From: W. Trevor King Date: Sat, 16 Apr 2011 14:36:02 +0000 (-0400) Subject: Convert entrez.py to use SOAP interface (much simpler code). X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=276ee35e1becf172183461c3230abcd520efdb3a;p=blog.git Convert entrez.py to use SOAP interface (much simpler code). --- diff --git a/posts/entrez/entrez.py b/posts/entrez/entrez.py index e026929..90dd914 100755 --- a/posts/entrez/entrez.py +++ b/posts/entrez/entrez.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/env python # # Copyright (C) 1998-2004 Frederic Gobry # Copyright (C) 2008-2011 W. Trevor King @@ -15,677 +15,88 @@ # # You should have received a copy of the GNU General Public License # along with This program. If not, see . -# -# Code following John Vu's medline query code pybliographer/Pyblio/Query.py, -# -# Python interface to the Entrez databases. -# See http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html -# Current as of August 1, 2007 -# -# Rules: -# * Run retrieval scripts on weekends or between 9 pm and 5 am -# Eastern Time weekdays for any series of more than 100 requests. -# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, -# not the standard NCBI Web address. -# * Make no more than one request every 3 seconds. -# * Use the URL parameter email, and tool for distributed software, -# so that we can track your project and contact you if there is a -# problem. -# * NCBI's Disclaimer and Copyright notice must be evident to users -# of your service. -# * NLM does not claim the copyright on the abstracts in PubMed; -# however, journal publishers or authors may. -# * NLM provides no legal advice concerning distribution of -# copyrighted materials, consult your legal counsel. -# -# For a good Python-and-XML-DOM intro, see -# http://www.boddie.org.uk/python/XML_intro.html -# for the official docs, see -# http://docs.python.org/lib/module-xml.dom.html -"""Python bindings on Entrez database queries. +"""Python interface to Entrez_ SOAP_ using the suds_ module. + +Before you use this program, read the rules_. + +.. _Entrez: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html +.. _SOAP: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/DOC/esoap_help.html +.. _suds: https://fedorahosted.org/suds/ +.. _rules: http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html#UserSystemRequirements + +To discover services using suds, try: + +>>> print EUTILS_CLIENT # doctest: +ELLIPSIS, +REPORT_UDIFF + +Suds ( https://fedorahosted.org/suds/ ) version: ... build: ... + +Service ( eUtilsService ) tns="http://www.ncbi.nlm.nih.gov/soap/eutils/" + Prefixes (6) + ns0 = "http://www.ncbi.nlm.nih.gov/soap/eutils/egquery" + ns1 = "http://www.ncbi.nlm.nih.gov/soap/eutils/einfo" + ns2 = "http://www.ncbi.nlm.nih.gov/soap/eutils/elink" + ns3 = "http://www.ncbi.nlm.nih.gov/soap/eutils/epost" + ns4 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esearch" + ns5 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esummary" + Ports (1): + (eUtilsServiceSoap) + Methods (7): + run_eGquery(xs:string term, xs:string tool, xs:string email, ) + run_eInfo(xs:string db, xs:string tool, xs:string email, ) + run_eLink(xs:string db, xs:string[] id, xs:string reldate, ...) + run_ePost(xs:string db, xs:string id, xs:string WebEnv, ...) + run_eSearch(xs:string db, xs:string term, xs:string WebEnv, ...) + run_eSpell(xs:string db, xs:string term, xs:string tool, ...) + run_eSummary(xs:string db, xs:string id, xs:string WebEnv, ...) + Types (34): + ns1:DbInfoType + ns1:DbListType + ... + ns0:eGQueryResultType + + """ -import logging -import re -import string -import sys -import time # for querying date ranges of publications -import urllib - -# DOM module for parsing XML, -# supports Document Object Model (DOM) Level 1 Specification -# http://docs.python.org/lib/module-xml.dom.minidom.html -import xml.dom.minidom as dom +import logging as _logging +import subprocess as _subprocess +import sys as _sys +import time as _time -# For calling the bibutils conversion programs -from subprocess import Popen, PIPE +import suds as _suds +from suds.client import Client as _Client # Platform constants -_MSWINDOWS = sys.platform == 'win32' +_MSWINDOWS = _sys.platform == 'win32' _POSIX = not _MSWINDOWS if _POSIX: - import os - import select + import os as _os + import select as _select __version__ = '0.2' -# Entrez access points -EINFO_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' -ESEARCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' -EFETCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' -ELINK_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi' + +EUTILS_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/eutils.wsdl' +EFETCH_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/efetch_%s.wsdl' + +EUTILS_CLIENT = _Client(EUTILS_WSDL_URL) # Entrez-requested tracking information TOOL = 'entrezpy' EMAIL = 'wking@drexel.edu' -# Logger - -LOG = logging.getLogger(TOOL) -LOG.setLevel(logging.WARN) -_handler = logging.StreamHandler() -_formatter = logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s') +# Logging +LOG = _logging.getLogger(TOOL) +LOG.setLevel(_logging.WARN) +_handler = _logging.StreamHandler() +_formatter = _logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s') _handler.setFormatter(_formatter) LOG.addHandler(_handler) del _handler, _formatter -## XML and list utility functions - -def urlencode(param_dict): - return urllib.urlencode( - [(k,v) for k,v in param_dict.iteritems() if v is not None]) - -def get_text(node): - """ - Given a node ( in the following example), - extract some-text from 'some-text' - returns u'some-text'. - However, if the xml is '' returns None - """ - if len(node.childNodes) == 1: - data = node.childNodes[0].data - elif len(node.childNodes) == 0: # empty node - data = None - else: - raise Exception, "Node contains more than text" - return data - -def get_child_nodes(node, child_name): - """ - Given a node ( in the following example), - returns an array of nodes matching - """ - ret = [] - for n in node.childNodes: - if n.nodeType != n.ELEMENT_NODE: - continue # ignore text, comment, etc. nodes - if n.tagName == child_name: - ret.append(n) - return ret - -def get_child_nodes(node, child_name): - """ - Given a node ( in the following example), - returns an the node matching - """ - nodes = get_child_node(node, child_name) - assert len(nodes) == 1, '%d child nodes named %s' % ( - len(nodes), child_name) - return node[0] - -def get_child_contents(node, child_name): - """ - Given a node ( in the following example), - extract some-text from ' - some-text - other-text - some-other-text - ... - ' - Returns ['some-text', 'some-other-text', ...] - """ - nodes = get_child_nodes(node, child_name) - ret = [] - for n in nodes: - ret.append(get_text(n)) - return ret - -def get_child_dict(node): - """ - Given a node ( in the following example), - extract some-text from ' - some-text - other-text - some-other-text - ... - ' - Returns {'some-tag':['some-text', 'some-other-text', ...], - 'other-tag':['some-other-text']} - """ - dict = {} - tags = [] # to preserve order of tags - for n in node.childNodes: - if n.nodeType != n.ELEMENT_NODE: - continue # ignore text, comment, etc. nodes - try: # another entry for an existing tag - dict[n.tagName].append(get_text(n)) - except KeyError: # new tag - dict[n.tagName] = [get_text(n)] - tags.append(n.tagName) - return (dict, tags) - -def delist_dict(dict): - """ - Given a dict - e.g. {'some-tag':['some-text', 'some-other-text', ...], - 'other-tag':['some-other-text'], ...} , - replaces any values in an array of length 1 with the element, - e.g. {'some-tag':['some-text', 'some-other-text', ...], - 'other-tag':'some-other-text', ...} , - """ - for key,value in dict.items(): - if isinstance(value, list) and len(value) == 1: - dict[key] = value[0] - return dict - -## Get information about the Entrez databases themselves - -def _query_einfo(db=None): - """ - Get information about the Entrez databases themselves. - http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html - - Either list all available databases with `db=None`, or specific - information on a particular database (e.g. pubmed) with - `db=pubmed`. - """ - params = urlencode({ - 'db': db, - 'tool': TOOL, - 'email': EMAIL}) - - LOG.info("getting einfo from '%s?%s'" % (EINFO_URL, params)) - f = urllib.urlopen("%s?%s" % (EINFO_URL, params)) - string = f.read() - f.close() - LOG.debug('got:\n%s' % string) - return string - -def get_parsed_einfo(db=None, page=None, parsed=None): - """ - Helper function for various einfo processing functions. - Allow each processor to function - independently (page=None, parsed=None), - with a shared xml string (page=, parsed=None), or - with a shared parsed xml structure (page=*, parsed=). - Use clean_parsed_einfo() for cleanup - """ - if page == None and parsed == None: - LOG.info('downloading new einfo page') - page = _query_einfo(db) - if parsed == None: - LOG.info('parsing new einfo page') - parsed = dom.parseString(page) - parsed_islocal = True - else: - LOG.info('using old einfo parsing') - parsed_islocal = False - return (parsed, parsed_islocal) - -def clean_parsed_einfo(parsed, parsed_islocal=True): - """ - Helper function for various einfo processing functions. - Clean up the parsed xml structure if the calling function created it. - """ - if parsed_islocal == True: - LOG.info('cleaning up einfo parsing') - parsed.unlink() # clean up the DOM - -def database_list(page=None, parsed=None): - parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed) - databases = [] - for node in parsed.getElementsByTagName("DbName"): - # Extract some-text from 'some-text' - # by default, xml.dom.minidom uses unicode, - # so strings get printed: "u'string contents'" - databases.append(get_text(node)) - clean_parsed_einfo(parsed,parsed_islocal) - return databases - -def field_dict(db='pubmed', page=None, parsed=None): - parsed,parsed_islocal = get_parsed_einfo(db, page, parsed) - fields = [] - tags = set() - field_info = {} - fieldlists = parsed.getElementsByTagName("FieldList") - assert len(fieldlists) == 1, '%s\n\n%d FieldLists!' % ( - parsed.toxml(), len(fieldlists)) - fieldlist = fieldlists[0] - for node in fieldlist.childNodes: - if node.nodeType != node.ELEMENT_NODE: - continue # ignore text, comment, etc. nodes - assert node.tagName == 'Field', ( - "Unrecognized tag '%s' in FieldList" % node.tagName) - field,new_tags = get_child_dict(node) - assert len(field['Name']) == 1, ( - 'Multiple field names %s' % str(field['Name'])) - field = delist_dict(field) - fields.append(field['Name']) - new_tags = tags.union(new_tags) - if tags: - assert new_tags == tags, "Inconsistent tags" - tags = new_tags - field_info[field['Name']] = field - clean_parsed_einfo(parsed,parsed_islocal) - return (fields, tags, field_info) - -def link_dict(db='pubmed', page=None, parsed=None): - parsed,parsed_islocal = get_parsed_einfo(db, page, parsed) - links = [] - tags = set() - link_info = [] - linklists = parsed.getElementsByTagName("LinkList") - assert len(linklists) == 1, ( - '%s\n\n%d LinkLists!' % (parsed.toxml(), len(linklists))) - linklist = linklists[0] - for node in linklist.childNodes: - if node.nodeType != node.ELEMENT_NODE: - continue # ignore text, comment, etc. nodes - assert node.tagName == 'Link', ( - "Unrecognized tag '%s' in LinkList" % node.tagName) - link,new_tags = get_child_dict(node) - assert len(link['Name']) == 1, ( - 'Multiple link names %s' % str(link['Name'])) - link = delist_dict(link) - links.append(link['Name']) - new_tags = tags.union(new_tags) - if tags: - assert new_tags == tags, "Inconsistent tags" - tags = new_tags - link_info[link['Name']] = link - clean_parsed_einfo(parsed,parsed_islocal) - return (links, tags, link_info) - -def database_info(db='pubmed', page=None, parsed=None): - "Convenience function to call both field_dict and link_dict" - parsed,parsed_islocal = get_parsed_einfo(db, page, parsed) - fields,field_tags,field_info = field_dict(db=db, parsed=parsed) - links,link_tags,link_info = link_dict(db=db, parsed=parsed) - clean_parsed_einfo(parsed,parsed_islocal) - return (fields, field_tags, field_info, links, link_tags, link_info) - -def validate_field(field, fields): - "Ensure that field is a valid field for the database db." - try: - fields.index(field.upper()) - except ValueError: - raise Exception("Field '%s' invalid\nValid fields are\n %s" - % (field, str(fields))) - -def strip_fields_from_term(term): - "HACK: really stupid algorithm" - fields = [] - infield = False - for i in range(len(term)): - if term[i] == '[' and infield == False: - infield = True - field_start = i+1 - elif term[i] == ']' and infield == True: - infield = False - fields.append(term[field_start:i]) - return fields - -def validate_search_term(term, fields): - "Ensure that the fields in term are valid fields for the database db." - for field in strip_fields_from_term(term): - validate_field(field, fields) - - -## Search an Entrez database - -def _query_esearch(term, db='pubmed', field=None, - reldate=None, daterange=None, datetype=None, - retmax=None, rettype=None, sort=None, - validate=False, valid_fields=None, debug=False): - """ - Search an Entrez database. - http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html - - Does not currently support the usehistory, WebEnv, query_key, - retstart, or retmode parameters. - - Help with the arguments adapted from esearch_help.html: - - term: This command uses search terms or phrases with or without - Boolean operators. - You can search in several fields using the [term field] tag. - You can search in a single field using the 'field' parameter below. - ?You may also tag search terms using field=tag.? I don't - understand this line - For example: term=asthma[MESH]+OR+hay+fever[MESH] - 'term=asthma[MESH]' is the same as 'term=asthma&field=MESH' - ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea ) - - db: This command selects the database to be searched - For example: db=pubmed - - field: Use this command to specify a specific search field. - PubMed fields: affl, auth, ecno, jour, iss, mesh,... - Retrieve with field_dict('pubmed') - For example: field=auth - - reldate: Limit items a number of days immediately preceding today's date. - For example: reldate=365 - - daterange: Limit results bounded by two specific dates. - For example: daterange=('2001', '2002/01/01') - (implemented as mindate=2001&maxdate=2002/01/01) - - datetype: Limit dates to a specific date field based on database. - For example: datetype=edat - - retmax: Limit the number of items retrieved - For example: retmax=100 - - rettype: Select the retrieval type - PubMed values: count, uilist (default) - - sort: Sort the returned uilist - PubMed values: author, last+author, journal, pub+date - - """ - if daterange != None: - assert len(daterange) == 2, ( - "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')" - % (daterange,)) - reldate == None, "Specifying date with daterange AND reldate!" - mindate = daterange[0] - maxdate = daterange[1] - else: - mindate = None - maxdate = None - if validate: - assert len(valid_fields) > 0, ( - 'Need a list of valid fields to validate') - if field != None: - validate_field(field) - validate_search_term(term, valid_fields) - params = urlencode({ - 'tool': TOOL, - 'email': EMAIL, - 'term': term, - 'db': db, - 'field': field, - 'reldate': reldate, - 'mindate': mindate, - 'maxdate': maxdate, - 'datetype': datetype, - 'maxdate': maxdate, - 'retmax': retmax, - 'rettype': rettype, - 'sort': sort}) - LOG.info("getting esearch from '%s?%s'" % (ESEARCH_URL, params)) - f = urllib.urlopen("%s?%s" % (ESEARCH_URL, params)) - string = f.read() - f.close() - LOG.debug('got:\n%s' % string) - return string - -def parse_esearch(page): - "Parse the xml returned by _query_esearch()" - parsed = dom.parseString(page) - - pid_list = [] - for node in parsed.getElementsByTagName("Id"): - pid_list.append(get_text(node)) - - parsed.unlink() - - return pid_list - - -## Fetch records by Primary ID from an Entrez database - -def _query_efetch(id, db='pubmed', - retmax=None, retmode='xml', rettype='medline'): - """ - Fetch records by primary ID from an Entrez database. - http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html - http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html - - - Does not currently support the usehistory, WebEnv, query_key, or - retstart parameters. - - Help with the arguments adapted from efetchlit_help.html: - - id: Primary UIs identifying the documents to fetch - For example: 'id=11877539, 11822933,11871444' - - db: This command selects the database to be searched - For example: db=pubmed - - retmax: Limit the number of items retrieved (default 20) - For example: retmax=100 - - retmode: Select the retrieval output format - xml (not journals) - html - text - asn.1 (not journals) - - rettype: Select the retrieval type - uilist - abstract (not omim) - citation (not omim) - medline (not omim) - full (journals and omim) - - Not all retmodes are possible with all rettypes: - PubMed Options: - uilist abstract citation medline - xml x x* x* x* - text x x x x - html x x x x - asn.1 n/a x* x* x - x = retrieval mode available - * returned retrieval type is the complete record in the retrieval mode - n/a - not available - OMIM Options: (not case sensitive) - uilist docsum synopsis variants detailed ExternalLink - (MIM (Clinical (Allelic - numbers) synopsis) Variants) - xml x x* x* x* x* x* - text x x x x x* x* - html x x x x x* x* - asn.1 x* x* x* x* x* x* - x = retrieval mode available - * returned retrieval type is the complete record in the retrieval mode - n/a - not available - - """ - idstring = "" - for d in id: - idstring += "%s," % d - idstring = idstring[:-1] # remove trailing comma - params = urlencode({ - 'tool': TOOL, - 'email': EMAIL, - 'id': idstring, - 'db': db, - 'retmax': retmax, - 'retmode': retmode, - 'rettype': rettype}) - - LOG.info("getting efetch from '%s?%s'" % (EFETCH_URL, params)) - f = urllib.urlopen("%s?%s" % (EFETCH_URL, params)) - string = f.read() - f.close() - LOG.debug('got:\n%s' % string) - return string - - -## Fetch links by Primary ID from an Entrez database - -def _query_elink(id, term=None, db='all', dbfrom='pubmed', - cmd=None, linkname=None, holding=None, - version=1, - reldate=None, daterange=None, datetype=None, - retmode='xml'): - """ - Fetch links from a list of primary IDs in an Entrez database. - http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html - http://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html - - Does not currently support the WebEnv or query_key parameters. - - Help with the arguments adapted from efetchlit_help.html: - - id: Primary UIs identifying the documents to fetch - For example: 'id=11877539, 11822933,11871444' - - term: This command uses search terms or phrases with or without - Boolean operators to limit the returned matching links. - - db: This command selects the databases to be searched for link targets. - For example: db=all - - dbfrom: This command selects the database containing the ids. - For example: dbfrom=pubmed - - - cmd: Link commands - * prlinks - List the hyperlink to the primary LinkOut provider for - multiple IDs and database. Each ID is processed separately. - * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut - provider for a single ID and database. - Return the elink command, since fetching - it breaks the relative links in the - publisher's page. - * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for - multiple IDs and database. Each ID is processed separately. - * llinkslib - List LinkOut URLs and Attributes for multiple IDs and - database. Each ID is processed separately. - * lcheck - Check for the existence (Y or N) of an external link in for - multiple IDs and database. - * ncheck - Check for the existence of a neighbor link for each ID within - a database, e.g., Related Articles in PubMed. - * neighbor - Display neighbors within a database. - * neighbor_history - Create history (WebEnv & query_key) for use in other - EUtilities. - * acheck - Lists Entrez databases links for multiple IDs from a single - database. - - linkname: link to a specific neighbor subset - For example: linkname=nucleotide_nucleotide_comp - - holding: List LinkOut URLs for the specified holding provider, (library). - Used only in conjunction with cmd=llinks or cmd=llinkslib - For example: cmd=llinkslib&holding=medlib - - version: Include a version number to refer to the latest DTD. - For example: version=1 - retrieves the latest DTD (eLink_050511.dtd) that includes the additional - elements, MenuTag, LinkInfo and IdLinkSet. - - Date command are only valid for dbfrom=pubmed & cmd=neighbor - reldate: Limit items a number of days immediately preceding today's date. - For example: reldate=365 - - daterange: Limit results bounded by two specific dates. - For example: daterange=('2001', '2002/01/01') - (implemented as mindate=2001&maxdate=2002/01/01) - - datetype: Limit dates to a specific date field based on database. - For example: datetype=edat - - retmode: Select the retrieval output format - xml (default) - ref (only used with cmd=prlinks for one ID) - - """ - idstring = "" - for d in id: - idstring += "%s," % d - idstring = idstring[:-1] # remove trailing comma - - params = urlencode({ - 'tool': TOOL, - 'email': EMAIL, - 'id': idstring, - 'term': term, - 'db': db, - 'dbfrom': dbfrom, - 'cmd': cmd, - 'linkname': linkname, - 'holding': holding, - 'version': version, - 'reldate': reldate, - 'daterange': daterange, - 'datetype': datetype, - 'retmode': retmode}) - - LOG.info("getting elink from '%s?%s'" % (ELINK_URL, params)) - f = urllib.urlopen("%s?%s" % (ELINK_URL, params)) - - if cmd == 'prlinks' and retmode == 'ref': - # Just get the link, we don't need the provider's webpage HTML. - url = f.geturl() - f.close() - return url - - string = f.read() - f.close() - LOG.debug('got:\n%s' % string) - return string - - -## Combining the searching and parsing (dropping some of the less used -## features) - -def search_fetch_xml(term, db='pubmed', field=None, - reldate=None, daterange=None, datetype=None, - retmax=None, sort=None, - validate=False, valid_fields=None, - retmode='xml', rettype='medline'): - if validate and valid_fields == None: - valid_fields,field_tags,field_info = field_dict(db) - search_page = _query_esearch(term, db, field, - reldate, daterange, datetype, - retmax, rettype='uilist', sort=sort, - validate=validate, valid_fields=valid_fields) - pid_list = parse_esearch(search_page) - if not pid_list: - return None - fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype) - return fetch_page - -def search_link(term, db='pubmed', field=None, - reldate=None, daterange=None, datetype=None, - retmax=None, sort=None, - validate=False, valid_fields=None, - link_term=None, fromdb=None, - cmd=None, linkname=None, link_holding=None, - version=1, - link_reldate=None, link_daterange=None, link_datetype=None, - link_retmode='xml'): - if validate and valid_fields == None: - valid_fields,field_tags,field_info = field_dict(db) - search_page = _query_esearch(term, db, field, - reldate, daterange, datetype, - retmax, rettype='uilist', sort=sort, - validate=validate, valid_fields=valid_fields) - pid_list = parse_esearch(search_page) - link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb, - cmd=cmd, linkname=linkname, holding=link_holding, - version=version,reldate=link_reldate, - daterange=link_daterange, datetype=link_datetype, - retmode=link_retmode) - return link_page ## Use the external bibutils package to convert to BibTeX format @@ -721,7 +132,7 @@ class Pipe (object): def __init__(self, cmds, stdin=None): if isinstance(stdin, str): stdin_str = stdin - stdin = PIPE + stdin = _subprocess.PIPE else: stdin_str = None @@ -734,8 +145,9 @@ class Pipe (object): kwargs = {} if _POSIX: kwargs['close_fds'] = True - self._procs.append(Popen( - cmd, stdin=stdin, stdout=PIPE, stderr=PIPE, **kwargs)) + self._procs.append(_subprocess.Popen( + cmd, stdin=stdin, stdout=_subprocess.PIPE, + stderr=_subprocess.PIPE, **kwargs)) self.stdout,self.stderrs = self._communicate(input=stdin_str) @@ -813,12 +225,12 @@ class Pipe (object): input_offset = 0 while read_set or write_set: - LOG.debug('select on read %s, write %s' % (read_set,write_set)) + LOG.debug('select on read %s, write %s' %(read_set, write_set)) try: - rlist,wlist,xlist = select.select(read_set, write_set, []) - except select.error, e: + rlist,wlist,xlist = _select.select(read_set, write_set, []) + except _select.error, e: if e.args[0] == errno.EINTR: - LOG.debug('EINTR') + LOG.debug('EINTR: %s' % e) continue raise LOG.debug('selected read %s, write %s, exception %s' @@ -829,7 +241,7 @@ class Pipe (object): # blocking. POSIX defines PIPE_BUF >= 512 LOG.debug('write to stdin for process 0') chunk = input[input_offset:input_offset+512] - bytes_written = os.write( + bytes_written = _os.write( self._procs[0].stdin.fileno(), chunk) input_offset += bytes_written if input_offset >= len(input): @@ -839,7 +251,7 @@ class Pipe (object): LOG.debug('stdin complete') if self._procs[-1].stdout in rlist: LOG.debug('read stdout for final process') - data = os.read(self._procs[-1].stdout.fileno(), 1024) + data = _os.read(self._procs[-1].stdout.fileno(), 1024) if data == '': self._procs[-1].stdout.close() read_set.remove(self._procs[-1].stdout) @@ -848,7 +260,7 @@ class Pipe (object): for i,proc in enumerate(self._procs): if proc.stderr in rlist: LOG.debug('read stderr for process %i' % i) - data = os.read(proc.stderr.fileno(), 1024) + data = _os.read(proc.stderr.fileno(), 1024) if data == '': proc.stderr.close() read_set.remove(proc.stderr) @@ -873,7 +285,7 @@ def medline_xml_to_bibtex(fetch_page): ... '', ... '', + ... '/DTD/pubmed_110101.dtd">', ... '', ... ' ', ... ' ', @@ -926,7 +338,7 @@ def medline_xml_to_bibtex(fetch_page): ... ' ', ... '', ... ]) - >>> print medline_xml_to_bibtex(xml) + >>> print medline_xml_to_bibtex(xml) # doctest: +REPORT_UDIFF @Article{King2010, author = "William T. King and Meihong Su and Guoliang Yang", title = "Monte Carlo simulation of mechanical unfolding of @@ -940,30 +352,19 @@ def medline_xml_to_bibtex(fetch_page): pages = "159--166", ISSN = "1879-0003", doi = "10.1016/j.ijbiomac.2009.12.001", + URL = "http://www.ncbi.nlm.nih.gov/pubmed/20004685", } """ - LOG.info('convert medline XML to BibTeX\n%s' % fetch_page) + LOG.info('convert medline XML to BibTeX') + LOG.debug('convert from\n%s' % fetch_page) p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']], stdin=fetch_page) LOG.debug('converted to\n%s' % p.stdout) return p.stdout -## Random - -def hints(): - "Print Entrez search hints and exit" - - print """ -free full text [sb] - - -""" - -## Test with a mini-searching application - -if __name__ == "__main__": +if __name__ == '__main__': from optparse import OptionParser usage_string = '\n'.join([ @@ -972,7 +373,7 @@ if __name__ == "__main__": ' (print medline xml matching search)', '| %prog -l [options] SEARCH_TERM' ' (print links to entries matching search)', - '| %prog -L [-d DATABASE] [-f FILE] (list databases)', + '| %prog -L [-f FILE] (list databases)', '| %prog -X [-d DATABASE] [-F FIELD] [-f FILE]' ' (list fields in a database, or details on a single field)', '', @@ -981,6 +382,16 @@ if __name__ == "__main__": 'See the docstrings in %prog or', ' http://www.ncbi.nlm.nih.gov/entrez/query/static/' 'eutils_help.html', + ' http://www.ncbi.nlm.nih.gov/entrez/query/static/' + 'eutils_help.html#UserSystemRequirements', + ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/' + 'einfo_help.html', + ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/' + ' esearch_help.html', + ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/' + 'efetch_help.html', + ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/' + 'elink_help.html', 'for more details.' ]) @@ -994,19 +405,16 @@ if __name__ == "__main__": # the value of that option. # " - parser.add_option('-d', '--database', dest="database", + parser.add_option('-d', '--database', dest='database', help="Search DATABASE (default '%default')", - type='string', metavar="DATABASE", default='pubmed') - parser.add_option('-f', '--file', dest="filename", - help="write output to FILE (default stdout)", - type='string', metavar="FILE") - parser.add_option('-v', '--verbose', dest="verbose", action="store_true", - help="Print lots of debugging information", - default=False) - parser.add_option('-H', '--hints', callback=hints, - help="Print Entrez search hints and exit", - action="callback") - + type='string', metavar='DATABASE', default='pubmed') + parser.add_option('-f', '--file', dest='filename', + help='write output to FILE (default stdout)', + type='string', metavar='FILE') + parser.add_option('-v', '--verbose', dest='verbose', action='count', + help=('Print minimal debugging information. Use twice ' + 'to get lots of debugging info.'), + default=0) # mode control options mode = 'search' @@ -1019,159 +427,196 @@ if __name__ == "__main__": mode = 'explain' parser.add_option('-L', '--list-mode', callback=set_mode, - help="Run in list mode", action="callback") + help='Run in list mode', action='callback') parser.add_option('-X', '--explain-mode', callback=set_mode, - help="Run in explain mode", action="callback") + help='Run in explain mode', action='callback') # search-fetch-xml-to-? options output = 'bibtex' def set_output(option, opt_str, value, parser): global output long_option = option.get_opt_string() + if long_option == '--output-xml': + output = 'medline' + if long_option == '--output-bibtex': + output = 'bibtex' if long_option == '--output-link': output = 'link' - parser.add_option('-W', '--raw', dest="raw", action="store_true", - help="Output raw Entrez xml", default=False) - parser.add_option('-F', '--field', dest="field", - help="Limit SEARCH_TERM to FIELD", - type='string', metavar="FIELD") - parser.add_option('-r', '--reldate', dest="reldate", - help="Limit search to dates within DAYS of today", - type='string', metavar="DAYS") - parser.add_option('-R', '--daterange', dest="daterange", - help=("Limit search to dates within DATERANGE " - "(e.g. '2001/1/1,2002')"), - type='string', metavar="DATERANGE") - parser.add_option('-t', '--datetype', dest="datetype", + parser.add_option('-x', '--output-xml', callback=set_output, + help='Output search results as Medline XML', + action='callback') + parser.add_option('-b', '--output-bibtex', callback=set_output, + help='Output search results as BibTeX', + action='callback') + parser.add_option('-F', '--field', dest='field', + help='Limit SEARCH_TERM to FIELD', + type='string', metavar='FIELD') + parser.add_option('-r', '--reldate', dest='reldate', + help='Limit search to dates within DAYS of today', + type='string', metavar='DAYS') + parser.add_option('--mindate', dest='mindate', + help=('Limit search to date after MINDATE ' + "(e.g. '2001/1/1' or '2002')"), + type='string', metavar='MINDATE') + parser.add_option('--maxdate', dest='maxdate', + help=('Limit search to date after MAXDATE ' + "(e.g. '2001/1/1' or '2002')"), + type='string', metavar='MAXDATE') + parser.add_option('-t', '--datetype', dest='datetype', help=("Select field to apply date limits to " "(e.g. 'edat' for Entrez date)"), - type='string', metavar="DATETYPE") - parser.add_option('-m', '--retmax', dest="retmax", - help=('Return at max RETMAX items from a successful ' + type='string', metavar='DATETYPE') + parser.add_option('-m', '--retmax', dest='retmax', + help=('Return at most RETMAX items from a successful ' 'search (default %default)'), - type='string', metavar="RETMAX", default=20) - parser.add_option('-M', '--retmode', dest="retmode", - help="Select fetch/link output format", - type='string', metavar="RETMODE", default='xml') - parser.add_option('-V', '--validate', dest="validate", action="store_true", + type='int', metavar='RETMAX', default=20) + parser.add_option('-s', '--retstart', dest='retstart', + help=('Index of first returned search item from a ' + 'successful search (default %default)'), + type='int', metavar='RETSTART', default=0) + parser.add_option('-V', '--validate', dest='validate', action='store_true', help=('Check that FIELD and field tags in SEARCH_TERM ' 'are valid for DB'), default=False) # output link options parser.add_option('-l', '--output-link', callback=set_output, - help="Output a link (instead of xml citations)", - action="callback") - parser.add_option('-c', '--link-cmd', dest="link_cmd", - help="Select link output", - type='string', metavar="LINK_CMD") - parser.add_option('-T', '--link-term', dest="link_term", - help="Limit links to those matching LINK_TERM", - type='string', metavar="LINK_TERM") - parser.add_option('-D', '--from-database', dest="fromdb", - help="Limit links to those from FROMDATABASE)", - type='string', metavar="FROMDATABASE") - parser.add_option('-n', '--link-name', dest="linkname", - help="Limit links to a specific neighbor", - type='string', metavar="LINKNAME") + help='Output a link (instead of xml citations).', + action='callback') + parser.add_option('-c', '--link-cmd', dest='link_cmd', + help='Select link output', + type='string', metavar='LINK_CMD') + parser.add_option('-T', '--link-term', dest='link_term', + help='Limit links to those matching LINK_TERM', + type='string', metavar='LINK_TERM') + parser.add_option('-D', '--from-database', dest='dbfrom', + help='Limit links to those from FROMDATABASE)', + type='string', metavar='FROMDATABASE') + parser.add_option('-n', '--link-name', dest='linkname', + help='Limit links to a specific neighbor', + type='string', metavar='LINKNAME') (options, args) = parser.parse_args() parser.destroy() # open the output file if specified if options.filename == None: - outfile = sys.stdout + outfile = _sys.stdout else: outfile = file(options.filename, 'w') - if options.verbose: - LOG.setLevel(logging.DEBUG) + if options.verbose == 1: + LOG.setLevel(_logging.INFO) + elif options.verbose > 1: + LOG.setLevel(_logging.DEBUG) LOG.debug('operating in %s mode' % mode) if mode == 'list': - print >> outfile, "Available databases:" - databases = database_list() - for db in databases: - print >> outfile, "\t%s" % db + outfile.write('# available databases:\n') + LOG.info('run eInfo to get list of databases') + q = EUTILS_CLIENT.service.run_eInfo(tool=TOOL, email=EMAIL) + if hasattr(q, 'ERROR'): + raise Exception(q.ERROR) + + for db in q.DbList.DbName: + outfile.write('%s\n' % db) elif mode == 'explain': - fields,tags,field_info = field_dict(db=options.database) - if options.field == None: - print >> outfile, "Available fields in %s:" % options.database - field_size = [0,0] - for field in fields: - if len(field) > field_size[0]: - field_size[0] = len(field) - if len(field_info[field]['FullName']) > field_size[1]: - field_size[1] = len(field_info[field]['FullName']) - for field in fields: - print >> outfile, ('\t%*.*s\t%-*.*s' - % (field_size[0], field_size[0], field, - field_size[1], field_size[1], - field_info[field]['FullName'])) - else: - print >> outfile, ( - 'Field %s in %s:' % (options.field,options.database)) + LOG.info('run eInfo on %s' % options.database) + q = EUTILS_CLIENT.service.run_eInfo( + db=options.database, tool=TOOL, email=EMAIL) + if hasattr(q, 'ERROR'): + raise Exception(q.ERROR) + + if options.field: # print specific info about this field + outfile.write( + 'field %s in %s:\n' % (options.field, options.database)) + fields = dict( + [(field.Name, field) for field in q.DbInfo.FieldList.Field]) + field = fields[options.field] + attributes = sorted( + [(a, getattr(field, a)) for a in dir(field) + if not a.startswith('_')]) + field_size = [0] + for attribute,value in attributes: + if len(attribute) > field_size[0]: + field_size[0] = len(attribute) + for attribute,value in attributes: + outfile.write( + '%*.*s\t%s\n' + % (field_size[0], field_size[0], attribute, value)) + else: # print general info + outfile.write('database: %s\n' % q.DbInfo.DbName) + outfile.write('description: %s\n' % q.DbInfo.Description) + outfile.write('available fields:\n') field_size = [0,0] - for key in tags: - if len(key) > field_size[0]: - field_size[0] = len(key) - if len(field_info[options.field][key]) > field_size[1]: - field_size[1] = len(field_info[options.field][key]) - for key in tags: - print >> outfile, ('\t%*.*s\t%-*.*s' - % (field_size[0], field_size[0], key, - field_size[1], field_size[1], - field_info[options.field][key])) + for field in q.DbInfo.FieldList.Field: + if len(field.Name) > field_size[0]: + field_size[0] = len(field.Name) + if len(field.FullName) > field_size[1]: + field_size[1] = len(field.FullName) + for field in q.DbInfo.FieldList.Field: + outfile.write( + '%*.*s\t%-*.*s\t%s\n' + % (field_size[0], field_size[0], field.Name, + field_size[1], field_size[1], field.FullName, + field.Description)) elif mode == 'search': search_term = args[0] LOG.debug('output %s' % output) - if output == 'bibtex': - medline_xml = search_fetch_xml(term=search_term, - db=options.database, - field=options.field, - reldate=options.reldate, - daterange=options.daterange, - datetype=options.datetype, - retmax=options.retmax, - validate=options.validate, - retmode=options.retmode, - rettype='medline') - if medline_xml: - if options.raw: - print outfile, medline_xml - else: - bibtex = medline_xml_to_bibtex(medline_xml) - print >> outfile, bibtex - - elif output == 'link': - # Assume that if you're looking for links - # your search is already pretty refined, - # so use the date options for link-limiting. - link_xml = search_link(term=search_term, - db=options.database, - field=options.field, - reldate=None, - daterange=None, - datetype=None, - retmax=None, - sort=None, - validate=options.validate, - valid_fields=None, - link_term=options.link_term, - fromdb=options.fromdb, - cmd=options.link_cmd, - linkname=options.linkname, - link_holding=None, - version=1, - link_reldate=options.reldate, - link_daterange=options.daterange, - link_datetype=options.datetype, - link_retmode=options.retmode,) - print >> outfile, link_xml + LOG.info('maxdate: %r, mindate %r' % (options.maxdate, options.mindate)) + if options.mindate and not options.maxdate: + options.maxdate = _time.strftime('%Y/%M/%d') + LOG.info('fill in maximum date: %s' % options.maxdate) + elif options.maxdate and not options.mindate: + options.mindate = '0' + LOG.info('fill in minimum date: %s' % options.mindate) + + LOG.info('run eEsearch on %s' % options.database) + q = EUTILS_CLIENT.service.run_eSearch( + db=options.database, term=search_term, tool=TOOL, email=EMAIL, + field=options.field, reldate=options.reldate, + mindate=options.mindate, maxdate=options.maxdate, + datetype=options.datetype, + RetStart=options.retstart, RetMax=options.retmax, + #sort=) + ) + if hasattr(q, 'ERROR'): + raise Exception(q.ERROR) + if hasattr(q.IdList, 'Id'): + ret = int(len(q.IdList.Id)) + else: + ret = 0 + LOG.info('search returned %d of %d items' % (ret, int(q.Count))) + + if ret > 0: + if output in ['medline', 'bibtex']: + LOG.info('run eFetch on %s' % options.database) + efetch_client = _Client(EFETCH_WSDL_URL % options.database) + f = efetch_client.service.run_eFetch( + id=','.join(q.IdList.Id), tool=TOOL, email=EMAIL) + if hasattr(f, 'ERROR'): + raise Exception(f.ERROR) + + if output == 'medline': + outfile.write(str(efetch_client.last_received()).rstrip()+'\n') + elif output == 'bibtex': + outfile.write( + medline_xml_to_bibtex(str(efetch_client.last_received()))) + elif output == 'link': + LOG.info('run eLink on %s' % options.database) + f = EUTILS_CLIENT.service.run_eLink( + db=options.database, id=','.join(q.IdList.Id), + #reldate=, mindate=, maxdate=, datetype=, + term=options.link_term, dbfrom=options.dbfrom, + linkname=options.linkname, cmd=options.link_cmd, + tool=TOOL, email=EMAIL) + outfile.write(str(EUTILS_CLIENT.last_received()).rstrip()+'\n') + else: + raise KeyError(output) if options.filename != None: outfile.close()