-#!/usr/bin/python
+#!/usr/bin/env python
#
# Copyright (C) 1998-2004 Frederic Gobry
# Copyright (C) 2008-2011 W. Trevor King
#
# You should have received a copy of the GNU General Public License
# along with This program. If not, see <http://www.gnu.org/licenses/>.
-#
-# Code following John Vu's medline query code pybliographer/Pyblio/Query.py,
-#
-# Python interface to the Entrez databases.
-# See http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
-# Current as of August 1, 2007
-#
-# Rules:
-# * Run retrieval scripts on weekends or between 9 pm and 5 am
-# Eastern Time weekdays for any series of more than 100 requests.
-# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov,
-# not the standard NCBI Web address.
-# * Make no more than one request every 3 seconds.
-# * Use the URL parameter email, and tool for distributed software,
-# so that we can track your project and contact you if there is a
-# problem.
-# * NCBI's Disclaimer and Copyright notice must be evident to users
-# of your service.
-# * NLM does not claim the copyright on the abstracts in PubMed;
-# however, journal publishers or authors may.
-# * NLM provides no legal advice concerning distribution of
-# copyrighted materials, consult your legal counsel.
-#
-# For a good Python-and-XML-DOM intro, see
-# http://www.boddie.org.uk/python/XML_intro.html
-# for the official docs, see
-# http://docs.python.org/lib/module-xml.dom.html
-"""Python bindings on Entrez database queries.
+"""Python interface to Entrez_ SOAP_ using the suds_ module.
+
+Before you use this program, read the rules_.
+
+.. _Entrez: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
+.. _SOAP: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/DOC/esoap_help.html
+.. _suds: https://fedorahosted.org/suds/
+.. _rules: http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html#UserSystemRequirements
+
+To discover services using suds, try:
+
+>>> print EUTILS_CLIENT # doctest: +ELLIPSIS, +REPORT_UDIFF
+<BLANKLINE>
+Suds ( https://fedorahosted.org/suds/ ) version: ... build: ...
+<BLANKLINE>
+Service ( eUtilsService ) tns="http://www.ncbi.nlm.nih.gov/soap/eutils/"
+ Prefixes (6)
+ ns0 = "http://www.ncbi.nlm.nih.gov/soap/eutils/egquery"
+ ns1 = "http://www.ncbi.nlm.nih.gov/soap/eutils/einfo"
+ ns2 = "http://www.ncbi.nlm.nih.gov/soap/eutils/elink"
+ ns3 = "http://www.ncbi.nlm.nih.gov/soap/eutils/epost"
+ ns4 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esearch"
+ ns5 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esummary"
+ Ports (1):
+ (eUtilsServiceSoap)
+ Methods (7):
+ run_eGquery(xs:string term, xs:string tool, xs:string email, )
+ run_eInfo(xs:string db, xs:string tool, xs:string email, )
+ run_eLink(xs:string db, xs:string[] id, xs:string reldate, ...)
+ run_ePost(xs:string db, xs:string id, xs:string WebEnv, ...)
+ run_eSearch(xs:string db, xs:string term, xs:string WebEnv, ...)
+ run_eSpell(xs:string db, xs:string term, xs:string tool, ...)
+ run_eSummary(xs:string db, xs:string id, xs:string WebEnv, ...)
+ Types (34):
+ ns1:DbInfoType
+ ns1:DbListType
+ ...
+ ns0:eGQueryResultType
+<BLANKLINE>
+<BLANKLINE>
"""
-import logging
-import re
-import string
-import sys
-import time # for querying date ranges of publications
-import urllib
-
-# DOM module for parsing XML,
-# supports Document Object Model (DOM) Level 1 Specification
-# http://docs.python.org/lib/module-xml.dom.minidom.html
-import xml.dom.minidom as dom
+import logging as _logging
+import subprocess as _subprocess
+import sys as _sys
+import time as _time
-# For calling the bibutils conversion programs
-from subprocess import Popen, PIPE
+import suds as _suds
+from suds.client import Client as _Client
# Platform constants
-_MSWINDOWS = sys.platform == 'win32'
+_MSWINDOWS = _sys.platform == 'win32'
_POSIX = not _MSWINDOWS
if _POSIX:
- import os
- import select
+ import os as _os
+ import select as _select
__version__ = '0.2'
-# Entrez access points
-EINFO_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
-ESEARCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
-EFETCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
-ELINK_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
+
+EUTILS_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/eutils.wsdl'
+EFETCH_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/efetch_%s.wsdl'
+
+EUTILS_CLIENT = _Client(EUTILS_WSDL_URL)
# Entrez-requested tracking information
TOOL = 'entrezpy'
EMAIL = 'wking@drexel.edu'
-# Logger
-
-LOG = logging.getLogger(TOOL)
-LOG.setLevel(logging.WARN)
-_handler = logging.StreamHandler()
-_formatter = logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
+# Logging
+LOG = _logging.getLogger(TOOL)
+LOG.setLevel(_logging.WARN)
+_handler = _logging.StreamHandler()
+_formatter = _logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
_handler.setFormatter(_formatter)
LOG.addHandler(_handler)
del _handler, _formatter
-## XML and list utility functions
-
-def urlencode(param_dict):
- return urllib.urlencode(
- [(k,v) for k,v in param_dict.iteritems() if v is not None])
-
-def get_text(node):
- """
- Given a node (<node-name> in the following example),
- extract some-text from '<node-name>some-text</node-name>'
- returns u'some-text'.
- However, if the xml is '</node-name>' returns None
- """
- if len(node.childNodes) == 1:
- data = node.childNodes[0].data
- elif len(node.childNodes) == 0: # empty node
- data = None
- else:
- raise Exception, "Node contains more than text"
- return data
-
-def get_child_nodes(node, child_name):
- """
- Given a node (<node-name> in the following example),
- returns an array of nodes matching <child-name>
- """
- ret = []
- for n in node.childNodes:
- if n.nodeType != n.ELEMENT_NODE:
- continue # ignore text, comment, etc. nodes
- if n.tagName == child_name:
- ret.append(n)
- return ret
-
-def get_child_nodes(node, child_name):
- """
- Given a node (<node-name> in the following example),
- returns an the node matching <child-name>
- """
- nodes = get_child_node(node, child_name)
- assert len(nodes) == 1, '%d child nodes named %s' % (
- len(nodes), child_name)
- return node[0]
-
-def get_child_contents(node, child_name):
- """
- Given a node (<node-name> in the following example),
- extract some-text from '<node-name>
- <some-tag>some-text</some-tag>
- <other-tag>other-text</other-tag>
- <some-tag>some-other-text</some-tag>
- ...
- </node-name>'
- Returns ['some-text', 'some-other-text', ...]
- """
- nodes = get_child_nodes(node, child_name)
- ret = []
- for n in nodes:
- ret.append(get_text(n))
- return ret
-
-def get_child_dict(node):
- """
- Given a node (<node-name> in the following example),
- extract some-text from '<node-name>
- <some-tag>some-text</some-tag>
- <other-tag>other-text</other-tag>
- <some-tag>some-other-text</some-tag>
- ...
- </node-name>'
- Returns {'some-tag':['some-text', 'some-other-text', ...],
- 'other-tag':['some-other-text']}
- """
- dict = {}
- tags = [] # to preserve order of tags
- for n in node.childNodes:
- if n.nodeType != n.ELEMENT_NODE:
- continue # ignore text, comment, etc. nodes
- try: # another entry for an existing tag
- dict[n.tagName].append(get_text(n))
- except KeyError: # new tag
- dict[n.tagName] = [get_text(n)]
- tags.append(n.tagName)
- return (dict, tags)
-
-def delist_dict(dict):
- """
- Given a dict
- e.g. {'some-tag':['some-text', 'some-other-text', ...],
- 'other-tag':['some-other-text'], ...} ,
- replaces any values in an array of length 1 with the element,
- e.g. {'some-tag':['some-text', 'some-other-text', ...],
- 'other-tag':'some-other-text', ...} ,
- """
- for key,value in dict.items():
- if isinstance(value, list) and len(value) == 1:
- dict[key] = value[0]
- return dict
-
-## Get information about the Entrez databases themselves
-
-def _query_einfo(db=None):
- """
- Get information about the Entrez databases themselves.
- http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
-
- Either list all available databases with `db=None`, or specific
- information on a particular database (e.g. pubmed) with
- `db=pubmed`.
- """
- params = urlencode({
- 'db': db,
- 'tool': TOOL,
- 'email': EMAIL})
-
- LOG.info("getting einfo from '%s?%s'" % (EINFO_URL, params))
- f = urllib.urlopen("%s?%s" % (EINFO_URL, params))
- string = f.read()
- f.close()
- LOG.debug('got:\n%s' % string)
- return string
-
-def get_parsed_einfo(db=None, page=None, parsed=None):
- """
- Helper function for various einfo processing functions.
- Allow each processor to function
- independently (page=None, parsed=None),
- with a shared xml string (page=<xml-string>, parsed=None), or
- with a shared parsed xml structure (page=*, parsed=<parsed_xml>).
- Use clean_parsed_einfo() for cleanup
- """
- if page == None and parsed == None:
- LOG.info('downloading new einfo page')
- page = _query_einfo(db)
- if parsed == None:
- LOG.info('parsing new einfo page')
- parsed = dom.parseString(page)
- parsed_islocal = True
- else:
- LOG.info('using old einfo parsing')
- parsed_islocal = False
- return (parsed, parsed_islocal)
-
-def clean_parsed_einfo(parsed, parsed_islocal=True):
- """
- Helper function for various einfo processing functions.
- Clean up the parsed xml structure if the calling function created it.
- """
- if parsed_islocal == True:
- LOG.info('cleaning up einfo parsing')
- parsed.unlink() # clean up the DOM
-
-def database_list(page=None, parsed=None):
- parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed)
- databases = []
- for node in parsed.getElementsByTagName("DbName"):
- # Extract some-text from '<DbName>some-text</DbName>'
- # by default, xml.dom.minidom uses unicode,
- # so strings get printed: "u'string contents'"
- databases.append(get_text(node))
- clean_parsed_einfo(parsed,parsed_islocal)
- return databases
-
-def field_dict(db='pubmed', page=None, parsed=None):
- parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
- fields = []
- tags = set()
- field_info = {}
- fieldlists = parsed.getElementsByTagName("FieldList")
- assert len(fieldlists) == 1, '%s\n\n%d FieldLists!' % (
- parsed.toxml(), len(fieldlists))
- fieldlist = fieldlists[0]
- for node in fieldlist.childNodes:
- if node.nodeType != node.ELEMENT_NODE:
- continue # ignore text, comment, etc. nodes
- assert node.tagName == 'Field', (
- "Unrecognized tag '%s' in FieldList" % node.tagName)
- field,new_tags = get_child_dict(node)
- assert len(field['Name']) == 1, (
- 'Multiple field names %s' % str(field['Name']))
- field = delist_dict(field)
- fields.append(field['Name'])
- new_tags = tags.union(new_tags)
- if tags:
- assert new_tags == tags, "Inconsistent tags"
- tags = new_tags
- field_info[field['Name']] = field
- clean_parsed_einfo(parsed,parsed_islocal)
- return (fields, tags, field_info)
-
-def link_dict(db='pubmed', page=None, parsed=None):
- parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
- links = []
- tags = set()
- link_info = []
- linklists = parsed.getElementsByTagName("LinkList")
- assert len(linklists) == 1, (
- '%s\n\n%d LinkLists!' % (parsed.toxml(), len(linklists)))
- linklist = linklists[0]
- for node in linklist.childNodes:
- if node.nodeType != node.ELEMENT_NODE:
- continue # ignore text, comment, etc. nodes
- assert node.tagName == 'Link', (
- "Unrecognized tag '%s' in LinkList" % node.tagName)
- link,new_tags = get_child_dict(node)
- assert len(link['Name']) == 1, (
- 'Multiple link names %s' % str(link['Name']))
- link = delist_dict(link)
- links.append(link['Name'])
- new_tags = tags.union(new_tags)
- if tags:
- assert new_tags == tags, "Inconsistent tags"
- tags = new_tags
- link_info[link['Name']] = link
- clean_parsed_einfo(parsed,parsed_islocal)
- return (links, tags, link_info)
-
-def database_info(db='pubmed', page=None, parsed=None):
- "Convenience function to call both field_dict and link_dict"
- parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
- fields,field_tags,field_info = field_dict(db=db, parsed=parsed)
- links,link_tags,link_info = link_dict(db=db, parsed=parsed)
- clean_parsed_einfo(parsed,parsed_islocal)
- return (fields, field_tags, field_info, links, link_tags, link_info)
-
-def validate_field(field, fields):
- "Ensure that field is a valid field for the database db."
- try:
- fields.index(field.upper())
- except ValueError:
- raise Exception("Field '%s' invalid\nValid fields are\n %s"
- % (field, str(fields)))
-
-def strip_fields_from_term(term):
- "HACK: really stupid algorithm"
- fields = []
- infield = False
- for i in range(len(term)):
- if term[i] == '[' and infield == False:
- infield = True
- field_start = i+1
- elif term[i] == ']' and infield == True:
- infield = False
- fields.append(term[field_start:i])
- return fields
-
-def validate_search_term(term, fields):
- "Ensure that the fields in term are valid fields for the database db."
- for field in strip_fields_from_term(term):
- validate_field(field, fields)
-
-
-## Search an Entrez database
-
-def _query_esearch(term, db='pubmed', field=None,
- reldate=None, daterange=None, datetype=None,
- retmax=None, rettype=None, sort=None,
- validate=False, valid_fields=None, debug=False):
- """
- Search an Entrez database.
- http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
-
- Does not currently support the usehistory, WebEnv, query_key,
- retstart, or retmode parameters.
-
- Help with the arguments adapted from esearch_help.html:
-
- term: This command uses search terms or phrases with or without
- Boolean operators.
- You can search in several fields using the [term field] tag.
- You can search in a single field using the 'field' parameter below.
- ?You may also tag search terms using field=tag.? I don't
- understand this line
- For example: term=asthma[MESH]+OR+hay+fever[MESH]
- 'term=asthma[MESH]' is the same as 'term=asthma&field=MESH'
- ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea )
-
- db: This command selects the database to be searched
- For example: db=pubmed
-
- field: Use this command to specify a specific search field.
- PubMed fields: affl, auth, ecno, jour, iss, mesh,...
- Retrieve with field_dict('pubmed')
- For example: field=auth
-
- reldate: Limit items a number of days immediately preceding today's date.
- For example: reldate=365
-
- daterange: Limit results bounded by two specific dates.
- For example: daterange=('2001', '2002/01/01')
- (implemented as mindate=2001&maxdate=2002/01/01)
-
- datetype: Limit dates to a specific date field based on database.
- For example: datetype=edat
-
- retmax: Limit the number of items retrieved
- For example: retmax=100
-
- rettype: Select the retrieval type
- PubMed values: count, uilist (default)
-
- sort: Sort the returned uilist
- PubMed values: author, last+author, journal, pub+date
-
- """
- if daterange != None:
- assert len(daterange) == 2, (
- "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
- % (daterange,))
- reldate == None, "Specifying date with daterange AND reldate!"
- mindate = daterange[0]
- maxdate = daterange[1]
- else:
- mindate = None
- maxdate = None
- if validate:
- assert len(valid_fields) > 0, (
- 'Need a list of valid fields to validate')
- if field != None:
- validate_field(field)
- validate_search_term(term, valid_fields)
- params = urlencode({
- 'tool': TOOL,
- 'email': EMAIL,
- 'term': term,
- 'db': db,
- 'field': field,
- 'reldate': reldate,
- 'mindate': mindate,
- 'maxdate': maxdate,
- 'datetype': datetype,
- 'maxdate': maxdate,
- 'retmax': retmax,
- 'rettype': rettype,
- 'sort': sort})
- LOG.info("getting esearch from '%s?%s'" % (ESEARCH_URL, params))
- f = urllib.urlopen("%s?%s" % (ESEARCH_URL, params))
- string = f.read()
- f.close()
- LOG.debug('got:\n%s' % string)
- return string
-
-def parse_esearch(page):
- "Parse the xml returned by _query_esearch()"
- parsed = dom.parseString(page)
-
- pid_list = []
- for node in parsed.getElementsByTagName("Id"):
- pid_list.append(get_text(node))
-
- parsed.unlink()
-
- return pid_list
-
-
-## Fetch records by Primary ID from an Entrez database
-
-def _query_efetch(id, db='pubmed',
- retmax=None, retmode='xml', rettype='medline'):
- """
- Fetch records by primary ID from an Entrez database.
- http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
- http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
-
-
- Does not currently support the usehistory, WebEnv, query_key, or
- retstart parameters.
-
- Help with the arguments adapted from efetchlit_help.html:
-
- id: Primary UIs identifying the documents to fetch
- For example: 'id=11877539, 11822933,11871444'
-
- db: This command selects the database to be searched
- For example: db=pubmed
-
- retmax: Limit the number of items retrieved (default 20)
- For example: retmax=100
-
- retmode: Select the retrieval output format
- xml (not journals)
- html
- text
- asn.1 (not journals)
-
- rettype: Select the retrieval type
- uilist
- abstract (not omim)
- citation (not omim)
- medline (not omim)
- full (journals and omim)
-
- Not all retmodes are possible with all rettypes:
- PubMed Options:
- uilist abstract citation medline
- xml x x* x* x*
- text x x x x
- html x x x x
- asn.1 n/a x* x* x
- x = retrieval mode available
- * returned retrieval type is the complete record in the retrieval mode
- n/a - not available
- OMIM Options: (not case sensitive)
- uilist docsum synopsis variants detailed ExternalLink
- (MIM (Clinical (Allelic
- numbers) synopsis) Variants)
- xml x x* x* x* x* x*
- text x x x x x* x*
- html x x x x x* x*
- asn.1 x* x* x* x* x* x*
- x = retrieval mode available
- * returned retrieval type is the complete record in the retrieval mode
- n/a - not available
-
- """
- idstring = ""
- for d in id:
- idstring += "%s," % d
- idstring = idstring[:-1] # remove trailing comma
- params = urlencode({
- 'tool': TOOL,
- 'email': EMAIL,
- 'id': idstring,
- 'db': db,
- 'retmax': retmax,
- 'retmode': retmode,
- 'rettype': rettype})
-
- LOG.info("getting efetch from '%s?%s'" % (EFETCH_URL, params))
- f = urllib.urlopen("%s?%s" % (EFETCH_URL, params))
- string = f.read()
- f.close()
- LOG.debug('got:\n%s' % string)
- return string
-
-
-## Fetch links by Primary ID from an Entrez database
-
-def _query_elink(id, term=None, db='all', dbfrom='pubmed',
- cmd=None, linkname=None, holding=None,
- version=1,
- reldate=None, daterange=None, datetype=None,
- retmode='xml'):
- """
- Fetch links from a list of primary IDs in an Entrez database.
- http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
- http://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html
-
- Does not currently support the WebEnv or query_key parameters.
-
- Help with the arguments adapted from efetchlit_help.html:
-
- id: Primary UIs identifying the documents to fetch
- For example: 'id=11877539, 11822933,11871444'
-
- term: This command uses search terms or phrases with or without
- Boolean operators to limit the returned matching links.
-
- db: This command selects the databases to be searched for link targets.
- For example: db=all
-
- dbfrom: This command selects the database containing the ids.
- For example: dbfrom=pubmed
-
-
- cmd: Link commands
- * prlinks - List the hyperlink to the primary LinkOut provider for
- multiple IDs and database. Each ID is processed separately.
- * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut
- provider for a single ID and database.
- Return the elink command, since fetching
- it breaks the relative links in the
- publisher's page.
- * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for
- multiple IDs and database. Each ID is processed separately.
- * llinkslib - List LinkOut URLs and Attributes for multiple IDs and
- database. Each ID is processed separately.
- * lcheck - Check for the existence (Y or N) of an external link in for
- multiple IDs and database.
- * ncheck - Check for the existence of a neighbor link for each ID within
- a database, e.g., Related Articles in PubMed.
- * neighbor - Display neighbors within a database.
- * neighbor_history - Create history (WebEnv & query_key) for use in other
- EUtilities.
- * acheck - Lists Entrez databases links for multiple IDs from a single
- database.
-
- linkname: link to a specific neighbor subset
- For example: linkname=nucleotide_nucleotide_comp
-
- holding: List LinkOut URLs for the specified holding provider, (library).
- Used only in conjunction with cmd=llinks or cmd=llinkslib
- For example: cmd=llinkslib&holding=medlib
-
- version: Include a version number to refer to the latest DTD.
- For example: version=1
- retrieves the latest DTD (eLink_050511.dtd) that includes the additional
- elements, MenuTag, LinkInfo and IdLinkSet.
-
- Date command are only valid for dbfrom=pubmed & cmd=neighbor
- reldate: Limit items a number of days immediately preceding today's date.
- For example: reldate=365
-
- daterange: Limit results bounded by two specific dates.
- For example: daterange=('2001', '2002/01/01')
- (implemented as mindate=2001&maxdate=2002/01/01)
-
- datetype: Limit dates to a specific date field based on database.
- For example: datetype=edat
-
- retmode: Select the retrieval output format
- xml (default)
- ref (only used with cmd=prlinks for one ID)
-
- """
- idstring = ""
- for d in id:
- idstring += "%s," % d
- idstring = idstring[:-1] # remove trailing comma
-
- params = urlencode({
- 'tool': TOOL,
- 'email': EMAIL,
- 'id': idstring,
- 'term': term,
- 'db': db,
- 'dbfrom': dbfrom,
- 'cmd': cmd,
- 'linkname': linkname,
- 'holding': holding,
- 'version': version,
- 'reldate': reldate,
- 'daterange': daterange,
- 'datetype': datetype,
- 'retmode': retmode})
-
- LOG.info("getting elink from '%s?%s'" % (ELINK_URL, params))
- f = urllib.urlopen("%s?%s" % (ELINK_URL, params))
-
- if cmd == 'prlinks' and retmode == 'ref':
- # Just get the link, we don't need the provider's webpage HTML.
- url = f.geturl()
- f.close()
- return url
-
- string = f.read()
- f.close()
- LOG.debug('got:\n%s' % string)
- return string
-
-
-## Combining the searching and parsing (dropping some of the less used
-## features)
-
-def search_fetch_xml(term, db='pubmed', field=None,
- reldate=None, daterange=None, datetype=None,
- retmax=None, sort=None,
- validate=False, valid_fields=None,
- retmode='xml', rettype='medline'):
- if validate and valid_fields == None:
- valid_fields,field_tags,field_info = field_dict(db)
- search_page = _query_esearch(term, db, field,
- reldate, daterange, datetype,
- retmax, rettype='uilist', sort=sort,
- validate=validate, valid_fields=valid_fields)
- pid_list = parse_esearch(search_page)
- if not pid_list:
- return None
- fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype)
- return fetch_page
-
-def search_link(term, db='pubmed', field=None,
- reldate=None, daterange=None, datetype=None,
- retmax=None, sort=None,
- validate=False, valid_fields=None,
- link_term=None, fromdb=None,
- cmd=None, linkname=None, link_holding=None,
- version=1,
- link_reldate=None, link_daterange=None, link_datetype=None,
- link_retmode='xml'):
- if validate and valid_fields == None:
- valid_fields,field_tags,field_info = field_dict(db)
- search_page = _query_esearch(term, db, field,
- reldate, daterange, datetype,
- retmax, rettype='uilist', sort=sort,
- validate=validate, valid_fields=valid_fields)
- pid_list = parse_esearch(search_page)
- link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb,
- cmd=cmd, linkname=linkname, holding=link_holding,
- version=version,reldate=link_reldate,
- daterange=link_daterange, datetype=link_datetype,
- retmode=link_retmode)
- return link_page
## Use the external bibutils package to convert to BibTeX format
def __init__(self, cmds, stdin=None):
if isinstance(stdin, str):
stdin_str = stdin
- stdin = PIPE
+ stdin = _subprocess.PIPE
else:
stdin_str = None
kwargs = {}
if _POSIX:
kwargs['close_fds'] = True
- self._procs.append(Popen(
- cmd, stdin=stdin, stdout=PIPE, stderr=PIPE, **kwargs))
+ self._procs.append(_subprocess.Popen(
+ cmd, stdin=stdin, stdout=_subprocess.PIPE,
+ stderr=_subprocess.PIPE, **kwargs))
self.stdout,self.stderrs = self._communicate(input=stdin_str)
input_offset = 0
while read_set or write_set:
- LOG.debug('select on read %s, write %s' % (read_set,write_set))
+ LOG.debug('select on read %s, write %s' %(read_set, write_set))
try:
- rlist,wlist,xlist = select.select(read_set, write_set, [])
- except select.error, e:
+ rlist,wlist,xlist = _select.select(read_set, write_set, [])
+ except _select.error, e:
if e.args[0] == errno.EINTR:
- LOG.debug('EINTR')
+ LOG.debug('EINTR: %s' % e)
continue
raise
LOG.debug('selected read %s, write %s, exception %s'
# blocking. POSIX defines PIPE_BUF >= 512
LOG.debug('write to stdin for process 0')
chunk = input[input_offset:input_offset+512]
- bytes_written = os.write(
+ bytes_written = _os.write(
self._procs[0].stdin.fileno(), chunk)
input_offset += bytes_written
if input_offset >= len(input):
LOG.debug('stdin complete')
if self._procs[-1].stdout in rlist:
LOG.debug('read stdout for final process')
- data = os.read(self._procs[-1].stdout.fileno(), 1024)
+ data = _os.read(self._procs[-1].stdout.fileno(), 1024)
if data == '':
self._procs[-1].stdout.close()
read_set.remove(self._procs[-1].stdout)
for i,proc in enumerate(self._procs):
if proc.stderr in rlist:
LOG.debug('read stderr for process %i' % i)
- data = os.read(proc.stderr.fileno(), 1024)
+ data = _os.read(proc.stderr.fileno(), 1024)
if data == '':
proc.stderr.close()
read_set.remove(proc.stderr)
... '<?xml version="1.0"?>',
... '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, '
... '1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query'
- ,,, '/DTD/pubmed_110101.dtd">',
+ ... '/DTD/pubmed_110101.dtd">',
... '<PubmedArticleSet>',
... ' <PubmedArticle>',
... ' <MedlineCitation Owner="NLM" Status="MEDLINE">',
... ' </PubmedArticle>',
... '</PubmedArticleSet>',
... ])
- >>> print medline_xml_to_bibtex(xml)
+ >>> print medline_xml_to_bibtex(xml) # doctest: +REPORT_UDIFF
@Article{King2010,
author = "William T. King and Meihong Su and Guoliang Yang",
title = "Monte Carlo simulation of mechanical unfolding of
pages = "159--166",
ISSN = "1879-0003",
doi = "10.1016/j.ijbiomac.2009.12.001",
+ URL = "http://www.ncbi.nlm.nih.gov/pubmed/20004685",
}
<BLANKLINE>
"""
- LOG.info('convert medline XML to BibTeX\n%s' % fetch_page)
+ LOG.info('convert medline XML to BibTeX')
+ LOG.debug('convert from\n%s' % fetch_page)
p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']],
stdin=fetch_page)
LOG.debug('converted to\n%s' % p.stdout)
return p.stdout
-## Random
-
-def hints():
- "Print Entrez search hints and exit"
-
- print """
-free full text [sb]
-
-
-"""
-
-## Test with a mini-searching application
-
-if __name__ == "__main__":
+if __name__ == '__main__':
from optparse import OptionParser
usage_string = '\n'.join([
' (print medline xml matching search)',
'| %prog -l [options] SEARCH_TERM'
' (print links to entries matching search)',
- '| %prog -L [-d DATABASE] [-f FILE] (list databases)',
+ '| %prog -L [-f FILE] (list databases)',
'| %prog -X [-d DATABASE] [-F FIELD] [-f FILE]'
' (list fields in a database, or details on a single field)',
'',
'See the docstrings in %prog or',
' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
'eutils_help.html',
+ ' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
+ 'eutils_help.html#UserSystemRequirements',
+ ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+ 'einfo_help.html',
+ ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+ ' esearch_help.html',
+ ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+ 'efetch_help.html',
+ ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+ 'elink_help.html',
'for more details.'
])
# the value of that option.
# "
- parser.add_option('-d', '--database', dest="database",
+ parser.add_option('-d', '--database', dest='database',
help="Search DATABASE (default '%default')",
- type='string', metavar="DATABASE", default='pubmed')
- parser.add_option('-f', '--file', dest="filename",
- help="write output to FILE (default stdout)",
- type='string', metavar="FILE")
- parser.add_option('-v', '--verbose', dest="verbose", action="store_true",
- help="Print lots of debugging information",
- default=False)
- parser.add_option('-H', '--hints', callback=hints,
- help="Print Entrez search hints and exit",
- action="callback")
-
+ type='string', metavar='DATABASE', default='pubmed')
+ parser.add_option('-f', '--file', dest='filename',
+ help='write output to FILE (default stdout)',
+ type='string', metavar='FILE')
+ parser.add_option('-v', '--verbose', dest='verbose', action='count',
+ help=('Print minimal debugging information. Use twice '
+ 'to get lots of debugging info.'),
+ default=0)
# mode control options
mode = 'search'
mode = 'explain'
parser.add_option('-L', '--list-mode', callback=set_mode,
- help="Run in list mode", action="callback")
+ help='Run in list mode', action='callback')
parser.add_option('-X', '--explain-mode', callback=set_mode,
- help="Run in explain mode", action="callback")
+ help='Run in explain mode', action='callback')
# search-fetch-xml-to-? options
output = 'bibtex'
def set_output(option, opt_str, value, parser):
global output
long_option = option.get_opt_string()
+ if long_option == '--output-xml':
+ output = 'medline'
+ if long_option == '--output-bibtex':
+ output = 'bibtex'
if long_option == '--output-link':
output = 'link'
- parser.add_option('-W', '--raw', dest="raw", action="store_true",
- help="Output raw Entrez xml", default=False)
- parser.add_option('-F', '--field', dest="field",
- help="Limit SEARCH_TERM to FIELD",
- type='string', metavar="FIELD")
- parser.add_option('-r', '--reldate', dest="reldate",
- help="Limit search to dates within DAYS of today",
- type='string', metavar="DAYS")
- parser.add_option('-R', '--daterange', dest="daterange",
- help=("Limit search to dates within DATERANGE "
- "(e.g. '2001/1/1,2002')"),
- type='string', metavar="DATERANGE")
- parser.add_option('-t', '--datetype', dest="datetype",
+ parser.add_option('-x', '--output-xml', callback=set_output,
+ help='Output search results as Medline XML',
+ action='callback')
+ parser.add_option('-b', '--output-bibtex', callback=set_output,
+ help='Output search results as BibTeX',
+ action='callback')
+ parser.add_option('-F', '--field', dest='field',
+ help='Limit SEARCH_TERM to FIELD',
+ type='string', metavar='FIELD')
+ parser.add_option('-r', '--reldate', dest='reldate',
+ help='Limit search to dates within DAYS of today',
+ type='string', metavar='DAYS')
+ parser.add_option('--mindate', dest='mindate',
+ help=('Limit search to date after MINDATE '
+ "(e.g. '2001/1/1' or '2002')"),
+ type='string', metavar='MINDATE')
+ parser.add_option('--maxdate', dest='maxdate',
+ help=('Limit search to date after MAXDATE '
+ "(e.g. '2001/1/1' or '2002')"),
+ type='string', metavar='MAXDATE')
+ parser.add_option('-t', '--datetype', dest='datetype',
help=("Select field to apply date limits to "
"(e.g. 'edat' for Entrez date)"),
- type='string', metavar="DATETYPE")
- parser.add_option('-m', '--retmax', dest="retmax",
- help=('Return at max RETMAX items from a successful '
+ type='string', metavar='DATETYPE')
+ parser.add_option('-m', '--retmax', dest='retmax',
+ help=('Return at most RETMAX items from a successful '
'search (default %default)'),
- type='string', metavar="RETMAX", default=20)
- parser.add_option('-M', '--retmode', dest="retmode",
- help="Select fetch/link output format",
- type='string', metavar="RETMODE", default='xml')
- parser.add_option('-V', '--validate', dest="validate", action="store_true",
+ type='int', metavar='RETMAX', default=20)
+ parser.add_option('-s', '--retstart', dest='retstart',
+ help=('Index of first returned search item from a '
+ 'successful search (default %default)'),
+ type='int', metavar='RETSTART', default=0)
+ parser.add_option('-V', '--validate', dest='validate', action='store_true',
help=('Check that FIELD and field tags in SEARCH_TERM '
'are valid for DB'),
default=False)
# output link options
parser.add_option('-l', '--output-link', callback=set_output,
- help="Output a link (instead of xml citations)",
- action="callback")
- parser.add_option('-c', '--link-cmd', dest="link_cmd",
- help="Select link output",
- type='string', metavar="LINK_CMD")
- parser.add_option('-T', '--link-term', dest="link_term",
- help="Limit links to those matching LINK_TERM",
- type='string', metavar="LINK_TERM")
- parser.add_option('-D', '--from-database', dest="fromdb",
- help="Limit links to those from FROMDATABASE)",
- type='string', metavar="FROMDATABASE")
- parser.add_option('-n', '--link-name', dest="linkname",
- help="Limit links to a specific neighbor",
- type='string', metavar="LINKNAME")
+ help='Output a link (instead of xml citations).',
+ action='callback')
+ parser.add_option('-c', '--link-cmd', dest='link_cmd',
+ help='Select link output',
+ type='string', metavar='LINK_CMD')
+ parser.add_option('-T', '--link-term', dest='link_term',
+ help='Limit links to those matching LINK_TERM',
+ type='string', metavar='LINK_TERM')
+ parser.add_option('-D', '--from-database', dest='dbfrom',
+ help='Limit links to those from FROMDATABASE)',
+ type='string', metavar='FROMDATABASE')
+ parser.add_option('-n', '--link-name', dest='linkname',
+ help='Limit links to a specific neighbor',
+ type='string', metavar='LINKNAME')
(options, args) = parser.parse_args()
parser.destroy()
# open the output file if specified
if options.filename == None:
- outfile = sys.stdout
+ outfile = _sys.stdout
else:
outfile = file(options.filename, 'w')
- if options.verbose:
- LOG.setLevel(logging.DEBUG)
+ if options.verbose == 1:
+ LOG.setLevel(_logging.INFO)
+ elif options.verbose > 1:
+ LOG.setLevel(_logging.DEBUG)
LOG.debug('operating in %s mode' % mode)
if mode == 'list':
- print >> outfile, "Available databases:"
- databases = database_list()
- for db in databases:
- print >> outfile, "\t%s" % db
+ outfile.write('# available databases:\n')
+ LOG.info('run eInfo to get list of databases')
+ q = EUTILS_CLIENT.service.run_eInfo(tool=TOOL, email=EMAIL)
+ if hasattr(q, 'ERROR'):
+ raise Exception(q.ERROR)
+
+ for db in q.DbList.DbName:
+ outfile.write('%s\n' % db)
elif mode == 'explain':
- fields,tags,field_info = field_dict(db=options.database)
- if options.field == None:
- print >> outfile, "Available fields in %s:" % options.database
- field_size = [0,0]
- for field in fields:
- if len(field) > field_size[0]:
- field_size[0] = len(field)
- if len(field_info[field]['FullName']) > field_size[1]:
- field_size[1] = len(field_info[field]['FullName'])
- for field in fields:
- print >> outfile, ('\t%*.*s\t%-*.*s'
- % (field_size[0], field_size[0], field,
- field_size[1], field_size[1],
- field_info[field]['FullName']))
- else:
- print >> outfile, (
- 'Field %s in %s:' % (options.field,options.database))
+ LOG.info('run eInfo on %s' % options.database)
+ q = EUTILS_CLIENT.service.run_eInfo(
+ db=options.database, tool=TOOL, email=EMAIL)
+ if hasattr(q, 'ERROR'):
+ raise Exception(q.ERROR)
+
+ if options.field: # print specific info about this field
+ outfile.write(
+ 'field %s in %s:\n' % (options.field, options.database))
+ fields = dict(
+ [(field.Name, field) for field in q.DbInfo.FieldList.Field])
+ field = fields[options.field]
+ attributes = sorted(
+ [(a, getattr(field, a)) for a in dir(field)
+ if not a.startswith('_')])
+ field_size = [0]
+ for attribute,value in attributes:
+ if len(attribute) > field_size[0]:
+ field_size[0] = len(attribute)
+ for attribute,value in attributes:
+ outfile.write(
+ '%*.*s\t%s\n'
+ % (field_size[0], field_size[0], attribute, value))
+ else: # print general info
+ outfile.write('database: %s\n' % q.DbInfo.DbName)
+ outfile.write('description: %s\n' % q.DbInfo.Description)
+ outfile.write('available fields:\n')
field_size = [0,0]
- for key in tags:
- if len(key) > field_size[0]:
- field_size[0] = len(key)
- if len(field_info[options.field][key]) > field_size[1]:
- field_size[1] = len(field_info[options.field][key])
- for key in tags:
- print >> outfile, ('\t%*.*s\t%-*.*s'
- % (field_size[0], field_size[0], key,
- field_size[1], field_size[1],
- field_info[options.field][key]))
+ for field in q.DbInfo.FieldList.Field:
+ if len(field.Name) > field_size[0]:
+ field_size[0] = len(field.Name)
+ if len(field.FullName) > field_size[1]:
+ field_size[1] = len(field.FullName)
+ for field in q.DbInfo.FieldList.Field:
+ outfile.write(
+ '%*.*s\t%-*.*s\t%s\n'
+ % (field_size[0], field_size[0], field.Name,
+ field_size[1], field_size[1], field.FullName,
+ field.Description))
elif mode == 'search':
search_term = args[0]
LOG.debug('output %s' % output)
- if output == 'bibtex':
- medline_xml = search_fetch_xml(term=search_term,
- db=options.database,
- field=options.field,
- reldate=options.reldate,
- daterange=options.daterange,
- datetype=options.datetype,
- retmax=options.retmax,
- validate=options.validate,
- retmode=options.retmode,
- rettype='medline')
- if medline_xml:
- if options.raw:
- print outfile, medline_xml
- else:
- bibtex = medline_xml_to_bibtex(medline_xml)
- print >> outfile, bibtex
-
- elif output == 'link':
- # Assume that if you're looking for links
- # your search is already pretty refined,
- # so use the date options for link-limiting.
- link_xml = search_link(term=search_term,
- db=options.database,
- field=options.field,
- reldate=None,
- daterange=None,
- datetype=None,
- retmax=None,
- sort=None,
- validate=options.validate,
- valid_fields=None,
- link_term=options.link_term,
- fromdb=options.fromdb,
- cmd=options.link_cmd,
- linkname=options.linkname,
- link_holding=None,
- version=1,
- link_reldate=options.reldate,
- link_daterange=options.daterange,
- link_datetype=options.datetype,
- link_retmode=options.retmode,)
- print >> outfile, link_xml
+ LOG.info('maxdate: %r, mindate %r' % (options.maxdate, options.mindate))
+ if options.mindate and not options.maxdate:
+ options.maxdate = _time.strftime('%Y/%M/%d')
+ LOG.info('fill in maximum date: %s' % options.maxdate)
+ elif options.maxdate and not options.mindate:
+ options.mindate = '0'
+ LOG.info('fill in minimum date: %s' % options.mindate)
+
+ LOG.info('run eEsearch on %s' % options.database)
+ q = EUTILS_CLIENT.service.run_eSearch(
+ db=options.database, term=search_term, tool=TOOL, email=EMAIL,
+ field=options.field, reldate=options.reldate,
+ mindate=options.mindate, maxdate=options.maxdate,
+ datetype=options.datetype,
+ RetStart=options.retstart, RetMax=options.retmax,
+ #sort=)
+ )
+ if hasattr(q, 'ERROR'):
+ raise Exception(q.ERROR)
+ if hasattr(q.IdList, 'Id'):
+ ret = int(len(q.IdList.Id))
+ else:
+ ret = 0
+ LOG.info('search returned %d of %d items' % (ret, int(q.Count)))
+
+ if ret > 0:
+ if output in ['medline', 'bibtex']:
+ LOG.info('run eFetch on %s' % options.database)
+ efetch_client = _Client(EFETCH_WSDL_URL % options.database)
+ f = efetch_client.service.run_eFetch(
+ id=','.join(q.IdList.Id), tool=TOOL, email=EMAIL)
+ if hasattr(f, 'ERROR'):
+ raise Exception(f.ERROR)
+
+ if output == 'medline':
+ outfile.write(str(efetch_client.last_received()).rstrip()+'\n')
+ elif output == 'bibtex':
+ outfile.write(
+ medline_xml_to_bibtex(str(efetch_client.last_received())))
+ elif output == 'link':
+ LOG.info('run eLink on %s' % options.database)
+ f = EUTILS_CLIENT.service.run_eLink(
+ db=options.database, id=','.join(q.IdList.Id),
+ #reldate=, mindate=, maxdate=, datetype=,
+ term=options.link_term, dbfrom=options.dbfrom,
+ linkname=options.linkname, cmd=options.link_cmd,
+ tool=TOOL, email=EMAIL)
+ outfile.write(str(EUTILS_CLIENT.last_received()).rstrip()+'\n')
+ else:
+ raise KeyError(output)
if options.filename != None:
outfile.close()