From 276ee35e1becf172183461c3230abcd520efdb3a Mon Sep 17 00:00:00 2001
From: "W. Trevor King" <wking@drexel.edu>
Date: Sat, 16 Apr 2011 10:36:02 -0400
Subject: [PATCH] Convert entrez.py to use SOAP interface (much simpler code).

---
 posts/entrez/entrez.py | 1051 ++++++++++------------------------------
 1 file changed, 248 insertions(+), 803 deletions(-)
diff --git a/posts/entrez/entrez.py b/posts/entrez/entrez.py
index e026929..90dd914 100755
--- a/posts/entrez/entrez.py
+++ b/posts/entrez/entrez.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # Copyright (C) 1998-2004 Frederic Gobry
 # Copyright (C) 2008-2011 W. Trevor King
@@ -15,677 +15,88 @@
 #
 # You should have received a copy of the GNU General Public License
 # along with This program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# Code following John Vu's medline query code pybliographer/Pyblio/Query.py,
-#
-# Python interface to the Entrez databases.
-# See http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
-# Current as of August 1, 2007
-#
-# Rules:
-#    * Run retrieval scripts on weekends or between 9 pm and 5 am
-#      Eastern Time weekdays for any series of more than 100 requests.
-#    * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov,
-#      not the standard NCBI Web address.
-#    * Make no more than one request every 3 seconds.
-#    * Use the URL parameter email, and tool for distributed software,
-#      so that we can track your project and contact you if there is a
-#      problem.
-#    * NCBI's Disclaimer and Copyright notice must be evident to users
-#      of your service.
-#    * NLM does not claim the copyright on the abstracts in PubMed;
-#      however, journal publishers or authors may.
-#    * NLM provides no legal advice concerning distribution of
-#      copyrighted materials, consult your legal counsel.
-#
-# For a good Python-and-XML-DOM intro, see
-#  http://www.boddie.org.uk/python/XML_intro.html
-# for the official docs, see
-#  http://docs.python.org/lib/module-xml.dom.html
 
-"""Python bindings on Entrez database queries.
+"""Python interface to Entrez_ SOAP_ using the suds_ module.
+
+Before you use this program, read the rules_.
+
+.. _Entrez: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
+.. _SOAP: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/DOC/esoap_help.html
+.. _suds: https://fedorahosted.org/suds/
+.. _rules: http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html#UserSystemRequirements
+
+To discover services using suds, try:
+
+>>> print EUTILS_CLIENT  # doctest: +ELLIPSIS, +REPORT_UDIFF
+<BLANKLINE>
+Suds ( https://fedorahosted.org/suds/ )  version: ...  build: ...
+<BLANKLINE>
+Service ( eUtilsService ) tns="http://www.ncbi.nlm.nih.gov/soap/eutils/"
+   Prefixes (6)
+      ns0 = "http://www.ncbi.nlm.nih.gov/soap/eutils/egquery"
+      ns1 = "http://www.ncbi.nlm.nih.gov/soap/eutils/einfo"
+      ns2 = "http://www.ncbi.nlm.nih.gov/soap/eutils/elink"
+      ns3 = "http://www.ncbi.nlm.nih.gov/soap/eutils/epost"
+      ns4 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esearch"
+      ns5 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esummary"
+   Ports (1):
+      (eUtilsServiceSoap)
+         Methods (7):
+            run_eGquery(xs:string term, xs:string tool, xs:string email, )
+            run_eInfo(xs:string db, xs:string tool, xs:string email, )
+            run_eLink(xs:string db, xs:string[] id, xs:string reldate, ...)
+            run_ePost(xs:string db, xs:string id, xs:string WebEnv, ...)
+            run_eSearch(xs:string db, xs:string term, xs:string WebEnv, ...)
+            run_eSpell(xs:string db, xs:string term, xs:string tool, ...)
+            run_eSummary(xs:string db, xs:string id, xs:string WebEnv, ...)
+         Types (34):
+            ns1:DbInfoType
+            ns1:DbListType
+            ...
+            ns0:eGQueryResultType
+<BLANKLINE>
+<BLANKLINE>
 """
 
-import logging
-import re
-import string
-import sys
-import time    # for querying date ranges of publications
-import urllib
-
-# DOM module for parsing XML,
-# supports Document Object Model (DOM) Level 1 Specification
-# http://docs.python.org/lib/module-xml.dom.minidom.html
-import xml.dom.minidom as dom
+import logging as _logging
+import subprocess as _subprocess
+import sys as _sys
+import time as _time
 
-# For calling the bibutils conversion programs
-from subprocess import Popen, PIPE
+import suds as _suds
+from suds.client import Client as _Client
 
 # Platform constants
-_MSWINDOWS = sys.platform == 'win32'
+_MSWINDOWS = _sys.platform == 'win32'
 _POSIX = not _MSWINDOWS
 
 if _POSIX:
-    import os
-    import select
+    import os as _os
+    import select as _select
 
 
 __version__ = '0.2'
 
-# Entrez access points
-EINFO_URL   = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
-ESEARCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
-EFETCH_URL  = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
-ELINK_URL   = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
+
+EUTILS_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/eutils.wsdl'
+EFETCH_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/efetch_%s.wsdl'
+
+EUTILS_CLIENT = _Client(EUTILS_WSDL_URL)
 
 # Entrez-requested tracking information
 TOOL = 'entrezpy'
 EMAIL = 'wking@drexel.edu'
 
-# Logger
-
-LOG = logging.getLogger(TOOL)
-LOG.setLevel(logging.WARN)
-_handler = logging.StreamHandler()
-_formatter = logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
+# Logging
+LOG = _logging.getLogger(TOOL)
+LOG.setLevel(_logging.WARN)
+_handler = _logging.StreamHandler()
+_formatter = _logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
 _handler.setFormatter(_formatter)
 LOG.addHandler(_handler)
 del _handler, _formatter
 
-## XML and list utility functions
-
-def urlencode(param_dict):
-    return urllib.urlencode(
-        [(k,v) for k,v in param_dict.iteritems() if v is not None])
-
-def get_text(node):
-    """
-    Given a node (<node-name> in the following example),
-     extract some-text from '<node-name>some-text</node-name>'
-     returns u'some-text'.
-    However, if the xml is '</node-name>' returns None
-    """
-    if len(node.childNodes) == 1:
-        data = node.childNodes[0].data
-    elif len(node.childNodes) == 0: # empty node
-        data = None
-    else:
-        raise Exception, "Node contains more than text"
-    return data
-
-def get_child_nodes(node, child_name):
-    """
-    Given a node (<node-name> in the following example),
-    returns an array of nodes matching <child-name>
-    """
-    ret = []
-    for n in node.childNodes:
-        if n.nodeType != n.ELEMENT_NODE:
-            continue # ignore text, comment, etc. nodes
-        if n.tagName == child_name:
-            ret.append(n)
-    return ret
-
-def get_child_nodes(node, child_name):
-    """
-    Given a node (<node-name> in the following example),
-    returns an the node matching <child-name>
-    """
-    nodes = get_child_node(node, child_name)
-    assert len(nodes) == 1, '%d child nodes named %s' % (
-        len(nodes), child_name)
-    return node[0]
-
-def get_child_contents(node, child_name):
-    """
-    Given a node (<node-name> in the following example),
-    extract some-text from '<node-name>
-                              <some-tag>some-text</some-tag>
-                              <other-tag>other-text</other-tag>
-                              <some-tag>some-other-text</some-tag>
-                              ...
-                            </node-name>'
-    Returns ['some-text', 'some-other-text', ...]
-    """
-    nodes = get_child_nodes(node, child_name)
-    ret = []
-    for n in nodes:
-        ret.append(get_text(n))
-    return ret
-
-def get_child_dict(node):
-    """
-    Given a node (<node-name> in the following example),
-    extract some-text from '<node-name>
-                              <some-tag>some-text</some-tag>
-                              <other-tag>other-text</other-tag>
-                              <some-tag>some-other-text</some-tag>
-                              ...
-                            </node-name>'
-    Returns {'some-tag':['some-text', 'some-other-text', ...],
-             'other-tag':['some-other-text']}
-    """
-    dict = {}
-    tags = [] # to preserve order of tags
-    for n in node.childNodes:
-        if n.nodeType != n.ELEMENT_NODE:
-            continue # ignore text, comment, etc. nodes
-        try: # another entry for an existing tag
-            dict[n.tagName].append(get_text(n))
-        except KeyError: # new tag
-            dict[n.tagName] = [get_text(n)]
-            tags.append(n.tagName)
-    return (dict, tags)
-
-def delist_dict(dict):
-    """
-    Given a dict
-        e.g. {'some-tag':['some-text', 'some-other-text', ...],
-              'other-tag':['some-other-text'], ...}  ,
-    replaces any values in an array of length 1 with the element,
-        e.g. {'some-tag':['some-text', 'some-other-text', ...],
-              'other-tag':'some-other-text', ...}  ,
-    """
-    for key,value in dict.items():
-        if isinstance(value, list) and len(value) == 1:
-            dict[key] = value[0]
-    return dict
-
-## Get information about the Entrez databases themselves
-
-def _query_einfo(db=None):
-    """
-    Get information about the Entrez databases themselves.
-    http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
-
-    Either list all available databases with `db=None`, or specific
-    information on a particular database (e.g. pubmed) with
-    `db=pubmed`.
-    """
-    params = urlencode({
-            'db': db,
-            'tool': TOOL,
-            'email': EMAIL})
-
-    LOG.info("getting einfo from '%s?%s'" % (EINFO_URL, params))
-    f = urllib.urlopen("%s?%s" % (EINFO_URL, params))
-    string = f.read()
-    f.close()
-    LOG.debug('got:\n%s' % string)
-    return string
-
-def get_parsed_einfo(db=None, page=None, parsed=None):
-    """
-    Helper function for various einfo processing functions.
-    Allow each processor to function
-      independently                      (page=None, parsed=None),
-      with a shared xml string           (page=<xml-string>, parsed=None), or
-      with a shared parsed xml structure (page=*, parsed=<parsed_xml>).
-    Use clean_parsed_einfo() for cleanup
-    """
-    if page == None and parsed == None:
-        LOG.info('downloading new einfo page')
-        page = _query_einfo(db)
-    if parsed == None:
-        LOG.info('parsing new einfo page')
-        parsed = dom.parseString(page)
-        parsed_islocal = True
-    else:
-        LOG.info('using old einfo parsing')
-        parsed_islocal = False
-    return (parsed, parsed_islocal)
-
-def clean_parsed_einfo(parsed, parsed_islocal=True):
-    """
-    Helper function for various einfo processing functions.
-    Clean up the parsed xml structure if the calling function created it.
-    """
-    if parsed_islocal == True:
-        LOG.info('cleaning up einfo parsing')
-        parsed.unlink() # clean up the DOM
-
-def database_list(page=None, parsed=None):
-    parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed)
-    databases = []
-    for node in parsed.getElementsByTagName("DbName"):
-        # Extract some-text from '<DbName>some-text</DbName>'
-        # by default, xml.dom.minidom uses unicode,
-        # so strings get printed: "u'string contents'"
-        databases.append(get_text(node))
-    clean_parsed_einfo(parsed,parsed_islocal)
-    return databases
-
-def field_dict(db='pubmed', page=None, parsed=None):
-    parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
-    fields = []
-    tags = set()
-    field_info = {}
-    fieldlists = parsed.getElementsByTagName("FieldList")
-    assert len(fieldlists) == 1, '%s\n\n%d FieldLists!' % (
-        parsed.toxml(), len(fieldlists))
-    fieldlist = fieldlists[0]
-    for node in fieldlist.childNodes:
-        if node.nodeType != node.ELEMENT_NODE:
-            continue # ignore text, comment, etc. nodes
-        assert node.tagName == 'Field', (
-            "Unrecognized tag '%s' in FieldList" % node.tagName)
-        field,new_tags = get_child_dict(node)
-        assert len(field['Name']) == 1, (
-            'Multiple field names %s' % str(field['Name']))
-        field = delist_dict(field)
-        fields.append(field['Name'])
-        new_tags = tags.union(new_tags)
-        if tags:
-            assert new_tags == tags, "Inconsistent tags"
-        tags = new_tags
-        field_info[field['Name']] = field
-    clean_parsed_einfo(parsed,parsed_islocal)
-    return (fields, tags, field_info)
-
-def link_dict(db='pubmed', page=None, parsed=None):
-    parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
-    links = []
-    tags = set()
-    link_info = []
-    linklists = parsed.getElementsByTagName("LinkList")
-    assert len(linklists) == 1, (
-        '%s\n\n%d LinkLists!' % (parsed.toxml(), len(linklists)))
-    linklist = linklists[0]
-    for node in linklist.childNodes:
-        if node.nodeType != node.ELEMENT_NODE:
-            continue # ignore text, comment, etc. nodes
-        assert node.tagName == 'Link', (
-            "Unrecognized tag '%s' in LinkList" % node.tagName)
-        link,new_tags = get_child_dict(node)
-        assert len(link['Name']) == 1, (
-            'Multiple link names %s' % str(link['Name']))
-        link = delist_dict(link)
-        links.append(link['Name'])
-        new_tags = tags.union(new_tags)
-        if tags:
-            assert new_tags == tags, "Inconsistent tags"
-        tags = new_tags
-        link_info[link['Name']] = link
-    clean_parsed_einfo(parsed,parsed_islocal)
-    return (links, tags, link_info)
-
-def database_info(db='pubmed', page=None, parsed=None):
-    "Convenience function to call both field_dict and link_dict"
-    parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
-    fields,field_tags,field_info = field_dict(db=db, parsed=parsed)
-    links,link_tags,link_info = link_dict(db=db, parsed=parsed)
-    clean_parsed_einfo(parsed,parsed_islocal)
-    return (fields, field_tags, field_info, links, link_tags, link_info)
-
-def validate_field(field, fields):
-    "Ensure that field is a valid field for the database db."
-    try:
-        fields.index(field.upper())
-    except ValueError:
-        raise Exception("Field '%s' invalid\nValid fields are\n %s"
-                        % (field, str(fields)))
-
-def strip_fields_from_term(term):
-    "HACK: really stupid algorithm"
-    fields = []
-    infield = False
-    for i in range(len(term)):
-        if term[i] == '[' and infield == False:
-            infield = True
-            field_start = i+1
-        elif term[i] == ']' and infield == True:
-            infield = False
-            fields.append(term[field_start:i])
-    return fields
-
-def validate_search_term(term, fields):
-    "Ensure that the fields in term are valid fields for the database db."
-    for field in strip_fields_from_term(term):
-        validate_field(field, fields)
-
-
-## Search an Entrez database
-
-def _query_esearch(term, db='pubmed', field=None,
-                   reldate=None, daterange=None, datetype=None,
-                   retmax=None, rettype=None, sort=None,
-                   validate=False, valid_fields=None, debug=False):
-    """
-    Search an Entrez database.
-    http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
-
-    Does not currently support the usehistory, WebEnv, query_key,
-    retstart, or retmode parameters.
-
-    Help with the arguments adapted from esearch_help.html:
-
-    term: This command uses search terms or phrases with or without
-    Boolean operators.
-     You can search in several fields using the [term field] tag.
-     You can search in a single field using the 'field' parameter below.
-     ?You may also tag search terms using field=tag.? I don't
-       understand this line
-     For example: term=asthma[MESH]+OR+hay+fever[MESH]
-      'term=asthma[MESH]' is the same as 'term=asthma&field=MESH'
-      ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea )
-
-    db: This command selects the database to be searched
-     For example: db=pubmed
-
-    field: Use this command to specify a specific search field.
-     PubMed fields: affl, auth, ecno, jour, iss, mesh,...
-     Retrieve with field_dict('pubmed')
-     For example: field=auth
-
-    reldate: Limit items a number of days immediately preceding today's date.
-     For example: reldate=365
-
-    daterange:  Limit results bounded by two specific dates.
-     For example: daterange=('2001', '2002/01/01')
-     (implemented as mindate=2001&maxdate=2002/01/01)
-
-    datetype:  Limit dates to a specific date field based on database.
-     For example: datetype=edat
-
-    retmax: Limit the number of items retrieved
-     For example: retmax=100
-
-    rettype: Select the retrieval type
-     PubMed values: count, uilist (default)
-
-    sort: Sort the returned uilist
-     PubMed values: author, last+author, journal, pub+date
-
-    """
-    if daterange != None:
-        assert len(daterange) == 2, (
-            "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
-            % (daterange,))
-        reldate == None, "Specifying date with daterange AND reldate!"
-        mindate = daterange[0]
-        maxdate = daterange[1]
-    else:
-        mindate = None
-        maxdate = None
-    if validate:
-        assert len(valid_fields) > 0, (
-            'Need a list of valid fields to validate')
-        if field != None:
-            validate_field(field)
-        validate_search_term(term, valid_fields)
-    params = urlencode({
-            'tool': TOOL,
-            'email': EMAIL,
-            'term': term,
-            'db': db,
-            'field': field,
-            'reldate': reldate,
-            'mindate': mindate,
-            'maxdate': maxdate,
-            'datetype': datetype,
-            'maxdate': maxdate,
-            'retmax': retmax,
-            'rettype': rettype,
-            'sort': sort})
 
-    LOG.info("getting esearch from '%s?%s'" % (ESEARCH_URL, params))
-    f = urllib.urlopen("%s?%s" % (ESEARCH_URL, params))
-    string = f.read()
-    f.close()
-    LOG.debug('got:\n%s' % string)
-    return string
-
-def parse_esearch(page):
-    "Parse the xml returned by _query_esearch()"
-    parsed = dom.parseString(page)
-
-    pid_list = []
-    for node in parsed.getElementsByTagName("Id"):
-        pid_list.append(get_text(node))
-
-    parsed.unlink()
-
-    return pid_list
-
-
-## Fetch records by Primary ID from an Entrez database
-
-def _query_efetch(id, db='pubmed',
-                  retmax=None, retmode='xml', rettype='medline'):
-    """
-    Fetch records by primary ID from an Entrez database.
-    http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
-    http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
-
-
-    Does not currently support the usehistory, WebEnv, query_key, or
-    retstart parameters.
-
-    Help with the arguments adapted from efetchlit_help.html:
-
-    id: Primary UIs identifying the documents to fetch
-     For example: 'id=11877539, 11822933,11871444'
-
-    db: This command selects the database to be searched
-     For example: db=pubmed
-
-    retmax: Limit the number of items retrieved (default 20)
-     For example: retmax=100
-
-    retmode: Select the retrieval output format
-     xml   (not journals)
-     html
-     text
-     asn.1 (not journals)
-
-    rettype: Select the retrieval type
-     uilist
-     abstract (not omim)
-     citation (not omim)
-     medline  (not omim)
-     full     (journals and omim)
-
-    Not all retmodes are possible with all rettypes:
-     PubMed Options:
-            uilist abstract citation medline
-      xml     x       x*       x*       x*
-      text    x       x        x        x
-      html    x       x        x        x
-      asn.1  n/a      x*       x*       x
-      x = retrieval mode available
-      * returned retrieval type is the complete record in the retrieval mode
-      n/a - not available
-     OMIM Options: (not case sensitive)
-           uilist    docsum synopsis   variants   detailed ExternalLink
-           (MIM             (Clinical  (Allelic
-            numbers)         synopsis)  Variants)
-      xml     x        x*       x*        x*         x*         x*
-      text    x        x        x         x          x*         x*
-      html    x        x        x         x          x*         x*
-      asn.1   x*       x*       x*        x*         x*         x*
-      x = retrieval mode available
-      * returned retrieval type is the complete record in the retrieval mode
-      n/a - not available
-
-    """
-    idstring = ""
-    for d in id:
-        idstring += "%s," % d
-    idstring = idstring[:-1] # remove trailing comma
-    params = urlencode({
-            'tool': TOOL,
-            'email': EMAIL,
-            'id': idstring,
-            'db': db,
-            'retmax': retmax,
-            'retmode': retmode,
-            'rettype': rettype})
-
-    LOG.info("getting efetch from '%s?%s'" % (EFETCH_URL, params))
-    f = urllib.urlopen("%s?%s" % (EFETCH_URL, params))
-    string = f.read()
-    f.close()
-    LOG.debug('got:\n%s' % string)
-    return string
-
-
-## Fetch links by Primary ID from an Entrez database
-
-def _query_elink(id, term=None, db='all', dbfrom='pubmed',
-                 cmd=None, linkname=None, holding=None,
-                 version=1,
-                 reldate=None, daterange=None, datetype=None,
-                 retmode='xml'):
-    """
-    Fetch links from a list of primary IDs in an Entrez database.
-    http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
-    http://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html
-
-    Does not currently support the WebEnv or query_key parameters.
-
-    Help with the arguments adapted from efetchlit_help.html:
-
-    id: Primary UIs identifying the documents to fetch
-     For example: 'id=11877539, 11822933,11871444'
-
-    term: This command uses search terms or phrases with or without
-     Boolean operators to limit the returned matching links.
-
-    db: This command selects the databases to be searched for link targets.
-     For example: db=all
-
-    dbfrom: This command selects the database containing the ids.
-     For example: dbfrom=pubmed
-
-
-    cmd: Link commands
-     * prlinks - List the hyperlink to the primary LinkOut provider for
-                 multiple IDs and database. Each ID is processed separately.
-     * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut
-                             provider for a single ID and database.
-                             Return the elink command, since fetching
-                             it breaks the relative links in the
-                             publisher's page.
-     * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for
-                multiple IDs and database. Each ID is processed separately.
-     * llinkslib - List LinkOut URLs and Attributes for multiple IDs and
-                   database. Each ID is processed separately.
-     * lcheck - Check for the existence (Y or N) of an external link in for
-                multiple IDs and database.
-     * ncheck - Check for the existence of a neighbor link for each ID within
-                a database, e.g., Related Articles in PubMed.
-     * neighbor - Display neighbors within a database.
-     * neighbor_history - Create history (WebEnv & query_key) for use in other
-                          EUtilities.
-     * acheck - Lists Entrez databases links for multiple IDs from a single
-                database.
-
-    linkname: link to a specific neighbor subset
-     For example: linkname=nucleotide_nucleotide_comp
-
-    holding: List LinkOut URLs for the specified holding provider, (library).
-     Used only in conjunction with cmd=llinks or cmd=llinkslib
-     For example: cmd=llinkslib&holding=medlib
-
-    version: Include a version number to refer to the latest DTD.
-     For example: version=1
-      retrieves the latest DTD (eLink_050511.dtd) that includes the additional
-      elements, MenuTag, LinkInfo and IdLinkSet.
-
-    Date command are only valid for dbfrom=pubmed & cmd=neighbor
-    reldate: Limit items a number of days immediately preceding today's date.
-     For example: reldate=365
-
-    daterange:  Limit results bounded by two specific dates.
-     For example: daterange=('2001', '2002/01/01')
-     (implemented as mindate=2001&maxdate=2002/01/01)
-
-    datetype:  Limit dates to a specific date field based on database.
-     For example: datetype=edat
-
-    retmode: Select the retrieval output format
-     xml  (default)
-     ref  (only used with cmd=prlinks for one ID)
-
-    """
-    idstring = ""
-    for d in id:
-        idstring += "%s," % d
-    idstring = idstring[:-1] # remove trailing comma
-
-    params = urlencode({
-            'tool': TOOL,
-            'email': EMAIL,
-            'id': idstring,
-            'term': term,
-            'db': db,
-            'dbfrom': dbfrom,
-            'cmd': cmd,
-            'linkname': linkname,
-            'holding': holding,
-            'version': version,
-            'reldate': reldate,
-            'daterange': daterange,
-            'datetype': datetype,
-            'retmode': retmode})
-
-    LOG.info("getting elink from '%s?%s'" % (ELINK_URL, params))
-    f = urllib.urlopen("%s?%s" % (ELINK_URL, params))
-
-    if cmd == 'prlinks' and retmode == 'ref':
-        # Just get the link, we don't need the provider's webpage HTML.
-        url = f.geturl()
-        f.close()
-        return url
-
-    string = f.read()
-    f.close()
-    LOG.debug('got:\n%s' % string)
-    return string
-
-
-## Combining the searching and parsing (dropping some of the less used
-## features)
-
-def search_fetch_xml(term, db='pubmed', field=None,
-                     reldate=None, daterange=None, datetype=None,
-                     retmax=None, sort=None,
-                     validate=False, valid_fields=None,
-                     retmode='xml', rettype='medline'):
-    if validate and valid_fields == None:
-        valid_fields,field_tags,field_info = field_dict(db)
-    search_page = _query_esearch(term, db, field,
-                                 reldate, daterange, datetype,
-                                 retmax, rettype='uilist', sort=sort,
-                                 validate=validate, valid_fields=valid_fields)
-    pid_list = parse_esearch(search_page)
-    if not pid_list:
-        return None
-    fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype)
-    return fetch_page
-
-def search_link(term, db='pubmed', field=None,
-                reldate=None, daterange=None, datetype=None,
-                retmax=None, sort=None,
-                validate=False, valid_fields=None,
-                link_term=None, fromdb=None,
-                cmd=None, linkname=None, link_holding=None,
-                version=1,
-                link_reldate=None, link_daterange=None, link_datetype=None,
-                link_retmode='xml'):
-    if validate and valid_fields == None:
-        valid_fields,field_tags,field_info = field_dict(db)
-    search_page = _query_esearch(term, db, field,
-                                 reldate, daterange, datetype,
-                                 retmax, rettype='uilist', sort=sort,
-                                 validate=validate, valid_fields=valid_fields)
-    pid_list = parse_esearch(search_page)
-    link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb,
-                             cmd=cmd, linkname=linkname, holding=link_holding,
-                             version=version,reldate=link_reldate,
-                             daterange=link_daterange, datetype=link_datetype,
-                             retmode=link_retmode)
-    return link_page
 
 ## Use the external bibutils package to convert to BibTeX format
 
@@ -721,7 +132,7 @@ class Pipe (object):
     def __init__(self, cmds, stdin=None):
         if isinstance(stdin, str):
             stdin_str = stdin
-            stdin = PIPE
+            stdin = _subprocess.PIPE
         else:
             stdin_str = None
 
@@ -734,8 +145,9 @@ class Pipe (object):
             kwargs = {}
             if _POSIX:
                 kwargs['close_fds'] = True
-            self._procs.append(Popen(
-                    cmd, stdin=stdin, stdout=PIPE, stderr=PIPE, **kwargs))
+            self._procs.append(_subprocess.Popen(
+                    cmd, stdin=stdin, stdout=_subprocess.PIPE,
+                    stderr=_subprocess.PIPE, **kwargs))
 
         self.stdout,self.stderrs = self._communicate(input=stdin_str)
 
@@ -813,12 +225,12 @@ class Pipe (object):
 
             input_offset = 0
             while read_set or write_set:
-                LOG.debug('select on read %s, write %s' % (read_set,write_set))
+                LOG.debug('select on read %s, write %s' %(read_set, write_set))
                 try:
-                    rlist,wlist,xlist = select.select(read_set, write_set, [])
-                except select.error, e:
+                    rlist,wlist,xlist = _select.select(read_set, write_set, [])
+                except _select.error, e:
                     if e.args[0] == errno.EINTR:
-                        LOG.debug('EINTR')
+                        LOG.debug('EINTR: %s' % e)
                         continue
                     raise
                 LOG.debug('selected read %s, write %s, exception %s'
@@ -829,7 +241,7 @@ class Pipe (object):
                     # blocking.  POSIX defines PIPE_BUF >= 512
                     LOG.debug('write to stdin for process 0')
                     chunk = input[input_offset:input_offset+512]
-                    bytes_written = os.write(
+                    bytes_written = _os.write(
                         self._procs[0].stdin.fileno(), chunk)
                     input_offset += bytes_written
                     if input_offset >= len(input):
@@ -839,7 +251,7 @@ class Pipe (object):
                         LOG.debug('stdin complete')
                 if self._procs[-1].stdout in rlist:
                     LOG.debug('read stdout for final process')
-                    data = os.read(self._procs[-1].stdout.fileno(), 1024)
+                    data = _os.read(self._procs[-1].stdout.fileno(), 1024)
                     if data == '':
                         self._procs[-1].stdout.close()
                         read_set.remove(self._procs[-1].stdout)
@@ -848,7 +260,7 @@ class Pipe (object):
                 for i,proc in enumerate(self._procs):
                     if proc.stderr in rlist:
                         LOG.debug('read stderr for process %i' % i)
-                        data = os.read(proc.stderr.fileno(), 1024)
+                        data = _os.read(proc.stderr.fileno(), 1024)
                         if data == '':
                             proc.stderr.close()
                             read_set.remove(proc.stderr)
@@ -873,7 +285,7 @@ def medline_xml_to_bibtex(fetch_page):
     ...     '<?xml version="1.0"?>',
     ...     '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, '
     ...     '1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query'
-    ,,,     '/DTD/pubmed_110101.dtd">',
+    ...     '/DTD/pubmed_110101.dtd">',
     ...     '<PubmedArticleSet>',
     ...     ' <PubmedArticle>',
     ...     '  <MedlineCitation Owner="NLM" Status="MEDLINE">',
@@ -926,7 +338,7 @@ def medline_xml_to_bibtex(fetch_page):
     ...     ' </PubmedArticle>',
     ...     '</PubmedArticleSet>',
     ...     ])
-    >>> print medline_xml_to_bibtex(xml)
+    >>> print medline_xml_to_bibtex(xml)  # doctest: +REPORT_UDIFF
     @Article{King2010,
       author =       "William T. King and Meihong Su and Guoliang Yang",
       title =        "Monte Carlo simulation of mechanical unfolding of
@@ -940,30 +352,19 @@ def medline_xml_to_bibtex(fetch_page):
       pages =        "159--166",
       ISSN =         "1879-0003",
       doi =          "10.1016/j.ijbiomac.2009.12.001",
+      URL =          "http://www.ncbi.nlm.nih.gov/pubmed/20004685",
     }
     <BLANKLINE>
     """
-    LOG.info('convert medline XML to BibTeX\n%s' % fetch_page)
+    LOG.info('convert medline XML to BibTeX')
+    LOG.debug('convert from\n%s' % fetch_page)
     p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']],
              stdin=fetch_page)
     LOG.debug('converted to\n%s' % p.stdout)
     return p.stdout
 
 
-## Random
-
-def hints():
-    "Print Entrez search hints and exit"
-
-    print """
-free full text [sb]
-
-
-"""
-
-## Test with a mini-searching application
-
-if __name__ == "__main__":
+if __name__ == '__main__':
     from optparse import OptionParser
 
     usage_string = '\n'.join([
@@ -972,7 +373,7 @@ if __name__ == "__main__":
              '       (print medline xml matching search)',
             '| %prog -l [options] SEARCH_TERM'
              '    (print links to entries matching search)',
-            '| %prog -L [-d DATABASE] [-f FILE]  (list databases)',
+            '| %prog -L [-f FILE]                (list databases)',
             '| %prog -X [-d DATABASE] [-F FIELD] [-f FILE]'
              '  (list fields in a database, or details on a single field)',
             '',
@@ -981,6 +382,16 @@ if __name__ == "__main__":
             'See the docstrings in %prog or',
             ' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
              'eutils_help.html',
+            ' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
+             'eutils_help.html#UserSystemRequirements',
+            ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+             'einfo_help.html',
+            ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+            ' esearch_help.html',
+            ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+             'efetch_help.html',
+            ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
+             'elink_help.html',
             'for more details.'
             ])
 
@@ -994,19 +405,16 @@ if __name__ == "__main__":
     # the value of that option.
     # "
 
-    parser.add_option('-d', '--database', dest="database",
+    parser.add_option('-d', '--database', dest='database',
                       help="Search DATABASE (default '%default')",
-                      type='string', metavar="DATABASE", default='pubmed')
-    parser.add_option('-f', '--file', dest="filename",
-                      help="write output to FILE (default stdout)",
-                      type='string', metavar="FILE")
-    parser.add_option('-v', '--verbose', dest="verbose", action="store_true",
-                      help="Print lots of debugging information",
-                      default=False)
-    parser.add_option('-H', '--hints', callback=hints,
-                      help="Print Entrez search hints and exit",
-                      action="callback")
-
+                      type='string', metavar='DATABASE', default='pubmed')
+    parser.add_option('-f', '--file', dest='filename',
+                      help='write output to FILE (default stdout)',
+                      type='string', metavar='FILE')
+    parser.add_option('-v', '--verbose', dest='verbose', action='count',
+                      help=('Print minimal debugging information.  Use twice '
+                            'to get lots of debugging info.'),
+                      default=0)
 
     # mode control options
     mode = 'search'
@@ -1019,159 +427,196 @@ if __name__ == "__main__":
             mode = 'explain'
 
     parser.add_option('-L', '--list-mode', callback=set_mode,
-                      help="Run in list mode", action="callback")
+                      help='Run in list mode', action='callback')
     parser.add_option('-X', '--explain-mode', callback=set_mode,
-                      help="Run in explain mode", action="callback")
+                      help='Run in explain mode', action='callback')
 
     # search-fetch-xml-to-? options
     output = 'bibtex'
     def set_output(option, opt_str, value, parser):
         global output
         long_option = option.get_opt_string()
+        if long_option == '--output-xml':
+            output = 'medline'
+        if long_option == '--output-bibtex':
+            output = 'bibtex'
         if long_option == '--output-link':
             output = 'link'
-    parser.add_option('-W', '--raw', dest="raw", action="store_true",
-                      help="Output raw Entrez xml", default=False)
-    parser.add_option('-F', '--field', dest="field",
-                      help="Limit SEARCH_TERM to FIELD",
-                      type='string', metavar="FIELD")
-    parser.add_option('-r', '--reldate', dest="reldate",
-                      help="Limit search to dates within DAYS of today",
-                      type='string', metavar="DAYS")
-    parser.add_option('-R', '--daterange', dest="daterange",
-                      help=("Limit search to dates within DATERANGE "
-                            "(e.g. '2001/1/1,2002')"),
-                      type='string', metavar="DATERANGE")
-    parser.add_option('-t', '--datetype', dest="datetype",
+    parser.add_option('-x', '--output-xml', callback=set_output,
+                      help='Output search results as Medline XML',
+                      action='callback')
+    parser.add_option('-b', '--output-bibtex', callback=set_output,
+                      help='Output search results as BibTeX',
+                      action='callback')
+    parser.add_option('-F', '--field', dest='field',
+                      help='Limit SEARCH_TERM to FIELD',
+                      type='string', metavar='FIELD')
+    parser.add_option('-r', '--reldate', dest='reldate',
+                      help='Limit search to dates within DAYS of today',
+                      type='string', metavar='DAYS')
+    parser.add_option('--mindate', dest='mindate',
+                      help=('Limit search to date after MINDATE '
+                            "(e.g. '2001/1/1' or '2002')"),
+                      type='string', metavar='MINDATE')
+    parser.add_option('--maxdate', dest='maxdate',
+                      help=('Limit search to date after MAXDATE '
+                            "(e.g. '2001/1/1' or '2002')"),
+                      type='string', metavar='MAXDATE')
+    parser.add_option('-t', '--datetype', dest='datetype',
                       help=("Select field to apply date limits to "
                             "(e.g. 'edat' for Entrez date)"),
-                      type='string', metavar="DATETYPE")
-    parser.add_option('-m', '--retmax', dest="retmax",
-                      help=('Return at max RETMAX items from a successful '
+                      type='string', metavar='DATETYPE')
+    parser.add_option('-m', '--retmax', dest='retmax',
+                      help=('Return at most RETMAX items from a successful '
                             'search (default %default)'),
-                      type='string', metavar="RETMAX", default=20)
-    parser.add_option('-M', '--retmode', dest="retmode",
-                      help="Select fetch/link output format",
-                      type='string', metavar="RETMODE", default='xml')
-    parser.add_option('-V', '--validate', dest="validate", action="store_true",
+                      type='int', metavar='RETMAX', default=20)
+    parser.add_option('-s', '--retstart', dest='retstart',
+                      help=('Index of first returned search item from a '
+                            'successful search (default %default)'),
+                      type='int', metavar='RETSTART', default=0)
+    parser.add_option('-V', '--validate', dest='validate', action='store_true',
                       help=('Check that FIELD and field tags in SEARCH_TERM '
                             'are valid for DB'),
                       default=False)
 
     # output link options
     parser.add_option('-l', '--output-link', callback=set_output,
-                      help="Output a link (instead of xml citations)",
-                      action="callback")
-    parser.add_option('-c', '--link-cmd', dest="link_cmd",
-                      help="Select link output",
-                      type='string', metavar="LINK_CMD")
-    parser.add_option('-T', '--link-term', dest="link_term",
-                      help="Limit links to those matching LINK_TERM",
-                      type='string', metavar="LINK_TERM")
-    parser.add_option('-D', '--from-database', dest="fromdb",
-                      help="Limit links to those from FROMDATABASE)",
-                      type='string', metavar="FROMDATABASE")
-    parser.add_option('-n', '--link-name', dest="linkname",
-                      help="Limit links to a specific neighbor",
-                      type='string', metavar="LINKNAME")
+                      help='Output a link (instead of xml citations).',
+                      action='callback')
+    parser.add_option('-c', '--link-cmd', dest='link_cmd',
+                      help='Select link output',
+                      type='string', metavar='LINK_CMD')
+    parser.add_option('-T', '--link-term', dest='link_term',
+                      help='Limit links to those matching LINK_TERM',
+                      type='string', metavar='LINK_TERM')
+    parser.add_option('-D', '--from-database', dest='dbfrom',
+                      help='Limit links to those from FROMDATABASE)',
+                      type='string', metavar='FROMDATABASE')
+    parser.add_option('-n', '--link-name', dest='linkname',
+                      help='Limit links to a specific neighbor',
+                      type='string', metavar='LINKNAME')
 
     (options, args) = parser.parse_args()
     parser.destroy()
 
     # open the output file if specified
     if options.filename == None:
-        outfile = sys.stdout
+        outfile = _sys.stdout
     else:
         outfile = file(options.filename, 'w')
 
-    if options.verbose:
-        LOG.setLevel(logging.DEBUG)
+    if options.verbose == 1:
+        LOG.setLevel(_logging.INFO)
+    elif options.verbose > 1:
+        LOG.setLevel(_logging.DEBUG)
 
     LOG.debug('operating in %s mode' % mode)
 
     if mode == 'list':
-        print >> outfile, "Available databases:"
-        databases = database_list()
-        for db in databases:
-            print >> outfile, "\t%s" % db
+        outfile.write('# available databases:\n')
+        LOG.info('run eInfo to get list of databases')
+        q = EUTILS_CLIENT.service.run_eInfo(tool=TOOL, email=EMAIL)
+        if hasattr(q, 'ERROR'):
+            raise Exception(q.ERROR)
+
+        for db in q.DbList.DbName:
+            outfile.write('%s\n' % db)
 
     elif mode == 'explain':
-        fields,tags,field_info = field_dict(db=options.database)
-        if options.field == None:
-            print >> outfile, "Available fields in %s:" % options.database
-            field_size = [0,0]
-            for field in fields:
-                if len(field) > field_size[0]:
-                    field_size[0] = len(field)
-                if len(field_info[field]['FullName']) > field_size[1]:
-                    field_size[1] = len(field_info[field]['FullName'])
-            for field in fields:
-                print >> outfile, ('\t%*.*s\t%-*.*s'
-                                   % (field_size[0], field_size[0], field,
-                                      field_size[1], field_size[1],
-                                      field_info[field]['FullName']))
-        else:
-            print >> outfile, (
-                'Field %s in %s:' % (options.field,options.database))
+        LOG.info('run eInfo on %s' % options.database)
+        q = EUTILS_CLIENT.service.run_eInfo(
+            db=options.database, tool=TOOL, email=EMAIL)
+        if hasattr(q, 'ERROR'):
+            raise Exception(q.ERROR)
+
+        if options.field:  # print specific info about this field
+            outfile.write(
+                'field %s in %s:\n' % (options.field, options.database))
+            fields = dict(
+                [(field.Name, field) for field in q.DbInfo.FieldList.Field])
+            field = fields[options.field]
+            attributes = sorted(
+                [(a, getattr(field, a)) for a in dir(field)
+                 if not a.startswith('_')])
+            field_size = [0]
+            for attribute,value in attributes:
+                if len(attribute) > field_size[0]:
+                    field_size[0] = len(attribute)
+            for attribute,value in attributes:
+                outfile.write(
+                    '%*.*s\t%s\n'
+                    % (field_size[0], field_size[0], attribute, value))
+        else:  # print general info
+            outfile.write('database: %s\n' % q.DbInfo.DbName)
+            outfile.write('description: %s\n' % q.DbInfo.Description)
+            outfile.write('available fields:\n')
             field_size = [0,0]
-            for key in tags:
-                if len(key) > field_size[0]:
-                    field_size[0] = len(key)
-                if len(field_info[options.field][key]) > field_size[1]:
-                    field_size[1] = len(field_info[options.field][key])
-            for key in tags:
-                print >> outfile, ('\t%*.*s\t%-*.*s'
-                                   % (field_size[0], field_size[0], key,
-                                      field_size[1], field_size[1],
-                                      field_info[options.field][key]))
+            for field in q.DbInfo.FieldList.Field:
+                if len(field.Name) > field_size[0]:
+                    field_size[0] = len(field.Name)
+                if len(field.FullName) > field_size[1]:
+                    field_size[1] = len(field.FullName)
+            for field in q.DbInfo.FieldList.Field:
+                outfile.write(
+                    '%*.*s\t%-*.*s\t%s\n'
+                    % (field_size[0], field_size[0], field.Name,
+                       field_size[1], field_size[1], field.FullName,
+                       field.Description))
 
     elif mode == 'search':
         search_term = args[0]
         LOG.debug('output %s' % output)
 
-        if output == 'bibtex':
-            medline_xml = search_fetch_xml(term=search_term,
-                                           db=options.database,
-                                           field=options.field,
-                                           reldate=options.reldate,
-                                           daterange=options.daterange,
-                                           datetype=options.datetype,
-                                           retmax=options.retmax,
-                                           validate=options.validate,
-                                           retmode=options.retmode,
-                                           rettype='medline')
-            if medline_xml:
-                if options.raw:
-                    print outfile, medline_xml
-                else:
-                    bibtex = medline_xml_to_bibtex(medline_xml)
-                    print >> outfile, bibtex
-
-        elif output == 'link':
-            # Assume that if you're looking for links
-            # your search is already pretty refined,
-            # so use the date options for link-limiting.
-            link_xml = search_link(term=search_term,
-                                   db=options.database,
-                                   field=options.field,
-                                   reldate=None,
-                                   daterange=None,
-                                   datetype=None,
-                                   retmax=None,
-                                   sort=None,
-                                   validate=options.validate,
-                                   valid_fields=None,
-                                   link_term=options.link_term,
-                                   fromdb=options.fromdb,
-                                   cmd=options.link_cmd,
-                                   linkname=options.linkname,
-                                   link_holding=None,
-                                   version=1,
-                                   link_reldate=options.reldate,
-                                   link_daterange=options.daterange,
-                                   link_datetype=options.datetype,
-                                   link_retmode=options.retmode,)
-            print >> outfile, link_xml
+        LOG.info('maxdate: %r, mindate %r' % (options.maxdate, options.mindate))
+        if options.mindate and not options.maxdate:
+            options.maxdate = _time.strftime('%Y/%M/%d')
+            LOG.info('fill in maximum date: %s' % options.maxdate)
+        elif options.maxdate and not options.mindate:
+            options.mindate = '0'
+            LOG.info('fill in minimum date: %s' % options.mindate)
+
+        LOG.info('run eEsearch on %s' % options.database)
+        q = EUTILS_CLIENT.service.run_eSearch(
+            db=options.database, term=search_term, tool=TOOL, email=EMAIL,
+            field=options.field, reldate=options.reldate,
+            mindate=options.mindate, maxdate=options.maxdate,
+            datetype=options.datetype, 
+            RetStart=options.retstart, RetMax=options.retmax,
+            #sort=)
+            )
+        if hasattr(q, 'ERROR'):
+            raise Exception(q.ERROR)
+        if hasattr(q.IdList, 'Id'):
+            ret = int(len(q.IdList.Id))
+        else:
+            ret = 0
+        LOG.info('search returned %d of %d items' % (ret, int(q.Count)))
+
+        if ret > 0:
+            if output in ['medline', 'bibtex']:
+                LOG.info('run eFetch on %s' % options.database)
+                efetch_client = _Client(EFETCH_WSDL_URL % options.database)
+                f = efetch_client.service.run_eFetch(
+                    id=','.join(q.IdList.Id), tool=TOOL, email=EMAIL)
+                if hasattr(f, 'ERROR'):
+                    raise Exception(f.ERROR)
+
+            if output == 'medline':
+                outfile.write(str(efetch_client.last_received()).rstrip()+'\n')
+            elif output == 'bibtex':
+                outfile.write(
+                    medline_xml_to_bibtex(str(efetch_client.last_received())))
+            elif output == 'link':
+                LOG.info('run eLink on %s' % options.database)
+                f = EUTILS_CLIENT.service.run_eLink(
+                    db=options.database, id=','.join(q.IdList.Id),
+                    #reldate=, mindate=, maxdate=, datetype=,
+                    term=options.link_term, dbfrom=options.dbfrom,
+                    linkname=options.linkname, cmd=options.link_cmd,
+                    tool=TOOL, email=EMAIL)
+                outfile.write(str(EUTILS_CLIENT.last_received()).rstrip()+'\n')
+            else:
+                raise KeyError(output)
 
     if options.filename != None:
         outfile.close()
-- 
2.26.2