posts/entrez/entrez.py

   1 #!/usr/bin/python
   2 #
   3 # Copyright (C) 1998-2004 Frederic Gobry
   4 # Copyright (C) 2008-2011 W. Trevor King
   5 #
   6 # This program is free software; you can redistribute it and/or modify
   7 # it under the terms of the GNU General Public License as published by
   8 # the Free Software Foundation, either version 2 of the License, or
   9 # (at your option) any later version.
  10 #
  11 # This program is distributed in the hope that it will be useful, but
  12 # WITHOUT ANY WARRANTY; without even the implied warranty of
  13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14 # General Public License for more details.
  15 #
  16 # You should have received a copy of the GNU General Public License
  17 # along with This program.  If not, see <http://www.gnu.org/licenses/>.
  18 #
  19 # Code following John Vu's medline query code pybliographer/Pyblio/Query.py,
  20 #
  21 # Python interface to the Entrez databases.
  22 # See http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
  23 # Current as of August 1, 2007
  24 #
  25 # Rules:
  26 #    * Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays for any series of more than 100 requests.
  27 #    * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, not the standard NCBI Web address.
  28 #    * Make no more than one request every 3 seconds.
  29 #    * Use the URL parameter email, and tool for distributed software, so that we can track your project and contact you if there is a problem.
  30 #    * NCBI's Disclaimer and Copyright notice must be evident to users of your service.
  31 #      NLM does not claim the copyright on the abstracts in PubMed; however, journal publishers or authors may.
  32 #      NLM provides no legal advice concerning distribution of copyrighted materials, consult your legal counsel.
  33 #
  34 # For a good Python-and-XML-DOM intro, see
  35 #  http://www.boddie.org.uk/python/XML_intro.html
  36 # for the official docs, see
  37 #  http://docs.python.org/lib/module-xml.dom.html
  38
  39 """Python bindings on Entrez database queries.
  40 """
  41
  42 import logging
  43 import re
  44 import string
  45 import sys
  46 import time    # for querying date ranges of publications
  47 import urllib
  48
  49 # DOM module for parsing XML,
  50 # supports Document Object Model (DOM) Level 1 Specification
  51 # http://docs.python.org/lib/module-xml.dom.minidom.html
  52 import xml.dom.minidom as dom
  53
  54 # For calling the bibutils conversion programs
  55 from subprocess import Popen, PIPE
  56
  57 # Platform constants
  58 _MSWINDOWS = sys.platform == 'win32'
  59 _POSIX = not _MSWINDOWS
  60
  61 if _POSIX:
  62     import os
  63     import select
  64
  65
  66 # Entrez access points
  67 einfo_url   = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
  68 esearch_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
  69 efetch_url  = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
  70 elink_url   = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
  71
  72 # Entrez-requested tracking information
  73 TOOL = 'entrezpy'
  74 EMAIL = 'wking@drexel.edu'
  75
  76 # Logger
  77
  78 LOG = logging.getLogger(TOOL)
  79 LOG.setLevel(logging.WARN)
  80 _handler = logging.StreamHandler()
  81 _formatter = logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
  82 _handler.setFormatter(_formatter)
  83 LOG.addHandler(_handler)
  84 del _handler, _formatter
  85
  86 ## XML and list utility functions
  87
  88 def urlencode(param_dict) :
  89     params = ""
  90     for key,value in param_dict.items() :
  91         if value == None :
  92             continue # ignore unused parameter
  93         #if type(value)== : # convert True/False to 'y'/<no-entry>
  94         #    if value == True :
  95         #        params += "%s=y&" % (key,)
  96         #    #else :
  97         #    #    params += "%s=n&" % (key,)
  98         if value != None :
  99             params += "%s=%s&" % (key, str(value))
 100     if len(params) > 1 :
 101         params = params[:-1] # remove trailing &
 102     return params
 103
 104 def unique(seq, keepstr=True):
 105     """
 106     Return the sequence (list, tuple, etc) without repeating entries
 107     by Paul Rubin and Jordan Callicoat.
 108     http://groups.google.com/group/comp.lang.python/browse_thread/thread/40c6c455f4fd5154/744a1a338afe1331?lnk=gst&rnum=7#744a1a338afe1331
 109
 110     for example [1,2,3,1,2] -> [1,2,3]
 111     """
 112     t = type(seq)
 113     if t in (str, unicode):
 114         t = (list, ''.join)[bool(keepstr)]
 115     seen = []
 116     return t(c for c in seq if not (c in seen or seen.append(c)))
 117
 118 def get_text(node) :
 119     """
 120     Given a node (<node-name> in the following example),
 121      extract some-text from '<node-name>some-text</node-name>'
 122      returns u'some-text'.
 123     However, if the xml is '</node-name>' returns None
 124     """
 125     if len(node.childNodes) == 1:
 126         data = node.childNodes[0].data
 127     elif len(node.childNodes) == 0: # empty node
 128         data = None
 129     else :
 130         raise Exception, "Node contains more than text"
 131     return data
 132
 133 def get_child_nodes(node, child_name):
 134     """
 135     Given a node (<node-name> in the following example),
 136     returns an array of nodes matching <child-name>
 137     """
 138     ret = []
 139     for n in node.childNodes:
 140         if n.nodeType != n.ELEMENT_NODE:
 141             continue # ignore text, comment, etc. nodes
 142         if n.tagName == child_name :
 143             ret.append(n)
 144     return ret
 145
 146 def get_child_nodes(node, child_name):
 147     """
 148     Given a node (<node-name> in the following example),
 149     returns an the node matching <child-name>
 150     """
 151     nodes = get_child_node(node, child_name)
 152     assert len(nodes) == 1, "%d child nodes named %s" % (len(nodes), child_name)
 153     return node[0]
 154
 155 def get_child_contents(node, child_name):
 156     """
 157     Given a node (<node-name> in the following example),
 158     extract some-text from '<node-name>
 159                               <some-tag>some-text</some-tag>
 160                               <other-tag>other-text</other-tag>
 161                               <some-tag>some-other-text</some-tag>
 162                               ...
 163                             </node-name>'
 164     Returns ['some-text', 'some-other-text', ...]
 165     """
 166     nodes = get_child_nodes(node, child_name)
 167     ret = []
 168     for n in nodes:
 169         ret.append(get_text(n))
 170     return ret
 171
 172 def get_child_dict(node):
 173     """
 174     Given a node (<node-name> in the following example),
 175     extract some-text from '<node-name>
 176                               <some-tag>some-text</some-tag>
 177                               <other-tag>other-text</other-tag>
 178                               <some-tag>some-other-text</some-tag>
 179                               ...
 180                             </node-name>'
 181     Returns {'some-tag':['some-text', 'some-other-text', ...],
 182              'other-tag':['some-other-text']}
 183     """
 184     dict = {}
 185     tags = [] # to preserve order of tags
 186     for n in node.childNodes:
 187         if n.nodeType != n.ELEMENT_NODE:
 188             continue # ignore text, comment, etc. nodes
 189         try: # another entry for an existing tag
 190             dict[n.tagName].append(get_text(n))
 191         except KeyError: # new tag
 192             dict[n.tagName] = [get_text(n)]
 193             tags.append(n.tagName)
 194     return (dict, tags)
 195
 196 def delist_dict(dict) :
 197     """
 198     Given a dict
 199         e.g. {'some-tag':['some-text', 'some-other-text', ...],
 200               'other-tag':['some-other-text'], ...}  ,
 201     replaces any values in an array of length 1 with the element,
 202         e.g. {'some-tag':['some-text', 'some-other-text', ...],
 203               'other-tag':'some-other-text', ...}  ,
 204     """
 205     for key,value in dict.items() :
 206         if isinstance(value, list) and len(value) == 1 :
 207             dict[key] = value[0]
 208     return dict
 209
 210 ## Get information about the Entrez databases themselves
 211
 212 def _query_einfo(db=None):
 213     """
 214     Get information about the Entrez databases themselves.
 215     http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
 216
 217     Either list all available databases with db=None, or
 218     Specific information on a particular database (e.g. pubmed) with db=pubmed.
 219     """
 220     params = urlencode ({
 221             'db': db,
 222             'tool' : TOOL,
 223             'email' : EMAIL})
 224
 225     LOG.info("getting einfo from '%s?%s'" % (einfo_url, params))
 226     f = urllib.urlopen ("%s?%s" % (einfo_url, params))
 227     string = f.read()
 228     f.close()
 229     LOG.debug('got:\n%s' % string)
 230     return string
 231
 232 def get_parsed_einfo(db=None, page=None, parsed=None):
 233     """
 234     Helper function for various einfo processing functions.
 235     Allow each processor to function
 236       independently                      (page=None, parsed=None),
 237       with a shared xml string           (page=<xml-string>, parsed=None), or
 238       with a shared parsed xml structure (page=*, parsed=<parsed_xml>).
 239     Use clean_parsed_einfo() for cleanup
 240     """
 241     if page == None and parsed == None:
 242         LOG.info('downloading new einfo page')
 243         page = _query_einfo(db)
 244     if parsed == None :
 245         LOG.info('parsing new einfo page')
 246         parsed = dom.parseString(page)
 247         parsed_islocal = True
 248     else :
 249         LOG.info('using old einfo parsing')
 250         parsed_islocal = False
 251     return (parsed, parsed_islocal)
 252
 253 def clean_parsed_einfo(parsed, parsed_islocal=True):
 254     """
 255     Helper function for various einfo processing functions.
 256     Clean up the parsed xml structure if the calling function created it.
 257     """
 258     if parsed_islocal == True :
 259         LOG.info('cleaning up einfo parsing')
 260         parsed.unlink() # clean up the DOM
 261
 262 def database_list(page=None, parsed=None):
 263     parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed)
 264     databases = []
 265     for node in parsed.getElementsByTagName("DbName"):
 266         # Extract some-text from '<DbName>some-text</DbName>'
 267         # by default, xml.dom.minidom uses unicode,
 268         # so strings get printed: "u'string contents'"
 269         databases.append(get_text(node))
 270     clean_parsed_einfo(parsed,parsed_islocal)
 271     return databases
 272
 273 def field_dict(db='pubmed', page=None, parsed=None):
 274     parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
 275     fields = []
 276     tags = []
 277     field_info = {}
 278     fieldlists = parsed.getElementsByTagName("FieldList")
 279     assert len(fieldlists) == 1, "%s\n\n%d FieldLists!" % (parsed.toxml(), len(fieldlists))
 280     fieldlist = fieldlists[0]
 281     for node in fieldlist.childNodes:
 282         if node.nodeType != node.ELEMENT_NODE :
 283             continue # ignore text, comment, etc. nodes
 284         assert node.tagName == "Field", "Unrecognized tag '%s' in FieldList" % node.tagName
 285         field,new_tags = get_child_dict(node)
 286         assert len(field['Name']) == 1, "Multiple field names %s" % str(field['Name'])
 287         field = delist_dict(field)
 288         fields.append(field['Name'])
 289         new_tags = unique(tags + new_tags)
 290         if tags != []:
 291             assert new_tags == tags, "Inconsistent tags"
 292         tags = new_tags
 293         field_info[field['Name']] = field
 294     clean_parsed_einfo(parsed,parsed_islocal)
 295     return (fields, tags, field_info)
 296
 297 def link_dict(db='pubmed', page=None, parsed=None):
 298     parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
 299     links = []
 300     tags = []
 301     link_info = []
 302     linklists = parsed.getElementsByTagName("LinkList")
 303     assert len(linklists) == 1, "%s\n\n%d LinkLists!" % (parsed.toxml(), len(linklists))
 304     linklist = linklists[0]
 305     for node in linklist.childNodes:
 306         if node.nodeType != node.ELEMENT_NODE :
 307             continue # ignore text, comment, etc. nodes
 308         assert node.tagName == "Link", "Unrecognized tag '%s' in LinkList" % node.tagName
 309         link,new_tags = get_child_dict(node)
 310         assert len(link['Name']) == 1, "Multiple link names %s" % str(link['Name'])
 311         link = delist_dict(link)
 312         links.append(link['Name'])
 313         new_tags = unique(tags + new_tags)
 314         if tags != []:
 315             assert new_tags == tags, "Inconsistent tags"
 316         tags = new_tags
 317         link_info[link['Name']] = link
 318     clean_parsed_einfo(parsed,parsed_islocal)
 319     return (links, tags, link_info)
 320
 321 def database_info(db='pubmed', page=None, parsed=None):
 322     "Convenience function to call both field_dict and link_dict"
 323     parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
 324     fields,field_tags,field_info = field_dict(db=db, parsed=parsed)
 325     links,link_tags,link_info = link_dict(db=db, parsed=parsed)
 326     clean_parsed_einfo(parsed,parsed_islocal)
 327     return (fields, field_tags, field_info, links, link_tags, link_info)
 328
 329 def validate_field(field, fields):
 330     "Ensure that field is a valid field for the database db."
 331     try :
 332         fields.index(field.upper())
 333     except ValueError:
 334         raise Exception, "Field '%s' invalid\nValid fields are\n %s" \
 335                          % (field, str(fields))
 336
 337 def strip_fields_from_term(term):
 338     "HACK: really stupid algorithm"
 339     fields = []
 340     infield = False
 341     for i in range(len(term)):
 342         if term[i] == '[' and infield == False :
 343             infield = True
 344             field_start = i+1
 345         elif term[i] == ']' and infield == True :
 346             infield = False
 347             fields.append(term[field_start:i])
 348     return fields
 349
 350 def validate_search_term(term, fields):
 351     "Ensure that the fields in term are valid fields for the database db."
 352     for field in strip_fields_from_term(term) :
 353         validate_field(field, fields)
 354
 355
 356 ## Search an Entrez database
 357
 358 def _query_esearch(term, db='pubmed', field=None,
 359                    reldate=None, daterange=None, datetype=None,
 360                    retmax=None, rettype=None, sort=None,
 361                    validate=False, valid_fields=None, debug=False) :
 362     """
 363     Search an Entrez database.
 364     http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
 365
 366     Does not currently support the usehistory, WebEnv, query_key, retstart, or retmode parameters.
 367
 368     Help with the arguments adapted from esearch_help.html:
 369
 370     term: This command uses search terms or phrases with or without Boolean operators.
 371      You can search in several fields using the [term field] tag.
 372      You can search in a single field using the 'field' parameter below.
 373      ?You may also tag search terms using field=tag.? I don't understand this line
 374      For example: term=asthma[MESH]+OR+hay+fever[MESH]
 375       'term=asthma[MESH]' is the same as 'term=asthma&field=MESH'
 376       ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea )
 377
 378     db: This command selects the database to be searched
 379      For example: db=pubmed
 380
 381     field: Use this command to specify a specific search field.
 382      PubMed fields: affl, auth, ecno, jour, iss, mesh,...
 383      Retrieve with field_dict('pubmed')
 384      For example: field=auth
 385
 386     reldate: Limit items a number of days immediately preceding today's date.
 387      For example: reldate=365
 388
 389     daterange:  Limit results bounded by two specific dates.
 390      For example: daterange=('2001', '2002/01/01')
 391      (implemented as mindate=2001&maxdate=2002/01/01)
 392
 393     datetype:  Limit dates to a specific date field based on database.
 394      For example: datetype=edat
 395
 396     retmax: Limit the number of items retrieved
 397      For example: retmax=100
 398
 399     rettype: Select the retrieval type
 400      PubMed values: count, uilist (default)
 401
 402     sort: Sort the returned uilist
 403      PubMed values: author, last+author, journal, pub+date
 404
 405     """
 406     if daterange != None :
 407         assert len(daterange) == 2, "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
 408         reldate == None, "Specifying date with daterange AND reldate!"
 409         mindate = daterange[0]
 410         maxdate = daterange[1]
 411     else :
 412         mindate = None
 413         maxdate = None
 414     if validate :
 415         assert len(valid_fields) > 0, "Need a list of valid fields to validate"
 416         if field != None :
 417             validate_field(field)
 418         validate_search_term(term, valid_fields)
 419     params = urlencode ({
 420             'tool' : TOOL,
 421             'email' : EMAIL,
 422             'term' : term,
 423             'db': db,
 424             'field' : field,
 425             'reldate' : reldate,
 426             'mindate' : mindate,
 427             'maxdate' : maxdate,
 428             'datetype' : datetype,
 429             'maxdate' : maxdate,
 430             'retmax' : retmax,
 431             'rettype' : rettype,
 432             'sort' : sort})
 433
 434     LOG.info("getting esearch from '%s?%s'" % (esearch_url, params))
 435     f = urllib.urlopen ("%s?%s" % (esearch_url, params))
 436     string = f.read()
 437     f.close()
 438     LOG.debug('got:\n%s' % string)
 439     return string
 440
 441 def parse_esearch(page):
 442     "Parse the xml returned by _query_esearch()"
 443     parsed = dom.parseString(page)
 444
 445     pid_list = []
 446     for node in parsed.getElementsByTagName("Id"):
 447         pid_list.append(get_text(node))
 448
 449     parsed.unlink()
 450
 451     return pid_list
 452
 453
 454 ## Fetch records by Primary ID from an Entrez database
 455
 456 def _query_efetch(id, db='pubmed',
 457                   retmax=None, retmode='xml', rettype='medline'):
 458     """
 459     Fetch records by primary ID from an Entrez database.
 460     http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
 461     http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
 462
 463
 464     Does not currently support the usehistory, WebEnv, query_key, or retstart parameters.
 465
 466     Help with the arguments adapted from efetchlit_help.html:
 467
 468     id: Primary UIs identifying the documents to fetch
 469      For example: 'id=11877539, 11822933,11871444'
 470
 471     db: This command selects the database to be searched
 472      For example: db=pubmed
 473
 474     retmax: Limit the number of items retrieved (default 20)
 475      For example: retmax=100
 476
 477     retmode: Select the retrieval output format
 478      xml   (not journals)
 479      html
 480      text
 481      asn.1 (not journals)
 482
 483     rettype: Select the retrieval type
 484      uilist
 485      abstract (not omim)
 486      citation (not omim)
 487      medline  (not omim)
 488      full     (journals and omim)
 489
 490     Not all retmodes are possible with all rettypes:
 491      PubMed Options:
 492             uilist abstract citation medline
 493       xml     x       x*       x*       x*
 494       text    x       x        x        x
 495       html    x       x        x        x
 496       asn.1  n/a      x*       x*       x
 497       x = retrieval mode available
 498       * returned retrieval type is the complete record in the retrieval mode
 499       n/a - not available
 500      OMIM Options: (not case sensitive)
 501            uilist    docsum synopsis   variants   detailed ExternalLink
 502            (MIM             (Clinical  (Allelic
 503             numbers)         synopsis)  Variants)
 504       xml     x        x*       x*        x*         x*         x*
 505       text    x        x        x         x          x*         x*
 506       html    x        x        x         x          x*         x*
 507       asn.1   x*       x*       x*        x*         x*         x*
 508       x = retrieval mode available
 509       * returned retrieval type is the complete record in the retrieval mode
 510       n/a - not available
 511
 512     """
 513     idstring = ""
 514     for d in id :
 515         idstring += "%s," % d
 516     idstring = idstring[:-1] # remove trailing comma
 517     params = urlencode ({
 518             'tool' : TOOL,
 519             'email' : EMAIL,
 520             'id' : idstring,
 521             'db': db,
 522             'retmax' : retmax,
 523             'retmode' : retmode,
 524             'rettype' : rettype})
 525
 526     LOG.info("getting efetch from '%s?%s'" % (efetch_url, params))
 527     f = urllib.urlopen ("%s?%s" % (efetch_url, params))
 528     string = f.read()
 529     f.close()
 530     LOG.debug('got:\n%s' % string)
 531     return string
 532
 533
 534 ## Fetch links by Primary ID from an Entrez database
 535
 536 def _query_elink(id, term=None, db='all', dbfrom='pubmed',
 537                  cmd=None, linkname=None, holding=None,
 538                  version=1,
 539                  reldate=None, daterange=None, datetype=None,
 540                  retmode='xml'):
 541     """
 542     Fetch links from a list of primary IDs in an Entrez database.
 543     http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
 544     http://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html
 545
 546     Does not currently support the WebEnv or query_key parameters.
 547
 548     Help with the arguments adapted from efetchlit_help.html:
 549
 550     id: Primary UIs identifying the documents to fetch
 551      For example: 'id=11877539, 11822933,11871444'
 552
 553     term: This command uses search terms or phrases with or without Boolean operators
 554      to limit the returned matching links.
 555
 556     db: This command selects the databases to be searched for link targets.
 557      For example: db=all
 558
 559     dbfrom: This command selects the database containing the ids.
 560      For example: dbfrom=pubmed
 561
 562
 563     cmd: Link commands
 564      * prlinks - List the hyperlink to the primary LinkOut provider for
 565                  multiple IDs and database. Each ID is processed separately.
 566      * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut provider
 567                              for a single ID and database.  Return the elink
 568                              command, since fetching it breaks the relative
 569                              links in the publisher's page.
 570      * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for
 571                 multiple IDs and database. Each ID is processed separately.
 572      * llinkslib - List LinkOut URLs and Attributes for multiple IDs and
 573                    database. Each ID is processed separately.
 574      * lcheck - Check for the existence (Y or N) of an external link in for
 575                 multiple IDs and database.
 576      * ncheck - Check for the existence of a neighbor link for each ID within
 577                 a database, e.g., Related Articles in PubMed.
 578      * neighbor - Display neighbors within a database.
 579      * neighbor_history - Create history (WebEnv & query_key) for use in other
 580                           EUtilities.
 581      * acheck - Lists Entrez databases links for multiple IDs from a single
 582                 database.
 583
 584     linkname: link to a specific neighbor subset
 585      For example: linkname=nucleotide_nucleotide_comp
 586
 587     holding: List LinkOut URLs for the specified holding provider, (library).
 588      Used only in conjunction with cmd=llinks or cmd=llinkslib
 589      For example: cmd=llinkslib&holding=medlib
 590
 591     version: Include a version number to refer to the latest DTD.
 592      For example: version=1
 593       retrieves the latest DTD (eLink_050511.dtd) that includes the additional
 594       elements, MenuTag, LinkInfo and IdLinkSet.
 595
 596     Date command are only valid for dbfrom=pubmed & cmd=neighbor
 597     reldate: Limit items a number of days immediately preceding today's date.
 598      For example: reldate=365
 599
 600     daterange:  Limit results bounded by two specific dates.
 601      For example: daterange=('2001', '2002/01/01')
 602      (implemented as mindate=2001&maxdate=2002/01/01)
 603
 604     datetype:  Limit dates to a specific date field based on database.
 605      For example: datetype=edat
 606
 607     retmode: Select the retrieval output format
 608      xml  (default)
 609      ref  (only used with cmd=prlinks for one ID)
 610
 611     """
 612     idstring = ""
 613     for d in id :
 614         idstring += "%s," % d
 615     idstring = idstring[:-1] # remove trailing comma
 616
 617     params = urlencode ({
 618             'tool' : TOOL,
 619             'email' : EMAIL,
 620             'id' : idstring,
 621             'term': term,
 622             'db': db,
 623             'dbfrom': dbfrom,
 624             'cmd': cmd,
 625             'linkname': linkname,
 626             'holding': holding,
 627             'version': version,
 628             'reldate': reldate,
 629             'daterange': daterange,
 630             'datetype': datetype,
 631             'retmode' : retmode})
 632
 633     LOG.info("getting elink from '%s?%s'" % (elink_url, params))
 634     f = urllib.urlopen ("%s?%s" % (elink_url, params))
 635
 636     if cmd == 'prlinks' and retmode == 'ref' :
 637         # Just get the link, we don't need the provider's webpage HTML.
 638         url = f.geturl()
 639         f.close()
 640         return url
 641
 642     string = f.read()
 643     f.close()
 644     LOG.debug('got:\n%s' % string)
 645     return string
 646
 647
 648 ## Combining the searching and parsing (dropping some of the less used features)
 649
 650 def search_fetch_xml(term, db='pubmed', field=None,
 651                      reldate=None, daterange=None, datetype=None,
 652                      retmax=None, sort=None,
 653                      validate=False, valid_fields=None,
 654                      retmode='xml', rettype='medline'):
 655     if validate and valid_fields == None:
 656         valid_fields,field_tags,field_info = field_dict(db)
 657     search_page = _query_esearch(term, db, field,
 658                                  reldate, daterange, datetype,
 659                                  retmax, rettype='uilist', sort=sort,
 660                                  validate=validate, valid_fields=valid_fields)
 661     pid_list = parse_esearch(search_page)
 662     if not pid_list:
 663         return None
 664     fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype)
 665     return fetch_page
 666
 667 def search_link(term, db='pubmed', field=None,
 668                 reldate=None, daterange=None, datetype=None,
 669                 retmax=None, sort=None,
 670                 validate=False, valid_fields=None,
 671                 link_term=None, fromdb=None,
 672                 cmd=None, linkname=None, link_holding=None,
 673                 version=1,
 674                 link_reldate=None, link_daterange=None, link_datetype=None,
 675                 link_retmode='xml'):
 676     if validate and valid_fields == None:
 677         valid_fields,field_tags,field_info = field_dict(db)
 678     search_page = _query_esearch(term, db, field,
 679                                  reldate, daterange, datetype,
 680                                  retmax, rettype='uilist', sort=sort,
 681                                  validate=validate, valid_fields=valid_fields)
 682     pid_list = parse_esearch(search_page)
 683     link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb,
 684                              cmd=cmd, linkname=linkname, holding=link_holding,
 685                              version=version,reldate=link_reldate,
 686                              daterange=link_daterange, datetype=link_datetype,
 687                              retmode=link_retmode)
 688     return link_page
 689
 690 ## Use the external bibutils package to convert to BibTeX format
 691
 692
 693 class Pipe (object):
 694     """Simple interface for executing POSIX-style pipes.
 695
 696     Based on the subprocess module.  The only complication is the
 697     adaptation of `subprocess.Popen._communicate` to listen to the
 698     stderrs of all processes involved in the pipe, as well as the
 699     terminal process' stdout.  There are two implementations of
 700     `Pipe._communicate`, one for MS Windows, and one for POSIX
 701     systems.  The MS Windows implementation is currently untested.
 702
 703     >>> p = Pipe([['find', '/etc/'], ['grep', '^/etc/ssh$']])
 704     >>> p.stdout
 705     '/etc/ssh\\n'
 706     >>> p.status
 707     1
 708     >>> p.statuses
 709     [1, 0]
 710     >>> p.stderrs # doctest: +ELLIPSIS
 711     [...find: ...: Permission denied..., '']
 712
 713     >>> p = Pipe([['cat'], ['head']], stdin='line 1\\nline 2\\nline 3\\n')
 714     >>> p.stdout
 715     'line 1\\nline 2\\nline 3\\n'
 716     >>> p.statuses
 717     [0, 0]
 718     >>> p.stderrs
 719     ['', '']
 720     """
 721     def __init__(self, cmds, stdin=None):
 722         if isinstance(stdin, str):
 723             stdin_str = stdin
 724             stdin = PIPE
 725         else:
 726             stdin_str = None
 727
 728         # spawn processes
 729         self._procs = []
 730         for cmd in cmds:
 731             if len(self._procs) != 0:
 732                 stdin = self._procs[-1].stdout
 733             LOG.debug('run command %s' % cmd)
 734             kwargs = {}
 735             if _POSIX:
 736                 kwargs['close_fds'] = True
 737             self._procs.append(Popen(
 738                     cmd, stdin=stdin, stdout=PIPE, stderr=PIPE, **kwargs))
 739
 740         self.stdout,self.stderrs = self._communicate(input=stdin_str)
 741
 742         # collect process statuses
 743         self.statuses = []
 744         self.status = 0
 745         for proc in self._procs:
 746             self.statuses.append(proc.wait())
 747             LOG.debug('join %s (status %d)' % (proc, self.statuses[-1]))
 748             if self.statuses[-1] != 0:
 749                 self.status = self.statuses[-1]
 750
 751     # Code excerpted from subprocess.Popen._communicate()
 752     if _MSWINDOWS == True:
 753         def _communicate(self, input=None):
 754             LOG.debug('communicate with pipe')
 755             assert input == None, 'stdin != None not yet supported'
 756             # listen to each process' stderr
 757             threads = []
 758             std_X_arrays = []
 759             for proc in self._procs:
 760                 stderr_array = []
 761                 thread = Thread(target=proc._readerthread,
 762                                 args=(proc.stderr, stderr_array))
 763                 thread.setDaemon(True)
 764                 thread.start()
 765                 threads.append(thread)
 766                 std_X_arrays.append(stderr_array)
 767
 768             # also listen to the last processes stdout
 769             stdout_array = []
 770             thread = Thread(target=proc._readerthread,
 771                             args=(proc.stdout, stdout_array))
 772             thread.setDaemon(True)
 773             thread.start()
 774             threads.append(thread)
 775             std_X_arrays.append(stdout_array)
 776
 777             # join threads as they die
 778             for thread in threads:
 779                 thread.join()
 780
 781             # read output from reader threads
 782             std_X_strings = []
 783             for std_X_array in std_X_arrays:
 784                 std_X_strings.append(std_X_array[0])
 785
 786             stdout = std_X_strings.pop(-1)
 787             stderrs = std_X_strings
 788             LOG.debug('pipe communication complete')
 789             return (stdout, stderrs)
 790     else:
 791         assert _POSIX==True, 'invalid platform'
 792         def _communicate(self, input=None):
 793             LOG.debug('communicate with pipe')
 794             read_set = []
 795             write_set = []
 796             read_arrays = []
 797             stdout = None # Return
 798             stderr = None # Return
 799
 800             if self._procs[0].stdin:
 801                 # Flush stdio buffer.  This might block, if the user has
 802                 # been writing to .stdin in an uncontrolled fashion.
 803                 self._procs[0].stdin.flush()
 804                 if input:
 805                     write_set.append(self._procs[0].stdin)
 806                 else:
 807                     self._procs[0].stdin.close()
 808             for proc in self._procs:
 809                 read_set.append(proc.stderr)
 810                 read_arrays.append([])
 811             read_set.append(self._procs[-1].stdout)
 812             read_arrays.append([])
 813
 814             input_offset = 0
 815             while read_set or write_set:
 816                 LOG.debug('select on read %s, write %s' % (read_set,write_set))
 817                 try:
 818                     rlist, wlist, xlist = select.select(read_set, write_set, [])
 819                 except select.error, e:
 820                     if e.args[0] == errno.EINTR:
 821                         LOG.debug('EINTR')
 822                         continue
 823                     raise
 824                 LOG.debug('selected read %s, write %s, exception %s'
 825                           % (rlist, wlist, xlist))
 826                 if self._procs[0].stdin in wlist:
 827                     # When select has indicated that the file is writable,
 828                     # we can write up to PIPE_BUF bytes without risk
 829                     # blocking.  POSIX defines PIPE_BUF >= 512
 830                     LOG.debug('write to stdin for process 0')
 831                     chunk = input[input_offset : input_offset + 512]
 832                     bytes_written = os.write(
 833                         self._procs[0].stdin.fileno(), chunk)
 834                     input_offset += bytes_written
 835                     if input_offset >= len(input):
 836                         self._procs[0].stdin.flush()
 837                         self._procs[0].stdin.close()
 838                         write_set.remove(self._procs[0].stdin)
 839                         LOG.debug('stdin complete')
 840                 if self._procs[-1].stdout in rlist:
 841                     LOG.debug('read stdout for final process')
 842                     data = os.read(self._procs[-1].stdout.fileno(), 1024)
 843                     if data == '':
 844                         self._procs[-1].stdout.close()
 845                         read_set.remove(self._procs[-1].stdout)
 846                         LOG.debug('stdout complete')
 847                     read_arrays[-1].append(data)
 848                 for i,proc in enumerate(self._procs):
 849                     if proc.stderr in rlist:
 850                         LOG.debug('read stderr for process %i' % i)
 851                         data = os.read(proc.stderr.fileno(), 1024)
 852                         if data == '':
 853                             proc.stderr.close()
 854                             read_set.remove(proc.stderr)
 855                             LOG.debug('stderr complete for process %d' % i)
 856                         read_arrays[i].append(data)
 857
 858             # All data exchanged.  Translate lists into strings.
 859             read_strings = []
 860             for read_array in read_arrays:
 861                 read_strings.append(''.join(read_array))
 862
 863             stdout = read_strings.pop(-1)
 864             stderrs = read_strings
 865             LOG.debug('pipe communication complete')
 866             return (stdout, stderrs)
 867
 868
 869 def medline_xml_to_bibtex(fetch_page):
 870     """Convert medline XML to BibTeX
 871
 872     >>> xml = '\\n'.join([
 873     ...     '<?xml version="1.0"?>',
 874     ...     '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_110101.dtd">',
 875     ...     '<PubmedArticleSet>',
 876     ...     ' <PubmedArticle>',
 877     ...     '  <MedlineCitation Owner="NLM" Status="MEDLINE">',
 878     ...     '   <PMID Version="1">20004685</PMID>',
 879     ...     '   <Article PubModel="Print-Electronic">',
 880     ...     '    <Journal>',
 881     ...     '     <ISSN IssnType="Electronic">1879-0003</ISSN>',
 882     ...     '     <JournalIssue CitedMedium="Internet">',
 883     ...     '      <Volume>46</Volume><Issue>2</Issue>',
 884     ...     '      <PubDate>',
 885     ...     '       <Year>2010</Year><Month>Mar</Month><Day>1</Day>',
 886     ...     '      </PubDate>',
 887     ...     '     </JournalIssue>',
 888     ...     '    </Journal>',
 889     ...     '    <ArticleTitle>Monte Carlo simulation of mechanical unfolding '
 890     ...          'of proteins based on a simple two-state model.'
 891     ...          '</ArticleTitle>',
 892     ...     '    <Pagination><MedlinePgn>159-66</MedlinePgn></Pagination>',
 893     ...     '    <AuthorList CompleteYN="Y">',
 894     ...     '     <Author ValidYN="Y">',
 895     ...     '      <LastName>King</LastName>',
 896     ...     '      <ForeName>William T</ForeName>',
 897     ...     '      <Initials>WT</Initials>',
 898     ...     '     </Author>',
 899     ...     '     <Author ValidYN="Y">',
 900     ...     '      <LastName>Su</LastName>',
 901     ...     '      <ForeName>Meihong</ForeName>',
 902     ...     '      <Initials>M</Initials>',
 903     ...     '     </Author>',
 904     ...     '     <Author ValidYN="Y">',
 905     ...     '      <LastName>Yang</LastName>',
 906     ...     '      <ForeName>Guoliang</ForeName>',
 907     ...     '      <Initials>G</Initials>',
 908     ...     '     </Author>',
 909     ...     '    </AuthorList>',
 910     ...     '    <MedlineJournalInfo>',
 911     ...     '     <MedlineTA>Int J Biol Macromol</MedlineTA>',
 912     ...     '    </MedlineJournalInfo>',
 913     ...     '   </Article>',
 914     ...     '   <MedlineJournalInfo>',
 915     ...     '    <MedlineTA>Int J Biol Macromol</MedlineTA>',
 916     ...     '   </MedlineJournalInfo>',
 917     ...     '  </MedlineCitation>',
 918     ...     '  <PubmedData>',
 919     ...     '   <ArticleIdList>',
 920     ...     '    <ArticleId IdType="doi">10.1016/j.ijbiomac.2009.12.001'
 921     ...          '</ArticleId>',
 922     ...     '   </ArticleIdList>',
 923     ...     '  </PubmedData>',
 924     ...     ' </PubmedArticle>',
 925     ...     '</PubmedArticleSet>',
 926     ...     ])
 927     >>> print medline_xml_to_bibtex(xml)
 928     @Article{King2010,
 929       author =       "William T. King and Meihong Su and Guoliang Yang",
 930       title =        "Monte Carlo simulation of mechanical unfolding of
 931                      proteins based on a simple two-state model.",
 932       journal =      "Int J Biol Macromol",
 933       year =         "2010",
 934       month =        mar,
 935       day =          "01",
 936       volume =       "46",
 937       number =       "2",
 938       pages =        "159--166",
 939       ISSN =         "1879-0003",
 940       doi =          "10.1016/j.ijbiomac.2009.12.001",
 941     }
 942     <BLANKLINE>
 943     """
 944     LOG.info('convert medline XML to BibTeX\n%s' % fetch_page)
 945     p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']],
 946              stdin=fetch_page)
 947     LOG.debug('converted to\n%s' % p.stdout)
 948     return p.stdout
 949
 950
 951 ## Random
 952
 953 def hints() :
 954     "Print Entrez search hints and exit"
 955
 956     print """
 957 free full text [sb]
 958
 959
 960 """
 961
 962 ## Test with a mini-searching application
 963
 964 if __name__ == "__main__" :
 965     from optparse import OptionParser
 966
 967     usage_string = """%prog [options] SEARCH_TERM       (print medline xml matching search)
 968      | %prog -l [options] SEARCH_TERM    (print links to entries matching search)
 969      | %prog -L [-d DATABASE] [-f FILE]  (list databases)
 970      | %prog -X [-d DATABASE] [-F FIELD] [-f FILE]  (list fields in a database, or details on a single field)
 971
 972 2008, W. Trevor King.
 973
 974 See the docstrings in %prog or
 975  http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
 976 for more details.
 977 """
 978     parser = OptionParser(usage=usage_string, version="%prog 0.1")
 979
 980     # Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511
 981     # "
 982     # metavar is the name used in the help for that options required text,
 983     # and dest is the name of the property you'll use to access the value of that option.
 984     # "
 985
 986     parser.add_option('-d', '--database', dest="database",
 987                       help="Search DATABASE (default '%default')",
 988                       type='string', metavar="DATABASE", default='pubmed')
 989     parser.add_option('-f', '--file', dest="filename",
 990                       help="write output to FILE (default stdout)",
 991                       type='string', metavar="FILE")
 992     parser.add_option('-v', '--verbose', dest="verbose", action="store_true",
 993                       help="Print lots of debugging information",
 994                       default=False)
 995     parser.add_option('-H', '--hints', callback=hints,
 996                       help="Print Entrez search hints and exit",
 997                       action="callback")
 998
 999
1000     # mode control options
1001     mode = 'search'
1002     def set_mode(option, opt_str, value, parser):
1003         global mode
1004         long_option = option.get_opt_string()
1005         if long_option == '--list-mode' :
1006             mode = 'list'
1007         elif long_option == '--explain-mode' :
1008             mode = 'explain'
1009
1010     parser.add_option('-L', '--list-mode', callback=set_mode,
1011                       help="Run in list mode", action="callback")
1012     parser.add_option('-X', '--explain-mode', callback=set_mode,
1013                       help="Run in explain mode", action="callback")
1014
1015     # search-fetch-xml-to-? options
1016     output = 'bibtex'
1017     def set_output(option, opt_str, value, parser):
1018         global output
1019         long_option = option.get_opt_string()
1020         if long_option == '--output-link' :
1021             output = 'link'
1022     parser.add_option('-W', '--raw', dest="raw", action="store_true",
1023                       help="Output raw Entrez xml", default=False)
1024     parser.add_option('-F', '--field', dest="field",
1025                       help="Limit SEARCH_TERM to FIELD",
1026                       type='string', metavar="FIELD")
1027     parser.add_option('-r', '--reldate', dest="reldate",
1028                       help="Limit search to dates within DAYS of today",
1029                       type='string', metavar="DAYS")
1030     parser.add_option('-R', '--daterange', dest="daterange",
1031                       help="Limit search to dates within DATERANGE (e.g. '2001/1/1,2002')",
1032                       type='string', metavar="DATERANGE")
1033     parser.add_option('-t', '--datetype', dest="datetype",
1034                       help="Select field to apply date limits to (e.g. 'edat' for Entrez date)",
1035                       type='string', metavar="DATETYPE")
1036     parser.add_option('-m', '--retmax', dest="retmax",
1037                       help="Return at max RETMAX items from a successful search (default %default)",
1038                       type='string', metavar="RETMAX", default=20)
1039     parser.add_option('-M', '--retmode', dest="retmode",
1040                       help="Select fetch/link output format",
1041                       type='string', metavar="RETMODE", default='xml')
1042     parser.add_option('-V', '--validate', dest="validate", action="store_true",
1043                       help="Check that FIELD and field tags in SEARCH_TERM are valid for DB",
1044                       default=False)
1045
1046     # output link options
1047     parser.add_option('-l', '--output-link', callback=set_output,
1048                       help="Output a link (instead of xml citations)",
1049                       action="callback")
1050     parser.add_option('-c', '--link-cmd', dest="link_cmd",
1051                       help="Select link output",
1052                       type='string', metavar="LINK_CMD")
1053     parser.add_option('-T', '--link-term', dest="link_term",
1054                       help="Limit links to those matching LINK_TERM",
1055                       type='string', metavar="LINK_TERM")
1056     parser.add_option('-D', '--from-database', dest="fromdb",
1057                       help="Limit links to those from FROMDATABASE)",
1058                       type='string', metavar="FROMDATABASE")
1059     parser.add_option('-n', '--link-name', dest="linkname",
1060                       help="Limit links to a specific neighbor",
1061                       type='string', metavar="LINKNAME")
1062
1063     (options, args) = parser.parse_args()
1064     parser.destroy()
1065
1066     # open the output file if specified
1067     if options.filename == None :
1068         outfile = sys.stdout
1069     else :
1070         outfile = file(options.filename, 'w')
1071
1072     if options.verbose :
1073         LOG.setLevel(logging.DEBUG)
1074
1075     LOG.debug('operating in %s mode' % mode)
1076
1077     if mode == 'list' :
1078         print >> outfile, "Available databases:"
1079         databases = database_list()
1080         for db in databases:
1081             print >> outfile, "\t%s" % db
1082
1083     elif mode == 'explain':
1084         fields,tags,field_info = field_dict(db=options.database)
1085         if options.field == None :
1086             print >> outfile, "Available fields in %s:" % options.database
1087             field_size = [0,0]
1088             for field in fields :
1089                 if len(field) > field_size[0] :
1090                     field_size[0] = len(field)
1091                 if len(field_info[field]['FullName']) > field_size[1] :
1092                     field_size[1] = len(field_info[field]['FullName'])
1093             for field in fields :
1094                 print >> outfile, "\t%*.*s\t%-*.*s" \
1095                     % (field_size[0], field_size[0], field,
1096                        field_size[1], field_size[1], field_info[field]['FullName'])
1097         else :
1098             print >> outfile, "Field %s in %s:" % (options.field,options.database)
1099             field_size = [0,0]
1100             for key in tags:
1101                 if len(key) > field_size[0] :
1102                     field_size[0] = len(key)
1103                 if len(field_info[options.field][key]) > field_size[1] :
1104                     field_size[1] = len(field_info[options.field][key])
1105             for key in tags:
1106                 print >> outfile, "\t%*.*s\t%-*.*s" \
1107                     % (field_size[0], field_size[0], key,
1108                        field_size[1], field_size[1], field_info[options.field][key])
1109
1110     elif mode == 'search':
1111         search_term = args[0]
1112         LOG.debug('output %s' % output)
1113
1114         if output == 'bibtex' :
1115             medline_xml = search_fetch_xml(term=search_term,
1116                                            db=options.database,
1117                                            field=options.field,
1118                                            reldate=options.reldate,
1119                                            daterange=options.daterange,
1120                                            datetype=options.datetype,
1121                                            retmax=options.retmax,
1122                                            validate=options.validate,
1123                                            retmode=options.retmode,
1124                                            rettype='medline')
1125             if medline_xml:
1126                 if options.raw :
1127                     print outfile, medline_xml
1128                 else:
1129                     bibtex = medline_xml_to_bibtex(medline_xml)
1130                     print >> outfile, bibtex
1131
1132         elif output == 'link' :
1133             # Assume that if you're looking for links
1134             # your search is already pretty refined,
1135             # so use the date options for link-limiting.
1136             link_xml = search_link(term=search_term,
1137                                    db=options.database,
1138                                    field=options.field,
1139                                    reldate=None,
1140                                    daterange=None,
1141                                    datetype=None,
1142                                    retmax=None,
1143                                    sort=None,
1144                                    validate=options.validate,
1145                                    valid_fields=None,
1146                                    link_term=options.link_term,
1147                                    fromdb=options.fromdb,
1148                                    cmd=options.link_cmd,
1149                                    linkname=options.linkname,
1150                                    link_holding=None,
1151                                    version=1,
1152                                    link_reldate=options.reldate,
1153                                    link_daterange=options.daterange,
1154                                    link_datetype=options.datetype,
1155                                    link_retmode=options.retmode,)
1156             print >> outfile, link_xml
1157
1158     if options.filename != None :
1159         outfile.close()