--- /dev/null
+#!/usr/bin/python
+#
+# Copyright (C) 1998-2004 Frederic Gobry
+# Copyright (C) 2008 W. Trevor King
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+# Code following John Vu's medline query code pybliographer/Pyblio/Query.py,
+#
+# Python interface to the Entrez databases.
+# See http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
+# Current as of August 1, 2007
+#
+# Rules:
+# * Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays for any series of more than 100 requests.
+# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, not the standard NCBI Web address.
+# * Make no more than one request every 3 seconds.
+# * Use the URL parameter email, and tool for distributed software, so that we can track your project and contact you if there is a problem.
+# * NCBI's Disclaimer and Copyright notice must be evident to users of your service.
+# NLM does not claim the copyright on the abstracts in PubMed; however, journal publishers or authors may.
+# NLM provides no legal advice concerning distribution of copyrighted materials, consult your legal counsel.
+#
+# For a good Python-and-XML-DOM intro, see
+# http://www.boddie.org.uk/python/XML_intro.html
+# for the official docs, see
+# http://docs.python.org/lib/module-xml.dom.html
+
+"""Python bindings on Entrez database queries.
+"""
+
+# The time module is added for querying date ranges of publications
+import urllib, sys, re, string, time
+
+# DOM module for parsing XML,
+# supports Document Object Model (DOM) Level 1 Specification
+# http://docs.python.org/lib/module-xml.dom.minidom.html
+import xml.dom.minidom as dom
+
+# For calling the bibutils conversion programs
+from popen2 import popen2
+
+# Entrez access points
+einfo_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
+esearch_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
+efetch_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
+elink_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
+
+# Entrez-requested tracking information
+TOOL = 'entrezpy'
+EMAIL = 'wking@drexel.edu'
+
+## XML and list utility functions
+
+def urlencode(param_dict) :
+ params = ""
+ for key,value in param_dict.items() :
+ if value == None :
+ continue # ignore unused parameter
+ #if type(value)== : # convert True/False to 'y'/<no-entry>
+ # if value == True :
+ # params += "%s=y&" % (key,)
+ # #else :
+ # # params += "%s=n&" % (key,)
+ if value != None :
+ params += "%s=%s&" % (key, str(value))
+ if len(params) > 1 :
+ params = params[:-1] # remove trailing &
+ return params
+
+def unique(seq, keepstr=True):
+ """
+ Return the sequence (list, tuple, etc) without repeating entries
+ by Paul Rubin and Jordan Callicoat.
+ http://groups.google.com/group/comp.lang.python/browse_thread/thread/40c6c455f4fd5154/744a1a338afe1331?lnk=gst&rnum=7#744a1a338afe1331
+
+ for example [1,2,3,1,2] -> [1,2,3]
+ """
+ t = type(seq)
+ if t in (str, unicode):
+ t = (list, ''.join)[bool(keepstr)]
+ seen = []
+ return t(c for c in seq if not (c in seen or seen.append(c)))
+
+def get_text(node) :
+ """
+ Given a node (<node-name> in the following example),
+ extract some-text from '<node-name>some-text</node-name>'
+ returns u'some-text'.
+ However, if the xml is '</node-name>' returns None
+ """
+ if len(node.childNodes) == 1:
+ data = node.childNodes[0].data
+ elif len(node.childNodes) == 0: # empty node
+ data = None
+ else :
+ raise Exception, "Node contains more than text"
+ return data
+
+def get_child_nodes(node, child_name):
+ """
+ Given a node (<node-name> in the following example),
+ returns an array of nodes matching <child-name>
+ """
+ ret = []
+ for n in node.childNodes:
+ if n.nodeType != n.ELEMENT_NODE:
+ continue # ignore text, comment, etc. nodes
+ if n.tagName == child_name :
+ ret.append(n)
+ return ret
+
+def get_child_nodes(node, child_name):
+ """
+ Given a node (<node-name> in the following example),
+ returns an the node matching <child-name>
+ """
+ nodes = get_child_node(node, child_name)
+ assert len(nodes) == 1, "%d child nodes named %s" % (len(nodes), child_name)
+ return node[0]
+
+def get_child_contents(node, child_name):
+ """
+ Given a node (<node-name> in the following example),
+ extract some-text from '<node-name>
+ <some-tag>some-text</some-tag>
+ <other-tag>other-text</other-tag>
+ <some-tag>some-other-text</some-tag>
+ ...
+ </node-name>'
+ Returns ['some-text', 'some-other-text', ...]
+ """
+ nodes = get_child_nodes(node, child_name)
+ ret = []
+ for n in nodes:
+ ret.append(get_text(n))
+ return ret
+
+def get_child_dict(node):
+ """
+ Given a node (<node-name> in the following example),
+ extract some-text from '<node-name>
+ <some-tag>some-text</some-tag>
+ <other-tag>other-text</other-tag>
+ <some-tag>some-other-text</some-tag>
+ ...
+ </node-name>'
+ Returns {'some-tag':['some-text', 'some-other-text', ...],
+ 'other-tag':['some-other-text']}
+ """
+ dict = {}
+ tags = [] # to preserve order of tags
+ for n in node.childNodes:
+ if n.nodeType != n.ELEMENT_NODE:
+ continue # ignore text, comment, etc. nodes
+ try: # another entry for an existing tag
+ dict[n.tagName].append(get_text(n))
+ except KeyError: # new tag
+ dict[n.tagName] = [get_text(n)]
+ tags.append(n.tagName)
+ return (dict, tags)
+
+def delist_dict(dict) :
+ """
+ Given a dict
+ e.g. {'some-tag':['some-text', 'some-other-text', ...],
+ 'other-tag':['some-other-text'], ...} ,
+ replaces any values in an array of length 1 with the element,
+ e.g. {'some-tag':['some-text', 'some-other-text', ...],
+ 'other-tag':'some-other-text', ...} ,
+ """
+ for key,value in dict.items() :
+ if isinstance(value, list) and len(value) == 1 :
+ dict[key] = value[0]
+ return dict
+
+## Get information about the Entrez databases themselves
+
+def _query_einfo(db=None, debug=False) :
+ """
+ Get information about the Entrez databases themselves.
+ http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
+
+ Either list all available databases with db=None, or
+ Specific information on a particular database (e.g. pubmed) with db=pubmed.
+ """
+ params = urlencode ({
+ 'db': db,
+ 'tool' : TOOL,
+ 'email' : EMAIL})
+
+ if debug :
+ print "Getting einfo from '%s?%s'" % (einfo_url, params)
+ f = urllib.urlopen ("%s?%s" % (einfo_url, params))
+ string = f.read()
+ f.close()
+ if debug == True:
+ print string
+ print ""
+ return string
+
+def get_parsed_einfo(db=None, page=None, parsed=None, debug=True):
+ """
+ Helper function for various einfo processing functions.
+ Allow each processor to function
+ independently (page=None, parsed=None),
+ with a shared xml string (page=<xml-string>, parsed=None), or
+ with a shared parsed xml structure (page=*, parsed=<parsed_xml>).
+ Use clean_parsed_einfo() for cleanup
+ """
+ if page == None and parsed == None:
+ if debug == True : print "Downloading new einfo page"
+ page = _query_einfo(db)
+ if parsed == None :
+ if debug == True : print "Parsing new einfo page"
+ parsed = dom.parseString(page)
+ parsed_islocal = True
+ else :
+ if debug == True : print "Using old einfo parsing"
+ parsed_islocal = False
+ return (parsed, parsed_islocal)
+
+def clean_parsed_einfo(parsed, parsed_islocal=True, debug=False):
+ """
+ Helper function for various einfo processing functions.
+ Clean up the parsed xml structure if the calling function created it.
+ """
+ if parsed_islocal == True :
+ if debug == True : print "Cleaning up einfo parsing"
+ parsed.unlink() # clean up the DOM
+
+def database_list(page=None, parsed=None, debug=False):
+ parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed, debug=debug)
+ databases = []
+ for node in parsed.getElementsByTagName("DbName"):
+ # Extract some-text from '<DbName>some-text</DbName>'
+ # by default, xml.dom.minidom uses unicode,
+ # so strings get printed: "u'string contents'"
+ databases.append(get_text(node))
+ clean_parsed_einfo(parsed,parsed_islocal, debug=debug)
+ return databases
+
+def field_dict(db='pubmed', page=None, parsed=None, debug=False):
+ parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug)
+ fields = []
+ tags = []
+ field_info = {}
+ fieldlists = parsed.getElementsByTagName("FieldList")
+ assert len(fieldlists) == 1, "%s\n\n%d FieldLists!" % (parsed.toxml(), len(fieldlists))
+ fieldlist = fieldlists[0]
+ for node in fieldlist.childNodes:
+ if node.nodeType != node.ELEMENT_NODE :
+ continue # ignore text, comment, etc. nodes
+ assert node.tagName == "Field", "Unrecognized tag '%s' in FieldList" % node.tagName
+ field,new_tags = get_child_dict(node)
+ assert len(field['Name']) == 1, "Multiple field names %s" % str(field['Name'])
+ field = delist_dict(field)
+ fields.append(field['Name'])
+ new_tags = unique(tags + new_tags)
+ if tags != []:
+ assert new_tags == tags, "Inconsistent tags"
+ tags = new_tags
+ field_info[field['Name']] = field
+ clean_parsed_einfo(parsed,parsed_islocal, debug)
+ return (fields, tags, field_info)
+
+def link_dict(db='pubmed', page=None, parsed=None, debug=False):
+ parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug)
+ links = []
+ tags = []
+ link_info = []
+ linklists = parsed.getElementsByTagName("LinkList")
+ assert len(linklists) == 1, "%s\n\n%d LinkLists!" % (parsed.toxml(), len(linklists))
+ linklist = linklists[0]
+ for node in linklist.childNodes:
+ if node.nodeType != node.ELEMENT_NODE :
+ continue # ignore text, comment, etc. nodes
+ assert node.tagName == "Link", "Unrecognized tag '%s' in LinkList" % node.tagName
+ link,new_tags = get_child_dict(node)
+ assert len(link['Name']) == 1, "Multiple link names %s" % str(link['Name'])
+ link = delist_dict(link)
+ links.append(link['Name'])
+ new_tags = unique(tags + new_tags)
+ if tags != []:
+ assert new_tags == tags, "Inconsistent tags"
+ tags = new_tags
+ link_info[link['Name']] = link
+ clean_parsed_einfo(parsed,parsed_islocal, debug)
+ return (links, tags, link_info)
+
+def database_info(db='pubmed', page=None, parsed=None, debug=False):
+ "Convenience function to call both field_dict and link_dict"
+ parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug)
+ fields,field_tags,field_info = field_dict(db=db, parsed=parsed, debug=debug)
+ links,link_tags,link_info = link_dict(db=db, parsed=parsed, debug=debug)
+ clean_parsed_einfo(parsed,parsed_islocal, debug=debug)
+ return (fields, field_tags, field_info, links, link_tags, link_info)
+
+def validate_field(field, fields):
+ "Ensure that field is a valid field for the database db."
+ try :
+ fields.index(field.upper())
+ except ValueError:
+ raise Exception, "Field '%s' invalid\nValid fields are\n %s" \
+ % (field, str(fields))
+
+def strip_fields_from_term(term):
+ "HACK: really stupid algorithm"
+ fields = []
+ infield = False
+ for i in range(len(term)):
+ if term[i] == '[' and infield == False :
+ infield = True
+ field_start = i+1
+ elif term[i] == ']' and infield == True :
+ infield = False
+ fields.append(term[field_start:i])
+ return fields
+
+def validate_search_term(term, fields):
+ "Ensure that the fields in term are valid fields for the database db."
+ for field in strip_fields_from_term(term) :
+ validate_field(field, fields)
+
+
+## Search an Entrez database
+
+def _query_esearch(term, db='pubmed', field=None,
+ reldate=None, daterange=None, datetype=None,
+ retmax=None, rettype=None, sort=None,
+ validate=False, valid_fields=None, debug=False) :
+ """
+ Search an Entrez database.
+ http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
+
+ Does not currently support the usehistory, WebEnv, query_key, retstart, or retmode parameters.
+
+ Help with the arguments adapted from esearch_help.html:
+
+ term: This command uses search terms or phrases with or without Boolean operators.
+ You can search in several fields using the [term field] tag.
+ You can search in a single field using the 'field' parameter below.
+ ?You may also tag search terms using field=tag.? I don't understand this line
+ For example: term=asthma[MESH]+OR+hay+fever[MESH]
+ 'term=asthma[MESH]' is the same as 'term=asthma&field=MESH'
+ ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea )
+
+ db: This command selects the database to be searched
+ For example: db=pubmed
+
+ field: Use this command to specify a specific search field.
+ PubMed fields: affl, auth, ecno, jour, iss, mesh,...
+ Retrieve with field_dict('pubmed')
+ For example: field=auth
+
+ reldate: Limit items a number of days immediately preceding today's date.
+ For example: reldate=365
+
+ daterange: Limit results bounded by two specific dates.
+ For example: daterange=('2001', '2002/01/01')
+ (implemented as mindate=2001&maxdate=2002/01/01)
+
+ datetype: Limit dates to a specific date field based on database.
+ For example: datetype=edat
+
+ retmax: Limit the number of items retrieved
+ For example: retmax=100
+
+ rettype: Select the retrieval type
+ PubMed values: count, uilist (default)
+
+ sort: Sort the returned uilist
+ PubMed values: author, last+author, journal, pub+date
+
+ """
+ if daterange != None :
+ assert len(daterange) == 2, "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
+ reldate == None, "Specifying date with daterange AND reldate!"
+ mindate = daterange[0]
+ maxdate = daterange[1]
+ else :
+ mindate = None
+ maxdate = None
+ if validate :
+ assert len(valid_fields) > 0, "Need a list of valid fields to validate"
+ if field != None :
+ validate_field(field)
+ validate_search_term(term, valid_fields)
+ params = urlencode ({
+ 'tool' : TOOL,
+ 'email' : EMAIL,
+ 'term' : term,
+ 'db': db,
+ 'field' : field,
+ 'reldate' : reldate,
+ 'mindate' : mindate,
+ 'maxdate' : maxdate,
+ 'datetype' : datetype,
+ 'maxdate' : maxdate,
+ 'retmax' : retmax,
+ 'rettype' : rettype,
+ 'sort' : sort})
+
+ if debug :
+ print "Getting esearch from '%s?%s'" % (esearch_url, params)
+ f = urllib.urlopen ("%s?%s" % (esearch_url, params))
+ string = f.read()
+ f.close()
+ if debug == True:
+ print string
+ print ""
+ return string
+
+def parse_esearch(page):
+ "Parse the xml returned by _query_esearch()"
+ parsed = dom.parseString(page)
+
+ pid_list = []
+ for node in parsed.getElementsByTagName("Id"):
+ pid_list.append(get_text(node))
+
+ parsed.unlink()
+
+ return pid_list
+
+
+## Fetch records by Primary ID from an Entrez database
+
+def _query_efetch(id, db='pubmed',
+ retmax=None, retmode='xml', rettype='medline',
+ debug=False) :
+ """
+ Fetch records by primary ID from an Entrez database.
+ http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
+ http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
+
+
+ Does not currently support the usehistory, WebEnv, query_key, or retstart parameters.
+
+ Help with the arguments adapted from efetchlit_help.html:
+
+ id: Primary UIs identifying the documents to fetch
+ For example: 'id=11877539, 11822933,11871444'
+
+ db: This command selects the database to be searched
+ For example: db=pubmed
+
+ retmax: Limit the number of items retrieved (default 20)
+ For example: retmax=100
+
+ retmode: Select the retrieval output format
+ xml (not journals)
+ html
+ text
+ asn.1 (not journals)
+
+ rettype: Select the retrieval type
+ uilist
+ abstract (not omim)
+ citation (not omim)
+ medline (not omim)
+ full (journals and omim)
+
+ Not all retmodes are possible with all rettypes:
+ PubMed Options:
+ uilist abstract citation medline
+ xml x x* x* x*
+ text x x x x
+ html x x x x
+ asn.1 n/a x* x* x
+ x = retrieval mode available
+ * returned retrieval type is the complete record in the retrieval mode
+ n/a - not available
+ OMIM Options: (not case sensitive)
+ uilist docsum synopsis variants detailed ExternalLink
+ (MIM (Clinical (Allelic
+ numbers) synopsis) Variants)
+ xml x x* x* x* x* x*
+ text x x x x x* x*
+ html x x x x x* x*
+ asn.1 x* x* x* x* x* x*
+ x = retrieval mode available
+ * returned retrieval type is the complete record in the retrieval mode
+ n/a - not available
+
+ """
+ idstring = ""
+ for d in id :
+ idstring += "%s," % d
+ idstring = idstring[:-1] # remove trailing comma
+ params = urlencode ({
+ 'tool' : TOOL,
+ 'email' : EMAIL,
+ 'id' : idstring,
+ 'db': db,
+ 'retmax' : retmax,
+ 'retmode' : retmode,
+ 'rettype' : rettype})
+
+ if debug :
+ print "Getting efetch from '%s?%s'" % (efetch_url, params)
+ f = urllib.urlopen ("%s?%s" % (efetch_url, params))
+ string = f.read()
+ f.close()
+ if debug == True:
+ print string
+ print ""
+ return string
+
+
+## Fetch links by Primary ID from an Entrez database
+
+def _query_elink(id, term=None, db='all', dbfrom='pubmed',
+ cmd=None, linkname=None, holding=None,
+ version=1,
+ reldate=None, daterange=None, datetype=None,
+ retmode='xml',
+ debug=False) :
+ """
+ Fetch links from a list of primary IDs in an Entrez database.
+ http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
+ http://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html
+
+ Does not currently support the WebEnv or query_key parameters.
+
+ Help with the arguments adapted from efetchlit_help.html:
+
+ id: Primary UIs identifying the documents to fetch
+ For example: 'id=11877539, 11822933,11871444'
+
+ term: This command uses search terms or phrases with or without Boolean operators
+ to limit the returned matching links.
+
+ db: This command selects the databases to be searched for link targets.
+ For example: db=all
+
+ dbfrom: This command selects the database containing the ids.
+ For example: dbfrom=pubmed
+
+
+ cmd: Link commands
+ * prlinks - List the hyperlink to the primary LinkOut provider for
+ multiple IDs and database. Each ID is processed separately.
+ * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut provider
+ for a single ID and database. Return the elink
+ command, since fetching it breaks the relative
+ links in the publisher's page.
+ * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for
+ multiple IDs and database. Each ID is processed separately.
+ * llinkslib - List LinkOut URLs and Attributes for multiple IDs and
+ database. Each ID is processed separately.
+ * lcheck - Check for the existence (Y or N) of an external link in for
+ multiple IDs and database.
+ * ncheck - Check for the existence of a neighbor link for each ID within
+ a database, e.g., Related Articles in PubMed.
+ * neighbor - Display neighbors within a database.
+ * neighbor_history - Create history (WebEnv & query_key) for use in other
+ EUtilities.
+ * acheck - Lists Entrez databases links for multiple IDs from a single
+ database.
+
+ linkname: link to a specific neighbor subset
+ For example: linkname=nucleotide_nucleotide_comp
+
+ holding: List LinkOut URLs for the specified holding provider, (library).
+ Used only in conjunction with cmd=llinks or cmd=llinkslib
+ For example: cmd=llinkslib&holding=medlib
+
+ version: Include a version number to refer to the latest DTD.
+ For example: version=1
+ retrieves the latest DTD (eLink_050511.dtd) that includes the additional
+ elements, MenuTag, LinkInfo and IdLinkSet.
+
+ Date command are only valid for dbfrom=pubmed & cmd=neighbor
+ reldate: Limit items a number of days immediately preceding today's date.
+ For example: reldate=365
+
+ daterange: Limit results bounded by two specific dates.
+ For example: daterange=('2001', '2002/01/01')
+ (implemented as mindate=2001&maxdate=2002/01/01)
+
+ datetype: Limit dates to a specific date field based on database.
+ For example: datetype=edat
+
+ retmode: Select the retrieval output format
+ xml (default)
+ ref (only used with cmd=prlinks for one ID)
+
+ """
+ idstring = ""
+ for d in id :
+ idstring += "%s," % d
+ idstring = idstring[:-1] # remove trailing comma
+
+ params = urlencode ({
+ 'tool' : TOOL,
+ 'email' : EMAIL,
+ 'id' : idstring,
+ 'term': term,
+ 'db': db,
+ 'dbfrom': dbfrom,
+ 'cmd': cmd,
+ 'linkname': linkname,
+ 'holding': holding,
+ 'version': version,
+ 'reldate': reldate,
+ 'daterange': daterange,
+ 'datetype': datetype,
+ 'retmode' : retmode})
+
+ if debug :
+ print "Getting elink from '%s?%s'" % (elink_url, params)
+ f = urllib.urlopen ("%s?%s" % (elink_url, params))
+
+ if cmd == 'prlinks' and retmode == 'ref' :
+ # Just get the link, we don't need the provider's webpage HTML.
+ url = f.geturl()
+ f.close()
+ return url
+
+ string = f.read()
+ f.close()
+ if debug == True:
+ print string
+ print ""
+ return string
+
+
+## Combining the searching and parsing (dropping some of the less used features)
+
+def search_fetch_xml(term, db='pubmed', field=None,
+ reldate=None, daterange=None, datetype=None,
+ retmax=None, sort=None,
+ validate=False, valid_fields=None,
+ retmode='xml', rettype='medline',
+ debug=False) :
+ if validate and valid_fields == None:
+ valid_fields,field_tags,field_info = field_dict(db, debug=debug)
+ search_page = _query_esearch(term, db, field,
+ reldate, daterange, datetype,
+ retmax, rettype='uilist', sort=sort,
+ validate=validate, valid_fields=valid_fields,
+ debug=debug)
+ pid_list = parse_esearch(search_page)
+ fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype, debug)
+ return fetch_page
+
+def search_link(term, db='pubmed', field=None,
+ reldate=None, daterange=None, datetype=None,
+ retmax=None, sort=None,
+ validate=False, valid_fields=None,
+ link_term=None, fromdb=None,
+ cmd=None, linkname=None, link_holding=None,
+ version=1,
+ link_reldate=None, link_daterange=None, link_datetype=None,
+ link_retmode='xml',
+ debug=False) :
+ if validate and valid_fields == None:
+ valid_fields,field_tags,field_info = field_dict(db, debug=debug)
+ search_page = _query_esearch(term, db, field,
+ reldate, daterange, datetype,
+ retmax, rettype='uilist', sort=sort,
+ validate=validate, valid_fields=valid_fields,
+ debug=debug)
+ pid_list = parse_esearch(search_page)
+ link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb,
+ cmd=cmd, linkname=linkname, holding=link_holding,
+ version=version,reldate=link_reldate,
+ daterange=link_daterange, datetype=link_datetype,
+ retmode=link_retmode,
+ debug=debug)
+ return link_page
+
+## Use the external bibutils package to convert to BibTeX format
+
+def medline_xml_to_bibtex(fetch_page):
+ child_stdout,child_stdin = popen2("med2xml | xml2bib -fc | bibclean")
+ print >> child_stdin, fetch_page
+ child_stdin.close()
+ bibtex = child_stdout.read()
+ child_stdout.close()
+ return bibtex
+
+## Random
+
+def hints() :
+ "Print Entrez search hints and exit"
+
+ print """
+free full text [sb]
+
+
+"""
+
+## Test with a mini-searching application
+
+if __name__ == "__main__" :
+ from optparse import OptionParser
+
+ usage_string = """%prog [options] SEARCH_TERM (print medline xml matching search)
+ | %prog -l [options] SEARCH_TERM (print links to entries matching search)
+ | %prog -L [-d DATABASE] [-f FILE] (list databases)
+ | %prog -X [-d DATABASE] [-F FIELD] [-f FILE] (list fields in a database, or details on a single field)
+
+2008, W. Trevor King.
+
+See the docstrings in %prog or
+ http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
+for more details.
+"""
+ parser = OptionParser(usage=usage_string, version="%prog 0.1")
+
+ # Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511
+ # "
+ # metavar is the name used in the help for that options required text,
+ # and dest is the name of the property you'll use to access the value of that option.
+ # "
+
+ parser.add_option('-d', '--database', dest="database",
+ help="Search DATABASE (default '%default')",
+ type='string', metavar="DATABASE", default='pubmed')
+ parser.add_option('-f', '--file', dest="filename",
+ help="write output to FILE (default stdout)",
+ type='string', metavar="FILE")
+ parser.add_option('-v', '--verbose', dest="verbose", action="store_true",
+ help="Print lots of debugging information",
+ default=False)
+ parser.add_option('-H', '--hints', callback=hints,
+ help="Print Entrez search hints and exit",
+ action="callback")
+
+
+ # mode control options
+ mode = 'search'
+ def set_mode(option, opt_str, value, parser):
+ global mode
+ long_option = option.get_opt_string()
+ if long_option == '--list-mode' :
+ mode = 'list'
+ elif long_option == '--explain-mode' :
+ mode = 'explain'
+
+ parser.add_option('-L', '--list-mode', callback=set_mode,
+ help="Run in list mode", action="callback")
+ parser.add_option('-X', '--explain-mode', callback=set_mode,
+ help="Run in explain mode", action="callback")
+
+ # search-fetch-xml-to-? options
+ output = 'bibtex'
+ def set_output(option, opt_str, value, parser):
+ global output
+ long_option = option.get_opt_string()
+ if long_option == '--output-link' :
+ output = 'link'
+ parser.add_option('-W', '--raw', dest="raw", action="store_true",
+ help="Output raw Entrez xml", default=False)
+ parser.add_option('-F', '--field', dest="field",
+ help="Limit SEARCH_TERM to FIELD",
+ type='string', metavar="FIELD")
+ parser.add_option('-r', '--reldate', dest="reldate",
+ help="Limit search to dates within DAYS of today",
+ type='string', metavar="DAYS")
+ parser.add_option('-R', '--daterange', dest="daterange",
+ help="Limit search to dates within DATERANGE (e.g. '2001/1/1,2002')",
+ type='string', metavar="DATERANGE")
+ parser.add_option('-t', '--datetype', dest="datetype",
+ help="Select field to apply date limits to (e.g. 'edat' for Entrez date)",
+ type='string', metavar="DATETYPE")
+ parser.add_option('-m', '--retmax', dest="retmax",
+ help="Return at max RETMAX items from a successful search (default %default)",
+ type='string', metavar="RETMAX", default=20)
+ parser.add_option('-M', '--retmode', dest="retmode",
+ help="Select fetch/link output format",
+ type='string', metavar="RETMODE", default='xml')
+ parser.add_option('-V', '--validate', dest="validate", action="store_true",
+ help="Check that FIELD and field tags in SEARCH_TERM are valid for DB",
+ default=False)
+
+ # output link options
+ parser.add_option('-l', '--output-link', callback=set_output,
+ help="Output a link (instead of xml citations)",
+ action="callback")
+ parser.add_option('-c', '--link-cmd', dest="link_cmd",
+ help="Select link output",
+ type='string', metavar="LINK_CMD")
+ parser.add_option('-T', '--link-term', dest="link_term",
+ help="Limit links to those matching LINK_TERM",
+ type='string', metavar="LINK_TERM")
+ parser.add_option('-D', '--from-database', dest="fromdb",
+ help="Limit links to those from FROMDATABASE)",
+ type='string', metavar="FROMDATABASE")
+ parser.add_option('-n', '--link-name', dest="linkname",
+ help="Limit links to a specific neighbor",
+ type='string', metavar="LINKNAME")
+
+ (options, args) = parser.parse_args()
+ parser.destroy()
+
+ # open the output file if specified
+ if options.filename == None :
+ outfile = sys.stdout
+ else :
+ outfile = file(options.filename, 'w')
+
+ if options.verbose :
+ print >> sys.stdout, "Operating in %s mode" % mode
+
+ if mode == 'list' :
+ print >> outfile, "Available databases:"
+ databases = database_list(debug=options.verbose)
+ for db in databases:
+ print >> outfile, "\t%s" % db
+
+ elif mode == 'explain':
+ fields,tags,field_info = field_dict(db=options.database,
+ debug=options.verbose)
+ if options.field == None :
+ print >> outfile, "Available fields in %s:" % options.database
+ field_size = [0,0]
+ for field in fields :
+ if len(field) > field_size[0] :
+ field_size[0] = len(field)
+ if len(field_info[field]['FullName']) > field_size[1] :
+ field_size[1] = len(field_info[field]['FullName'])
+ for field in fields :
+ print >> outfile, "\t%*.*s\t%-*.*s" \
+ % (field_size[0], field_size[0], field,
+ field_size[1], field_size[1], field_info[field]['FullName'])
+ else :
+ print >> outfile, "Field %s in %s:" % (options.field,options.database)
+ field_size = [0,0]
+ for key in tags:
+ if len(key) > field_size[0] :
+ field_size[0] = len(key)
+ if len(field_info[options.field][key]) > field_size[1] :
+ field_size[1] = len(field_info[options.field][key])
+ for key in tags:
+ print >> outfile, "\t%*.*s\t%-*.*s" \
+ % (field_size[0], field_size[0], key,
+ field_size[1], field_size[1], field_info[options.field][key])
+
+ elif mode == 'search':
+ search_term = args[0]
+ if options.verbose :
+ print >> sys.stdout, "Output %s" % output
+
+ if output == 'bibtex' :
+ medline_xml = search_fetch_xml(term=search_term,
+ db=options.database,
+ field=options.field,
+ reldate=options.reldate,
+ daterange=options.daterange,
+ datetype=options.datetype,
+ retmax=options.retmax,
+ validate=options.validate,
+ retmode=options.retmode,
+ rettype='medline',
+ debug=options.verbose)
+ if options.raw :
+ print outfile, medline_xml
+ else :
+ bibtex = medline_xml_to_bibtex(medline_xml)
+ print >> outfile, bibtex
+
+ elif output == 'link' :
+ # Assume that if you're looking for links
+ # your search is already pretty refined,
+ # so use the date options for link-limiting.
+ link_xml = search_link(term=search_term,
+ db=options.database,
+ field=options.field,
+ reldate=None,
+ daterange=None,
+ datetype=None,
+ retmax=None,
+ sort=None,
+ validate=options.validate,
+ valid_fields=None,
+ link_term=options.link_term,
+ fromdb=options.fromdb,
+ cmd=options.link_cmd,
+ linkname=options.linkname,
+ link_holding=None,
+ version=1,
+ link_reldate=options.reldate,
+ link_daterange=options.daterange,
+ link_datetype=options.datetype,
+ link_retmode=options.retmode,
+ debug=options.verbose)
+ print >> outfile, link_xml
+
+ if options.filename != None :
+ outfile.close()