# Current as of August 1, 2007
#
# Rules:
-# * Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays for any series of more than 100 requests.
-# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, not the standard NCBI Web address.
+# * Run retrieval scripts on weekends or between 9 pm and 5 am
+# Eastern Time weekdays for any series of more than 100 requests.
+# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov,
+# not the standard NCBI Web address.
# * Make no more than one request every 3 seconds.
-# * Use the URL parameter email, and tool for distributed software, so that we can track your project and contact you if there is a problem.
-# * NCBI's Disclaimer and Copyright notice must be evident to users of your service.
-# NLM does not claim the copyright on the abstracts in PubMed; however, journal publishers or authors may.
-# NLM provides no legal advice concerning distribution of copyrighted materials, consult your legal counsel.
+# * Use the URL parameter email, and tool for distributed software,
+# so that we can track your project and contact you if there is a
+# problem.
+# * NCBI's Disclaimer and Copyright notice must be evident to users
+# of your service.
+# * NLM does not claim the copyright on the abstracts in PubMed;
+# however, journal publishers or authors may.
+# * NLM provides no legal advice concerning distribution of
+# copyrighted materials, consult your legal counsel.
#
# For a good Python-and-XML-DOM intro, see
# http://www.boddie.org.uk/python/XML_intro.html
import select
+__version__ = '0.2'
+
# Entrez access points
EINFO_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
ESEARCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
returns an the node matching <child-name>
"""
nodes = get_child_node(node, child_name)
- assert len(nodes) == 1, "%d child nodes named %s" % (len(nodes), child_name)
+ assert len(nodes) == 1, '%d child nodes named %s' % (
+ len(nodes), child_name)
return node[0]
def get_child_contents(node, child_name):
Get information about the Entrez databases themselves.
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
- Either list all available databases with db=None, or
- Specific information on a particular database (e.g. pubmed) with db=pubmed.
+ Either list all available databases with `db=None`, or specific
+ information on a particular database (e.g. pubmed) with
+ `db=pubmed`.
"""
params = urlencode({
'db': db,
tags = []
field_info = {}
fieldlists = parsed.getElementsByTagName("FieldList")
- assert len(fieldlists) == 1, "%s\n\n%d FieldLists!" % (parsed.toxml(), len(fieldlists))
+ assert len(fieldlists) == 1, '%s\n\n%d FieldLists!' % (
+ parsed.toxml(), len(fieldlists))
fieldlist = fieldlists[0]
for node in fieldlist.childNodes:
if node.nodeType != node.ELEMENT_NODE:
continue # ignore text, comment, etc. nodes
- assert node.tagName == "Field", "Unrecognized tag '%s' in FieldList" % node.tagName
+ assert node.tagName == 'Field', (
+ "Unrecognized tag '%s' in FieldList" % node.tagName)
field,new_tags = get_child_dict(node)
- assert len(field['Name']) == 1, "Multiple field names %s" % str(field['Name'])
+ assert len(field['Name']) == 1, (
+ 'Multiple field names %s' % str(field['Name']))
field = delist_dict(field)
fields.append(field['Name'])
new_tags = unique(tags + new_tags)
tags = []
link_info = []
linklists = parsed.getElementsByTagName("LinkList")
- assert len(linklists) == 1, "%s\n\n%d LinkLists!" % (parsed.toxml(), len(linklists))
+ assert len(linklists) == 1, (
+ '%s\n\n%d LinkLists!' % (parsed.toxml(), len(linklists)))
linklist = linklists[0]
for node in linklist.childNodes:
if node.nodeType != node.ELEMENT_NODE:
continue # ignore text, comment, etc. nodes
- assert node.tagName == "Link", "Unrecognized tag '%s' in LinkList" % node.tagName
+ assert node.tagName == 'Link', (
+ "Unrecognized tag '%s' in LinkList" % node.tagName)
link,new_tags = get_child_dict(node)
- assert len(link['Name']) == 1, "Multiple link names %s" % str(link['Name'])
+ assert len(link['Name']) == 1, (
+ 'Multiple link names %s' % str(link['Name']))
link = delist_dict(link)
links.append(link['Name'])
new_tags = unique(tags + new_tags)
try:
fields.index(field.upper())
except ValueError:
- raise Exception, "Field '%s' invalid\nValid fields are\n %s" \
- % (field, str(fields))
+ raise Exception("Field '%s' invalid\nValid fields are\n %s"
+ % (field, str(fields)))
def strip_fields_from_term(term):
"HACK: really stupid algorithm"
Search an Entrez database.
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
- Does not currently support the usehistory, WebEnv, query_key, retstart, or retmode parameters.
+ Does not currently support the usehistory, WebEnv, query_key,
+ retstart, or retmode parameters.
Help with the arguments adapted from esearch_help.html:
- term: This command uses search terms or phrases with or without Boolean operators.
+ term: This command uses search terms or phrases with or without
+ Boolean operators.
You can search in several fields using the [term field] tag.
You can search in a single field using the 'field' parameter below.
- ?You may also tag search terms using field=tag.? I don't understand this line
+ ?You may also tag search terms using field=tag.? I don't
+ understand this line
For example: term=asthma[MESH]+OR+hay+fever[MESH]
'term=asthma[MESH]' is the same as 'term=asthma&field=MESH'
( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea )
"""
if daterange != None:
- assert len(daterange) == 2, "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
+ assert len(daterange) == 2, (
+ "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
+ % (daterange,))
reldate == None, "Specifying date with daterange AND reldate!"
mindate = daterange[0]
maxdate = daterange[1]
mindate = None
maxdate = None
if validate:
- assert len(valid_fields) > 0, "Need a list of valid fields to validate"
+ assert len(valid_fields) > 0, (
+ 'Need a list of valid fields to validate')
if field != None:
validate_field(field)
validate_search_term(term, valid_fields)
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
- Does not currently support the usehistory, WebEnv, query_key, or retstart parameters.
+ Does not currently support the usehistory, WebEnv, query_key, or
+ retstart parameters.
Help with the arguments adapted from efetchlit_help.html:
id: Primary UIs identifying the documents to fetch
For example: 'id=11877539, 11822933,11871444'
- term: This command uses search terms or phrases with or without Boolean operators
- to limit the returned matching links.
+ term: This command uses search terms or phrases with or without
+ Boolean operators to limit the returned matching links.
db: This command selects the databases to be searched for link targets.
For example: db=all
cmd: Link commands
* prlinks - List the hyperlink to the primary LinkOut provider for
multiple IDs and database. Each ID is processed separately.
- * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut provider
- for a single ID and database. Return the elink
- command, since fetching it breaks the relative
- links in the publisher's page.
+ * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut
+ provider for a single ID and database.
+ Return the elink command, since fetching
+ it breaks the relative links in the
+ publisher's page.
* llinks - List LinkOut URLs and Attributes, except PubMed libraries, for
multiple IDs and database. Each ID is processed separately.
* llinkslib - List LinkOut URLs and Attributes for multiple IDs and
return string
-## Combining the searching and parsing (dropping some of the less used features)
+## Combining the searching and parsing (dropping some of the less used
+## features)
def search_fetch_xml(term, db='pubmed', field=None,
reldate=None, daterange=None, datetype=None,
while read_set or write_set:
LOG.debug('select on read %s, write %s' % (read_set,write_set))
try:
- rlist, wlist, xlist = select.select(read_set, write_set, [])
+ rlist,wlist,xlist = select.select(read_set, write_set, [])
except select.error, e:
if e.args[0] == errno.EINTR:
LOG.debug('EINTR')
>>> xml = '\\n'.join([
... '<?xml version="1.0"?>',
- ... '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_110101.dtd">',
+ ... '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, '
+ ... '1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query'
+ ,,, '/DTD/pubmed_110101.dtd">',
... '<PubmedArticleSet>',
... ' <PubmedArticle>',
... ' <MedlineCitation Owner="NLM" Status="MEDLINE">',
if __name__ == "__main__":
from optparse import OptionParser
- usage_string = """%prog [options] SEARCH_TERM (print medline xml matching search)
- | %prog -l [options] SEARCH_TERM (print links to entries matching search)
- | %prog -L [-d DATABASE] [-f FILE] (list databases)
- | %prog -X [-d DATABASE] [-F FIELD] [-f FILE] (list fields in a database, or details on a single field)
-
-2008, W. Trevor King.
-
-See the docstrings in %prog or
- http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
-for more details.
-"""
- parser = OptionParser(usage=usage_string, version="%prog 0.1")
+ usage_string = '\n'.join([
+ '',
+ ' %prog [options] SEARCH_TERM'
+ ' (print medline xml matching search)',
+ '| %prog -l [options] SEARCH_TERM'
+ ' (print links to entries matching search)',
+ '| %prog -L [-d DATABASE] [-f FILE] (list databases)',
+ '| %prog -X [-d DATABASE] [-F FIELD] [-f FILE]'
+ ' (list fields in a database, or details on a single field)',
+ '',
+ '2008-2011, W. Trevor King.',
+ '',
+ 'See the docstrings in %prog or',
+ ' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
+ 'eutils_help.html',
+ 'for more details.'
+ ])
+
+ parser = OptionParser(
+ usage=usage_string, version='%%prog %s' % __version__)
# Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511
# "
- # metavar is the name used in the help for that options required text,
- # and dest is the name of the property you'll use to access the value of that option.
+ # metavar is the name used in the help for that options required
+ # text, and dest is the name of the property you'll use to access
+ # the value of that option.
# "
parser.add_option('-d', '--database', dest="database",
help="Limit search to dates within DAYS of today",
type='string', metavar="DAYS")
parser.add_option('-R', '--daterange', dest="daterange",
- help="Limit search to dates within DATERANGE (e.g. '2001/1/1,2002')",
+ help=("Limit search to dates within DATERANGE "
+ "(e.g. '2001/1/1,2002')"),
type='string', metavar="DATERANGE")
parser.add_option('-t', '--datetype', dest="datetype",
- help="Select field to apply date limits to (e.g. 'edat' for Entrez date)",
+ help=("Select field to apply date limits to "
+ "(e.g. 'edat' for Entrez date)"),
type='string', metavar="DATETYPE")
parser.add_option('-m', '--retmax', dest="retmax",
- help="Return at max RETMAX items from a successful search (default %default)",
+ help=('Return at max RETMAX items from a successful '
+ 'search (default %default)'),
type='string', metavar="RETMAX", default=20)
parser.add_option('-M', '--retmode', dest="retmode",
help="Select fetch/link output format",
type='string', metavar="RETMODE", default='xml')
parser.add_option('-V', '--validate', dest="validate", action="store_true",
- help="Check that FIELD and field tags in SEARCH_TERM are valid for DB",
+ help=('Check that FIELD and field tags in SEARCH_TERM '
+ 'are valid for DB'),
default=False)
# output link options
if len(field_info[field]['FullName']) > field_size[1]:
field_size[1] = len(field_info[field]['FullName'])
for field in fields:
- print >> outfile, "\t%*.*s\t%-*.*s" \
- % (field_size[0], field_size[0], field,
- field_size[1], field_size[1], field_info[field]['FullName'])
+ print >> outfile, ('\t%*.*s\t%-*.*s'
+ % (field_size[0], field_size[0], field,
+ field_size[1], field_size[1],
+ field_info[field]['FullName']))
else:
- print >> outfile, "Field %s in %s:" % (options.field,options.database)
+ print >> outfile, (
+ 'Field %s in %s:' % (options.field,options.database))
field_size = [0,0]
for key in tags:
if len(key) > field_size[0]:
if len(field_info[options.field][key]) > field_size[1]:
field_size[1] = len(field_info[options.field][key])
for key in tags:
- print >> outfile, "\t%*.*s\t%-*.*s" \
- % (field_size[0], field_size[0], key,
- field_size[1], field_size[1], field_info[options.field][key])
+ print >> outfile, ('\t%*.*s\t%-*.*s'
+ % (field_size[0], field_size[0], key,
+ field_size[1], field_size[1],
+ field_info[options.field][key]))
elif mode == 'search':
search_term = args[0]