From: W. Trevor King Date: Sat, 16 Apr 2011 02:01:22 +0000 (-0400) Subject: Wrap long lines in entrez.py. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=012c283303511ac31fb35b320107c5556e82f8bb;p=mw2txt.git Wrap long lines in entrez.py. --- diff --git a/posts/entrez/entrez.py b/posts/entrez/entrez.py index 61948b2..bdf6235 100755 --- a/posts/entrez/entrez.py +++ b/posts/entrez/entrez.py @@ -23,13 +23,20 @@ # Current as of August 1, 2007 # # Rules: -# * Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays for any series of more than 100 requests. -# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, not the standard NCBI Web address. +# * Run retrieval scripts on weekends or between 9 pm and 5 am +# Eastern Time weekdays for any series of more than 100 requests. +# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, +# not the standard NCBI Web address. # * Make no more than one request every 3 seconds. -# * Use the URL parameter email, and tool for distributed software, so that we can track your project and contact you if there is a problem. -# * NCBI's Disclaimer and Copyright notice must be evident to users of your service. -# NLM does not claim the copyright on the abstracts in PubMed; however, journal publishers or authors may. -# NLM provides no legal advice concerning distribution of copyrighted materials, consult your legal counsel. +# * Use the URL parameter email, and tool for distributed software, +# so that we can track your project and contact you if there is a +# problem. +# * NCBI's Disclaimer and Copyright notice must be evident to users +# of your service. +# * NLM does not claim the copyright on the abstracts in PubMed; +# however, journal publishers or authors may. +# * NLM provides no legal advice concerning distribution of +# copyrighted materials, consult your legal counsel. # # For a good Python-and-XML-DOM intro, see # http://www.boddie.org.uk/python/XML_intro.html @@ -63,6 +70,8 @@ if _POSIX: import select +__version__ = '0.2' + # Entrez access points EINFO_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' ESEARCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' @@ -137,7 +146,8 @@ def get_child_nodes(node, child_name): returns an the node matching """ nodes = get_child_node(node, child_name) - assert len(nodes) == 1, "%d child nodes named %s" % (len(nodes), child_name) + assert len(nodes) == 1, '%d child nodes named %s' % ( + len(nodes), child_name) return node[0] def get_child_contents(node, child_name): @@ -202,8 +212,9 @@ def _query_einfo(db=None): Get information about the Entrez databases themselves. http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html - Either list all available databases with db=None, or - Specific information on a particular database (e.g. pubmed) with db=pubmed. + Either list all available databases with `db=None`, or specific + information on a particular database (e.g. pubmed) with + `db=pubmed`. """ params = urlencode({ 'db': db, @@ -264,14 +275,17 @@ def field_dict(db='pubmed', page=None, parsed=None): tags = [] field_info = {} fieldlists = parsed.getElementsByTagName("FieldList") - assert len(fieldlists) == 1, "%s\n\n%d FieldLists!" % (parsed.toxml(), len(fieldlists)) + assert len(fieldlists) == 1, '%s\n\n%d FieldLists!' % ( + parsed.toxml(), len(fieldlists)) fieldlist = fieldlists[0] for node in fieldlist.childNodes: if node.nodeType != node.ELEMENT_NODE: continue # ignore text, comment, etc. nodes - assert node.tagName == "Field", "Unrecognized tag '%s' in FieldList" % node.tagName + assert node.tagName == 'Field', ( + "Unrecognized tag '%s' in FieldList" % node.tagName) field,new_tags = get_child_dict(node) - assert len(field['Name']) == 1, "Multiple field names %s" % str(field['Name']) + assert len(field['Name']) == 1, ( + 'Multiple field names %s' % str(field['Name'])) field = delist_dict(field) fields.append(field['Name']) new_tags = unique(tags + new_tags) @@ -288,14 +302,17 @@ def link_dict(db='pubmed', page=None, parsed=None): tags = [] link_info = [] linklists = parsed.getElementsByTagName("LinkList") - assert len(linklists) == 1, "%s\n\n%d LinkLists!" % (parsed.toxml(), len(linklists)) + assert len(linklists) == 1, ( + '%s\n\n%d LinkLists!' % (parsed.toxml(), len(linklists))) linklist = linklists[0] for node in linklist.childNodes: if node.nodeType != node.ELEMENT_NODE: continue # ignore text, comment, etc. nodes - assert node.tagName == "Link", "Unrecognized tag '%s' in LinkList" % node.tagName + assert node.tagName == 'Link', ( + "Unrecognized tag '%s' in LinkList" % node.tagName) link,new_tags = get_child_dict(node) - assert len(link['Name']) == 1, "Multiple link names %s" % str(link['Name']) + assert len(link['Name']) == 1, ( + 'Multiple link names %s' % str(link['Name'])) link = delist_dict(link) links.append(link['Name']) new_tags = unique(tags + new_tags) @@ -319,8 +336,8 @@ def validate_field(field, fields): try: fields.index(field.upper()) except ValueError: - raise Exception, "Field '%s' invalid\nValid fields are\n %s" \ - % (field, str(fields)) + raise Exception("Field '%s' invalid\nValid fields are\n %s" + % (field, str(fields))) def strip_fields_from_term(term): "HACK: really stupid algorithm" @@ -351,14 +368,17 @@ def _query_esearch(term, db='pubmed', field=None, Search an Entrez database. http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html - Does not currently support the usehistory, WebEnv, query_key, retstart, or retmode parameters. + Does not currently support the usehistory, WebEnv, query_key, + retstart, or retmode parameters. Help with the arguments adapted from esearch_help.html: - term: This command uses search terms or phrases with or without Boolean operators. + term: This command uses search terms or phrases with or without + Boolean operators. You can search in several fields using the [term field] tag. You can search in a single field using the 'field' parameter below. - ?You may also tag search terms using field=tag.? I don't understand this line + ?You may also tag search terms using field=tag.? I don't + understand this line For example: term=asthma[MESH]+OR+hay+fever[MESH] 'term=asthma[MESH]' is the same as 'term=asthma&field=MESH' ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea ) @@ -392,7 +412,9 @@ def _query_esearch(term, db='pubmed', field=None, """ if daterange != None: - assert len(daterange) == 2, "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')" + assert len(daterange) == 2, ( + "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')" + % (daterange,)) reldate == None, "Specifying date with daterange AND reldate!" mindate = daterange[0] maxdate = daterange[1] @@ -400,7 +422,8 @@ def _query_esearch(term, db='pubmed', field=None, mindate = None maxdate = None if validate: - assert len(valid_fields) > 0, "Need a list of valid fields to validate" + assert len(valid_fields) > 0, ( + 'Need a list of valid fields to validate') if field != None: validate_field(field) validate_search_term(term, valid_fields) @@ -449,7 +472,8 @@ def _query_efetch(id, db='pubmed', http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html - Does not currently support the usehistory, WebEnv, query_key, or retstart parameters. + Does not currently support the usehistory, WebEnv, query_key, or + retstart parameters. Help with the arguments adapted from efetchlit_help.html: @@ -538,8 +562,8 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed', id: Primary UIs identifying the documents to fetch For example: 'id=11877539, 11822933,11871444' - term: This command uses search terms or phrases with or without Boolean operators - to limit the returned matching links. + term: This command uses search terms or phrases with or without + Boolean operators to limit the returned matching links. db: This command selects the databases to be searched for link targets. For example: db=all @@ -551,10 +575,11 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed', cmd: Link commands * prlinks - List the hyperlink to the primary LinkOut provider for multiple IDs and database. Each ID is processed separately. - * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut provider - for a single ID and database. Return the elink - command, since fetching it breaks the relative - links in the publisher's page. + * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut + provider for a single ID and database. + Return the elink command, since fetching + it breaks the relative links in the + publisher's page. * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for multiple IDs and database. Each ID is processed separately. * llinkslib - List LinkOut URLs and Attributes for multiple IDs and @@ -633,7 +658,8 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed', return string -## Combining the searching and parsing (dropping some of the less used features) +## Combining the searching and parsing (dropping some of the less used +## features) def search_fetch_xml(term, db='pubmed', field=None, reldate=None, daterange=None, datetype=None, @@ -803,7 +829,7 @@ class Pipe (object): while read_set or write_set: LOG.debug('select on read %s, write %s' % (read_set,write_set)) try: - rlist, wlist, xlist = select.select(read_set, write_set, []) + rlist,wlist,xlist = select.select(read_set, write_set, []) except select.error, e: if e.args[0] == errno.EINTR: LOG.debug('EINTR') @@ -859,7 +885,9 @@ def medline_xml_to_bibtex(fetch_page): >>> xml = '\\n'.join([ ... '', - ... '', + ... '', ... '', ... ' ', ... ' ', @@ -952,23 +980,32 @@ free full text [sb] if __name__ == "__main__": from optparse import OptionParser - usage_string = """%prog [options] SEARCH_TERM (print medline xml matching search) - | %prog -l [options] SEARCH_TERM (print links to entries matching search) - | %prog -L [-d DATABASE] [-f FILE] (list databases) - | %prog -X [-d DATABASE] [-F FIELD] [-f FILE] (list fields in a database, or details on a single field) - -2008, W. Trevor King. - -See the docstrings in %prog or - http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html -for more details. -""" - parser = OptionParser(usage=usage_string, version="%prog 0.1") + usage_string = '\n'.join([ + '', + ' %prog [options] SEARCH_TERM' + ' (print medline xml matching search)', + '| %prog -l [options] SEARCH_TERM' + ' (print links to entries matching search)', + '| %prog -L [-d DATABASE] [-f FILE] (list databases)', + '| %prog -X [-d DATABASE] [-F FIELD] [-f FILE]' + ' (list fields in a database, or details on a single field)', + '', + '2008-2011, W. Trevor King.', + '', + 'See the docstrings in %prog or', + ' http://www.ncbi.nlm.nih.gov/entrez/query/static/' + 'eutils_help.html', + 'for more details.' + ]) + + parser = OptionParser( + usage=usage_string, version='%%prog %s' % __version__) # Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511 # " - # metavar is the name used in the help for that options required text, - # and dest is the name of the property you'll use to access the value of that option. + # metavar is the name used in the help for that options required + # text, and dest is the name of the property you'll use to access + # the value of that option. # " parser.add_option('-d', '--database', dest="database", @@ -1016,19 +1053,23 @@ for more details. help="Limit search to dates within DAYS of today", type='string', metavar="DAYS") parser.add_option('-R', '--daterange', dest="daterange", - help="Limit search to dates within DATERANGE (e.g. '2001/1/1,2002')", + help=("Limit search to dates within DATERANGE " + "(e.g. '2001/1/1,2002')"), type='string', metavar="DATERANGE") parser.add_option('-t', '--datetype', dest="datetype", - help="Select field to apply date limits to (e.g. 'edat' for Entrez date)", + help=("Select field to apply date limits to " + "(e.g. 'edat' for Entrez date)"), type='string', metavar="DATETYPE") parser.add_option('-m', '--retmax', dest="retmax", - help="Return at max RETMAX items from a successful search (default %default)", + help=('Return at max RETMAX items from a successful ' + 'search (default %default)'), type='string', metavar="RETMAX", default=20) parser.add_option('-M', '--retmode', dest="retmode", help="Select fetch/link output format", type='string', metavar="RETMODE", default='xml') parser.add_option('-V', '--validate', dest="validate", action="store_true", - help="Check that FIELD and field tags in SEARCH_TERM are valid for DB", + help=('Check that FIELD and field tags in SEARCH_TERM ' + 'are valid for DB'), default=False) # output link options @@ -1079,11 +1120,13 @@ for more details. if len(field_info[field]['FullName']) > field_size[1]: field_size[1] = len(field_info[field]['FullName']) for field in fields: - print >> outfile, "\t%*.*s\t%-*.*s" \ - % (field_size[0], field_size[0], field, - field_size[1], field_size[1], field_info[field]['FullName']) + print >> outfile, ('\t%*.*s\t%-*.*s' + % (field_size[0], field_size[0], field, + field_size[1], field_size[1], + field_info[field]['FullName'])) else: - print >> outfile, "Field %s in %s:" % (options.field,options.database) + print >> outfile, ( + 'Field %s in %s:' % (options.field,options.database)) field_size = [0,0] for key in tags: if len(key) > field_size[0]: @@ -1091,9 +1134,10 @@ for more details. if len(field_info[options.field][key]) > field_size[1]: field_size[1] = len(field_info[options.field][key]) for key in tags: - print >> outfile, "\t%*.*s\t%-*.*s" \ - % (field_size[0], field_size[0], key, - field_size[1], field_size[1], field_info[options.field][key]) + print >> outfile, ('\t%*.*s\t%-*.*s' + % (field_size[0], field_size[0], key, + field_size[1], field_size[1], + field_info[options.field][key])) elif mode == 'search': search_term = args[0]