Wrap long lines in entrez.py.

author W. Trevor King <wking@drexel.edu>

Sat, 16 Apr 2011 02:01:22 +0000 (22:01 -0400)

committer W. Trevor King <wking@drexel.edu>

Sat, 16 Apr 2011 02:01:22 +0000 (22:01 -0400)
author W. Trevor King <wking@drexel.edu>
Sat, 16 Apr 2011 02:01:22 +0000 (22:01 -0400)
committer W. Trevor King <wking@drexel.edu>
Sat, 16 Apr 2011 02:01:22 +0000 (22:01 -0400)
diff --git a/posts/entrez/entrez.py b/posts/entrez/entrez.py

index 61948b272f8d9751854205b46aa1be2a60642bcc..bdf6235bb75ca63ee22aaf8dea415b492b7c5b8c 100755 (executable)
--- a/posts/entrez/entrez.py
+++ b/posts/entrez/entrez.py
@@ -23,13 +23,20 @@
  # Current as of August 1, 2007
  #
  # Rules:
-#    * Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays for any series of more than 100 requests.
-#    * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, not the standard NCBI Web address.
+#    * Run retrieval scripts on weekends or between 9 pm and 5 am
+#      Eastern Time weekdays for any series of more than 100 requests.
+#    * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov,
+#      not the standard NCBI Web address.
  #    * Make no more than one request every 3 seconds.
-#    * Use the URL parameter email, and tool for distributed software, so that we can track your project and contact you if there is a problem.
-#    * NCBI's Disclaimer and Copyright notice must be evident to users of your service.
-#      NLM does not claim the copyright on the abstracts in PubMed; however, journal publishers or authors may.
-#      NLM provides no legal advice concerning distribution of copyrighted materials, consult your legal counsel.
+#    * Use the URL parameter email, and tool for distributed software,
+#      so that we can track your project and contact you if there is a
+#      problem.
+#    * NCBI's Disclaimer and Copyright notice must be evident to users
+#      of your service.
+#    * NLM does not claim the copyright on the abstracts in PubMed;
+#      however, journal publishers or authors may.
+#    * NLM provides no legal advice concerning distribution of
+#      copyrighted materials, consult your legal counsel.
  #
  # For a good Python-and-XML-DOM intro, see
  #  http://www.boddie.org.uk/python/XML_intro.html
@@ -63,6 +70,8 @@ if _POSIX:
      import select
  
  
+__version__ = '0.2'
+
  # Entrez access points
  EINFO_URL   = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
  ESEARCH_URL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
@@ -137,7 +146,8 @@ def get_child_nodes(node, child_name):
      returns an the node matching <child-name>
      """
      nodes = get_child_node(node, child_name)
-    assert len(nodes) == 1, "%d child nodes named %s" % (len(nodes), child_name)
+    assert len(nodes) == 1, '%d child nodes named %s' % (
+        len(nodes), child_name)
      return node[0]
  
  def get_child_contents(node, child_name):
@@ -202,8 +212,9 @@ def _query_einfo(db=None):
      Get information about the Entrez databases themselves.
      http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
  
-    Either list all available databases with db=None, or
-    Specific information on a particular database (e.g. pubmed) with db=pubmed.
+    Either list all available databases with `db=None`, or specific
+    information on a particular database (e.g. pubmed) with
+    `db=pubmed`.
      """
      params = urlencode({
              'db': db,
@@ -264,14 +275,17 @@ def field_dict(db='pubmed', page=None, parsed=None):
      tags = []
      field_info = {}
      fieldlists = parsed.getElementsByTagName("FieldList")
-    assert len(fieldlists) == 1, "%s\n\n%d FieldLists!" % (parsed.toxml(), len(fieldlists))
+    assert len(fieldlists) == 1, '%s\n\n%d FieldLists!' % (
+        parsed.toxml(), len(fieldlists))
      fieldlist = fieldlists[0]
      for node in fieldlist.childNodes:
          if node.nodeType != node.ELEMENT_NODE:
              continue # ignore text, comment, etc. nodes
-        assert node.tagName == "Field", "Unrecognized tag '%s' in FieldList" % node.tagName
+        assert node.tagName == 'Field', (
+            "Unrecognized tag '%s' in FieldList" % node.tagName)
          field,new_tags = get_child_dict(node)
-        assert len(field['Name']) == 1, "Multiple field names %s" % str(field['Name'])
+        assert len(field['Name']) == 1, (
+            'Multiple field names %s' % str(field['Name']))
          field = delist_dict(field)
          fields.append(field['Name'])
          new_tags = unique(tags + new_tags)
@@ -288,14 +302,17 @@ def link_dict(db='pubmed', page=None, parsed=None):
      tags = []
      link_info = []
      linklists = parsed.getElementsByTagName("LinkList")
-    assert len(linklists) == 1, "%s\n\n%d LinkLists!" % (parsed.toxml(), len(linklists))
+    assert len(linklists) == 1, (
+        '%s\n\n%d LinkLists!' % (parsed.toxml(), len(linklists)))
      linklist = linklists[0]
      for node in linklist.childNodes:
          if node.nodeType != node.ELEMENT_NODE:
              continue # ignore text, comment, etc. nodes
-        assert node.tagName == "Link", "Unrecognized tag '%s' in LinkList" % node.tagName
+        assert node.tagName == 'Link', (
+            "Unrecognized tag '%s' in LinkList" % node.tagName)
          link,new_tags = get_child_dict(node)
-        assert len(link['Name']) == 1, "Multiple link names %s" % str(link['Name'])
+        assert len(link['Name']) == 1, (
+            'Multiple link names %s' % str(link['Name']))
          link = delist_dict(link)
          links.append(link['Name'])
          new_tags = unique(tags + new_tags)
@@ -319,8 +336,8 @@ def validate_field(field, fields):
      try:
          fields.index(field.upper())
      except ValueError:
-        raise Exception, "Field '%s' invalid\nValid fields are\n %s" \
-                         % (field, str(fields))
+        raise Exception("Field '%s' invalid\nValid fields are\n %s"
+                        % (field, str(fields)))
  
  def strip_fields_from_term(term):
      "HACK: really stupid algorithm"
@@ -351,14 +368,17 @@ def _query_esearch(term, db='pubmed', field=None,
      Search an Entrez database.
      http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
  
-    Does not currently support the usehistory, WebEnv, query_key, retstart, or retmode parameters.
+    Does not currently support the usehistory, WebEnv, query_key,
+    retstart, or retmode parameters.
  
      Help with the arguments adapted from esearch_help.html:
  
-    term: This command uses search terms or phrases with or without Boolean operators.
+    term: This command uses search terms or phrases with or without
+    Boolean operators.
       You can search in several fields using the [term field] tag.
       You can search in a single field using the 'field' parameter below.
-     ?You may also tag search terms using field=tag.? I don't understand this line
+     ?You may also tag search terms using field=tag.? I don't
+       understand this line
       For example: term=asthma[MESH]+OR+hay+fever[MESH]
        'term=asthma[MESH]' is the same as 'term=asthma&field=MESH'
        ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea )
@@ -392,7 +412,9 @@ def _query_esearch(term, db='pubmed', field=None,
  
      """
      if daterange != None:
-        assert len(daterange) == 2, "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
+        assert len(daterange) == 2, (
+            "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
+            % (daterange,))
          reldate == None, "Specifying date with daterange AND reldate!"
          mindate = daterange[0]
          maxdate = daterange[1]
@@ -400,7 +422,8 @@ def _query_esearch(term, db='pubmed', field=None,
          mindate = None
          maxdate = None
      if validate:
-        assert len(valid_fields) > 0, "Need a list of valid fields to validate"
+        assert len(valid_fields) > 0, (
+            'Need a list of valid fields to validate')
          if field != None:
              validate_field(field)
          validate_search_term(term, valid_fields)
@@ -449,7 +472,8 @@ def _query_efetch(id, db='pubmed',
      http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
  
  
-    Does not currently support the usehistory, WebEnv, query_key, or retstart parameters.
+    Does not currently support the usehistory, WebEnv, query_key, or
+    retstart parameters.
  
      Help with the arguments adapted from efetchlit_help.html:
  
@@ -538,8 +562,8 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed',
      id: Primary UIs identifying the documents to fetch
       For example: 'id=11877539, 11822933,11871444'
  
-    term: This command uses search terms or phrases with or without Boolean operators
-     to limit the returned matching links.
+    term: This command uses search terms or phrases with or without
+     Boolean operators to limit the returned matching links.
  
      db: This command selects the databases to be searched for link targets.
       For example: db=all
@@ -551,10 +575,11 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed',
      cmd: Link commands
       * prlinks - List the hyperlink to the primary LinkOut provider for
                   multiple IDs and database. Each ID is processed separately.
-     * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut provider
-                             for a single ID and database.  Return the elink
-                             command, since fetching it breaks the relative
-                             links in the publisher's page.
+     * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut
+                             provider for a single ID and database.
+                             Return the elink command, since fetching
+                             it breaks the relative links in the
+                             publisher's page.
       * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for
                  multiple IDs and database. Each ID is processed separately.
       * llinkslib - List LinkOut URLs and Attributes for multiple IDs and
@@ -633,7 +658,8 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed',
      return string
  
  
-## Combining the searching and parsing (dropping some of the less used features)
+## Combining the searching and parsing (dropping some of the less used
+## features)
  
  def search_fetch_xml(term, db='pubmed', field=None,
                       reldate=None, daterange=None, datetype=None,
@@ -803,7 +829,7 @@ class Pipe (object):
              while read_set or write_set:
                  LOG.debug('select on read %s, write %s' % (read_set,write_set))
                  try:
-                    rlist, wlist, xlist = select.select(read_set, write_set, [])
+                    rlist,wlist,xlist = select.select(read_set, write_set, [])
                  except select.error, e:
                      if e.args[0] == errno.EINTR:
                          LOG.debug('EINTR')
@@ -859,7 +885,9 @@ def medline_xml_to_bibtex(fetch_page):
  
      >>> xml = '\\n'.join([
      ...     '<?xml version="1.0"?>',
-    ...     '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_110101.dtd">',
+    ...     '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, '
+    ...     '1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query'
+    ,,,     '/DTD/pubmed_110101.dtd">',
      ...     '<PubmedArticleSet>',
      ...     ' <PubmedArticle>',
      ...     '  <MedlineCitation Owner="NLM" Status="MEDLINE">',
@@ -952,23 +980,32 @@ free full text [sb]
  if __name__ == "__main__":
      from optparse import OptionParser
  
-    usage_string = """%prog [options] SEARCH_TERM       (print medline xml matching search)
-     | %prog -l [options] SEARCH_TERM    (print links to entries matching search)
-     | %prog -L [-d DATABASE] [-f FILE]  (list databases)
-     | %prog -X [-d DATABASE] [-F FIELD] [-f FILE]  (list fields in a database, or details on a single field)
-
-2008, W. Trevor King.
-
-See the docstrings in %prog or
- http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
-for more details.
-"""
-    parser = OptionParser(usage=usage_string, version="%prog 0.1")
+    usage_string = '\n'.join([
+            '',
+            '  %prog [options] SEARCH_TERM'
+             '       (print medline xml matching search)',
+            '| %prog -l [options] SEARCH_TERM'
+             '    (print links to entries matching search)',
+            '| %prog -L [-d DATABASE] [-f FILE]  (list databases)',
+            '| %prog -X [-d DATABASE] [-F FIELD] [-f FILE]'
+             '  (list fields in a database, or details on a single field)',
+            '',
+            '2008-2011, W. Trevor King.',
+            '',
+            'See the docstrings in %prog or',
+            ' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
+             'eutils_help.html',
+            'for more details.'
+            ])
+
+    parser = OptionParser(
+        usage=usage_string, version='%%prog %s' % __version__)
  
      # Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511
      # "
-    # metavar is the name used in the help for that options required text,
-    # and dest is the name of the property you'll use to access the value of that option.
+    # metavar is the name used in the help for that options required
+    # text, and dest is the name of the property you'll use to access
+    # the value of that option.
      # "
  
      parser.add_option('-d', '--database', dest="database",
@@ -1016,19 +1053,23 @@ for more details.
                        help="Limit search to dates within DAYS of today",
                        type='string', metavar="DAYS")
      parser.add_option('-R', '--daterange', dest="daterange",
-                      help="Limit search to dates within DATERANGE (e.g. '2001/1/1,2002')",
+                      help=("Limit search to dates within DATERANGE "
+                            "(e.g. '2001/1/1,2002')"),
                        type='string', metavar="DATERANGE")
      parser.add_option('-t', '--datetype', dest="datetype",
-                      help="Select field to apply date limits to (e.g. 'edat' for Entrez date)",
+                      help=("Select field to apply date limits to "
+                            "(e.g. 'edat' for Entrez date)"),
                        type='string', metavar="DATETYPE")
      parser.add_option('-m', '--retmax', dest="retmax",
-                      help="Return at max RETMAX items from a successful search (default %default)",
+                      help=('Return at max RETMAX items from a successful '
+                            'search (default %default)'),
                        type='string', metavar="RETMAX", default=20)
      parser.add_option('-M', '--retmode', dest="retmode",
                        help="Select fetch/link output format",
                        type='string', metavar="RETMODE", default='xml')
      parser.add_option('-V', '--validate', dest="validate", action="store_true",
-                      help="Check that FIELD and field tags in SEARCH_TERM are valid for DB",
+                      help=('Check that FIELD and field tags in SEARCH_TERM '
+                            'are valid for DB'),
                        default=False)
  
      # output link options
@@ -1079,11 +1120,13 @@ for more details.
                  if len(field_info[field]['FullName']) > field_size[1]:
                      field_size[1] = len(field_info[field]['FullName'])
              for field in fields:
-                print >> outfile, "\t%*.*s\t%-*.*s" \
-                    % (field_size[0], field_size[0], field,
-                       field_size[1], field_size[1], field_info[field]['FullName'])
+                print >> outfile, ('\t%*.*s\t%-*.*s'
+                                   % (field_size[0], field_size[0], field,
+                                      field_size[1], field_size[1],
+                                      field_info[field]['FullName']))
          else:
-            print >> outfile, "Field %s in %s:" % (options.field,options.database)
+            print >> outfile, (
+                'Field %s in %s:' % (options.field,options.database))
              field_size = [0,0]
              for key in tags:
                  if len(key) > field_size[0]:
@@ -1091,9 +1134,10 @@ for more details.
                  if len(field_info[options.field][key]) > field_size[1]:
                      field_size[1] = len(field_info[options.field][key])
              for key in tags:
-                print >> outfile, "\t%*.*s\t%-*.*s" \
-                    % (field_size[0], field_size[0], key,
-                       field_size[1], field_size[1], field_info[options.field][key])
+                print >> outfile, ('\t%*.*s\t%-*.*s'
+                                   % (field_size[0], field_size[0], key,
+                                      field_size[1], field_size[1],
+                                      field_info[options.field][key]))
  
      elif mode == 'search':
          search_term = args[0]
author	W. Trevor King <wking@drexel.edu>
	Sat, 16 Apr 2011 02:01:22 +0000 (22:01 -0400)
committer	W. Trevor King <wking@drexel.edu>
	Sat, 16 Apr 2011 02:01:22 +0000 (22:01 -0400)