From d4ed3fce418da02cf1fb27f7c764a63496e1b0da Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Fri, 15 Apr 2011 21:15:47 -0400 Subject: [PATCH] Move entrez.py from popen2.popen2 to subprocess.Popen. popen2 is depreceated for Python >= 2.6. The Pipe object is borrowed from Bugs Everywhere. Also added: * logging (instead of the old debug printouts) * doctests for medline_xml_to_bibtex(). * don't fetch medline or process to bibtex if a search is empty. --- posts/entrez/entrez.py | 421 +++++++++++++++++++++++++++++++++-------- 1 file changed, 338 insertions(+), 83 deletions(-) diff --git a/posts/entrez/entrez.py b/posts/entrez/entrez.py index 6dc7a9b..89b9da1 100755 --- a/posts/entrez/entrez.py +++ b/posts/entrez/entrez.py @@ -39,8 +39,12 @@ """Python bindings on Entrez database queries. """ -# The time module is added for querying date ranges of publications -import urllib, sys, re, string, time +import logging +import re +import string +import sys +import time # for querying date ranges of publications +import urllib # DOM module for parsing XML, # supports Document Object Model (DOM) Level 1 Specification @@ -48,7 +52,15 @@ import urllib, sys, re, string, time import xml.dom.minidom as dom # For calling the bibutils conversion programs -from popen2 import popen2 +from subprocess import Popen, PIPE + +# Platform constants +_MSWINDOWS = sys.platform == 'win32' +_POSIX = not _MSWINDOWS + +if _POSIX: + import os + import select # Entrez access points @@ -61,6 +73,16 @@ elink_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi' TOOL = 'entrezpy' EMAIL = 'wking@drexel.edu' +# Logger + +LOG = logging.getLogger(TOOL) +LOG.setLevel(logging.WARN) +_handler = logging.StreamHandler() +_formatter = logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s') +_handler.setFormatter(_formatter) +LOG.addHandler(_handler) +del _handler, _formatter + ## XML and list utility functions def urlencode(param_dict) : @@ -187,7 +209,7 @@ def delist_dict(dict) : ## Get information about the Entrez databases themselves -def _query_einfo(db=None, debug=False) : +def _query_einfo(db=None): """ Get information about the Entrez databases themselves. http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html @@ -200,17 +222,14 @@ def _query_einfo(db=None, debug=False) : 'tool' : TOOL, 'email' : EMAIL}) - if debug : - print "Getting einfo from '%s?%s'" % (einfo_url, params) + LOG.info("getting einfo from '%s?%s'" % (einfo_url, params)) f = urllib.urlopen ("%s?%s" % (einfo_url, params)) string = f.read() f.close() - if debug == True: - print string - print "" + LOG.debug('got:\n%s' % string) return string -def get_parsed_einfo(db=None, page=None, parsed=None, debug=True): +def get_parsed_einfo(db=None, page=None, parsed=None): """ Helper function for various einfo processing functions. Allow each processor to function @@ -220,39 +239,39 @@ def get_parsed_einfo(db=None, page=None, parsed=None, debug=True): Use clean_parsed_einfo() for cleanup """ if page == None and parsed == None: - if debug == True : print "Downloading new einfo page" + LOG.info('downloading new einfo page') page = _query_einfo(db) if parsed == None : - if debug == True : print "Parsing new einfo page" + LOG.info('parsing new einfo page') parsed = dom.parseString(page) parsed_islocal = True else : - if debug == True : print "Using old einfo parsing" + LOG.info('using old einfo parsing') parsed_islocal = False return (parsed, parsed_islocal) -def clean_parsed_einfo(parsed, parsed_islocal=True, debug=False): +def clean_parsed_einfo(parsed, parsed_islocal=True): """ Helper function for various einfo processing functions. Clean up the parsed xml structure if the calling function created it. """ if parsed_islocal == True : - if debug == True : print "Cleaning up einfo parsing" + LOG.info('cleaning up einfo parsing') parsed.unlink() # clean up the DOM -def database_list(page=None, parsed=None, debug=False): - parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed, debug=debug) +def database_list(page=None, parsed=None): + parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed) databases = [] for node in parsed.getElementsByTagName("DbName"): # Extract some-text from 'some-text' # by default, xml.dom.minidom uses unicode, # so strings get printed: "u'string contents'" databases.append(get_text(node)) - clean_parsed_einfo(parsed,parsed_islocal, debug=debug) + clean_parsed_einfo(parsed,parsed_islocal) return databases -def field_dict(db='pubmed', page=None, parsed=None, debug=False): - parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug) +def field_dict(db='pubmed', page=None, parsed=None): + parsed,parsed_islocal = get_parsed_einfo(db, page, parsed) fields = [] tags = [] field_info = {} @@ -272,11 +291,11 @@ def field_dict(db='pubmed', page=None, parsed=None, debug=False): assert new_tags == tags, "Inconsistent tags" tags = new_tags field_info[field['Name']] = field - clean_parsed_einfo(parsed,parsed_islocal, debug) + clean_parsed_einfo(parsed,parsed_islocal) return (fields, tags, field_info) -def link_dict(db='pubmed', page=None, parsed=None, debug=False): - parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug) +def link_dict(db='pubmed', page=None, parsed=None): + parsed,parsed_islocal = get_parsed_einfo(db, page, parsed) links = [] tags = [] link_info = [] @@ -296,15 +315,15 @@ def link_dict(db='pubmed', page=None, parsed=None, debug=False): assert new_tags == tags, "Inconsistent tags" tags = new_tags link_info[link['Name']] = link - clean_parsed_einfo(parsed,parsed_islocal, debug) + clean_parsed_einfo(parsed,parsed_islocal) return (links, tags, link_info) -def database_info(db='pubmed', page=None, parsed=None, debug=False): +def database_info(db='pubmed', page=None, parsed=None): "Convenience function to call both field_dict and link_dict" - parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug) - fields,field_tags,field_info = field_dict(db=db, parsed=parsed, debug=debug) - links,link_tags,link_info = link_dict(db=db, parsed=parsed, debug=debug) - clean_parsed_einfo(parsed,parsed_islocal, debug=debug) + parsed,parsed_islocal = get_parsed_einfo(db, page, parsed) + fields,field_tags,field_info = field_dict(db=db, parsed=parsed) + links,link_tags,link_info = link_dict(db=db, parsed=parsed) + clean_parsed_einfo(parsed,parsed_islocal) return (fields, field_tags, field_info, links, link_tags, link_info) def validate_field(field, fields): @@ -412,14 +431,11 @@ def _query_esearch(term, db='pubmed', field=None, 'rettype' : rettype, 'sort' : sort}) - if debug : - print "Getting esearch from '%s?%s'" % (esearch_url, params) + LOG.info("getting esearch from '%s?%s'" % (esearch_url, params)) f = urllib.urlopen ("%s?%s" % (esearch_url, params)) string = f.read() f.close() - if debug == True: - print string - print "" + LOG.debug('got:\n%s' % string) return string def parse_esearch(page): @@ -438,8 +454,7 @@ def parse_esearch(page): ## Fetch records by Primary ID from an Entrez database def _query_efetch(id, db='pubmed', - retmax=None, retmode='xml', rettype='medline', - debug=False) : + retmax=None, retmode='xml', rettype='medline'): """ Fetch records by primary ID from an Entrez database. http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html @@ -508,14 +523,11 @@ def _query_efetch(id, db='pubmed', 'retmode' : retmode, 'rettype' : rettype}) - if debug : - print "Getting efetch from '%s?%s'" % (efetch_url, params) + LOG.info("getting efetch from '%s?%s'" % (efetch_url, params)) f = urllib.urlopen ("%s?%s" % (efetch_url, params)) string = f.read() f.close() - if debug == True: - print string - print "" + LOG.debug('got:\n%s' % string) return string @@ -525,8 +537,7 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed', cmd=None, linkname=None, holding=None, version=1, reldate=None, daterange=None, datetype=None, - retmode='xml', - debug=False) : + retmode='xml'): """ Fetch links from a list of primary IDs in an Entrez database. http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html @@ -619,8 +630,7 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed', 'datetype': datetype, 'retmode' : retmode}) - if debug : - print "Getting elink from '%s?%s'" % (elink_url, params) + LOG.info("getting elink from '%s?%s'" % (elink_url, params)) f = urllib.urlopen ("%s?%s" % (elink_url, params)) if cmd == 'prlinks' and retmode == 'ref' : @@ -631,9 +641,7 @@ def _query_elink(id, term=None, db='all', dbfrom='pubmed', string = f.read() f.close() - if debug == True: - print string - print "" + LOG.debug('got:\n%s' % string) return string @@ -643,17 +651,17 @@ def search_fetch_xml(term, db='pubmed', field=None, reldate=None, daterange=None, datetype=None, retmax=None, sort=None, validate=False, valid_fields=None, - retmode='xml', rettype='medline', - debug=False) : + retmode='xml', rettype='medline'): if validate and valid_fields == None: - valid_fields,field_tags,field_info = field_dict(db, debug=debug) + valid_fields,field_tags,field_info = field_dict(db) search_page = _query_esearch(term, db, field, reldate, daterange, datetype, retmax, rettype='uilist', sort=sort, - validate=validate, valid_fields=valid_fields, - debug=debug) + validate=validate, valid_fields=valid_fields) pid_list = parse_esearch(search_page) - fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype, debug) + if not pid_list: + return None + fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype) return fetch_page def search_link(term, db='pubmed', field=None, @@ -664,33 +672,281 @@ def search_link(term, db='pubmed', field=None, cmd=None, linkname=None, link_holding=None, version=1, link_reldate=None, link_daterange=None, link_datetype=None, - link_retmode='xml', - debug=False) : + link_retmode='xml'): if validate and valid_fields == None: - valid_fields,field_tags,field_info = field_dict(db, debug=debug) + valid_fields,field_tags,field_info = field_dict(db) search_page = _query_esearch(term, db, field, reldate, daterange, datetype, retmax, rettype='uilist', sort=sort, - validate=validate, valid_fields=valid_fields, - debug=debug) + validate=validate, valid_fields=valid_fields) pid_list = parse_esearch(search_page) link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb, cmd=cmd, linkname=linkname, holding=link_holding, version=version,reldate=link_reldate, daterange=link_daterange, datetype=link_datetype, - retmode=link_retmode, - debug=debug) + retmode=link_retmode) return link_page ## Use the external bibutils package to convert to BibTeX format + +class Pipe (object): + """Simple interface for executing POSIX-style pipes. + + Based on the subprocess module. The only complication is the + adaptation of `subprocess.Popen._communicate` to listen to the + stderrs of all processes involved in the pipe, as well as the + terminal process' stdout. There are two implementations of + `Pipe._communicate`, one for MS Windows, and one for POSIX + systems. The MS Windows implementation is currently untested. + + >>> p = Pipe([['find', '/etc/'], ['grep', '^/etc/ssh$']]) + >>> p.stdout + '/etc/ssh\\n' + >>> p.status + 1 + >>> p.statuses + [1, 0] + >>> p.stderrs # doctest: +ELLIPSIS + [...find: ...: Permission denied..., ''] + + >>> p = Pipe([['cat'], ['head']], stdin='line 1\\nline 2\\nline 3\\n') + >>> p.stdout + 'line 1\\nline 2\\nline 3\\n' + >>> p.statuses + [0, 0] + >>> p.stderrs + ['', ''] + """ + def __init__(self, cmds, stdin=None): + if isinstance(stdin, str): + stdin_str = stdin + stdin = PIPE + else: + stdin_str = None + + # spawn processes + self._procs = [] + for cmd in cmds: + if len(self._procs) != 0: + stdin = self._procs[-1].stdout + LOG.debug('run command %s' % cmd) + kwargs = {} + if _POSIX: + kwargs['close_fds'] = True + self._procs.append(Popen( + cmd, stdin=stdin, stdout=PIPE, stderr=PIPE, **kwargs)) + + self.stdout,self.stderrs = self._communicate(input=stdin_str) + + # collect process statuses + self.statuses = [] + self.status = 0 + for proc in self._procs: + self.statuses.append(proc.wait()) + LOG.debug('join %s (status %d)' % (proc, self.statuses[-1])) + if self.statuses[-1] != 0: + self.status = self.statuses[-1] + + # Code excerpted from subprocess.Popen._communicate() + if _MSWINDOWS == True: + def _communicate(self, input=None): + LOG.debug('communicate with pipe') + assert input == None, 'stdin != None not yet supported' + # listen to each process' stderr + threads = [] + std_X_arrays = [] + for proc in self._procs: + stderr_array = [] + thread = Thread(target=proc._readerthread, + args=(proc.stderr, stderr_array)) + thread.setDaemon(True) + thread.start() + threads.append(thread) + std_X_arrays.append(stderr_array) + + # also listen to the last processes stdout + stdout_array = [] + thread = Thread(target=proc._readerthread, + args=(proc.stdout, stdout_array)) + thread.setDaemon(True) + thread.start() + threads.append(thread) + std_X_arrays.append(stdout_array) + + # join threads as they die + for thread in threads: + thread.join() + + # read output from reader threads + std_X_strings = [] + for std_X_array in std_X_arrays: + std_X_strings.append(std_X_array[0]) + + stdout = std_X_strings.pop(-1) + stderrs = std_X_strings + LOG.debug('pipe communication complete') + return (stdout, stderrs) + else: + assert _POSIX==True, 'invalid platform' + def _communicate(self, input=None): + LOG.debug('communicate with pipe') + read_set = [] + write_set = [] + read_arrays = [] + stdout = None # Return + stderr = None # Return + + if self._procs[0].stdin: + # Flush stdio buffer. This might block, if the user has + # been writing to .stdin in an uncontrolled fashion. + self._procs[0].stdin.flush() + if input: + write_set.append(self._procs[0].stdin) + else: + self._procs[0].stdin.close() + for proc in self._procs: + read_set.append(proc.stderr) + read_arrays.append([]) + read_set.append(self._procs[-1].stdout) + read_arrays.append([]) + + input_offset = 0 + while read_set or write_set: + LOG.debug('select on read %s, write %s' % (read_set,write_set)) + try: + rlist, wlist, xlist = select.select(read_set, write_set, []) + except select.error, e: + if e.args[0] == errno.EINTR: + LOG.debug('EINTR') + continue + raise + LOG.debug('selected read %s, write %s, exception %s' + % (rlist, wlist, xlist)) + if self._procs[0].stdin in wlist: + # When select has indicated that the file is writable, + # we can write up to PIPE_BUF bytes without risk + # blocking. POSIX defines PIPE_BUF >= 512 + LOG.debug('write to stdin for process 0') + chunk = input[input_offset : input_offset + 512] + bytes_written = os.write( + self._procs[0].stdin.fileno(), chunk) + input_offset += bytes_written + if input_offset >= len(input): + self._procs[0].stdin.flush() + self._procs[0].stdin.close() + write_set.remove(self._procs[0].stdin) + LOG.debug('stdin complete') + if self._procs[-1].stdout in rlist: + LOG.debug('read stdout for final process') + data = os.read(self._procs[-1].stdout.fileno(), 1024) + if data == '': + self._procs[-1].stdout.close() + read_set.remove(self._procs[-1].stdout) + LOG.debug('stdout complete') + read_arrays[-1].append(data) + for i,proc in enumerate(self._procs): + if proc.stderr in rlist: + LOG.debug('read stderr for process %i' % i) + data = os.read(proc.stderr.fileno(), 1024) + if data == '': + proc.stderr.close() + read_set.remove(proc.stderr) + LOG.debug('stderr complete for process %d' % i) + read_arrays[i].append(data) + + # All data exchanged. Translate lists into strings. + read_strings = [] + for read_array in read_arrays: + read_strings.append(''.join(read_array)) + + stdout = read_strings.pop(-1) + stderrs = read_strings + LOG.debug('pipe communication complete') + return (stdout, stderrs) + + def medline_xml_to_bibtex(fetch_page): - child_stdout,child_stdin = popen2("med2xml | xml2bib -fc | bibclean") - print >> child_stdin, fetch_page - child_stdin.close() - bibtex = child_stdout.read() - child_stdout.close() - return bibtex + """Convert medline XML to BibTeX + + >>> xml = '\\n'.join([ + ... '', + ... '', + ... '', + ... ' ', + ... ' ', + ... ' 20004685', + ... '
', + ... ' ', + ... ' 1879-0003', + ... ' ', + ... ' 462', + ... ' ', + ... ' 2010Mar1', + ... ' ', + ... ' ', + ... ' ', + ... ' Monte Carlo simulation of mechanical unfolding ' + ... 'of proteins based on a simple two-state model.' + ... '', + ... ' 159-66', + ... ' ', + ... ' ', + ... ' King', + ... ' William T', + ... ' WT', + ... ' ', + ... ' ', + ... ' Su', + ... ' Meihong', + ... ' M', + ... ' ', + ... ' ', + ... ' Yang', + ... ' Guoliang', + ... ' G', + ... ' ', + ... ' ', + ... ' ', + ... ' Int J Biol Macromol', + ... ' ', + ... '
', + ... ' ', + ... ' Int J Biol Macromol', + ... ' ', + ... '
', + ... ' ', + ... ' ', + ... ' 10.1016/j.ijbiomac.2009.12.001' + ... '', + ... ' ', + ... ' ', + ... '
', + ... '
', + ... ]) + >>> print medline_xml_to_bibtex(xml) + @Article{King2010, + author = "William T. King and Meihong Su and Guoliang Yang", + title = "Monte Carlo simulation of mechanical unfolding of + proteins based on a simple two-state model.", + journal = "Int J Biol Macromol", + year = "2010", + month = mar, + day = "01", + volume = "46", + number = "2", + pages = "159--166", + ISSN = "1879-0003", + doi = "10.1016/j.ijbiomac.2009.12.001", + } + + """ + LOG.info('convert medline XML to BibTeX\n%s' % fetch_page) + p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']], + stdin=fetch_page) + LOG.debug('converted to\n%s' % p.stdout) + return p.stdout + ## Random @@ -814,17 +1070,18 @@ for more details. outfile = file(options.filename, 'w') if options.verbose : - print >> sys.stdout, "Operating in %s mode" % mode - + LOG.setLevel(logging.DEBUG) + + LOG.debug('operating in %s mode' % mode) + if mode == 'list' : print >> outfile, "Available databases:" - databases = database_list(debug=options.verbose) + databases = database_list() for db in databases: print >> outfile, "\t%s" % db elif mode == 'explain': - fields,tags,field_info = field_dict(db=options.database, - debug=options.verbose) + fields,tags,field_info = field_dict(db=options.database) if options.field == None : print >> outfile, "Available fields in %s:" % options.database field_size = [0,0] @@ -852,8 +1109,7 @@ for more details. elif mode == 'search': search_term = args[0] - if options.verbose : - print >> sys.stdout, "Output %s" % output + LOG.debug('output %s' % output) if output == 'bibtex' : medline_xml = search_fetch_xml(term=search_term, @@ -865,13 +1121,13 @@ for more details. retmax=options.retmax, validate=options.validate, retmode=options.retmode, - rettype='medline', - debug=options.verbose) - if options.raw : - print outfile, medline_xml - else : - bibtex = medline_xml_to_bibtex(medline_xml) - print >> outfile, bibtex + rettype='medline') + if medline_xml: + if options.raw : + print outfile, medline_xml + else: + bibtex = medline_xml_to_bibtex(medline_xml) + print >> outfile, bibtex elif output == 'link' : # Assume that if you're looking for links @@ -896,8 +1152,7 @@ for more details. link_reldate=options.reldate, link_daterange=options.daterange, link_datetype=options.datetype, - link_retmode=options.retmode, - debug=options.verbose) + link_retmode=options.retmode,) print >> outfile, link_xml if options.filename != None : -- 2.26.2