"""Python bindings on Entrez database queries.
"""
-# The time module is added for querying date ranges of publications
-import urllib, sys, re, string, time
+import logging
+import re
+import string
+import sys
+import time # for querying date ranges of publications
+import urllib
# DOM module for parsing XML,
# supports Document Object Model (DOM) Level 1 Specification
import xml.dom.minidom as dom
# For calling the bibutils conversion programs
-from popen2 import popen2
+from subprocess import Popen, PIPE
+
+# Platform constants
+_MSWINDOWS = sys.platform == 'win32'
+_POSIX = not _MSWINDOWS
+
+if _POSIX:
+ import os
+ import select
# Entrez access points
TOOL = 'entrezpy'
EMAIL = 'wking@drexel.edu'
+# Logger
+
+LOG = logging.getLogger(TOOL)
+LOG.setLevel(logging.WARN)
+_handler = logging.StreamHandler()
+_formatter = logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
+_handler.setFormatter(_formatter)
+LOG.addHandler(_handler)
+del _handler, _formatter
+
## XML and list utility functions
def urlencode(param_dict) :
## Get information about the Entrez databases themselves
-def _query_einfo(db=None, debug=False) :
+def _query_einfo(db=None):
"""
Get information about the Entrez databases themselves.
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
'tool' : TOOL,
'email' : EMAIL})
- if debug :
- print "Getting einfo from '%s?%s'" % (einfo_url, params)
+ LOG.info("getting einfo from '%s?%s'" % (einfo_url, params))
f = urllib.urlopen ("%s?%s" % (einfo_url, params))
string = f.read()
f.close()
- if debug == True:
- print string
- print ""
+ LOG.debug('got:\n%s' % string)
return string
-def get_parsed_einfo(db=None, page=None, parsed=None, debug=True):
+def get_parsed_einfo(db=None, page=None, parsed=None):
"""
Helper function for various einfo processing functions.
Allow each processor to function
Use clean_parsed_einfo() for cleanup
"""
if page == None and parsed == None:
- if debug == True : print "Downloading new einfo page"
+ LOG.info('downloading new einfo page')
page = _query_einfo(db)
if parsed == None :
- if debug == True : print "Parsing new einfo page"
+ LOG.info('parsing new einfo page')
parsed = dom.parseString(page)
parsed_islocal = True
else :
- if debug == True : print "Using old einfo parsing"
+ LOG.info('using old einfo parsing')
parsed_islocal = False
return (parsed, parsed_islocal)
-def clean_parsed_einfo(parsed, parsed_islocal=True, debug=False):
+def clean_parsed_einfo(parsed, parsed_islocal=True):
"""
Helper function for various einfo processing functions.
Clean up the parsed xml structure if the calling function created it.
"""
if parsed_islocal == True :
- if debug == True : print "Cleaning up einfo parsing"
+ LOG.info('cleaning up einfo parsing')
parsed.unlink() # clean up the DOM
-def database_list(page=None, parsed=None, debug=False):
- parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed, debug=debug)
+def database_list(page=None, parsed=None):
+ parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed)
databases = []
for node in parsed.getElementsByTagName("DbName"):
# Extract some-text from '<DbName>some-text</DbName>'
# by default, xml.dom.minidom uses unicode,
# so strings get printed: "u'string contents'"
databases.append(get_text(node))
- clean_parsed_einfo(parsed,parsed_islocal, debug=debug)
+ clean_parsed_einfo(parsed,parsed_islocal)
return databases
-def field_dict(db='pubmed', page=None, parsed=None, debug=False):
- parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug)
+def field_dict(db='pubmed', page=None, parsed=None):
+ parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
fields = []
tags = []
field_info = {}
assert new_tags == tags, "Inconsistent tags"
tags = new_tags
field_info[field['Name']] = field
- clean_parsed_einfo(parsed,parsed_islocal, debug)
+ clean_parsed_einfo(parsed,parsed_islocal)
return (fields, tags, field_info)
-def link_dict(db='pubmed', page=None, parsed=None, debug=False):
- parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug)
+def link_dict(db='pubmed', page=None, parsed=None):
+ parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
links = []
tags = []
link_info = []
assert new_tags == tags, "Inconsistent tags"
tags = new_tags
link_info[link['Name']] = link
- clean_parsed_einfo(parsed,parsed_islocal, debug)
+ clean_parsed_einfo(parsed,parsed_islocal)
return (links, tags, link_info)
-def database_info(db='pubmed', page=None, parsed=None, debug=False):
+def database_info(db='pubmed', page=None, parsed=None):
"Convenience function to call both field_dict and link_dict"
- parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug)
- fields,field_tags,field_info = field_dict(db=db, parsed=parsed, debug=debug)
- links,link_tags,link_info = link_dict(db=db, parsed=parsed, debug=debug)
- clean_parsed_einfo(parsed,parsed_islocal, debug=debug)
+ parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
+ fields,field_tags,field_info = field_dict(db=db, parsed=parsed)
+ links,link_tags,link_info = link_dict(db=db, parsed=parsed)
+ clean_parsed_einfo(parsed,parsed_islocal)
return (fields, field_tags, field_info, links, link_tags, link_info)
def validate_field(field, fields):
'rettype' : rettype,
'sort' : sort})
- if debug :
- print "Getting esearch from '%s?%s'" % (esearch_url, params)
+ LOG.info("getting esearch from '%s?%s'" % (esearch_url, params))
f = urllib.urlopen ("%s?%s" % (esearch_url, params))
string = f.read()
f.close()
- if debug == True:
- print string
- print ""
+ LOG.debug('got:\n%s' % string)
return string
def parse_esearch(page):
## Fetch records by Primary ID from an Entrez database
def _query_efetch(id, db='pubmed',
- retmax=None, retmode='xml', rettype='medline',
- debug=False) :
+ retmax=None, retmode='xml', rettype='medline'):
"""
Fetch records by primary ID from an Entrez database.
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
'retmode' : retmode,
'rettype' : rettype})
- if debug :
- print "Getting efetch from '%s?%s'" % (efetch_url, params)
+ LOG.info("getting efetch from '%s?%s'" % (efetch_url, params))
f = urllib.urlopen ("%s?%s" % (efetch_url, params))
string = f.read()
f.close()
- if debug == True:
- print string
- print ""
+ LOG.debug('got:\n%s' % string)
return string
cmd=None, linkname=None, holding=None,
version=1,
reldate=None, daterange=None, datetype=None,
- retmode='xml',
- debug=False) :
+ retmode='xml'):
"""
Fetch links from a list of primary IDs in an Entrez database.
http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
'datetype': datetype,
'retmode' : retmode})
- if debug :
- print "Getting elink from '%s?%s'" % (elink_url, params)
+ LOG.info("getting elink from '%s?%s'" % (elink_url, params))
f = urllib.urlopen ("%s?%s" % (elink_url, params))
if cmd == 'prlinks' and retmode == 'ref' :
string = f.read()
f.close()
- if debug == True:
- print string
- print ""
+ LOG.debug('got:\n%s' % string)
return string
reldate=None, daterange=None, datetype=None,
retmax=None, sort=None,
validate=False, valid_fields=None,
- retmode='xml', rettype='medline',
- debug=False) :
+ retmode='xml', rettype='medline'):
if validate and valid_fields == None:
- valid_fields,field_tags,field_info = field_dict(db, debug=debug)
+ valid_fields,field_tags,field_info = field_dict(db)
search_page = _query_esearch(term, db, field,
reldate, daterange, datetype,
retmax, rettype='uilist', sort=sort,
- validate=validate, valid_fields=valid_fields,
- debug=debug)
+ validate=validate, valid_fields=valid_fields)
pid_list = parse_esearch(search_page)
- fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype, debug)
+ if not pid_list:
+ return None
+ fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype)
return fetch_page
def search_link(term, db='pubmed', field=None,
cmd=None, linkname=None, link_holding=None,
version=1,
link_reldate=None, link_daterange=None, link_datetype=None,
- link_retmode='xml',
- debug=False) :
+ link_retmode='xml'):
if validate and valid_fields == None:
- valid_fields,field_tags,field_info = field_dict(db, debug=debug)
+ valid_fields,field_tags,field_info = field_dict(db)
search_page = _query_esearch(term, db, field,
reldate, daterange, datetype,
retmax, rettype='uilist', sort=sort,
- validate=validate, valid_fields=valid_fields,
- debug=debug)
+ validate=validate, valid_fields=valid_fields)
pid_list = parse_esearch(search_page)
link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb,
cmd=cmd, linkname=linkname, holding=link_holding,
version=version,reldate=link_reldate,
daterange=link_daterange, datetype=link_datetype,
- retmode=link_retmode,
- debug=debug)
+ retmode=link_retmode)
return link_page
## Use the external bibutils package to convert to BibTeX format
+
+class Pipe (object):
+ """Simple interface for executing POSIX-style pipes.
+
+ Based on the subprocess module. The only complication is the
+ adaptation of `subprocess.Popen._communicate` to listen to the
+ stderrs of all processes involved in the pipe, as well as the
+ terminal process' stdout. There are two implementations of
+ `Pipe._communicate`, one for MS Windows, and one for POSIX
+ systems. The MS Windows implementation is currently untested.
+
+ >>> p = Pipe([['find', '/etc/'], ['grep', '^/etc/ssh$']])
+ >>> p.stdout
+ '/etc/ssh\\n'
+ >>> p.status
+ 1
+ >>> p.statuses
+ [1, 0]
+ >>> p.stderrs # doctest: +ELLIPSIS
+ [...find: ...: Permission denied..., '']
+
+ >>> p = Pipe([['cat'], ['head']], stdin='line 1\\nline 2\\nline 3\\n')
+ >>> p.stdout
+ 'line 1\\nline 2\\nline 3\\n'
+ >>> p.statuses
+ [0, 0]
+ >>> p.stderrs
+ ['', '']
+ """
+ def __init__(self, cmds, stdin=None):
+ if isinstance(stdin, str):
+ stdin_str = stdin
+ stdin = PIPE
+ else:
+ stdin_str = None
+
+ # spawn processes
+ self._procs = []
+ for cmd in cmds:
+ if len(self._procs) != 0:
+ stdin = self._procs[-1].stdout
+ LOG.debug('run command %s' % cmd)
+ kwargs = {}
+ if _POSIX:
+ kwargs['close_fds'] = True
+ self._procs.append(Popen(
+ cmd, stdin=stdin, stdout=PIPE, stderr=PIPE, **kwargs))
+
+ self.stdout,self.stderrs = self._communicate(input=stdin_str)
+
+ # collect process statuses
+ self.statuses = []
+ self.status = 0
+ for proc in self._procs:
+ self.statuses.append(proc.wait())
+ LOG.debug('join %s (status %d)' % (proc, self.statuses[-1]))
+ if self.statuses[-1] != 0:
+ self.status = self.statuses[-1]
+
+ # Code excerpted from subprocess.Popen._communicate()
+ if _MSWINDOWS == True:
+ def _communicate(self, input=None):
+ LOG.debug('communicate with pipe')
+ assert input == None, 'stdin != None not yet supported'
+ # listen to each process' stderr
+ threads = []
+ std_X_arrays = []
+ for proc in self._procs:
+ stderr_array = []
+ thread = Thread(target=proc._readerthread,
+ args=(proc.stderr, stderr_array))
+ thread.setDaemon(True)
+ thread.start()
+ threads.append(thread)
+ std_X_arrays.append(stderr_array)
+
+ # also listen to the last processes stdout
+ stdout_array = []
+ thread = Thread(target=proc._readerthread,
+ args=(proc.stdout, stdout_array))
+ thread.setDaemon(True)
+ thread.start()
+ threads.append(thread)
+ std_X_arrays.append(stdout_array)
+
+ # join threads as they die
+ for thread in threads:
+ thread.join()
+
+ # read output from reader threads
+ std_X_strings = []
+ for std_X_array in std_X_arrays:
+ std_X_strings.append(std_X_array[0])
+
+ stdout = std_X_strings.pop(-1)
+ stderrs = std_X_strings
+ LOG.debug('pipe communication complete')
+ return (stdout, stderrs)
+ else:
+ assert _POSIX==True, 'invalid platform'
+ def _communicate(self, input=None):
+ LOG.debug('communicate with pipe')
+ read_set = []
+ write_set = []
+ read_arrays = []
+ stdout = None # Return
+ stderr = None # Return
+
+ if self._procs[0].stdin:
+ # Flush stdio buffer. This might block, if the user has
+ # been writing to .stdin in an uncontrolled fashion.
+ self._procs[0].stdin.flush()
+ if input:
+ write_set.append(self._procs[0].stdin)
+ else:
+ self._procs[0].stdin.close()
+ for proc in self._procs:
+ read_set.append(proc.stderr)
+ read_arrays.append([])
+ read_set.append(self._procs[-1].stdout)
+ read_arrays.append([])
+
+ input_offset = 0
+ while read_set or write_set:
+ LOG.debug('select on read %s, write %s' % (read_set,write_set))
+ try:
+ rlist, wlist, xlist = select.select(read_set, write_set, [])
+ except select.error, e:
+ if e.args[0] == errno.EINTR:
+ LOG.debug('EINTR')
+ continue
+ raise
+ LOG.debug('selected read %s, write %s, exception %s'
+ % (rlist, wlist, xlist))
+ if self._procs[0].stdin in wlist:
+ # When select has indicated that the file is writable,
+ # we can write up to PIPE_BUF bytes without risk
+ # blocking. POSIX defines PIPE_BUF >= 512
+ LOG.debug('write to stdin for process 0')
+ chunk = input[input_offset : input_offset + 512]
+ bytes_written = os.write(
+ self._procs[0].stdin.fileno(), chunk)
+ input_offset += bytes_written
+ if input_offset >= len(input):
+ self._procs[0].stdin.flush()
+ self._procs[0].stdin.close()
+ write_set.remove(self._procs[0].stdin)
+ LOG.debug('stdin complete')
+ if self._procs[-1].stdout in rlist:
+ LOG.debug('read stdout for final process')
+ data = os.read(self._procs[-1].stdout.fileno(), 1024)
+ if data == '':
+ self._procs[-1].stdout.close()
+ read_set.remove(self._procs[-1].stdout)
+ LOG.debug('stdout complete')
+ read_arrays[-1].append(data)
+ for i,proc in enumerate(self._procs):
+ if proc.stderr in rlist:
+ LOG.debug('read stderr for process %i' % i)
+ data = os.read(proc.stderr.fileno(), 1024)
+ if data == '':
+ proc.stderr.close()
+ read_set.remove(proc.stderr)
+ LOG.debug('stderr complete for process %d' % i)
+ read_arrays[i].append(data)
+
+ # All data exchanged. Translate lists into strings.
+ read_strings = []
+ for read_array in read_arrays:
+ read_strings.append(''.join(read_array))
+
+ stdout = read_strings.pop(-1)
+ stderrs = read_strings
+ LOG.debug('pipe communication complete')
+ return (stdout, stderrs)
+
+
def medline_xml_to_bibtex(fetch_page):
- child_stdout,child_stdin = popen2("med2xml | xml2bib -fc | bibclean")
- print >> child_stdin, fetch_page
- child_stdin.close()
- bibtex = child_stdout.read()
- child_stdout.close()
- return bibtex
+ """Convert medline XML to BibTeX
+
+ >>> xml = '\\n'.join([
+ ... '<?xml version="1.0"?>',
+ ... '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_110101.dtd">',
+ ... '<PubmedArticleSet>',
+ ... ' <PubmedArticle>',
+ ... ' <MedlineCitation Owner="NLM" Status="MEDLINE">',
+ ... ' <PMID Version="1">20004685</PMID>',
+ ... ' <Article PubModel="Print-Electronic">',
+ ... ' <Journal>',
+ ... ' <ISSN IssnType="Electronic">1879-0003</ISSN>',
+ ... ' <JournalIssue CitedMedium="Internet">',
+ ... ' <Volume>46</Volume><Issue>2</Issue>',
+ ... ' <PubDate>',
+ ... ' <Year>2010</Year><Month>Mar</Month><Day>1</Day>',
+ ... ' </PubDate>',
+ ... ' </JournalIssue>',
+ ... ' </Journal>',
+ ... ' <ArticleTitle>Monte Carlo simulation of mechanical unfolding '
+ ... 'of proteins based on a simple two-state model.'
+ ... '</ArticleTitle>',
+ ... ' <Pagination><MedlinePgn>159-66</MedlinePgn></Pagination>',
+ ... ' <AuthorList CompleteYN="Y">',
+ ... ' <Author ValidYN="Y">',
+ ... ' <LastName>King</LastName>',
+ ... ' <ForeName>William T</ForeName>',
+ ... ' <Initials>WT</Initials>',
+ ... ' </Author>',
+ ... ' <Author ValidYN="Y">',
+ ... ' <LastName>Su</LastName>',
+ ... ' <ForeName>Meihong</ForeName>',
+ ... ' <Initials>M</Initials>',
+ ... ' </Author>',
+ ... ' <Author ValidYN="Y">',
+ ... ' <LastName>Yang</LastName>',
+ ... ' <ForeName>Guoliang</ForeName>',
+ ... ' <Initials>G</Initials>',
+ ... ' </Author>',
+ ... ' </AuthorList>',
+ ... ' <MedlineJournalInfo>',
+ ... ' <MedlineTA>Int J Biol Macromol</MedlineTA>',
+ ... ' </MedlineJournalInfo>',
+ ... ' </Article>',
+ ... ' <MedlineJournalInfo>',
+ ... ' <MedlineTA>Int J Biol Macromol</MedlineTA>',
+ ... ' </MedlineJournalInfo>',
+ ... ' </MedlineCitation>',
+ ... ' <PubmedData>',
+ ... ' <ArticleIdList>',
+ ... ' <ArticleId IdType="doi">10.1016/j.ijbiomac.2009.12.001'
+ ... '</ArticleId>',
+ ... ' </ArticleIdList>',
+ ... ' </PubmedData>',
+ ... ' </PubmedArticle>',
+ ... '</PubmedArticleSet>',
+ ... ])
+ >>> print medline_xml_to_bibtex(xml)
+ @Article{King2010,
+ author = "William T. King and Meihong Su and Guoliang Yang",
+ title = "Monte Carlo simulation of mechanical unfolding of
+ proteins based on a simple two-state model.",
+ journal = "Int J Biol Macromol",
+ year = "2010",
+ month = mar,
+ day = "01",
+ volume = "46",
+ number = "2",
+ pages = "159--166",
+ ISSN = "1879-0003",
+ doi = "10.1016/j.ijbiomac.2009.12.001",
+ }
+ <BLANKLINE>
+ """
+ LOG.info('convert medline XML to BibTeX\n%s' % fetch_page)
+ p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']],
+ stdin=fetch_page)
+ LOG.debug('converted to\n%s' % p.stdout)
+ return p.stdout
+
## Random
outfile = file(options.filename, 'w')
if options.verbose :
- print >> sys.stdout, "Operating in %s mode" % mode
-
+ LOG.setLevel(logging.DEBUG)
+
+ LOG.debug('operating in %s mode' % mode)
+
if mode == 'list' :
print >> outfile, "Available databases:"
- databases = database_list(debug=options.verbose)
+ databases = database_list()
for db in databases:
print >> outfile, "\t%s" % db
elif mode == 'explain':
- fields,tags,field_info = field_dict(db=options.database,
- debug=options.verbose)
+ fields,tags,field_info = field_dict(db=options.database)
if options.field == None :
print >> outfile, "Available fields in %s:" % options.database
field_size = [0,0]
elif mode == 'search':
search_term = args[0]
- if options.verbose :
- print >> sys.stdout, "Output %s" % output
+ LOG.debug('output %s' % output)
if output == 'bibtex' :
medline_xml = search_fetch_xml(term=search_term,
retmax=options.retmax,
validate=options.validate,
retmode=options.retmode,
- rettype='medline',
- debug=options.verbose)
- if options.raw :
- print outfile, medline_xml
- else :
- bibtex = medline_xml_to_bibtex(medline_xml)
- print >> outfile, bibtex
+ rettype='medline')
+ if medline_xml:
+ if options.raw :
+ print outfile, medline_xml
+ else:
+ bibtex = medline_xml_to_bibtex(medline_xml)
+ print >> outfile, bibtex
elif output == 'link' :
# Assume that if you're looking for links
link_reldate=options.reldate,
link_daterange=options.daterange,
link_datetype=options.datetype,
- link_retmode=options.retmode,
- debug=options.verbose)
+ link_retmode=options.retmode,)
print >> outfile, link_xml
if options.filename != None :