3 # Copyright (C) 1998-2004 Frederic Gobry
4 # Copyright (C) 2008-2011 W. Trevor King
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with This program. If not, see <http://www.gnu.org/licenses/>.
19 """Python interface to Entrez_ SOAP_ using the suds_ module.
21 Before you use this program, read the rules_.
23 .. _Entrez: http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
24 .. _SOAP: http://eutils.ncbi.nlm.nih.gov/entrez/eutils/soap/v2.0/DOC/esoap_help.html
25 .. _suds: https://fedorahosted.org/suds/
26 .. _rules: http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html#UserSystemRequirements
28 To discover services using suds, try:
30 >>> print EUTILS_CLIENT # doctest: +ELLIPSIS, +REPORT_UDIFF
32 Suds ( https://fedorahosted.org/suds/ ) version: ... build: ...
34 Service ( eUtilsService ) tns="http://www.ncbi.nlm.nih.gov/soap/eutils/"
36 ns0 = "http://www.ncbi.nlm.nih.gov/soap/eutils/egquery"
37 ns1 = "http://www.ncbi.nlm.nih.gov/soap/eutils/einfo"
38 ns2 = "http://www.ncbi.nlm.nih.gov/soap/eutils/elink"
39 ns3 = "http://www.ncbi.nlm.nih.gov/soap/eutils/epost"
40 ns4 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esearch"
41 ns5 = "http://www.ncbi.nlm.nih.gov/soap/eutils/esummary"
45 run_eGquery(xs:string term, xs:string tool, xs:string email, )
46 run_eInfo(xs:string db, xs:string tool, xs:string email, )
47 run_eLink(xs:string db, xs:string[] id, xs:string reldate, ...)
48 run_ePost(xs:string db, xs:string id, xs:string WebEnv, ...)
49 run_eSearch(xs:string db, xs:string term, xs:string WebEnv, ...)
50 run_eSpell(xs:string db, xs:string term, xs:string tool, ...)
51 run_eSummary(xs:string db, xs:string id, xs:string WebEnv, ...)
61 import logging as _logging
62 import subprocess as _subprocess
67 from suds.client import Client as _Client
70 _MSWINDOWS = _sys.platform == 'win32'
71 _POSIX = not _MSWINDOWS
75 import select as _select
81 EUTILS_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/eutils.wsdl'
82 EFETCH_WSDL_URL = 'http://eutils.ncbi.nlm.nih.gov/soap/v2.0/efetch_%s.wsdl'
84 EUTILS_CLIENT = _Client(EUTILS_WSDL_URL)
86 # Entrez-requested tracking information
88 EMAIL = 'wking@drexel.edu'
91 LOG = _logging.getLogger(TOOL)
92 LOG.setLevel(_logging.WARN)
93 _handler = _logging.StreamHandler()
94 _formatter = _logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
95 _handler.setFormatter(_formatter)
96 LOG.addHandler(_handler)
97 del _handler, _formatter
101 ## Use the external bibutils package to convert to BibTeX format
105 """Simple interface for executing POSIX-style pipes.
107 Based on the subprocess module. The only complication is the
108 adaptation of `subprocess.Popen._communicate` to listen to the
109 stderrs of all processes involved in the pipe, as well as the
110 terminal process' stdout. There are two implementations of
111 `Pipe._communicate`, one for MS Windows, and one for POSIX
112 systems. The MS Windows implementation is currently untested.
114 >>> p = Pipe([['find', '/etc/'], ['grep', '^/etc/ssh$']])
121 >>> p.stderrs # doctest: +ELLIPSIS
122 [...find: ...: Permission denied..., '']
124 >>> p = Pipe([['cat'], ['head']], stdin='line 1\\nline 2\\nline 3\\n')
126 'line 1\\nline 2\\nline 3\\n'
132 def __init__(self, cmds, stdin=None):
133 if isinstance(stdin, str):
135 stdin = _subprocess.PIPE
142 if len(self._procs) != 0:
143 stdin = self._procs[-1].stdout
144 LOG.debug('run command %s' % cmd)
147 kwargs['close_fds'] = True
148 self._procs.append(_subprocess.Popen(
149 cmd, stdin=stdin, stdout=_subprocess.PIPE,
150 stderr=_subprocess.PIPE, **kwargs))
152 self.stdout,self.stderrs = self._communicate(input=stdin_str)
154 # collect process statuses
157 for proc in self._procs:
158 self.statuses.append(proc.wait())
159 LOG.debug('join %s (status %d)' % (proc, self.statuses[-1]))
160 if self.statuses[-1] != 0:
161 self.status = self.statuses[-1]
163 # Code excerpted from subprocess.Popen._communicate()
164 if _MSWINDOWS == True:
165 def _communicate(self, input=None):
166 LOG.debug('communicate with pipe')
167 assert input == None, 'stdin != None not yet supported'
168 # listen to each process' stderr
171 for proc in self._procs:
173 thread = Thread(target=proc._readerthread,
174 args=(proc.stderr, stderr_array))
175 thread.setDaemon(True)
177 threads.append(thread)
178 std_X_arrays.append(stderr_array)
180 # also listen to the last processes stdout
182 thread = Thread(target=proc._readerthread,
183 args=(proc.stdout, stdout_array))
184 thread.setDaemon(True)
186 threads.append(thread)
187 std_X_arrays.append(stdout_array)
189 # join threads as they die
190 for thread in threads:
193 # read output from reader threads
195 for std_X_array in std_X_arrays:
196 std_X_strings.append(std_X_array[0])
198 stdout = std_X_strings.pop(-1)
199 stderrs = std_X_strings
200 LOG.debug('pipe communication complete')
201 return (stdout, stderrs)
203 assert _POSIX==True, 'invalid platform'
204 def _communicate(self, input=None):
205 LOG.debug('communicate with pipe')
209 stdout = None # Return
210 stderr = None # Return
212 if self._procs[0].stdin:
213 # Flush stdio buffer. This might block, if the user has
214 # been writing to .stdin in an uncontrolled fashion.
215 self._procs[0].stdin.flush()
217 write_set.append(self._procs[0].stdin)
219 self._procs[0].stdin.close()
220 for proc in self._procs:
221 read_set.append(proc.stderr)
222 read_arrays.append([])
223 read_set.append(self._procs[-1].stdout)
224 read_arrays.append([])
227 while read_set or write_set:
228 LOG.debug('select on read %s, write %s' %(read_set, write_set))
230 rlist,wlist,xlist = _select.select(read_set, write_set, [])
231 except _select.error, e:
232 if e.args[0] == errno.EINTR:
233 LOG.debug('EINTR: %s' % e)
236 LOG.debug('selected read %s, write %s, exception %s'
237 % (rlist, wlist, xlist))
238 if self._procs[0].stdin in wlist:
239 # When select has indicated that the file is writable,
240 # we can write up to PIPE_BUF bytes without risk
241 # blocking. POSIX defines PIPE_BUF >= 512
242 LOG.debug('write to stdin for process 0')
243 chunk = input[input_offset:input_offset+512]
244 bytes_written = _os.write(
245 self._procs[0].stdin.fileno(), chunk)
246 input_offset += bytes_written
247 if input_offset >= len(input):
248 self._procs[0].stdin.flush()
249 self._procs[0].stdin.close()
250 write_set.remove(self._procs[0].stdin)
251 LOG.debug('stdin complete')
252 if self._procs[-1].stdout in rlist:
253 LOG.debug('read stdout for final process')
254 data = _os.read(self._procs[-1].stdout.fileno(), 1024)
256 self._procs[-1].stdout.close()
257 read_set.remove(self._procs[-1].stdout)
258 LOG.debug('stdout complete')
259 read_arrays[-1].append(data)
260 for i,proc in enumerate(self._procs):
261 if proc.stderr in rlist:
262 LOG.debug('read stderr for process %i' % i)
263 data = _os.read(proc.stderr.fileno(), 1024)
266 read_set.remove(proc.stderr)
267 LOG.debug('stderr complete for process %d' % i)
268 read_arrays[i].append(data)
270 # All data exchanged. Translate lists into strings.
272 for read_array in read_arrays:
273 read_strings.append(''.join(read_array))
275 stdout = read_strings.pop(-1)
276 stderrs = read_strings
277 LOG.debug('pipe communication complete')
278 return (stdout, stderrs)
281 def medline_xml_to_bibtex(fetch_page):
282 """Convert medline XML to BibTeX
284 >>> xml = '\\n'.join([
285 ... '<?xml version="1.0"?>',
286 ... '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, '
287 ... '1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query'
288 ... '/DTD/pubmed_110101.dtd">',
289 ... '<PubmedArticleSet>',
290 ... ' <PubmedArticle>',
291 ... ' <MedlineCitation Owner="NLM" Status="MEDLINE">',
292 ... ' <PMID Version="1">20004685</PMID>',
293 ... ' <Article PubModel="Print-Electronic">',
295 ... ' <ISSN IssnType="Electronic">1879-0003</ISSN>',
296 ... ' <JournalIssue CitedMedium="Internet">',
297 ... ' <Volume>46</Volume><Issue>2</Issue>',
299 ... ' <Year>2010</Year><Month>Mar</Month><Day>1</Day>',
301 ... ' </JournalIssue>',
303 ... ' <ArticleTitle>Monte Carlo simulation of mechanical unfolding '
304 ... 'of proteins based on a simple two-state model.'
305 ... '</ArticleTitle>',
306 ... ' <Pagination><MedlinePgn>159-66</MedlinePgn></Pagination>',
307 ... ' <AuthorList CompleteYN="Y">',
308 ... ' <Author ValidYN="Y">',
309 ... ' <LastName>King</LastName>',
310 ... ' <ForeName>William T</ForeName>',
311 ... ' <Initials>WT</Initials>',
313 ... ' <Author ValidYN="Y">',
314 ... ' <LastName>Su</LastName>',
315 ... ' <ForeName>Meihong</ForeName>',
316 ... ' <Initials>M</Initials>',
318 ... ' <Author ValidYN="Y">',
319 ... ' <LastName>Yang</LastName>',
320 ... ' <ForeName>Guoliang</ForeName>',
321 ... ' <Initials>G</Initials>',
323 ... ' </AuthorList>',
324 ... ' <MedlineJournalInfo>',
325 ... ' <MedlineTA>Int J Biol Macromol</MedlineTA>',
326 ... ' </MedlineJournalInfo>',
328 ... ' <MedlineJournalInfo>',
329 ... ' <MedlineTA>Int J Biol Macromol</MedlineTA>',
330 ... ' </MedlineJournalInfo>',
331 ... ' </MedlineCitation>',
333 ... ' <ArticleIdList>',
334 ... ' <ArticleId IdType="doi">10.1016/j.ijbiomac.2009.12.001'
336 ... ' </ArticleIdList>',
337 ... ' </PubmedData>',
338 ... ' </PubmedArticle>',
339 ... '</PubmedArticleSet>',
341 >>> print medline_xml_to_bibtex(xml) # doctest: +REPORT_UDIFF
343 author = "William T. King and Meihong Su and Guoliang Yang",
344 title = "Monte Carlo simulation of mechanical unfolding of
345 proteins based on a simple two-state model.",
346 journal = "Int J Biol Macromol",
354 doi = "10.1016/j.ijbiomac.2009.12.001",
355 URL = "http://www.ncbi.nlm.nih.gov/pubmed/20004685",
359 LOG.info('convert medline XML to BibTeX')
360 LOG.debug('convert from\n%s' % fetch_page)
361 p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']],
363 LOG.debug('converted to\n%s' % p.stdout)
367 if __name__ == '__main__':
368 from optparse import OptionParser
370 usage_string = '\n'.join([
372 ' %prog [options] SEARCH_TERM'
373 ' (print medline xml matching search)',
374 '| %prog -l [options] SEARCH_TERM'
375 ' (print links to entries matching search)',
376 '| %prog -L [-f FILE] (list databases)',
377 '| %prog -X [-d DATABASE] [-F FIELD] [-f FILE]'
378 ' (list fields in a database, or details on a single field)',
380 '2008-2011, W. Trevor King.',
382 'See the docstrings in %prog or',
383 ' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
385 ' http://www.ncbi.nlm.nih.gov/entrez/query/static/'
386 'eutils_help.html#UserSystemRequirements',
387 ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
389 ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
391 ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
393 ' http://www.ncbi.nlm.nih.gov/corehtml/query/static/'
398 parser = OptionParser(
399 usage=usage_string, version='%%prog %s' % __version__)
401 # Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511
403 # metavar is the name used in the help for that options required
404 # text, and dest is the name of the property you'll use to access
405 # the value of that option.
408 parser.add_option('-d', '--database', dest='database',
409 help="Search DATABASE (default '%default')",
410 type='string', metavar='DATABASE', default='pubmed')
411 parser.add_option('-f', '--file', dest='filename',
412 help='write output to FILE (default stdout)',
413 type='string', metavar='FILE')
414 parser.add_option('-v', '--verbose', dest='verbose', action='count',
415 help=('Print minimal debugging information. Use twice '
416 'to get lots of debugging info.'),
419 # mode control options
421 def set_mode(option, opt_str, value, parser):
423 long_option = option.get_opt_string()
424 if long_option == '--list-mode':
426 elif long_option == '--explain-mode':
429 parser.add_option('-L', '--list-mode', callback=set_mode,
430 help='Run in list mode', action='callback')
431 parser.add_option('-X', '--explain-mode', callback=set_mode,
432 help='Run in explain mode', action='callback')
434 # search-fetch-xml-to-? options
436 def set_output(option, opt_str, value, parser):
438 long_option = option.get_opt_string()
439 if long_option == '--output-xml':
441 if long_option == '--output-bibtex':
443 if long_option == '--output-link':
445 parser.add_option('-x', '--output-xml', callback=set_output,
446 help='Output search results as Medline XML',
448 parser.add_option('-b', '--output-bibtex', callback=set_output,
449 help='Output search results as BibTeX',
451 parser.add_option('-F', '--field', dest='field',
452 help='Limit SEARCH_TERM to FIELD',
453 type='string', metavar='FIELD')
454 parser.add_option('-r', '--reldate', dest='reldate',
455 help='Limit search to dates within DAYS of today',
456 type='string', metavar='DAYS')
457 parser.add_option('--mindate', dest='mindate',
458 help=('Limit search to date after MINDATE '
459 "(e.g. '2001/1/1' or '2002')"),
460 type='string', metavar='MINDATE')
461 parser.add_option('--maxdate', dest='maxdate',
462 help=('Limit search to date after MAXDATE '
463 "(e.g. '2001/1/1' or '2002')"),
464 type='string', metavar='MAXDATE')
465 parser.add_option('-t', '--datetype', dest='datetype',
466 help=("Select field to apply date limits to "
467 "(e.g. 'edat' for Entrez date)"),
468 type='string', metavar='DATETYPE')
469 parser.add_option('-m', '--retmax', dest='retmax',
470 help=('Return at most RETMAX items from a successful '
471 'search (default %default)'),
472 type='int', metavar='RETMAX', default=20)
473 parser.add_option('-s', '--retstart', dest='retstart',
474 help=('Index of first returned search item from a '
475 'successful search (default %default)'),
476 type='int', metavar='RETSTART', default=0)
477 parser.add_option('-V', '--validate', dest='validate', action='store_true',
478 help=('Check that FIELD and field tags in SEARCH_TERM '
482 # output link options
483 parser.add_option('-l', '--output-link', callback=set_output,
484 help='Output a link (instead of xml citations).',
486 parser.add_option('-c', '--link-cmd', dest='link_cmd',
487 help='Select link output',
488 type='string', metavar='LINK_CMD')
489 parser.add_option('-T', '--link-term', dest='link_term',
490 help='Limit links to those matching LINK_TERM',
491 type='string', metavar='LINK_TERM')
492 parser.add_option('-D', '--from-database', dest='dbfrom',
493 help='Limit links to those from FROMDATABASE)',
494 type='string', metavar='FROMDATABASE')
495 parser.add_option('-n', '--link-name', dest='linkname',
496 help='Limit links to a specific neighbor',
497 type='string', metavar='LINKNAME')
499 (options, args) = parser.parse_args()
502 # open the output file if specified
503 if options.filename == None:
504 outfile = _sys.stdout
506 outfile = file(options.filename, 'w')
508 if options.verbose == 1:
509 LOG.setLevel(_logging.INFO)
510 elif options.verbose > 1:
511 LOG.setLevel(_logging.DEBUG)
513 LOG.debug('operating in %s mode' % mode)
516 outfile.write('# available databases:\n')
517 LOG.info('run eInfo to get list of databases')
518 q = EUTILS_CLIENT.service.run_eInfo(tool=TOOL, email=EMAIL)
519 if hasattr(q, 'ERROR'):
520 raise Exception(q.ERROR)
522 for db in q.DbList.DbName:
523 outfile.write('%s\n' % db)
525 elif mode == 'explain':
526 LOG.info('run eInfo on %s' % options.database)
527 q = EUTILS_CLIENT.service.run_eInfo(
528 db=options.database, tool=TOOL, email=EMAIL)
529 if hasattr(q, 'ERROR'):
530 raise Exception(q.ERROR)
532 if options.field: # print specific info about this field
534 'field %s in %s:\n' % (options.field, options.database))
536 [(field.Name, field) for field in q.DbInfo.FieldList.Field])
537 field = fields[options.field]
539 [(a, getattr(field, a)) for a in dir(field)
540 if not a.startswith('_')])
542 for attribute,value in attributes:
543 if len(attribute) > field_size[0]:
544 field_size[0] = len(attribute)
545 for attribute,value in attributes:
548 % (field_size[0], field_size[0], attribute, value))
549 else: # print general info
550 outfile.write('database: %s\n' % q.DbInfo.DbName)
551 outfile.write('description: %s\n' % q.DbInfo.Description)
552 outfile.write('available fields:\n')
554 for field in q.DbInfo.FieldList.Field:
555 if len(field.Name) > field_size[0]:
556 field_size[0] = len(field.Name)
557 if len(field.FullName) > field_size[1]:
558 field_size[1] = len(field.FullName)
559 for field in q.DbInfo.FieldList.Field:
561 '%*.*s\t%-*.*s\t%s\n'
562 % (field_size[0], field_size[0], field.Name,
563 field_size[1], field_size[1], field.FullName,
566 elif mode == 'search':
567 search_term = args[0]
568 LOG.debug('output %s' % output)
570 if options.mindate and not options.maxdate:
571 options.maxdate = _time.strftime('%Y/%M/%d')
572 LOG.info('fill in maximum date: %s' % options.maxdate)
573 elif options.maxdate and not options.mindate:
574 options.mindate = '0'
575 LOG.info('fill in minimum date: %s' % options.mindate)
577 LOG.info('run eEsearch on %s' % options.database)
578 q = EUTILS_CLIENT.service.run_eSearch(
579 db=options.database, term=search_term, tool=TOOL, email=EMAIL,
580 field=options.field, reldate=options.reldate,
581 mindate=options.mindate, maxdate=options.maxdate,
582 datetype=options.datetype,
583 RetStart=options.retstart, RetMax=options.retmax,
586 if hasattr(q, 'ERROR'):
587 raise Exception(q.ERROR)
588 if hasattr(q.IdList, 'Id'):
589 ret = int(len(q.IdList.Id))
592 LOG.info('search returned %d of %d items' % (ret, int(q.Count)))
595 if output in ['medline', 'bibtex']:
596 LOG.info('run eFetch on %s' % options.database)
597 efetch_client = _Client(EFETCH_WSDL_URL % options.database)
598 f = efetch_client.service.run_eFetch(
599 id=','.join(q.IdList.Id), tool=TOOL, email=EMAIL)
600 if hasattr(f, 'ERROR'):
601 raise Exception(f.ERROR)
603 if output == 'medline':
604 outfile.write(str(efetch_client.last_received()).rstrip()+'\n')
605 elif output == 'bibtex':
607 medline_xml_to_bibtex(str(efetch_client.last_received())))
608 elif output == 'link':
609 LOG.info('run eLink on %s' % options.database)
610 f = EUTILS_CLIENT.service.run_eLink(
611 db=options.database, id=','.join(q.IdList.Id),
612 #reldate=, mindate=, maxdate=, datetype=,
613 term=options.link_term, dbfrom=options.dbfrom,
614 linkname=options.linkname, cmd=options.link_cmd,
615 tool=TOOL, email=EMAIL)
616 outfile.write(str(EUTILS_CLIENT.last_received()).rstrip()+'\n')
618 raise KeyError(output)
620 if options.filename != None: