3 # Copyright (C) 1998-2004 Frederic Gobry
4 # Copyright (C) 2008-2011 W. Trevor King
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License as published by
8 # the Free Software Foundation, either version 2 of the License, or
9 # (at your option) any later version.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with This program. If not, see <http://www.gnu.org/licenses/>.
19 # Code following John Vu's medline query code pybliographer/Pyblio/Query.py,
21 # Python interface to the Entrez databases.
22 # See http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
23 # Current as of August 1, 2007
26 # * Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays for any series of more than 100 requests.
27 # * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, not the standard NCBI Web address.
28 # * Make no more than one request every 3 seconds.
29 # * Use the URL parameter email, and tool for distributed software, so that we can track your project and contact you if there is a problem.
30 # * NCBI's Disclaimer and Copyright notice must be evident to users of your service.
31 # NLM does not claim the copyright on the abstracts in PubMed; however, journal publishers or authors may.
32 # NLM provides no legal advice concerning distribution of copyrighted materials, consult your legal counsel.
34 # For a good Python-and-XML-DOM intro, see
35 # http://www.boddie.org.uk/python/XML_intro.html
36 # for the official docs, see
37 # http://docs.python.org/lib/module-xml.dom.html
39 """Python bindings on Entrez database queries.
46 import time # for querying date ranges of publications
49 # DOM module for parsing XML,
50 # supports Document Object Model (DOM) Level 1 Specification
51 # http://docs.python.org/lib/module-xml.dom.minidom.html
52 import xml.dom.minidom as dom
54 # For calling the bibutils conversion programs
55 from subprocess import Popen, PIPE
58 _MSWINDOWS = sys.platform == 'win32'
59 _POSIX = not _MSWINDOWS
66 # Entrez access points
67 einfo_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi'
68 esearch_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
69 efetch_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
70 elink_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
72 # Entrez-requested tracking information
74 EMAIL = 'wking@drexel.edu'
78 LOG = logging.getLogger(TOOL)
79 LOG.setLevel(logging.WARN)
80 _handler = logging.StreamHandler()
81 _formatter = logging.Formatter('%(name)-8s: %(levelname)-6s %(message)s')
82 _handler.setFormatter(_formatter)
83 LOG.addHandler(_handler)
84 del _handler, _formatter
86 ## XML and list utility functions
88 def urlencode(param_dict) :
90 for key,value in param_dict.items() :
92 continue # ignore unused parameter
93 #if type(value)== : # convert True/False to 'y'/<no-entry>
95 # params += "%s=y&" % (key,)
97 # # params += "%s=n&" % (key,)
99 params += "%s=%s&" % (key, str(value))
101 params = params[:-1] # remove trailing &
104 def unique(seq, keepstr=True):
106 Return the sequence (list, tuple, etc) without repeating entries
107 by Paul Rubin and Jordan Callicoat.
108 http://groups.google.com/group/comp.lang.python/browse_thread/thread/40c6c455f4fd5154/744a1a338afe1331?lnk=gst&rnum=7#744a1a338afe1331
110 for example [1,2,3,1,2] -> [1,2,3]
113 if t in (str, unicode):
114 t = (list, ''.join)[bool(keepstr)]
116 return t(c for c in seq if not (c in seen or seen.append(c)))
120 Given a node (<node-name> in the following example),
121 extract some-text from '<node-name>some-text</node-name>'
122 returns u'some-text'.
123 However, if the xml is '</node-name>' returns None
125 if len(node.childNodes) == 1:
126 data = node.childNodes[0].data
127 elif len(node.childNodes) == 0: # empty node
130 raise Exception, "Node contains more than text"
133 def get_child_nodes(node, child_name):
135 Given a node (<node-name> in the following example),
136 returns an array of nodes matching <child-name>
139 for n in node.childNodes:
140 if n.nodeType != n.ELEMENT_NODE:
141 continue # ignore text, comment, etc. nodes
142 if n.tagName == child_name :
146 def get_child_nodes(node, child_name):
148 Given a node (<node-name> in the following example),
149 returns an the node matching <child-name>
151 nodes = get_child_node(node, child_name)
152 assert len(nodes) == 1, "%d child nodes named %s" % (len(nodes), child_name)
155 def get_child_contents(node, child_name):
157 Given a node (<node-name> in the following example),
158 extract some-text from '<node-name>
159 <some-tag>some-text</some-tag>
160 <other-tag>other-text</other-tag>
161 <some-tag>some-other-text</some-tag>
164 Returns ['some-text', 'some-other-text', ...]
166 nodes = get_child_nodes(node, child_name)
169 ret.append(get_text(n))
172 def get_child_dict(node):
174 Given a node (<node-name> in the following example),
175 extract some-text from '<node-name>
176 <some-tag>some-text</some-tag>
177 <other-tag>other-text</other-tag>
178 <some-tag>some-other-text</some-tag>
181 Returns {'some-tag':['some-text', 'some-other-text', ...],
182 'other-tag':['some-other-text']}
185 tags = [] # to preserve order of tags
186 for n in node.childNodes:
187 if n.nodeType != n.ELEMENT_NODE:
188 continue # ignore text, comment, etc. nodes
189 try: # another entry for an existing tag
190 dict[n.tagName].append(get_text(n))
191 except KeyError: # new tag
192 dict[n.tagName] = [get_text(n)]
193 tags.append(n.tagName)
196 def delist_dict(dict) :
199 e.g. {'some-tag':['some-text', 'some-other-text', ...],
200 'other-tag':['some-other-text'], ...} ,
201 replaces any values in an array of length 1 with the element,
202 e.g. {'some-tag':['some-text', 'some-other-text', ...],
203 'other-tag':'some-other-text', ...} ,
205 for key,value in dict.items() :
206 if isinstance(value, list) and len(value) == 1 :
210 ## Get information about the Entrez databases themselves
212 def _query_einfo(db=None):
214 Get information about the Entrez databases themselves.
215 http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html
217 Either list all available databases with db=None, or
218 Specific information on a particular database (e.g. pubmed) with db=pubmed.
220 params = urlencode ({
225 LOG.info("getting einfo from '%s?%s'" % (einfo_url, params))
226 f = urllib.urlopen ("%s?%s" % (einfo_url, params))
229 LOG.debug('got:\n%s' % string)
232 def get_parsed_einfo(db=None, page=None, parsed=None):
234 Helper function for various einfo processing functions.
235 Allow each processor to function
236 independently (page=None, parsed=None),
237 with a shared xml string (page=<xml-string>, parsed=None), or
238 with a shared parsed xml structure (page=*, parsed=<parsed_xml>).
239 Use clean_parsed_einfo() for cleanup
241 if page == None and parsed == None:
242 LOG.info('downloading new einfo page')
243 page = _query_einfo(db)
245 LOG.info('parsing new einfo page')
246 parsed = dom.parseString(page)
247 parsed_islocal = True
249 LOG.info('using old einfo parsing')
250 parsed_islocal = False
251 return (parsed, parsed_islocal)
253 def clean_parsed_einfo(parsed, parsed_islocal=True):
255 Helper function for various einfo processing functions.
256 Clean up the parsed xml structure if the calling function created it.
258 if parsed_islocal == True :
259 LOG.info('cleaning up einfo parsing')
260 parsed.unlink() # clean up the DOM
262 def database_list(page=None, parsed=None):
263 parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed)
265 for node in parsed.getElementsByTagName("DbName"):
266 # Extract some-text from '<DbName>some-text</DbName>'
267 # by default, xml.dom.minidom uses unicode,
268 # so strings get printed: "u'string contents'"
269 databases.append(get_text(node))
270 clean_parsed_einfo(parsed,parsed_islocal)
273 def field_dict(db='pubmed', page=None, parsed=None):
274 parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
278 fieldlists = parsed.getElementsByTagName("FieldList")
279 assert len(fieldlists) == 1, "%s\n\n%d FieldLists!" % (parsed.toxml(), len(fieldlists))
280 fieldlist = fieldlists[0]
281 for node in fieldlist.childNodes:
282 if node.nodeType != node.ELEMENT_NODE :
283 continue # ignore text, comment, etc. nodes
284 assert node.tagName == "Field", "Unrecognized tag '%s' in FieldList" % node.tagName
285 field,new_tags = get_child_dict(node)
286 assert len(field['Name']) == 1, "Multiple field names %s" % str(field['Name'])
287 field = delist_dict(field)
288 fields.append(field['Name'])
289 new_tags = unique(tags + new_tags)
291 assert new_tags == tags, "Inconsistent tags"
293 field_info[field['Name']] = field
294 clean_parsed_einfo(parsed,parsed_islocal)
295 return (fields, tags, field_info)
297 def link_dict(db='pubmed', page=None, parsed=None):
298 parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
302 linklists = parsed.getElementsByTagName("LinkList")
303 assert len(linklists) == 1, "%s\n\n%d LinkLists!" % (parsed.toxml(), len(linklists))
304 linklist = linklists[0]
305 for node in linklist.childNodes:
306 if node.nodeType != node.ELEMENT_NODE :
307 continue # ignore text, comment, etc. nodes
308 assert node.tagName == "Link", "Unrecognized tag '%s' in LinkList" % node.tagName
309 link,new_tags = get_child_dict(node)
310 assert len(link['Name']) == 1, "Multiple link names %s" % str(link['Name'])
311 link = delist_dict(link)
312 links.append(link['Name'])
313 new_tags = unique(tags + new_tags)
315 assert new_tags == tags, "Inconsistent tags"
317 link_info[link['Name']] = link
318 clean_parsed_einfo(parsed,parsed_islocal)
319 return (links, tags, link_info)
321 def database_info(db='pubmed', page=None, parsed=None):
322 "Convenience function to call both field_dict and link_dict"
323 parsed,parsed_islocal = get_parsed_einfo(db, page, parsed)
324 fields,field_tags,field_info = field_dict(db=db, parsed=parsed)
325 links,link_tags,link_info = link_dict(db=db, parsed=parsed)
326 clean_parsed_einfo(parsed,parsed_islocal)
327 return (fields, field_tags, field_info, links, link_tags, link_info)
329 def validate_field(field, fields):
330 "Ensure that field is a valid field for the database db."
332 fields.index(field.upper())
334 raise Exception, "Field '%s' invalid\nValid fields are\n %s" \
335 % (field, str(fields))
337 def strip_fields_from_term(term):
338 "HACK: really stupid algorithm"
341 for i in range(len(term)):
342 if term[i] == '[' and infield == False :
345 elif term[i] == ']' and infield == True :
347 fields.append(term[field_start:i])
350 def validate_search_term(term, fields):
351 "Ensure that the fields in term are valid fields for the database db."
352 for field in strip_fields_from_term(term) :
353 validate_field(field, fields)
356 ## Search an Entrez database
358 def _query_esearch(term, db='pubmed', field=None,
359 reldate=None, daterange=None, datetype=None,
360 retmax=None, rettype=None, sort=None,
361 validate=False, valid_fields=None, debug=False) :
363 Search an Entrez database.
364 http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html
366 Does not currently support the usehistory, WebEnv, query_key, retstart, or retmode parameters.
368 Help with the arguments adapted from esearch_help.html:
370 term: This command uses search terms or phrases with or without Boolean operators.
371 You can search in several fields using the [term field] tag.
372 You can search in a single field using the 'field' parameter below.
373 ?You may also tag search terms using field=tag.? I don't understand this line
374 For example: term=asthma[MESH]+OR+hay+fever[MESH]
375 'term=asthma[MESH]' is the same as 'term=asthma&field=MESH'
376 ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea )
378 db: This command selects the database to be searched
379 For example: db=pubmed
381 field: Use this command to specify a specific search field.
382 PubMed fields: affl, auth, ecno, jour, iss, mesh,...
383 Retrieve with field_dict('pubmed')
384 For example: field=auth
386 reldate: Limit items a number of days immediately preceding today's date.
387 For example: reldate=365
389 daterange: Limit results bounded by two specific dates.
390 For example: daterange=('2001', '2002/01/01')
391 (implemented as mindate=2001&maxdate=2002/01/01)
393 datetype: Limit dates to a specific date field based on database.
394 For example: datetype=edat
396 retmax: Limit the number of items retrieved
397 For example: retmax=100
399 rettype: Select the retrieval type
400 PubMed values: count, uilist (default)
402 sort: Sort the returned uilist
403 PubMed values: author, last+author, journal, pub+date
406 if daterange != None :
407 assert len(daterange) == 2, "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')"
408 reldate == None, "Specifying date with daterange AND reldate!"
409 mindate = daterange[0]
410 maxdate = daterange[1]
415 assert len(valid_fields) > 0, "Need a list of valid fields to validate"
417 validate_field(field)
418 validate_search_term(term, valid_fields)
419 params = urlencode ({
428 'datetype' : datetype,
434 LOG.info("getting esearch from '%s?%s'" % (esearch_url, params))
435 f = urllib.urlopen ("%s?%s" % (esearch_url, params))
438 LOG.debug('got:\n%s' % string)
441 def parse_esearch(page):
442 "Parse the xml returned by _query_esearch()"
443 parsed = dom.parseString(page)
446 for node in parsed.getElementsByTagName("Id"):
447 pid_list.append(get_text(node))
454 ## Fetch records by Primary ID from an Entrez database
456 def _query_efetch(id, db='pubmed',
457 retmax=None, retmode='xml', rettype='medline'):
459 Fetch records by primary ID from an Entrez database.
460 http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html
461 http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html
464 Does not currently support the usehistory, WebEnv, query_key, or retstart parameters.
466 Help with the arguments adapted from efetchlit_help.html:
468 id: Primary UIs identifying the documents to fetch
469 For example: 'id=11877539, 11822933,11871444'
471 db: This command selects the database to be searched
472 For example: db=pubmed
474 retmax: Limit the number of items retrieved (default 20)
475 For example: retmax=100
477 retmode: Select the retrieval output format
483 rettype: Select the retrieval type
488 full (journals and omim)
490 Not all retmodes are possible with all rettypes:
492 uilist abstract citation medline
497 x = retrieval mode available
498 * returned retrieval type is the complete record in the retrieval mode
500 OMIM Options: (not case sensitive)
501 uilist docsum synopsis variants detailed ExternalLink
502 (MIM (Clinical (Allelic
503 numbers) synopsis) Variants)
507 asn.1 x* x* x* x* x* x*
508 x = retrieval mode available
509 * returned retrieval type is the complete record in the retrieval mode
515 idstring += "%s," % d
516 idstring = idstring[:-1] # remove trailing comma
517 params = urlencode ({
524 'rettype' : rettype})
526 LOG.info("getting efetch from '%s?%s'" % (efetch_url, params))
527 f = urllib.urlopen ("%s?%s" % (efetch_url, params))
530 LOG.debug('got:\n%s' % string)
534 ## Fetch links by Primary ID from an Entrez database
536 def _query_elink(id, term=None, db='all', dbfrom='pubmed',
537 cmd=None, linkname=None, holding=None,
539 reldate=None, daterange=None, datetype=None,
542 Fetch links from a list of primary IDs in an Entrez database.
543 http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html
544 http://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html
546 Does not currently support the WebEnv or query_key parameters.
548 Help with the arguments adapted from efetchlit_help.html:
550 id: Primary UIs identifying the documents to fetch
551 For example: 'id=11877539, 11822933,11871444'
553 term: This command uses search terms or phrases with or without Boolean operators
554 to limit the returned matching links.
556 db: This command selects the databases to be searched for link targets.
559 dbfrom: This command selects the database containing the ids.
560 For example: dbfrom=pubmed
564 * prlinks - List the hyperlink to the primary LinkOut provider for
565 multiple IDs and database. Each ID is processed separately.
566 * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut provider
567 for a single ID and database. Return the elink
568 command, since fetching it breaks the relative
569 links in the publisher's page.
570 * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for
571 multiple IDs and database. Each ID is processed separately.
572 * llinkslib - List LinkOut URLs and Attributes for multiple IDs and
573 database. Each ID is processed separately.
574 * lcheck - Check for the existence (Y or N) of an external link in for
575 multiple IDs and database.
576 * ncheck - Check for the existence of a neighbor link for each ID within
577 a database, e.g., Related Articles in PubMed.
578 * neighbor - Display neighbors within a database.
579 * neighbor_history - Create history (WebEnv & query_key) for use in other
581 * acheck - Lists Entrez databases links for multiple IDs from a single
584 linkname: link to a specific neighbor subset
585 For example: linkname=nucleotide_nucleotide_comp
587 holding: List LinkOut URLs for the specified holding provider, (library).
588 Used only in conjunction with cmd=llinks or cmd=llinkslib
589 For example: cmd=llinkslib&holding=medlib
591 version: Include a version number to refer to the latest DTD.
592 For example: version=1
593 retrieves the latest DTD (eLink_050511.dtd) that includes the additional
594 elements, MenuTag, LinkInfo and IdLinkSet.
596 Date command are only valid for dbfrom=pubmed & cmd=neighbor
597 reldate: Limit items a number of days immediately preceding today's date.
598 For example: reldate=365
600 daterange: Limit results bounded by two specific dates.
601 For example: daterange=('2001', '2002/01/01')
602 (implemented as mindate=2001&maxdate=2002/01/01)
604 datetype: Limit dates to a specific date field based on database.
605 For example: datetype=edat
607 retmode: Select the retrieval output format
609 ref (only used with cmd=prlinks for one ID)
614 idstring += "%s," % d
615 idstring = idstring[:-1] # remove trailing comma
617 params = urlencode ({
625 'linkname': linkname,
629 'daterange': daterange,
630 'datetype': datetype,
631 'retmode' : retmode})
633 LOG.info("getting elink from '%s?%s'" % (elink_url, params))
634 f = urllib.urlopen ("%s?%s" % (elink_url, params))
636 if cmd == 'prlinks' and retmode == 'ref' :
637 # Just get the link, we don't need the provider's webpage HTML.
644 LOG.debug('got:\n%s' % string)
648 ## Combining the searching and parsing (dropping some of the less used features)
650 def search_fetch_xml(term, db='pubmed', field=None,
651 reldate=None, daterange=None, datetype=None,
652 retmax=None, sort=None,
653 validate=False, valid_fields=None,
654 retmode='xml', rettype='medline'):
655 if validate and valid_fields == None:
656 valid_fields,field_tags,field_info = field_dict(db)
657 search_page = _query_esearch(term, db, field,
658 reldate, daterange, datetype,
659 retmax, rettype='uilist', sort=sort,
660 validate=validate, valid_fields=valid_fields)
661 pid_list = parse_esearch(search_page)
664 fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype)
667 def search_link(term, db='pubmed', field=None,
668 reldate=None, daterange=None, datetype=None,
669 retmax=None, sort=None,
670 validate=False, valid_fields=None,
671 link_term=None, fromdb=None,
672 cmd=None, linkname=None, link_holding=None,
674 link_reldate=None, link_daterange=None, link_datetype=None,
676 if validate and valid_fields == None:
677 valid_fields,field_tags,field_info = field_dict(db)
678 search_page = _query_esearch(term, db, field,
679 reldate, daterange, datetype,
680 retmax, rettype='uilist', sort=sort,
681 validate=validate, valid_fields=valid_fields)
682 pid_list = parse_esearch(search_page)
683 link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb,
684 cmd=cmd, linkname=linkname, holding=link_holding,
685 version=version,reldate=link_reldate,
686 daterange=link_daterange, datetype=link_datetype,
687 retmode=link_retmode)
690 ## Use the external bibutils package to convert to BibTeX format
694 """Simple interface for executing POSIX-style pipes.
696 Based on the subprocess module. The only complication is the
697 adaptation of `subprocess.Popen._communicate` to listen to the
698 stderrs of all processes involved in the pipe, as well as the
699 terminal process' stdout. There are two implementations of
700 `Pipe._communicate`, one for MS Windows, and one for POSIX
701 systems. The MS Windows implementation is currently untested.
703 >>> p = Pipe([['find', '/etc/'], ['grep', '^/etc/ssh$']])
710 >>> p.stderrs # doctest: +ELLIPSIS
711 [...find: ...: Permission denied..., '']
713 >>> p = Pipe([['cat'], ['head']], stdin='line 1\\nline 2\\nline 3\\n')
715 'line 1\\nline 2\\nline 3\\n'
721 def __init__(self, cmds, stdin=None):
722 if isinstance(stdin, str):
731 if len(self._procs) != 0:
732 stdin = self._procs[-1].stdout
733 LOG.debug('run command %s' % cmd)
736 kwargs['close_fds'] = True
737 self._procs.append(Popen(
738 cmd, stdin=stdin, stdout=PIPE, stderr=PIPE, **kwargs))
740 self.stdout,self.stderrs = self._communicate(input=stdin_str)
742 # collect process statuses
745 for proc in self._procs:
746 self.statuses.append(proc.wait())
747 LOG.debug('join %s (status %d)' % (proc, self.statuses[-1]))
748 if self.statuses[-1] != 0:
749 self.status = self.statuses[-1]
751 # Code excerpted from subprocess.Popen._communicate()
752 if _MSWINDOWS == True:
753 def _communicate(self, input=None):
754 LOG.debug('communicate with pipe')
755 assert input == None, 'stdin != None not yet supported'
756 # listen to each process' stderr
759 for proc in self._procs:
761 thread = Thread(target=proc._readerthread,
762 args=(proc.stderr, stderr_array))
763 thread.setDaemon(True)
765 threads.append(thread)
766 std_X_arrays.append(stderr_array)
768 # also listen to the last processes stdout
770 thread = Thread(target=proc._readerthread,
771 args=(proc.stdout, stdout_array))
772 thread.setDaemon(True)
774 threads.append(thread)
775 std_X_arrays.append(stdout_array)
777 # join threads as they die
778 for thread in threads:
781 # read output from reader threads
783 for std_X_array in std_X_arrays:
784 std_X_strings.append(std_X_array[0])
786 stdout = std_X_strings.pop(-1)
787 stderrs = std_X_strings
788 LOG.debug('pipe communication complete')
789 return (stdout, stderrs)
791 assert _POSIX==True, 'invalid platform'
792 def _communicate(self, input=None):
793 LOG.debug('communicate with pipe')
797 stdout = None # Return
798 stderr = None # Return
800 if self._procs[0].stdin:
801 # Flush stdio buffer. This might block, if the user has
802 # been writing to .stdin in an uncontrolled fashion.
803 self._procs[0].stdin.flush()
805 write_set.append(self._procs[0].stdin)
807 self._procs[0].stdin.close()
808 for proc in self._procs:
809 read_set.append(proc.stderr)
810 read_arrays.append([])
811 read_set.append(self._procs[-1].stdout)
812 read_arrays.append([])
815 while read_set or write_set:
816 LOG.debug('select on read %s, write %s' % (read_set,write_set))
818 rlist, wlist, xlist = select.select(read_set, write_set, [])
819 except select.error, e:
820 if e.args[0] == errno.EINTR:
824 LOG.debug('selected read %s, write %s, exception %s'
825 % (rlist, wlist, xlist))
826 if self._procs[0].stdin in wlist:
827 # When select has indicated that the file is writable,
828 # we can write up to PIPE_BUF bytes without risk
829 # blocking. POSIX defines PIPE_BUF >= 512
830 LOG.debug('write to stdin for process 0')
831 chunk = input[input_offset : input_offset + 512]
832 bytes_written = os.write(
833 self._procs[0].stdin.fileno(), chunk)
834 input_offset += bytes_written
835 if input_offset >= len(input):
836 self._procs[0].stdin.flush()
837 self._procs[0].stdin.close()
838 write_set.remove(self._procs[0].stdin)
839 LOG.debug('stdin complete')
840 if self._procs[-1].stdout in rlist:
841 LOG.debug('read stdout for final process')
842 data = os.read(self._procs[-1].stdout.fileno(), 1024)
844 self._procs[-1].stdout.close()
845 read_set.remove(self._procs[-1].stdout)
846 LOG.debug('stdout complete')
847 read_arrays[-1].append(data)
848 for i,proc in enumerate(self._procs):
849 if proc.stderr in rlist:
850 LOG.debug('read stderr for process %i' % i)
851 data = os.read(proc.stderr.fileno(), 1024)
854 read_set.remove(proc.stderr)
855 LOG.debug('stderr complete for process %d' % i)
856 read_arrays[i].append(data)
858 # All data exchanged. Translate lists into strings.
860 for read_array in read_arrays:
861 read_strings.append(''.join(read_array))
863 stdout = read_strings.pop(-1)
864 stderrs = read_strings
865 LOG.debug('pipe communication complete')
866 return (stdout, stderrs)
869 def medline_xml_to_bibtex(fetch_page):
870 """Convert medline XML to BibTeX
872 >>> xml = '\\n'.join([
873 ... '<?xml version="1.0"?>',
874 ... '<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2011//EN" "http://www.ncbi.nlm.nih.gov/entrez/query/DTD/pubmed_110101.dtd">',
875 ... '<PubmedArticleSet>',
876 ... ' <PubmedArticle>',
877 ... ' <MedlineCitation Owner="NLM" Status="MEDLINE">',
878 ... ' <PMID Version="1">20004685</PMID>',
879 ... ' <Article PubModel="Print-Electronic">',
881 ... ' <ISSN IssnType="Electronic">1879-0003</ISSN>',
882 ... ' <JournalIssue CitedMedium="Internet">',
883 ... ' <Volume>46</Volume><Issue>2</Issue>',
885 ... ' <Year>2010</Year><Month>Mar</Month><Day>1</Day>',
887 ... ' </JournalIssue>',
889 ... ' <ArticleTitle>Monte Carlo simulation of mechanical unfolding '
890 ... 'of proteins based on a simple two-state model.'
891 ... '</ArticleTitle>',
892 ... ' <Pagination><MedlinePgn>159-66</MedlinePgn></Pagination>',
893 ... ' <AuthorList CompleteYN="Y">',
894 ... ' <Author ValidYN="Y">',
895 ... ' <LastName>King</LastName>',
896 ... ' <ForeName>William T</ForeName>',
897 ... ' <Initials>WT</Initials>',
899 ... ' <Author ValidYN="Y">',
900 ... ' <LastName>Su</LastName>',
901 ... ' <ForeName>Meihong</ForeName>',
902 ... ' <Initials>M</Initials>',
904 ... ' <Author ValidYN="Y">',
905 ... ' <LastName>Yang</LastName>',
906 ... ' <ForeName>Guoliang</ForeName>',
907 ... ' <Initials>G</Initials>',
909 ... ' </AuthorList>',
910 ... ' <MedlineJournalInfo>',
911 ... ' <MedlineTA>Int J Biol Macromol</MedlineTA>',
912 ... ' </MedlineJournalInfo>',
914 ... ' <MedlineJournalInfo>',
915 ... ' <MedlineTA>Int J Biol Macromol</MedlineTA>',
916 ... ' </MedlineJournalInfo>',
917 ... ' </MedlineCitation>',
919 ... ' <ArticleIdList>',
920 ... ' <ArticleId IdType="doi">10.1016/j.ijbiomac.2009.12.001'
922 ... ' </ArticleIdList>',
923 ... ' </PubmedData>',
924 ... ' </PubmedArticle>',
925 ... '</PubmedArticleSet>',
927 >>> print medline_xml_to_bibtex(xml)
929 author = "William T. King and Meihong Su and Guoliang Yang",
930 title = "Monte Carlo simulation of mechanical unfolding of
931 proteins based on a simple two-state model.",
932 journal = "Int J Biol Macromol",
940 doi = "10.1016/j.ijbiomac.2009.12.001",
944 LOG.info('convert medline XML to BibTeX\n%s' % fetch_page)
945 p = Pipe(cmds=[['med2xml'], ['xml2bib', '-fc'], ['bibclean']],
947 LOG.debug('converted to\n%s' % p.stdout)
954 "Print Entrez search hints and exit"
962 ## Test with a mini-searching application
964 if __name__ == "__main__" :
965 from optparse import OptionParser
967 usage_string = """%prog [options] SEARCH_TERM (print medline xml matching search)
968 | %prog -l [options] SEARCH_TERM (print links to entries matching search)
969 | %prog -L [-d DATABASE] [-f FILE] (list databases)
970 | %prog -X [-d DATABASE] [-F FIELD] [-f FILE] (list fields in a database, or details on a single field)
972 2008, W. Trevor King.
974 See the docstrings in %prog or
975 http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html
978 parser = OptionParser(usage=usage_string, version="%prog 0.1")
980 # Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511
982 # metavar is the name used in the help for that options required text,
983 # and dest is the name of the property you'll use to access the value of that option.
986 parser.add_option('-d', '--database', dest="database",
987 help="Search DATABASE (default '%default')",
988 type='string', metavar="DATABASE", default='pubmed')
989 parser.add_option('-f', '--file', dest="filename",
990 help="write output to FILE (default stdout)",
991 type='string', metavar="FILE")
992 parser.add_option('-v', '--verbose', dest="verbose", action="store_true",
993 help="Print lots of debugging information",
995 parser.add_option('-H', '--hints', callback=hints,
996 help="Print Entrez search hints and exit",
1000 # mode control options
1002 def set_mode(option, opt_str, value, parser):
1004 long_option = option.get_opt_string()
1005 if long_option == '--list-mode' :
1007 elif long_option == '--explain-mode' :
1010 parser.add_option('-L', '--list-mode', callback=set_mode,
1011 help="Run in list mode", action="callback")
1012 parser.add_option('-X', '--explain-mode', callback=set_mode,
1013 help="Run in explain mode", action="callback")
1015 # search-fetch-xml-to-? options
1017 def set_output(option, opt_str, value, parser):
1019 long_option = option.get_opt_string()
1020 if long_option == '--output-link' :
1022 parser.add_option('-W', '--raw', dest="raw", action="store_true",
1023 help="Output raw Entrez xml", default=False)
1024 parser.add_option('-F', '--field', dest="field",
1025 help="Limit SEARCH_TERM to FIELD",
1026 type='string', metavar="FIELD")
1027 parser.add_option('-r', '--reldate', dest="reldate",
1028 help="Limit search to dates within DAYS of today",
1029 type='string', metavar="DAYS")
1030 parser.add_option('-R', '--daterange', dest="daterange",
1031 help="Limit search to dates within DATERANGE (e.g. '2001/1/1,2002')",
1032 type='string', metavar="DATERANGE")
1033 parser.add_option('-t', '--datetype', dest="datetype",
1034 help="Select field to apply date limits to (e.g. 'edat' for Entrez date)",
1035 type='string', metavar="DATETYPE")
1036 parser.add_option('-m', '--retmax', dest="retmax",
1037 help="Return at max RETMAX items from a successful search (default %default)",
1038 type='string', metavar="RETMAX", default=20)
1039 parser.add_option('-M', '--retmode', dest="retmode",
1040 help="Select fetch/link output format",
1041 type='string', metavar="RETMODE", default='xml')
1042 parser.add_option('-V', '--validate', dest="validate", action="store_true",
1043 help="Check that FIELD and field tags in SEARCH_TERM are valid for DB",
1046 # output link options
1047 parser.add_option('-l', '--output-link', callback=set_output,
1048 help="Output a link (instead of xml citations)",
1050 parser.add_option('-c', '--link-cmd', dest="link_cmd",
1051 help="Select link output",
1052 type='string', metavar="LINK_CMD")
1053 parser.add_option('-T', '--link-term', dest="link_term",
1054 help="Limit links to those matching LINK_TERM",
1055 type='string', metavar="LINK_TERM")
1056 parser.add_option('-D', '--from-database', dest="fromdb",
1057 help="Limit links to those from FROMDATABASE)",
1058 type='string', metavar="FROMDATABASE")
1059 parser.add_option('-n', '--link-name', dest="linkname",
1060 help="Limit links to a specific neighbor",
1061 type='string', metavar="LINKNAME")
1063 (options, args) = parser.parse_args()
1066 # open the output file if specified
1067 if options.filename == None :
1068 outfile = sys.stdout
1070 outfile = file(options.filename, 'w')
1072 if options.verbose :
1073 LOG.setLevel(logging.DEBUG)
1075 LOG.debug('operating in %s mode' % mode)
1078 print >> outfile, "Available databases:"
1079 databases = database_list()
1080 for db in databases:
1081 print >> outfile, "\t%s" % db
1083 elif mode == 'explain':
1084 fields,tags,field_info = field_dict(db=options.database)
1085 if options.field == None :
1086 print >> outfile, "Available fields in %s:" % options.database
1088 for field in fields :
1089 if len(field) > field_size[0] :
1090 field_size[0] = len(field)
1091 if len(field_info[field]['FullName']) > field_size[1] :
1092 field_size[1] = len(field_info[field]['FullName'])
1093 for field in fields :
1094 print >> outfile, "\t%*.*s\t%-*.*s" \
1095 % (field_size[0], field_size[0], field,
1096 field_size[1], field_size[1], field_info[field]['FullName'])
1098 print >> outfile, "Field %s in %s:" % (options.field,options.database)
1101 if len(key) > field_size[0] :
1102 field_size[0] = len(key)
1103 if len(field_info[options.field][key]) > field_size[1] :
1104 field_size[1] = len(field_info[options.field][key])
1106 print >> outfile, "\t%*.*s\t%-*.*s" \
1107 % (field_size[0], field_size[0], key,
1108 field_size[1], field_size[1], field_info[options.field][key])
1110 elif mode == 'search':
1111 search_term = args[0]
1112 LOG.debug('output %s' % output)
1114 if output == 'bibtex' :
1115 medline_xml = search_fetch_xml(term=search_term,
1116 db=options.database,
1117 field=options.field,
1118 reldate=options.reldate,
1119 daterange=options.daterange,
1120 datetype=options.datetype,
1121 retmax=options.retmax,
1122 validate=options.validate,
1123 retmode=options.retmode,
1127 print outfile, medline_xml
1129 bibtex = medline_xml_to_bibtex(medline_xml)
1130 print >> outfile, bibtex
1132 elif output == 'link' :
1133 # Assume that if you're looking for links
1134 # your search is already pretty refined,
1135 # so use the date options for link-limiting.
1136 link_xml = search_link(term=search_term,
1137 db=options.database,
1138 field=options.field,
1144 validate=options.validate,
1146 link_term=options.link_term,
1147 fromdb=options.fromdb,
1148 cmd=options.link_cmd,
1149 linkname=options.linkname,
1152 link_reldate=options.reldate,
1153 link_daterange=options.daterange,
1154 link_datetype=options.datetype,
1155 link_retmode=options.retmode,)
1156 print >> outfile, link_xml
1158 if options.filename != None :