From bf150e9990d5dd21793284b1b434af515f8a7099 Mon Sep 17 00:00:00 2001 From: "W. Trevor King" Date: Fri, 15 Apr 2011 14:50:58 -0400 Subject: [PATCH] Began versioning my entrez.py script. --- posts/entrez/entrez.py | 904 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 904 insertions(+) create mode 100755 posts/entrez/entrez.py diff --git a/posts/entrez/entrez.py b/posts/entrez/entrez.py new file mode 100755 index 0000000..af27730 --- /dev/null +++ b/posts/entrez/entrez.py @@ -0,0 +1,904 @@ +#!/usr/bin/python +# +# Copyright (C) 1998-2004 Frederic Gobry +# Copyright (C) 2008 W. Trevor King +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version 2 +# of the License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. +# +# Code following John Vu's medline query code pybliographer/Pyblio/Query.py, +# +# Python interface to the Entrez databases. +# See http://eutils.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html +# Current as of August 1, 2007 +# +# Rules: +# * Run retrieval scripts on weekends or between 9 pm and 5 am Eastern Time weekdays for any series of more than 100 requests. +# * Send E-utilities requests to http://eutils.ncbi.nlm.nih.gov, not the standard NCBI Web address. +# * Make no more than one request every 3 seconds. +# * Use the URL parameter email, and tool for distributed software, so that we can track your project and contact you if there is a problem. +# * NCBI's Disclaimer and Copyright notice must be evident to users of your service. +# NLM does not claim the copyright on the abstracts in PubMed; however, journal publishers or authors may. +# NLM provides no legal advice concerning distribution of copyrighted materials, consult your legal counsel. +# +# For a good Python-and-XML-DOM intro, see +# http://www.boddie.org.uk/python/XML_intro.html +# for the official docs, see +# http://docs.python.org/lib/module-xml.dom.html + +"""Python bindings on Entrez database queries. +""" + +# The time module is added for querying date ranges of publications +import urllib, sys, re, string, time + +# DOM module for parsing XML, +# supports Document Object Model (DOM) Level 1 Specification +# http://docs.python.org/lib/module-xml.dom.minidom.html +import xml.dom.minidom as dom + +# For calling the bibutils conversion programs +from popen2 import popen2 + +# Entrez access points +einfo_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' +esearch_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' +efetch_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' +elink_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi' + +# Entrez-requested tracking information +TOOL = 'entrezpy' +EMAIL = 'wking@drexel.edu' + +## XML and list utility functions + +def urlencode(param_dict) : + params = "" + for key,value in param_dict.items() : + if value == None : + continue # ignore unused parameter + #if type(value)== : # convert True/False to 'y'/ + # if value == True : + # params += "%s=y&" % (key,) + # #else : + # # params += "%s=n&" % (key,) + if value != None : + params += "%s=%s&" % (key, str(value)) + if len(params) > 1 : + params = params[:-1] # remove trailing & + return params + +def unique(seq, keepstr=True): + """ + Return the sequence (list, tuple, etc) without repeating entries + by Paul Rubin and Jordan Callicoat. + http://groups.google.com/group/comp.lang.python/browse_thread/thread/40c6c455f4fd5154/744a1a338afe1331?lnk=gst&rnum=7#744a1a338afe1331 + + for example [1,2,3,1,2] -> [1,2,3] + """ + t = type(seq) + if t in (str, unicode): + t = (list, ''.join)[bool(keepstr)] + seen = [] + return t(c for c in seq if not (c in seen or seen.append(c))) + +def get_text(node) : + """ + Given a node ( in the following example), + extract some-text from 'some-text' + returns u'some-text'. + However, if the xml is '' returns None + """ + if len(node.childNodes) == 1: + data = node.childNodes[0].data + elif len(node.childNodes) == 0: # empty node + data = None + else : + raise Exception, "Node contains more than text" + return data + +def get_child_nodes(node, child_name): + """ + Given a node ( in the following example), + returns an array of nodes matching + """ + ret = [] + for n in node.childNodes: + if n.nodeType != n.ELEMENT_NODE: + continue # ignore text, comment, etc. nodes + if n.tagName == child_name : + ret.append(n) + return ret + +def get_child_nodes(node, child_name): + """ + Given a node ( in the following example), + returns an the node matching + """ + nodes = get_child_node(node, child_name) + assert len(nodes) == 1, "%d child nodes named %s" % (len(nodes), child_name) + return node[0] + +def get_child_contents(node, child_name): + """ + Given a node ( in the following example), + extract some-text from ' + some-text + other-text + some-other-text + ... + ' + Returns ['some-text', 'some-other-text', ...] + """ + nodes = get_child_nodes(node, child_name) + ret = [] + for n in nodes: + ret.append(get_text(n)) + return ret + +def get_child_dict(node): + """ + Given a node ( in the following example), + extract some-text from ' + some-text + other-text + some-other-text + ... + ' + Returns {'some-tag':['some-text', 'some-other-text', ...], + 'other-tag':['some-other-text']} + """ + dict = {} + tags = [] # to preserve order of tags + for n in node.childNodes: + if n.nodeType != n.ELEMENT_NODE: + continue # ignore text, comment, etc. nodes + try: # another entry for an existing tag + dict[n.tagName].append(get_text(n)) + except KeyError: # new tag + dict[n.tagName] = [get_text(n)] + tags.append(n.tagName) + return (dict, tags) + +def delist_dict(dict) : + """ + Given a dict + e.g. {'some-tag':['some-text', 'some-other-text', ...], + 'other-tag':['some-other-text'], ...} , + replaces any values in an array of length 1 with the element, + e.g. {'some-tag':['some-text', 'some-other-text', ...], + 'other-tag':'some-other-text', ...} , + """ + for key,value in dict.items() : + if isinstance(value, list) and len(value) == 1 : + dict[key] = value[0] + return dict + +## Get information about the Entrez databases themselves + +def _query_einfo(db=None, debug=False) : + """ + Get information about the Entrez databases themselves. + http://eutils.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html + + Either list all available databases with db=None, or + Specific information on a particular database (e.g. pubmed) with db=pubmed. + """ + params = urlencode ({ + 'db': db, + 'tool' : TOOL, + 'email' : EMAIL}) + + if debug : + print "Getting einfo from '%s?%s'" % (einfo_url, params) + f = urllib.urlopen ("%s?%s" % (einfo_url, params)) + string = f.read() + f.close() + if debug == True: + print string + print "" + return string + +def get_parsed_einfo(db=None, page=None, parsed=None, debug=True): + """ + Helper function for various einfo processing functions. + Allow each processor to function + independently (page=None, parsed=None), + with a shared xml string (page=, parsed=None), or + with a shared parsed xml structure (page=*, parsed=). + Use clean_parsed_einfo() for cleanup + """ + if page == None and parsed == None: + if debug == True : print "Downloading new einfo page" + page = _query_einfo(db) + if parsed == None : + if debug == True : print "Parsing new einfo page" + parsed = dom.parseString(page) + parsed_islocal = True + else : + if debug == True : print "Using old einfo parsing" + parsed_islocal = False + return (parsed, parsed_islocal) + +def clean_parsed_einfo(parsed, parsed_islocal=True, debug=False): + """ + Helper function for various einfo processing functions. + Clean up the parsed xml structure if the calling function created it. + """ + if parsed_islocal == True : + if debug == True : print "Cleaning up einfo parsing" + parsed.unlink() # clean up the DOM + +def database_list(page=None, parsed=None, debug=False): + parsed,parsed_islocal = get_parsed_einfo(page=page, parsed=parsed, debug=debug) + databases = [] + for node in parsed.getElementsByTagName("DbName"): + # Extract some-text from 'some-text' + # by default, xml.dom.minidom uses unicode, + # so strings get printed: "u'string contents'" + databases.append(get_text(node)) + clean_parsed_einfo(parsed,parsed_islocal, debug=debug) + return databases + +def field_dict(db='pubmed', page=None, parsed=None, debug=False): + parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug) + fields = [] + tags = [] + field_info = {} + fieldlists = parsed.getElementsByTagName("FieldList") + assert len(fieldlists) == 1, "%s\n\n%d FieldLists!" % (parsed.toxml(), len(fieldlists)) + fieldlist = fieldlists[0] + for node in fieldlist.childNodes: + if node.nodeType != node.ELEMENT_NODE : + continue # ignore text, comment, etc. nodes + assert node.tagName == "Field", "Unrecognized tag '%s' in FieldList" % node.tagName + field,new_tags = get_child_dict(node) + assert len(field['Name']) == 1, "Multiple field names %s" % str(field['Name']) + field = delist_dict(field) + fields.append(field['Name']) + new_tags = unique(tags + new_tags) + if tags != []: + assert new_tags == tags, "Inconsistent tags" + tags = new_tags + field_info[field['Name']] = field + clean_parsed_einfo(parsed,parsed_islocal, debug) + return (fields, tags, field_info) + +def link_dict(db='pubmed', page=None, parsed=None, debug=False): + parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug) + links = [] + tags = [] + link_info = [] + linklists = parsed.getElementsByTagName("LinkList") + assert len(linklists) == 1, "%s\n\n%d LinkLists!" % (parsed.toxml(), len(linklists)) + linklist = linklists[0] + for node in linklist.childNodes: + if node.nodeType != node.ELEMENT_NODE : + continue # ignore text, comment, etc. nodes + assert node.tagName == "Link", "Unrecognized tag '%s' in LinkList" % node.tagName + link,new_tags = get_child_dict(node) + assert len(link['Name']) == 1, "Multiple link names %s" % str(link['Name']) + link = delist_dict(link) + links.append(link['Name']) + new_tags = unique(tags + new_tags) + if tags != []: + assert new_tags == tags, "Inconsistent tags" + tags = new_tags + link_info[link['Name']] = link + clean_parsed_einfo(parsed,parsed_islocal, debug) + return (links, tags, link_info) + +def database_info(db='pubmed', page=None, parsed=None, debug=False): + "Convenience function to call both field_dict and link_dict" + parsed,parsed_islocal = get_parsed_einfo(db, page, parsed, debug) + fields,field_tags,field_info = field_dict(db=db, parsed=parsed, debug=debug) + links,link_tags,link_info = link_dict(db=db, parsed=parsed, debug=debug) + clean_parsed_einfo(parsed,parsed_islocal, debug=debug) + return (fields, field_tags, field_info, links, link_tags, link_info) + +def validate_field(field, fields): + "Ensure that field is a valid field for the database db." + try : + fields.index(field.upper()) + except ValueError: + raise Exception, "Field '%s' invalid\nValid fields are\n %s" \ + % (field, str(fields)) + +def strip_fields_from_term(term): + "HACK: really stupid algorithm" + fields = [] + infield = False + for i in range(len(term)): + if term[i] == '[' and infield == False : + infield = True + field_start = i+1 + elif term[i] == ']' and infield == True : + infield = False + fields.append(term[field_start:i]) + return fields + +def validate_search_term(term, fields): + "Ensure that the fields in term are valid fields for the database db." + for field in strip_fields_from_term(term) : + validate_field(field, fields) + + +## Search an Entrez database + +def _query_esearch(term, db='pubmed', field=None, + reldate=None, daterange=None, datetype=None, + retmax=None, rettype=None, sort=None, + validate=False, valid_fields=None, debug=False) : + """ + Search an Entrez database. + http://eutils.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html + + Does not currently support the usehistory, WebEnv, query_key, retstart, or retmode parameters. + + Help with the arguments adapted from esearch_help.html: + + term: This command uses search terms or phrases with or without Boolean operators. + You can search in several fields using the [term field] tag. + You can search in a single field using the 'field' parameter below. + ?You may also tag search terms using field=tag.? I don't understand this line + For example: term=asthma[MESH]+OR+hay+fever[MESH] + 'term=asthma[MESH]' is the same as 'term=asthma&field=MESH' + ( http://www.ncbi.nlm.nih.gov/books/bv.fcgi?rid=helpentrez.section.EntrezHelp.Writing_Advanced_Sea ) + + db: This command selects the database to be searched + For example: db=pubmed + + field: Use this command to specify a specific search field. + PubMed fields: affl, auth, ecno, jour, iss, mesh,... + Retrieve with field_dict('pubmed') + For example: field=auth + + reldate: Limit items a number of days immediately preceding today's date. + For example: reldate=365 + + daterange: Limit results bounded by two specific dates. + For example: daterange=('2001', '2002/01/01') + (implemented as mindate=2001&maxdate=2002/01/01) + + datetype: Limit dates to a specific date field based on database. + For example: datetype=edat + + retmax: Limit the number of items retrieved + For example: retmax=100 + + rettype: Select the retrieval type + PubMed values: count, uilist (default) + + sort: Sort the returned uilist + PubMed values: author, last+author, journal, pub+date + + """ + if daterange != None : + assert len(daterange) == 2, "Invalid daterange '%s', should be e.g. ('2001', '2002/01/01')" + reldate == None, "Specifying date with daterange AND reldate!" + mindate = daterange[0] + maxdate = daterange[1] + else : + mindate = None + maxdate = None + if validate : + assert len(valid_fields) > 0, "Need a list of valid fields to validate" + if field != None : + validate_field(field) + validate_search_term(term, valid_fields) + params = urlencode ({ + 'tool' : TOOL, + 'email' : EMAIL, + 'term' : term, + 'db': db, + 'field' : field, + 'reldate' : reldate, + 'mindate' : mindate, + 'maxdate' : maxdate, + 'datetype' : datetype, + 'maxdate' : maxdate, + 'retmax' : retmax, + 'rettype' : rettype, + 'sort' : sort}) + + if debug : + print "Getting esearch from '%s?%s'" % (esearch_url, params) + f = urllib.urlopen ("%s?%s" % (esearch_url, params)) + string = f.read() + f.close() + if debug == True: + print string + print "" + return string + +def parse_esearch(page): + "Parse the xml returned by _query_esearch()" + parsed = dom.parseString(page) + + pid_list = [] + for node in parsed.getElementsByTagName("Id"): + pid_list.append(get_text(node)) + + parsed.unlink() + + return pid_list + + +## Fetch records by Primary ID from an Entrez database + +def _query_efetch(id, db='pubmed', + retmax=None, retmode='xml', rettype='medline', + debug=False) : + """ + Fetch records by primary ID from an Entrez database. + http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html + http://eutils.ncbi.nlm.nih.gov/entrez/query/static/efetchlit_help.html + + + Does not currently support the usehistory, WebEnv, query_key, or retstart parameters. + + Help with the arguments adapted from efetchlit_help.html: + + id: Primary UIs identifying the documents to fetch + For example: 'id=11877539, 11822933,11871444' + + db: This command selects the database to be searched + For example: db=pubmed + + retmax: Limit the number of items retrieved (default 20) + For example: retmax=100 + + retmode: Select the retrieval output format + xml (not journals) + html + text + asn.1 (not journals) + + rettype: Select the retrieval type + uilist + abstract (not omim) + citation (not omim) + medline (not omim) + full (journals and omim) + + Not all retmodes are possible with all rettypes: + PubMed Options: + uilist abstract citation medline + xml x x* x* x* + text x x x x + html x x x x + asn.1 n/a x* x* x + x = retrieval mode available + * returned retrieval type is the complete record in the retrieval mode + n/a - not available + OMIM Options: (not case sensitive) + uilist docsum synopsis variants detailed ExternalLink + (MIM (Clinical (Allelic + numbers) synopsis) Variants) + xml x x* x* x* x* x* + text x x x x x* x* + html x x x x x* x* + asn.1 x* x* x* x* x* x* + x = retrieval mode available + * returned retrieval type is the complete record in the retrieval mode + n/a - not available + + """ + idstring = "" + for d in id : + idstring += "%s," % d + idstring = idstring[:-1] # remove trailing comma + params = urlencode ({ + 'tool' : TOOL, + 'email' : EMAIL, + 'id' : idstring, + 'db': db, + 'retmax' : retmax, + 'retmode' : retmode, + 'rettype' : rettype}) + + if debug : + print "Getting efetch from '%s?%s'" % (efetch_url, params) + f = urllib.urlopen ("%s?%s" % (efetch_url, params)) + string = f.read() + f.close() + if debug == True: + print string + print "" + return string + + +## Fetch links by Primary ID from an Entrez database + +def _query_elink(id, term=None, db='all', dbfrom='pubmed', + cmd=None, linkname=None, holding=None, + version=1, + reldate=None, daterange=None, datetype=None, + retmode='xml', + debug=False) : + """ + Fetch links from a list of primary IDs in an Entrez database. + http://eutils.ncbi.nlm.nih.gov/entrez/query/static/elink_help.html + http://www.ncbi.nlm.nih.gov/entrez/query/static/entrezlinks.html + + Does not currently support the WebEnv or query_key parameters. + + Help with the arguments adapted from efetchlit_help.html: + + id: Primary UIs identifying the documents to fetch + For example: 'id=11877539, 11822933,11871444' + + term: This command uses search terms or phrases with or without Boolean operators + to limit the returned matching links. + + db: This command selects the databases to be searched for link targets. + For example: db=all + + dbfrom: This command selects the database containing the ids. + For example: dbfrom=pubmed + + + cmd: Link commands + * prlinks - List the hyperlink to the primary LinkOut provider for + multiple IDs and database. Each ID is processed separately. + * prlinks&retmode=ref - Create a hyperlink to the primary LinkOut provider + for a single ID and database. Return the elink + command, since fetching it breaks the relative + links in the publisher's page. + * llinks - List LinkOut URLs and Attributes, except PubMed libraries, for + multiple IDs and database. Each ID is processed separately. + * llinkslib - List LinkOut URLs and Attributes for multiple IDs and + database. Each ID is processed separately. + * lcheck - Check for the existence (Y or N) of an external link in for + multiple IDs and database. + * ncheck - Check for the existence of a neighbor link for each ID within + a database, e.g., Related Articles in PubMed. + * neighbor - Display neighbors within a database. + * neighbor_history - Create history (WebEnv & query_key) for use in other + EUtilities. + * acheck - Lists Entrez databases links for multiple IDs from a single + database. + + linkname: link to a specific neighbor subset + For example: linkname=nucleotide_nucleotide_comp + + holding: List LinkOut URLs for the specified holding provider, (library). + Used only in conjunction with cmd=llinks or cmd=llinkslib + For example: cmd=llinkslib&holding=medlib + + version: Include a version number to refer to the latest DTD. + For example: version=1 + retrieves the latest DTD (eLink_050511.dtd) that includes the additional + elements, MenuTag, LinkInfo and IdLinkSet. + + Date command are only valid for dbfrom=pubmed & cmd=neighbor + reldate: Limit items a number of days immediately preceding today's date. + For example: reldate=365 + + daterange: Limit results bounded by two specific dates. + For example: daterange=('2001', '2002/01/01') + (implemented as mindate=2001&maxdate=2002/01/01) + + datetype: Limit dates to a specific date field based on database. + For example: datetype=edat + + retmode: Select the retrieval output format + xml (default) + ref (only used with cmd=prlinks for one ID) + + """ + idstring = "" + for d in id : + idstring += "%s," % d + idstring = idstring[:-1] # remove trailing comma + + params = urlencode ({ + 'tool' : TOOL, + 'email' : EMAIL, + 'id' : idstring, + 'term': term, + 'db': db, + 'dbfrom': dbfrom, + 'cmd': cmd, + 'linkname': linkname, + 'holding': holding, + 'version': version, + 'reldate': reldate, + 'daterange': daterange, + 'datetype': datetype, + 'retmode' : retmode}) + + if debug : + print "Getting elink from '%s?%s'" % (elink_url, params) + f = urllib.urlopen ("%s?%s" % (elink_url, params)) + + if cmd == 'prlinks' and retmode == 'ref' : + # Just get the link, we don't need the provider's webpage HTML. + url = f.geturl() + f.close() + return url + + string = f.read() + f.close() + if debug == True: + print string + print "" + return string + + +## Combining the searching and parsing (dropping some of the less used features) + +def search_fetch_xml(term, db='pubmed', field=None, + reldate=None, daterange=None, datetype=None, + retmax=None, sort=None, + validate=False, valid_fields=None, + retmode='xml', rettype='medline', + debug=False) : + if validate and valid_fields == None: + valid_fields,field_tags,field_info = field_dict(db, debug=debug) + search_page = _query_esearch(term, db, field, + reldate, daterange, datetype, + retmax, rettype='uilist', sort=sort, + validate=validate, valid_fields=valid_fields, + debug=debug) + pid_list = parse_esearch(search_page) + fetch_page = _query_efetch(pid_list, db, retmax, retmode, rettype, debug) + return fetch_page + +def search_link(term, db='pubmed', field=None, + reldate=None, daterange=None, datetype=None, + retmax=None, sort=None, + validate=False, valid_fields=None, + link_term=None, fromdb=None, + cmd=None, linkname=None, link_holding=None, + version=1, + link_reldate=None, link_daterange=None, link_datetype=None, + link_retmode='xml', + debug=False) : + if validate and valid_fields == None: + valid_fields,field_tags,field_info = field_dict(db, debug=debug) + search_page = _query_esearch(term, db, field, + reldate, daterange, datetype, + retmax, rettype='uilist', sort=sort, + validate=validate, valid_fields=valid_fields, + debug=debug) + pid_list = parse_esearch(search_page) + link_page = _query_elink(pid_list, term=link_term, db=db, dbfrom=fromdb, + cmd=cmd, linkname=linkname, holding=link_holding, + version=version,reldate=link_reldate, + daterange=link_daterange, datetype=link_datetype, + retmode=link_retmode, + debug=debug) + return link_page + +## Use the external bibutils package to convert to BibTeX format + +def medline_xml_to_bibtex(fetch_page): + child_stdout,child_stdin = popen2("med2xml | xml2bib -fc | bibclean") + print >> child_stdin, fetch_page + child_stdin.close() + bibtex = child_stdout.read() + child_stdout.close() + return bibtex + +## Random + +def hints() : + "Print Entrez search hints and exit" + + print """ +free full text [sb] + + +""" + +## Test with a mini-searching application + +if __name__ == "__main__" : + from optparse import OptionParser + + usage_string = """%prog [options] SEARCH_TERM (print medline xml matching search) + | %prog -l [options] SEARCH_TERM (print links to entries matching search) + | %prog -L [-d DATABASE] [-f FILE] (list databases) + | %prog -X [-d DATABASE] [-F FIELD] [-f FILE] (list fields in a database, or details on a single field) + +2008, W. Trevor King. + +See the docstrings in %prog or + http://www.ncbi.nlm.nih.gov/entrez/query/static/eutils_help.html +for more details. +""" + parser = OptionParser(usage=usage_string, version="%prog 0.1") + + # Explaination by Jerry Stratton, http://www.hoboes.com/Mimsy/?ART=511 + # " + # metavar is the name used in the help for that options required text, + # and dest is the name of the property you'll use to access the value of that option. + # " + + parser.add_option('-d', '--database', dest="database", + help="Search DATABASE (default '%default')", + type='string', metavar="DATABASE", default='pubmed') + parser.add_option('-f', '--file', dest="filename", + help="write output to FILE (default stdout)", + type='string', metavar="FILE") + parser.add_option('-v', '--verbose', dest="verbose", action="store_true", + help="Print lots of debugging information", + default=False) + parser.add_option('-H', '--hints', callback=hints, + help="Print Entrez search hints and exit", + action="callback") + + + # mode control options + mode = 'search' + def set_mode(option, opt_str, value, parser): + global mode + long_option = option.get_opt_string() + if long_option == '--list-mode' : + mode = 'list' + elif long_option == '--explain-mode' : + mode = 'explain' + + parser.add_option('-L', '--list-mode', callback=set_mode, + help="Run in list mode", action="callback") + parser.add_option('-X', '--explain-mode', callback=set_mode, + help="Run in explain mode", action="callback") + + # search-fetch-xml-to-? options + output = 'bibtex' + def set_output(option, opt_str, value, parser): + global output + long_option = option.get_opt_string() + if long_option == '--output-link' : + output = 'link' + parser.add_option('-W', '--raw', dest="raw", action="store_true", + help="Output raw Entrez xml", default=False) + parser.add_option('-F', '--field', dest="field", + help="Limit SEARCH_TERM to FIELD", + type='string', metavar="FIELD") + parser.add_option('-r', '--reldate', dest="reldate", + help="Limit search to dates within DAYS of today", + type='string', metavar="DAYS") + parser.add_option('-R', '--daterange', dest="daterange", + help="Limit search to dates within DATERANGE (e.g. '2001/1/1,2002')", + type='string', metavar="DATERANGE") + parser.add_option('-t', '--datetype', dest="datetype", + help="Select field to apply date limits to (e.g. 'edat' for Entrez date)", + type='string', metavar="DATETYPE") + parser.add_option('-m', '--retmax', dest="retmax", + help="Return at max RETMAX items from a successful search (default %default)", + type='string', metavar="RETMAX", default=20) + parser.add_option('-M', '--retmode', dest="retmode", + help="Select fetch/link output format", + type='string', metavar="RETMODE", default='xml') + parser.add_option('-V', '--validate', dest="validate", action="store_true", + help="Check that FIELD and field tags in SEARCH_TERM are valid for DB", + default=False) + + # output link options + parser.add_option('-l', '--output-link', callback=set_output, + help="Output a link (instead of xml citations)", + action="callback") + parser.add_option('-c', '--link-cmd', dest="link_cmd", + help="Select link output", + type='string', metavar="LINK_CMD") + parser.add_option('-T', '--link-term', dest="link_term", + help="Limit links to those matching LINK_TERM", + type='string', metavar="LINK_TERM") + parser.add_option('-D', '--from-database', dest="fromdb", + help="Limit links to those from FROMDATABASE)", + type='string', metavar="FROMDATABASE") + parser.add_option('-n', '--link-name', dest="linkname", + help="Limit links to a specific neighbor", + type='string', metavar="LINKNAME") + + (options, args) = parser.parse_args() + parser.destroy() + + # open the output file if specified + if options.filename == None : + outfile = sys.stdout + else : + outfile = file(options.filename, 'w') + + if options.verbose : + print >> sys.stdout, "Operating in %s mode" % mode + + if mode == 'list' : + print >> outfile, "Available databases:" + databases = database_list(debug=options.verbose) + for db in databases: + print >> outfile, "\t%s" % db + + elif mode == 'explain': + fields,tags,field_info = field_dict(db=options.database, + debug=options.verbose) + if options.field == None : + print >> outfile, "Available fields in %s:" % options.database + field_size = [0,0] + for field in fields : + if len(field) > field_size[0] : + field_size[0] = len(field) + if len(field_info[field]['FullName']) > field_size[1] : + field_size[1] = len(field_info[field]['FullName']) + for field in fields : + print >> outfile, "\t%*.*s\t%-*.*s" \ + % (field_size[0], field_size[0], field, + field_size[1], field_size[1], field_info[field]['FullName']) + else : + print >> outfile, "Field %s in %s:" % (options.field,options.database) + field_size = [0,0] + for key in tags: + if len(key) > field_size[0] : + field_size[0] = len(key) + if len(field_info[options.field][key]) > field_size[1] : + field_size[1] = len(field_info[options.field][key]) + for key in tags: + print >> outfile, "\t%*.*s\t%-*.*s" \ + % (field_size[0], field_size[0], key, + field_size[1], field_size[1], field_info[options.field][key]) + + elif mode == 'search': + search_term = args[0] + if options.verbose : + print >> sys.stdout, "Output %s" % output + + if output == 'bibtex' : + medline_xml = search_fetch_xml(term=search_term, + db=options.database, + field=options.field, + reldate=options.reldate, + daterange=options.daterange, + datetype=options.datetype, + retmax=options.retmax, + validate=options.validate, + retmode=options.retmode, + rettype='medline', + debug=options.verbose) + if options.raw : + print outfile, medline_xml + else : + bibtex = medline_xml_to_bibtex(medline_xml) + print >> outfile, bibtex + + elif output == 'link' : + # Assume that if you're looking for links + # your search is already pretty refined, + # so use the date options for link-limiting. + link_xml = search_link(term=search_term, + db=options.database, + field=options.field, + reldate=None, + daterange=None, + datetype=None, + retmax=None, + sort=None, + validate=options.validate, + valid_fields=None, + link_term=options.link_term, + fromdb=options.fromdb, + cmd=options.link_cmd, + linkname=options.linkname, + link_holding=None, + version=1, + link_reldate=options.reldate, + link_daterange=options.daterange, + link_datetype=options.datetype, + link_retmode=options.retmode, + debug=options.verbose) + print >> outfile, link_xml + + if options.filename != None : + outfile.close() -- 2.26.2