Add ticker post and script.
authorW. Trevor King <wking@drexel.edu>
Tue, 28 Dec 2010 21:37:29 +0000 (16:37 -0500)
committerW. Trevor King <wking@drexel.edu>
Tue, 28 Dec 2010 21:37:29 +0000 (16:37 -0500)
posts/ticker.mdwn [new file with mode: 0644]
posts/ticker/ticker.py [new file with mode: 0755]

diff --git a/posts/ticker.mdwn b/posts/ticker.mdwn
new file mode 100644 (file)
index 0000000..1d0153d
--- /dev/null
@@ -0,0 +1,10 @@
+[[ticker.py]] is a simple stock-quote scraper using [[Python]]'s
+[urllib2][] to grab pages and [lxml][] to parse the HTML.  It's a
+pretty straightforward example of elementary scraping in Python.
+
+[urllib2]: http://docs.python.org/library/urllib2.html
+[lxml]: http://codespeak.net/lxml/
+
+[[!tag tags/fun]]
+[[!tag tags/python]]
+[[!tag tags/tools]]
diff --git a/posts/ticker/ticker.py b/posts/ticker/ticker.py
new file mode 100755 (executable)
index 0000000..a175fc5
--- /dev/null
@@ -0,0 +1,161 @@
+#!/usr/bin/env python
+# Copyright (C) 2010  W. Trevor King <wking@drexel.edu>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+"""Grab stock prices by ticker symbol.
+"""
+
+import logging
+from StringIO import StringIO
+import urllib2
+
+from lxml import etree
+
+
+class Grabber (object):
+    "Base ckass for website-specific quote scrapers."
+    def __init__(self, url):
+        self._url = url
+
+    def quote(self, ticker):
+        "Floating point quote for the given `ticker` symbol string."
+        url = self._get_url(ticker)
+        logging.info('get quote for %s from %s using %s'
+                     % (ticker, url, self))
+        info,html = self._get_html(url)
+        quote = self._parse_html(html)
+        return quote
+
+    def _get_url(self, ticker):
+        "URL listing the quote for the given `ticker` symbol string."
+        return self._url % ticker
+
+    def _get_html(self, url):
+        "Page info and html associated with the given `url`."
+        f = urllib2.urlopen(url)
+        info = f.info()
+        html = f.read()
+        f.close()
+        return (info, html)
+
+    def _parse_html(self, html):
+        """Extract the floating point quote from the page's `html`.
+
+        This method must be overriden by website-specific subclasses.
+        """
+        raise NotImplementedError()
+
+
+class GoogleGrabber (Grabber):
+    "Grab quotes from Google Finance."
+    def __init__(self):
+        super(GoogleGrabber, self).__init__(
+            url='http://www.google.com/finance?q=%s')
+
+    def _parse_html(self, html):
+        """Extract quote from a snippet that looks like::
+
+            <span class="pr">
+              <span id="ref_29312_l">
+                64.77
+              </span>
+            </span>
+        """
+        parser = etree.HTMLParser()
+        tree = etree.parse(StringIO(html), parser)
+        root = tree.getroot()
+        span = root.xpath(".//span[@class='pr']")[0]
+        text = ''.join(span.itertext()).strip()
+        return float(text)
+
+
+class YahooGrabber (Grabber):
+    "Grab quotes from Yahoo! Finance."
+    def __init__(self):
+        super(YahooGrabber, self).__init__(
+            url='http://finance.yahoo.com/q?s=%s')
+
+    def _parse_html(self, html):
+        """Extract quote from a snippet that looks like::
+
+            <tr>
+              <th ...>Last Trade:</th>
+              <td ...>
+                <big>
+                  <b>
+                    <span ...>
+                      64.74
+                    </span>
+                  </b>
+                </big>
+              </td>
+            </tr>
+
+        For the implementation, see the `LXML tutorial`_.
+
+        .. _LXML tutorial:
+          http://codespeak.net/lxml/tutorial.html#using-xpath-to-find-text
+        """
+        parser = etree.HTMLParser()
+        tree = etree.parse(StringIO(html), parser)
+        root = tree.getroot()
+        rows = root.xpath('.//tr')  #[[td/text() = 'Last Trade:']")
+        for row in rows:
+            has_label = row.xpath(".//th/text() = 'Last Trade:'")
+            if has_label:
+                break
+        assert has_label, '\n---\n\n'.join([
+                etree.tostring(row,  pretty_print=True) for row in rows])
+        data = row.xpath('.//td')[0]
+        text = ''.join(data.itertext()).strip()
+        return float(text)
+
+
+GRABBERS = {}
+# Create a dictionary of (name, grabber) pairs.  For example
+#   GRABBERS['google'] = GoogleGrabber
+for name,obj in locals().items():
+    match = False
+    try:
+        if issubclass(obj, Grabber) and obj != Grabber:
+            match = True
+    except TypeError:
+        pass
+    if match:
+        n = name[:-len('Grabber')].lower()
+        GRABBERS[n] = obj
+del name, obj, match
+
+
+if __name__ == '__main__':
+    from optparse import OptionParser
+
+    p = OptionParser(usage='%prog [options] TICKER ...')
+    p.disable_interspersed_args()
+    p.add_option('-v', '--verbose', dest='verbose', default=0, action='count',
+                 help='increment verbosity')
+    grabbers = sorted(GRABBERS.keys())
+    p.add_option('-g', '--grabber', dest='grabber', default='google',
+                 type='choice', choices=grabbers,
+                 help='select grabber from %s (%%default)' % grabbers)
+
+    options,args = p.parse_args()
+
+    log_levels = [logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG]
+    log_level = log_levels[min(options.verbose, len(log_levels)-1)]
+    logging.basicConfig(level=log_level)
+
+    g = GRABBERS[options.grabber]()
+    print '\t'.join([str(g.quote(ticker)) for ticker in args])