From: W. Trevor King Date: Tue, 28 Dec 2010 21:37:29 +0000 (-0500) Subject: Add ticker post and script. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=ae319e06852b8e6b3ef8bf91a721db5a55b8546f;p=blog.git Add ticker post and script. --- diff --git a/posts/ticker.mdwn b/posts/ticker.mdwn new file mode 100644 index 0000000..1d0153d --- /dev/null +++ b/posts/ticker.mdwn @@ -0,0 +1,10 @@ +[[ticker.py]] is a simple stock-quote scraper using [[Python]]'s +[urllib2][] to grab pages and [lxml][] to parse the HTML. It's a +pretty straightforward example of elementary scraping in Python. + +[urllib2]: http://docs.python.org/library/urllib2.html +[lxml]: http://codespeak.net/lxml/ + +[[!tag tags/fun]] +[[!tag tags/python]] +[[!tag tags/tools]] diff --git a/posts/ticker/ticker.py b/posts/ticker/ticker.py new file mode 100755 index 0000000..a175fc5 --- /dev/null +++ b/posts/ticker/ticker.py @@ -0,0 +1,161 @@ +#!/usr/bin/env python +# Copyright (C) 2010 W. Trevor King +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +"""Grab stock prices by ticker symbol. +""" + +import logging +from StringIO import StringIO +import urllib2 + +from lxml import etree + + +class Grabber (object): + "Base ckass for website-specific quote scrapers." + def __init__(self, url): + self._url = url + + def quote(self, ticker): + "Floating point quote for the given `ticker` symbol string." + url = self._get_url(ticker) + logging.info('get quote for %s from %s using %s' + % (ticker, url, self)) + info,html = self._get_html(url) + quote = self._parse_html(html) + return quote + + def _get_url(self, ticker): + "URL listing the quote for the given `ticker` symbol string." + return self._url % ticker + + def _get_html(self, url): + "Page info and html associated with the given `url`." + f = urllib2.urlopen(url) + info = f.info() + html = f.read() + f.close() + return (info, html) + + def _parse_html(self, html): + """Extract the floating point quote from the page's `html`. + + This method must be overriden by website-specific subclasses. + """ + raise NotImplementedError() + + +class GoogleGrabber (Grabber): + "Grab quotes from Google Finance." + def __init__(self): + super(GoogleGrabber, self).__init__( + url='http://www.google.com/finance?q=%s') + + def _parse_html(self, html): + """Extract quote from a snippet that looks like:: + + + + 64.77 + + + """ + parser = etree.HTMLParser() + tree = etree.parse(StringIO(html), parser) + root = tree.getroot() + span = root.xpath(".//span[@class='pr']")[0] + text = ''.join(span.itertext()).strip() + return float(text) + + +class YahooGrabber (Grabber): + "Grab quotes from Yahoo! Finance." + def __init__(self): + super(YahooGrabber, self).__init__( + url='http://finance.yahoo.com/q?s=%s') + + def _parse_html(self, html): + """Extract quote from a snippet that looks like:: + + + Last Trade: + + + + + 64.74 + + + + + + + For the implementation, see the `LXML tutorial`_. + + .. _LXML tutorial: + http://codespeak.net/lxml/tutorial.html#using-xpath-to-find-text + """ + parser = etree.HTMLParser() + tree = etree.parse(StringIO(html), parser) + root = tree.getroot() + rows = root.xpath('.//tr') #[[td/text() = 'Last Trade:']") + for row in rows: + has_label = row.xpath(".//th/text() = 'Last Trade:'") + if has_label: + break + assert has_label, '\n---\n\n'.join([ + etree.tostring(row, pretty_print=True) for row in rows]) + data = row.xpath('.//td')[0] + text = ''.join(data.itertext()).strip() + return float(text) + + +GRABBERS = {} +# Create a dictionary of (name, grabber) pairs. For example +# GRABBERS['google'] = GoogleGrabber +for name,obj in locals().items(): + match = False + try: + if issubclass(obj, Grabber) and obj != Grabber: + match = True + except TypeError: + pass + if match: + n = name[:-len('Grabber')].lower() + GRABBERS[n] = obj +del name, obj, match + + +if __name__ == '__main__': + from optparse import OptionParser + + p = OptionParser(usage='%prog [options] TICKER ...') + p.disable_interspersed_args() + p.add_option('-v', '--verbose', dest='verbose', default=0, action='count', + help='increment verbosity') + grabbers = sorted(GRABBERS.keys()) + p.add_option('-g', '--grabber', dest='grabber', default='google', + type='choice', choices=grabbers, + help='select grabber from %s (%%default)' % grabbers) + + options,args = p.parse_args() + + log_levels = [logging.ERROR, logging.WARNING, logging.INFO, logging.DEBUG] + log_level = log_levels[min(options.verbose, len(log_levels)-1)] + logging.basicConfig(level=log_level) + + g = GRABBERS[options.grabber]() + print '\t'.join([str(g.quote(ticker)) for ticker in args])