Add nasdaq.py scraper and update README.
authorW. Trevor King <wking@drexel.edu>
Thu, 21 Jul 2011 19:36:12 +0000 (15:36 -0400)
committerW. Trevor King <wking@drexel.edu>
Thu, 21 Jul 2011 19:36:12 +0000 (15:36 -0400)
README
insider/scrape/__init__.py [new file with mode: 0644]
insider/scrape/nasdaq.py [new file with mode: 0644]

diff --git a/README b/README
index 2e4ccbdca72863fdd355c4e60f5920a366bb7b6e..8063787b6e169b9d950928c0666541e3968cb98b 100644 (file)
--- a/README
+++ b/README
@@ -44,6 +44,21 @@ look like
 
   $ PYTHONPATH=".:$PYTHONPATH" python example/manage.py runserver
 
+Scraping
+========
+
+Entering transaction data by hand can be tedious and error prone.  To
+automate the task, you should write scrapers to look up and enter
+transaction data automatically.  To get you started, I've written
+`insider/scrape/nasdaq.py`, which scrapes `NASDAQ's interface`__ to
+`EDGAR`_\'s data.  Use the scraper with something like::
+
+  $ export PYTHONPATH='.'
+  $ export DJANGO_SETTINGS_MODULE='example.settings'
+  $ python insider/scrape/nasdaq.py NYSE:RHT NASDAQ:GOOG
+
+__ NASDAQ_
+
 Hacking
 =======
 
@@ -58,4 +73,6 @@ That's a good place to start if you're new to Django.
 .. _dt2-docs: http://django-tables2.readthedocs.org/en/latest/
 .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/
 .. _Django documentation: https://docs.djangoproject.com/
+.. _NASDAQ: http://www.nasdaq.com/reference/ownership.stm
+.. _EDGAR: http://www.edgar-online.com/
 .. _Django tutorial: https://docs.djangoproject.com/en/1.3/intro/tutorial01/
diff --git a/insider/scrape/__init__.py b/insider/scrape/__init__.py
new file mode 100644 (file)
index 0000000..e69de29
diff --git a/insider/scrape/nasdaq.py b/insider/scrape/nasdaq.py
new file mode 100644 (file)
index 0000000..5a1e616
--- /dev/null
@@ -0,0 +1,136 @@
+"""Scrape insider trade information from `holdings.nasdaq.com`
+"""
+
+import datetime
+import decimal
+import urllib2
+
+from BeautifulSoup import BeautifulSoup
+
+from insider.models import add_transaction
+
+
+CODE = {
+    'person': {
+        # Convert NASDAQ's people (generaly LAST FIRST[ MIDDLE]) to
+        # Person.name.  This is for hardcoded overrides when the
+        # default algorithm fails.
+        },
+    'relation': {'OFF': 'officer', 'DIR': 'Director'},
+    'transaction': {
+        '': None,
+        'AS': 'Automatic Sell',
+        'AB': 'Automatic Buy',
+        'JS': 'Disposition (Non Open Market)',
+        'JB': 'Acquisition (Non Open Market)',
+        'OE': 'Option Exercise',
+        'S': 'Sell',
+        'B': 'Buy',
+        },
+    'ownership': {
+        'D': 'direct',
+        'IN': 'indirect',
+        },
+    }
+
+
+def get_trades(ticker):
+    exchange_symbol,company_symbol = ticker.split(':')
+    url = 'http://holdings.nasdaq.com/asp/Form4.asp?selected={}'.format(
+        company_symbol)
+    html = urllib2.urlopen(url).read()
+    soup = BeautifulSoup(html)
+    table = soup('table', {'class': 'holdings', 'width': '100%'})[1]
+    for row in table('tr'):
+        # extract data values
+        tds = row('td')
+        line_match = False
+        data = {'exchange': 'UNKNOWN', 'exchange_symbol': exchange_symbol,
+                'company': 'UNKNOWN', 'company_symbol': company_symbol,
+                'source': url}
+        for i,(field,class_) in enumerate([
+                (None, None),  # first column is blank
+                ('person', 'Holddata'),
+                ('relation', 'Holddata'),
+                ('date', 'date'),
+                ('form', 'Holddata'),
+                ('transaction', 'Holddata'),
+                ('ownership', 'Holddata'),
+                ('shares traded', 'Holdnum'),
+                ('last price', 'Holdnum'),
+                ('shares held', 'Holdnum'),
+                ('', None)
+                ]):
+            if i >= len(tds):
+                break # not enough columns in this row
+            if field == None:
+                continue  # nothing interesting in this field
+            elif field == '':
+                line_match = True
+                break  # we made it to the end of the list
+            if dict(tds[i].attrs).get('class', None) != class_:
+                break  # wrong class
+            value = tds[i].text
+            if class_ == 'Holdnum':  # decode numerical values
+                if value == '-':
+                    value = None
+                else:
+                    value = float(value.strip('$()').replace(',', ''))
+            elif field in CODE:  # decode abbreviated values
+                code = CODE[field]
+                try:
+                    value = code[value]
+                except KeyError:
+                    if field == 'person':  # fall back to default name
+                        last,first_plus = value.title().split(' ', 1)
+                        value = ' '.join((first_plus, last))
+                    else:
+                        print('unknown code {} for {} field\n{}'.format(
+                                value, field, url))
+                        raise
+            elif field == 'date':
+                value = datetime.datetime.strptime(value, '%m/%d/%Y')
+            data[field] = value
+        if not line_match:
+            continue
+        if data['transaction'] and 'Non Open Market' in data['transaction']:
+            data['exchange'] = 'non-open market'
+            data['exchange_symbol'] = '-'
+        if data['shares traded'] and data['transaction']:
+            for key in ['sell', 'option', 'disposition']:
+                if key in data['transaction'].lower():
+                    data['shares traded'] *= -1  # - for selling
+        # estimate total price
+        price = data['last price'] or 0
+        shares = data['shares traded'] or 0
+        value = decimal.Decimal(price * shares)
+        data['value'] = value.quantize(decimal.Decimal('.01'))
+        yield data
+
+
+if __name__ == '__main__':
+    import sys
+
+    verbose = False
+
+    for ticker in sys.argv[1:]:
+        for trade in get_trades(ticker=ticker):
+            if verbose:
+                # display the trade we're looking at
+                for field,value in sorted(trade.items()):
+                    print('{}\t{}'.format(field, value))
+
+            add_transaction(
+                person=trade['person'],
+                date=trade['date'],
+                exchange=trade['exchange'],
+                exchange_symbol=trade['exchange_symbol'],
+                company=trade['company'],
+                company_symbol=trade['company_symbol'],
+                shares=trade['shares traded'],
+                value=trade['value'],
+                source=trade['source'],
+                )
+
+            if verbose:
+                print('')