From: W. Trevor King Date: Thu, 21 Jul 2011 19:36:12 +0000 (-0400) Subject: Add nasdaq.py scraper and update README. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=90bf87cb78eebeb076edcaf1783c8edef9876199;p=insider.git Add nasdaq.py scraper and update README. --- diff --git a/README b/README index 2e4ccbd..8063787 100644 --- a/README +++ b/README @@ -44,6 +44,21 @@ look like $ PYTHONPATH=".:$PYTHONPATH" python example/manage.py runserver +Scraping +======== + +Entering transaction data by hand can be tedious and error prone. To +automate the task, you should write scrapers to look up and enter +transaction data automatically. To get you started, I've written +`insider/scrape/nasdaq.py`, which scrapes `NASDAQ's interface`__ to +`EDGAR`_\'s data. Use the scraper with something like:: + + $ export PYTHONPATH='.' + $ export DJANGO_SETTINGS_MODULE='example.settings' + $ python insider/scrape/nasdaq.py NYSE:RHT NASDAQ:GOOG + +__ NASDAQ_ + Hacking ======= @@ -58,4 +73,6 @@ That's a good place to start if you're new to Django. .. _dt2-docs: http://django-tables2.readthedocs.org/en/latest/ .. _BeautifulSoup: http://www.crummy.com/software/BeautifulSoup/ .. _Django documentation: https://docs.djangoproject.com/ +.. _NASDAQ: http://www.nasdaq.com/reference/ownership.stm +.. _EDGAR: http://www.edgar-online.com/ .. _Django tutorial: https://docs.djangoproject.com/en/1.3/intro/tutorial01/ diff --git a/insider/scrape/__init__.py b/insider/scrape/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/insider/scrape/nasdaq.py b/insider/scrape/nasdaq.py new file mode 100644 index 0000000..5a1e616 --- /dev/null +++ b/insider/scrape/nasdaq.py @@ -0,0 +1,136 @@ +"""Scrape insider trade information from `holdings.nasdaq.com` +""" + +import datetime +import decimal +import urllib2 + +from BeautifulSoup import BeautifulSoup + +from insider.models import add_transaction + + +CODE = { + 'person': { + # Convert NASDAQ's people (generaly LAST FIRST[ MIDDLE]) to + # Person.name. This is for hardcoded overrides when the + # default algorithm fails. + }, + 'relation': {'OFF': 'officer', 'DIR': 'Director'}, + 'transaction': { + '': None, + 'AS': 'Automatic Sell', + 'AB': 'Automatic Buy', + 'JS': 'Disposition (Non Open Market)', + 'JB': 'Acquisition (Non Open Market)', + 'OE': 'Option Exercise', + 'S': 'Sell', + 'B': 'Buy', + }, + 'ownership': { + 'D': 'direct', + 'IN': 'indirect', + }, + } + + +def get_trades(ticker): + exchange_symbol,company_symbol = ticker.split(':') + url = 'http://holdings.nasdaq.com/asp/Form4.asp?selected={}'.format( + company_symbol) + html = urllib2.urlopen(url).read() + soup = BeautifulSoup(html) + table = soup('table', {'class': 'holdings', 'width': '100%'})[1] + for row in table('tr'): + # extract data values + tds = row('td') + line_match = False + data = {'exchange': 'UNKNOWN', 'exchange_symbol': exchange_symbol, + 'company': 'UNKNOWN', 'company_symbol': company_symbol, + 'source': url} + for i,(field,class_) in enumerate([ + (None, None), # first column is blank + ('person', 'Holddata'), + ('relation', 'Holddata'), + ('date', 'date'), + ('form', 'Holddata'), + ('transaction', 'Holddata'), + ('ownership', 'Holddata'), + ('shares traded', 'Holdnum'), + ('last price', 'Holdnum'), + ('shares held', 'Holdnum'), + ('', None) + ]): + if i >= len(tds): + break # not enough columns in this row + if field == None: + continue # nothing interesting in this field + elif field == '': + line_match = True + break # we made it to the end of the list + if dict(tds[i].attrs).get('class', None) != class_: + break # wrong class + value = tds[i].text + if class_ == 'Holdnum': # decode numerical values + if value == '-': + value = None + else: + value = float(value.strip('$()').replace(',', '')) + elif field in CODE: # decode abbreviated values + code = CODE[field] + try: + value = code[value] + except KeyError: + if field == 'person': # fall back to default name + last,first_plus = value.title().split(' ', 1) + value = ' '.join((first_plus, last)) + else: + print('unknown code {} for {} field\n{}'.format( + value, field, url)) + raise + elif field == 'date': + value = datetime.datetime.strptime(value, '%m/%d/%Y') + data[field] = value + if not line_match: + continue + if data['transaction'] and 'Non Open Market' in data['transaction']: + data['exchange'] = 'non-open market' + data['exchange_symbol'] = '-' + if data['shares traded'] and data['transaction']: + for key in ['sell', 'option', 'disposition']: + if key in data['transaction'].lower(): + data['shares traded'] *= -1 # - for selling + # estimate total price + price = data['last price'] or 0 + shares = data['shares traded'] or 0 + value = decimal.Decimal(price * shares) + data['value'] = value.quantize(decimal.Decimal('.01')) + yield data + + +if __name__ == '__main__': + import sys + + verbose = False + + for ticker in sys.argv[1:]: + for trade in get_trades(ticker=ticker): + if verbose: + # display the trade we're looking at + for field,value in sorted(trade.items()): + print('{}\t{}'.format(field, value)) + + add_transaction( + person=trade['person'], + date=trade['date'], + exchange=trade['exchange'], + exchange_symbol=trade['exchange_symbol'], + company=trade['company'], + company_symbol=trade['company_symbol'], + shares=trade['shares traded'], + value=trade['value'], + source=trade['source'], + ) + + if verbose: + print('')