From: W. Trevor King Date: Mon, 2 Apr 2012 00:56:45 +0000 (-0400) Subject: Add script to convert v0.4 databases into fixtures for Django import. X-Git-Url: http://git.tremily.us/?a=commitdiff_plain;h=b54b1733cba30b38554f15657197b9b2e986c73c;p=chemdb.git Add script to convert v0.4 databases into fixtures for Django import. --- diff --git a/contrib/chemdb-text-to-fixture.py b/contrib/chemdb-text-to-fixture.py new file mode 100755 index 0000000..39d1b67 --- /dev/null +++ b/contrib/chemdb-text-to-fixture.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python + +"""Convert a tab-delimited text database into a Django fixture file. + +This allows you to upgrade a database from ChemDB v0.4 and earlier so +you can use it with ChemDB v0.5. +""" + +import re as _re +import sys as _sys +import time as _time +import urllib2 as _urllib2 + + +TITLE_REGEXP = _re.compile('([^<]*)') +MONTHS = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', + 'nov', 'dec'] +SPECIALS = { # match with initial_data.yaml fixture + 'OX': 1, + 'W': 2, + 'Simple asphyxiant': 3, + } + + +def get_cas_number_name(cas_number): + url = 'http://webbook.nist.gov/cgi/cbook.cgi?ID={}&Units=SI'.format( + cas_number) + f = _urllib2.urlopen(url) + html = f.read() + f.close() + _time.sleep(1) # don't flood the server + match = TITLE_REGEXP.search(html) + if match: + return match.group(1) + raise ValueError(html) + +def date(string): + if string in ['', '?']: + return None + try: + month,day,year = [int(x) for x in string.split('/')] + except ValueError: + try: + month,day,year = [x for x in string.split('-')] + except ValueError: + print string + raise + try: + month = MONTHS.index(month.lower()) + 1 + except ValueError: + month = int(month) + day = int(day) + year = int(year) + if year <= 10: + year += 2000 + if day > 31: + raise ValueError(string) + return '{:04d}-{:02d}-{:02d}'.format(year, month, day) + +def print_cas_numbers(entries): + cas_numbers = set() + for entry in entries: + cas = [cas.split(':', 1)[0] for cas in entry['CAS#'].split(',')] + cas_numbers.update(cas) + for cas in ['', '?', '+secret-non-hazardous solids', 'na']: + cas_numbers.remove(cas) + cas_dict = {} + for i,cas_number in enumerate(sorted(cas_numbers)): + name = get_cas_number_name(cas_number) + print('- model: chemdb.CASNumber') + print(' pk: {}'.format(i+1)) + print(' fields:') + print(' name: {}'.format(name)) + print(' abbrev: {}'.format(name)) + print(' cas: {}'.format(cas_number)) + cas_dict[cas_number] = i+1 + print('') + return cas_dict + +def print_chemicals(entries, cas_numbers): + chemicals = set(entry['Name'] for entry in entries) + chemical_dict = {} + for i,chemical in enumerate(sorted(chemicals)): + entry = [entry for entry in entries if entry['Name'] == chemical][0] + print('- model: chemdb.Chemical') + print(' pk: {}'.format(i+1)) + print(' fields:') + print(' name: {}'.format(chemical)) + print(' abbrev: {}'.format(chemical)) + cas_names = [cas.split(':', 1)[0] for cas in entry['CAS#'].split(',')] + cas_indexes = [] + for cas_name in cas_names: + if cas_name in cas_numbers: + index = cas_numbers[cas_name] + cas_indexes.append(index) + if cas_indexes: + print(' cas: {}'.format(str(cas_indexes))) + try: + if False: + print(' msds: {}'.format(entry[''])) + for name,key in [('health', 'H'), + ('fire', 'F'), + ('reactivity', 'R'), + ]: + if entry[key] not in ['', '?']: + print(' {}: {}'.format(name, entry[key])) + if entry['O']: + special = [SPECIALS[entry['O']],] + print(' special: {}'.format(special)) + for name,key in [('mutagen', 'M'), + ('carcinogen', 'C'), + ('teratogen', 'T'), + ]: + if entry[key] not in ['', '?']: + print(' {}: {}'.format(name, True)) + if entry['Note']: + note = entry['Note'] + if ':' in note: + note = "'{}'".format(note) + print(' note: {}'.format(note)) + except: + _sys.stderr.write('{}\n'.format(entry)) + raise + chemical_dict[chemical] = i+1 + print('') + return chemical_dict + +def print_locations(entries): + for entry in entries: + if not entry['Location']: + entry['Location'] = 'unknown' + locations = set(entry['Location'] for entry in entries) + location_dict = {} + for i,location in enumerate(sorted(locations)): + print('- model: chemdb.Location') + print(' pk: {}'.format(i+1)) + print(' fields:') + print(' name: {}'.format(location)) + print(' abbrev: {}'.format(location)) + location_dict[location] = i+1 + print('') + return location_dict + +def print_vendors(entries): + for entry in entries: + if entry['Vendor'] in ['', '?']: + entry['Vendor'] = 'unknown' + vendors = set(entry['Vendor'] for entry in entries) + vendor_dict = {} + for i,vendor in enumerate(sorted(vendors)): + print('- model: chemdb.Vendor') + print(' pk: {}'.format(i+1)) + print(' fields:') + print(' name: {}'.format(vendor)) + print(' abbrev: {}'.format(vendor)) + vendor_dict[vendor] = i+1 + print('') + return vendor_dict + +def print_chemical_instances(entries, chemicals, locations, vendors): + for entry in entries: + if entry['Cat#'] in ['', '?', '-']: + entry['Cat#'] = 'unknown' + if entry['Amount'] in ['', '?', '-']: + entry['Amount'] = 'unknown' + for i,entry in enumerate(sorted(entries)): + print('- model: chemdb.ChemicalInstance') + print(' pk: {}'.format(i+1)) + print(' fields:') + print(' chemical: {}'.format(chemicals[entry['Name']])) + print(' location: {}'.format(locations[entry['Location']])) + print(' amount: {}'.format(entry['Amount'])) + print(' vendor: {}'.format(vendors[entry['Vendor']])) + print(' catalog: {}'.format(entry['Cat#'])) + for cat in ['Received', 'Disposed']: + if cat == 'Received' and cat not in entry: + d = date(entry['Recieved']) # I can't spell ;) + else: + d = date(entry.get(cat, '')) + if d: + print(' {}: {}'.format(cat.lower(), d)) + +def upgrade(filename): + with open(filename, 'r') as f: + header = f.readline() + assert header.startswith('#'), header + fields = [x.strip() for x in header[1:].split('\t')] + entries = [] + for line in f: + line = line.strip() + if not line or line.startswith('#'): + continue + values = [x.strip() for x in line.split('\t')] + if len(values) < len(fields): + values.extend([''] * (len(fields) - len(values))) + entries.append(dict(zip(fields, values))) + cas_numbers = print_cas_numbers(entries) + chemicals = print_chemicals(entries, cas_numbers=cas_numbers) + locations = print_locations(entries) + vendors = print_vendors(entries) + print_chemical_instances(entries, chemicals, locations, vendors) + + +if __name__ == '__main__': + import sys + + filename = sys.argv[1] + upgrade(filename)