--- /dev/null
+#!/usr/bin/env python
+
+"""Convert a tab-delimited text database into a Django fixture file.
+
+This allows you to upgrade a database from ChemDB v0.4 and earlier so
+you can use it with ChemDB v0.5.
+"""
+
+import re as _re
+import sys as _sys
+import time as _time
+import urllib2 as _urllib2
+
+
+TITLE_REGEXP = _re.compile('<title>([^<]*)</title>')
+MONTHS = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
+ 'nov', 'dec']
+SPECIALS = { # match with initial_data.yaml fixture
+ 'OX': 1,
+ 'W': 2,
+ 'Simple asphyxiant': 3,
+ }
+
+
+def get_cas_number_name(cas_number):
+ url = 'http://webbook.nist.gov/cgi/cbook.cgi?ID={}&Units=SI'.format(
+ cas_number)
+ f = _urllib2.urlopen(url)
+ html = f.read()
+ f.close()
+ _time.sleep(1) # don't flood the server
+ match = TITLE_REGEXP.search(html)
+ if match:
+ return match.group(1)
+ raise ValueError(html)
+
+def date(string):
+ if string in ['', '?']:
+ return None
+ try:
+ month,day,year = [int(x) for x in string.split('/')]
+ except ValueError:
+ try:
+ month,day,year = [x for x in string.split('-')]
+ except ValueError:
+ print string
+ raise
+ try:
+ month = MONTHS.index(month.lower()) + 1
+ except ValueError:
+ month = int(month)
+ day = int(day)
+ year = int(year)
+ if year <= 10:
+ year += 2000
+ if day > 31:
+ raise ValueError(string)
+ return '{:04d}-{:02d}-{:02d}'.format(year, month, day)
+
+def print_cas_numbers(entries):
+ cas_numbers = set()
+ for entry in entries:
+ cas = [cas.split(':', 1)[0] for cas in entry['CAS#'].split(',')]
+ cas_numbers.update(cas)
+ for cas in ['', '?', '+secret-non-hazardous solids', 'na']:
+ cas_numbers.remove(cas)
+ cas_dict = {}
+ for i,cas_number in enumerate(sorted(cas_numbers)):
+ name = get_cas_number_name(cas_number)
+ print('- model: chemdb.CASNumber')
+ print(' pk: {}'.format(i+1))
+ print(' fields:')
+ print(' name: {}'.format(name))
+ print(' abbrev: {}'.format(name))
+ print(' cas: {}'.format(cas_number))
+ cas_dict[cas_number] = i+1
+ print('')
+ return cas_dict
+
+def print_chemicals(entries, cas_numbers):
+ chemicals = set(entry['Name'] for entry in entries)
+ chemical_dict = {}
+ for i,chemical in enumerate(sorted(chemicals)):
+ entry = [entry for entry in entries if entry['Name'] == chemical][0]
+ print('- model: chemdb.Chemical')
+ print(' pk: {}'.format(i+1))
+ print(' fields:')
+ print(' name: {}'.format(chemical))
+ print(' abbrev: {}'.format(chemical))
+ cas_names = [cas.split(':', 1)[0] for cas in entry['CAS#'].split(',')]
+ cas_indexes = []
+ for cas_name in cas_names:
+ if cas_name in cas_numbers:
+ index = cas_numbers[cas_name]
+ cas_indexes.append(index)
+ if cas_indexes:
+ print(' cas: {}'.format(str(cas_indexes)))
+ try:
+ if False:
+ print(' msds: {}'.format(entry['']))
+ for name,key in [('health', 'H'),
+ ('fire', 'F'),
+ ('reactivity', 'R'),
+ ]:
+ if entry[key] not in ['', '?']:
+ print(' {}: {}'.format(name, entry[key]))
+ if entry['O']:
+ special = [SPECIALS[entry['O']],]
+ print(' special: {}'.format(special))
+ for name,key in [('mutagen', 'M'),
+ ('carcinogen', 'C'),
+ ('teratogen', 'T'),
+ ]:
+ if entry[key] not in ['', '?']:
+ print(' {}: {}'.format(name, True))
+ if entry['Note']:
+ note = entry['Note']
+ if ':' in note:
+ note = "'{}'".format(note)
+ print(' note: {}'.format(note))
+ except:
+ _sys.stderr.write('{}\n'.format(entry))
+ raise
+ chemical_dict[chemical] = i+1
+ print('')
+ return chemical_dict
+
+def print_locations(entries):
+ for entry in entries:
+ if not entry['Location']:
+ entry['Location'] = 'unknown'
+ locations = set(entry['Location'] for entry in entries)
+ location_dict = {}
+ for i,location in enumerate(sorted(locations)):
+ print('- model: chemdb.Location')
+ print(' pk: {}'.format(i+1))
+ print(' fields:')
+ print(' name: {}'.format(location))
+ print(' abbrev: {}'.format(location))
+ location_dict[location] = i+1
+ print('')
+ return location_dict
+
+def print_vendors(entries):
+ for entry in entries:
+ if entry['Vendor'] in ['', '?']:
+ entry['Vendor'] = 'unknown'
+ vendors = set(entry['Vendor'] for entry in entries)
+ vendor_dict = {}
+ for i,vendor in enumerate(sorted(vendors)):
+ print('- model: chemdb.Vendor')
+ print(' pk: {}'.format(i+1))
+ print(' fields:')
+ print(' name: {}'.format(vendor))
+ print(' abbrev: {}'.format(vendor))
+ vendor_dict[vendor] = i+1
+ print('')
+ return vendor_dict
+
+def print_chemical_instances(entries, chemicals, locations, vendors):
+ for entry in entries:
+ if entry['Cat#'] in ['', '?', '-']:
+ entry['Cat#'] = 'unknown'
+ if entry['Amount'] in ['', '?', '-']:
+ entry['Amount'] = 'unknown'
+ for i,entry in enumerate(sorted(entries)):
+ print('- model: chemdb.ChemicalInstance')
+ print(' pk: {}'.format(i+1))
+ print(' fields:')
+ print(' chemical: {}'.format(chemicals[entry['Name']]))
+ print(' location: {}'.format(locations[entry['Location']]))
+ print(' amount: {}'.format(entry['Amount']))
+ print(' vendor: {}'.format(vendors[entry['Vendor']]))
+ print(' catalog: {}'.format(entry['Cat#']))
+ for cat in ['Received', 'Disposed']:
+ if cat == 'Received' and cat not in entry:
+ d = date(entry['Recieved']) # I can't spell ;)
+ else:
+ d = date(entry.get(cat, ''))
+ if d:
+ print(' {}: {}'.format(cat.lower(), d))
+
+def upgrade(filename):
+ with open(filename, 'r') as f:
+ header = f.readline()
+ assert header.startswith('#'), header
+ fields = [x.strip() for x in header[1:].split('\t')]
+ entries = []
+ for line in f:
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ values = [x.strip() for x in line.split('\t')]
+ if len(values) < len(fields):
+ values.extend([''] * (len(fields) - len(values)))
+ entries.append(dict(zip(fields, values)))
+ cas_numbers = print_cas_numbers(entries)
+ chemicals = print_chemicals(entries, cas_numbers=cas_numbers)
+ locations = print_locations(entries)
+ vendors = print_vendors(entries)
+ print_chemical_instances(entries, chemicals, locations, vendors)
+
+
+if __name__ == '__main__':
+ import sys
+
+ filename = sys.argv[1]
+ upgrade(filename)