Add script to convert v0.4 databases into fixtures for Django import.

author W. Trevor King <wking@drexel.edu>

Mon, 2 Apr 2012 00:56:45 +0000 (20:56 -0400)

committer W. Trevor King <wking@drexel.edu>

Mon, 2 Apr 2012 01:01:39 +0000 (21:01 -0400)
author W. Trevor King <wking@drexel.edu>
Mon, 2 Apr 2012 00:56:45 +0000 (20:56 -0400)
committer W. Trevor King <wking@drexel.edu>
Mon, 2 Apr 2012 01:01:39 +0000 (21:01 -0400)
diff --git a/contrib/chemdb-text-to-fixture.py b/contrib/chemdb-text-to-fixture.py

new file mode 100755 (executable)

index 0000000..39d1b67
--- /dev/null
+++ b/contrib/chemdb-text-to-fixture.py
@@ -0,0 +1,208 @@
+#!/usr/bin/env python
+
+"""Convert a tab-delimited text database into a Django fixture file.
+
+This allows you to upgrade a database from ChemDB v0.4 and earlier so
+you can use it with ChemDB v0.5.
+"""
+
+import re as _re
+import sys as _sys
+import time as _time
+import urllib2 as _urllib2
+
+
+TITLE_REGEXP = _re.compile('<title>([^<]*)</title>')
+MONTHS = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct',
+          'nov', 'dec']
+SPECIALS = {  # match with initial_data.yaml fixture
+    'OX': 1,
+    'W': 2,
+    'Simple asphyxiant': 3,
+    }
+
+
+def get_cas_number_name(cas_number):
+    url = 'http://webbook.nist.gov/cgi/cbook.cgi?ID={}&Units=SI'.format(
+        cas_number)
+    f = _urllib2.urlopen(url)
+    html = f.read()
+    f.close()
+    _time.sleep(1)  # don't flood the server
+    match = TITLE_REGEXP.search(html)
+    if match:
+        return match.group(1)
+    raise ValueError(html)
+
+def date(string):
+    if string in ['', '?']:
+        return None
+    try:
+        month,day,year = [int(x) for x in string.split('/')]
+    except ValueError:
+        try:
+            month,day,year = [x for x in string.split('-')]
+        except ValueError:
+            print string
+            raise
+        try:
+            month = MONTHS.index(month.lower()) + 1
+        except ValueError:
+            month = int(month)
+        day = int(day)
+        year = int(year)
+    if year <= 10:
+        year += 2000
+    if day > 31:
+        raise ValueError(string)
+    return '{:04d}-{:02d}-{:02d}'.format(year, month, day)
+
+def print_cas_numbers(entries):
+    cas_numbers = set()
+    for entry in entries:
+        cas = [cas.split(':', 1)[0] for cas in entry['CAS#'].split(',')]
+        cas_numbers.update(cas)
+    for cas in ['', '?', '+secret-non-hazardous solids', 'na']:
+        cas_numbers.remove(cas)
+    cas_dict = {}
+    for i,cas_number in enumerate(sorted(cas_numbers)):
+        name = get_cas_number_name(cas_number)
+        print('- model: chemdb.CASNumber')
+        print('  pk: {}'.format(i+1))
+        print('  fields:')
+        print('    name: {}'.format(name))
+        print('    abbrev: {}'.format(name))
+        print('    cas: {}'.format(cas_number))
+        cas_dict[cas_number] = i+1
+    print('')
+    return cas_dict
+
+def print_chemicals(entries, cas_numbers):
+    chemicals = set(entry['Name'] for entry in entries)
+    chemical_dict = {}
+    for i,chemical in enumerate(sorted(chemicals)):
+        entry = [entry for entry in entries if entry['Name'] == chemical][0]
+        print('- model: chemdb.Chemical')
+        print('  pk: {}'.format(i+1))
+        print('  fields:')
+        print('    name: {}'.format(chemical))
+        print('    abbrev: {}'.format(chemical))
+        cas_names = [cas.split(':', 1)[0] for cas in entry['CAS#'].split(',')]
+        cas_indexes = []
+        for cas_name in cas_names:
+            if cas_name in cas_numbers:
+                index = cas_numbers[cas_name]
+                cas_indexes.append(index)
+        if cas_indexes:
+            print('    cas: {}'.format(str(cas_indexes)))
+        try:
+            if False:
+                print('    msds: {}'.format(entry['']))
+            for name,key in [('health', 'H'),
+                             ('fire', 'F'),
+                             ('reactivity', 'R'),
+                             ]:
+                if entry[key] not in ['', '?']:
+                    print('    {}: {}'.format(name, entry[key]))
+            if entry['O']:
+                special = [SPECIALS[entry['O']],]
+                print('    special: {}'.format(special))
+            for name,key in [('mutagen', 'M'),
+                             ('carcinogen', 'C'),
+                             ('teratogen', 'T'),
+                             ]:
+                if entry[key] not in ['', '?']:
+                    print('    {}: {}'.format(name, True))
+            if entry['Note']:
+                note = entry['Note']
+                if ':' in note:
+                    note = "'{}'".format(note)
+                print('    note: {}'.format(note))
+        except:
+            _sys.stderr.write('{}\n'.format(entry))
+            raise
+        chemical_dict[chemical] = i+1
+    print('')
+    return chemical_dict
+
+def print_locations(entries):
+    for entry in entries:
+        if not entry['Location']:
+            entry['Location'] = 'unknown'
+    locations = set(entry['Location'] for entry in entries)
+    location_dict = {}
+    for i,location in enumerate(sorted(locations)):
+        print('- model: chemdb.Location')
+        print('  pk: {}'.format(i+1))
+        print('  fields:')
+        print('    name: {}'.format(location))
+        print('    abbrev: {}'.format(location))
+        location_dict[location] = i+1
+    print('')
+    return location_dict
+
+def print_vendors(entries):
+    for entry in entries:
+        if entry['Vendor'] in ['', '?']:
+            entry['Vendor'] = 'unknown'
+    vendors = set(entry['Vendor'] for entry in entries)
+    vendor_dict = {}
+    for i,vendor in enumerate(sorted(vendors)):
+        print('- model: chemdb.Vendor')
+        print('  pk: {}'.format(i+1))
+        print('  fields:')
+        print('    name: {}'.format(vendor))
+        print('    abbrev: {}'.format(vendor))
+        vendor_dict[vendor] = i+1
+    print('')
+    return vendor_dict
+
+def print_chemical_instances(entries, chemicals, locations, vendors): 
+    for entry in entries:
+        if entry['Cat#'] in ['', '?', '-']:
+            entry['Cat#'] = 'unknown'
+        if entry['Amount'] in ['', '?', '-']:
+            entry['Amount'] = 'unknown'
+    for i,entry in enumerate(sorted(entries)):
+        print('- model: chemdb.ChemicalInstance')
+        print('  pk: {}'.format(i+1))
+        print('  fields:')
+        print('    chemical: {}'.format(chemicals[entry['Name']]))
+        print('    location: {}'.format(locations[entry['Location']]))
+        print('    amount: {}'.format(entry['Amount']))
+        print('    vendor: {}'.format(vendors[entry['Vendor']]))
+        print('    catalog: {}'.format(entry['Cat#']))
+        for cat in ['Received', 'Disposed']:
+            if cat == 'Received' and cat not in entry:
+                d = date(entry['Recieved'])  # I can't spell ;)
+            else:
+                d = date(entry.get(cat, ''))
+            if d:
+                print('    {}: {}'.format(cat.lower(), d))
+
+def upgrade(filename):
+    with open(filename, 'r') as f:
+        header = f.readline()
+        assert header.startswith('#'), header
+        fields = [x.strip() for x in header[1:].split('\t')]
+        entries = []
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            values = [x.strip() for x in line.split('\t')]
+            if len(values) < len(fields):
+                values.extend([''] * (len(fields) - len(values)))
+            entries.append(dict(zip(fields, values)))
+    cas_numbers = print_cas_numbers(entries)
+    chemicals = print_chemicals(entries, cas_numbers=cas_numbers)
+    locations = print_locations(entries)
+    vendors = print_vendors(entries)
+    print_chemical_instances(entries, chemicals, locations, vendors)
+
+
+if __name__ == '__main__':
+    import sys
+
+    filename = sys.argv[1]
+    upgrade(filename)
author	W. Trevor King <wking@drexel.edu>
	Mon, 2 Apr 2012 00:56:45 +0000 (20:56 -0400)
committer	W. Trevor King <wking@drexel.edu>
	Mon, 2 Apr 2012 01:01:39 +0000 (21:01 -0400)