generate_data.py

   1 import sys
   2 import os
   3 import math
   4 from random import choice, randint, random
   5 import calendar
   6
   7 class Person:
   8     maxCI = 25
   9     # teenagers are hereby declared to be between 11 and 20 years old
  10     birthyears = range(1991,2000)
  11     repeatFraction = 0.1
  12
  13     names = ['john', 'paul', 'george', 'ringo',\
  14         'baby','scary','posh','ginger','madonna',\
  15         'prince','robyn','beyonce','jay']
  16     words =['Beatle','Spice','Backstreet','Sync','Jonas',\
  17         'Lennon','McCartney','Starr','Harrison','Z',\
  18         'Carrot','Broccoli','Asparagus','Beet']
  19     CIs=range(1,maxCI+1)
  20     birthmonths= range(1,13)
  21     #ensure unique ids
  22     serialNum=173
  23     sexes=['M','F','N']
  24
  25     def age(self, curyr=2011, curmo=11):
  26         return curyr+(1.*curmo-1.)/12. - self.birthyear - 1.*(self.birthmonth-1.)/12.
  27
  28     def __init__(self):
  29         self.subject = choice(Person.names)+choice(Person.words)+ ('%03d' % Person.serialNum)
  30         Person.serialNum = Person.serialNum + 1
  31
  32         self.birthyear  = choice(Person.birthyears)
  33         self.birthmonth = choice(Person.birthmonths)
  34
  35         self.sex = choice(Person.sexes)
  36         age = self.age(2011,11)
  37         self.CI = choice(Person.CIs)
  38
  39         # newer CIs have better volume, discrimination;
  40         # range goes down with age.  (say).
  41
  42         CInewness = (self.CI-1.)/(1.*max(Person.CIs))
  43         # from oldest CI to newest, gain 2 volume pts:
  44         self.trueVolume = randint(0,4)+randint(1,4)+round(2.*CInewness)
  45
  46         # from oldest CI to newest, gain 3 discrimination pts:
  47         self.trueDiscrimination = randint(0,3)+randint(1,4)+round(3.*CInewness)
  48
  49         # 21-year-olds would lose 3 range points over 10 year olds (say)
  50         self.trueRange = randint(0,4)+randint(1,6)+round((10.-(self.age()-11.))*3./10.)
  51
  52         # Most people don't repeat; those that do take the test 2-5 times
  53         if (random() > Person.repeatFraction):
  54             self.repeats = 1
  55         else:
  56             self.repeats=choice(range(2,6))
  57
  58
  59 from numpy import polyfit, array
  60 def test_peopleCorrelations():
  61     testpeople = []
  62     npeople = 4000
  63     for pnum in xrange(1,npeople):
  64         testpeople.append(Person())
  65
  66     data = [[p.age(), p.CI, p.trueVolume, p.trueRange, p.trueDiscrimination] for p in testpeople]
  67     ages, cis, vols, ranges, discs = zip(*data)
  68
  69     CIVolParam, dummy   = polyfit(cis, vols, 1)
  70     CIRangeParam, dummy = polyfit(cis, ranges, 1)
  71     CIDiscParam, dummy  = polyfit(cis, discs, 1)
  72
  73     AgeVolParam, dummy   = polyfit(ages, vols, 1)
  74     AgeRangeParam, dummy = polyfit(ages, ranges, 1)
  75     AgeDiscParam, dummy  = polyfit(ages, discs, 1)
  76
  77     assert CIVolParam > 0.75*(2./25.) and CIVolParam < 1.25*(2./25.)
  78     assert CIDiscParam > 0.75*(3./25.) and CIDiscParam < 1.25*(3./25.)
  79     assert AgeRangeParam < 0.75*(-3./10.) and AgeRangeParam > 1.25*(-3./10.)
  80
  81     zeroTol = 0.03
  82     assert abs(CIRangeParam) < zeroTol
  83     assert abs(AgeVolParam)  < zeroTol
  84     assert abs(AgeDiscParam) < zeroTol
  85
  86
  87
  88 class Measurement:
  89     incompleteFraction = 0.05
  90     serialNum = 211
  91     def randomDate(self):
  92         hrs = range(8,17)
  93         mins = range(1,60)
  94         secs = range(1,60)
  95         months = range(5,10)
  96
  97         month = choice(months)
  98         monthname = calendar.month_abbr[month]
  99         day = choice(range(1,calendar.monthrange(2011, month)[1]))
 100         dayname = calendar.day_abbr[calendar.weekday(2011, month, day)]
 101         hr = choice(hrs)
 102         min = choice(mins)
 103         sec = choice(secs)
 104
 105         datestring = '%s %s %d %02d:%02d:%02d %s' % (dayname, monthname, day, hr, min, sec, '2011')
 106         return [datestring, month, day, hr, min, sec]
 107
 108     def limit(self,n):
 109         if n < 1 :
 110             n = 1
 111         if n > 10 :
 112             n = 10
 113         return n
 114
 115     def __init__(self, p):
 116         """Generate a result"""
 117         self.person = p
 118         self.datestring, self.month, self.day, self.hr, self.min, self.sec = self.randomDate();
 119
 120         self.serialNum = Measurement.serialNum
 121         Measurement.serialNum = Measurement.serialNum + 1
 122
 123         # +/- 1 random measurement error
 124         self.volume = self.person.trueVolume + choice([-1,0,0,0,+1])
 125         self.range  = self.person.trueRange + choice([-1,0,0,0,+1])
 126         self.discrimination  = self.person.trueDiscrimination + choice([-1,0,0,0,+1])
 127
 128         self.volume = self.limit(self.volume)
 129         self.range = self.limit(self.range)
 130         self.discrimination = self.limit(self.discrimination)
 131
 132         # before this date, things were being recorded 0..9 rather than 1..10
 133         fixmonth = 8
 134         fixday = 18
 135         fixhr = 10
 136
 137         fixdate = fixmonth*10000 + fixday*100 + fixhr
 138         checkdate = self.month*10000 + self.day*100 + self.hr
 139         if checkdate < fixdate:
 140             self.volume = self.volume - 1
 141             self.range = self.range - 1
 142             self.discrimination = self.discrimination - 1
 143
 144         if (random() < Measurement.incompleteFraction):
 145             self.discrimination = None
 146
 147
 148     def __str__(self):
 149         text = '# ' + '\n'
 150         text += "%s: %s\n" % ( 'Reported', self.datestring )
 151         text += "%s: %s\n" % ( 'Subject',  self.person.subject )
 152         text += "%s: %4d/%02d\n" % ( 'Year/month of birth', self.person.birthyear,  self.person.birthmonth )
 153         text += "%s: %s\n" % ( 'Sex', self.person.sex )
 154         text += "%s: %d\n" % ( 'CI type', self.person.CI )
 155         text += "%s: %d\n" % ( 'Volume', self.volume )
 156         text += "%s: %d\n" % ( 'Range', self.range )
 157         if self.discrimination is None :
 158             text += "%s: \n" % ( 'Discrimination' )
 159         else:
 160             text += "%s: %d\n" % ( 'Discrimination', self.discrimination )
 161
 162         return text
 163
 164 class Datataker:
 165     names = ['angela', 'JamesD', 'jamesm', 'Frank_Richard',\
 166         'lab183','THOMAS','alexander','Beth','Lawrence',\
 167         'Toni', 'gerdal', 'Bert', 'Ernie', 'olivia', 'Leandra',\
 168         'sonya_p', 'h_jackson']
 169     filenamestyles = ['data_%d','Data%04d','%d','%04d','audioresult-%05d']
 170     suffixstyles = ['.dat','.txt','','','.DATA']
 171     tookNotesFraction = 0.5
 172     notes = ['Took data on Thursday and Friday until 4pm;\nAll day saturday.\n',\
 173              'Contact Janice about new calibration for data in August.\n',\
 174              'Submission of hours last week shows only 7 hours because \none was spent cleaning the lab.\n',\
 175              'Had some trouble accessing data submission form on Saturday,\nso fewer submissions then.\n',\
 176              'Third subject had real problems with the discrimiation test, so omitted.\n',\
 177              'Discrimination test seems kind of flaky - had to skip in several cases\n',\
 178              'Fuse blew midway through this weeks data taking,\nfewer results than last week.\n']
 179     notefilenames = ['notes.txt','NOTES','ReadMe','misc.txt','About']
 180
 181     def __init__(self):
 182         self.name = choice(Datataker.names)
 183         Datataker.names.remove(self.name)
 184         self.filenameprefix = choice(Datataker.filenamestyles)
 185         self.filenamesuffix = choice(Datataker.suffixstyles)
 186         self.measures = []
 187         self.tookNotes = False
 188         if (random() < Datataker.tookNotesFraction) :
 189             self.tookNotes = True
 190             self.notes = choice(Datataker.notes)
 191             self.noteFilename = choice(Datataker.notefilenames)
 192
 193     def addmeasurement(self,measurement):
 194         self.measures.append(measurement)
 195
 196     def write(self):
 197         os.mkdir(self.name)
 198         os.chdir(self.name)
 199
 200         if (self.tookNotes):
 201             fname = self.noteFilename
 202             file = open(fname, 'w')
 203             file.write(self.notes)
 204             file.close()
 205
 206         for m in self.measures:
 207             fname = self.filenameprefix % m.serialNum + self.filenamesuffix
 208             file = open(fname, 'w')
 209             file.write(str(m))
 210             file.close()
 211         os.chdir('..')
 212
 213
 214 def main():
 215     #test_peopleCorrelations()
 216
 217     npeople = 300 # should generate ~ .9*300 + 3.5*.1*300 ~ 375 files
 218     nfiles = 351
 219
 220     people = []
 221     for pnum in range(npeople):
 222         people.append(Person())
 223
 224     measurements = []
 225     for p in people:
 226         for m in range(p.repeats):
 227             measurements.append(Measurement(p))
 228
 229     nexperimenters = 7
 230     experimenters = []
 231     for i in range(nexperimenters):
 232         experimenters.append(Datataker())
 233
 234     for fnum in xrange(min(len(measurements), nfiles)):
 235         ex = choice(experimenters)
 236         ex.addmeasurement(measurements[fnum])
 237
 238     os.mkdir('data')
 239     os.chdir('data')
 240     for ex in experimenters:
 241         ex.write()
 242     os.chdir('..')
 243
 244 if __name__=='__main__':
 245     sys.exit(main())
 246