pcluster.py

   1 #!/usr/bin/env python
   2
   3 from mdp import pca
   4 from libhooke import WX_GOOD, ClickedPoint
   5 import wxversion
   6 wxversion.select(WX_GOOD)
   7 from wx import PostEvent
   8 import numpy as np
   9 import scipy as sp
  10 import copy
  11 import os.path
  12 import time
  13 import libhookecurve as lhc
  14
  15 import warnings
  16 warnings.simplefilter('ignore',np.RankWarning)
  17
  18
  19 class pclusterCommands:
  20
  21     def _plug_init(self):
  22         self.pca_paths=[]
  23         self.pca_myArray=None
  24         self.clustplot=None
  25
  26     def do_pcluster(self,args):
  27         '''
  28         pCLUSTER
  29         (pcluster.py)
  30         Automatically measures peaks and extracts informations for further clustering
  31         (c)Paolo Pancaldi, Massimo Sandal 2009
  32         '''
  33         #--Custom persistent length
  34         pl_value=None
  35         for arg in args.split():
  36             #look for a persistent length argument.
  37             if 'pl=' in arg:
  38                 pl_expression=arg.split('=')
  39                 pl_value=float(pl_expression[1]) #actual value
  40             else:
  41                 pl_value=None
  42
  43         #configuration variables
  44         min_npks = self.convfilt_config['minpeaks']
  45         min_deviation = self.convfilt_config['mindeviation']
  46
  47         pclust_filename=raw_input('Automeasure filename? ')
  48         realclust_filename=raw_input('Coordinates filename? ')
  49
  50         f=open(pclust_filename,'w+')
  51         f.write('Analysis started '+time.asctime()+'\n')
  52         f.write('----------------------------------------\n')
  53         f.write('; Contour length (nm)  ;  Persistence length (nm) ;  Max.Force (pN)  ;  Slope (N/m) ;  Sigma contour (nm) ; Sigma persistence (nm)\n')
  54         f.close()
  55
  56         f=open(realclust_filename,'w+')
  57         f.write('Analysis started '+time.asctime()+'\n')
  58         f.write('----------------------------------------\n')
  59         f.write('; Peak number ; Mean delta (nm)  ;  Median delta (nm) ;  Mean force (pN)  ;  Median force (pN) ; First peak length (nm) ; Last peak length (nm) ; Max force (pN) ; Min force (pN) ; Max delta (nm) ; Min delta (nm)\n')
  60         f.close()
  61         # ------ FUNCTION ------
  62         def fit_interval_nm(start_index,plot,nm,backwards):
  63             '''
  64             Calculates the number of points to fit, given a fit interval in nm
  65             start_index: index of point
  66             plot: plot to use
  67             backwards: if true, finds a point backwards.
  68             '''
  69             whatset=1 #FIXME: should be decidable
  70             x_vect=plot.vectors[1][0]
  71
  72             c=0
  73             i=start_index
  74             start=x_vect[start_index]
  75             maxlen=len(x_vect)
  76             while abs(x_vect[i]-x_vect[start_index])*(10**9) < nm:
  77                 if i==0 or i==maxlen-1: #we reached boundaries of vector!
  78                     return c
  79                 if backwards:
  80                     i-=1
  81                 else:
  82                     i+=1
  83                 c+=1
  84             return c
  85
  86         def plot_informations(itplot,pl_value):
  87             '''
  88             OUR VARIABLES
  89             contact_point.absolute_coords               (2.4584142802103689e-007, -6.9647135616234017e-009)
  90             peak_point.absolute_coords                  (3.6047748250571423e-008, -7.7142802788854212e-009)
  91             other_fit_point.absolute_coords     (4.1666139243838867e-008, -7.3759393477579707e-009)
  92             peak_location                                                                               [510, 610, 703, 810, 915, 1103]
  93             peak_size                                                                                           [-1.2729111505202212e-009, -9.1632775347399312e-010, -8.1707438353929907e-010, -8.0335812578148904e-010, -8.7483955226387558e-010, -3.6269619757067322e-009]
  94             params                                                                                                      [2.2433999931959462e-007, 3.3230248825175678e-010]
  95             fit_errors                                                                                  [6.5817195369767644e-010, 2.4415923138871498e-011]
  96             '''
  97             fit_points=int(self.config['auto_fit_points']) # number of points to fit before the peak maximum <50>
  98
  99             T=self.config['temperature'] #temperature of the system in kelvins. By default it is 293 K. <301.0>
 100             cindex=self.find_contact_point(itplot[0]) #Automatically find contact point <158, libhooke.ClickedPoint>
 101             contact_point=self._clickize(itplot[0].vectors[1][0], itplot[0].vectors[1][1], cindex)
 102             self.basepoints=[]
 103             base_index_0=peak_location[-1]+fit_interval_nm(peak_location[-1], itplot[0], self.config['auto_right_baseline'],False)
 104             self.basepoints.append(self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],base_index_0))
 105             base_index_1=self.basepoints[0].index+fit_interval_nm(self.basepoints[0].index, itplot[0], self.config['auto_left_baseline'],False)
 106             self.basepoints.append(self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],base_index_1))
 107             self.basecurrent=self.current.path
 108             boundaries=[self.basepoints[0].index, self.basepoints[1].index]
 109             boundaries.sort()
 110             to_average=itplot[0].vectors[1][1][boundaries[0]:boundaries[1]] #y points to average
 111             avg=np.mean(to_average)
 112             return fit_points, contact_point, pl_value, T, cindex, avg
 113
 114         def features_peaks(itplot, peak, fit_points, contact_point, pl_value, T, cindex, avg):
 115             '''
 116             calculate informations for each peak and add they in
 117             c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes
 118             '''
 119             c_leng=None
 120             p_leng=None
 121             sigma_c_leng=None
 122             sigma_p_leng=None
 123             force=None
 124             slope=None
 125
 126             delta_force=10
 127             slope_span=int(self.config['auto_slope_span'])
 128
 129             peak_point=self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],peak)
 130             other_fit_point=self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],peak-fit_points)
 131
 132             points=[contact_point, peak_point, other_fit_point]
 133
 134             params, yfit, xfit, fit_errors = self.wlc_fit(points, itplot[0].vectors[1][0], itplot[0].vectors[1][1], pl_value, T, return_errors=True)
 135
 136             #Measure forces
 137             delta_to_measure=itplot[0].vectors[1][1][peak-delta_force:peak+delta_force]
 138             y=min(delta_to_measure)
 139             #Measure slopes
 140             slope=self.linefit_between(peak-slope_span,peak)[0]
 141             #check fitted data and, if right, add peak to the measurement
 142             if len(params)==1: #if we did choose 1-value fit
 143                 p_leng=pl_value
 144                 c_leng=params[0]*(1.0e+9)
 145                 sigma_p_leng=0
 146                 sigma_c_leng=fit_errors[0]*(1.0e+9)
 147                 force = abs(y-avg)*(1.0e+12)
 148             else: #2-value fit
 149                 p_leng=params[1]*(1.0e+9)
 150                 #check if persistent length makes sense. otherwise, discard peak.
 151                 if p_leng>self.config['auto_min_p'] and p_leng<self.config['auto_max_p']:
 152                     '''
 153                     p_lengths.append(p_leng)
 154                     c_lengths.append(params[0]*(1.0e+9))
 155                     sigma_c_lengths.append(fit_errors[0]*(1.0e+9))
 156                     sigma_p_lengths.append(fit_errors[1]*(1.0e+9))
 157                     forces.append(abs(y-avg)*(1.0e+12))
 158                     slopes.append(slope)
 159                     '''
 160                     c_leng=params[0]*(1.0e+9)
 161                     sigma_c_leng=fit_errors[0]*(1.0e+9)
 162                     sigma_p_leng=fit_errors[1]*(1.0e+9)
 163                     force=abs(y-avg)*(1.0e+12)
 164                 else:
 165                     p_leng=None
 166                     slope=None
 167             #return c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes
 168             return  c_leng, p_leng, sigma_c_leng, sigma_p_leng, force, slope
 169
 170
 171         # ------ PROGRAM -------
 172         c=0
 173         for item in self.current_list:
 174             c+=1
 175             item.identify(self.drivers)
 176             itplot=item.curve.default_plots()
 177             flatten=self._find_plotmanip('flatten') #extract flatten plot manipulator
 178             itplot[0]=flatten(itplot[0], item, customvalue=1)
 179             try:
 180                 peak_location,peak_size=self.exec_has_peaks(item,min_deviation)
 181             except:
 182                 #We have troubles with exec_has_peaks (bad curve, whatever).
 183                 #Print info and go to next cycle.
 184                 print 'Cannot process ',item.path
 185                 continue
 186
 187             if len(peak_location)==0:
 188                 print 'No peaks!'
 189                 continue
 190
 191             fit_points, contact_point, pl_value, T, cindex, avg = plot_informations(itplot,pl_value)
 192
 193             print '\n\nCurve',item.path, 'is',c,'of',len(self.current_list),': found '+str(len(peak_location))+' peaks.'
 194
 195             #initialize output data vectors
 196             c_lengths=[]
 197             p_lengths=[]
 198             sigma_c_lengths=[]
 199             sigma_p_lengths=[]
 200             forces=[]
 201             slopes=[]
 202
 203             #loop each peak of my curve
 204             for peak in peak_location:
 205                 c_leng, p_leng, sigma_c_leng, sigma_p_leng, force, slope = features_peaks(itplot, peak, fit_points, contact_point, pl_value, T, cindex, avg)
 206                 for var, vector in zip([c_leng, p_leng, sigma_c_leng, sigma_p_leng, force, slope],[c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes]):
 207                     if var is not None:
 208                         vector.append(var)
 209
 210             #FIXME: We need a dictionary here...
 211             allvects=[c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes]
 212             for vect in allvects:
 213                 if len(vect)==0:
 214                     for i in range(len(c_lengths)):
 215                         vect.append(0)
 216
 217             print 'Measurements for all peaks detected:'
 218             print 'contour (nm)', c_lengths
 219             print 'sigma contour (nm)',sigma_c_lengths
 220             print 'p (nm)',p_lengths
 221             print 'sigma p (nm)',sigma_p_lengths
 222             print 'forces (pN)',forces
 223             print 'slopes (N/m)',slopes
 224
 225             '''
 226             write automeasure text file
 227             '''
 228             print 'Saving automatic measurement...'
 229             f=open(pclust_filename,'a+')
 230             f.write(item.path+'\n')
 231             for i in range(len(c_lengths)):
 232                 f.write(' ; '+str(c_lengths[i])+' ; '+str(p_lengths[i])+' ; '+str(forces[i])+' ; '+str(slopes[i])+' ; '+str(sigma_c_lengths[i])+' ; '+str(sigma_p_lengths[i])+'\n')
 233             f.close()
 234
 235             '''
 236             calculate clustering coordinates
 237             '''
 238             peak_number=len(c_lengths)
 239             if peak_number > 1:
 240                 deltas=[]
 241                 for i in range(len(c_lengths)-1):
 242                     deltas.append(c_lengths[i+1]-c_lengths[i])
 243
 244                 delta_mean=np.mean(deltas)
 245                 delta_median=np.median(deltas)
 246
 247                 force_mean=np.mean(forces)
 248                 force_median=np.median(forces)
 249
 250                 first_peak_cl=c_lengths[0]
 251                 last_peak_cl=c_lengths[-1]
 252
 253                 max_force=max(forces[:-1])
 254                 min_force=min(forces)
 255
 256                 max_delta=max(deltas)
 257                 min_delta=min(deltas)
 258
 259                 delta_stdev=np.std(deltas)
 260                 forces_stdev=np.std(forces[:-1])
 261
 262                 print 'Coordinates'
 263                 print 'Peaks',peak_number
 264                 print 'Mean delta',delta_mean
 265                 print 'Median delta',delta_median
 266                 print 'Mean force',force_mean
 267                 print 'Median force',force_median
 268                 print 'First peak',first_peak_cl
 269                 print 'Last peak',last_peak_cl
 270                 print 'Max force',max_force
 271                 print 'Min force',min_force
 272                 print 'Max delta',max_delta
 273                 print 'Min delta',min_delta
 274                 print 'Delta stdev',delta_stdev
 275                 print 'Forces stdev',forces_stdev
 276
 277                 '''
 278                 write clustering coordinates
 279                 '''
 280                 f=open(realclust_filename,'a+')
 281                 f.write(item.path+'\n')
 282                 f.write(' ; '+str(peak_number)+     # non considerato
 283                         ' ; '+str(delta_mean)+      # 0
 284                         ' ; '+str(delta_median)+    # 1 -
 285                         ' ; '+str(force_mean)+      # 2
 286                         ' ; '+str(force_median)+    # 3 -
 287                         ' ; '+str(first_peak_cl)+   # 4 -
 288                         ' ; '+str(last_peak_cl)+    # 5 -
 289                         ' ; '+str(max_force)+       # 6
 290                         ' ; '+str(min_force)+       # 7
 291                         ' ; '+str(max_delta)+       # 8
 292                         ' ; '+str(min_delta)+       # 9
 293                         ' ; '+str(delta_stdev)+     # 10
 294                         ' ; '+str(forces_stdev)+    # 11
 295                         '\n')
 296                 f.close()
 297             else:
 298                 pass
 299
 300     def do_pca(self,args):
 301         '''
 302         PCA -> "pca gaeta_coor_blind50.txt 1,3,6"
 303         Automatically measures pca from coordinates filename and shows two interactives plots
 304         With the second argument (arbitrary) you can select the columns and the multiplier factor
 305         to use for the pca (for es "1,3*50,6,8x10,9"). Dont use spaces. "*" or "x" are the same thing.
 306         Without second argument it reads pca_config.txt file
 307         (c)Paolo Pancaldi, Massimo Sandal 2009
 308         '''
 309
 310         # reads the columns of pca
 311         conf=open("pca_config.txt")
 312         config = conf.readlines()
 313         conf.close()
 314
 315         self.pca_myArray = []
 316         self.pca_paths = {}
 317         plot_path_temp = ""
 318         nPlotTot = 0
 319         nPlotGood = 0
 320
 321         # prende in inpunt un arg (nome del file)
 322         # e il secondo e' arbitrario riceve x es "row[1],row2,row[3]"
 323         arg = args.split(" ")
 324         if arg[0]==args:
 325             file_name=args
 326         else:
 327             file_name=arg[0]
 328             config[0] = arg[1]
 329
 330         f=open(file_name)
 331         rows = f.readlines()
 332         for row in rows:
 333             if row[0]=="/":
 334                 nPlotTot = nPlotTot+1
 335                 #plot_path_temp = row.split("/")[6][:-1]
 336                 plot_path_temp = row
 337             if row[0]==" " and row.find('nan')==-1:
 338                 row = row[row.index(";",2)+2:].split(" ; ")     # non considero la prima colonna col #picchi
 339                 row = [float(i) for i in row]
 340
 341                 #0:Mean delta, 1:Median delta, 2:Mean force, 3:Median force, 4:First peak length, 5:Last peak length
 342                 #6:Max delta 7:Min delta 8:Max force 9:Min force 10:Std delta 11:Std force
 343                 if (row[0]<500 and row[1]<500 and row[2]<500 and row[3]<500 and row[4]<500 and row[5]<500):
 344                     if (row[0]>0 and row[1]>0 and row[2]>0 and row[3]>0 and row[4]>0 and row[5]>0):
 345                         self.pca_paths[nPlotGood] = plot_path_temp
 346                         #row = row[0], row[2], row[3]*3, row[6], row[7]*56, row[8]
 347                         res=[]
 348                         for cols in config[0].split(","):
 349                             if cols.find("*")!=-1:
 350                                 col = int(cols.split("*")[0])
 351                                 molt = int(cols.split("*")[1])
 352                             elif cols.find("x")!=-1:
 353                                 col = int(cols.split("x")[0])
 354                                 molt = int(cols.split("x")[1])
 355                             else:
 356                                 col = int(cols)
 357                                 molt = 1
 358                             res.append(row[col]*molt)
 359                         self.pca_myArray.append(res)
 360                         nPlotGood = nPlotGood+1
 361
 362         f.close()
 363         print nPlotGood, "of", nPlotTot
 364
 365         # array convert, calculate PCA, transpose
 366         self.pca_myArray = np.array(self.pca_myArray,dtype='float')
 367         print self.pca_myArray.shape
 368         self.pca_myArray = pca(self.pca_myArray, output_dim=2)  #other way -> y = mdp.nodes.PCANode(output_dim=2)(gigi)
 369         myArrayTr = np.transpose(self.pca_myArray)
 370
 371         # plotting
 372         X=myArrayTr[0]
 373         Y=myArrayTr[1]
 374
 375         X=list(X)
 376         Y=list(Y)
 377
 378         clustplot=lhc.PlotObject()
 379
 380         #FIXME
 381         #our dataset-specific stuff
 382         #This will go away after testing :)
 383         Xsyn=[]
 384         Ysyn=[]
 385
 386         Xgb1=[]
 387         Ygb1=[]
 388
 389         Xbad=[]
 390         Ybad=[]
 391
 392         goodnamefile=open('gaeta_good_blind50.log','r')
 393         #goodnamefile=open('/home/massimo/python/hooke/dataset_clust/roslin_blind50.log','r')
 394         goodnames=goodnamefile.readlines()
 395         goodnames=[i.split()[0] for i in goodnames[1:]]
 396
 397
 398         for index in range(len(self.pca_paths)):
 399             '''
 400             if '3s3' in self.pca_paths[index] and not 'bad' in self.pca_paths[index]:
 401                 Xsyn.append(X[index])
 402                 Ysyn.append(Y[index])
 403             elif 'bad' in self.pca_paths[index]:
 404                 Xbad.append(X[index])
 405                 Ybad.append(Y[index])
 406             else:
 407                 Xgb1.append(X[index])
 408                 Ygb1.append(Y[index])
 409             '''
 410             #print self.pca_paths
 411             if self.pca_paths[index][:-1] in goodnames:
 412                 Xsyn.append(X[index])
 413                 Ysyn.append(Y[index])
 414             else:
 415                 Xbad.append(X[index])
 416                 Ybad.append(Y[index])
 417
 418         print 'blath',len(Xsyn),len(Ysyn)
 419
 420         #clustplot.add_set(Xgb1,Ygb1)
 421         clustplot.add_set(Xbad,Ybad)
 422         clustplot.add_set(Xsyn,Ysyn)
 423         clustplot.normalize_vectors()
 424         clustplot.styles=['scatter', 'scatter','scatter']
 425         clustplot.colors=[None,'red','green']
 426         #clustplot.styles=['scatter',None]
 427         clustplot.destination=1
 428         self._send_plot([clustplot])
 429         self.clustplot=clustplot
 430
 431         # -- exporting coordinates and plot! --
 432
 433         #builds coordinate s file
 434         '''
 435         f = open('coordinate_punti.txt','w')
 436         for i in range(len(X)):
 437             f.write (str(i) + "\t" + str(X[i]) + "\t" + str(Y[i]) + "\n")
 438         f.close()
 439         '''
 440         #save plot
 441         config = config[0].replace("*", "x")
 442         self.do_export("png/" + config + " 1")
 443
 444     def do_multipca(self,args):
 445         '''
 446         MULTIPCA -> "multipca gaeta_coor_blind50.txt 3"
 447         Automatically multiply the column suggest in second argument for value between 1-100 (step of 2),
 448         measures pca from coordinates filename and save the png plots.
 449         (c)Paolo Pancaldi, Massimo Sandal 2009
 450         '''
 451         # reads the columns of pca
 452         conf=open("pca_config.txt")
 453         config = conf.readlines() # config[0] = "1,2,3"
 454         conf.close()
 455         # cycling pca
 456         arg = args.split(" ")
 457         file_name=arg[0]
 458         column=str(arg[1])
 459         for i in range(1, 101, 2):
 460             self.do_pca(file_name + " " + config[0].replace(column,column+"*"+str(i),1))
 461
 462     def do_pclick(self,args):
 463
 464         self._send_plot([self.clustplot]) #quick workaround for BAD problems in the GUI
 465         print 'Click point'
 466         point = self._measure_N_points(N=1, whatset=0)
 467         indice = point[0].index
 468         plot_file = self.pca_paths[indice]
 469         dot_coord = self.pca_myArray[indice]
 470         print "file: " + str(plot_file).rstrip()
 471         print "id: " + str(indice)
 472         print "coord: " + str(dot_coord)
 473         self.do_genlist(str(plot_file))
 474         #self.do_jump(str(plot_file))