PCA improved: it takes columns and multiplier factor from pca_config.txt or from...
[hooke.git] / pcluster.py
1 #!/usr/bin/env python
2
3 from mdp import pca
4 from libhooke import WX_GOOD, ClickedPoint
5 import wxversion
6 wxversion.select(WX_GOOD)
7 from wx import PostEvent
8 import numpy as np
9 import scipy as sp
10 import copy
11 import os.path
12 import time
13 import libhookecurve as lhc
14
15 import warnings
16 warnings.simplefilter('ignore',np.RankWarning)
17
18
19 class pclusterCommands:
20     
21     def _plug_init(self):
22         self.pca_paths=[]
23         self.pca_myArray=None
24         self.clustplot=None
25
26     def do_pcluster(self,args):
27         '''
28         pCLUSTER
29         (pcluster.py)
30         Automatically measures peaks and extracts informations for further clustering
31         (c)Paolo Pancaldi, Massimo Sandal 2009
32         '''
33         #--Custom persistent length
34         pl_value=None
35         for arg in args.split():
36             #look for a persistent length argument.
37             if 'pl=' in arg:
38                 pl_expression=arg.split('=')
39                 pl_value=float(pl_expression[1]) #actual value
40             else:
41                 pl_value=None
42                 
43         #configuration variables
44         min_npks = self.convfilt_config['minpeaks']
45         min_deviation = self.convfilt_config['mindeviation']
46         
47         pclust_filename=raw_input('Automeasure filename? ')
48         realclust_filename=raw_input('Coordinates filename? ')
49         
50         f=open(pclust_filename,'w+')
51         f.write('Analysis started '+time.asctime()+'\n')
52         f.write('----------------------------------------\n')
53         f.write('; Contour length (nm)  ;  Persistence length (nm) ;  Max.Force (pN)  ;  Slope (N/m) ;  Sigma contour (nm) ; Sigma persistence (nm)\n')
54         f.close()
55         
56         f=open(realclust_filename,'w+')
57         f.write('Analysis started '+time.asctime()+'\n')
58         f.write('----------------------------------------\n')
59         f.write('; Peak number ; Mean delta (nm)  ;  Median delta (nm) ;  Mean force (pN)  ;  Median force (pN) ; First peak length (nm) ; Last peak length (nm) ; Max force (pN) ; Min force (pN) ; Max delta (nm) ; Min delta (nm)\n')
60         f.close()
61         # ------ FUNCTION ------
62         def fit_interval_nm(start_index,plot,nm,backwards):
63             '''
64             Calculates the number of points to fit, given a fit interval in nm
65             start_index: index of point
66             plot: plot to use
67             backwards: if true, finds a point backwards.
68             '''
69             whatset=1 #FIXME: should be decidable
70             x_vect=plot.vectors[1][0]
71             
72             c=0
73             i=start_index
74             start=x_vect[start_index]
75             maxlen=len(x_vect)
76             while abs(x_vect[i]-x_vect[start_index])*(10**9) < nm:
77                 if i==0 or i==maxlen-1: #we reached boundaries of vector!
78                     return c
79                 if backwards:
80                     i-=1
81                 else:
82                     i+=1
83                 c+=1
84             return c
85
86         def plot_informations(itplot,pl_value):
87             '''
88             OUR VARIABLES
89             contact_point.absolute_coords               (2.4584142802103689e-007, -6.9647135616234017e-009)
90             peak_point.absolute_coords                  (3.6047748250571423e-008, -7.7142802788854212e-009)
91             other_fit_point.absolute_coords     (4.1666139243838867e-008, -7.3759393477579707e-009)
92             peak_location                                                                               [510, 610, 703, 810, 915, 1103]
93             peak_size                                                                                           [-1.2729111505202212e-009, -9.1632775347399312e-010, -8.1707438353929907e-010, -8.0335812578148904e-010, -8.7483955226387558e-010, -3.6269619757067322e-009]
94             params                                                                                                      [2.2433999931959462e-007, 3.3230248825175678e-010]
95             fit_errors                                                                                  [6.5817195369767644e-010, 2.4415923138871498e-011]
96             '''
97             fit_points=int(self.config['auto_fit_points']) # number of points to fit before the peak maximum <50>
98             
99             T=self.config['temperature'] #temperature of the system in kelvins. By default it is 293 K. <301.0>
100             cindex=self.find_contact_point(itplot[0]) #Automatically find contact point <158, libhooke.ClickedPoint>
101             contact_point=self._clickize(itplot[0].vectors[1][0], itplot[0].vectors[1][1], cindex)
102             self.basepoints=[]
103             base_index_0=peak_location[-1]+fit_interval_nm(peak_location[-1], itplot[0], self.config['auto_right_baseline'],False)
104             self.basepoints.append(self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],base_index_0))
105             base_index_1=self.basepoints[0].index+fit_interval_nm(self.basepoints[0].index, itplot[0], self.config['auto_left_baseline'],False)
106             self.basepoints.append(self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],base_index_1))
107             self.basecurrent=self.current.path
108             boundaries=[self.basepoints[0].index, self.basepoints[1].index]
109             boundaries.sort()
110             to_average=itplot[0].vectors[1][1][boundaries[0]:boundaries[1]] #y points to average
111             avg=np.mean(to_average)
112             return fit_points, contact_point, pl_value, T, cindex, avg
113
114         def features_peaks(itplot, peak, fit_points, contact_point, pl_value, T, cindex, avg):
115             '''
116             calculate informations for each peak and add they in 
117             c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes
118             '''
119             c_leng=None
120             p_leng=None
121             sigma_c_leng=None
122             sigma_p_leng=None
123             force=None
124             slope=None
125             
126             delta_force=10
127             slope_span=int(self.config['auto_slope_span'])
128             
129             peak_point=self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],peak)
130             other_fit_point=self._clickize(itplot[0].vectors[1][0],itplot[0].vectors[1][1],peak-fit_points)
131             
132             points=[contact_point, peak_point, other_fit_point]
133             
134             params, yfit, xfit, fit_errors = self.wlc_fit(points, itplot[0].vectors[1][0], itplot[0].vectors[1][1], pl_value, T, return_errors=True)
135             
136             #Measure forces
137             delta_to_measure=itplot[0].vectors[1][1][peak-delta_force:peak+delta_force]
138             y=min(delta_to_measure)
139             #Measure slopes
140             slope=self.linefit_between(peak-slope_span,peak)[0]
141             #check fitted data and, if right, add peak to the measurement
142             if len(params)==1: #if we did choose 1-value fit
143                 p_leng=pl_value
144                 c_leng=params[0]*(1.0e+9)
145                 sigma_p_leng=0
146                 sigma_c_leng=fit_errors[0]*(1.0e+9)
147                 force = abs(y-avg)*(1.0e+12)
148             else: #2-value fit
149                 p_leng=params[1]*(1.0e+9)
150                 #check if persistent length makes sense. otherwise, discard peak.
151                 if p_leng>self.config['auto_min_p'] and p_leng<self.config['auto_max_p']:
152                     '''
153                     p_lengths.append(p_leng)       
154                     c_lengths.append(params[0]*(1.0e+9))
155                     sigma_c_lengths.append(fit_errors[0]*(1.0e+9))
156                     sigma_p_lengths.append(fit_errors[1]*(1.0e+9))
157                     forces.append(abs(y-avg)*(1.0e+12))
158                     slopes.append(slope)     
159                     '''
160                     c_leng=params[0]*(1.0e+9)
161                     sigma_c_leng=fit_errors[0]*(1.0e+9)
162                     sigma_p_leng=fit_errors[1]*(1.0e+9)
163                     force=abs(y-avg)*(1.0e+12)
164                 else:
165                     p_leng=None
166                     slope=None
167             #return c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes
168             return  c_leng, p_leng, sigma_c_leng, sigma_p_leng, force, slope
169
170
171         # ------ PROGRAM -------
172         c=0
173         for item in self.current_list:
174             c+=1
175             item.identify(self.drivers)
176             itplot=item.curve.default_plots()
177             flatten=self._find_plotmanip('flatten') #extract flatten plot manipulator
178             itplot[0]=flatten(itplot[0], item, customvalue=1)
179             try:
180                 peak_location,peak_size=self.exec_has_peaks(item,min_deviation)
181             except: 
182                 #We have troubles with exec_has_peaks (bad curve, whatever).
183                 #Print info and go to next cycle.
184                 print 'Cannot process ',item.path
185                 continue 
186
187             if len(peak_location)==0:
188                 print 'No peaks!'
189                 continue
190
191             fit_points, contact_point, pl_value, T, cindex, avg = plot_informations(itplot,pl_value)
192             
193             print '\n\nCurve',item.path, 'is',c,'of',len(self.current_list),': found '+str(len(peak_location))+' peaks.'
194
195             #initialize output data vectors
196             c_lengths=[]
197             p_lengths=[]
198             sigma_c_lengths=[]
199             sigma_p_lengths=[]
200             forces=[]
201             slopes=[]
202
203             #loop each peak of my curve
204             for peak in peak_location:
205                 c_leng, p_leng, sigma_c_leng, sigma_p_leng, force, slope = features_peaks(itplot, peak, fit_points, contact_point, pl_value, T, cindex, avg)
206                 for var, vector in zip([c_leng, p_leng, sigma_c_leng, sigma_p_leng, force, slope],[c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes]):
207                     if var is not None:
208                         vector.append(var)
209
210             #FIXME: We need a dictionary here...
211             allvects=[c_lengths, p_lengths, sigma_c_lengths, sigma_p_lengths, forces, slopes]
212             for vect in allvects:
213                 if len(vect)==0:
214                     for i in range(len(c_lengths)):
215                         vect.append(0)
216             
217             print 'Measurements for all peaks detected:'
218             print 'contour (nm)', c_lengths
219             print 'sigma contour (nm)',sigma_c_lengths
220             print 'p (nm)',p_lengths
221             print 'sigma p (nm)',sigma_p_lengths
222             print 'forces (pN)',forces
223             print 'slopes (N/m)',slopes
224             
225             '''
226             write automeasure text file
227             '''
228             print 'Saving automatic measurement...'
229             f=open(pclust_filename,'a+')
230             f.write(item.path+'\n')
231             for i in range(len(c_lengths)):
232                 f.write(' ; '+str(c_lengths[i])+' ; '+str(p_lengths[i])+' ; '+str(forces[i])+' ; '+str(slopes[i])+' ; '+str(sigma_c_lengths[i])+' ; '+str(sigma_p_lengths[i])+'\n')
233             f.close()
234             
235             '''
236             calculate clustering coordinates
237             '''
238             peak_number=len(c_lengths)
239             if peak_number > 1:
240                 deltas=[]
241                 for i in range(len(c_lengths)-1):
242                     deltas.append(c_lengths[i+1]-c_lengths[i])
243                 
244                 delta_mean=np.mean(deltas)
245                 delta_median=np.median(deltas)
246                 
247                 force_mean=np.mean(forces)
248                 force_median=np.median(forces)
249                 
250                 first_peak_cl=c_lengths[0]
251                 last_peak_cl=c_lengths[-1]
252                 
253                 max_force=max(forces[:-1])
254                 min_force=min(forces)
255                 
256                 max_delta=max(deltas)
257                 min_delta=min(deltas)
258                 
259                 delta_stdev=np.std(deltas)
260                 forces_stdev=np.std(forces[:-1])
261                 
262                 print 'Coordinates'
263                 print 'Peaks',peak_number
264                 print 'Mean delta',delta_mean
265                 print 'Median delta',delta_median
266                 print 'Mean force',force_mean
267                 print 'Median force',force_median
268                 print 'First peak',first_peak_cl
269                 print 'Last peak',last_peak_cl
270                 print 'Max force',max_force
271                 print 'Min force',min_force
272                 print 'Max delta',max_delta
273                 print 'Min delta',min_delta
274                 print 'Delta stdev',delta_stdev
275                 print 'Forces stdev',forces_stdev
276                 
277                 '''
278                 write clustering coordinates
279                 '''
280                 f=open(realclust_filename,'a+')
281                 f.write(item.path+'\n')
282                 f.write(' ; '+str(peak_number)+     # non considerato
283                         ' ; '+str(delta_mean)+      # 0
284                         ' ; '+str(delta_median)+    # 1 -
285                         ' ; '+str(force_mean)+      # 2
286                         ' ; '+str(force_median)+    # 3 -
287                         ' ; '+str(first_peak_cl)+   # 4 -
288                         ' ; '+str(last_peak_cl)+    # 5 -
289                         ' ; '+str(max_force)+       # 6
290                         ' ; '+str(min_force)+       # 7
291                         ' ; '+str(max_delta)+       # 8
292                         ' ; '+str(min_delta)+       # 9
293                         ' ; '+str(delta_stdev)+     # 10
294                         ' ; '+str(forces_stdev)+    # 11
295                         '\n')
296                 f.close()
297             else:
298                 pass
299                 
300     def do_pca(self,args):
301         '''
302         PCA -> "pca gaeta_coor_blind50.txt 1,3,6"
303         Automatically measures pca from coordinates filename and shows two interactives plots
304         With the second argument (arbitrary) you can select the columns and the multiplier factor 
305         to use for the pca (for es "1,3*50,6,8x10,9"). Dont use spaces. "*" or "x" are the same thing.
306         Without second argument it reads pca_config.txt file
307         (c)Paolo Pancaldi, Massimo Sandal 2009
308         '''
309         
310         # reads the columns of pca
311         conf=open("pca_config.txt")
312         config = conf.readlines()
313         conf.close()
314         
315         self.pca_myArray = []
316         self.pca_paths = {}
317         plot_path_temp = ""
318         nPlotTot = 0
319         nPlotGood = 0
320         
321         # prende in inpunt un arg (nome del file) 
322         # e il secondo e' arbitrario riceve x es "row[1],row2,row[3]"
323         arg = args.split(" ")
324         if arg[0]==args:
325             file_name=args
326         else:
327             file_name=arg[0]
328             config[0] = arg[1]
329         
330         f=open(file_name)
331         rows = f.readlines()
332         for row in rows:
333             if row[0]=="/":
334                 nPlotTot = nPlotTot+1
335                 #plot_path_temp = row.split("/")[6][:-1]
336                 plot_path_temp = row
337             if row[0]==" " and row.find('nan')==-1:
338                 row = row[row.index(";",2)+2:].split(" ; ")     # non considero la prima colonna col #picchi
339                 row = [float(i) for i in row]
340                         
341                 #0:Mean delta, 1:Median delta, 2:Mean force, 3:Median force, 4:First peak length, 5:Last peak length
342                 #6:Max delta 7:Min delta 8:Max force 9:Min force 10:Std delta 11:Std force
343                 if (row[0]<500 and row[1]<500 and row[2]<500 and row[3]<500 and row[4]<500 and row[5]<500):
344                     if (row[0]>0 and row[1]>0 and row[2]>0 and row[3]>0 and row[4]>0 and row[5]>0):
345                         self.pca_paths[nPlotGood] = plot_path_temp
346                         #row = row[0], row[2], row[3]*3, row[6], row[7]*56, row[8]
347                         res=[]
348                         for cols in config[0].split(","):
349                             if cols.find("*")!=-1:
350                                 col = int(cols.split("*")[0])
351                                 molt = int(cols.split("*")[1])
352                             elif cols.find("x")!=-1:
353                                 col = int(cols.split("x")[0])
354                                 molt = int(cols.split("x")[1])
355                             else:
356                                 col = int(cols)
357                                 molt = 1
358                             res.append(row[col]*molt)
359                         self.pca_myArray.append(res)
360                         nPlotGood = nPlotGood+1
361                         
362         f.close()
363         print nPlotGood, "of", nPlotTot
364         
365         # array convert, calculate PCA, transpose
366         self.pca_myArray = np.array(self.pca_myArray,dtype='float')
367         print self.pca_myArray.shape
368         self.pca_myArray = pca(self.pca_myArray, output_dim=2)  #other way -> y = mdp.nodes.PCANode(output_dim=2)(gigi)
369         myArrayTr = np.transpose(self.pca_myArray)
370         
371         # plotting
372         X=myArrayTr[0]
373         Y=myArrayTr[1]
374         
375         X=list(X)
376         Y=list(Y)
377         
378         clustplot=lhc.PlotObject()
379         
380         #FIXME
381         #our dataset-specific stuff
382         #This will go away after testing :)
383         Xsyn=[]
384         Ysyn=[]
385         
386         Xgb1=[]
387         Ygb1=[]
388         
389         Xbad=[]
390         Ybad=[]
391         
392         goodnamefile=open('gaeta_good_blind50.log','r')
393         #goodnamefile=open('/home/massimo/python/hooke/dataset_clust/roslin_blind50.log','r')
394         goodnames=goodnamefile.readlines()
395         goodnames=[i.split()[0] for i in goodnames[1:]]
396         
397         
398         for index in range(len(self.pca_paths)):
399             '''
400             if '3s3' in self.pca_paths[index] and not 'bad' in self.pca_paths[index]:
401                 Xsyn.append(X[index])
402                 Ysyn.append(Y[index])
403             elif 'bad' in self.pca_paths[index]:
404                 Xbad.append(X[index])
405                 Ybad.append(Y[index])
406             else:
407                 Xgb1.append(X[index])
408                 Ygb1.append(Y[index])
409             '''
410             #print self.pca_paths
411             if self.pca_paths[index][:-1] in goodnames:
412                 Xsyn.append(X[index])
413                 Ysyn.append(Y[index])
414             else:
415                 Xbad.append(X[index])
416                 Ybad.append(Y[index])
417             
418         print 'blath',len(Xsyn),len(Ysyn)
419         
420         #clustplot.add_set(Xgb1,Ygb1)
421         clustplot.add_set(Xbad,Ybad)
422         clustplot.add_set(Xsyn,Ysyn)
423         clustplot.normalize_vectors()
424         clustplot.styles=['scatter', 'scatter','scatter']
425         clustplot.colors=[None,'red','green']
426         #clustplot.styles=['scatter',None]
427         clustplot.destination=1
428         self._send_plot([clustplot])
429         self.clustplot=clustplot
430         
431         # -- exporting coordinates and plot! --
432         
433         #builds coordinate s file
434         '''
435         f = open('coordinate_punti.txt','w')
436         for i in range(len(X)):
437             f.write (str(i) + "\t" + str(X[i]) + "\t" + str(Y[i]) + "\n")
438         f.close()
439         '''
440         #save plot
441         config = config[0].replace("*", "x")
442         self.do_export("png/" + config + " 1")
443             
444     def do_multipca(self,args):
445         '''
446         MULTIPCA -> "multipca gaeta_coor_blind50.txt 3"
447         Automatically multiply the column suggest in second argument for value between 1-100 (step of 2), 
448         measures pca from coordinates filename and save the png plots.
449         (c)Paolo Pancaldi, Massimo Sandal 2009
450         '''
451         # reads the columns of pca
452         conf=open("pca_config.txt")
453         config = conf.readlines() # config[0] = "1,2,3"
454         conf.close()
455         # cycling pca
456         arg = args.split(" ")
457         file_name=arg[0]
458         column=str(arg[1])
459         for i in range(1, 101, 2):
460             self.do_pca(file_name + " " + config[0].replace(column,column+"*"+str(i),1))
461         
462     def do_pclick(self,args):        
463         
464         self._send_plot([self.clustplot]) #quick workaround for BAD problems in the GUI
465         print 'Click point'
466         point = self._measure_N_points(N=1, whatset=0)
467         indice = point[0].index
468         plot_file = self.pca_paths[indice]
469         dot_coord = self.pca_myArray[indice]
470         print "file: " + str(plot_file).rstrip()
471         print "id: " + str(indice)
472         print "coord: " + str(dot_coord)
473         self.do_genlist(str(plot_file))
474         #self.do_jump(str(plot_file))