Bez popisu

dataanalysis.py 7.3KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
  1. from readsample import *
  2. from operator import itemgetter
  3. class dataanalysis:
  4. def getPeaks(self, samples, n_peaks, hash_key = "intensity"):
  5. "Gets the top n_peaks massspec values in the file"
  6. from operator import itemgetter
  7. all_value = []
  8. sample = samples.next()
  9. while sample:
  10. all_value.append(sample)
  11. sample = samples.next()
  12. all_value = sorted(all_value, key=itemgetter(hash_key), reverse=True)
  13. return all_value[:n_peaks]
  14. def getPeaksFromFile(self, samples, peaks, peak_range=4):
  15. "Returns the n_peaks in a smaple file with key ms_key"
  16. #peaks_hash = {key: {} for key in peaks}
  17. peaks_hash = {}
  18. for key in peaks:
  19. peaks_hash[key] = {}
  20. for peak in peaks_hash.keys():
  21. for val in range(int(peak) - peak_range, int(peak) + peak_range + 1):
  22. peaks_hash[peak][val] = {"peak": val, "intensity": 0}
  23. #print peaks_hash
  24. sample = samples.next()
  25. while sample:
  26. for peak in peaks:
  27. # Extrack the samples in range of each peak
  28. if sample["peak"] >= (int(peak) - peak_range) and sample["peak"] <= (int(peak) + peak_range+1):
  29. #peaks_hash[peak][0].append(sample)
  30. # Keep track of the max_intensity
  31. key = int(sample["peak"])
  32. if peaks_hash[peak][key]["intensity"] < sample["intensity"]:
  33. peaks_hash[peak][key] = sample
  34. sample = samples.next()
  35. # Final filtering. Removing the false peaks.
  36. self.FilterPeaks(peaks_hash)
  37. return peaks_hash
  38. def FilterPeaks(self, peaks_hash):
  39. # Final filtering. Removing the false peaks.
  40. for peak in peaks_hash.keys():
  41. # Find max peak
  42. max_peak = self.getMaxPeak(peaks_hash[peak])
  43. keys = peaks_hash[peak].keys()
  44. #print peaks_hash[peak]
  45. keys.sort()
  46. # Get list position of the max peak
  47. position = keys.index(int(max_peak["peak"]))
  48. # from the max down remove if intensity is greater than the next peak or
  49. # if decimal is not decimal of the peak
  50. current_intensity = max_peak["intensity"]
  51. max_dec = float(max_peak["peak"]) -1 #int(str(max_peak["peak"]).split(".")[1][0])
  52. discard = 0 # After one point is discarded all others are.
  53. for i in range(position-1, -1, -1):
  54. current_dec = float(peaks_hash[peak][keys[i]]["peak"])#int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0])
  55. #print current_dec
  56. if discard or current_dec < max_dec - .12 or current_dec > max_dec + .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity:
  57. del peaks_hash[peak][keys[i]]
  58. discard = 1
  59. else:
  60. current_intensity = peaks_hash[peak][keys[i]]["intensity"]
  61. max_dec -= 1
  62. # From the max up remove if intensity is greater than the next peak or
  63. # if decimal is not decimal of the peak
  64. current_intensity = max_peak["intensity"]
  65. max_dec = float(max_peak["peak"]) + 1
  66. discard = 0 # After one point is discarded all others are.
  67. for i in range(position+1, len(keys)):
  68. current_dec = float(peaks_hash[peak][keys[i]]["peak"]) #int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0])
  69. if discard or current_dec > max_dec + .12 or current_dec < max_dec - .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity:
  70. del peaks_hash[peak][keys[i]]
  71. discard = 1
  72. else:
  73. current_intensity = peaks_hash[peak][keys[i]]["intensity"]
  74. max_dec += 1
  75. def getMaxPeak(self, peaks):
  76. max_peak = {"peak": 0, "intensity": -1}
  77. for peak in peaks.keys():
  78. if max_peak["intensity"] < peaks[peak]["intensity"]:
  79. max_peak = peaks[peak]
  80. return max_peak
  81. def getPeaksFromFileSet(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4):
  82. "Returns a set of n_peaks in a set of sample files with keys found in peaks_file"
  83. peak_set = {}
  84. for filename in file_set:
  85. samples = readsample(filename, sample_keys)
  86. peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range)
  87. samples.close()
  88. return peak_set
  89. def getPeaksFromFileList(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4):
  90. "Returns a set of n_peaks in a set of sample files with keys found in list peaks"
  91. peak_set = {}
  92. for file in file_set:
  93. samples = readsample(filename, sample_keys)
  94. peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range)
  95. samples.close()
  96. return peak_set
  97. def computePeakIntensity(self, peaks, hash_key="intensity"):
  98. acum = 0
  99. if type(peaks) == dict:
  100. for key in peaks.keys():
  101. acum += peaks[key][hash_key]
  102. # As comes from SQL DB
  103. else:
  104. for peak in peaks:
  105. acum += peak["peak"]
  106. return acum
  107. def computePeaksRelativeIntensity(self, peaks_childs, hash_key="intensity"):
  108. totalIntentisty = 0
  109. relative_intensities = {}
  110. if type(peaks_childs) == dict:
  111. for peak in peaks_childs.keys():
  112. relative_intensities[peak] = self.computePeakIntensity(peaks_childs[peak], hash_key)
  113. totalIntentisty += relative_intensities[peak]
  114. for peak in peaks_childs.keys():
  115. relative_intensities[peak] = relative_intensities[peak] / float(totalIntentisty)
  116. # As comes from SQL DB
  117. else:
  118. for peak in peaks_childs:
  119. relative_intensities[peak['peak']] = self.computePeakIntensity(peak["intensities"])
  120. totalIntentisty += relative_intensities[peak['peak']]
  121. for peak in peaks_childs:
  122. relative_intensities[peak['peak']] = relative_intensities[peak['peak']] / float(totalIntentisty)
  123. return relative_intensities
  124. def relativeAbundanceAverage(self, abundances):
  125. abundanceAverages = {}
  126. peaks = abundances[0].keys()
  127. for peak in peaks:
  128. abundanceAverages[peak] = 0
  129. for i in range(len(abundances)):
  130. abundanceAverages[peak] += abundances[i][peak]
  131. abundanceAverages[peak] = abundanceAverages[peak]/len(abundances)
  132. return abundanceAverages
  133. def relativeAbundanceStdDeviation(self, abundances, average):
  134. import math
  135. abundanceStdDev = {}
  136. peaks = average.keys()
  137. for peak in peaks:
  138. abundanceStdDev[peak] = 0
  139. for i in range(len(abundances)):
  140. abundanceStdDev[peak] += pow(abundances[i][peak] - average[peak], 2)
  141. abundanceStdDev[peak] = math.sqrt(1/float((len(abundances) - 1)) * abundanceStdDev[peak])
  142. return abundanceStdDev
  143. def relativeAbundanceSDM(self, deviations, sample_size):
  144. import math
  145. abundanceSDM = {}
  146. for peak in deviations.keys():
  147. abundanceSDM[peak] = deviations[peak]/float(math.sqrt(sample_size))
  148. return abundanceSDM
  149. def printPeaksChilds(self, peaks_childs):
  150. for peak in peaks_childs.keys():
  151. print peak
  152. for child in peaks_childs[peak].keys():
  153. print "\t", peaks_childs[peak][child]["peak"], peaks_childs[peak][child]["intensity"],
  154. def main():
  155. "Main to test the library"
  156. peakread = readsample(sys.argv[1], None, None, 0)
  157. peaks = peakread.readAll()
  158. peaks = [float(x[0]) for x in peaks]
  159. print peaks
  160. samples = readsample(sys.argv[2], ["peak", "intensity"])
  161. da = dataanalysis()
  162. peaks_childs = da.getPeaksFromFile(samples, peaks)
  163. da.printPeaksChilds(peaks_childs)
  164. print da.computePeaksRelativeIntensity(peaks_childs)
  165. sys.exit(0)
  166. files_peak_childs = da.getPeaksFromFileSet(sys.argv[2:], peaks)
  167. for key in files_peak_childs.keys():
  168. print key
  169. da.printPeaksChilds(files_peak_childs[key])
  170. samples.close()
  171. ### Sort by peak
  172. samples = readsample(sys.argv[2])
  173. peaks = da.getPeaks(samples, 60)
  174. for peak in peaks:
  175. print peak
  176. print
  177. print
  178. peaks = sorted(peaks, key=itemgetter("peak"), reverse=True)
  179. for peak in peaks:
  180. print peak
  181. samples.close()
  182. if __name__ == '__main__':
  183. main()