123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263 |
-
- from readsample import *
- from operator import itemgetter
-
- class dataanalysis:
-
- def getPeaks(self, samples, n_peaks, hash_key = "intensity"):
- "Gets the top n_peaks massspec values in the file"
-
- from operator import itemgetter
-
- all_value = []
- sample = samples.next()
- while sample:
- all_value.append(sample)
- sample = samples.next()
-
- all_value = sorted(all_value, key=itemgetter(hash_key), reverse=True)
-
- return all_value[:n_peaks]
-
- def getPeaksFromFile(self, samples, peaks, peak_range=4):
- "Returns the n_peaks in a smaple file with key ms_key"
-
- #peaks_hash = {key: {} for key in peaks}
- peaks_hash = {}
- for key in peaks:
- peaks_hash[key] = {}
-
- for peak in peaks_hash.keys():
- for val in range(int(peak) - peak_range, int(peak) + peak_range + 1):
- peaks_hash[peak][val] = {"peak": val, "intensity": 0}
-
- #print peaks_hash
-
- sample = samples.next()
- while sample:
- for peak in peaks:
- # Extrack the samples in range of each peak
- if sample["peak"] >= (int(peak) - peak_range) and sample["peak"] <= (int(peak) + peak_range+1):
- #peaks_hash[peak][0].append(sample)
- # Keep track of the max_intensity
- key = int(sample["peak"])
- if peaks_hash[peak][key]["intensity"] < sample["intensity"]:
- peaks_hash[peak][key] = sample
-
-
- sample = samples.next()
-
-
- # Final filtering. Removing the false peaks.
- self.FilterPeaks(peaks_hash)
-
- return peaks_hash
-
- def FilterPeaks(self, peaks_hash):
-
- # Final filtering. Removing the false peaks.
- for peak in peaks_hash.keys():
- # Find max peak
- max_peak = self.getMaxPeak(peaks_hash[peak])
- keys = peaks_hash[peak].keys()
- #print peaks_hash[peak]
- keys.sort()
-
- # Get list position of the max peak
- position = keys.index(int(max_peak["peak"]))
-
- # from the max down remove if intensity is greater than the next peak or
- # if decimal is not decimal of the peak
- current_intensity = max_peak["intensity"]
- max_dec = float(max_peak["peak"]) -1 #int(str(max_peak["peak"]).split(".")[1][0])
- discard = 0 # After one point is discarded all others are.
- for i in range(position-1, -1, -1):
- current_dec = float(peaks_hash[peak][keys[i]]["peak"])#int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0])
- #print current_dec
- if discard or current_dec < max_dec - .12 or current_dec > max_dec + .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity:
- del peaks_hash[peak][keys[i]]
- discard = 1
- else:
- current_intensity = peaks_hash[peak][keys[i]]["intensity"]
- max_dec -= 1
-
- # From the max up remove if intensity is greater than the next peak or
- # if decimal is not decimal of the peak
- current_intensity = max_peak["intensity"]
- max_dec = float(max_peak["peak"]) + 1
- discard = 0 # After one point is discarded all others are.
- for i in range(position+1, len(keys)):
- current_dec = float(peaks_hash[peak][keys[i]]["peak"]) #int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0])
- if discard or current_dec > max_dec + .12 or current_dec < max_dec - .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity:
- del peaks_hash[peak][keys[i]]
- discard = 1
- else:
- current_intensity = peaks_hash[peak][keys[i]]["intensity"]
- max_dec += 1
-
- def getMaxPeak(self, peaks):
- max_peak = {"peak": 0, "intensity": -1}
- for peak in peaks.keys():
- if max_peak["intensity"] < peaks[peak]["intensity"]:
- max_peak = peaks[peak]
- return max_peak
-
-
- def getPeaksFromFileSet(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4):
- "Returns a set of n_peaks in a set of sample files with keys found in peaks_file"
-
- peak_set = {}
-
- for filename in file_set:
- samples = readsample(filename, sample_keys)
- peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range)
- samples.close()
- return peak_set
-
- def getPeaksFromFileList(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4):
- "Returns a set of n_peaks in a set of sample files with keys found in list peaks"
-
- peak_set = {}
-
-
- for file in file_set:
- samples = readsample(filename, sample_keys)
- peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range)
- samples.close()
- return peak_set
-
- def computePeakIntensity(self, peaks, hash_key="intensity"):
- acum = 0
-
- if type(peaks) == dict:
- for key in peaks.keys():
- acum += peaks[key][hash_key]
-
- # As comes from SQL DB
- else:
- for peak in peaks:
- acum += peak["peak"]
-
- return acum
-
-
- def computePeaksRelativeIntensity(self, peaks_childs, hash_key="intensity"):
-
- totalIntentisty = 0
- relative_intensities = {}
-
- if type(peaks_childs) == dict:
-
- for peak in peaks_childs.keys():
- relative_intensities[peak] = self.computePeakIntensity(peaks_childs[peak], hash_key)
- totalIntentisty += relative_intensities[peak]
-
- for peak in peaks_childs.keys():
- relative_intensities[peak] = relative_intensities[peak] / float(totalIntentisty)
-
- # As comes from SQL DB
- else:
- for peak in peaks_childs:
- relative_intensities[peak['peak']] = self.computePeakIntensity(peak["intensities"])
- totalIntentisty += relative_intensities[peak['peak']]
-
- for peak in peaks_childs:
- relative_intensities[peak['peak']] = relative_intensities[peak['peak']] / float(totalIntentisty)
-
-
- return relative_intensities
-
-
- def relativeAbundanceAverage(self, abundances):
-
- abundanceAverages = {}
- peaks = abundances[0].keys()
- for peak in peaks:
- abundanceAverages[peak] = 0
- for i in range(len(abundances)):
- abundanceAverages[peak] += abundances[i][peak]
-
- abundanceAverages[peak] = abundanceAverages[peak]/len(abundances)
-
- return abundanceAverages
-
- def relativeAbundanceStdDeviation(self, abundances, average):
-
- import math
-
- abundanceStdDev = {}
- peaks = average.keys()
- for peak in peaks:
- abundanceStdDev[peak] = 0
- for i in range(len(abundances)):
- abundanceStdDev[peak] += pow(abundances[i][peak] - average[peak], 2)
-
- abundanceStdDev[peak] = math.sqrt(1/float((len(abundances) - 1)) * abundanceStdDev[peak])
-
- return abundanceStdDev
-
- def relativeAbundanceSDM(self, deviations, sample_size):
-
- import math
-
- abundanceSDM = {}
- for peak in deviations.keys():
- abundanceSDM[peak] = deviations[peak]/float(math.sqrt(sample_size))
-
- return abundanceSDM
-
-
-
- def printPeaksChilds(self, peaks_childs):
- for peak in peaks_childs.keys():
- print peak
- for child in peaks_childs[peak].keys():
- print "\t", peaks_childs[peak][child]["peak"], peaks_childs[peak][child]["intensity"],
-
-
- def main():
- "Main to test the library"
-
- peakread = readsample(sys.argv[1], None, None, 0)
- peaks = peakread.readAll()
-
- peaks = [float(x[0]) for x in peaks]
- print peaks
-
- samples = readsample(sys.argv[2], ["peak", "intensity"])
- da = dataanalysis()
-
- peaks_childs = da.getPeaksFromFile(samples, peaks)
-
- da.printPeaksChilds(peaks_childs)
-
-
- print da.computePeaksRelativeIntensity(peaks_childs)
-
- sys.exit(0)
-
- files_peak_childs = da.getPeaksFromFileSet(sys.argv[2:], peaks)
-
- for key in files_peak_childs.keys():
- print key
- da.printPeaksChilds(files_peak_childs[key])
-
- samples.close()
-
- ### Sort by peak
-
- samples = readsample(sys.argv[2])
- peaks = da.getPeaks(samples, 60)
- for peak in peaks:
- print peak
- print
- print
- peaks = sorted(peaks, key=itemgetter("peak"), reverse=True)
- for peak in peaks:
- print peak
-
- samples.close()
-
-
- if __name__ == '__main__':
- main()
|