from readsample import * from operator import itemgetter class dataanalysis: def getPeaks(self, samples, n_peaks, hash_key = "intensity"): "Gets the top n_peaks massspec values in the file" from operator import itemgetter all_value = [] sample = samples.next() while sample: all_value.append(sample) sample = samples.next() all_value = sorted(all_value, key=itemgetter(hash_key), reverse=True) return all_value[:n_peaks] def getPeaksFromFile(self, samples, peaks, peak_range=4): "Returns the n_peaks in a smaple file with key ms_key" #peaks_hash = {key: {} for key in peaks} peaks_hash = {} for key in peaks: peaks_hash[key] = {} for peak in peaks_hash.keys(): for val in range(int(peak) - peak_range, int(peak) + peak_range + 1): peaks_hash[peak][val] = {"peak": val, "intensity": 0} #print peaks_hash sample = samples.next() while sample: for peak in peaks: # Extrack the samples in range of each peak if sample["peak"] >= (int(peak) - peak_range) and sample["peak"] <= (int(peak) + peak_range+1): #peaks_hash[peak][0].append(sample) # Keep track of the max_intensity key = int(sample["peak"]) if peaks_hash[peak][key]["intensity"] < sample["intensity"]: peaks_hash[peak][key] = sample sample = samples.next() # Final filtering. Removing the false peaks. self.FilterPeaks(peaks_hash) return peaks_hash def FilterPeaks(self, peaks_hash): # Final filtering. Removing the false peaks. for peak in peaks_hash.keys(): # Find max peak max_peak = self.getMaxPeak(peaks_hash[peak]) keys = peaks_hash[peak].keys() #print peaks_hash[peak] keys.sort() # Get list position of the max peak position = keys.index(int(max_peak["peak"])) # from the max down remove if intensity is greater than the next peak or # if decimal is not decimal of the peak current_intensity = max_peak["intensity"] max_dec = float(max_peak["peak"]) -1 #int(str(max_peak["peak"]).split(".")[1][0]) discard = 0 # After one point is discarded all others are. for i in range(position-1, -1, -1): current_dec = float(peaks_hash[peak][keys[i]]["peak"])#int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0]) #print current_dec if discard or current_dec < max_dec - .12 or current_dec > max_dec + .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity: del peaks_hash[peak][keys[i]] discard = 1 else: current_intensity = peaks_hash[peak][keys[i]]["intensity"] max_dec -= 1 # From the max up remove if intensity is greater than the next peak or # if decimal is not decimal of the peak current_intensity = max_peak["intensity"] max_dec = float(max_peak["peak"]) + 1 discard = 0 # After one point is discarded all others are. for i in range(position+1, len(keys)): current_dec = float(peaks_hash[peak][keys[i]]["peak"]) #int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0]) if discard or current_dec > max_dec + .12 or current_dec < max_dec - .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity: del peaks_hash[peak][keys[i]] discard = 1 else: current_intensity = peaks_hash[peak][keys[i]]["intensity"] max_dec += 1 def getMaxPeak(self, peaks): max_peak = {"peak": 0, "intensity": -1} for peak in peaks.keys(): if max_peak["intensity"] < peaks[peak]["intensity"]: max_peak = peaks[peak] return max_peak def getPeaksFromFileSet(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4): "Returns a set of n_peaks in a set of sample files with keys found in peaks_file" peak_set = {} for filename in file_set: samples = readsample(filename, sample_keys) peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range) samples.close() return peak_set def getPeaksFromFileList(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4): "Returns a set of n_peaks in a set of sample files with keys found in list peaks" peak_set = {} for file in file_set: samples = readsample(filename, sample_keys) peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range) samples.close() return peak_set def computePeakIntensity(self, peaks, hash_key="intensity"): acum = 0 if type(peaks) == dict: for key in peaks.keys(): acum += peaks[key][hash_key] # As comes from SQL DB else: for peak in peaks: acum += peak["peak"] return acum def computePeaksRelativeIntensity(self, peaks_childs, hash_key="intensity"): totalIntentisty = 0 relative_intensities = {} if type(peaks_childs) == dict: for peak in peaks_childs.keys(): relative_intensities[peak] = self.computePeakIntensity(peaks_childs[peak], hash_key) totalIntentisty += relative_intensities[peak] for peak in peaks_childs.keys(): relative_intensities[peak] = relative_intensities[peak] / float(totalIntentisty) # As comes from SQL DB else: for peak in peaks_childs: relative_intensities[peak['peak']] = self.computePeakIntensity(peak["intensities"]) totalIntentisty += relative_intensities[peak['peak']] for peak in peaks_childs: relative_intensities[peak['peak']] = relative_intensities[peak['peak']] / float(totalIntentisty) return relative_intensities def relativeAbundanceAverage(self, abundances): abundanceAverages = {} peaks = abundances[0].keys() for peak in peaks: abundanceAverages[peak] = 0 for i in range(len(abundances)): abundanceAverages[peak] += abundances[i][peak] abundanceAverages[peak] = abundanceAverages[peak]/len(abundances) return abundanceAverages def relativeAbundanceStdDeviation(self, abundances, average): import math abundanceStdDev = {} peaks = average.keys() for peak in peaks: abundanceStdDev[peak] = 0 for i in range(len(abundances)): abundanceStdDev[peak] += pow(abundances[i][peak] - average[peak], 2) abundanceStdDev[peak] = math.sqrt(1/float((len(abundances) - 1)) * abundanceStdDev[peak]) return abundanceStdDev def relativeAbundanceSDM(self, deviations, sample_size): import math abundanceSDM = {} for peak in deviations.keys(): abundanceSDM[peak] = deviations[peak]/float(math.sqrt(sample_size)) return abundanceSDM def printPeaksChilds(self, peaks_childs): for peak in peaks_childs.keys(): print peak for child in peaks_childs[peak].keys(): print "\t", peaks_childs[peak][child]["peak"], peaks_childs[peak][child]["intensity"], def main(): "Main to test the library" peakread = readsample(sys.argv[1], None, None, 0) peaks = peakread.readAll() peaks = [float(x[0]) for x in peaks] print peaks samples = readsample(sys.argv[2], ["peak", "intensity"]) da = dataanalysis() peaks_childs = da.getPeaksFromFile(samples, peaks) da.printPeaksChilds(peaks_childs) print da.computePeaksRelativeIntensity(peaks_childs) sys.exit(0) files_peak_childs = da.getPeaksFromFileSet(sys.argv[2:], peaks) for key in files_peak_childs.keys(): print key da.printPeaksChilds(files_peak_childs[key]) samples.close() ### Sort by peak samples = readsample(sys.argv[2]) peaks = da.getPeaks(samples, 60) for peak in peaks: print peak print print peaks = sorted(peaks, key=itemgetter("peak"), reverse=True) for peak in peaks: print peak samples.close() if __name__ == '__main__': main()