CSLab
/
maldisoft


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263
							
from readsample import *
from operator import itemgetter

class dataanalysis:

	def getPeaks(self, samples, n_peaks, hash_key = "intensity"):
		"Gets the top n_peaks massspec values in the file"

		from operator import itemgetter

		all_value = []
		sample = samples.next()
		while sample:
			all_value.append(sample)
			sample = samples.next()

		all_value = sorted(all_value, key=itemgetter(hash_key), reverse=True)

		return all_value[:n_peaks]

	def getPeaksFromFile(self, samples, peaks, peak_range=4):
		"Returns the n_peaks in a smaple file with key ms_key"

		#peaks_hash = {key: {} for key in peaks}
		peaks_hash = {}
		for key in peaks:
			peaks_hash[key] = {}
			
		for peak in peaks_hash.keys():
			for val in range(int(peak) - peak_range, int(peak) + peak_range + 1):
				peaks_hash[peak][val] = {"peak": val, "intensity": 0}
 		
 		#print peaks_hash

		sample = samples.next() 
		while sample:
			for peak in peaks:
				# Extrack the samples in range of each peak
				if sample["peak"] >= (int(peak) - peak_range) and sample["peak"] <= (int(peak) + peak_range+1):
					#peaks_hash[peak][0].append(sample)
					# Keep track of the max_intensity
					key = int(sample["peak"])
					if peaks_hash[peak][key]["intensity"] < sample["intensity"]:
						peaks_hash[peak][key] = sample


			sample = samples.next()


		# Final filtering.  Removing the false peaks.
		self.FilterPeaks(peaks_hash)
					
		return peaks_hash

	def FilterPeaks(self, peaks_hash):

		# Final filtering.  Removing the false peaks.
		for peak in peaks_hash.keys():
			# Find max peak
			max_peak = self.getMaxPeak(peaks_hash[peak])
			keys = peaks_hash[peak].keys()
			#print peaks_hash[peak]
			keys.sort()

			# Get list position of the max peak
			position = keys.index(int(max_peak["peak"]))

			# from the max down remove if intensity is greater than the next peak or 
			# if decimal is not decimal of the peak
			current_intensity = max_peak["intensity"]
			max_dec = float(max_peak["peak"]) -1 #int(str(max_peak["peak"]).split(".")[1][0])
			discard = 0 # After one point is discarded all others are.
			for i in range(position-1, -1, -1):
				current_dec = float(peaks_hash[peak][keys[i]]["peak"])#int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0])
				#print current_dec
				if discard or current_dec < max_dec - .12 or current_dec > max_dec + .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity:
					del peaks_hash[peak][keys[i]]	
					discard = 1
				else:
					current_intensity = peaks_hash[peak][keys[i]]["intensity"]
					max_dec -= 1

			# From the max up remove if intensity is greater than the next peak or 
			# if decimal is not decimal of the peak
			current_intensity = max_peak["intensity"]
			max_dec = float(max_peak["peak"]) + 1
			discard = 0 # After one point is discarded all others are.
			for i in range(position+1, len(keys)):
				current_dec = float(peaks_hash[peak][keys[i]]["peak"]) #int(str(peaks_hash[peak][keys[i]]["peak"]).split(".")[1][0])
				if  discard or current_dec > max_dec + .12 or current_dec < max_dec - .12 or peaks_hash[peak][keys[i]]["intensity"] >= current_intensity:
					del peaks_hash[peak][keys[i]]	
					discard = 1
				else:
					current_intensity = peaks_hash[peak][keys[i]]["intensity"]		
					max_dec += 1

	def getMaxPeak(self, peaks):
		max_peak = {"peak": 0, "intensity": -1}
		for peak in peaks.keys():
			if max_peak["intensity"] < peaks[peak]["intensity"]:
				max_peak = peaks[peak]
		return max_peak


	def getPeaksFromFileSet(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4):
		"Returns a set of n_peaks in a set of sample files with keys found in peaks_file"

		peak_set = {}

		for filename in file_set:
			samples = readsample(filename, sample_keys)
			peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range)
			samples.close()
		return peak_set

	def getPeaksFromFileList(self, file_set, peaks, sample_keys=["peak", "intensity"], peak_range=4):
		"Returns a set of n_peaks in a set of sample files with keys found in list peaks"

		peak_set = {}


		for file in file_set:
			samples = readsample(filename, sample_keys)
			peak_set[''.join(filename.split())] = self.getPeaksFromFile(samples, peaks, peak_range)
			samples.close()
		return peak_set

	def computePeakIntensity(self, peaks, hash_key="intensity"):
		acum = 0

		if type(peaks) == dict:
			for key in peaks.keys():
				acum += peaks[key][hash_key]
	
		# As comes from SQL DB		
		else:
			for peak in peaks:
				acum += peak["peak"]

		return acum


	def computePeaksRelativeIntensity(self, peaks_childs, hash_key="intensity"):

		totalIntentisty = 0
		relative_intensities = {}

		if type(peaks_childs) == dict:

			for peak in peaks_childs.keys():
				relative_intensities[peak] = self.computePeakIntensity(peaks_childs[peak], hash_key)
				totalIntentisty += relative_intensities[peak]

			for peak in peaks_childs.keys():
				relative_intensities[peak] = relative_intensities[peak] / float(totalIntentisty)

		# As comes from SQL DB
		else:
			for peak in peaks_childs:
				relative_intensities[peak['peak']] = self.computePeakIntensity(peak["intensities"])
				totalIntentisty += relative_intensities[peak['peak']]

			for peak in peaks_childs:
				relative_intensities[peak['peak']] = relative_intensities[peak['peak']] / float(totalIntentisty)


		return relative_intensities


	def relativeAbundanceAverage(self, abundances):

		abundanceAverages = {}
		peaks = abundances[0].keys()
		for peak in peaks:
			abundanceAverages[peak] = 0
			for i in range(len(abundances)):
				abundanceAverages[peak] += abundances[i][peak]

			abundanceAverages[peak] = abundanceAverages[peak]/len(abundances)

		return abundanceAverages

	def relativeAbundanceStdDeviation(self, abundances, average):

		import math

		abundanceStdDev = {}
		peaks = average.keys()
		for peak in peaks:
			abundanceStdDev[peak] = 0
			for i in range(len(abundances)):
				abundanceStdDev[peak] += pow(abundances[i][peak] - average[peak], 2)

			abundanceStdDev[peak] = math.sqrt(1/float((len(abundances) - 1)) *   abundanceStdDev[peak]) 

		return abundanceStdDev

	def relativeAbundanceSDM(self, deviations, sample_size):

		import math

		abundanceSDM = {}
		for peak in deviations.keys():
			abundanceSDM[peak] = deviations[peak]/float(math.sqrt(sample_size))

		return abundanceSDM


	def printPeaksChilds(self, peaks_childs):
		for peak in peaks_childs.keys():
			print peak
			for child in peaks_childs[peak].keys():
				print "\t", peaks_childs[peak][child]["peak"], peaks_childs[peak][child]["intensity"],


def main():
	"Main to test the library"

	peakread = readsample(sys.argv[1], None, None, 0)
	peaks = peakread.readAll()

	peaks = [float(x[0]) for x in peaks]
	print peaks

	samples = readsample(sys.argv[2], ["peak", "intensity"])
	da = dataanalysis()

	peaks_childs = da.getPeaksFromFile(samples, peaks)

	da.printPeaksChilds(peaks_childs)


	print da.computePeaksRelativeIntensity(peaks_childs)

	sys.exit(0)

	files_peak_childs = da.getPeaksFromFileSet(sys.argv[2:], peaks)

	for key in files_peak_childs.keys():
		print key
		da.printPeaksChilds(files_peak_childs[key])

	samples.close()

	### Sort by peak

	samples = readsample(sys.argv[2])
	peaks = da.getPeaks(samples, 60)
	for peak in peaks:
		print peak
	print
	print
	peaks = sorted(peaks, key=itemgetter("peak"), reverse=True)
	for peak in peaks:
		print peak

	samples.close()


if __name__ == '__main__':
	main()