|
@@ -0,0 +1,169 @@
|
|
1
|
+####################################################
|
|
2
|
+# Researcher: Sara Schwarz
|
|
3
|
+# Advisor: Dr. Jose Ortiz
|
|
4
|
+# VERSION #2
|
|
5
|
+# Program Objective:
|
|
6
|
+#
|
|
7
|
+#In this work we seek to develop algorithms (brute-force approach, TRW, etc) to detect network and port
|
|
8
|
+#scanners on large-scale networks traffc such as those of Internet Service Providers,
|
|
9
|
+#Big Data research centers, and Science DMZ networks implemented in research institutions using network
|
|
10
|
+#flows. This specific program will follow a brute-force approach algorithm, of reading the network flows
|
|
11
|
+#and recording for each source ip the destination ips and the different destination ports it connected to.
|
|
12
|
+#Later, the number of connected ports will be compared with a threshold to classify the source ips as
|
|
13
|
+#either suspicious scanners or not.
|
|
14
|
+#To run this algorithm we will be using high-performance methods for computing such as Map and Reduce,
|
|
15
|
+#specifically Python's Pool Class Library.
|
|
16
|
+###############################################################
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+from silk import *
|
|
20
|
+import multiprocessing as mp
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+#input => list of files where each file has a list of network flows
|
|
24
|
+#output => threeway-hash table of the form {sip:{dip:{dport:total # of each dport}}}
|
|
25
|
+def verify_type(filename):
|
|
26
|
+ #print "Files", len(filename)
|
|
27
|
+
|
|
28
|
+ count_flow = 0
|
|
29
|
+ dportHash = {} #will contain amount of dport per dip per each sip
|
|
30
|
+ #iterates through each file that has a list of network flows
|
|
31
|
+ for file in filename:
|
|
32
|
+
|
|
33
|
+ #iterates through each network flow, and reads it using silkfile_open, SiLks Flow Repository function
|
|
34
|
+ for rec in silkfile_open(file, READ):
|
|
35
|
+ count_flow +=1
|
|
36
|
+ sip = str(rec.sip)
|
|
37
|
+ dip = str(rec.dip)
|
|
38
|
+ dport = rec.dport
|
|
39
|
+ #For our current research, we are limiting to only verifying IPv4 network flows
|
|
40
|
+ if (':' in sip):
|
|
41
|
+ continue
|
|
42
|
+ #using the hash table structure, keeps record of all the times each
|
|
43
|
+ #destination port for each destination ip was connected to by the same source ip
|
|
44
|
+ else:
|
|
45
|
+ if sip in dportHash:
|
|
46
|
+ if dip in dportHash[sip]:
|
|
47
|
+ if dport in dportHash[sip][dip]:
|
|
48
|
+ dportHash[sip][dip][dport] += 1
|
|
49
|
+ else:
|
|
50
|
+ dportHash[sip][dip][dport] = 1
|
|
51
|
+ else:
|
|
52
|
+ dportHash[sip][dip] = {dport : 1}
|
|
53
|
+ else:
|
|
54
|
+ dportHash[sip] = { dip: {dport: 1} }
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+ #print "flows", count_flow
|
|
58
|
+ return dportHash
|
|
59
|
+
|
|
60
|
+#input => list of hash tables of the form {sip:{dip:{dport:total # of each dport}}}
|
|
61
|
+#output => hash table of the form {sip:{dip:{dport:total # of each dport}}}
|
|
62
|
+#Method => iterates through each hash
|
|
63
|
+def join_hash(list):
|
|
64
|
+ complete_hash ={} #will contain the hash table of all the hash tables once formed
|
|
65
|
+
|
|
66
|
+ for i in list: #will iterate through each hash of the list
|
|
67
|
+ #will iterate through each source ip key as sip with the hash table of the form {dip:{dport:#}} as the value
|
|
68
|
+ for sip, hash in i.items():
|
|
69
|
+ #verify if that source ip is already in the merged hash.
|
|
70
|
+ if sip in complete_hash:
|
|
71
|
+ #will iterate through each destination ip key as dip with the hash table of the form {dport:#} as the value
|
|
72
|
+ for dip, dports in i[sip].items():
|
|
73
|
+ #verify if that destination ip is already in the hash of the sip's value in the merged hash
|
|
74
|
+ if dip in complete_hash[sip]:
|
|
75
|
+ #will iterate through each destination port key as number with the total # of times it was connected as the value
|
|
76
|
+ for number, value in dports.items():
|
|
77
|
+ #verify if that destination port is already in the hash of the dip's value in the hash of the sips'value in the merged hash
|
|
78
|
+ if number in complete_hash[sip]:
|
|
79
|
+ #Add the value to the dport's value in the merged hash
|
|
80
|
+ complete_hash[sip][dip][number] += value
|
|
81
|
+
|
|
82
|
+ else:
|
|
83
|
+ #Add the destination port with its value to the merged hash
|
|
84
|
+ complete_hash[sip][dip][number]= value
|
|
85
|
+ else:
|
|
86
|
+ #Add the destination ip with its hash table of dports to the merged hash
|
|
87
|
+ complete_hash[sip][dip]= dports
|
|
88
|
+ else:
|
|
89
|
+ #Add the source ip with its hash table of destination ip and dports to the merged hash
|
|
90
|
+ complete_hash[sip]= hash
|
|
91
|
+ return complete_hash
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+def main():
|
|
95
|
+ #for accesing the network flows during a specific interval of time
|
|
96
|
+ startDate = "2018/08/1"
|
|
97
|
+ endDate = "2018/10/15"
|
|
98
|
+ otherHash = {} #hash table for suspicious source ips with destination ips
|
|
99
|
+ counter = 0
|
|
100
|
+ threshold = 100
|
|
101
|
+ #object for retrieving files from Silk data store. Specified the interval of time, the silk configuration file location, and the silk data location
|
|
102
|
+ files = FGlob(classname="all", type="all", start_date=startDate, end_date=endDate, site_config_file="/etc/silk/conf-v9/silk.conf", data_rootdir="/home/scratch/flow/rwflowpack/")
|
|
103
|
+#****************************************
|
|
104
|
+# Using Pythons Pool Class for multiprocessing
|
|
105
|
+#****************************************
|
|
106
|
+ process_num = 2
|
|
107
|
+ pool = mp.Pool(processes=process_num)
|
|
108
|
+ #change files1 from FGlob object type to a list of silk data files
|
|
109
|
+ files1 = [x for x in files]
|
|
110
|
+ files_list = []
|
|
111
|
+ #Send each process an equal amount of files
|
|
112
|
+ blocksize = len(files1) / process_num
|
|
113
|
+ for x in range(process_num):
|
|
114
|
+ files_list.append(files1[0:blocksize])
|
|
115
|
+ files1 = files1[blocksize:]
|
|
116
|
+ #in the case that the number of files is not divisible by the number of processes
|
|
117
|
+ for i in files1:
|
|
118
|
+ files_list[files1.index(i)].append(i)
|
|
119
|
+ #Using map from Pool, returns a list of a hash per process, being the hash the output of verify_type
|
|
120
|
+ fileHash = pool.map(verify_type, files_list)
|
|
121
|
+ #returns the hash of all hashes merged.
|
|
122
|
+ flowHash = join_hash(fileHash)
|
|
123
|
+ #print len(flowHash)
|
|
124
|
+#****************************************
|
|
125
|
+
|
|
126
|
+#****************************************
|
|
127
|
+# Verifying the source ips
|
|
128
|
+#****************************************
|
|
129
|
+ #Will iterate through the hash of sips and dports
|
|
130
|
+ #Verify whether the number of different dports per desitnation ip, connected to by the same source ip
|
|
131
|
+ #is greater than our threshold. If it is, the source ip will be added to the hash suspicious sips.
|
|
132
|
+ for sips in flowHash:
|
|
133
|
+ for dips, dports in flowHash[sips].items():
|
|
134
|
+ if len(dports) >= threshold:
|
|
135
|
+ if sips in otherHash:
|
|
136
|
+ otherHash[sips][dips] = dports
|
|
137
|
+ else:
|
|
138
|
+ otherHash[sips] = {dips: dports}
|
|
139
|
+
|
|
140
|
+ for dips, dports in otherHash.items():
|
|
141
|
+ counter +=1
|
|
142
|
+ #prints the total number of suspicious source ips.
|
|
143
|
+ print counter
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+if __name__== "__main__":
|
|
147
|
+ main()
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+ # def Filter(fd, sip, dip, sport, dport, dportHash):
|
|
153
|
+ #
|
|
154
|
+ # if dport > 1024 and (sport <= 1024 or (sport >= 8000 and sport < 9000)):
|
|
155
|
+ # return
|
|
156
|
+ # if sip in dportHash:
|
|
157
|
+ # # if dip in dportHash[sip]["dips"]:
|
|
158
|
+ # # dportHash[sip]["dips"][dip] += 1
|
|
159
|
+ # # else:
|
|
160
|
+ # # dportHash[sip]["dips"][dip] = 1
|
|
161
|
+ # if dport in dportHash[sip]["dports"]:
|
|
162
|
+ # dportHash[sip]["dports"][dport] += 1
|
|
163
|
+ # #return
|
|
164
|
+ # else:
|
|
165
|
+ # dportHash[sip]["dports"][dport] = 1
|
|
166
|
+ # else:
|
|
167
|
+ # dportHash[sip] = {"dports": {}}
|
|
168
|
+ # dportHash[sip]["dips"] = {}
|
|
169
|
+ # #fd.write("%s:%s:%s:%s\n" % (sip, dip, sport, dport))
|