-
Notifications
You must be signed in to change notification settings - Fork 1
/
mal2csv.py
412 lines (367 loc) · 24.3 KB
/
mal2csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
#mal2csv (malformed access log to CSV) - Written by Ryan Boyle [email protected]
#This script represents an effort to parse and properly align web logs for import into analysis tools.
#Caution! While this script tries to not truncate data to make it align; some data truncation may occur due to malformed data (rare but will happen when a return character is included in the last field).
#Preprocessing may not be required but is recommended to avoid parsing errors
#Combined Log Format will break down to the following columns
#RemoteIP,RemoteLogName,RemoteUser,EventTime,TimeZone,Request,StatusCode,Size,Referrer,UserAgent
import csv
import io
import os
import re
import time
import sys
import json
from optparse import OptionParser
from Web_Log_Deobfuscate import Deobfuscate_Web_Log
#config section
strInputFilePath = "" #Leave blank to process the directory specified in strInputPath. Use to specify a specific log file to process
strInputPath = "" #Path to folder containing log files to format. Separate from Output path
strOutputPath = "" #Folder path to output formated logs. Make sure the folder path exists - script will not create the folder
#Input settings
inputEncoding = "utf-8" #set to "" to use system default
strLineBeginingRE = "^(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)" #regex to ensure each line starts with valid value. Set to "" to disable. default regex is for common log format and should be disabled or modfied for other formats
quotecharacter = '\"'
csv_quotechar = '\"'
strdateFormat = "%d/%b/%Y:%H:%M:%S";#apache datetime format "%d/%b/%Y:%H:%M:%S" #IIS format "%Y-%b-%d %H:%M:%S"
outputDateFormat = '%Y-%m-%d %H:%M:%S'
columnCount = 0 #set to zero to have it dynamically identify the number of columns based on header row (first row). Note not all web servers log header rows
boolPreprocess = False #preprocessing may be required. See if you get "Error on Row: " message and if so set to True.
boolExpectDefaultFormat = True #added to improve accuracy of Common/Combined Log Format. Set to False for IIS logs
#Output settings
outputEncoding = "utf-8"
boolSingleFile = True #Create one output file or many
boolOutputInteresting = False #This can be useful for finding potential suspicious anomalies
boolDeobfuscate = False #Use Web_Log_Deobfuscate to decode fields and improve readability
boolOutputSuspicious = False #If deobfuscating entries then output suspicious entries
boolphpids = False #Run log entries against phpids rules
boolOutputIDS = False #Output PHPIDS rule match information
boolOutputUnformatted = False #This is only useful when debugging
boolIIS = False #Use IIS settings (set boolExpectDefaultFormat = False and strdateFormat = "")
#end config section
boolSuspiciousLineFound = False #variable used to track when a line contains encoded data
phpidSignatures = {} #phpids signatures
customSignatures = {} #IDS signatures for deobfuscated log entries
boolHead = False
def build_cli_parser():
parser = OptionParser(usage="%prog [options]", description="Format malformed access logs to CSV")
parser.add_option("-i", "--input", action="store", default=None, dest="InputPath",
help="Path to folder containing logs to be formatted")
parser.add_option("-o", "--output", action="store", default=None, dest="OutputPath",
help="Formatted log output folder path")
parser.add_option("-d", "--deobfuscate", action="store_true", default=False, dest="boolDeobfuscate",
help="True or False value to deobfuscate log entries for output")
parser.add_option("-l", "--loginteresting", action="store_true", default=False, dest="boolOutputSuspicious",
help="True or False value if interesting deobfuscated entries should be logged")
parser.add_option("-p", "--phpids", action="store_true", default=False, dest="boolphpids",
help="True or False value if PHPIDS rule matching should be performed")
parser.add_option("-r", "--logrules", action="store_true", default=False, dest="boolOutputIDS",
help="True or False value if PHPIDS rule matches should be logged")
parser.add_option("-f", "--formatlogging", action="store_true", default=False, dest="boolOutputInteresting",
help="True or False value if suspicious formatting should be logged")
parser.add_option("-m", "--MicrosoftIIS", action="store_true", default=False, dest="boolIIS",
help="True or False value if Microsoft IIS logs")
return parser
def phpIDS (strMatchCheck, idsFileHandle):
global phpidSignatures
if phpidSignatures == {}:
with open('default_filter.json') as json_file:
phpidSignatures = json.load(json_file)
for filter in phpidSignatures['filters']['filter']:
if re.search( filter['rule'], strMatchCheck.lower()):
#print('id: ' + filter['id'])
#print('rule: ' + filter['rule'])
#print('description: ' + filter['description'])
#print('')
if boolOutputIDS == True:
outputIDS = filter['id'] + "|" + filter['description'] + "|" + strMatchCheck
logIDS(idsFileHandle, outputIDS)
return True
def customIDS (strMatchCheck, idsFileHandle):
global customSignatures
if customSignatures == {}:
with open('custom_filter.json') as json_file:
customSignatures = json.load(json_file)
for filter in customSignatures['filters']['filter']:
if re.search( filter['rule'], strMatchCheck.lower()):
if boolOutputIDS == True:
outputIDS = filter['id'] + "|" + filter['description'] + "|" + strMatchCheck
logIDS(idsFileHandle, outputIDS)
return True
#print(strMatchCheck)
def logIDS(fP, logline):
fP.write("\"" + logline.replace("|", "\",\"") + "\"" + "\n")
def appendQuote(strRow):
if right(strRow, 1) != '"':
return strRow + '\"'
return (strRow)
def right(s, amount):
return s[-amount:]
def deobfuscateEncoding(line):
global boolSuspiciousLineFound
if (line[:1] == "+" and line.replace("+", "").replace("]", "").isnumeric()) == True:
return line#time zone does not need deobfuscating
strOutput = Deobfuscate_Web_Log.replaceChar(Deobfuscate_Web_Log.urldecode(line))
strOutput = Deobfuscate_Web_Log.urldecode(strOutput)#second pass for things like %2520
strOutput = Deobfuscate_Web_Log.replaceUnicodeChar(strOutput)
strOutput = Deobfuscate_Web_Log.HexDecode(strOutput, '0x')
strOutput = Deobfuscate_Web_Log.HexDecode(strOutput, '0X')
strTmpCompare = line # used to identify supicious activity
if boolOutputSuspicious == True and strTmpCompare != strOutput:
if strTmpCompare.replace("%2520", " ").replace("%20", " ") != strOutput:
boolSuspiciousLineFound = True
return strOutput
def CheckRemainingColumns(row_Check, intCurrentLoc, boolNumeric):#check for special chars followed by numeric
boolSpecialFound = False
for intLoopRemaining in range(intCurrentLoc, len(row_Check)):
if boolSpecialFound == True and boolNumeric == True:
if str.isnumeric(row_Check[intLoopRemaining]):
return intLoopRemaining
elif boolSpecialFound == True:
return intLoopRemaining
elif quotecharacter in row_Check[intLoopRemaining]:
boolSpecialFound = True
else:
boolSpecialFound = False
return -1
def fileProcess(strInputFpath, strFileName, strOutPath):
global boolSuspiciousLineFound
global boolHead
global columnCount
boolIDSdetection = False
if boolPreprocess == True:
if not os.path.isfile(strInputFpath):
return None
elif not os.path.exists(strInputFpath):
return None
tmpFilePath = strOutPath +"_preprocessed.tmp"
if os.path.isfile(tmpFilePath):
os.remove(tmpFilePath )
with open(strInputFpath, "rt", encoding=inputEncoding) as inputFile:
for tmpLineIn in inputFile:
tmpLineOut = tmpLineIn
if right(tmpLineOut, 4) == '\\""\n':
tmpLineOut = tmpLineOut[:-4] + '\"\n'
if " " in tmpLineIn: #encounted with nginx logs
tmpLineOut = tmpLineIn.replace(" "," ")
with io.open(tmpFilePath, "a", encoding=outputEncoding) as outputFile:
outputFile.write(tmpLineOut)
strInputFpath = tmpFilePath
print("file created for parsing " + tmpFilePath)
if os.path.isdir(strInputFpath):
return None
elif not os.path.exists(strInputFpath):
return None
if boolSingleFile == True:
strOutPath = strOutPath + "LogOutput.Formatted"
else:
strOutPath = strOutPath + strFileName + ".Formatted"
if boolphpids == True and boolOutputIDS == True:
fP = io.open(strOutPath + ".IDS", "a", encoding=outputEncoding) #open file handle for logging IDS matches
if boolphpids == True or boolOutputSuspicious == True or boolOutputInteresting == True:#open file handle for interesting log output
fi = open(strOutPath + ".interesting","a", encoding=outputEncoding) #suspicious log entry output
csv.field_size_limit(2147483647) #increase threshold to avoid length limitation errors
with open(strInputFpath, "rt", encoding=inputEncoding) as csvfile:
with io.open(strOutPath , "a", encoding=outputEncoding) as f:
queuedRows = []
reader = csv.reader(csvfile, delimiter=' ', quotechar=csv_quotechar)
for r_row in reader: #loop through each row of input
queuedRows = [r_row]
intCheckFirstUserInput = 0
if strLineBeginingRE != "": #can we validate the row start with regex
intListCount = 0
boolMatch = re.match(strLineBeginingRE, r_row[0])
if not boolMatch: #ensure first item has a valid value
for testColumns in r_row:
intListCount +=1
if re.match(strLineBeginingRE, testColumns):
rowSlice = slice(intListCount -1, len(r_row))
queuedRows = [r_row[rowSlice]]
break
if '\n' in "".join(r_row) and columnCount > 0: #handle newline in row
if len(r_row) / columnCount >= 2:
intListCount = 0
for testColumns in r_row:
intListCount +=1
if '\n' in testColumns:
queuedRows = [r_row[:intListCount]]
print ("Error on Row: " + "".join(r_row))
queuedRows[len(queuedRows)-1][intListCount-1] = testColumns[0:testColumns.find("\n")]
break
for row in queuedRows:
if columnCount == 0 and boolIIS == False:
columnCount = len(row)#dynamic row length
if "\\" in "".join(row) and boolOutputInteresting == True:
boolSuspiciousLineFound = True #Trigger logging suspicious line
outputRow = ""
boolSkipColumn = False
lastColumnEscaped = False
intColumnCount = 0
intWriteCount = 0
skippedColumns = 0
boolDateCoverted = False
boolExcludeRow = False #IIS headers are dropped
boolRequestEnding = False # this was added to track the request column. Set to true once "HTTP/" is encountered. Example: HTTP/1.1"
for column in row:
intColumnCount += 1
boolQuoteRemoved = False
boolEscapeChar = False
if boolphpids == True and boolSuspiciousLineFound != True:
boolIDSdetection = phpIDS(column, fP)
boolSuspiciousLineFound = boolIDSdetection
#saniColumn = str.replace(column, "'","") # remove quote chars
saniColumn = column
if boolIIS == True and intColumnCount == 1 and "#Fields:" in saniColumn:
if boolHead == False:
saniColumn = ""
boolSkipColumn = True
if columnCount ==0: #if dynamic header identification
columnCount = len(row) -2 #dynamic row length
else:
boolExcludeRow = True
break #skip header row
if boolIIS == True and intColumnCount == 1 and ("#Software:" in saniColumn or "#Version:" in saniColumn or "#Date:" in saniColumn):
boolExcludeRow = True
break #drop IIS header rows
if boolDeobfuscate == True: #perform decoding
saniColumn = deobfuscateEncoding(saniColumn)
saniColumn = str.replace(saniColumn, quotecharacter,"").replace("\n", "").replace("\rz", "") #remove format characters
if boolphpids == True and boolIDSdetection != True:
boolIDSdetection = customIDS(saniColumn.lower(),fP)
boolSuspiciousLineFound = boolIDSdetection
if 'HTTP/' in saniColumn:
boolRequestEnding = True
if '\"' in saniColumn:
saniColumn = str.replace(saniColumn, quotecharacter,"") # remove quote chars
boolQuoteRemoved = True
if boolExpectDefaultFormat == True and intColumnCount == 6 and row[6].isnumeric() == True and row[7].isnumeric() == True: #if this is the request column and next two columns are numeric then
boolRequestEnding = True #Things line up formatting wise that we don't need to check for escape characters
elif boolExpectDefaultFormat == True and intColumnCount == 3 and "[" == row[4][:1]: #if the column after next is the datetime field then we need to merge the next field with this one
boolQuoteRemoved = True
boolEscapeChar = True
elif '\\' in saniColumn:
if (boolDateCoverted == True): # if we have made it past the user name and date field (only need escape character checks for fields with user provided input). Example: domainname\x5Cryan.boyle
if right(saniColumn,1) == "\\" and boolDeobfuscate == False:
boolQuoteRemoved = True
saniColumn = str.replace(saniColumn, "\\","") #remove escape character
boolEscapeChar = True
elif boolExpectDefaultFormat == True and intColumnCount == 6 and ('GET' in saniColumn or 'POST' in saniColumn or 'PUT' in saniColumn or 'HEAD' in saniColumn or 'PUT' in saniColumn or 'DELETE' in saniColumn) and boolRequestEnding == False:
boolEscapeChar = True #specific way to identify the request column and combine
elif boolExpectDefaultFormat == True and intColumnCount > 6 and boolRequestEnding == False:
boolEscapeChar = True #specific way to identify the request column and combine
elif boolExpectDefaultFormat == True and intColumnCount == columnCount and len(row) - intColumnCount - skippedColumns != columnCount - intColumnCount:
boolEscapeChar = True #This will cause the script to add up all final columns into the last one
if boolDateCoverted == False and saniColumn[0:1] == "[":# format date time
boolDateCoverted = True
logDateTime = time.strptime( saniColumn[1:], strdateFormat)
saniColumn = time.strftime(outputDateFormat, logDateTime)
if boolEscapeChar == True and len(row) > columnCount and boolSkipColumn == False and boolQuoteRemoved == True: #escaped character and column mismatch
if len(row) - intColumnCount != 0:
outputRow = outputRow + ',"' + saniColumn #add new column
else:
outputRow = outputRow + " " + saniColumn #continue column and add separator char back
intWriteCount += 1
if len(row) - intColumnCount - skippedColumns != columnCount - intColumnCount: #more columns than what is expected so combine next column
boolSkipColumn = True
elif boolSkipColumn == True and (len(row) - intColumnCount - skippedColumns != columnCount - intColumnCount): #still more columns than what is expected so combine next column
skippedColumns +=1
if boolHead == False and boolIIS == True and intColumnCount == 1: #IIS header row manipulation
outputRow = "\"" #excluding #Fields: and replacing with a qoute to start our next field
continue
if intCheckFirstUserInput == 0:
intCheckFirstUserInput = CheckRemainingColumns(row, intColumnCount, True) # row, currentColumn, boolCheckNumeric
if intCheckFirstUserInput >= intColumnCount:#check for special chars followed by number (In apache logs this is the first non system/user provided column that is followed by a status code)
outputRow = outputRow + " " + saniColumn #continue column and add separator char back
elif boolQuoteRemoved == True and CheckRemainingColumns(row, intColumnCount, False) <= intColumnCount and not (intColumnCount - skippedColumns == columnCount and len(row) - intColumnCount != 0):#check for special chars ensuring we don't close the column if there is still items to add
outputRow = outputRow + " " + saniColumn + '"' #Finish and close column
boolSkipColumn = False
intWriteCount += 1
elif boolHead == False and boolIIS == True and intColumnCount == 2: #IIS header row manipulation continuation
outputRow = outputRow + saniColumn #this is actually our first output value entry as we skipped #Fields:
boolHead = True
else:
outputRow = outputRow + " " + saniColumn #continue column and add separator char back
elif boolSkipColumn == True and len(row) - intColumnCount - skippedColumns == columnCount - intColumnCount: #New columns are just right
skippedColumns +=1
outputRow = appendQuote(outputRow) + ',"' + saniColumn + '"' #Close column and add new column
boolSkipColumn = False
intWriteCount += 1
elif outputRow == "": # first column in new row
outputRow = '"' + saniColumn + '"'
intWriteCount += 1
else: # add new column
if boolEscapeChar == True:
lastColumnEscaped = True
outputRow = appendQuote(outputRow) + ',"' + saniColumn #start new column
boolSkipColumn = True
elif intColumnCount - skippedColumns >= columnCount and len(row) == intColumnCount and columnCount != len(row): #we've got too many columns. Mash last one together
outputRow = appendQuote(outputRow) + ',"' + saniColumn + '"' #Close column and add final column
elif intWriteCount + 1 == columnCount and len(row) > intColumnCount:
outputRow = appendQuote(outputRow) + ',"' + saniColumn #start new column
else:
outputRow = outputRow + ',"' + saniColumn + '"'#start and close new column
intWriteCount += 1
if len(row) < columnCount:
for x in range(0,columnCount - len(row)):
outputRow = appendQuote(outputRow) + ',\"ParseError\"'
if right(outputRow, 1) != '\"':
#outputRow = outputRow + '\"'
if boolOutputUnformatted == True:
with io.open(strOutPath + ".Unformatted", "a", encoding=outputEncoding) as fU:#Unformatted output that eluded a final quote
fU.write(outputRow + "\n")
outputRow = appendQuote(outputRow)
if boolExcludeRow == False:
f.write(outputRow + "\n")
if boolSuspiciousLineFound == True:
boolSuspiciousLineFound = False
boolIDSdetection = False
fi.write(outputRow + "\n")
if os.path.isfile(strInputFilePath +".tmp"):
os.remove(strInputFilePath +".tmp")
if boolphpids == True and boolOutputIDS == True:
fP.close() #close file handle for IDS log output
if boolphpids == True or boolOutputSuspicious == True or boolOutputInteresting == True:#open file handle for interesting log output
fi.close() #close file handle for interesting log output
parser = build_cli_parser()
opts, args = parser.parse_args(sys.argv[1:])
if opts.InputPath:
strInputPath = opts.InputPath
print (strInputPath)
if opts.OutputPath:
strOutputPath = opts.OutputPath
print (strOutputPath)
if (not strInputPath and not strInputFilePath) or not strOutputPath:
print ("Missing required parameter")
sys.exit(-1)
if opts.boolDeobfuscate:
boolDeobfuscate = opts.boolDeobfuscate
if opts.boolOutputSuspicious:
boolOutputSuspicious = opts.boolOutputSuspicious
if opts.boolphpids:
boolphpids = opts.boolphpids
if opts.boolOutputInteresting:
boolOutputInteresting = opts.boolOutputInteresting
if opts.boolOutputIDS:
boolOutputIDS = opts.boolOutputIDS
if opts.boolIIS:
boolIIS = opts.boolIIS
if boolIIS == True:
boolExpectDefaultFormat = False
strdateFormat = ""
strLineBeginingRE = ""
csv_quotechar = '\x07' #https://stackoverflow.com/questions/494054/how-can-i-disable-quoting-in-the-python-2-4-csv-reader
if strInputFilePath == "":
if os.path.isfile(strInputPath):#check if a file path was provided instead of a folder
strInputFilePath = strInputPath #use file instead of folder
strInputPath = ""
if os.path.isdir(strInputPath):
for file in os.listdir(strInputPath):
if os.path.isdir(os.path.join(strInputPath, file)):
for subfile in os.listdir(os.path.join(strInputPath, file)):
print(os.path.join(os.path.join(strInputPath, file), subfile))
fileProcess(os.path.join(os.path.join(strInputPath, file), subfile), subfile, strOutputPath)
else:
fileProcess(os.path.join(strInputPath, file), file, strOutputPath)
else:
fileName = os.path.basename(strInputFilePath)
fileProcess(strInputFilePath, fileName, strOutputPath)
print("Completed!")