This repository has been archived by the owner on Oct 30, 2018. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
logreader.py
232 lines (199 loc) · 8.05 KB
/
logreader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
from pymongo import MongoClient
from fabric.api import *
from fabric.contrib import *
import os
import gzip
import re
import json
import time
import datetime
class LogParser(object):
servers = [
'ny.vpn.bukget.org',
'ca.vpn.bukget.org',
'de.vpn.bukget.org',
'fr.vpn.bukget.org'
]
ignores = ['java', 'php', 'mozilla', 'chrome', 'opera', 'wget',
'curl', 'urllib', 'bot', 'spider', 'apache', 'ruby']
ua = re.compile(r'\"[^ ]*\" \"([^\(\/ ]*).*\"$')
rcode = re.compile(r'HTTP/\d\.\d\" (\d{3})')
def __init__(self):
self.conn = MongoClient()
self.db = self.conn.bukget
self.webstats = self.db.webstats
self.plugins = self.db.plugins
def run(self):
'''
Perform all of the needed actions...
'''
starttime = time.time()
tnxtime = time.time()
print '\n*** PULLING LOGS FROM REMOTE API SERVERS ***\n'
for server in self.servers:
self.get_log(server)
print '*** LOG PULLS TOOK %ss ***' % int(time.time() - tnxtime)
logtime = time.time()
print '\n*** PARSING THE RAW LOG FILES ***\n'
self.parse_logs()
print '*** LOG PARSING TOOK %ss ***' % int(time.time() - logtime)
popularitytime = time.time()
print '\n*** RUNNING POPULARITY CONTEST ***\n'
self.popularity()
print '*** POPULARITY CONTEST TOOK %ss ***' % int(time.time() - popularitytime)
print '*** LOGPARSE TOOK %s Seconds ***' % int(time.time() - starttime)
def get_log(self, host):
'''
Log Retreival
'''
env.warn_only = True
env.user = 'root'
env.key_filename = '/etc/bukget/keys/id_rsa'
date = datetime.datetime.now()
log = '/var/log/bukget/api-access.log-%s.gz' % date.strftime('%Y%m%d')
env.host_string = 'root@%s:22' % host
if not os.path.exists('/tmp/bukgetlogs'):
os.makedirs('/tmp/bukgetlogs')
get(log, '/tmp/bukgetlogs/%s.log.gz' % host)
def ignore_ua(self, line):
'''
Checks the User-Agent against anything we wish to ignore. If there are
no matches (i.e. a good UA string) then return the string, otherwise
return False.
'''
ua_string = self.ua.findall(line.lower())
if len(ua_string) > 0:
ua_string = ua_string[0]
else:
return False
for ignore in self.ignores:
if ignore in ua_string:
return False
if ua_string == '-':
return False
return ua_string.replace('.', '_')
def check_return_code(self, line, logfile):
'''
Checks the return code and looks to see if there are any 500 errors.
If the entry does contain a 500 error, then prin the line to the screen.
'''
show = False
codes = self.rcode.findall(line)
for item in codes:
if item == '500':
show = True
if show: print '{%s} %s' % (logfile, line.strip('\n'))
def parse_logs(self):
'''
Log Parsing Function
'''
data = {
'total': 0,
'api1': 0,
'api2': 0,
'api3': 0,
'unique': 0,
'plugins': {},
'user_agents': {},
'bukkitdev': 0,
'timestamp': int(time.time()),
}
ipaddys = {'total': []}
ipaddy = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
api1 = re.compile(r'plugin/([^/ \?]*)')
api3 = re.compile(r'plugins/[^/ ]*/([^/ \?]*)')
plist = [p['slug'] for p in self.plugins.find({})]
for log in os.listdir('/tmp/bukgetlogs'):
if log[-2:] != 'gz': continue
lfile = gzip.open('/tmp/bukgetlogs/%s' % log)
for line in lfile.readlines():
if 'BukGet-Monitor' in line:
continue
# Get the IP Address
ip = ipaddy.findall(line)[0]
# Now to pull the plugin name from the log. As this
# can depend on the API we are calling, we will first
# look for a match from the api3 regex and fallback to
# the regex that will find entries for api1 and api2.
plugin = api3.findall(line)
if len(plugin) < 1:
plugin = api1.findall(line)
# Next we will check to see if this IP is unique or not.
# If the IP is unique then we will increment in the unique
# counter and add the IP to our unique counter.
data['total'] += 1
if ip not in ipaddys['total']:
ipaddys['total'].append(ip)
data['unique'] += 1
# Next we need to determine what kind of api call this was
# and increment the appropriate counter.
if 'GET /1/' in line: data['api1'] += 1
elif 'GET /2/' in line: data['api2'] += 1
elif 'GET /3/' in line: data['api3'] += 1
# This is where we compute the counts for all of the user-agent
# strings.
uastring = self.ignore_ua(line)
if uastring:
if uastring.lower() not in data['user_agents']:
data['user_agents'][uastring.lower()] = 0
data['user_agents'][uastring.lower()] += 1
# We should also print out any 500 errors so that cron will
# email out the resulting output to the admins.
self.check_return_code(line, log)
if len(plugin) > 0:
p = plugin[0]
if 'https://bukget.org' in line or 'http://bukget.org' in line:
continue
if p not in plist: continue
if p not in data['plugins']:
data['plugins'][p] = {'unique': 0, 'total': 0}
ipaddys[p] = []
if ip not in ipaddys[p]:
data['plugins'][p]['unique'] += 1
ipaddys[p].append(ip)
data['plugins'][p]['total'] += 1
os.remove('/tmp/bukgetlogs/%s' % log)
self.webstats.save(data)
def popularity(self):
'''
Popularity Trending
'''
day_trend = list(self.db.webstats.find().sort('_id', -1).limit(1))[0]
week_trend = list(self.db.webstats.find().sort('_id', -1).limit(7))
month_trend = list(self.db.webstats.find().sort('_id', -1).limit(30))
total_trend = list(self.db.webstats.find().sort('_id', -1))
results = self.plugins.find({})
results.batch_size(10)
for plugin in results:
# First
if 'popularity' not in plugin:
plugin['popularity'] = {}
# Daily Trending
daily = 0
if plugin['slug'] in day_trend['plugins']:
daily = day_trend['plugins'][plugin['slug']]['unique']
# Weekly Trending
weekly = 0
for day in week_trend:
if plugin['slug'] in day['plugins']:
weekly += day['plugins'][plugin['slug']]['unique']
# Monthly Trending
monthly = 0
for day in month_trend:
if plugin['slug'] in day['plugins']:
monthly += day['plugins'][plugin['slug']]['unique']
total = 0
for day in total_trend:
if plugin['slug'] in day['plugins']:
total += day['plugins'][plugin['slug']]['unique']
# Now to add all of the new values to the plugin...
plugin['popularity']['daily'] = daily
plugin['popularity']['weekly'] = weekly
plugin['popularity']['monthly'] = monthly
plugin['popularity']['total'] = total
# Lastly save the changes :)
self.plugins.save(plugin)
if __name__ == '__main__':
lp = LogParser()
lp.run()