-
Notifications
You must be signed in to change notification settings - Fork 1
/
collectdata.py
111 lines (98 loc) · 3.07 KB
/
collectdata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from bs4 import BeautifulSoup
from urllib2 import urlopen
from reverselines import reverse_lines
import time
from selenium import webdriver
URL = ['http://www.oddsportal.com/basketball/usa/nba/results/page/{}/',
'http://www.oddsportal.com/basketball/usa/nba-2012-2013/results/page/{}/',
'http://www.oddsportal.com/basketball/usa/nba-2011-2012/results/page/{}/']
num_pages = [13, 30, 24]
def generate_data(page, f, URL):
html = urlopen(URL.format(page)).read()
pg = BeautifulSoup(html)
browser = webdriver.Chrome('C:/DCB/chromedriver_win32/chromedriver.exe')
browser.get(URL.format(page))
odds = browser.find_elements_by_class_name('odds-nowrp')
odds = [element.text for element in odds]
oddsindex = 0
browser.quit()
data = pg.find_all('td', {'class': 'name table-participant'})
scores = pg.find_all('td', {'class': 'center bold table-odds'})
for t, score in zip(data, scores):
try:
f.write(fix_str(t.span.string))
f.write(' > ')
except AttributeError: # Rare case where neither is bolded (ex. game cancelled)
continue
for order, s in enumerate(t.a.stripped_strings):
if s.strip() != t.span.string.strip():
f.write(fix_str(s))
if order == 1:
f.write(' |H')
f.write(' ^{} {}'.format(fix_odds(odds[oddsindex]), fix_odds(odds[oddsindex + 1])))
oddsindex += 2
else:
f.write(' |A')
f.write(' ^{} {}'.format(fix_odds(odds[oddsindex + 1]), fix_odds(odds[oddsindex])))
oddsindex += 2
break
f.write(' & ')
f.write(fix_str(score.string))
f.write('\n')
def fix_odds(s):
try:
if '+' in s:
return str(float(s[1:]) / 100 + 1)
elif '-' in s:
return str((float(s[1:]) + 100)/float(s[1:]))
elif '/' in s:
s = s.strip()
return str((float(s[:s.find('/')]) / float(s[s.find('/') + 1:]) + 1.0))
elif '.' in s: return s
else:
print s
return 1.0
except: return 1.0
def fix_str(s):
s = s.strip()
if ':' in s:
if 'OT' in s: s = s[:s.find('OT')].strip()
s = s.split(':')
s = [int(_) for _ in s]
s = sorted(s, reverse=True)
s = [str(_) for _ in s]
s = ' '.join(s)
else:
s = s.replace('- ', '')
s = s.replace('-', '')
return s.encode('UTF-8')
def readCommand():
"""
Processes the command used to output to a different file.
"""
import sys
from optparse import OptionParser
argv = sys.argv[1:]
usageStr = """
USAGE: python collectdata.py -o output.txt
"""
parser = OptionParser(usageStr)
default_fn = 'out' + time.strftime('%m_%d_%y_%H_%M_%S', time.localtime()) + '.txt'
parser.add_option('-f', '--file', dest='fn',
help='the output file (default is timestamped)',
metavar='FILE', default=default_fn)
options, otherjunk = parser.parse_args(argv)
return options.fn
if __name__ == '__main__':
fn = readCommand()
f = open('temp', 'w+')
for url, p in zip(URL, num_pages):
for i in range(1, p):
print i
generate_data(i, f, url)
f.close()
f = open('temp', 'r')
fout = open(fn, 'a')
reverse_lines(f, fout)
f.close()
fout.close()