-
Notifications
You must be signed in to change notification settings - Fork 2
/
councillors.py
130 lines (101 loc) · 4.57 KB
/
councillors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import re
import scraperwiki
import requests
from lxml.html.soupparser import fromstring
from time import sleep
def search():
urlBase = "http://mycouncil.oxford.gov.uk/"
# print "GET 'http://mycouncil.oxford.gov.uk/mgMemberIndex.aspx?FN=ALPHA&VW=TABLE&PIC=1'"
sleep(2)
result = requests.get(urlBase + 'mgMemberIndex.aspx?FN=ALPHA&VW=TABLE&PIC=1')
result_dom = fromstring(result.content)
councillors = result_dom.xpath("//table[@id='mgTable1']//tr")
if len(councillors) == 0:
return
else:
for index, councillor in enumerate(councillors):
roles = ""
eHome = ""
eWork = ""
homePhone = ""
workPhone = ""
homeMobile = ""
workMobile = ""
surgery = ""
# data = {"name": name, "link": link, "address": address, "roles": roles, "eWork": eWork, "eHome": eHome, "homePhone": homePhone,
# "workhone": workhone, "homeMobile": homeMobile, "workMobile": workMobile, "party": party, "ward": ward}
cols = councillor.xpath("td")
if len(cols) == 4:
paras = cols[1].xpath('p')
for i, para in enumerate(paras):
if i == 0:
name = "".join(para.xpath('./a/text()')).strip()
link = "".join(para.xpath('./a/@href')).strip()
else:
pText = "".join(para.xpath('text()')).strip()
# print i, pText
if len(para.xpath('a')) ==1:
link1 = "".join(para.xpath('./a/@href')).strip()
matchObj = re.search( r'@', link1)
if matchObj:
matchObj1 = re.search( r'work', pText, re.I)
if re.search( r'work', pText, re.I):
eWork = link1
elif re.search( r'home', pText, re.I):
eHome = link1
else:
print i, pText, link1
else:
print "non email address link"
else:
matchObj = re.search( r'OX\d \d[A-Z]{2}', pText)
if matchObj:
address = pText
else:
matchObj = re.search( r'^(.+)?\:\s+(0[0-9 ]+)$', pText)
if matchObj:
number = matchObj.group(2)
numberType = matchObj.group(1)
if re.search( r'home\s+mob', numberType, re.I):
homeMobile = number
elif re.search( r'work\s+mob', numberType, re.I):
workMobile = number
elif re.search( r'home', numberType, re.I):
homePhone = number
elif re.search( r'work', numberType, re.I):
workPhone = number
else:
# print i, pText
roles = roles.join(pText)
party = "".join(cols[2].xpath('text()')).strip()
ward = "".join(cols[3].xpath('text()')).strip()
sleep(2)
# print "GET " + urlBase + link
result1 = requests.get(urlBase + link)
result_dom1 = fromstring(re.sub(u"(\u2018|\u2019)", "'", result1.content))
mgUserBody = result_dom1.xpath("//div[@class='mgUserBody']")[0]
mgUserBodySectionTitles = mgUserBody.xpath("//h2[@class='mgSectionTitle']")
print len(mgUserBodySectionTitles)
for mgUserBodySectionTitle in mgUserBodySectionTitles:
mgUserBodySection = mgUserBodySectionTitle.xpath('following-sibling::*')[0]
if mgUserBodySection:
mgUserBodySectionName = "".join(mgUserBodySectionTitle.xpath('text()')).strip()
matchObj = re.search( r'Surgery details', mgUserBodySectionName, re.I)
if re.search( r'Surgery details', mgUserBodySectionName, re.I):
surgery = re.sub(u"(\u2018|\u2019)", "'", "".join(mgUserBodySection.xpath('text()')).strip())
elif re.search( r'terms of office', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'More information about this councillor', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'committee appointments', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'Appointments to outside bodies', mgUserBodySectionName, re.I):
print mgUserBodySectionName, len(mgUserBodySection.xpath('li'))
elif re.search( r'Additional Information', mgUserBodySectionName, re.I):
print mgUserBodySection.tag
else:
print "No next sibling"
data = { "index": index, "surgery": surgery, "name": name, "link": link, "address": address, "roles": roles, "eWork": eWork, "eHome": eHome, "homePhone": homePhone, "workPhone": workPhone, "homeMobile": homeMobile, "workMobile": workMobile, "party": party, "ward": ward}
scraperwiki.sqlite.save(unique_keys=['index', 'link'], data=data)
# print data
search()