From ad207a2133aca2608f983d4387b11668e03a26b8 Mon Sep 17 00:00:00 2001 From: Rachel Shorey Date: Fri, 3 Apr 2015 13:28:50 -0400 Subject: [PATCH 1/2] minor chicago bugfixes --- chicago/bills.py | 11 ++++++++--- chicago/legistar.py | 13 +++++++------ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/chicago/bills.py b/chicago/bills.py index c0d37ed1..c32c56a1 100644 --- a/chicago/bills.py +++ b/chicago/bills.py @@ -181,11 +181,16 @@ def addDetails(self, bill, detail_url) : legislation_details = self.parseDetails(detail_div) - for related_bill in legislation_details.get('Related files', []) : - bill.add_related_bill(identifier = related_bill['label'], + title = bill.title + if ("sundry" in title.lower() + or "miscellaneous" in title.lower()): #these are ominbus + bill.add_related_bill(identifier = related_bill['label'], legislative_session = bill.legislative_session, - relation_type='pending') + relation_type='replaces') + #for now we're skipping related bills if they + #don't contain words that make us think they're + #in a ominbus relationship with each other for i, sponsor in enumerate(legislation_details.get('Sponsors', [])) : if i == 0 : diff --git a/chicago/legistar.py b/chicago/legistar.py index f518589b..44f7da8c 100644 --- a/chicago/legistar.py +++ b/chicago/legistar.py @@ -13,9 +13,9 @@ class LegistarScraper(Scraper): def lxmlize(self, url, payload=None): if payload : - entry = self.urlopen(url, 'POST', payload) + entry = self.post(url, payload).text else : - entry = self.urlopen(url) + entry = self.get(url).text page = lxml.html.fromstring(entry) page.make_links_absolute(url) return page @@ -118,14 +118,15 @@ def parseDataTable(self, table): def _get_link_address(self, link): - if 'onclick' in link.attrib : + url = None + if 'onclick' in link.attrib: onclick = link.attrib['onclick'] - if onclick is not None and onclick.startswith("radopen('"): + if (onclick is not None + and (onclick.startswith("radopen('") + or onclick.startswith("window.open"))): url = self.base_url + onclick.split("'")[1] elif 'href' in link.attrib : url = link.attrib['href'] - else : - url = None return url From b0687ee520f5f8500490739be88832c69fd43c93 Mon Sep 17 00:00:00 2001 From: Rachel Shorey Date: Tue, 7 Apr 2015 10:17:24 -0400 Subject: [PATCH 2/2] fixed some chicago scraper data format bugs --- chicago/bills.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/chicago/bills.py b/chicago/bills.py index c32c56a1..46a5b6bb 100644 --- a/chicago/bills.py +++ b/chicago/bills.py @@ -121,7 +121,11 @@ def scrape(self): def extractVotes(self, action_detail_url) : action_detail_page = self.lxmlize(action_detail_url) - vote_table = action_detail_page.xpath("//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']")[0] + try: + vote_table = action_detail_page.xpath("//table[@id='ctl00_ContentPlaceHolder1_gridVote_ctl00']")[0] + except IndexError: + self.warning("No votes found in table") + return None, [] votes = list(self.parseDataTable(vote_table)) vote_list = [] for vote, _, _ in votes : @@ -249,6 +253,7 @@ def addDetails(self, bill, detail_url) : 'Published in Special Pamphlet' : None, 'Adopted as Substitute' : None, 'Deferred and Published' : None, + 'Approved as Amended' : 'passage', } VOTE_OPTIONS = {'yea' : 'yes',