Skip to content

Commit

Permalink
filings
Browse files Browse the repository at this point in the history
  • Loading branch information
fgregg committed Apr 19, 2024
1 parent a44a32f commit 5adad8b
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 59 deletions.
12 changes: 6 additions & 6 deletions candidates.mk
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
.PHONY: offices
offices : data/processed/candidate_committees.csv \
data/processed/candidate_committees_filings.csv \
data/processed/candidate_committee_filings.csv \
data/processed/pac_committees.csv \
data/processed/pac_committees_filings.csv
data/processed/pac_committee_filings.csv

data/processed/candidate_committees.csv \
data/processed/candidate_committees_filings.csv &:
python -m scrapers.office.scrape_search candidates > $@
data/processed/candidate_committee_filings.csv &:
python -m scrapers.office.scrape_search candidates


data/processed/pac_committees.csv \
data/processed/pac_committees_filings.csv &:
python -m scrapers.office.scrape_search committees > $@
data/processed/pac_committee_filings.csv &:
python -m scrapers.office.scrape_search committees
2 changes: 1 addition & 1 deletion data/headers/employment.csv
Original file line number Diff line number Diff line change
@@ -1 +1 @@
FilerName,FilerID,ReportID,FilingYear,FiledDate,Reporting individual status,Office / Board or Commission / Agency Name,"Date Assumed Office, Employed, or Appointed",Employer,Employer's Phone Number,P.O. Box or Street Address of Employer,City State Zip,Title or Position held by reporting individual,Nature of business or occupation,Spouse's Last Name,Spouse's First Name,Spouse's Middle,Name of Spouse’s Employer,Address of Spouse’s Employer,City,State,Zip,Spouse’s title or position held Nature of business or occupation,"Income Source over $5,000",Recieved By
FilerName,FilerID,ReportID,FilingYear,FiledDate,Reporting individual status,Office / Board or Commission / Agency Name,"Date Assumed Office, Employed, or Appointed",Employer,Employer's Phone Number,P.O. Box or Street Address of Employer,City State Zip,Title or Position held by reporting individual,Nature of business or occupation,Spouse's Last Name,Spouse's First Name,Spouse's Middle,Name of Spouse’s Employer,Address of Spouse’s Employer,City,State,Zip,Spouse’s title or position held Nature of business or occupation
4 changes: 1 addition & 3 deletions scrapers/financial_disclosure/levenshtein_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,6 @@ def correct(self, text):
if text in self.corpus:
return text

best_guess = min(
self.corpus, key=lambda x: levenshtein_distance(x, text)
)
best_guess = min(self.corpus, key=lambda x: levenshtein_distance(x, text))

return best_guess
57 changes: 40 additions & 17 deletions scrapers/financial_disclosure/parse_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,13 @@ def __getitem__(self, target_key):


def _is_section(row: Row) -> bool:
return (
re.match(r"^\d+\. ?[A-Z]", row[0]) is not None # type: ignore[arg-type]
or row[0].startswith("*Pursuant to NMSA 1978 §") # ensure signature is its own section
)
return re.match(
r"^\d+\. ?[A-Z]", row[0]
) is not None or row[ # type: ignore[arg-type]
0
].startswith(
"*Pursuant to NMSA 1978 §"
) # ensure signature is its own section


def _group_rows(rows: Iterable[Row]) -> dict[str, Rows]:
Expand Down Expand Up @@ -76,14 +79,14 @@ def _parse_general_info(rows: Rows) -> dict[str, str | None]:

header, *body = rows

result = [{"Input": val[0]} for val in body if val[0] != '']
result = [{"Input": val[0]} for val in body if val[0] != ""]

return result


def parse_pdf(pdf: pdfplumber.PDF) -> dict[str, dict[str, str | None]]:
table_settings = {
"intersection_tolerance": 6, # minimum allowable tolerance to grab all tables
"intersection_tolerance": 6, # minimum allowable tolerance to grab all tables
}

rows = [tuple(row) for page in pdf.pages if (table := page.extract_table(table_settings=table_settings)) for row in table] # type: ignore[union-attr]
Expand All @@ -101,34 +104,54 @@ def parse_pdf(pdf: pdfplumber.PDF) -> dict[str, dict[str, str | None]]:
grouped_rows["REPORTING INDIVIDUAL – Current Filing Status"]
),
"income sources": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Income Source(s)"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Income Source(s)"
]
),
"specializations": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Areas of Specialization"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Areas of Specialization"
]
),
"consulting or lobbying": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Consulting and/or Lobbying"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Consulting and/or Lobbying"
]
),
"real estate": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Real Estate"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Real Estate"
]
),
"other business": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Other Business"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Other Business"
]
),
"board membership": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nBoard Membership"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nBoard Membership"
]
),
"professional licenses": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Professional License(s)"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Professional License(s)"
]
),
"provisions to state agencies": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nGoods and/or Services Provided to State Agencies"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nGoods and/or Services Provided to State Agencies"
]
),
"state agency representation": _parse_filing_status(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nState Agency Representation"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nState Agency Representation"
]
),
"general info": _parse_general_info(
grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – General Information"]
grouped_rows[
"REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – General Information"
]
),
}

Expand Down
40 changes: 14 additions & 26 deletions scrapers/financial_disclosure/scrape_financial_disclosures.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,6 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
open("data/intermediate/provisions.csv", "w") as provisions_file,
open("data/intermediate/representation.csv", "w") as representation_file,
open("data/intermediate/general.csv", "w") as general_file,

):
filer_writer = csv.DictWriter(
filer_file,
Expand Down Expand Up @@ -174,41 +173,36 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
"Reporting individual",
"Office / Board or Commission / Agency Name",
"Date Assumed Office, Employed, or Appointed",
"ReportID"
"ReportID",
],
)
income_writer = csv.DictWriter(
income_file,
[
"Income source (*see pg. 4):",
"Received by (list the name of the reporting individual or spouse):",
"ReportID"
"ReportID",
],
)
specializations_writer = csv.DictWriter(
specializations_file,
[
"Describe the major areas of specialization or sources of income.",
"Received by (list the name of the reporting individual or spouse):",
"ReportID"
"ReportID",
],
)
consulting_writer = csv.DictWriter(
consulting_file,
[
"Client name & address:",
"Represented by: List the name of the reporting individual’s firm or spouse’s firm",
"ReportID"
"ReportID",
],
)
real_estate_writer = csv.DictWriter(
real_estate_file,
[
"Owner",
"County",
"General Description",
"ReportID"
],
["Owner", "County", "General Description", "ReportID"],
)
business_writer = csv.DictWriter(
business_file,
Expand All @@ -217,47 +211,44 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
"Position held:",
"General statement of business purpose:",
"Received by (list the name of the reporting individual or spouse):",
"ReportID"
"ReportID",
],
)
membership_writer = csv.DictWriter(
membership_file,
[
"Name of business:",
"Board member (list the name of the reporting individual or spouse):",
"ReportID"
"ReportID",
],
)
licenses_writer = csv.DictWriter(
licenses_file,
[
"Type of license:",
"Individual holding license (list the name of the reporting individual or spouse):",
"ReportID"
"ReportID",
],
)
provisions_writer = csv.DictWriter(
provisions_file,
[
"State agency to which goods and/or services were provided:",
"Individual providing goods or services (list the name of the reporting individual or spouse):",
"ReportID"
"ReportID",
],
)
representation_writer = csv.DictWriter(
representation_file,
[
"State agency (other than a court):",
"Individual assisting client (list the name of the reporting individual or spouse):",
"ReportID"
"ReportID",
],
)
general_writer = csv.DictWriter(
general_file,
[
"Input",
"ReportID"
],
["Input", "ReportID"],
)

# pdf fields that return lists of similarly structured dicts
Expand Down Expand Up @@ -348,8 +339,8 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
"corrector": levenshtein_distance.SpellingCorrector(
general_writer.fieldnames
),
"accessor": "general info"
}
"accessor": "general info",
},
}

for pdf_field in mapping_dict.values():
Expand Down Expand Up @@ -403,8 +394,5 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
writer = pdf_field["writer"]

for entry in extracted_info[accessor]:
field_data = {
corrector.correct(k): v
for k, v in entry.items()
}
field_data = {corrector.correct(k): v for k, v in entry.items()}
writer.writerow(field_data | {"ReportID": report_id})
17 changes: 11 additions & 6 deletions scrapers/office/scrape_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def _versions(self, filing):
if filing["ReportVersionID"] > 1:
version_response = self.get(
"https://login.cfis.sos.state.nm.us/api///Filing/GetFilingHistory",
params={"reportID": filing["ReportVersionID"]},
params={"reportID": filing["ReportID"]},
)
versions.extend(version_response.json())

Expand Down Expand Up @@ -152,6 +152,7 @@ def __init__(self, **kwargs):
self.result_key = "CandidateInformationslist"
self.id_key = "IDNumber"
self.detail_endpoint = "https://login.cfis.sos.state.nm.us/api///Organization/GetCandidatesInformation"
self.committee_key = "PoliticalPartyCommitteeName"

def _filings(self, search_result):
payload = {
Expand Down Expand Up @@ -188,6 +189,7 @@ def __init__(self, **kwargs):
self.result_key = "CommitteeInformationlist"
self.id_key = "IdNumber"
self.detail_endpoint = "https://login.cfis.sos.state.nm.us/api///Organization/GetCommitteeInformation"
self.committee_key = "CommitteeName"

def _filings(self, search_result):
payload = {
Expand Down Expand Up @@ -222,19 +224,22 @@ def _filings(self, search_result):

if __name__ == "__main__":
import csv
import pathlib
from scrapelib.cache import FileCache

cache = FileCache("cache")

output_dir = pathlib.Path("data/processed")

if sys.argv[1] == "candidates":
scraper_klass = CandidateScraper
committee_file = open("candidate_committees.csv", "w")
filing_file = open("candidate_committee_filings.csv", "w")
committee_file = open(output_dir / "candidate_committees.csv", "w")
filing_file = open(output_dir / "candidate_committee_filings.csv", "w")

elif sys.argv[1] == "committees":
scraper_klass = CommitteeScraper
committee_file = open("pac_committees.csv", "w")
filing_file = open("pac_committee_filings.csv", "w")
committee_file = open(output_dir / "pac_committees.csv", "w")
filing_file = open(output_dir / "pac_committee_filings.csv", "w")

scraper = scraper_klass(requests_per_minute=0, retry_attempts=3)
scraper.timeout = 10
Expand Down Expand Up @@ -267,7 +272,7 @@ def _filings(self, search_result):

extra = {
"StateID": years[0]["StateID"],
"CommitteeName": years[0]["CommitteeName"],
"CommitteeName": years[0][scraper.committee_key],
}

for filing in filings:
Expand Down

0 comments on commit 5adad8b

Please sign in to comment.