filings

datamade · Apr 19, 2024 · 5adad8b · 5adad8b
1 parent a44a32f
commit 5adad8b
Show file tree

Hide file tree

Showing 6 changed files with 73 additions and 59 deletions.
diff --git a/candidates.mk b/candidates.mk
@@ -1,14 +1,14 @@
 .PHONY: offices
 offices : data/processed/candidate_committees.csv		\
-          data/processed/candidate_committees_filings.csv	\
+          data/processed/candidate_committee_filings.csv	\
           data/processed/pac_committees.csv			\
-          data/processed/pac_committees_filings.csv
+          data/processed/pac_committee_filings.csv
 
 data/processed/candidate_committees.csv			\
-data/processed/candidate_committees_filings.csv &:
-	python -m scrapers.office.scrape_search candidates > $@
+data/processed/candidate_committee_filings.csv &:
+	python -m scrapers.office.scrape_search candidates
 
 
 data/processed/pac_committees.csv		\
-data/processed/pac_committees_filings.csv &:
-	python -m scrapers.office.scrape_search committees > $@
+data/processed/pac_committee_filings.csv &:
+	python -m scrapers.office.scrape_search committees
diff --git a/data/headers/employment.csv b/data/headers/employment.csv
@@ -1 +1 @@
-FilerName,FilerID,ReportID,FilingYear,FiledDate,Reporting individual status,Office / Board or Commission / Agency Name,"Date Assumed Office, Employed, or Appointed",Employer,Employer's Phone Number,P.O. Box or Street Address of Employer,City State Zip,Title or Position held by reporting individual,Nature of business or occupation,Spouse's Last Name,Spouse's First Name,Spouse's Middle,Name of Spouse’s Employer,Address of Spouse’s Employer,City,State,Zip,Spouse’s title or position held Nature of business or occupation,"Income Source over $5,000",Recieved By
+FilerName,FilerID,ReportID,FilingYear,FiledDate,Reporting individual status,Office / Board or Commission / Agency Name,"Date Assumed Office, Employed, or Appointed",Employer,Employer's Phone Number,P.O. Box or Street Address of Employer,City State Zip,Title or Position held by reporting individual,Nature of business or occupation,Spouse's Last Name,Spouse's First Name,Spouse's Middle,Name of Spouse’s Employer,Address of Spouse’s Employer,City,State,Zip,Spouse’s title or position held Nature of business or occupation
diff --git a/scrapers/financial_disclosure/levenshtein_distance.py b/scrapers/financial_disclosure/levenshtein_distance.py
@@ -126,8 +126,6 @@ def correct(self, text):
         if text in self.corpus:
             return text
 
-        best_guess = min(
-            self.corpus, key=lambda x: levenshtein_distance(x, text)
-        )
+        best_guess = min(self.corpus, key=lambda x: levenshtein_distance(x, text))
 
         return best_guess
diff --git a/scrapers/financial_disclosure/parse_pdf.py b/scrapers/financial_disclosure/parse_pdf.py
@@ -18,10 +18,13 @@ def __getitem__(self, target_key):
 
 
 def _is_section(row: Row) -> bool:
-    return (
-        re.match(r"^\d+\. ?[A-Z]", row[0]) is not None  # type: ignore[arg-type]
-        or row[0].startswith("*Pursuant to NMSA 1978 §")  # ensure signature is its own section
-    )
+    return re.match(
+        r"^\d+\. ?[A-Z]", row[0]
+    ) is not None or row[  # type: ignore[arg-type]
+        0
+    ].startswith(
+        "*Pursuant to NMSA 1978 §"
+    )  # ensure signature is its own section
 
 
 def _group_rows(rows: Iterable[Row]) -> dict[str, Rows]:
@@ -76,14 +79,14 @@ def _parse_general_info(rows: Rows) -> dict[str, str | None]:
 
     header, *body = rows
 
-    result = [{"Input": val[0]} for val in body if val[0] != '']
-    
+    result = [{"Input": val[0]} for val in body if val[0] != ""]
+
     return result
 
 
 def parse_pdf(pdf: pdfplumber.PDF) -> dict[str, dict[str, str | None]]:
     table_settings = {
-        "intersection_tolerance": 6, # minimum allowable tolerance to grab all tables
+        "intersection_tolerance": 6,  # minimum allowable tolerance to grab all tables
     }
 
     rows = [tuple(row) for page in pdf.pages if (table := page.extract_table(table_settings=table_settings)) for row in table]  # type: ignore[union-attr]
@@ -101,34 +104,54 @@ def parse_pdf(pdf: pdfplumber.PDF) -> dict[str, dict[str, str | None]]:
             grouped_rows["REPORTING INDIVIDUAL – Current Filing Status"]
         ),
         "income sources": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Income Source(s)"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Income Source(s)"
+            ]
         ),
         "specializations": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Areas of Specialization"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Areas of Specialization"
+            ]
         ),
         "consulting or lobbying": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Consulting and/or Lobbying"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE - Consulting and/or Lobbying"
+            ]
         ),
         "real estate": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Real Estate"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Real Estate"
+            ]
         ),
         "other business": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Other Business"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Other Business"
+            ]
         ),
         "board membership": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nBoard Membership"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nBoard Membership"
+            ]
         ),
         "professional licenses": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Professional License(s)"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – Professional License(s)"
+            ]
         ),
         "provisions to state agencies": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nGoods and/or Services Provided to State Agencies"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nGoods and/or Services Provided to State Agencies"
+            ]
         ),
         "state agency representation": _parse_filing_status(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nState Agency Representation"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE\nState Agency Representation"
+            ]
         ),
         "general info": _parse_general_info(
-            grouped_rows["REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – General Information"]
+            grouped_rows[
+                "REPORTING INDIVIDUAL & REPORTING INDIVIDUAL’S SPOUSE – General Information"
+            ]
         ),
     }
 

diff --git a/scrapers/financial_disclosure/scrape_financial_disclosures.py b/scrapers/financial_disclosure/scrape_financial_disclosures.py
@@ -106,7 +106,6 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
         open("data/intermediate/provisions.csv", "w") as provisions_file,
         open("data/intermediate/representation.csv", "w") as representation_file,
         open("data/intermediate/general.csv", "w") as general_file,
-
     ):
         filer_writer = csv.DictWriter(
             filer_file,
@@ -174,41 +173,36 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
                 "Reporting individual",
                 "Office / Board or Commission / Agency Name",
                 "Date Assumed Office, Employed, or Appointed",
-                "ReportID"
+                "ReportID",
             ],
         )
         income_writer = csv.DictWriter(
             income_file,
             [
                 "Income source (*see pg. 4):",
                 "Received by (list the name of the reporting individual or spouse):",
-                "ReportID"
+                "ReportID",
             ],
         )
         specializations_writer = csv.DictWriter(
             specializations_file,
             [
                 "Describe the major areas of specialization or sources of income.",
                 "Received by (list the name of the reporting individual or spouse):",
-                "ReportID"
+                "ReportID",
             ],
         )
         consulting_writer = csv.DictWriter(
             consulting_file,
             [
                 "Client name & address:",
                 "Represented by: List the name of the reporting individual’s firm or spouse’s firm",
-                "ReportID"
+                "ReportID",
             ],
         )
         real_estate_writer = csv.DictWriter(
             real_estate_file,
-            [
-                "Owner",
-                "County",
-                "General Description",
-                "ReportID"
-            ],
+            ["Owner", "County", "General Description", "ReportID"],
         )
         business_writer = csv.DictWriter(
             business_file,
@@ -217,47 +211,44 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
                 "Position held:",
                 "General statement of business purpose:",
                 "Received by (list the name of the reporting individual or spouse):",
-                "ReportID"
+                "ReportID",
             ],
         )
         membership_writer = csv.DictWriter(
             membership_file,
             [
                 "Name of business:",
                 "Board member (list the name of the reporting individual or spouse):",
-                "ReportID"
+                "ReportID",
             ],
         )
         licenses_writer = csv.DictWriter(
             licenses_file,
             [
                 "Type of license:",
                 "Individual holding license (list the name of the reporting individual or spouse):",
-                "ReportID"
+                "ReportID",
             ],
         )
         provisions_writer = csv.DictWriter(
             provisions_file,
             [
                 "State agency to which goods and/or services were provided:",
                 "Individual providing goods or services (list the name of the reporting individual or spouse):",
-                "ReportID"
+                "ReportID",
             ],
         )
         representation_writer = csv.DictWriter(
             representation_file,
             [
                 "State agency (other than a court):",
                 "Individual assisting client (list the name of the reporting individual or spouse):",
-                "ReportID"
+                "ReportID",
             ],
         )
         general_writer = csv.DictWriter(
             general_file,
-            [
-                "Input",
-                "ReportID"
-            ],
+            ["Input", "ReportID"],
         )
 
         # pdf fields that return lists of similarly structured dicts
@@ -348,8 +339,8 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
                 "corrector": levenshtein_distance.SpellingCorrector(
                     general_writer.fieldnames
                 ),
-                "accessor": "general info"
-            }
+                "accessor": "general info",
+            },
         }
 
         for pdf_field in mapping_dict.values():
@@ -403,8 +394,5 @@ def scrape(self) -> Generator[dict[str, dict], None, None]:
                     writer = pdf_field["writer"]
 
                     for entry in extracted_info[accessor]:
-                        field_data = {
-                            corrector.correct(k): v
-                            for k, v in entry.items()
-                        }
+                        field_data = {corrector.correct(k): v for k, v in entry.items()}
                         writer.writerow(field_data | {"ReportID": report_id})
diff --git a/scrapers/office/scrape_search.py b/scrapers/office/scrape_search.py
@@ -98,7 +98,7 @@ def _versions(self, filing):
         if filing["ReportVersionID"] > 1:
             version_response = self.get(
                 "https://login.cfis.sos.state.nm.us/api///Filing/GetFilingHistory",
-                params={"reportID": filing["ReportVersionID"]},
+                params={"reportID": filing["ReportID"]},
             )
             versions.extend(version_response.json())
 
@@ -152,6 +152,7 @@ def __init__(self, **kwargs):
         self.result_key = "CandidateInformationslist"
         self.id_key = "IDNumber"
         self.detail_endpoint = "https://login.cfis.sos.state.nm.us/api///Organization/GetCandidatesInformation"
+        self.committee_key = "PoliticalPartyCommitteeName"
 
     def _filings(self, search_result):
         payload = {
@@ -188,6 +189,7 @@ def __init__(self, **kwargs):
         self.result_key = "CommitteeInformationlist"
         self.id_key = "IdNumber"
         self.detail_endpoint = "https://login.cfis.sos.state.nm.us/api///Organization/GetCommitteeInformation"
+        self.committee_key = "CommitteeName"
 
     def _filings(self, search_result):
         payload = {
@@ -222,19 +224,22 @@ def _filings(self, search_result):
 
 if __name__ == "__main__":
     import csv
+    import pathlib
     from scrapelib.cache import FileCache
 
     cache = FileCache("cache")
 
+    output_dir = pathlib.Path("data/processed")
+
     if sys.argv[1] == "candidates":
         scraper_klass = CandidateScraper
-        committee_file = open("candidate_committees.csv", "w")
-        filing_file = open("candidate_committee_filings.csv", "w")
+        committee_file = open(output_dir / "candidate_committees.csv", "w")
+        filing_file = open(output_dir / "candidate_committee_filings.csv", "w")
 
     elif sys.argv[1] == "committees":
         scraper_klass = CommitteeScraper
-        committee_file = open("pac_committees.csv", "w")
-        filing_file = open("pac_committee_filings.csv", "w")
+        committee_file = open(output_dir / "pac_committees.csv", "w")
+        filing_file = open(output_dir / "pac_committee_filings.csv", "w")
 
     scraper = scraper_klass(requests_per_minute=0, retry_attempts=3)
     scraper.timeout = 10
@@ -267,7 +272,7 @@ def _filings(self, search_result):
 
         extra = {
             "StateID": years[0]["StateID"],
-            "CommitteeName": years[0]["CommitteeName"],
+            "CommitteeName": years[0][scraper.committee_key],
         }
 
         for filing in filings:
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		FilerName,FilerID,ReportID,FilingYear,FiledDate,Reporting individual status,Office / Board or Commission / Agency Name,"Date Assumed Office, Employed, or Appointed",Employer,Employer's Phone Number,P.O. Box or Street Address of Employer,City State Zip,Title or Position held by reporting individual,Nature of business or occupation,Spouse's Last Name,Spouse's First Name,Spouse's Middle,Name of Spouse’s Employer,Address of Spouse’s Employer,City,State,Zip,Spouse’s title or position held Nature of business or occupation,"Income Source over $5,000",Recieved By
		FilerName,FilerID,ReportID,FilingYear,FiledDate,Reporting individual status,Office / Board or Commission / Agency Name,"Date Assumed Office, Employed, or Appointed",Employer,Employer's Phone Number,P.O. Box or Street Address of Employer,City State Zip,Title or Position held by reporting individual,Nature of business or occupation,Spouse's Last Name,Spouse's First Name,Spouse's Middle,Name of Spouse’s Employer,Address of Spouse’s Employer,City,State,Zip,Spouse’s title or position held Nature of business or occupation