Merge pull request #599 from biglocalnews/stucka-patch-8

Patch DC, CT for #596
biglocalnews · Jan 8, 2024 · 1b9c46c · 1b9c46c
2 parents 62ab2ff + ecafcb3
commit 1b9c46c
Show file tree

Hide file tree

Showing 2 changed files with 27 additions and 6 deletions.
diff --git a/warn/scrapers/ct.py b/warn/scrapers/ct.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 from bs4 import BeautifulSoup
+import requests
 
 from .. import utils
 from ..cache import Cache
@@ -40,8 +41,20 @@ def scrape(
     # We start in 2015
     current_year = datetime.now().year
 
-    # Get the full range of years
-    year_range = range(2015, current_year + 1)
+    if cache.exists(f"ct/{current_year}.html"):
+        # Get the full range of years
+        year_range = range(2015, current_year + 1)
+    else:
+        url = f"https://www.ctdol.state.ct.us/progsupt/bussrvce/warnreports/warn{current_year}.htm"
+        r = requests.head(url)
+        if r.ok:
+            logger.debug(f"Found first entry for {current_year}")
+            year_range = range(2015, current_year + 1)
+        else:
+            logger.debug(
+                f"No data for {current_year} found at {url}. Dropping back a year."
+            )
+            year_range = range(2015, current_year + 0)
 
     output_rows = []
     for year in year_range:
@@ -100,7 +113,6 @@ def _scrape_table(table) -> list:
     row_list = []
     # loop over table to process each row, skipping the header
     for table_row in table[0].find_all("tr")[1:]:
-
         # Get all the cells
         table_cells = table_row.find_all("td")
 

diff --git a/warn/scrapers/dc.py b/warn/scrapers/dc.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 from bs4 import BeautifulSoup
+import requests
 
 from .. import utils
 from ..cache import Cache
@@ -38,13 +39,22 @@ def scrape(
     # Get the root page
     today = datetime.today()
     current_year = today.year
-    url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
+    targetfile = f"dc/{current_year}.html"
+    if not cache.exists(targetfile):  # Check if we have an entry for the latest year
+        url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
+        r = requests.head(url)
+        if not r.ok:
+            logger.debug(f"Still no data found for {current_year}. Falling back.")
+            current_year = today.year - 1
+            targetfile = f"dc/{current_year}.html"
+            url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
+
     r = utils.get_url(url)
     r.encoding = "utf-8"
     root_html = r.text
 
     # Save it to the cache
-    cache.write(f"dc/{current_year}.html", root_html)
+    cache.write(targetfile, root_html)
 
     # Parse the list of links
     soup = BeautifulSoup(root_html, "html5lib")
@@ -70,7 +80,6 @@ def scrape(
         root_html,
     ]
     for href in link_lookup.values():
-
         # Request the HTML
         r = utils.get_url(href)
         r.encoding = "utf-8"