Skip to content

Commit

Permalink
Merge pull request #599 from biglocalnews/stucka-patch-8
Browse files Browse the repository at this point in the history
Patch DC, CT for #596
  • Loading branch information
stucka authored Jan 8, 2024
2 parents 62ab2ff + ecafcb3 commit 1b9c46c
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 6 deletions.
18 changes: 15 additions & 3 deletions warn/scrapers/ct.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from pathlib import Path

from bs4 import BeautifulSoup
import requests

from .. import utils
from ..cache import Cache
Expand Down Expand Up @@ -40,8 +41,20 @@ def scrape(
# We start in 2015
current_year = datetime.now().year

# Get the full range of years
year_range = range(2015, current_year + 1)
if cache.exists(f"ct/{current_year}.html"):
# Get the full range of years
year_range = range(2015, current_year + 1)
else:
url = f"https://www.ctdol.state.ct.us/progsupt/bussrvce/warnreports/warn{current_year}.htm"
r = requests.head(url)
if r.ok:
logger.debug(f"Found first entry for {current_year}")
year_range = range(2015, current_year + 1)
else:
logger.debug(
f"No data for {current_year} found at {url}. Dropping back a year."
)
year_range = range(2015, current_year + 0)

output_rows = []
for year in year_range:
Expand Down Expand Up @@ -100,7 +113,6 @@ def _scrape_table(table) -> list:
row_list = []
# loop over table to process each row, skipping the header
for table_row in table[0].find_all("tr")[1:]:

# Get all the cells
table_cells = table_row.find_all("td")

Expand Down
15 changes: 12 additions & 3 deletions warn/scrapers/dc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

from bs4 import BeautifulSoup
import requests

from .. import utils
from ..cache import Cache
Expand Down Expand Up @@ -38,13 +39,22 @@ def scrape(
# Get the root page
today = datetime.today()
current_year = today.year
url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
targetfile = f"dc/{current_year}.html"
if not cache.exists(targetfile): # Check if we have an entry for the latest year
url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"
r = requests.head(url)
if not r.ok:
logger.debug(f"Still no data found for {current_year}. Falling back.")
current_year = today.year - 1
targetfile = f"dc/{current_year}.html"
url = f"https://does.dc.gov/page/industry-closings-and-layoffs-warn-notifications-{current_year}"

r = utils.get_url(url)
r.encoding = "utf-8"
root_html = r.text

# Save it to the cache
cache.write(f"dc/{current_year}.html", root_html)
cache.write(targetfile, root_html)

# Parse the list of links
soup = BeautifulSoup(root_html, "html5lib")
Expand All @@ -70,7 +80,6 @@ def scrape(
root_html,
]
for href in link_lookup.values():

# Request the HTML
r = utils.get_url(href)
r.encoding = "utf-8"
Expand Down

0 comments on commit 1b9c46c

Please sign in to comment.