Skip to content

Commit

Permalink
Merge pull request #603 from biglocalnews/stucka-patch-10
Browse files Browse the repository at this point in the history
Back out #600 for now
  • Loading branch information
stucka authored Jan 19, 2024
2 parents e1261e0 + 81e0225 commit c8aedeb
Showing 1 changed file with 6 additions and 9 deletions.
15 changes: 6 additions & 9 deletions warn/scrapers/hi.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
import datetime
import logging
from pathlib import Path
from urllib.parse import quote

from bs4 import BeautifulSoup

from .. import utils

__authors__ = ["Ash1R", "stucka"]
__tags__ = ["html", "pdf"]
__tags__ = ["html"]
__source__ = {
"name": "Workforce Development Hawaii",
"url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
Expand All @@ -29,17 +28,15 @@ def scrape(
cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
Returns: the Path where the file is written
"""
cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A" # Use Google Cache, per #600

firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/"))
firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/")
soup = BeautifulSoup(firstpage.text, features="html5lib")
pagesection = soup.select("div.primary-content")[0]
subpageurls = []
for atag in pagesection.find_all("a"):
href = atag["href"]
if href.endswith("/"):
href = href # [:-1]
subpageurls.append(cacheprefix + quote(href))
href = href[:-1]
subpageurls.append(href)

headers = ["Company", "Date", "PDF url", "location", "jobs"]
data = [headers]
Expand Down Expand Up @@ -88,8 +85,8 @@ def scrape(
row.append(dates[i])

row.append(url)
row.append(None) # location
row.append(None) # jobs
row.append(None) # location
row.append(None) # jobs
data.append(row)

output_csv = data_dir / "hi.csv"
Expand Down

0 comments on commit c8aedeb

Please sign in to comment.