From 81e0225f492940faf4f86c6912649497e47cf56d Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Fri, 19 Jan 2024 13:17:57 -0500 Subject: [PATCH] Back out #600 for now Scraper will break, transformer will come back. --- warn/scrapers/hi.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/warn/scrapers/hi.py b/warn/scrapers/hi.py index 1052bf3..78421dd 100644 --- a/warn/scrapers/hi.py +++ b/warn/scrapers/hi.py @@ -1,14 +1,13 @@ import datetime import logging from pathlib import Path -from urllib.parse import quote from bs4 import BeautifulSoup from .. import utils __authors__ = ["Ash1R", "stucka"] -__tags__ = ["html", "pdf"] +__tags__ = ["html"] __source__ = { "name": "Workforce Development Hawaii", "url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/", @@ -29,17 +28,15 @@ def scrape( cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR) Returns: the Path where the file is written """ - cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A" # Use Google Cache, per #600 - - firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/")) + firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/") soup = BeautifulSoup(firstpage.text, features="html5lib") pagesection = soup.select("div.primary-content")[0] subpageurls = [] for atag in pagesection.find_all("a"): href = atag["href"] if href.endswith("/"): - href = href # [:-1] - subpageurls.append(cacheprefix + quote(href)) + href = href[:-1] + subpageurls.append(href) headers = ["Company", "Date", "PDF url", "location", "jobs"] data = [headers] @@ -88,8 +85,8 @@ def scrape( row.append(dates[i]) row.append(url) - row.append(None) # location - row.append(None) # jobs + row.append(None) # location + row.append(None) # jobs data.append(row) output_csv = data_dir / "hi.csv"