Merge pull request #603 from biglocalnews/stucka-patch-10

Back out #600 for now
biglocalnews · Jan 19, 2024 · c8aedeb · c8aedeb
2 parents e1261e0 + 81e0225
commit c8aedeb
Showing 1 changed file with 6 additions and 9 deletions.
diff --git a/warn/scrapers/hi.py b/warn/scrapers/hi.py
@@ -1,14 +1,13 @@
 import datetime
 import logging
 from pathlib import Path
-from urllib.parse import quote
 
 from bs4 import BeautifulSoup
 
 from .. import utils
 
 __authors__ = ["Ash1R", "stucka"]
-__tags__ = ["html", "pdf"]
+__tags__ = ["html"]
 __source__ = {
     "name": "Workforce Development Hawaii",
     "url": "https://labor.hawaii.gov/wdc/real-time-warn-updates/",
@@ -29,17 +28,15 @@ def scrape(
     cache_dir -- the Path where results can be cached (default WARN_CACHE_DIR)
     Returns: the Path where the file is written
     """
-    cacheprefix = "https://webcache.googleusercontent.com/search?q=cache%3A"    # Use Google Cache, per #600
-
-    firstpage = utils.get_url(cacheprefix + quote("https://labor.hawaii.gov/wdc/real-time-warn-updates/"))
+    firstpage = utils.get_url("https://labor.hawaii.gov/wdc/real-time-warn-updates/")
     soup = BeautifulSoup(firstpage.text, features="html5lib")
     pagesection = soup.select("div.primary-content")[0]
     subpageurls = []
     for atag in pagesection.find_all("a"):
         href = atag["href"]
         if href.endswith("/"):
-            href = href         # [:-1]
-        subpageurls.append(cacheprefix + quote(href))
+            href = href[:-1]
+        subpageurls.append(href)
 
     headers = ["Company", "Date", "PDF url", "location", "jobs"]
     data = [headers]
@@ -88,8 +85,8 @@ def scrape(
             row.append(dates[i])
 
             row.append(url)
-            row.append(None)     # location
-            row.append(None)     # jobs
+            row.append(None)  # location
+            row.append(None)  # jobs
             data.append(row)
 
     output_csv = data_dir / "hi.csv"