Skip to content

Commit

Permalink
fix(colo, coloctapp): update scraper for api changes
Browse files Browse the repository at this point in the history
Solves: #1198

API had changed: query without date filters returned older documents
  • Loading branch information
grossir committed Nov 20, 2024
1 parent 2567793 commit 5d2f9e7
Showing 1 changed file with 23 additions and 12 deletions.
35 changes: 23 additions & 12 deletions juriscraper/opinions/united_states/state/colo.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
- 2024-07-04: Update to new site, grossir
"""

from datetime import date, datetime
from typing import Tuple
from datetime import date, datetime, timedelta
from typing import Optional, Tuple
from urllib.parse import urlencode

from juriscraper.AbstractSite import logger
Expand All @@ -31,7 +31,7 @@ def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.params = {
"product_id": "WW",
"product_id": "COLORADO",
"jurisdiction": "US",
"content_type": "2",
"court": self.api_court_code,
Expand All @@ -40,14 +40,13 @@ def __init__(self, *args, **kwargs):
"per_page": "30", # Server breaks down when per_page=500, returns 503
"page": "1",
"sort": "date",
"type": "document",
"include_local_exclusive": "true",
"cbm": "6.0|361.0|5.0|9.0|4.0|2.0=0.01|400.0|1.0|0.001|1.5|0.2",
"locale": "en",
"hide_ct6": "true",
"t": str(datetime.now().timestamp())[:10],
"type": "document",
}
self.url = f"{self.base_url}?{urlencode(self.params)}"
self.update_url()

# Request won't work without some of these X- headers
self.request["headers"].update(
Expand Down Expand Up @@ -123,19 +122,31 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
:return None
"""
logger.info("Backscraping for range %s %s", *dates)
self.update_url(dates)
self.html = self._download()
self._process_html()

def update_url(self, dates: Optional[Tuple[date]] = None) -> None:
"""
Set URL with date filters and current timestamp.
Request with no date filter was returning very old documents
instead of the most recent ones
:param dates: start and end date tuple. If not present,
scrape last week
"""
if not dates:
today = datetime.now()
dates = (today - timedelta(7), today + timedelta(1))

start = dates[0].strftime("%Y-%m-%d")
end = dates[1].strftime("%Y-%m-%d")
timestamp = str(datetime.now().timestamp())[:10]
params = {**self.params}
params.update(
{
"date": f"{start}..{end}",
# These are duplicated by the frontend too
"locale": ["en", "en"],
"hide_ct6": ["true", "true"],
"t": [timestamp, timestamp],
"t": timestamp,
}
)
self.url = f"{self.base_url}?{urlencode(params)}"
self.html = self._download()
self._process_html()

0 comments on commit 5d2f9e7

Please sign in to comment.