From 5d48d85d9e8e18eb5198ffe5045c8704547db4f1 Mon Sep 17 00:00:00 2001 From: Adam Kariv Date: Sun, 24 Nov 2024 18:43:07 +0200 Subject: [PATCH] User agent for scraping? --- odds/backend/scanner/website/website_scanner.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/odds/backend/scanner/website/website_scanner.py b/odds/backend/scanner/website/website_scanner.py index 768e16c..a44990b 100644 --- a/odds/backend/scanner/website/website_scanner.py +++ b/odds/backend/scanner/website/website_scanner.py @@ -39,7 +39,7 @@ def __call__(self, tag, name, value): class Scraper: headers = { - 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0' + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36' } WORKER_COUNT = 5 CACHE = CACHE_DIR / 'web-scraper' @@ -89,9 +89,9 @@ async def scrape(self, url: str) -> list[str]: final_url = data.get('final_url') if content is None: - async with httpx.AsyncClient(headers=self.headers, timeout=30) as client: + async with httpx.AsyncClient() as client: await asyncio.sleep(self.WORKER_COUNT / 4) - r = await client.get(url, follow_redirects=True) + r = await client.get(url, follow_redirects=True, headers=self.headers, timeout=30) r.raise_for_status() # check content type to ensure it's html: content_type = r.headers.get('content-type', '').lower()