diff --git a/duckduckgo_search/ddg.py b/duckduckgo_search/ddg.py index 5b466aef..ef5e119b 100755 --- a/duckduckgo_search/ddg.py +++ b/duckduckgo_search/ddg.py @@ -1,7 +1,5 @@ import logging -from requests import ConnectionError - from .utils import SESSION, _do_output, _get_vqd, _normalize logger = logging.getLogger(__name__) @@ -39,7 +37,7 @@ def ddg( # search safesearch_base = {"On": 1, "Moderate": -1, "Off": -2} - params = { + payload = { "q": keywords, "l": region, "p": safesearch_base[safesearch], @@ -50,20 +48,15 @@ def ddg( } results, cache = [], set() - while len(results) < max_results and params["s"] < 200: + while payload["s"] < min(max_results, 200) or len(results) < max_results: # request search results from duckduckgo page_data = None try: - resp = SESSION.get("https://links.duckduckgo.com/d.js", params=params) - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) + resp = SESSION.get("https://links.duckduckgo.com/d.js", params=payload) + resp.raise_for_status() page_data = resp.json().get("results", None) - except ConnectionError: - logger.error("Connection Error.") - break except Exception: - logger.exception("Exception.", exc_info=True) + logger.exception("") break if not page_data: @@ -74,7 +67,7 @@ def ddg( # try pagination if "n" in row: - params["s"] += i + payload["s"] += i break # collect results diff --git a/duckduckgo_search/ddg_images.py b/duckduckgo_search/ddg_images.py index f3c03ab6..5f7f10e1 100755 --- a/duckduckgo_search/ddg_images.py +++ b/duckduckgo_search/ddg_images.py @@ -3,8 +3,6 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from datetime import datetime -from requests import ConnectionError - from .utils import SESSION, _do_output, _download_image, _get_vqd logger = logging.getLogger(__name__) @@ -24,7 +22,7 @@ def ddg_images( output=None, download=False, ): - """DuckDuckGo images search. + """DuckDuckGo images search. Query params: https://duckduckgo.com/params Args: keywords (str): keywords for query. @@ -80,19 +78,14 @@ def ddg_images( } results, cache = [], set() - while payload["s"] < max_results or len(results) < max_results: + while payload["s"] < min(max_results, 1000) or len(results) < max_results: page_data = None try: resp = SESSION.get("https://duckduckgo.com/i.js", params=payload) - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) + resp.raise_for_status() page_data = resp.json().get("results", None) - except ConnectionError: - logger.error("Connection Error.") - break except Exception: - logger.exception("Exception.", exc_info=True) + logger.exception("") break if not page_data: diff --git a/duckduckgo_search/ddg_maps.py b/duckduckgo_search/ddg_maps.py index 71ecc7dc..c1ac5934 100755 --- a/duckduckgo_search/ddg_maps.py +++ b/duckduckgo_search/ddg_maps.py @@ -4,7 +4,6 @@ from decimal import Decimal import requests -from requests import ConnectionError from .utils import SESSION, _do_output, _get_vqd, _normalize @@ -44,7 +43,7 @@ def ddg_maps( max_results=None, output=None, ): - """DuckDuckGo maps search + """DuckDuckGo maps search. Query params: https://duckduckgo.com/params Args: keywords: keywords for query @@ -110,17 +109,12 @@ def ddg_maps( params=params, headers=headers, ) - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) + resp.raise_for_status() coordinates = resp.json()[0]["boundingbox"] lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2]) lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3]) - except ConnectionError: - logger.error("Connection Error.") - return except Exception: - logger.exception("Exception.", exc_info=True) + logger.exception("") return # if a radius is specified, expand the search square @@ -154,15 +148,10 @@ def ddg_maps( page_data = None try: resp = SESSION.get("https://duckduckgo.com/local.js", params=params) - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) + resp.raise_for_status() page_data = resp.json()["results"] - except ConnectionError: - logger.error("Connection Error.") - break except Exception: - logger.exception("Exception.", exc_info=True) + logger.exception("") break if not page_data: diff --git a/duckduckgo_search/ddg_news.py b/duckduckgo_search/ddg_news.py index a50ebf96..16e9dee7 100755 --- a/duckduckgo_search/ddg_news.py +++ b/duckduckgo_search/ddg_news.py @@ -1,8 +1,6 @@ import logging from datetime import datetime -from requests import ConnectionError - from .utils import SESSION, _do_output, _get_vqd, _normalize logger = logging.getLogger(__name__) @@ -16,7 +14,7 @@ def ddg_news( max_results=25, output=None, ): - """DuckDuckGo news search + """DuckDuckGo news search. Query params: https://duckduckgo.com/params Args: keywords: keywords for query. @@ -40,7 +38,7 @@ def ddg_news( # get news safesearch_base = {"On": 1, "Moderate": -1, "Off": -2} - params = { + payload = { "l": region, "o": "json", "noamp": "1", @@ -51,19 +49,14 @@ def ddg_news( "s": 0, } results, cache = [], set() - while params["s"] < min(max_results, 240) or len(results) < max_results: + while payload["s"] < min(max_results, 240) or len(results) < max_results: page_data = None try: - resp = SESSION.get("https://duckduckgo.com/news.js", params=params) - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) + resp = SESSION.get("https://duckduckgo.com/news.js", params=payload) + resp.raise_for_status() page_data = resp.json().get("results", None) - except ConnectionError: - logger.error("Connection Error.") - break except Exception: - logger.exception("Exception.", exc_info=True) + logger.exception("") break if not page_data: @@ -88,7 +81,7 @@ def ddg_news( break results.extend(page_results) # pagination - params["s"] += 30 + payload["s"] += 30 results = sorted(results[:max_results], key=lambda x: x["date"], reverse=True) if output: diff --git a/duckduckgo_search/ddg_translate.py b/duckduckgo_search/ddg_translate.py index 6425644c..e768cab3 100755 --- a/duckduckgo_search/ddg_translate.py +++ b/duckduckgo_search/ddg_translate.py @@ -1,7 +1,5 @@ import logging -from requests import ConnectionError - from .utils import SESSION, VQD_DICT, _do_output, _get_vqd logger = logging.getLogger(__name__) @@ -34,7 +32,7 @@ def ddg_translate( return None # translate - params = { + payload = { "vqd": vqd, "query": "translate", "from": from_, @@ -49,20 +47,16 @@ def ddg_translate( try: resp = SESSION.post( "https://duckduckgo.com/translation.js", - params=params, + params=payload, data=data.encode("utf-8"), ) - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) + resp.raise_for_status() result = resp.json() result["original"] = data results.append(result) - except ConnectionError: - logger.error("Connection Error.") except Exception: VQD_DICT.pop("translate", None) - logger.exception("Exception.", exc_info=True) + logger.exception("") if output: keywords = keywords[0] diff --git a/duckduckgo_search/ddg_videos.py b/duckduckgo_search/ddg_videos.py index cd9c976f..6836555f 100755 --- a/duckduckgo_search/ddg_videos.py +++ b/duckduckgo_search/ddg_videos.py @@ -1,7 +1,5 @@ import logging -from requests import ConnectionError - from .utils import SESSION, _do_output, _get_vqd logger = logging.getLogger(__name__) @@ -18,7 +16,7 @@ def ddg_videos( max_results=50, output=None, ): - """DuckDuckGo videos search + """DuckDuckGo videos search. Query params: https://duckduckgo.com/params Args: keywords: keywords for query. @@ -61,19 +59,14 @@ def ddg_videos( } results, cache = [], set() - while payload["s"] < max_results or len(results) < max_results: + while payload["s"] < min(max_results, 1000) or len(results) < max_results: page_data = None try: resp = SESSION.get("https://duckduckgo.com/v.js", params=payload) - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) + resp.raise_for_status() page_data = resp.json().get("results", None) - except ConnectionError: - logger.error("Connection Error.") - break except Exception: - logger.exception("Exception.", exc_info=True) + logger.exception("") break if not page_data: @@ -87,7 +80,7 @@ def ddg_videos( if not page_results: break results.extend(page_results) - # for pagination + # pagination payload["s"] += 60 results = results[:max_results] diff --git a/duckduckgo_search/utils.py b/duckduckgo_search/utils.py index 7b2eb7ad..32b6b0e6 100755 --- a/duckduckgo_search/utils.py +++ b/duckduckgo_search/utils.py @@ -7,17 +7,16 @@ from time import sleep import requests -from requests import ConnectionError, Timeout -SESSION = requests.Session() +logger = logging.getLogger(__name__) + HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0", "Referer": "https://duckduckgo.com/", } +SESSION = requests.Session() SESSION.headers.update(HEADERS) -logger = logging.getLogger(__name__) - RE_CLEAN_HTML = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});") VQD_DICT = dict() @@ -37,28 +36,20 @@ def _get_vqd(keywords): resp = SESSION.post( "https://duckduckgo.com", data=payload, headers=HEADERS, timeout=10 ) - if resp.status_code == 200: - logger.info( - "%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds() - ) - vqd_index_start = resp.content.index(b"vqd='") + 5 - vqd_index_end = resp.content.index(b"'", vqd_index_start) - vqd_bytes = resp.content[vqd_index_start:vqd_index_end] - - if vqd_bytes: - # delete the first key to reduce memory consumption - if len(VQD_DICT) >= 32768: - VQD_DICT.pop(next(iter(VQD_DICT))) - VQD_DICT[keywords] = vqd_bytes - logger.info("keywords=%s. Got vqd=%s", keywords, vqd_bytes) - return vqd_bytes.decode() - logger.info("get_vqd(). response=%s", resp.status_code) - except Timeout: - logger.warning("Connection timeout in get_vqd().") - except ConnectionError: - logger.warning("Connection error in get_vqd().") - except Exception as ex: - logger.exception("Exception in get_vqd().", ex) + resp.raise_for_status() + vqd_index_start = resp.content.index(b"vqd='") + 5 + vqd_index_end = resp.content.index(b"'", vqd_index_start) + vqd_bytes = resp.content[vqd_index_start:vqd_index_end] + + if vqd_bytes: + # delete the first key to reduce memory consumption + if len(VQD_DICT) > 32768: + VQD_DICT.pop(next(iter(VQD_DICT))) + VQD_DICT[keywords] = vqd_bytes + return vqd_bytes.decode() + + except Exception: + logger.exception("") # refresh SESSION if not vqd prev_proxies = SESSION.proxies @@ -70,10 +61,10 @@ def _get_vqd(keywords): "keywords=%s. _get_vqd() is None. Refresh SESSION and retry...", keywords ) VQD_DICT.pop(keywords, None) - sleep(1) + sleep(0.25) # sleep to prevent blocking - sleep(1) + sleep(0.25) def _save_json(jsonfile, data): @@ -102,12 +93,8 @@ def _download_image(image_url, dir_path, filename): file.write(resp.content) logger.info("Image downloaded. image_url=%s", image_url) break - except Timeout: - logger.warning("Connection timeout. image_url=%s", image_url) - except ConnectionError: - logger.warning("Connection error. image_url=%s", image_url) except Exception: - logger.warning("Exception. {image_url=}.", exc_info=True) + logger.exception("") def _normalize(raw_html): diff --git a/duckduckgo_search/version.py b/duckduckgo_search/version.py index ba51cedf..55e47090 100755 --- a/duckduckgo_search/version.py +++ b/duckduckgo_search/version.py @@ -1 +1 @@ -__version__ = "2.2.2" +__version__ = "2.3.0"