Skip to content

Commit

Permalink
v2.3.0 (#21)
Browse files Browse the repository at this point in the history
1)    VQD_DICT now LRU cache,
2)    simplification of logging,
3)    correcting the while loop conditions,
4)    small code improvements.
  • Loading branch information
deedy5 authored Nov 2, 2022
1 parent 1c0d7ac commit 209616d
Show file tree
Hide file tree
Showing 8 changed files with 52 additions and 110 deletions.
19 changes: 6 additions & 13 deletions duckduckgo_search/ddg.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging

from requests import ConnectionError

from .utils import SESSION, _do_output, _get_vqd, _normalize

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -39,7 +37,7 @@ def ddg(

# search
safesearch_base = {"On": 1, "Moderate": -1, "Off": -2}
params = {
payload = {
"q": keywords,
"l": region,
"p": safesearch_base[safesearch],
Expand All @@ -50,20 +48,15 @@ def ddg(
}

results, cache = [], set()
while len(results) < max_results and params["s"] < 200:
while payload["s"] < min(max_results, 200) or len(results) < max_results:
# request search results from duckduckgo
page_data = None
try:
resp = SESSION.get("https://links.duckduckgo.com/d.js", params=params)
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
resp = SESSION.get("https://links.duckduckgo.com/d.js", params=payload)
resp.raise_for_status()
page_data = resp.json().get("results", None)
except ConnectionError:
logger.error("Connection Error.")
break
except Exception:
logger.exception("Exception.", exc_info=True)
logger.exception("")
break

if not page_data:
Expand All @@ -74,7 +67,7 @@ def ddg(

# try pagination
if "n" in row:
params["s"] += i
payload["s"] += i
break

# collect results
Expand Down
15 changes: 4 additions & 11 deletions duckduckgo_search/ddg_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime

from requests import ConnectionError

from .utils import SESSION, _do_output, _download_image, _get_vqd

logger = logging.getLogger(__name__)
Expand All @@ -24,7 +22,7 @@ def ddg_images(
output=None,
download=False,
):
"""DuckDuckGo images search.
"""DuckDuckGo images search. Query params: https://duckduckgo.com/params
Args:
keywords (str): keywords for query.
Expand Down Expand Up @@ -80,19 +78,14 @@ def ddg_images(
}

results, cache = [], set()
while payload["s"] < max_results or len(results) < max_results:
while payload["s"] < min(max_results, 1000) or len(results) < max_results:
page_data = None
try:
resp = SESSION.get("https://duckduckgo.com/i.js", params=payload)
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
resp.raise_for_status()
page_data = resp.json().get("results", None)
except ConnectionError:
logger.error("Connection Error.")
break
except Exception:
logger.exception("Exception.", exc_info=True)
logger.exception("")
break

if not page_data:
Expand Down
21 changes: 5 additions & 16 deletions duckduckgo_search/ddg_maps.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from decimal import Decimal

import requests
from requests import ConnectionError

from .utils import SESSION, _do_output, _get_vqd, _normalize

Expand Down Expand Up @@ -44,7 +43,7 @@ def ddg_maps(
max_results=None,
output=None,
):
"""DuckDuckGo maps search
"""DuckDuckGo maps search. Query params: https://duckduckgo.com/params
Args:
keywords: keywords for query
Expand Down Expand Up @@ -110,17 +109,12 @@ def ddg_maps(
params=params,
headers=headers,
)
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
resp.raise_for_status()
coordinates = resp.json()[0]["boundingbox"]
lat_t, lon_l = Decimal(coordinates[1]), Decimal(coordinates[2])
lat_b, lon_r = Decimal(coordinates[0]), Decimal(coordinates[3])
except ConnectionError:
logger.error("Connection Error.")
return
except Exception:
logger.exception("Exception.", exc_info=True)
logger.exception("")
return

# if a radius is specified, expand the search square
Expand Down Expand Up @@ -154,15 +148,10 @@ def ddg_maps(
page_data = None
try:
resp = SESSION.get("https://duckduckgo.com/local.js", params=params)
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
resp.raise_for_status()
page_data = resp.json()["results"]
except ConnectionError:
logger.error("Connection Error.")
break
except Exception:
logger.exception("Exception.", exc_info=True)
logger.exception("")
break

if not page_data:
Expand Down
21 changes: 7 additions & 14 deletions duckduckgo_search/ddg_news.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import logging
from datetime import datetime

from requests import ConnectionError

from .utils import SESSION, _do_output, _get_vqd, _normalize

logger = logging.getLogger(__name__)
Expand All @@ -16,7 +14,7 @@ def ddg_news(
max_results=25,
output=None,
):
"""DuckDuckGo news search
"""DuckDuckGo news search. Query params: https://duckduckgo.com/params
Args:
keywords: keywords for query.
Expand All @@ -40,7 +38,7 @@ def ddg_news(

# get news
safesearch_base = {"On": 1, "Moderate": -1, "Off": -2}
params = {
payload = {
"l": region,
"o": "json",
"noamp": "1",
Expand All @@ -51,19 +49,14 @@ def ddg_news(
"s": 0,
}
results, cache = [], set()
while params["s"] < min(max_results, 240) or len(results) < max_results:
while payload["s"] < min(max_results, 240) or len(results) < max_results:
page_data = None
try:
resp = SESSION.get("https://duckduckgo.com/news.js", params=params)
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
resp = SESSION.get("https://duckduckgo.com/news.js", params=payload)
resp.raise_for_status()
page_data = resp.json().get("results", None)
except ConnectionError:
logger.error("Connection Error.")
break
except Exception:
logger.exception("Exception.", exc_info=True)
logger.exception("")
break

if not page_data:
Expand All @@ -88,7 +81,7 @@ def ddg_news(
break
results.extend(page_results)
# pagination
params["s"] += 30
payload["s"] += 30

results = sorted(results[:max_results], key=lambda x: x["date"], reverse=True)
if output:
Expand Down
14 changes: 4 additions & 10 deletions duckduckgo_search/ddg_translate.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging

from requests import ConnectionError

from .utils import SESSION, VQD_DICT, _do_output, _get_vqd

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -34,7 +32,7 @@ def ddg_translate(
return None

# translate
params = {
payload = {
"vqd": vqd,
"query": "translate",
"from": from_,
Expand All @@ -49,20 +47,16 @@ def ddg_translate(
try:
resp = SESSION.post(
"https://duckduckgo.com/translation.js",
params=params,
params=payload,
data=data.encode("utf-8"),
)
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
resp.raise_for_status()
result = resp.json()
result["original"] = data
results.append(result)
except ConnectionError:
logger.error("Connection Error.")
except Exception:
VQD_DICT.pop("translate", None)
logger.exception("Exception.", exc_info=True)
logger.exception("")

if output:
keywords = keywords[0]
Expand Down
17 changes: 5 additions & 12 deletions duckduckgo_search/ddg_videos.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
import logging

from requests import ConnectionError

from .utils import SESSION, _do_output, _get_vqd

logger = logging.getLogger(__name__)
Expand All @@ -18,7 +16,7 @@ def ddg_videos(
max_results=50,
output=None,
):
"""DuckDuckGo videos search
"""DuckDuckGo videos search. Query params: https://duckduckgo.com/params
Args:
keywords: keywords for query.
Expand Down Expand Up @@ -61,19 +59,14 @@ def ddg_videos(
}

results, cache = [], set()
while payload["s"] < max_results or len(results) < max_results:
while payload["s"] < min(max_results, 1000) or len(results) < max_results:
page_data = None
try:
resp = SESSION.get("https://duckduckgo.com/v.js", params=payload)
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
resp.raise_for_status()
page_data = resp.json().get("results", None)
except ConnectionError:
logger.error("Connection Error.")
break
except Exception:
logger.exception("Exception.", exc_info=True)
logger.exception("")
break

if not page_data:
Expand All @@ -87,7 +80,7 @@ def ddg_videos(
if not page_results:
break
results.extend(page_results)
# for pagination
# pagination
payload["s"] += 60

results = results[:max_results]
Expand Down
53 changes: 20 additions & 33 deletions duckduckgo_search/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,16 @@
from time import sleep

import requests
from requests import ConnectionError, Timeout

SESSION = requests.Session()
logger = logging.getLogger(__name__)

HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:102.0) Gecko/20100101 Firefox/102.0",
"Referer": "https://duckduckgo.com/",
}
SESSION = requests.Session()
SESSION.headers.update(HEADERS)

logger = logging.getLogger(__name__)

RE_CLEAN_HTML = re.compile("<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});")
VQD_DICT = dict()

Expand All @@ -37,28 +36,20 @@ def _get_vqd(keywords):
resp = SESSION.post(
"https://duckduckgo.com", data=payload, headers=HEADERS, timeout=10
)
if resp.status_code == 200:
logger.info(
"%s %s %s", resp.status_code, resp.url, resp.elapsed.total_seconds()
)
vqd_index_start = resp.content.index(b"vqd='") + 5
vqd_index_end = resp.content.index(b"'", vqd_index_start)
vqd_bytes = resp.content[vqd_index_start:vqd_index_end]

if vqd_bytes:
# delete the first key to reduce memory consumption
if len(VQD_DICT) >= 32768:
VQD_DICT.pop(next(iter(VQD_DICT)))
VQD_DICT[keywords] = vqd_bytes
logger.info("keywords=%s. Got vqd=%s", keywords, vqd_bytes)
return vqd_bytes.decode()
logger.info("get_vqd(). response=%s", resp.status_code)
except Timeout:
logger.warning("Connection timeout in get_vqd().")
except ConnectionError:
logger.warning("Connection error in get_vqd().")
except Exception as ex:
logger.exception("Exception in get_vqd().", ex)
resp.raise_for_status()
vqd_index_start = resp.content.index(b"vqd='") + 5
vqd_index_end = resp.content.index(b"'", vqd_index_start)
vqd_bytes = resp.content[vqd_index_start:vqd_index_end]

if vqd_bytes:
# delete the first key to reduce memory consumption
if len(VQD_DICT) > 32768:
VQD_DICT.pop(next(iter(VQD_DICT)))
VQD_DICT[keywords] = vqd_bytes
return vqd_bytes.decode()

except Exception:
logger.exception("")

# refresh SESSION if not vqd
prev_proxies = SESSION.proxies
Expand All @@ -70,10 +61,10 @@ def _get_vqd(keywords):
"keywords=%s. _get_vqd() is None. Refresh SESSION and retry...", keywords
)
VQD_DICT.pop(keywords, None)
sleep(1)
sleep(0.25)

# sleep to prevent blocking
sleep(1)
sleep(0.25)


def _save_json(jsonfile, data):
Expand Down Expand Up @@ -102,12 +93,8 @@ def _download_image(image_url, dir_path, filename):
file.write(resp.content)
logger.info("Image downloaded. image_url=%s", image_url)
break
except Timeout:
logger.warning("Connection timeout. image_url=%s", image_url)
except ConnectionError:
logger.warning("Connection error. image_url=%s", image_url)
except Exception:
logger.warning("Exception. {image_url=}.", exc_info=True)
logger.exception("")


def _normalize(raw_html):
Expand Down
2 changes: 1 addition & 1 deletion duckduckgo_search/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.2.2"
__version__ = "2.3.0"

0 comments on commit 209616d

Please sign in to comment.