Skip to content

Commit

Permalink
use httpx for requests
Browse files Browse the repository at this point in the history
  • Loading branch information
deedy5 committed Aug 17, 2024
1 parent e93d7f3 commit adfee5d
Show file tree
Hide file tree
Showing 6 changed files with 1,454 additions and 38 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/python-package.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@ name: Python package

on:
push:
branches: [ main ]
branches: [ httpx ]
pull_request:
branches: [ main ]
branches: [ httpx ]

jobs:
build:
Expand Down
21 changes: 14 additions & 7 deletions duckduckgo_search/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
from urllib.parse import unquote

import click
import primp
import httpx

from .duckduckgo_search import DDGS
from .utils import _expand_proxy_tb_alias, json_dumps, json_loads
from .utils import _expand_proxy_tb_alias, _get_random_headers, _get_random_ssl_context, json_dumps, json_loads
from .version import __version__

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -83,12 +83,19 @@ def _sanitize_keywords(keywords):

def _download_file(url, dir_path, filename, proxy):
try:
resp = primp.Client(proxy=proxy, impersonate="chrome_126", timeout=10, verify=False).get(url)
if resp.status_code == 200:
with open(os.path.join(dir_path, filename[:200]), "wb") as file:
file.write(resp.content)
resp = httpx.get(
url,
headers=_get_random_headers(),
proxy=proxy,
timeout=10,
follow_redirects=True,
verify=_get_random_ssl_context(),
)
resp.raise_for_status()
with open(os.path.join(dir_path, filename[:200]), "wb") as file:
file.write(resp.content)
except Exception as ex:
logger.debug(f"download_file url={url} {type(ex).__name__} {ex}")
logger.info(f"download_file url={url} {type(ex).__name__} {ex}")


def _download_results(keywords, results, images=False, proxy=None, threads=None):
Expand Down
42 changes: 15 additions & 27 deletions duckduckgo_search/duckduckgo_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,11 @@
from decimal import Decimal
from functools import cached_property
from itertools import cycle, islice
from random import choice
from threading import Event
from types import TracebackType
from typing import cast
from typing import Any

import primp # type: ignore
import httpx

try:
from lxml.etree import _Element
Expand All @@ -28,6 +27,8 @@
_calculate_distance,
_expand_proxy_tb_alias,
_extract_vqd,
_get_random_headers,
_get_random_ssl_context,
_normalize,
_normalize_url,
_text_extract_json,
Expand All @@ -41,16 +42,6 @@ class DDGS:
"""DuckDuckgo_search class to get search results from duckduckgo.com."""

_executor: ThreadPoolExecutor = ThreadPoolExecutor()
_impersonates = (
"chrome_100", "chrome_101", "chrome_104", "chrome_105", "chrome_106", "chrome_107", "chrome_108",
"chrome_109", "chrome_114", "chrome_116", "chrome_117", "chrome_118", "chrome_119", "chrome_120",
#"chrome_123", "chrome_124", "chrome_126",
"chrome_127",
"safari_ios_16.5", "safari_ios_17.2", "safari_ios_17.4.1", "safari_15.3", "safari_15.5",
"safari_15.6.1", "safari_16", "safari_16.5", "safari_17.2.1", "safari_17.4.1", "safari_17.5",
"edge_101", "edge_122", "edge_127",
# "safari_17.0"(primp>=0.6.0)
) # fmt: skip

def __init__(
self,
Expand All @@ -72,18 +63,15 @@ def __init__(
if not proxy and proxies:
warnings.warn("'proxies' is deprecated, use 'proxy' instead.", stacklevel=1)
self.proxy = proxies.get("http") or proxies.get("https") if isinstance(proxies, dict) else proxies
self.headers = headers if headers else {}
self.headers["Referer"] = "https://duckduckgo.com/"
self.client = primp.Client(
headers=self.headers,
self.client = httpx.Client(
headers=_get_random_headers() or headers,
proxy=self.proxy,
timeout=timeout,
cookie_store=True,
referer=True,
impersonate=choice(self._impersonates),
follow_redirects=False,
verify=False,
http2=True,
verify=_get_random_ssl_context(),
)
self.client.headers["Referer"] = "https://duckduckgo.com/"
self._exception_event = Event()
self._chat_messages: list[dict[str, str]] = []
self._chat_tokens_count = 0
Expand All @@ -98,7 +86,7 @@ def __exit__(
exc_val: BaseException | None = None,
exc_tb: TracebackType | None = None,
) -> None:
pass
self.client.__exit__(exc_type, exc_val, exc_tb)

@cached_property
def parser(self) -> LHTMLParser:
Expand All @@ -111,7 +99,7 @@ def _get_url(
url: str,
params: dict[str, str] | None = None,
content: bytes | None = None,
data: dict[str, str] | bytes | None = None,
data: dict[str, Any] | None = None,
) -> bytes:
if self._exception_event.is_set():
raise DuckDuckGoSearchException("Exception occurred in previous call.")
Expand All @@ -124,7 +112,7 @@ def _get_url(
raise DuckDuckGoSearchException(f"{url} {type(ex).__name__}: {ex}") from ex
logger.debug(f"_get_url() {resp.url} {resp.status_code} {len(resp.content)}")
if resp.status_code == 200:
return cast(bytes, resp.content)
return resp.content
self._exception_event.set()
if resp.status_code in (202, 301, 403):
raise RatelimitException(f"{resp.url} {resp.status_code} Ratelimit")
Expand Down Expand Up @@ -183,9 +171,9 @@ def chat(self, keywords: str, model: str = "gpt-4o-mini", timeout: int = 30) ->
data = ",".join(x for line in resp.text.rstrip("[DONE]LIMT_CVRSA\n").split("data:") if (x := line.strip()))
data = json_loads("[" + data + "]")

results = []
results: list[str] = []
for x in data:
if x.get("action") == "error":
if isinstance(x, dict) and x.get("action") == "error":
err_message = x.get("type", "")
if x.get("status") == 429:
raise (
Expand All @@ -194,7 +182,7 @@ def chat(self, keywords: str, model: str = "gpt-4o-mini", timeout: int = 30) ->
else RatelimitException(err_message)
)
raise DuckDuckGoSearchException(err_message)
elif message := x.get("message"):
elif isinstance(x, dict) and (message := x.get("message")):
results.append(message)
result = "".join(results)

Expand Down
Loading

0 comments on commit adfee5d

Please sign in to comment.