Skip to content

Commit

Permalink
v0.7
Browse files Browse the repository at this point in the history
1. The requests session is now global, so that cookies are stored between different requests
2. Added a delay of 2 seconds between intermediate requests to avoid site error
  • Loading branch information
deedy5 authored Sep 22, 2021
1 parent c5b9286 commit b9fab94
Showing 1 changed file with 26 additions and 23 deletions.
49 changes: 26 additions & 23 deletions duckduckgo_search.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,23 @@
from time import sleep
from lxml import html
import requests

__version__ = 0.6
__version__ = 0.7

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0"})

def ddg(keywords, region='wt-wt', safesearch='Moderate', time=None, max_results=30, **kwargs):
'''
DuckDuckGo search
''' DuckDuckGo search
Query parameters, link: https://duckduckgo.com/params:
keywords: keywords for query;
safesearch: On (kp = 1), Moderate (kp = -1), Off (kp = -2);
region: country of results - wt-wt (Global), us-en, uk-en, ru-ru, etc.;
time: 'd' (day), 'w' (week), 'm' (month), 'y' (year), or 'year-month-date..year-month-date';
max_results = 30 gives a number of results not less than 30,
maximum DDG gives out about 200 results.
'''
'''

safesearch_base = {
'On': 1,
'Moderate': -1,
Expand All @@ -25,23 +29,22 @@ def ddg(keywords, region='wt-wt', safesearch='Moderate', time=None, max_results=
'p': safesearch_base[safesearch],
'df': time
}

with requests.Session() as s:
s.headers.update({"User-Agent": "Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0"})
results = []
while True:
res = s.post('https://html.duckduckgo.com/html', data=payload, **kwargs)
tree = html.fromstring(res.text)
if tree.xpath('//div[@class="no-results"]/text()'):
return results
for element in tree.xpath('//div[contains(@class, "results_links")]'):
results.append({'title': element.xpath('.//a[contains(@class, "result__a")]/text()')[0],
'href': element.xpath('.//a[contains(@class, "result__a")]/@href')[0],
'body': ''.join(element.xpath('.//a[contains(@class, "result__snippet")]//text()')),})
if len(results) >= max_results:
return results

next_page = tree.xpath('.//div[@class="nav-link"]')[-1]
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
results = []
while True:
res = session.post('https://html.duckduckgo.com/html', data=payload, **kwargs)
tree = html.fromstring(res.text)
if tree.xpath('//div[@class="no-results"]/text()'):
return results
for element in tree.xpath('//div[contains(@class, "results_links")]'):
results.append({'title': element.xpath('.//a[contains(@class, "result__a")]/text()')[0],
'href': element.xpath('.//a[contains(@class, "result__a")]/@href')[0],
'body': ''.join(element.xpath('.//a[contains(@class, "result__snippet")]//text()')),})
if len(results) >= max_results:
return results

next_page = tree.xpath('.//div[@class="nav-link"]')[-1]
names = next_page.xpath('.//input[@type="hidden"]/@name')
values = next_page.xpath('.//input[@type="hidden"]/@value')
payload = {n: v for n, v in zip(names, values)}
sleep(2)

0 comments on commit b9fab94

Please sign in to comment.