Skip to content

Commit

Permalink
v2.1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
deedy5 committed Feb 18, 2024
1 parent c7c4c42 commit 56aa6a9
Show file tree
Hide file tree
Showing 5 changed files with 72 additions and 207 deletions.
18 changes: 6 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,10 @@ CLI examples:
```python3
# user located in Turkey, who speaks Kurdish and is interested in hot stories
fake_traffic -c tr -l ku-tr -ca h
# user located in Brazil, who speaks Portuguese and is interested in sports
fake_traffic -c br -l pt-br -ca s
# save logs into 'fake_traffic.log'
fake_traffic -c ru -l ru-ru -ca s -lf
# define wait times between requests
fake_traffic -c fr -l fr-fr -ca b -min_w 1 -max_w 100 -lf
# use none-headless mode
fake_traffic -c en -l en-us -ca t -nh -lf
fake_traffic -c en -l en-us -ca t -nh
```
---
### Simple usage
Expand All @@ -42,14 +38,12 @@ FakeTraffic(country='US', language='en-US").crawl()
from fake_traffic import FakeTraffic

ft = FakeTraffic(country='US', language='en-US', category='h', min_wait=1, max_wait=5, headless=True)
""" Imitating an Internet user by mimicking popular web traffic (internet traffic generator).
"""Internet traffic generator.
country = country code ISO 3166-1 Alpha-2 code (https://www.iso.org/obp/ui/),
language = country-language code ISO-639 and ISO-3166 (https://www.fincher.org/Utilities/CountryLanguageList.shtml),
category = сategory of interest of a user (defaults to 'h'):
'all' (all), 'b' (business), 'e' (entertainment),
'm' (health), 's' (sports), 't' (sci/tech), 'h' (top stories);
min_wait = minimal delay between requests (defaults to 1),
max_wait = maximum delay between requests (defaults to 10),
headless = True/False (defaults to True).
"""
ft.crawl()
Expand Down Expand Up @@ -91,11 +85,11 @@ Country | Language | Function |
France | French | `FakeTraffic(country="FR", language="fr-FR")` |
Germany | German | `FakeTraffic(country="DE", language="de-DE", category='b')` |
India | English | `FakeTraffic(country="IN", language="en-IN", category='all')` |
India | Hindi | `FakeTraffic(country="IN", language="hi-IN", max_wait=10)` |
India | Hindi | `FakeTraffic(country="IN", language="hi-IN")` |
Russia | English | `FakeTraffic(country="RU", language="en-US", category='b', headless=False)` |
Russia | Russian | `FakeTraffic(country="RU", language="ru-RU", min_wait=0.5, max_wait=3)` |
Brazil | Portuguese | `FakeTraffic(country="BR", language="pt-BR", category='s', threads=2, max_wait=60)` |
Russia | Russian | `FakeTraffic(country="RU", language="ru-RU")` |
Brazil | Portuguese | `FakeTraffic(country="BR", language="pt-BR", category='s')` |
United Kingdom | English | `FakeTraffic(country="GB", language="en-GB")` |
United States | English | `FakeTraffic(country="US", language="en-US", min_wait=60, max_wait=300)` |
United States | English | `FakeTraffic(country="US", language="en-US")` |
United States | Hebrew Israel | `FakeTraffic(country="US", language="he-IL")` |

22 changes: 2 additions & 20 deletions fake_traffic/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,7 @@
from .fake_traffic import FakeTraffic


parser = argparse.ArgumentParser(
description="fake_traffic. Imitating an Internet user by mimicking popular web traffic (internet traffic generator)."
)
parser = argparse.ArgumentParser(description="Internet traffic generator")
parser.add_argument(
"-c",
"--country",
Expand All @@ -29,20 +27,6 @@
choices=["all", "b", "e", "m", "s", "t", "h"],
required=False,
)
parser.add_argument(
"-min_w",
"--min_wait",
default=1,
help="default=1. Minimum wait time between requests.",
required=False,
)
parser.add_argument(
"-max_w",
"--max_wait",
default=10,
help="default=10. Maximum wait time between requests.",
required=False,
)
parser.add_argument(
"-nh",
"--no-headless",
Expand Down Expand Up @@ -82,16 +66,14 @@
language_split = args.language.split("-")
language = f"{language_split[0]}-{language_split[1].upper()}"
logging.info(
f"Run crawl with: {country=}, {language=}, category={args.category} min_w={args.min_wait}, max_w={args.max_wait}, headless={args.headless}, logging_level={args.logging_level}, logging_file={args.logging_file}"
f"Run crawl with: {country=}, {language=}, category={args.category}, headless={args.headless}, logging_level={args.logging_level}, logging_file={args.logging_file}"
)


fake_traffic = FakeTraffic(
country=country,
language=language,
category=args.category,
min_wait=int(args.min_wait),
max_wait=int(args.max_wait),
headless=args.headless,
)
fake_traffic.crawl()
231 changes: 60 additions & 171 deletions fake_traffic/fake_traffic.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
import asyncio
import logging
import subprocess
from collections import deque
from random import choice, randint, shuffle, uniform
from time import sleep
from urllib.parse import urljoin

from playwright.sync_api import sync_playwright
from playwright_stealth import stealth_sync
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async

logging.basicConfig(level=logging.INFO)
SEMAPHORE = asyncio.Semaphore(5)


# playwright install chromium
Expand All @@ -19,208 +19,97 @@
)
logging.info(res.stdout)

BLACKLIST = (
".cs",
".css",
".gif",
".ico",
".iso",
".jpeg",
".jpg",
".js",
".json",
".png",
".svg",
".xml",
"/auth/",
"/authorize?",
"/captcha",
"/chat",
"/click",
"/feed?",
"/help",
"/join?",
"/joinchat",
"/privacy",
"/registration",
"/share",
"/showcaptcha",
"/stat/",
"/support",
"/terms",
"/tos",
"/tweet",
"Login",
"Special:",
"_click_",
"bit.ly",
"clickserve",
"https://t.co",
"itunes.apple.com",
"javascript:",
"l.facebook.com",
"legal.twitter.com",
"login",
"mail.",
"mailto:",
"mediawiki",
"messenger.com",
"policies",
"s.click",
"showcaptcha?",
"signup",
"smart-captcha/",
"support.",
"t.umblr.com",
"tel:",
"tg://",
"whatsapp://",
"zendesk",
)


class FakeTraffic:
def __init__(
self,
country="US",
language="en-US",
category="h",
min_wait=1,
max_wait=10,
headless=True,
):
"""Imitating an Internet user by mimicking popular web traffic (internet traffic generator).
"""Internet traffic generator.
country = country code ISO 3166-1 Alpha-2 code (https://www.iso.org/obp/ui/),
language = country-language code ISO-639 and ISO-3166 (https://www.fincher.org/Utilities/CountryLanguageList.shtml),
category = category of interest of a user (defaults to 'h'):
'all' (all), 'b' (business), 'e' (entertainment),
'm' (health), 's' (sports), 't' (sci/tech), 'h' (top stories);
min_wait = minimal delay between requests (defaults to 1),
max_wait = maximum delay between requests (defaults to 10),
headless = True/False (defaults to True).
"""
self.country = country
self.language = language
self.category = category
self.min_wait = min_wait
self.max_wait = max_wait
self.headless = headless
self.urls_queue = deque()
self.trends = set()
self.page = self.initialize_browser()

@staticmethod
def url_in_blacklist(url):
if any(x in url for x in BLACKLIST):
logging.info(f"{url}, STATUS: in BLACKLIST")
return True
self.browser = None

@staticmethod
def url_fix(url):
if "https://" not in url and "http://" not in url:
url = f"https://{url}"
url = url.split("#")[0].split("?")[0]
return url
async def abrowse(self, url):
async with SEMAPHORE:
page = await self.browser.new_page()
await stealth_async(page)
try:
await page.goto(url, wait_until="load")
except Exception as ex:
logging.warning(f"{type(ex).__name__}: {ex}")
await page.close()

def initialize_browser(self):
"""Initialize browser"""
try:
p = sync_playwright().__enter__()
browser = p.chromium.launch(
async def acrawl(self):
async with async_playwright() as p:
browser = await p.chromium.launch(
args=["--disable-blink-features=AutomationControlled"],
headless=self.headless,
slow_mo=100,
)
context = browser.new_context(
context = await browser.new_context(
locale=self.language,
viewport={"width": 1920, "height": 1080},
)
page = context.new_page()
stealth_sync(page)
return page
except Exception as ex:
logging.warning(f"{type(ex).__name__}: {ex}")

def get_url(self, url):
url = self.url_fix(url)
if not self.url_in_blacklist(url):
try:
resp = self.page.goto(url, wait_until="load")
logging.info(f"{resp.url} {resp.status}")
return self.page
except Exception as ex:
logging.warning(f"{url} {type(ex).__name__}: {ex}")

def google_search(self, query):
self.page.goto("https://www.google.com")
self.page.fill('textarea[name="q"]', query)
self.page.press('textarea[name="q"]', "Enter")
self.page.wait_for_load_state("load")
result_urls = self.page.query_selector_all(
"//div[starts-with(@class, 'g ')]//span/a[@href]"
)
result_urls = [link.get_attribute("href") for link in result_urls]
logging.info(f"google_search() {query=} GOT {len(result_urls)} results")
return result_urls

def google_trends(self):
url = f"https://trends.google.com/trends/trendingsearches/realtime?geo={self.country}&hl={self.language}&category={self.category}"
self.page.goto(url, wait_until="load")
elements = self.page.query_selector_all("//div[@class='title']")
trends = [x for e in elements for x in e.inner_text().split(" • ")]
logging.info(f"google_trends() GOT {len(trends)} trends")

for e in elements:
e.click()
self.page.wait_for_selector("//div[@class='carousel-wrapper']")
related_urls_elements = self.page.query_selector_all("//div[@class='carousel-wrapper']//a")
related_urls = [link.get_attribute("href") for link in related_urls_elements]
self.urls_queue.extend(related_urls)
return trends

def parse_urls(self, page, base_url):
try:
elements = page.query_selector_all("a")
urls = [
urljoin(base_url, x) for e in elements if (x := e.get_attribute("href"))
]
return urls
except Exception as ex:
logging.warning(f"parse_urls() {type(ex).__name__}: {ex}")
return []

def recursive_browse(self, url, depth):
if depth:
resp = self.get_url(url)
if resp:
urls = self.parse_urls(resp, resp.url)
if urls:
url = choice(urls)
sleep(uniform(self.min_wait, self.max_wait))
self.recursive_browse(url, depth - 1)
self.browser = context

page = await self.browser.new_page()
await stealth_async(page)

# google trends
url = f"https://trends.google.com/trends/trendingsearches/realtime?geo={self.country}&hl={self.language}&category={self.category}"
await page.goto(url, wait_until="load")
elements = await page.query_selector_all("//div[@class='title']")
keywords = [x for e in elements for x in (await e.inner_text()).split(" • ")]
logging.info(f"google_trends() GOT {len(keywords)} keywords")

# google search
for keyword in keywords:
await page.goto("https://www.google.com")
await page.fill('textarea[name="q"]', keyword)
await page.press('textarea[name="q"]', "Enter")
while True:
# Check for a popup window and close it
if len(self.browser.pages) > 1:
await self.browser.pages[1].close()
# Scroll to the bottom of the page
await page.mouse.wheel(0, 1000)
await page.wait_for_load_state("networkidle")
await asyncio.sleep(0.2)
elements = await page.query_selector_all(
"//div[starts-with(@class, 'g ')]//span/a[@href]"
)
if len(elements) > 50:
break
result_urls = [await link.get_attribute("href") for link in elements]
logging.info(f"google_search() {keyword=} GOT {len(result_urls)} results")

# browse urls in parallel
tasks = [
asyncio.create_task(self.abrowse(url)) for url in result_urls
]
await asyncio.gather(*tasks)

def crawl(self):
while True:
if not self.urls_queue:
if not self.trends:
self.trends = self.google_trends()
shuffle(self.trends)
trend = self.trends.pop()
search_results = self.google_search(trend)
self.urls_queue = deque(search_results)

url = self.urls_queue.popleft()
depth = randint(3, 10)
self.recursive_browse(url, depth)
asyncio.run(self.acrawl())


if __name__ == "__main__":
fake_traffic = FakeTraffic(
country="US",
language="en-US",
category="h",
min_wait=1,
max_wait=10,
headless=True,
)
fake_traffic.crawl()
2 changes: 1 addition & 1 deletion fake_traffic/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.0.0"
__version__ = "2.1.0"
Loading

0 comments on commit 56aa6a9

Please sign in to comment.