Skip to content

Commit

Permalink
added headed argument to more robustly handle issues with scrapers he…
Browse files Browse the repository at this point in the history
…adless mode
  • Loading branch information
trislee committed Sep 19, 2023
1 parent 0bd87f9 commit 89d8952
Show file tree
Hide file tree
Showing 5 changed files with 53 additions and 10 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ You should now be ready to start using it.
## About the tool
### Command-line arguments
```
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [--limit LIMIT] [-v] [hashtags ...]
usage: tiktok-hashtag-analysis [-h] [--file FILE] [-d] [--number NUMBER] [-p] [-t] [--output-dir OUTPUT_DIR] [--config CONFIG] [--log LOG] [--limit LIMIT] [-v] [--headed] [hashtags ...]
Analyze hashtags within posts scraped from TikTok.
Expand All @@ -35,6 +35,7 @@ optional arguments:
--log LOG File to write logs to
--limit LIMIT Maximum number of videos to download for each hashtag
-v, --verbose Increase output verbosity
--headed Don't use headless version of TikTok scraper
```

### Structure of output data
Expand Down
11 changes: 10 additions & 1 deletion tests/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,16 @@

def test_scrape(tmp_path, hashtags):
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
downloader.run(limit=1000, download=True, plot=True, table=True, number=20)
downloader.run(
limit=10, download=True, plot=True, table=True, number=5, headed=True
)


def test_scrape_headless(tmp_path, hashtags):
downloader = TikTokDownloader(hashtags=hashtags[:1], data_dir=tmp_path)
downloader.run(
limit=10, download=True, plot=True, table=True, number=5, headed=False
)


def test_load_hashtags_from_file(tmp_path, hashtags):
Expand Down
4 changes: 4 additions & 0 deletions tests/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
("table", True, "--table"),
("table", True, "-t"),
("verbose", True, "--verbose"),
("headed", True, "--headed"),
("verbose", True, "-v"),
("output_dir", "/tmp/tiktok_download", "--output-dir"),
("config", "~/.tiktok", "--config"),
Expand Down Expand Up @@ -51,6 +52,7 @@ def test_output_dir_spec_noexist_nowrite(tmp_path):
specified_output_dir=specified_output_dir, parser=parser
)
assert system_exit.type == SystemExit
os.chmod(tmp_path, 0o666)


def test_output_dir_spec_exist_nowrite(tmp_path):
Expand All @@ -63,6 +65,7 @@ def test_output_dir_spec_exist_nowrite(tmp_path):
specified_output_dir=specified_output_dir, parser=parser
)
assert system_exit.type == SystemExit
os.chmod(tmp_path, 0o666)


def test_output_dir_unspec_nowrite(monkeypatch, tmp_path):
Expand All @@ -75,6 +78,7 @@ def test_output_dir_unspec_nowrite(monkeypatch, tmp_path):
result = process_output_dir(specified_output_dir=None, parser=parser)
monkeypatch.chdir(cwd)
assert result == DEFAULT_OUTPUT_DIR
os.chmod(tmp_path, 0o666)


def test_output_dir_spec_noexist_write(tmp_path):
Expand Down
38 changes: 31 additions & 7 deletions tiktok_hashtag_analysis/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,15 @@ def load_hashtags_from_file(file: str) -> List[str]:

# Retry upon encountering transient playwright errors
@retry(retry=retry_if_exception_type(Error), stop=stop_after_attempt(3))
async def _fetch_hashtag_data(hashtag: str, limit: int) -> List[Dict]:
async def _fetch_hashtag_data(
hashtag: str, limit: int, headed: bool = False
) -> List[Dict]:
"""Fetch data for videos containing a specified hashtag, asynchronously."""
data = []
async with TikTokApi() as api:
await api.create_sessions(ms_tokens=[], num_sessions=1, sleep_after=3)
await api.create_sessions(
ms_tokens=[], num_sessions=1, sleep_after=3, headless=not headed
)
async for video in api.hashtag(name=hashtag).videos(count=limit):
data.append(video.as_dict)
return data
Expand Down Expand Up @@ -157,7 +161,7 @@ def prioritize_hashtags(self):
}
self.hashtags.sort(key=lambda h: last_edited.get(h, 0))

def get_hashtag_posts(self, hashtag: str, limit: int):
def get_hashtag_posts(self, hashtag: str, limit: int, headed: bool):
"""Fetch data about posts that used a specified hashtag and merge with
existing data, if it exists."""

Expand All @@ -172,8 +176,20 @@ def get_hashtag_posts(self, hashtag: str, limit: int):
already_fetched_data = []
already_fetched_ids = set(video["id"] for video in already_fetched_data)

# Scrape posts that use the specified hashtag
fetched_data = asyncio.run(_fetch_hashtag_data(hashtag=hashtag, limit=limit))
# Scrape posts that use the specified hashag
# Attempt to be robust against TikTok's countermeasures for headless browsing
try:
fetched_data = asyncio.run(
_fetch_hashtag_data(hashtag=hashtag, limit=limit, headed=headed)
)
except Exception as e:
logger.warning(
"Encountered error {e} when fetching data, retrying in headed mode"
)
fetched_data = asyncio.run(
_fetch_hashtag_data(hashtag=hashtag, limit=limit, headed=True)
)

fetched_ids = set(video["id"] for video in fetched_data)

if len(fetched_data) == 0:
Expand Down Expand Up @@ -303,13 +319,21 @@ def plot(self, hashtag: str, number: int):
plt.savefig(plot_file, bbox_inches="tight", facecolor="white", dpi=300)
logger.info(f"Plot saved to file: {plot_file}")

def run(self, limit: int, download: bool, plot: bool, table: bool, number: int):
def run(
self,
limit: int,
download: bool,
plot: bool,
table: bool,
number: int,
headed: bool,
):
"""Execute the specified operations on all specified hashtags."""

# Scrape all specified hashtags and perform analyses, depending on if
# `--table`, `--plot`, and `--download` flags are used in the command
for hashtag in self.hashtags:
self.get_hashtag_posts(hashtag=hashtag, limit=limit)
self.get_hashtag_posts(hashtag=hashtag, limit=limit, headed=headed)
if plot:
self.plot(hashtag=hashtag, number=number)
if table:
Expand Down
7 changes: 6 additions & 1 deletion tiktok_hashtag_analysis/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,11 @@ def create_parser():
help="Increase output verbosity",
action="store_true",
)

parser.add_argument(
"--headed",
help="Don't use headless version of TikTok scraper",
action="store_true",
)
return parser


Expand Down Expand Up @@ -146,6 +150,7 @@ def main():
plot=args.plot,
table=args.table,
number=args.number,
headed=args.headed,
)


Expand Down

0 comments on commit 89d8952

Please sign in to comment.