-
Notifications
You must be signed in to change notification settings - Fork 20
/
stats-crawler.sh
executable file
·66 lines (49 loc) · 1.66 KB
/
stats-crawler.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/bin/bash
set -x
#####################
# DECLARE VARIABLES #
#####################
# get the current branch of the 'awesome-lemmy-instances' repo
current_branch=$(git branch --show-current)
crawl_list="baraza.africa,lemmygrad.ml,lemmy.blahaj.zone,lemmy.pussthecat.org,lemmy.studio,toast.ooo,iusearchlinux.fyi,waveform.social,monero.town,exploding-heads.com,reddthat.com,mander.xyz,vlemmy.net,szmer.info,beehaw.org"
#############
# FUNCTIONS #
#############
FATAL() {
printf 'FATAL: %s. Aborting.\n' "$*"
exit 1
}
#######################
# LEMMY-STATS-CRAWLER #
#######################
# this is so fucking unsafe https://rustup.rs/
#curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
CARGO=$(which cargo)
if [[ -z ${CARGO} ]]; then
CARGO="${HOME}/.cargo/bin/cargo"
fi
${CARGO} --version
git clone https://github.com/LemmyNet/lemmy-stats-crawler.git
pushd lemmy-stats-crawler
# some pre-run output for debugging
ls
# is this our dev branch?
if [ "${current_branch}" = "dev" ]; then
# this is a run in dev; We limit the `max-crawl-distance` to 0 here (so
# the crawler does not go to other instances than those explicitly
# listed), for faster execution.
#time ${CARGO} run -- --verbose 4 --start-instances $crawl_list \
time ${CARGO} run -- --verbose 1 --start-instances $crawl_list \
--json --max-crawl-distance 0 > lemmy-stats-crawler.json
else
# this isn't dev; do a full crawl
time ${CARGO} run -- --verbose 4 --start-instances $crawl_list \
--json > lemmy-stats-crawler.json
fi
# some post-run output for debugging
ls
du -sh lemmy-stats-crawler.json
wc -l lemmy-stats-crawler.json
head lemmy-stats-crawler.json
tail lemmy-stats-crawler.json
exit 0