-
Notifications
You must be signed in to change notification settings - Fork 0
/
appro-fetcher
executable file
·142 lines (122 loc) · 4.28 KB
/
appro-fetcher
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
#!/usr/bin/env bash
# SPDX-FileCopyrightText: 2021 Robin Vobruba <[email protected]>
#
# SPDX-License-Identifier: Unlicense
# See the output of "$0 -h" for details.
# Exit immediately on each error and unset variable;
# see: https://vaneyckt.io/posts/safer_bash_scripts_with_set_euxo_pipefail/
set -Eeuo pipefail
#set -Eeu
script_dir=$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")
#source "$script_dir/_shared_git"
# initial default values
APP_NAME="appropedia.org Project Names Fetcher"
root_dir="$script_dir"
build_dir="$root_dir/target"
publish_dir="$root_dir/public"
raw_html_file="$build_dir/appro_list.html"
csv_names_file="$build_dir/appro_proj_names.csv"
csv_upstream_style_file="$build_dir/appro_yaml_urls.csv"
APPRO_OKH_GEN_URL_PREFIX="https://www.appropedia.org/generateOpenKnowHowManifest.php?title="
# ... for example: https://www.appropedia.org/generateOpenKnowHowManifest.php?title=Zero-electricity_refrigerator
function print_help() {
script_name="$(basename "$0")"
echo "$APP_NAME - Fetches an HTML file from appropedia,"
echo "containing the list of all the projects (names) they are hosting,"
echo "and extracts the raw (1 column CSV) list from that."
echo
echo "Usage:"
echo " $script_name [OPTION...]"
echo "Options:"
echo " -h, --help Print this usage help and exit"
echo "Examples:"shasum
echo " $script_name"
}
# read command-line args
POSITIONAL=()
while [[ $# -gt 0 ]]
do
arg="$1"
shift # $2 -> $1, $3 -> $2, ...
case "$arg" in
-h|--help)
print_help
exit 0
;;
*) # non-/unknown option
POSITIONAL+=("$arg") # save it in an array for later
;;
esac
done
set -- "${POSITIONAL[@]}" # restore positional parameters
function url_encode() {
input="$1"
# This URL-encodes *all* bytes
#printf '%s' "$data" | hexdump -v -e '/1 "%02x"' | sed 's/\(..\)/%\1/g'
# This just URL-encodes the non-standard characters,
# but it is slow!
printf '%s' "$input" | curl -Gso /dev/null -w "%{url_effective}" --data-urlencode @- "" | cut -c 3- || true
# Pure BASH, converts too much, and seems ot fail at special chars like ae
# local input_len=${#input}
# local encoded=""
# local pos c o
# for (( pos=0 ; pos<input_len ; pos++ )); do
# c=${input:$pos:1}
# case "$c" in
# [-_.~a-zA-Z0-9] ) o="${c}" ;;
# * ) printf -v o '%%%02x' "'$c"
# esac
# encoded+="${o}"
# done
# printf '%s\n' "$encoded" # You can either set a return variable (FASTER)
#REPLY="$encoded" # +or echo the result (EASIER)... or both... :p
}
function store_raw_html_to_file() {
out_file="$1"
curl \
'https://www.appropedia.org/Special:Export' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' \
--compressed \
-H 'Content-Type: application/x-www-form-urlencoded' \
-H 'Origin: https://www.appropedia.org' \
-H 'Referer: https://www.appropedia.org/Special:Export?catname=Projects' \
-H 'Upgrade-Insecure-Requests: 1' \
--data-raw 'catname=Projects&addcat=Add&pages=&curonly=1&wpDownload=1&wpEditToken=%2B%5C&title=Special%3AExport' \
--output "$out_file"
}
mkdir -p "$build_dir"
store_raw_html_to_file "$raw_html_file"
awk -f "$script_dir/appro-parser.awk" \
< "$raw_html_file" \
| sed -e 's|&|\&|g' -e 's|"|"|g' \
| sort \
> "$csv_names_file"
# URL-encode all the names, becasue some have commas in them,
# which is not viable in a one column CSV file.
mv "$csv_names_file" "${csv_names_file}_TMP"
while read -r line
do
url_encode "$line"
done < "${csv_names_file}_TMP" > "$csv_names_file"
rm "${csv_names_file}_TMP"
{
date="$(date +"%Y-%m-%d")"
echo "Project name,date added(YYYY-MM-DD),link to YML file"
while read -r proj_name
do
# URL-encode the project name, to ensure it contains no commas
# (-> would be bad in a CSV), and ot ensure it forms a valid URL
# NOTE It is already URL-encoded in the input list
#proj_name="$(url_encode "$proj_name")"
echo "appropedia_org--$proj_name,$date,${APPRO_OKH_GEN_URL_PREFIX}$proj_name"
done
} < "$csv_names_file" > "$csv_upstream_style_file"
mkdir -p "$publish_dir"
rm -Rf "${publish_dir:?}/"*
cp \
"$csv_names_file" \
"$csv_upstream_style_file" \
"$publish_dir/"
/usr/bin/tree -H . -o "$build_dir/index.html" "$publish_dir/"
sed -i -e 's|<body>|<body>\nGenerated at '"$(date)"'<br>|' "$build_dir/index.html"
mv "$build_dir/index.html" "$publish_dir/"