generated from amosproj/amos202Xss0Y-projname
-
Notifications
You must be signed in to change notification settings - Fork 1
/
run_all.sh
194 lines (163 loc) · 4.65 KB
/
run_all.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#!/bin/bash
# Exit on error, treat unset variables as an error, and make pipeline errors propagate
set -euo pipefail
log() {
echo "$(date '+%Y-%m-%d %H:%M:%S') - $1"
}
# Load environment variables from a file
load_env() {
if [ -f ".env" ]; then
log "Loading environment variables from .env file..."
export $(grep -v '^#' .env | xargs -d '\n')
log "Environment variables loaded."
else
log "No .env file found. Make sure to create one with the required environment variables."
exit 1
fi
}
# Create and activate Python virtual environment
setup_virtual_environment() {
if [ ! -d "venv" ]; then
log "Creating virtual environment..."
python3 -m venv venv
log "Virtual environment created."
else
log "Virtual environment already exists."
fi
log "Activating virtual environment..."
source venv/bin/activate
log "Installing required packages from requirements.txt..."
pip install --upgrade pip
pip install -r requirements.txt
python -m spacy download en_core_web_sm
log "Required packages installed."
}
run_python_script() {
local script_name=$1
local script_path="$(dirname "$script_name")" # Extract directory from script path
local script_basename="$(basename "$script_name")" # Extract script basename
log "Running ${script_name}..."
(
cd "$script_path" # Change directory
if [[ "$script_basename" == "upload_to_huggingface.py" ]]; then
if python3 -u "$script_basename" "$2"; then
log "${script_name} completed successfully."
else
log "Error: ${script_name} failed"
exit 1
fi
else
if python3 -u "$script_basename"; then
log "${script_name} completed successfully."
else
log "Error: ${script_name} failed"
exit 1
fi
fi
)
}
run_scrapy_spider() {
log "Running Scrapy spider..."
(
cd src/landscape_scraper # Change directory to where the Scrapy project is
if scrapy crawl files -O output.json; then
log "Scrapy spider completed successfully."
else
log "Error: Scrapy spider failed"
exit 1
fi
)
}
run_etl() {
log "Starting ETL process..."
etl_scripts=(
"src/scripts/landscape_explorer.py"
)
for script in "${etl_scripts[@]}"; do
run_python_script "$script"
done
run_scrapy_spider
etl_scripts_continued=(
"src/scripts/augment_landscape.py"
"src/scripts/landscape_extractor.py"
"src/scripts/Unified_format_conversation.py"
"src/scripts/upload_to_huggingface.py"
)
for script in "${etl_scripts_continued[@]}"; do
if [[ $script == *"upload_to_huggingface.py" ]]; then
run_python_script "$script" "$DATA_SET_ID"
else
run_python_script "$script"
fi
done
log "ETL process completed."
}
run_qa() {
log "Starting Q&A generation process..."
qa_scripts=(
"src/scripts/qa_generation/qa_generation.py"
"src/scripts/upload_qa_to_hugging_face.py"
)
for script in "${qa_scripts[@]}"; do
if [[ $script == *.py ]]; then
run_python_script "$script"
else
log "Running ${script}..."
if bash "$script"; then
log "${script} completed successfully."
else
log "Error: ${script} failed"
exit 1
fi
fi
done
log "Q&A generation process completed."
}
# check first if input is valid, check if all arguments are either etl or qa or dataset id
if [ $# -eq 0 ]; then
log "No arguments provided. Usage: $0 [etl] [qa] <data_set_id>"
exit 1
fi
# Validate and process arguments
ETL=false
QA=false
DATA_SET_ID=""
for arg in "$@"; do
case $arg in
etl)
ETL=true
;;
qa)
QA=true
;;
*)
if [[ -z "$DATA_SET_ID" ]]; then
DATA_SET_ID="$arg"
else
log "Invalid argument: $arg"
log "Usage: $0 [etl] [qa] <data_set_id>"
exit 1
fi
;;
esac
done
if [[ -z "$DATA_SET_ID" ]]; then
log "Error: Data set ID is required."
log "Usage: $0 [etl] [qa] <data_set_id>"
exit 1
fi
log "Data set ID: $DATA_SET_ID"
setup_virtual_environment
load_env
if $ETL; then
run_etl
fi
if $QA; then
run_qa
fi
if ! $ETL && ! $QA; then
log "No ETL or QA specified, running both"
run_etl
run_qa
fi
log "All selected tasks ran successfully."