-
Notifications
You must be signed in to change notification settings - Fork 1
/
ood_experiment_at_scale_script.sh
182 lines (159 loc) · 10.3 KB
/
ood_experiment_at_scale_script.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
#!/bin/bash
#
#EXPERIMENT CONFIGS
#DATA
#DLIDS are the "download ids" of the gdrive for each data set
export CUDA_VISIBLE_DEVICES=1
declare -A DATA_DLIDS=( ["MNIST"]="10pULG3xRIkl5tDo6VJHUK6NHIguMT7Jq" ["FASHIONMNIST"]="11AJ-OEgtj7XDeLHPVEyWuvHzFIIadj9_" ["CIFAR10"]="1O6uarg54CwtZ3h_B6YzD1KW9nbQ-E7Xl" ["TINYIMAGENET"]="10i1FV1SgXxMWfgTEpAXr0qe7q7s6Ko6e" ["SVHN"]="1wgTQJOtGxWPLNKPMuDc7vzYejYnWJsVr" ["SVHN-different"]="10HbYSMt3CbHeieUBqO675eNpAef-UY3_" ["GaussianNoise"]="1GXljou_EJGcdfVsfVJiVMVo7-RKNu106" ["SALTANDPEPPER"]="1iBwKyR7M4_ca2Ti7xW5FJ-BNdqEDG-vK" ["FASHIONPRODUCT"]="1zN1BF1u1SJl81JpH6hexYgrVvivaGxa5")
#FPATHS are the file paths for the data sets in the local working directory when doing experiments
declare -A DATA_FPATHS=(["MNIST"]="data/MNIST/" ["FASHIONMNIST"]="data/FASHIONMNIST/" ["CIFAR10"]="data/CIFAR10/" ["TINYIMAGENET"]="data/TINYIMAGENET/" ["SVHN-different"]="data/SVHN-different/" ["GaussianNoise"]="data/GaussianNoise/" ["SALTANDPEPPER"]="data/SALTANDPEPPER/" ["SVHN"]="data/SVHN/" ["FASHIONPRODUCT"]="data/FASHIONPRODUCT/")
BASE_DATA="CIFAR10" #the base data that is used for the iod data. around it which the experiment is centered
DIFFERENT_DATA="MNIST" #array of datasets used for the ood setting "different" -> contrasting data sets
OOD_PERC_PP_LIST=(0 50 100) #the ood percentage in percentage points
NUM_UNLABELED=3000 #should be 3000
MIN_CLASS_ID=0 #the lowest class id for the classes in the data set
MAX_CLASS_ID=9 #the highest class id for the classes of the data set
NUM_CLASSES_IN_DIST=5 #the number of classes to select for the in dist class
NUM_LABELED_LIST=(60 100 150)
OOD_TYPE="different" #can be "half-half" (ood samples come from same dataset but are a subset of the classes) or "different" (ood samples come from different dataset)
#NOTE: seed for shuf of iod class permuations is given by batch id on run level
#
#RUN CONFIGS (a run is an iteration of teh experiment with one of the random data batches)
BATCHES=(0 1 2 3 4 5 6 7 8 9) #the batch id used for the different runs of the experiment
#
#MIX MATCH ALGO CONFIGS
MODEL="wide_resnet"
DATASET="CIFAR10-BASELINE"
RESULTS_FILE="stats_OOD_4_SSDL.csv"
WORKERS="1"
EPOCHS="50" #should be 50
BATCH_SIZE="16" #should be 16
LR="0.0002"
WEIGHT_DECAY="0.0001"
K_TRANSFORMS="2"
T_SHARPENING="0.25"
ALPHA_MIX="0.75"
MODE="ssdl"
BALANCED="5" #int -1 no bal, 5 bal
GAMMA_US="25" #the gamma for the unsupervised loss
IMG_SIZE="32"
NORM_STATS="MNIST" #is not used
#
LOG_FOLDER="logs"
SAVE_WEIGHTS="FALSE"
WEIGHTS_PATH=""
RAMPUP_COEFFICIENT="3000"
#
N=10 #number of parallel processes
#DOWNLOAD DATA
#steps
#create data dir
mkdir data
#cd insto data dir
cd data
#do it for base data
##downlaod file
###get file id
FILEID="${DATA_DLIDS["${BASE_DATA}"]}"
###download
wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${BASE_DATA}.zip && rm -rf /tmp/cookies.txt
##unzip
unzip ${BASE_DATA}.zip
##remove zip
rm ${BASE_DATA}.zip
#do it for different data
###get file id
FILEID="${DATA_DLIDS["${DIFFERENT_DATA}"]}"
###download
wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate "https://docs.google.com/uc?export=download&id=${FILEID}" -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=${FILEID}" -O ${DIFFERENT_DATA}.zip && rm -rf /tmp/cookies.txt
##unzip
unzip ${DIFFERENT_DATA}.zip
##remove zip
rm ${DIFFERENT_DATA}.zip
#when done cd .. back to work dir
cd ..
#EXPERIMENTS
PATH_BASE="${DATA_FPATHS["${BASE_DATA}"]}" #path to the dataset that will be used for the experiments and construction of the train and val sets. assumes at the end of path exists another directory /all/ which contains subdirectories with id name of class and then each containing the samples of this class
PATH_DIFFERENT="${DATA_FPATHS["${DIFFERENT_DATA}"]}"
train_batch(){
local batch_id=$1
local OOD_PERC=$2
local OOD_PERC_PP=$3
local NUM_UNLABELED=$4
local IDS=$5
local LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data
local PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id"
local PATH_UNLABELED="$PATH_BASE/unlabeled/batch_${batch_id}_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}"
python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT
}
if [ $OOD_TYPE = "half-half" ]
then
for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data
do
for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data
do
OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command
#first iteration over batches creates the data
IDS_LIST=() #keep track of the class ids per batch for running mixmatch later
for batch_id in ${BATCHES[@]}
do
IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability
IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training
LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data
python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES
python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_BASE/batches_unlabeled_out_dist/batch_$batch_id" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id"
done
#the second iteration over batches runs mixmatch in parallel
for ((i=0;i<${#BATCHES[@]};++i))
do
((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files
train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "${IDS_LIST[i]}"&
done
wait
#)
#do clean up prior to the next experiment
rm -r $PATH_BASE/unlabeled/
rm -r $PATH_BASE/batches_labeled_in_dist/
rm -r $PATH_BASE/batches_unlabeled_out_dist/
done
done
elif [[ $OOD_TYPE -eq "different" ]]
then
#add loop over the different datasets -> no loop as I split experiments into invidual data sets
for NUM_LABELED in ${NUM_LABELED_LIST[@]} #axis 1 loop, i.e. portion of labeled data
do
for OOD_PERC_PP in ${OOD_PERC_PP_LIST[@]} #axis 2 loop, i.e. ood portion in unlabeled data
do
OOD_PERC=$(bc <<< "${OOD_PERC_PP} * 0.01") #go from percentage points to decimals for OOD percentage, i.e. e.g. 66 -> 0.66, for the second python command
IDS_LIST=() #keep track of the class ids per batch for running mixmatch later
for batch_id in ${BATCHES[@]}
do
IDS=$(shuf -i $MIN_CLASS_ID-$MAX_CLASS_ID -n $NUM_CLASSES_IN_DIST) # --random-source=<(echo $batch_id)) #randomly select class ids for in-dist data, random seed is provided by batch_id for reproducability
IDS_LIST+=("$IDS") #add class ids for this batch to the class ids list that mixmatch can use later for training
LIST_IN_DIST_CLASSES=$(echo $IDS | tr ' ' ,) #the subset of the classes in the dataset that are used as in dist data#the subset of the classes in the dataset that are used as in dist data
python utilities/dataset_partitioner.py --mode train_partitioner --path_base "$PATH_BASE/" --batch_id_num $batch_id --list_in_dist_classes $LIST_IN_DIST_CLASSES
python utilities/dataset_partitioner.py --mode unlabeled_partitioner --path_ood "$PATH_DIFFERENT" --path_iod "$PATH_BASE/batches_labeled_in_dist/batch_$batch_id/train" --path_dest "$PATH_BASE/unlabeled" --ood_perc "$OOD_PERC" --num_unlabeled "$NUM_UNLABELED" --batch_id_num "$batch_id"
#path_ood here has one subfolder with all the unlabelled images
done
#the second iteration over batches runs mixmatch in parallel
PATH_LABELED="$PATH_BASE/batches_labeled_in_dist/batch_$batch_id"
PATH_UNLABELED="$PATH_BASE/unlabeled/batch_0_num_unlabeled_${NUM_UNLABELED}_ood_perc_${OOD_PERC_PP}"
python MixMatch_OOD_main.py --dataset $DATASET --path_labeled $PATH_LABELED --path_unlabeled $PATH_UNLABELED --results_file_name $RESULTS_FILE --workers $WORKERS --epochs $EPOCHS --batch_size $BATCH_SIZE --lr $LR --weight_decay $WEIGHT_DECAY --K_transforms $K_TRANSFORMS --T_sharpening $T_SHARPENING --alpha_mix $ALPHA_MIX --mode $MODE --balanced $BALANCED --lambda_unsupervised $GAMMA_US --number_labeled $NUM_LABELED --model $MODEL --num_classes $NUM_CLASSES_IN_DIST --size_image $IMG_SIZE --log_folder $LOG_FOLDER --norm_stats $NORM_STATS --save_weights $SAVE_WEIGHTS --weights_path_name "$WEIGHTS_PATH" --rampup_coefficient $RAMPUP_COEFFICIENT --exp_creator "Yes"
for ((i=0;i<${#BATCHES[@]};++i))
do
((j=j%N)); sleep 3; ((j++==0)) && wait #short sleep to prevent conflicts in creating the documentation files
train_batch "${BATCHES[i]}" "$OOD_PERC" "$OOD_PERC_PP" "$NUM_UNLABELED" "$LIST_IN_DIST_CLASSES"&
done
wait
#do clean up prior to the next experiment
rm -r $PATH_BASE/unlabeled/
rm -r $PATH_BASE/batches_labeled_in_dist/
rm -r $PATH_BASE/batches_unlabeled_out_dist/
done
done
else
echo "No valid OOD_TYPE was specified. Choose 'same' or 'different'"
fi
#final cleanup -> delete all data
rm -r data/
#dynamic resizing of images for different setting -> when they do not exactly match -> defined in mixmatch