-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess.sh
executable file
·56 lines (51 loc) · 1.96 KB
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
## Variables
# $data_path -> which data to process
# $lang -> language of the data
# $type -> final format of the data
# $pretokenize -> whether to tokenize or not
# $truecase -> whether to trucase or not
# $tc_model -> path to truecase model
# $subword_model_type -> Type of dubword model used for preprocessing
# $bpe_model -> path to subword model to use for preprocessing
outf="tok"
if [ ! -f ${data_path}.$outf.$lang ]; then
if [ $pretokenize == True ] & [ $already_tokenized == False ]; then
echo "Tokenizing $data_path.$lang ..."
if [ $lang != "ja" ]; then
cat ${data_path}.$lang | $moses_scripts/tokenizer/normalize-punctuation.perl -l $lang | \
$moses_scripts/tokenizer/tokenizer.perl -threads 8 -a -l $lang > ${data_path}.$outf.$lang ;
else
$kytea_path -out tok -notags < ${data_path}.$lang > ${data_path}.$outf.$lang;
fi;
else
cat ${data_path}.$lang > ${data_path}.$outf.$lang;
fi;
fi;
inf=$outf
outf="tok.tc"
if [ ! -f ${data_path}.$outf.$lang ]; then
if [ $truecase == True ]; then
echo " * True-casing ${data_path}.$inf.$lang ..."
$moses_scripts/recaser/truecase.perl \
-model $tc_model \
< ${data_path}.$inf.$lang \
> ${data_path}.$outf.$lang
else
cat ${data_path}.$inf.$lang | $moses_scripts/tokenizer/lowercase.perl > ${data_path}.$outf.$lang;
fi;
fi;
inf=$outf
outf="tok.tc.bpe"
if [ $subword == True ] && [ ! -f ${data_path}.$outf.$lang ]; then
if [ $subword_model_type == "bpe" ]; then
python2 $bpe_scripts_path/apply_bpe.py \
--codes $bpe_model.$lang \
< ${data_path}.$inf.$lang \
> ${data_path}.$outf.$lang
else
python get_sentencepiece.py encode \
$bpe_model.$lang ${data_path}.$inf.$lang \
${data_path}.$outf.$lang
fi;
cat ${data_path}.$outf.$lang > ${data_path}.$type
fi;