diff --git a/data-wiki/README.md b/data-wiki/README.md index 9140ddf..257f42d 100644 --- a/data-wiki/README.md +++ b/data-wiki/README.md @@ -66,8 +66,42 @@ parallel --verbose -j 8 ./convert-doc.sh '<' {} '>' {.}.md \ < wiki/docs.txt \ 2>&1 | tee convert.log ``` +This may take a few days. There are some downsides to using Pandoc here, since it does not handle Wikipedia template references, and instead seems to remove them in the output. This leads to a few sentences missing words in the middle. This is a relatively rare occasion, so it should not be much of a problem. + +Also, the conversion crashes sometimes: +``` +/home/leod/src/hncynic/data-wiki/convert-doc.sh < docs/85/Munich%E2%80%93Augsburg_railway.txt > docs/85/Munich%E2%80%93Augsburg_railway.md +Traceback (most recent call last): + File "/home/leod/src/hncynic/data-wiki/filter_markdown.py", line 114, in + main() + File "/home/leod/src/hncynic/data-wiki/filter_markdown.py", line 98, in main + return run_filter(action, prepare=prepare, doc=doc) + File "/home/leod/.local/lib/python3.6/site-packages/panflute/io.py", line 260, in run_filter + return run_filters([action], *args, **kwargs) +... + File "/home/leod/.local/lib/python3.6/site-packages/panflute/elements.py", line 1061, in __init__ + self.header = header + File "/home/leod/.local/lib/python3.6/site-packages/panflute/elements.py", line 1097, in header + raise IndexError(msg) +IndexError: table header has an incorrect number of cols: 6 rows but expected 8 +pandoc: Error running filter /home/leod/src/hncynic/data-wiki/filter_markdown.py +Filter returned error status 1 +``` +## Convert to TSV +We use each section of an article as an individual training example. + +``` +find docs -name '*.md' > docs.md.txt +parallel --verbose -j 8 \ + ~/src/hncynic/data-wiki/clean_text.sh \ + '<' {} \ + '|' ~/src/hncynic/data-wiki/md_to_tsv.py {} \ + '>' {.}.tsv \ + < docs.md.txt \ + > convert.tsv.log 2>&1 +``` diff --git a/data-wiki/md_to_tsv.py b/data-wiki/md_to_tsv.py index 4d451e8..4f085a6 100755 --- a/data-wiki/md_to_tsv.py +++ b/data-wiki/md_to_tsv.py @@ -41,7 +41,7 @@ def write_tsv(self, out, header_path=None): out.write('\t') for i in range(len(self.text)-1): out.write(self.text[i].strip()) - out.write('

') + out.write(' ') out.write(self.text[-1]) out.write('\n') @@ -98,7 +98,6 @@ def parse_md_tree(stream, title): title = urllib.parse.unquote(basename.replace('_', ' ')) - with open(filename) as f: - tree = parse_md_tree(f, title) - #tree.write(sys.stdout) - tree.write_tsv(sys.stdout) + tree = parse_md_tree(sys.stdin, title) + #tree.write(sys.stdout) + tree.write_tsv(sys.stdout) diff --git a/data/moses_tokenizer_protected.txt b/data/moses_tokenizer_protected.txt index 599dbaa..b67112e 100644 --- a/data/moses_tokenizer_protected.txt +++ b/data/moses_tokenizer_protected.txt @@ -1,4 +1,5 @@ +

http://\S+ https://\S+ ftp://\S+ diff --git a/train/README.md b/train/README.md index 68404f9..869394c 100644 --- a/train/README.md +++ b/train/README.md @@ -36,9 +36,16 @@ the power plug. ``` onmt-main train --config opennmt_config.yml --model_type Transformer --num_gpus 1 ``` +I trained the model for about 40K steps with `opennmt_config.yml`. I noticed that the loss wasn't +improving much after that, so I got worried and increased the batch size (known to help with +training Transformer models) by performing gradient accumulation as in `opennmt_config_larger_batch.yml`. +As can be seen in the plot below, this seems to have helped. ![training loss](train.svg) +Unfortunately, I don't have a plot for the dev loss, since I forgot to turn on dev evaluation. +What a bummer. + ### Evaluate ### Export