diff --git a/.travis.yml b/.travis.yml index 5e0aff001..4fc315147 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,6 +2,21 @@ language: python python: - "2.7" - "3.6" +env: + global: + - T2T_PROBLEM=algorithmic_reverse_binary40_test + - T2T_DATA_DIR=/tmp/t2t-data + - T2T_TRAIN_DIR=/tmp/t2t-train + matrix: + - TF_VERSION="1.4.*" + - TF_VERSION="1.5.*" + - TF_VERSION="1.6.0rc1" +matrix: + exclude: + - python: "3.6" + env: TF_VERSION="1.4.*" + - python: "3.6" + env: TF_VERSION="1.6.0rc1" before_install: - echo "deb [arch=amd64] http://storage.googleapis.com/tensorflow-serving-apt stable tensorflow-model-server tensorflow-model-server-universal" | sudo tee /etc/apt/sources.list.d/tensorflow-serving.list - curl https://storage.googleapis.com/tensorflow-serving-apt/tensorflow-serving.release.pub.gpg | sudo apt-key add - @@ -9,24 +24,27 @@ before_install: - sudo apt-get install -qq libhdf5-dev - sudo apt-get install -qq tensorflow-model-server install: - - pip install -q .[tensorflow] + - pip install -q "tensorflow==$TF_VERSION" - pip install -q .[tests] # Make sure we have the latest version of numpy - avoid problems we were # seeing with Python 3 - pip install -q -U numpy -env: - global: - - T2T_PROBLEM=algorithmic_reverse_binary40_test - - T2T_DATA_DIR=/tmp/t2t-data - - T2T_TRAIN_DIR=/tmp/t2t-train script: # Check import - python -c "from tensor2tensor.models import transformer; print(transformer.Transformer.__name__)" # Run tests - - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py --ignore=tensor2tensor/bin/t2t_trainer_test.py + - pytest + --ignore=tensor2tensor/utils/registry_test.py + --ignore=tensor2tensor/utils/trainer_lib_test.py + --ignore=tensor2tensor/visualization/visualization_test.py + --ignore=tensor2tensor/problems_test.py + --ignore=tensor2tensor/bin/t2t_trainer_test.py + --ignore=tensor2tensor/data_generators/algorithmic_math_test.py + --ignore=tensor2tensor/rl/rl_trainer_lib_test.py - pytest tensor2tensor/utils/registry_test.py - pytest tensor2tensor/utils/trainer_lib_test.py + - pytest tensor2tensor/visualization/visualization_test.py # Run installed scripts - t2t-datagen 2>&1 | grep translate && echo passed @@ -41,8 +59,8 @@ script: - t2t-decoder --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR --decode_hparams='num_samples=10' # Export and query (on Python 2 only) - - t2t-exporter --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR - - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then + - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]] && [[ "$TF_VERSION" == "1.5.*" ]]; then + t2t-exporter --problems=$T2T_PROBLEM --data_dir=$T2T_DATA_DIR --model=transformer --hparams_set=transformer_tiny --output_dir=$T2T_TRAIN_DIR; pip install tensorflow-serving-api; tensorflow_model_server --port=9000 --model_name=my_model --model_base_path=$T2T_TRAIN_DIR/export/Servo & sleep 10; diff --git a/docs/new_problem.md b/docs/new_problem.md index 342d7abb1..7564e4ad8 100644 --- a/docs/new_problem.md +++ b/docs/new_problem.md @@ -9,286 +9,232 @@ welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg)](CO [![Gitter](https://img.shields.io/gitter/room/nwjs/nw.js.svg)](https://gitter.im/tensor2tensor/Lobby) [![License](https://img.shields.io/badge/License-Apache%202.0-brightgreen.svg)](https://opensource.org/licenses/Apache-2.0) -Let's add a new dataset together and train the transformer model. We'll be learning to define English words by training the transformer to "translate" between English words and their definitions on a character level. +Let's add a new dataset together and train the +[Transformer](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/models/transformer.py) +model on it. We'll give the model a line of poetry, and it will learn to +generate the next line. -# About the Problem +# Defining the `Problem` -For each problem we want to tackle we create a new problem class and register it. Let's call our problem `Word2def`. +For each problem we want to tackle we create a new subclass of `Problem` and +register it. Let's call our problem `PoetryLines`. -Since many text2text problems share similar methods, there's already a class -called `Text2TextProblem` that extends the base problem class, `Problem` -(both found in -[`problem.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py)). +Since many text-to-text problems share similar methods, there's already a class +called +[`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/text_problems.py) +that extends the base problem class +[`Problem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py) +and makes it easy to add text-to-text problems. -For our problem, we can go ahead and create the file `word2def.py` in the -[`data_generators`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/) -folder and add our new problem, `Word2def`, which extends -[`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py). -Let's also register it while we're at it so we can specify the problem through -flags. +In that same file, there are other base classes that make it easy to add text +classification tasks (`Text2ClassProblem`) and language modeling tasks +(`Text2SelfProblem`). -```python -@registry.register_problem -class Word2def(problem.Text2TextProblem): - """Problem spec for English word to dictionary definition.""" - @property - def is_character_level(self): - ... -``` - -We need to implement the following methods from -[`Text2TextProblem`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py). -in our new class: -* is_character_level -* targeted_vocab_size -* generator -* input_space_id -* target_space_id -* num_shards -* vocab_name -* use_subword_tokenizer - -Let's tackle them one by one: - -**input_space_id, target_space_id, is_character_level, targeted_vocab_size, use_subword_tokenizer**: - -SpaceIDs tell Tensor2Tensor what sort of space the input and target tensors are -in. These are things like, EN_CHR (English character), EN_TOK (English token), -AUDIO_WAV (audio waveform), IMAGE, DNA (genetic bases). The complete list can be -found at -[`data_generators/problem.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/problem.py). -in the class `SpaceID`. +For our problem, let's create the file `poetry_lines.py` and add our new +problem, `PoetryLines`, which extends `Text2TextProblem` and register it so that +it is accessible by command-line flag. -Since we're generating definitions and feeding in words at the character level, we set `is_character_level` to true, and use the same SpaceID, EN_CHR, for both input and target. Additionally, since we aren't using tokens, we don't need to give a `targeted_vocab_size` or define `use_subword_tokenizer`. - -**vocab_name**: - -`vocab_name` will be used to name your vocabulary files. We can call ours `'vocab.word2def.en'` - -**num_shards**: - -The number of shards to break data files into. +Here's the Problem in full. We'll go step by step through it. ```python -@registry.register_problem() -class Word2def(problem.Text2TextProblem): - """Problem spec for English word to dictionary definition.""" +import re - @property - def is_character_level(self): - return True +from gutenberg import acquire +from gutenberg import cleanup - @property - def vocab_name(self): - return "vocab.word2def.en" +from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_problems +from tensor2tensor.utils import registry - @property - def input_space_id(self): - return problem.SpaceID.EN_CHR +@registry.register_problem +class PoetryLines(text_problems.Text2TextProblem): + """Predict next line of poetry from the last line. From Gutenberg texts.""" @property - def target_space_id(self): - return problem.SpaceID.EN_CHR + def approx_vocab_size(self): + return 2**13 # ~8k @property - def num_shards(self): - return 100 + def is_generate_per_split(self): + # generate_data will shard the data into TRAIN and EVAL for us. + return False @property - def use_subword_tokenizer(self): - return False + def dataset_splits(self): + """Splits of data to produce and number of output shards for each.""" + # 10% evaluation data + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 90, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 10, + }] + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + del data_dir + del tmp_dir + del dataset_split + + + books = [ + # bookid, skip N lines + (19221, 223), + (15553, 522), + ] + + for (book_id, toskip) in books: + text = cleanup.strip_headers(acquire.load_etext(book_id)).strip() + lines = text.split("\n")[toskip:] + prev_line = None + ex_count = 0 + for line in lines: + # Any line that is all upper case is a title or author name + if not line or line.upper() == line: + prev_line = None + continue + + line = re.sub("[^a-z]+", " ", line.strip().lower()) + if prev_line and line: + yield { + "inputs": prev_line, + "targets": line, + } + ex_count += 1 + prev_line = line ``` -**generator**: +## Vocabulary specification -We're almost done. `generator` generates the training and evaluation data and -stores them in files like "word2def_train.lang1" in your DATA_DIR. Thankfully -several commonly used methods like `character_generator`, and `token_generator` -are already written in the file -[`translate.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/translate.py). -We will import `character_generator` and -[`text_encoder`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/text_encoder.py) -to write: +The text generated is encoded with a vocabulary for training. By default, it is +a `SubwordTextEncoder` that is built with an approximate vocab size specified by +the user. It's fully invertible (no out-of-vocab tokens) with a fixed-size vocab +which makes it ideal for text problems. -```python - def generator(self, data_dir, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS - return character_generator(datasets[0], datasets[1], character_vocab, EOS) -``` +You can also choose to use a character-level encoder or a token encoder where +you provide the vocab file yourself. See `Text2TextProblem.vocab_type`. -Now our `word2def.py` file looks like the below: +Here we specify that we're going to have a vocabulary with approximately 8,000 +subwords. ```python -@registry.register_problem() -class Word2def(problem.Text2TextProblem): - """Problem spec for English word to dictionary definition.""" - @property - def is_character_level(self): - return True - - @property - def vocab_name(self): - return "vocab.word2def.en" - - def generator(self, data_dir, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS - return character_generator(datasets[0], datasets[1], character_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.EN_CHR - - @property - def target_space_id(self): - return problem.SpaceID.EN_CHR - @property - def num_shards(self): - return 100 - - @property - def use_subword_tokenizer(self): - return False + def approx_vocab_size(self): + return 2**13 # ~8k ``` -## Data: -Now we need to tell Tensor2Tensor where our data is located. - -I've gone ahead and split all words into a train and test set and saved them in files called `words.train.txt`, `words.test.txt`, -`definitions.train.txt`, and `definitions.test.txt` in a directory called `LOCATION_OF_DATA/`. Let's tell T2T where these files are: +## Splitting data between Train and Eval -```python -# English Word2def datasets -_WORD2DEF_TRAIN_DATASETS = [ - LOCATION_OF_DATA + 'words_train.txt', - LOCATION_OF_DATA + 'definitions_train.txt' -] - -_WORD2DEF_TEST_DATASETS = [ - LOCATION_OF_DATA + 'words_test.txt', - LOCATION_OF_DATA + 'definitions_test.txt' -] -``` +By setting `is_generate_per_split=False`, the `generate_samples` method will +only be called once and the data will automatically be split across training and +evaluation data for us. This is useful because for our dataset we don't have +pre-existing "training" and "evaluation" sets. If we did, we'd set +`is_generate_per_split=True` so that `generate_samples` was called once per data +split. -## Putting it all together - -Now our `word2def.py` file looks like: +The `dataset_splits` method determines the fraction that goes to each split. The +training data will be generated into 90 files and the evaluation data into 10. +90% of the data will be for training. 10% of the data will be for evaluation. ```python -""" Problem definition for word to dictionary definition. -""" - -import os - -from tensor2tensor.data_generators import problem -from tensor2tensor.data_generators import text_encoder -from tensor2tensor.data_generators.translate import character_generator - -from tensor2tensor.utils import registry - -# English Word2def datasets -_WORD2DEF_TRAIN_DATASETS = [ - LOCATION_OF_DATA+'words_train.txt', - LOCATION_OF_DATA+'definitions_train.txt' -] - -_WORD2DEF_TEST_DATASETS = [ - LOCATION_OF_DATA+'words_test.txt', - LOCATION_OF_DATA+'definitions_test.txt' -] - -@registry.register_problem() -class Word2def(problem.Text2TextProblem): - """Problem spec for English word to dictionary definition.""" @property - def is_character_level(self): - return True - - @property - def vocab_name(self): - return "vocab.word2def.en" - - def generator(self, data_dir, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _WORD2DEF_TRAIN_DATASETS if train else _WORD2DEF_TEST_DATASETS - return character_generator(datasets[0], datasets[1], character_vocab, EOS) + def is_generate_per_split(self): + # generate_data will shard the data into TRAIN and EVAL for us. + return False @property - def input_space_id(self): - return problem.SpaceID.EN_CHR + def dataset_splits(self): + """Splits of data to produce and number of output shards for each.""" + # 10% evaluation data + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 90, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 10, + }] +``` - @property - def target_space_id(self): - return problem.SpaceID.EN_CHR +## Generating samples - @property - def num_shards(self): - return 100 +`generate_samples` is the bulk of the code where we actually produce +dictionaries of poetry line pairs ("inputs" and "targets"). - @property - def use_subword_tokenizer(self): - return False +Some problems might require downloading, which can be done into `tmp_dir`. Some +problems may use their own token vocabulary file, in which case it can be copied +into `data_dir` before yielding samples. -``` - -# Hyperparameters -All hyperparamters inherit from `_default_hparams()` in `problem.py.` If you would like to customize your hyperparameters, register a new hyperparameter set in `word2def.py` like the example provided in the walkthrough. For example: +Here we iterate through the lines of a couple books of poetry and produce pairs +of lines for the model to train against. ```python -from tensor2tensor.models import transformer - -@registry.register_hparams -def word2def_hparams(): - hparams = transformer.transformer_base_single_gpu() # Or whatever you'd like to build off. - hparams.batch_size = 1024 - return hparams + def generate_samples(self, data_dir, tmp_dir, dataset_split): + del data_dir + del tmp_dir + del dataset_split + + books = [ + # bookid, skip N lines + (19221, 223), + (15553, 522), + ] + + for (book_id, toskip) in books: + text = cleanup.strip_headers(acquire.load_etext(book_id)).strip() + lines = text.split("\n")[toskip:] + prev_line = None + ex_count = 0 + for line in lines: + # Any line that is all upper case is a title or author name + if not line or line.upper() == line: + prev_line = None + continue + + line = re.sub("[^a-z]+", " ", line.strip().lower()) + if prev_line and line: + yield { + "inputs": prev_line, + "targets": line, + } + ex_count += 1 + prev_line = line ``` -# Test the data generation +That's all for the problem specification! We're ready to generate the data. + +# Run data generation -You can test data generation of your a problem in your own project with: +You can run data generation of your a problem in your own project with +`t2t-datagen` and the `--t2t_usr_dir` flag, which should point to the directory +containing an `__init__.py` file that imports `word2def`, the file we just +wrote. ```bash -PROBLEM=word2def +USR_DIR=... +PROBLEM=poetry_lines DATA_DIR=$HOME/t2t_data TMP_DIR=/tmp/t2t_datagen mkdir -p $DATA_DIR $TMP_DIR t2t-datagen \ - --t2t_usr_dir=$PATH_TO_YOUR_PROBLEM_DIR \ + --t2t_usr_dir=$USR_DIR \ --data_dir=$DATA_DIR \ --tmp_dir=$TMP_DIR \ --problem=$PROBLEM ``` -Where: -* `PROBLEM` is the name of the class that was registered with - `@registry.register_problem()`, but converted from `CamelCase` to - `snake_case`. -* `PATH_TO_YOUR_PROBLEM_DIR` is a path to the directory of your python problem - file. +`PROBLEM` is the name of the class that was registered with +`@registry.register_problem`, but converted from `CamelCase` to `snake_case`. -If you plan to contribute to the tensor2tensor repository, you can install the -local cloned version in developer mode with `pip install -e .` from the -tensor2tensor directory. You can also add your new problem file to -[`all_problems.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/all_problems.py). +`USR_DIR` should be a directory with the `poetry_lines.py` file as well as an +`__init__.py` file that imports it (`from . import poetry_lines`). -# Run the problem -Now that we've gotten our problem set up, let's train a model and generate -definitions. - -To train, specify the problem name, the model, and hparams: -```bash -PROBLEM=word2def -MODEL=transformer -HPARAMS=word2def_hparams -``` +If you plan to contribute problems to the tensor2tensor repository, you can +clone the repository and install it in developer mode with `pip install -e .`. -The rest of the steps are as given in the [walkthrough](walkthrough.md). +# Train! -What if we wanted to train a model to generate words given definitions? In T2T, -we can change the problem name to be `PROBLEM=word2def_rev`. +You can train exactly as you do in the [walkthrough](walkthrough.md) with flags +`--problems=poetry_lines` and `--t2t_usr_dir=$USR_DIR`. -All done. Let us know what definitions your model generated. +All done. Let us know what amazing poetry your model writes! diff --git a/docs/tutorials/asr_with_transformer.md b/docs/tutorials/asr_with_transformer.md new file mode 100644 index 000000000..92c847ba8 --- /dev/null +++ b/docs/tutorials/asr_with_transformer.md @@ -0,0 +1,65 @@ +# Automatic Speech Recognition (ASR) with Transformer + +## Data set + +This tutorial uses the publicly available +[Librispeech](http://www.openslr.org/12/) ASR corpus. + +## Generate the dataset + +To generate the dataset use `t2t-datagen`. You need to create environment +variables for a data directory `DATA_DIR` where the data is stored and for a +temporary directory `TMP_DIR` where necessary data is downloaded. + +As the audio import in `t2t-datagen` uses `sox` to generate normalized +waveforms, please install it as appropriate (e.g. `apt-get install sox`). + +``` +t2t-datagen --problem=librispeech --data_dir=$DATA_DIR --tmp_dir=$TMP_DIR +``` + +You can also use smaller versions of the dataset by replacing `librispeech` with +`librispeech_clean` or `librispeech_clean_small` + +## Training on GPUs + +To train a model on GPU set up`OUT_DIR` and run the trainer: + +``` +t2t-trainer \ + --model=transformer \ + --hparams_set=transformer_librispeech \ + --problems=librispeech \ + --train_steps=120000 \ + --eval_steps=3 \ + --local_eval_frequency=100 \ + --data_dir=$DATA_DIR \ + --output_dir=$OUT_DIR +``` + +This model should achieve approximately 22% accuracy per sequence after +approximately 80,000 steps. + +## Training on Cloud TPUs + +To train a model on TPU set up `OUT_DIR` and run the trainer: + +``` +t2t-trainer \ + --model=transformer \ + --hparams_set=transformer_librispeech_tpu \ + --problems=librispeech \ + --train_steps=120000 \ + --eval_steps=3 \ + --local_eval_frequency=100 \ + --data_dir=$DATA_DIR \ + --output_dir=$OUT_DIR \ + --cloud_tpu \ + --cloud_delete_on_done +``` + +For more information, see [Tensor2Tensor's +documentation](https://github.com/tensorflow/tensor2tensor/tree/master/docs/cloud_tpu.md) +for Tensor2Tensor on Cloud TPUs, or the [official Google Cloud Platform +documentation](https://cloud.google.com/tpu/docs/tutorials/transformer) for +Cloud TPUs. diff --git a/setup.py b/setup.py index b17656b5b..f02efdb2d 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setup( name='tensor2tensor', - version='1.5.1', + version='1.5.2', description='Tensor2Tensor', author='Google Inc.', author_email='no-reply@google.com', @@ -45,8 +45,8 @@ 'six', ], extras_require={ - 'tensorflow': ['tensorflow>=1.5.0'], - 'tensorflow_gpu': ['tensorflow-gpu>=1.5.0'], + 'tensorflow': ['tensorflow>=1.4.1'], + 'tensorflow_gpu': ['tensorflow-gpu>=1.4.1'], 'tests': ['pytest', 'h5py', 'mock'], }, classifiers=[ diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py index af7cb5077..944ef016a 100644 --- a/tensor2tensor/bin/t2t_trainer.py +++ b/tensor2tensor/bin/t2t_trainer.py @@ -322,8 +322,8 @@ def main(argv): if FLAGS.generate_data: generate_data() - if hasattr(FLAGS, "job_dir") and FLAGS.job_dir: - FLAGS.output_dir = FLAGS.job_dir + if cloud_mlengine.job_dir(): + FLAGS.output_dir = cloud_mlengine.job_dir() if argv: set_hparams_from_args(argv[1:]) diff --git a/tensor2tensor/data_generators/celeba.py b/tensor2tensor/data_generators/celeba.py index cd27e5715..7fd3bddb5 100644 --- a/tensor2tensor/data_generators/celeba.py +++ b/tensor2tensor/data_generators/celeba.py @@ -61,7 +61,6 @@ def hparams(self, defaults, unused_model_hparams): p.input_modality = {"inputs": ("image:identity", 256)} p.target_modality = ("image:identity", 256) p.batch_size_multiplier = 256 - p.max_expected_batch_size_per_shard = 4 p.input_space_id = 1 p.target_space_id = 1 @@ -168,3 +167,20 @@ def preprocess_example(self, example, unused_mode, unused_hparams): example["inputs"] = image_8 example["targets"] = image_32 return example + + +@registry.register_problem +class Img2imgCeleba64(Img2imgCeleba): + """8px to 64px problem.""" + + def preprocess_example(self, example, unused_mode, unused_hparams): + image = example["inputs"] + # Remove boundaries in CelebA images. Remove 40 pixels each side + # vertically and 20 pixels each side horizontally. + image = tf.image.crop_to_bounding_box(image, 40, 20, 218 - 80, 178 - 40) + image_8 = image_utils.resize_by_area(image, 8) + image_64 = image_utils.resize_by_area(image, 64) + + example["inputs"] = image_8 + example["targets"] = image_64 + return example diff --git a/tensor2tensor/data_generators/cifar.py b/tensor2tensor/data_generators/cifar.py index 881e52a7e..ac23a95b5 100644 --- a/tensor2tensor/data_generators/cifar.py +++ b/tensor2tensor/data_generators/cifar.py @@ -43,20 +43,26 @@ "data_batch_5" ] _CIFAR10_TEST_FILES = ["test_batch"] -_CIFAR10_IMAGE_SIZE = 32 +_CIFAR10_IMAGE_SIZE = _CIFAR100_IMAGE_SIZE = 32 +_CIFAR100_URL = "https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz" +_CIFAR100_PREFIX = "cifar-100-python/" +_CIFAR100_TRAIN_FILES = ["train"] +_CIFAR100_TEST_FILES = ["test"] -def _get_cifar10(directory): + +def _get_cifar(directory, url): """Download and extract CIFAR to directory unless it is there.""" - filename = os.path.basename(_CIFAR10_URL) - path = generator_utils.maybe_download(directory, filename, _CIFAR10_URL) + filename = os.path.basename(url) + path = generator_utils.maybe_download(directory, filename, url) tarfile.open(path, "r:gz").extractall(directory) -def cifar10_generator(tmp_dir, training, how_many, start_from=0): - """Image generator for CIFAR-10. +def cifar_generator(cifar_version, tmp_dir, training, how_many, start_from=0): + """Image generator for CIFAR-10 and 100. Args: + cifar_version: string; one of "cifar10" or "cifar100" tmp_dir: path to temporary storage directory. training: a Boolean; if true, we use the train set, otherwise the test set. how_many: how many images and labels to generate. @@ -65,21 +71,33 @@ def cifar10_generator(tmp_dir, training, how_many, start_from=0): Returns: An instance of image_generator that produces CIFAR-10 images and labels. """ - _get_cifar10(tmp_dir) - data_files = _CIFAR10_TRAIN_FILES if training else _CIFAR10_TEST_FILES + if cifar_version == "cifar10": + url = _CIFAR10_URL + train_files = _CIFAR10_TRAIN_FILES + test_files = _CIFAR10_TEST_FILES + prefix = _CIFAR10_PREFIX + image_size = _CIFAR10_IMAGE_SIZE + elif cifar_version == "cifar100": + url = _CIFAR100_URL + train_files = _CIFAR100_TRAIN_FILES + test_files = _CIFAR100_TEST_FILES + prefix = _CIFAR100_PREFIX + image_size = _CIFAR100_IMAGE_SIZE + + _get_cifar(tmp_dir, url) + data_files = train_files if training else test_files all_images, all_labels = [], [] for filename in data_files: - path = os.path.join(tmp_dir, _CIFAR10_PREFIX, filename) + path = os.path.join(tmp_dir, prefix, filename) with tf.gfile.Open(path, "r") as f: data = cPickle.load(f) images = data["data"] num_images = images.shape[0] - images = images.reshape((num_images, 3, _CIFAR10_IMAGE_SIZE, - _CIFAR10_IMAGE_SIZE)) + images = images.reshape((num_images, 3, image_size, image_size)) all_images.extend([ np.squeeze(images[j]).transpose((1, 2, 0)) for j in xrange(num_images) ]) - labels = data["labels"] + labels = data["labels" if cifar_version == "cifar10" else "fine_labels"] all_labels.extend([labels[j] for j in xrange(num_images)]) return image_utils.image_generator( all_images[start_from:start_from + how_many], @@ -112,9 +130,9 @@ def preprocess_example(self, example, mode, unused_hparams): def generator(self, data_dir, tmp_dir, is_training): if is_training: - return cifar10_generator(tmp_dir, True, 48000) + return cifar_generator("cifar10", tmp_dir, True, 48000) else: - return cifar10_generator(tmp_dir, True, 2000, 48000) + return cifar_generator("cifar10", tmp_dir, True, 2000, 48000) @registry.register_problem @@ -122,9 +140,9 @@ class ImageCifar10(ImageCifar10Tune): def generator(self, data_dir, tmp_dir, is_training): if is_training: - return cifar10_generator(tmp_dir, True, 50000) + return cifar_generator("cifar10", tmp_dir, True, 50000) else: - return cifar10_generator(tmp_dir, False, 10000) + return cifar_generator("cifar10", tmp_dir, False, 10000) @registry.register_problem @@ -181,6 +199,212 @@ def preprocess_example(self, example, unused_mode, unused_hparams): example["targets"] = image_utils.resize_by_area(inputs, 32) return example + def hparams(self, defaults, unused_model_hparams): + p = defaults + p.input_modality = {"inputs": ("image:identity", 256)} + p.target_modality = ("image:identity", 256) + p.batch_size_multiplier = 256 + p.input_space_id = 1 + p.target_space_id = 1 + + +@registry.register_problem +class ImageCifar100Tune(mnist.ImageMnistTune): + """Cifar-100 Tune.""" + + @property + def num_classes(self): + return 100 + + @property + def num_channels(self): + return 3 + + @property + def class_labels(self): + return [ + "beaver", + "dolphin", + "otter", + "seal", + "whale", + "aquarium fish", + "flatfish", + "ray", + "shark", + "trout", + "orchids", + "poppies", + "roses", + "sunflowers", + "tulips", + "bottles", + "bowls", + "cans", + "cups", + "plates", + "apples", + "mushrooms", + "oranges", + "pears", + "sweet peppers", + "clock", + "computer keyboard", + "lamp", + "telephone", + "television", + "bed", + "chair", + "couch", + "table", + "wardrobe", + "bee", + "beetle", + "butterfly", + "caterpillar", + "cockroach", + "bear", + "leopard", + "lion", + "tiger", + "wolf", + "bridge", + "castle", + "house", + "road", + "skyscraper", + "cloud", + "forest", + "mountain", + "plain", + "sea", + "camel", + "cattle", + "chimpanzee", + "elephant", + "kangaroo", + "fox", + "porcupine", + "possum", + "raccoon", + "skunk", + "crab", + "lobster", + "snail", + "spider", + "worm", + "baby", + "boy", + "girl", + "man", + "woman", + "crocodile", + "dinosaur", + "lizard", + "snake", + "turtle", + "hamster", + "mouse", + "rabbit", + "shrew", + "squirrel", + "maple", + "oak", + "palm", + "pine", + "willow", + "bicycle", + "bus", + "motorcycle", + "pickup truck", + "train", + "lawn-mower", + "rocket", + "streetcar", + "tank", + "tractor", + ] + + def preprocess_example(self, example, mode, unused_hparams): + image = example["inputs"] + image.set_shape([_CIFAR100_IMAGE_SIZE, _CIFAR100_IMAGE_SIZE, 3]) + if mode == tf.estimator.ModeKeys.TRAIN: + image = image_utils.cifar_image_augmentation(image) + image = tf.image.per_image_standardization(image) + example["inputs"] = image + return example + + def generator(self, data_dir, tmp_dir, is_training): + if is_training: + return cifar_generator("cifar100", tmp_dir, True, 48000) + else: + return cifar_generator("cifar100", tmp_dir, True, 2000, 48000) + + +@registry.register_problem +class ImageCifar100(ImageCifar100Tune): + + def generator(self, data_dir, tmp_dir, is_training): + if is_training: + return cifar_generator("cifar100", tmp_dir, True, 50000) + else: + return cifar_generator("cifar100", tmp_dir, False, 10000) + + +@registry.register_problem +class ImageCifar100Plain(ImageCifar100): + + def preprocess_example(self, example, mode, unused_hparams): + image = example["inputs"] + image.set_shape([_CIFAR100_IMAGE_SIZE, _CIFAR100_IMAGE_SIZE, 3]) + image = tf.image.per_image_standardization(image) + example["inputs"] = image + return example + + +@registry.register_problem +class ImageCifar100PlainGen(ImageCifar100Plain): + """CIFAR-100 32x32 for image generation without standardization preprep.""" + + def dataset_filename(self): + return "image_cifar100_plain" # Reuse CIFAR-100 plain data. + + def preprocess_example(self, example, mode, unused_hparams): + example["inputs"].set_shape([_CIFAR100_IMAGE_SIZE, _CIFAR100_IMAGE_SIZE, 3]) + example["inputs"] = tf.to_int64(example["inputs"]) + return example + + +@registry.register_problem +class ImageCifar100Plain8(ImageCifar100): + """CIFAR-100 rescaled to 8x8 for output: Conditional image generation.""" + + def dataset_filename(self): + return "image_cifar100_plain" # Reuse CIFAR-100 plain data. + + def preprocess_example(self, example, mode, unused_hparams): + image = example["inputs"] + image = image_utils.resize_by_area(image, 8) + image = tf.image.per_image_standardization(image) + example["inputs"] = image + return example + + +@registry.register_problem +class Img2imgCifar100(ImageCifar100): + """CIFAR-100 rescaled to 8x8 for input and 32x32 for output.""" + + def dataset_filename(self): + return "image_cifar100_plain" # Reuse CIFAR-100 plain data. + + def preprocess_example(self, example, unused_mode, unused_hparams): + + inputs = example["inputs"] + # For Img2Img resize input and output images as desired. + example["inputs"] = image_utils.resize_by_area(inputs, 8) + example["targets"] = image_utils.resize_by_area(inputs, 32) + return example + def hparams(self, defaults, unused_model_hparams): p = defaults p.input_modality = {"inputs": ("image:identity", 256)} diff --git a/tensor2tensor/data_generators/cnn_dailymail.py b/tensor2tensor/data_generators/cnn_dailymail.py index 2771f4b0c..a09d22c3c 100644 --- a/tensor2tensor/data_generators/cnn_dailymail.py +++ b/tensor2tensor/data_generators/cnn_dailymail.py @@ -30,17 +30,16 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.utils import registry import tensorflow as tf - # Links to data from http://cs.nyu.edu/~kcho/DMQA/ _CNN_STORIES_DRIVE_URL = "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ" _DAILYMAIL_STORIES_DRIVE_URL = "https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs" - # Note: using See et al. (2017) as reference for data generation # For more info, use the links below @@ -49,17 +48,17 @@ _DEV_URLS = "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_val.txt" _TEST_URLS = "https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt" - # End-of-sentence marker. EOS = text_encoder.EOS_ID - # Techniques for data prep from See et al. (2017) dm_single_close_quote = u"\u2019" # unicode dm_double_close_quote = u"\u201d" # Acceptable ways to end a sentence. -END_TOKENS = [u".", u"!", u"?", u"...", u"'", u"`", u"\"", - dm_single_close_quote, dm_double_close_quote, u")"] +END_TOKENS = [ + u".", u"!", u"?", u"...", u"'", u"`", u"\"", dm_single_close_quote, + dm_double_close_quote, u")" +] def _maybe_download_corpora(tmp_dir, is_training): @@ -93,17 +92,18 @@ def _maybe_download_corpora(tmp_dir, is_training): all_files = cnn_files + dailymail_files if is_training: - urls_path = generator_utils.maybe_download( - tmp_dir, "all_train.txt", _TRAIN_URLS) + urls_path = generator_utils.maybe_download(tmp_dir, "all_train.txt", + _TRAIN_URLS) else: - urls_path = generator_utils.maybe_download( - tmp_dir, "all_val.txt", _DEV_URLS) + urls_path = generator_utils.maybe_download(tmp_dir, "all_val.txt", + _DEV_URLS) return all_files, urls_path def example_splits(url_file, all_files): """Generate splits of the data.""" + def generate_hash(inp): """Generate a sha1 hash to match the raw url to the filename extracted.""" h = hashlib.sha1() @@ -132,6 +132,7 @@ def generate_hash(inp): def example_generator(all_files, urls_path, sum_token): """Generate examples.""" + def fix_run_on_sents(line): if u"@highlight" in line: return line @@ -175,81 +176,54 @@ def _story_summary_split(story): split_str = u" " split_str_len = len(split_str) split_pos = story.find(split_str) - return story[:split_pos], story[split_pos+split_str_len:] # story, summary + return story[:split_pos], story[split_pos + split_str_len:] # story, summary -def write_raw_text_to_files(all_files, urls_path, data_dir, tmp_dir, - is_training): +def write_raw_text_to_files(all_files, urls_path, tmp_dir, is_training): """Write text to files.""" - def write_to_file(all_files, urls_path, data_dir, filename): - with io.open(os.path.join(data_dir, filename+".source"), "w") as fstory: - with io.open(os.path.join(data_dir, filename+".target"), "w") as fsummary: + + def write_to_file(all_files, urls_path, tmp_dir, filename): + with io.open(os.path.join(tmp_dir, filename + ".source"), "w") as fstory: + with io.open(os.path.join(tmp_dir, filename + ".target"), + "w") as fsummary: for example in example_generator(all_files, urls_path, sum_token=True): story, summary = _story_summary_split(example) - fstory.write(story+"\n") - fsummary.write(summary+"\n") + fstory.write(story + "\n") + fsummary.write(summary + "\n") filename = "cnndm.train" if is_training else "cnndm.dev" tf.logging.info("Writing %s" % filename) - write_to_file(all_files, urls_path, data_dir, filename) + write_to_file(all_files, urls_path, tmp_dir, filename) if not is_training: - test_urls_path = generator_utils.maybe_download( - tmp_dir, "all_test.txt", _TEST_URLS) + test_urls_path = generator_utils.maybe_download(tmp_dir, "all_test.txt", + _TEST_URLS) filename = "cnndm.test" tf.logging.info("Writing %s" % filename) - write_to_file(all_files, test_urls_path, data_dir, filename) + write_to_file(all_files, test_urls_path, tmp_dir, filename) @registry.register_problem -class SummarizeCnnDailymail32k(problem.Text2TextProblem): +class SummarizeCnnDailymail32k(text_problems.Text2TextProblem): """Summarize CNN and Daily Mail articles to their summary highlights.""" @property - def is_character_level(self): - return False - - @property - def has_inputs(self): - return True - - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.EN_TOK + def vocab_filename(self): + return "vocab.cnndailymail.%d" % self.approx_vocab_size - @property - def num_shards(self): - return 100 - - @property - def vocab_name(self): - return "vocab.cnndailymail" + def generate_text_for_vocab(self, data_dir, tmp_dir): + del data_dir + all_files, urls_path = _maybe_download_corpora(tmp_dir, True) + return example_generator(all_files, urls_path, sum_token=False) - @property - def use_subword_tokenizer(self): + def is_generate_per_split(self): return True - @property - def targeted_vocab_size(self): - return 2**15 # 32768 - - @property - def use_train_shards_for_dev(self): - return False - - def generator(self, data_dir, tmp_dir, is_training): + def generate_samples(self, data_dir, tmp_dir, dataset_split): + del data_dir + is_training = dataset_split == problem.DatasetSplit.TRAIN all_files, urls_path = _maybe_download_corpora(tmp_dir, is_training) - encoder = generator_utils.get_or_generate_vocab_inner( - data_dir, self.vocab_file, self.targeted_vocab_size, - example_generator(all_files, urls_path, sum_token=False)) - write_raw_text_to_files(all_files, urls_path, data_dir, tmp_dir, - is_training) + write_raw_text_to_files(all_files, urls_path, tmp_dir, is_training) for example in example_generator(all_files, urls_path, sum_token=True): story, summary = _story_summary_split(example) - encoded_summary = encoder.encode(summary) + [EOS] - encoded_story = encoder.encode(story) + [EOS] - yield {"inputs": encoded_story, "targets": encoded_summary} + yield {"inputs": story, "targets": summary} diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py index 845cecd70..1e72746fb 100644 --- a/tensor2tensor/data_generators/desc2code.py +++ b/tensor2tensor/data_generators/desc2code.py @@ -30,6 +30,7 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.utils import registry import tensorflow as tf @@ -75,28 +76,18 @@ CodingPbInfo = collections.namedtuple("CodingPbInfo", "desc_file, code_files") -class Desc2CodeProblem(problem.Text2TextProblem): +class Desc2CodeProblem(text_problems.Text2TextProblem): """Base class for Description2Code problems.""" @property - def is_character_level(self): - return False - - @property - def num_shards(self): - return 10 - - @property - def use_subword_tokenizer(self): - return True - - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return self.pb_constants.target_space + def dataset_splits(self): + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 10, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 1, + }] @property def input_vocab_size(self): @@ -138,7 +129,11 @@ def feature_encoders(self, data_dir): "targets": target_token, } - def generator(self, data_dir, tmp_dir, train): + def is_generate_per_split(self): + return True + + def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN # Called twice: for train and test # Get the list of the training samples (coding challenge samples) diff --git a/tensor2tensor/data_generators/fsns.py b/tensor2tensor/data_generators/fsns.py index 8a9cfced4..ed7baaec6 100644 --- a/tensor2tensor/data_generators/fsns.py +++ b/tensor2tensor/data_generators/fsns.py @@ -68,7 +68,6 @@ def hparams(self, defaults, unused_model_hparams): vocab_size = self._encoders["targets"].vocab_size p.target_modality = (registry.Modalities.SYMBOL, vocab_size) p.batch_size_multiplier = 256 - p.max_expected_batch_size_per_shard = 2 p.input_space_id = problem.SpaceID.IMAGE p.target_space_id = problem.SpaceID.EN_TOK diff --git a/tensor2tensor/data_generators/image_utils.py b/tensor2tensor/data_generators/image_utils.py index 3eb571865..7c3d946e1 100644 --- a/tensor2tensor/data_generators/image_utils.py +++ b/tensor2tensor/data_generators/image_utils.py @@ -116,7 +116,6 @@ def hparams(self, defaults, unused_model_hparams): p.input_modality = {"inputs": (registry.Modalities.IMAGE, 256)} p.target_modality = (registry.Modalities.CLASS_LABEL, self.num_classes) p.batch_size_multiplier = 4 if self.is_small else 256 - p.max_expected_batch_size_per_shard = 8 if self.is_small else 2 p.loss_multiplier = 3.0 if self.is_small else 1.0 if self._was_reversed: p.loss_multiplier = 1.0 @@ -229,7 +228,6 @@ def hparams(self, defaults, unused_model_hparams): encoder = self._encoders["targets"] p.target_modality = (registry.Modalities.SYMBOL, encoder.vocab_size) p.batch_size_multiplier = 256 - p.max_expected_batch_size_per_shard = 4 p.loss_multiplier = 1.0 p.input_space_id = problem.SpaceID.IMAGE p.target_space_id = self.target_space_id diff --git a/tensor2tensor/data_generators/imagenet.py b/tensor2tensor/data_generators/imagenet.py index 8fb6679ff..a071c820b 100644 --- a/tensor2tensor/data_generators/imagenet.py +++ b/tensor2tensor/data_generators/imagenet.py @@ -155,7 +155,6 @@ def hparams(self, defaults, unused_model_hparams): p.input_modality = {"inputs": ("image:identity", 256)} p.target_modality = ("image:identity", 256) p.batch_size_multiplier = 256 - p.max_expected_batch_size_per_shard = 4 p.input_space_id = 1 p.target_space_id = 1 diff --git a/tensor2tensor/data_generators/imdb.py b/tensor2tensor/data_generators/imdb.py index 61587da6e..d0f1e5cac 100644 --- a/tensor2tensor/data_generators/imdb.py +++ b/tensor2tensor/data_generators/imdb.py @@ -26,36 +26,47 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem -from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.utils import registry import tensorflow as tf -# End-of-sentence marker. -EOS = text_encoder.EOS_ID - @registry.register_problem -class SentimentIMDB(problem.Problem): +class SentimentIMDB(text_problems.Text2ClassProblem): """IMDB sentiment classification.""" URL = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz" @property - def num_shards(self): - return 10 + def is_generate_per_split(self): + return True @property - def vocab_file(self): - return "sentiment_imdb.vocab" + def dataset_splits(self): + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 10, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 1, + }] @property - def batch_size_means_tokens(self): - return True + def vocab_filename(self): + return "sentiment_imdb.vocab.%d" % self.approx_vocab_size @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**13 # 8k vocab suffices for this small dataset. + @property + def num_classes(self): + return 2 + + def class_labels(self, data_dir): + del data_dir + return ["neg", "pos"] + def doc_generator(self, imdb_dir, dataset, include_label=False): dirs = [(os.path.join(imdb_dir, dataset, "pos"), True), (os.path.join( imdb_dir, dataset, "neg"), False)] @@ -69,7 +80,7 @@ def doc_generator(self, imdb_dir, dataset, include_label=False): else: yield doc - def generator(self, data_dir, tmp_dir, train): + def generate_samples(self, data_dir, tmp_dir, dataset_split): """Generate examples.""" # Download and extract compressed_filename = os.path.basename(self.URL) @@ -80,49 +91,11 @@ def generator(self, data_dir, tmp_dir, train): with tarfile.open(download_path, "r:gz") as tar: tar.extractall(tmp_dir) - # Generate vocab - encoder = generator_utils.get_or_generate_vocab_inner( - data_dir, self.vocab_file, self.targeted_vocab_size, - self.doc_generator(imdb_dir, "train")) - # Generate examples + train = dataset_split == problem.DatasetSplit.TRAIN dataset = "train" if train else "test" for doc, label in self.doc_generator(imdb_dir, dataset, include_label=True): yield { - "inputs": encoder.encode(doc) + [EOS], - "targets": [int(label)], + "inputs": doc, + "label": int(label), } - - def generate_data(self, data_dir, tmp_dir, task_id=-1): - train_paths = self.training_filepaths( - data_dir, self.num_shards, shuffled=False) - dev_paths = self.dev_filepaths(data_dir, 1, shuffled=False) - generator_utils.generate_dataset_and_shuffle( - self.generator(data_dir, tmp_dir, True), train_paths, - self.generator(data_dir, tmp_dir, False), dev_paths) - - def hparams(self, defaults, unused_model_hparams): - p = defaults - source_vocab_size = self._encoders["inputs"].vocab_size - p.input_modality = { - "inputs": (registry.Modalities.SYMBOL, source_vocab_size) - } - p.target_modality = (registry.Modalities.CLASS_LABEL, 2) - p.input_space_id = problem.SpaceID.EN_TOK - p.target_space_id = problem.SpaceID.GENERIC - - def feature_encoders(self, data_dir): - vocab_filename = os.path.join(data_dir, self.vocab_file) - encoder = text_encoder.SubwordTextEncoder(vocab_filename) - return { - "inputs": encoder, - "targets": text_encoder.ClassLabelEncoder(["neg", "pos"]), - } - - def example_reading_spec(self): - data_fields = { - "inputs": tf.VarLenFeature(tf.int64), - "targets": tf.FixedLenFeature([1], tf.int64), - } - data_items_to_decoders = None - return (data_fields, data_items_to_decoders) diff --git a/tensor2tensor/data_generators/librispeech.py b/tensor2tensor/data_generators/librispeech.py index d14cc8868..ab8376847 100644 --- a/tensor2tensor/data_generators/librispeech.py +++ b/tensor2tensor/data_generators/librispeech.py @@ -185,3 +185,10 @@ def add_librispeech_hparams(hparams): hparams.train_steps = 5000000 hparams.num_hidden_layers = 4 return hparams + + +def set_librispeech_length_hparams(hparams): + hparams.max_length = 1650 * 80 # this limits inputs[1] * inputs[2] + hparams.max_input_seq_length = 1650 + hparams.max_target_seq_length = 350 + return hparams diff --git a/tensor2tensor/data_generators/lm1b.py b/tensor2tensor/data_generators/lm1b.py index b08ec9e3f..fc4d0347e 100644 --- a/tensor2tensor/data_generators/lm1b.py +++ b/tensor2tensor/data_generators/lm1b.py @@ -31,15 +31,12 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.data_generators import tokenizer from tensor2tensor.utils import registry import tensorflow as tf -# End-of-sentence marker (should correspond to the position of EOS in the -# RESERVED_TOKENS list in text_encoder.py) -EOS = 1 - def _original_vocab(tmp_dir): """Returns a set containing the original vocabulary. @@ -89,7 +86,7 @@ def _train_data_filenames(tmp_dir): ] -def _dev_data_filename(tmp_dir): +def _dev_data_filenames(tmp_dir): return os.path.join(tmp_dir, "1-billion-word-language-modeling-benchmark-r13output", "heldout-monolingual.tokenized.shuffled", @@ -112,9 +109,7 @@ def _maybe_download_corpus(tmp_dir): corpus_tar.extractall(tmp_dir) -def _get_or_build_subword_text_encoder(tmp_dir, - vocab_filepath, - target_size): +def _get_or_build_subword_text_encoder(tmp_dir, vocab_filepath, target_size): """Builds a SubwordTextEncoder based on the corpus. Args: @@ -140,7 +135,7 @@ def _get_or_build_subword_text_encoder(tmp_dir, line_count += 1 if line_count >= max_lines: break - if target_size == 2 ** 15: + if target_size == 2**15: # legacy behavior ret = text_encoder.SubwordTextEncoder() ret.build_from_token_counts(token_counts, min_count=5) @@ -152,75 +147,34 @@ def _get_or_build_subword_text_encoder(tmp_dir, @registry.register_problem -class LanguagemodelLm1b32k(problem.Text2TextProblem): +class LanguagemodelLm1b32k(text_problems.Text2TextProblem): """A language model on the 1B words corpus.""" - @property - def is_character_level(self): - return False - - @property - def has_inputs(self): - return False - - @property - def input_space_id(self): - # Ratio of dev tokens (including eos) to dev words (including eos) - # 176884 / 159658 = 1.107893; multiply ppx by this to compare results. - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def num_shards(self): - return 100 - @property def vocab_name(self): - return "vocab.lm1b.en" + return "vocab.lm1b.en.%d" % self.approx_vocab_size @property - def use_subword_tokenizer(self): - return True - - @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**15 # 32768 - @property - def use_train_shards_for_dev(self): - return False - - def generator(self, data_dir, tmp_dir, is_training): - """Generator for lm1b sentences. - - Args: - data_dir: data dir. - tmp_dir: tmp dir. - is_training: a boolean. + def is_generate_per_split(self): + return True - Yields: - A dictionary {"inputs": [0], "targets": []} - """ + def generate_samples(self, data_dir, tmp_dir, dataset_split): + del data_dir + split_files = { + problem.DatasetSplit.TRAIN: _train_data_filenames(tmp_dir), + problem.DatasetSplit.EVAL: _dev_data_filenames(tmp_dir), + } _maybe_download_corpus(tmp_dir) original_vocab = _original_vocab(tmp_dir) - files = (_train_data_filenames(tmp_dir) - if is_training else [_dev_data_filename(tmp_dir)]) - if self.is_character_level: - encoder = text_encoder.ByteTextEncoder() - else: - vocab_filepath = os.path.join(data_dir, self.vocab_file) - encoder = _get_or_build_subword_text_encoder( - tmp_dir, vocab_filepath, self.targeted_vocab_size) + files = split_files[dataset_split] for filepath in files: tf.logging.info("filepath = %s", filepath) for line in tf.gfile.Open(filepath): - tokens = encoder.encode( - _replace_oov(original_vocab, text_encoder.native_to_unicode(line))) - tokens.append(EOS) - yield {"inputs": [0], "targets": tokens} + txt = _replace_oov(original_vocab, text_encoder.native_to_unicode(line)) + yield {"targets": txt} @registry.register_problem @@ -237,7 +191,7 @@ class LanguagemodelLm1b8kPacked(LanguagemodelLm1b32k): """ @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**13 # 8192 @property @@ -250,5 +204,5 @@ class LanguagemodelLm1bCharacters(LanguagemodelLm1b32k): """A language model on the 1B words corpus, character level.""" @property - def is_character_level(self): - return True + def vocab_type(self): + return text_problems.VocabType.CHARACTER diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py index c5b253c3c..ebcc0697d 100644 --- a/tensor2tensor/data_generators/problem.py +++ b/tensor2tensor/data_generators/problem.py @@ -26,11 +26,16 @@ from tensor2tensor.data_generators import text_encoder from tensor2tensor.utils import data_reader from tensor2tensor.utils import metrics -from tensor2tensor.utils import registry import tensorflow as tf +class DatasetSplit(object): + TRAIN = tf.estimator.ModeKeys.TRAIN + EVAL = tf.estimator.ModeKeys.EVAL + TEST = "test" + + class SpaceID(object): """Input and target space ids. Add more as needed.""" # Generic / unknown output space (default) @@ -343,14 +348,14 @@ def filepattern(self, data_dir, mode, shard=None): """Get filepattern for data files for mode. Matches mode to a suffix. - * TRAIN: train - * EVAL: dev - * PREDICT: dev - * test: test + * DatasetSplit.TRAIN: train + * DatasetSplit.EVAL: dev + * DatasetSplit.TEST: test + * tf.estimator.ModeKeys.PREDICT: dev Args: data_dir: str, data directory. - mode: tf.estimator.ModeKeys or "test". + mode: DatasetSplit shard: int, if provided, will only read data from the specified shard. Returns: @@ -358,12 +363,12 @@ def filepattern(self, data_dir, mode, shard=None): """ path = os.path.join(data_dir, self.dataset_filename()) shard_str = "-%05d" % shard if shard is not None else "" - if mode == tf.estimator.ModeKeys.TRAIN: + if mode == DatasetSplit.TRAIN: suffix = "train" - elif mode in [tf.estimator.ModeKeys.EVAL, tf.estimator.ModeKeys.PREDICT]: + elif mode in [DatasetSplit.EVAL, tf.estimator.ModeKeys.PREDICT]: suffix = "dev" else: - assert mode == "test" + assert mode == DatasetSplit.TEST suffix = "test" return "%s-%s%s*" % (path, suffix, shard_str) @@ -432,7 +437,6 @@ def dataset(self, num_threads=None, output_buffer_size=None, shuffle_files=None, - repeat=None, hparams=None, preprocess=True, dataset_split=None, @@ -449,14 +453,12 @@ def dataset(self, output_buffer_size: int, how many elements to prefetch at end of pipeline. shuffle_files: whether to shuffle input files. Default behavior (i.e. when shuffle_files=None) is to shuffle if mode == TRAIN. - repeat: whether to repeat the Dataset. Default behavior is to repeat if - mode == TRAIN. hparams: tf.contrib.training.HParams; hparams to be passed to Problem.preprocess_example and Problem.hparams. If None, will use a default set that is a no-op. preprocess: bool, whether to map the Dataset through Problem.preprocess_example. - dataset_split: tf.estimator.ModeKeys + ["test"], which split to read data + dataset_split: DatasetSplit, which split to read data from (TRAIN:"-train", EVAL:"-dev", "test":"-test"). Defaults to mode. shard: int, if provided, will only read data from the specified shard. partition_id: integer - which partition of the dataset to read from @@ -469,7 +471,6 @@ def dataset(self, ValueError: if num_partitions is greater than the number of data files. """ is_training = mode == tf.estimator.ModeKeys.TRAIN - repeat = repeat or repeat is None and is_training shuffle_files = shuffle_files or shuffle_files is None and is_training dataset_split = dataset_split or mode @@ -526,8 +527,6 @@ def _maybe_reverse_and_copy(example): dataset = dataset.interleave(_load_records, cycle_length=8, block_length=16) - if repeat: - dataset = dataset.repeat() dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads) if preprocess: if hasattr(tf.contrib.data, "parallel_interleave"): @@ -717,10 +716,15 @@ def define_shapes(example): }) dataset = self.dataset(**dataset_kwargs) + if is_training: + # Repeat and skip a random number of records + dataset = dataset.repeat() + data_files = tf.contrib.slim.parallel_reader.get_data_files( + self.filepattern(data_dir, mode)) + dataset = skip_random_fraction(dataset, data_files[0]) + dataset = dataset.map( data_reader.cast_int64_to_int32, num_parallel_calls=num_threads) - if is_training: - dataset = dataset.repeat(None) if self.batch_size_means_tokens: batch_size_means_tokens = True @@ -750,10 +754,8 @@ def define_shapes(example): else: # batch_size means tokens per datashard if config and config.use_tpu: - # On TPU, pad to max_length dataset = dataset.filter(tpu_valid_size) - padded_shapes = _fill_shape_nones( - dataset.output_shapes, none_filler=max_length) + padded_shapes = self._pad_for_tpu(dataset.output_shapes, hparams) # on TPU, we use params["batch_size"], which specifies the number of # examples across all datashards batch_size = params["batch_size"] @@ -825,6 +827,33 @@ def serving_input_fn(self, hparams): return tf.estimator.export.ServingInputReceiver( features=features, receiver_tensors=serialized_example) + def _pad_for_tpu(self, shapes_dict, hparams): + """Pads unknown features' dimensions for TPU.""" + max_length = self.max_length(hparams) + padded_shapes = {} + + def get_filler(specified_max_length): + if not specified_max_length: + return max_length + return min(specified_max_length, max_length) + + inputs_none_filler = get_filler(hparams.max_input_seq_length) + targets_none_filler = get_filler(hparams.max_target_seq_length) + + def pad_one_shape(shape, none_filler): + return [ + (dim if dim is not None else none_filler) for dim in shape.as_list() + ] + + for key, shape in six.iteritems(shapes_dict): + if key == "inputs": + padded_shapes[key] = pad_one_shape(shape, inputs_none_filler) + elif key == "targets": + padded_shapes[key] = pad_one_shape(shape, targets_none_filler) + else: + padded_shapes[key] = pad_one_shape(shape, max_length) + return padded_shapes + class FeatureInfo(object): @@ -895,12 +924,6 @@ def _default_hparams(): # be used to judge the sequence length. batch_size_multiplier=1, - # To make queues of the right capacity, it's good to know the maximal - # expected batch size, as it can vary a lot. It only affects performance - # of input readers and memory use. The defaults should be safe and fast, - # but decrease if your reader uses a lot of memory and increase if slow. - max_expected_batch_size_per_shard=64, - # During inference for autoregressive problems, if the batch_size is 1, # the inference will stop when the model predict a text_encoder.EOS_ID # token. @@ -926,438 +949,6 @@ def _default_hparams(): target_space_id=SpaceID.GENERIC) -class Text2TextProblem(Problem): - """Base class for text-to-text problems.""" - - @property - def is_character_level(self): - """Whether the inputs and targets are sequences of characters.""" - raise NotImplementedError() - - @property - def targeted_vocab_size(self): - raise NotImplementedError() # Not needed if self.is_character_level. - - @property - def batch_size_means_tokens(self): - return True - - def generator(self, data_dir, tmp_dir, is_training): - """Generator for the training and evaluation data. - - Args: - data_dir: The directory in which to assets, e.g. the vocab file. - tmp_dir: A scratch directory (if needed). - is_training: A boolean indicating if we should generate training data - (True) or dev set data (False). - - Yields: - dicts with keys "inputs" and "targets", with values being lists of token - ids. - """ - raise NotImplementedError() - - @property - def packed_length(self): - """Pack multiple examples into a single example of constant length. - - This is useful for TPU training. See generator_utils.pack_examples(). - - Returns: - an optional integer - """ - return None - - def max_length(self, model_hparams): - """Maximum sequence length.""" - if self.packed_length: - return self.packed_length - return super(Text2TextProblem, self).max_length(model_hparams) - - @property - def use_train_shards_for_dev(self): - """If true, we only generate training data and hold out shards for dev.""" - return False - - @property - def input_space_id(self): - raise NotImplementedError() - - @property - def target_space_id(self): - raise NotImplementedError() - - @property - def num_shards(self): - raise NotImplementedError() - - @property - def num_dev_shards(self): - return 1 - - @property - def vocab_name(self): - raise NotImplementedError() - - @property - def vocab_file(self): - return "%s.%d" % (self.vocab_name, self.targeted_vocab_size) - - @property - def use_subword_tokenizer(self): - raise NotImplementedError() - - @property - def has_inputs(self): - return True # Set to False for language models. - - def _maybe_pack_examples(self, generator): - """Helper to generate_data().""" - if self.packed_length: - return generator_utils.pack_examples( - generator, - self.has_inputs, - self.packed_length, - chop_long_sequences=not self.has_inputs) - else: - return generator - - def generate_data(self, data_dir, tmp_dir, task_id=-1): - train_paths = self.training_filepaths( - data_dir, self.num_shards, shuffled=False) - dev_paths = self.dev_filepaths( - data_dir, self.num_dev_shards, shuffled=False) - if self.use_train_shards_for_dev: - all_paths = train_paths + dev_paths - generator_utils.generate_files( - self._maybe_pack_examples(self.generator(data_dir, tmp_dir, True)), - all_paths) - generator_utils.shuffle_dataset(all_paths) - else: - generator_utils.generate_dataset_and_shuffle( - self._maybe_pack_examples(self.generator(data_dir, tmp_dir, - True)), train_paths, - self._maybe_pack_examples(self.generator(data_dir, tmp_dir, False)), - dev_paths) - - def feature_encoders(self, data_dir): - if self.is_character_level: - encoder = text_encoder.ByteTextEncoder() - elif self.use_subword_tokenizer: - vocab_filename = os.path.join(data_dir, self.vocab_file) - encoder = text_encoder.SubwordTextEncoder(vocab_filename) - else: - vocab_filename = os.path.join(data_dir, self.vocab_file) - encoder = text_encoder.TokenTextEncoder(vocab_filename) - if self.has_inputs: - return {"inputs": encoder, "targets": encoder} - return {"targets": encoder} - - def hparams(self, defaults, unused_model_hparams): - p = defaults - p.stop_at_eos = int(True) - - if self.has_inputs: - source_vocab_size = self._encoders["inputs"].vocab_size - p.input_modality = { - "inputs": (registry.Modalities.SYMBOL, source_vocab_size) - } - target_vocab_size = self._encoders["targets"].vocab_size - p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size) - if self.has_inputs: - p.input_space_id = self.input_space_id - p.target_space_id = self.target_space_id - if self.is_character_level: - p.loss_multiplier = 2.0 - if self.packed_length: - identity = (registry.Modalities.GENERIC, None) - if self.has_inputs: - p.input_modality["inputs_segmentation"] = identity - p.input_modality["inputs_position"] = identity - p.input_modality["targets_segmentation"] = identity - p.input_modality["targets_position"] = identity - - def example_reading_spec(self): - data_fields = {"targets": tf.VarLenFeature(tf.int64)} - if self.has_inputs: - data_fields["inputs"] = tf.VarLenFeature(tf.int64) - - if self.packed_length: - if self.has_inputs: - data_fields["inputs_segmentation"] = tf.VarLenFeature(tf.int64) - data_fields["inputs_position"] = tf.VarLenFeature(tf.int64) - data_fields["targets_segmentation"] = tf.VarLenFeature(tf.int64) - data_fields["targets_position"] = tf.VarLenFeature(tf.int64) - - data_items_to_decoders = None - return (data_fields, data_items_to_decoders) - - def eval_metrics(self): - return [ - metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, - metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, - metrics.Metrics.APPROX_BLEU, metrics.Metrics.ROUGE_2_F, - metrics.Metrics.ROUGE_L_F - ] - - -class ChoppedTextProblem(Text2TextProblem): - """Tokenize and chop text files into fixed-length language-modeling examples. - - The input data is a set of text files, as specified by - self.train_text_filepaths() and self.dev_text_filepaths(). - - The text is tokenized using a SubwordTextEncoder, and - then split into examples, each of length self.sequence_length(). - """ - - def train_text_filepaths(self, tmp_dir): - """Local filepaths of text files containing training data. - - This function may want to download the files if they do not exist. - - Args: - tmp_dir: a string - Returns: - a list of strings. - """ - raise NotImplementedError() - - def dev_text_filepaths(self, tmp_dir): - """Local filepaths of text files containing dev data. - - This function may want to download the files if they do not exist. - - Args: - tmp_dir: a string - Returns: - a list of strings. - """ - raise NotImplementedError() - - @property - def sequence_length(self): - """Length of each example (in tokens).""" - raise NotImplementedError() - - def max_length(self, model_hparams): - return model_hparams.split_to_length or self.sequence_length - - @property - def is_character_level(self): - return False - - def text_filepaths_for_task(self, tmp_dir, task_id): - """List of input filepaths for a particular training or dev shard. - - Args: - tmp_dir: a string - task_id: an integer less than self.num_shards - Returns: - a list of tuples (filepath, start_pos, num_bytes) - """ - assert task_id >= 0 - assert task_id < self.num_train_shards + self.num_dev_shards - if task_id < self.num_train_shards: - return [ - f for i, f in enumerate(self.train_text_filepaths(tmp_dir)) - if i % self.num_train_shards == task_id - ] - else: - return [ - f for i, f in enumerate(self.dev_text_filepaths(tmp_dir)) - if i % self.num_dev_shards == task_id - self.num_train_shards - ] - - def filepath_to_unicode_strings(self, filepath): - """Read text out of an input file. - - The default just reads the text, converts to unicode and yields one - unicode string. - - Subclasses can override this function in order to preprocess, and can - yield any number of strings. - - Args: - filepath: a string - Yields: - unicode strings. - """ - f = tf.gfile.Open(filepath) - b = f.read() - yield to_unicode_ignore_erros(b) - - def file_generator(self, - filepaths, - max_chars_per_file=None, - max_chars_total=None): - """Read complete text of input files and yield unicode strings. - - By default, one unicode string is produced per file, but this is - not guaranteed, since subclasses can override - filepath_to_unicode_strings(). - - max_chars_per_file and max_chars_total can also be specified, in which - case some strings may be truncated or dropped to limit the total - amount of output. - - Args: - filepaths: a list of strings - max_chars_per_file: an optional integer - max_chars_total: an optional integer - Yields: - unicode strings - """ - chars_total = 0 - for fname in filepaths: - chars_this_file = 0 - tf.logging.info("reading file %s" % fname) - for text in self.filepath_to_unicode_strings(fname): - if (max_chars_per_file and - chars_this_file + len(text) > max_chars_per_file): - text = text[:max_chars_per_file - chars_this_file] - if max_chars_total and chars_total + len(text) > max_chars_total: - text = text[:max_chars_total - chars_total] - chars_total += len(text) - chars_this_file += len(text) - if text: - yield text - if max_chars_total and chars_total >= max_chars_total: - return - if max_chars_per_file and chars_this_file >= max_chars_per_file: - break - - def example_generator(self, encoder, tmp_dir, task_id): - """Generator for examples. - - Args: - encoder: a TextEncoder - tmp_dir: a string - task_id: an integer - Yields: - feature dictionaries - """ - filepaths = self.text_filepaths_for_task(tmp_dir, task_id) - if task_id >= self.num_train_shards: - # this is dev data - limit the total length. - max_chars_per_file = self.max_dev_chars // ( - self.num_dev_shards * len(filepaths)) - else: - max_chars_per_file = None - tokens = [] - for ftext in self.file_generator( - filepaths, max_chars_per_file=max_chars_per_file): - tokens.extend(encoder.encode(ftext)) - pos = 0 - while pos + self.sequence_length <= len(tokens): - yield {"inputs": [0], "targets": tokens[pos:pos + self.sequence_length]} - pos += self.sequence_length - if pos > 0: - tokens = tokens[pos:] - if self.remainder_policy == "pad": - if tokens: - targets = tokens + [0] * (self.sequence_length - len(tokens)) - yield {"inputs": [0], "targets": targets} - else: - assert self.remainder_policy == "drop" - - @property - def remainder_policy(self): - """What to do with leftover tokens. - - Returns: - a string - either "pad" or "drop". - """ - return "pad" - - def prepare_to_generate(self, data_dir, tmp_dir): - """Make sure that the data is prepared and the vocab is generated.""" - self.get_or_generate_vocab(data_dir, tmp_dir) - self.train_text_filepaths(tmp_dir) - self.dev_text_filepaths(tmp_dir) - - def get_or_generate_vocab(self, data_dir, tmp_dir): - return generator_utils.get_or_generate_vocab_inner( - data_dir, self.vocab_file, self.targeted_vocab_size, - self.file_generator( - self.train_text_filepaths(tmp_dir), - max_chars_total=self.max_chars_for_vocab)) - - def generate_data(self, data_dir, tmp_dir, task_id=-1): - """Generates training/dev data. - - Args: - data_dir: a string - tmp_dir: a string - task_id: an optional integer - Returns: - shard or shards for which data was generated. - """ - tf.logging.info("generate_data task_id=%s" % task_id) - encoder = self.get_or_generate_vocab(data_dir, tmp_dir) - assert task_id >= 0 and task_id < self.num_generate_tasks - if task_id < self.num_train_shards: - out_file = self.training_filepaths( - data_dir, self.num_train_shards, shuffled=False)[task_id] - else: - out_file = self.dev_filepaths( - data_dir, self.num_dev_shards, - shuffled=False)[task_id - self.num_train_shards] - generator_utils.generate_files( - self.example_generator(encoder, tmp_dir, task_id), [out_file]) - generator_utils.shuffle_dataset([out_file]) - - @property - def max_chars_for_vocab(self): - """Number of characters of training data to use for generating vocab.""" - return 10**7 - - @property - def target_space_id(self): - return SpaceID.EN_TOK - - @property - def num_train_shards(self): - return 100 - - @property - def num_dev_shards(self): - return 1 - - @property - def max_dev_chars(self): - """Limit dev set to at most this many characters (default 10M).""" - return 10**7 - - @property - def multiprocess_generate(self): - return True - - @property - def num_generate_tasks(self): - return self.num_train_shards + self.num_dev_shards - - @property - def vocab_name(self): - raise NotImplementedError() - - @property - def use_subword_tokenizer(self): - return True - - @property - def has_inputs(self): - return False - - def eval_metrics(self): - return [metrics.Metrics.ACC, metrics.Metrics.NEG_LOG_PERPLEXITY] - - -def to_unicode_ignore_erros(s): - return (unicode(s, "utf-8", errors="ignore") - if six.PY2 else s.decode("utf-8", "ignore")) - - def _are_shapes_fully_defined(shapes_dict): for shape in shapes_dict.values(): if not shape.is_fully_defined(): @@ -1365,15 +956,6 @@ def _are_shapes_fully_defined(shapes_dict): return True -def _fill_shape_nones(shapes_dict, none_filler=None): - padded_shapes = {} - for key, shape in six.iteritems(shapes_dict): - padded_shapes[key] = [ - (dim if dim is not None else none_filler) for dim in shape.as_list() - ] - return padded_shapes - - def _summarize_features(features, num_shards=1): with tf.name_scope("input_stats"): for (k, v) in six.iteritems(features): diff --git a/tensor2tensor/data_generators/ptb.py b/tensor2tensor/data_generators/ptb.py index 19f20c90a..15c72e9a7 100644 --- a/tensor2tensor/data_generators/ptb.py +++ b/tensor2tensor/data_generators/ptb.py @@ -29,6 +29,7 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.utils import registry import tensorflow as tf @@ -76,36 +77,29 @@ def _get_token_encoder(vocab_dir, vocab_name, filename): return text_encoder.TokenTextEncoder(vocab_path) -class PTBProblem(problem.Text2TextProblem): - """A class for generating PTB data.""" - - @property - def has_inputs(self): - return False - - @property - def target_space_id(self): - if self.is_character_level: - return problem.SpaceID.EN_CHR - return problem.SpaceID.EN_TOK - - @property - def num_shards(self): - return 10 +@registry.register_problem +class LanguagemodelPtb10k(text_problems.Text2SelfProblem): + """PTB, 10k vocab.""" @property - def vocab_name(self): - return "vocab.lmptb_10k" + def dataset_splits(self): + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 10, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 1, + }] @property - def use_subword_tokenizer(self): - return False + def vocab_filename(self): + return "vocab.lmptb.10000" @property - def targeted_vocab_size(self): - return 10000 + def vocab_type(self): + return text_problems.VocabType.TOKEN - def generator(self, data_dir, tmp_dir, train): + def generate_samples(self, data_dir, tmp_dir, dataset_split): filename = os.path.basename(PTB_URL) compressed_filepath = generator_utils.maybe_download( tmp_dir, filename, PTB_URL) @@ -124,7 +118,7 @@ def generator(self, data_dir, tmp_dir, train): tgz.extractall(tmp_dir, members=files) - if self.is_character_level: + if self.vocab_type == text_problems.VocabType.CHARACTER: files = ptb_char_files else: files = ptb_files @@ -139,37 +133,21 @@ def generator(self, data_dir, tmp_dir, train): assert train_file, "Training file not found" assert valid_file, "Validation file not found" - if self.is_character_level: - encoder = text_encoder.ByteTextEncoder() - else: - encoder = _get_token_encoder(data_dir, self.vocab_file, train_file) + _get_token_encoder(data_dir, self.vocab_filename, train_file) - if train: - return self._generator(train_file, encoder) - return self._generator(valid_file, encoder) + train = dataset_split == problem.DatasetSplit.TRAIN + filepath = train_file if train else valid_file - def _generator(self, filename, encoder): - with tf.gfile.GFile(filename, "r") as f: + with tf.gfile.GFile(filepath, "r") as f: for line in f: line = " ".join(line.replace("\n", " %s " % EOS).split()) - tok = encoder.encode(line) - if tok: - yield {"inputs": [0], "targets": tok} - - -@registry.register_problem -class LanguagemodelPtb10k(PTBProblem): - """A class for generating PTB data, 10k vocab.""" - - @property - def is_character_level(self): - return False + yield {"targets": line} @registry.register_problem -class LanguagemodelPtbCharacters(PTBProblem): - """A class for generating PTB data, character-level.""" +class LanguagemodelPtbCharacters(LanguagemodelPtb10k): + """PTB, character-level.""" @property - def is_character_level(self): - return True + def vocab_type(self): + return text_problems.VocabType.CHARACTER diff --git a/tensor2tensor/data_generators/speech_recognition.py b/tensor2tensor/data_generators/speech_recognition.py index 6d37565b8..25cea7cc5 100644 --- a/tensor2tensor/data_generators/speech_recognition.py +++ b/tensor2tensor/data_generators/speech_recognition.py @@ -239,7 +239,6 @@ class SpeechRecognitionProblem(problem.Problem): def hparams(self, defaults, model_hparams): p = model_hparams - # Filterbank extraction # Filterbank extraction in bottom instead of preprocess_example is faster. p.add_hparam("audio_preproc_in_bottom", False) # The trainer seems to reserve memory for all members of the input dict @@ -312,10 +311,18 @@ def preprocess_example(self, example, mode, hparams): mel_fbanks = add_delta_deltas(mel_fbanks) fbank_size = common_layers.shape_list(mel_fbanks) assert fbank_size[0] == 1 + + # This replaces CMVN estimation on data + + mean = tf.reduce_mean(mel_fbanks, keepdims=True, axis=1) + variance = tf.reduce_mean((mel_fbanks-mean)**2, keepdims=True, axis=1) + mel_fbanks = (mel_fbanks - mean) / variance + # Later models like to flatten the two spatial dims. Instead, we add a # unit spatial dim and flatten the frequencies and channels. example["inputs"] = tf.reshape( - mel_fbanks, [fbank_size[1], 1, fbank_size[2] * fbank_size[3]]) + mel_fbanks, [fbank_size[1], fbank_size[2], fbank_size[3]]) + if not p.audio_keep_example_waveforms: del example["waveforms"] return super(SpeechRecognitionProblem, self @@ -364,36 +371,31 @@ def bottom(self, inputs): mel_fbanks = add_delta_deltas(mel_fbanks) x = tf.reshape(mel_fbanks, common_layers.shape_list(mel_fbanks)[:2] + - [1, num_mel_bins * num_channels]) + [num_mel_bins, num_channels]) + + nonpadding_mask = 1. - common_attention.embedding_to_padding(x) + num_of_nonpadding_elements = tf.reduce_sum( + nonpadding_mask) * num_mel_bins * num_channels + + # This replaces CMVN estimation on data + mean = tf.reduce_sum( + x, axis=[1], keepdims=True) / num_of_nonpadding_elements + variance = (num_of_nonpadding_elements * mean**2. - + 2. * mean * tf.reduce_sum(x, axis=[1], keepdims=True) + + tf.reduce_sum(x**2, axis=[1], keepdims=True) + ) / num_of_nonpadding_elements + x = (x - mean) / variance * tf.expand_dims(nonpadding_mask, -1) else: x = inputs # The convention is that the models are flattened along the spatial, # dimensions, thus the speech preprocessor treats frequencies and # channels as image colors (last axis) - x.set_shape([None, None, 1, num_mel_bins * num_channels]) - - xshape = common_layers.shape_list(x) - - nonpadding_mask = 1. - common_attention.embedding_to_padding(x) - num_of_nonpadding_elements = tf.reduce_sum( - nonpadding_mask) * num_mel_bins * num_channels - - # This replaces CMVN estimation on data - mean = tf.reduce_sum( - x, axis=[1, 2], keepdims=True) / num_of_nonpadding_elements - variance = (num_of_nonpadding_elements * mean**2. - - 2. * mean * tf.reduce_sum(x, axis=[1, 2], keepdims=True) + - tf.reduce_sum(x**2, axis=[1, 2], keepdims=True) - ) / num_of_nonpadding_elements - x = (x - mean) / variance * tf.expand_dims(nonpadding_mask, -1) - - # restore batch_size x time x frequency x channel layout - x = tf.reshape(x, [xshape[0], xshape[1], num_mel_bins, num_channels]) + x.set_shape([None, None, num_mel_bins, num_channels]) # TODO(chorowski): how to specify bottom's hparams and avoid hardcoding? + x = tf.pad(x, [[0, 0], [0, 8], [0, 0], [0, 0]]) for _ in range(2): - x = tf.pad(x, [[0, 0], [0, 2], [0, 0], [0, 0]]) x = tf.layers.conv2d( x, 128, (3, 3), (2, 2), use_bias=False) x = common_layers.layer_norm(x) diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py index e974a523d..e71c3e37c 100644 --- a/tensor2tensor/data_generators/text_encoder.py +++ b/tensor2tensor/data_generators/text_encoder.py @@ -75,6 +75,11 @@ def unicode_to_native(s): return s +def to_unicode_ignore_erros(s): + return (unicode(s, "utf-8", errors="ignore") + if six.PY2 else s.decode("utf-8", "ignore")) + + class TextEncoder(object): """Base class for converting from ints to/from human readable strings.""" @@ -421,6 +426,7 @@ def __init__(self, filename=None): vocab """ self._alphabet = set() + self.filename = filename if filename is not None: self._load_from_file(filename) super(SubwordTextEncoder, self).__init__(num_reserved_ids=None) diff --git a/tensor2tensor/data_generators/text_problems.py b/tensor2tensor/data_generators/text_problems.py new file mode 100644 index 000000000..854a8f9f5 --- /dev/null +++ b/tensor2tensor/data_generators/text_problems.py @@ -0,0 +1,733 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Base classes for text-based Problems. + +* Text2TextProblem: input=text, target=text. +* Text2ClassProblem: input=text, target=class. +* Text2SelfProblem (for language modeling): target=text + +The Text2TextTmpDir problem allows you to train without defining a problem. It +expects you to format your data in a particular way and put it in tmp_dir. See +its docstring. +""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os + +from tensor2tensor.data_generators import generator_utils +from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_encoder +from tensor2tensor.utils import metrics +from tensor2tensor.utils import registry + +import tensorflow as tf + + +class VocabType(object): + """Available text vocabularies.""" + CHARACTER = "character" + SUBWORD = "subwords" + TOKEN = "tokens" + + +class Text2TextProblem(problem.Problem): + """Base class for text-to-text problems. + + Subclasses only must override `generate_samples` and `is_generate_per_split`. + See the "Subclass interface" code block below to see what else subclasses can + override. + """ + + # START: Subclass interface + @property + def dataset_splits(self): + """Splits of data to produce and number of output shards for each.""" + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 100, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 1, + }] + + @property + def is_generate_per_split(self): + """A single call to `generate_samples` generates for all `dataset_splits`. + + Set to True if you already have distinct subsets of data for each dataset + split specified in `self.dataset_splits`. `self.generate_samples` will be + called once for each split. + + Set to False if you have a unified dataset that you'd like to have split out + into training and evaluation data automatically. `self.generate_samples` + will be called only once and the data will be sharded across the dataset + splits specified in `self.dataset_splits`. + + Returns: + bool + """ + raise NotImplementedError() + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + """Generate samples of input text and target text pairs. + + Each yielded dict will be made into a single example. The values should be + raw text. The Problem will generate a vocabulary and encode the raw text as + integers as part of the data generation process. + + This method is typically called once per split in `self.dataset_splits` + unless `self.is_generate_per_split=False`. + + Args: + data_dir: final data directory. Typically only used in this method to copy + over user-supplied vocab files (for example, if vocab_type == + VocabType.TOKEN). + tmp_dir: temporary directory that you can use for downloading and scratch. + dataset_split: problem.DatasetSplit, which data split to generate samples + for (for example, training and evaluation). + + Yields: + {"inputs": text, "targets": text} + """ + raise NotImplementedError() + + @property + def vocab_type(self): + """What kind of vocabulary to use. + + `VocabType`s: + * `SUBWORD`: `SubwordTextEncoder`, an invertible wordpiece vocabulary. + Must provide `self.approx_vocab_size`. Generates the vocabulary based on + the training data. To limit the number of samples the vocab generation + looks at, override `self.max_samples_for_vocab`. Recommended and + default. + * `CHARACTER`: `ByteTextEncoder`, encode raw bytes. + * `TOKEN`: `TokenTextEncoder`, vocabulary based on a file. Must provide a + vocabulary file yourself (`TokenTextEncoder.store_to_file`) because one + will not be generated for you. The vocab file should be stored in + `data_dir/` with the name specified by `self.vocab_filename`. + + Returns: + VocabType constant + """ + return VocabType.SUBWORD + + def approx_vocab_size(self): + """Approximate vocab size to generate. Only for VocabType.SUBWORD.""" + return 2**15 # ~32k + + @property + def max_samples_for_vocab(self): + """How many samples from `generate_samples` to look at for vocab generation. + + Only applies if self.vocab_type == VocabType.SUBWORD. + + If None, look at all training samples. + + Returns: + None or int. + """ + return None + + @property + def packed_length(self): + """Pack multiple examples into a single example of constant length. + + This is useful for TPU training to reduce the fraction of padding tokens. + See generator_utils.pack_examples. + + Returns: + None or int + """ + return None + + # END: Subclass interface + + @property + def has_inputs(self): + return True + + def max_length(self, model_hparams): + return (self.packed_length or + super(Text2TextProblem, self).max_length(model_hparams)) + + def feature_encoders(self, data_dir): + encoder = self.get_or_create_vocab(data_dir, None, force_get=True) + encoders = {"targets": encoder} + if self.has_inputs: + encoders["inputs"] = encoder + return encoders + + def generate_text_for_vocab(self, data_dir, tmp_dir): + for i, sample in enumerate( + self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN)): + if self.has_inputs: + yield sample["inputs"] + yield sample["targets"] + if self.max_samples_for_vocab and (i + 1) >= self.max_samples_for_vocab: + break + + @property + def vocab_filename(self): + if self.vocab_type == VocabType.SUBWORD: + return "vocab.%s.%d.%s" % (self.name, self.approx_vocab_size, + VocabType.SUBWORD) + else: + return "vocab.%s.%s" % (self.name, VocabType.TOKEN) + + def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): + if self.vocab_type == VocabType.CHARACTER: + encoder = text_encoder.ByteTextEncoder() + elif self.vocab_type == VocabType.SUBWORD: + if force_get: + vocab_filepath = os.path.join(data_dir, self.vocab_filename) + encoder = text_encoder.SubwordTextEncoder(vocab_filepath) + else: + encoder = generator_utils.get_or_generate_vocab_inner( + data_dir, self.vocab_filename, self.approx_vocab_size, + self.generate_text_for_vocab(data_dir, tmp_dir)) + elif self.vocab_type == VocabType.TOKEN: + vocab_filename = os.path.join(data_dir, self.vocab_filename) + encoder = text_encoder.TokenTextEncoder(vocab_filename) + else: + raise ValueError("Unrecognized VocabType") + return encoder + + def _maybe_pack_examples(self, generator): + """Wraps generator with packer if self.packed_length.""" + if not self.packed_length: + return generator + return generator_utils.pack_examples( + generator, + self.has_inputs, + self.packed_length, + chop_long_sequences=not self.has_inputs) + + def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): + generator = self.generate_samples(data_dir, tmp_dir, dataset_split) + encoder = self.get_or_create_vocab(data_dir, tmp_dir) + for sample in generator: + targets = encoder.encode(sample["targets"]) + targets.append(text_encoder.EOS_ID) + encoded_sample = {"targets": targets} + if self.has_inputs: + inputs = encoder.encode(sample["inputs"]) + inputs.append(text_encoder.EOS_ID) + encoded_sample["inputs"] = inputs + yield encoded_sample + + @property + def batch_size_means_tokens(self): + return True + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + + filepath_fns = { + problem.DatasetSplit.TRAIN: self.training_filepaths, + problem.DatasetSplit.EVAL: self.dev_filepaths, + problem.DatasetSplit.TEST: self.test_filepaths, + } + + split_paths = dict([(split["split"], filepath_fns[split["split"]]( + data_dir, split["shards"], shuffled=False)) + for split in self.dataset_splits]) + all_paths = [] + for paths in split_paths.values(): + all_paths.extend(paths) + + if self.is_generate_per_split: + for split, paths in split_paths.items(): + generator_utils.generate_files( + self._maybe_pack_examples( + self.generate_encoded_samples(data_dir, tmp_dir, split)), paths) + else: + generator_utils.generate_files( + self._maybe_pack_examples( + self.generate_encoded_samples( + data_dir, tmp_dir, problem.DatasetSplit.TRAIN)), all_paths) + + generator_utils.shuffle_dataset(all_paths) + + def hparams(self, defaults, unused_model_hparams): + p = defaults + p.stop_at_eos = int(True) + + if self.has_inputs: + source_vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = { + "inputs": (registry.Modalities.SYMBOL, source_vocab_size) + } + target_vocab_size = self._encoders["targets"].vocab_size + p.target_modality = (registry.Modalities.SYMBOL, target_vocab_size) + if self.vocab_type == VocabType.CHARACTER: + p.loss_multiplier = 2.0 + + if self.packed_length: + identity = (registry.Modalities.GENERIC, None) + if self.has_inputs: + p.input_modality["inputs_segmentation"] = identity + p.input_modality["inputs_position"] = identity + p.input_modality["targets_segmentation"] = identity + p.input_modality["targets_position"] = identity + + def example_reading_spec(self): + data_fields = {"targets": tf.VarLenFeature(tf.int64)} + if self.has_inputs: + data_fields["inputs"] = tf.VarLenFeature(tf.int64) + + if self.packed_length: + if self.has_inputs: + data_fields["inputs_segmentation"] = tf.VarLenFeature(tf.int64) + data_fields["inputs_position"] = tf.VarLenFeature(tf.int64) + data_fields["targets_segmentation"] = tf.VarLenFeature(tf.int64) + data_fields["targets_position"] = tf.VarLenFeature(tf.int64) + + data_items_to_decoders = None + return (data_fields, data_items_to_decoders) + + def eval_metrics(self): + return [ + metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5, + metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY, + metrics.Metrics.APPROX_BLEU, metrics.Metrics.ROUGE_2_F, + metrics.Metrics.ROUGE_L_F + ] + + +class Text2SelfProblem(Text2TextProblem): + """Language modeling problems base class. + + See Text2TextProblem for subclass interface. + """ + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + """Generate samples of text. + + Args: + data_dir: final data directory. Typically only used in this method to copy + over user-supplied vocab files (for example, if vocab_type == + VocabType.TOKEN). + tmp_dir: temporary directory that you can use for downloading and scratch. + dataset_split: problem.DatasetSplit, which data split to generate samples + for (for example, training and evaluation). + + Yields: + Sample: dict: for language modeling problems + (i.e. Text2SelfProblems), this generator should yield dicts with only + the "targets" key. + """ + raise NotImplementedError() + + @property + def has_inputs(self): + return False + + +class Text2ClassProblem(Text2TextProblem): + """Base class for text classification problems.""" + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + """Generate samples of text and label pairs. + + Each yielded dict will be a single example. The inputs should be raw text. + The label should be an int in [0, self.num_classes). + + Args: + data_dir: final data directory. Typically only used in this method to copy + over user-supplied vocab files (for example, if vocab_type == + VocabType.TOKEN). + tmp_dir: temporary directory that you can use for downloading and scratch. + dataset_split: problem.DatasetSplit, which data split to generate samples + for (for example, training and evaluation). + + Yields: + {"inputs": text, "label": int} + """ + raise NotImplementedError() + + # START: Additional subclass interface + @property + def num_classes(self): + """The number of classes.""" + raise NotImplementedError() + + def class_labels(self, data_dir): + """String representation of the classes.""" + del data_dir + return ["ID_%d" % i for i in range(self.num_classes)] + + # END: Additional subclass interface + + def generate_text_for_vocab(self, data_dir, tmp_dir): + for i, sample in enumerate( + self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN)): + yield sample["inputs"] + if self.max_samples_for_vocab and (i + 1) >= self.max_samples_for_vocab: + break + + def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): + generator = self.generate_samples(data_dir, tmp_dir, dataset_split) + encoder = self.get_or_create_vocab(data_dir, tmp_dir) + for sample in generator: + inputs = encoder.encode(sample["inputs"]) + inputs.append(text_encoder.EOS_ID) + label = sample["label"] + yield {"inputs": inputs, "targets": [label]} + + def feature_encoders(self, data_dir): + encoder = self.get_or_create_vocab(data_dir, None, force_get=True) + + return { + "inputs": encoder, + "targets": text_encoder.ClassLabelEncoder(self.class_labels) + } + + def hparams(self, defaults, unused_model_hparams): + p = defaults + source_vocab_size = self._encoders["inputs"].vocab_size + p.input_modality = { + "inputs": (registry.Modalities.SYMBOL, source_vocab_size) + } + p.target_modality = (registry.Modalities.CLASS_LABEL, self.num_classes) + + def example_reading_spec(self): + data_fields = { + "inputs": tf.VarLenFeature(tf.int64), + "targets": tf.FixedLenFeature([1], tf.int64), + } + data_items_to_decoders = None + return (data_fields, data_items_to_decoders) + + +def txt_line_iterator(txt_path): + """Iterate through lines of file.""" + with tf.gfile.Open(txt_path) as f: + readline = lambda: f.readline().strip() + for line in iter(readline, ""): + yield line + + +def text2text_txt_iterator(source_txt_path, target_txt_path): + """Yield dicts for Text2TextProblem.generate_samples from lines of files.""" + for inputs, targets in zip( + txt_line_iterator(source_txt_path), txt_line_iterator(target_txt_path)): + yield {"inputs": inputs, "targets": targets} + + +def text2self_txt_iterator(txt_path): + for line in txt_line_iterator(txt_path): + yield {"targets": line} + + +def text2class_txt_iterator(source_txt_path, label_txt_path, class_strs=None): + """Yield dicts for Text2ClassProblem.generate_samples from lines of files. + + Args: + source_txt_path: txt file with record per line. + label_txt_path: txt file with label per line, either as int or str. If + string, must provide class_strs. + class_strs: list of class label names. Must be in correct order (i.e. + ["a", "b", "c"] means that "a" will get class ID 0, "b" ID 1, etc.). + + Yields: + {"inputs": inputs, "label": label} + """ + if class_strs: + class_strs = dict([(s, i) for i, s in enumerate(class_strs)]) + for inputs, label in zip( + txt_line_iterator(source_txt_path), txt_line_iterator(label_txt_path)): + label = label.strip() + if class_strs: + label = class_strs[label] + else: + label = int(label) + yield {"inputs": inputs, "label": label} + + +def text2text_txt_tab_iterator(txt_path): + """Yield dicts for Text2TextProblem.generate_samples from lines of txt_path. + + Args: + txt_path: path to txt file with a record per line, source and target + are tab-separated. + + Yields: + {"inputs": inputs, "targets": targets} + """ + for line in txt_line_iterator(txt_path): + if line and "\t" in line: + parts = line.split("\t") + inputs, targets = parts[:2] + yield {"inputs": inputs.strip(), "targets": targets.strip()} + + +@registry.register_problem +class Text2textTmpdir(Text2TextProblem): + """Allows training a Text2TextProblem without defining a subclass. + + Put your training and evaluation data into the following files in tmp_dir, + with 1 record per line: + + * inputs.train.txt + * targets.train.txt + * inputs.eval.txt + * targets.eval.txt + """ + TRAIN_FILES = ("inputs.train.txt", "targets.train.txt") + EVAL_FILES = ("inputs.eval.txt", "targets.eval.txt") + + def is_generate_per_split(self): + return True + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + del data_dir + is_training = dataset_split == problem.DatasetSplit.TRAIN + files = self.TRAIN_FILES if is_training else self.EVAL_FILES + files = [os.path.join(tmp_dir, f) for f in files] + inputs_file, targets_file = files + return text2text_txt_iterator(inputs_file, targets_file) + + +class ChoppedTextProblem(Text2SelfProblem): + """Tokenize and chop text files into fixed-length language-modeling examples. + + The input data is a set of text files, as specified by + self.train_text_filepaths() and self.dev_text_filepaths(). + + The text is tokenized using a SubwordTextEncoder, and + then split into examples, each of length self.sequence_length(). + """ + + def train_text_filepaths(self, tmp_dir): + """Local filepaths of text files containing training data. + + This function may want to download the files if they do not exist. + + Args: + tmp_dir: a string + Returns: + a list of strings. + """ + raise NotImplementedError() + + def dev_text_filepaths(self, tmp_dir): + """Local filepaths of text files containing dev data. + + This function may want to download the files if they do not exist. + + Args: + tmp_dir: a string + Returns: + a list of strings. + """ + raise NotImplementedError() + + @property + def sequence_length(self): + """Length of each example (in tokens).""" + raise NotImplementedError() + + def max_length(self, model_hparams): + return model_hparams.split_to_length or self.sequence_length + + def text_filepaths_for_task(self, tmp_dir, task_id): + """List of input filepaths for a particular training or dev shard. + + Args: + tmp_dir: a string + task_id: an integer less than self.num_shards + Returns: + a list of tuples (filepath, start_pos, num_bytes) + """ + assert task_id >= 0 + assert task_id < self.num_train_shards + self.num_dev_shards + if task_id < self.num_train_shards: + return [ + f for i, f in enumerate(self.train_text_filepaths(tmp_dir)) + if i % self.num_train_shards == task_id + ] + else: + return [ + f for i, f in enumerate(self.dev_text_filepaths(tmp_dir)) + if i % self.num_dev_shards == task_id - self.num_train_shards + ] + + def filepath_to_unicode_strings(self, filepath): + """Read text out of an input file. + + The default just reads the text, converts to unicode and yields one + unicode string. + + Subclasses can override this function in order to preprocess, and can + yield any number of strings. + + Args: + filepath: a string + Yields: + unicode strings. + """ + f = tf.gfile.Open(filepath) + b = f.read() + yield text_encoder.to_unicode_ignore_erros(b) + + def file_generator(self, + filepaths, + max_chars_per_file=None, + max_chars_total=None): + """Read complete text of input files and yield unicode strings. + + By default, one unicode string is produced per file, but this is + not guaranteed, since subclasses can override + filepath_to_unicode_strings(). + + max_chars_per_file and max_chars_total can also be specified, in which + case some strings may be truncated or dropped to limit the total + amount of output. + + Args: + filepaths: a list of strings + max_chars_per_file: an optional integer + max_chars_total: an optional integer + Yields: + unicode strings + """ + chars_total = 0 + for fname in filepaths: + chars_this_file = 0 + tf.logging.info("reading file %s" % fname) + for text in self.filepath_to_unicode_strings(fname): + if (max_chars_per_file and + chars_this_file + len(text) > max_chars_per_file): + text = text[:max_chars_per_file - chars_this_file] + if max_chars_total and chars_total + len(text) > max_chars_total: + text = text[:max_chars_total - chars_total] + chars_total += len(text) + chars_this_file += len(text) + if text: + yield text + if max_chars_total and chars_total >= max_chars_total: + return + if max_chars_per_file and chars_this_file >= max_chars_per_file: + break + + def example_generator(self, encoder, tmp_dir, task_id): + """Generator for examples. + + Args: + encoder: a TextEncoder + tmp_dir: a string + task_id: an integer + Yields: + feature dictionaries + """ + filepaths = self.text_filepaths_for_task(tmp_dir, task_id) + if task_id >= self.num_train_shards: + # this is dev data - limit the total length. + max_chars_per_file = self.max_dev_chars // ( + self.num_dev_shards * len(filepaths)) + else: + max_chars_per_file = None + tokens = [] + for ftext in self.file_generator( + filepaths, max_chars_per_file=max_chars_per_file): + tokens.extend(encoder.encode(ftext)) + pos = 0 + while pos + self.sequence_length <= len(tokens): + yield {"targets": tokens[pos:pos + self.sequence_length]} + pos += self.sequence_length + if pos > 0: + tokens = tokens[pos:] + if self.remainder_policy == "pad": + if tokens: + targets = tokens + [0] * (self.sequence_length - len(tokens)) + yield {"targets": targets} + else: + assert self.remainder_policy == "drop" + + @property + def remainder_policy(self): + """What to do with leftover tokens. + + Returns: + a string - either "pad" or "drop". + """ + return "pad" + + def prepare_to_generate(self, data_dir, tmp_dir): + """Make sure that the data is prepared and the vocab is generated.""" + self.get_or_create_vocab(data_dir, tmp_dir) + self.train_text_filepaths(tmp_dir) + self.dev_text_filepaths(tmp_dir) + + def generate_text_for_vocab(self, data_dir, tmp_dir): + return self.file_generator( + self.train_text_filepaths(tmp_dir), + max_chars_total=self.max_chars_for_vocab) + + def generate_data(self, data_dir, tmp_dir, task_id=-1): + """Generates training/dev data. + + Args: + data_dir: a string + tmp_dir: a string + task_id: an optional integer + Returns: + shard or shards for which data was generated. + """ + tf.logging.info("generate_data task_id=%s" % task_id) + encoder = self.get_or_create_vocab(data_dir, tmp_dir) + assert task_id >= 0 and task_id < self.num_generate_tasks + if task_id < self.num_train_shards: + out_file = self.training_filepaths( + data_dir, self.num_train_shards, shuffled=False)[task_id] + else: + out_file = self.dev_filepaths( + data_dir, self.num_dev_shards, + shuffled=False)[task_id - self.num_train_shards] + generator_utils.generate_files( + self.example_generator(encoder, tmp_dir, task_id), [out_file]) + generator_utils.shuffle_dataset([out_file]) + + @property + def max_chars_for_vocab(self): + """Number of characters of training data to use for generating vocab.""" + return 10**7 + + @property + def num_train_shards(self): + return self.dataset_splits[0]["shards"] + + @property + def num_dev_shards(self): + return self.dataset_splits[1]["shards"] + + @property + def max_dev_chars(self): + """Limit dev set to at most this many characters (default 10M).""" + return 10**7 + + @property + def multiprocess_generate(self): + return True + + @property + def num_generate_tasks(self): + return self.num_train_shards + self.num_dev_shards + + def eval_metrics(self): + return [metrics.Metrics.ACC, metrics.Metrics.NEG_LOG_PERPLEXITY] diff --git a/tensor2tensor/data_generators/text_problems_test.py b/tensor2tensor/data_generators/text_problems_test.py new file mode 100644 index 000000000..af39e35ef --- /dev/null +++ b/tensor2tensor/data_generators/text_problems_test.py @@ -0,0 +1,191 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Text problems test.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import shutil + +# Dependency imports + +from tensor2tensor.data_generators import problem as problem_lib +from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems + +import tensorflow as tf + + +class Test1(text_problems.Text2textTmpdir): + + @property + def name(self): + # name is normally provided by register_problem, but this problem is not + # registered, so we provide one here to avoid inheriting the parent class's + # name. + return "test1" + + @property + def approx_vocab_size(self): + return 3 + + @property + def dataset_splits(self): + return [{ + "split": problem_lib.DatasetSplit.TRAIN, + "shards": 1, + }, { + "split": problem_lib.DatasetSplit.EVAL, + "shards": 1, + }] + + +class TextProblems(tf.test.TestCase): + + @classmethod + def setUpClass(cls): + cls.tmp_dir = tf.test.get_temp_dir() + shutil.rmtree(cls.tmp_dir) + os.mkdir(cls.tmp_dir) + + cls.inputs = [ + "Hello world", + "Goodbye world", + ] + cls.targets = [ + "Hola mundo", + "Adios mundo", + ] + cls.labels = [2, 3] + cls.labels_strs = ["c", "d"] + + cls.inputs_file = os.path.join(cls.tmp_dir, "inputs.train.txt") + cls.targets_file = os.path.join(cls.tmp_dir, "targets.train.txt") + cls.labels_file = os.path.join(cls.tmp_dir, "labels.train.txt") + cls.labels_str_file = os.path.join(cls.tmp_dir, "labels_str.train.txt") + data = [(cls.inputs, cls.inputs_file), (cls.targets, cls.targets_file), + (cls.labels, cls.labels_file), (cls.labels_strs, + cls.labels_str_file)] + + for lines, filename in data: + with tf.gfile.Open(filename, "w") as f: + for line in lines: + f.write(str(line)) + f.write("\n") + + cls.tabbed_file = os.path.join(cls.tmp_dir, "tabbed.train.txt") + with tf.gfile.Open(cls.tabbed_file, "w") as f: + for inputs, targets in zip(cls.inputs, cls.targets): + f.write("%s\t%s\n" % (inputs, targets)) + + tf.gfile.Copy(cls.inputs_file, os.path.join(cls.tmp_dir, "inputs.eval.txt")) + tf.gfile.Copy(cls.targets_file, os.path.join(cls.tmp_dir, + "targets.eval.txt")) + + def testTxtLineIterator(self): + lines = [line for line in text_problems.txt_line_iterator(self.inputs_file)] + self.assertEqual(lines, self.inputs) + + def testText2TextTxtIterator(self): + inputs = [] + targets = [] + for entry in text_problems.text2text_txt_iterator(self.inputs_file, + self.targets_file): + inputs.append(entry["inputs"]) + targets.append(entry["targets"]) + self.assertEqual(inputs, self.inputs) + self.assertEqual(targets, self.targets) + + def testText2SelfTxtIterator(self): + targets = [ + entry["targets"] + for entry in text_problems.text2self_txt_iterator(self.targets_file) + ] + self.assertEqual(targets, self.targets) + + def testText2ClassTxtIterator(self): + inputs = [] + labels = [] + for entry in text_problems.text2class_txt_iterator(self.inputs_file, + self.labels_file): + inputs.append(entry["inputs"]) + labels.append(entry["label"]) + self.assertEqual(inputs, self.inputs) + self.assertEqual(labels, self.labels) + + def testText2ClassTxtIteratorWithStrs(self): + inputs = [] + labels = [] + for entry in text_problems.text2class_txt_iterator( + self.inputs_file, self.labels_str_file, class_strs=["a", "b", "c", + "d"]): + inputs.append(entry["inputs"]) + labels.append(entry["label"]) + self.assertEqual(inputs, self.inputs) + self.assertEqual(labels, self.labels) + + def testText2TextTxtTabIterator(self): + inputs = [] + targets = [] + for entry in text_problems.text2text_txt_tab_iterator(self.tabbed_file): + inputs.append(entry["inputs"]) + targets.append(entry["targets"]) + self.assertEqual(inputs, self.inputs) + self.assertEqual(targets, self.targets) + + def testText2TextTmpDir(self): + problem = Test1() + problem.generate_data(self.tmp_dir, self.tmp_dir) + vocab_file = os.path.join(self.tmp_dir, "vocab.test1.3.subwords") + train_file = os.path.join(self.tmp_dir, "test1-train-00000-of-00001") + eval_file = os.path.join(self.tmp_dir, "test1-dev-00000-of-00001") + self.assertTrue(tf.gfile.Exists(vocab_file)) + self.assertTrue(tf.gfile.Exists(train_file)) + self.assertTrue(tf.gfile.Exists(eval_file)) + + dataset = problem.dataset(tf.estimator.ModeKeys.TRAIN, self.tmp_dir) + features = dataset.make_one_shot_iterator().get_next() + + examples = [] + exhausted = False + with self.test_session() as sess: + examples.append(sess.run(features)) + examples.append(sess.run(features)) + try: + sess.run(features) + except tf.errors.OutOfRangeError: + exhausted = True + + self.assertTrue(exhausted) + self.assertEqual(2, len(examples)) + + self.assertNotEqual( + list(examples[0]["inputs"]), list(examples[1]["inputs"])) + + example = examples[0] + encoder = text_encoder.SubwordTextEncoder(vocab_file) + inputs_encoded = list(example["inputs"]) + inputs_encoded.pop() # rm EOS + self.assertTrue(encoder.decode(inputs_encoded) in self.inputs) + targets_encoded = list(example["targets"]) + targets_encoded.pop() # rm EOS + self.assertTrue(encoder.decode(targets_encoded) in self.targets) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/data_generators/translate.py b/tensor2tensor/data_generators/translate.py index 20c5081e7..0c6d32ca8 100644 --- a/tensor2tensor/data_generators/translate.py +++ b/tensor2tensor/data_generators/translate.py @@ -26,26 +26,44 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_problems import tensorflow as tf FLAGS = tf.flags.FLAGS -class TranslateProblem(problem.Text2TextProblem): +class TranslateProblem(text_problems.Text2TextProblem): """Base class for translation problems.""" - @property - def is_character_level(self): - return False + def is_generate_per_split(self): + return True - @property - def num_shards(self): - return 100 + def approx_vocab_size(self): + return 2**15 - @property - def use_subword_tokenizer(self): - return True + def source_data_files(self, dataset_split): + """Files to be passed to compile_data.""" + raise NotImplementedError() + + def vocab_data_files(self): + """Files to be passed to get_or_generate_vocab.""" + return self.source_data_files(problem.DatasetSplit.TRAIN) + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN + datasets = self.source_data_files(dataset_split) + tag = "train" if train else "dev" + data_path = compile_data(tmp_dir, datasets, "%s-compiled-%s" % (self.name, + tag)) + + if self.vocab_type == text_problems.VocabType.SUBWORD: + generator_utils.get_or_generate_vocab( + data_dir, tmp_dir, self.vocab_filename, self.approx_vocab_size, + self.vocab_data_files()) + + return text_problems.text2text_txt_iterator(data_path + ".lang1", + data_path + ".lang2") # Generic generators used later for multiple problems. diff --git a/tensor2tensor/data_generators/translate_encs.py b/tensor2tensor/data_generators/translate_encs.py index b5f72fc9a..3b6adc5aa 100644 --- a/tensor2tensor/data_generators/translate_encs.py +++ b/tensor2tensor/data_generators/translate_encs.py @@ -21,9 +21,9 @@ # Dependency imports -from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.data_generators import translate from tensor2tensor.utils import registry @@ -65,43 +65,29 @@ class TranslateEncsWmt32k(translate.TranslateProblem): """Problem spec for WMT English-Czech translation.""" @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**15 # 32768 @property - def vocab_name(self): - return "vocab.encs" + def vocab_filename(self): + return "vocab.encs.%d" % self.approx_vocab_size - def generator(self, data_dir, tmp_dir, train): - datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS - tag = "train" if train else "dev" + def source_data_files(self, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN + return _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS + + def vocab_data_files(self): + datasets = self.source_data_files(problem.DatasetSplit.TRAIN) vocab_datasets = [] - data_path = translate.compile_data(tmp_dir, datasets, - "wmt_encs_tok_%s" % tag) - # CzEng contains 100 gz files with tab-separated columns, so let's expect - # it is the first dataset in datasets and use the newly created *.lang{1,2} - # files for vocab construction. if datasets[0][0].endswith("data-plaintext-format.tar"): vocab_datasets.append([ - datasets[0][0], - ["wmt_encs_tok_%s.lang1" % tag, - "wmt_encs_tok_%s.lang2" % tag] + datasets[0][0], [ + "%s-compiled-train.lang1" % self.name, + "%s-compiled-train.lang2" % self.name + ] ]) datasets = datasets[1:] vocab_datasets += [[item[0], [item[1][0], item[1][1]]] for item in datasets] - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, - vocab_datasets) - return translate.token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.CS_TOK @registry.register_problem @@ -109,22 +95,14 @@ class TranslateEncsWmtCharacters(translate.TranslateProblem): """Problem spec for WMT En-Cs character-based translation.""" @property - def is_character_level(self): - return True + def vocab_type(self): + return text_problems.VocabType.CHARACTER - def generator(self, data_dir, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() + def generate_samples(self, data_dir, tmp_dir, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN datasets = _ENCS_TRAIN_DATASETS if train else _ENCS_TEST_DATASETS tag = "train" if train else "dev" data_path = translate.compile_data(tmp_dir, datasets, "wmt_encs_chr_%s" % tag) - return translate.character_generator( - data_path + ".lang1", data_path + ".lang2", character_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.EN_CHR - - @property - def target_space_id(self): - return problem.SpaceID.CS_CHR + return text_problems.text2text_txt_iterator(data_path + ".lang1", + data_path + ".lang2") diff --git a/tensor2tensor/data_generators/translate_ende.py b/tensor2tensor/data_generators/translate_ende.py index f9be14681..b493ec5c9 100644 --- a/tensor2tensor/data_generators/translate_ende.py +++ b/tensor2tensor/data_generators/translate_ende.py @@ -27,16 +27,12 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.data_generators import translate from tensor2tensor.utils import registry import tensorflow as tf -FLAGS = tf.flags.FLAGS - -# End-of-sentence marker. -EOS = text_encoder.EOS_ID - _ENDE_TRAIN_DATASETS = [ [ "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz", # pylint: disable=line-too-long @@ -79,41 +75,38 @@ class TranslateEndeWmtBpe32k(translate.TranslateProblem): """Problem spec for WMT En-De translation, BPE version.""" @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 32000 @property - def vocab_name(self): - return "vocab.bpe" + def vocab_filename(self): + return "vocab.bpe.%d" % self.approx_vocab_size - def feature_encoders(self, data_dir): - vocab_filename = os.path.join(data_dir, self.vocab_file) - encoder = text_encoder.TokenTextEncoder(vocab_filename, replace_oov="UNK") - return {"inputs": encoder, "targets": encoder} + def get_or_create_vocab(self, data_dir, tmp_dir, force_get=False): + vocab_filename = os.path.join(data_dir, self.vocab_filename) + if not tf.gfile.Exists(vocab_filename) and force_get: + raise ValueError("Vocab %s not found" % vocab_filename) + return text_encoder.TokenTextEncoder(vocab_filename, replace_oov="UNK") - def generator(self, data_dir, tmp_dir, train): + def generate_samples(self, data_dir, tmp_dir, dataset_split): """Instance of token generator for the WMT en->de task, training set.""" + train = dataset_split == problem.DatasetSplit.TRAIN dataset_path = ("train.tok.clean.bpe.32000" if train else "newstest2013.tok.bpe.32000") train_path = _get_wmt_ende_bpe_dataset(tmp_dir, dataset_path) - token_tmp_path = os.path.join(tmp_dir, self.vocab_file) - token_path = os.path.join(data_dir, self.vocab_file) - tf.gfile.Copy(token_tmp_path, token_path, overwrite=True) - with tf.gfile.GFile(token_path, mode="r") as f: - vocab_data = "\n\n" + f.read() + "UNK\n" - with tf.gfile.GFile(token_path, mode="w") as f: - f.write(vocab_data) - token_vocab = text_encoder.TokenTextEncoder(token_path, replace_oov="UNK") - return translate.token_generator(train_path + ".en", train_path + ".de", - token_vocab, EOS) - @property - def input_space_id(self): - return problem.SpaceID.EN_BPE_TOK + # Vocab + token_path = os.path.join(data_dir, self.vocab_filename) + if not tf.gfile.Exists(token_path): + token_tmp_path = os.path.join(tmp_dir, self.vocab_filename) + tf.gfile.Copy(token_tmp_path, token_path) + with tf.gfile.GFile(token_path, mode="r") as f: + vocab_data = "\n\n" + f.read() + "UNK\n" + with tf.gfile.GFile(token_path, mode="w") as f: + f.write(vocab_data) - @property - def target_space_id(self): - return problem.SpaceID.DE_BPE_TOK + return text_problems.text2text_txt_iterator(train_path + ".en", + train_path + ".de") @registry.register_problem @@ -121,38 +114,23 @@ class TranslateEndeWmt8k(translate.TranslateProblem): """Problem spec for WMT En-De translation.""" @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**13 # 8192 @property - def vocab_name(self): - return "vocab.ende" - - def generator(self, data_dir, tmp_dir, train): - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, - _ENDE_TRAIN_DATASETS) - datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS - tag = "train" if train else "dev" - data_path = translate.compile_data(tmp_dir, datasets, - "wmt_ende_tok_%s" % tag) - return translate.token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) + def vocab_filename(self): + return "vocab.ende.%d" % self.approx_vocab_size - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.DE_TOK + def source_data_files(self, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN + return _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS @registry.register_problem class TranslateEndeWmt32k(TranslateEndeWmt8k): @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**15 # 32768 @@ -169,26 +147,5 @@ class TranslateEndeWmtCharacters(translate.TranslateProblem): """Problem spec for WMT En-De translation.""" @property - def is_character_level(self): - return True - - @property - def vocab_name(self): - return "vocab.ende" - - def generator(self, _, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() - datasets = _ENDE_TRAIN_DATASETS if train else _ENDE_TEST_DATASETS - tag = "train" if train else "dev" - data_path = translate.compile_data(tmp_dir, datasets, - "wmt_ende_chr_%s" % tag) - return translate.character_generator( - data_path + ".lang1", data_path + ".lang2", character_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.EN_CHR - - @property - def target_space_id(self): - return problem.SpaceID.DE_CHR + def vocab_type(self): + return text_problems.VocabType.CHARACTER diff --git a/tensor2tensor/data_generators/translate_enfr.py b/tensor2tensor/data_generators/translate_enfr.py index 18e99d069..53b46b78a 100644 --- a/tensor2tensor/data_generators/translate_enfr.py +++ b/tensor2tensor/data_generators/translate_enfr.py @@ -21,9 +21,9 @@ # Dependency imports -from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder +from tensor2tensor.data_generators import text_problems from tensor2tensor.data_generators import translate from tensor2tensor.utils import registry @@ -85,45 +85,34 @@ class TranslateEnfrWmtSmall8k(translate.TranslateProblem): """Problem spec for WMT En-Fr translation.""" @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**13 # 8192 @property - def vocab_name(self): - return "vocab.enfr" + def vocab_filename(self): + return "vocab.enfr.%d" % self.approx_vocab_size @property def use_small_dataset(self): return True - def generator(self, data_dir, tmp_dir, train): - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, - _ENFR_TRAIN_SMALL_DATA) + def source_data_files(self, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN if self.use_small_dataset: datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA else: datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA - tag = "train" if train else "dev" - data_path = translate.compile_data(tmp_dir, datasets, - "wmt_enfr_tok_%s" % tag) - return translate.token_generator(data_path + ".lang1", data_path + ".lang2", - symbolizer_vocab, EOS) + return datasets - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.FR_TOK + def vocab_data_files(self): + return _ENFR_TRAIN_SMALL_DATA @registry.register_problem class TranslateEnfrWmtSmall32k(TranslateEnfrWmtSmall8k): @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**15 # 32768 @@ -156,36 +145,24 @@ class TranslateEnfrWmtSmallCharacters(translate.TranslateProblem): """Problem spec for WMT En-Fr translation.""" @property - def is_character_level(self): - return True + def vocab_type(self): + return text_problems.VocabType.CHARACTER @property def use_small_dataset(self): return True @property - def vocab_name(self): - return "vocab.enfr" + def vocab_filename(self): + return "vocab.enfr.%d" % self.approx_vocab_size - def generator(self, data_dir, tmp_dir, train): - character_vocab = text_encoder.ByteTextEncoder() + def source_data_files(self, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN if self.use_small_dataset: datasets = _ENFR_TRAIN_SMALL_DATA if train else _ENFR_TEST_SMALL_DATA else: datasets = _ENFR_TRAIN_LARGE_DATA if train else _ENFR_TEST_LARGE_DATA - tag = "train" if train else "dev" - data_path = translate.compile_data(tmp_dir, datasets, - "wmt_enfr_chr_%s" % tag) - return translate.character_generator( - data_path + ".lang1", data_path + ".lang2", character_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.EN_CHR - - @property - def target_space_id(self): - return problem.SpaceID.FR_CHR + return datasets @registry.register_problem diff --git a/tensor2tensor/data_generators/translate_enmk.py b/tensor2tensor/data_generators/translate_enmk.py index 70630c444..a95489841 100644 --- a/tensor2tensor/data_generators/translate_enmk.py +++ b/tensor2tensor/data_generators/translate_enmk.py @@ -21,7 +21,6 @@ # Dependency imports -from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem from tensor2tensor.data_generators import text_encoder from tensor2tensor.data_generators import translate @@ -55,33 +54,16 @@ class TranslateEnmkSetimes32k(translate.TranslateProblem): """Problem spec for SETimes Mk-En translation.""" @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**15 # 32768 @property - def vocab_name(self): - return "vocab.mken" + def vocab_filename(self): + return "vocab.mken.%d" % self.approx_vocab_size - def generator(self, data_dir, tmp_dir, train): + def source_data_files(self, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN datasets = _MKEN_TRAIN_DATASETS if train else _MKEN_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in datasets] target_datasets = [[item[0], [item[1][1]]] for item in datasets] - symbolizer_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.vocab_file, self.targeted_vocab_size, - source_datasets + target_datasets) - tag = "train" if train else "dev" - data_path = translate.compile_data(tmp_dir, datasets, - "setimes_mken_tok_%s" % tag) - # We generate English->X data by convention, to train reverse translation - # just add the "_rev" suffix to the problem name, e.g., like this. - # --problems=translate_enmk_setimes32k_rev - return translate.token_generator(data_path + ".lang2", data_path + ".lang1", - symbolizer_vocab, EOS) - - @property - def input_space_id(self): - return problem.SpaceID.MK_TOK - - @property - def target_space_id(self): - return problem.SpaceID.EN_TOK + return source_datasets + target_datasets diff --git a/tensor2tensor/data_generators/translate_enzh.py b/tensor2tensor/data_generators/translate_enzh.py index 27f621c85..01f9d8fc1 100644 --- a/tensor2tensor/data_generators/translate_enzh.py +++ b/tensor2tensor/data_generators/translate_enzh.py @@ -159,16 +159,16 @@ class TranslateEnzhWmt32k(translate.TranslateProblem): """ @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**15 # 32k @property def source_vocab_name(self): - return "vocab.enzh-en.%d" % self.targeted_vocab_size + return "vocab.enzh-en.%d" % self.approx_vocab_size @property def target_vocab_name(self): - return "vocab.enzh-zh.%d" % self.targeted_vocab_size + return "vocab.enzh-zh.%d" % self.approx_vocab_size def get_training_dataset(self, tmp_dir): """UN Parallel Corpus and CWMT Corpus need to be downloaded manually. @@ -192,32 +192,33 @@ def get_training_dataset(self, tmp_dir): "manually download %s" % filename) return full_dataset - def generator(self, data_dir, tmp_dir, train): + def generate_encoded_samples(self, data_dir, tmp_dir, dataset_split): + train = dataset_split == problem.DatasetSplit.TRAIN train_dataset = self.get_training_dataset(tmp_dir) datasets = train_dataset if train else _NC_TEST_DATASETS source_datasets = [[item[0], [item[1][0]]] for item in train_dataset] target_datasets = [[item[0], [item[1][1]]] for item in train_dataset] source_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size, - source_datasets, file_byte_budget=1e8) + data_dir, + tmp_dir, + self.source_vocab_name, + self.approx_vocab_size, + source_datasets, + file_byte_budget=1e8) target_vocab = generator_utils.get_or_generate_vocab( - data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size, - target_datasets, file_byte_budget=1e8) + data_dir, + tmp_dir, + self.target_vocab_name, + self.approx_vocab_size, + target_datasets, + file_byte_budget=1e8) tag = "train" if train else "dev" - filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag) + filename_base = "wmt_enzh_%sk_tok_%s" % (self.approx_vocab_size, tag) data_path = translate.compile_data(tmp_dir, datasets, filename_base) return translate.bi_vocabs_token_generator(data_path + ".lang1", data_path + ".lang2", source_vocab, target_vocab, EOS) - @property - def input_space_id(self): - return problem.SpaceID.EN_TOK - - @property - def target_space_id(self): - return problem.SpaceID.ZH_TOK - def feature_encoders(self, data_dir): source_vocab_filename = os.path.join(data_dir, self.source_vocab_name) target_vocab_filename = os.path.join(data_dir, self.target_vocab_name) @@ -237,12 +238,21 @@ class TranslateEnzhWmt8k(TranslateEnzhWmt32k): """ @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**13 # 8192 @property - def num_shards(self): - return 10 # This is a small dataset. + def dataset_splits(self): + return [ + { + "split": problem.DatasetSplit.TRAIN, + "shards": 10, # this is a small dataset + }, + { + "split": problem.DatasetSplit.EVAL, + "shards": 1, + } + ] def get_training_dataset(self, tmp_dir): """Uses only News Commentary Dataset for training.""" diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py index 4f6dd57bb..33a77b746 100644 --- a/tensor2tensor/data_generators/wiki.py +++ b/tensor2tensor/data_generators/wiki.py @@ -28,13 +28,14 @@ from tensor2tensor.data_generators import generator_utils from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_problems from tensor2tensor.utils import registry import tensorflow as tf @registry.register_problem -class LanguagemodelWikiXmlV8kL1k(problem.ChoppedTextProblem): +class LanguagemodelWikiXmlV8kL1k(text_problems.ChoppedTextProblem): """A language model on English Wikipedia. XML dump is chopped arbitrarily into sequences of length 1024 tokens, @@ -88,11 +89,11 @@ def corpus_url(self): "enwiki-20171201-pages-articles.xml.bz2") @property - def vocab_name(self): - return "vocab.wiki_xml" + def vocab_filename(self): + return "vocab.wiki_xml.%d" % self.approx_vocab_size @property - def targeted_vocab_size(self): + def approx_vocab_size(self): return 2**13 # 8192 @property diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py index 5badb031d..d567016a5 100644 --- a/tensor2tensor/layers/common_attention.py +++ b/tensor2tensor/layers/common_attention.py @@ -1432,7 +1432,61 @@ def dot_product_attention_relative(q, return _relative_attention_inner(weights, v, relations_values, False) -def masked_local_attention_1d(q, k, v, block_length=128, name=None): +def masked_within_block_local_attention_1d(q, k, v, block_length=64, name=None): + """Attention to the source and a neighborhood to the left within a block. + + The sequence is divided into blocks of length block_size. + Attention for a given query position can only see memory positions + less than or equal to the query position in the corresponding block + + Args: + q: a Tensor with shape [batch, heads, length, depth_k] + k: a Tensor with shape [batch, heads, length, depth_k] + v: a Tensor with shape [batch, heads, length, depth_v] + block_length: an integer + name: an optional string + + Returns: + a Tensor of shape [batch, heads, length, depth_v] + """ + with tf.variable_scope( + name, default_name="within_local_attention_1d", values=[q, k, v]): + v_shape = v.get_shape() + batch, heads, length, _ = common_layers.shape_list(q) + if isinstance(block_length, tf.Tensor): + const = tf.contrib.util.constant_value(block_length) + if const is not None: + block_length = int(const) + + depth_k = common_layers.shape_list(k)[3] + depth_v = common_layers.shape_list(v)[3] + original_length = length + padding_size = tf.mod(-length, block_length) + length += padding_size + padding = [[0, 0], [0, 0], [0, padding_size], [0, 0]] + q = tf.pad(q, padding) + k = tf.pad(k, padding) + v = tf.pad(v, padding) + num_blocks = tf.div(length, block_length) + # compute attention for all subsequent query blocks. + q = tf.reshape(q, [batch, heads, num_blocks, block_length, depth_k]) + k = tf.reshape(k, [batch, heads, num_blocks, block_length, depth_k]) + v = tf.reshape(v, [batch, heads, num_blocks, block_length, depth_v]) + # attention shape: [batch, heads, num_blocks, block_length, block_length] + attention = tf.matmul(q, k, transpose_b=True) + attention += tf.reshape(attention_bias_lower_triangle(block_length), + [1, 1, 1, block_length, block_length]) + attention = tf.nn.softmax(attention) + # initial output shape: [batch, heads, num_blocks, block_length, depth_v] + output = tf.matmul(attention, v) + output = tf.reshape(output, [batch, heads, -1, depth_v]) + output = tf.slice(output, [0, 0, 0, 0], [-1, -1, original_length, -1]) + output.set_shape(v_shape) + return output + + +def masked_local_attention_1d(q, k, v, block_length=128, + make_image_summary=False, name=None): """Attention to the source position and a neighborhood to the left of it. The sequence is divided into blocks of length block_size. @@ -1448,6 +1502,7 @@ def masked_local_attention_1d(q, k, v, block_length=128, name=None): k: a Tensor with shape [batch, heads, length, depth_k] v: a Tensor with shape [batch, heads, length, depth_v] block_length: an integer + make_image_summary: a boolean, whether to make an attention image summary. name: an optional string Returns: @@ -1490,6 +1545,7 @@ def masked_local_attention_1d(q, k, v, block_length=128, name=None): first_k, first_v, attention_bias_lower_triangle(block_length), + make_image_summary=make_image_summary, name="fist_block") # compute attention for all subsequent query blocks. @@ -1802,7 +1858,7 @@ def masked_dilated_self_attention_1d(q, gap_size=2, num_memory_blocks=2, name=None): - """dilated self-attention. + """dilated self-attention. TODO(avaswani): Try it and write a paper on it. Args: q: a Tensor with shape [batch, heads, length, depth_k] @@ -2464,8 +2520,12 @@ def multihead_attention(query_antecedent, x = dot_product_attention_relative(q, k, v, bias, max_relative_position, dropout_rate, image_shapes, make_image_summary=make_image_summary) + elif attention_type == "local_within_block_mask_right": + x = masked_within_block_local_attention_1d(q, k, v, + block_length=block_length) elif attention_type == "local_mask_right": - x = masked_local_attention_1d(q, k, v, block_length=block_length) + x = masked_local_attention_1d(q, k, v, block_length=block_length, + make_image_summary=make_image_summary) elif attention_type == "local_unmasked": x = local_attention_1d( q, k, v, block_length=block_length, filter_width=block_width) diff --git a/tensor2tensor/layers/common_image_attention.py b/tensor2tensor/layers/common_image_attention.py index c8084cb71..f830ac977 100644 --- a/tensor2tensor/layers/common_image_attention.py +++ b/tensor2tensor/layers/common_image_attention.py @@ -14,11 +14,14 @@ # limitations under the License. """Utils for attention mechanism for images.""" +# Dependency imports + +from six.moves import xrange # pylint: disable=redefined-builtin + from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_layers from tensor2tensor.utils import expert_utils -from six.moves import xrange # pylint: disable=redefined-builtin import tensorflow as tf @@ -27,6 +30,7 @@ class AttentionType(object): LOCAL_2D = "local_2d" GLOBAL = "global" GLOCAL = "global_local" + DILATED = "dilated" MOE_LOCAL_1D = "moe_local1d" @staticmethod @@ -37,6 +41,7 @@ def get_choices(): AttentionType.MOE_LOCAL_1D, AttentionType.LOCAL_1D, AttentionType.LOCAL_2D, + AttentionType.DILATED, ] @@ -95,6 +100,43 @@ def local_attention_1d(x, kv_padding=kv_padding, q_filter_width=hparams.q_filter_width, kv_filter_width=hparams.kv_filter_width, + make_image_summary=False, + name="self_attention") + if is_4d: + y = tf.reshape(y, x_shape) + y.set_shape([None, None, None, hparams.hidden_size]) + return y + + +def dilated_attention_1d(x, + self_attention_bias, + hparams, + attention_type="masked_dilated_1d", + q_padding="VALID", + kv_padding="VALID", + gap_size=2): + """Dilated 1d self attention.""" + # self-attention + x, x_shape, is_4d = maybe_reshape_4d_to_3d(x, hparams) + with tf.variable_scope("masked_dilated_1d"): + y = common_attention.multihead_attention( + x, + None, + self_attention_bias, + hparams.attention_key_channels or hparams.hidden_size, + hparams.attention_value_channels or hparams.hidden_size, + hparams.hidden_size, + hparams.num_heads, + hparams.attention_dropout, + attention_type=attention_type, + block_width=hparams.block_width, + block_length=hparams.block_length, + q_padding=q_padding, + kv_padding=kv_padding, + q_filter_width=hparams.q_filter_width, + kv_filter_width=hparams.kv_filter_width, + gap_size=gap_size, + num_memory_blocks=hparams.num_memory_blocks, name="self_attention") if is_4d: y = tf.reshape(y, x_shape) @@ -209,6 +251,8 @@ def transformer_decoder_layers(inputs, """Multi layer transformer.""" x = inputs x = tf.nn.dropout(x, 1.0 - hparams.layer_prepostprocess_dropout) + if attention_type == AttentionType.DILATED: + assert len(hparams.gap_sizes) == num_layers for layer in xrange(num_layers): with tf.variable_scope("%s_layer_%d" % (name, layer)): # self-attention + skip connections @@ -225,11 +269,15 @@ def transformer_decoder_layers(inputs, y = local_global_attention(common_layers.layer_preprocess(x, hparams), bias, hparams, q_padding="LEFT", kv_padding="LEFT") + elif attention_type == AttentionType.DILATED: + y = dilated_attention_1d(common_layers.layer_preprocess(x, hparams), + bias, hparams, q_padding="LEFT", + kv_padding="LEFT", + gap_size=hparams.gap_sizes[layer]) elif attention_type == AttentionType.GLOBAL: y = full_self_attention(common_layers.layer_preprocess(x, hparams), bias, hparams, q_padding="LEFT", kv_padding="LEFT") - # TODO(nikip): Add support for dilated attention. x = common_layers.layer_postprocess(x, y, hparams) # enc-dec attention + skip connections if encoder_output is not None: diff --git a/tensor2tensor/models/__init__.py b/tensor2tensor/models/__init__.py index 58badcb4e..df7744ff9 100644 --- a/tensor2tensor/models/__init__.py +++ b/tensor2tensor/models/__init__.py @@ -25,6 +25,8 @@ from tensor2tensor.layers import modalities from tensor2tensor.models import basic from tensor2tensor.models import bytenet +from tensor2tensor.models import image_transformer +from tensor2tensor.models import image_transformer_2d from tensor2tensor.models import lstm from tensor2tensor.models import neural_gpu from tensor2tensor.models import resnet diff --git a/tensor2tensor/models/image_transformer.py b/tensor2tensor/models/image_transformer.py new file mode 100644 index 000000000..a7e00245f --- /dev/null +++ b/tensor2tensor/models/image_transformer.py @@ -0,0 +1,664 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""image generation with transformer (attention). + +encoder: [Self-Attention, Feed-forward] x n +decoder: [Self-Attention, Source-Target-Attention, Feed-forward] x n + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +# Dependency imports + +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_image_attention as cia +from tensor2tensor.layers import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +@registry.register_model +class Imagetransformer(t2t_model.T2TModel): + """Conditional image generation with attention. See file docstring.""" + + def body(self, features): + hparams = copy.copy(self._hparams) + inputs = features["inputs"] + targets = features["targets"] + if not (tf.get_variable_scope().reuse or + hparams.mode == tf.contrib.learn.ModeKeys.INFER): + tf.summary.image("targets", targets, max_outputs=1) + + # Prepare decoder inputs and bias. + decoder_input, rows, cols, bias = cia.prepare_decoder(targets, hparams) + # Add class label to decoder input. + if not hparams.unconditional: + decoder_input += tf.reshape( + inputs, + [common_layers.shape_list(targets)[0], 1, 1, hparams.hidden_size]) + decoder_output = cia.transformer_decoder_layers( + decoder_input, + None, + bias, + hparams.num_decoder_layers or hparams.num_hidden_layers, + hparams, + attention_type=hparams.dec_attention_type, + name="decoder") + output = cia.create_output(decoder_output, rows, cols, targets, hparams) + return output + + +@registry.register_model +class ImagetransformerMoe(t2t_model.T2TModel): + """Conditional image generation with attention and MoE.""" + + @property + def use_body_sharded(self): + return True + + def body_sharded(self, sharded_features): + dp = self._data_parallelism + hparams = copy.copy(self._hparams) + inputs = sharded_features["inputs"] + targets = sharded_features["targets"] + + # Determine attention type and padding from hparams. + q_padding, kv_padding = "VALID", "VALID" + if hparams.q_filter_width > 1: + q_padding = "LEFT" + if hparams.kv_filter_width > 1: + kv_padding = "LEFT" + + # Prepare decoder inputs and bias. + decoder_input, rows, cols, attention_bias = dp(cia.prepare_decoder_inputs, + inputs, targets, hparams) + + # Run decoder. + decoder_output, extra_loss = cia.transformer_layers_sharded( + dp, + self._ps_devices, + decoder_input, + hparams.num_hidden_layers, + hparams, + self_attention_bias=attention_bias, + enc_output=None, + attention_type=hparams.dec_attention_type, + q_padding=q_padding, + kv_padding=kv_padding, + name="decoder") + + output = dp(cia.create_output, decoder_output, rows, cols, targets, hparams) + return output, extra_loss + + +@registry.register_hparams +def image_transformer_base(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.hidden_size = 512 + hparams.batch_size = 1 + hparams.max_length = 3075 + hparams.dropout = 0.0 + hparams.clip_grad_norm = 0. # i.e. no gradient clipping + hparams.optimizer_adam_epsilon = 1e-9 + hparams.learning_rate_decay_scheme = "noam" + hparams.learning_rate = 0.1 + hparams.learning_rate_warmup_steps = 4000 + hparams.initializer_gain = 0.2 + hparams.num_hidden_layers = 6 + hparams.initializer = "uniform_unit_scaling" + hparams.weight_decay = 0.0 + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.98 + hparams.label_smoothing = 0.0 + hparams.target_modality = "image:identity" + hparams.norm_type = "layer" + hparams.layer_prepostprocess_dropout = 0.0 + hparams.add_hparam("filter_size", 512) # Add new ones like this. + + # attention-related flags + hparams.add_hparam("num_heads", 8) + hparams.add_hparam("attention_key_channels", 0) + hparams.add_hparam("attention_value_channels", 0) + hparams.add_hparam("ffn_layer", "conv_hidden_relu") + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. + hparams.add_hparam("attention_dropout", 0.0) + hparams.add_hparam("relu_dropout", 0.0) + hparams.add_hparam("pos", "timing") # timing, none + hparams.add_hparam("nbr_decoder_problems", 1) + hparams.add_hparam("num_output_layers", 3) + hparams.add_hparam("block_size", 1) + + # dilated attention based flags + hparams.add_hparam("gap_sizes", [2, 4, 8, 16, 32, 64, 2, 4, 8, 16, 32, 64]) + + # image size related flags + # assuming that the image has same height and width + hparams.add_hparam("img_len", 32) + hparams.add_hparam("num_channels", 3) + # Local attention params + hparams.add_hparam("local_and_global_att", False) + hparams.add_hparam("block_length", 256) + hparams.add_hparam("block_width", 128) + hparams.add_hparam("num_encoder_layers", 4) + hparams.add_hparam("num_decoder_layers", 12) + hparams.sep_rgb_embed = False + hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D) + hparams.add_hparam("block_rastor_scan", False) + + # multipos attention params + hparams.add_hparam("q_filter_width", 1) + hparams.add_hparam("kv_filter_width", 1) + + hparams.add_hparam("unconditional", False) # unconditional generation + + return hparams + + +@registry.register_hparams +def imagetransformer_base(): + hparams = image_transformer_base() + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels(): + """separate rgb embeddings.""" + hparams = imagetransformer_base() + hparams.num_heads = 4 + hparams.attention_key_channels = hparams.attention_value_channels = 0 + hparams.hidden_size = 256 + hparams.filter_size = 512 + hparams.num_hidden_layers = 6 + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_8l(): + """separate rgb embeddings.""" + hparams = imagetransformer_base() + hparams.num_heads = 4 + hparams.attention_key_channels = hparams.attention_value_channels = 0 + hparams.hidden_size = 256 + hparams.filter_size = 256 + hparams.num_hidden_layers = 8 + hparams.sampling_method = "random" + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_8l_multipos3(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l() + hparams.q_filter_width = 3 + hparams.kv_filter_width = 3 + return hparams + + +@registry.register_hparams +def imagetransformer_sep_output_channels_8l(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l() + hparams.sep_rgb_embed = True + hparams.sampling_method = "random" + return hparams + + +@registry.register_hparams +def imagetransformer_base_8l_8h_big_cond_dr03_dan(): + """big 1d model for conditional image generation.2.99 on cifar10.""" + hparams = imagetransformer_sep_channels_8l() + hparams.block_width = 256 + hparams.block_length = 256 + hparams.hidden_size = 512 + hparams.num_heads = 8 + hparams.filter_size = 2048 + hparams.batch_size = 4 + hparams.max_length = 3075 + hparams.layer_preprocess_sequence = "none" + hparams.layer_postprocess_sequence = "dan" + hparams.num_decoder_layers = 8 + hparams.layer_prepostprocess_dropout = 0.3 + return hparams + + +@registry.register_hparams +def imagetransformer_base_8l_8h_big_cond_dr03_dan_128(): + hparams = imagetransformer_base_8l_8h_big_cond_dr03_dan() + hparams.block_width = 128 + hparams.block_length = 128 + return hparams + + +@registry.register_hparams +def imagetransformer_base_10l_8h_big_cond_dr03_dan(): + """Best conditional Cifar10 gen param.""" + hparams = imagetransformer_base_8l_8h_big_cond_dr03_dan() + hparams.num_decoder_layers = 10 + return hparams + + +@registry.register_hparams +def imagetransformer_base_10l_8h_big_uncond_dr03_dan(): + """Best unconditional Cifar10 gen param.""" + hparams = imagetransformer_base_10l_8h_big_cond_dr03_dan() + hparams.num_decoder_layers = 10 + hparams.unconditional = True + return hparams + + +@registry.register_hparams +def imagetransformer_base_8l_8h_big_cond_dr03_dan_dilated(): + """Dilated hparams.""" + hparams = imagetransformer_base_8l_8h_big_cond_dr03_dan() + hparams.gap_sizes = [0, 16, 64, 0, 16, 64, 128, 0] + hparams.dec_attention_type = cia.AttentionType.DILATED + hparams.block_length = 128 + hparams.block_width = 128 + hparams.add_hparam("num_memory_blocks", 1) + return hparams + + +@registry.register_hparams +def imagetransformer_base_8l_8h_big_cond_dr03_dan_dilated_b(): + """Dilated hparams.""" + hparams = imagetransformer_base_8l_8h_big_cond_dr03_dan_dilated() + hparams.block_width = 64 + hparams.num_memory_blocks = 2 + return hparams + + +@registry.register_hparams +def imagetransformer_base_8l_8h_big_cond_dr03_dan_dilated_c(): + """Dilated hparams.""" + hparams = imagetransformer_base_8l_8h_big_cond_dr03_dan_dilated() + hparams.block_width = 32 + hparams.num_memory_blocks = 4 + return hparams + + +@registry.register_hparams +def imagetransformer_base_8l_8h_big_cond_dr03_dan_dilated_d(): + """Dilated hparams.""" + hparams = imagetransformer_base_8l_8h_big_cond_dr03_dan_dilated() + hparams.gap_sizes = [0, 16, 64, 16, 64, 128, 256, 0] + return hparams + + +@registry.register_hparams +def imagetransformer_base_12l_8h_big(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.filter_size = 1024 + hparams.num_decoder_layers = 12 + hparams.batch_size = 1 + hparams.hidden_size = 512 + hparams.learning_rate_warmup_steps = 4000 + hparams.sampling_method = "random" + hparams.beam_size = 1 + hparams.block_width = 256 + return hparams + + +@registry.register_hparams +def imagetransformer1d_base_8l_64by64(): + """hparams fo 12 layer big 1d model for imagenet 64x64.""" + hparams = image_transformer_base() + hparams.num_heads = 8 + hparams.hidden_size = 512 + hparams.filter_size = 2048 + hparams.num_decoder_layers = 8 + hparams.batch_size = 1 + hparams.block_length = 512 + hparams.block_width = 768 + hparams.layer_prepostprocess_dropout = 0.1 + hparams.max_length = 14000 + hparams.unconditional = int(False) + return hparams + + +@registry.register_hparams +def imagetransformer1d_base_12l_64by64(): + """hparams fo 12 layer big 1d model for imagenet 64x64.""" + hparams = image_transformer_base() + hparams.num_heads = 8 + hparams.hidden_size = 512 + hparams.filter_size = 2048 + hparams.num_decoder_layers = 12 + hparams.batch_size = 1 + hparams.block_length = 512 + hparams.block_width = 768 + hparams.layer_prepostprocess_dropout = 0.1 + hparams.max_length = 14000 + hparams.unconditional = int(False) + return hparams + + +@registry.register_hparams +def imagetransformer_base_14l_8h_big(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_12l_8h_big() + hparams.num_decoder_layers = 14 + return hparams + + +@registry.register_hparams +def imagetransformer_base_14l_8h_big_dr01(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_14l_8h_big() + hparams.layer_prepostprocess_dropout = 0.1 + return hparams + + +@registry.register_hparams +def imagetransformer_base_12l_8h_big_uncond(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_12l_8h_big() + hparams.unconditional = True + return hparams + + +@registry.register_hparams +def imagetransformer_base_14l_8h_big_uncond(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_12l_8h_big_uncond() + hparams.num_decoder_layers = 14 + return hparams + + +@registry.register_hparams +def imagetransformer_base_14l_8h_big_uncond_dr01(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_14l_8h_big_uncond() + hparams.layer_prepostprocess_dropout = 0.1 + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_12l_16h_imagenet_large(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.num_hidden_layers = 12 + hparams.batch_size = 1 + hparams.filter_size = 2048 + hparams.num_heads = 16 + hparams.learning_rate_warmup_steps = 16000 + hparams.sampling_method = "random" + hparams.learning_rate = 0.1 + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_16l_16h_imgnet_lrg_loc(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_12l_16h_imagenet_large() + hparams.num_hidden_layers = 16 + hparams.local_attention = True + hparams.batch_size = 1 + hparams.block_length = 256 + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_16l_16h_imgnet_lrg_loc_128(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_12l_16h_imagenet_large() + hparams.num_hidden_layers = 16 + hparams.local_attention = True + hparams.batch_size = 1 + hparams.block_length = 128 + return hparams + + +@registry.register_hparams +def imagetransformer_sep_output_channels_8l_local_and_global_att(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l() + hparams.sep_rgb_embed = True + hparams.sampling_method = "random" + hparams.local_and_global_att = True + return hparams + + +@registry.register_hparams +def imagetransformer_base_10l_16h_big_uncond_dr01_imgnet(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_14l_8h_big_uncond_dr01() + # num_hidden_layers + hparams.num_decoder_layers = 10 + hparams.num_heads = 16 + hparams.hidden_size = 1024 + hparams.filter_size = 4096 + hparams.batch_size = 1 + hparams.layer_prepostprocess_dropout = 0.1 + return hparams + + +@registry.register_hparams +def imagetransformer_base_10l_16h_big_dr01_imgnet(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_14l_8h_big_uncond_dr01() + # num_hidden_layers + hparams.num_decoder_layers = 10 + hparams.num_heads = 16 + hparams.hidden_size = 1024 + hparams.filter_size = 4096 + hparams.batch_size = 1 + hparams.unconditional = False + hparams.layer_prepostprocess_dropout = 0.1 + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_8l_8h(): + """separate rgb embeddings.""" + hparams = imagetransformer_base() + hparams.num_heads = 8 + hparams.batch_size = 1 + hparams.attention_key_channels = hparams.attention_value_channels = 0 + hparams.hidden_size = 512 + hparams.filter_size = 512 + hparams.num_hidden_layers = 8 + hparams.sampling_method = "random" + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_10l_8h(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.num_hidden_layers = 8 + hparams.learning_rate_warmup_steps = 16000 + hparams.sampling_method = "random" + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_12l_8h(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.num_hidden_layers = 12 + hparams.batch_size = 2 + hparams.learning_rate_warmup_steps = 16000 + hparams.sampling_method = "random" + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_12l_8h_nda(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.num_hidden_layers = 12 + hparams.batch_size = 2 + hparams.learning_rate_warmup_steps = 16000 + hparams.sampling_method = "random" + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_12l_8h_4k(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.num_hidden_layers = 12 + hparams.batch_size = 2 + hparams.learning_rate_warmup_steps = 4000 + hparams.sampling_method = "random" + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_12l_8h_sep_rgb(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.num_hidden_layers = 12 + hparams.batch_size = 2 + hparams.learning_rate_warmup_steps = 16000 + hparams.sep_rgb_embed = True + hparams.sampling_method = "random" + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_8l_8h_local_and_global_att(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l_8h() + hparams.num_heads = 8 + hparams.batch_size = 1 + hparams.attention_key_channels = hparams.attention_value_channels = 0 + hparams.hidden_size = 256 + hparams.filter_size = 256 + hparams.num_hidden_layers = 4 + hparams.sampling_method = "random" + hparams.local_and_global_att = True + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_8l_self_att_ffn(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l() + hparams.num_parts = 4 + hparams.ffn_layer = "self_attention_ffn" + hparams.share_kv = True + return hparams + + +@registry.register_hparams +def imagetransformer_sep_channels_8l_glu_ffn(): + """separate rgb embeddings.""" + hparams = imagetransformer_sep_channels_8l() + hparams.ffn_layer = "glu_ffn" + return hparams + + +@registry.register_hparams +def imagetransformer_bas8l_8h_big_uncond_dr03_imgnet(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_14l_8h_big_uncond_dr01() + # num_hidden_layers + hparams.num_decoder_layers = 8 + hparams.num_heads = 8 + hparams.hidden_size = 512 + hparams.filter_size = 2048 + hparams.layer_prepostprocess_dropout = 0.3 + return hparams + + +@registry.register_hparams +def imagetransformer_tiny(): + hparams = imagetransformer_base() + hparams.num_hidden_layers = 2 + hparams.hidden_size = 64 + hparams.batch_size = 1 + return hparams + + +@registry.register_hparams +def imagetransformer_tiny_tpu(): + hparams = imagetransformer_base() + hparams.num_hidden_layers = 2 + hparams.hidden_size = 16 + hparams.batch_size = 2 + hparams.num_heads = 2 + return hparams + + +@registry.register_hparams +def imagetransformer_base_10l_16h_big_dr01_moe_imgnet(): + """big 1d model for conditional image generation.""" + hparams = imagetransformer_base_10l_16h_big_dr01_imgnet() + hparams.initializer = "orthogonal" + hparams.learning_rate_warmup_steps = 16000 + hparams.add_hparam("moe_layers_decoder", "2,7") # Which layer is MoE. + hparams.moe_hidden_sizes = "4096" # Hidden layer sizes (comma-separated). + hparams.moe_num_experts = 64 # Number of experts in each MoE layer. + hparams.moe_k = 4 # How many experts to use per batch element (try 2 or 4). + hparams.moe_loss_coef = 3e-2 # MoE loss coefficient (1e-2 is usually ok). + hparams.scheduled_sampling_prob = 0.1 + hparams.scheduled_sampling_warmup_steps = 200000 + return hparams + + +@registry.register_hparams +def imagetransformer_moe_tiny(): + """Set of hyperparameters for a very small imagetransformer with MoE.""" + hparams = imagetransformer_tiny() + hparams.hidden_size = 64 + hparams.batch_size = 1 + hparams.num_hidden_layers = 3 + hparams.dec_attention_type = cia.AttentionType.MOE_LOCAL_1D + hparams.add_hparam("moe_layers_decoder", "1") # Which layer is MoE. + hparams.moe_hidden_sizes = "1024" # Hidden layer sizes (comma-separated). + hparams.moe_num_experts = 16 # Number of experts in each MoE layer. + hparams.moe_k = 2 # How many experts to use per batch element (try 2 or 4). + hparams.moe_loss_coef = 1e-2 # MoE loss coefficient (1e-2 is usually ok). + return hparams + + +def update_hparams_for_tpu(hparams): + hparams.use_pad_remover = False # where op not supported + hparams.optimizer = "TrueAdam" + hparams.batch_size = 4 + + +@registry.register_hparams +def imagetransformer_sep_channels_8l_tpu(): + """Hparams for training imagetransformer on tpu.""" + hparams = imagetransformer_sep_channels_8l() + update_hparams_for_tpu(hparams) + hparams.batch_size = 1 + hparams.num_heads = 4 # heads are expensive on tpu + hparams.shared_embedding_and_softmax_weights = False + return hparams + + +@registry.register_hparams +def imagetransformer_bas8l_8h_big_uncond_dr03_imgnet_tpu(): + hparams = imagetransformer_bas8l_8h_big_uncond_dr03_imgnet() + update_hparams_for_tpu(hparams) + hparams.batch_size = 1 + hparams.num_heads = 8 # heads are expensive on tpu + return hparams diff --git a/tensor2tensor/models/image_transformer_2d.py b/tensor2tensor/models/image_transformer_2d.py new file mode 100644 index 000000000..5ab0d112b --- /dev/null +++ b/tensor2tensor/models/image_transformer_2d.py @@ -0,0 +1,517 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""image generation with transformer (attention). + +encoder: [Self-Attention, Feed-forward] x n +decoder: [Self-Attention, Source-Target-Attention, Feed-forward] x n + +""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import copy + +# Dependency imports + +from tensor2tensor.layers import common_hparams +from tensor2tensor.layers import common_image_attention as cia +from tensor2tensor.layers import common_layers +from tensor2tensor.utils import registry +from tensor2tensor.utils import t2t_model + +import tensorflow as tf + + +@registry.register_model +class Imagetransformer2d(t2t_model.T2TModel): + """Conditional image generation with attention. See file docstring.""" + + def body(self, features): + hparams = copy.copy(self._hparams) + inputs = features["inputs"] + targets = features["targets"] + targets_shape = common_layers.shape_list(targets) + if not (tf.get_variable_scope().reuse or + hparams.mode == tf.contrib.learn.ModeKeys.INFER): + tf.summary.image("targets", targets, max_outputs=1) + + decoder_input, rows, cols, bias = cia.prepare_decoder( + targets, hparams) + # Add class label to decoder input. + if not hparams.unconditional: + decoder_input += tf.reshape(inputs, + [targets_shape[0], 1, 1, hparams.hidden_size]) + + decoder_output = cia.transformer_decoder_layers( + decoder_input, None, bias, + hparams.num_decoder_layers, + hparams, + attention_type=hparams.dec_attention_type, + name="decoder") + + output = cia.create_output(decoder_output, rows, cols, targets, hparams) + return output + + +@registry.register_model +class Img2imgTransformer(t2t_model.T2TModel): + """Image 2 Image transformer net.""" + + def body(self, features): + hparams = copy.copy(self._hparams) + targets = features["targets"] + inputs = features["inputs"] + if not (tf.get_variable_scope().reuse or + hparams.mode == tf.contrib.learn.ModeKeys.INFER): + tf.summary.image("inputs", inputs, max_outputs=1) + tf.summary.image("targets", targets, max_outputs=1) + + encoder_input = cia.prepare_encoder(inputs, hparams) + encoder_output = cia.transformer_encoder_layers( + encoder_input, + hparams.num_encoder_layers, + hparams, + attention_type=hparams.enc_attention_type, + name="encoder") + decoder_input, rows, cols, bias = cia.prepare_decoder( + targets, hparams) + decoder_output = cia.transformer_decoder_layers( + decoder_input, + encoder_output, bias, + hparams.num_decoder_layers, + hparams, + attention_type=hparams.dec_attention_type, + name="decoder") + output = cia.create_output(decoder_output, rows, cols, targets, hparams) + return output + + +@registry.register_hparams +def image_transformer2d_base(): + """Set of hyperparameters.""" + hparams = common_hparams.basic_params1() + hparams.hidden_size = 512 + hparams.batch_size = 1 + hparams.max_length = 256 + hparams.dropout = 0.0 + hparams.clip_grad_norm = 0. # i.e. no gradient clipping + hparams.optimizer_adam_epsilon = 1e-9 + hparams.learning_rate_decay_scheme = "noam" + hparams.learning_rate = 0.1 + hparams.learning_rate_warmup_steps = 4000 + hparams.initializer_gain = 0.2 + hparams.initializer = "uniform_unit_scaling" + hparams.weight_decay = 0.0 + hparams.optimizer_adam_beta1 = 0.9 + hparams.optimizer_adam_beta2 = 0.98 + hparams.label_smoothing = 0.0 + hparams.target_modality = "image:identity" + hparams.norm_type = "layer" + hparams.layer_prepostprocess_dropout = 0.0 + hparams.add_hparam("filter_size", 512) # Add new ones like this. + + # attention-related flags + hparams.add_hparam("num_heads", 8) + hparams.add_hparam("attention_key_channels", 0) + hparams.add_hparam("attention_value_channels", 0) + hparams.add_hparam("ffn_layer", "conv_hidden_relu") + # All hyperparameters ending in "dropout" are automatically set to 0.0 + # when not in training mode. + hparams.add_hparam("attention_dropout", 0.0) + hparams.add_hparam("relu_dropout", 0.0) + hparams.add_hparam("pos", "timing") # timing, none + hparams.add_hparam("nbr_decoder_problems", 1) + hparams.add_hparam("num_output_layers", 3) + hparams.add_hparam("block_size", 1) + + # image size related flags + # assuming that the image has same height and width + hparams.add_hparam("img_len", 32) + hparams.add_hparam("num_channels", 3) + # Local attention params + hparams.add_hparam("local_and_global_att", False) + hparams.add_hparam("block_length", 256) + hparams.add_hparam("block_width", 128) + # Local 2D attention params + hparams.add_hparam("query_shape", (16, 16)) + hparams.add_hparam("memory_flange", (16, 32)) + hparams.add_hparam("num_encoder_layers", 4) + hparams.add_hparam("num_decoder_layers", 8) + # attention type related params + hparams.add_hparam("enc_attention_type", cia.AttentionType.GLOBAL) + hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_2D) + hparams.add_hparam("block_rastor_scan", False) + + # multipos attention params + hparams.add_hparam("q_filter_width", 1) + hparams.add_hparam("kv_filter_width", 1) + + hparams.add_hparam("unconditional", False) # unconditional generation + return hparams + + +@registry.register_hparams +def imagetransformer2d_base(): + hparams = image_transformer2d_base() + hparams.dec_attention_type = cia.AttentionType.LOCAL_2D + hparams.block_rastor_scan = True + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_8l_8_16(): + hparams = image_transformer2d_base() + hparams.num_decoder_layers = 8 + hparams.batch_size = 1 + hparams.memory_flange = (8, 16) + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_8l_8_16_ls(): + hparams = image_transformer2d_base() + hparams.num_decoder_layers = 8 + hparams.label_smoothing = 0.05 + hparams.batch_size = 1 + hparams.memory_flange = (8, 16) + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_8l_8_16_big(): + hparams = image_transformer2d_base() + hparams.filter_size = 1024 + hparams.num_decoder_layers = 8 + hparams.batch_size = 1 + hparams.memory_flange = (8, 16) + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_12l_8_16_big(): + hparams = image_transformer2d_base() + hparams.filter_size = 1024 + hparams.num_decoder_layers = 12 + hparams.batch_size = 1 + hparams.memory_flange = (8, 16) + hparams.sampling_method = "random" + hparams.beam_size = 1 + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_8l_8_32_big(): + """hparams fo 8 layer big 2d model for cifar 10.""" + hparams = image_transformer2d_base() + hparams.num_heads = 16 + hparams.hidden_size = 1024 + hparams.filter_size = 2048 + hparams.num_decoder_layers = 8 + hparams.batch_size = 1 + hparams.layer_prepostprocess_dropout = 0.3 + hparams.query_shape = (8, 16) + hparams.memory_flange = (0, 32) + hparams.unconditional = int(False) + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_8l_8_64_64by64(): + """hparams fo 12 layer big 2d model for imagenet 64x64.""" + hparams = image_transformer2d_base() + hparams.num_heads = 8 + hparams.hidden_size = 512 + hparams.filter_size = 2048 + hparams.num_decoder_layers = 8 + hparams.batch_size = 1 + hparams.layer_prepostprocess_dropout = 0.1 + hparams.query_shape = (8, 64) + hparams.memory_flange = (4, 32) + hparams.unconditional = int(False) + hparams.max_length = 14000 + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_12l_8_64_64by64(): + """hparams fo 12 layer big 2d model for imagenet 64x64.""" + hparams = image_transformer2d_base() + hparams.num_heads = 8 + hparams.hidden_size = 512 + hparams.filter_size = 2048 + hparams.num_decoder_layers = 12 + hparams.batch_size = 1 + hparams.layer_prepostprocess_dropout = 0.1 + hparams.query_shape = (8, 64) + hparams.memory_flange = (4, 32) + hparams.unconditional = int(False) + hparams.max_length = 14000 + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_14l_8_16_big(): + hparams = image_transformer2d_base() + hparams.filter_size = 1024 + hparams.num_decoder_layers = 14 + hparams.batch_size = 1 + hparams.memory_flange = (8, 16) + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_14l_8_16_big_uncond(): + hparams = imagetransformer2d_base_14l_8_16_big() + hparams.unconditional = True + return hparams + + +@registry.register_hparams +def imagetransformer2d_base_8l_8_16_big_16k(): + hparams = image_transformer2d_base() + hparams.filter_size = 1024 + hparams.num_decoder_layers = 8 + hparams.batch_size = 1 + hparams.memory_flange = (8, 16) + hparams.learning_rate_warmup_steps = 16000 + return hparams + + +@registry.register_hparams +def img2img_transformer2d_base(): + """Base params for img2img 2d attention.""" + hparams = image_transformer2d_base() + # learning related flags + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + # This version seems to benefit from a higher learning rate. + hparams.learning_rate = 0.2 + hparams.layer_prepostprocess_dropout = 0.1 + hparams.learning_rate_warmup_steps = 12000 + hparams.filter_size = 2048 + hparams.num_encoder_layers = 4 + hparams.num_decoder_layers = 8 + hparams.dec_attention_type = cia.AttentionType.LOCAL_2D + hparams.block_rastor_scan = True + return hparams + + +@registry.register_hparams +def img2img_transformer2d_q1(): + hparams = img2img_transformer2d_base() + hparams.batch_size = 2 + hparams.layer_preprocess_sequence = "none" + hparams.layer_postprocess_sequence = "dan" + hparams.query_shape = (16, 16) + hparams.memory_flange = (16, 64) + return hparams + + +@registry.register_hparams +def img2img_transformer2d_q2(): + hparams = img2img_transformer2d_q1() + hparams.batch_size = 2 + hparams.layer_preprocess_sequence = "none" + hparams.layer_postprocess_sequence = "dan" + hparams.query_shape = (16, 16) + hparams.memory_flange = (16, 32) + return hparams + + +@registry.register_hparams +def img2img_transformer2d_q3(): + """Current best hparams for local 2d.""" + hparams = img2img_transformer2d_q1() + hparams.batch_size = 2 + hparams.query_shape = (8, 16) + hparams.memory_flange = (8, 32) + return hparams + + +@registry.register_hparams +def img2img_transformer_base(): + """Base params for local1d attention.""" + hparams = image_transformer2d_base() + # learning related flags + hparams.layer_preprocess_sequence = "n" + hparams.layer_postprocess_sequence = "da" + # This version seems to benefit from a higher learning rate. + hparams.learning_rate = 0.2 + hparams.layer_prepostprocess_dropout = 0.1 + hparams.learning_rate_warmup_steps = 12000 + hparams.filter_size = 2048 + hparams.num_encoder_layers = 4 + hparams.num_decoder_layers = 8 + hparams.block_length = 256 + hparams.block_width = 256 + hparams.dec_attention_type = cia.AttentionType.LOCAL_1D + hparams.block_rastor_scan = False + return hparams + + +@registry.register_hparams +def img2img_transformer_b1(): + hparams = img2img_transformer_base() + hparams.batch_size = 2 + hparams.layer_preprocess_sequence = "none" + hparams.layer_postprocess_sequence = "dan" + hparams.block_length = 512 + return hparams + + +@registry.register_hparams +def img2img_transformer_b2(): + hparams = img2img_transformer_base() + hparams.batch_size = 2 + hparams.layer_preprocess_sequence = "none" + hparams.layer_postprocess_sequence = "dan" + hparams.block_length = 256 + return hparams + + +@registry.register_hparams +def img2img_transformer_b3(): + """Current best hparams for local 1d.""" + hparams = img2img_transformer_base() + hparams.batch_size = 2 + hparams.layer_preprocess_sequence = "none" + hparams.layer_postprocess_sequence = "dan" + hparams.block_length = 128 + hparams.sampling_temp = 0.9 + return hparams + + +@registry.register_hparams +def img2img_transformer_dilated(): + """Try dilated.""" + hparams = img2img_transformer_base() + hparams.add_hparam("num_memory_blocks", 1) + hparams.num_heads = 8 + hparams.attention_key_channels = hparams.attention_value_channels = 0 + hparams.hidden_size = 512 + hparams.filter_size = 2048 + hparams.num_decoder_layers = 8 + hparams.sampling_method = "random" + hparams.gap_sizes = [0, 16, 64, 0, 16, 64, 128, 0] + hparams.dec_attention_type = cia.AttentionType.DILATED + hparams.img_len = 64 + hparams.block_length = 128 + hparams.block_width = 128 + return hparams + + +@registry.register_hparams +def imagetransformer2d_tiny(): + hparams = imagetransformer2d_base() + hparams.num_decoder_layers = 2 + hparams.hidden_size = 64 + hparams.batch_size = 1 + return hparams + + +@registry.register_hparams +def img2img_transformer2d_n3(): + hparams = img2img_transformer2d_base() + hparams.batch_size = 1 + hparams.num_encoder_layers = 4 + hparams.num_decoder_layers = 12 + hparams.query_shape = (16, 32) + hparams.memory_flange = (16, 16) + hparams.layer_prepostprocess_dropout = 0.0 + return hparams + + +@registry.register_hparams +def img2img_transformer2d_n31(): + """Set of hyperparameters.""" + hparams = img2img_transformer2d_base() + hparams.batch_size = 1 + hparams.num_encoder_layers = 6 + hparams.num_decoder_layers = 12 + hparams.num_heads = 8 + hparams.query_shape = (16, 32) + hparams.memory_flange = (16, 32) + return hparams + + +@registry.register_hparams +def img2img_transformer2d_n24(): + hparams = img2img_transformer2d_base() + hparams.batch_size = 1 + hparams.hidden_size = 1024 + hparams.filter_size = 2048 + hparams.layer_prepostprocess_dropout = 0.2 + hparams.num_decoder_layers = 8 + hparams.query_shape = (8, 16) + hparams.memory_flange = (8, 32) + return hparams + + +@registry.register_hparams +def img2img_transformer2d_n44(): + hparams = img2img_transformer2d_base() + hparams.batch_size = 1 + hparams.num_decoder_layers = 8 + hparams.query_shape = (8, 16) + hparams.memory_flange = (8, 32) + hparams.layer_prepostprocess_dropout = 0.1 + return hparams + + +@registry.register_hparams +def img2img_transformer2d_n103(): + """Best config for img2img.""" + hparams = img2img_transformer2d_base() + hparams.batch_size = 1 + hparams.num_decoder_layers = 12 + hparams.num_encoder_layers = 6 + hparams.query_shape = (8, 32) + hparams.memory_flange = (8, 64) + hparams.layer_prepostprocess_dropout = 0.1 + return hparams + + +@registry.register_hparams +def img2img_transformer2d_tiny(): + """Tiny params.""" + hparams = img2img_transformer2d_base() + hparams.num_decoder_layers = 2 + hparams.hidden_size = 128 + hparams.batch_size = 4 + hparams.max_length = 128 + hparams.attention_key_channels = hparams.attention_value_channels = 0 + hparams.filter_size = 128 + hparams.num_heads = 4 + hparams.pos = "timing" + hparams.img_len = 32 + return hparams + + +@registry.register_hparams +def img2img_transformer_tiny(): + """Tiny params.""" + hparams = img2img_transformer2d_base() + hparams.num_hidden_layers = 2 + hparams.hidden_size = 128 + hparams.batch_size = 4 + hparams.max_length = 128 + hparams.attention_key_channels = hparams.attention_value_channels = 0 + hparams.filter_size = 128 + hparams.num_heads = 1 + hparams.pos = "timing" + return hparams diff --git a/tensor2tensor/models/image_transformer_2d_test.py b/tensor2tensor/models/image_transformer_2d_test.py new file mode 100644 index 000000000..4098792a4 --- /dev/null +++ b/tensor2tensor/models/image_transformer_2d_test.py @@ -0,0 +1,88 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Transformer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import celeba # pylint: disable=unused-import +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import image_transformer_2d +from tensor2tensor.utils import registry + +import tensorflow as tf + + +class Img2imgTransformerTest(tf.test.TestCase): + + def _testImg2imgTransformer(self, net): + batch_size = 3 + hparams = image_transformer_2d.img2img_transformer2d_tiny() + hparams.data_dir = "" + p_hparams = registry.problem("image_celeba").get_hparams(hparams) + inputs = np.random.random_integers(0, high=255, size=(3, 4, 4, 3)) + targets = np.random.random_integers(0, high=255, size=(3, 8, 8, 3)) + with self.test_session() as session: + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.int32), + "target_space_id": tf.constant(1, dtype=tf.int32), + } + model = net(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (batch_size, 8, 8, 3, 256)) + + def testImg2imgTransformer(self): + self._testImg2imgTransformer(image_transformer_2d.Img2imgTransformer) + + +class Imagetransformer2dTest(tf.test.TestCase): + + def _testImagetransformer2d(self, net): + batch_size = 3 + size = 7 + vocab_size = 256 + hparams = image_transformer_2d.imagetransformer2d_tiny() + p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size) + inputs = -1 + np.random.random_integers( + vocab_size, size=(batch_size, 1, 1, 1)) + targets = -1 + np.random.random_integers( + vocab_size, size=(batch_size, size, size, 3)) + with self.test_session() as session: + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.int32), + "target_space_id": tf.constant(1, dtype=tf.int32), + } + model = net(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (batch_size, size, size, 3, vocab_size)) + + def testImagetransformer2d(self): + self._testImagetransformer2d(image_transformer_2d.Imagetransformer2d) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/image_transformer_test.py b/tensor2tensor/models/image_transformer_test.py new file mode 100644 index 000000000..a997a6bc5 --- /dev/null +++ b/tensor2tensor/models/image_transformer_test.py @@ -0,0 +1,61 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for Transformer.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +# Dependency imports + +import numpy as np + +from tensor2tensor.data_generators import problem_hparams +from tensor2tensor.models import image_transformer + +import tensorflow as tf + + +class ImagetransformerTest(tf.test.TestCase): + + def _testImagetransformer(self, net): + batch_size = 3 + size = 7 + vocab_size = 256 + hparams = image_transformer.imagetransformer_base() + p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size) + inputs = -1 + np.random.random_integers( + vocab_size, size=(batch_size, 1, 1, 1)) + targets = -1 + np.random.random_integers( + vocab_size, size=(batch_size, size, size, 3)) + with self.test_session() as session: + features = { + "inputs": tf.constant(inputs, dtype=tf.int32), + "targets": tf.constant(targets, dtype=tf.int32), + "target_space_id": tf.constant(1, dtype=tf.int32), + } + model = net(hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (batch_size, size, size, 3, vocab_size)) + + def testImagetransformer(self): + self._testImagetransformer(image_transformer.Imagetransformer) + + +if __name__ == "__main__": + tf.test.main() diff --git a/tensor2tensor/models/lstm.py b/tensor2tensor/models/lstm.py index dd6f144ba..9c6144e9f 100644 --- a/tensor2tensor/models/lstm.py +++ b/tensor2tensor/models/lstm.py @@ -19,6 +19,8 @@ from __future__ import division from __future__ import print_function +import copy + # Dependency imports from tensor2tensor.layers import common_hparams @@ -131,6 +133,116 @@ def lstm_seq2seq_internal_attention(inputs, targets, hparams, train): return tf.expand_dims(decoder_outputs, axis=2) +def lstm_bid_encoder(inputs, hparams, train, name): + """Bidirectional LSTM for encoding inputs that are [batch x time x size].""" + + def dropout_lstm_cell(): + return tf.contrib.rnn.DropoutWrapper( + tf.contrib.rnn.BasicLSTMCell(hparams.hidden_size), + input_keep_prob=1.0 - hparams.dropout * tf.to_float(train)) + + with tf.variable_scope(name): + cell_fw = tf.contrib.rnn.MultiRNNCell( + [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)]) + + cell_bw = tf.contrib.rnn.MultiRNNCell( + [dropout_lstm_cell() for _ in range(hparams.num_hidden_layers)]) + + ((encoder_fw_outputs, encoder_bw_outputs), + (encoder_fw_state, encoder_bw_state)) = tf.nn.bidirectional_dynamic_rnn( + cell_fw=cell_fw, + cell_bw=cell_bw, + inputs=inputs, + dtype=tf.float32, + time_major=False) + + encoder_outputs = tf.concat((encoder_fw_outputs, encoder_bw_outputs), 2) + encoder_states = [] + + for i in range(hparams.num_hidden_layers): + if isinstance(encoder_fw_state[i], tf.contrib.rnn.LSTMStateTuple): + encoder_state_c = tf.concat( + values=(encoder_fw_state[i].c, encoder_bw_state[i].c), + axis=1, + name="encoder_fw_state_c") + encoder_state_h = tf.concat( + values=(encoder_fw_state[i].h, encoder_bw_state[i].h), + axis=1, + name="encoder_fw_state_h") + encoder_state = tf.contrib.rnn.LSTMStateTuple( + c=encoder_state_c, h=encoder_state_h) + elif isinstance(encoder_fw_state[i], tf.Tensor): + encoder_state = tf.concat( + values=(encoder_fw_state[i], encoder_bw_state[i]), + axis=1, + name="bidirectional_concat") + + encoder_states.append(encoder_state) + + encoder_states = tuple(encoder_states) + return encoder_outputs, encoder_states + + +def lstm_seq2seq_internal_bid_encoder(inputs, targets, hparams, train): + """The basic LSTM seq2seq model with bidirectional encoder.""" + with tf.variable_scope("lstm_seq2seq_bid_encoder"): + if inputs is not None: + # Flatten inputs. + inputs = common_layers.flatten4d3d(inputs) + # LSTM encoder. + _, final_encoder_state = lstm_bid_encoder( + tf.reverse(inputs, axis=[1]), hparams, train, "encoder") + else: + final_encoder_state = None + # LSTM decoder. + shifted_targets = common_layers.shift_right(targets) + hparams_decoder = copy.copy(hparams) + hparams_decoder.hidden_size = 2 * hparams.hidden_size + decoder_outputs, _ = lstm( + common_layers.flatten4d3d(shifted_targets), + hparams_decoder, + train, + "decoder", + initial_state=final_encoder_state) + return tf.expand_dims(decoder_outputs, axis=2) + + +def lstm_seq2seq_internal_attention_bid_encoder(inputs, targets, hparams, + train): + """LSTM seq2seq model with attention, main step used for training.""" + with tf.variable_scope("lstm_seq2seq_attention_bid_encoder"): + # Flatten inputs. + inputs = common_layers.flatten4d3d(inputs) + # LSTM encoder. + encoder_outputs, final_encoder_state = lstm_bid_encoder( + tf.reverse(inputs, axis=[1]), hparams, train, "encoder") + # LSTM decoder with attention + shifted_targets = common_layers.shift_right(targets) + hparams_decoder = copy.copy(hparams) + hparams_decoder.hidden_size = 2 * hparams.hidden_size + decoder_outputs, _ = lstm_attention_decoder( + common_layers.flatten4d3d(shifted_targets), hparams_decoder, train, + "decoder", final_encoder_state, encoder_outputs) + return tf.expand_dims(decoder_outputs, axis=2) + + +@registry.register_model +class LSTMEncoder(t2t_model.T2TModel): + """LSTM encoder only.""" + + def body(self, features): + if self._hparams.initializer == "orthogonal": + raise ValueError("LSTM models fail with orthogonal initializer.") + train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN + inputs = features.get("inputs") + # Flatten inputs. + inputs = common_layers.flatten4d3d(inputs) + # LSTM encoder. + encoder_output, _ = lstm( + tf.reverse(inputs, axis=[1]), self._hparams, train, "encoder") + return tf.expand_dims(encoder_output, axis=2) + + @registry.register_model class LSTMSeq2seq(t2t_model.T2TModel): @@ -155,6 +267,30 @@ def body(self, features): features.get("inputs"), features["targets"], self._hparams, train) +@registry.register_model +class LSTMSeq2seqBidirectionalEncoder(t2t_model.T2TModel): + + def body(self, features): + # TODO(lukaszkaiser): investigate this issue and repair. + if self._hparams.initializer == "orthogonal": + raise ValueError("LSTM models fail with orthogonal initializer.") + train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN + return lstm_seq2seq_internal_bid_encoder( + features.get("inputs"), features["targets"], self._hparams, train) + + +@registry.register_model +class LSTMSeq2seqAttentionBidirectionalEncoder(t2t_model.T2TModel): + + def body(self, features): + # TODO(lukaszkaiser): investigate this issue and repair. + if self._hparams.initializer == "orthogonal": + raise ValueError("LSTM models fail with orthogonal initializer.") + train = self._hparams.mode == tf.estimator.ModeKeys.TRAIN + return lstm_seq2seq_internal_attention_bid_encoder( + features.get("inputs"), features["targets"], self._hparams, train) + + @registry.register_hparams def lstm_seq2seq(): """hparams for LSTM.""" diff --git a/tensor2tensor/models/lstm_test.py b/tensor2tensor/models/lstm_test.py index 4e35d34cb..c392f23fd 100644 --- a/tensor2tensor/models/lstm_test.py +++ b/tensor2tensor/models/lstm_test.py @@ -71,6 +71,46 @@ def testLSTMSeq2SeqAttention(self): res = session.run(logits) self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size)) + def testLSTMSeq2seqBidirectionalEncoder(self): + vocab_size = 9 + x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1)) + y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1)) + hparams = lstm.lstm_seq2seq() + p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size) + with self.test_session() as session: + features = { + "inputs": tf.constant(x, dtype=tf.int32), + "targets": tf.constant(y, dtype=tf.int32), + } + model = lstm.LSTMSeq2seqBidirectionalEncoder( + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size)) + + def testLSTMSeq2seqAttentionBidirectionalEncoder(self): + vocab_size = 9 + x = np.random.random_integers(1, high=vocab_size - 1, size=(3, 5, 1, 1)) + y = np.random.random_integers(1, high=vocab_size - 1, size=(3, 6, 1, 1)) + hparams = lstm.lstm_attention() + + p_hparams = problem_hparams.test_problem_hparams(vocab_size, vocab_size) + x = tf.constant(x, dtype=tf.int32) + x = tf.placeholder_with_default(x, shape=[None, None, 1, 1]) + + with self.test_session() as session: + features = { + "inputs": x, + "targets": tf.constant(y, dtype=tf.int32), + } + model = lstm.LSTMSeq2seqAttentionBidirectionalEncoder( + hparams, tf.estimator.ModeKeys.TRAIN, p_hparams) + logits, _ = model(features) + session.run(tf.global_variables_initializer()) + res = session.run(logits) + self.assertEqual(res.shape, (3, 6, 1, 1, vocab_size)) + if __name__ == "__main__": tf.test.main() diff --git a/tensor2tensor/models/research/attention_lm.py b/tensor2tensor/models/research/attention_lm.py index bb847c1ec..30277d6f3 100644 --- a/tensor2tensor/models/research/attention_lm.py +++ b/tensor2tensor/models/research/attention_lm.py @@ -74,7 +74,7 @@ def attention_lm_prepare_decoder(targets, hparams): """ if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( - common_attention.attention_bias_prepended( + common_attention.attention_bias_prepend_inputs_full_attention( common_attention.embedding_to_padding(targets))) else: decoder_self_attention_bias = ( diff --git a/tensor2tensor/models/research/attention_lm_moe.py b/tensor2tensor/models/research/attention_lm_moe.py index c931a4dbe..ea65496cb 100644 --- a/tensor2tensor/models/research/attention_lm_moe.py +++ b/tensor2tensor/models/research/attention_lm_moe.py @@ -380,7 +380,8 @@ def attention_lm_moe_prepare_decoder(targets, hparams): if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( - common_attention.attention_bias_prepended(targets_pad_mask)) + common_attention.attention_bias_prepend_inputs_full_attention( + targets_pad_mask)) else: decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(tf.shape(targets)[1])) diff --git a/tensor2tensor/models/research/rl.py b/tensor2tensor/models/research/rl.py index defb1d822..7433026b0 100644 --- a/tensor2tensor/models/research/rl.py +++ b/tensor2tensor/models/research/rl.py @@ -136,20 +136,25 @@ def feed_forward_categorical_fun(action_space, config, observations): def feed_forward_cnn_small_categorical_fun(action_space, config, observations): """Small cnn network with categorical output.""" + del config + obs_shape = observations.shape.as_list() - x = tf.reshape(observations, [-1]+ obs_shape[2:]) + x = tf.reshape(observations, [-1] + obs_shape[2:]) - with tf.variable_scope('policy'): - x = tf.to_float(x)/255.0 - x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], activation_fn= tf.nn.relu, padding="SAME") - x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], activation_fn=tf.nn.relu, padding="SAME") + with tf.variable_scope("policy"): + x = tf.to_float(x) / 255.0 + x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], + activation_fn=tf.nn.relu, padding="SAME") + x = tf.contrib.layers.conv2d(x, 32, [5, 5], [2, 2], + activation_fn=tf.nn.relu, padding="SAME") - flat_x = tf.reshape(x, [ - tf.shape(observations)[0], tf.shape(observations)[1], - functools.reduce(operator.mul, x.shape.as_list()[1:], 1)]) + flat_x = tf.reshape( + x, [tf.shape(observations)[0], tf.shape(observations)[1], + functools.reduce(operator.mul, x.shape.as_list()[1:], 1)]) x = tf.contrib.layers.fully_connected(flat_x, 128, tf.nn.relu) - logits = tf.contrib.layers.fully_connected(x, action_space.n, activation_fn=None) + logits = tf.contrib.layers.fully_connected(x, action_space.n, + activation_fn=None) value = tf.contrib.layers.fully_connected(x, 1, activation_fn=None)[..., 0] policy = tf.contrib.distributions.Categorical(logits=logits) diff --git a/tensor2tensor/models/research/transformer_vae.py b/tensor2tensor/models/research/transformer_vae.py index ba66fc797..6ad4e19a5 100644 --- a/tensor2tensor/models/research/transformer_vae.py +++ b/tensor2tensor/models/research/transformer_vae.py @@ -670,7 +670,8 @@ def ae_transformer_internal(inputs, if hparams.mode != tf.estimator.ModeKeys.PREDICT: # Compress and bottleneck. latents_dense, latents_discrete, extra_loss, embed = bottleneck( - targets_c, hparams, 2 * 2048, "vc", means, ema_count, ema_means) + targets_c, hparams, + hparams.compress_filter_size, "vc", means, ema_count, ema_means) if _DO_SUMMARIES: tf.summary.histogram("b0", tf.reshape(latents_discrete[:, 0, :], [-1])) pc = common_layers.inverse_exp_decay(hparams.startup_steps) @@ -695,7 +696,8 @@ def ae_transformer_internal(inputs, losses["latent_pred"] = tf.reduce_mean((inputs_c - targets_c)**2) * 20 def bn_inputs(): with tf.variable_scope(tf.get_variable_scope(), reuse=True): - bn, _, _, _ = bottleneck(inputs_c, hparams, 2 * 2048, "vc", means, + bn, _, _, _ = bottleneck(inputs_c, hparams, + hparams.compress_filter_size, "vc", means, ema_count, ema_means) return bn pbn = 0.8 if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0 @@ -708,11 +710,13 @@ def bn_inputs(): else: if hparams.bottleneck_kind in ["dense", "vae"]: inputs_c = decode_transformer(inputs, ed, targets_c, hparams, "dec_c") - latents_dense, _, _, _ = bottleneck(inputs_c, hparams, 2 * 2048, "vc", - means, ema_count, ema_means) + latents_dense, _, _, _ = bottleneck( + inputs_c, hparams, hparams.compress_filter_size, "vc", + means, ema_count, ema_means) else: latent_len = common_layers.shape_list(targets_c)[1] - _, _, _, embed = bottleneck(targets_c, hparams, 2 * 2048, "vc", means, + _, _, _, embed = bottleneck(targets_c, hparams, + hparams.compress_filter_size, "vc", means, ema_count, ema_means) latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :]) if cache is None: @@ -806,7 +810,7 @@ def __init__(self, *args, **kwargs): self._hparams.num_blocks, self._hparams.hidden_size, self._hparams.block_dim ], - initializer=tf.random_normal_initializer(), + initializer=tf.contrib.layers.xavier_initializer(), trainable=self._hparams.trainable_projections) self._hparams.reshape_fn = project_hidden elif self._hparams.reshape_method == "slice": @@ -922,6 +926,7 @@ def transformer_ae_small(): hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 + hparams.add_hparam("compress_filter_size", 2048 * 2) hparams.label_smoothing = 0.0 hparams.optimizer = "Adam" # Can be unstable, maybe try Adam. hparams.optimizer_adam_epsilon = 1e-9 @@ -953,7 +958,6 @@ def transformer_ae_small(): hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) - hparams.add_hparam("use_gumbel_softmax", True) hparams.add_hparam("softmax_k", 0) hparams.add_hparam("decode_autoregressive", True) hparams.add_hparam("do_vae", True) @@ -1052,3 +1056,33 @@ def transformer_ae_base(): hparams.filter_size = 4096 hparams.num_hidden_layers = 6 return hparams + + +@registry.register_hparams +def transformer_ae_a3(): + """Set of hyperparameters.""" + hparams = transformer_ae_base() + hparams.batch_size = 4096 + hparams.layer_prepostprocess_dropout = 0.3 + hparams.optimizer = "Adafactor" + hparams.learning_rate = 0.25 + hparams.learning_rate_warmup_steps = 10000 + return hparams + + +@registry.register_hparams +def transformer_ae_a6(): + """Best hparams for transformer with semhash.""" + hparams = transformer_ae_a3() + hparams.optimizer = "Adam" + hparams.noise_dev = 0.5 + return hparams + + +@registry.register_hparams +def transformer_ae_a8(): + """Set of hyperparameters.""" + hparams = transformer_ae_a3() + hparams.optimizer = "Adafactor" + hparams.noise_dev = 0.5 + return hparams diff --git a/tensor2tensor/models/revnet.py b/tensor2tensor/models/revnet.py index 66e6a2abf..63ae19717 100644 --- a/tensor2tensor/models/revnet.py +++ b/tensor2tensor/models/revnet.py @@ -390,11 +390,10 @@ def revnet_cifar_base(): hparams.init_maxpool = False hparams.strides = [1, 2, 2] hparams.batch_size = 128 - hparams.weight_decay = 5e-3 + hparams.weight_decay = 1e-4 hparams.learning_rate = 0.1 - hparams.learning_rate_boundaries = [2000, 4000, 6000, 8000] - hparams.learning_rate_multiples = [0.1, 0.01, 0.001, 0.0001] + hparams.learning_rate_cosine_cycle_steps = 5000 return hparams @@ -404,6 +403,8 @@ def revnet_38_cifar(): hparams.bottleneck = False hparams.num_channels = [16, 32, 56] hparams.num_layers_per_block = [2, 2, 2] + hparams.initializer = 'normal_unit_scaling' + hparams.initializer_gain = 1.5 return hparams diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py index 05795323e..11d446f5b 100644 --- a/tensor2tensor/models/transformer.py +++ b/tensor2tensor/models/transformer.py @@ -31,6 +31,7 @@ from six.moves import xrange # pylint: disable=redefined-builtin +from tensor2tensor.data_generators import librispeech from tensor2tensor.layers import common_attention from tensor2tensor.layers import common_hparams from tensor2tensor.layers import common_layers @@ -603,7 +604,7 @@ def transformer_prepare_decoder(targets, hparams, features=None): """ if hparams.prepend_mode == "prepend_inputs_full_attention": decoder_self_attention_bias = ( - common_attention.attention_bias_prepended( + common_attention.attention_bias_prepend_inputs_full_attention( common_attention.embedding_to_padding(targets))) else: decoder_self_attention_bias = ( @@ -1423,3 +1424,32 @@ def transformer_lm_tpu_1(): hparams.hidden_size = 2048 hparams.filter_size = 8192 return hparams + + +@registry.register_hparams +def transformer_librispeech(): + """Hparams for training ASR model on Librispeech.""" + hparams = transformer_base() + + hparams.num_heads = 4 + hparams.filter_size = 1024 + hparams.hidden_size = 256 + hparams.num_encoder_layers = 5 + hparams.num_decoder_layers = 3 + hparams.learning_rate = 0.15 + hparams.batch_size = 6000000 + + librispeech.set_librispeech_length_hparams(hparams) + return hparams + + +@registry.register_hparams +def transformer_librispeech_tpu(): + """Hparams for training ASR model on Librispeech on TPU.""" + hparams = transformer_librispeech() + update_hparams_for_tpu(hparams) + + hparams.batch_size = 32 + librispeech.set_librispeech_length_hparams(hparams) + return hparams + diff --git a/tensor2tensor/rl/collect.py b/tensor2tensor/rl/collect.py index 354c40065..8e81dafa7 100644 --- a/tensor2tensor/rl/collect.py +++ b/tensor2tensor/rl/collect.py @@ -38,11 +38,13 @@ def define_collect(policy_factory, batch_env, hparams, eval_phase): trainable=False) should_reset_var = tf.Variable(True, trainable=False) + + def group(): + return tf.group(batch_env.reset(tf.range(len(batch_env))), + tf.assign(cumulative_rewards, tf.zeros(len(batch_env)))) reset_op = tf.cond( - tf.logical_or(should_reset_var, eval_phase), - lambda: tf.group(batch_env.reset(tf.range(len(batch_env))), - tf.assign(cumulative_rewards, tf.zeros(len(batch_env)))), - lambda: tf.no_op()) + tf.logical_or(should_reset_var, eval_phase), group, tf.no_op) + with tf.control_dependencies([reset_op]): reset_once_op = tf.assign(should_reset_var, False) @@ -50,7 +52,7 @@ def define_collect(policy_factory, batch_env, hparams, eval_phase): def step(index, scores_sum, scores_num): """Single step.""" - index = index % hparams.epoch_length # Only needed in eval runs. + index %= hparams.epoch_length # Only needed in eval runs. # Note - the only way to ensure making a copy of tensor is to run simple # operation. We are waiting for tf.copy: # https://github.com/tensorflow/tensorflow/issues/11186 @@ -88,9 +90,9 @@ def step(index, scores_sum, scores_num): scores_num + scores_num_delta] def stop_condition(i, _, resets): - return tf.cond(eval_phase, - lambda: resets < hparams.num_eval_agents, - lambda: i < hparams.epoch_length) + return tf.cond(eval_phase, + lambda: resets < hparams.num_eval_agents, + lambda: i < hparams.epoch_length) init = [tf.constant(0), tf.constant(0.0), tf.constant(0)] index, scores_sum, scores_num = tf.while_loop( diff --git a/tensor2tensor/rl/envs/utils.py b/tensor2tensor/rl/envs/utils.py index 68bff6229..59732fed0 100644 --- a/tensor2tensor/rl/envs/utils.py +++ b/tensor2tensor/rl/envs/utils.py @@ -20,7 +20,6 @@ # https://github.com/tensorflow/agents/blob/master/agents/scripts/utility.py import atexit -import gym import multiprocessing import os import random @@ -31,14 +30,15 @@ # Dependency imports +import gym + from tensor2tensor.rl.envs import batch_env from tensor2tensor.rl.envs import in_graph_batch_env import tensorflow as tf class EvalVideoWrapper(gym.Wrapper): - """ - Wrapper for recording videos during eval phase. + """Wrapper for recording videos during eval phase. This wrapper is designed to record videos via gym.wrappers.Monitor and simplifying its usage in t2t collect phase. @@ -52,6 +52,7 @@ class EvalVideoWrapper(gym.Wrapper): returns last seen observation. Videos are only generated during the active runs. """ + def __init__(self, env): super(EvalVideoWrapper, self).__init__(env) self._reset_counter = 0 @@ -61,7 +62,7 @@ def __init__(self, env): def _step(self, action): if self._active: self._last_returned = self.env.step(action) - if self._last_returned == None: + if self._last_returned is None: raise Exception("Environment stepped before proper reset.") return self._last_returned @@ -99,6 +100,7 @@ def __init__(self, constructor, xvfb): Args: constructor: Callable that creates and returns an OpenAI gym environment. + xvfb: Frame buffer. Attributes: observation_space: The cached observation space of the environment. @@ -109,25 +111,25 @@ def __init__(self, constructor, xvfb): server_id = random.randint(10000, 99999) auth_file_id = random.randint(10000, 99999999999) - xauthority_path = '/tmp/Xauthority_{}'.format(auth_file_id) + xauthority_path = "/tmp/Xauthority_{}".format(auth_file_id) - command = 'Xvfb :{} -screen 0 1400x900x24 -nolisten tcp -auth {}'.format( - server_id, xauthority_path) - with open(os.devnull, 'w') as devnull: + command = "Xvfb :{} -screen 0 1400x900x24 -nolisten tcp -auth {}".format( + server_id, xauthority_path) + with open(os.devnull, "w") as devnull: proc = subprocess.Popen(command.split(), shell=False, stdout=devnull, stderr=devnull) atexit.register(lambda: os.kill(proc.pid, signal.SIGKILL)) def constructor_using_xvfb(): - os.environ["DISPLAY"] = ":{}".format(server_id) - os.environ["XAUTHORITY"] = xauthority_path - return constructor() + os.environ["DISPLAY"] = ":{}".format(server_id) + os.environ["XAUTHORITY"] = xauthority_path + return constructor() self._process = multiprocessing.Process( target=self._worker, args=(constructor_using_xvfb, conn)) else: self._process = multiprocessing.Process( - target=self._worker, args=(constructor, conn)) + target=self._worker, args=(constructor, conn)) atexit.register(self.close) self._process.start() @@ -137,13 +139,13 @@ def constructor_using_xvfb(): @property def observation_space(self): if not self._observ_space: - self._observ_space = self.__getattr__('observation_space') + self._observ_space = self.__getattr__("observation_space") return self._observ_space @property def action_space(self): if not self._action_space: - self._action_space = self.__getattr__('action_space') + self._action_space = self.__getattr__("action_space") return self._action_space def __getattr__(self, name): @@ -197,7 +199,7 @@ def step(self, action, blocking=True): Transition tuple when blocking, otherwise callable that returns the transition tuple. """ - promise = self.call('step', action) + promise = self.call("step", action) if blocking: return promise() else: @@ -213,7 +215,7 @@ def reset(self, blocking=True): New observation when blocking, otherwise callable that returns the new observation. """ - promise = self.call('reset') + promise = self.call("reset") if blocking: return promise() else: @@ -236,7 +238,7 @@ def _receive(self): raise Exception(stacktrace) if message == self._RESULT: return payload - raise KeyError('Received message of unexpected type {}'.format(message)) + raise KeyError("Received message of unexpected type {}".format(message)) def _worker(self, constructor, conn): """The process waits for actions and sends back environment results. @@ -268,10 +270,10 @@ def _worker(self, constructor, conn): if message == self._CLOSE: assert payload is None break - raise KeyError('Received message of unknown type {}'.format(message)) + raise KeyError("Received message of unknown type {}".format(message)) except Exception: # pylint: disable=broad-except - stacktrace = ''.join(traceback.format_exception(*sys.exc_info())) - tf.logging.error('Error in environment process: {}'.format(stacktrace)) + stacktrace = "".join(traceback.format_exception(*sys.exc_info())) + tf.logging.error("Error in environment process: {}".format(stacktrace)) conn.send((self._EXCEPTION, stacktrace)) conn.close() @@ -282,12 +284,13 @@ def define_batch_env(constructor, num_agents, xvfb=False, env_processes=True): Args: constructor: Constructor of an OpenAI gym environment. num_agents: Number of environments to combine in the batch. + xvfb: Frame buffer. env_processes: Whether to step environment in external processes. Returns: In-graph environments object. """ - with tf.variable_scope('environments'): + with tf.variable_scope("environments"): if env_processes: envs = [ ExternalProcessEnv(constructor, xvfb) diff --git a/tensor2tensor/rl/ppo.py b/tensor2tensor/rl/ppo.py index 124ec4e49..706e3c6b4 100644 --- a/tensor2tensor/rl/ppo.py +++ b/tensor2tensor/rl/ppo.py @@ -21,21 +21,19 @@ import tensorflow as tf -def get_optimiser(config): - - if config.optimizer=='Adam': +def get_optimizer(config): + if config.optimizer == "Adam": return tf.train.AdamOptimizer(config.learning_rate) - return config.optimizer(config.learning_rate) def define_ppo_step(observation, action, reward, done, value, old_pdf, policy_factory, config): - + """Step of PPO.""" new_policy_dist, new_value, _ = policy_factory(observation) new_pdf = new_policy_dist.prob(action) - ratio = new_pdf/old_pdf + ratio = new_pdf / old_pdf clipped_ratio = tf.clip_by_value(ratio, 1 - config.clipping_coef, 1 + config.clipping_coef) @@ -52,13 +50,13 @@ def define_ppo_step(observation, action, reward, done, value, old_pdf, policy_loss = -tf.reduce_mean(surrogate_objective) value_error = calculate_generalized_advantage_estimator( - reward, new_value, done, config.gae_gamma, config.gae_lambda) + reward, new_value, done, config.gae_gamma, config.gae_lambda) value_loss = config.value_loss_coef * tf.reduce_mean(value_error ** 2) entropy = new_policy_dist.entropy() entropy_loss = -config.entropy_loss_coef * tf.reduce_mean(entropy) - optimizer = get_optimiser(config) + optimizer = get_optimizer(config) losses = [policy_loss, value_loss, entropy_loss] gradients = [list(zip(*optimizer.compute_gradients(loss))) for loss in losses] @@ -68,13 +66,15 @@ def define_ppo_step(observation, action, reward, done, value, old_pdf, gradients_flat = sum([gradient[0] for gradient in gradients], ()) gradients_variables_flat = sum([gradient[1] for gradient in gradients], ()) - optimize_op = optimizer.apply_gradients(zip(gradients_flat, gradients_variables_flat)) + optimize_op = optimizer.apply_gradients(zip(gradients_flat, + gradients_variables_flat)) with tf.control_dependencies([optimize_op]): return [tf.identity(x) for x in losses + gradients_norms] def define_ppo_epoch(memory, policy_factory, config): + """PPO epoch.""" observation, reward, done, action, old_pdf, value = memory # This is to avoid propagating gradients though simulation of simulation @@ -86,8 +86,9 @@ def define_ppo_epoch(memory, policy_factory, config): old_pdf = tf.stop_gradient(old_pdf) ppo_step_rets = tf.scan( - lambda _1, _2: define_ppo_step(observation, action, reward, done, value, - old_pdf, policy_factory, config), + lambda _1, _2: define_ppo_step( # pylint: disable=g-long-lambda + observation, action, reward, done, value, + old_pdf, policy_factory, config), tf.range(config.optimization_epochs), [0., 0., 0., 0., 0., 0.], parallel_iterations=1) @@ -105,19 +106,23 @@ def define_ppo_epoch(memory, policy_factory, config): return losses_summary -def calculate_generalized_advantage_estimator(reward, value, done, gae_gamma, gae_lambda): - """Generalized advantage estimator""" - # Below is slight wierdness, we set the last reward to 0. +def calculate_generalized_advantage_estimator( + reward, value, done, gae_gamma, gae_lambda): + """Generalized advantage estimator.""" + + # Below is slight weirdness, we set the last reward to 0. # This makes the adventantage to be 0 in the last timestep - reward = tf.concat([reward[:-1,:], value[-1:,:]], axis=0) - next_value = tf.concat([value[1:,:], tf.zeros_like(value[-1:, :])], axis=0) - next_not_done = 1 - tf.cast(tf.concat([done[1:, :], tf.zeros_like(done[-1:, :])], axis=0), tf.float32) + reward = tf.concat([reward[:-1, :], value[-1:, :]], axis=0) + next_value = tf.concat([value[1:, :], tf.zeros_like(value[-1:, :])], axis=0) + next_not_done = 1 - tf.cast(tf.concat([done[1:, :], + tf.zeros_like(done[-1:, :])], axis=0), + tf.float32) delta = reward + gae_gamma * next_value * next_not_done - value return_ = tf.reverse(tf.scan( lambda agg, cur: cur[0] + cur[1] * gae_gamma * gae_lambda * agg, [tf.reverse(delta, [0]), tf.reverse(next_not_done, [0])], tf.zeros_like(delta[0, :]), - parallel_iterations=1), [0]) - return tf.check_numerics(return_, 'return') + parallel_iterations=1), [0]) + return tf.check_numerics(return_, "return") diff --git a/tensor2tensor/rl/rl_trainer_lib.py b/tensor2tensor/rl/rl_trainer_lib.py index d87a9d0fb..28ec7e22c 100644 --- a/tensor2tensor/rl/rl_trainer_lib.py +++ b/tensor2tensor/rl/rl_trainer_lib.py @@ -51,7 +51,7 @@ def define_train(hparams, environment_spec, event_dir): with tf.variable_scope("train"): memory, collect_summary = collect.define_collect( - policy_factory, batch_env, hparams, eval_phase=False) + policy_factory, batch_env, hparams, eval_phase=False) ppo_summary = ppo.define_ppo_epoch(memory, policy_factory, hparams) summary = tf.summary.merge([collect_summary, ppo_summary]) @@ -60,19 +60,20 @@ def define_train(hparams, environment_spec, event_dir): if event_dir and hparams.video_during_eval: # Some environments reset environments automatically, when reached done # state. For them we shall record only every second episode. - d = 2 if env_lambda().metadata.get('semantics.autoreset') else 1 - eval_env_lambda = lambda: gym.wrappers.Monitor( - env_lambda(), event_dir, video_callable=lambda i: i % d == 0) + d = 2 if env_lambda().metadata.get("semantics.autoreset") else 1 + eval_env_lambda = lambda: gym.wrappers.Monitor( # pylint: disable=g-long-lambda + env_lambda(), event_dir, video_callable=lambda i: i % d == 0) wrapped_eval_env_lambda = lambda: utils.EvalVideoWrapper(eval_env_lambda()) _, eval_summary = collect.define_collect( - policy_factory, - utils.define_batch_env(wrapped_eval_env_lambda, hparams.num_eval_agents, - xvfb=hparams.video_during_eval), - hparams, eval_phase=True) + policy_factory, + utils.define_batch_env(wrapped_eval_env_lambda, hparams.num_eval_agents, + xvfb=hparams.video_during_eval), + hparams, eval_phase=True) return summary, eval_summary def train(hparams, environment_spec, event_dir=None): + """Train.""" train_summary_op, eval_summary_op = define_train(hparams, environment_spec, event_dir) @@ -88,7 +89,8 @@ def train(hparams, environment_spec, event_dir=None): summary = sess.run(train_summary_op) if summary_writer: summary_writer.add_summary(summary, epoch_index) - if hparams.eval_every_epochs and epoch_index % hparams.eval_every_epochs == 0: + if (hparams.eval_every_epochs and + epoch_index % hparams.eval_every_epochs == 0): summary = sess.run(eval_summary_op) if summary_writer: summary_writer.add_summary(summary, epoch_index) diff --git a/tensor2tensor/rl/rl_trainer_lib_test.py b/tensor2tensor/rl/rl_trainer_lib_test.py index d9484f045..0f3aa2025 100644 --- a/tensor2tensor/rl/rl_trainer_lib_test.py +++ b/tensor2tensor/rl/rl_trainer_lib_test.py @@ -27,12 +27,12 @@ class TrainTest(tf.test.TestCase): def test_no_crash_pendulum(self): hparams = trainer_lib.create_hparams( - "continuous_action_base", "epochs_num=11,video_during_eval=False") + "continuous_action_base", "epochs_num=11,video_during_eval=False") rl_trainer_lib.train(hparams, "Pendulum-v0") def test_no_crash_cartpole(self): hparams = trainer_lib.create_hparams( - "discrete_action_base", "epochs_num=11,video_during_eval=False") + "discrete_action_base", "epochs_num=11,video_during_eval=False") rl_trainer_lib.train(hparams, "CartPole-v0") diff --git a/tensor2tensor/test_data/example_usr_dir/my_submodule.py b/tensor2tensor/test_data/example_usr_dir/my_submodule.py index 0872ce5ed..70929afbc 100644 --- a/tensor2tensor/test_data/example_usr_dir/my_submodule.py +++ b/tensor2tensor/test_data/example_usr_dir/my_submodule.py @@ -14,9 +14,17 @@ # limitations under the License. """Example registrations for T2T.""" +import re + +from tensor2tensor.data_generators import problem +from tensor2tensor.data_generators import text_problems from tensor2tensor.layers import common_hparams from tensor2tensor.utils import registry +# Use register_model for a new T2TModel +# Use register_problem for a new Problem +# Use register_hparams for a new hyperparameter set + @registry.register_hparams def my_very_own_hparams(): @@ -28,5 +36,64 @@ def my_very_own_hparams(): hp.add_hparam("filter_size", 2048) return hp -# Use register_model for a new T2TModel -# Use register_problem for a new Problem + +@registry.register_problem +class PoetryLines(text_problems.Text2TextProblem): + """Predict next line of poetry from the last line. From Gutenberg texts.""" + + @property + def approx_vocab_size(self): + return 2**13 # ~8k + + @property + def is_generate_per_split(self): + # generate_data will shard the data into TRAIN and EVAL for us. + return False + + @property + def dataset_splits(self): + """Splits of data to produce and number of output shards for each.""" + # 10% evaluation data + return [{ + "split": problem.DatasetSplit.TRAIN, + "shards": 90, + }, { + "split": problem.DatasetSplit.EVAL, + "shards": 10, + }] + + def generate_samples(self, data_dir, tmp_dir, dataset_split): + del data_dir + del tmp_dir + del dataset_split + + # pylint: disable=g-import-not-at-top + from gutenberg import acquire + from gutenberg import cleanup + # pylint: enable=g-import-not-at-top + + books = [ + # bookid, skip N lines + (19221, 223), + (15553, 522), + ] + + for (book_id, toskip) in books: + text = cleanup.strip_headers(acquire.load_etext(book_id)).strip() + lines = text.split("\n")[toskip:] + prev_line = None + ex_count = 0 + for line in lines: + # Any line that is all upper case is a title or author name + if not line or line.upper() == line: + prev_line = None + continue + + line = re.sub("[^a-z]+", " ", line.strip().lower()) + if prev_line and line: + yield { + "inputs": prev_line, + "targets": line, + } + ex_count += 1 + prev_line = line diff --git a/tensor2tensor/test_data/example_usr_dir/setup.py b/tensor2tensor/test_data/example_usr_dir/setup.py new file mode 100644 index 000000000..ad3701bb2 --- /dev/null +++ b/tensor2tensor/test_data/example_usr_dir/setup.py @@ -0,0 +1,34 @@ +# coding=utf-8 +# Copyright 2018 The Tensor2Tensor Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Example setup.py for a t2t_usr_dir launching on Cloud ML Engine. + +This is only necessary if you have additional required pip packages for the +import of your usr_dir, and only if you're launching t2t-trainer on Cloud ML +Engine with the --cloud_mlengine flag. + +Note that the call to setup uses find_packages() and that the location of this +file is alongside the __init__.py file that imports my_submodule. +""" +from setuptools import find_packages +from setuptools import setup +setup( + name='DummyUsrDirPackage', + version='0.1', + packages=find_packages(), + install_requires=[ + 'gutenberg', + ], +) diff --git a/tensor2tensor/utils/cloud_mlengine.py b/tensor2tensor/utils/cloud_mlengine.py index eb5949e08..1d9e1c591 100644 --- a/tensor2tensor/utils/cloud_mlengine.py +++ b/tensor2tensor/utils/cloud_mlengine.py @@ -47,12 +47,24 @@ """ +def job_dir(): + # The flag --job-dir is parsed differently before and after switching to absl + return getattr(FLAGS, 'job-dir', '') or getattr(FLAGS, 'job_dir', '') + + def flags_as_args(): """Convert FLAGS to list of args suitable for passing on cmd line.""" - args_dict = dict(FLAGS.__dict__['__flags']) + if hasattr(FLAGS, 'flag_values_dict'): + args_dict = FLAGS.flag_values_dict() + else: + args_dict = dict(FLAGS.__dict__['__flags']) del args_dict['cloud_mlengine'] # Configured later del args_dict['t2t_usr_dir'] + args_dict.pop('h', None) + args_dict.pop('helpfull', None) + args_dict.pop('helpshort', None) + args_dict.pop('help', None) args = [] for name, val in args_dict.items(): if val is None: @@ -223,7 +235,7 @@ def configure_usr_dir(job_spec, usr_tar): def launch(): """Launch t2t_trainer on Cloud ML Engine.""" assert not FLAGS.cloud_tpu - assert not FLAGS.job_dir + assert not job_dir() assert FLAGS.output_dir.startswith('gs://') assert FLAGS.data_dir.startswith('gs://') assert FLAGS.worker_replicas <= 1 diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py index 457ece27a..db039b799 100644 --- a/tensor2tensor/utils/data_reader_test.py +++ b/tensor2tensor/utils/data_reader_test.py @@ -90,10 +90,10 @@ def tearDownClass(cls): os.remove(f) def testBasicExampleReading(self): - dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN, - data_dir=self.data_dir, - repeat=False, - shuffle_files=False) + dataset = self.problem.dataset( + tf.estimator.ModeKeys.TRAIN, + data_dir=self.data_dir, + shuffle_files=False) examples = dataset.make_one_shot_iterator().get_next() with tf.train.MonitoredSession() as sess: # Check that there are multiple examples that have the right fields of the @@ -109,10 +109,10 @@ def testBasicExampleReading(self): self.assertGreater(len(field), 0) def testPreprocess(self): - dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN, - data_dir=self.data_dir, - repeat=False, - shuffle_files=False) + dataset = self.problem.dataset( + tf.estimator.ModeKeys.TRAIN, + data_dir=self.data_dir, + shuffle_files=False) examples = dataset.make_one_shot_iterator().get_next() with tf.train.MonitoredSession() as sess: ex_val = sess.run(examples) @@ -121,10 +121,10 @@ def testPreprocess(self): def testLengthFilter(self): max_len = 15 - dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN, - data_dir=self.data_dir, - repeat=False, - shuffle_files=False) + dataset = self.problem.dataset( + tf.estimator.ModeKeys.TRAIN, + data_dir=self.data_dir, + shuffle_files=False) dataset = dataset.filter( lambda ex: data_reader.example_valid_size(ex, 0, max_len)) examples = dataset.make_one_shot_iterator().get_next() @@ -216,10 +216,10 @@ def example_len(ex): boundaries = [10, 20, 30] batch_sizes = [10, 8, 4, 2] - dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN, - data_dir=self.data_dir, - repeat=False, - shuffle_files=False) + dataset = self.problem.dataset( + tf.estimator.ModeKeys.TRAIN, + data_dir=self.data_dir, + shuffle_files=False) dataset = data_reader.bucket_by_sequence_length( dataset, example_len, boundaries, batch_sizes) batch = dataset.make_one_shot_iterator().get_next() diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py index 4e95c7e5a..3d18b4d10 100644 --- a/tensor2tensor/utils/decoding.py +++ b/tensor2tensor/utils/decoding.py @@ -85,9 +85,9 @@ def log_decode_results(inputs, decoded_targets = None decoded_outputs = None if identity_output: - decoded_outputs = "".join(map(str, outputs.flatten())) + decoded_outputs = " ".join(map(str, outputs.flatten())) if targets is not None: - decoded_targets = "".join(map(str, targets.flatten())) + decoded_targets = " ".join(map(str, targets.flatten())) else: decoded_outputs = targets_vocab.decode(_save_until_eos(outputs, is_image)) if targets is not None: diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py index f21562eeb..1b4013fbc 100644 --- a/tensor2tensor/utils/t2t_model.py +++ b/tensor2tensor/utils/t2t_model.py @@ -103,7 +103,7 @@ def __init__(self, self._problem_hparams.target_modality): same_vocab_sizes = False if not same_vocab_sizes: - tf.logging.info("Unsetting shared_embedding_and_softmax_weights.") + log_info("Unsetting shared_embedding_and_softmax_weights.") hparams.shared_embedding_and_softmax_weights = 0 self._original_hparams = hparams self.set_mode(mode) @@ -135,7 +135,13 @@ def call(self, features): self._fill_problem_hparams_features(features) sharded_features = self._shard_features(features) sharded_logits, losses = self.model_fn_sharded(sharded_features) - return tf.concat(sharded_logits, 0), losses + if isinstance(sharded_logits, dict): + concat_logits = {} + for k, v in sharded_logits.iteritems(): + concat_logits[k] = tf.concat(v, 0) + return concat_logits, losses + else: + return tf.concat(sharded_logits, 0), losses @property def use_body_sharded(self): @@ -157,18 +163,36 @@ def model_fn_sharded(self, sharded_features): self._to_single_features_dict(transformed_features)) body_out, losses = self._normalize_body_output(body_out) if "training" in losses: - tf.logging.info("Skipping T2TModel top and loss because training loss " - "returned from body") + log_info("Skipping T2TModel top and loss because training loss " + "returned from body") sharded_logits = body_out else: - sharded_logits = dp(self.top, body_out, datashard_to_features) - sharded_losses = dp(self.loss, sharded_logits, datashard_to_features) - training_loss_dict = average_sharded_losses([{ - "training": loss - } for loss in sharded_losses]) - losses.update(training_loss_dict) + if isinstance(body_out, dict): + sharded_logits = {} + sharded_losses = {} + for k, v in body_out.iteritems(): + sharded_logits[k] = dp(self.top, v, datashard_to_features) + sharded_losses[k] = dp(self.loss, sharded_logits[k], + datashard_to_features) + training_loss_dict = average_sharded_losses([{ + "training": l + } for l in loss for loss in sharded_losses.values()]) + losses.update(training_loss_dict) + else: + sharded_logits = dp(self.top, body_out, datashard_to_features) + sharded_losses = dp(self.loss, sharded_logits, datashard_to_features) + training_loss_dict = average_sharded_losses([{ + "training": loss + } for loss in sharded_losses]) + losses.update(training_loss_dict) else: sharded_logits, sharded_losses = dp(self.model_fn, datashard_to_features) + if isinstance(sharded_logits[0], dict): + temp_dict = {k: [] for k, _ in sharded_logits[0].iteritems()} + for k, _ in sharded_logits[0].iteritems(): + for l in sharded_logits: + temp_dict[k].append(l[k]) + sharded_logits = temp_dict losses = average_sharded_losses(sharded_losses) # TODO(rsepassi): Reenable scheduled sampling @@ -189,13 +213,13 @@ def model_fn(self, features): transformed_features = self.bottom(features) with tf.variable_scope("body"): - tf.logging.info("Building model body") + log_info("Building model body") body_out = self.body(transformed_features) output, losses = self._normalize_body_output(body_out) if "training" in losses: - tf.logging.info("Skipping T2TModel top and loss because training loss " - "returned from body") + log_info("Skipping T2TModel top and loss because training loss " + "returned from body") logits = output else: logits = self.top(output, features) @@ -205,7 +229,7 @@ def model_fn(self, features): def bottom(self, features): """Transform features to feed into body.""" if not self._problem_hparams: - tf.logging.warn("Without a Problem, T2TModel.bottom is a passthrough.") + log_warn("Without a Problem, T2TModel.bottom is a passthrough.") return features transformed_features = {} @@ -216,16 +240,16 @@ def bottom(self, features): self._problem_hparams.input_modality): do_reuse = input_modality.name in all_previous_modalities with tf.variable_scope(input_modality.name, reuse=do_reuse): - tf.logging.info("Transforming feature '%s' with %s.bottom", key, - input_modality.name) + log_info("Transforming feature '%s' with %s.bottom", key, + input_modality.name) transformed_features[key] = input_modality.bottom(features[key]) all_previous_modalities.append(input_modality.name) # Transform the targets (for autoregressive models) target_modality = self._problem_hparams.target_modality with tf.variable_scope(target_modality.name): - tf.logging.info("Transforming 'targets' with %s.targets_bottom", - target_modality.name) + log_info("Transforming 'targets' with %s.targets_bottom", + target_modality.name) transformed_features["targets"] = target_modality.targets_bottom( features["targets"]) @@ -256,15 +280,14 @@ def body(self, features): """ raise NotImplementedError("Abstract Method") - def top(self, body_output, features): + def _top_single(self, body_output, features): if not self._problem_hparams: - tf.logging.warn("Without a Problem, T2TModel.top is a passthrough.") + log_warn("Without a Problem, T2TModel.top is a passthrough.") return body_output target_modality = self._problem_hparams.target_modality with tf.variable_scope(target_modality.name): - tf.logging.info("Transforming body output with %s.top", - target_modality.name) + log_info("Transforming body output with %s.top", target_modality.name) last_only = ( target_modality.top_is_pointwise and self.hparams.mode == tf.estimator.ModeKeys.PREDICT and @@ -281,9 +304,18 @@ def top(self, body_output, features): last_position_targets) return logits - def loss(self, logits, features): + def top(self, body_output, features): + if isinstance(body_output, dict): + logits = {} + for k, v in body_output.iteritems(): + logits[k] = self._top_single(v, features) + return logits + else: + return self._top_single(body_output, features) + + def _loss_single(self, logits, features): if not self._problem_hparams: - tf.logging.warn(_no_problem_err("loss")) + log_warn(_no_problem_err("loss")) return (tf.constant(0., dtype=tf.float32), tf.constant(1., dtype=tf.float32)) @@ -292,19 +324,28 @@ def loss(self, logits, features): loss_num *= self._problem_hparams.loss_multiplier return loss_num, loss_den + def loss(self, logits, features): + if isinstance(logits, dict): + losses = {} + for k, v in logits.iteritems(): + losses[k] = self._loss_single(v, features) + return tf.add_n([n / d for n, d in logits.values()]) + else: + return self._loss_single(logits, features) + def optimize(self, loss, num_async_replicas=1): """Return a training op minimizing loss.""" - tf.logging.info("Base learning rate: %f", self.hparams.learning_rate) + log_info("Base learning rate: %f", self.hparams.learning_rate) lr = self.hparams.learning_rate decay_rate = optimize.learning_rate_schedule(self.hparams) lr *= decay_rate if self.hparams.learning_rate_minimum: lr_min = float(self.hparams.learning_rate_minimum) - tf.logging.info("Applying learning rate minimum: %f", lr_min) + log_info("Applying learning rate minimum: %f", lr_min) lr = tf.max(lr, tf.to_float(lr_min)) if num_async_replicas > 1: - tf.logging.info("Dividing learning rate by num_async_replicas: %d", - num_async_replicas) + log_info("Dividing learning rate by num_async_replicas: %d", + num_async_replicas) lr /= math.sqrt(float(num_async_replicas)) train_op = optimize.optimize( loss, lr, self.hparams, use_tpu=common_layers.is_on_tpu()) @@ -312,14 +353,14 @@ def optimize(self, loss, num_async_replicas=1): def set_mode(self, mode): """Set hparams with the given mode.""" - tf.logging.info("Setting T2TModel mode to '%s'", mode) + log_info("Setting T2TModel mode to '%s'", mode) hparams = copy.copy(self._original_hparams) hparams.add_hparam("mode", mode) # When not in training mode, set all forms of dropout to zero. if mode != tf.estimator.ModeKeys.TRAIN: for key in hparams.values(): if key.endswith("dropout"): - tf.logging.info("Setting hparams.%s to 0.0", key) + log_info("Setting hparams.%s to 0.0", key) setattr(hparams, key, 0.0) self._hparams = hparams @@ -419,9 +460,9 @@ def infer(self, # (i.e. if the target modality is RealModality). self.prepare_features_for_infer(features) if not self.has_input and beam_size > 1: - tf.logging.warn("Beam searching for a model with no inputs.") + log_warn("Beam searching for a model with no inputs.") if not self.has_input and self.hparams.sampling_method != "random": - tf.logging.warn("Non-random sampling for a model with no inputs.") + log_warn("Non-random sampling for a model with no inputs.") self._fill_problem_hparams_features(features) if self._problem_hparams: @@ -429,12 +470,12 @@ def infer(self, if target_modality.is_class_modality: beam_size = 1 # No use to run beam-search for a single class. if beam_size == 1: - tf.logging.info("Greedy Decoding") + log_info("Greedy Decoding") results = self._greedy_infer(features, decode_length) else: - tf.logging.info("Beam Decoding with beam size %d" % beam_size) - results = self._beam_decode( - features, decode_length, beam_size, top_beams, alpha) + log_info("Beam Decoding with beam size %d" % beam_size) + results = self._beam_decode(features, decode_length, beam_size, + top_beams, alpha) return results @@ -840,12 +881,24 @@ def estimator_model_fn(cls, # Set known shapes if use_tpu: - shape = logits.get_shape().as_list() - if shape[0] is None: - shape[0] = params["batch_size"] - if shape[1] is None: - shape[1] = hparams.max_length - logits.set_shape(shape) + if isinstance(logits, dict): + for k, v in logits.iteritems(): + if "scalar/" in k: + continue + + shape = v.get_shape().as_list() + if shape[0] is None: + shape[0] = params["batch_size"] + if shape[1] is None: + shape[1] = hparams.max_length + v.set_shape(shape) + else: + shape = logits.get_shape().as_list() + if shape[0] is None: + shape[0] = params["batch_size"] + if shape[1] is None: + shape[1] = hparams.max_length + logits.set_shape(shape) assert "training" in losses_dict @@ -859,7 +912,8 @@ def estimator_model_fn(cls, # EVAL mode if mode == tf.estimator.ModeKeys.EVAL: - return model.estimator_spec_eval(features, logits, labels, loss) + return model.estimator_spec_eval(features, logits, labels, loss, + losses_dict) # TRAIN mode assert mode == tf.estimator.ModeKeys.TRAIN @@ -880,7 +934,7 @@ def estimator_spec_train(self, loss, num_async_replicas=1): return tf.estimator.EstimatorSpec( tf.estimator.ModeKeys.TRAIN, loss=loss, train_op=train_op) - def estimator_spec_eval(self, features, logits, labels, loss): + def estimator_spec_eval(self, features, logits, labels, loss, losses_dict): """Construct EstimatorSpec for EVAL mode.""" hparams = self.hparams @@ -891,10 +945,19 @@ def estimator_spec_eval(self, features, logits, labels, loss): if common_layers.is_on_tpu(): eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams) _remove_summaries() - return tf.contrib.tpu.TPUEstimatorSpec( - tf.estimator.ModeKeys.EVAL, - eval_metrics=(eval_metrics_fn, [logits, labels]), - loss=loss) + if isinstance(logits, dict): + # For TPU, logits dict will be passed as keyword arguments to + # eval_metrics_fn. Here we add the labels to those arguments. + logits.update({"labels": labels}) + return tf.contrib.tpu.TPUEstimatorSpec( + tf.estimator.ModeKeys.EVAL, + eval_metrics=(eval_metrics_fn, logits), + loss=loss) + else: + return tf.contrib.tpu.TPUEstimatorSpec( + tf.estimator.ModeKeys.EVAL, + eval_metrics=(eval_metrics_fn, [logits, labels]), + loss=loss) else: eval_metrics_fns = metrics.create_evaluation_metrics([problem], hparams) eval_metrics = {} @@ -963,9 +1026,9 @@ def _warn_changed_modality_type(new_name, old_name, feature_name): new_type, new_name = registry.parse_modality_name(new_name) old_type, old_name = registry.parse_modality_name(old_name) if new_type != old_type: - tf.logging.warning("%s has a designated modality type %s (%s) but has been " - "overridden with a modality of type %s (%s).", - feature_name, old_type, old_name, new_type, new_name) + log_warn("%s has a designated modality type %s (%s) but has been " + "overridden with a modality of type %s (%s).", feature_name, + old_type, old_name, new_type, new_name) def _with_timing(fn, msg, silent=False): @@ -974,8 +1037,7 @@ def fn_with_timing(*args, **kwargs): start_time = time.time() res = fn(*args, **kwargs) if not silent: - tf.logging.info("Doing %s took %.3f sec." % (msg, - time.time() - start_time)) + log_info("Doing %s took %.3f sec." % (msg, time.time() - start_time)) return res return fn_with_timing @@ -1024,16 +1086,24 @@ def wrapped_metric_fn(logits, labels): for metric in eval_metrics: if metric in TPU_METRIC_BLACKLIST: - tf.logging.warn("Skipping eval metric %s in TPU_METRIC_BLACKLIST", metric) + log_warn("Skipping eval metric %s in TPU_METRIC_BLACKLIST", metric) continue name = "metrics-%s/%s" % (problem.name, metric) metric_fns.append((name, make_metric_fn(metrics.METRICS_FNS[metric]))) - def all_metrics_fn(logits, labels): + def all_metrics_fn(logits=None, labels=None, **kwargs): + """Construct metrics dictionary.""" metrics_dict = {} + if logits is None: + logits = kwargs + for name, fn in metric_fns: - metrics_dict[name] = fn(logits, labels) + if isinstance(logits, dict): + for k, v in logits.iteritems(): + metrics_dict["%s/%s" % (name, k)] = fn(v, labels) + else: + metrics_dict[name] = fn(logits, labels) return metrics_dict @@ -1155,3 +1225,21 @@ def summarize_features(features, num_shards=1): tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens) tf.summary.scalar("%s_nonpadding_fraction" % k, tf.reduce_mean(nonpadding)) + + +_already_logged = set() + + +def _eager_log(level, *args): + if context.in_eager_mode() and args in _already_logged: + return + _already_logged.add(args) + getattr(tf.logging, level)(*args) + + +def log_info(*args): + _eager_log("info", *args) + + +def log_warn(*args): + _eager_log("warn", *args) diff --git a/tensor2tensor/utils/trainer_lib.py b/tensor2tensor/utils/trainer_lib.py index 2b0543f37..dd1442517 100644 --- a/tensor2tensor/utils/trainer_lib.py +++ b/tensor2tensor/utils/trainer_lib.py @@ -70,15 +70,15 @@ def create_hparams(hparams_set, problem_name=None): """Create HParams with data_dir and problem hparams, if kwargs provided.""" hparams = registry.hparams(hparams_set)() + if data_dir: + hparams.add_hparam("data_dir", data_dir) + if problem_name: + add_problem_hparams(hparams, problem_name) if hparams_overrides_str: tf.logging.info("Overriding hparams in %s with %s", hparams_set, hparams_overrides_str) hparams = hparams.parse(hparams_overrides_str) - if data_dir: - hparams.add_hparam("data_dir", data_dir) - if problem_name: - add_problem_hparams(hparams, problem_name) return hparams