diff --git a/.travis.yml b/.travis.yml
index 7841b0b7e..00fe35951 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -14,10 +14,11 @@ env:
     - T2T_DATA_DIR=/tmp/t2t-data
     - T2T_TRAIN_DIR=/tmp/t2t-train
 script:
-  - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/tpu/tpu_trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py
+  - pytest --ignore=tensor2tensor/utils/registry_test.py --ignore=tensor2tensor/problems_test.py --ignore=tensor2tensor/utils/trainer_lib_test.py --ignore=tensor2tensor/data_generators/algorithmic_math_test.py
   - pytest tensor2tensor/utils/registry_test.py
-  - pytest tensor2tensor/tpu/tpu_trainer_lib_test.py
+  - pytest tensor2tensor/utils/trainer_lib_test.py
   - t2t-datagen 2>&1 | grep translate && echo passed
+  - t2t-trainer --registry_help --t2t_usr_dir=./tensor2tensor/test_data/example_usr_dir 2>&1 | grep my_very_own_hparams && echo passed
   - python -c "from tensor2tensor.models import transformer; print(transformer.Transformer.__name__)"
   - t2t-trainer --registry_help
   - mkdir $T2T_DATA_DIR
diff --git a/README.md b/README.md
index de2951c53..06a15d1c8 100644
--- a/README.md
+++ b/README.md
@@ -296,36 +296,8 @@ specifying the `--t2t_usr_dir` flag in `t2t-trainer`.
 You can do so for models, hyperparameter sets, modalities, and problems. Please
 do submit a pull request if your component might be useful to others.
 
-Here's an example with a new hyperparameter set:
-
-```python
-# In ~/usr/t2t_usr/my_registrations.py
-
-from tensor2tensor.models import transformer
-from tensor2tensor.utils import registry
-
-@registry.register_hparams
-def transformer_my_very_own_hparams_set():
-  hparams = transformer.transformer_base()
-  hparams.hidden_size = 1024
-  ...
-```
-
-```python
-# In ~/usr/t2t_usr/__init__.py
-from . import my_registrations
-```
-
-```
-t2t-trainer --t2t_usr_dir=~/usr/t2t_usr --registry_help
-```
-
-You'll see under the registered HParams your
-`transformer_my_very_own_hparams_set`, which you can directly use on the command
-line with the `--hparams_set` flag.
-
-`t2t-datagen` also supports the `--t2t_usr_dir` flag for `Problem`
-registrations.
+See the [`example_usr_dir`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/test_data/example_usr_dir)
+for an example user directory.
 
 ## Adding a dataset
 
diff --git a/docs/cloud_tpu.md b/docs/cloud_tpu.md
index 56bad4093..55144e69c 100644
--- a/docs/cloud_tpu.md
+++ b/docs/cloud_tpu.md
@@ -5,8 +5,10 @@ for ML training.
 
 Models and hparams that are known to work on TPU:
 * `transformer` with `transformer_tpu`
-* `xception` with `xception_base`
+* `transformer_encoder` with `transformer_tpu`
+* `transformer_decoder` with `transformer_tpu`
 * `resnet50` with `resnet_base`
+* `revnet104` with `revnet_base`
 
 To run on TPUs, you need to be part of the alpha program; if you're not, these
 commands won't work for you currently, but access will expand soon, so get
@@ -34,6 +36,8 @@ gcloud compute instances create $USER-vm \
 Launch the TPU instance; the Python program will connect to this to train on the
 TPU device.
 ```
+gcloud alpha compute tpus list
+# Make an IP with structure 10.240.X.2 that’s unique in the list
 TPU_IP=10.240.0.2
 gcloud alpha compute tpus create \
   $USER-tpu \
@@ -41,9 +45,6 @@ gcloud alpha compute tpus create \
   --version=nightly
 ```
 
-To see all TPU instances running: `gcloud alpha compute tpus list`.  The
-`TPU_IP` should be unique amongst the list and follow the format `10.240.i.2`.
-
 SSH in with port forwarding for TensorBoard
 ```
 gcloud compute ssh $USER-vm -- -L 6006:localhost:6006
@@ -52,7 +53,7 @@ gcloud compute ssh $USER-vm -- -L 6006:localhost:6006
 Now that you're on the cloud instance, install T2T:
 ```
 pip install tensor2tensor --user
-# If your python bin dir isn't already in your path
+# Add the python bin dir to your path
 export PATH=$HOME/.local/bin:$PATH
 ```
 
@@ -67,9 +68,9 @@ t2t-datagen --problem=translate_ende_wmt8k --data_dir=$DATA_DIR
 Setup some vars used below. `TPU_IP` and `DATA_DIR` should be the same as what
 was used above. Note that the `DATA_DIR` and `OUT_DIR` must be GCS buckets.
 ```
-TPU_IP=<IP of TPU machine>
+TPU_IP=10.240.0.2
 DATA_DIR=$GCS_BUCKET/t2t/data/
-OUT_DIR=$GCS_BUCKET/t2t/training/
+OUT_DIR=$GCS_BUCKET/t2t/training/transformer_ende_1
 TPU_MASTER=grpc://$TPU_IP:8470
 ```
 
diff --git a/docs/new_problem.md b/docs/new_problem.md
index fd5f9d625..342d7abb1 100644
--- a/docs/new_problem.md
+++ b/docs/new_problem.md
@@ -264,16 +264,22 @@ t2t-datagen \
 ```
 
 Where:
-*   `PROBLEM` is the name of the class that was registered with `@registry.register_problem()`, but converted from `CamelCase` to `snake_case`.
-*   `PATH_TO_YOUR_PROBLEM_DIR` is a path to the directory of your python problem file.
+* `PROBLEM` is the name of the class that was registered with
+  `@registry.register_problem()`, but converted from `CamelCase` to
+  `snake_case`.
+* `PATH_TO_YOUR_PROBLEM_DIR` is a path to the directory of your python problem
+  file.
 
-If you plan to contribute to the tensor2tensor repository, you can install the local cloned version in developer mode with `pip install -e .` from the tensor2tensor directory. You can also add your new problem file to [`all_problems.py`](https://github.com/tensorflow/tensor2tensor/blob/master/tensor2tensor/data_generators/all_problems.py).
+If you plan to contribute to the tensor2tensor repository, you can install the
+local cloned version in developer mode with `pip install -e .` from the
+tensor2tensor directory. You can also add your new problem file to
+[`all_problems.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/data_generators/all_problems.py).
 
 # Run the problem
-Now that we've gotten our problem set up, let's train a model and generate definitions. 
+Now that we've gotten our problem set up, let's train a model and generate
+definitions.
 
 To train, specify the problem name, the model, and hparams:
-
 ```bash
 PROBLEM=word2def
 MODEL=transformer
@@ -282,6 +288,7 @@ HPARAMS=word2def_hparams
 
 The rest of the steps are as given in the [walkthrough](walkthrough.md).
 
-What if we wanted to train a model to generate words given definitions? In T2T, we can change the problem name to be `PROBLEM=word2def_rev`.
+What if we wanted to train a model to generate words given definitions? In T2T,
+we can change the problem name to be `PROBLEM=word2def_rev`.
 
 All done. Let us know what definitions your model generated.
diff --git a/docs/overview.md b/docs/overview.md
index fcc0aba5a..9ea87bc50 100644
--- a/docs/overview.md
+++ b/docs/overview.md
@@ -14,7 +14,7 @@ to training, evaluation, and decoding.
 
 Some key files and their functions:
 
-*   [`tpu_trainer.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/tpu/tpu_trainer.py) and [`tpu_trainer_lib.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/tpu/tpu_trainer_lib.py):
+*   [`t2t_trainer.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/bin/t2t_trainer.py) and [`trainer_lib.py`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/utils/trainer_lib.py):
     Main entrypoint for training and evaluation.  Constructs and runs all the
     main components of the system (the `Problem`, the `HParams`, the
     `Estimator`, the `Experiment`, the `input_fn`s and `model_fn`).
@@ -134,7 +134,7 @@ The default implementations of `bottom`, `top`, and `loss` depend on the
 
 The actual training loop and related services (checkpointing, summaries,
 continuous evaluation, etc.) are all handled by `Estimator` and `Experiment`
-objects. `tpu_trainer.py` is the main entrypoint and uses `tpu_trainer_lib.py`
+objects. `t2t_trainer.py` is the main entrypoint and uses `trainer_lib.py`
 to construct the various components.
 
 ## Decoding
@@ -144,7 +144,7 @@ to construct the various components.
 
 ## System Overview for Train/Eval
 
-See `tpu_trainer.py`.
+See `t2t_trainer.py` and `trainer_lib.py`.
 
 * Create HParams
 * Create `RunConfig`, including `Parallelism` object (i.e. `data_parallelism`)
diff --git a/docs/walkthrough.md b/docs/walkthrough.md
index de2951c53..06a15d1c8 100644
--- a/docs/walkthrough.md
+++ b/docs/walkthrough.md
@@ -296,36 +296,8 @@ specifying the `--t2t_usr_dir` flag in `t2t-trainer`.
 You can do so for models, hyperparameter sets, modalities, and problems. Please
 do submit a pull request if your component might be useful to others.
 
-Here's an example with a new hyperparameter set:
-
-```python
-# In ~/usr/t2t_usr/my_registrations.py
-
-from tensor2tensor.models import transformer
-from tensor2tensor.utils import registry
-
-@registry.register_hparams
-def transformer_my_very_own_hparams_set():
-  hparams = transformer.transformer_base()
-  hparams.hidden_size = 1024
-  ...
-```
-
-```python
-# In ~/usr/t2t_usr/__init__.py
-from . import my_registrations
-```
-
-```
-t2t-trainer --t2t_usr_dir=~/usr/t2t_usr --registry_help
-```
-
-You'll see under the registered HParams your
-`transformer_my_very_own_hparams_set`, which you can directly use on the command
-line with the `--hparams_set` flag.
-
-`t2t-datagen` also supports the `--t2t_usr_dir` flag for `Problem`
-registrations.
+See the [`example_usr_dir`](https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/test_data/example_usr_dir)
+for an example user directory.
 
 ## Adding a dataset
 
diff --git a/setup.py b/setup.py
index fb2b6492d..ede08f6ae 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.4.1',
+    version='1.4.2',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',
@@ -23,10 +23,19 @@
         'tensor2tensor/bin/t2t-datagen',
         'tensor2tensor/bin/t2t-decoder',
         'tensor2tensor/bin/t2t-make-tf-configs',
+        'tensor2tensor/bin/t2t-exporter',
+        'tensor2tensor/bin/t2t-query-server',
+        'tensor2tensor/bin/t2t-insights-server',
+        'tensor2tensor/bin/t2t-avg-all',
+        'tensor2tensor/bin/t2t-bleu',
+        'tensor2tensor/bin/t2t-translate-all',
     ],
     install_requires=[
         'bz2file',
+        'flask',
         'future',
+        'gevent',
+        'gunicorn',
         'gym',
         'numpy',
         'requests',
@@ -35,8 +44,8 @@
         'six',
     ],
     extras_require={
-        'tensorflow': ['tensorflow>=1.4.0'],
-        'tensorflow_gpu': ['tensorflow-gpu>=1.4.0'],
+        'tensorflow': ['tensorflow>=1.4.1'],
+        'tensorflow_gpu': ['tensorflow-gpu>=1.4.1'],
         'tests': ['pytest', 'h5py', 'mock'],
     },
     classifiers=[
diff --git a/tensor2tensor/bin/t2t-avg-all b/tensor2tensor/bin/t2t-avg-all
index 3b4d6211d..abef8b755 100755
--- a/tensor2tensor/bin/t2t-avg-all
+++ b/tensor2tensor/bin/t2t-avg-all
@@ -1,105 +1,15 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Script to continously average last N checkpoints in a given directory."""
+"""t2t-avg-all."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import logging
-
-# Dependency imports
+from tensor2tensor.bin import t2t_avg_all
 
-import numpy as np
-import six
-from six.moves import zip  # pylint: disable=redefined-builtin
-from collections import deque
-import shutil
 import tensorflow as tf
-from tensor2tensor.utils import bleu_hook
-
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("model_dir", "", "Directory to load model checkpoints from.")
-flags.DEFINE_string("output_dir", "avg/", "Directory to output the averaged checkpoints to.")
-flags.DEFINE_integer("n", 8, "How many checkpoints should be averaged?")
-flags.DEFINE_integer("min_steps", 0, "Ignore checkpoints with less steps.")
-flags.DEFINE_integer("wait_minutes", 0, "Wait upto N minutes for a new checkpoint.")
-
-
-def main(_):
-  tf.logging._handler.setFormatter(logging.Formatter("%(asctime)s:" + logging.BASIC_FORMAT, None))
-  tf.logging.set_verbosity(tf.logging.INFO)
-
-  model_dir = os.path.expanduser(FLAGS.model_dir)
-  output_dir = os.path.expanduser(FLAGS.output_dir)
-  out_base_file = os.path.join(output_dir, 'model.ckpt')
-
-  # Copy flags.txt with the original time, so t2t-bleu can report correct relative time.
-  os.makedirs(FLAGS.output_dir, exist_ok=True)
-  if not os.path.exists(os.path.join(output_dir, 'flags.txt')):
-    shutil.copy2(os.path.join(model_dir, 'flags.txt'), os.path.join(output_dir, 'flags.txt'))
-
-  models_processed = 0
-  queue = deque()
-  for model in bleu_hook.stepfiles_iterator(model_dir, FLAGS.wait_minutes, FLAGS.min_steps):
-    if models_processed == 0:
-      var_list = tf.contrib.framework.list_variables(model.filename)
-      avg_values = {}
-      for (name, shape) in var_list:
-        if not name.startswith("global_step"):
-          avg_values[name] = np.zeros(shape)      
-    models_processed += 1
-
-    tf.logging.info("Loading [%d]: %s" % (models_processed, model.filename))
-    reader = tf.contrib.framework.load_checkpoint(model.filename)
-    for name in avg_values:
-      avg_values[name] += reader.get_tensor(name) / FLAGS.n
-    queue.append(model)
-    if len(queue) < FLAGS.n:
-      continue
-
-    out_file = "%s-%d" % (out_base_file, model.steps)
-    tf_vars = []
-    tf.logging.info("Averaging %s" % (out_file))
-    for (name, value) in six.iteritems(avg_values):
-      tf_vars.append(tf.get_variable(name, shape=value.shape)) # TODO , dtype=var_dtypes[name]
-    placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars]
-    assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
-  
-    global_step = tf.Variable(model.steps, name="global_step", trainable=False, dtype=tf.int64)
-    saver = tf.train.Saver(tf.global_variables())
-
-    tf.logging.info("Running session for %s" % (out_file))
-    with tf.Session() as sess:
-      sess.run(tf.global_variables_initializer())
-      for p, assign_op, (name, value) in zip(placeholders, assign_ops, six.iteritems(avg_values)):
-        sess.run(assign_op, {p: value})
-      tf.logging.info("Storing to %s" % out_file)
-      saver.save(sess, out_base_file, global_step=global_step)
-    os.utime(out_file + '.index', (model.mtime, model.mtime))
-
-    tf.reset_default_graph()
-    first_model = queue.popleft()
 
-    reader = tf.contrib.framework.load_checkpoint(first_model.filename)
-    for name in avg_values:
-      avg_values[name] -= reader.get_tensor(name) / FLAGS.n
+def main(argv):
+  t2t_avg_all.main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/bin/t2t-bleu b/tensor2tensor/bin/t2t-bleu
index cac2b9fc3..966f50a81 100755
--- a/tensor2tensor/bin/t2t-bleu
+++ b/tensor2tensor/bin/t2t-bleu
@@ -1,136 +1,16 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Evaluate BLEU score for all checkpoints/translations in a given directory.
-
-This script can be used in two ways.
-
-To evaluate one already translated file:
-`t2t-bleu --translation=my-wmt13.de --reference=wmt13_deen.de`
-
-To evaluate all translations in a given directory (translated by t2t-translate-all):
-`t2t-bleu
-  --translations_dir=my-translations
-  --reference=wmt13_deen.de
-  --event_dir=events`
-
-In addition to the above-mentioned compulsory parameters,
-there are optional parameters:
-
- * bleu_variant: cased (case-sensitive), uncased, both (default).
- * tag_suffix: Default="", so the tags will be BLEU_cased and BLEU_uncased. tag_suffix
-   can be used e.g. for different beam sizes if these should be plotted in different graphs.
- * min_steps: Don't evaluate checkpoints with less steps.
-   Default=-1 means check the `last_evaluated_step.txt` file, which contains the number of steps
-   of the last successfully evaluated checkpoint.
- * report_zero: Store BLEU=0 and guess its time based on the oldest file in the translations_dir.
-   Default=True. This is useful, so TensorBoard reports correct relative time for the remaining
-   checkpoints. This flag is set to False if min_steps is > 0.
- * wait_minutes: Wait upto N minutes for a new translated file. Default=0.
-   This is useful for continuous evaluation of a running training,
-   in which case this should be equal to save_checkpoints_secs/60 plus time needed for translation
-   plus some reserve.
-"""
+"""t2t-bleu."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os
-from tensor2tensor.utils import bleu_hook
-import tensorflow as tf
 
+from tensor2tensor.bin import t2t_bleu
 
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("source", None, "Path to the source-language file to be translated")
-flags.DEFINE_string("reference", None, "Path to the reference translation file")
-flags.DEFINE_string("translation", None, "Path to the MT system translation file")
-flags.DEFINE_string("translations_dir", None, "Directory with translated files to be evaulated.")
-flags.DEFINE_string("event_dir", None, "Where to store the event file.")
-
-flags.DEFINE_string("bleu_variant", "both",
-                    "Possible values: cased(case-sensitive), uncased, both(default).")
-flags.DEFINE_string("tag_suffix", "",
-                    "What to add to BLEU_cased and BLEU_uncased tags. Default=''.")
-flags.DEFINE_integer("min_steps", -1, "Don't evaluate checkpoints with less steps.")
-flags.DEFINE_integer("wait_minutes", 0,
-                     "Wait upto N minutes for a new checkpoint, cf. save_checkpoints_secs.")
-flags.DEFINE_bool("report_zero", None, "Store BLEU=0 and guess its time based on the oldest file.")
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-  if FLAGS.translation:
-    if FLAGS.translations_dir:
-      raise ValueError('Cannot specify both --translation and --translations_dir.')
-    if FLAGS.bleu_variant in ('uncased', 'both'):
-      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, FLAGS.translation, case_sensitive=False)
-      print("BLEU_uncased = %6.2f" % bleu)
-    if FLAGS.bleu_variant in ('cased', 'both'):
-      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, FLAGS.translation, case_sensitive=True)
-      print("BLEU_cased = %6.2f" % bleu)
-    return
-
-  if not FLAGS.translations_dir:
-    raise ValueError('Either --translation or --translations_dir must be specified.')
-  transl_dir = os.path.expanduser(FLAGS.translations_dir)
-
-  last_step_file = os.path.join(FLAGS.event_dir, 'last_evaluated_step.txt')
-  if FLAGS.min_steps == -1:
-    try:
-      with open(last_step_file) as ls_file:
-        FLAGS.min_steps = int(ls_file.read())
-    except FileNotFoundError:
-      FLAGS.min_steps = 0
-  if FLAGS.report_zero is None:
-    FLAGS.report_zero = FLAGS.min_steps == 0
+import tensorflow as tf
 
-  writer = tf.summary.FileWriter(FLAGS.event_dir)
-  for transl_file in bleu_hook.stepfiles_iterator(transl_dir, FLAGS.wait_minutes,
-                                                      FLAGS.min_steps, path_suffix=''):
-    # report_zero handling must be inside the for-loop,
-    # so we are sure the transl_dir is already created.
-    if FLAGS.report_zero:
-      all_files = (os.path.join(transl_dir, f) for f in os.listdir(transl_dir))
-      start_time = min(os.path.getmtime(f) for f in all_files if os.path.isfile(f))
-      values = []
-      if FLAGS.bleu_variant in ('uncased', 'both'):
-        values.append(tf.Summary.Value(tag='BLEU_uncased' + FLAGS.tag_suffix, simple_value=0))
-      if FLAGS.bleu_variant in ('cased', 'both'):
-        values.append(tf.Summary.Value(tag='BLEU_cased' + FLAGS.tag_suffix, simple_value=0))
-      writer.add_event(tf.summary.Event(summary=tf.Summary(value=values),
-                                        wall_time=start_time, step=0))
-      FLAGS.report_zero = False
+def main(argv):
+  t2t_bleu.main(argv)
 
-    filename = transl_file.filename
-    tf.logging.info("Evaluating " + filename)
-    values = []
-    if FLAGS.bleu_variant in ('uncased', 'both'):
-      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, filename, case_sensitive=False)
-      values.append(tf.Summary.Value(tag='BLEU_uncased' + FLAGS.tag_suffix, simple_value=bleu))
-      tf.logging.info("%s: BLEU_uncased = %6.2f" % (filename, bleu))
-    if FLAGS.bleu_variant in ('cased', 'both'):
-      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, filename, case_sensitive=True)
-      values.append(tf.Summary.Value(tag='BLEU_cased' + FLAGS.tag_suffix, simple_value=bleu))
-      tf.logging.info("%s: BLEU_cased = %6.2f" % (transl_file.filename, bleu))
-    writer.add_event(tf.summary.Event(summary=tf.Summary(value=values),
-                                      wall_time=transl_file.mtime, step=transl_file.steps))
-    writer.flush()
-    with open(last_step_file, 'w') as ls_file:
-      ls_file.write(str(transl_file.steps) + '\n')
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/bin/t2t-datagen b/tensor2tensor/bin/t2t-datagen
old mode 100644
new mode 100755
index 2ac0f0db2..4290365b6
--- a/tensor2tensor/bin/t2t-datagen
+++ b/tensor2tensor/bin/t2t-datagen
@@ -1,211 +1,15 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Produces the training and dev data for --problem into --data_dir.
-
-Produces sharded and shuffled TFRecord files of tensorflow.Example protocol
-buffers for a variety of registered datasets.
-
-All Problems are registered with @registry.register_problem or are in
-_SUPPORTED_PROBLEM_GENERATORS in this file. Each entry maps a string name
-(selectable on the command-line with --problem) to a function that takes 2
-arguments - input_directory and mode (one of "train" or "dev") - and yields for
-each training example a dictionary mapping string feature names to lists of
-{string, int, float}. The generator will be run once for each mode.
-"""
+"""t2t-datagen."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-import random
-import tempfile
-
-# Dependency imports
-
-import numpy as np
-
-from tensor2tensor.data_generators import algorithmic_math
-from tensor2tensor.data_generators import all_problems  # pylint: disable=unused-import
-from tensor2tensor.data_generators import audio
-from tensor2tensor.data_generators import generator_utils
-from tensor2tensor.data_generators import snli
-from tensor2tensor.data_generators import wsj_parsing
-from tensor2tensor.utils import registry
-from tensor2tensor.utils import usr_dir
+from tensor2tensor.bin import t2t_datagen
 
 import tensorflow as tf
 
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("data_dir", "", "Data directory.")
-flags.DEFINE_string("tmp_dir", "/tmp/t2t_datagen",
-                    "Temporary storage directory.")
-flags.DEFINE_string("problem", "",
-                    "The name of the problem to generate data for.")
-flags.DEFINE_string("exclude_problems", "",
-                    "Comma-separates list of problems to exclude.")
-flags.DEFINE_integer("num_shards", 0, "How many shards to use. Ignored for "
-                     "registered Problems.")
-flags.DEFINE_integer("max_cases", 0,
-                     "Maximum number of cases to generate (unbounded if 0).")
-flags.DEFINE_bool("only_list", False,
-                  "If true, we only list the problems that will be generated.")
-flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
-flags.DEFINE_integer("task_id", -1, "For distributed data generation.")
-flags.DEFINE_string("t2t_usr_dir", "",
-                    "Path to a Python module that will be imported. The "
-                    "__init__.py file should include the necessary imports. "
-                    "The imported files should contain registrations, "
-                    "e.g. @registry.register_problem calls, that will then be "
-                    "available to t2t-datagen.")
-
-# Mapping from problems that we can generate data for to their generators.
-# pylint: disable=g-long-lambda
-_SUPPORTED_PROBLEM_GENERATORS = {
-    "algorithmic_algebra_inverse": (
-        lambda: algorithmic_math.algebra_inverse(26, 0, 2, 100000),
-        lambda: algorithmic_math.algebra_inverse(26, 3, 3, 10000)),
-    "parsing_english_ptb8k": (
-        lambda: wsj_parsing.parsing_token_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 2**13, 2**9),
-        lambda: wsj_parsing.parsing_token_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 2**13, 2**9)),
-    "parsing_english_ptb16k": (
-        lambda: wsj_parsing.parsing_token_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 2**14, 2**9),
-        lambda: wsj_parsing.parsing_token_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 2**14, 2**9)),
-    "inference_snli32k": (
-        lambda: snli.snli_token_generator(FLAGS.tmp_dir, True, 2**15),
-        lambda: snli.snli_token_generator(FLAGS.tmp_dir, False, 2**15),
-    ),
-    "audio_timit_characters_test": (
-        lambda: audio.timit_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 1718),
-        lambda: audio.timit_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 626)),
-    "audio_timit_tokens_8k_test": (
-        lambda: audio.timit_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 1718,
-            vocab_filename="vocab.endefr.%d" % 2**13, vocab_size=2**13),
-        lambda: audio.timit_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 626,
-            vocab_filename="vocab.endefr.%d" % 2**13, vocab_size=2**13)),
-    "audio_timit_tokens_32k_test": (
-        lambda: audio.timit_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, True, 1718,
-            vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15),
-        lambda: audio.timit_generator(
-            FLAGS.data_dir, FLAGS.tmp_dir, False, 626,
-            vocab_filename="vocab.endefr.%d" % 2**15, vocab_size=2**15)),
-}
-
-# pylint: enable=g-long-lambda
-
-
-def set_random_seed():
-  """Set the random seed from flag everywhere."""
-  tf.set_random_seed(FLAGS.random_seed)
-  random.seed(FLAGS.random_seed)
-  np.random.seed(FLAGS.random_seed)
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
-
-  # Calculate the list of problems to generate.
-  problems = sorted(
-      list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems())
-  for exclude in FLAGS.exclude_problems.split(","):
-    if exclude:
-      problems = [p for p in problems if exclude not in p]
-  if FLAGS.problem and FLAGS.problem[-1] == "*":
-    problems = [p for p in problems if p.startswith(FLAGS.problem[:-1])]
-  elif FLAGS.problem:
-    problems = [p for p in problems if p == FLAGS.problem]
-  else:
-    problems = []
-
-  # Remove TIMIT if paths are not given.
-  if not FLAGS.timit_paths:
-    problems = [p for p in problems if "timit" not in p]
-  # Remove parsing if paths are not given.
-  if not FLAGS.parsing_path:
-    problems = [p for p in problems if "parsing" not in p]
-
-  if not problems:
-    problems_str = "\n  * ".join(
-        sorted(list(_SUPPORTED_PROBLEM_GENERATORS) + registry.list_problems()))
-    error_msg = ("You must specify one of the supported problems to "
-                 "generate data for:\n  * " + problems_str + "\n")
-    error_msg += ("TIMIT and parsing need data_sets specified with "
-                  "--timit_paths and --parsing_path.")
-    raise ValueError(error_msg)
-
-  if not FLAGS.data_dir:
-    FLAGS.data_dir = tempfile.gettempdir()
-    tf.logging.warning("It is strongly recommended to specify --data_dir. "
-                       "Data will be written to default data_dir=%s.",
-                       FLAGS.data_dir)
-
-  tf.logging.info("Generating problems:\n%s"
-                  % registry.display_list_by_prefix(problems,
-                                                    starting_spaces=4))
-  if FLAGS.only_list:
-    return
-  for problem in problems:
-    set_random_seed()
-
-    if problem in _SUPPORTED_PROBLEM_GENERATORS:
-      generate_data_for_problem(problem)
-    else:
-      generate_data_for_registered_problem(problem)
-
-
-def generate_data_for_problem(problem):
-  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
-  training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]
-
-  num_shards = FLAGS.num_shards or 10
-  tf.logging.info("Generating training data for %s.", problem)
-  train_output_files = generator_utils.train_data_filenames(
-      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards)
-  generator_utils.generate_files(training_gen(), train_output_files,
-                                 FLAGS.max_cases)
-  tf.logging.info("Generating development data for %s.", problem)
-  dev_output_files = generator_utils.dev_data_filenames(
-      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, 1)
-  generator_utils.generate_files(dev_gen(), dev_output_files)
-  all_output_files = train_output_files + dev_output_files
-  generator_utils.shuffle_dataset(all_output_files)
-
-
-def generate_data_for_registered_problem(problem_name):
-  tf.logging.info("Generating data for %s.", problem_name)
-  if FLAGS.num_shards:
-    raise ValueError("--num_shards should not be set for registered Problem.")
-  problem = registry.problem(problem_name)
-  task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
-  problem.generate_data(
-      os.path.expanduser(FLAGS.data_dir),
-      os.path.expanduser(FLAGS.tmp_dir),
-      task_id=task_id)
+def main(argv):
+  t2t_datagen.main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/bin/t2t-decoder b/tensor2tensor/bin/t2t-decoder
old mode 100644
new mode 100755
index f453b01fd..612117c22
--- a/tensor2tensor/bin/t2t-decoder
+++ b/tensor2tensor/bin/t2t-decoder
@@ -1,109 +1,15 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-r"""Decode from trained T2T models.
-
-This binary performs inference using the Estimator API.
-
-Example usage to decode from dataset:
-
-  t2t-decoder \
-      --data_dir ~/data \
-      --problems=algorithmic_identity_binary40 \
-      --model=transformer
-      --hparams_set=transformer_base
-
-Set FLAGS.decode_interactive or FLAGS.decode_from_file for alternative decode
-sources.
-"""
+"""t2t-decoder."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import os
-
-# Dependency imports
-
-from tensor2tensor.tpu import tpu_trainer
-from tensor2tensor.tpu import tpu_trainer_lib
-from tensor2tensor.utils import decoding
-from tensor2tensor.utils import usr_dir
+from tensor2tensor.bin import t2t_decoder
 
 import tensorflow as tf
 
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-# Additional flags in tpu/tpu_trainer.py and utils/flags.py
-flags.DEFINE_string("decode_from_file", None,
-                    "Path to the source file for decoding")
-flags.DEFINE_string("decode_to_file", None,
-                    "Path to the decoded (output) file")
-flags.DEFINE_bool("decode_interactive", False,
-                  "Interactive local inference mode.")
-flags.DEFINE_integer("decode_shards", 1, "Number of decoding replicas.")
-
-
-def create_hparams():
-  return tpu_trainer_lib.create_hparams(
-      FLAGS.hparams_set,
-      FLAGS.hparams,
-      data_dir=os.path.expanduser(FLAGS.data_dir),
-      problem_name=FLAGS.problems)
-
-
-def create_decode_hparams():
-  decode_hp = decoding.decode_hparams(FLAGS.decode_hparams)
-  decode_hp.add_hparam("shards", FLAGS.decode_shards)
-  decode_hp.add_hparam("shard_id", FLAGS.worker_id)
-  return decode_hp
-
-
-def decode(estimator, hparams, decode_hp):
-  if FLAGS.decode_interactive:
-    decoding.decode_interactively(estimator, hparams, decode_hp)
-  elif FLAGS.decode_from_file:
-    decoding.decode_from_file(estimator, FLAGS.decode_from_file, hparams,
-                              decode_hp, FLAGS.decode_to_file)
-  else:
-    decoding.decode_from_dataset(
-        estimator,
-        FLAGS.problems.split("-"),
-        hparams,
-        decode_hp,
-        decode_to_file=FLAGS.decode_to_file,
-        dataset_split="test" if FLAGS.eval_use_test_set else None)
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
-  FLAGS.use_tpu = False  # decoding not supported on TPU
-
-  hp = create_hparams()
-  decode_hp = create_decode_hparams()
-
-  estimator = tpu_trainer_lib.create_estimator(
-      FLAGS.model,
-      hp,
-      tpu_trainer.create_run_config(hp),
-      decode_hparams=decode_hp,
-      use_tpu=False)
-
-  decode(estimator, hp, decode_hp)
+def main(argv):
+  t2t_decoder.main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/bin/t2t-exporter b/tensor2tensor/bin/t2t-exporter
new file mode 100755
index 000000000..cfd4f5ff8
--- /dev/null
+++ b/tensor2tensor/bin/t2t-exporter
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+"""t2t-exporter."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensor2tensor.serving import export
+
+import tensorflow as tf
+
+def main(argv):
+  export.main(argv)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/bin/t2t-insights-server b/tensor2tensor/bin/t2t-insights-server
new file mode 100755
index 000000000..102202c9b
--- /dev/null
+++ b/tensor2tensor/bin/t2t-insights-server
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+"""t2t-insights-server."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensor2tensor.insights import server
+
+import tensorflow as tf
+
+def main(argv):
+  server.main(argv)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/bin/t2t-make-tf-configs b/tensor2tensor/bin/t2t-make-tf-configs
old mode 100644
new mode 100755
index 0b656aba6..b481ea910
--- a/tensor2tensor/bin/t2t-make-tf-configs
+++ b/tensor2tensor/bin/t2t-make-tf-configs
@@ -1,86 +1,15 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Output command line arguments and json-encoded TF_CONFIGs.
-
-Usage:
-
-`t2t-make-tf-configs --masters="server1:1234" --ps="server3:2134,server4:2334"`
-
-Outputs 1 line per job to stdout, first the masters, then the parameter servers.
-Each line has the TF_CONFIG, then a tab, then the command line flags for that
-job.
-
-If there is a single master, it will have the `--sync` flag.
-"""
+"""t2t-make-tf-configs."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import json
-
-# Dependency imports
+from tensor2tensor.bin import make_tf_configs
 
 import tensorflow as tf
 
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-flags.DEFINE_string("masters", "", "Comma-separated list of master addresses")
-flags.DEFINE_string("ps", "", "Comma-separated list of ps addresses")
-
-
-def main(_):
-  if not (FLAGS.masters and FLAGS.ps):
-    raise ValueError("Must provide --masters and --ps")
-
-  masters = FLAGS.masters.split(",")
-  ps = FLAGS.ps.split(",")
-
-  cluster = {"ps": ps, "master": masters}
-
-  for task_type, jobs in (("master", masters), ("ps", ps)):
-    for idx, job in enumerate(jobs):
-      if task_type == "master":
-        cmd_line_flags = " ".join([
-            "--master=grpc://%s" % job,
-            "--ps_replicas=%d" % len(ps),
-            "--worker_replicas=%d" % len(masters),
-            "--worker_gpu=1",
-            "--worker_id=%d" % idx,
-            "--worker_job='/job:master'",
-            "--ps_gpu=1",
-            "--schedule=train",
-            "--sync" if len(masters) == 1 else "",
-        ])
-      else:
-        cmd_line_flags = " ".join([
-            "--master=grpc://%s" % job,
-            "--schedule=run_std_server",
-        ])
-
-      tf_config = json.dumps({
-          "cluster": cluster,
-          "task": {
-              "type": task_type,
-              "index": idx
-          },
-          "environment": "cloud",
-      })
-      print("'%s'\t%s" % (tf_config, cmd_line_flags))
+def main(argv):
+  make_tf_configs.main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/bin/t2t-query-server b/tensor2tensor/bin/t2t-query-server
new file mode 100755
index 000000000..91ede7ce7
--- /dev/null
+++ b/tensor2tensor/bin/t2t-query-server
@@ -0,0 +1,16 @@
+#!/usr/bin/env python
+"""t2t-query-server."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tensor2tensor.serving import query
+
+import tensorflow as tf
+
+def main(argv):
+  query.main(argv)
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
old mode 100644
new mode 100755
index 70435094a..77f1ec865
--- a/tensor2tensor/bin/t2t-trainer
+++ b/tensor2tensor/bin/t2t-trainer
@@ -1,190 +1,15 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Train on TPU."""
+"""t2t-trainer."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import contextlib
-import os
-import sys
-
-# Dependency imports
-
-from tensor2tensor import models  # pylint: disable=unused-import
-from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
-from tensor2tensor.tpu import tpu_trainer_lib
-from tensor2tensor.utils import decoding
-from tensor2tensor.utils import flags as t2t_flags  # pylint: disable=unused-import
-from tensor2tensor.utils import registry
-from tensor2tensor.utils import usr_dir
+from tensor2tensor.bin import t2t_trainer
 
 import tensorflow as tf
 
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-# See flags.py for additional command-line flags.
-flags.DEFINE_string("t2t_usr_dir", "",
-                    "Path to a Python module that will be imported. The "
-                    "__init__.py file should include the necessary imports. "
-                    "The imported files should contain registrations, "
-                    "e.g. @registry.register_model calls, that will then be "
-                    "available to the t2t-trainer.")
-flags.DEFINE_integer("random_seed", 1234, "Random seed.")
-flags.DEFINE_integer("tpu_num_shards", 8, "Number of tpu shards.")
-flags.DEFINE_integer("iterations_per_loop", 1000,
-                     "Number of iterations in a TPU training loop.")
-flags.DEFINE_bool("use_tpu", False, "Whether to use TPU.")
-flags.DEFINE_bool("generate_data", False, "Generate data before training?")
-flags.DEFINE_string("tmp_dir", "/tmp/t2t_datagen",
-                    "Temporary storage directory, used if --generate_data.")
-flags.DEFINE_bool("profile", False, "Profile performance?")
-
-# To maintain compatibility with some internal libs, we guard against these flag
-# definitions possibly erroring. Apologies for the ugliness.
-try:
-  flags.DEFINE_string("master", "", "Address of TensorFlow master.")
-  flags.DEFINE_string("output_dir", "", "Base output directory for run.")
-  flags.DEFINE_string("schedule", "continuous_train_and_eval",
-                      "Method of Experiment to run.")
-  flags.DEFINE_integer("eval_steps", 10000,
-                       "Number of steps in evaluation. By default, eval will "
-                       "stop after eval_steps or when it runs through the eval "
-                       "dataset once in full, whichever comes first, so this "
-                       "can be a very large number.")
-except:  # pylint: disable=bare-except
-  pass
-
-
-def get_problem_name():
-  problems = FLAGS.problems.split("-")
-  assert len(problems) == 1
-  return problems[0]
-
-
-def create_hparams():
-  return tpu_trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams)
-
-
-def create_experiment_fn():
-  return tpu_trainer_lib.create_experiment_fn(
-      model_name=FLAGS.model,
-      problem_name=get_problem_name(),
-      data_dir=os.path.expanduser(FLAGS.data_dir),
-      train_steps=FLAGS.train_steps,
-      eval_steps=FLAGS.eval_steps,
-      min_eval_frequency=FLAGS.local_eval_frequency,
-      schedule=FLAGS.schedule,
-      export=FLAGS.export_saved_model,
-      decode_hparams=decoding.decode_hparams(FLAGS.decode_hparams),
-      use_tfdbg=FLAGS.tfdbg,
-      use_dbgprofile=FLAGS.dbgprofile,
-      eval_early_stopping_steps=FLAGS.eval_early_stopping_steps,
-      eval_early_stopping_metric=FLAGS.eval_early_stopping_metric,
-      eval_early_stopping_metric_delta=FLAGS.eval_early_stopping_metric_delta,
-      eval_early_stopping_metric_minimize=FLAGS.
-      eval_early_stopping_metric_minimize,
-      use_tpu=FLAGS.use_tpu)
-
-
-def create_run_config(hp):
-  return tpu_trainer_lib.create_run_config(
-      model_dir=os.path.expanduser(FLAGS.output_dir),
-      master=FLAGS.master,
-      iterations_per_loop=FLAGS.iterations_per_loop,
-      num_shards=FLAGS.tpu_num_shards,
-      log_device_placement=FLAGS.log_device_placement,
-      save_checkpoints_steps=max(FLAGS.iterations_per_loop,
-                                 FLAGS.local_eval_frequency),
-      keep_checkpoint_max=FLAGS.keep_checkpoint_max,
-      keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours,
-      num_gpus=FLAGS.worker_gpu,
-      gpu_order=FLAGS.gpu_order,
-      shard_to_cpu=FLAGS.locally_shard_to_cpu,
-      num_async_replicas=FLAGS.worker_replicas,
-      gpu_mem_fraction=FLAGS.worker_gpu_memory_fraction,
-      enable_graph_rewriter=FLAGS.experimental_optimize_placement,
-      use_tpu=FLAGS.use_tpu,
-      schedule=FLAGS.schedule,
-      no_data_parallelism=hp.no_data_parallelism,
-      daisy_chain_variables=hp.daisy_chain_variables,
-      ps_replicas=FLAGS.ps_replicas,
-      ps_job=FLAGS.ps_job,
-      ps_gpu=FLAGS.ps_gpu,
-      sync=FLAGS.sync,
-      worker_id=FLAGS.worker_id,
-      worker_job=FLAGS.worker_job)
-
-
-def generate_data():
-  # Generate data if requested.
-  data_dir = os.path.expanduser(FLAGS.data_dir)
-  tmp_dir = os.path.expanduser(FLAGS.tmp_dir)
-  tf.gfile.MakeDirs(data_dir)
-  tf.gfile.MakeDirs(tmp_dir)
-
-  problem_name = get_problem_name()
-  tf.logging.info("Generating data for %s" % problem_name)
-  registry.problem(problem_name).generate_data(data_dir, tmp_dir)
-
-
-@contextlib.contextmanager
-def profile_context():
-  if FLAGS.profile:
-    with tf.contrib.tfprof.ProfileContext("t2tprof",
-                                          trace_steps=range(100),
-                                          dump_steps=range(100)) as pctx:
-      opts = tf.profiler.ProfileOptionBuilder.time_and_memory()
-      pctx.add_auto_profiling("op", opts, range(100))
-      yield
-  else:
-    yield
-
-
-def log_registry():
-  if FLAGS.registry_help:
-    tf.logging.info(registry.help_string())
-    sys.exit(0)
-
-
-def execute_schedule(exp):
-  if not hasattr(exp, FLAGS.schedule):
-    raise ValueError(
-        "Experiment has no method %s, from --schedule" % FLAGS.schedule)
-  with profile_context():
-    getattr(exp, FLAGS.schedule)()
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tpu_trainer_lib.set_random_seed(FLAGS.random_seed)
-  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
-  log_registry()
-
-  if FLAGS.generate_data:
-    generate_data()
-
-  hparams = create_hparams()
-  run_config = create_run_config(hparams)
-
-  exp_fn = create_experiment_fn()
-  exp = exp_fn(run_config, hparams)
-  execute_schedule(exp)
+def main(argv):
+  t2t_trainer.main(argv)
 
 
 if __name__ == "__main__":
diff --git a/tensor2tensor/bin/t2t-translate-all b/tensor2tensor/bin/t2t-translate-all
index 1ee7e535f..fed5d3045 100755
--- a/tensor2tensor/bin/t2t-translate-all
+++ b/tensor2tensor/bin/t2t-translate-all
@@ -1,91 +1,17 @@
 #!/usr/bin/env python
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Translate a file with all checkpoints in a given directory.
-
-t2t-decoder will be executed with these parameters:
---problems
---data_dir
---output_dir with the value of --model_dir
---decode_from_file with the value of --source
---decode_hparams with properly formated --beam_size and --alpha
---checkpoint_path automatically filled
---decode_to_file automatically filled
-"""
+"""t2t-translate-all."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
-import os
-import shutil
-import tensorflow as tf
-from tensor2tensor.utils import bleu_hook
-
 
-flags = tf.flags
-
-# t2t-translate-all specific options
-flags.DEFINE_string("decoder_command", "t2t-decoder {params}",
-                    "Which command to execute instead t2t-decoder."
-                    "{params} is replaced by the parameters. Useful e.g. for qsub wrapper.")
-flags.DEFINE_string("model_dir", "", "Directory to load model checkpoints from.")
-flags.DEFINE_string("source", None, "Path to the source-language file to be translated")
-flags.DEFINE_string("translations_dir", "translations", "Where to store the translated files.")
-flags.DEFINE_integer("min_steps", 0, "Ignore checkpoints with less steps.")
-flags.DEFINE_integer("wait_minutes", 0, "Wait upto N minutes for a new checkpoint")
-
-# options derived from t2t-decoder
-flags.DEFINE_integer("beam_size", 4, "Beam-search width.")
-flags.DEFINE_float("alpha", 0.6, "Beam-search alpha.")
-flags.DEFINE_string("model", "transformer", "see t2t-decoder")
-flags.DEFINE_string("t2t_usr_dir", None, "see t2t-decoder")
-flags.DEFINE_string("data_dir", None, "see t2t-decoder")
-flags.DEFINE_string("problems", None, "see t2t-decoder")
-flags.DEFINE_string("hparams_set", "transformer_big_single_gpu", "see t2t-decoder")
+from tensor2tensor.bin import t2t_translate_all
 
+import tensorflow as tf
 
-def main(_):
-  FLAGS = flags.FLAGS
-  tf.logging.set_verbosity(tf.logging.INFO)
-  model_dir = os.path.expanduser(FLAGS.model_dir)
-  translations_dir = os.path.expanduser(FLAGS.translations_dir)
-  source = os.path.expanduser(FLAGS.source)
-  os.makedirs(translations_dir, exist_ok=True)
-  translated_base_file = os.path.join(translations_dir, FLAGS.problems)
+def main(argv):
+  t2t_translate_all.main(argv)
 
-  # Copy flags.txt with the original time, so t2t-bleu can report correct relative time.
-  flags_path = os.path.join(translations_dir, FLAGS.problems + '-flags.txt')
-  if not os.path.exists(flags_path):
-    shutil.copy2(os.path.join(model_dir, 'flags.txt'), flags_path)
 
-  for model in bleu_hook.stepfiles_iterator(model_dir, FLAGS.wait_minutes, FLAGS.min_steps):
-    tf.logging.info("Translating " + model.filename)
-    out_file = translated_base_file + '-' + str(model.steps)
-    if os.path.exists(out_file):
-      tf.logging.info(out_file + " already exists, so skipping it.")
-    else:
-      tf.logging.info("Translating " + out_file)
-      params = ("--t2t_usr_dir={FLAGS.t2t_usr_dir} --output_dir={model_dir} "
-         "--data_dir={FLAGS.data_dir} --problems={FLAGS.problems} "
-         "--decode_hparams=beam_size={FLAGS.beam_size},alpha={FLAGS.alpha} "
-         "--model={FLAGS.model} --hparams_set={FLAGS.hparams_set} "
-         "--checkpoint_path={model.filename} --decode_from_file={source} "
-         "--decode_to_file={out_file}".format(**locals()))
-      command = FLAGS.decoder_command.format(**locals())
-      tf.logging.info("Running:\n" + command)
-      os.system(command)
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/tensor2tensor/bin/t2t_avg_all.py b/tensor2tensor/bin/t2t_avg_all.py
new file mode 100644
index 000000000..6e2a3088e
--- /dev/null
+++ b/tensor2tensor/bin/t2t_avg_all.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Script to continously average last N checkpoints in a given directory."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from collections import deque
+import logging
+import os
+import shutil
+
+# Dependency imports
+
+import numpy as np
+import six
+from six.moves import zip  # pylint: disable=redefined-builtin
+from tensor2tensor.utils import bleu_hook
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("model_dir", "",
+                    "Directory to load model checkpoints from.")
+flags.DEFINE_string("output_dir", "avg/",
+                    "Directory to output the averaged checkpoints to.")
+flags.DEFINE_integer("n", 8, "How many checkpoints should be averaged?")
+flags.DEFINE_integer("min_steps", 0, "Ignore checkpoints with less steps.")
+flags.DEFINE_integer("wait_minutes", 0,
+                     "Wait upto N minutes for a new checkpoint.")
+
+
+def main(_):
+  tf.logging._handler.setFormatter(  # pylint: disable=protected-access
+      logging.Formatter("%(asctime)s:" + logging.BASIC_FORMAT, None))
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  model_dir = os.path.expanduser(FLAGS.model_dir)
+  output_dir = os.path.expanduser(FLAGS.output_dir)
+  out_base_file = os.path.join(output_dir, "model.ckpt")
+
+  # Copy flags.txt with the original time, so t2t-bleu can report correct
+  # relative time.
+  tf.gfile.MakeDirs(FLAGS.output_dir)
+  if not os.path.exists(os.path.join(output_dir, "flags.txt")):
+    shutil.copy2(os.path.join(model_dir, "flags.txt"),
+                 os.path.join(output_dir, "flags.txt"))
+
+  models_processed = 0
+  queue = deque()
+  for model in bleu_hook.stepfiles_iterator(model_dir, FLAGS.wait_minutes,
+                                            FLAGS.min_steps):
+    if models_processed == 0:
+      var_list = tf.contrib.framework.list_variables(model.filename)
+      avg_values = {}
+      for (name, shape) in var_list:
+        if not name.startswith("global_step"):
+          avg_values[name] = np.zeros(shape)
+    models_processed += 1
+
+    tf.logging.info("Loading [%d]: %s" % (models_processed, model.filename))
+    reader = tf.contrib.framework.load_checkpoint(model.filename)
+    for name in avg_values:
+      avg_values[name] += reader.get_tensor(name) / FLAGS.n
+    queue.append(model)
+    if len(queue) < FLAGS.n:
+      continue
+
+    out_file = "%s-%d" % (out_base_file, model.steps)
+    tf_vars = []
+    tf.logging.info("Averaging %s" % (out_file))
+    for (name, value) in six.iteritems(avg_values):
+      # TODO(martinpopel): dtype=var_dtypes[name]
+      tf_vars.append(tf.get_variable(name, shape=value.shape))
+    placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars]
+    assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]
+
+    global_step = tf.get_variable(
+        "global_step",
+        initializer=tf.constant(model.steps, dtype=tf.int64),
+        trainable=False)
+    saver = tf.train.Saver(tf.global_variables())
+
+    tf.logging.info("Running session for %s" % (out_file))
+    with tf.Session() as sess:
+      sess.run(tf.global_variables_initializer())
+      for p, assign_op, (name, value) in zip(
+          placeholders, assign_ops, six.iteritems(avg_values)):
+        sess.run(assign_op, {p: value})
+      tf.logging.info("Storing to %s" % out_file)
+      saver.save(sess, out_base_file, global_step=global_step)
+    os.utime(out_file + ".index", (model.mtime, model.mtime))
+
+    tf.reset_default_graph()
+    first_model = queue.popleft()
+
+    reader = tf.contrib.framework.load_checkpoint(first_model.filename)
+    for name in avg_values:
+      avg_values[name] -= reader.get_tensor(name) / FLAGS.n
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/bin/t2t_bleu.py b/tensor2tensor/bin/t2t_bleu.py
new file mode 100644
index 000000000..5db83965d
--- /dev/null
+++ b/tensor2tensor/bin/t2t_bleu.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Evaluate BLEU score for all checkpoints/translations in a given directory.
+
+This script can be used in two ways.
+
+
+To evaluate one already translated file:
+
+```
+t2t-bleu --translation=my-wmt13.de --reference=wmt13_deen.de
+```
+
+To evaluate all translations in a given directory (translated by
+`t2t-translate-all`):
+
+```
+t2t-bleu
+  --translations_dir=my-translations
+  --reference=wmt13_deen.de
+  --event_dir=events
+```
+
+In addition to the above-mentioned required parameters,
+there are optional parameters:
+ * bleu_variant: cased (case-sensitive), uncased, both (default).
+ * tag_suffix: Default="", so the tags will be BLEU_cased and BLEU_uncased.
+   tag_suffix can be used e.g. for different beam sizes if these should be
+   plotted in different graphs.
+ * min_steps: Don't evaluate checkpoints with less steps.
+   Default=-1 means check the `last_evaluated_step.txt` file, which contains
+   the number of steps of the last successfully evaluated checkpoint.
+ * report_zero: Store BLEU=0 and guess its time based on the oldest file in the
+   translations_dir. Default=True. This is useful, so TensorBoard reports
+   correct relative time for the remaining checkpoints. This flag is set to
+   False if min_steps is > 0.
+ * wait_minutes: Wait upto N minutes for a new translated file. Default=0.
+   This is useful for continuous evaluation of a running training, in which case
+   this should be equal to save_checkpoints_secs/60 plus time needed for
+   translation plus some reserve.
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Dependency imports
+
+from tensor2tensor.utils import bleu_hook
+import tensorflow as tf
+
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("source", None,
+                    "Path to the source-language file to be translated")
+flags.DEFINE_string("reference", None, "Path to the reference translation file")
+flags.DEFINE_string("translation", None,
+                    "Path to the MT system translation file")
+flags.DEFINE_string("translations_dir", None,
+                    "Directory with translated files to be evaulated.")
+flags.DEFINE_string("event_dir", None, "Where to store the event file.")
+
+flags.DEFINE_string("bleu_variant", "both",
+                    "Possible values: cased(case-sensitive), uncased, "
+                    "both(default).")
+flags.DEFINE_string("tag_suffix", "",
+                    "What to add to BLEU_cased and BLEU_uncased tags.")
+flags.DEFINE_integer("min_steps", -1,
+                     "Don't evaluate checkpoints with less steps.")
+flags.DEFINE_integer("wait_minutes", 0,
+                     "Wait upto N minutes for a new checkpoint, cf. "
+                     "save_checkpoints_secs.")
+flags.DEFINE_bool("report_zero", None,
+                  "Store BLEU=0 and guess its time based on the oldest file.")
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  if FLAGS.translation:
+    if FLAGS.translations_dir:
+      raise ValueError(
+          "Cannot specify both --translation and --translations_dir.")
+    if FLAGS.bleu_variant in ("uncased", "both"):
+      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, FLAGS.translation,
+                                          case_sensitive=False)
+      print("BLEU_uncased = %6.2f" % bleu)
+    if FLAGS.bleu_variant in ("cased", "both"):
+      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, FLAGS.translation,
+                                          case_sensitive=True)
+      print("BLEU_cased = %6.2f" % bleu)
+    return
+
+  if not FLAGS.translations_dir:
+    raise ValueError(
+        "Either --translation or --translations_dir must be specified.")
+  transl_dir = os.path.expanduser(FLAGS.translations_dir)
+
+  last_step_file = os.path.join(FLAGS.event_dir, "last_evaluated_step.txt")
+  if FLAGS.min_steps == -1:
+    if tf.gfile.Exists(last_step_file):
+      with open(last_step_file) as ls_file:
+        FLAGS.min_steps = int(ls_file.read())
+    else:
+      FLAGS.min_steps = 0
+  if FLAGS.report_zero is None:
+    FLAGS.report_zero = FLAGS.min_steps == 0
+
+  writer = tf.summary.FileWriter(FLAGS.event_dir)
+  for transl_file in bleu_hook.stepfiles_iterator(
+      transl_dir, FLAGS.wait_minutes, FLAGS.min_steps, path_suffix=""):
+    # report_zero handling must be inside the for-loop,
+    # so we are sure the transl_dir is already created.
+    if FLAGS.report_zero:
+      all_files = (os.path.join(transl_dir, f) for f in os.listdir(transl_dir))
+      start_time = min(
+          os.path.getmtime(f) for f in all_files if os.path.isfile(f))
+      values = []
+      if FLAGS.bleu_variant in ("uncased", "both"):
+        values.append(tf.Summary.Value(
+            tag="BLEU_uncased" + FLAGS.tag_suffix, simple_value=0))
+      if FLAGS.bleu_variant in ("cased", "both"):
+        values.append(tf.Summary.Value(
+            tag="BLEU_cased" + FLAGS.tag_suffix, simple_value=0))
+      writer.add_event(tf.summary.Event(summary=tf.Summary(value=values),
+                                        wall_time=start_time, step=0))
+      FLAGS.report_zero = False
+
+    filename = transl_file.filename
+    tf.logging.info("Evaluating " + filename)
+    values = []
+    if FLAGS.bleu_variant in ("uncased", "both"):
+      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, filename,
+                                          case_sensitive=False)
+      values.append(tf.Summary.Value(tag="BLEU_uncased" + FLAGS.tag_suffix,
+                                     simple_value=bleu))
+      tf.logging.info("%s: BLEU_uncased = %6.2f" % (filename, bleu))
+    if FLAGS.bleu_variant in ("cased", "both"):
+      bleu = 100 * bleu_hook.bleu_wrapper(FLAGS.reference, filename,
+                                          case_sensitive=True)
+      values.append(tf.Summary.Value(tag="BLEU_cased" + FLAGS.tag_suffix,
+                                     simple_value=bleu))
+      tf.logging.info("%s: BLEU_cased = %6.2f" % (transl_file.filename, bleu))
+    writer.add_event(tf.summary.Event(
+        summary=tf.Summary(value=values),
+        wall_time=transl_file.mtime, step=transl_file.steps))
+    writer.flush()
+    with open(last_step_file, "w") as ls_file:
+      ls_file.write(str(transl_file.steps) + "\n")
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/bin/t2t_datagen.py b/tensor2tensor/bin/t2t_datagen.py
index c83428bc2..451b99a3a 100644
--- a/tensor2tensor/bin/t2t_datagen.py
+++ b/tensor2tensor/bin/t2t_datagen.py
@@ -29,6 +29,7 @@
 from __future__ import division
 from __future__ import print_function
 
+import multiprocessing
 import os
 import random
 import tempfile
@@ -66,6 +67,11 @@
                   "If true, we only list the problems that will be generated.")
 flags.DEFINE_integer("random_seed", 429459, "Random seed to use.")
 flags.DEFINE_integer("task_id", -1, "For distributed data generation.")
+flags.DEFINE_integer("task_id_start", -1, "For distributed data generation.")
+flags.DEFINE_integer("task_id_end", -1, "For distributed data generation.")
+flags.DEFINE_integer(
+    "num_concurrent_processes", 10,
+    "Applies only to problems for which multiprocess_generate=True.")
 flags.DEFINE_string("t2t_usr_dir", "",
                     "Path to a Python module that will be imported. The "
                     "__init__.py file should include the necessary imports. "
@@ -195,17 +201,35 @@ def generate_data_for_problem(problem):
   generator_utils.shuffle_dataset(all_output_files)
 
 
+def generate_data_in_process(arg):
+  problem_name, data_dir, tmp_dir, task_id = arg
+  problem = registry.problem(problem_name)
+  problem.generate_data(data_dir, tmp_dir, task_id)
+
+
 def generate_data_for_registered_problem(problem_name):
   tf.logging.info("Generating data for %s.", problem_name)
   if FLAGS.num_shards:
     raise ValueError("--num_shards should not be set for registered Problem.")
   problem = registry.problem(problem_name)
   task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
-  problem.generate_data(
-      os.path.expanduser(FLAGS.data_dir),
-      os.path.expanduser(FLAGS.tmp_dir),
-      task_id=task_id)
-
+  data_dir = os.path.expanduser(FLAGS.data_dir)
+  tmp_dir = os.path.expanduser(FLAGS.tmp_dir)
+  if task_id is None and problem.multiprocess_generate:
+    if FLAGS.task_id_start != -1:
+      assert FLAGS.task_id_end != -1
+      task_id_start = FLAGS.task_id_start
+      task_id_end = FLAGS.task_id_end
+    else:
+      task_id_start = 0
+      task_id_end = problem.num_generate_tasks
+    pool = multiprocessing.Pool(processes=FLAGS.num_concurrent_processes)
+    problem.prepare_to_generate(data_dir, tmp_dir)
+    args = [(problem_name, data_dir, tmp_dir, task_id)
+            for task_id in range(task_id_start, task_id_end)]
+    pool.map(generate_data_in_process, args)
+  else:
+    problem.generate_data(data_dir, tmp_dir, task_id)
 
 if __name__ == "__main__":
   tf.app.run()
diff --git a/tensor2tensor/bin/t2t_decoder.py b/tensor2tensor/bin/t2t_decoder.py
index 25358739a..132dac0e4 100644
--- a/tensor2tensor/bin/t2t_decoder.py
+++ b/tensor2tensor/bin/t2t_decoder.py
@@ -36,9 +36,9 @@
 
 # Dependency imports
 
-from tensor2tensor.tpu import tpu_trainer
-from tensor2tensor.tpu import tpu_trainer_lib
+from tensor2tensor.bin import t2t_trainer
 from tensor2tensor.utils import decoding
+from tensor2tensor.utils import trainer_lib
 from tensor2tensor.utils import usr_dir
 
 import tensorflow as tf
@@ -46,7 +46,7 @@
 flags = tf.flags
 FLAGS = flags.FLAGS
 
-# Additional flags in tpu/tpu_trainer.py and utils/flags.py
+# Additional flags in bin/t2t_trainer.py and utils/flags.py
 flags.DEFINE_string("decode_from_file", None,
                     "Path to the source file for decoding")
 flags.DEFINE_string("decode_to_file", None,
@@ -57,7 +57,7 @@
 
 
 def create_hparams():
-  return tpu_trainer_lib.create_hparams(
+  return trainer_lib.create_hparams(
       FLAGS.hparams_set,
       FLAGS.hparams,
       data_dir=os.path.expanduser(FLAGS.data_dir),
@@ -95,10 +95,10 @@ def main(_):
   hp = create_hparams()
   decode_hp = create_decode_hparams()
 
-  estimator = tpu_trainer_lib.create_estimator(
+  estimator = trainer_lib.create_estimator(
       FLAGS.model,
       hp,
-      tpu_trainer.create_run_config(hp),
+      t2t_trainer.create_run_config(hp),
       decode_hparams=decode_hp,
       use_tpu=False)
 
diff --git a/tensor2tensor/bin/t2t_trainer.py b/tensor2tensor/bin/t2t_trainer.py
index 571a21839..a984ca9db 100644
--- a/tensor2tensor/bin/t2t_trainer.py
+++ b/tensor2tensor/bin/t2t_trainer.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Train on TPU."""
+"""Train and evaluate."""
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -26,10 +26,10 @@
 
 from tensor2tensor import models  # pylint: disable=unused-import
 from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
-from tensor2tensor.tpu import tpu_trainer_lib
 from tensor2tensor.utils import decoding
 from tensor2tensor.utils import flags as t2t_flags  # pylint: disable=unused-import
 from tensor2tensor.utils import registry
+from tensor2tensor.utils import trainer_lib
 from tensor2tensor.utils import usr_dir
 
 import tensorflow as tf
@@ -38,7 +38,7 @@
 FLAGS = flags.FLAGS
 
 # See flags.py for additional command-line flags.
-flags.DEFINE_string("t2t_usr_dir", "",
+flags.DEFINE_string("t2t_usr_dir", None,
                     "Path to a Python module that will be imported. The "
                     "__init__.py file should include the necessary imports. "
                     "The imported files should contain registrations, "
@@ -49,6 +49,8 @@
 flags.DEFINE_integer("iterations_per_loop", 1000,
                      "Number of iterations in a TPU training loop.")
 flags.DEFINE_bool("use_tpu", False, "Whether to use TPU.")
+flags.DEFINE_integer("tpu_infeed_sleep_secs", None,
+                     "How long to sleep the infeed thread.")
 flags.DEFINE_bool("generate_data", False, "Generate data before training?")
 flags.DEFINE_string("tmp_dir", "/tmp/t2t_datagen",
                     "Temporary storage directory, used if --generate_data.")
@@ -77,11 +79,15 @@ def get_problem_name():
 
 
 def create_hparams():
-  return tpu_trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams)
+  if FLAGS.use_tpu and "tpu" not in FLAGS.hparams_set:
+    tf.logging.warn("Not all hyperparameter sets work on TPU. When available "
+                    "for a given model, prefer hparams_sets with a '_tpu' "
+                    "suffix, e.g. transformer_tpu.")
+  return trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams)
 
 
 def create_experiment_fn():
-  return tpu_trainer_lib.create_experiment_fn(
+  return trainer_lib.create_experiment_fn(
       model_name=FLAGS.model,
       problem_name=get_problem_name(),
       data_dir=os.path.expanduser(FLAGS.data_dir),
@@ -102,7 +108,7 @@ def create_experiment_fn():
 
 
 def create_run_config(hp):
-  return tpu_trainer_lib.create_run_config(
+  return trainer_lib.create_run_config(
       model_dir=os.path.expanduser(FLAGS.output_dir),
       master=FLAGS.master,
       iterations_per_loop=FLAGS.iterations_per_loop,
@@ -127,7 +133,9 @@ def create_run_config(hp):
       ps_gpu=FLAGS.ps_gpu,
       sync=FLAGS.sync,
       worker_id=FLAGS.worker_id,
-      worker_job=FLAGS.worker_job)
+      worker_job=FLAGS.worker_job,
+      random_seed=FLAGS.random_seed,
+      tpu_infeed_sleep_secs=FLAGS.tpu_infeed_sleep_secs)
 
 
 def generate_data():
@@ -161,6 +169,46 @@ def log_registry():
     sys.exit(0)
 
 
+def is_chief():
+  schedules = ["train", "train_and_evaluate", "continuous_train_and_eval"]
+  return FLAGS.worker_id == 0 and FLAGS.schedule in schedules
+
+
+def save_metadata(hparams):
+  """Saves FLAGS and hparams to output_dir."""
+  output_dir = os.path.expanduser(FLAGS.output_dir)
+  if not tf.gfile.Exists(output_dir):
+    tf.gfile.MakeDirs(output_dir)
+
+  # Save FLAGS in txt file
+  if hasattr(FLAGS, "flags_into_string"):
+    flags_str = FLAGS.flags_into_string()
+    t2t_flags_str = "\n".join([
+        "--%s=%s" % (f.name, f.value)
+        for f in FLAGS.flags_by_module_dict()[
+            "tensor2tensor.utils.flags"]
+    ])
+  else:
+    flags_dict = FLAGS.__dict__["__flags"]
+    flags_str = "\n".join(
+        ["--%s=%s" % (name, str(f)) for (name, f) in flags_dict.items()])
+    t2t_flags_str = None
+
+  flags_txt = os.path.join(output_dir, "flags.txt")
+  with tf.gfile.Open(flags_txt, "w") as f:
+    f.write(flags_str)
+
+  if t2t_flags_str:
+    t2t_flags_txt = os.path.join(output_dir, "flags_t2t.txt")
+    with tf.gfile.Open(t2t_flags_txt, "w") as f:
+      f.write(t2t_flags_str)
+
+  # Save hparams as hparams.json
+  hparams_fname = os.path.join(output_dir, "hparams.json")
+  with tf.gfile.Open(hparams_fname, "w") as f:
+    f.write(hparams.to_json())
+
+
 def execute_schedule(exp):
   if not hasattr(exp, FLAGS.schedule):
     raise ValueError(
@@ -171,7 +219,7 @@ def execute_schedule(exp):
 
 def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
-  tpu_trainer_lib.set_random_seed(FLAGS.random_seed)
+  trainer_lib.set_random_seed(FLAGS.random_seed)
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   log_registry()
 
@@ -181,6 +229,9 @@ def main(_):
   hparams = create_hparams()
   run_config = create_run_config(hparams)
 
+  if is_chief():
+    save_metadata(hparams)
+
   exp_fn = create_experiment_fn()
   exp = exp_fn(run_config, hparams)
   execute_schedule(exp)
diff --git a/tensor2tensor/bin/t2t_translate_all.py b/tensor2tensor/bin/t2t_translate_all.py
new file mode 100644
index 000000000..9827705c3
--- /dev/null
+++ b/tensor2tensor/bin/t2t_translate_all.py
@@ -0,0 +1,107 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Translate a file with all checkpoints in a given directory.
+
+t2t-decoder will be executed with these parameters:
+--problems
+--data_dir
+--output_dir with the value of --model_dir
+--decode_from_file with the value of --source
+--decode_hparams with properly formatted --beam_size and --alpha
+--checkpoint_path automatically filled
+--decode_to_file automatically filled
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import shutil
+
+# Dependency imports
+
+from tensor2tensor.utils import bleu_hook
+
+import tensorflow as tf
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+# t2t-translate-all specific options
+flags.DEFINE_string("decoder_command", "t2t-decoder {params}",
+                    "Which command to execute instead t2t-decoder. "
+                    "{params} is replaced by the parameters. Useful e.g. for "
+                    "qsub wrapper.")
+flags.DEFINE_string("model_dir", "",
+                    "Directory to load model checkpoints from.")
+flags.DEFINE_string("source", None,
+                    "Path to the source-language file to be translated")
+flags.DEFINE_string("translations_dir", "translations",
+                    "Where to store the translated files.")
+flags.DEFINE_integer("min_steps", 0, "Ignore checkpoints with less steps.")
+flags.DEFINE_integer("wait_minutes", 0,
+                     "Wait upto N minutes for a new checkpoint")
+
+# options derived from t2t-decoder
+flags.DEFINE_integer("beam_size", 4, "Beam-search width.")
+flags.DEFINE_float("alpha", 0.6, "Beam-search alpha.")
+flags.DEFINE_string("model", "transformer", "see t2t-decoder")
+flags.DEFINE_string("t2t_usr_dir", None, "see t2t-decoder")
+flags.DEFINE_string("data_dir", None, "see t2t-decoder")
+flags.DEFINE_string("problems", None, "see t2t-decoder")
+flags.DEFINE_string("hparams_set", "transformer_big_single_gpu",
+                    "see t2t-decoder")
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  # pylint: disable=unused-variable
+  model_dir = os.path.expanduser(FLAGS.model_dir)
+  translations_dir = os.path.expanduser(FLAGS.translations_dir)
+  source = os.path.expanduser(FLAGS.source)
+  tf.gfile.MakeDirs(translations_dir)
+  translated_base_file = os.path.join(translations_dir, FLAGS.problems)
+
+  # Copy flags.txt with the original time, so t2t-bleu can report correct
+  # relative time.
+  flags_path = os.path.join(translations_dir, FLAGS.problems + "-flags.txt")
+  if not os.path.exists(flags_path):
+    shutil.copy2(os.path.join(model_dir, "flags.txt"), flags_path)
+
+  for model in bleu_hook.stepfiles_iterator(model_dir, FLAGS.wait_minutes,
+                                            FLAGS.min_steps):
+    tf.logging.info("Translating " + model.filename)
+    out_file = translated_base_file + "-" + str(model.steps)
+    if os.path.exists(out_file):
+      tf.logging.info(out_file + " already exists, so skipping it.")
+    else:
+      tf.logging.info("Translating " + out_file)
+      params = (
+          "--t2t_usr_dir={FLAGS.t2t_usr_dir} --output_dir={model_dir} "
+          "--data_dir={FLAGS.data_dir} --problems={FLAGS.problems} "
+          "--decode_hparams=beam_size={FLAGS.beam_size},alpha={FLAGS.alpha} "
+          "--model={FLAGS.model} --hparams_set={FLAGS.hparams_set} "
+          "--checkpoint_path={model.filename} --decode_from_file={source} "
+          "--decode_to_file={out_file}"
+      ).format(**locals())
+      command = FLAGS.decoder_command.format(**locals())
+      tf.logging.info("Running:\n" + command)
+      os.system(command)
+  # pylint: enable=unused-variable
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/data_generators/generator_utils.py b/tensor2tensor/data_generators/generator_utils.py
index c657a503f..b73b63d74 100644
--- a/tensor2tensor/data_generators/generator_utils.py
+++ b/tensor2tensor/data_generators/generator_utils.py
@@ -316,8 +316,7 @@ def get_or_generate_vocab_inner(data_dir, vocab_filename, vocab_size,
 
 
 def get_or_generate_vocab(data_dir, tmp_dir, vocab_filename, vocab_size,
-                          sources, 
-                          _file_byte_budget=1e6):
+                          sources, file_byte_budget=1e6):
   """Generate a vocabulary from the datasets in sources."""
 
   def generate():
@@ -350,17 +349,17 @@ def generate():
 
         # Use Tokenizer to count the word occurrences.
         with tf.gfile.GFile(filepath, mode="r") as source_file:
-          file_byte_budget = _file_byte_budget
+          file_byte_budget_ = file_byte_budget
           counter = 0
-          countermax = int(source_file.size() / file_byte_budget / 2)
+          countermax = int(source_file.size() / file_byte_budget_ / 2)
           for line in source_file:
             if counter < countermax:
               counter += 1
             else:
-              if file_byte_budget <= 0:
+              if file_byte_budget_ <= 0:
                 break
               line = line.strip()
-              file_byte_budget -= len(line)
+              file_byte_budget_ -= len(line)
               counter = 0
               yield line
 
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 52d7bdab2..53fa48740 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -102,6 +102,7 @@ def default_model_hparams():
       max_input_seq_length=0,
       max_target_seq_length=0,
       prepend_mode="none",
+      split_to_length=0,
       data_dir=None)
 
 
@@ -117,6 +118,12 @@ def preprocess_example_common(example, hparams, mode):
     else:
       example["targets"] = tf.concat(
           [example["inputs"], [0], example["targets"]], 0)
+  if hparams.split_to_length:
+    example["targets"] = tf.reshape(
+        example["targets"], [-1, hparams.split_to_length, 1, 1])
+    if len(example) != 1:
+      raise ValueError("split_to_length only works for LM problems")
+    return tf.data.Dataset.from_tensor_slices(example)
   return example
 
 
@@ -195,9 +202,67 @@ class Problem(object):
   def generate_data(self, data_dir, tmp_dir, task_id=-1):
     raise NotImplementedError()
 
+  @property
+  def multiprocess_generate(self):
+    """Whether to generate the data in multiple parallel processes."""
+    return False
+
+  @property
+  def num_generate_tasks(self):
+    """Needed if multiprocess_generate is True."""
+    raise NotImplementedError()
+
+  def prepare_to_generate(self, data_dir, tmp_dir):
+    """Prepare to generate data in parallel on different processes.
+
+    This function is called if multiprocess_generate is True.
+
+    Some things that might need to be done once are downloading the data
+    if it is not yet downloaded, and building the vocabulary.
+
+    Args:
+      data_dir: a string
+      tmp_dir: a string
+    """
+    raise NotImplementedError()
+
   def hparams(self, defaults, model_hparams):
     pass
 
+  def max_length(self, model_hparams):
+    """Maximum sequence length.
+
+    Problems with fixed length should override.
+
+    Args:
+      model_hparams: model hyperparameters
+    Returns:
+      an integer
+    """
+    return (
+        model_hparams.split_to_length or
+        model_hparams.max_length or
+        model_hparams.batch_size)
+
+  @property
+  def batch_size_means_tokens(self):
+    """Do we specify hparams.batch_size in tokens per datashard per batch.
+
+    This is generally done for text problems.
+
+    If False, we assume that batch sizes are specified in examples per
+    datashard per batch.
+
+    TODO(noam): we should be more explicit and replace the hyperparameter
+    batch size with two hyperparameters:
+      hparams.examples_per_batch_per_datashard
+      hparams.tokens_per_batch_per_datashard
+
+    Returns:
+      a boolean
+    """
+    return False
+
   def dataset_filename(self):
     return self.name
 
@@ -217,6 +282,19 @@ def example_reading_spec(self):
     return (data_fields, data_items_to_decoders)
 
   def preprocess_example(self, example, mode, hparams):
+    """Runtime preprocessing.
+
+    Return a dict or a tf.Data.Datset.from_tensor_slices (if you want each
+    example to turn into multiple).
+
+    Args:
+      example: dict, features
+      mode: tf.estimator.ModeKeys
+      hparams: HParams, model hyperparameters
+
+    Returns:
+      dict or Dataset
+    """
     return preprocess_example_common(example, hparams, mode)
 
   def eval_metrics(self):
@@ -343,6 +421,7 @@ def dataset(self,
               num_threads=None,
               output_buffer_size=None,
               shuffle_files=None,
+              repeat=None,
               hparams=None,
               preprocess=True,
               dataset_split=None,
@@ -358,6 +437,8 @@ def dataset(self,
         calls.
       shuffle_files: whether to shuffle input files. Default behavior (i.e. when
         shuffle_files=None) is to shuffle if mode == TRAIN.
+      repeat: whether to repeat the Dataset. Default behavior is to repeat if
+        mode == TRAIN.
       hparams: tf.contrib.training.HParams; hparams to be passed to
         Problem.preprocess_example and Problem.hparams. If None, will use a
         default set that is a no-op.
@@ -370,6 +451,10 @@ def dataset(self,
     Returns:
       Dataset containing dict<feature name, Tensor>.
     """
+    is_training = mode == tf.estimator.ModeKeys.TRAIN
+    repeat = repeat or repeat is None and is_training
+    shuffle_files = shuffle_files or shuffle_files is None and is_training
+
     dataset_split = dataset_split or mode
     assert data_dir
 
@@ -383,32 +468,56 @@ def dataset(self,
     # Construct the Problem's hparams so that items within it are accessible
     _ = self.get_hparams(hparams)
 
-    is_training = mode == tf.estimator.ModeKeys.TRAIN
     data_filepattern = self.filepattern(data_dir, dataset_split, shard=shard)
     tf.logging.info("Reading data files from %s", data_filepattern)
-    data_files = tf.contrib.slim.parallel_reader.get_data_files(
-        data_filepattern)
-    if shuffle_files or shuffle_files is None and is_training:
-      # In addition to shuffling the list of file names, we skip a random
-      # fraction of the first file.  The skip is essential for synchronous
-      # highly-parallel training.  Otherwise, we have multiple replicas
-      # reading the same shard in lock-step.
-      num_skip = random.randint(0, _file_num_records_cached(data_files[0]))
-      random.shuffle(data_files)
-      dataset = tf.data.TFRecordDataset(data_files).skip(num_skip)
+    dataset = tf.data.Dataset.list_files(data_filepattern)
+
+    if shuffle_files:
+      dataset = dataset.shuffle(buffer_size=1024)
+
+    def _load_records(filename):
+      return tf.data.TFRecordDataset(filename, buffer_size=16 * 1000 * 1000)
+
+    if hasattr(tf.contrib.data, "parallel_interleave"):
+      interleave = lambda ds, fn: ds.apply(  # pylint: disable=g-long-lambda
+          tf.contrib.data.parallel_interleave(
+              fn, sloppy=is_training, cycle_length=16))
     else:
-      dataset = tf.data.TFRecordDataset(data_files)
+      interleave = lambda ds, fn: ds.interleave(fn, cycle_length=16)
 
-    def _preprocess(example):
-      example = self.preprocess_example(example, mode, hparams)
+    dataset = interleave(dataset, _load_records)
+
+    if repeat:
+      dataset = dataset.repeat()
+
+    if shuffle_files:
+      # Skip a random fraction at the beginning of the stream.  The skip is
+      # essential for synchronous highly-parallel training to avoid multiple
+      # replicas reading the same data in lock-step.
+      data_files = tf.contrib.slim.parallel_reader.get_data_files(
+          data_filepattern)
+      num_skip = random.randint(0, _file_num_records_cached(data_files[0]))
+      dataset = dataset.skip(num_skip)
+
+    def _maybe_reverse_and_copy(example):
       self.maybe_reverse_features(example)
       self.maybe_copy_features(example)
       return example
 
+    def _preprocess(example):
+      examples = self.preprocess_example(example, mode, hparams)
+      if not isinstance(examples, tf.data.Dataset):
+        examples = tf.data.Dataset.from_tensors(examples)
+      return examples
+
     dataset = dataset.map(self.decode_example, num_parallel_calls=num_threads)
 
     if preprocess:
-      dataset = dataset.map(_preprocess, num_parallel_calls=num_threads)
+      dataset = interleave(dataset, _preprocess)
+
+    dataset = dataset.map(
+        _maybe_reverse_and_copy, num_parallel_calls=num_threads)
+
     if output_buffer_size:
       dataset = dataset.prefetch(output_buffer_size)
 
@@ -507,18 +616,23 @@ def input_fn(self, mode, hparams, data_dir=None, params=None, config=None,
       (features_dict<str name, Tensor feature>, Tensor targets)
     """
     is_training = mode == tf.estimator.ModeKeys.TRAIN
-    num_threads = 4 if is_training else 1
+    if config.use_tpu:
+      num_threads = 32
+    else:
+      num_threads = 4 if is_training else 1
+
+    max_length = self.max_length(hparams)
 
     def tpu_valid_size(example):
       return data_reader.example_valid_size(example, hparams.min_length,
-                                            hparams.max_length)
+                                            max_length)
 
     def gpu_valid_size(example):
       drop_long_sequences = is_training or hparams.eval_drop_long_sequences
       return data_reader.example_valid_size(
           example,
           hparams.min_length,
-          hparams.max_length if drop_long_sequences else 10**9)
+          max_length if drop_long_sequences else 10**9)
 
     def define_shapes(example):
       batch_size = config and config.use_tpu and params["batch_size"]
@@ -540,10 +654,24 @@ def define_shapes(example):
     if is_training:
       dataset = dataset.repeat(None)
 
+    if self.batch_size_means_tokens:
+      batch_size_means_tokens = True
+    else:
+      if _are_shapes_fully_defined(dataset.output_shapes):
+        batch_size_means_tokens = False
+      else:
+        tf.logging.warning(
+            "Shapes are not fully defined. Assuming batch_size means tokens. "
+            "You should probably override batch_size_means_tokens() "
+            "in your problem subclass")
+        batch_size_means_tokens = True
+
     # Batching
-    if _are_shapes_fully_defined(dataset.output_shapes):
-      # Static shape features (e.g. images)
+    if not batch_size_means_tokens:
+      # Batch size means examples per datashard.
       if config and config.use_tpu:
+        # on TPU, we use params["batch_size"], which specifies the number of
+        # examples across all datashards
         tpu_batch_size = params["batch_size"]
         dataset = dataset.apply(
             tf.contrib.data.batch_and_drop_remainder(tpu_batch_size))
@@ -551,12 +679,14 @@ def define_shapes(example):
         num_shards = (config and config.data_parallelism.n) or 1
         dataset = dataset.batch(hparams.batch_size * num_shards)
     else:
-      # Variable length features
+      # batch_size means tokens per datashard
       if config and config.use_tpu:
-        # On TPU, pad to hparams.max_length
+        # On TPU, pad to max_length
         dataset = dataset.filter(tpu_valid_size)
         padded_shapes = _fill_shape_nones(
-            dataset.output_shapes, none_filler=hparams.max_length)
+            dataset.output_shapes, none_filler=max_length)
+        # on TPU, we use params["batch_size"], which specifies the number of
+        # examples across all datashards
         dataset = dataset.apply(
             tf.contrib.data.padded_batch_and_drop_remainder(
                 params["batch_size"], padded_shapes))
@@ -568,6 +698,7 @@ def define_shapes(example):
             shard_multiplier=(config and config.data_parallelism.n) or 1,
             length_multiplier=self.get_hparams().batch_size_multiplier)
         if hparams.use_fixed_batch_size:
+          # Here  batch_size really means examples per datashard.
           batching_scheme["batch_sizes"] = [hparams.batch_size]
           batching_scheme["boundaries"] = []
         dataset = data_reader.bucket_by_sequence_length(
@@ -590,7 +721,7 @@ def _pad_batch(features):
           dataset = dataset.map(_pad_batch, num_parallel_calls=num_threads)
 
     dataset = dataset.map(define_shapes, num_parallel_calls=num_threads)
-    dataset = dataset.prefetch(1)
+    dataset = dataset.prefetch(2)
     features = dataset.make_one_shot_iterator().get_next()
     if not config or not config.use_tpu:
       _summarize_features(features, (config and config.data_parallelism.n) or 1)
@@ -738,6 +869,10 @@ def is_character_level(self):
   def targeted_vocab_size(self):
     raise NotImplementedError()  # Not needed if self.is_character_level.
 
+  @property
+  def batch_size_means_tokens(self):
+    return True
+
   def generator(self, data_dir, tmp_dir, is_training):
     """Generator for the training and evaluation data.
 
@@ -764,6 +899,12 @@ def packed_length(self):
     """
     return None
 
+  def max_length(self, model_hparams):
+    """Maximum sequence length."""
+    if self.packed_length:
+      return self.packed_length
+    return super(Text2TextProblem, self).max_length(model_hparams)
+
   @property
   def use_train_shards_for_dev(self):
     """If true, we only generate training data and hold out shards for dev."""
@@ -891,6 +1032,261 @@ def eval_metrics(self):
     ]
 
 
+class ChoppedTextProblem(Text2TextProblem):
+  """Tokenize and chop text files into fixed-length language-modeling examples.
+
+  The input data is a set of text files, as specified by
+  self.train_text_filepaths() and self.dev_text_filepaths().
+
+  The text is tokenized using a SubwordTextEncoder, and
+  then split into examples, each of length self.sequence_length().
+  """
+
+  def train_text_filepaths(self, tmp_dir):
+    """Local filepaths of text files containing training data.
+
+    This function may want to download the files if they do not exist.
+
+    Args:
+      tmp_dir: a string
+    Returns:
+      a list of strings.
+    """
+    raise NotImplementedError()
+
+  def dev_text_filepaths(self, tmp_dir):
+    """Local filepaths of text files containing dev data.
+
+    This function may want to download the files if they do not exist.
+
+    Args:
+      tmp_dir: a string
+    Returns:
+      a list of strings.
+    """
+    raise NotImplementedError()
+
+  @property
+  def sequence_length(self):
+    """Length of each example (in tokens)."""
+    raise NotImplementedError()
+
+  def max_length(self, model_hparams):
+    return model_hparams.split_to_length or self.sequence_length
+
+  @property
+  def is_character_level(self):
+    return False
+
+  def text_filepaths_for_task(self, tmp_dir, task_id):
+    """List of input filepaths for a particular training or dev shard.
+
+    Args:
+      tmp_dir: a string
+      task_id: an integer less than self.num_shards
+    Returns:
+      a list of tuples (filepath, start_pos, num_bytes)
+    """
+    assert task_id >= 0
+    assert task_id < self.num_train_shards + self.num_dev_shards
+    if task_id < self.num_train_shards:
+      return [f for i, f in enumerate(self.train_text_filepaths(tmp_dir))
+              if i % self.num_train_shards == task_id]
+    else:
+      return [f for i, f in enumerate(self.dev_text_filepaths(tmp_dir))
+              if i % self.num_dev_shards == task_id - self.num_train_shards]
+
+  def filepath_to_unicode_strings(self, filepath):
+    """Read text out of an input file.
+
+    The default just reads the text, converts to unicode and yields one
+    unicode string.
+
+    Subclasses can override this function in order to preprocess, and can
+    yield any number of strings.
+
+    Args:
+      filepath: a string
+    Yields:
+      unicode strings.
+    """
+    f = tf.gfile.Open(filepath)
+    b = f.read()
+    yield to_unicode_ignore_erros(b)
+
+  def file_generator(self,
+                     filepaths,
+                     max_chars_per_file=None,
+                     max_chars_total=None):
+    """Read complete text of input files and yield unicode strings.
+
+    By default, one unicode string is produced per file, but this is
+    not guaranteed, since subclasses can override
+    filepath_to_unicode_strings().
+
+    max_chars_per_file and max_chars_total can also be specified, in which
+    case some strings may be truncated or dropped to limit the total
+    amount of output.
+
+    Args:
+      filepaths: a list of strings
+      max_chars_per_file: an optional integer
+      max_chars_total: an optional integer
+    Yields:
+      unicode strings
+    """
+    chars_total = 0
+    for fname in filepaths:
+      chars_this_file = 0
+      tf.logging.info("reading file %s" % fname)
+      for text in self.filepath_to_unicode_strings(fname):
+        if (max_chars_per_file and chars_this_file + len(text)
+            > max_chars_per_file):
+          text = text[:max_chars_per_file - chars_this_file]
+        if max_chars_total and chars_total + len(text) > max_chars_total:
+          text = text[:max_chars_total - chars_total]
+        chars_total += len(text)
+        chars_this_file += len(text)
+        if text:
+          yield text
+        if max_chars_total and chars_total >= max_chars_total:
+          return
+        if max_chars_per_file and chars_this_file >= max_chars_per_file:
+          break
+
+  def example_generator(self, encoder, tmp_dir, task_id):
+    """Generator for examples.
+
+    Args:
+      encoder: a TextEncoder
+      tmp_dir: a string
+      task_id: an integer
+    Yields:
+      feature dictionaries
+    """
+    filepaths = self.text_filepaths_for_task(tmp_dir, task_id)
+    if task_id >= self.num_train_shards:
+      # this is dev data - limit the total length.
+      max_chars_per_file = self.max_dev_chars // (
+          self.num_dev_shards * len(filepaths))
+    else:
+      max_chars_per_file = None
+    tokens = []
+    for ftext in self.file_generator(
+        filepaths, max_chars_per_file=max_chars_per_file):
+      tokens.extend(encoder.encode(ftext))
+      pos = 0
+      while pos + self.sequence_length <= len(tokens):
+        yield {"inputs": [0], "targets": tokens[pos:pos + self.sequence_length]}
+        pos += self.sequence_length
+      if pos > 0:
+        tokens = tokens[pos:]
+    if self.remainder_policy == "pad":
+      if tokens:
+        targets = tokens + [0] * (self.sequence_length - len(tokens))
+        yield {"inputs": [0], "targets": targets}
+    else:
+      assert self.remainder_policy == "drop"
+
+  @property
+  def remainder_policy(self):
+    """What to do with leftover tokens.
+
+    Returns:
+      a string - either "pad" or  "drop".
+    """
+    return "pad"
+
+  def prepare_to_generate(self, data_dir, tmp_dir):
+    """Make sure that the data is prepared and the vocab is generated."""
+    self.get_or_generate_vocab(data_dir, tmp_dir)
+    self.train_text_filepaths(tmp_dir)
+    self.dev_text_filepaths(tmp_dir)
+
+  def get_or_generate_vocab(self, data_dir, tmp_dir):
+    return generator_utils.get_or_generate_vocab_inner(
+        data_dir, self.vocab_file, self.targeted_vocab_size,
+        self.file_generator(
+            self.train_text_filepaths(tmp_dir),
+            max_chars_total=self.max_chars_for_vocab))
+
+  def generate_data(self, data_dir, tmp_dir, task_id=-1):
+    """Generates training/dev data.
+
+    Args:
+      data_dir: a string
+      tmp_dir: a string
+      task_id: an optional integer
+    Returns:
+      shard or shards for which data was generated.
+    """
+    tf.logging.info("generate_data task_id=%s" % task_id)
+    encoder = self.get_or_generate_vocab(data_dir, tmp_dir)
+    assert task_id >= 0 and task_id < self.num_generate_tasks
+    if task_id < self.num_train_shards:
+      out_file = self.training_filepaths(
+          data_dir, self.num_train_shards, shuffled=False)[task_id]
+    else:
+      out_file = self.dev_filepaths(
+          data_dir, self.num_dev_shards,
+          shuffled=False)[task_id - self.num_train_shards]
+    generator_utils.generate_files(
+        self.example_generator(encoder, tmp_dir, task_id), [out_file])
+    generator_utils.shuffle_dataset([out_file])
+
+  @property
+  def max_chars_for_vocab(self):
+    """Number of characters of training data to use for generating vocab."""
+    return 10 ** 7
+
+  @property
+  def target_space_id(self):
+    return SpaceID.EN_TOK
+
+  @property
+  def num_train_shards(self):
+    return 100
+
+  @property
+  def num_dev_shards(self):
+    return 1
+
+  @property
+  def max_dev_chars(self):
+    """Limit dev set to at most this many characters (default 10M)."""
+    return 10 ** 7
+
+  @property
+  def multiprocess_generate(self):
+    return True
+
+  @property
+  def num_generate_tasks(self):
+    return self.num_train_shards + self.num_dev_shards
+
+  @property
+  def vocab_name(self):
+    raise NotImplementedError()
+
+  @property
+  def use_subword_tokenizer(self):
+    return True
+
+  @property
+  def has_inputs(self):
+    return False
+
+  def eval_metrics(self):
+    return [
+        metrics.Metrics.ACC, metrics.Metrics.NEG_LOG_PERPLEXITY
+    ]
+
+
+def to_unicode_ignore_erros(s):
+  return (unicode(s, "utf-8", errors="ignore") if six.PY2 else
+          s.decode("utf-8", "ignore"))
+
+
 def _are_shapes_fully_defined(shapes_dict):
   for shape in shapes_dict.values():
     if not shape.is_fully_defined():
diff --git a/tensor2tensor/data_generators/text_encoder.py b/tensor2tensor/data_generators/text_encoder.py
index 6930b205e..d43236945 100644
--- a/tensor2tensor/data_generators/text_encoder.py
+++ b/tensor2tensor/data_generators/text_encoder.py
@@ -436,6 +436,23 @@ def encode(self, raw_text):
     return self._tokens_to_subtoken_ids(
         tokenizer.encode(native_to_unicode(raw_text)))
 
+  def encode_without_tokenizing(self, token_text):
+    """Converts string to list of subtoken ids without calling tokenizer.
+
+    This treats `token_text` as a single token and directly converts it
+    to subtoken ids. This may be useful when the default tokenizer doesn't
+    do what we want (e.g., when encoding text with tokens composed of lots of
+    nonalphanumeric characters). It is then up to the caller to make sure that
+    raw text is consistently converted into tokens. Only use this if you are
+    sure that `encode` doesn't suit your needs.
+
+    Args:
+      token_text: A native string representation of a single token.
+    Returns:
+      A list of subword token ids; i.e., integers in the range [0, vocab_size).
+    """
+    return self._tokens_to_subtoken_ids([native_to_unicode(token_text)])
+
   def decode(self, subtokens):
     """Converts a sequence of subtoken ids to a native string.
 
@@ -559,6 +576,8 @@ def build_to_target_size(cls,
                            token_counts,
                            min_val,
                            max_val,
+                           max_subtoken_length=None,
+                           reserved_tokens=None,
                            num_iterations=4):
     """Builds a SubwordTextEncoder that has `vocab_size` near `target_size`.
 
@@ -570,6 +589,13 @@ def build_to_target_size(cls,
       token_counts: A dictionary of token counts, mapping string to int.
       min_val: An integer; lower bound for the minimum token count.
       max_val: An integer; upper bound for the minimum token count.
+      max_subtoken_length: Maximum length of a subtoken. If this is not set,
+        then the runtime and memory use of creating the vocab is quadratic in
+        the length of the longest token. If this is set, then it is instead
+        O(max_subtoken_length * length of longest token).
+      reserved_tokens: List of reserved tokens. The global variable
+        `RESERVED_TOKENS` must be a prefix of `reserved_tokens`. If this
+        argument is `None`, it will use `RESERVED_TOKENS`.
       num_iterations: An integer; how many iterations of refinement.
 
     Returns:
@@ -584,13 +610,18 @@ def build_to_target_size(cls,
     if target_size < 1:
       raise ValueError("Target size must be positive.")
 
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+
     def bisect(min_val, max_val):
       """Bisection to find the right size."""
       present_count = (max_val + min_val) // 2
       tf.logging.info("Trying min_count %d" % present_count)
       subtokenizer = cls()
-      subtokenizer.build_from_token_counts(token_counts, present_count,
-                                           num_iterations)
+      subtokenizer.build_from_token_counts(
+          token_counts, present_count, num_iterations,
+          max_subtoken_length=max_subtoken_length,
+          reserved_tokens=reserved_tokens)
 
       # Being within 1% of the target size is ok.
       is_ok = abs(subtokenizer.vocab_size - target_size) * 100 < target_size
@@ -617,36 +648,47 @@ def build_from_token_counts(self,
                               token_counts,
                               min_count,
                               num_iterations=4,
-                              num_reserved_ids=NUM_RESERVED_TOKENS):
+                              reserved_tokens=None,
+                              max_subtoken_length=None):
     """Train a SubwordTextEncoder based on a dictionary of word counts.
 
     Args:
       token_counts: a dictionary of Unicode strings to int.
       min_count: an integer - discard subtokens with lower counts.
       num_iterations: an integer.  how many iterations of refinement.
-      num_reserved_ids: an integer.  how many ids to reserve for special tokens.
+      reserved_tokens: List of reserved tokens. The global variable
+        `RESERVED_TOKENS` must be a prefix of `reserved_tokens`. If this
+        argument is `None`, it will use `RESERVED_TOKENS`.
+      max_subtoken_length: Maximum length of a subtoken. If this is not set,
+        then the runtime and memory use of creating the vocab is quadratic in
+        the length of the longest token. If this is set, then it is instead
+        O(max_subtoken_length * length of longest token).
 
     Raises:
       ValueError: if reserved is not 0 or len(RESERVED_TOKENS). In this case, it
         is not clear what the space is being reserved for, or when it will be
         filled in.
     """
+    if reserved_tokens is None:
+      reserved_tokens = RESERVED_TOKENS
+    else:
+      # There is not complete freedom in replacing RESERVED_TOKENS.
+      for default, proposed in zip(RESERVED_TOKENS, reserved_tokens):
+        if default != proposed:
+          raise ValueError("RESERVED_TOKENS must be a prefix of "
+                           "reserved_tokens.")
+
     # Initialize the alphabet. Note, this must include reserved tokens or it can
     # result in encoding failures.
-    if num_reserved_ids == NUM_RESERVED_TOKENS:
-      alphabet_tokens = chain(six.iterkeys(token_counts),
-                              [native_to_unicode(t) for t in RESERVED_TOKENS])
-    elif num_reserved_ids == 0:
-      alphabet_tokens = six.iterkeys(token_counts)
-    else:
-      raise ValueError("Unexpected value for reserved. What is being reserved?")
+    alphabet_tokens = chain(six.iterkeys(token_counts),
+                            [native_to_unicode(t) for t in reserved_tokens])
 
     self._init_alphabet_from_tokens(alphabet_tokens)
 
     # Bootstrap the initial list of subtokens with the characters from the
     # alphabet plus the escaping characters.
-    self._init_subtokens_from_list(
-        list(self._alphabet), reserved=num_reserved_ids)
+    self._init_subtokens_from_list(list(self._alphabet),
+                                   reserved_tokens=reserved_tokens)
 
     # We build iteratively.  On each iteration, we segment all the words,
     # then count the resulting potential subtokens, keeping the ones
@@ -664,7 +706,11 @@ def build_from_token_counts(self,
         subtokens = self._escaped_token_to_subtoken_strings(escaped_token)
         start = 0
         for subtoken in subtokens:
-          for end in xrange(start + 1, len(escaped_token) + 1):
+          last_position = len(escaped_token) + 1
+          if max_subtoken_length is not None:
+            last_position = min(last_position, start + max_subtoken_length)
+
+          for end in xrange(start + 1, last_position):
             new_subtoken = escaped_token[start:end]
             subtoken_counts[new_subtoken] += count
           start += len(subtoken)
@@ -700,13 +746,9 @@ def build_from_token_counts(self,
 
       # Reinitialize to the candidate vocabulary.
       new_subtoken_strings = [subtoken for _, subtoken in new_subtoken_strings]
-      if num_reserved_ids == len(RESERVED_TOKENS):
-        new_subtoken_strings = RESERVED_TOKENS + new_subtoken_strings
-      elif num_reserved_ids == 0:
-        pass
-      else:
-        raise ValueError("num_reserved_ids must be 0 or %d but was %d" %
-                         NUM_RESERVED_TOKENS, num_reserved_ids)
+      if reserved_tokens:
+        new_subtoken_strings = reserved_tokens + new_subtoken_strings
+
       self._init_subtokens_from_list(new_subtoken_strings)
       tf.logging.info("vocab_size = %d" % self.vocab_size)
 
@@ -721,32 +763,33 @@ def dump(self):
     print(u", ".join(u"{0} : '{1}'".format(i, s)
                      for i, s in sorted(subtoken_strings)))
 
-  def _init_subtokens_from_list(self, subtoken_strings, reserved=0):
+  def _init_subtokens_from_list(self, subtoken_strings, reserved_tokens=None):
     """Initialize token information from a list of subtoken strings.
 
     Args:
       subtoken_strings: a list of subtokens
-      reserved: number of spaces to save at the beginning for reserved tokens
+      reserved_tokens: List of reserved tokens. We must have `reserved_tokens`
+        as None or the empty list, or else the global variable `RESERVED_TOKENS`
+        must be a prefix of `reserved_tokens`.
 
     Raises:
       ValueError: if reserved is not 0 or len(RESERVED_TOKENS). In this case, it
         is not clear what the space is being reserved for, or when it will be
         filled in.
     """
-    if reserved == 0:
-      self._all_subtoken_strings = subtoken_strings
-    elif reserved == len(RESERVED_TOKENS):
-      self._all_subtoken_strings = RESERVED_TOKENS + subtoken_strings
+    if reserved_tokens is None:
+      reserved_tokens = []
+
+    if reserved_tokens:
+      self._all_subtoken_strings = reserved_tokens + subtoken_strings
     else:
-      # TODO(dtarlow): or should we fall back to the previous behavior and
-      # insert copies of "" for each reserved count?
-      raise ValueError("Unexpected value for reserved. What is being reserved?")
+      self._all_subtoken_strings = subtoken_strings
 
     # we remember the maximum length of any subtoken to avoid having to
     # check arbitrarily long strings.
     self._max_subtoken_len = max([len(s) for s in subtoken_strings])
     self._subtoken_string_to_id = {
-        s: i + reserved
+        s: i + len(reserved_tokens)
         for i, s in enumerate(subtoken_strings) if s
     }
     # Initialize the cache to empty.
@@ -817,8 +860,13 @@ def encode(self, s):
     Returns:
       ids: list of integers
     """
-    # TODO(lukaszkaiser): implement this.
-    raise NotImplementedError
+    try:
+      import matplotlib.image as im  # pylint: disable=g-import-not-at-top
+    except ImportError as e:
+      tf.logging.warning(
+          "Reading an image requires matplotlib to be installed: %s", e)
+      raise NotImplementedError("Image reading not implemented.")
+    return im.imread(s)
 
   def decode(self, ids):
     """Transform a sequence of int ids into an image file.
diff --git a/tensor2tensor/data_generators/text_encoder_test.py b/tensor2tensor/data_generators/text_encoder_test.py
index 8364afafd..273810684 100644
--- a/tensor2tensor/data_generators/text_encoder_test.py
+++ b/tensor2tensor/data_generators/text_encoder_test.py
@@ -23,11 +23,14 @@
 import collections
 import io
 import os
+import random
 import shutil
+import string
 
 # Dependency imports
 import mock
 import six
+from six.moves import xrange  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import text_encoder
 import tensorflow as tf
@@ -120,7 +123,7 @@ def test_encode_decode(self):
         "to build a vocabulary. It will be used when strings are encoded "
         "with a TextEncoder subclass. The encoder was coded by a coder.")
     token_counts = collections.Counter(corpus.split(" "))
-    alphabet = set(corpus) ^ {" "}
+    alphabet = set(corpus) - {" "}
 
     original = "This is a coded sentence encoded by the SubwordTextEncoder."
     token_counts.update(original.split(" "))
@@ -161,7 +164,7 @@ def test_unicode(self):
   def test_small_vocab(self):
     corpus = "The quick brown fox jumps over the lazy dog"
     token_counts = collections.Counter(corpus.split(" "))
-    alphabet = set(corpus) ^ {" "}
+    alphabet = set(corpus) - {" "}
 
     encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
         10, token_counts, 2, 10)
@@ -173,6 +176,63 @@ def test_small_vocab(self):
     for a in alphabet:
       self.assertIn(a, encoder.all_subtoken_strings)
 
+  def test_long_tokens(self):
+    """Subword tokenization should still run efficiently with long tokens.
+
+    To make it run efficiently, we need to use the `max_subtoken_length`
+    argument when calling SubwordTextEncoder.build_to_target_size.
+    """
+    token_length = 4000
+    num_tokens = 50
+    target_vocab_size = 600
+    max_subtoken_length = 10  # Set this to `None` to get problems.
+    max_count = 500
+
+    # Generate some long random strings.
+    random.seed(0)
+    long_tokens = []
+    for _ in range(num_tokens):
+      long_token = "".join([random.choice(string.ascii_uppercase)
+                            for _ in xrange(token_length)])
+      long_tokens.append(long_token)
+
+    corpus = " ".join(long_tokens)
+    token_counts = collections.Counter(corpus.split(" "))
+    alphabet = set(corpus) - {" "}
+
+    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
+        target_vocab_size, token_counts, 1, max_count, num_iterations=1,
+        max_subtoken_length=max_subtoken_length)
+
+    # All vocabulary elements are in the alphabet and subtoken strings even
+    # if we requested a smaller vocabulary to assure all expected strings
+    # are encodable.
+    self.assertTrue(alphabet.issubset(encoder._alphabet))
+    for a in alphabet:
+      self.assertIn(a, encoder.all_subtoken_strings)
+
+  def test_custom_reserved_tokens(self):
+    """Test that we can pass custom reserved tokens to SubwordTextEncoder."""
+    corpus = "The quick brown fox jumps over the lazy dog"
+    token_counts = collections.Counter(corpus.split(" "))
+
+    start_symbol = "<S>"
+    end_symbol = "<E>"
+    reserved_tokens = text_encoder.RESERVED_TOKENS + [start_symbol,
+                                                      end_symbol]
+    encoder = text_encoder.SubwordTextEncoder.build_to_target_size(
+        10, token_counts, 2, 10, reserved_tokens=reserved_tokens)
+
+    # Make sure that reserved tokens appear in the right places.
+    start_id = encoder._subtoken_string_to_id[start_symbol]
+    end_id = encoder._subtoken_string_to_id[end_symbol]
+    self.assertEqual(start_id, 2)
+    self.assertEqual(end_id, 3)
+
+    # Make sure that we haven't messed up the ability to reconstruct.
+    reconstructed_corpus = encoder.decode(encoder.encode(corpus))
+    self.assertEqual(corpus, reconstructed_corpus)
+
   def test_encodable_when_not_in_alphabet(self):
     corpus = "the quick brown fox jumps over the lazy dog"
     token_counts = collections.Counter(corpus.split(" "))
diff --git a/tensor2tensor/data_generators/translate_enzh.py b/tensor2tensor/data_generators/translate_enzh.py
index d3ddd8d98..0a645b3bb 100644
--- a/tensor2tensor/data_generators/translate_enzh.py
+++ b/tensor2tensor/data_generators/translate_enzh.py
@@ -46,9 +46,10 @@
 # News Commentary, around 220k lines
 # This dataset is only a small fraction of full WMT17 task
 _NC_TRAIN_DATASETS = [[
-        "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12.tgz",
-        ["training/news-commentary-v12.zh-en.en",
-            "training/news-commentary-v12.zh-en.zh"]]]
+    "http://data.statmt.org/wmt17/translation-task/training-parallel-nc-v12"
+    ".tgz",
+    ["training/news-commentary-v12.zh-en.en",
+     "training/news-commentary-v12.zh-en.zh"]]]
 
 # Test set from News Commentary. 2000 lines
 _NC_TEST_DATASETS = [[
@@ -63,117 +64,96 @@
 # NOTE: You need to register to download dataset from official source
 # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
 _UN_TRAIN_DATASETS = [[
-        "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/UNv1.0.en-zh.tar.gz",
-        ["en-zh/UNv1.0.en-zh.en",
-            "en-zh/UNv1.0.en-zh.zh"]]]
+    "https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/UNv1.0.en-zh.tar"
+    ".gz",
+    ["en-zh/UNv1.0.en-zh.en", "en-zh/UNv1.0.en-zh.zh"]]]
 
 # CWMT corpus
 # Visit source website to download manually:
-# http://nlp.nju.edu.cn/cwmt-wmt/  
+# http://nlp.nju.edu.cn/cwmt-wmt/
 #
 # casia2015: 1,050,000 lines
 # casict2015: 2,036,833 lines
 # datum2015:  1,000,003 lines
 # datum2017: 1,999,968 lines
-# NEU2017:  2,000,000 lines 
+# NEU2017:  2,000,000 lines
 #
 # NOTE: You need to register to download dataset from official source
 # place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
 
 _CWMT_TRAIN_DATASETS = [
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/casia2015/casia2015_en.txt",
-            "cwmt/casia2015/casia2015_ch.txt"]],
+     ["cwmt/casia2015/casia2015_en.txt", "cwmt/casia2015/casia2015_ch.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/casict2015/casict2015_en.txt",
-             "cwmt/casict2015/casict2015_ch.txt"]],
+     ["cwmt/casict2015/casict2015_en.txt",
+      "cwmt/casict2015/casict2015_ch.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/neu2017/NEU_en.txt",
-            "cwmt/neu2017/NEU_cn.txt"]],
+     ["cwmt/neu2017/NEU_en.txt", "cwmt/neu2017/NEU_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2015/datum_en.txt",
-            "cwmt/datum2015/datum_ch.txt"]],
+     ["cwmt/datum2015/datum_en.txt", "cwmt/datum2015/datum_ch.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book1_en.txt",
-            "cwmt/datum2017/Book1_cn.txt"]],
+     ["cwmt/datum2017/Book1_en.txt", "cwmt/datum2017/Book1_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book2_en.txt",
-            "cwmt/datum2017/Book2_cn.txt"]],
+     ["cwmt/datum2017/Book2_en.txt", "cwmt/datum2017/Book2_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book3_en.txt",
-            "cwmt/datum2017/Book3_cn.txt"]],
+     ["cwmt/datum2017/Book3_en.txt", "cwmt/datum2017/Book3_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book4_en.txt",
-            "cwmt/datum2017/Book4_cn.txt"]],
+     ["cwmt/datum2017/Book4_en.txt", "cwmt/datum2017/Book4_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book5_en.txt",
-            "cwmt/datum2017/Book5_cn.txt"]],
+     ["cwmt/datum2017/Book5_en.txt", "cwmt/datum2017/Book5_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book6_en.txt",
-            "cwmt/datum2017/Book6_cn.txt"]],
+     ["cwmt/datum2017/Book6_en.txt", "cwmt/datum2017/Book6_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book7_en.txt",
-            "cwmt/datum2017/Book7_cn.txt"]],
+     ["cwmt/datum2017/Book7_en.txt", "cwmt/datum2017/Book7_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book8_en.txt",
-            "cwmt/datum2017/Book8_cn.txt"]],
+     ["cwmt/datum2017/Book8_en.txt", "cwmt/datum2017/Book8_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book9_en.txt",
-            "cwmt/datum2017/Book9_cn.txt"]],
+     ["cwmt/datum2017/Book9_en.txt", "cwmt/datum2017/Book9_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book10_en.txt",
-            "cwmt/datum2017/Book10_cn.txt"]],
+     ["cwmt/datum2017/Book10_en.txt", "cwmt/datum2017/Book10_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book11_en.txt",
-            "cwmt/datum2017/Book11_cn.txt"]],
+     ["cwmt/datum2017/Book11_en.txt", "cwmt/datum2017/Book11_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book12_en.txt",
-            "cwmt/datum2017/Book12_cn.txt"]],
+     ["cwmt/datum2017/Book12_en.txt", "cwmt/datum2017/Book12_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book13_en.txt",
-            "cwmt/datum2017/Book13_cn.txt"]],
+     ["cwmt/datum2017/Book13_en.txt", "cwmt/datum2017/Book13_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book14_en.txt",
-            "cwmt/datum2017/Book14_cn.txt"]],
+     ["cwmt/datum2017/Book14_en.txt", "cwmt/datum2017/Book14_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book15_en.txt",
-            "cwmt/datum2017/Book15_cn.txt"]],
+     ["cwmt/datum2017/Book15_en.txt", "cwmt/datum2017/Book15_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book16_en.txt",
-            "cwmt/datum2017/Book16_cn.txt"]],
+     ["cwmt/datum2017/Book16_en.txt", "cwmt/datum2017/Book16_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book17_en.txt",
-            "cwmt/datum2017/Book17_cn.txt"]],
+     ["cwmt/datum2017/Book17_en.txt", "cwmt/datum2017/Book17_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book18_en.txt",
-            "cwmt/datum2017/Book18_cn.txt"]],
+     ["cwmt/datum2017/Book18_en.txt", "cwmt/datum2017/Book18_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book19_en.txt",
-            "cwmt/datum2017/Book19_cn.txt"]],
+     ["cwmt/datum2017/Book19_en.txt", "cwmt/datum2017/Book19_cn.txt"]],
     ["https://s3-us-west-2.amazonaws.com/twairball.wmt17.zh-en/cwmt.tgz",
-        ["cwmt/datum2017/Book20_en.txt",
-            "cwmt/datum2017/Book20_cn.txt"]]
+     ["cwmt/datum2017/Book20_en.txt", "cwmt/datum2017/Book20_cn.txt"]]
 ]
 
 
 def get_filename(dataset):
-  return dataset[0][0].split('/')[-1]
+  return dataset[0][0].split("/")[-1]
+
 
 @registry.register_problem
 class TranslateEnzhWmt32k(translate.TranslateProblem):
   """Problem spec for WMT En-Zh translation.
-  Attempts to use full training dataset, which needs website 
+
+  Attempts to use full training dataset, which needs website
   registration and downloaded manually from official sources:
 
-  CWMT: 
+  CWMT:
     - http://nlp.nju.edu.cn/cwmt-wmt/
-    - Website contrains instructions for FTP server access. 
-    - You'll need to download CASIA, CASICT, DATUM2015, DATUM2017, 
+    - Website contrains instructions for FTP server access.
+    - You'll need to download CASIA, CASICT, DATUM2015, DATUM2017,
         NEU datasets
 
-  UN Parallel Corpus: 
+  UN Parallel Corpus:
     - https://conferences.unite.un.org/UNCorpus
-    - You'll need to register your to download the dataset. 
+    - You'll need to register your to download the dataset.
 
   NOTE: place into tmp directory e.g. /tmp/t2t_datagen/dataset.tgz
   """
@@ -189,32 +169,40 @@ def source_vocab_name(self):
   @property
   def target_vocab_name(self):
     return "vocab.enzh-zh.%d" % self.targeted_vocab_size
-  
+
   def get_training_dataset(self, tmp_dir):
     """UN Parallel Corpus and CWMT Corpus need to be downloaded manually.
+
     Append to training dataset if available
+
+    Args:
+      tmp_dir: path to temporary dir with the data in it.
+
+    Returns:
+      paths
     """
     full_dataset = _NC_TRAIN_DATASETS
     for dataset in [_CWMT_TRAIN_DATASETS, _UN_TRAIN_DATASETS]:
       filename = get_filename(dataset)
       tmp_filepath = os.path.join(tmp_dir, filename)
       if tf.gfile.Exists(tmp_filepath):
-        full_dataset = full_dataset + dataset
+        full_dataset += dataset
       else:
-        tf.logging.info("[TranslateEzhWmt] dataset incomplete, you need to manually download %s" % filename)
+        tf.logging.info("[TranslateEzhWmt] dataset incomplete, you need to "
+                        "manually download %s" % filename)
     return full_dataset
 
   def generator(self, data_dir, tmp_dir, train):
-    TRAIN_DATASET = self.get_training_dataset(tmp_dir)
-    datasets = TRAIN_DATASET if train else _NC_TEST_DATASETS
-    source_datasets = [[item[0], [item[1][0]]] for item in TRAIN_DATASET]
-    target_datasets = [[item[0], [item[1][1]]] for item in TRAIN_DATASET]
+    train_dataset = self.get_training_dataset(tmp_dir)
+    datasets = train_dataset if train else _NC_TEST_DATASETS
+    source_datasets = [[item[0], [item[1][0]]] for item in train_dataset]
+    target_datasets = [[item[0], [item[1][1]]] for item in train_dataset]
     source_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.source_vocab_name, self.targeted_vocab_size,
-        source_datasets, _file_byte_budget=1e8)
+        source_datasets, file_byte_budget=1e8)
     target_vocab = generator_utils.get_or_generate_vocab(
         data_dir, tmp_dir, self.target_vocab_name, self.targeted_vocab_size,
-        target_datasets, _file_byte_budget=1e8)
+        target_datasets, file_byte_budget=1e8)
     tag = "train" if train else "dev"
     filename_base = "wmt_enzh_%sk_tok_%s" % (self.targeted_vocab_size, tag)
     data_path = translate.compile_data(tmp_dir, datasets, filename_base)
@@ -244,6 +232,7 @@ def feature_encoders(self, data_dir):
 @registry.register_problem
 class TranslateEnzhWmt8k(TranslateEnzhWmt32k):
   """Problem spec for WMT En-Zh translation.
+
   This is far from being the real WMT17 task - only toyset here
   """
 
@@ -254,7 +243,7 @@ def targeted_vocab_size(self):
   @property
   def num_shards(self):
     return 10  # This is a small dataset.
-  
+
   def get_training_dataset(self, tmp_dir):
-    """Uses only News Commentary Dataset for training"""
+    """Uses only News Commentary Dataset for training."""
     return _NC_TRAIN_DATASETS
diff --git a/tensor2tensor/data_generators/wiki.py b/tensor2tensor/data_generators/wiki.py
index a1380c27f..828ef2246 100644
--- a/tensor2tensor/data_generators/wiki.py
+++ b/tensor2tensor/data_generators/wiki.py
@@ -20,122 +20,108 @@
 from __future__ import print_function
 
 import os
+import subprocess
 
 # Dependency imports
 
-import bz2file
-
 import numpy as np
 
-import six
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem
-from tensor2tensor.data_generators import text_encoder
-from tensor2tensor.utils import metrics
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
 
-# End-of-sentence marker.
-EOS = text_encoder.EOS_ID
-
 
-def _maybe_download_corpus(tmp_dir):
-  """Download corpus if necessary.
-
-  Args:
-    tmp_dir: directory containing dataset.
+@registry.register_problem
+class LanguagemodelWikiXmlV8kL1k(problem.ChoppedTextProblem):
+  """A language model on English Wikipedia.
 
-  Returns:
-    filepath of the downloaded corpus file.
+  XML dump is chopped arbitrarily into sequences of length 1024 tokens,
+  without regard to article boundaries.
   """
-  corpus_url = ("https://dumps.wikimedia.org/enwiki/20170620/"
-                "enwiki-20170620-pages-articles-multistream.xml.bz2")
-  corpus_filename = os.path.basename(corpus_url)
-  corpus_filepath = os.path.join(tmp_dir, corpus_filename)
-  if not tf.gfile.Exists(corpus_filepath):
-    generator_utils.maybe_download(tmp_dir, corpus_filename, corpus_url)
-  return corpus_filepath
-
-
-def page_generator(tmp_dir, max_docs=None):
-  doc = u""
-  count = 0
-  corpus_filepath = _maybe_download_corpus(tmp_dir)
-  for line in bz2file.BZ2File(corpus_filepath, "r", buffering=1000000):
-    line = unicode(line, "utf-8") if six.PY2 else line.decode("utf-8")
-    if not doc and line != u"  <page>\n":
-      continue
-    doc += line
-    if line == u"  </page>\n":
-      yield doc
-      doc = u""
-      count += 1
-      if max_docs and count >= max_docs:
-        break
-
-
-def _page_title(page):
-  start_pos = page.find(u"<title>")
-  end_pos = page.find(u"</title>")
-  assert start_pos != -1
-  assert end_pos != -1
-  start_pos += len(u"<title>")
-  return page[start_pos:end_pos]
 
-
-@registry.register_problem
-class LanguagemodelWikiFull32k(problem.Text2TextProblem):
-  """A language model on full English Wikipedia."""
+  def maybe_prepare_text(self, tmp_dir):
+    """Download corpus if necessary, decompress, split into multiple text files.
+
+    Args:
+      tmp_dir: directory containing dataset.
+
+    Returns:
+      list of filepaths for local text files.
+    """
+    compressed_filename = os.path.basename(self.corpus_url)
+    compressed_filepath = os.path.join(tmp_dir, compressed_filename)
+    decompressed_filepath = compressed_filepath[:-4]
+    split_file_prefix = decompressed_filepath + "-part-"
+    split_filepattern = split_file_prefix + "?????"
+    split_files = sorted(tf.gfile.Glob(split_filepattern))
+    if not split_files:
+      if not tf.gfile.Exists(decompressed_filepath):
+        if not tf.gfile.Exists(compressed_filepath):
+          generator_utils.maybe_download(
+              tmp_dir, compressed_filepath, self.corpus_url)
+        assert not subprocess.call(["bunzip2", compressed_filepath])
+      assert tf.gfile.Exists(decompressed_filepath)
+      assert not subprocess.call([
+          "split", "--line-bytes=4M", "--suffix-length=5",
+          "--numeric-suffixes", decompressed_filepath, split_file_prefix])
+      split_files = sorted(tf.gfile.Glob(split_filepattern))
+    assert split_files
+    return split_files
+
+  def train_text_filepaths(self, tmp_dir):
+    all_files = self.maybe_prepare_text(tmp_dir)
+    return [f for i, f in enumerate(all_files) if i % self.dev_fraction != 0]
+
+  def dev_text_filepaths(self, tmp_dir):
+    all_files = self.maybe_prepare_text(tmp_dir)
+    return [f for i, f in enumerate(all_files) if i % self.dev_fraction == 0]
 
   @property
-  def is_character_level(self):
-    return False
+  def dev_fraction(self):
+    return 5000
 
   @property
-  def has_inputs(self):
-    return True
+  def corpus_url(self):
+    return ("https://archive.org/download/enwiki-20171201/"
+            "enwiki-20171201-pages-articles.xml.bz2")
 
   @property
-  def input_space_id(self):
-    return problem.SpaceID.EN_TOK
+  def vocab_name(self):
+    return "vocab.wiki_xml"
 
   @property
-  def target_space_id(self):
-    return problem.SpaceID.EN_TOK
+  def targeted_vocab_size(self):
+    return 2**13  # 8192
 
   @property
-  def num_shards(self):
-    return 1000
+  def sequence_length(self):
+    """Length of each example (in tokens)."""
+    return 1024
 
   @property
-  def vocab_name(self):
-    return "vocab.wiki"
+  def max_chars_for_vocab(self):
+    """Number of characters of training data to use for generating vocab."""
+    # magic number for backwards compatibility
+    return 41800829
 
-  @property
-  def use_subword_tokenizer(self):
-    return True
 
-  @property
-  def targeted_vocab_size(self):
-    return 2**15  # 32768
+@registry.register_problem
+class LanguagemodelWikiXmlV8kL4k(LanguagemodelWikiXmlV8kL1k):
+  """A language model on English Wikipedia.
 
-  @property
-  def use_train_shards_for_dev(self):
-    return True
+  XML dump is chopped arbitrarily into sequences of length 4096 tokens,
+  without regard to article boundaries.
+  """
 
-  def generator(self, data_dir, tmp_dir, _):
-    encoder = generator_utils.get_or_generate_vocab_inner(
-        data_dir, self.vocab_file, self.targeted_vocab_size,
-        page_generator(tmp_dir, max_docs=10000))
-    for page in page_generator(tmp_dir):
-      title = _page_title(page)
-      encoded = encoder.encode(page) + [EOS]
-      encoded_title = encoder.encode(title) + [EOS]
-      yield {"inputs": encoded_title, "targets": encoded}
+  @property
+  def sequence_length(self):
+    """Length of each example (in tokens)."""
+    return 4096
 
 
-class LanguagemodelWikiScramble(problem.Text2TextProblem):
+class LanguagemodelWikiScramble(LanguagemodelWikiXmlV8kL1k):
   """Language modeling on English wikipedia.
 
   "targets" is a sequence of sequence_length tokens - a fragment of an article.
@@ -146,18 +132,16 @@ class LanguagemodelWikiScramble(problem.Text2TextProblem):
   of the target sequence given the input sequence.
   """
 
-  @property
-  def sequence_length(self):
-    raise NotImplementedError()
+  def example_generator(self, encoder, tmp_dir, task_id):
+    for x in super(LanguagemodelWikiScramble, self).example_generator(
+        encoder, tmp_dir, task_id):
+      x["inputs"] = self.scramble(x["targets"])
+      yield x
 
   @property
   def scramble_fraction(self):
     raise NotImplementedError()
 
-  @property
-  def is_character_level(self):
-    return False
-
   @property
   def has_inputs(self):
     return True
@@ -166,33 +150,14 @@ def has_inputs(self):
   def input_space_id(self):
     return problem.SpaceID.EN_TOK
 
-  @property
-  def target_space_id(self):
-    return problem.SpaceID.EN_TOK
-
-  @property
-  def num_shards(self):
-    return 1000
-
-  @property
-  def vocab_name(self):
-    return "vocab.wiki"
-
-  @property
-  def use_subword_tokenizer(self):
-    return True
-
   @property
   def targeted_vocab_size(self):
     return 2**13  # 8192
 
   @property
-  def use_train_shards_for_dev(self):
-    return True
-
-  @property
-  def max_cases(self):
-    return (2 ** 30) / self.sequence_length
+  def remainder_policy(self):
+    """What to do with leftover tokens."""
+    return "drop"
 
   def scramble(self, seq):
     seq = np.array(seq)
@@ -207,30 +172,9 @@ def scramble(self, seq):
     seq = list(seq)
     return seq
 
-  def generator(self, data_dir, tmp_dir, _):
-    encoder = generator_utils.get_or_generate_vocab_inner(
-        data_dir, self.vocab_file, self.targeted_vocab_size,
-        page_generator(tmp_dir, max_docs=1000))
-    case_num = 0
-    for page in page_generator(tmp_dir):
-      encoded = encoder.encode(page)
-      for i in xrange(len(encoded) // self.sequence_length):
-        case_num += 1
-        if self.max_cases and case_num > self.max_cases:
-          return
-        targets = encoded[
-            i * self.sequence_length:(i + 1) * self.sequence_length]
-        inputs = self.scramble(targets)
-        yield {"inputs": inputs, "targets": targets}
-
-  def eval_metrics(self):
-    return [
-        metrics.Metrics.ACC, metrics.Metrics.NEG_LOG_PERPLEXITY
-    ]
-
 
 @registry.register_problem
-class LanguagemodelWikiScramble128(LanguagemodelWikiScramble):
+class LanguagemodelWikiScrambleL128(LanguagemodelWikiScramble):
   """Sequence length 128, 50% scrambed."""
 
   @property
@@ -243,7 +187,7 @@ def scramble_fraction(self):
 
 
 @registry.register_problem
-class LanguagemodelWikiScramble1k50(LanguagemodelWikiScramble):
+class LanguagemodelWikiScrambleL1k(LanguagemodelWikiScramble):
   """Sequence length 1024, 50% scrambed."""
 
   @property
@@ -256,13 +200,192 @@ def scramble_fraction(self):
 
 
 @registry.register_problem
-class LanguagemodelWikiScramble8k50(LanguagemodelWikiScramble):
-  """Sequence length 8192, 50% scrambed."""
+class LanguagemodelWikiNorefV8kL1k(LanguagemodelWikiXmlV8kL1k):
+  """A language model on English Wikipedia.
+
+  References and internal links are removed from the raw XML.
+
+  Special pages (non-articles) are dropped.
+
+  This more closely resemples plain text, though there are still some xml
+  elements, like tables.
+
+  Each article is prefixed by a line containing the title and length in
+  characters - e.g.
+  title: "Price of Tea in China" length: 12345
+  During inference time, you can forward generate starting with such a header
+  in order to obtain a randomly generated article with a given title and
+  (approximate) length.
+
+  Result is chopped arbitrarily into sequences of length 1024 tokens,
+  without regard to article boundaries.
+  """
 
   @property
-  def sequence_length(self):
-    return 8192
+  def vocab_name(self):
+    return "vocab.wiki_noref"
+
+  def filepath_to_unicode_text(self, filepath):
+    """Overriddes the base class to clean up the xml dump before tokenizing."""
+    dump = problem.to_unicode_ignore_erros(tf.gfile.Open(filepath).read())
+    pages = _dump_to_pages(dump)
+    ret = u""
+    for p in pages:
+      title = _page_to_title(p)
+      text = _page_to_text(p)
+      text = _remove_triple_quotes(
+          _remove_double_brackets(_remove_references(text)))
+      if u":" in title:
+        # not a regular article
+        continue
+      if len(text) <= 140:
+        # Probably a redirect or something like that.  Skip it.
+        continue
+      ret += u"title: \"%s\" length: %d\n%s\n" % (title, len(text), text)
+    return ret
 
   @property
-  def scramble_fraction(self):
-    return 0.5
+  def max_chars_for_vocab(self):
+    """Number of characters of training data to use for generating vocab."""
+    # magic number for backwards compatibility
+    return 21240483
+
+
+def _dump_to_pages(dump):
+  """Extract pages from an xml dump.
+
+  Args:
+    dump: a unicode string
+  Returns:
+    a list of unicode strings
+  """
+  pos = 0
+  ret = []
+  start_tag = u"<page>\n"
+  end_tag = u"</page>\n"
+  while True:
+    start_pos = dump.find(start_tag, pos)
+    if start_pos == -1:
+      break
+    start_pos += len(start_tag)
+    end_pos = dump.find(end_tag, start_pos)
+    if end_pos == -1:
+      break
+    ret.append(dump[start_pos:end_pos])
+    pos = end_pos + len(end_tag)
+  return ret
+
+
+def _page_to_title(page):
+  """Extract the title from a page.
+
+  Args:
+    page: a unicode string
+  Returns:
+    a unicode string
+  """
+  # print("page=%s" % page)
+  start_tag = u"<title>"
+  end_tag = u"</title>"
+  start_pos = page.find(start_tag)
+  end_pos = page.find(end_tag)
+  assert start_pos != -1
+  assert end_pos != -1
+  start_pos += len(start_tag)
+  return page[start_pos:end_pos]
+
+
+def _page_to_text(page):
+  """Extract the text from a page.
+
+  Args:
+    page: a unicode string
+  Returns:
+    a unicode string
+  """
+  # text start tag looks like "<text ..otherstuff>"
+  start_pos = page.find(u"<text")
+  assert start_pos != -1
+  end_tag_pos = page.find(u">", start_pos)
+  assert end_tag_pos != -1
+  end_tag_pos += len(u">")
+  end_pos = page.find(u"</text>")
+  if end_pos == -1:
+    return u""
+  return page[end_tag_pos:end_pos]
+
+
+def _find_and_replace(text, start_string, end_string, replace_fn):
+  """Remove everything found between instances of start_string and end_string.
+
+  Replace each such instance with replace_fn(removed_text)
+
+  e.g. _find_and_replace(u"the [[fat]] cat [[sat]]", u"[[", u"]]", lambda x: x)
+    = u"the fat cat sat"
+
+  Args:
+    text: a unicode string
+    start_string: a unicode string
+    end_string: a unicode string
+    replace_fn: a unary function from unicode string to unicode string
+
+  Returns:
+    a string
+  """
+  ret = u""
+  current_pos = 0
+  while True:
+    start_pos = text.find(start_string, current_pos)
+    if start_pos == -1:
+      ret += text[current_pos:]
+      break
+    ret += text[current_pos:start_pos]
+    end_pos = text.find(end_string, start_pos + len(start_string))
+    if end_pos == -1:
+      break
+    ret += replace_fn(text[start_pos + len(start_string):end_pos])
+    current_pos = end_pos + len(end_string)
+  return ret
+
+
+def _remove_references(text):
+  """Strip out references from wikipedia xml."""
+  return _find_and_replace(text, u"&lt;ref", u"&lt;/ref&gt;", lambda s: "")
+
+
+def _remove_triple_quotes(text):
+  """Strip out triple quotes from wikipedia xml."""
+  return _find_and_replace(text, u"'''", u"'''", lambda s: s)
+
+
+def _remove_double_brackets(text):
+  """Remove double brackets (internal links) but leave the viewable text.
+
+  Args:
+    text: a unicode string
+  Returns:
+    a unicode string
+  """
+  def replacement_fn(s):
+    if u":" in s:
+      # this is probably a category or something like that.
+      return ""
+    # keep the part after the bar.
+    bar_pos = s.find(u"|")
+    if bar_pos == -1:
+      return s
+    return s[bar_pos + 1:]
+  return _find_and_replace(text, u"[[", u"]]", replacement_fn)
+
+
+@registry.register_problem
+class LanguagemodelWikiNorefV8kL16k(LanguagemodelWikiNorefV8kL1k):
+  """A language model on English Wikipedia.
+
+  References removed.  Chopped into segments of 16k tokens.
+  """
+
+  @property
+  def sequence_length(self):
+    """Length of each example (in tokens)."""
+    return 2**14
diff --git a/tensor2tensor/insights/README.md b/tensor2tensor/insights/README.md
new file mode 100644
index 000000000..ebed255e1
--- /dev/null
+++ b/tensor2tensor/insights/README.md
@@ -0,0 +1,76 @@
+# Tensor2Tensor Insights
+
+The Insights packages provides an interactive webservice for understanding the
+inner workings of a Tensor2Tensor model.  It will provide a series of
+visualizations extracted from a requested T2T model that informs model developers
+and model users on how to improve or best utilize a model.
+
+## Dependencies
+
+Before using the Insights server, you must install [Bower](https://bower.io/)
+which we use to manage our web component dependencies.  You can easily install
+this with the [Node Package Manager](https://www.npmjs.com/).
+
+## Setup Instructions
+
+After training a model, such as according to the Quick Start guide, you can run
+the `t2t-insights-server` binary and begin querying it.
+
+First, prepare the bower dependencies by navigating into the
+`tensor2tensor/insights/polymer` directory and running `bower install`:
+
+```
+pushd tensor2tensor/insights/polymer
+bower install
+popd
+```
+
+The models run by server is then configured by a JSON version of the
+InsightsConfiguration protocol buffer.  Using the model trained in the Quick
+Start guide, a sample configuration would be:
+
+```
+  {
+    "configuration": [{
+      "source_language": "en",
+      "target_language": "de",
+      "label": "transformers_wmt32k",
+      "transformer": {
+        "model": "transformer",
+        "model_dir": "/tmp/t2t/train",
+        "data_dir": "/tmp/t2t/data",
+        "hparams": "",
+        "hparams_set": "transformer_base_single_gpu",
+        "problems": "translate_ende_wmt32k"
+      },
+    }]
+    "language": [{
+      "code": "en",
+      "name": "English",
+    },{
+      "code": "de",
+      "name": "German",
+    }]
+  }
+```
+
+With that saved to `configuration.json`, run the following:
+
+```
+t2t-insights-server \
+  --configuration=configuration.json \
+  --static_path=`pwd`/tensor2tensor/insights/polymer
+```
+
+This will bring up a minimal [Flask](http://flask.pocoo.org/) REST service
+served by a [GUnicorn](http://gunicorn.org/) HTTP Server.
+
+## Features to be developed
+
+This is a minimal web server.  We are in the process of adding additional
+exciting features that give insight into a model's behavior:
+
+  * Integrating a multi-head attention visualization.
+  * Registering multiple models to compare their behavior.
+  * Indexing training data to find examples related to a current query.
+  * Tracking interesting query + translation pairs for deeper analysis.
diff --git a/tensor2tensor/insights/insight_configuration.proto b/tensor2tensor/insights/insight_configuration.proto
new file mode 100644
index 000000000..6a1656eac
--- /dev/null
+++ b/tensor2tensor/insights/insight_configuration.proto
@@ -0,0 +1,55 @@
+syntax = "proto3";
+
+package tensor2tensor;
+
+// Configures the Neural Machine Translation Insight Frontend with a set of
+// supported query processors and languages.
+message InsightConfiguration {
+  // Specifies zero or more models to inspect.
+  repeated QueryProcessorConfiguration configuration = 1;
+
+  // Specifies language codes and display names.
+  repeated Language language = 2;
+}
+
+// A displayable language name.
+message Language {
+  // The BCP-47 Language code.
+  string code = 1;
+  // The language's display name.
+  string name = 2;
+}
+
+// Configures a QueryProcessor and registers it with the Insight Frontend when
+// responding to analysis queries.
+message QueryProcessorConfiguration {
+  // The model's BCP-47 source language code.
+  string source_language = 1;
+  // The model's BCP-47 target language code.
+  string target_language = 2;
+  // A short label for the model.
+  string label = 3;
+  // The QueryProcessor to use.  By default we just use the TransformerModel.
+  string query_processor = 4;
+
+  // Configuration for the TransformerModel.
+  TransformerConfiguration transformer = 5;
+}
+
+// Specifies the parameters for a trained Transformer model to inspect.  These
+// parameters match those in t2t-trainer and t2t-decoder.
+message TransformerConfiguration {
+  // The model type.
+  string model = 1;
+  // The trained model directory.
+  string model_dir = 2;
+  // The data directory for the model.
+  string data_dir = 3;
+
+  // The hyperparameter set for running the model.
+  string hparams_set = 4;
+  // Overriding hyperparameters.
+  string hparams = 5;
+  // The problem sets over which this model was trained and configured.
+  string problems = 6;
+}
diff --git a/tensor2tensor/insights/polymer/.bowerrc b/tensor2tensor/insights/polymer/.bowerrc
new file mode 100644
index 000000000..b316080f0
--- /dev/null
+++ b/tensor2tensor/insights/polymer/.bowerrc
@@ -0,0 +1,3 @@
+{
+  "directory": "."
+}
diff --git a/tensor2tensor/insights/polymer/attention_visualization/attention-visualization.js b/tensor2tensor/insights/polymer/attention_visualization/attention-visualization.js
index b58d90905..e738c2629 100644
--- a/tensor2tensor/insights/polymer/attention_visualization/attention-visualization.js
+++ b/tensor2tensor/insights/polymer/attention_visualization/attention-visualization.js
@@ -15,8 +15,6 @@
  * limitations under the License.
  */
 
-goog.module('t2t.AttentionVisualization');
-
 /**
  * `<attention-visualization>` presents a heatmap of input-output associations.
  *
@@ -62,10 +60,16 @@ class AttentionVisualization extends Polymer.Element {
     this.zoom_ = undefined;
   }
 
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'attention-visualization';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -84,6 +88,9 @@ class AttentionVisualization extends Polymer.Element {
     };
   }
 
+  /**
+   * @return {!Array<string>} The component observers.
+   */
   static get observers() {
     return [
       'zoomDepthChanged_(zoomDepth_)',
@@ -308,5 +315,3 @@ class AttentionVisualization extends Polymer.Element {
 }
 
 customElements.define(AttentionVisualization.is, AttentionVisualization);
-
-exports = {AttentionVisualization};
diff --git a/tensor2tensor/insights/polymer/bower.json b/tensor2tensor/insights/polymer/bower.json
new file mode 100644
index 000000000..da1f4aaed
--- /dev/null
+++ b/tensor2tensor/insights/polymer/bower.json
@@ -0,0 +1,80 @@
+{
+  "name": "tensor2tensor-insights",
+  "homepage": "https://github.com/tensorflow/tensor2tensor",
+  "description": "Components for analyzing tensor2tensor neural machine translation models.",
+  "main": "index.html",
+  "keywords": [
+    "neural",
+    "machine",
+    "translation"
+  ],
+  "authors": [
+    "kstevens@google.com"
+  ],
+  "license": "Apache 2.0",
+  "private": true,
+  "ignore": [
+    "**/.*",
+    "node_modules",
+    "bower_components",
+    "test",
+    "tests"
+  ],
+  "dependencies": {
+    "app-layout": "PolymerElements/app-layout#2.0.4",
+    "app-route": "PolymerElements/app-route#2.0.3",
+    "d3": "d3#4.12.2",
+    "iron-a11y-keys": "PolymerElements/iron-a11y-keys#2.0.0",
+    "iron-ajax": "PolymerElements/iron-ajax#2.0.0",
+    "iron-flex-layout": "PolymerElements/iron-flex-layout#2.0.0",
+    "iron-icon": "PolymerElements/iron-icon#2.0.0",
+    "iron-icons": "PolymerElements/iron-icons#2.0.0",
+    "iron-list": "PolymerElements/iron-list#2.0.0",
+    "iron-pages": "PolymerElements/iron-pages#2.0.0",
+    "iron-selector": "PolymerElements/iron-selector#2.0.0",
+    "neon-animation": "PolymerElements/neon-animation#2.0.0",
+    "paper-button": "PolymerElements/paper-button#2.0.0",
+    "paper-card": "PolymerElements/paper-card#2.0.0",
+    "paper-dialog": "PolymerElements/paper-dialog#2.0.0",
+    "paper-dropdown-menu": "PolymerElements/paper-dropdown-menu#2.0.0",
+    "paper-icon-button": "PolymerElements/paper-icon-button#2.0.0",
+    "paper-input": "PolymerElements/paper-input#2.0.0",
+    "paper-item": "PolymerElements/paper-item#2.0.0",
+    "paper-listbox": "PolymerElements/paper-listbox#2.0.0",
+    "paper-slider": "PolymerElements/paper-slider#2.0.0",
+    "paper-tabs": "PolymerElements/paper-tabs#2.0.0",
+    "paper-toggle-button": "PolymerElements/paper-toggle-button#2.0.0",
+    "paper-tooltip": "PolymerElements/paper-tooltip#2.0.0",
+    "paper-progress": "PolymerElements/paper-progress#2.0.0",
+    "polymer": "polymer/polymer#v2.3.1"
+  },
+  "resolutions": {
+    "webcomponentsjs": "^v1.0.19",
+    "polymer": "^v2.3.1",
+    "app-route": "^2.0.3",
+    "app-layout": "^2.0.4",
+    "iron-location": "1 - 2",
+    "iron-selector": "^2.0.0",
+    "neon-animation": "^2.0.0",
+    "iron-icon": "^2.0.0",
+    "iron-pages": "^2.0.0",
+    "iron-icons": "^2.0.0",
+    "paper-icon-button": "^2.0.0",
+    "paper-item": "^2.0.0",
+    "iron-flex-layout": "^2.0.0",
+    "paper-listbox": "^2.0.0",
+    "iron-a11y-keys": "^2.0.0",
+    "paper-dialog": "^2.0.0",
+    "iron-ajax": "^2.0.0",
+    "paper-progress": "^2.0.0",
+    "paper-dropdown-menu": "^2.0.0",
+    "paper-tabs": "^2.0.0",
+    "paper-input": "^2.0.0",
+    "paper-toggle-button": "^2.0.0",
+    "paper-slider": "^2.0.0",
+    "iron-list": "^2.0.0",
+    "paper-card": "^2.0.0",
+    "paper-tooltip": "^2.0.0",
+    "iron-overlay-behavior": "^2.2.0"
+  }
+}
diff --git a/tensor2tensor/insights/polymer/explore_view/explore-view.html b/tensor2tensor/insights/polymer/explore_view/explore-view.html
index d0456211f..97fce423c 100644
--- a/tensor2tensor/insights/polymer/explore_view/explore-view.html
+++ b/tensor2tensor/insights/polymer/explore_view/explore-view.html
@@ -31,8 +31,8 @@
 <link rel="import" href="../paper-toggle-button/paper-toggle-button.html">
 <link rel="import" href="../paper-progress/paper-progress.html">
 
-<link rel="import" href="../query-card/query-card.html">
-<link rel="import" href="../translation-result/translation-result.html">
+<link rel="import" href="../query_card/query-card.html">
+<link rel="import" href="../translation_result/translation-result.html">
 
 <dom-module id="explore-view">
   <template>
diff --git a/tensor2tensor/insights/polymer/explore_view/explore-view.js b/tensor2tensor/insights/polymer/explore_view/explore-view.js
index b9cb329bb..edd50aa06 100644
--- a/tensor2tensor/insights/polymer/explore_view/explore-view.js
+++ b/tensor2tensor/insights/polymer/explore_view/explore-view.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('t2t.ExploreView');
 
 /**
  * `<explore-view>` Presents a view for debuging translations.
@@ -28,10 +27,16 @@ goog.module('t2t.ExploreView');
  *   <explore-view></explore-view>
  */
 class ExploreView extends Polymer.Element {
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'explore-view';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       route: {
@@ -63,6 +68,9 @@ class ExploreView extends Polymer.Element {
     };
   }
 
+  /**
+   * @return {!Array<string>} The component observers.
+   */
   static get observers() {
     return [
       'modelChanged_(queryData, model_)',
@@ -201,5 +209,3 @@ class ExploreView extends Polymer.Element {
 }
 
 customElements.define(ExploreView.is, ExploreView);
-
-exports = {ExploreView};
diff --git a/tensor2tensor/insights/polymer/graph_visualization/graph-visualization.js b/tensor2tensor/insights/polymer/graph_visualization/graph-visualization.js
index e69ef3713..0945c08a0 100644
--- a/tensor2tensor/insights/polymer/graph_visualization/graph-visualization.js
+++ b/tensor2tensor/insights/polymer/graph_visualization/graph-visualization.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('t2t.GraphVisualization');
 
 /**
  * `<graph-visualization>` Presents a beam search decoding graph.
@@ -106,10 +105,16 @@ class GraphVisualization extends Polymer.Element {
     this.container_ = undefined;
   }
 
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'graph-visualization';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -153,6 +158,9 @@ class GraphVisualization extends Polymer.Element {
     };
   }
 
+  /**
+   * @return {!Array<string>} The component observers.
+   */
   static get observers() {
     return [
       'zoomDepthChanged_(zoomDepth_)',
@@ -818,5 +826,3 @@ class GraphVisualization extends Polymer.Element {
 }
 
 customElements.define(GraphVisualization.is, GraphVisualization);
-
-exports = {GraphVisualization};
diff --git a/tensor2tensor/insights/polymer/index.html b/tensor2tensor/insights/polymer/index.html
index fb3fa0db7..e05956599 100644
--- a/tensor2tensor/insights/polymer/index.html
+++ b/tensor2tensor/insights/polymer/index.html
@@ -1,3 +1,4 @@
+<!doctype html>
 <!--
 @license
 Copyright 2017 The Tensor2Tensor Authors.
@@ -14,4 +15,59 @@
 See the License for the specific language governing permissions and
 limitations under the License.
 -->
-<link rel="import" href="insights-app/insights-app.html">
+<html>
+<head>
+  <meta name="viewport" content="width=device-width, minimum-scale=1.0, initial-scale=1, user-scalable=no">
+  <meta name="mobile-web-app-capable" content="yes">
+  <meta name="apple-mobile-web-app-capable" content="yes">
+  <meta name="apple-touch-fullscreen" content="yes">
+  <meta name="apple-mobile-web-app-status-bar-style"
+        content="black-translucent" >
+  <meta name="format-detection" content="telephone=no">
+
+  <title>NMT Research Frontend</title>
+  <meta name="description" content="Debug your favorite Neural Machine Translation model">
+  <meta name="application-name" content="NMT Research Frontend">
+
+  <meta property="og:title" content="Get started debugging">
+  <meta property="og:type" content="article">
+  <meta property="og:description" content="Link up to an NMT Service and find out what's going on with it.">
+
+  <link class="favicon" href="//ssl.gstatic.com/translate/translate_1x_web_logo_24dp.png" rel="icon">
+  <link async href="https://fonts.googleapis.com/css?family=Product+Sans" rel="stylesheet">
+
+  <script async type="text/javascript" src="/polymer/webcomponentsjs/webcomponents-lite.js"></script>
+
+  <link rel="import" href="polymer/insights_app/insights-app.html">
+
+  <style>
+    html, body {
+      height: 100%;
+      width: 100%;
+      -webkit-tap-highlight-color: rgba(0,0,0,0);
+    }
+
+    body {
+      margin: 0;
+      background-color: #eee;
+      background-repeat: no-repeat;
+      background-position: 50%;
+      font-family: "RobotoDraft", sans-serif;
+      color: #444;
+      fill: #444;
+      font-weight: 400;
+    }
+  </style>
+</head>
+<body class="fullbleed">
+  <insights-app id="app"></insights-app>
+
+  <script type="text/javascript">
+    // Once webcomponents are fully initialized, assign any initial values to
+    // the main scaffolding element.
+    document.addEventListener('WebComponentsReady', function() {
+      var app = document.querySelector('#app');
+    });
+  </script>
+</body>
+</html>
diff --git a/tensor2tensor/insights/polymer/insights_app/insights-app.html b/tensor2tensor/insights/polymer/insights_app/insights-app.html
index b2c495433..8138177e0 100644
--- a/tensor2tensor/insights/polymer/insights_app/insights-app.html
+++ b/tensor2tensor/insights/polymer/insights_app/insights-app.html
@@ -25,15 +25,13 @@
 <link rel="import" href="../app-layout/app-header-layout/app-header-layout.html">
 <link rel="import" href="../app-layout/app-toolbar/app-toolbar.html">
 
-<link rel="import" href="../iron-icon/iron-icon.html">
-<link rel="import" href="../iron-icons/iron-icons.html">
 <link rel="import" href="../iron-pages/iron-pages.html">
 <link rel="import" href="../iron-selector/iron-selector.html">
 
 <link rel="import" href="../paper-icon-button/paper-icon-button.html">
 <link rel="import" href="../paper-item/paper-item.html">
 
-<link rel="import" href="../explore-view/explore-view.html">
+<link rel="import" href="../explore_view/explore-view.html">
 
 <dom-module id="insights-app">
   <template>
@@ -104,8 +102,6 @@
       -->
       <app-drawer id="drawer" slot="drawer">
         <app-toolbar>
-          <iron-icon src="https://www.gstatic.com/images/branding/product/1x/translate_64dp.png">
-          </iron-icon>
           <div>Debug Frontend</div>
         </app-toolbar>
         <iron-selector selected="[[page]]" attr-for-selected="name" class="drawer-list" role="navigation">
diff --git a/tensor2tensor/insights/polymer/insights_app/insights-app.js b/tensor2tensor/insights/polymer/insights_app/insights-app.js
index 5942d7549..b652c2187 100644
--- a/tensor2tensor/insights/polymer/insights_app/insights-app.js
+++ b/tensor2tensor/insights/polymer/insights_app/insights-app.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('t2t.InsightsApp');
 
 /**
  * `<insights-app>` Manages the views of the NMT Insights App.
@@ -25,10 +24,16 @@ goog.module('t2t.InsightsApp');
  *   </insights-app>
  */
 class InsightsApp extends Polymer.Element {
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'insights-app';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -41,6 +46,9 @@ class InsightsApp extends Polymer.Element {
     };
   }
 
+  /**
+   * @return {!Array<string>} The component observers.
+   */
   static get observers() {
     return [
       'routePageChanged_(routeData.page)',
@@ -68,5 +76,3 @@ class InsightsApp extends Polymer.Element {
 }
 
 customElements.define(InsightsApp.is, InsightsApp);
-
-exports = {InsightsApp};
diff --git a/tensor2tensor/insights/polymer/language_selector/language-selector-content.js b/tensor2tensor/insights/polymer/language_selector/language-selector-content.js
index b00c5aeec..dc7e1de20 100644
--- a/tensor2tensor/insights/polymer/language_selector/language-selector-content.js
+++ b/tensor2tensor/insights/polymer/language_selector/language-selector-content.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('t2t.LanguageSelectorContent');
 
 /**
  * `<language-selector-content>` provides menu content for language selection.
@@ -33,10 +32,16 @@ goog.module('t2t.LanguageSelectorContent');
  *   </language-selector-content>
  */
 class LanguageSelectorContent extends Polymer.Element {
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'language-selector-content';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -63,6 +68,9 @@ class LanguageSelectorContent extends Polymer.Element {
     };
   }
 
+  /**
+   * @return {!Array<string>} The component observers.
+   */
   static get observers() {
     return [
       'selectDefault_(languages, renderedItemCount)',
@@ -233,5 +241,3 @@ class LanguageSelectorContent extends Polymer.Element {
 }
 
 customElements.define(LanguageSelectorContent.is, LanguageSelectorContent);
-
-exports = {LanguageSelectorContent};
diff --git a/tensor2tensor/insights/polymer/language_selector/language-selector.js b/tensor2tensor/insights/polymer/language_selector/language-selector.js
index ff59f675d..8441aca9d 100644
--- a/tensor2tensor/insights/polymer/language_selector/language-selector.js
+++ b/tensor2tensor/insights/polymer/language_selector/language-selector.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('nmt_insights.LanguageSelector');
 
 /**
  * `<language-selector>` provides a searchable dropdown of languages.
@@ -32,10 +31,16 @@ goog.module('nmt_insights.LanguageSelector');
  *   </language-selector>
  */
 class LanguageSelector extends Polymer.Element {
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'language-selector';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -78,5 +83,3 @@ class LanguageSelector extends Polymer.Element {
 }
 
 customElements.define(LanguageSelector.is, LanguageSelector);
-
-exports = {LanguageSelector};
diff --git a/tensor2tensor/insights/polymer/processing_visualization/processing-visualization.js b/tensor2tensor/insights/polymer/processing_visualization/processing-visualization.js
index 99f2d08f9..9cc9360ad 100644
--- a/tensor2tensor/insights/polymer/processing_visualization/processing-visualization.js
+++ b/tensor2tensor/insights/polymer/processing_visualization/processing-visualization.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('t2t.ProcessingVisualization');
 
 /**
  * `<processing-visualization>` summarises pre/post processing steps.
@@ -28,10 +27,16 @@ goog.module('t2t.ProcessingVisualization');
  *   <processing-visualization data="[[data]]"></processing-visualization>
  */
 class ProcessingVisualization extends Polymer.Element {
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'processing-visualization';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -45,5 +50,3 @@ class ProcessingVisualization extends Polymer.Element {
 }
 
 customElements.define(ProcessingVisualization.is, ProcessingVisualization);
-
-exports = {ProcessingVisualization};
diff --git a/tensor2tensor/insights/polymer/query_card/query-card.html b/tensor2tensor/insights/polymer/query_card/query-card.html
index 740735c0f..d966d1c4f 100644
--- a/tensor2tensor/insights/polymer/query_card/query-card.html
+++ b/tensor2tensor/insights/polymer/query_card/query-card.html
@@ -26,7 +26,7 @@
 <link rel="import" href="../paper-item/paper-item.html">
 <link rel="import" href="../paper-listbox/paper-listbox.html">
 
-<link rel="import" href="../language-selector/language-selector.html">
+<link rel="import" href="../language_selector/language-selector.html">
 
 <dom-module id="query-card">
   <template>
diff --git a/tensor2tensor/insights/polymer/query_card/query-card.js b/tensor2tensor/insights/polymer/query_card/query-card.js
index 3141a9545..b08a82ae5 100644
--- a/tensor2tensor/insights/polymer/query_card/query-card.js
+++ b/tensor2tensor/insights/polymer/query_card/query-card.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('t2t.QueryCard');
 
 /**
  * `<query-card>` presents a material card for selecting a supported mdoel.
@@ -52,10 +51,16 @@ class QueryCard extends Polymer.Element {
     this.languagePairToModelMap_ = {};
   }
 
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'query-card';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -106,6 +111,9 @@ class QueryCard extends Polymer.Element {
     };
   }
 
+  /**
+   * @return {!Array<string>} The component observers.
+   */
   static get observers() {
     return [
       'routeActiveUpdated_(routeActive)',
@@ -314,8 +322,6 @@ class QueryCard extends Polymer.Element {
 
 customElements.define(QueryCard.is, QueryCard);
 
-exports = {QueryCard};
-
 /**
  * Returns the ordering of two language's based on their name.
  * @param {!Language} a The first language to compare.
diff --git a/tensor2tensor/insights/index.html b/tensor2tensor/insights/polymer/tensor2tensor.html
similarity index 91%
rename from tensor2tensor/insights/index.html
rename to tensor2tensor/insights/polymer/tensor2tensor.html
index fe3f8a0b7..10cbf3af1 100644
--- a/tensor2tensor/insights/index.html
+++ b/tensor2tensor/insights/polymer/tensor2tensor.html
@@ -36,9 +36,9 @@
   <link class="favicon" href="//ssl.gstatic.com/translate/translate_1x_web_logo_24dp.png" rel="icon">
   <link async href="https://fonts.googleapis.com/css?family=Product+Sans" rel="stylesheet">
 
-  <script async type="text/javascript" src="/static/insights_frontend_polymer/webcomponentsjs/webcomponents-lite.js"></script>
+  <script async type="text/javascript" src="webcomponentsjs/webcomponents-lite.js"></script>
 
-  <link rel="import" href="/static/insights_frontend_polymer/insights_frontend_polymer.html">
+  <link rel="import" href="insights_app/insights-app.html">
 
   <style>
     html, body {
diff --git a/tensor2tensor/insights/polymer/translation_result/translation-result.html b/tensor2tensor/insights/polymer/translation_result/translation-result.html
index 11615ed74..fdcddcca3 100644
--- a/tensor2tensor/insights/polymer/translation_result/translation-result.html
+++ b/tensor2tensor/insights/polymer/translation_result/translation-result.html
@@ -28,9 +28,9 @@
 <link rel="import" href="../paper-tabs/paper-tabs.html">
 <link rel="import" href="../paper-tooltip/paper-tooltip.html">
 
-<link rel="import" href="../attention-visualization/attention-visualization.html">
-<link rel="import" href="../graph-visualization/graph-visualization.html">
-<link rel="import" href="../processing-visualization/processing-visualization.html">
+<link rel="import" href="../attention_visualization/attention-visualization.html">
+<link rel="import" href="../graph_visualization/graph-visualization.html">
+<link rel="import" href="../processing_visualization/processing-visualization.html">
 
 <dom-module id="translation-result">
   <template>
diff --git a/tensor2tensor/insights/polymer/translation_result/translation-result.js b/tensor2tensor/insights/polymer/translation_result/translation-result.js
index c2ef46eeb..98329e3d1 100644
--- a/tensor2tensor/insights/polymer/translation_result/translation-result.js
+++ b/tensor2tensor/insights/polymer/translation_result/translation-result.js
@@ -14,7 +14,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-goog.module('t2t.TranslationResult');
 
 /**
  * `<translation-result>` Presents zero or more visualization of a translation.
@@ -29,10 +28,16 @@ goog.module('t2t.TranslationResult');
  *   </translation-result>
  */
 class TranslationResult extends Polymer.Element {
+  /**
+   * @return {string} The component name.
+   */
   static get is() {
     return 'translation-result';
   }
 
+  /**
+   * @return {!Object} The component properties.
+   */
   static get properties() {
     return {
       /**
@@ -107,5 +112,3 @@ class TranslationResult extends Polymer.Element {
 }
 
 customElements.define(TranslationResult.is, TranslationResult);
-
-exports = {TranslationResult};
diff --git a/tensor2tensor/insights/server.py b/tensor2tensor/insights/server.py
index b82f988d4..ca4dd9544 100644
--- a/tensor2tensor/insights/server.py
+++ b/tensor2tensor/insights/server.py
@@ -15,6 +15,8 @@
 
 """A GUnicorn + Flask Debug Frontend for Transformer models."""
 
+import json
+
 from flask import Flask
 from flask import jsonify
 from flask import request
@@ -28,8 +30,9 @@
 flags = tf.flags
 FLAGS = flags.FLAGS
 
-flags.DEFINE_string("t2t_model_dir", "", "")
-flags.DEFINE_string("t2t_data_dir", "", "")
+flags.DEFINE_string("configuration", "",
+                    "A JSON InsightConfiguration message that configures which "
+                    "models to run in the insight frontend.")
 flags.DEFINE_string("static_path", "",
                     "Path to static javascript and html files to serve.")
 
@@ -71,18 +74,32 @@ def load(self):
 
 def main(_):
   # Create the models we support:
+  with open(FLAGS.configuration) as configuration_file:
+    configuration = json.load(configuration_file)
+
+  # Read in the set of query processors.
   processors = {}
-  transformer_key = ("en", "de", "transformers_wmt32k")
-  # TODO(kstevens): Turn this into a text proto configuration that's read in on
-  # startup.
-  processors[transformer_key] = transformer_model.TransformerModel(
-      FLAGS.t2t_data_dir, FLAGS.t2t_model_dir)
-
-  # Create flask to serve all paths starting with '/static' from the static
-  # path.
+  for processor_configuration in configuration["configuration"]:
+    key = (processor_configuration["source_language"],
+           processor_configuration["target_language"],
+           processor_configuration["label"])
+
+    processors[key] = transformer_model.TransformerModel(
+        processor_configuration)
+
+  # Read in the list of supported languages.
+  languages = {}
+  for language in configuration["language"]:
+    languages[language["code"]] = {
+        "code": language["code"],
+        "name": language["name"],
+    }
+
+  # Create flask to serve all paths starting with '/polymer' from the static
+  # path.  This is to served non-vulcanized components.
   app = Flask(
       __name__.split(".")[0],
-      static_url_path="/static",
+      static_url_path="/polymer",
       static_folder=FLAGS.static_path)
 
   # Disable static file caching.
@@ -95,15 +112,9 @@ def language_list():  # pylint: disable=unused-variable
     Returns:
       JSON for the languages.
     """
-    # TODO(kstevens): Figure this out automatically by processing the
-    # configuration.
-    result = {
-        "language": [
-            {"code": "en", "name": "English"},
-            {"code": "de", "name": "German"},
-        ],
-    }
-    return jsonify(result)
+    return jsonify({
+        "language": languages.values()
+    })
 
   @app.route("/api/list_models/")
   def list_models():  # pylint: disable=unused-variable
@@ -113,24 +124,16 @@ def list_models():  # pylint: disable=unused-variable
     Returns:
       JSON for the supported models.
     """
-    # TODO(kstevens): Turn this into a configuration text proto that's read in
-    # on startup.
-    result = {
-        "configuration": [
-            {
-                "id": "transformers_wmt32k",
-                "source_language": {
-                    "code": "en",
-                    "name": "English",
-                },
-                "target_language": {
-                    "code": "de",
-                    "name": "German",
-                },
-            },
-        ],
-    }
-    return jsonify(result)
+    configuration_list = []
+    for source_code, target_code, label in processors:
+      configuration_list.append({
+          "id": label,
+          "source_language": languages[source_code],
+          "target_language": languages[target_code],
+      })
+    return jsonify({
+        "configuration": configuration_list
+    })
 
   @app.route("/debug", methods=["GET"])
   def query():  # pylint: disable=unused-variable
@@ -160,7 +163,18 @@ def root(path):  # pylint: disable=unused-variable
     Returns:
       The landing page html text.
     """
-    del path
+    if (path == "index.js" or
+        path == "webcomponentsjs/custom-elements-es5-adapter.js" or
+        path == "webcomponentsjs/webcomponents-lite.js"):
+      # Some vulcanizing methods bundle the javascript into a index.js file
+      # paired with index.html but leave two important webcomponents js files
+      # outside of the bundle.  If requesting those special files, fetch them
+      # directly rather than from a /static sub-directory.
+      return send_from_directory(FLAGS.static_path, path)
+    # Everything else should redirect to the main landing page.  Since we
+    # use a single page app, any initial url requests may include random
+    # paths (that don't start with /api or /static) which all should be
+    # served by the main landing page.
     return send_from_directory(FLAGS.static_path, "index.html")
 
   # Run the server.
diff --git a/tensor2tensor/insights/transformer_model.py b/tensor2tensor/insights/transformer_model.py
index 94bc7c0e1..ca36c5941 100644
--- a/tensor2tensor/insights/transformer_model.py
+++ b/tensor2tensor/insights/transformer_model.py
@@ -24,12 +24,12 @@
 
 import numpy as np
 
+from tensor2tensor.bin import t2t_trainer
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.insights import graph
 from tensor2tensor.insights import query_processor
-from tensor2tensor.tpu import tpu_trainer
-from tensor2tensor.tpu import tpu_trainer_lib
 from tensor2tensor.utils import decoding
+from tensor2tensor.utils import trainer_lib
 from tensor2tensor.utils import usr_dir
 
 import tensorflow as tf
@@ -97,36 +97,36 @@ class TransformerModel(query_processor.QueryProcessor):
     - graph: A graph of the beam search process.
   """
 
-  def __init__(self, data_dir, model_dir):
+  def __init__(self, processor_configuration):
     """Creates the Transformer estimator.
 
     Args:
-      data_dir: The training data directory.
-      model_dir: The trained model directory.
+      processor_configuration: A ProcessorConfiguration protobuffer with the
+        transformer fields populated.
     """
     # Do the pre-setup tensor2tensor requires for flags and configurations.
-    FLAGS.output_dir = model_dir
-    FLAGS.data_dir = data_dir
+    transformer_config = processor_configuration["transformer"]
+    FLAGS.output_dir = transformer_config["model_dir"]
     usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
-    data_dir = os.path.expanduser(data_dir)
+    data_dir = os.path.expanduser(transformer_config["data_dir"])
 
     # Create the basic hyper parameters.
-    self.hparams = tpu_trainer_lib.create_hparams(
-        FLAGS.hparams_set,
-        FLAGS.hparams,
+    self.hparams = trainer_lib.create_hparams(
+        transformer_config["hparams_set"],
+        transformer_config["hparams"],
         data_dir=data_dir,
-        problem_name=FLAGS.problems)
+        problem_name=transformer_config["problems"])
 
-    decode_hp = decoding.decode_hparams(FLAGS.decode_hparams)
+    decode_hp = decoding.decode_hparams()
     decode_hp.add_hparam("shards", 1)
     decode_hp.add_hparam("shard_id", 0)
 
     # Create the estimator and final hyper parameters.
-    self.estimator = tpu_trainer_lib.create_estimator(
-        FLAGS.model,
+    self.estimator = trainer_lib.create_estimator(
+        transformer_config["model"],
         self.hparams,
-        tpu_trainer.create_run_config(),
-        decode_hp, use_tpu=False)
+        t2t_trainer.create_run_config(self.hparams),
+        decode_hparams=decode_hp, use_tpu=False)
 
     # Fetch the vocabulary and other helpful variables for decoding.
     self.source_vocab = self.hparams.problems[0].vocabulary["inputs"]
diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index 8f17bf734..d7874ceff 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -1123,10 +1123,7 @@ def grouped_attention_multihead(query_antecedent,
     extra_loss *= extra_loss_multiplier
 
     # Show a bunch of summaries.
-    if (not tf.get_variable_scope().reuse and
-        # Summaries don't work well within tf.while_loop()
-        "/while/" not in tf.contrib.framework.get_name_scope() and
-        make_image_summary):
+    if expert_utils.should_generate_summaries() and make_image_summary:
       tf.summary.histogram("q_group_size", q_group_size)
       tf.summary.histogram("m_group_size", m_group_size)
       tf.summary.scalar("q_loss", q_loss)
@@ -1214,10 +1211,7 @@ def dot_product_attention(q,
       save_weights_to[scope.name] = weights
     # dropping out the attention links for each of the heads
     weights = tf.nn.dropout(weights, 1.0 - dropout_rate)
-    if (not tf.get_variable_scope().reuse and
-        # Summaries don't work well within tf.while_loop()
-        "/while/" not in tf.contrib.framework.get_name_scope() and
-        make_image_summary):
+    if expert_utils.should_generate_summaries() and make_image_summary:
       attention_image_summary(weights, image_shapes)
     return tf.matmul(weights, v)
 
@@ -3449,8 +3443,9 @@ def scaled_dot_product_attention_simple(q, k, v, bias, name=None):
     if bias is not None:
       logits += bias
     weights = tf.nn.softmax(logits, name="attention_weights")
-    tf.summary.image(
-        "attention", tf.expand_dims(tf.pow(weights, 0.2), 3), max_outputs=1)
+    if expert_utils.should_generate_summaries():
+      tf.summary.image(
+          "attention", tf.expand_dims(tf.pow(weights, 0.2), 3), max_outputs=1)
     return tf.matmul(weights, v)
 
 
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 5b4e39058..f6728ed86 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -61,6 +61,8 @@ def basic_params1():
       weight_decay=0.1,
       weight_noise=0.0,
       learning_rate_decay_scheme="none",
+      learning_rate_minimum=None,
+      learning_rate_decay_rate=1.0,
       learning_rate_warmup_steps=100,
       learning_rate_cosine_cycle_steps=250000,
       learning_rate=0.1,
@@ -96,7 +98,7 @@ def basic_params1():
       norm_type="layer",  # "batch", layer", "noam", "none".
       # epsilon parameter to normalization function
       norm_epsilon=1e-6,
-      symbol_modality_num_shards=16,
+      symbol_modality_num_shards=1,
       # During training, we drop sequences whose inputs and targets are shorter
       # than min_length
       min_length=0,
@@ -151,6 +153,11 @@ def basic_params1():
       # You can change this behavior by overridding preprocess_example() method
       # in your problem class.
       max_target_seq_length=0,
+      # if nonzero, we split the target sequences on example read.
+      # This is for use with language modeling problems with fixed length
+      # examples.  e.g.  The examples may be written with length 65536, but we
+      # want to split each example into 64 examples of length 1024.
+      split_to_length=0,
       # This flag allows us to optionally treat a seq-to-seq problem
       # as a language model.  Legal values are:
       #
@@ -190,8 +197,8 @@ def basic_params1():
       # This is the actual batch size, *not* tokens per batch (i.e. for
       # language models this is the number of sentences in the batch)
       tpu_batch_size_per_shard=24,
-      # Set by tpu_trainer to let the model know whether we are on TPU.
-      # Switching on/off tpu should not invalidate checkpoints.
+      # Set by t2t_trainer if --use_tpu to let the model know whether we are on
+      # TPU. Switching on/off tpu should not invalidate checkpoints.
       use_tpu=False,
       # If True in PREDICT mode, then last-position-only optimizations are not
       # used.
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 0e305ef54..5e9d9cd45 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -938,7 +938,7 @@ def simple_attention(target, source, bias=None):
     if bias is not None:
       attention += tf.expand_dims(tf.squeeze(bias, axis=[2, 3]), axis=1)
     attention = tf.nn.softmax(attention)
-    if not tf.get_variable_scope().reuse:
+    if eu.should_generate_summaries():
       tf.summary.image("attention", tf.expand_dims(attention, 3), max_outputs=5)
     attended = tf.matmul(attention, source)
     return tf.reshape(attended, target_shape)
@@ -1233,7 +1233,7 @@ def _maybe_transform(t, size, should_transform, name):
       mask = (1.0 - mask) * -1e9
       attention += mask
     attention = tf.nn.softmax(attention)
-    if not tf.get_variable_scope().reuse:
+    if eu.should_generate_summaries():
       # Compute a color image summary.
       image = tf.reshape(attention,
                          [batch, num_heads, target_length, source_length])
diff --git a/tensor2tensor/models/multimodel.py b/tensor2tensor/models/multimodel.py
index 370647544..5253e2c98 100644
--- a/tensor2tensor/models/multimodel.py
+++ b/tensor2tensor/models/multimodel.py
@@ -146,7 +146,7 @@ def flatten(inputs):
         expert_loss += tf.reduce_mean(moe_loss) * hparams.moe_loss_coef
 
     # If we're just predicing a class, there is no use for a decoder, return.
-    if isinstance(hparams.problems[self._problem_idx].target_modality,
+    if isinstance(self._problem_hparams.target_modality,
                   modalities.ClassLabelModality):
       return inputs_encoded, tf.reduce_mean(expert_loss)
 
diff --git a/tensor2tensor/models/revnet.py b/tensor2tensor/models/revnet.py
index 9d07e918f..23e87d1b4 100644
--- a/tensor2tensor/models/revnet.py
+++ b/tensor2tensor/models/revnet.py
@@ -35,6 +35,8 @@
 https://arxiv.org/pdf/1707.04585.pdf
 """
 
+import functools
+
 # Dependency imports
 
 from tensor2tensor.layers import common_hparams
@@ -44,13 +46,18 @@
 
 import tensorflow as tf
 
-CONFIG = {'2d': {'conv': tf.layers.conv2d,
+conv_initializer = tf.contrib.layers.variance_scaling_initializer(
+    factor=2.0, mode='FAN_OUT')
+
+CONFIG = {'2d': {'conv': functools.partial(
+    tf.layers.conv2d, kernel_initializer=conv_initializer),
                  'max_pool': tf.layers.max_pooling2d,
                  'avg_pool': tf.layers.average_pooling2d,
                  'split_axis': 3,
                  'reduction_dimensions': [1, 2]
                 },
-          '3d': {'conv': tf.layers.conv3d,
+          '3d': {'conv': functools.partial(
+              tf.layers.conv3d, kernel_initializer=conv_initializer),
                  'max_pool': tf.layers.max_pooling3d,
                  'avg_pool': tf.layers.average_pooling2d,
                  'split_axis': 4,
@@ -59,9 +66,9 @@
          }
 
 
-def f(x, depth1, depth2, dim='2d', first_batch_norm=True, layer_stride=1,
-      training=True, padding='SAME'):
-  """Applies bottleneck residual function for 104-layer RevNet.
+def f(x, depth1, depth2, dim='2d', first_batch_norm=True, stride=1,
+      training=True, bottleneck=True, padding='SAME'):
+  """Applies residual function for RevNet.
 
   Args:
     x: input tensor
@@ -70,14 +77,15 @@ def f(x, depth1, depth2, dim='2d', first_batch_norm=True, layer_stride=1,
     dim: '2d' if 2-dimensional, '3d' if 3-dimensional.
     first_batch_norm: Whether to keep the first batch norm layer or not.
       Typically used in the first RevNet block.
-    layer_stride: Stride for the first conv filter. Note that this particular
-      104-layer RevNet architecture only varies the stride for the first conv
+    stride: Stride for the first conv filter. Note that this particular
+      RevNet architecture only varies the stride for the first conv
       filter. The stride for the second conv filter is always set to 1.
     training: True for train phase, False for eval phase.
+    bottleneck: If true, apply bottleneck 1x1 down/up sampling.
     padding: Padding for each conv layer.
 
   Returns:
-    Output tensor after applying residual function for 104-layer RevNet.
+    Output tensor after applying residual function for RevNet.
   """
   conv = CONFIG[dim]['conv']
   with tf.variable_scope('f'):
@@ -86,53 +94,95 @@ def f(x, depth1, depth2, dim='2d', first_batch_norm=True, layer_stride=1,
       net = tf.nn.relu(net)
     else:
       net = x
-    net = conv(net, depth1, 1, strides=layer_stride,
-               padding=padding, activation=None)
 
-    net = tf.layers.batch_normalization(net, training=training)
-    net = tf.nn.relu(net)
-    net = conv(net, depth1, 3, strides=1,
-               padding=padding, activation=None)
+    if bottleneck:
+      net = conv(net, depth1, 1, strides=stride,
+                 padding=padding, activation=None)
+
+      net = tf.layers.batch_normalization(net, training=training)
+      net = tf.nn.relu(net)
+      net = conv(net, depth1, 3, strides=1,
+                 padding=padding, activation=None)
+
+      net = tf.layers.batch_normalization(net, training=training)
+      net = tf.nn.relu(net)
+      net = conv(net, depth2, 1, strides=1,
+                 padding=padding, activation=None)
+    else:
+      net = conv(net, depth2, 3, strides=stride,
+                 padding=padding, activation=None)
+      net = tf.layers.batch_normalization(x, training=training)
+      net = tf.nn.relu(net)
+      net = conv(net, depth2, 3, strides=stride,
+                 padding=padding, activation=None)
 
-    net = tf.layers.batch_normalization(net, training=training)
-    net = tf.nn.relu(net)
-    net = conv(net, depth2, 1, strides=1,
-               padding=padding, activation=None)
     return net
 
 
-def h(x, output_channels, dim='2d', layer_stride=1, scope='h'):
-  """Downsamples 'x' using a 1x1 convolution filter and a chosen stride.
+def downsample_bottleneck(x, output_channels, dim='2d', stride=1, scope='h'):
+  """Downsamples 'x' by `stride` using a 1x1 convolution filter.
 
   Args:
     x: input tensor of size [N, H, W, C]
     output_channels: Desired number of output channels.
     dim: '2d' if 2-dimensional, '3d' if 3-dimensional.
-    layer_stride: What stride to use. Usually 1 or 2.
-    scope: Optional variable scope for the h function.
-
-  This function uses a 1x1 convolution filter and a chosen stride to downsample
-  the input tensor x.
+    stride: What stride to use. Usually 1 or 2.
+    scope: Optional variable scope.
 
   Returns:
-    A downsampled tensor of size [N, H/2, W/2, output_channels] if layer_stride
+    A downsampled tensor of size [N, H/2, W/2, output_channels] if stride
     is 2, else returns a tensor of size [N, H, W, output_channels] if
-    layer_stride is 1.
+    stride is 1.
   """
   conv = CONFIG[dim]['conv']
   with tf.variable_scope(scope):
-    x = conv(x, output_channels, 1, strides=layer_stride, padding='SAME',
+    x = conv(x, output_channels, 1, strides=stride, padding='SAME',
              activation=None)
     return x
 
 
-def init(images, num_channels, dim='2d', training=True, scope='init'):
+def downsample_residual(x, output_channels, dim='2d', stride=1, scope='h'):
+  """Downsamples 'x' by `stride` using average pooling.
+
+  Args:
+    x: input tensor of size [N, H, W, C]
+    output_channels: Desired number of output channels.
+    dim: '2d' if 2-dimensional, '3d' if 3-dimensional.
+    stride: What stride to use. Usually 1 or 2.
+    scope: Optional variable scope.
+
+  Returns:
+    A downsampled tensor of size [N, H/2, W/2, output_channels] if stride
+    is 2, else returns a tensor of size [N, H, W, output_channels] if
+    stride is 1.
+  """
+  with tf.variable_scope(scope):
+    if stride > 1:
+      avg_pool = CONFIG[dim]['avg_pool']
+      x = avg_pool(x,
+                   pool_size=(stride, stride),
+                   strides=(stride, stride),
+                   padding='VALID')
+
+    input_channels = tf.shape(x)[3]
+    diff = output_channels - input_channels
+    x = tf.pad(
+        x, [[0, 0], [0, 0], [0, 0],
+            [diff // 2, diff // 2]])
+    return x
+
+
+def init(images, num_channels, dim='2d', stride=2,
+         kernel_size=7, maxpool=True, training=True, scope='init'):
   """Standard ResNet initial block used as first RevNet block.
 
   Args:
     images: [N, H, W, 3] tensor of input images to the model.
     num_channels: Output depth of convolutional layer in initial block.
     dim: '2d' if 2-dimensional, '3d' if 3-dimensional.
+    stride: stride for the convolution and pool layer.
+    kernel_size: Size of the initial convolution filter
+    maxpool: If true, apply a maxpool after the convolution
     training: True for train phase, False for eval phase.
     scope: Optional scope for the init block.
 
@@ -142,27 +192,28 @@ def init(images, num_channels, dim='2d', training=True, scope='init'):
   conv = CONFIG[dim]['conv']
   pool = CONFIG[dim]['max_pool']
   with tf.variable_scope(scope):
-    net = conv(images, num_channels, 7, strides=2,
+    net = conv(images, num_channels, kernel_size, strides=stride,
                padding='SAME', activation=None)
     net = tf.layers.batch_normalization(net, training=training)
     net = tf.nn.relu(net)
-    net = pool(net, pool_size=3, strides=2)
+    if maxpool:
+      net = pool(net, pool_size=3, strides=stride)
     x1, x2 = tf.split(net, 2, axis=CONFIG[dim]['split_axis'])
     return x1, x2
 
 
-def unit(x1, x2, block_num, depth1, depth2, num_layers, dim='2d',
-         first_batch_norm=True, stride=1, training=True):
-  """Implements bottleneck RevNet unit from authors' RevNet-104 architecture.
+def unit(x1, x2, block_num, depth, num_layers, dim='2d',
+         bottleneck=True, first_batch_norm=True, stride=1, training=True):
+  """Implements bottleneck RevNet unit from authors' RevNet architecture.
 
   Args:
     x1: [N, H, W, C] tensor of network activations.
     x2: [N, H, W, C] tensor of network activations.
     block_num: integer ID of block
-    depth1: First depth in bottleneck residual unit.
-    depth2: Second depth in bottleneck residual unit.
+    depth: First depth in bottleneck residual unit.
     num_layers: Number of layers in the RevNet block.
     dim: '2d' if 2-dimensional, '3d' if 3-dimensional.
+    bottleneck: Should a bottleneck layer be used.
     first_batch_norm: Whether to keep the first batch norm layer or not.
       Typically used in the first RevNet block.
     stride: Stride for the residual function.
@@ -172,25 +223,34 @@ def unit(x1, x2, block_num, depth1, depth2, num_layers, dim='2d',
     Two [N, H, W, C] output activation tensors.
   """
   scope_name = 'unit_%d' % block_num
+  if bottleneck:
+    depth1 = depth
+    depth2 = depth * 4
+  else:
+    depth1 = depth2 = depth
+
+  residual = functools.partial(f,
+                               depth1=depth1, depth2=depth2, dim=dim,
+                               training=training, bottleneck=bottleneck)
+
   with tf.variable_scope(scope_name):
+    downsample = downsample_bottleneck if bottleneck else downsample_residual
     # Manual implementation of downsampling
     with tf.variable_scope('downsampling'):
       with tf.variable_scope('x1'):
-        hx1 = h(x1, depth2, dim=dim, layer_stride=stride)
-        fx2 = f(x2, depth1, depth2, dim=dim, layer_stride=stride,
-                first_batch_norm=first_batch_norm, training=training)
+        hx1 = downsample(x1, depth2, dim=dim, stride=stride)
+        fx2 = residual(x2, stride=stride, first_batch_norm=first_batch_norm)
         x1 = hx1 + fx2
       with tf.variable_scope('x2'):
-        hx2 = h(x2, depth2, dim=dim, layer_stride=stride)
-        fx1 = f(x1, depth1, depth2, dim=dim, training=training)
+        hx2 = downsample(x2, depth2, dim=dim, stride=stride)
+        fx1 = residual(x1)
         x2 = hx2 + fx1
 
     # Full block using memory-efficient rev_block implementation.
     with tf.variable_scope('full_block'):
-      residual_func = lambda x: f(x, depth1, depth2, dim=dim, training=training)
       x1, x2 = rev_block.rev_block(x1, x2,
-                                   residual_func,
-                                   residual_func,
+                                   residual,
+                                   residual,
                                    num_layers=num_layers)
       return x1, x2
 
@@ -222,7 +282,7 @@ def final_block(x1, x2, dim='2d', training=True, scope='final_block'):
     return net
 
 
-def revnet104(inputs, hparams, reuse=None):
+def revnet(inputs, hparams, reuse=None):
   """Uses Tensor2Tensor memory optimized RevNet block to build a RevNet.
 
   Args:
@@ -252,17 +312,20 @@ def revnet104(inputs, hparams, reuse=None):
     [batch_size, hidden_dim] pre-logits tensor from the bottleneck RevNet.
   """
   training = hparams.mode == tf.estimator.ModeKeys.TRAIN
-  with tf.variable_scope('RevNet104', reuse=reuse):
+  with tf.variable_scope('RevNet', reuse=reuse):
     x1, x2 = init(inputs,
                   num_channels=hparams.num_channels_init_block,
                   dim=hparams.dim,
+                  kernel_size=hparams.init_kernel_size,
+                  maxpool=hparams.init_maxpool,
+                  stride=hparams.init_stride,
                   training=training)
-    for block_num in range(1, len(hparams.num_layers_per_block)):
-      block = {'depth1': hparams.num_channels_first[block_num],
-               'depth2': hparams.num_channels_second[block_num],
+    for block_num in range(len(hparams.num_layers_per_block)):
+      block = {'depth': hparams.num_channels[block_num],
                'num_layers': hparams.num_layers_per_block[block_num],
                'first_batch_norm': hparams.first_batch_norm[block_num],
-               'stride': hparams.strides[block_num]}
+               'stride': hparams.strides[block_num],
+               'bottleneck': hparams.bottleneck}
       x1, x2 = unit(x1, x2, block_num, dim=hparams.dim, training=training,
                     **block)
     pre_logits = final_block(x1, x2, dim=hparams.dim, training=training)
@@ -270,27 +333,96 @@ def revnet104(inputs, hparams, reuse=None):
 
 
 @registry.register_model
-class Revnet104(t2t_model.T2TModel):
+class Revnet(t2t_model.T2TModel):
 
   def body(self, features):
-    return revnet104(features['inputs'], self.hparams)
+    return revnet(features['inputs'], self.hparams)
 
 
-@registry.register_hparams
 def revnet_base():
-  """Set of hyperparameters."""
+  """Default hparams for Revnet."""
   hparams = common_hparams.basic_params1()
-  hparams.add_hparam('num_channels_first', [64, 128, 256, 416])
-  hparams.add_hparam('num_channels_second', [256, 512, 1024, 1664])
+  hparams.add_hparam('num_channels', [64, 128, 256, 416])
   hparams.add_hparam('num_layers_per_block', [1, 1, 10, 1])
+  hparams.add_hparam('bottleneck', True)
   hparams.add_hparam('first_batch_norm', [False, True, True, True])
+  hparams.add_hparam('init_stride', 2)
+  hparams.add_hparam('init_kernel_size', 7)
+  hparams.add_hparam('init_maxpool', True)
   hparams.add_hparam('strides', [1, 2, 2, 2])
-  hparams.add_hparam('num_channels_init_block', 32)
+  hparams.add_hparam('num_channels_init_block', 64)
   hparams.add_hparam('dim', '2d')
 
   hparams.optimizer = 'Momentum'
-  hparams.learning_rate = 0.01
+  hparams.learning_rate = 0.4
+
+  hparams.learning_rate_boundaries = [40000, 80000, 120000, 140000]
+  hparams.learning_rate_multiples = [0.1, 0.01, 0.001, 0.0002]
+  hparams.learning_rate_decay_scheme = 'piecewise'
+
   hparams.weight_decay = 1e-4
+
   # Can run with a batch size of 128 with Problem ImageImagenet224
   hparams.tpu_batch_size_per_shard = 128
   return hparams
+
+
+@registry.register_hparams
+def revnet_104():
+  return revnet_base()
+
+
+def revnet_cifar_base():
+  """Tiny hparams suitable for CIFAR/etc."""
+  hparams = revnet_base()
+  hparams.num_channels_init_block = 32
+  hparams.first_batch_norm = [False, True, True]
+  hparams.init_stride = 1
+  hparams.init_kernel_size = 3
+  hparams.init_maxpool = False
+  hparams.strides = [1, 2, 2]
+  hparams.tpu_batch_size_per_shard = 128
+  hparams.weight_decay = 5e-3
+
+  hparams.learning_rate = 0.1
+  hparams.learning_rate_boundaries = [2000, 4000, 6000, 8000]
+  hparams.learning_rate_multiples = [0.1, 0.01, 0.001, 0.0001]
+  return hparams
+
+
+@registry.register_hparams
+def revnet_38_cifar():
+  hparams = revnet_cifar_base()
+  hparams.bottleneck = False
+  hparams.num_channels = [16, 32, 56]
+  hparams.num_layers_per_block = [2, 2, 2]
+  return hparams
+
+
+@registry.register_hparams
+def revnet_110_cifar():
+  """Tiny hparams suitable for CIFAR/etc."""
+  hparams = revnet_cifar_base()
+  hparams.bottleneck = False
+  hparams.num_channels = [16, 32, 64]
+  hparams.num_layers_per_block = [8, 8, 8]
+  return hparams
+
+
+@registry.register_hparams
+def revnet_164_cifar():
+  """Tiny hparams suitable for CIFAR/etc."""
+  hparams = revnet_cifar_base()
+  hparams.bottleneck = True
+  hparams.num_channels = [16, 32, 64]
+  hparams.num_layers_per_block = [8, 8, 8]
+  return hparams
+
+
+@registry.register_ranged_hparams
+def revnet_range(rhp):
+  """Hyperparameters for tuning revnet."""
+  rhp.set_float('learning_rate', 0.05, 0.2, scale=rhp.LOG_SCALE)
+  rhp.set_float('weight_decay', 1e-5, 1e-3, scale=rhp.LOG_SCALE)
+  rhp.set_discrete('num_channels_init_block', [64, 128])
+  return rhp
diff --git a/tensor2tensor/models/revnet_test.py b/tensor2tensor/models/revnet_test.py
index 2c9abc0a9..4f8cf71b0 100644
--- a/tensor2tensor/models/revnet_test.py
+++ b/tensor2tensor/models/revnet_test.py
@@ -23,66 +23,68 @@ class RevnetTest(tf.test.TestCase):
 
   def testH(self):
     rev_block_input = tf.random_uniform([1, 299, 299, 3])
-    rev_block_output = revnet.h(rev_block_input, 256)
-    self.assertEquals(rev_block_output.get_shape(), [1, 299, 299, 256])
+    rev_block_output = revnet.downsample_bottleneck(rev_block_input, 256)
+    self.assertEquals(rev_block_output.get_shape().as_list(),
+                      [1, 299, 299, 256])
 
   def testHStride(self):
     rev_block_input = tf.random_uniform([2, 299, 299, 256])
-    rev_block_output = revnet.h(rev_block_input, 512, layer_stride=2,
-                                scope='HStride')
-    self.assertEquals(rev_block_output.get_shape(), [2, 150, 150, 512])
+    rev_block_output = revnet.downsample_bottleneck(
+        rev_block_input, 512, stride=2, scope='HStride')
+    self.assertEquals(rev_block_output.get_shape().as_list(),
+                      [2, 150, 150, 512])
 
   def testInit(self):
     images = tf.random_uniform([1, 299, 299, 3])
     x1, x2 = revnet.init(images, 32)
-    self.assertEquals(x1.get_shape(), [1, 74, 74, 16])
-    self.assertEquals(x2.get_shape(), [1, 74, 74, 16])
+    self.assertEquals(x1.get_shape().as_list(), [1, 74, 74, 16])
+    self.assertEquals(x2.get_shape().as_list(), [1, 74, 74, 16])
 
   def testInit3D(self):
     images = tf.random_uniform([1, 299, 299, 299, 3])
     x1, x2 = revnet.init(images, 32, dim='3d', scope='init3d')
-    self.assertEquals(x1.get_shape(), [1, 74, 74, 74, 16])
-    self.assertEquals(x2.get_shape(), [1, 74, 74, 74, 16])
+    self.assertEquals(x1.get_shape().as_list(), [1, 74, 74, 74, 16])
+    self.assertEquals(x2.get_shape().as_list(), [1, 74, 74, 74, 16])
 
   def testUnit1(self):
     x1 = tf.random_uniform([4, 74, 74, 256])
     x2 = tf.random_uniform([4, 74, 74, 256])
-    x1, x2 = revnet.unit(x1, x2, block_num=1, depth1=64, depth2=256,
+    x1, x2 = revnet.unit(x1, x2, block_num=1, depth=64,
                          first_batch_norm=True, num_layers=1)
-    self.assertEquals(x1.get_shape(), [4, 74, 74, 256])
-    self.assertEquals(x2.get_shape(), [4, 74, 74, 256])
+    self.assertEquals(x1.get_shape().as_list(), [4, 74, 74, 256])
+    self.assertEquals(x2.get_shape().as_list(), [4, 74, 74, 256])
 
   def testUnit2(self):
     x1 = tf.random_uniform([4, 74, 74, 256])
     x2 = tf.random_uniform([4, 74, 74, 256])
-    x1, x2 = revnet.unit(x1, x2, block_num=2, depth1=128, depth2=512,
+    x1, x2 = revnet.unit(x1, x2, block_num=2, depth=128,
                          num_layers=1, stride=2)
-    self.assertEquals(x1.get_shape(), [4, 37, 37, 512])
-    self.assertEquals(x2.get_shape(), [4, 37, 37, 512])
+    self.assertEquals(x1.get_shape().as_list(), [4, 37, 37, 512])
+    self.assertEquals(x2.get_shape().as_list(), [4, 37, 37, 512])
 
   def testUnit3(self):
     x1 = tf.random_uniform([1, 37, 37, 512])
     x2 = tf.random_uniform([1, 37, 37, 512])
-    x1, x2 = revnet.unit(x1, x2, block_num=3, depth1=256,
-                         depth2=1024, num_layers=10, stride=2)
-    self.assertEquals(x1.get_shape(), [1, 19, 19, 1024])
-    self.assertEquals(x2.get_shape(), [1, 19, 19, 1024])
+    x1, x2 = revnet.unit(x1, x2, block_num=3, depth=256,
+                         num_layers=10, stride=2)
+    self.assertEquals(x1.get_shape().as_list(), [1, 19, 19, 1024])
+    self.assertEquals(x2.get_shape().as_list(), [1, 19, 19, 1024])
 
   def testUnit4(self):
     x1 = tf.random_uniform([1, 19, 19, 1024])
     x2 = tf.random_uniform([1, 19, 19, 1024])
-    x1, x2 = revnet.unit(x1, x2, block_num=4, depth1=416,
-                         depth2=1664, num_layers=1, stride=2)
-    self.assertEquals(x1.get_shape(), [1, 10, 10, 1664])
-    self.assertEquals(x2.get_shape(), [1, 10, 10, 1664])
+    x1, x2 = revnet.unit(x1, x2, block_num=4, depth=416,
+                         num_layers=1, stride=2)
+    self.assertEquals(x1.get_shape().as_list(), [1, 10, 10, 1664])
+    self.assertEquals(x2.get_shape().as_list(), [1, 10, 10, 1664])
 
   def testUnit3D(self):
     x1 = tf.random_uniform([4, 74, 74, 74, 256])
     x2 = tf.random_uniform([4, 74, 74, 74, 256])
-    x1, x2 = revnet.unit(x1, x2, block_num=5, depth1=128, depth2=512,
+    x1, x2 = revnet.unit(x1, x2, block_num=5, depth=128,
                          num_layers=1, dim='3d', stride=2)
-    self.assertEquals(x1.get_shape(), [4, 37, 37, 37, 512])
-    self.assertEquals(x2.get_shape(), [4, 37, 37, 37, 512])
+    self.assertEquals(x1.get_shape().as_list(), [4, 37, 37, 37, 512])
+    self.assertEquals(x2.get_shape().as_list(), [4, 37, 37, 37, 512])
 
   def testFinalBlock(self):
     x1 = tf.random_uniform([5, 10, 10, 1024])
@@ -100,7 +102,7 @@ def testEndToEnd(self):
     images = tf.random_uniform([1, 299, 299, 3])
     hparams = revnet.revnet_base()
     hparams.mode = tf.estimator.ModeKeys.TRAIN
-    logits = revnet.revnet104(images, hparams)
+    logits = revnet.revnet(images, hparams)
     self.assertEquals(logits.shape, [1, 1, 1, 3328])
 
   def testEndToEnd3D(self):
@@ -108,7 +110,7 @@ def testEndToEnd3D(self):
     hparams = revnet.revnet_base()
     hparams.dim = '3d'
     hparams.mode = tf.estimator.ModeKeys.TRAIN
-    logits = revnet.revnet104(images, hparams)
+    logits = revnet.revnet(images, hparams)
     self.assertEquals(logits.shape, [1, 1, 1, 1, 3328])
 
 if __name__ == '__main__':
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index 9f0718dd7..a18676967 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -281,7 +281,7 @@ class SliceNet(t2t_model.T2TModel):
 
   def body(self, features):
     target_modality_name = (
-        self._hparams.problems[self._problem_idx].target_modality.name)
+        self._problem_hparams.target_modality.name)
     # If we're just predicing a class, there is no use for a decoder.
     run_decoder = "class_label_modality" not in target_modality_name
     return slicenet_internal(
diff --git a/tensor2tensor/models/super_lm.py b/tensor2tensor/models/super_lm.py
index f671e1c19..d004087a6 100644
--- a/tensor2tensor/models/super_lm.py
+++ b/tensor2tensor/models/super_lm.py
@@ -33,6 +33,7 @@
 from tensor2tensor.layers import common_attention
 from tensor2tensor.layers import common_hparams
 from tensor2tensor.layers import common_layers
+from tensor2tensor.utils import diet
 from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
@@ -97,7 +98,7 @@ def body(self, features):
     decoder_input = mp(
         tf.nn.dropout, decoder_input,
         1.0 - hparams.layer_prepostprocess_dropout)
-    decoder_output = _super_stack(
+    decoder_output, extra_loss = _super_stack(
         decoder_input, decoder_self_attention_bias, hparams, mp)
     # Bypass the symbol modality and compute logits directly.
     # We compute a different set of logits on each shard, and sum them.
@@ -121,6 +122,8 @@ def _loss_for_shard(logits, targets, shard):
     num, denom = mp(_loss_for_shard, logits, targets, range(mp.n))
     # override training loss so that it is not computed externally.
     losses = {"training": tf.add_n(num) / tf.add_n(denom)}
+    if extra_loss is not None:
+      losses["extra"] = extra_loss
     return logits_shard_0, losses
 
 
@@ -140,16 +143,27 @@ def _super_stack(inputs,
     padding: a string
 
   Returns:
-    y: a Tensors
+    y: a list of Tensors
+    extra_loss: an optional scalar
   """
   layers = hparams.layers.strip(",").split(",")
   ffn_hidden_sizes = [int(s) for s in hparams.ffn_hidden_sizes.split(",")]
+  moe_hidden_sizes = [int(s) for s in hparams.moe_hidden_sizes.split(",")]
+  if hparams.diet_experts:
+    hsize, = moe_hidden_sizes
+    def _diet_expert(x):
+      return diet.diet_expert(x, hsize, diet.diet_adam_optimizer_params())
+    expert_fn = _diet_expert
+  else:
+    expert_fn = expert_utils.ffn_expert_fn(
+        hparams.hidden_size, moe_hidden_sizes, hparams.hidden_size)
   # scaled_dot_product_attention_with_projections uses a 3d attention bias
   # (no heads), where multihead_attention uses 4d attention bias.
   mix_size = int(hparams.mix_fraction * hparams.hidden_size)
   attention_bias_3d = mp(tf.squeeze, attention_bias, 1)
   accumulator = inputs
   x = inputs
+  extra_losses = []
   for layer_num, layer_type in enumerate(layers):
     with tf.variable_scope("%s_%d" % (layer_type, layer_num)):
       tf.logging.info("%s_%d" % (layer_type, layer_num))
@@ -210,9 +224,24 @@ def _split(t):
             activation=tf.nn.relu,
             padding=padding,
         )
+      elif layer_type == "moe":
+        # mixture of experts - each model shard has its own local MoE.
+        x, loss = mp(
+            expert_utils.local_moe,
+            x,
+            train=hparams.mode == tf.estimator.ModeKeys.TRAIN,
+            expert_fn=expert_fn,
+            num_experts=hparams.moe_num_experts,
+            k=hparams.moe_k,
+            loss_coef=hparams.moe_loss_coef)
+        extra_losses.extend(loss)
       else:
         assert False, "unknown sublayer %s" % layer_type
-  return x
+  if extra_losses:
+    extra_loss = tf.add_n(extra_losses)
+  else:
+    extra_loss = None
+  return x, extra_loss
 
 
 @registry.register_hparams
@@ -220,8 +249,9 @@ def super_lm_base():
   """Set of hyperparameters."""
   hparams = common_hparams.basic_params1()
   hparams.hidden_size = 512
+  hparams.moe_hidden_sizes = "512"
   hparams.batch_size = 16384
-  hparams.max_length = 256
+  hparams.max_length = 0
   hparams.layer_prepostprocess_dropout = 0.0
   hparams.label_smoothing = 0.0
   hparams.clip_grad_norm = 0.  # i.e. no gradient clipping
@@ -233,7 +263,7 @@ def super_lm_base():
   hparams.initializer = "uniform_unit_scaling"
   hparams.weight_decay = 0.0
   hparams.optimizer_adam_beta1 = 0.9
-  hparams.optimizer_adam_beta2 = 0.98
+  hparams.optimizer_adam_beta2 = 0.999
   hparams.shared_embedding_and_softmax_weights = False
   hparams.layer_preprocess_sequence = "n"
   hparams.layer_postprocess_sequence = "da"
@@ -244,7 +274,7 @@ def super_lm_base():
   hparams.add_hparam("ffn_hidden_sizes", "512")  # Add new ones like this.
   hparams.add_hparam("mix_fraction", 0.5)
   # attention-related flags
-  hparams.add_hparam("num_heads", 8)
+  hparams.add_hparam("num_heads", 4)
   hparams.add_hparam("attention_key_channels", 0)
   hparams.add_hparam("attention_value_channels", 0)
   # All hyperparameters ending in "dropout" are automatically set to 0.0
@@ -256,6 +286,7 @@ def super_lm_base():
   # Number of model shards - each one has separate parameters.
   # Changing this number invalidates checkpoints.
   hparams.add_hparam("num_model_shards", 8)
+  hparams.add_hparam("diet_experts", False)
   return hparams
 
 
@@ -299,3 +330,39 @@ def super_lm_b8k():
   hparams = super_lm_base()
   hparams.batch_size = 8192
   return hparams
+
+
+@registry.register_hparams
+def super_lm_moe():
+  """Add mixture of experts with ~1B params."""
+  hparams = super_lm_base()
+  hparams.layers = (
+      ("n,att,m,d,a," "n,moe,m,d,a,") * 4 + "n,ffn,d")
+  hparams.moe_num_experts = 32
+  hparams.moe_hidden_sizes = "1024"
+  return hparams
+
+
+@registry.register_hparams
+def super_lm_moe_h4():
+  """Add mixture of experts."""
+  hparams = super_lm_moe()
+  hparams.layers = (
+      ("n,multihead-att,m,d,a," "n,moe,m,d,a,") * 4 + "n,ffn,d")
+  return hparams
+
+
+@registry.register_hparams
+def super_lm_moe_4b_diet():
+  """Add mixture of experts with ~4B params and diet variables.
+
+  Currently, hangs.  See this issue:
+  https://github.com/tensorflow/tensorflow/issues/13351
+
+  Returns:
+    a hparams.
+  """
+  hparams = super_lm_moe()
+  hparams.moe_num_experts = 128
+  hparams.diet_experts = True
+  return hparams
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index de812b64b..67a342e67 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -78,7 +78,7 @@ def encode(self, inputs, target_space, hparams, features=None):
 
     encoder_output = transformer_encoder(
         encoder_input, self_attention_bias,
-        hparams, nonpadding=_features_to_nonpadding(features, "inputs"),
+        hparams, nonpadding=features_to_nonpadding(features, "inputs"),
         save_weights_to=self.attention_weights)
 
     return encoder_output, encoder_decoder_attention_bias
@@ -162,7 +162,7 @@ def body(self, features):
     return self.decode(decoder_input, encoder_output,
                        encoder_decoder_attention_bias,
                        decoder_self_attention_bias, hparams,
-                       nonpadding=_features_to_nonpadding(features, "targets"))
+                       nonpadding=features_to_nonpadding(features, "targets"))
 
   def _greedy_infer(self, features, decode_length):
     """Fast version of greedy decoding.
@@ -233,7 +233,6 @@ def _fast_decode(self,
     hparams = self._hparams
 
     inputs = features["inputs"]
-    batch_size = common_layers.shape_list(inputs)[0]
     target_modality = self._problem_hparams.target_modality
     if target_modality.is_class_modality:
       decode_length = 1
@@ -309,83 +308,130 @@ def symbols_to_logits_fn(ids, i, cache):
         body_outputs = dp(
             self.decode, targets, cache["encoder_output"],
             cache["encoder_decoder_attention_bias"], bias, hparams, cache,
-            nonpadding=_features_to_nonpadding(features, "targets"))
+            nonpadding=features_to_nonpadding(features, "targets"))
 
       with tf.variable_scope(target_modality.name):
         logits = target_modality.top_sharded(body_outputs, None, dp)[0]
 
       return tf.squeeze(logits, axis=[1, 2, 3]), cache
 
-    key_channels = hparams.attention_key_channels or hparams.hidden_size
-    value_channels = hparams.attention_value_channels or hparams.hidden_size
-    num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers
-
-    cache = {
-        "layer_%d" % layer: {
-            "k": tf.zeros([batch_size, 0, key_channels]),
-            "v": tf.zeros([batch_size, 0, value_channels]),
-        }
-        for layer in range(num_layers)
-    }
-
-    # Set 2nd dim to None since it's not invariant in the tf.while_loop
-    # Note: Tensor.set_shape() does not work here since it merges shape info.
-    # TODO(llion); Find a more robust solution.
-    # pylint: disable=protected-access
-    if not context.in_eager_mode():
-      for layer in cache:
-        cache[layer]["k"]._shape = tf.TensorShape([None, None, key_channels])
-        cache[layer]["v"]._shape = tf.TensorShape([None, None, value_channels])
-    # pylint: enable=protected-access
-    cache["encoder_output"] = encoder_output
-    cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
-
-    if beam_size > 1:  # Beam Search
-      target_modality = (
-          self._hparams.problems[self._problem_idx].target_modality)
-      vocab_size = target_modality.top_dimensionality
-      initial_ids = tf.zeros([batch_size], dtype=tf.int32)
-      decoded_ids, scores = beam_search.beam_search(
-          symbols_to_logits_fn,
-          initial_ids,
-          beam_size,
-          decode_length,
-          vocab_size,
-          alpha,
-          states=cache,
-          stop_early=(top_beams == 1))
-
-      if top_beams == 1:
-        decoded_ids = decoded_ids[:, 0, 1:]
-      else:
-        decoded_ids = decoded_ids[:, :top_beams, 1:]
-    else:  # Greedy
-
-      def inner_loop(i, next_id, decoded_ids, cache):
-        logits, cache = symbols_to_logits_fn(next_id, i, cache)
-        temperature = (0.0 if hparams.sampling_method == "argmax" else
-                       hparams.sampling_temp)
-        next_id = tf.expand_dims(
-            common_layers.sample_with_temperature(logits, temperature), axis=1)
-        decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
-        return i + 1, next_id, decoded_ids, cache
-
-      decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
-      scores = None
-      next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
-      _, _, decoded_ids, _ = tf.while_loop(
-          # TODO(llion): Early stopping.
-          lambda i, *_: tf.less(i, decode_length),
-          inner_loop,
-          [tf.constant(0), next_id, decoded_ids, cache],
-          shape_invariants=[
-              tf.TensorShape([]),
-              tf.TensorShape([None, None]),
-              tf.TensorShape([None, None]),
-              nest.map_structure(lambda t: tf.TensorShape(t.shape), cache),
-          ])
-
-    return decoded_ids, scores
+    return fast_decode(
+        encoder_output=encoder_output,
+        encoder_decoder_attention_bias=encoder_decoder_attention_bias,
+        symbols_to_logits_fn=symbols_to_logits_fn,
+        hparams=hparams,
+        decode_length=decode_length,
+        vocab_size=target_modality.top_dimensionality,
+        beam_size=beam_size,
+        top_beams=top_beams,
+        alpha=alpha)
+
+
+def fast_decode(encoder_output,
+                encoder_decoder_attention_bias,
+                symbols_to_logits_fn,
+                hparams,
+                decode_length,
+                vocab_size,
+                beam_size=1,
+                top_beams=1,
+                alpha=1.0,
+                eos_id=beam_search.EOS_ID):
+  """Given encoder output and a symbols to logits function, does fast decoding.
+
+  Implements both greedy and beam search decoding, uses beam search iff
+  beam_size > 1, otherwise beam search related arguments are ignored.
+
+  Args:
+    encoder_output: Output from encoder.
+    encoder_decoder_attention_bias: a bias tensor for use in encoder-decoder
+      attention
+    symbols_to_logits_fn: Incremental decoding; function mapping triple
+      `(ids, step, cache)` to symbol logits.
+    hparams: run hyperparameters
+    decode_length: an integer.  How many additional timesteps to decode.
+    vocab_size: Output vocabulary size.
+    beam_size: number of beams.
+    top_beams: an integer. How many of the beams to return.
+    alpha: Float that controls the length penalty. larger the alpha, stronger
+      the preference for slonger translations.
+    eos_id: End-of-sequence symbol in beam search.
+
+  Returns:
+    Pair of tensors `(decoded_ids, scores)`, where `decoded_ids` is a 2-d or 3-d
+    (when doing beam search with top_beams > 1) tensor containing result of
+    decoding, and `scores` is the beam search scores.
+  """
+  batch_size = common_layers.shape_list(encoder_output)[0]
+
+  key_channels = hparams.attention_key_channels or hparams.hidden_size
+  value_channels = hparams.attention_value_channels or hparams.hidden_size
+  num_layers = hparams.num_decoder_layers or hparams.num_hidden_layers
+
+  cache = {
+      "layer_%d" % layer: {
+          "k": tf.zeros([batch_size, 0, key_channels]),
+          "v": tf.zeros([batch_size, 0, value_channels]),
+      }
+      for layer in range(num_layers)
+  }
+
+  # Set 2nd dim to None since it's not invariant in the tf.while_loop
+  # Note: Tensor.set_shape() does not work here since it merges shape info.
+  # TODO(llion); Find a more robust solution.
+  # pylint: disable=protected-access
+  if not context.in_eager_mode():
+    for layer in cache:
+      cache[layer]["k"]._shape = tf.TensorShape([None, None, key_channels])
+      cache[layer]["v"]._shape = tf.TensorShape([None, None, value_channels])
+  # pylint: enable=protected-access
+  cache["encoder_output"] = encoder_output
+  cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias
+
+  if beam_size > 1:  # Beam Search
+    initial_ids = tf.zeros([batch_size], dtype=tf.int32)
+    decoded_ids, scores = beam_search.beam_search(
+        symbols_to_logits_fn,
+        initial_ids,
+        beam_size,
+        decode_length,
+        vocab_size,
+        alpha,
+        states=cache,
+        eos_id=eos_id,
+        stop_early=(top_beams == 1))
+
+    if top_beams == 1:
+      decoded_ids = decoded_ids[:, 0, 1:]
+    else:
+      decoded_ids = decoded_ids[:, :top_beams, 1:]
+  else:  # Greedy
+
+    def inner_loop(i, next_id, decoded_ids, cache):
+      logits, cache = symbols_to_logits_fn(next_id, i, cache)
+      temperature = (0.0 if hparams.sampling_method == "argmax" else
+                     hparams.sampling_temp)
+      next_id = tf.expand_dims(
+          common_layers.sample_with_temperature(logits, temperature), axis=1)
+      decoded_ids = tf.concat([decoded_ids, next_id], axis=1)
+      return i + 1, next_id, decoded_ids, cache
+
+    decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int64)
+    scores = None
+    next_id = tf.zeros([batch_size, 1], dtype=tf.int64)
+    _, _, decoded_ids, _ = tf.while_loop(
+        # TODO(llion): Early stopping.
+        lambda i, *_: tf.less(i, decode_length),
+        inner_loop,
+        [tf.constant(0), next_id, decoded_ids, cache],
+        shape_invariants=[
+            tf.TensorShape([]),
+            tf.TensorShape([None, None]),
+            tf.TensorShape([None, None]),
+            nest.map_structure(lambda t: tf.TensorShape(t.shape), cache),
+        ])
+
+  return decoded_ids, scores
 
 
 @registry.register_model
@@ -406,13 +452,13 @@ def body(self, features):
                                   1.0 - hparams.layer_prepostprocess_dropout)
     encoder_output = transformer_encoder(
         encoder_input, encoder_self_attention_bias, hparams,
-        nonpadding=_features_to_nonpadding(features, "inputs"))
+        nonpadding=features_to_nonpadding(features, "inputs"))
     encoder_output = tf.expand_dims(encoder_output, 2)
 
     return encoder_output
 
 
-def _features_to_nonpadding(features, inputs_or_targets="inputs"):
+def features_to_nonpadding(features, inputs_or_targets="inputs"):
   key = inputs_or_targets + "_segmentation"
   if features and key in features:
     return tf.minimum(features[key], 1.0)
@@ -486,9 +532,15 @@ def transformer_prepare_decoder(targets, hparams, features=None):
     decoder_input: a Tensor, bottom of decoder stack
     decoder_self_attention_bias: a bias tensor for use in encoder self-attention
   """
-  decoder_self_attention_bias = (
-      common_attention.attention_bias_lower_triangle(
-          common_layers.shape_list(targets)[1]))
+  if hparams.prepend_mode == "prepend_inputs_full_attention":
+    decoder_self_attention_bias = (
+        common_attention.attention_bias_prepended(
+            common_attention.embedding_to_padding(targets)))
+  else:
+    decoder_self_attention_bias = (
+        common_attention.attention_bias_lower_triangle(
+            common_layers.shape_list(targets)[1]))
+
   if features and "targets_segmentation" in features:
     # "Packed" dataset - keep the examples from seeing each other.
     targets_segmentation = features["targets_segmentation"]
@@ -750,6 +802,7 @@ def transformer_base_v1():
   hparams.num_sampled_classes = 0
   hparams.label_smoothing = 0.1
   hparams.shared_embedding_and_softmax_weights = True
+  hparams.symbol_modality_num_shards = 16
   # Add new ones like this.
   hparams.add_hparam("filter_size", 2048)
   # Layer-related flags. If zero, these fall back on hparams.num_hidden_layers.
@@ -896,6 +949,16 @@ def transformer_tiny():
   return hparams
 
 
+@registry.register_hparams
+def transformer_test():
+  hparams = transformer_base()
+  hparams.num_hidden_layers = 2
+  hparams.hidden_size = 16
+  hparams.filter_size = 8
+  hparams.num_heads = 2
+  return hparams
+
+
 @registry.register_hparams
 def transformer_small():
   hparams = transformer_base()
@@ -1147,6 +1210,24 @@ def transformer_tpu():
   """HParams for Transformer model on TPU."""
   hparams = transformer_base()
   update_hparams_for_tpu(hparams)
+  hparams.tpu_batch_size_per_shard = 56
+  return hparams
+
+
+@registry.register_hparams
+def transformer_packed_tpu():
+  """For packed problems, length 256, batch 14."""
+  hparams = transformer_base()
+  update_hparams_for_tpu(hparams)
+  hparams.tpu_batch_size_per_shard = 12
+  return hparams
+
+
+@registry.register_hparams
+def transformer_big_tpu():
+  hparams = transformer_big()
+  update_hparams_for_tpu(hparams)
+  hparams.tpu_batch_size_per_shard = 16
   return hparams
 
 
@@ -1205,14 +1286,16 @@ def update_hparams_for_tpu(hparams):
   hparams.use_pad_remover = False  # where op not supported
   hparams.optimizer = "TrueAdam"
   hparams.learning_rate = 0.2
+  # Avoid an expensive concat on TPU
+  hparams.symbol_modality_num_shards = 1
 
   # Inputs
   # Each example in the batch will be of (padded) length hparams.max_length
   # It is suggested to use a dataset that where examples have been combined
-  # to this length.
-  # TODO(noam): Prepare and debug these datasets.
-  hparams.max_length = 256
-  hparams.tpu_batch_size_per_shard = 8
+  # to a longer length, e.g. the "_packed" datasets. If that's the case, reduce
+  # the tpu_batch_size_per_shard as necessary to fit in memory.
+  # For translate_ende_wmt32k_packed, transformer_packed_tpu is a good config.
+  hparams.max_length = 64
 
 
 @registry.register_hparams
diff --git a/tensor2tensor/models/transformer_test.py b/tensor2tensor/models/transformer_test.py
index a0c21e2c0..d4502e585 100644
--- a/tensor2tensor/models/transformer_test.py
+++ b/tensor2tensor/models/transformer_test.py
@@ -149,6 +149,26 @@ def testBeamVsFast(self):
     self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length))
     self.assertAllClose(beam_res, fast_res)
 
+  def testTransformerWithoutProblem(self):
+    hparams = transformer.transformer_test()
+
+    embedded_inputs = np.random.random_sample(
+        (BATCH_SIZE, INPUT_LENGTH, 1, hparams.hidden_size))
+    embedded_targets = np.random.random_sample(
+        (BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size))
+
+    transformed_features = {
+        "inputs": tf.constant(embedded_inputs, dtype=tf.float32),
+        "targets": tf.constant(embedded_targets, dtype=tf.float32)
+    }
+
+    model = transformer.Transformer(hparams)
+    body_out, _ = model(transformed_features)
+
+    self.assertAllEqual(
+        body_out.get_shape().as_list(),
+        [BATCH_SIZE, TARGET_LENGTH, 1, hparams.hidden_size])
+
 
 if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index c43342afd..4b91f777e 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -25,6 +25,7 @@
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
 import tensorflow as tf
+from tensorflow.python.training import moving_averages
 
 
 _DO_SUMMARIES = True
@@ -140,28 +141,34 @@ def vae(x, z_size, name):
 
 def nearest(x, means, hparams):
   """Find the nearest means to elements in x."""
-  x, means = tf.stop_gradient(x), tf.stop_gradient(means)
   x_flat = tf.reshape(x, [-1, hparams.hidden_size])
-  x_norm = tf.norm(x_flat, axis=-1, keep_dims=True)
-  means_norm = tf.norm(means, axis=-1, keep_dims=True)
-  dist = x_norm + tf.transpose(means_norm) - 2 * tf.matmul(x_flat, means,
-                                                           transpose_b=True)
-  _, nearest_idx = tf.nn.top_k(- dist, k=1)
-  nearest_hot = tf.one_hot(tf.squeeze(nearest_idx, axis=1), hparams.v_size)
+  x_norm_sq = tf.reduce_sum(x_flat ** 2, axis=-1, keep_dims=True)
+  means_norm_sq = tf.reduce_sum(means ** 2, axis=-1, keep_dims=True)
+  dist = (
+      x_norm_sq + tf.transpose(means_norm_sq) -
+      2 * tf.matmul(x_flat, means, transpose_b=True))
+  if hparams.random_top_k > 1:
+    _, top_k_idx = tf.nn.top_k(-dist, k=hparams.random_top_k)
+    nearest_idx = tf.gather(
+        top_k_idx,
+        tf.random_uniform(
+            [1], minval=0, maxval=hparams.random_top_k - 1, dtype=tf.int32),
+        axis=-1)
+  else:
+    nearest_idx = tf.argmax(-dist, axis=-1)
+  nearest_hot = tf.one_hot(nearest_idx, hparams.v_size)
   shape = common_layers.shape_list(x)
   shape[-1] = hparams.v_size
   nearest_hot = tf.reshape(nearest_hot, shape=shape)
   return tf.stop_gradient(nearest_hot)
 
 
-def kmeans(x, means, hparams, name):
-  with tf.variable_scope(name):
-    x_means_hot = nearest(x, means, hparams)
-    x_means = tf.gather(means, tf.argmax(x_means_hot, axis=-1))
-    reg_loss1 = tf.nn.l2_loss((tf.stop_gradient(x) - x_means))
-    reg_loss2 = hparams.beta * tf.nn.l2_loss((x - tf.stop_gradient(x_means)))
-    l = reg_loss1 + reg_loss2
-    return x_means_hot, x_means, l
+def kmeans(x, means, hparams):
+  x_means_hot = nearest(x, means, hparams)
+  x_means = tf.gather(means, tf.argmax(x_means_hot, axis=-1))
+  q_loss = tf.reduce_mean((tf.stop_gradient(x) - x_means)**2)
+  e_loss = tf.reduce_mean((x - tf.stop_gradient(x_means))**2)
+  return x_means_hot, x_means, q_loss, e_loss
 
 
 def bit_to_int(x_bit, nbits):
@@ -184,11 +191,23 @@ def int_to_bit(x_int, nbits):
   return tf.to_float(res)
 
 
-def bottleneck(x, hparams, filter_size, name):
+def bottleneck(x,
+               hparams,
+               filter_size,
+               name,
+               means=None,
+               ema_count=None,
+               ema_means=None):
   """Bottleneck."""
+  if hparams.bottleneck_kind == "vq-vae":
+    assert means is not None
+    if hparams.ema:
+      assert ema_count is not None
+      assert ema_means is not None
+
   def embed(x):
     """Embedding function; must be compatible with the code later."""
-    with tf.variable_scope(name, reuse=True):
+    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
       if hparams.bottleneck_kind == "semhash":
         c = int_to_bit(x, z_size)
         h1a = tf.layers.dense(c, filter_size, name="vch1a")
@@ -198,16 +217,19 @@ def embed(x):
         hot = tf.one_hot(x, hparams.v_size)
         h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense")
       elif hparams.bottleneck_kind == "vq-vae":
-        means = tf.get_variable(name="means",
-                                shape=[hparams.v_size, hparams.hidden_size])
-        h1 = tf.gather(means, x)
+        if hparams.ema:
+          means_embed = ema_means
+        else:
+          means_embed = means
+
+        h1 = tf.gather(means_embed, x)
       elif hparams.bottleneck_kind == "rounding":
         h1 = x
 
       h2 = tf.layers.dense(tf.nn.relu(h1), filter_size, name="vch2")
       return tf.layers.dense(tf.nn.relu(h2), hparams.hidden_size, name="vcfin")
 
-  with tf.variable_scope(name):
+  with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
     z_size = hparams.z_size
     l = tf.constant(0.0)
     if hparams.bottleneck_kind == "dense":
@@ -245,11 +267,36 @@ def embed(x):
       c = tf.argmax(hot, axis=-1)
       h1 = tf.layers.dense(hot, hparams.hidden_size, name="dae_dense")
     if hparams.bottleneck_kind == "vq-vae":
-      means = tf.get_variable(name="means", shape=[hparams.v_size,
-                                                   hparams.hidden_size])
-      x_means_hot, x_means, l = kmeans(x, means, hparams, name="vq-vae-kmeans")
-      h1 = tf.stop_gradient(x_means) + x - tf.stop_gradient(x)
+      x_means_hot, x_means, q_loss, e_loss = kmeans(x, means, hparams)
       c = tf.argmax(x_means_hot, axis=-1)
+
+      # Update the ema variables
+      if hparams.ema:
+        tf.logging.info("Using EMA with beta = {}".format(hparams.beta))
+        x_means_hot_flat = tf.reshape(x_means_hot, shape=[-1, hparams.v_size])
+        updated_ema_count = moving_averages.assign_moving_average(
+            ema_count,
+            tf.reduce_sum(x_means_hot_flat, axis=0),
+            hparams.decay,
+            zero_debias=False)
+        x_flat = tf.reshape(x, [-1, hparams.hidden_size])
+        dw = tf.matmul(x_means_hot_flat, x_flat, transpose_a=True)
+        updated_ema_means = moving_averages.assign_moving_average(
+            ema_means, dw, hparams.decay, zero_debias=False)
+        n = tf.reduce_sum(updated_ema_count)
+        updated_ema_count = ((updated_ema_count + hparams.epsilon) /
+                             (n + hparams.v_size * hparams.epsilon) * n)
+        updated_ema_means /= tf.expand_dims(updated_ema_count, axis=-1)
+
+        with tf.control_dependencies([e_loss]):
+          update_means = tf.assign(means, updated_ema_means)
+          with tf.control_dependencies([update_means]):
+            l = hparams.beta * e_loss
+      else:
+        l = q_loss + hparams.beta * e_loss
+
+      h1 = tf.stop_gradient(x_means) + x - tf.stop_gradient(x)
+
     if hparams.bottleneck_kind == "rounding":
       h = tf.layers.dense(x, 1, name="vcc")
 
@@ -353,8 +400,15 @@ def next_bit(latents_discrete, i):
   return latents_discrete
 
 
-def ae_transformer_internal(inputs, targets, target_space, hparams,
-                            cache=None, predict_mask=1.0):
+def ae_transformer_internal(inputs,
+                            targets,
+                            target_space,
+                            hparams,
+                            cache=None,
+                            predict_mask=1.0,
+                            means=None,
+                            ema_count=None,
+                            ema_means=None):
   """AE Transformer, main step used for training."""
   # Summaries break with the do_refine cond, turn them off in that case.
   global _DO_SUMMARIES
@@ -383,7 +437,7 @@ def ae_transformer_internal(inputs, targets, target_space, hparams,
     if hparams.mode != tf.estimator.ModeKeys.PREDICT:
       # Compress and bottleneck.
       latents_dense, latents_discrete, extra_loss, _ = bottleneck(
-          targets_c, hparams, 2*2048, "vc")
+          targets_c, hparams, 2 * 2048, "vc", means, ema_count, ema_means)
       if _DO_SUMMARIES:
         tf.summary.histogram("b0", tf.reshape(latents_discrete[:, 0, :], [-1]))
       pc = common_layers.inverse_exp_decay(hparams.startup_steps) * 0.95
@@ -407,7 +461,8 @@ def ae_transformer_internal(inputs, targets, target_space, hparams,
         losses["latent_pred"] = tf.reduce_mean((inputs_c - targets_c)**2) * 20
         def bn_inputs():
           with tf.variable_scope(tf.get_variable_scope(), reuse=True):
-            bn, _, _, _ = bottleneck(inputs_c, hparams, 2*2048, "vc")
+            bn, _, _, _ = bottleneck(inputs_c, hparams, 2 * 2048, "vc", means,
+                                     ema_count, ema_means)
           return bn
         pbn = 0.8 if hparams.mode == tf.estimator.ModeKeys.TRAIN else 1.0
         inputs_c = tf.cond(tf.less(tf.random_uniform([]), pbn),
@@ -419,10 +474,12 @@ def bn_inputs():
     else:
       if hparams.bottleneck_kind in ["dense", "vae"]:
         inputs_c = decode_transformer(inputs, ed, targets_c, hparams, "dec_c")
-        latents_dense, _, _, _ = bottleneck(inputs_c, hparams, 2*2048, "vc")
+        latents_dense, _, _, _ = bottleneck(inputs_c, hparams, 2 * 2048, "vc",
+                                            means, ema_count, ema_means)
       else:
         latent_len = common_layers.shape_list(targets_c)[1]
-        _, _, _, embed = bottleneck(targets_c, hparams, 2*2048, "vc")
+        _, _, _, embed = bottleneck(targets_c, hparams, 2 * 2048, "vc", means,
+                                    ema_count, ema_means)
         latents_dense = tf.zeros_like(targets_c[:, :latent_len, :, :])
         if cache is None:
           cache = ae_latent_sample(latents_dense, inputs, ed, embed, 8, hparams)
@@ -482,6 +539,25 @@ def __init__(self, *args, **kwargs):
     super(TransformerAE, self).__init__(*args, **kwargs)
     self.predict_mask = 1.0
 
+    # Define the embeddings if we are using vq-vae
+    self.means = None
+    self.ema_count = None
+    self.ema_means = None
+    if self._hparams.bottleneck_kind == "vq-vae":
+      self.means = tf.get_variable(
+          name="means",
+          shape=[self._hparams.v_size, self._hparams.hidden_size],
+          initializer=tf.random_normal_initializer())
+
+      # Create the shadow variables if we are using EMA
+      if self._hparams.ema:
+        self.ema_count = tf.get_variable(
+            "ema_count", [self._hparams.v_size],
+            initializer=tf.constant_initializer(0))
+        with tf.colocate_with(self.means):
+          self.ema_means = tf.get_variable(
+              "ema_means", initializer=self.means.initialized_value())
+
   @property
   def has_input(self):
     return self._problem_hparams.input_modality
@@ -493,9 +569,15 @@ def body(self, features):
     reuse = "cache_raw" in features
     with tf.variable_scope(tf.get_variable_scope(), reuse=reuse):
       res, loss, _ = ae_transformer_internal(
-          inputs, features["targets"], features["target_space_id"],
-          self._hparams, features.get("cache_raw", None),
-          predict_mask=self.predict_mask)
+          inputs,
+          features["targets"],
+          features["target_space_id"],
+          self._hparams,
+          features.get("cache_raw", None),
+          predict_mask=self.predict_mask,
+          means=self.means,
+          ema_count=self.ema_count,
+          ema_means=self.ema_means)
       return res, loss
 
   def prepare_features_for_infer(self, features):
@@ -510,7 +592,8 @@ def prepare_features_for_infer(self, features):
     targets = tf.zeros([beam_batch_size, 1, 1, self._hparams.hidden_size])
     with tf.variable_scope("body"):
       _, _, cache = ae_transformer_internal(
-          inputs, targets, features["target_space_id"], self._hparams)
+          inputs, targets, features["target_space_id"], self._hparams,
+          self.means, self.ema_count, self.ema_means)
     features["cache_raw"] = cache
 
   def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
@@ -588,6 +671,10 @@ def transformer_ae_small():
   hparams.add_hparam("do_vae", True)
   hparams.add_hparam("bit_vae", True)
   hparams.add_hparam("beta", 0.25)
+  hparams.add_hparam("epsilon", 1e-5)
+  hparams.add_hparam("decay", 0.999)
+  hparams.add_hparam("ema", True)
+  hparams.add_hparam("random_top_k", 1)
   hparams.kl_warmup_steps = 150000
   hparams.force_full_predict = True
   return hparams
@@ -603,7 +690,7 @@ def transformer_ae_cifar():
   hparams.num_compress_steps = 2
   hparams.v_size = 1024 * 64
   hparams.kl_warmup_steps = 150000
-  hparams.startup_steps = 20000
+  hparams.startup_steps = 10000
   hparams.kmeans_lr_factor = 0.0
   hparams.is_2d = 1
   hparams.learning_rate_warmup_steps = 8000
diff --git a/tensor2tensor/notebooks/hello_t2t.ipynb b/tensor2tensor/notebooks/hello_t2t.ipynb
index 5b58b042b..27ce78bb5 100644
--- a/tensor2tensor/notebooks/hello_t2t.ipynb
+++ b/tensor2tensor/notebooks/hello_t2t.ipynb
@@ -60,8 +60,8 @@
       },
       "source": [
         "# Install deps\n",
-        "# We're using some new features from tensorflow so we install tf-nightly\n",
-        "!pip install -q tensor2tensor tf-nightly"
+        "# We're using some new features from tensorflow so we install 1.5.0rc0\n",
+        "!pip install -q 'tensor2tensor==1.4.1' 'tensorflow==1.5.0rc0'"
       ],
       "cell_type": "code",
       "execution_count": 0,
diff --git a/tensor2tensor/serving/README.md b/tensor2tensor/serving/README.md
new file mode 100644
index 000000000..aadffa9f8
--- /dev/null
+++ b/tensor2tensor/serving/README.md
@@ -0,0 +1,54 @@
+# Serving
+
+Tensor2Tensor and the TensorFlow ecosystem make it easy to serve a model once
+trained.
+
+**Note**: The following requires recent features in TensorFlow as so if you get
+import errors or the like, try installing `tensorflow==1.5.0rc0`.
+
+## 1. Export for Serving
+
+First, export it for serving:
+
+```
+t2t-exporter \
+  --model=transformer \
+  --hparams_set=transformer_tiny \
+  --problems=translate_ende_wmt8k \
+  --data_dir=~/t2t/data \
+  --output_dir=/tmp/t2t_train
+```
+
+You should have an export directory in `output_dir` now.
+
+## 2. Launch a Server
+
+Install the `tensorflow-model-server`
+([instructions](https://www.tensorflow.org/serving/setup#installing_the_modelserver)).
+
+Start the server pointing at the export:
+
+```
+tensorflow_model_server \
+  --port=9000 \
+  --model_name=my_model \
+  --model_base_path=/tmp/t2t_train/export/Servo
+```
+
+## 3. Query the Server
+
+Install some dependencies:
+
+```
+pip install tensorflow-serving-api
+```
+
+Query:
+
+```
+t2t-query-server \
+  --server=localhost:9000 \
+  --servable_name=my_model \
+  --problem=translate_ende_wmt8k \
+  --data_dir=~/t2t/data
+```
diff --git a/tensor2tensor/tpu/__init__.py b/tensor2tensor/serving/__init__.py
similarity index 100%
rename from tensor2tensor/tpu/__init__.py
rename to tensor2tensor/serving/__init__.py
diff --git a/tensor2tensor/serving/export.py b/tensor2tensor/serving/export.py
new file mode 100644
index 000000000..143668437
--- /dev/null
+++ b/tensor2tensor/serving/export.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Export a trained model for serving."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Dependency imports
+
+from tensor2tensor.bin import t2t_trainer
+from tensor2tensor.utils import decoding
+from tensor2tensor.utils import trainer_lib
+from tensor2tensor.utils import usr_dir
+
+import tensorflow as tf
+
+FLAGS = tf.flags.FLAGS
+
+
+def create_estimator(run_config, hparams):
+  return trainer_lib.create_estimator(
+      FLAGS.model,
+      hparams,
+      run_config,
+      decode_hparams=decoding.decode_hparams(FLAGS.decode_hparams))
+
+
+def create_hparams():
+  return trainer_lib.create_hparams(
+      FLAGS.hparams_set,
+      FLAGS.hparams,
+      data_dir=os.path.expanduser(FLAGS.data_dir),
+      problem_name=FLAGS.problems)
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  trainer_lib.set_random_seed(FLAGS.random_seed)
+  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
+
+  ckpt_dir = os.path.expanduser(FLAGS.output_dir)
+
+  hparams = create_hparams()
+  hparams.no_data_parallelism = True  # To clear the devices
+  run_config = t2t_trainer.create_run_config(hparams)
+
+  estimator = create_estimator(run_config, hparams)
+
+  problem = hparams.problem_instances[0]
+  strategy = trainer_lib.create_export_strategy(problem, hparams)
+
+  export_dir = os.path.join(ckpt_dir, "export", strategy.name)
+  strategy.export(
+      estimator,
+      export_dir,
+      checkpoint_path=tf.train.latest_checkpoint(ckpt_dir))
+
+
+if __name__ == "__main__":
+  tf.app.run()
diff --git a/tensor2tensor/serving/query.py b/tensor2tensor/serving/query.py
new file mode 100644
index 000000000..4140b569e
--- /dev/null
+++ b/tensor2tensor/serving/query.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Query an exported model. Py2 only. Install tensorflow-serving-api."""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Dependency imports
+
+from grpc.beta import implementations
+
+from six.moves import input  # pylint: disable=redefined-builtin
+
+from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
+from tensor2tensor.data_generators import text_encoder
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import usr_dir
+
+import tensorflow as tf
+
+from tensorflow_serving.apis import predict_pb2
+from tensorflow_serving.apis import prediction_service_pb2
+
+flags = tf.flags
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("server", None, "Address to Tensorflow Serving server.")
+flags.DEFINE_string("servable_name", None, "Name of served model.")
+flags.DEFINE_string("problem", None, "Problem name.")
+flags.DEFINE_string("data_dir", None, "Data directory, for vocab files.")
+flags.DEFINE_string("t2t_usr_dir", None, "Usr dir for registrations.")
+flags.DEFINE_string("inputs_once", None, "Query once with this input.")
+flags.DEFINE_integer("timeout_secs", 10, "Timeout for query.")
+
+
+def make_example(input_ids, feature_name="inputs"):
+  features = {
+      feature_name:
+          tf.train.Feature(int64_list=tf.train.Int64List(value=input_ids))
+  }
+  return tf.train.Example(features=tf.train.Features(feature=features))
+
+
+def create_stub():
+  host, port = FLAGS.server.split(":")
+  channel = implementations.insecure_channel(host, int(port))
+  return prediction_service_pb2.beta_create_PredictionService_stub(channel)
+
+
+def query(stub, input_ids, feature_name="inputs"):
+  request = predict_pb2.PredictRequest()
+  request.model_spec.name = FLAGS.servable_name
+  ex = make_example(input_ids, feature_name)
+  request.inputs["input"].CopyFrom(
+      tf.contrib.util.make_tensor_proto(ex.SerializeToString(), shape=[1]))
+  response = stub.Predict(request, FLAGS.timeout_secs)
+  output_ids = response.outputs["outputs"].int_val
+  return output_ids
+
+
+def encode(inputs, encoder):
+  input_ids = encoder.encode(inputs)
+  input_ids.append(text_encoder.EOS_ID)
+  return input_ids
+
+
+def decode(output_ids, output_decoder):
+  return output_decoder.decode(output_ids)
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
+
+  problem = registry.problem(FLAGS.problem)
+  hparams = tf.contrib.training.HParams(
+      data_dir=os.path.expanduser(FLAGS.data_dir))
+  problem.get_hparams(hparams)
+
+  fname = "inputs" if problem.has_inputs else "targets"
+  input_encoder = problem.feature_info[fname].encoder
+  output_decoder = problem.feature_info["targets"].encoder
+
+  stub = create_stub()
+
+  while True:
+    prompt = ">> "
+    if FLAGS.inputs_once:
+      inputs = FLAGS.inputs_once
+    else:
+      inputs = input(prompt)
+
+    input_ids = encode(inputs, input_encoder)
+    output_ids = query(stub, input_ids, feature_name=fname)
+
+    outputs = decode(output_ids, output_decoder)
+
+    print_str = """
+Input:
+{inputs}
+
+Output:
+{outputs}
+    """
+    print(print_str.format(inputs=inputs, outputs=outputs))
+    if FLAGS.inputs_once:
+      break
+
+
+if __name__ == "__main__":
+  flags.mark_flags_as_required(
+      ["server", "servable_name", "problem", "data_dir"])
+  tf.app.run()
diff --git a/tensor2tensor/test_data/example_usr_dir/__init__.py b/tensor2tensor/test_data/example_usr_dir/__init__.py
new file mode 100644
index 000000000..9bab20593
--- /dev/null
+++ b/tensor2tensor/test_data/example_usr_dir/__init__.py
@@ -0,0 +1,17 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example T2T user directory."""
+from . import my_submodule
diff --git a/tensor2tensor/test_data/example_usr_dir/my_submodule.py b/tensor2tensor/test_data/example_usr_dir/my_submodule.py
new file mode 100644
index 000000000..b6c3579ac
--- /dev/null
+++ b/tensor2tensor/test_data/example_usr_dir/my_submodule.py
@@ -0,0 +1,32 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Example registrations for T2T."""
+from tensor2tensor.layers import common_hparams
+from tensor2tensor.utils import registry
+
+
+@registry.register_hparams
+def my_very_own_hparams():
+  # Start with the base set
+  hp = common_hparams.basic_params1()
+  # Modify existing hparams
+  hp.num_hidden_layers = 2
+  # Add new hparams
+  hp.add_hparam("filter_size", 2048)
+  return hp
+
+# Use register_model for a new T2TModel
+# Use register_problem for a new Problem
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/checkpoint b/tensor2tensor/test_data/transformer_test_ckpt/checkpoint
new file mode 100644
index 000000000..9877cc425
--- /dev/null
+++ b/tensor2tensor/test_data/transformer_test_ckpt/checkpoint
@@ -0,0 +1,2 @@
+model_checkpoint_path: "model.ckpt-1"
+all_model_checkpoint_paths: "model.ckpt-1"
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/flags.txt b/tensor2tensor/test_data/transformer_test_ckpt/flags.txt
new file mode 100644
index 000000000..26988922c
--- /dev/null
+++ b/tensor2tensor/test_data/transformer_test_ckpt/flags.txt
@@ -0,0 +1,49 @@
+--eval_steps=1
+--hparams_range=
+--t2t_usr_dir=
+--experimental_optimize_placement=False
+--sync=False
+--eval_run_autoregressive=False
+--eval_use_test_set=False
+--worker_id=0
+--eval_early_stopping_metric_minimize=True
+--worker_replicas=1
+--random_seed=1234
+--worker_gpu_memory_fraction=0.95
+--train_steps=1
+--iterations_per_loop=1000
+--registry_help=False
+--worker_gpu=1
+--keep_checkpoint_max=20
+--save_checkpoints_secs=0
+--gpu_order=
+--master=
+--generate_data=False
+--local_eval_frequency=2000
+--export_saved_model=False
+--eval_early_stopping_steps=None
+--output_dir=/tmp/oss_train
+--profile=False
+--ps_job=/job:ps
+--tmp_dir=/tmp/t2t_datagen
+--schedule=continuous_train_and_eval
+--problems=translate_ende_wmt8k
+--hparams=
+--use_tpu=False
+--eval_early_stopping_metric_delta=0.1
+--ps_gpu=0
+--keep_checkpoint_every_n_hours=10000
+--decode_hparams=
+--tfdbg=False
+--data_dir=~/t2t/data
+--ps_replicas=0
+--eval_early_stopping_metric=loss
+--log_device_placement=False
+--hparams_set=transformer_test
+--dbgprofile=False
+--timit_paths=
+--tpu_num_shards=8
+--locally_shard_to_cpu=False
+--worker_job=/job:localhost
+--model=transformer
+--parsing_path=
\ No newline at end of file
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/hparams.json b/tensor2tensor/test_data/transformer_test_ckpt/hparams.json
new file mode 100644
index 000000000..196d736ba
--- /dev/null
+++ b/tensor2tensor/test_data/transformer_test_ckpt/hparams.json
@@ -0,0 +1 @@
+{"daisy_chain_variables": true, "optimizer_adam_beta1": 0.9, "scheduled_sampling_prob": 0.0, "num_hidden_layers": 2, "moe_loss_coef": 0.01, "max_target_seq_length": 0, "clip_grad_norm": 0.0, "pos": "timing", "scheduled_sampling_gold_mixin_prob": 0.5, "initializer": "uniform_unit_scaling", "grad_noise_scale": 0.0, "optimizer_momentum_momentum": 0.9, "nbr_decoder_problems": 1, "attention_key_channels": 0, "tpu_batch_size_per_shard": 24, "eval_drop_long_sequences": false, "learning_rate_cosine_cycle_steps": 250000, "prepend_mode": "none", "weight_decay": 0.0, "symbol_modality_skip_top": false, "weight_noise": 0.0, "target_modality": "default", "use_tpu": false, "attention_dropout": 0.1, "parameter_attention_value_channels": 0, "factored_logits": false, "relu_dropout": 0.1, "no_data_parallelism": false, "layer_preprocess_sequence": "n", "sampling_method": "argmax", "learning_rate": 0.2, "num_heads": 2, "max_length": 256, "summarize_grads": false, "attention_value_channels": 0, "num_encoder_layers": 0, "label_smoothing": 0.1, "use_fixed_batch_size": false, "optimizer": "Adam", "moe_k": 2, "self_attention_type": "dot_product", "learning_rate_decay_scheme": "noam", "sampling_temp": 1.0, "kernel_height": 3, "use_pad_remover": true, "batch_size": 4096, "problem_choice": "adaptive", "max_relative_position": 0, "force_full_predict": false, "min_length_bucket": 8, "layer_prepostprocess_dropout": 0.1, "eval_run_autoregressive": false, "shared_embedding_and_softmax_weights": true, "symbol_modality_num_shards": 16, "dropout": 0.2, "compress_steps": 0, "parameter_attention_key_channels": 0, "length_bucket_step": 1.1, "kernel_width": 1, "hidden_size": 16, "num_decoder_layers": 0, "input_modalities": "default", "filter_size": 8, "optimizer_adam_beta2": 0.98, "scheduled_sampling_warmup_steps": 50000, "norm_type": "layer", "min_length": 0, "moe_num_experts": 64, "multiply_embedding_mode": "sqrt_depth", "max_input_seq_length": 0, "learning_rate_warmup_steps": 8000, "proximity_bias": false, "ffn_layer": "dense_relu_dense", "initializer_gain": 1.0, "layer_postprocess_sequence": "da", "moe_hidden_sizes": "2048", "optimizer_adam_epsilon": 1e-09, "norm_epsilon": 1e-06}
\ No newline at end of file
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.data-00000-of-00002 b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.data-00000-of-00002
new file mode 100644
index 000000000..119e367a3
Binary files /dev/null and b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.data-00000-of-00002 differ
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.data-00001-of-00002 b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.data-00001-of-00002
new file mode 100644
index 000000000..7d3868985
Binary files /dev/null and b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.data-00001-of-00002 differ
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.index b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.index
new file mode 100644
index 000000000..f24748e8a
Binary files /dev/null and b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.index differ
diff --git a/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.meta b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.meta
new file mode 100644
index 000000000..fef601895
Binary files /dev/null and b/tensor2tensor/test_data/transformer_test_ckpt/model.ckpt-1.meta differ
diff --git a/tensor2tensor/test_data/vocab.ende.8192 b/tensor2tensor/test_data/vocab.ende.8192
new file mode 100644
index 000000000..072c420ca
--- /dev/null
+++ b/tensor2tensor/test_data/vocab.ende.8192
@@ -0,0 +1,8190 @@
+'<pad>'
+'<EOS>'
+'_'
+', _'
+'._'
+'the_'
+'s_'
+'in_'
+'of_'
+'and_'
+'to_'
+'die_'
+'der_'
+'und_'
+'a_'
+'n_'
+'en_'
+'e_'
+'-_'
+'t_'
+'is_'
+'that_'
+'zu_'
+'d_'
+'den_'
+'es_'
+'ed_'
+'on_'
+'ing_'
+'for_'
+'von_'
+'r_'
+'an_'
+'ist_'
+'er_'
+'y_'
+'. _'
+'für_'
+'be_'
+'The_'
+'are_'
+'with_'
+'as_'
+'das_'
+'it_'
+'des_'
+'ung_'
+'auf_'
+'mit_'
+'eine_'
+'dass_'
+'nicht_'
+'I_'
+'im_'
+'by_'
+'not_'
+'have_'
+'this_'
+' (_'
+' – _'
+'sich_'
+'or_'
+'was_'
+'um_'
+'ein_'
+'dem_'
+'werden_'
+'Die_'
+'will_'
+'from_'
+'we_'
+'ly_'
+'’_'
+'at_'
+': _'
+'te_'
+'Sie_'
+'which_'
+'ng_'
+'als_'
+'has_'
+'m_'
+'ten_'
+'auch_'
+'l_'
+'you_'
+'wir_'
+'In_'
+'sind_'
+'ion_'
+'wird_'
+'o_'
+') _'
+'all_'
+'so_'
+'can_'
+''_'
+'sie_'
+' - _'
+'al_'
+'einer_'
+'its_'
+'de_'
+'hat_'
+'wie_'
+'also_'
+'their_'
+'haben_'
+'European_'
+'more_'
+'would_'
+'oder_'
+'über_'
+'ich_'
+'but_'
+'us_'
+'einen_'
+'?_'
+'ungen_'
+'one_'
+'our_'
+'g_'
+'aus_'
+'zur_'
+'they_'
+'bei_'
+'k_'
+'Das_'
+'ation_'
+'am_'
+'2_'
+'i_'
+'been_'
+'; _'
+'1_'
+'/_'
+'ce_'
+'nur_'
+'Union_'
+'should_'
+'durch_'
+'h_'
+'EU_'
+'It_'
+'le_'
+'einem_'
+'A'
+'tion_'
+'5_'
+'nach_'
+'other_'
+'noch_'
+'do_'
+'This_'
+'können_'
+' '
+'diese_'
+'st_'
+'zum_'
+'only_'
+' , _'
+'there_'
+'lich_'
+'countries_'
+'kann_'
+'dieser_'
+'ch_'
+'war_'
+'than_'
+'We_'
+'new_'
+'- _'
+'your_'
+'man_'
+'Europe_'
+'vor_'
+'se_'
+'gen_'
+'Der_'
+'must_'
+'3_'
+'no_'
+'z_'
+'Mr_'
+'like_'
+'were_'
+'ment_'
+'I'
+'ge_'
+'wenn_'
+'US_'
+'Ich_'
+'wurde_'
+'O'
+' "_'
+'about_'
+'4_'
+'ne_'
+'time_'
+'E'
+'re_'
+'President_'
+'if_'
+'Es_'
+'up_'
+'ve_'
+'aber_'
+'A_'
+'sein_'
+'these_'
+'ts_'
+'ble_'
+'who_'
+'very_'
+'et_'
+'ers_'
+' ._'
+'c_'
+'able_'
+'Hotel_'
+'world_'
+'out_'
+'S'
+'uns_'
+'Commission_'
+'rs_'
+'mehr_'
+'such_'
+'when_'
+'But_'
+'B'
+'Wir_'
+' “_'
+'people_'
+'he_'
+'müssen_'
+'P'
+'ns_'
+'ter_'
+'into_'
+'G'
+'China_'
+'his_'
+'ihre_'
+'most_'
+')._'
+' _'
+'what_'
+'now_'
+'some_'
+'D'
+'ungs'
+'p_'
+'!_'
+'any_'
+'sehr_'
+'Kommission_'
+'many_'
+'ies_'
+'F'
+',_'
+'8_'
+'way_'
+'chen_'
+'ive_'
+'), _'
+'% _'
+' „_'
+'0_'
+'unter_'
+'had_'
+'ent_'
+'" _'
+'use_'
+'T'
+'S_'
+'States_'
+'C'
+'w_'
+'ry_'
+'x_'
+'them_'
+'nd_'
+'economic_'
+'6_'
+'eines_'
+'well_'
+'ty_'
+'Herr_'
+'d'
+'me_'
+'Er'
+'da_'
+'M'
+'ischen_'
+'K'
+'diesem_'
+'7_'
+'need_'
+'my_'
+'da'
+'ein'
+'f_'
+'zwischen_'
+'years_'
+'political_'
+'Ab'
+'(_'
+'ions_'
+'her_'
+'between_'
+'ar_'
+'alle_'
+'over_'
+'hotel_'
+'first_'
+'gegen_'
+'work_'
+'che_'
+'bis_'
+'lichen_'
+'even_'
+'make_'
+'policy_'
+'N'
+'two_'
+'could_'
+'L'
+'muss_'
+'anderen_'
+'Di'
+'Parliament_'
+'9_'
+'ting_'
+'Ta'
+'where_'
+'keine_'
+'hen_'
+'ons_'
+'ss_'
+'ally_'
+'system_'
+'may_'
+'ren_'
+'sa'
+'ern_'
+'iert_'
+'important_'
+'ben_'
+'Council_'
+'gibt_'
+'gi'
+'heit_'
+'ck_'
+' _'
+'ro'
+'report_'
+'Präsident_'
+'just_'
+'her'
+'Europäischen_'
+'Europa_'
+'because_'
+'If_'
+'4'
+'those_'
+'An'
+'U'
+'R'
+'gel'
+'La'
+'support_'
+'do'
+'ieren_'
+'rt_'
+'igen_'
+'B_'
+'z'
+'nt_'
+'immer_'
+'ho'
+'take_'
+'“ _'
+'vom_'
+'seine_'
+'pro'
+'Bo'
+'el_'
+'dies_'
+'sowie_'
+'end_'
+'Be'
+'hi'
+'liche_'
+'country_'
+'H'
+'” _'
+'year_'
+'much_'
+'k'
+'W'
+'C_'
+'Wenn_'
+'P_'
+'dieses_'
+'ange'
+'ted_'
+'government_'
+'Member_'
+'ke_'
+'du'
+'Lo'
+'w'
+'after_'
+'own_'
+'made_'
+'u_'
+'ments_'
+'te'
+'schen_'
+'Ver'
+'0'
+'unter'
+'ra'
+'möchte_'
+'ab'
+'D_'
+'market_'
+'being_'
+'ity_'
+'ance_'
+'To'
+'' _'
+'ru'
+'right_'
+'public_'
+'long_'
+'ate_'
+'Welt_'
+'Al'
+'Un'
+'sondern_'
+'sen_'
+'lu'
+'ja'
+'ors_'
+'Zeit_'
+'ds_'
+'Menschen_'
+'Jahren_'
+'th_'
+'international_'
+'5'
+'pa'
+'ci'
+'Ha'
+':_'
+'good_'
+'cht_'
+'how_'
+'Le'
+'financial_'
+'b'
+'ausge'
+'na'
+'ihrer_'
+'Se'
+'diesen_'
+'USA_'
+'wurden_'
+'ke'
+'ert_'
+'en'
+'andere_'
+'For_'
+'ve'
+'Po'
+'s'
+'ic_'
+'eu'
+'la_'
+'6'
+'sti'
+'age_'
+'part_'
+'len_'
+'Diese_'
+'same_'
+'der'
+'Li'
+'Jahr_'
+'De'
+'As_'
+'ls_'
+'tri'
+'no'
+'Ne'
+'both_'
+'cy_'
+'V'
+'ür'
+'t'
+'ken_'
+'information_'
+'bar_'
+'to'
+'sten_'
+'last_'
+'against_'
+'Ro'
+'Länder_'
+'through_'
+'lo'
+'ations_'
+'Da'
+'10_'
+'über'
+'ur'
+'um'
+'ien_'
+'as'
+'Do'
+'8'
+'rn_'
+'over'
+'Bericht_'
+'unsere_'
+'ri'
+'keit_'
+'global_'
+'20_'
+'Über'
+'zu'
+'i'
+'et'
+'dann_'
+'aus'
+'ig_'
+'used_'
+'nden_'
+'Ber'
+'2'
+'würde_'
+'gen'
+'then_'
+'ss'
+'sollte_'
+'king_'
+'eri'
+'ent'
+'00_'
+' [[_'
+'national_'
+'There_'
+'too_'
+'jedoch_'
+'hier_'
+'un'
+'high_'
+'does_'
+'T_'
+'mar'
+'auf'
+'ar'
+'che'
+'ba'
+'Wi'
+'damit_'
+'Im_'
+'seiner_'
+'sch_'
+'order_'
+'or'
+'less_'
+'heute_'
+'а'
+'ng'
+'ful_'
+'ca'
+'b_'
+'Sa'
+'ut'
+'ta'
+'men_'
+'3'
+'United_'
+'O_'
+'Ein'
+'under_'
+'sen'
+'ed'
+'Und_'
+'är'
+'social_'
+'l'
+'Fa'
+'Ar'
+'sche_'
+'a'
+'ische_'
+'ia_'
+'fen_'
+'bar'
+'n'
+'In'
+'go_'
+'7'
+'.'
+'still_'
+'m'
+'growth_'
+'eb'
+'E_'
+'example_'
+'Ma'
+'9'
+'g'
+'day_'
+'al'
+'ö'
+'sp'
+'ning_'
+'ris'
+'les_'
+'inter'
+'é'
+'so'
+'europäischen_'
+'sta'
+'see_'
+'power_'
+'neue_'
+'ohne_'
+'nen_'
+'free_'
+'Parlament_'
+'Land_'
+'Ba'
+'rights_'
+'nn'
+'ner_'
+'ha'
+'That_'
+'Mitgliedstaaten_'
+'Mar'
+'number_'
+'ce'
+'place_'
+'nde_'
+'könnte_'
+'development_'
+'ck'
+'area_'
+'And_'
+'kommen_'
+'Her'
+'within_'
+'while_'
+'fact_'
+'course_'
+' | _'
+'view_'
+'point_'
+'hr'
+'before_'
+'ac'
+'ter'
+'bereits_'
+'Co'
+'os_'
+'op'
+'per_'
+'h'
+'Frage_'
+'’ _'
+'ä'
+'uf'
+'room_'
+'neuen_'
+'se'
+'ft_'
+'come_'
+'ad'
+'30_'
+'possible_'
+'denen_'
+'Unter'
+'Entwicklung_'
+'Ein_'
+'Bi'
+'199'
+'tt'
+'selbst_'
+'aufge'
+'wieder_'
+'up'
+'ma'
+'he'
+'far_'
+'Mo'
+'Aus'
+'economy_'
+'Auf'
+'want_'
+'set_'
+'here_'
+'gu'
+'future_'
+'sollten_'
+'ks_'
+'ger'
+'ta_'
+'stellen_'
+'rec'
+'od'
+'ni'
+'f'
+'M_'
+'wo_'
+'tly_'
+'mp'
+'land_'
+'es'
+'bi'
+'You_'
+'nehmen_'
+'iti'
+'zwei_'
+'sh_'
+'dazu_'
+'R_'
+'Europäische_'
+'under'
+'ina'
+'down_'
+'..._'
+'ti'
+'say_'
+'ens'
+'con'
+'Regierung_'
+'Pa'
+'ste'
+'Vor'
+'tra'
+'pri'
+'great_'
+'already_'
+'without_'
+'red_'
+'out'
+'gra'
+'gesch'
+'liegt_'
+'Zu'
+'Aber_'
+'ör'
+'ure_'
+'ht_'
+'fi'
+'put_'
+'mo'
+'il_'
+'di'
+'men'
+'lassen_'
+'large_'
+'So'
+'No'
+'! _'
+'während_'
+'para'
+'ous_'
+'nce_'
+'machen_'
+'human_'
+'Me'
+'Go'
+'ll_'
+'bo'
+'Sta'
+'Mu'
+'Man'
+'000_'
+'weil_'
+'problem_'
+'ko'
+'е'
+'today_'
+'jetzt_'
+'ihren_'
+'ver'
+'nd'
+'ine_'
+'igkeit_'
+'ige_'
+'F_'
+'versch'
+'line_'
+'ihr_'
+'go'
+'get_'
+'So_'
+'15_'
+'ig'
+'ie_'
+'hl'
+'el'
+'ban'
+'Wa'
+'Re'
+'Ad'
+'wäre_'
+'therefore_'
+'cannot_'
+'believe_'
+'service_'
+'le'
+'fa'
+'den'
+'Welt'
+' '_'
+'teil'
+'sk'
+'san'
+'pi'
+'is'
+'finden_'
+'We'
+'Je'
+'Fe'
+'st'
+'since_'
+'ok'
+'gef'
+'Lage_'
+'Ja'
+'Finanz'
+'Du'
+'ssen_'
+'fe'
+'state_'
+'re'
+'Ge'
+'crisis_'
+'X'
+'Ra'
+'Ni'
+'Bei'
+'201'
+'ves_'
+'three_'
+'ine'
+'gs_'
+'ger_'
+'fort'
+'er'
+'c'
+'G_'
+'val'
+'po'
+'pen'
+'ness_'
+'fully_'
+'find_'
+'few_'
+'change_'
+'cal_'
+'ary_'
+'America_'
+'u'
+'si'
+'inc'
+'geht_'
+'eren_'
+'ch'
+'bel'
+'Ti'
+'N_'
+'Ländern_'
+'Ka'
+'viele_'
+'net'
+'na_'
+'Pi'
+'ßen_'
+'us'
+'ten'
+'services_'
+'process_'
+'lt_'
+'ki'
+'issue_'
+'help_'
+'Unternehmen_'
+'Jo'
+'trade_'
+'ori'
+'it'
+'including_'
+'enden_'
+'available_'
+'Car'
+'und'
+'sit'
+'politik_'
+'dig'
+'allen_'
+')'
+'tions_'
+'sol'
+'pol'
+'level_'
+'dis'
+'case_'
+'ult'
+'p'
+'cha'
+'Maßnahmen_'
+'uti'
+'rk'
+'agen_'
+'real_'
+'ne'
+'know_'
+'einige_'
+'ber'
+'Vi'
+'Eine_'
+'50_'
+'viel_'
+'que'
+'om'
+'means_'
+'de'
+'darauf_'
+'bu'
+'Fi'
+'una'
+'politischen_'
+'o'
+'denn_'
+'dafür_'
+'clear_'
+'ät_'
+'tru'
+'rate_'
+'next_'
+'mich_'
+'gest'
+'different_'
+'di_'
+'city_'
+'Pro'
+'Jahre_'
+'seit_'
+'que_'
+'car'
+'areas_'
+'Fu'
+'ü'
+'ps_'
+'ngen_'
+'mm'
+'kan'
+'ht'
+'ex'
+'em_'
+'ea'
+'sto'
+'geben_'
+'e'
+'bes'
+'and'
+'о'
+'   _'
+'ster'
+'problems_'
+'par'
+'gut_'
+'When_'
+'Staaten_'
+'Rechts'
+'Nach'
+'L_'
+'wa'
+'stre'
+'ran'
+'ler_'
+'health_'
+'Politik_'
+'ug'
+'ou'
+'ir'
+'form_'
+'best_'
+'Z'
+'Na'
+'Mit_'
+'Ihnen_'
+'Gr'
+'н'
+'ssi'
+'spa'
+'sch'
+'ret'
+'mus'
+'making_'
+'led_'
+'however_'
+'better_'
+'allem_'
+'Su'
+'Spe'
+'Bel'
+'org'
+'nis'
+'hin'
+'ganz_'
+'est_'
+'ß'
+'why_'
+'tan'
+'page_'
+'ord'
+'mit'
+'lis'
+'be'
+'Mi'
+'ms_'
+'lly_'
+'habe_'
+'ua'
+'tes_'
+'on'
+'ip'
+'ings_'
+'Frau_'
+'Arbeits'
+'tu'
+'su'
+'sicher'
+'kan_'
+'je'
+'in'
+'doch_'
+'ding_'
+'Dies_'
+'zug'
+'ul'
+'ant_'
+'Inter'
+'ze_'
+'politische_'
+'hol'
+'erung_'
+'energy_'
+'ely_'
+'during_'
+'ak'
+'zi'
+'mor'
+'become_'
+'They_'
+'v_'
+'sur'
+'sin'
+'rt'
+'ren'
+'pre'
+'ner'
+'back_'
+'ari'
+'access_'
+'W_'
+'ye'
+'sehen_'
+'ma_'
+'Japan_'
+'2009_'
+'. '
+'waren_'
+'ur_'
+'unserer_'
+'tung_'
+'ob_'
+'nt'
+'nach'
+'j'
+'hin_'
+'führen_'
+'etwas_'
+'daß_'
+'cht'
+'Committee_'
+'Cha'
+'ya'
+'war'
+'taken_'
+'man'
+'durch'
+'current_'
+'ara'
+'At'
+'yl'
+'wollen_'
+'v'
+'ura'
+'ual_'
+'qui'
+'question_'
+'pass'
+'ling_'
+'au'
+'aff'
+'yang_'
+'inde'
+'ile'
+'To_'
+'Si'
+'Energie'
+'Bar'
+'tro'
+'particular_'
+'har'
+'ence_'
+'action_'
+']]'
+'18'
+'small_'
+'schaft_'
+'markets_'
+'lic'
+'gt_'
+'enti'
+'did_'
+'com_'
+'ag'
+'Ru'
+'Rat_'
+'Lu'
+'Kont'
+'Ho'
+'vera'
+'tä'
+'ins'
+'fre'
+'co'
+'These_'
+'Israel_'
+'x'
+'vor'
+'tig'
+'think_'
+'ise'
+'eg'
+'based_'
+'anti'
+'vi'
+'ts'
+'interest_'
+'des'
+'debate_'
+'common_'
+'beim_'
+'Kon'
+'Gi'
+'Ges'
+'Doch_'
+'Commissioner_'
+'mag'
+'letzten_'
+'again_'
+'Is'
+'Am'
+'т'
+'äu'
+'nun_'
+'ie'
+'geb'
+'bietet_'
+'betr'
+'bef'
+'Tre'
+'Germany_'
+'At_'
+'tz'
+'security_'
+'schon_'
+'ppe'
+'measures_'
+'enc'
+'each_'
+'business_'
+'ated_'
+'Zimmer_'
+'Bre'
+'quality_'
+'pe_'
+'offers_'
+'nes_'
+'military_'
+'eli'
+'bre'
+'Spa'
+'sho'
+'rn'
+'rl'
+'per'
+'lten_'
+'ging_'
+'bur'
+'Zeit'
+'Wie_'
+'Te'
+'Auto'
+'win'
+'seinen_'
+'möglich_'
+'ite'
+'im'
+'Pe'
+'Alt'
+'y'
+'stri'
+'ra_'
+'non_'
+'major_'
+'lä'
+'la'
+'ität_'
+'erhalten_'
+'eh'
+'act'
+'Ihre_'
+'12_'
+'ß_'
+'tic'
+'stay_'
+'og'
+'halten_'
+'end'
+'bew'
+'another_'
+'alt'
+'Ko'
+'Bereich_'
+'Arbeit_'
+'vo'
+'tzt_'
+'tin'
+'situation_'
+'multi'
+'gem'
+'bility_'
+'Iran_'
+'|_'
+'rm'
+'provide_'
+'mu'
+'ins_'
+'dan_'
+'Ende_'
+'11_'
+'result_'
+'rather_'
+'position_'
+'nta'
+'ns'
+'ngs'
+'main_'
+'law_'
+'lar'
+'ert'
+'ear'
+'continue_'
+'citizens_'
+'said_'
+'read'
+'nts_'
+'ll'
+'kra'
+'ers'
+'ende_'
+'Auss'
+'wo'
+'nit'
+'mis'
+'might_'
+'kti'
+'ier'
+'ces_'
+'ass'
+'House_'
+'un_'
+'term_'
+'steht_'
+'ste_'
+'oo'
+'nst'
+'nahme_'
+'mer'
+'ld'
+'icher'
+'hou'
+'ehen_'
+'ab_'
+'American_'
+'2008_'
+'Ä'
+'ym'
+'worden_'
+'recent_'
+'ot'
+'ory_'
+'given_'
+'ersch'
+'ens_'
+'Unterstützung_'
+'San'
+'Internet_'
+'Euro_'
+'whether_'
+'using_'
+'uns'
+'systems_'
+'stra'
+'ph'
+'ici'
+'every_'
+'ef'
+'ct_'
+'bet'
+'V_'
+'Ri'
+'Probleme_'
+'Nu'
+'Gen'
+'xi'
+'tun_'
+'pers'
+'ol'
+'nk'
+'mon'
+'ft'
+'ess'
+'einge'
+'dec'
+'bin_'
+'außer'
+'En'
+'2000_'
+'nl'
+'mas'
+'il'
+'hm'
+'halt'
+'give_'
+'for'
+'ersten_'
+'einfach_'
+'ee'
+'called_'
+'ans_'
+'Sc'
+'El'
+'trans'
+'ses_'
+'life_'
+'ju'
+'governments_'
+'ents_'
+'enter'
+'debt_'
+'Teil_'
+'J'
+'Auch_'
+'ück'
+'ze'
+'stor'
+'pt'
+'pan'
+'ons'
+'ill'
+'gri'
+'ght_'
+'führt_'
+'disa'
+'besteht_'
+'Problem_'
+'Gu'
+'rooms_'
+'proposal_'
+'pe'
+'needs_'
+'hu'
+'hei'
+'anzu'
+'Sol'
+'Lin'
+'Kü'
+'Bank_'
+'á'
+'private_'
+'ond'
+'ler'
+'iss'
+'eigenen_'
+'ei'
+'budget_'
+'Sch'
+'Herrn_'
+'р'
+'weniger_'
+'rü'
+'rä'
+'institutions_'
+'ierung_'
+'ial_'
+'full_'
+'ele'
+'Schwe'
+'Scha'
+'On_'
+'New_'
+'For'
+'Fall_'
+'vis'
+'rig'
+'mir_'
+'li'
+'fest'
+'fas'
+'bli'
+'ap'
+'Y'
+'Ob'
+'Ga'
+'Chi'
+'vote_'
+'verb'
+'terms_'
+'issues_'
+'ec'
+'du_'
+'away_'
+'Wei'
+'Son'
+'Nach_'
+'än'
+'xe'
+'vers'
+'rr'
+'me'
+'little_'
+'least_'
+'hope_'
+'further_'
+'erm'
+'data_'
+'around_'
+'an'
+'always_'
+'Ziel_'
+'Wirtschafts'
+'Sicherheit_'
+'Poli'
+'Des'
+'Als_'
+'üt'
+'off_'
+'million_'
+'kel'
+'ian_'
+'großen_'
+'fast_'
+'ain'
+'adv'
+'Verg'
+'France_'
+'spiel'
+'sma'
+'rea'
+'oni'
+'hy'
+'era'
+'Ste'
+'Russia_'
+'Rolle_'
+'Che'
+'üb'
+'äh'
+'ver_'
+'tat'
+'ring_'
+'leg'
+'kt_'
+'ise_'
+'insbesondere_'
+'if'
+'hatte_'
+'fel'
+'close_'
+'World_'
+'zurück'
+'ys_'
+'ud'
+'tte_'
+'tre'
+'tom'
+'sts_'
+'open_'
+'nis_'
+'lit'
+'group_'
+'ere'
+'deren_'
+'bri'
+']] _'
+'With_'
+'U_'
+'Medi'
+'Art_'
+'All_'
+', '
+'zwar_'
+'sagen_'
+'policies_'
+'oa'
+'mb'
+'legal_'
+'aren'
+'ali'
+'Ir'
+'wind'
+'we'
+'ut_'
+'nge'
+'located_'
+'link'
+'ja_'
+'int'
+'inform'
+'id'
+'hen'
+'eti'
+'System_'
+'Gar'
+'Ex'
+'Bürger_'
+'zer'
+'wis'
+'unge'
+'ther'
+'order'
+'necessary_'
+'money_'
+'lus'
+'ben'
+'What_'
+'Tra'
+'Namen_'
+'Char'
+'Ch'
+'Air'
+'é_'
+'weit_'
+'tze'
+'stehen_'
+'risk_'
+'pp'
+'investment_'
+'foreign_'
+'ero'
+'drei_'
+'bor'
+'ana'
+'allerdings_'
+'age'
+'Zusammenarbeit_'
+'Wirtschaft_'
+'Viel'
+'He_'
+'Gef'
+'Fo'
+'CO'
+'Bedeutung_'
+'2005_'
+'women_'
+'ving_'
+'stellt_'
+'rst'
+'role_'
+'person'
+'oren_'
+'itu'
+'ita'
+'große_'
+'disc'
+'chl'
+'bringen_'
+'ang_'
+'Was_'
+'Rück'
+'Informationen_'
+'Haupt'
+'Cr'
+'würden_'
+'weiter_'
+'wal'
+'sec'
+'schi'
+'old_'
+'nu'
+'hold'
+'exist'
+'err'
+'certain_'
+'Par'
+'Op'
+'Lebens'
+'2006_'
+'zen_'
+'writ'
+'sed_'
+'len'
+'etwa_'
+'einmal_'
+'done_'
+'dessen_'
+'bs'
+'ani'
+'ago_'
+'Wo'
+'Verl'
+'German_'
+'Dis'
+'Bra'
+'verk'
+'uni'
+'une'
+'start_'
+'sector_'
+'not'
+'nb'
+'going_'
+'ges_'
+'fla'
+'conf'
+'chr'
+'agreement_'
+'Zi'
+'Ke'
+'Hin'
+'2007_'
+'и'
+'zahl'
+'rules_'
+'pat'
+'ow_'
+'ort_'
+'increase_'
+'ild'
+'ihnen_'
+'ien'
+'expect'
+'ere_'
+'companies_'
+'basis_'
+'asi'
+'app'
+'Steuer'
+'ying_'
+'wel'
+'unseren_'
+'tou'
+'ski'
+'pro_'
+'particularly_'
+'oll'
+'members_'
+'kn'
+'ess_'
+'einzu'
+'cu'
+'create_'
+'co_'
+'century_'
+'all'
+'Sy'
+'Per'
+'Entwicklungs'
+'Au'
+'40_'
+'25_'
+'öffentlichen_'
+'ute_'
+'rates_'
+'our'
+'mes_'
+'lt'
+'local_'
+'gehen_'
+'ga'
+'eth'
+'erreichen_'
+'cut'
+'ct'
+'ati'
+'art'
+'Vie'
+'Va'
+'Pal'
+'Er_'
+'Deutschland_'
+'Cu'
+'Can'
+'Bei_'
+'Ans'
+'.”_'
+'vie'
+'stand'
+'second_'
+'quen'
+'once_'
+'oc'
+'llen_'
+'gh'
+'fic'
+'ffen_'
+'eni'
+'davon_'
+'allow'
+'Red'
+'Neu'
+'1'
+'ы'
+'yet_'
+'west'
+'vert'
+'ture_'
+'tor'
+'together_'
+'rel'
+'onis'
+'los'
+'ku'
+'io'
+'him_'
+'han'
+'ever_'
+'ect'
+'dort_'
+'cri'
+'cke'
+'Bea'
+'tis'
+'spo'
+'soll_'
+'ras'
+'others_'
+'one'
+'mat'
+'mal'
+'könnten_'
+'hoch'
+'head'
+'ffe'
+'ded_'
+'beh'
+'Mit'
+'Men'
+'Har'
+'Ele'
+'ühr'
+'ute'
+'uss_'
+'trag'
+'ties_'
+'staff_'
+'setzen_'
+'sent'
+'sam'
+'rac'
+'port'
+'mittel'
+'lle'
+'ka'
+'hip_'
+'fan'
+'ez'
+'dra'
+'\u_'
+'Ur'
+'Or'
+'Für_'
+'Abs'
+'"_'
+' & _'
+'с'
+'uri'
+'tw'
+'serve'
+'sche'
+'needed_'
+'meine'
+'ite_'
+'invest'
+'hä'
+'ensure_'
+'early_'
+'del'
+'dan'
+'dabei_'
+'control_'
+'conditions_'
+'chi'
+'book'
+'ast'
+'Schul'
+'Millionen_'
+'Community_'
+'Auf_'
+'2001_'
+'weitere_'
+'ui'
+'though_'
+'sou'
+'sha'
+'rde'
+'pf'
+'park'
+'often_'
+'location_'
+'ing'
+'gro'
+'establish'
+'bei'
+'ate'
+'alles_'
+'alis'
+'Kur'
+'Gl'
+'Ger'
+'ön'
+'zen'
+'wissen_'
+'wer'
+'vote'
+'sy'
+'short_'
+'ria'
+'price_'
+'ologi'
+'meisten_'
+'kt'
+'isti'
+'ions'
+'inv'
+'express'
+'especially_'
+'erh'
+'cho'
+'aid_'
+'ade'
+'Zug'
+'September_'
+'He'
+'Gra'
+'Gesch'
+'Fragen_'
+'Dar'
+'wohl_'
+'weise_'
+'verst'
+'sse'
+'lei'
+'kosten'
+'hren_'
+'fer'
+'central_'
+'Weise_'
+'Weg_'
+'One_'
+'Ju'
+'19'
+') '
+'rd_'
+'ps'
+'os'
+'oli'
+'key_'
+'kein_'
+'itself_'
+'isch_'
+'greater_'
+'geo'
+'gegenüber_'
+'fund'
+'force_'
+'cra'
+'capital_'
+'belie'
+'Rahmen_'
+'Euro'
+'ya_'
+'nne'
+'never_'
+'mme'
+'mil'
+'mand'
+'look_'
+'kon'
+'kle'
+'ket'
+'ker_'
+'gin'
+'face_'
+'export'
+'eich'
+'dre'
+'decision_'
+'cle'
+'auszu'
+'appear'
+'aller_'
+'akt'
+'Vers'
+'Stre'
+'Sk'
+'Grund'
+'197'
+'unc'
+'ue'
+'sogar_'
+'sel'
+'scha'
+'pal'
+'modern_'
+'list_'
+'ka_'
+'ila'
+'hea'
+'handel'
+'genu'
+'date_'
+'cooperation_'
+'ant'
+'ale'
+'Stra'
+'Staats'
+'Recht_'
+'Ki'
+'However_'
+' $_'
+'zuge'
+'zo'
+'ys'
+'spi'
+'nti'
+'nder_'
+'lli'
+'innerhalb_'
+'inf'
+'gez'
+'gar'
+'frei'
+'costs_'
+'cally_'
+'bil'
+'Vorschlag_'
+'RI'
+'Q'
+'Kan'
+'Ins'
+'II_'
+'Hal'
+'welche_'
+'water_'
+'sure_'
+'sb'
+'regard'
+'oi'
+'nie'
+'nes'
+'ned_'
+'name'
+'lem'
+'late_'
+'install'
+'hn'
+'food_'
+'ern'
+'conte'
+'cit'
+'bro'
+'alit'
+'ala'
+'aim'
+'ad_'
+'Zukunft_'
+'Wachstum_'
+'Stadt_'
+'Kredit'
+'Indeed_'
+'2004_'
+' ..._'
+'wide_'
+'third_'
+'rp'
+'rein'
+'region_'
+'reform_'
+'profit'
+'od_'
+'mut'
+'mic'
+'mate'
+'log'
+'lines_'
+'lea'
+'kat'
+'ionen_'
+'hard_'
+'fish'
+'ekt'
+'date'
+'bank_'
+'account_'
+'Wer'
+'Mittel'
+'Industrie'
+'Comm'
+'Bef'
+'Bau'
+'vern'
+'unt'
+'tax_'
+'rse'
+'rit'
+'rest'
+'medi'
+'leaders_'
+'implement'
+'ene'
+'dia'
+'dat'
+'cken_'
+'bek'
+'ard'
+'among_'
+'Ze'
+'UN_'
+'The'
+'Seite_'
+'Erw'
+'Beispiel_'
+'18_'
+'16_'
+'100_'
+'whole_'
+'tal'
+'rat'
+'prä'
+'protection_'
+'present_'
+'mel'
+'low_'
+'los_'
+'lau'
+'isierung_'
+'ir_'
+'hear'
+'eve'
+'dar'
+'chu'
+'ca_'
+'bla'
+'bis'
+'bieten_'
+'ator'
+'alten_'
+'ability_'
+'Wh'
+'Um_'
+'Pla'
+'Europas_'
+'CA'
+'yo'
+'won'
+'vielen_'
+'shall_'
+'run'
+'nung_'
+'nte'
+'nig'
+'nei'
+'meine_'
+'kommt_'
+'income_'
+'ide'
+'hervor'
+'gang_'
+'following_'
+'fee'
+'excellent_'
+'disp'
+'demo'
+'darüber_'
+'children_'
+'cer'
+'ble'
+'banks_'
+'ain_'
+'Web'
+'Sie'
+'Pol'
+'Form'
+'Ca'
+'Bed'
+'äg'
+'wirtschaftlichen_'
+'ssion_'
+'several_'
+'reason_'
+'ic'
+'having_'
+'haus'
+'get'
+'ellen_'
+'ek'
+'dit'
+'beste'
+'Sicherheits'
+'Seiten_'
+'Klima'
+'Cli'
+'Cas'
+'vil'
+'ure'
+'the'
+'suggest'
+'sat'
+'progress_'
+'ommen_'
+'mä'
+'mini'
+'lat'
+'kor'
+'kons'
+'ib'
+'fur'
+'dri'
+'comm'
+'bin'
+'back'
+'Verb'
+'Nor'
+'EC'
+'Chinese_'
+'Bu'
+'Ap'
+'xt'
+'wirklich_'
+'umge'
+'tät_'
+'side_'
+'sca'
+'regulat'
+'really_'
+'q'
+'nationalen_'
+'name_'
+'möchten_'
+'iff'
+'gü'
+'flu'
+'europäische_'
+'este'
+'emp'
+'ech'
+'centre_'
+'cel'
+'ante'
+'Zw'
+'X_'
+'UN'
+'Mos'
+'Hä'
+'24_'
+'was'
+'value_'
+'turn'
+'stand_'
+'sign'
+'sbe'
+'ro_'
+'resources_'
+'relati'
+'ping_'
+'nde'
+'nahmen_'
+'mal_'
+'ied_'
+'icht_'
+'gh_'
+'geta'
+'function'
+'erste'
+'erk'
+'environment_'
+'eig'
+'Region_'
+'Möglichkeit_'
+'K_'
+'Gegen'
+'%'
+'ton_'
+'something_'
+'sm'
+'reve'
+'represent'
+'radi'
+'r'
+'ona'
+'nat'
+'left_'
+'ism_'
+'internationalen_'
+'home_'
+'gese'
+'gele'
+'esti'
+'down'
+'democratic_'
+'const'
+'conse'
+'blo'
+'beg'
+'ai'
+'Zusammenhang_'
+'US'
+'Pu'
+'Mon'
+'Grund_'
+'Frauen_'
+'Flug'
+'Bus'
+'2003_'
+'– _'
+'zeug'
+'zeit'
+'views_'
+'tern_'
+'teil_'
+'stu'
+'sowohl_'
+'ration_'
+'ol_'
+'include_'
+'gs'
+'größere'
+'freie'
+'dro'
+'democracy_'
+'daher_'
+'Schl'
+'Haushalts'
+'Bild'
+'working_'
+'weiterhin_'
+'unf'
+'unbe'
+'thr'
+'super'
+'states_'
+'she'
+'rou'
+'production_'
+'macht_'
+'lan'
+'kin'
+'jo'
+'icht'
+'getr'
+'fu'
+'erg'
+'emb'
+'ehr'
+'drive'
+'def'
+'comes_'
+'com'
+'ans'
+'air_'
+'Bush_'
+'Ac'
+'werk'
+'upt'
+'ster_'
+'standards_'
+'son_'
+'single_'
+'sei'
+'regard_'
+'play'
+'ov'
+'ogen'
+'nds_'
+'longer_'
+'kun'
+'ik'
+'groß'
+'gar_'
+'entr'
+'efforts_'
+'bring'
+'at'
+'arbeiten_'
+'ament'
+'Süd'
+'Rec'
+'Mrs_'
+'Kom'
+'India_'
+'Hu'
+'Hi'
+'Frei'
+'Ea'
+'14_'
+'''
+'"'
+'л'
+'üsse'
+'wei'
+'until_'
+'tol'
+'tic_'
+'taking_'
+'sal'
+'ress'
+'rep'
+'poor_'
+'play_'
+'matter_'
+'lin'
+'lf'
+'ind'
+'hand_'
+'hal'
+'gek'
+'fall_'
+'dru'
+'dep'
+'compl'
+'beschr'
+'am'
+'abge'
+'Nicht'
+'French_'
+'Cont'
+'Cam'
+'Bri'
+'20'
+'д'
+'äng'
+'wish_'
+'user_'
+'unit'
+'ue_'
+'tional'
+'tele'
+'tch'
+'stimm'
+'speci'
+'son'
+'society_'
+'sion_'
+'sf'
+'ser'
+'sco'
+'rif'
+'reb'
+'operation'
+'nder'
+'nda'
+'mi'
+'indem_'
+'ik_'
+'hil'
+'happen'
+'gie'
+'exp'
+'est'
+'elle_'
+'ede'
+'col'
+'cc'
+'attack'
+'ating_'
+'anz'
+'agree'
+'Vo'
+'Mor'
+'60_'
+'2002_'
+'öl'
+'wä'
+'wirtschaftliche_'
+'unterstützen_'
+'th'
+'temp'
+'swe'
+'sun'
+'ssel'
+'spe'
+'sis'
+'sing_'
+'rar'
+'press'
+'partners'
+'onen_'
+'offer_'
+'nomi'
+'nk_'
+'member_'
+'likely_'
+'let'
+'lag'
+'ion'
+'ili'
+'hte'
+'gne'
+'coa'
+'cla'
+'but'
+'body_'
+'Star'
+'Rei'
+'Pf'
+'Os'
+'No_'
+'Kor'
+'Imp'
+'Group_'
+'Central_'
+'zus'
+'yn'
+'vors'
+'uss'
+'tell'
+'sn'
+'rd'
+'period_'
+'ow'
+'ont'
+'nz'
+'meiner_'
+'ls'
+'lig'
+'ick'
+'hör'
+'gre'
+'gentlemen_'
+'erz'
+'entfernt_'
+'difficult_'
+'detail'
+'dera'
+'cur'
+'chs'
+'bou'
+'bem'
+'beiden_'
+'ah'
+'ack'
+'Text'
+'Kinder_'
+'IN'
+'Gäste'
+'Groß'
+'Gesetz'
+'Den'
+'/'
+'äte'
+'äche'
+'zwei'
+'zusammen'
+'verf'
+'target'
+'reli'
+'prices_'
+'nä'
+'min'
+'lung_'
+'ischer_'
+'house_'
+'gener'
+'geh'
+'fra'
+'fiscal_'
+'entw'
+'claim'
+'besonders_'
+'approach_'
+'Zust'
+'Sto'
+'Selbst'
+'Res'
+'Projekt'
+'Parlaments_'
+'Mittel_'
+'Min'
+'Kla'
+'Hol'
+'übern'
+'zw'
+'verschiedenen_'
+'usi'
+'ub'
+'tzen_'
+'sm_'
+'sect'
+'results_'
+'respect_'
+'research_'
+'reich'
+'rati'
+'rage_'
+'light_'
+'leistung'
+'land'
+'ire'
+'industry_'
+'ih'
+'firm'
+'ete'
+'erb'
+'enta'
+'developing_'
+'cost_'
+'Sub'
+'Pat'
+'Lei'
+'Iraq_'
+'International_'
+'Fra'
+'Dollar_'
+' % _'
+'wor'
+'weis'
+'vorge'
+'vat'
+'talk'
+'special_'
+'site_'
+'seek'
+'rid'
+'resolution_'
+'rei'
+'ort'
+'onal'
+'mea'
+'management_'
+'lif'
+'let_'
+'leich'
+'jun'
+'ian'
+'gep'
+'follow'
+'enk'
+'eil'
+'ds'
+'dly_'
+'created_'
+'coming_'
+'ber_'
+'bea'
+'arm'
+'Tru'
+'State_'
+'Kar'
+'Investitionen_'
+'Ent'
+'Dan'
+'Buch'
+'ätze_'
+'vision_'
+'vel'
+'uld'
+'uc'
+'transport_'
+'sub'
+'star'
+'self_'
+'reiche'
+'pt_'
+'pose'
+'pha'
+'ot_'
+'nü'
+'mid'
+'menta'
+'manage'
+'interests_'
+'id_'
+'ia'
+'gat'
+'enough_'
+'ema'
+'can'
+'breakfast_'
+'bol'
+'best'
+'ank'
+'amm'
+'Will'
+'Wasser'
+'System'
+'Handels'
+'Gleich'
+'Geschäfts'
+'Bro'
+'Best'
+'Ben'
+'zz'
+'wichtig_'
+'wenig_'
+'volle'
+'ven_'
+'ven'
+'uer'
+'suc'
+'rze'
+'rise_'
+'punkt_'
+'proposals_'
+'pie'
+'parties_'
+'nichts_'
+'miss'
+'kr'
+'ives_'
+'indi'
+'ihm_'
+'igung_'
+'ground_'
+'gew'
+'fe_'
+'experience_'
+'etzen_'
+'ete_'
+'etc_'
+'dea'
+'cul'
+'cover'
+'cher'
+'bit'
+'almost_'
+'ak_'
+'Vereinigten_'
+'Obama_'
+'Leben_'
+'Fest'
+'Ei'
+'wider'
+'use'
+'sst'
+'remain_'
+'reflect'
+'operat'
+'ong'
+'num'
+'nen'
+'near_'
+'kte_'
+'jene'
+'ish_'
+'inco'
+'general_'
+'family_'
+'erste_'
+'ena'
+'deck'
+'compa'
+'cal'
+'aufgrund_'
+'arbeite'
+'address_'
+'Ziele_'
+'Wor'
+'OR'
+'Fre'
+'Fraktion_'
+'Daten_'
+'Ant'
+'&_'
+'ul_'
+'table_'
+'ship_'
+'programme_'
+'prof'
+'products_'
+'ora'
+'merk'
+'licher_'
+'kü'
+'ihn_'
+'iden'
+'ibe'
+'history_'
+'hinaus_'
+'ged_'
+'ftig'
+'forward_'
+'exc'
+'erkenn'
+'easy_'
+'comp'
+'cause_'
+'call_'
+'building_'
+'billion_'
+'anst'
+'Wä'
+'VI'
+'Service_'
+'Richtlinie_'
+'Rats'
+'Präsident'
+'Privat'
+'Ol'
+'ME'
+'Krise_'
+'Grundlage_'
+'19_'
+'zun'
+'zed_'
+'za'
+'ungsp'
+'thus_'
+'things_'
+'show_'
+'sei_'
+'see'
+'ron'
+'questions_'
+'ption_'
+'port_'
+'nuclear_'
+'nommen_'
+'mos'
+'metr'
+'known_'
+'kleine'
+'ilit'
+'hes'
+'fol'
+'fle'
+'feel_'
+'extra'
+'eva'
+'erhöh'
+'darin_'
+'cor'
+'chtig'
+'beha'
+'ase_'
+'ame'
+'ach'
+'abs'
+'Thema_'
+'Sozial'
+'Russland_'
+'Regierungs'
+'Regierungen_'
+'NE'
+'Kollegen_'
+'Chance'
+''''
+'vention'
+'towards_'
+'technology_'
+'tar'
+'swi'
+'stati'
+'spr'
+'seinem_'
+'schwer'
+'sau'
+'rs'
+'rie'
+'reas'
+'ore'
+'ml'
+'maintain'
+'lichkeit_'
+'iz'
+'imi'
+'hold_'
+'hat'
+'gewäh'
+'easi'
+'dor'
+'concern'
+'cli'
+'break'
+'befindet_'
+'Verfügung_'
+'Tha'
+'Standard'
+'Sho'
+'Plan'
+'Pen'
+'Mari'
+'Landes_'
+'LA'
+'Betr'
+'An_'
+'wat'
+'solution_'
+'software_'
+'select'
+'proposed_'
+'produ'
+'pris'
+'personal_'
+'part'
+'natürlich_'
+'moderne'
+'ländern_'
+'leading_'
+'lang'
+'kung'
+'keinen_'
+'ize_'
+'instrument'
+'hor'
+'graph'
+'fäll'
+'fin'
+'din'
+'demokratische'
+'demand_'
+'ction_'
+'cker'
+'cing_'
+'border'
+'berg'
+'bas'
+'aufzu'
+'atz_'
+'aba'
+'Zusammen'
+'War'
+'Umwelt'
+'Tur'
+'Ph'
+'Mitglieds'
+'Lösung_'
+'Lib'
+'Leistung'
+'Kommissar_'
+'Ci'
+'è'
+'änder'
+'zeichnet'
+'ws_'
+'workers_'
+'wer_'
+'wegen_'
+'weg'
+'want'
+'version_'
+'treat'
+'themselves_'
+'test'
+'staatliche'
+'schaffen_'
+'roc'
+'raum_'
+'projects_'
+'power'
+'positi'
+'open'
+'months_'
+'ji'
+'entsch'
+'eis'
+'eib'
+'dic'
+'depend'
+'cultural_'
+'began'
+'Sen'
+'Programm_'
+'Not'
+'Macht_'
+'Jahrhundert'
+'Forschung'
+'Eigen'
+'Dienst'
+'ät'
+'ände'
+'via_'
+'tle'
+'seri'
+'revi'
+'reso'
+'res_'
+'report'
+'rema'
+'regional_'
+'publi'
+'programm'
+'program_'
+'plan_'
+'pay_'
+'past_'
+'mie'
+'io_'
+'ical_'
+'hing_'
+'hem'
+'govern'
+'erw'
+'erst'
+'enz'
+'ende'
+'dy'
+'dari'
+'cies_'
+'chte_'
+'bio'
+'betrachte'
+'ask_'
+'ama'
+'adopted_'
+'Mont'
+'Georg'
+'Gemeinschaft_'
+'Fer'
+'East_'
+'By_'
+'Ausb'
+'21_'
+'15'
+'ün'
+'äre_'
+'zentr'
+'yc'
+'tte'
+'strategy_'
+'sh'
+'prospe'
+'liberal'
+'lege'
+'lead_'
+'ihrem_'
+'ier_'
+'higher_'
+'gute_'
+'directive_'
+'beispielsweise_'
+'bb'
+'ata'
+'ane'
+'agree_'
+'ag_'
+'Stand'
+'Natur'
+'Mer'
+'Man_'
+'IS'
+'Hand'
+'H_'
+'As'
+'”'
+'ziehen_'
+'wie'
+'welcome_'
+'uch'
+'tiv'
+'tim'
+'solid'
+'shop'
+'setzt_'
+'serious_'
+'sach'
+'red'
+'rece'
+'rch'
+'quar'
+'pul'
+'points_'
+'ost'
+'nci'
+'nar'
+'minist'
+'levels_'
+'lang_'
+'import'
+'ient_'
+'gilt_'
+'expan'
+'ente'
+'eng'
+'empf'
+'either_'
+'eign'
+'effective_'
+'ee_'
+'education_'
+'chan'
+'art_'
+'anges'
+'allow_'
+'af'
+'acht_'
+'above_'
+'Wal'
+'Um'
+'Transp'
+'Trad'
+'Situation_'
+'Rea'
+'Nord'
+'Nachf'
+'Met'
+'Maß'
+'Kultur'
+'Bru'
+'After_'
+'198'
+'zin'
+'wirk'
+'vielleicht_'
+'verbesser'
+'various_'
+'town_'
+'substan'
+'stan'
+'spen'
+'specific_'
+'sicher_'
+'seite'
+'seen_'
+'schnell_'
+'ry'
+'related_'
+'reduc'
+'pun'
+'plu'
+'mt'
+'länder_'
+'lie'
+'leader'
+'lands_'
+'kit'
+'investi'
+'ines'
+'hed_'
+'gm'
+'gesp'
+'gan_'
+'freedom_'
+'found_'
+'farm'
+'eten_'
+'erv'
+'erst_'
+'deal_'
+'cre'
+'corr'
+'broad'
+'bleiben_'
+'besser_'
+'bedeutet_'
+'av'
+'anc'
+'affect'
+'TE'
+'South_'
+'Pra'
+'Institutionen_'
+'Indi'
+'Aufs'
+'Anst'
+' -'
+'zip'
+'wand'
+'van'
+'urs'
+'tum'
+'tragen_'
+'tm'
+'spezi'
+'sli'
+'seve'
+'rapporteur_'
+'prec'
+'ote_'
+'ordnung_'
+'nge_'
+'mem'
+'ly'
+'iger_'
+'hand'
+'gk'
+'erle'
+'erforder'
+'enjoy_'
+'eld'
+'eins'
+'einges'
+'economies_'
+'ebenfalls_'
+'direkt_'
+'conven'
+'chten_'
+'cast'
+'bilde'
+'authorities_'
+'ah_'
+'Win'
+'Technologie'
+'Such'
+'Sam'
+'November_'
+'Meinung_'
+'Geb'
+'Even_'
+'Cor'
+'Ansicht_'
+'='
+'10'
+'. _'
+'änge'
+'äl'
+'à_'
+'zuf'
+'zeitig'
+'whe'
+'tn'
+'ther_'
+'suffer'
+'stal'
+'ssion'
+'schw'
+'rte'
+'ress_'
+'repr'
+'remains_'
+'reforms_'
+'rag'
+'provided_'
+'pin'
+'opportunity_'
+'ntr'
+'nten_'
+'mot'
+'ladies_'
+'kind_'
+'ized_'
+'iv'
+'iso'
+'ismus_'
+'ische'
+'impact_'
+'gung'
+'groups_'
+'gefähr'
+'framework_'
+'fern'
+'equip'
+'due_'
+'document'
+'cru'
+'big_'
+'big'
+'bez'
+'automati'
+'atic'
+'ater'
+'arbeit'
+'alte'
+'ach_'
+'accept'
+'Verein'
+'Schutz_'
+'Qua'
+'Markt'
+'Kam'
+'Fahr'
+'Berei'
+'And'
+'Agen'
+'works_'
+'ways_'
+'versi'
+'univers'
+'uh'
+'tt_'
+'top_'
+'tit'
+'strong_'
+'simply_'
+'significant_'
+'sicht'
+'shi'
+'rus'
+'rung_'
+'rsch'
+'ral_'
+'quite_'
+'polic'
+'party_'
+'organis'
+'omi'
+'oh'
+'obe'
+'nh'
+'nan'
+'lige'
+'language_'
+'imp'
+'ily_'
+'gliche'
+'gestellt_'
+'frag'
+'fie'
+'explo'
+'erie'
+'eigene'
+'distri'
+'deep'
+'dd'
+'cour'
+'concerns_'
+'climate_'
+'cation'
+'bie'
+'besondere'
+'benötig'
+'behalf_'
+'ausges'
+'atu'
+'ang'
+'abzu'
+'Restaurant_'
+'Kosten_'
+'Em'
+'Dieser_'
+'Da_'
+'Cons'
+'Bla'
+'Berichte'
+'All'
+'Africa_'
+'öse'
+'ätz'
+'Ö'
+' '
+'zust'
+'wahr'
+'verd'
+'verbr'
+'uk'
+'ttel'
+'travel'
+'tier'
+'tial_'
+'tax'
+'seh'
+'road_'
+'rin'
+'rekt'
+'refer'
+'record'
+'rechts'
+'presen'
+'pea'
+'pages_'
+'office'
+'offen_'
+'nisse_'
+'mmer_'
+'mine'
+'method'
+'mer_'
+'mass'
+'makes_'
+'liche'
+'kont'
+'ieb'
+'globalen_'
+'geri'
+'essen'
+'erl'
+'einzige'
+'direct'
+'did'
+'cat'
+'bera'
+'attempt'
+'ates_'
+'angeb'
+'Str'
+'Rom'
+'RE'
+'Produkt'
+'Our_'
+'Lab'
+'Ku'
+'Komp'
+'Hilfe_'
+'Fri'
+'Einig'
+'Einf'
+'Dr'
+'Demokratie_'
+'DS'
+'Bud'
+'Ban'
+'Ang'
+'2010_'
+'00'
+'“_'
+'ändig'
+'wit'
+'wi'
+'upon_'
+'uct'
+'tor_'
+'summ'
+'suit'
+'si_'
+'she_'
+'seems_'
+'rz'
+'ros'
+'rma'
+'rese'
+'relations_'
+'rechte'
+'ppen_'
+'pleas'
+'oph'
+'online_'
+'nischen_'
+'nal'
+'lü'
+'loc'
+'keiten_'
+'join'
+'izie'
+'inh'
+'ile_'
+'hers'
+'grant'
+'gr'
+'ff_'
+'ew'
+'ement_'
+'ell_'
+'efficien'
+'effect_'
+'economi'
+'eben_'
+'ean_'
+'cro'
+'bra'
+'bank'
+'any'
+'Treaty_'
+'Of_'
+'Menschenrechte_'
+'Kre'
+'Form_'
+'Ed'
+'DE'
+'Con'
+'Anf'
+';_'
+'80_'
+'*'
+')_'
+'м'
+'Änderung'
+'zy'
+'zurück_'
+'ziert'
+'yr'
+'wen'
+'verwe'
+'va'
+'unk'
+'tung'
+'true_'
+'total_'
+'tliche'
+'subject_'
+'sse_'
+'sprechen_'
+'schutz'
+'scher_'
+'safety_'
+'rung'
+'rh'
+'prop'
+'parliament'
+'nutzen_'
+'mount'
+'loa'
+'lk'
+'limit'
+'learn'
+'later_'
+'lant'
+'lab'
+'kli'
+'jeder_'
+'ini'
+'ierte_'
+'growing_'
+'ght'
+'förder'
+'every'
+'erfolgreich'
+'enen_'
+'dge'
+'cou'
+'chn'
+'char'
+'cas'
+'bezieh'
+'betre'
+'bestimmte'
+'appropriate_'
+'ances_'
+'amendments_'
+'ail'
+'abges'
+'West_'
+'Verm'
+'Ref'
+'Minister_'
+'Lis'
+'Kr'
+'Ham'
+'Geschichte_'
+'Fort'
+'Einw'
+'Col'
+'Alle_'
+','
+'('
+'öhn'
+'zeigt_'
+'yp'
+'young_'
+'weit'
+'til'
+'tac'
+'start'
+'sses_'
+'solche_'
+'ruct'
+'recently_'
+'real'
+'rasch'
+'qua'
+'protect'
+'pli'
+'phi'
+'offer'
+'off'
+'nch'
+'nce'
+'nationale'
+'media_'
+'lim'
+'legt_'
+'lect'
+'kal'
+'irk'
+'internationale_'
+'intell'
+'individual_'
+'improv'
+'held_'
+'harm'
+'grat'
+'gewi'
+'gal'
+'gab_'
+'format'
+'forces_'
+'euro_'
+'erte_'
+'ep'
+'environmental_'
+'ebenso_'
+'dun'
+'det'
+'daran_'
+'concerned_'
+'conce'
+'combin'
+'care_'
+'auto'
+'aten_'
+'archi'
+'andere'
+'Wieder'
+'Wettbewerbs'
+'Ukraine_'
+'Strategie_'
+'Sti'
+'Schi'
+'Rel'
+'NG'
+'Mass'
+'Madam_'
+'Hea'
+'Gewi'
+'Gesellschaft_'
+'Gericht'
+'GDP_'
+'Fin'
+'Erh'
+'Dieses_'
+'Bes'
+'Außen'
+'к'
+'öt'
+'Übers'
+'zation_'
+'wage'
+'umfa'
+'transp'
+'sym'
+'spending_'
+'share_'
+'rw'
+'rre'
+'rity_'
+'rene'
+'priorit'
+'positive_'
+'peace_'
+'path'
+'parts_'
+'ously_'
+'ore_'
+'optim'
+'opinion_'
+'ob'
+'ndlung'
+'nati'
+'mäßig'
+'ks'
+'kation'
+'kam'
+'isier'
+'illegal'
+'ierten_'
+'gg'
+'gan'
+'fri'
+'four_'
+'eurozone_'
+'employment_'
+'discuss'
+'conver'
+'community_'
+'ckt'
+'cher_'
+'changes_'
+'attention_'
+'air'
+'agen'
+'[_'
+'Woche'
+'Tat'
+'Some_'
+'Rest'
+'Rates_'
+'Pan'
+'Off'
+'Milliarden_'
+'Meer'
+'Jun'
+'Hil'
+'EUR_'
+'Bürger'
+'Bad'
+'Ass'
+'17_'
+'überw'
+'äs'
+'tlich_'
+'success_'
+'sollen_'
+'sie'
+'popul'
+'pla'
+'perat'
+'parti'
+'pac'
+'outside_'
+'oun'
+'orti'
+'ola'
+'nte_'
+'nse'
+'net_'
+'nation_'
+'mus_'
+'maß'
+'main'
+'liste'
+'km_'
+'keine'
+'jud'
+'ize'
+'ive'
+'improve_'
+'iegen'
+'ieden'
+'idea'
+'ica'
+'huge_'
+'half_'
+'gol'
+'gewe'
+'genau_'
+'funds_'
+'fragen_'
+'field_'
+'fail'
+'exce'
+'etzung_'
+'erat'
+'eo'
+'entst'
+'entire'
+'endi'
+'electr'
+'dom_'
+'destr'
+'dent'
+'danger'
+'content_'
+'cent'
+'bru'
+'block'
+'beda'
+'auss'
+'ative_'
+'ath'
+'ann'
+'ami'
+'ambi'
+'ale_'
+'addition_'
+'act_'
+'Western_'
+'Sou'
+'Sin'
+'See'
+'Rot'
+'Regi'
+'Real'
+'Miss'
+'Kapital'
+'Ira'
+'ID'
+'Fl'
+'Bü'
+'Bas'
+' . _'
+'zel'
+'traditional_'
+'tober_'
+'station_'
+'stadt_'
+'run_'
+'rich_'
+'post'
+'ple'
+'passi'
+'oil_'
+'of'
+'nächsten_'
+'now'
+'nothing_'
+'nf'
+'mun'
+'mee'
+'mani'
+'legen_'
+'leb'
+'ition_'
+'idea_'
+'gemacht_'
+'fü'
+'fundamental_'
+'flexib'
+'fal'
+'entl'
+'eite'
+'eid'
+'egen_'
+'drück'
+'draw'
+'die'
+'deliver'
+'compo'
+'character'
+'bekannt_'
+'apartment'
+'ache'
+'Vorsch'
+'Vis'
+'Verbraucher'
+'Val'
+'Tu'
+'Schulden'
+'Schu'
+'Mitte'
+'Krieg_'
+'Italy_'
+'Hotels_'
+'Herausforderung'
+'Frank'
+'Ec'
+'Dem'
+'Del'
+'Bot'
+'Beziehungen_'
+'Bet'
+': '
+'zit'
+'zei'
+'ws'
+'words_'
+'verh'
+'usual'
+'tet_'
+'terr'
+'tain'
+'solche'
+'schwierig'
+'sc'
+'regionale'
+'population_'
+'pool_'
+'player'
+'pl'
+'orate'
+'ole'
+'medic'
+'lot_'
+'legislation_'
+'komme'
+'iva'
+'institution'
+'inent'
+'ieh'
+'high'
+'gui'
+'genie'
+'gene'
+'fäh'
+'fs'
+'exte'
+'esp'
+'eren'
+'else'
+'ebe'
+'don'
+'defi'
+'darstell'
+'currently_'
+'competitive'
+'bitte'
+'bau_'
+'amerikanischen_'
+'alist'
+'Turkey_'
+'Tri'
+'Sl'
+'Sha'
+'Sau'
+'Ren'
+'Paris_'
+'Net'
+'Mitglieder_'
+'Mat'
+'London_'
+'Kommiss'
+'Ihr_'
+'Geld_'
+'Führ'
+'Frankreich_'
+'Foto'
+'DE_'
+'Comp'
+'Bevölkerung_'
+'Besuch'
+'23_'
+'üh'
+'wert'
+'verle'
+'verein'
+'ung'
+'trä'
+'traf'
+'trad'
+'tool'
+'times_'
+'thing_'
+'thank_'
+'stische'
+'sim'
+'restaurant_'
+'required_'
+'reich_'
+'recht'
+'präsident'
+'pil'
+'photo'
+'participat'
+'nic'
+'lti'
+'letzte'
+'leide'
+'ktur'
+'komple'
+'inst'
+'inge'
+'individu'
+'indicat'
+'heart_'
+'hap'
+'hab'
+'gl'
+'gewa'
+'gesagt_'
+'faci'
+'developed_'
+'deutlich_'
+'days_'
+'chaft_'
+'car_'
+'bzw_'
+'bat'
+'adi'
+'Während_'
+'Uns'
+'Tages'
+'Members_'
+'MA'
+'Land'
+'Isla'
+'Genera'
+'Et'
+'Erd'
+'Eins'
+'Bur'
+'British_'
+'Besch'
+'70_'
+'.._'
+'ül'
+'Öl'
+'züg'
+'zig'
+'zess'
+'weltweite'
+'vent'
+'ungss'
+'tst'
+'tive_'
+'tie'
+'statt_'
+'sia'
+'sea_'
+'schn'
+'reta'
+'rer'
+'prot'
+'plat'
+'pic'
+'new'
+'my'
+'minutes_'
+'mente'
+'mehrere'
+'material'
+'lte_'
+'living_'
+'line'
+'itt'
+'insta'
+'insi'
+'increas'
+'immigra'
+'hre'
+'help'
+'hel'
+'goal'
+'game_'
+'flow'
+'fit'
+'ffer'
+'facilities_'
+'eta'
+'erwei'
+'deutsche'
+'demand'
+'cus'
+'beschl'
+'att'
+'arti'
+'aris'
+'appro'
+'ae'
+'actually_'
+'acht'
+'abe'
+'Zugang_'
+'UK_'
+'Sup'
+'Regel'
+'Produktion'
+'Pac'
+'Organisation'
+'My'
+'Moreover_'
+'Let'
+'Ide'
+'Hei'
+'Geld'
+'Fern'
+'Dienstleistungen_'
+'DA'
+'Bez'
+'Bedingungen_'
+'Auswirkungen_'
+'Aus_'
+'AS'
+'35'
+'13_'
+'"._'
+'üge'
+'zie'
+'zentrale'
+'wesentlich'
+'vict'
+'union_'
+'tur'
+'transfer'
+'tischen_'
+'tha'
+'text_'
+'stü'
+'smo'
+'sagt'
+'rö'
+'rne'
+'rapid'
+'provid'
+'product_'
+'priv'
+'principle_'
+'politische'
+'person_'
+'orm'
+'nämlich_'
+'model_'
+'mati'
+'majority_'
+'llen'
+'lia'
+'ktion_'
+'jobs_'
+'itte'
+'intr'
+'industrie'
+'inder'
+'imag'
+'ichts'
+'hätte_'
+'hours_'
+'hilfe'
+'gte'
+'gli'
+'fort_'
+'erten_'
+'erreicht_'
+'dist'
+'demonstrat'
+'control'
+'cis'
+'certainly_'
+'bus_'
+'bung_'
+'bereit_'
+'bed_'
+'ausw'
+'aue'
+'ark'
+'applica'
+'aner'
+'anders'
+'ake'
+'across_'
+'Vorteil'
+'Tod'
+'Struktur'
+'Sit'
+'Sim'
+'Schw'
+'SS'
+'Reze'
+'Rep'
+'Pl'
+'Nah'
+'MI'
+'Lie'
+'Gew'
+'Gas'
+'GE'
+'Erfahrung'
+'Ce'
+'Bewe'
+'Amerika'
+'ünde'
+'äum'
+'zung_'
+'ziel'
+'zeigen_'
+'zahlreiche'
+'worte'
+'wende'
+'vorl'
+'verg'
+'turn_'
+'träge'
+'surp'
+'stärker'
+'sge'
+'setz'
+'rund_'
+'rol'
+'reserve'
+'regist'
+'reduce_'
+'presiden'
+'pres'
+'potential_'
+'por'
+'plo'
+'organisation'
+'nya_'
+'neighbo'
+'lös'
+'lose_'
+'lo_'
+'lik'
+'lb'
+'ktionen_'
+'kenn'
+'je_'
+'ino'
+'innen_'
+'inflation_'
+'indeed_'
+'ika'
+'igen'
+'häng'
+'heraus'
+'hatten_'
+'glaub'
+'füh'
+'fli'
+'fl'
+'extrem'
+'exchange_'
+'except'
+'env'
+'entsprechende'
+'emerge'
+'elect'
+'einigen_'
+'deshalb_'
+'cop'
+'coo'
+'cons'
+'colla'
+'cases_'
+'bt_'
+'bleibt_'
+'bere'
+'benefits_'
+'bene'
+'batt'
+'awa'
+'asse'
+'anti_'
+'Windows_'
+'Stelle'
+'Sei'
+'Schritt_'
+'Schluss'
+'Pre'
+'Office_'
+'Nic'
+'National_'
+'Mä'
+'Markt_'
+'Greece_'
+'Bezug_'
+'1999_'
+',” _'
+'і'
+'üg'
+'zunehmend'
+'zil'
+'whose_'
+'werte'
+'vita'
+'unterstützt_'
+'unserem_'
+'umfassende'
+'trotz'
+'tend'
+'sus'
+'stability_'
+'stabil'
+'sive_'
+'similar_'
+'sier'
+'sense_'
+'selb'
+'resi'
+'reme'
+'regulation_'
+'range_'
+'provision'
+'nsch'
+'ndet'
+'natural_'
+'moral'
+'mod'
+'mittel_'
+'mische'
+'mere'
+'lässt_'
+'länger'
+'lon'
+'lib'
+'leite'
+'kto'
+'keyword_'
+'jede'
+'interess'
+'immediate'
+'hs_'
+'house'
+'historische'
+'hafte'
+'gemein'
+'gebracht_'
+'freundlich'
+'financ'
+'esc'
+'erge'
+'enb'
+'element'
+'ei_'
+'ege'
+'directly_'
+'ding'
+'ders'
+'consider_'
+'brauch'
+'bereit'
+'beautiful_'
+'bahn'
+'atz'
+'appe'
+'along_'
+'abl'
+'Umsetzung_'
+'TV_'
+'She'
+'Scho'
+'Sach'
+'Menschen'
+'Luft'
+'Interesse_'
+'Instrument'
+'Imm'
+'Hun'
+'His'
+'Erfolg_'
+'Entscheidung_'
+'Durch'
+'Bundes'
+'Aust'
+'Ausl'
+'Asia_'
+'Aktion'
+'Afrika'
+'196'
+'17'
+'01'
+'...'
+'", _'
+'í'
+'yt'
+'wichtige_'
+'vol'
+'unden_'
+'ulat'
+'tionen_'
+'tik'
+'ters_'
+'stet'
+'shed_'
+'schön'
+'schein'
+'ruh'
+'res'
+'regi'
+'referen'
+'recommend'
+'pit'
+'package'
+'oy'
+'ote'
+'opposit'
+'nze'
+'neu'
+'ms'
+'lle_'
+'lern'
+'leicht_'
+'lation'
+'jekt'
+'ham'
+'gua'
+'gerade_'
+'gegenwärtig'
+'ge'
+'ga_'
+'ften_'
+'fris'
+'flo'
+'five_'
+'erfa'
+'elections_'
+'eilig'
+'eder'
+'eas'
+'discussion'
+'dama'
+'contra'
+'company_'
+'breite'
+'besi'
+'becom'
+'aut'
+'are'
+'application_'
+'analys'
+'Versch'
+'Verha'
+'Spiele'
+'Sec'
+'Republic'
+'Prin'
+'OS'
+'Liste_'
+'LI'
+'Kun'
+'Ihrer_'
+'Haus_'
+'Goo'
+'Free'
+'Far'
+'Fac'
+'Ev'
+'DI'
+'Cou'
+'Cl'
+'Cal'
+'Berlin_'
+'Bal'
+'Ander'
+'!'
+' : _'
+'whi'
+'wenige'
+'warm'
+'vertei'
+'ved_'
+'understand_'
+'ule'
+'tter'
+'trac'
+'ton'
+'tast'
+'support'
+'stic'
+'starke'
+'soci'
+'slow'
+'schritt'
+'rvati'
+'rule_'
+'ruf'
+'reib'
+'pret'
+'ple_'
+'ones'
+'offene'
+'nglich'
+'minim'
+'minat'
+'looking_'
+'lla'
+'liegen_'
+'lediglich_'
+'kis'
+'kehr'
+'joy'
+'job_'
+'hti'
+'hn_'
+'guide'
+'grad'
+'geführt_'
+'front'
+'ernst'
+'ence'
+'emerging_'
+'eit'
+'dem'
+'deli'
+'credi'
+'contain'
+'comple'
+'communication'
+'communi'
+'bte'
+'britische'
+'boo'
+'bear'
+'ausl'
+'atische'
+'argument'
+'amount_'
+'ade_'
+'Zwei'
+'Wu'
+'War_'
+'Tatsache_'
+'Stimme'
+'Regul'
+'RA'
+'Prod'
+'Port'
+'Personen_'
+'Kö'
+'Krit'
+'Gran'
+'Gegens'
+'Deutsch'
+'April_'
+'„_'
+'ünft'
+'welche'
+'vic'
+'ust'
+'uer_'
+'tr'
+'sver'
+'sup'
+'speak'
+'sor'
+'ska'
+'schl'
+'rth'
+'row'
+'rich'
+'release'
+'rate'
+'proce'
+'prevent_'
+'pekt'
+'option'
+'opportunities_'
+'omm'
+'om_'
+'nz_'
+'nut'
+'mul'
+'move_'
+'mba'
+'love'
+'lay'
+'kurz'
+'krieg'
+'komp'
+'ject'
+'item'
+'iste'
+'involved_'
+'hund'
+'handelt_'
+'gge'
+'führung_'
+'fen'
+'fach'
+'experi'
+'erse'
+'erklärt'
+'enn'
+'einander'
+'dung_'
+'divers'
+'disk'
+'disease'
+'dens'
+'conflict_'
+'clo'
+'ches_'
+'chaften_'
+'center_'
+'card_'
+'capacity_'
+'bring_'
+'bevor'
+'bad_'
+'avoid'
+'au_'
+'astr'
+'ano'
+'ander'
+'aktiv'
+'achieve_'
+'While_'
+'Werk'
+'Vertrag'
+'Trans'
+'Tag_'
+'Sp'
+'Schri'
+'Reso'
+'Prä'
+'Portug'
+'On'
+'Nähe_'
+'Muslim'
+'Japan'
+'January_'
+'Institut'
+'Geh'
+'Fall'
+'Ergebnis_'
+'Erf'
+'Ebene_'
+'Debatte_'
+'Anz'
+'Agr'
+'überzeug'
+'ört'
+'{{_'
+'zut'
+'wahrscheinlich_'
+'ustr'
+'ugh'
+'uen_'
+'uche'
+'tz_'
+'tti'
+'territor'
+'subs'
+'stell'
+'size_'
+'sil'
+'set'
+'schnelle'
+'rom'
+'rent'
+'rem'
+'regions_'
+'refu'
+'rb'
+'project_'
+'politi'
+'plant'
+'peri'
+'pati'
+'osit'
+'noti'
+'moder'
+'meeting_'
+'mean_'
+'mach'
+'mac'
+'lange_'
+'komm'
+'ker'
+'ito'
+'ient'
+'identif'
+'hom'
+'hard'
+'größte'
+'gee'
+'gas_'
+'found'
+'former_'
+'find'
+'festge'
+'etzt_'
+'eratur'
+'elt'
+'els_'
+'ell'
+'eit_'
+'eher_'
+'dollar'
+'connect'
+'compr'
+'complete_'
+'clu'
+'cial_'
+'benutz'
+'baren_'
+'balance_'
+'assen_'
+'arra'
+'arme'
+'anw'
+'akte'
+'adopt'
+'acco'
+'Zahl'
+'Vors'
+'Raum'
+'Mitglied'
+'Leg'
+'Krieg'
+'Kri'
+'Kommuni'
+'IC'
+'Gründe'
+'Frühstück'
+'Dritte'
+'Deshalb_'
+'Beitritt'
+'Austria'
+'12'
+'zlich'
+'würdig'
+'wr'
+'vorher'
+'violence_'
+'verwendet_'
+'verhindern_'
+'verge'
+'uses_'
+'unver'
+'typi'
+'tigen_'
+'tab'
+'stock'
+'stage'
+'spielen_'
+'some'
+'save'
+'rweise_'
+'rti'
+'rge'
+'rf'
+'response_'
+'recogni'
+'realis'
+'put'
+'pte'
+'popular_'
+'piel'
+'passen'
+'ose'
+'nier'
+'near'
+'nc'
+'nature_'
+'moti'
+'mobil'
+'lier'
+'ität'
+'irr'
+'inn'
+'ience_'
+'ichtet'
+'ial'
+'hop'
+'hinter'
+'heißt_'
+'haupt'
+'gramm'
+'gn'
+'focus_'
+'findet_'
+'fic_'
+'ffen'
+'favo'
+'extensi'
+'ehl'
+'ega'
+'edit'
+'dürfen_'
+'del_'
+'competition_'
+'clearly_'
+'check_'
+'cate'
+'bald_'
+'add_'
+'Wirk'
+'Vol'
+'Verantwortung_'
+'Sinn'
+'Ser'
+'Second'
+'SA'
+'Präsidenten_'
+'Pri'
+'Pres'
+'National'
+'La_'
+'Jahres_'
+'Interessen_'
+'HI'
+'Government_'
+'Direct'
+'CH'
+'Afghanistan_'
+' ('
+'zähl'
+'zeit_'
+'willi'
+'weak'
+'var'
+'urc'
+'unser_'
+'ufe'
+'tö'
+'trie'
+'task'
+'statt'
+'stat'
+'space_'
+'show'
+'sek'
+'scheint_'
+'ries_'
+'ried'
+'richtung'
+'richt_'
+'regul'
+'rbe'
+'rais'
+'phe'
+'oten_'
+'ople'
+'olu'
+'night_'
+'nie_'
+'ming_'
+'mail'
+'lte'
+'loy'
+'ling'
+'lichkeiten_'
+'lagen_'
+'jeden_'
+'ium_'
+'isten_'
+'inten'
+'insp'
+'increasingly_'
+'impe'
+'image_'
+'ight'
+'hst'
+'hnt'
+'handl'
+'halb_'
+'großer_'
+'gleichzeitig_'
+'gemeinsame_'
+'fix'
+'finanzier'
+'features_'
+'face'
+'existing_'
+'everything_'
+'event_'
+'erba'
+'ept'
+'ehmen_'
+'discover'
+'digital'
+'counter'
+'clean_'
+'civil_'
+'chen'
+'came_'
+'bs_'
+'befinden_'
+'beach'
+'anden_'
+'alli'
+'administration_'
+'Wes'
+'Us'
+'Tr'
+'Tho'
+'Sprach'
+'Sh'
+'Reise'
+'Park_'
+'Mai'
+'King'
+'Irak_'
+'Gewalt_'
+'Gan'
+'Erklärung'
+'Daten'
+'CE'
+'Bor'
+'Bil'
+'26'
+'... _'
+' ( _'
+'üs'
+'öko'
+'ähr'
+'   . '
+'wirtschaftlich'
+'volu'
+'verz'
+'try_'
+'train'
+'tnis'
+'thi'
+'teile'
+'tau'
+'tal_'
+'tage'
+'stro'
+'stei'
+'sozialen_'
+'sieh'
+'school'
+'rv'
+'rio'
+'richten_'
+'raf'
+'provides_'
+'poten'
+'plane'
+'obwohl_'
+'observ'
+'negotiations_'
+'neg'
+'minal'
+'militärische'
+'markt_'
+'list'
+'lde'
+'ktr'
+'kom'
+'ken'
+'ities_'
+'höchst'
+'host'
+'hof_'
+'halte'
+'gesellschaft'
+'gende'
+'ged'
+'fung'
+'fische'
+'fight_'
+'fat'
+'expens'
+'erfolg'
+'enf'
+'ef_'
+'eck'
+'direct_'
+'dar_'
+'culture_'
+'computer_'
+'care'
+'bestimm'
+'beitr'
+'bau'
+'ants_'
+'allgemeine'
+'Verh'
+'Ven'
+'Temp'
+'Teil'
+'Tag'
+'Sw'
+'Stat'
+'Som'
+'Sat'
+'Pet'
+'Mexi'
+'Mal'
+'Kop'
+'Kinder'
+'Kampf_'
+'Jede'
+'Eb'
+'Boo'
+'195'
+'�_'
+'you'
+'weisen_'
+'wart'
+'vin'
+'verwa'
+'verfügt_'
+'unkt_'
+'uel'
+'training_'
+'takes_'
+'stun'
+'stic_'
+'squ'
+'six_'
+'ront'
+'ring'
+'rg'
+'rence_'
+'remi'
+'recht_'
+'quot'
+'prepare'
+'pet'
+'pel'
+'partner_'
+'othe'
+'original'
+'oben_'
+'nnen_'
+'nke'
+'network_'
+'mpf'
+'mont'
+'liz'
+'live_'
+'lich'
+'lam'
+'kre'
+'ional_'
+'internal_'
+'interest'
+'instead_'
+'inis'
+'igu'
+'generation'
+'gegeben_'
+'foo'
+'fied_'
+'ff'
+'essi'
+'ensi'
+'ener'
+'emi'
+'einger'
+'echt'
+'dl'
+'dict'
+'defen'
+'decisions_'
+'comment'
+'circu'
+'call'
+'bod'
+'betrifft_'
+'atten'
+'angeh'
+'address'
+'achten'
+'] _'
+'Zins'
+'Wü'
+'Werte'
+'Wachstums'
+'Türkei_'
+'Straße'
+'Sorge'
+'Schwi'
+'Sal'
+'Reserv'
+'Para'
+'North_'
+'NI'
+'Märkte'
+'Mot'
+'MP'
+'Idee'
+'Hy'
+'Hier_'
+'Hel'
+'Gal'
+'Engl'
+'Cla'
+'Bereichen_'
+'Banken_'
+'Aussprache_'
+'Absch'
+' -_'
+'ührung_'
+'ästinens'
+'äge_'
+'zustellen_'
+'zuk'
+'xa'
+'wn_'
+'wing_'
+'wide'
+'vorschl'
+'verw'
+'unterst'
+'unterschiedliche'
+'tg'
+'stl'
+'sten'
+'standard_'
+'soft_'
+'ria_'
+'rce'
+'prü'
+'prove'
+'prob'
+'ped_'
+'och'
+'nv'
+'neuer'
+'nel_'
+'meng'
+'meet_'
+'manufactur'
+'mals_'
+'lution'
+'look'
+'logis'
+'lm'
+'legitim'
+'lah_'
+'kten_'
+'keep_'
+'ked_'
+'jeweil'
+'involv'
+'integration_'
+'iesen_'
+'ichen_'
+'iche'
+'hle'
+'geg'
+'funktionier'
+'forma'
+'fon'
+'fo'
+'fina'
+'file_'
+'fet'
+'extremely_'
+'extend'
+'ext'
+'exam'
+'ession'
+'ese_'
+'entscheidende'
+'enha'
+'eme'
+'elli'
+'ehen'
+'echte'
+'div'
+'dev'
+'deine'
+'debat'
+'cs_'
+'close'
+'class'
+'carrie'
+'bot'
+'bild'
+'bestä'
+'bereich'
+'below_'
+'aufs'
+'activities_'
+'accu'
+'Zu_'
+'Ye'
+'Y_'
+'Währungs'
+'Seh'
+'San_'
+'Russ'
+'Roman'
+'Ple'
+'Partei_'
+'Möglichkeiten_'
+'Mode'
+'Manage'
+'Las'
+'Konflikt'
+'Inf'
+'Home'
+'Gesundheits'
+'Einsatz_'
+'BIP_'
+'Av'
+'Aspekt'
+'Allerdings_'
+'40'
+'194'
+' ‘_'
+' ''_'
+'ämpf'
+'}}'
+'za_'
+'weiß_'
+'weiteren_'
+'week_'
+'wee'
+'visit'
+'vili'
+'verfolg'
+'varia'
+'values_'
+'unr'
+'ually_'
+'tradition'
+'tische_'
+'tho'
+'tand_'
+'suppl'
+'simple_'
+'sem'
+'scr'
+'return_'
+'rest_'
+'reit'
+'reg'
+'reco'
+'rauch'
+'rai'
+'quest'
+'productiv'
+'prevent'
+'perhaps_'
+'obacht'
+'nin'
+'nia'
+'nel'
+'memb'
+'manch'
+'lung'
+'ller_'
+'law'
+'langfristig'
+'lage_'
+'lad'
+'jahr'
+'iro'
+'ira'
+'intend'
+'infrastructure_'
+'increased_'
+'included_'
+'ice_'
+'höhere'
+'hung_'
+'hohen_'
+'glei'
+'gla'
+'ges'
+'gebe'
+'fun'
+'fuel'
+'fehl'
+'evi'
+'effective'
+'doing_'
+'dio'
+'difference'
+'devi'
+'currency_'
+'cos'
+'continue'
+'contains_'
+'consider'
+'commitment_'
+'collecti'
+'chtli'
+'brauchen_'
+'besten_'
+'bessere'
+'bedi'
+'ativen_'
+'ationen_'
+'alle'
+'ahren_'
+'absolute'
+'\'
+'You'
+'Wohl'
+'Tele'
+'Staat_'
+'Spain_'
+'Roo'
+'Richtung_'
+'Rat'
+'Qualität'
+'Pap'
+'Ort_'
+'Minde'
+'Install'
+'Exp'
+'Dur'
+'Cre'
+'Booking_'
+'Auff'
+'Arme'
+'Arab'
+'€'
+'в'
+'ühl'
+'ös'
+'ßt_'
+'Ökonom'
+'   – _'
+'zweite'
+'zusätzliche'
+'yb'
+'wirksam'
+'wic'
+'wert_'
+'verur'
+'vergangenen_'
+'uli'
+'tü'
+'toward_'
+'took_'
+'theor'
+'tatsächlich_'
+'sung_'
+'ständig'
+'step_'
+'statement'
+'stag'
+'signa'
+'share'
+'sell'
+'reviews_'
+'responsible_'
+'respect'
+'requirement'
+'representative'
+'relax'
+'recover'
+'rds_'
+'rap'
+'rad'
+'pu'
+'prech'
+'prac'
+'poverty_'
+'pir'
+'pay'
+'notwendig_'
+'negara_'
+'möglicherweise_'
+'mous'
+'mission'
+'mbe'
+'lou'
+'les'
+'lend'
+'iona'
+'importance_'
+'igt_'
+'ific'
+'ideal_'
+'ichten_'
+'hätten_'
+'humanit'
+'hende'
+'gam'
+'favour_'
+'ew_'
+'essential_'
+'esi'
+'enge'
+'emphasis'
+'effects_'
+'door'
+'dest'
+'design_'
+'declar'
+'customers_'
+'constructi'
+'connection'
+'cks_'
+'chw'
+'chinesischen_'
+'board_'
+'bly_'
+'beein'
+'bean'
+'bare_'
+'assess'
+'arr'
+'agenda_'
+'Zus'
+'Wir'
+'Veran'
+'Stabilität'
+'Software_'
+'Sea'
+'Prof'
+'Prim'
+'Netz'
+'König'
+'Kn'
+'Kir'
+'Funktion'
+'Freiheit'
+'Fran'
+'Einh'
+'Conf'
+'Bahn'
+'Anla'
+'AC'
+'32'
+'…'
+'“, _'
+'“'
+'ße'
+'zusammen_'
+'wären_'
+'wors'
+'wir'
+'vorges'
+'vollständig'
+'vas'
+'user'
+'urb'
+'unw'
+'ungsv'
+'ular'
+'uelle'
+'tter_'
+'tren'
+'touris'
+'telle'
+'structure'
+'streng'
+'sprach'
+'soziale_'
+'south'
+'sla'
+'schä'
+'schwa'
+'richtig'
+'reject'
+'react'
+'quis'
+'qualifi'
+'pus'
+'pra'
+'performance_'
+'opi'
+'oft_'
+'ocat'
+'ndo'
+'moment_'
+'mili'
+'menti'
+'male'
+'logi'
+'leich_'
+'legislat'
+'leben_'
+'leave'
+'lai'
+'lack_'
+'kontroll'
+'kleinen_'
+'klar_'
+'kla'
+'kers_'
+'isation_'
+'introduce'
+'ignor'
+'hö'
+'grün'
+'grenz'
+'gericht'
+'gang'
+'fünf_'
+'französische'
+'folgende'
+'fil'
+'fertig'
+'ey_'
+'erke'
+'era_'
+'elle'
+'egel'
+'domin'
+'dli'
+'deut'
+'deal'
+'concept'
+'colo'
+'coh'
+'cin'
+'ching_'
+'boa'
+'bel_'
+'beginn'
+'bede'
+'beach_'
+'ball'
+'bal'
+'atur'
+'ation'
+'artige'
+'arian_'
+'applie'
+'ape'
+'apa'
+'ansch'
+'alter'
+'airs_'
+'active_'
+'achi'
+'Wert_'
+'Weiter'
+'Vorb'
+'Video'
+'Unterk'
+'Techn'
+'Sektor'
+'Ran'
+'Party_'
+'Partei'
+'Oc'
+'Musik'
+'Minister'
+'Mill'
+'Mil'
+'MO'
+'Justi'
+'Ind'
+'Höhe'
+'Großbritannien_'
+'Grenzen_'
+'Gem'
+'Finanzierung'
+'Einkommen'
+'EA'
+'Design'
+'Dep'
+'Chinas_'
+'Beha'
+'Aufg'
+'1980'
+'0er_'
+'п'
+'ützt'
+'wünsche'
+'wirtschaft_'
+'wichtigen_'
+'weltweit_'
+'vorgeschlagen'
+'voran'
+'vertrete'
+'verlang'
+'verbind'
+'unately_'
+'ual'
+'treffen_'
+'tings_'
+'technologie'
+'steigen'
+'slo'
+'sierung_'
+'sibl'
+'short'
+'rier'
+'restrict'
+'responsibility_'
+'require_'
+'reif'
+'reasons_'
+'pursu'
+'prefer'
+'places_'
+'permi'
+'perce'
+'opol'
+'nimmt_'
+'negative_'
+'mst'
+'med_'
+'map'
+'läss'
+'ln'
+'lateral'
+'kurze'
+'kap'
+'isin'
+'influence'
+'iken_'
+'igkeit'
+'ielt'
+'ied'
+'gst'
+'gori'
+'gleichen_'
+'gleich_'
+'gewisse'
+'gerecht'
+'gap'
+'fore'
+'forder'
+'finanzielle'
+'external_'
+'embe'
+'develop_'
+'derzeit_'
+'denk'
+'deb'
+'darf_'
+'conclude'
+'campaign'
+'burg_'
+'begrüße'
+'azi'
+'aspect'
+'animal'
+'amerikanische'
+'alternative'
+'akzeptier'
+'York_'
+'Vor_'
+'Verk'
+'Univers'
+'Today_'
+'TO'
+'Stunde'
+'Spi'
+'Schla'
+'Richt'
+'Preis_'
+'Pass'
+'Ot'
+'Meine'
+'Marke'
+'Kra'
+'It'
+'Invest'
+'Ihre'
+'Gold'
+'Fehler'
+'Eff'
+'Dor'
+'Cat'
+'CD_'
+'Beschäftig'
+'Außerdem_'
+'Argentin'
+'Arbeit'
+'Anl'
+'Ange'
+'Alb'
+'AR'
+'--'
+'-, _'
+''' '
+'” '
+'—'
+'és'
+'änk'
+'ändern_'
+'äfte'
+'zial'
+'zar'
+'wes'
+'welt'
+'vir'
+'viert'
+'uß'
+'urf'
+'ture'
+'tia'
+'threat'
+'team'
+'tant'
+'surround'
+'successful_'
+'student'
+'strong'
+'stoff'
+'stab'
+'spar'
+'sof'
+'schul'
+'schr'
+'rim'
+'revolution'
+'reno'
+'remov'
+'religio'
+'purchas'
+'protect_'
+'promise'
+'professional'
+'president_'
+'practical'
+'pos'
+'oppo'
+'odi'
+'occ'
+'nom'
+'national'
+'nal_'
+'mpe'
+'monitor'
+'mbi'
+'massive'
+'lth'
+'lf_'
+'largest_'
+'kö'
+'kul'
+'jenigen_'
+'ivit'
+'insur'
+'initiat'
+'implementation_'
+'ierung'
+'hl_'
+'hing'
+'gue_'
+'gle_'
+'gesamt_'
+'gebi'
+'gari'
+'friendly_'
+'forg'
+'fest_'
+'fahr'
+'factor'
+'eug'
+'entsp'
+'enthalt'
+'elf'
+'eigentlich'
+'eigen'
+'dte'
+'double_'
+'dies'
+'dialog'
+'decades_'
+'contract'
+'confi'
+'colleague'
+'challenges_'
+'chai'
+'capa'
+'bul'
+'bracht'
+'blin'
+'bers'
+'authorit'
+'attr'
+'arriv'
+'arin'
+'advantage'
+'ada_'
+'accessi'
+'Worte'
+'Vorauss'
+'Von_'
+'Verkehrs'
+'Ve'
+'Time'
+'Tie'
+'Ther'
+'Tatsächlich_'
+'Stell'
+'SE'
+'Ris'
+'Preise'
+'Pers'
+'Nation'
+'My_'
+'Monate'
+'Modell'
+'Koo'
+'Konsu'
+'Konferenz'
+'Koh'
+'Kern'
+'Kenn'
+'Interna'
+'Haf'
+'Fälle'
+'Es'
+'ES'
+'Dri'
+'Denn'
+'Blo'
+'Bl'
+'Ausf'
+'Aufgabe_'
+'Am_'
+'> _'
+'45_'
+'-'
+'üns'
+'überl'
+'zweiten'
+'xim'
+'werde_'
+'weapon'
+'wai'
+'verse'
+'vermi'
+'ures_'
+'ument'
+'tten_'
+'translat'
+'tens'
+'sub_'
+'spri'
+'spec'
+'soon_'
+'schlecht'
+'rolle'
+'respond'
+'refugee'
+'redi'
+'rative'
+'ragen_'
+'rag_'
+'promote'
+'pressure_'
+'option_'
+'ock'
+'occur'
+'neu_'
+'nehmer'
+'mechanism'
+'lve'
+'kräfte'
+'ko_'
+'ki_'
+'journ'
+'jedes_'
+'isten'
+'indung'
+'immun'
+'igi'
+'hmen_'
+'grund'
+'greif'
+'glaube_'
+'gas'
+'events_'
+'established_'
+'ering_'
+'equal'
+'encourage'
+'enabl'
+'ellung_'
+'eint'
+'einfache'
+'dw'
+'doubt'
+'despite_'
+'demi'
+'decline'
+'cti'
+'credit_'
+'comfortable_'
+'bun'
+'built_'
+'bran'
+'bond'
+'benefit_'
+'bedro'
+'bed'
+'bare'
+'ativ'
+'assist'
+'although_'
+'agr'
+'aft_'
+'abi'
+'Wahl_'
+'Verfahren_'
+'Verbesserung'
+'Test'
+'Serb'
+'Risiko'
+'Regionen_'
+'Note'
+'Nachbar'
+'May_'
+'Mal_'
+'Jahrzehnt'
+'Insel'
+'Inflation'
+'Ihren_'
+'High'
+'Haus'
+'Grün'
+'Gesamt'
+'Flo'
+'Fischer'
+'Enterprise_'
+'Eng'
+'Einrichtung'
+'Britain_'
+'Behörden_'
+'Begr'
+'Balk'
+'Ausschuss_'
+'Amerika_'
+'Ale'
+'22_'
+'* _'
+'%._'
+'übera'
+'ächt'
+'zb'
+'yer'
+'xu'
+'wäh'
+'wan'
+'vorzu'
+'verschiedene_'
+'uring_'
+'ug_'
+'top'
+'tid'
+'tec'
+'sw'
+'strategic_'
+'sterda'
+'stark_'
+'serv'
+'ser_'
+'secu'
+'ritt'
+'richte'
+'reach_'
+'quie'
+'preis'
+'precise'
+'post_'
+'oti'
+'orary_'
+'olge_'
+'official_'
+'official'
+'mü'
+'mens'
+'meaning'
+'ltige'
+'lose'
+'lne'
+'liv'
+'lett'
+'immen'
+'igne'
+'has'
+'gy'
+'guarantee_'
+'größten_'
+'grundlegende'
+'gern_'
+'genommen_'
+'gelegen'
+'gehört_'
+'führte'
+'forms_'
+'forme'
+'fine'
+'film'
+'fig'
+'employe'
+'ela'
+'einzelnen_'
+'einig'
+'effizien'
+'dynami'
+'designed_'
+'design'
+'describe'
+'dern'
+'degr'
+'deci'
+'dal'
+'cycl'
+'contact'
+'con_'
+'client'
+'chaf'
+'centr'
+'cance'
+'bill'
+'bewertung'
+'behind_'
+'base_'
+'author'
+'auft'
+'assung'
+'asion'
+'arity_'
+'anne'
+'angs'
+'activi'
+'Wissenschaft'
+'Website_'
+'Verf'
+'Verbindung_'
+'Ungl'
+'Tw'
+'Teile'
+'TI'
+'Strateg'
+'Sport'
+'Spani'
+'Russian_'
+'Rechte_'
+'RO'
+'Presidency_'
+'Position'
+'Ort'
+'Ok'
+'Micro'
+'Mag'
+'Mach'
+'LO'
+'Kata'
+'Kat'
+'Kal'
+'Initiative_'
+'Hostel'
+'Hon'
+'Griechenland'
+'Folge'
+'Democra'
+'Court_'
+'City_'
+'Christ'
+'Binnen'
+'Arch'
+'Arbeitspl'
+'Angebot'
+'Amt'
+'Abstimmung_'
+'Abschl'
+'200_'
+'.)'
+'äß'
+'älte'
+'®'
+'woll'
+'wil'
+'walk_'
+'vu'
+'vio'
+'villa'
+'verstärk'
+'verlie'
+'verantwort'
+'uz'
+'ums_'
+'ums'
+'umb'
+'ude_'
+'type_'
+'traditionelle'
+'tp'
+'tot'
+'tige_'
+'tief'
+'terrorism_'
+'strengthen'
+'sensi'
+'schließlich_'
+'sam_'
+'rz_'
+'risks_'
+'relative'
+'regime_'
+'rdan'
+'rc'
+'rang'
+'rain'
+'quat'
+'possibilit'
+'picture'
+'pfel'
+'pert'
+'pect'
+'pate'
+'ordina'
+'ny_'
+'nor_'
+'nks_'
+'nice_'
+'ngt'
+'nahe_'
+'model'
+'migra'
+'meri'
+'labor_'
+'konzentrier'
+'jüngste'
+'izi'
+'ization_'
+'ists_'
+'ious_'
+'inat'
+'imm'
+'ikan'
+'ifi'
+'ieg'
+'green'
+'gehören_'
+'gege'
+'gain'
+'fos'
+'fahre'
+'euro'
+'etzung'
+'erneut'
+'ermöglichen_'
+'erfüll'
+'epi'
+'entlich'
+'eat'
+'diplomat'
+'dien'
+'crime'
+'contribut'
+'confirm'
+'chaftliche'
+'candidate'
+'blu'
+'bisher_'
+'bh'
+'beri'
+'beginning_'
+'became_'
+'ausgew'
+'attracti'
+'associat'
+'approv'
+'anis'
+'anderer_'
+'amp'
+'amo'
+'ace_'
+'account'
+'abst'
+'Zum_'
+'Zentralbank'
+'Wege'
+'Wahr'
+'Verfassung_'
+'Umst'
+'Umf'
+'Uhr_'
+'Ts'
+'Th'
+'Stu'
+'St'
+'Spanien_'
+'Schä'
+'Schließ'
+'Platz_'
+'Phil'
+'PS'
+'PE'
+'ON_'
+'Mona'
+'MB'
+'Lea'
+'Late'
+'Konse'
+'Jac'
+'Italien'
+'Inte'
+'Guest'
+'First_'
+'Firm'
+'Fed_'
+'Fakt'
+'Ever'
+'Erst'
+'Ents'
+'Club'
+'Bran'
+'Bemühungen_'
+'Barr'
+'Bank'
+'Armut_'
+'Anti'
+'Anre'
+'Anna'
+'Akt'
+'Aff'
+'Acco'
+'? '
+'. - (_'
+' ='
+'öst'
+'öffentlich'
+'ó'
+'ätzlich'
+'ältnis'
+'wn'
+'wichtigsten_'
+'weig'
+'wandel'
+'voll_'
+'visit_'
+'video_'
+'veränder'
+'verbunden_'
+'uner'
+'tural'
+'threat_'
+'thought_'
+'thin'
+'stop_'
+'steps_'
+'stellung_'
+'sport'
+'sion'
+'side'
+'shows_'
+'shift'
+'shar'
+'sess'
+'sar'
+'rück'
+'root'
+'receive_'
+'qualit'
+'prüf'
+'process'
+'probably_'
+'practice_'
+'plann'
+'pain'
+'osse'
+'music_'
+'move'
+'messen'
+'mental_'
+'measure'
+'md'
+'lower_'
+'lion'
+'konkret'
+'kee'
+'island'
+'ish'
+'internet_'
+'integrat'
+'ink'
+'ilung_'
+'ible_'
+'hoste'
+'hlen'
+'globali'
+'gemeinsam_'
+'gehe'
+'gain_'
+'fallen_'
+'eure'
+'ered_'
+'ene_'
+'edl'
+'druck_'
+'constitution'
+'complex_'
+'compet'
+'committee_'
+'closed_'
+'cki'
+'bür'
+'bein'
+'beide'
+'ay_'
+'aufr'
+'angesichts_'
+'angen_'
+'amend'
+'alu'
+'acy_'
+'acce'
+'able'
+'Zeitp'
+'Zahl_'
+'Yet_'
+'Ya'
+'Vorschläge_'
+'Version_'
+'Verhandlungen_'
+'Up'
+'Ter'
+'Stä'
+'Studie'
+'Stud'
+'Sig'
+'Regime'
+'Programme'
+'Politiker_'
+'Person'
+'Partners'
+'Parl'
+'Pakistan'
+'Ober'
+'Mus'
+'More_'
+'Mess'
+'Mehrheit_'
+'Mas'
+'Les'
+'Lat'
+'Krankheit'
+'How_'
+'Hoch'
+'Führung_'
+'Freund'
+'Fou'
+'Familien'
+'Eurozone_'
+'Ergebnisse_'
+'Dialog'
+'Dav'
+'Christi'
+'Blu'
+'Bar_'
+'Arti'
+'Ansatz_'
+'Aben'
+'AT'
+'zte'
+'zone_'
+'zers'
+'zehn_'
+'word_'
+'wobei_'
+'weitere'
+'voi'
+'verm'
+'untersch'
+'understand'
+'unabhängig'
+'tätig'
+'tions'
+'tel'
+'sze'
+'systeme'
+'study_'
+'started_'
+'sle'
+'sili'
+'secure'
+'seas'
+'screen'
+'schaften_'
+'sce'
+'sation'
+'rten_'
+'rien'
+'result'
+'reform'
+'receive'
+'reality_'
+'purpose_'
+'pruch'
+'programs_'
+'prev'
+'phone_'
+'oso'
+'onne'
+'olge'
+'nötig'
+'nächste'
+'nian'
+'nent'
+'müss'
+'monetary_'
+'modifi'
+'mode'
+'mind'
+'met_'
+'met'
+'mentioned_'
+'mber_'
+'lter'
+'lst'
+'liti'
+'limited_'
+'ley_'
+'konnte_'
+'kk'
+'kill'
+'kannt'
+'kame'
+'jet'
+'iu'
+'isi'
+'iri'
+'interven'
+'ined_'
+'improvement'
+'iet'
+'ieben'
+'ide_'
+'iar'
+'hono'
+'highly_'
+'hic'
+'heiten_'
+'gute'
+'guests_'
+'gelang'
+'fru'
+'friend'
+'freu'
+'fly'
+'figure'
+'fair'
+'expl'
+'esta'
+'esen'
+'erte'
+'erreich'
+'erf'
+'ereign'
+'engine'
+'endl'
+'enable_'
+'einst'
+'eichne'
+'dt'
+'desir'
+'derartige'
+'corp'
+'confidence_'
+'code_'
+'chst_'
+'chance'
+'busi'
+'bilit'
+'berücksichtig'
+'beruh'
+'ba_'
+'aur'
+'atr'
+'assum'
+'ars'
+'anf'
+'alo'
+'ada'
+'according_'
+'accept_'
+'Zweiten'
+'Widers'
+'Vertr'
+'Versuch'
+'Stadt'
+'ST'
+'Reformen_'
+'Que'
+'Prozess_'
+'Oste'
+'Nutzung'
+'Nieder'
+'Nationen_'
+'NATO_'
+'Mehr'
+'Mail'
+'Kunden_'
+'Joh'
+'Indien_'
+'Handel_'
+'From_'
+'Fr'
+'Film'
+'Ext'
+'Est'
+'Entw'
+'Entschließung'
+'Egypt'
+'Economi'
+'Druck_'
+'Diskussion'
+'Dabei_'
+'Cap'
+'Bis'
+'Augen'
+'Antwort_'
+'AL'
+'" '
+' / _'
+'örder'
+'ähnlich'
+'Überw'
+'zuv'
+'zum'
+'zt_'
+'wichtige'
+'verpflichte'
+'vel_'
+'ution'
+'urs_'
+'unternehmen_'
+'undene'
+'unan'
+'ume'
+'tli'
+'tle_'
+'test_'
+'teri'
+'tb'
+'tation_'
+'sustainable_'
+'sustain'
+'stärk'
+'stä'
+'style_'
+'struc'
+'stelle'
+'status_'
+'sre'
+'smus_'
+'shown_'
+'seb'
+'schla'
+'rieb'
+'ric'
+'repu'
+'rbeite'
+'raise_'
+'pur'
+'propos'
+'prom'
+'private'
+'previous_'
+'praktisch'
+'ples_'
+'plans_'
+'ono'
+'onic'
+'olog'
+'oliti'
+'oint'
+'offizielle'
+'obje'
+'nr'
+'nme'
+'nesses_'
+'ndlich'
+'mein'
+'matters_'
+'marke'
+'lec'
+'lac'
+'konf'
+'kau'
+'ist'
+'irgend'
+'ining'
+'ini_'
+'industrial_'
+'impli'
+'ime'
+'identi'
+'ibi'
+'histori'
+'helfen_'
+'glo'
+'geschaff'
+'garde'
+'fähigkeit_'
+'fts'
+'fond'
+'ffn'
+'fei'
+'featur'
+'falt'
+'fac'
+'engage'
+'dure'
+'dramati'
+'discr'
+'dim'
+'definit'
+'creating_'
+'creat'
+'context_'
+'consumers_'
+'consequences_'
+'congr'
+'conclusion'
+'committe'
+'color'
+'cities_'
+'buy'
+'brid'
+'bezüglich'
+'bewusst'
+'beit'
+'basic_'
+'audi'
+'atis'
+'apply'
+'answer_'
+'anb'
+'allein'
+'ahr'
+'ahl_'
+'ahl'
+'accomp'
+'Wohn'
+'Waffen'
+'Verteidigung'
+'Verpflichtung'
+'Them'
+'Server'
+'Sche'
+'SI'
+'Revolution'
+'Qu'
+'Prozent_'
+'Prote'
+'Post'
+'PA'
+'Opfer'
+'Only_'
+'LE'
+'Kolleg'
+'Kli'
+'Kle'
+'June_'
+'Jan'
+'Islami'
+'Heu'
+'Further'
+'English_'
+'Development_'
+'Dec'
+'Bew'
+'Beschl'
+'90_'
+'75'
+'500_'
+'27_'
+'..." _'
+', “_'
+' –'
+'ón_'
+'zier'
+'xis'
+'wieder'
+'vorsi'
+'vertra'
+'varie'
+'unique_'
+'ucat'
+'ths_'
+'tem'
+'table'
+'ssive_'
+'sozi'
+'sov'
+'som'
+'separat'
+'seines_'
+'search'
+'scienti'
+'schre'
+'schließ'
+'räum'
+'rupti'
+'reu'
+'rete'
+'repea'
+'reichen_'
+'promoti'
+'primar'
+'presented_'
+'phen'
+'pen_'
+'payment'
+'nung'
+'nto'
+'nahm'
+'movi'
+'mix'
+'machine'
+'läuf'
+'ließ'
+'laut'
+'launch'
+'las_'
+'krise_'
+'konnten_'
+'kol'
+'knowledge_'
+'isse_'
+'inz'
+'int_'
+'inner'
+'ihres_'
+'igte'
+'icul'
+'hun'
+'hne_'
+'heit'
+'graphic'
+'gleiche'
+'gkeiten_'
+'gewor'
+'gesetzt_'
+'geringe'
+'fte_'
+'fello'
+'eye'
+'erla'
+'erforderlich_'
+'ells'
+'eind'
+'eif'
+'effort_'
+'eciat'
+'draft'
+'dauer'
+'critic'
+'cript'
+'creation_'
+'count'
+'commi'
+'cious'
+'cio'
+'chtige'
+'choose_'
+'change'
+'caused_'
+'categories_'
+'capita'
+'beyond_'
+'ay'
+'ausreich'
+'ausf'
+'asset'
+'artic'
+'argue'
+'alen_'
+'aktuell'
+'airport'
+'aging_'
+'adapt'
+'actions_'
+'Zo'
+'Zimmer'
+'Wort_'
+'Wettbewerb'
+'Weste'
+'Well'
+'Vertrauen'
+'Unterschied'
+'Unsere'
+'Stil'
+'Social'
+'Seit_'
+'Punkt_'
+'Prov'
+'Play'
+'Pho'
+'Paket'
+'Nutze'
+'Now_'
+'NA'
+'Mod'
+'Mich'
+'Mein'
+'Mac'
+'Leit'
+'Kong'
+'Innovation'
+'Gruppe_'
+'Grunds'
+'Entscheidungen_'
+'End'
+'Einz'
+'Dro'
+'Don'
+'Demokrat'
+'Defi'
+'Constitution'
+'Bul'
+'Brü'
+'Berichterstatter_'
+'Ausga'
+'Artikel_'
+'Art'
+'Arbeitnehmer'
+'Arab_'
+'öffentliche_'
+'ñ'
+'ärk'
+'äch'
+'Überein'
+'   '
+'zufü'
+'zog'
+'ziell'
+'zeichn'
+'wendig'
+'website'
+'warn'
+'ware_'
+'vorha'
+'virtu'
+'vier_'
+'uste'
+'urg_'
+'tut'
+'tto'
+'tsch'
+'treatment_'
+'tral'
+'tors_'
+'terrorist'
+'sätz'
+'suff'
+'studie'
+'spir'
+'seem_'
+'schli'
+'saying_'
+'sale'
+'sa_'
+'rot'
+'relevan'
+'reis'
+'rdn'
+'rau'
+'raelis'
+'quid'
+'print'
+'politicians_'
+'platz'
+'pfe'
+'pap'
+'ott'
+'orati'
+'oral'
+'nos'
+'normal'
+'neues'
+'nec'
+'ndig'
+'ndel'
+'nachhaltige'
+'minute'
+'message'
+'mes'
+'meist'
+'mbo'
+'mann'
+'lost_'
+'losse'
+'lives_'
+'lien'
+'latest_'
+'lande'
+'kur'
+'ktive'
+'kart'
+'kar'
+'joint'
+'itut'
+'inu'
+'innovat'
+'inier'
+'ida'
+'hten'
+'hohe_'
+'hinzu'
+'guten_'
+'guest_'
+'gruppe'
+'gno'
+'gewährleisten_'
+'gewinn'
+'gesamten_'
+'gebu'
+'foc'
+'floor'
+'finance_'
+'ffee'
+'failure_'
+'explain'
+'evidence_'
+'erwart'
+'entwickeln_'
+'entsprechen'
+'entscheide'
+'enthält_'
+'ectio'
+'dritte'
+'dr'
+'divid'
+'dish'
+'dier'
+'darum_'
+'dank'
+'dadurch_'
+'crit'
+'convi'
+'chlossen'
+'challenge_'
+'bus'
+'bezahl'
+'begin'
+'bee'
+'bathroom'
+'basier'
+'aware_'
+'aufe'
+'asked_'
+'annt'
+'alone_'
+'adver'
+'Ziel'
+'Vertreter'
+'Tou'
+'TA'
+'Stärk'
+'Spiel_'
+'Sitz'
+'Rad'
+'Putin_'
+'NT'
+'Loca'
+'Last'
+'Key'
+'Jugend'
+'Infrastruktur'
+'Human'
+'Hot'
+'Hinblick_'
+'General_'
+'Gelegenheit_'
+'Gefahr_'
+'Gebiet_'
+'Förderung_'
+'Europä'
+'Europeans_'
+'Dra'
+'Dinge'
+'Darüber_'
+'Dank_'
+'Damit_'
+'CI'
+'Besi'
+'Beginn_'
+'Barcelona_'
+'Ausschu'
+'August_'
+'Anschl'
+'Angelegenheit'
+'Alli'
+'Aktivität'
+'Act'
+'Abkommen_'
+'23'
+' " _'
+'änkt'
+'ält'
+'Öffentlichkeit_'
+'Änderungsanträge'
+'Änderungsantrag'
+'zule'
+'zeuge_'
+'ype'
+'xit'
+'worth'
+'wohn'
+'wealth_'
+'wac'
+'vig'
+'viele'
+'verwenden_'
+'va_'
+'unmi'
+'unemployment_'
+'uation'
+'uar'
+'trust'
+'technologi'
+'technical'
+'tand'
+'tag_'
+'spoliti'
+'source_'
+'sort'
+'sorg'
+'sometimes_'
+'solutions_'
+'setting_'
+'schütz'
+'rising_'
+'riere'
+'rding_'
+'quickly_'
+'py'
+'prochen'
+'politics_'
+'please_'
+'pem'
+'pani'
+'ows_'
+'oss'
+'osp'
+'objective_'
+'obi'
+'nsti'
+'nste'
+'note_'
+'nische_'
+'nich'
+'nent_'
+'minor'
+'minister_'
+'min_'
+'mi_'
+'markt'
+'luxur'
+'linien'
+'lini'
+'leiste'
+'led'
+'laufen'
+'larger_'
+'künft'
+'kriti'
+'kos'
+'kes_'
+'jährlich'
+'integri'
+'innovation'
+'illi'
+'icat'
+'iat'
+'hme'
+'hind'
+'hier'
+'heri'
+'haf'
+'great'
+'geä'
+'geste'
+'gestalt'
+'genannten_'
+'gemeins'
+'gefa'
+'gaben_'
+'fs_'
+'ffi'
+'exac'
+'erwartet'
+'eru'
+'equent'
+'entge'
+'ender'
+'elbe'
+'ego'
+'dringend'
+'don_'
+'dire'
+'develop'
+'determine'
+'deter'
+'damage_'
+'correct'
+'contribution'
+'consult'
+'condition'
+'child'
+'chec'
+'charge'
+'ced_'
+'carry'
+'build_'
+'br'
+'boost'
+'blick'
+'bell'
+'bauen_'
+'average_'
+'amen'
+'aktuellen_'
+'aktive'
+'agricultural_'
+'admi'
+'achieved_'
+'aben_'
+'Währung'
+'Wit'
+'Wand'
+'Volks'
+'Verst'
+'Verbre'
+'University_'
+'Terror'
+'Stern'
+'St_'
+'Schr'
+'Schlusselwortern_'
+'Sar'
+'SQL_'
+'Regeln_'
+'Rechn'
+'Punkte'
+'Produkte_'
+'Personal'
+'Muse'
+'Middle_'
+'March_'
+'Lösung'
+'Liberal'
+'Lang'
+'Karte'
+'Investi'
+'Innen'
+'Globali'
+'Global'
+'Gebiet'
+'Flughafen'
+'Export'
+'Exper'
+'Empf'
+'Emi'
+'Disk'
+'Datei_'
+'DVD'
+'Conven'
+'Cong'
+'Bildung_'
+'BE'
+'Arbeitslos'
+'Anwend'
+'Alternativ'
+'31_'
+'1990_'
+'193'
+'11'
+' ".'
+'“._'
+'öß'
+'öge'
+'öffnet'
+'äußerst_'
+'äußer'
+'äss'
+'ystem'
+'ysi'
+'wund'
+'wort'
+'wood'
+'ward_'
+'viel'
+'verständlich_'
+'uten_'
+'urgen'
+'unte'
+'twort'
+'tially_'
+'throughout_'
+'teach'
+'system'
+'stru'
+'spro'
+'spread_'
+'speech'
+'settle'
+'send_'
+'sd'
+'russische'
+'rob'
+'rische'
+'riff'
+'richtige'
+'requir'
+'request_'
+'replac'
+'regel'
+'reduction'
+'rede'
+'recognize'
+'race'
+'pou'
+'pas'
+'ors'
+'orient'
+'ope'
+'ohn'
+'occup'
+'obligat'
+'nissen_'
+'nied'
+'nfall'
+'movement_'
+'mitte'
+'mark_'
+'löse'
+'lp'
+'lor'
+'lls_'
+'llo'
+'lies'
+'lange'
+'laden_'
+'klare'
+'jede_'
+'iven'
+'iten_'
+'itali'
+'isla'
+'intensiv'
+'infl'
+'independent_'
+'immt_'
+'icient'
+'hotel'
+'hie'
+'heut'
+'ggl'
+'gesamt'
+'gent'
+'ganzen_'
+'führende'
+'fy'
+'fte'
+'fores'
+'fordert'
+'fire'
+'fer_'
+'familie'
+'fair_'
+'failed_'
+'eventu'
+'ermöglicht_'
+'equal_'
+'ep_'
+'ensur'
+'enlargement_'
+'emo'
+'einschließ'
+'eding'
+'ecke'
+'ea_'
+'dliche'
+'distribution_'
+'direction_'
+'digen_'
+'dienst'
+'det_'
+'depr'
+'danken_'
+'cts_'
+'conti'
+'confe'
+'commerc'
+'cket'
+'chter'
+'chsel'
+'choice_'
+'chine'
+'cen'
+'buch'
+'brie'
+'beschä'
+'begre'
+'aul'
+'att_'
+'appr'
+'angene'
+'afr'
+'Zur'
+'Zentrum'
+'Zen'
+'Za'
+'WT'
+'Volkswirtschaft'
+'Vereinbar'
+'Unternehmens'
+'Una'
+'Thu'
+'Thr'
+'Terrorismus_'
+'Summ'
+'Spr'
+'Rooms_'
+'Republik'
+'Prozess'
+'Progr'
+'Priorität'
+'Palestinian_'
+'PR'
+'PC_'
+'Organ'
+'Option'
+'Nam'
+'Mü'
+'Minuten_'
+'Methode'
+'Lit'
+'Lauf'
+'J_'
+'Inha'
+'IT_'
+'IMF_'
+'Herren_'
+'Haushalt'
+'Gru'
+'Greek'
+'Glaub'
+'Funktion_'
+'Friedens'
+'Fle'
+'Fla'
+'Fir'
+'Financ'
+'Fel'
+'Dre'
+'Dez'
+'Datei'
+'Computer'
+'Cod'
+'Bestimm'
+'Anfang_'
+'Amendment'
+'Abgeordneten_'
+'AN'
+'36'
+'š'
+'ären_'
+'zieren'
+'zahlen'
+'yi'
+'wun'
+'wissens'
+'wiederh'
+'who'
+'völlig_'
+'vorb'
+'vora'
+'voll'
+'veröffentlicht'
+'versu'
+'verstehen_'
+'verringer'
+'verlo'
+'verha'
+'vac'
+'usa'
+'urte'
+'urg'
+'ungsb'
+'ultimat'
+'tschaft'
+'transit'
+'thousand'
+'tere'
+'tag'
+'supported_'
+'suchen'
+'subsid'
+'street'
+'strategie'
+'strategi'
+'sst_'
+'specific'
+'sound_'
+'signifi'
+'sende'
+'sav'
+'rte_'
+'roll'
+'rlei'
+'rheit'
+'resolve'
+'relationship'
+'rdin'
+'ration'
+'programmes_'
+'procedure_'
+'principles_'
+'pon'
+'perform'
+'perf'
+'paid_'
+'own'
+'ose_'
+'orte'
+'organ'
+'omen'
+'obvious'
+'obt'
+'nsta'
+'nsi'
+'nor'
+'ndi'
+'mögliche'
+'mp_'
+'mite'
+'menu'
+'ltung'
+'lokale'
+'lity_'
+'ließen'
+'licht'
+'lehn'
+'kürz'
+'kern'
+'initiative_'
+'initi'
+'includes_'
+'impos'
+'hält_'
+'hum'
+'holiday'
+'hnen_'
+'historical'
+'herrsch'
+'helpful'
+'hai'
+'ground'
+'grade'
+'got'
+'gemeinsamen_'
+'garantier'
+'ganz'
+'gabe_'
+'fication_'
+'feld'
+'fear_'
+'falsch'
+'execut'
+'exclusive'
+'ette'
+'erkläre'
+'erinner'
+'erho'
+'enthalten_'
+'enorme'
+'elega'
+'einzig'
+'einhei'
+'eingeh'
+'eindeutig'
+'efo'
+'effektiv'
+'durchgeführt'
+'domestic_'
+'distin'
+'display'
+'denken'
+'deficit_'
+'ctions_'
+'critici'
+'coordinat'
+'consumer_'
+'consist'
+'cia'
+'bestehende'
+'bekommen_'
+'bekannt'
+'behavio'
+'behandel'
+'bai'
+'ausschu'
+'aussch'
+'atte'
+'anything_'
+'anlage'
+'anh'
+'agier'
+'advanced_'
+'adjust'
+'achung'
+'absch'
+'aa'
+'['
+'Zei'
+'Vorg'
+'Volk'
+'Urlaub'
+'Umge'
+'Ty'
+'Trot'
+'Touris'
+'Team'
+'Stran'
+'Since_'
+'Ressourcen_'
+'Rent'
+'Partner'
+'Pala'
+'Notwendigkeit_'
+'Nebe'
+'Natürlich_'
+'Medit'
+'Mad'
+'Kraft_'
+'Kosovo'
+'Korea_'
+'Konvent'
+'Kontrolle_'
+'Kompr'
+'Israeli_'
+'Integration'
+'Informations'
+'Hof'
+'Hand_'
+'Gemeins'
+'Fortschritte_'
+'Flü'
+'Februar'
+'Erweiterung_'
+'Ersten'
+'Einst'
+'EN_'
+'Dow'
+'Dok'
+'Debian_'
+'Dah'
+'Beste'
+'Beitrag'
+'Bedr'
+'Auswa'
+'Ausdruck_'
+'Auft'
+'Aufmerksamkeit_'
+'Akti'
+'African'
+'33'
+'29_'
+'28_'
+'160'
+'.  '
+', "'
+' [_'
+' ) '
+'übersch'
+'übernachten_'
+'ögen'
+'äne'
+'ße_'
+'zep'
+'worked_'
+'weiter'
+'wechs'
+'web_'
+'versuche'
+'verfügen_'
+'uzie'
+'unli'
+'uble'
+'trib'
+'trete'
+'trei'
+'tert'
+'tellung_'
+'tel_'
+'technische'
+'techn'
+'sv'
+'success'
+'submitted_'
+'staaten_'
+'ssen'
+'speed_'
+'sorge'
+'sichtlich_'
+'seat'
+'schu'
+'schrift'
+'schem'
+'scale'
+'safe_'
+'räge'
+'round'
+'ries'
+'rf_'
+'return'
+'restaura'
+'rende'
+'ref'
+'reached_'
+'ragend'
+'rable_'
+'propert'
+'proper'
+'produce_'
+'predict'
+'pho'
+'pes'
+'permanen'
+'perfect_'
+'pende'
+'outs'
+'omiss'
+'ome'
+'ohner'
+'oduct'
+'objectives_'
+'nzi'
+'nort'
+'normale'
+'niedrige'
+'neben'
+'nachd'
+'möglich'
+'mär'
+'mobile_'
+'mmi'
+'mind_'
+'menschliche'
+'länd'
+'lz'
+'load_'
+'lief'
+'liebe'
+'ld_'
+'last'
+'lass'
+'käm'
+'kurs'
+'klas'
+'ition'
+'ision_'
+'institutional_'
+'inander'
+'iger'
+'ießen'
+'iell'
+'ice'
+'hoffe_'
+'hme_'
+'hid'
+'goods_'
+'glich'
+'giving_'
+'geschlossen'
+'gesche'
+'geru'
+'genannt'
+'gelte'
+'formul'
+'force'
+'folgen'
+'final_'
+'fft'
+'famous_'
+'expert'
+'erran'
+'erlaub'
+'erheblich'
+'entwickelte'
+'entwickelt_'
+'ensw'
+'eln_'
+'eland_'
+'doc'
+'diff'
+'deser'
+'dde'
+'consumption_'
+'considered_'
+'considerabl'
+'conduc'
+'compare'
+'cod'
+'class_'
+'chun'
+'chinesisch'
+'charge_'
+'bud'
+'bon_'
+'bind'
+'bewer'
+'bedeutend'
+'ava'
+'ausgestattet'
+'atl'
+'announc'
+'angen'
+'ande'
+'ahe'
+'aggr'
+'administ'
+'additional_'
+'accommodation_'
+'abhäng'
+'Whe'
+'Wann_'
+'Vorsitz'
+'Viele_'
+'Verwendung'
+'Verwaltung'
+'Verfass'
+'Tage_'
+'Syria'
+'Swe'
+'Sun'
+'Spie'
+'Sicht'
+'Security_'
+'Schiff'
+'Run'
+'Rou'
+'Rights_'
+'Problem'
+'Ord'
+'Many_'
+'Mala'
+'Lisbon_'
+'Lehr'
+'Landw'
+'Jul'
+'Informati'
+'Hinter'
+'Herze'
+'Gua'
+'Gree'
+'Golf'
+'Ged'
+'Fro'
+'Friede'
+'Forderung'
+'Folgen_'
+'Find'
+'Federal_'
+'Entscheid'
+'Eink'
+'ER'
+'Damen_'
+'Com'
+'Colo'
+'Blick_'
+'Bild_'
+'Betri'
+'Bekämpfung_'
+'Ausgaben_'
+'Asien'
+'Appl'
+'Anwendung_'
+'Angesichts_'
+'Anal'
+'Americans_'
+': „'
+': "'
+'300_'
+'16'
+'03'
+'+_'
+'#_'
+'ürf'
+'übert'
+'überg'
+'öc'
+'ée'
+'ändische'
+'ältig'
+' % _'
+'  '
+'zudem_'
+'zing_'
+'zimmer'
+'ystem_'
+'yal'
+'wrong_'
+'world'
+'work'
+'weite'
+'weder_'
+'wed'
+'wachsende'
+'wa_'
+'völk'
+'vy'
+'vot'
+'vermeid'
+'verla'
+'veau'
+'untuk_'
+'unl'
+'unis'
+'tzu'
+'typ'
+'transparent'
+'term'
+'tell_'
+'tail'
+'säch'
+'swei'
+'surviv'
+'supply'
+'strukt'
+'steu'
+'sta_'
+'sra'
+'später_'
+'speak_'
+'situated_'
+'sight'
+'shing'
+'server_'
+'schaft'
+'sabo'
+'respons'
+'residen'
+'rali'
+'quantit'
+'prüfen'
+'produkti'
+'produced_'
+'produce'
+'prem'
+'preise'
+'ppin'
+'pot'
+'plac'
+'physic'
+'persönliche'
+'pension'
+'owing'
+'orn'
+'organiza'
+'olg'
+'old'
+'offens'
+'offe'
+'obs'
+'nsw'
+'nken_'
+'ngs_'
+'news_'
+'negotia'
+'mittle'
+'meter_'
+'mete'
+'maxi'
+'lesso'
+'lent'
+'leben'
+'kulturell'
+'kommende'
+'klu'
+'kennen'
+'kauf'
+'jährig'
+'justif'
+'justi'
+'its'
+'ism'
+'irtschaft'
+'ior_'
+'international'
+'intern'
+'impre'
+'impose'
+'ierend'
+'ics'
+'ichtlich_'
+'häufig_'
+'hrte'
+'hibi'
+'heless_'
+'hebe'
+'hd'
+'gän'
+'guarantee'
+'grund_'
+'globale_'
+'gets_'
+'gesund'
+'gers'
+'generat'
+'gei'
+'geben'
+'füg'
+'fä'
+'früher'
+'frü'
+'folgt'
+'flight'
+'files'
+'fare_'
+'fand'
+'falls'
+'exer'
+'evo'
+'europä'
+'erzielt'
+'erzi'
+'erwä'
+'erungen_'
+'eria_'
+'ergr'
+'erent'
+'erence_'
+'ereit'
+'equi'
+'enor'
+'emissions_'
+'electi'
+'eiten_'
+'ehemalige'
+'dür'
+'dne'
+'dle_'
+'distanc'
+'diffic'
+'depart'
+'deleg'
+'defini'
+'defin'
+'defens'
+'decided_'
+'death_'
+'ctu'
+'craft'
+'continu'
+'comprehen'
+'completely_'
+'combat'
+'chrift'
+'chma'
+'chea'
+'chani'
+'ces'
+'caus'
+'camera'
+'brought_'
+'beweg'
+'besuch'
+'berü'
+'begr'
+'bege'
+'bby'
+'balance'
+'aust'
+'atm'
+'arc'
+'annual_'
+'angenomme'
+'altung'
+'akan_'
+'afte'
+'added_'
+'achstum'
+'Wirtschaftsw'
+'Ware'
+'Wahle'
+'Vill'
+'Veränderungen_'
+'Verordnung'
+'Untersuch'
+'Umwelt_'
+'Tun'
+'Terr'
+'Statu'
+'Station_'
+'Soli'
+'Services_'
+'Schaffung_'
+'SP'
+'Robe'
+'Ric'
+'Reihe_'
+'Reform_'
+'Rahm'
+'Quell'
+'Pub'
+'Prot'
+'Premi'
+'Politik'
+'Open'
+'Män'
+'Milit'
+'Linux'
+'Konzept'
+'Kin'
+'Kap'
+'Juli'
+'Jav'
+'Italian'
+'Instead_'
+'Inde'
+'Here_'
+'HE'
+'Gren'
+'Great'
+'Gest'
+'Gemeinschafts'
+'Gebä'
+'Gaz'
+'Fäh'
+'Fund_'
+'Fisch'
+'Finanzm'
+'Finally_'
+'Famili'
+'Ers'
+'Einfluss_'
+'Durch_'
+'December_'
+'Dazu_'
+'Centr'
+'Center'
+'Beweg'
+'Benutzer'
+'Basi'
+'Asi'
+'Apartment'
+'Ante'
+'Ann'
+'Angriff'
+'Alle'
+'1791_'
+'1781'
+'’'
+'у'
+'»'
+'&'
+'+'
+'$'
+'Ü'
+'қ'
+']'
+'б'
+'«'
+'–'
+'ç'
+';'
+'­'
+'з'
+'й'
+'č'
+':'
+'я'
+'г'
+'ž'
+'ж'
+'™'
+'ı'
+'ô'
+'‘'
+'{'
+'?'
+'`'
+'ú'
+'ь'
+'ê'
+'}'
+'@'
+'•'
+'ң'
+'ш'
+'·'
+'>'
+'|'
+'ł'
+'ã'
+'°'
+'х'
+'´'
+'α'
+'å'
+'ө'
+'ğ'
+'ø'
+'²'
+'ч'
+'â'
+'ο'
+'ε'
+'�'
+'„'
+'ц'
+'ë'
+'א'
+'ұ'
+'ә'
+'ғ'
+'э'
+'ń'
+'ć'
+'Ã'
+'ү'
+'Б'
+'ι'
+'ע'
+'ю'
+'μ'
+'Č'
+'ф'
+'С'
+'И'
+'τ'
+'Š'
+'ý'
+'©'
+'#'
+'†'
+'ا'
+'י'
+'Т'
+'К'
+'Г'
+'ρ'
+'Ž'
+'ż'
+'ò'
+'ï'
+'î'
+'£'
+'−'
+'ي'
+'ט'
+'ג'
+'щ'
+'σ'
+'ş'
+'œ'
+'ě'
+'ę'
+'ā'
+'õ'
+'¿'
+'º'
+'~'
+'ن'
+'ل'
+'ف'
+'ر'
+'ר'
+'נ'
+'Ж'
+'Д'
+'υ'
+'ν'
+'λ'
+'ś'
+'ù'
+'ì'
+'Ñ'
+'É'
+'Á'
+'§'
+''
+'ー'
+'‚'
+'م'
+'ק'
+'ד'
+'Я'
+'П'
+'О'
+'Л'
+'Е'
+'А'
+'π'
+'κ'
+'θ'
+'β'
+'ū'
+'Ś'
+'ō'
+'æ'
+'Ê'
+'Â'
+'¼'
+'¶'
+'¥'
+''
+'년'
+'語'
+'简'
+'本'
+'日'
+'文'
+'年'
+'中'
+'ṳ'
+'ศ'
+'พ'
+'ा'
+'र'
+'ى'
+'ه'
+'ص'
+'ت'
+'ب'
+'פ'
+'ס'
+'ן'
+'ו'
+'ֿ'
+'В'
+'ω'
+'χ'
+'δ'
+'Ω'
+'̤'
+'ư'
+'ů'
+'ř'
+'ľ'
+'ė'
+'ĕ'
+'ą'
+'û'
+'À'
+'½'
+'¹'
+'¤'
+'¡'
+''
+'：'
+'﻿'
+'ﬁ'
+'黵'
+'黃'
+'鰀'
+'鋘'
+'鋓'
+'遝'
+'蒸'
+'致'
+'美'
+'网'
+'紙'
+'熨'
+'斗'
+'応'
+'女'
+'味'
+'友'
+'信'
+'介'
+'丨'
+'一'
+'ャ'
+'バ'
+'チ'
+'ジ'
+'カ'
+'ん'
+'ら'
+'め'
+'●'
+'▼'
+'→'
+'※'
+'ớ'
+'ọ'
+'ị'
+'ẽ'
+'ẻ'
+'ấ'
+'ी'
+'ि'
+'य'
+'ब'
+'त'
+'छ'
+'आ'
+'ِ'
+'ك'
+'غ'
+'ع'
+'د'
+'ج'
+'إ'
+'،'
+'צ'
+'ל'
+'ה'
+'Қ'
+'Ғ'
+'Э'
+'Ш'
+'Ц'
+'Х'
+'Р'
+'М'
+'φ'
+'ζ'
+'γ'
+'Χ'
+'Τ'
+'Ι'
+'Ε'
+'̯'
+'̆'
+'ː'
+'ˈ'
+'ɾ'
+'ɛ'
+'ɐ'
+'ſ'
+'ű'
+'ŭ'
+'ő'
+'Ő'
+'ŏ'
+'ň'
+'İ'
+'ī'
+'đ'
+'Đ'
+'ă'
+'à'
+'Ô'
+'Ó'
+'È'
+'Å'
+'¾'
+'µ'
+'³'
+'¬'
+'¢'
+''
+''
+''
+''
+''
+'^'
+'<'
diff --git a/tensor2tensor/tpu/tpu_trainer.py b/tensor2tensor/tpu/tpu_trainer.py
deleted file mode 100644
index 571a21839..000000000
--- a/tensor2tensor/tpu/tpu_trainer.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Train on TPU."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import contextlib
-import os
-import sys
-
-# Dependency imports
-
-from tensor2tensor import models  # pylint: disable=unused-import
-from tensor2tensor import problems as problems_lib  # pylint: disable=unused-import
-from tensor2tensor.tpu import tpu_trainer_lib
-from tensor2tensor.utils import decoding
-from tensor2tensor.utils import flags as t2t_flags  # pylint: disable=unused-import
-from tensor2tensor.utils import registry
-from tensor2tensor.utils import usr_dir
-
-import tensorflow as tf
-
-flags = tf.flags
-FLAGS = flags.FLAGS
-
-# See flags.py for additional command-line flags.
-flags.DEFINE_string("t2t_usr_dir", "",
-                    "Path to a Python module that will be imported. The "
-                    "__init__.py file should include the necessary imports. "
-                    "The imported files should contain registrations, "
-                    "e.g. @registry.register_model calls, that will then be "
-                    "available to the t2t-trainer.")
-flags.DEFINE_integer("random_seed", 1234, "Random seed.")
-flags.DEFINE_integer("tpu_num_shards", 8, "Number of tpu shards.")
-flags.DEFINE_integer("iterations_per_loop", 1000,
-                     "Number of iterations in a TPU training loop.")
-flags.DEFINE_bool("use_tpu", False, "Whether to use TPU.")
-flags.DEFINE_bool("generate_data", False, "Generate data before training?")
-flags.DEFINE_string("tmp_dir", "/tmp/t2t_datagen",
-                    "Temporary storage directory, used if --generate_data.")
-flags.DEFINE_bool("profile", False, "Profile performance?")
-
-# To maintain compatibility with some internal libs, we guard against these flag
-# definitions possibly erroring. Apologies for the ugliness.
-try:
-  flags.DEFINE_string("master", "", "Address of TensorFlow master.")
-  flags.DEFINE_string("output_dir", "", "Base output directory for run.")
-  flags.DEFINE_string("schedule", "continuous_train_and_eval",
-                      "Method of Experiment to run.")
-  flags.DEFINE_integer("eval_steps", 10000,
-                       "Number of steps in evaluation. By default, eval will "
-                       "stop after eval_steps or when it runs through the eval "
-                       "dataset once in full, whichever comes first, so this "
-                       "can be a very large number.")
-except:  # pylint: disable=bare-except
-  pass
-
-
-def get_problem_name():
-  problems = FLAGS.problems.split("-")
-  assert len(problems) == 1
-  return problems[0]
-
-
-def create_hparams():
-  return tpu_trainer_lib.create_hparams(FLAGS.hparams_set, FLAGS.hparams)
-
-
-def create_experiment_fn():
-  return tpu_trainer_lib.create_experiment_fn(
-      model_name=FLAGS.model,
-      problem_name=get_problem_name(),
-      data_dir=os.path.expanduser(FLAGS.data_dir),
-      train_steps=FLAGS.train_steps,
-      eval_steps=FLAGS.eval_steps,
-      min_eval_frequency=FLAGS.local_eval_frequency,
-      schedule=FLAGS.schedule,
-      export=FLAGS.export_saved_model,
-      decode_hparams=decoding.decode_hparams(FLAGS.decode_hparams),
-      use_tfdbg=FLAGS.tfdbg,
-      use_dbgprofile=FLAGS.dbgprofile,
-      eval_early_stopping_steps=FLAGS.eval_early_stopping_steps,
-      eval_early_stopping_metric=FLAGS.eval_early_stopping_metric,
-      eval_early_stopping_metric_delta=FLAGS.eval_early_stopping_metric_delta,
-      eval_early_stopping_metric_minimize=FLAGS.
-      eval_early_stopping_metric_minimize,
-      use_tpu=FLAGS.use_tpu)
-
-
-def create_run_config(hp):
-  return tpu_trainer_lib.create_run_config(
-      model_dir=os.path.expanduser(FLAGS.output_dir),
-      master=FLAGS.master,
-      iterations_per_loop=FLAGS.iterations_per_loop,
-      num_shards=FLAGS.tpu_num_shards,
-      log_device_placement=FLAGS.log_device_placement,
-      save_checkpoints_steps=max(FLAGS.iterations_per_loop,
-                                 FLAGS.local_eval_frequency),
-      keep_checkpoint_max=FLAGS.keep_checkpoint_max,
-      keep_checkpoint_every_n_hours=FLAGS.keep_checkpoint_every_n_hours,
-      num_gpus=FLAGS.worker_gpu,
-      gpu_order=FLAGS.gpu_order,
-      shard_to_cpu=FLAGS.locally_shard_to_cpu,
-      num_async_replicas=FLAGS.worker_replicas,
-      gpu_mem_fraction=FLAGS.worker_gpu_memory_fraction,
-      enable_graph_rewriter=FLAGS.experimental_optimize_placement,
-      use_tpu=FLAGS.use_tpu,
-      schedule=FLAGS.schedule,
-      no_data_parallelism=hp.no_data_parallelism,
-      daisy_chain_variables=hp.daisy_chain_variables,
-      ps_replicas=FLAGS.ps_replicas,
-      ps_job=FLAGS.ps_job,
-      ps_gpu=FLAGS.ps_gpu,
-      sync=FLAGS.sync,
-      worker_id=FLAGS.worker_id,
-      worker_job=FLAGS.worker_job)
-
-
-def generate_data():
-  # Generate data if requested.
-  data_dir = os.path.expanduser(FLAGS.data_dir)
-  tmp_dir = os.path.expanduser(FLAGS.tmp_dir)
-  tf.gfile.MakeDirs(data_dir)
-  tf.gfile.MakeDirs(tmp_dir)
-
-  problem_name = get_problem_name()
-  tf.logging.info("Generating data for %s" % problem_name)
-  registry.problem(problem_name).generate_data(data_dir, tmp_dir)
-
-
-@contextlib.contextmanager
-def profile_context():
-  if FLAGS.profile:
-    with tf.contrib.tfprof.ProfileContext("t2tprof",
-                                          trace_steps=range(100),
-                                          dump_steps=range(100)) as pctx:
-      opts = tf.profiler.ProfileOptionBuilder.time_and_memory()
-      pctx.add_auto_profiling("op", opts, range(100))
-      yield
-  else:
-    yield
-
-
-def log_registry():
-  if FLAGS.registry_help:
-    tf.logging.info(registry.help_string())
-    sys.exit(0)
-
-
-def execute_schedule(exp):
-  if not hasattr(exp, FLAGS.schedule):
-    raise ValueError(
-        "Experiment has no method %s, from --schedule" % FLAGS.schedule)
-  with profile_context():
-    getattr(exp, FLAGS.schedule)()
-
-
-def main(_):
-  tf.logging.set_verbosity(tf.logging.INFO)
-  tpu_trainer_lib.set_random_seed(FLAGS.random_seed)
-  usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
-  log_registry()
-
-  if FLAGS.generate_data:
-    generate_data()
-
-  hparams = create_hparams()
-  run_config = create_run_config(hparams)
-
-  exp_fn = create_experiment_fn()
-  exp = exp_fn(run_config, hparams)
-  execute_schedule(exp)
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensor2tensor/utils/bleu_hook.py b/tensor2tensor/utils/bleu_hook.py
index 50caf09bf..6ea1b440f 100644
--- a/tensor2tensor/utils/bleu_hook.py
+++ b/tensor2tensor/utils/bleu_hook.py
@@ -25,7 +25,6 @@
 import sys
 import time
 import unicodedata
-from collections import namedtuple
 
 # Dependency imports
 
@@ -150,13 +149,14 @@ class UnicodeRegex(object):
   """Ad-hoc hack to recognize all punctuation and symbols."""
 
   def __init__(self):
-    def _property_chars(prefix):
-      return ''.join(six.unichr(x) for x in range(sys.maxunicode)
-                     if unicodedata.category(six.unichr(x)).startswith(prefix))
-    punctuation = _property_chars('P')
-    self.nondigit_punct_re = re.compile(r'([^\d])([' + punctuation + r'])')
-    self.punct_nondigit_re = re.compile(r'([' + punctuation + r'])([^\d])')
-    self.symbol_re = re.compile('([' + _property_chars('S') + '])')
+    punctuation = self.property_chars("P")
+    self.nondigit_punct_re = re.compile(r"([^\d])([" + punctuation + r"])")
+    self.punct_nondigit_re = re.compile(r"([" + punctuation + r"])([^\d])")
+    self.symbol_re = re.compile("([" + self.property_chars("S") + "])")
+
+  def property_chars(self, prefix):
+    return "".join(six.unichr(x) for x in range(sys.maxunicode)
+                   if unicodedata.category(six.unichr(x)).startswith(prefix))
 
 
 def bleu_tokenize(string):
@@ -184,9 +184,9 @@ def bleu_tokenize(string):
     a list of tokens
   """
   uregex = UnicodeRegex()
-  string = uregex.nondigit_punct_re.sub(r'\1 \2 ', string)
-  string = uregex.punct_nondigit_re.sub(r' \1 \2', string)
-  string = uregex.symbol_re.sub(r' \1 ', string)
+  string = uregex.nondigit_punct_re.sub(r"\1 \2 ", string)
+  string = uregex.punct_nondigit_re.sub(r" \1 \2", string)
+  string = uregex.symbol_re.sub(r" \1 ", string)
   return string.split()
 
 
@@ -203,15 +203,16 @@ def bleu_wrapper(ref_filename, hyp_filename, case_sensitive=False):
   return compute_bleu(ref_tokens, hyp_tokens)
 
 
-StepFile = namedtuple('StepFile', 'filename mtime ctime steps')
+StepFile = collections.namedtuple("StepFile", "filename mtime ctime steps")
 
 
-def _read_stepfiles_list(path_prefix, path_suffix='.index', min_steps=0):
+def _read_stepfiles_list(path_prefix, path_suffix=".index", min_steps=0):
+  """Return list of StepFiles sorted by step from files at path_prefix."""
   stepfiles = []
-  for filename in tf.gfile.Glob(path_prefix + '*-[0-9]*' + path_suffix):
+  for filename in tf.gfile.Glob(path_prefix + "*-[0-9]*" + path_suffix):
     basename = filename[:-len(path_suffix)] if len(path_suffix) else filename
     try:
-      steps = int(basename.rsplit('-')[-1])
+      steps = int(basename.rsplit("-")[-1])
     except ValueError:  # The -[0-9]* part is not an integer.
       continue
     if steps < min_steps:
@@ -225,36 +226,44 @@ def _read_stepfiles_list(path_prefix, path_suffix='.index', min_steps=0):
 
 
 def stepfiles_iterator(path_prefix, wait_minutes=0, min_steps=0,
-                       path_suffix='.index', sleep_sec=10):
+                       path_suffix=".index", sleep_sec=10):
   """Continuously yield new files with steps in filename as they appear.
 
-  This is useful for checkpoint files or other files whose names differ just in an interger
-  marking the number of steps and match the wildcard path_prefix + '*-[0-9]*' + path_suffix.
-  Unlike `tf.contrib.training.checkpoints_iterator`, this
-  implementation always starts from the oldest files
-  (and it cannot miss any file). Note that the oldest checkpoint
-  may be deleted anytime by Tensorflow (if set up so). It is up to the user
-  to check that the files returned by this generator actually exist.
+  This is useful for checkpoint files or other files whose names differ just in
+  an integer marking the number of steps and match the wildcard path_prefix +
+  "*-[0-9]*" + path_suffix.
+
+  Unlike `tf.contrib.training.checkpoints_iterator`, this implementation always
+  starts from the oldest files (and it cannot miss any file). Note that the
+  oldest checkpoint may be deleted anytime by Tensorflow (if set up so). It is
+  up to the user to check that the files returned by this generator actually
+  exist.
+
   Args:
     path_prefix: The directory + possible common filename prefix to the files.
-    path_suffix: Common filename suffix (after steps), including possible extension dot.
     wait_minutes: The maximum amount of minutes to wait between files.
     min_steps: Skip files with lower global step.
+    path_suffix: Common filename suffix (after steps), including possible
+      extension dot.
     sleep_sec: How often to check for new files.
+
   Yields:
     named tuples (filename, mtime, ctime, steps) of the files as they arrive.
   """
-  # Wildcard D*-[0-9]* does not match D/x-1, so if D is a directory let path_prefix='D/'.
+  # Wildcard D*-[0-9]* does not match D/x-1, so if D is a directory let
+  # path_prefix="D/".
   if not path_prefix.endswith(os.sep) and os.path.isdir(path_prefix):
     path_prefix += os.sep
   stepfiles = _read_stepfiles_list(path_prefix, path_suffix, min_steps)
-  tf.logging.info("Found %d files with steps: %s"
-                  % (len(stepfiles), ", ".join(str(x.steps) for x in reversed(stepfiles))))
+  tf.logging.info("Found %d files with steps: %s",
+                  len(stepfiles),
+                  ", ".join(str(x.steps) for x in reversed(stepfiles)))
   exit_time = time.time() + wait_minutes * 60
   while True:
     if not stepfiles and wait_minutes:
-      tf.logging.info('Waiting till %s if a new file matching %s*-[0-9]*%s appears'
-                      % (time.asctime(time.localtime(exit_time)), path_prefix, path_suffix))
+      tf.logging.info(
+          "Waiting till %s if a new file matching %s*-[0-9]*%s appears",
+          time.asctime(time.localtime(exit_time)), path_prefix, path_suffix)
       while True:
         stepfiles = _read_stepfiles_list(path_prefix, path_suffix, min_steps)
         if stepfiles or time.time() > exit_time:
@@ -264,5 +273,6 @@ def stepfiles_iterator(path_prefix, wait_minutes=0, min_steps=0,
       return
 
     stepfile = stepfiles.pop()
-    exit_time, min_steps = stepfile.ctime + wait_minutes * 60, stepfile.steps + 1
+    exit_time, min_steps = (stepfile.ctime + wait_minutes * 60,
+                            stepfile.steps + 1)
     yield stepfile
diff --git a/tensor2tensor/utils/bleu_hook_test.py b/tensor2tensor/utils/bleu_hook_test.py
index b616aaf7c..bb2677b71 100644
--- a/tensor2tensor/utils/bleu_hook_test.py
+++ b/tensor2tensor/utils/bleu_hook_test.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# coding=utf-8
 """Tests for tensor2tensor.utils.bleu_hook."""
 
 from __future__ import absolute_import
@@ -58,8 +59,9 @@ def testComputeMultipleNgrams(self):
     self.assertAllClose(bleu, actual_bleu, atol=1e-03)
 
   def testBleuTokenize(self):
-    self.assertEqual(bleu_hook.bleu_tokenize(u'hi, “there”'), [u'hi', u',', u'“', u'there', u'”'])
+    self.assertEqual(bleu_hook.bleu_tokenize(u"hi, “there”"),
+                     [u"hi", u",", u"“", u"there", u"”"])
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
   tf.test.main()
diff --git a/tensor2tensor/utils/checkpoint_compatibility_test.py b/tensor2tensor/utils/checkpoint_compatibility_test.py
new file mode 100644
index 000000000..3adc64b3e
--- /dev/null
+++ b/tensor2tensor/utils/checkpoint_compatibility_test.py
@@ -0,0 +1,90 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Test for checkpoint compatibility."""
+# The checkpoint in test_data/transformer_test_ckpt is generated with the OSS
+# release.
+# t2t-trainer \
+#   --model=transformer \
+#   --hparams_set=transformer_test \
+#   --problem=translate_ende_wmt8k \
+#   --data_dir=~/t2t/data \
+#   --output_dir=/tmp/t2t_train \
+#   --train_steps=1 \
+#   --eval_steps=1
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+
+# Dependency imports
+
+import numpy as np
+
+from tensor2tensor import models  # pylint: disable=unused-import
+from tensor2tensor import problems  # pylint: disable=unused-import
+from tensor2tensor.data_generators import problem
+from tensor2tensor.utils import trainer_lib
+
+import tensorflow as tf
+
+
+def get_data_dir():
+  pkg, _ = os.path.split(__file__)
+  pkg, _ = os.path.split(pkg)
+  return os.path.join(pkg, "test_data")
+
+
+_DATA_DIR = get_data_dir()
+_CKPT_DIR = os.path.join(_DATA_DIR, "transformer_test_ckpt")
+
+
+class CheckpointCompatibilityTest(tf.test.TestCase):
+  BATCH_SIZE = 3
+
+  def testCompatibility(self):
+    model = "transformer"
+    hp_set = "transformer_test"
+    problem_name = "translate_ende_wmt8k"
+
+    hp = trainer_lib.create_hparams(
+        hp_set, data_dir=_DATA_DIR, problem_name=problem_name)
+    run_config = trainer_lib.create_run_config(model_dir=_CKPT_DIR)
+    estimator = trainer_lib.create_estimator(model, hp, run_config)
+
+    for prediction in estimator.predict(self.input_fn):
+      self.assertEqual(prediction["outputs"].dtype, np.int32)
+
+  def input_fn(self):
+    types = {"inputs": tf.int32}
+    shapes = {"inputs": tf.TensorShape([None])}
+    dataset = tf.data.Dataset.from_generator(self.input_generator, types,
+                                             shapes)
+    dataset = dataset.padded_batch(self.BATCH_SIZE, shapes)
+    dataset = dataset.map(problem.standardize_shapes)
+    features = dataset.make_one_shot_iterator().get_next()
+    return features
+
+  def input_generator(self):
+    for _ in range(self.BATCH_SIZE):
+      vals = np.random.randint(
+          1, 100, size=np.random.randint(20), dtype=np.int32)
+      yield {"inputs": vals}
+
+
+if __name__ == "__main__":
+  tf.test.main()
diff --git a/tensor2tensor/utils/data_reader_test.py b/tensor2tensor/utils/data_reader_test.py
index 3893386af..85cf58930 100644
--- a/tensor2tensor/utils/data_reader_test.py
+++ b/tensor2tensor/utils/data_reader_test.py
@@ -92,6 +92,7 @@ def tearDownClass(cls):
   def testBasicExampleReading(self):
     dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN,
                                    data_dir=self.data_dir,
+                                   repeat=False,
                                    shuffle_files=False)
     examples = dataset.make_one_shot_iterator().get_next()
     with tf.train.MonitoredSession() as sess:
@@ -110,6 +111,7 @@ def testBasicExampleReading(self):
   def testPreprocess(self):
     dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN,
                                    data_dir=self.data_dir,
+                                   repeat=False,
                                    shuffle_files=False)
     examples = dataset.make_one_shot_iterator().get_next()
     with tf.train.MonitoredSession() as sess:
@@ -121,6 +123,7 @@ def testLengthFilter(self):
     max_len = 15
     dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN,
                                    data_dir=self.data_dir,
+                                   repeat=False,
                                    shuffle_files=False)
     dataset = dataset.filter(
         lambda ex: data_reader.example_valid_size(ex, 0, max_len))
@@ -215,6 +218,7 @@ def example_len(ex):
 
     dataset = self.problem.dataset(tf.estimator.ModeKeys.TRAIN,
                                    data_dir=self.data_dir,
+                                   repeat=False,
                                    shuffle_files=False)
     dataset = data_reader.bucket_by_sequence_length(
         dataset, example_len, boundaries, batch_sizes)
diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
index d072ecce9..6268b5c9f 100644
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -473,36 +473,23 @@ def _interactive_input_fn(hparams):
         x += [0] * (const_array_size - len(x))
         yield {
             "inputs": np.array(x).astype(np.int32),
-            "problem_choice": np.array(problem_id).astype(np.int32)
         }
       elif input_type == "image":
         input_path = input_string
-        img = read_image(input_path)
+        img = vocabulary.encode(input_path)
         yield {
             "inputs": img.astype(np.int32),
-            "problem_choice": np.array(problem_id).astype(np.int32)
         }
       elif input_type == "label":
         input_ids = [int(input_string)]
         x = [num_samples, decode_length, len(input_ids)] + input_ids
         yield {
             "inputs": np.array(x).astype(np.int32),
-            "problem_choice": np.array(problem_id).astype(np.int32)
         }
       else:
         raise Exception("Unsupported input type.")
 
 
-def read_image(path):
-  try:
-    import matplotlib.image as im  # pylint: disable=g-import-not-at-top
-  except ImportError as e:
-    tf.logging.warning(
-        "Reading an image requires matplotlib to be installed: %s", e)
-    raise NotImplementedError("Image reading not implemented.")
-  return im.imread(path)
-
-
 def show_and_save_image(img, save_path):
   try:
     import matplotlib.pyplot as plt  # pylint: disable=g-import-not-at-top
diff --git a/tensor2tensor/utils/devices.py b/tensor2tensor/utils/devices.py
index 06a7938c5..29b2b0232 100644
--- a/tensor2tensor/utils/devices.py
+++ b/tensor2tensor/utils/devices.py
@@ -74,8 +74,12 @@ def data_parallelism(daisy_chain_variables=True,
                      worker_id=0,
                      gpu_order="",
                      locally_shard_to_cpu=False,
-                     worker_job="/job:localhost"):
+                     worker_job="/job:localhost",
+                     no_data_parallelism=False):
   """See data_parallelism_from_flags."""
+  tf.logging.info("schuedule=%s" % schedule)
+  tf.logging.info("worker_gpu=%s" % worker_gpu)
+  tf.logging.info("sync=%s" % sync)
   def _ps_replicas(all_workers=False):
     if all_workers:
       return list(range(ps_replicas))
@@ -130,7 +134,10 @@ def _replica_device_setter(worker_device):
         ps_tasks=ps_replicas,
         ps_device=ps_job + "/GPU:0" if ps_gpu > 0 else ps_job)
 
-  if schedule in ["train_and_evaluate", "continuous_train_and_eval"]:
+  if no_data_parallelism:
+    datashard_devices = [""]
+    caching_devices = None
+  elif schedule in ["train_and_evaluate", "continuous_train_and_eval"]:
     assert not sync
     tf.logging.warn(
         "Schedule=%s. Assuming that training is running on a single machine.",
@@ -165,6 +172,7 @@ def _replica_device_setter(worker_device):
       caching_devices = None
   tf.logging.info("datashard_devices: %s", datashard_devices)
   tf.logging.info("caching_devices: %s", caching_devices)
+  tf.logging.info("ps_devices: %s", ps_devices(all_workers=all_workers))
   return eu.Parallelism(
       datashard_devices,
       caching_devices=caching_devices,
diff --git a/tensor2tensor/utils/expert_utils.py b/tensor2tensor/utils/expert_utils.py
index c947c6dba..9691b5de5 100644
--- a/tensor2tensor/utils/expert_utils.py
+++ b/tensor2tensor/utils/expert_utils.py
@@ -478,7 +478,7 @@ def noisy_top_k_gating(x,
       noisy_logits = clean_logits + (
           tf.random_normal(tf.shape(clean_logits)) * noise_stddev)
       logits = noisy_logits
-      if not tf.get_variable_scope().reuse:
+      if should_generate_summaries():
         tf.summary.histogram("noisy_logits", noisy_logits)
         tf.summary.histogram("noise_stddev", noise_stddev)
     else:
@@ -497,7 +497,7 @@ def noisy_top_k_gating(x,
                          k), 0)
     else:
       load = _gates_to_load(gates)
-    if not tf.get_variable_scope().reuse:
+    if should_generate_summaries():
       tf.summary.histogram("importance", tf.reduce_sum(gates, 0))
       tf.summary.histogram("load", load)
     return gates, load
@@ -1201,3 +1201,18 @@ def length_coordinate(self):
        integers in the range [0, length)
     """
     return self._indices
+
+
+def should_generate_summaries():
+  """Is this an appropriate context to generate summaries.
+
+  Returns:
+    a boolean
+  """
+  if "while/" in tf.contrib.framework.get_name_scope():
+    # Summaries don't work well within tf.while_loop()
+    return False
+  if tf.get_variable_scope().reuse:
+    # Avoid generating separate summaries for different data shards
+    return False
+  return True
diff --git a/tensor2tensor/utils/flags.py b/tensor2tensor/utils/flags.py
index 410dccfe1..b2685ce90 100644
--- a/tensor2tensor/utils/flags.py
+++ b/tensor2tensor/utils/flags.py
@@ -30,12 +30,12 @@
 flags.DEFINE_bool("tfdbg", False,
                   "If True, use the TF debugger CLI on train/eval.")
 flags.DEFINE_bool("export_saved_model", False,
-                  "Whether to export a SavedModel for serving.")
+                  "DEPRECATED - see serving/export.py.")
 flags.DEFINE_bool("dbgprofile", False,
                   "If True, record the timeline for chrome://tracing/.")
-flags.DEFINE_string("model", "", "Which model to use.")
-flags.DEFINE_string("hparams_set", "", "Which parameters to use.")
-flags.DEFINE_string("hparams_range", "", "Parameters range.")
+flags.DEFINE_string("model", None, "Which model to use.")
+flags.DEFINE_string("hparams_set", None, "Which parameters to use.")
+flags.DEFINE_string("hparams_range", None, "Parameters range.")
 flags.DEFINE_string(
     "hparams", "",
     """A comma-separated list of `name=value` hyperparameter values. This flag
@@ -43,7 +43,7 @@
     hyperparameters or when using Vizier. If a hyperparameter setting is
     specified by this flag then it must be a valid hyperparameter name for the
     model.""")
-flags.DEFINE_string("problems", "", "Dash separated list of problems to "
+flags.DEFINE_string("problems", None, "Dash separated list of problems to "
                     "solve.")
 
 # data_dir is a common flag name - catch conflicts and define it once.
diff --git a/tensor2tensor/utils/optimize.py b/tensor2tensor/utils/optimize.py
index 856b4e005..c0cbf5677 100644
--- a/tensor2tensor/utils/optimize.py
+++ b/tensor2tensor/utils/optimize.py
@@ -41,9 +41,12 @@ def optimize(loss, learning_rate, hparams, use_tpu=False):
   opt = ConditionalOptimizer(hparams.optimizer, learning_rate, hparams)
   if use_tpu:
     opt = tf.contrib.tpu.CrossShardOptimizer(opt)
-  opt_summaries = ["learning_rate", "loss", "gradient_norm"]
+
+  tf.summary.scalar("learning_rate", learning_rate)
+  opt_summaries = ["loss"]
   if hparams.summarize_grads:
-    opt_summaries.extend(["gradients"])
+    opt_summaries.extend(["gradients", "gradient_norm", "global_gradient_norm"])
+
   train_op = tf.contrib.layers.optimize_loss(
       name="training",
       loss=loss,
@@ -82,6 +85,9 @@ def __init__(self, optimizer_name, lr, hparams):
           beta1=hparams.optimizer_adam_beta1,
           beta2=hparams.optimizer_adam_beta2,
           epsilon=hparams.optimizer_adam_epsilon)
+    elif optimizer_name == "Adafactor":
+      self._opt = AdafactorOptimizer(
+          lr / 500.0, epsilon=hparams.optimizer_adam_epsilon)
     else:
       self._opt = tf.contrib.layers.OPTIMIZER_CLS_NAMES[optimizer_name](lr)
 
@@ -107,11 +113,36 @@ def _exp_decay_after(step, rate, from_which_step):
       name="exponential_decay_step_cond")
 
 
-def learning_rate_decay(hparams, num_worker_replicas=1, num_train_steps=1):
+def piecewise_learning_rate(step, boundaries, values):
+  """Scale learning rate according to the given schedule.
+
+  Multipliers are not cumulative.
+
+  Args:
+    step: global step
+    boundaries: List of steps to transition on.
+    values: Multiplier to apply at each boundary transition.
+
+  Returns:
+    Scaled value for the learning rate.
+  """
+  values = [1.0] + values
+  return tf.train.piecewise_constant(
+      step, boundaries, values, name="piecewise_lr")
+
+
+def learning_rate_decay(hparams, num_worker_replicas=1):
   """Inverse-decay learning rate until warmup_steps, then decay."""
+  if hparams.learning_rate_decay_scheme == "piecewise":
+    return piecewise_learning_rate(tf.train.get_or_create_global_step(),
+                                   hparams.learning_rate_boundaries,
+                                   hparams.learning_rate_multiples)
+
   warmup_steps = tf.to_float(
       hparams.learning_rate_warmup_steps * num_worker_replicas)
+  num_train_steps = hparams.train_steps
   step = tf.to_float(tf.train.get_or_create_global_step())
+
   if hparams.learning_rate_decay_scheme == "noam":
     return 5000.0 * hparams.hidden_size**-0.5 * tf.minimum(
         (step + 1) * warmup_steps**-1.5, (step + 1)**-0.5)
@@ -133,6 +164,18 @@ def learning_rate_decay(hparams, num_worker_replicas=1, num_train_steps=1):
   inv_decay = inv_base**(warmup_steps - step)
   if hparams.learning_rate_decay_scheme == "sqrt":
     decay = _sqrt_decay(step - warmup_steps)
+  elif hparams.learning_rate_decay_scheme == "exp":
+    total_steps = num_train_steps - warmup_steps
+    assert num_train_steps > hparams.learning_rate_warmup_steps
+    assert hparams.learning_rate_minimum is not None, "Must specify final LR"
+    total_steps = num_train_steps - hparams.learning_rate_warmup_steps
+    decay_needed = hparams.learning_rate_minimum / hparams.learning_rate
+    decay_rate = decay_needed**(1.0 / total_steps)
+    tf.logging.info("Decay rate: %f.  LR %f -> %f", decay_rate,
+                    hparams.learning_rate, hparams.learning_rate_minimum)
+    decay = _exp_decay_after(step, decay_rate,
+                             hparams.learning_rate_warmup_steps)
+    return decay
   elif hparams.learning_rate_decay_scheme == "exp10k":
     decay = _exp_decay_after(step - warmup_steps, 0.9995,
                              num_train_steps - warmup_steps - 10000)
@@ -159,7 +202,7 @@ def weight_decay_and_noise(loss, hparams, learning_rate, var_list=None):
   noise_vars = [v for v in var_list if "/body/" in v.name]
 
   weight_decay_loss = weight_decay(hparams.weight_decay, decay_vars)
-  tf.summary.scalar("weight_decay_loss", weight_decay_loss)
+  tf.summary.scalar("losses/weight_decay", weight_decay_loss)
   weight_noise_ops = weight_noise(hparams.weight_noise, learning_rate,
                                   noise_vars)
 
@@ -195,13 +238,11 @@ def weight_decay(decay_rate, var_list):
 
   weight_decays = []
   for v in var_list:
-    v_size = int(np.prod(np.array(v.shape.as_list())))
-
     # Weight decay
     is_bias = len(v.shape.as_list()) <= 1
     if not is_bias:
-      with tf.device(v._ref().device):  # pylint: disable=protected-access
-        v_loss = tf.nn.l2_loss(v) / v_size
+      with tf.device(v.device):
+        v_loss = tf.nn.l2_loss(v)
       weight_decays.append(v_loss)
 
   return tf.add_n(weight_decays) * decay_rate
@@ -211,7 +252,7 @@ def log_variable_sizes(var_list=None, tag=None):
   """Log the sizes and shapes of variables, and the total size.
 
   Args:
-    var_list: a list of varaibles; defaults to trainable_variables
+    var_list: a list of variables; defaults to trainable_variables
     tag: a string; defaults to "Trainable Variables"
   """
   if var_list is None:
@@ -249,3 +290,158 @@ def get_variable_initializer(hparams):
         hparams.initializer_gain, mode="fan_avg", distribution="uniform")
   else:
     raise ValueError("Unrecognized initializer: %s" % hparams.initializer)
+
+
+class AdafactorOptimizer(tf.train.Optimizer):
+  """Optimizer that implements the Adafactor algorithm.
+
+  Adafactor is similar to Adam, but seeks to reduce the memory
+  requirements due to the moment estimates.  The auxiliary memory
+  requirements for an `AxB` weight matrix are `A+B` for Adafactor,
+  versus `2AB` for Adam.
+
+  Adam is described in [Kingma et al., 2014](http://arxiv.org/abs/1412.6980)
+  ([pdf](http://arxiv.org/pdf/1412.6980.pdf)).
+
+  The differences are as follows:
+
+  1. No momentum - this removes the first-moment estimate.
+  2. For an AxB weight matrix, instead of keeping a full AxB second-moment
+     estimate matrix, Adafactor keeps only the row and column means of that
+     estimate matrix, and estimate the full second-moment estimate matrix
+     from on the fly, based on the means.
+  3. Adafactor uses a variable decay rate for the second-moment estaimtes -
+     faster decay at the start of training and slower decay later. This
+     elimnates the awkwardness in Adam related to having biased moment
+     estimates at the start of training.
+
+  For non-2d variables:
+    We initialize
+    ```
+    t <- 0
+    v <- zeros(shape(var))
+    ```
+
+    The update rule is as follows:
+    ```
+    t <- t + 1
+    decay_horizon = min(t, t * relative_decay_horizon + absolute_decay_horizon)
+    decay_rate = 1 - 1 / decay_horizon
+    v <- decay_rate * v + (1 - decay_rate) * grad^2
+    var <- var - lr * grad / (sqrt(v) + epsilon)
+    ```
+
+  For 2d variables:
+    We initialize
+    ```
+    t <- 0
+    v_r <- zeros([num_rows])
+    v_c <- zeros([num_cols])
+    ```
+
+    The update rule is as follows:
+    ```
+    t <- t + 1
+    decay_horizon = min(t, t * relative_decay_horizon + absolute_decay_horizon)
+    decay_rate = 1 - 1 / decay_horizon
+    v_r <- decay_rate * v_r + (1 - decay_rate) * reduce_mean(grad^2, 1)
+    v_c <- decay_rate * v_c + (1 - decay_rate) * reduce_mean(grad^2, 0)
+    approx_v = expand_dims(v_r, 1) * expand_dims(v_c, 0) / reduce_mean(v_c)
+    var <- var - lr * grad / (sqrt(approx_v) + epsilon)
+    ```
+
+  TODO(noam): write a paper.
+  TODO(noam): we should also apply the 2d logic to the two final dimensions.
+    of >2d convolutional kernels.
+  """
+
+  def __init__(self,
+               learning_rate=0.001,
+               epsilon=1e-8,
+               relative_decay_horizon=0.2,
+               absolute_decay_horizon=100.0,
+               use_locking=False,
+               name="Adafactor"):
+    """Construct a new Adafactor optimizer.
+
+    See class comment.
+
+    Args:
+      learning_rate: A Tensor or a floating point value.  The learning rate.
+      epsilon: A small constant for numerical stability.
+      relative_decay_horizon: a floating point value <= 1
+      absolute_decay_horizon: a floating point value (representing a step count)
+      use_locking: If True use locks for update operations.
+      name: Optional name for the operations created when applying gradients.
+        Defaults to "AdafactorOptimizer".
+    """
+    super(AdafactorOptimizer, self).__init__(use_locking, name)
+    self._lr = learning_rate
+    self._relative_decay_horizon = relative_decay_horizon
+    self._absolute_decay_horizon = absolute_decay_horizon
+    self._epsilon = epsilon
+
+  def _prepare(self):
+    global_step = tf.to_float(tf.train.get_or_create_global_step()) + 1.0
+    decay_horizon = tf.minimum(global_step,
+                               global_step * self._relative_decay_horizon +
+                               self._absolute_decay_horizon)
+    self._mixing_rate = 1.0 / decay_horizon
+    self._decay_rate = 1.0 - self._mixing_rate
+    self._epsilon = tf.to_float(self._epsilon)
+    self._lr = tf.to_float(self._lr)
+
+  def _should_use_factored_second_moment_estimate(self, shape):
+    """Should we use a factored second moment estimator.
+
+    Based on the shape of the variable.
+
+    Args:
+      shape: a list of integers
+    Returns:
+      a boolean
+    """
+    return len(shape) == 2
+
+  def _create_slots(self, var_list):
+    for v in var_list:
+      shape = v.get_shape().as_list()
+      if self._should_use_factored_second_moment_estimate(shape):
+        r_val = tf.zeros([shape[0]], dtype=tf.float32)
+        c_val = tf.zeros([shape[1]], dtype=tf.float32)
+        self._get_or_make_slot(v, r_val, "vr", self._name)
+        self._get_or_make_slot(v, c_val, "vc", self._name)
+      else:
+        self._zeros_slot(v, "v", self._name)
+
+  def _apply_dense(self, grad, var):
+    return self._resource_apply_dense(grad, var)
+
+  def _resource_apply_dense(self, grad, var):
+    shape = var.get_shape().as_list()
+    grad_squared = tf.square(grad)
+    updates = []
+    if self._should_use_factored_second_moment_estimate(shape):
+      vr = self.get_slot(var, "vr")
+      new_vr = (self._decay_rate * vr +
+                self._mixing_rate * tf.reduce_mean(grad_squared, 1))
+      vc = self.get_slot(var, "vc")
+      new_vc = (self._decay_rate * vc +
+                self._mixing_rate * tf.reduce_mean(grad_squared, 0))
+      vr_update = tf.assign(vr, new_vr, use_locking=self._use_locking)
+      vc_update = tf.assign(vc, new_vc, use_locking=self._use_locking)
+      updates = [vr_update, vc_update]
+      vr = tf.sqrt(new_vr) + self._epsilon
+      vc = tf.sqrt(new_vc) + self._epsilon
+      vc /= tf.reduce_mean(vc)
+      denom = tf.expand_dims(vr, 1) * tf.expand_dims(vc, 0)
+    else:
+      v = self.get_slot(var, "v")
+      new_v = (self._decay_rate * v + self._mixing_rate * grad_squared)
+      v_update = tf.assign(v, new_v, use_locking=self._use_locking)
+      updates = [v_update]
+      denom = tf.sqrt(new_v) + self._epsilon
+    subtrahend = self._lr * grad / denom
+    var_update = tf.assign_sub(var, subtrahend, use_locking=self._use_locking)
+    updates = [var_update] + updates
+    return tf.group(*updates)
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index fe2790194..aaa2be9d2 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -24,7 +24,7 @@ class MyModel(T2TModel):
 ```
 
 Access by snake-cased name: `registry.model("my_model")`. If you're using
-`tpu_trainer.py`, you can pass on the command-line: `--model=my_model`.
+`t2t_trainer.py`, you can pass on the command-line: `--model=my_model`.
 
 See all the models registered: `registry.list_models()`.
 
@@ -32,13 +32,13 @@ class MyModel(T2TModel):
   * Register: `registry.register_hparams`
   * List: `registry.list_hparams`
   * Retrieve by name: `registry.hparams`
-  * Command-line flag in `tpu_trainer.py`: `--hparams_set=name`
+  * Command-line flag in `t2t_trainer.py`: `--hparams_set=name`
 
 For hyperparameter ranges:
   * Register: `registry.register_ranged_hparams`
   * List: `registry.list_ranged_hparams`
   * Retrieve by name: `registry.ranged_hparams`
-  * Command-line flag in `tpu_trainer.py`: `--hparams_range=name`
+  * Command-line flag in `t2t_trainer.py`: `--hparams_range=name`
 """
 from __future__ import absolute_import
 from __future__ import division
@@ -51,6 +51,8 @@ class MyModel(T2TModel):
 
 import six
 
+from tensorflow.python.eager import context
+
 _MODELS = {}
 _HPARAMS = {}
 _RANGED_HPARAMS = {}
@@ -120,7 +122,7 @@ def register_model(name=None):
   def decorator(model_cls, registration_name=None):
     """Registers & returns model_cls with registration_name or default name."""
     model_name = registration_name or default_name(model_cls)
-    if model_name in _MODELS:
+    if model_name in _MODELS and not context.in_eager_mode():
       raise LookupError("Model %s already registered." % model_name)
     model_cls.REGISTERED_NAME = model_name
     _MODELS[model_name] = model_cls
@@ -136,12 +138,14 @@ def decorator(model_cls, registration_name=None):
 
 def model(name):
   if name not in _MODELS:
-    raise LookupError("Model %s never registered." % name)
+    raise LookupError("Model %s never registered.  Available models:\n %s" % (
+        name, "\n".join(list_models())))
+
   return _MODELS[name]
 
 
 def list_models():
-  return list(_MODELS)
+  return list(sorted(_MODELS))
 
 
 def register_hparams(name=None):
@@ -150,7 +154,7 @@ def register_hparams(name=None):
   def decorator(hp_fn, registration_name=None):
     """Registers & returns hp_fn with registration_name or default name."""
     hp_name = registration_name or default_name(hp_fn)
-    if hp_name in _HPARAMS:
+    if hp_name in _HPARAMS and not context.in_eager_mode():
       raise LookupError("HParams set %s already registered." % hp_name)
     _HPARAMS[hp_name] = hp_fn
     return hp_fn
@@ -217,7 +221,7 @@ def register_problem(name=None):
   def decorator(p_cls, registration_name=None):
     """Registers & returns p_cls with registration_name or default name."""
     p_name = registration_name or default_name(p_cls)
-    if p_name in _PROBLEMS:
+    if p_name in _PROBLEMS and not context.in_eager_mode():
       raise LookupError("Problem %s already registered." % p_name)
 
     _PROBLEMS[p_name] = p_cls
@@ -317,7 +321,7 @@ def _internal_register_modality(name, mod_collection, collection_str):
   def decorator(mod_cls, registration_name=None):
     """Registers & returns mod_cls with registration_name or default name."""
     mod_name = registration_name or default_name(mod_cls)
-    if mod_name in mod_collection:
+    if mod_name in mod_collection and not context.in_eager_mode():
       raise LookupError("%s modality %s already registered." % (collection_str,
                                                                 mod_name))
     mod_collection[mod_name] = mod_cls
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 630011541..8e5a76d67 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -31,6 +31,7 @@
 from tensor2tensor.data_generators import text_encoder
 from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import beam_search
+from tensor2tensor.utils import decoding
 from tensor2tensor.utils import expert_utils as eu
 from tensor2tensor.utils import metrics
 from tensor2tensor.utils import optimize
@@ -42,31 +43,41 @@
 from tensorflow.python.layers import base
 from tensorflow.python.ops import variable_scope
 
+_no_problem_err_str = (
+    "The default implementation of %s requires that the "
+    "model be used with a Problem. If using a Problem, augment the "
+    "hparams object with trainer_lib.add_problem_hparams. If not, "
+    "override %s.")
+_no_problem_err = (
+    lambda method_name: _no_problem_err_str % (method_name, method_name))
+
 
 class T2TModel(base.Layer):
   """Abstract base class for models.
 
-  Subclassess generally only need to override `build_model`.
+  Subclassess generally only need to override `body`.
   """
   REGISTERED_NAME = None  # Updated on registration.
 
   def __init__(self,
                hparams,
-               mode,
+               mode=tf.estimator.ModeKeys.TRAIN,
                problem_hparams=None,
-               problem_idx=0,
                data_parallelism=None,
                decode_hparams=None):
     """Create a T2TModel.
 
     Args:
-      hparams: a hyperparameters object.
-      mode: The execution mode, as defined in tf.estimator.ModeKeys.
-      problem_hparams: a hyperparameters object.
-      problem_idx: an integer.
-      data_parallelism: a expert_utils.parallelism
-        (specifies devices for data parallelism).
+      hparams: tf.contrib.training.HParams, model hyperparameters.
+      mode: tf.estimator.ModeKeys, the execution mode.
+      problem_hparams: tf.contrib.training.HParams, hyperparameters for the
+        Problem. If provided here or in hparams.problems, the model will
+        automatically determine bottom, top, and loss methods. If not provided,
+        calling the model will only invoke body.
+      data_parallelism: a expert_utils.Parallelism object,
+        specifies devices for data parallelism.
       decode_hparams: a hyperparameter object with decoding parameters.
+        See decoding.decode_hparams.
 
     Returns:
       a T2TModel
@@ -76,32 +87,34 @@ def __init__(self,
     name = self.REGISTERED_NAME or default_name
     super(T2TModel, self).__init__(
         trainable=mode == tf.estimator.ModeKeys.TRAIN, name=name)
-    if data_parallelism is None:
-      data_parallelism = eu.Parallelism([""])
-    if problem_hparams is None:
+
+    if not problem_hparams and hasattr(hparams, "problems"):
       problem_hparams = hparams.problems[0]
+    self._problem_hparams = problem_hparams
 
+    # Setup hparams
     # If vocabularies differ, unset shared_embedding_and_softmax_weights.
     hparams = copy.copy(hparams)
-    if hparams.shared_embedding_and_softmax_weights:
+    if self._problem_hparams and hparams.shared_embedding_and_softmax_weights:
       same_vocab_sizes = True
-      for problem in hparams.problems:
-        if "inputs" in problem.input_modality:
-          if problem.input_modality["inputs"] != problem.target_modality:
-            same_vocab_sizes = False
+      if "inputs" in self._problem_hparams.input_modality:
+        if (self._problem_hparams.input_modality["inputs"] !=
+            self._problem_hparams.target_modality):
+          same_vocab_sizes = False
       if not same_vocab_sizes:
         tf.logging.info("Unsetting shared_embedding_and_softmax_weights.")
         hparams.shared_embedding_and_softmax_weights = 0
     self._original_hparams = hparams
     self.set_mode(mode)
-    self._decode_hparams = copy.copy(decode_hparams)
-    self._data_parallelism = data_parallelism
-    self._num_datashards = data_parallelism.n
-    self._ps_devices = data_parallelism.ps_devices
-    self._problem_hparams = problem_hparams
-    self._problem_idx = problem_idx
-    self._create_modalities(problem_hparams, self._hparams)
-    self._var_store = create_eager_var_store()
+
+    self._decode_hparams = copy.copy(
+        decode_hparams or decoding.decode_hparams())
+    self._data_parallelism = data_parallelism or eu.Parallelism([""])
+    self._num_datashards = self._data_parallelism.n
+    self._ps_devices = self._data_parallelism.ps_devices
+    self._eager_var_store = create_eager_var_store()
+    if self._problem_hparams:
+      self._create_modalities(self._problem_hparams, self._hparams)
 
   @property
   def hparams(self):
@@ -109,12 +122,15 @@ def hparams(self):
 
   @property
   def has_input(self):
-    return self._problem_hparams.input_modality
+    if self._problem_hparams:
+      return self._problem_hparams.input_modality
+    else:
+      return True
 
   def call(self, features):
     tf.get_variable_scope().set_initializer(
         optimize.get_variable_initializer(self.hparams))
-    with self._var_store.as_default():
+    with self._eager_var_store.as_default():
       self._fill_problem_hparams_features(features)
       sharded_features = self._shard_features(features)
       sharded_logits, losses = self.model_fn_sharded(sharded_features)
@@ -131,6 +147,7 @@ def body_sharded(self, sharded_features):
 
   def model_fn_sharded(self, sharded_features):
     dp = self._data_parallelism
+    summarize_features(sharded_features, num_shards=dp.n)
     datashard_to_features = self._to_features_per_datashard(sharded_features)
 
     if self.use_body_sharded:
@@ -139,15 +156,17 @@ def model_fn_sharded(self, sharded_features):
       body_out = self.body_sharded(
           self._to_single_features_dict(transformed_features))
       body_out, losses = self._normalize_body_output(body_out)
-      if "training" not in losses:
+      if "training" in losses:
+        # If body has returned the training loss, the body outputs are
+        # considered the logits and no further work is done.
+        sharded_logits = body_out
+      else:
         sharded_logits = dp(self.top, body_out, datashard_to_features)
         sharded_losses = dp(self.loss, sharded_logits, datashard_to_features)
         training_loss_dict = average_sharded_losses([{
             "training": loss
         } for loss in sharded_losses])
         losses.update(training_loss_dict)
-      else:
-        sharded_logits = body_out
     else:
       sharded_logits, sharded_losses = dp(self.model_fn, datashard_to_features)
       losses = average_sharded_losses(sharded_losses)
@@ -162,53 +181,44 @@ def model_fn_sharded(self, sharded_features):
     #   sharded_logits, losses = scheduled_sampling(
     #       self.hparams, self._problem_hparams, dp,
     #       sharded_logits, losses, sharded_features,
-    #       self._transformed_features, self)
+    #       transformed_features, self)
 
     return sharded_logits, losses
 
   def model_fn(self, features):
     transformed_features = self.bottom(features)
-    self._transformed_features = transformed_features
 
     with tf.variable_scope("body"):
       body_out = self.body(transformed_features)
     output, losses = self._normalize_body_output(body_out)
 
-    if "training" not in losses:
+    if "training" in losses:
+      logits = output
+    else:
       logits = self.top(output, features)
       losses["training"] = self.loss(logits, features)
-    else:
-      logits = output
     return logits, losses
 
   def bottom(self, features):
     """Transform features to feed into body."""
+    if not self._problem_hparams:
+      tf.logging.warn("Without a Problem, T2TModel.bottom is a passthrough.")
+      return features
+
     transformed_features = {}
     all_previous_modalities = []
 
     # Transform the input features
     for key, input_modality in six.iteritems(
         self._problem_hparams.input_modality):
-      previous_modalities = [
-          self.hparams.problems[i].input_modality[key].name
-          for i in range(self._problem_idx)
-      ]
-      all_previous_modalities.extend(previous_modalities)
       do_reuse = input_modality.name in all_previous_modalities
       with tf.variable_scope(input_modality.name, reuse=do_reuse):
         transformed_features[key] = input_modality.bottom(features[key])
       all_previous_modalities.append(input_modality.name)
 
     # Transform the targets (for autoregressive models)
-    previous_tgt_modalities = [
-        self.hparams.problems[i].target_modality.name
-        for i in range(self._problem_idx)
-    ]
-    all_previous_modalities.extend(previous_tgt_modalities)
-
     target_modality = self._problem_hparams.target_modality
-    target_reuse = target_modality.name in previous_tgt_modalities
-    with tf.variable_scope(target_modality.name, reuse=target_reuse):
+    with tf.variable_scope(target_modality.name):
       transformed_features["targets"] = target_modality.targets_bottom(
           features["targets"])
 
@@ -240,6 +250,10 @@ def body(self, features):
     raise NotImplementedError("Abstract Method")
 
   def top(self, body_output, features):
+    if not self._problem_hparams:
+      tf.logging.warn("Without a Problem, T2TModel.top is a passthrough.")
+      return body_output
+
     target_modality = self._problem_hparams.target_modality
     with tf.variable_scope(target_modality.name):
       last_only = (
@@ -259,13 +273,19 @@ def top(self, body_output, features):
     return logits
 
   def loss(self, logits, features):
+    if not self._problem_hparams:
+      tf.logging.warn(_no_problem_err("loss"))
+      return (tf.constant(0., dtype=tf.float32),
+              tf.constant(1., dtype=tf.float32))
+
     target_modality = self._problem_hparams.target_modality
     loss_num, loss_den = target_modality.loss(logits, features["targets"])
     loss_num *= self._problem_hparams.loss_multiplier
     return loss_num, loss_den
 
-  def optimize(self, loss, num_async_replicas=1, use_tpu=False):
+  def optimize(self, loss, num_async_replicas=1):
     """Return a training op minimizing loss."""
+    use_tpu = self.hparams.use_tpu
     lr = self.hparams.learning_rate * optimize.learning_rate_decay(self.hparams)
     lr /= math.sqrt(float(num_async_replicas))
     train_op = optimize.optimize(loss, lr, self.hparams, use_tpu=use_tpu)
@@ -339,16 +359,20 @@ def eval_autoregressive(self, features=None, decode_length=50):
   def _fill_problem_hparams_features(self, features):
     if features is None:
       return
-    problem_hparams = self._problem_hparams
+
+    input_space_id, target_space_id = 0, 0
+    if self._problem_hparams:
+      input_space_id = self._problem_hparams.input_space_id
+      target_space_id = self._problem_hparams.target_space_id
+
     if "problem_choice" not in features:
-      features["problem_choice"] = tf.constant(
-          self._problem_idx, name="problem_choice")
+      features["problem_choice"] = tf.constant(0, name="problem_choice")
     if "input_space_id" not in features:
       features["input_space_id"] = tf.constant(
-          problem_hparams.input_space_id, name="input_space_id")
+          input_space_id, name="input_space_id")
     if "target_space_id" not in features:
       features["target_space_id"] = tf.constant(
-          problem_hparams.target_space_id, name="target_space_id")
+          target_space_id, name="target_space_id")
 
   def infer(self,
             features=None,
@@ -371,7 +395,7 @@ def infer(self,
     Returns:
        samples: an integer `Tensor`.
     """
-    with self._var_store.as_default():
+    with self._eager_var_store.as_default():
       # TODO(rsepassi): Make decoding work with real-valued model outputs
       # (i.e. if the target modality is RealModality).
       self.prepare_features_for_infer(features)
@@ -381,9 +405,10 @@ def infer(self,
         tf.logging.warn("Non-random sampling for a model with no inputs.")
       self._fill_problem_hparams_features(features)
 
-      target_modality = self.hparams.problems[self._problem_idx].target_modality
-      if target_modality.is_class_modality:
-        beam_size = 1  # No use to run beam-search for a single class.
+      if self._problem_hparams:
+        target_modality = self._problem_hparams.target_modality
+        if target_modality.is_class_modality:
+          beam_size = 1  # No use to run beam-search for a single class.
       if beam_size == 1:
         tf.logging.info("Greedy Decoding")
         samples, _, _ = self._greedy_infer(features, decode_length)
@@ -430,7 +455,6 @@ def _beam_decode_slow(self, features, decode_length, beam_size, top_beams,
        samples: an integer `Tensor`. Top samples from the beam search
     """
     batch_size = common_layers.shape_list(features["inputs"])[0]
-    batch_size = tf.Print(batch_size, [batch_size], "beam_decode batch_size=")
 
     def symbols_to_logits_fn(ids):
       """Go from ids to logits."""
@@ -449,9 +473,10 @@ def symbols_to_logits_fn(ids):
       # now self._coverage is a coverage tensor for the first datashard.
       # it has shape [batch_size] and contains floats between 0 and
       # source_length.
-      modality = self.hparams.problems[self._problem_idx].target_modality
-      if modality.top_is_pointwise:
-        return tf.squeeze(logits, axis=[1, 2, 3])
+      if self._problem_hparams:
+        modality = self._problem_hparams.target_modality
+        if modality.top_is_pointwise:
+          return tf.squeeze(logits, axis=[1, 2, 3])
       # -1 due to the pad above.
       current_output_position = common_layers.shape_list(ids)[1] - 1
       logits = logits[:, current_output_position, :, :]
@@ -470,7 +495,7 @@ def symbols_to_logits_fn(ids):
       features["inputs"] = tf.reshape(features["inputs"],
                                       [s[0] * s[1], s[2], s[3], s[4]])
 
-    target_modality = self.hparams.problems[self._problem_idx].target_modality
+    target_modality = self._problem_hparams.target_modality
     vocab_size = target_modality.top_dimensionality
     # Setting decode length to input length + decode_length
     decode_length = tf.constant(decode_length)
@@ -543,7 +568,7 @@ def _slow_greedy_infer(self, features, decode_length):
     # in metric functions stays in the same frame as other vars.
     targets_old = features.get("targets", None)
 
-    target_modality = self.hparams.problems[self._problem_idx].target_modality
+    target_modality = self._problem_hparams.target_modality
 
     def infer_step(recent_output, recent_logits, unused_loss):
       """Inference step."""
@@ -585,7 +610,7 @@ def infer_step(recent_output, recent_logits, unused_loss):
     # input shape, so we confuse it about the input shape.
     initial_output = tf.slice(initial_output, [0, 0, 0, 0],
                               common_layers.shape_list(initial_output))
-    target_modality = self.hparams.problems[self._problem_idx].target_modality
+    target_modality = self._problem_hparams.target_modality
     if target_modality.is_class_modality:
       decode_length = 1
     else:
@@ -739,7 +764,7 @@ def estimator_model_fn(cls,
                          config=None,
                          params=None,
                          decode_hparams=None,
-                         use_tpu=True):
+                         use_tpu=False):
     """Model fn for Estimator.
 
     Args:
@@ -755,17 +780,13 @@ def estimator_model_fn(cls,
     Returns:
       TPUEstimatorSpec if use tpu else EstimatorSpec
     """
-    tf.logging.warning("T2TModel.estimator_model_fn implements a subset of "
-                       "model_builder.model_fn and is currently only used "
-                       "in tpu_trainer.")
     _create_dummy_vars()
     hparams = copy.deepcopy(hparams)
     hparams.use_tpu = use_tpu
-    problem = hparams.problem_instances[0]
 
     # Instantiate model
     data_parallelism = None
-    if not use_tpu and not hparams.no_data_parallelism and config:
+    if not use_tpu and config:
       data_parallelism = config.data_parallelism
     model = cls(hparams, mode, data_parallelism=data_parallelism,
                 decode_hparams=decode_hparams)
@@ -773,7 +794,6 @@ def estimator_model_fn(cls,
     # PREDICT mode
     if mode == tf.estimator.ModeKeys.PREDICT:
       assert not use_tpu
-      assert decode_hparams is not None
       return model.estimator_spec_predict(features)
 
     # TRAIN and EVAL modes
@@ -791,27 +811,32 @@ def estimator_model_fn(cls,
         shape[1] = hparams.max_length
       logits.set_shape(shape)
 
-    # Accumulate losses
     assert "training" in losses_dict
+
+    # Summarize losses
+    with tf.name_scope("losses"):
+      for loss_name, loss_val in losses_dict.items():
+        tf.summary.scalar(loss_name, loss_val)
+
+    # Accumulate losses
     loss = sum(losses_dict.values())
 
     # EVAL mode
     if mode == tf.estimator.ModeKeys.EVAL:
-      return model.estimator_spec_eval(features, logits, labels, loss,
-                                       problem, hparams, use_tpu=use_tpu)
+      return model.estimator_spec_eval(features, logits, labels, loss)
 
     # TRAIN mode
     assert mode == tf.estimator.ModeKeys.TRAIN
     num_async_replicas = (
         1 if (use_tpu or not config)
         else config.t2t_device_info["num_async_replicas"])
-    return model.estimator_spec_train(
-        loss, num_async_replicas=num_async_replicas, use_tpu=use_tpu)
+    return model.estimator_spec_train(loss,
+                                      num_async_replicas=num_async_replicas)
 
-  def estimator_spec_train(self, loss, num_async_replicas=1, use_tpu=False):
+  def estimator_spec_train(self, loss, num_async_replicas=1):
     """Construct EstimatorSpec for TRAIN mode."""
-    train_op = self.optimize(loss, num_async_replicas=num_async_replicas,
-                             use_tpu=use_tpu)
+    use_tpu = self.hparams.use_tpu
+    train_op = self.optimize(loss, num_async_replicas=num_async_replicas)
 
     if use_tpu:
       _remove_summaries()  # summaries not currently working on TPU
@@ -825,11 +850,15 @@ def estimator_spec_eval(self,
                           features,
                           logits,
                           labels,
-                          loss,
-                          problem,
-                          hparams,
-                          use_tpu=False):
+                          loss):
     """Construct EstimatorSpec for EVAL mode."""
+    hparams = self.hparams
+    use_tpu = hparams.use_tpu
+
+    if not hasattr(hparams, "problem_instances"):
+      raise NotImplementedError(_no_problem_err("estimator_spec_eval"))
+
+    problem = hparams.problem_instances[0]
     if use_tpu:
       eval_metrics_fn = _create_tpu_eval_metrics_fn(problem, hparams)
       _remove_summaries()
@@ -1090,3 +1119,16 @@ def average_sharded_losses(sharded_losses):
 
     losses[loss_name] = mean_loss
   return losses
+
+
+def summarize_features(features, num_shards=1):
+  with tf.name_scope("input_stats"):
+    for (k, v) in six.iteritems(features):
+      if isinstance(v, tf.Tensor) and v.get_shape().ndims > 1:
+        tf.summary.scalar("%s_batch" % k, tf.shape(v)[0] // num_shards)
+        tf.summary.scalar("%s_length" % k, tf.shape(v)[1])
+        nonpadding = tf.to_float(tf.not_equal(v, 0))
+        nonpadding_tokens = tf.reduce_sum(nonpadding)
+        tf.summary.scalar("%s_nonpadding_tokens" % k, nonpadding_tokens)
+        tf.summary.scalar("%s_nonpadding_fraction" % k,
+                          tf.reduce_mean(nonpadding))
diff --git a/tensor2tensor/tpu/tpu_trainer_lib.py b/tensor2tensor/utils/trainer_lib.py
similarity index 87%
rename from tensor2tensor/tpu/tpu_trainer_lib.py
rename to tensor2tensor/utils/trainer_lib.py
index bde85e4db..0b0d96ac6 100644
--- a/tensor2tensor/tpu/tpu_trainer_lib.py
+++ b/tensor2tensor/utils/trainer_lib.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Library for training on TPU. See tpu_trainer.py."""
+"""Library for training. See t2t_trainer.py."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -27,7 +27,6 @@
 import numpy as np
 
 from tensor2tensor.utils import devices
-from tensor2tensor.utils import expert_utils
 from tensor2tensor.utils import metrics_hook
 from tensor2tensor.utils import registry
 from tensor2tensor.utils import t2t_model
@@ -104,7 +103,9 @@ def create_run_config(master="",
                       ps_replicas=0,
                       ps_job="/job:ps",
                       ps_gpu=0,
+                      random_seed=None,
                       sync=False,
+                      tpu_infeed_sleep_secs=None,
                       use_tpu=False):
   """Create RunConfig, TPUConfig, and Parallelism object."""
   session_config = create_session_config(
@@ -118,10 +119,11 @@ def create_run_config(master="",
       "master": master,
       "model_dir": model_dir,
       "session_config": session_config,
-      "save_summary_steps": 0,
+      "save_summary_steps": 100,
       "save_checkpoints_steps": save_checkpoints_steps,
       "keep_checkpoint_max": keep_checkpoint_max,
       "keep_checkpoint_every_n_hours": keep_checkpoint_every_n_hours,
+      "tf_random_seed": random_seed,
   }
   run_config_cls = tf.contrib.learn.RunConfig
 
@@ -131,7 +133,8 @@ def create_run_config(master="",
     tpu_config = tf.contrib.tpu.TPUConfig(
         iterations_per_loop=iterations_per_loop,
         num_shards=num_shards,
-        per_host_input_for_training=(num_shards <= 8))
+        per_host_input_for_training=(num_shards <= 8),
+        initial_infeed_sleep_secs=tpu_infeed_sleep_secs)
     run_config_args["tpu_config"] = tpu_config
 
   config = run_config_cls(**run_config_args)
@@ -142,22 +145,20 @@ def create_run_config(master="",
     config.t2t_device_info = {
         "num_async_replicas": num_async_replicas,
     }
-    if no_data_parallelism:
-      config.data_parallelism = expert_utils.Parallelism([""])
-    else:
-      config.data_parallelism = devices.data_parallelism(
-          daisy_chain_variables=daisy_chain_variables,
-          ps_replicas=ps_replicas,
-          ps_job=ps_job,
-          ps_gpu=ps_gpu,
-          schedule=schedule,
-          sync=sync,
-          worker_gpu=num_gpus,
-          worker_replicas=num_async_replicas,
-          worker_id=worker_id,
-          gpu_order=gpu_order,
-          locally_shard_to_cpu=shard_to_cpu,
-          worker_job=worker_job)
+    config.data_parallelism = devices.data_parallelism(
+        daisy_chain_variables=daisy_chain_variables,
+        ps_replicas=ps_replicas,
+        ps_job=ps_job,
+        ps_gpu=ps_gpu,
+        schedule=schedule,
+        sync=sync,
+        worker_gpu=num_gpus,
+        worker_replicas=num_async_replicas,
+        worker_id=worker_id,
+        gpu_order=gpu_order,
+        locally_shard_to_cpu=shard_to_cpu,
+        worker_job=worker_job,
+        no_data_parallelism=no_data_parallelism)
 
   return config
 
@@ -174,18 +175,12 @@ def create_estimator(model_name,
   if use_tpu:
     batch_size = hparams.tpu_batch_size_per_shard
     batch_size *= run_config.tpu_config.num_shards
-    eval_batch_size = batch_size * 2
-    if "eval" not in schedule:
-      # Estimator takes the presence of eval_batch_size as an indication that
-      # an eval is being performed, and complains about num_shards being too
-      # big. So we have to set eval_batch_size to None.
-      eval_batch_size = None
     return tf.contrib.tpu.TPUEstimator(
         model_fn=model_fn,
         model_dir=run_config.model_dir,
         config=run_config,
         train_batch_size=batch_size,
-        eval_batch_size=eval_batch_size)
+        eval_batch_size=batch_size if "eval" in schedule else None)
   else:
     return tf.estimator.Estimator(
         model_fn=model_fn, model_dir=run_config.model_dir, config=run_config)
@@ -245,6 +240,7 @@ def create_experiment(run_config,
   """Create Experiment."""
   # HParams
   hparams.add_hparam("data_dir", data_dir)
+  hparams.add_hparam("train_steps", train_steps)
   add_problem_hparams(hparams, problem_name)
 
   # Estimator
@@ -264,7 +260,9 @@ def create_experiment(run_config,
       tf.estimator.ModeKeys.EVAL, hparams)
 
   # Export
-  export_strategies = export and [create_export_strategy(problem, hparams)]
+  if export:
+    tf.logging.warn("Exporting from the trainer is deprecated. "
+                    "See serving/export.py.")
 
   # Hooks
   hooks_kwargs = {}
@@ -311,7 +309,7 @@ def create_experiment(run_config,
       eval_steps=eval_steps,
       min_eval_frequency=min_eval_frequency,
       train_steps_per_iteration=min(min_eval_frequency, train_steps),
-      export_strategies=export_strategies,
+      eval_delay_secs=0 if schedule == "evaluate" else 120,
       **hooks_kwargs)
 
 
@@ -329,16 +327,13 @@ def create_export_strategy(problem, hparams):
       lambda: problem.serving_input_fn(hparams), as_text=True)
 
 
-def add_problem_hparams(hparams, problems):
+def add_problem_hparams(hparams, problem_name):
   """Add problem hparams for the problems."""
-  hparams.problems = []
-  hparams.problem_instances = []
-  for problem_name in problems.split("-"):
-    problem = registry.problem(problem_name)
-    p_hparams = problem.get_hparams(hparams)
-
-    hparams.problem_instances.append(problem)
-    hparams.problems.append(p_hparams)
+  problem = registry.problem(problem_name)
+  p_hparams = problem.get_hparams(hparams)
+
+  hparams.problem_instances = [problem]
+  hparams.problems = [p_hparams]
 
 
 def set_random_seed(seed):
diff --git a/tensor2tensor/tpu/tpu_trainer_lib_test.py b/tensor2tensor/utils/trainer_lib_test.py
similarity index 88%
rename from tensor2tensor/tpu/tpu_trainer_lib_test.py
rename to tensor2tensor/utils/trainer_lib_test.py
index 2a2148afd..5df62d2cb 100644
--- a/tensor2tensor/tpu/tpu_trainer_lib_test.py
+++ b/tensor2tensor/utils/trainer_lib_test.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for tpu_trainer_lib."""
+"""Tests for trainer_lib."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -28,8 +28,8 @@
 from tensor2tensor.data_generators import algorithmic
 from tensor2tensor.data_generators import generator_utils
 from tensor2tensor.data_generators import problem as problem_lib
-from tensor2tensor.tpu import tpu_trainer_lib
 from tensor2tensor.utils import registry
+from tensor2tensor.utils import trainer_lib
 
 import tensorflow as tf
 
@@ -47,7 +47,7 @@ def generate_data(self, data_dir, _):
         self.dev_filepaths(data_dir, 1, shuffled=True), 100)
 
 
-class TpuTrainerTest(tf.test.TestCase):
+class TrainerLibTest(tf.test.TestCase):
 
   @classmethod
   def setUpClass(cls):
@@ -60,7 +60,7 @@ def setUpClass(cls):
     registry.problem("tiny_algo").generate_data(cls.data_dir, None)
 
   def testExperiment(self):
-    exp_fn = tpu_trainer_lib.create_experiment_fn(
+    exp_fn = trainer_lib.create_experiment_fn(
         "transformer",
         "tiny_algo",
         self.data_dir,
@@ -68,7 +68,7 @@ def testExperiment(self):
         eval_steps=1,
         min_eval_frequency=1,
         use_tpu=False)
-    run_config = tpu_trainer_lib.create_run_config(
+    run_config = trainer_lib.create_run_config(
         model_dir=self.data_dir, num_gpus=0, use_tpu=False)
     hparams = registry.hparams("transformer_tiny_tpu")()
     exp = exp_fn(run_config, hparams)
@@ -76,9 +76,9 @@ def testExperiment(self):
 
   def testModel(self):
     # HParams
-    hparams = tpu_trainer_lib.create_hparams("transformer_tiny",
-                                             data_dir=self.data_dir,
-                                             problem_name="tiny_algo")
+    hparams = trainer_lib.create_hparams("transformer_tiny",
+                                         data_dir=self.data_dir,
+                                         problem_name="tiny_algo")
 
     # Dataset
     problem = hparams.problem_instances[0]
diff --git a/tensor2tensor/visualization/TransformerVisualization.ipynb b/tensor2tensor/visualization/TransformerVisualization.ipynb
index f2c4f1559..bec758327 100644
--- a/tensor2tensor/visualization/TransformerVisualization.ipynb
+++ b/tensor2tensor/visualization/TransformerVisualization.ipynb
@@ -29,10 +29,10 @@
     "import tensorflow as tf\n",
     "import numpy as np\n",
     "\n",
-    "from tensor2tensor.tpu import tpu_trainer_lib\n",
     "from tensor2tensor.utils import t2t_model\n",
     "from tensor2tensor.utils import decoding\n",
     "from tensor2tensor.utils import devices\n",
+    "from tensor2tensor.utils import trainer_lib\n",
     "from tensor2tensor.visualization import attention\n"
    ]
   },
@@ -133,7 +133,7 @@
     }
    ],
    "source": [
-    "hparams = tpu_trainer_lib.create_hparams(FLAGS.hparams_set, data_dir=FLAGS.data_dir, problem_name=PROBLEM)\n",
+    "hparams = trainer_lib.create_hparams(FLAGS.hparams_set, data_dir=FLAGS.data_dir, problem_name=PROBLEM)\n",
     "hparams.use_fixed_batch_size = True\n",
     "hparams.batch_size = 1\n",
     "\n",
diff --git a/tensor2tensor/visualization/attention.py b/tensor2tensor/visualization/attention.py
index 0de7d7165..11ed75b8d 100644
--- a/tensor2tensor/visualization/attention.py
+++ b/tensor2tensor/visualization/attention.py
@@ -59,21 +59,20 @@ def _show_attention(att_json):
   display.display(display.Javascript(vis_js))
 
 
-def resize(att_mat, max_length=30):
+def resize(att_mat, max_length=None):
   """Normalize attention matrices and reshape as necessary."""
-  layer_mats = []
-  for att in att_mat:
-    # Sum across different heads.
-    att = att[ :, :max_length, :max_length]
-    row_sums = np.sum(att, axis=0)
-    # Normalize
-    layer_mat = att / row_sums[np.newaxis, :]
-    lsh = layer_mat.shape
+  for i, att in enumerate(att_mat):
     # Add extra batch dim for viz code to work.
-    if len(np.shape(lsh)) == 3:
-      layer_mat = np.reshape(layer_mat, (1, lsh[0], lsh[1], lsh[2]))
-    layer_mats.append(layer_mat)
-  return layer_mats
+    if att.ndim == 3:
+      att = np.expand_dims(att, axis=0)
+    if max_length is not None:
+      # Sum across different attention values for each token.
+      att = att[:, :, :max_length, :max_length]
+      row_sums = np.sum(att, axis=2)
+      # Normalize
+      att /= row_sums[:, :, np.newaxis]
+    att_mat[i] = att
+  return att_mat
 
 
 def _get_attention(inp_text, out_text, enc_atts, dec_atts, encdec_atts):