From 8b622a4c177c9555955e9f24e47ec21d8134a6be Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Fri, 4 Aug 2017 16:27:46 -0700
Subject: [PATCH 01/17] Add rouge metrics (rouge-2 and rouge-l f1 scores) to
 wiki_sum problems.

PiperOrigin-RevId: 164325588
---
 tensor2tensor/data_generators/problem.py |   3 +-
 tensor2tensor/utils/metrics.py           |   5 +
 tensor2tensor/utils/model_builder.py     |   2 +-
 tensor2tensor/utils/rouge.py             | 249 +++++++++++++++++++++++
 tensor2tensor/utils/rouge_test.py        | 120 +++++++++++
 5 files changed, 377 insertions(+), 2 deletions(-)
 create mode 100644 tensor2tensor/utils/rouge.py
 create mode 100644 tensor2tensor/utils/rouge_test.py

diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index fb7e53cb7..2bbc88192 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -424,5 +424,6 @@ def eval_metrics(self):
     return [
         metrics.Metrics.ACC, metrics.Metrics.ACC_TOP5,
         metrics.Metrics.ACC_PER_SEQ, metrics.Metrics.NEG_LOG_PERPLEXITY,
-        metrics.Metrics.APPROX_BLEU
+        metrics.Metrics.APPROX_BLEU, metrics.Metrics.ROUGE_2_F,
+        metrics.Metrics.ROUGE_L_F
     ]
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index db60e07c8..ea2187427 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -24,6 +24,7 @@
 
 from tensor2tensor.layers import common_layers
 from tensor2tensor.utils import bleu_hook
+from tensor2tensor.utils import rouge
 
 import tensorflow as tf
 
@@ -37,6 +38,8 @@ class Metrics(object):
   NEG_LOG_PERPLEXITY = "neg_log_perplexity"
   APPROX_BLEU = "approx_bleu_score"
   RMSE = "rmse"
+  ROUGE_2_F = "rouge_2_fscore"
+  ROUGE_L_F = "rouge_L_fscore"
 
 
 def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero):
@@ -188,4 +191,6 @@ def problem_metric_fn(predictions, labels, weights):
     Metrics.NEG_LOG_PERPLEXITY: padded_neg_log_perplexity,
     Metrics.APPROX_BLEU: bleu_hook.bleu_score,
     Metrics.RMSE: padded_rmse,
+    Metrics.ROUGE_2_F: rouge.rouge_2_fscore,
+    Metrics.ROUGE_L_F: rouge.rouge_l_fscore,
 }
diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
index 48d5dd7a0..a12aa1122 100644
--- a/tensor2tensor/utils/model_builder.py
+++ b/tensor2tensor/utils/model_builder.py
@@ -195,7 +195,7 @@ def nth_model(n):
           features, skip=(skipping_is_on and skip_this_one))
       with tf.variable_scope("losses_avg", reuse=True):
         total_loss, ops = 0.0, []
-        for loss_key, loss_value in six.iteritems(losses_dict):
+        for loss_key, loss_value in losses_dict.iteritems():
           loss_moving_avg = tf.get_variable("problem_%d/%s_loss" % (n,
                                                                     loss_key))
           ops.append(
diff --git a/tensor2tensor/utils/rouge.py b/tensor2tensor/utils/rouge.py
new file mode 100644
index 000000000..29c84729f
--- /dev/null
+++ b/tensor2tensor/utils/rouge.py
@@ -0,0 +1,249 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ROUGe metric implementation.
+
+This is a modified and slightly extended verison of
+https://github.com/miso-belica/sumy/blob/dev/sumy/evaluation/rouge.py.
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import numpy as np
+
+import tensorflow as tf
+
+
+def _len_lcs(x, y):
+  """Returns the length of the Longest Common Subsequence between two seqs.
+
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: sequence of words
+    y: sequence of words
+
+  Returns
+    integer: Length of LCS between x and y
+  """
+  table = _lcs(x, y)
+  n, m = len(x), len(y)
+  return table[n, m]
+
+
+def _lcs(x, y):
+  """Computes the length of the LCS between two seqs.
+
+  The implementation below uses a DP programming algorithm and runs
+  in O(nm) time where n = len(x) and m = len(y).
+  Source: http://www.algorithmist.com/index.php/Longest_Common_Subsequence
+
+  Args:
+    x: collection of words
+    y: collection of words
+
+  Returns:
+    Table of dictionary of coord and len lcs
+  """
+  n, m = len(x), len(y)
+  table = dict()
+  for i in range(n + 1):
+    for j in range(m + 1):
+      if i == 0 or j == 0:
+        table[i, j] = 0
+      elif x[i - 1] == y[j - 1]:
+        table[i, j] = table[i - 1, j - 1] + 1
+      else:
+        table[i, j] = max(table[i - 1, j], table[i, j - 1])
+  return table
+
+
+def _f_lcs(llcs, m, n):
+  """Computes the LCS-based F-measure score.
+
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+
+  Args:
+    llcs: Length of LCS
+    m: number of words in reference summary
+    n: number of words in candidate summary
+
+  Returns:
+    Float. LCS-based F-measure score
+  """
+  r_lcs = llcs / m
+  p_lcs = llcs / n
+  beta = p_lcs / (r_lcs + 1e-12)
+  num = (1 + (beta**2)) * r_lcs * p_lcs
+  denom = r_lcs + ((beta**2) * p_lcs)
+  f_lcs = num / (denom + 1e-12)
+  return f_lcs
+
+
+def rouge_l_sentence_level(eval_sentences, ref_sentences):
+  """Computes ROUGE-L (sentence level) of two collections of sentences.
+
+  Source: http://research.microsoft.com/en-us/um/people/cyl/download/papers/
+  rouge-working-note-v1.3.1.pdf
+
+  Calculated according to:
+  R_lcs = LCS(X,Y)/m
+  P_lcs = LCS(X,Y)/n
+  F_lcs = ((1 + beta^2)*R_lcs*P_lcs) / (R_lcs + (beta^2) * P_lcs)
+
+  where:
+  X = reference summary
+  Y = Candidate summary
+  m = length of reference summary
+  n = length of candidate summary
+
+  Args:
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the referene set
+
+  Returns:
+    A float: F_lcs
+  """
+
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    m = len(ref_sentence)
+    n = len(eval_sentence)
+    lcs = _len_lcs(eval_sentence, ref_sentence)
+    f1_scores.append(_f_lcs(lcs, m, n))
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def rouge_l_fscore(predictions, labels, **unused_kwargs):
+  """ROUGE scores computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    predictions: tensor, model predicitons
+    labels: tensor, gold output.
+
+  Returns:
+    rouge_l_fscore: approx rouge-l f1 score.
+  """
+  outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+  # Convert the outputs and labels to a [batch_size, input_length] tensor.
+  outputs = tf.squeeze(outputs, axis=[-1, -2])
+  labels = tf.squeeze(labels, axis=[-1, -2])
+  rouge_l_f_score = tf.py_func(rouge_l_sentence_level, (labels, outputs),
+                               tf.float32)
+  return rouge_l_f_score, tf.constant(1.0)
+
+
+def _get_ngrams(n, text):
+  """Calcualtes n-grams.
+
+  Args:
+    n: which n-grams to calculate
+    text: An array of tokens
+
+  Returns:
+    A set of n-grams
+  """
+  ngram_set = set()
+  text_length = len(text)
+  max_index_ngram_start = text_length - n
+  for i in range(max_index_ngram_start + 1):
+    ngram_set.add(tuple(text[i:i + n]))
+  return ngram_set
+
+
+def rouge_n(eval_sentences, ref_sentences, n=2):
+  """Computes ROUGE-N f1 score of two text collections of sentences.
+
+  Sourece: http://research.microsoft.com/en-us/um/people/cyl/download/
+  papers/rouge-working-note-v1.3.1.pdf
+
+  Args:
+    eval_sentences: The sentences that have been picked by the summarizer
+    ref_sentences: The sentences from the reference set
+    n: Size of ngram.  Defaults to 2.
+
+  Returns:
+    f1 score for ROUGE-N
+  """
+
+  f1_scores = []
+  for eval_sentence, ref_sentence in zip(eval_sentences, ref_sentences):
+    eval_ngrams = _get_ngrams(n, eval_sentence)
+    ref_ngrams = _get_ngrams(n, ref_sentence)
+    ref_count = len(ref_ngrams)
+    eval_count = len(eval_ngrams)
+
+    # Gets the overlapping ngrams between evaluated and reference
+    overlapping_ngrams = eval_ngrams.intersection(ref_ngrams)
+    overlapping_count = len(overlapping_ngrams)
+
+    # Handle edge case. This isn't mathematically correct, but it's good enough
+    if eval_count == 0:
+      precision = 0.0
+    else:
+      precision = overlapping_count / eval_count
+
+    if ref_count == 0:
+      recall = 0.0
+    else:
+      recall = overlapping_count / ref_count
+
+    f1_scores.append(2.0 * ((precision * recall) / (precision + recall + 1e-8)))
+
+  # return overlapping_count / reference_count
+  return np.mean(f1_scores, dtype=np.float32)
+
+
+def rouge_2_fscore(predictions, labels, **unused_kwargs):
+  """ROUGE-2 F1 score computation between labels and predictions.
+
+  This is an approximate ROUGE scoring method since we do not glue word pieces
+  or decode the ids and tokenize the output.
+
+  Args:
+    predictions: tensor, model predicitons
+    labels: tensor, gold output.
+
+  Returns:
+    rouge2_fscore: approx rouge-2 f1 score.
+  """
+
+  outputs = tf.to_int32(tf.argmax(predictions, axis=-1))
+  # Convert the outputs and labels to a [batch_size, input_length] tensor.
+  outputs = tf.squeeze(outputs, axis=[-1, -2])
+  labels = tf.squeeze(labels, axis=[-1, -2])
+  rouge_2_f_score = tf.py_func(rouge_n, (labels, outputs), tf.float32)
+  return rouge_2_f_score, tf.constant(1.0)
diff --git a/tensor2tensor/utils/rouge_test.py b/tensor2tensor/utils/rouge_test.py
new file mode 100644
index 000000000..2a8c260e2
--- /dev/null
+++ b/tensor2tensor/utils/rouge_test.py
@@ -0,0 +1,120 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for Rouge metric."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+import numpy as np
+from tensor2tensor.utils import rouge
+
+import tensorflow as tf
+
+
+class TestRouge2Metric(tf.test.TestCase):
+  """Tests the rouge-2 metric."""
+
+  def testRouge2Identical(self):
+    hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    references = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    self.assertAllClose(rouge.rouge_n(hypotheses, references), 1.0, atol=1e-03)
+
+  def testRouge2Disjoint(self):
+    hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    references = np.array([[8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
+                           [9, 10, 11, 12, 13, 14, 15, 16, 17, 0]])
+    self.assertEqual(rouge.rouge_n(hypotheses, references), 0.0)
+
+  def testRouge2PartialOverlap(self):
+    hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    references = np.array([[1, 9, 2, 3, 4, 5, 1, 10, 6, 7],
+                           [1, 9, 2, 3, 4, 5, 1, 10, 6, 7]])
+    self.assertAllClose(rouge.rouge_n(hypotheses, references), 0.53, atol=1e-03)
+
+
+class TestRougeLMetric(tf.test.TestCase):
+  """Tests the rouge-l metric."""
+
+  def testRougeLIdentical(self):
+    hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    references = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    self.assertAllClose(
+        rouge.rouge_l_sentence_level(hypotheses, references), 1.0, atol=1e-03)
+
+  def testRougeLDisjoint(self):
+    hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    references = np.array([[8, 9, 10, 11, 12, 13, 14, 15, 16, 17],
+                           [9, 10, 11, 12, 13, 14, 15, 16, 17, 0]])
+    self.assertEqual(rouge.rouge_l_sentence_level(hypotheses, references), 0.0)
+
+  def testRougeLPartialOverlap(self):
+    hypotheses = np.array([[1, 2, 3, 4, 5, 1, 6, 7, 0],
+                           [1, 2, 3, 4, 5, 1, 6, 8, 7]])
+    references = np.array([[1, 9, 2, 3, 4, 5, 1, 10, 6, 7],
+                           [1, 9, 2, 3, 4, 5, 1, 10, 6, 7]])
+    self.assertAllClose(
+        rouge.rouge_l_sentence_level(hypotheses, references), 0.837, atol=1e-03)
+
+
+class TestRougeMetricsE2E(tf.test.TestCase):
+  """Tests the rouge metrics end-to-end."""
+
+  def testRouge2MetricE2E(self):
+    vocab_size = 4
+    batch_size = 12
+    seq_length = 12
+    predictions = tf.one_hot(
+        np.random.randint(vocab_size, size=(batch_size, seq_length, 1, 1)),
+        depth=4,
+        dtype=tf.float32)
+    targets = np.random.randint(4, size=(12, 12, 1, 1))
+    with self.test_session() as session:
+      scores, _ = rouge.rouge_2_fscore(predictions,
+                                       tf.constant(targets, dtype=tf.int32))
+      a = tf.reduce_mean(scores)
+      session.run(tf.global_variables_initializer())
+      session.run(a)
+
+  def testRougeLMetricE2E(self):
+    vocab_size = 4
+    batch_size = 12
+    seq_length = 12
+    predictions = tf.one_hot(
+        np.random.randint(vocab_size, size=(batch_size, seq_length, 1, 1)),
+        depth=4,
+        dtype=tf.float32)
+    targets = np.random.randint(4, size=(12, 12, 1, 1))
+    with self.test_session() as session:
+      scores, _ = rouge.rouge_l_fscore(
+          predictions,
+          tf.constant(targets, dtype=tf.int32))
+      a = tf.reduce_mean(scores)
+      session.run(tf.global_variables_initializer())
+      session.run(a)
+
+
+if __name__ == "__main__":
+  tf.test.main()

From 13ac3b45994e28a00284a28943ace21d57a14dc6 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Mon, 7 Aug 2017 11:39:43 -0700
Subject: [PATCH 02/17] Filter C++/java files from the des2code dataset and
 replace tabs by spaces

PiperOrigin-RevId: 164487331
---
 tensor2tensor/data_generators/desc2code.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py
index 98c93aacd..b6f0d38a2 100644
--- a/tensor2tensor/data_generators/desc2code.py
+++ b/tensor2tensor/data_generators/desc2code.py
@@ -43,6 +43,7 @@
 
 _DESC_DIR_NAME = "description"
 _CODE_PY_DIR_NAME = "solutions_python"
+_CODE_PY_FILTER_PATERNS = ["#include", "# include", "import java."]
 
 _VOCAB_EN_FILENAME = "vocab.endefr"
 _VOCAB_PY_FILENAME = "vocab.py"
@@ -145,6 +146,7 @@ def generator_samples_content(get_source, get_target):
           for code_file in sample.code_files:
             with tf.gfile.GFile(code_file, mode="r") as target_file:
               target = target_file.read()
+              target = target.replace("\t", "    ")
             yield source, target
         elif sample.code_files:  # Only take the source if a target exists
           yield source, target
@@ -224,10 +226,18 @@ def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
     # pairs, the problem difficulty, the names of the algorithmic techniques
     # needed)
     desc_file = os.path.join(subdir, _DESC_DIR_NAME, "description.txt")
-    code_rootdir = os.path.join(subdir, _CODE_PY_DIR_NAME)
-    code_files = [
-        f for f in tf.gfile.Glob(os.path.join(code_rootdir, "*.txt"))
-    ]
+    code_files = []
+    # As the dataset is noisy, the program deduce the language from the file
+    # content.
+    code_pattern = os.path.join(subdir, _CODE_PY_DIR_NAME, "*.txt")
+    for f in tf.gfile.Glob(code_pattern):
+      with tf.gfile.GFile(f, mode="r") as target_file:
+        # Hack to filter C++/Java files. In theory some python comments could
+        # make the file be concidered as C++ but in practice the chance of
+        # getting a false negative is low.
+        content = target_file.read()
+        if not any(p in content for p in _CODE_PY_FILTER_PATERNS):
+          code_files.append(f)
     return CodingPbInfo(
         desc_file=desc_file,
         code_files=code_files
@@ -239,4 +249,3 @@ def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
   for w in tf.gfile.Walk(data_rootdir):
     if contains_samples(*w):
       yield next_sample(*w)
-

From 032dab0cdf5d74932ad451fef24435625183ddc0 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Mon, 7 Aug 2017 16:38:37 -0700
Subject: [PATCH 03/17] Add Desc2Cpp problem

PiperOrigin-RevId: 164528773
---
 tensor2tensor/data_generators/desc2code.py    | 108 ++++++++++++++----
 .../data_generators/desc2code_test.py         |  64 +++++++++++
 tensor2tensor/data_generators/problem.py      |   2 +
 3 files changed, 151 insertions(+), 23 deletions(-)
 create mode 100644 tensor2tensor/data_generators/desc2code_test.py

diff --git a/tensor2tensor/data_generators/desc2code.py b/tensor2tensor/data_generators/desc2code.py
index b6f0d38a2..6cef6db63 100644
--- a/tensor2tensor/data_generators/desc2code.py
+++ b/tensor2tensor/data_generators/desc2code.py
@@ -22,6 +22,7 @@
 import collections
 import os
 import random
+import re
 import zipfile
 
 # Dependency imports
@@ -42,11 +43,32 @@
 _DATASET_PB_PATH = "description2code_current/"
 
 _DESC_DIR_NAME = "description"
-_CODE_PY_DIR_NAME = "solutions_python"
-_CODE_PY_FILTER_PATERNS = ["#include", "# include", "import java."]
 
 _VOCAB_EN_FILENAME = "vocab.endefr"
-_VOCAB_PY_FILENAME = "vocab.py"
+
+_RE_CPP_INLINE_COMMENT = re.compile("//.*?\n")  # Compiled once
+
+
+# Constant defined for a language problem
+CodingPbConstants = collections.namedtuple("CodingPbConstants", [
+    "code_dir_name",
+    "vocab_filename",
+    "filter_patterns",
+    "target_space",
+])
+
+PB_PY = CodingPbConstants(
+    code_dir_name="solutions_python",
+    vocab_filename="vocab.py",
+    filter_patterns=["#include", "# include", "import java."],
+    target_space=problem.SpaceID.PY_TOK,
+)
+PB_CPP = CodingPbConstants(
+    code_dir_name="solutions_c++",
+    vocab_filename="vocab.cpp",
+    filter_patterns=["import java."],
+    target_space=problem.SpaceID.CPP_TOK,
+)
 
 # Struct containing a coding problem (contains the paths to the descriptions
 # and code files)
@@ -68,6 +90,14 @@ def num_shards(self):
   def use_subword_tokenizer(self):
     return True
 
+  @property
+  def input_space_id(self):
+    return problem.SpaceID.EN_TOK
+
+  @property
+  def target_space_id(self):
+    return self.pb_constants.target_space
+
   @property
   def input_vocab_size(self):
     return 2**15  # 32k
@@ -82,7 +112,21 @@ def vocab_input_filename(self):
 
   @property
   def vocab_target_filename(self):
-    return "{}.{}".format(_VOCAB_PY_FILENAME, self.target_vocab_size)
+    return "{}.{}".format(
+        self.pb_constants.vocab_filename, self.target_vocab_size)
+
+  def preprocess_target(self, target):
+    """Apply some preprocessing to the target.
+
+    For instance, remove space/tabs.
+
+    Args:
+      target (str): code source content
+
+    Returns:
+      the pre-processed string content
+    """
+    return target
 
   def feature_encoders(self, data_dir):
     source_vocab_filename = os.path.join(data_dir, self.vocab_input_filename)
@@ -94,24 +138,11 @@ def feature_encoders(self, data_dir):
         "targets": target_token,
     }
 
-
-@registry.register_problem("desc2code_py")
-class Desc2CodePyProblem(Desc2CodeProblem):
-  """Description2Code for python problem."""
-
-  @property
-  def input_space_id(self):
-    return problem.SpaceID.EN_TOK
-
-  @property
-  def target_space_id(self):
-    return problem.SpaceID.PY_TOK
-
   def train_generator(self, data_dir, tmp_dir, train):
     # Called twice: for train and test
 
     # Get the list of the training samples (coding challenge samples)
-    samples = list(generator_samples(tmp_dir))
+    samples = list(generator_samples(tmp_dir, self.pb_constants))
 
     # Split between train and dev
     # Suffle to get problems from diverse sources (CodeChef and CodeForces) and
@@ -146,7 +177,7 @@ def generator_samples_content(get_source, get_target):
           for code_file in sample.code_files:
             with tf.gfile.GFile(code_file, mode="r") as target_file:
               target = target_file.read()
-              target = target.replace("\t", "    ")
+              target = self.preprocess_target(target)
             yield source, target
         elif sample.code_files:  # Only take the source if a target exists
           yield source, target
@@ -178,16 +209,47 @@ def generator_target():
       }
 
 
+@registry.register_problem("desc2code_py")
+class Desc2CodePyProblem(Desc2CodeProblem):
+  """Description2Code for python problem."""
+
+  @property
+  def pb_constants(self):
+    return PB_PY
+
+  def preprocess_target(self, target):
+    """Simple tab to space replacement."""
+    return target.replace("\t", "    ")
+
+
+@registry.register_problem("desc2code_cpp")
+class Desc2CodeCppProblem(Desc2CodeProblem):
+  """Description2Code for C++ problem."""
+
+  @property
+  def pb_constants(self):
+    return PB_CPP
+
+  def preprocess_target(self, target):
+    """Pre-process Cpp files."""
+    target = re.sub(_RE_CPP_INLINE_COMMENT, " ", target)  # Remove comments
+    # The regex rule is quite simple, So will fail if a // is inside a string,
+    # and don't remove /* */ comments
+    target = " ".join(target.split())  # Normalize all spaces
+    return target
+
+
 # Utils functions
 
 
-def generator_samples(tmp_dir):
+def generator_samples(tmp_dir, pb_cst):
   """Generator for the dataset samples.
 
   If not present, download and extract the dataset.
 
   Args:
     tmp_dir: path to the directory where to download the dataset.
+    pb_cst: CodingPbConstants object defining paths
 
   Yields:
     A CodingPbInfo object containing the next challenge informations.
@@ -217,7 +279,7 @@ def contains_samples(subdir, dirs, files):  # pylint: disable=unused-argument
     """Check that the folder contains a problem."""
     return (
         _DESC_DIR_NAME in dirs and
-        _CODE_PY_DIR_NAME in dirs
+        pb_cst.code_dir_name in dirs
     )
 
   def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
@@ -229,14 +291,14 @@ def next_sample(subdir, dirs, files):  # pylint: disable=unused-argument
     code_files = []
     # As the dataset is noisy, the program deduce the language from the file
     # content.
-    code_pattern = os.path.join(subdir, _CODE_PY_DIR_NAME, "*.txt")
+    code_pattern = os.path.join(subdir, pb_cst.code_dir_name, "*.txt")
     for f in tf.gfile.Glob(code_pattern):
       with tf.gfile.GFile(f, mode="r") as target_file:
         # Hack to filter C++/Java files. In theory some python comments could
         # make the file be concidered as C++ but in practice the chance of
         # getting a false negative is low.
         content = target_file.read()
-        if not any(p in content for p in _CODE_PY_FILTER_PATERNS):
+        if not any(p in content for p in pb_cst.filter_patterns):
           code_files.append(f)
     return CodingPbInfo(
         desc_file=desc_file,
diff --git a/tensor2tensor/data_generators/desc2code_test.py b/tensor2tensor/data_generators/desc2code_test.py
new file mode 100644
index 000000000..0d10c7d6f
--- /dev/null
+++ b/tensor2tensor/data_generators/desc2code_test.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for google3.third_party.py.tensor2tensor.data_generators.desc2code."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+from tensor2tensor.data_generators import desc2code
+from google3.testing.pybase import googletest
+
+
+CODE_CPP_IN = """
+  #include <iostream>
+
+void main() {  // This comment will be removed
+  // This too.
+  //
+  /* Not    this     one     */
+\t
+\t
+  int a \t\n  =   3;//
+//
+}
+
+"""
+
+CODE_CPP_OUT = "#include <iostream> void main() { /* Not this one */ int a = 3; }"  # pylint: disable=line-too-loong
+
+
+class Desc2codeTest(googletest.TestCase):
+
+  def testCppPreprocess(self):
+    """Check that the file correctly preprocess the code source."""
+    cpp_pb = desc2code.Desc2CodeCppProblem()
+
+    self.assertEqual(  # Add space beween two lines
+        cpp_pb.preprocess_target("firstline//comm1\nsecondline//comm2\n"),
+        "firstline secondline"
+    )
+    # Checking for boths comments and spaces
+    self.assertEqual(cpp_pb.preprocess_target(CODE_CPP_IN), CODE_CPP_OUT)
+    self.assertEqual(
+        cpp_pb.preprocess_target("  not removed //abcd  "),
+        "not removed //abcd"
+    )
+
+
+if __name__ == "__main__":
+  googletest.main()
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 2bbc88192..66cd7df4f 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -88,6 +88,8 @@ class SpaceID(object):
   PEPTIDE = 26
   # Python
   PY_TOK = 27
+  # C++
+  CPP_TOK = 28
 
 
 class Problem(object):

From 4517651aea3891584b65224093e7ce182752fecd Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Mon, 7 Aug 2017 19:09:39 -0700
Subject: [PATCH 04/17] Play more with VAE, small corrections in README and
 ByteNet and shared weights hparams.

PiperOrigin-RevId: 164543518
---
 tensor2tensor/bin/t2t-trainer           |  17 ++-
 tensor2tensor/data_generators/cipher.py |  14 +--
 tensor2tensor/models/bytenet.py         |   6 +-
 tensor2tensor/models/transformer_vae.py | 154 ++++++++++++++++--------
 tensor2tensor/utils/model_builder.py    |   6 +-
 tensor2tensor/utils/t2t_model.py        |  12 +-
 6 files changed, 136 insertions(+), 73 deletions(-)

diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
index 13dd7d355..6e0be3f23 100644
--- a/tensor2tensor/bin/t2t-trainer
+++ b/tensor2tensor/bin/t2t-trainer
@@ -30,6 +30,8 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
+import os
+
 # Dependency imports
 
 from tensor2tensor.utils import registry
@@ -57,22 +59,25 @@ def main(_):
   usr_dir.import_usr_dir(FLAGS.t2t_usr_dir)
   trainer_utils.log_registry()
   trainer_utils.validate_flags()
-  tf.gfile.MakeDirs(FLAGS.output_dir)
+  output_dir = os.path.expanduser(FLAGS.output_dir)
+  tmp_dir = os.path.expanduser(FLAGS.tmp_dir)
+  data_dir = os.path.expanduser(FLAGS.data_dir)
+  tf.gfile.MakeDir(output_dir)
 
   # Generate data if requested.
   if FLAGS.generate_data:
-    tf.gfile.MakeDirs(FLAGS.data_dir)
-    tf.gfile.MakeDirs(FLAGS.tmp_dir)
+    tf.gfile.MakeDirs(data_dir)
+    tf.gfile.MakeDirs(tmp_dir)
     for problem_name in FLAGS.problems.split("-"):
       tf.logging.info("Generating data for %s" % problem_name)
       problem = registry.problem(problem_name)
-      problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir)
+      problem.generate_data(data_dir, tmp_dir)
 
   # Run the trainer.
   trainer_utils.run(
-      data_dir=FLAGS.data_dir,
+      data_dir=data_dir,
       model=FLAGS.model,
-      output_dir=FLAGS.output_dir,
+      output_dir=output_dir,
       train_steps=FLAGS.train_steps,
       eval_steps=FLAGS.eval_steps,
       schedule=FLAGS.schedule)
diff --git a/tensor2tensor/data_generators/cipher.py b/tensor2tensor/data_generators/cipher.py
index 3a743337a..41dcbd80e 100644
--- a/tensor2tensor/data_generators/cipher.py
+++ b/tensor2tensor/data_generators/cipher.py
@@ -56,8 +56,8 @@ def _gen(nbr_symbols, max_length, nbr_cases):
 
       for plain, code in zip(indices, codes):
         yield {
-            "X": plain,
-            "Y": code,
+            "inputs": plain,
+            "targets": code,
         }
 
     return _gen
@@ -99,8 +99,8 @@ def _gen(nbr_symbols, max_length, nbr_cases):
 
       for plain, code in zip(indices, codes):
         yield {
-            "X": plain,
-            "Y": code,
+            "inputs": plain,
+            "targets": code,
         }
 
     return _gen
@@ -148,7 +148,7 @@ def key(self):
     return [1, 3]
 
 
-class Layer(object):
+class ShiftEncryptionLayer(object):
   """A single layer for shift."""
 
   def __init__(self, vocab, shift):
@@ -211,7 +211,7 @@ def encipher_shift(plaintext, plain_vocab, shift):
     ciphertext (list of Strings): encrypted plain text.
   """
   ciphertext = []
-  cipher = Layer(plain_vocab, shift)
+  cipher = ShiftEncryptionLayer(plain_vocab, shift)
 
   for _, sentence in enumerate(plaintext):
     cipher_sentence = []
@@ -238,7 +238,7 @@ def encipher_vigenere(plaintext, plain_vocab, key):
   # generate Vigenere table
   layers = []
   for i in range(len(plain_vocab)):
-    layers.append(Layer(plain_vocab, i))
+    layers.append(ShiftEncryptionLayer(plain_vocab, i))
 
   for i, sentence in enumerate(plaintext):
     cipher_sentence = []
diff --git a/tensor2tensor/models/bytenet.py b/tensor2tensor/models/bytenet.py
index d9c4e29a9..e4537ef3f 100644
--- a/tensor2tensor/models/bytenet.py
+++ b/tensor2tensor/models/bytenet.py
@@ -40,13 +40,13 @@ def residual_dilated_conv(x, repeat, padding, name, hparams):
     for i in xrange(repeat):
       with tf.variable_scope("repeat_%d" % i):
         y = common_layers.conv_block(
-            x,
+            common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"),
             hparams.hidden_size,
             dilations_and_kernels,
             padding=padding,
             name="residual_conv")
-        x = common_layers.layer_norm(x + y, hparams.hidden_size, name="lnorm")
-        x = tf.nn.dropout(x, hparams.dropout)
+        y = tf.nn.dropout(y, 1.0 - hparams.dropout)
+        x += y
     return x
 
 
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 31de7bd5f..f3d400045 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -31,13 +31,31 @@
 import tensorflow as tf
 
 
-def decompress(source, hparams, name):
+def residual_conv(x, repeat, hparams, name):
+  """A stack of convolution blocks with residual connections."""
+  with tf.variable_scope(name):
+    k = (3, 1)
+    dilations_and_kernels = [((1, 1), k) for _ in xrange(3)]
+    for i in xrange(repeat):
+      with tf.variable_scope("repeat_%d" % i):
+        y = common_layers.conv_block(
+            common_layers.layer_norm(x, hparams.hidden_size, name="lnorm"),
+            hparams.hidden_size,
+            dilations_and_kernels,
+            padding="SAME",
+            name="residual_conv")
+        y = tf.nn.dropout(y, 1.0 - hparams.dropout)
+        x += y
+    return x
+
+
+def decompress(source, hparams, first_relu, name):
   """Decompression function."""
   with tf.variable_scope(name):
     shape = tf.shape(source)
     thicker = common_layers.conv_block(
         source, hparams.hidden_size * 2, [((1, 1), (1, 1))],
-        name="decompress_conv")
+        first_relu=first_relu, name="decompress_conv")
     return tf.reshape(thicker, [shape[0], shape[1] * 2, 1, hparams.hidden_size])
 
 
@@ -60,6 +78,7 @@ def compress_vae(inputs, hparams, name):
     # Run compression by strided convs.
     cur = tf.expand_dims(inputs, axis=2)
     for i in xrange(hparams.num_compress_steps):
+      cur = residual_conv(cur, 1, hparams, "compress_rc_%d" % i)
       cur = common_layers.conv_block(
           cur, hparams.hidden_size, [((1, 1), (2, 1))],
           strides=(2, 1), name="compress_%d" % i)
@@ -72,73 +91,78 @@ def compress_vae(inputs, hparams, name):
     return cur, kl_loss
 
 
+def encode(x, x_space, hparams, name):
+  """Transformer preparations and encoder."""
+  with tf.variable_scope(name):
+    (encoder_input, encoder_self_attention_bias,
+     _) = transformer.transformer_prepare_encoder(x, x_space, hparams)
+    residual_fn = transformer.get_residual_fn(hparams)
+    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
+    return transformer.transformer_encoder(
+        encoder_input, residual_fn, encoder_self_attention_bias, hparams)
+
+
+def dropmask(targets, targets_dropout_max, is_training):
+  if not is_training:
+    return targets
+  targets_drop_prob = tf.random_uniform([]) * targets_dropout_max
+  drop_mask = tf.random_uniform(tf.shape(targets)[:-1])
+  drop_mask = tf.to_float(tf.less(drop_mask, targets_drop_prob))
+  keep_mask = tf.expand_dims(1.0 - drop_mask, axis=2)
+  return targets * keep_mask
+
+
 def vae_transformer_internal(inputs, targets, target_space, hparams):
   """VAE Transformer, main step used for training."""
   with tf.variable_scope("vae_transformer"):
     is_training = hparams.mode == tf.contrib.learn.ModeKeys.TRAIN
     # Prepare inputs, targets, and k.
     inputs = common_layers.flatten4d3d(inputs)
+    input_len = tf.shape(inputs)[1]  # Double input size to cover targets.
+    inputs = tf.pad(inputs, [[0, 0], [0, input_len], [0, 0]])
+    inputs.set_shape([None, None, hparams.hidden_size])
     targets = common_layers.flatten4d3d(targets)
     k = 2**hparams.num_compress_steps
-    _, targets = common_layers.pad_to_same_length(
+    inputs, targets = common_layers.pad_to_same_length(
         inputs, targets, final_length_divisible_by=k)
+    inputs = encode(inputs, target_space, hparams, "input_enc")
+
+    # Dropout targets or swap for zeros 5% of the time.
+    max_prestep = 90000
+    prob_targets = 0.95 if is_training else 1.0
+    targets_dropout_max = common_layers.inverse_lin_decay(max_prestep) - 0.01
+    targets = dropmask(targets, targets_dropout_max, is_training)
+    targets = tf.cond(tf.less(tf.random_uniform([]), prob_targets),
+                      lambda: targets, lambda: tf.zeros_like(targets))
+
+    # Join targets with inputs, run encoder.
+    # to_encode = common_layers.conv_block(
+    #     tf.expand_dims(tf.concat([targets, inputs], axis=2), axis=2),
+    #     hparams.hidden_size, [((1, 1), (1, 1))],
+    #     first_relu=False, name="join_targets")
+    # to_compress = encode(tf.squeeze(to_encode, axis=2),
+    #                      target_space, hparams, "enc")
+
+    # Compress and vae.
+    z, kl_loss = compress_vae(targets, hparams, "vae")
 
-    # Transformer preparations and encoder.
-    (encoder_input, encoder_self_attention_bias,
-     encoder_decoder_attention_bias) = transformer.transformer_prepare_encoder(
-         inputs, target_space, hparams)
-    residual_fn = transformer.get_residual_fn(hparams)
-    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
-    encoder_output = transformer.transformer_encoder(
-        encoder_input, residual_fn, encoder_self_attention_bias, hparams)
-
-    def get_decoder_autoregressive():
-      """Decoder input for autoregressive computation."""
-      (a, b) = transformer.transformer_prepare_decoder(targets, hparams)
-      return (a, b, tf.constant(0.0))
-
-    # 10% of the time we compress all-zeros, as will be at decoding start.
-    prob_targets = 0.9 if is_training else 1.0
-    to_compress = tf.cond(tf.less(tf.random_uniform([]), prob_targets),
-                          lambda: targets, lambda: tf.zeros_like(targets))
-    z, kl_loss = compress_vae(to_compress, hparams, "vae")
     # Decompress.
     for i in xrange(hparams.num_compress_steps):
       j = hparams.num_hidden_layers - i - 1
-      z = decompress(z, hparams, "decompress_%d" % j)
+      z = residual_conv(z, 1, hparams, "dec_rc_%d" % j)
+      z = decompress(z, hparams, i > 0, "decompress_%d" % j)
 
-    def get_decoder_from_vae():
-      """Decoder input computed by VAE."""
-      # Return decoder stuff.
-      (a, b) = transformer.transformer_prepare_decoder(
-          tf.squeeze(z, axis=2), hparams)
-      return (a, b, kl_loss)
+    # Join z with inputs, run decoder.
+    to_decode = common_layers.conv_block(
+        tf.concat([z, tf.expand_dims(inputs, axis=2)], axis=3),
+        hparams.hidden_size, [((1, 1), (1, 1))], name="join_z")
+    ret = encode(tf.squeeze(to_decode, axis=2), target_space, hparams, "dec")
+    # to_decode = residual_conv(to_decode, 2, hparams, "dec_conv")
+    # ret = tf.squeeze(to_decode, axis=2)
 
     # Randomize decoder inputs..
-    prob_do_vae = common_layers.inverse_exp_decay(40000) * 0.7
-    step = tf.to_float(tf.contrib.framework.get_global_step())
-    if not is_training:
-      prob_do_vae = tf.cond(tf.less(step, 40000.0), lambda: tf.constant(0.0),
-                            lambda: tf.constant(1.0))
-    (decoder_input, decoder_self_attention_bias, kl_loss2) = tf.cond(
-        tf.less(tf.random_uniform([]), prob_do_vae),
-        get_decoder_from_vae, get_decoder_autoregressive)
-
-    # Transformer decoder.
-    decoder_output = transformer.transformer_decoder(
-        decoder_input, encoder_output, residual_fn, decoder_self_attention_bias,
-        encoder_decoder_attention_bias, hparams)
-    decoder_output = tf.expand_dims(decoder_output, 2)
-
-    cond_self = tf.cond(tf.less(step, 30000.0), lambda: tf.constant(1.0),
-                        lambda: tf.constant(0.0))
-    prob_self = 0.4 if is_training else cond_self
-    (ret, kl_loss) = tf.cond(tf.less(tf.random_uniform([]), prob_self),
-                             lambda: (z, kl_loss),
-                             lambda: (decoder_output, kl_loss2))
-
-    kl_loss *= common_layers.inverse_exp_decay(50000) * 2.0
-    return ret, kl_loss
+    kl_loss *= common_layers.inverse_exp_decay(max_prestep) * 3.0
+    return tf.expand_dims(ret, axis=2), kl_loss
 
 
 @registry.register_model
@@ -171,6 +195,15 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
         features, False, last_position_only=last_position_only)
     sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4)
     samples = tf.concat(sharded_samples, 0)
+
+    # 2nd step.
+    with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+      features["targets"] = samples
+      sharded_logits, _ = self.model_fn(
+          features, False, last_position_only=last_position_only)
+      sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4)
+      samples = tf.concat(sharded_samples, 0)
+
     if inputs_old is not None:  # Restore to not confuse Estimator.
       features["inputs"] = inputs_old
     return samples
@@ -180,6 +213,21 @@ def infer(self, features=None, decode_length=50, beam_size=1, top_beams=1,
 def transformer_vae_small():
   """Set of hyperparameters."""
   hparams = transformer.transformer_small()
+  hparams.batch_size = 2048
   hparams.add_hparam("z_size", 128)
   hparams.add_hparam("num_compress_steps", 4)
   return hparams
+
+
+@registry.register_hparams
+def transformer_vae_base():
+  """Set of hyperparameters."""
+  hparams = transformer_vae_small()
+  hparams.hidden_size = 512
+  hparams.filter_size = 2048
+  hparams.attention_dropout = 0.1
+  hparams.relu_dropout = 0.1
+  hparams.dropout = 0.1
+  hparams.num_hidden_layers = 4
+  hparams.z_size = 256
+  return hparams
diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
index a12aa1122..01768e263 100644
--- a/tensor2tensor/utils/model_builder.py
+++ b/tensor2tensor/utils/model_builder.py
@@ -195,9 +195,9 @@ def nth_model(n):
           features, skip=(skipping_is_on and skip_this_one))
       with tf.variable_scope("losses_avg", reuse=True):
         total_loss, ops = 0.0, []
-        for loss_key, loss_value in losses_dict.iteritems():
-          loss_moving_avg = tf.get_variable("problem_%d/%s_loss" % (n,
-                                                                    loss_key))
+        for loss_key, loss_value in six.iteritems(losses_dict):
+          loss_moving_avg = tf.get_variable(
+              "problem_%d/%s_loss" % (n, loss_key))
           ops.append(
               loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
           total_loss += loss_value
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 3af4f10c1..a63f5cc7f 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -85,11 +85,21 @@ def __init__(self,
       ps_devices = [""]
     hparams = copy.copy(hparams)
     hparams.add_hparam("mode", mode)
-    # when not in training mode, set all forms of dropout to zero.
+    # When not in training mode, set all forms of dropout to zero.
     if mode != tf.contrib.learn.ModeKeys.TRAIN:
       for key in hparams.values():
         if key[-len("dropout"):] == "dropout":
           setattr(hparams, key, 0.0)
+    # If vocabularies differ, unset shared_embedding_and_softmax_weights.
+    if hparams.shared_embedding_and_softmax_weights:
+      same_vocab_sizes = True
+      for problem in hparams.problems:
+        if "inputs" in problem.input_modality:
+          if problem.input_modality["inputs"] != problem.target_modality:
+            same_vocab_sizes = False
+      if not same_vocab_sizes:
+        tf.logging.info("Unsetting shared_embedding_and_softmax_weights.")
+        hparams.shared_embedding_and_softmax_weights = 0
     self._hparams = hparams
     self._data_parallelism = data_parallelism
     self._num_datashards = data_parallelism.n

From ce59768a415c4ceeeb47872d28ad90a10026ac18 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Tue, 8 Aug 2017 09:17:50 -0700
Subject: [PATCH 05/17] Add reversible residual block (from RevNet)

PiperOrigin-RevId: 164601350
---
 tensor2tensor/layers/rev_block.py      | 195 +++++++++++++++++++++++++
 tensor2tensor/layers/rev_block_test.py |  92 ++++++++++++
 2 files changed, 287 insertions(+)
 create mode 100644 tensor2tensor/layers/rev_block.py
 create mode 100644 tensor2tensor/layers/rev_block_test.py

diff --git a/tensor2tensor/layers/rev_block.py b/tensor2tensor/layers/rev_block.py
new file mode 100644
index 000000000..1e1a7b848
--- /dev/null
+++ b/tensor2tensor/layers/rev_block.py
@@ -0,0 +1,195 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Reversible Residual Block.
+
+From
+[The Reversible Residual Network: Backpropagation Without Storing
+Activations](https://arxiv.org/abs/1707.04585).
+"""
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import re
+
+# Dependency imports
+
+import tensorflow as tf
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import function
+
+LAYER_RE = re.compile(".*revlayer_([0-9]*)/([fg])/.*")
+
+
+def _rev_layer_forward(xs, f, g):
+  """Forward for 1 reversible layer."""
+  x1, x2 = xs
+  with tf.variable_scope("f"):
+    y1 = x1 + f(x2)
+  with tf.variable_scope("g"):
+    y2 = x2 + g(y1)
+  return (y1, y2)
+
+
+def _rev_layer_backward(ys, grad_ys, f, g, f_vars, g_vars):
+  """Backprop for 1 layer."""
+  y1, y2 = ys
+  grad_y1, grad_y2 = grad_ys
+
+  # Reconstruct intermediates and inputs (x1, x2)
+  # stop_gradients required on y1 and x2 to prevent infinite recursion into this
+  # grad function on the calls to tf.gradients.
+  y1_stop = tf.stop_gradient(y1)
+  with tf.variable_scope("g"):
+    gy1 = g(y1_stop)
+
+  x2 = y2 - gy1
+  x2_stop = tf.stop_gradient(x2)
+  with tf.variable_scope("f"):
+    fx2 = f(x2_stop)
+
+  x1 = y1 - fx2
+
+  # Compute gradients wrt to inputs
+  # dL/dy2 * dG(y1)/y1
+  grad_gy1_y2 = tf.gradients(gy1, y1_stop, grad_y2)[0]
+  grad_x1 = grad_y1 + grad_gy1_y2
+  grad_x2 = (tf.gradients(fx2, x2_stop, grad_y1)[0] + grad_y2 + tf.gradients(
+      fx2, x2_stop, grad_gy1_y2)[0])
+
+  # Compute gradients wrt to vars in f and g
+  grad_g_vars = tf.gradients(gy1, g_vars, grad_y2)
+  grad_f_y1 = tf.gradients(fx2, f_vars, grad_y1)
+  grad_f_y2 = tf.gradients(fx2, f_vars, grad_gy1_y2)
+  grad_f_vars = [tf.add_n(grads) for grads in zip(grad_f_y1, grad_f_y2)]
+
+  return (x1, x2), (grad_x1, grad_x2), grad_f_vars, grad_g_vars
+
+
+def _rev_block_forward(x, f, g, num_layers=1, layer_scopes=None, name=None):
+  """Forward for a series of reversible layers."""
+  x1, x2 = tf.split(x, 2, axis=len(x.get_shape()) - 1)
+  out = (x1, x2)
+  with tf.variable_scope(name, default_name="revblock"):
+    for i in xrange(num_layers):
+      with tf.variable_scope("revlayer_%d" % i) as layer_vs:
+        if layer_scopes is not None:
+          layer_scopes.append(layer_vs)
+        out = _rev_layer_forward(out, f, g)
+
+  y1, y2 = out
+  y = tf.concat([y1, y2], axis=-1)
+  return y
+
+
+def rev_block(x, f, g, num_layers=1, is_training=True):
+  """A block of reversible residual layers.
+
+  A reversible residual layer is defined as:
+
+  ```
+  x1, x2 = tf.split(x, 2, axis=-1)
+  y1 = x1 + f(x2)
+  y2 = x2 + g(y1)
+  y = tf.concat([y1, y2], axis=-1)
+  ```
+
+  Args:
+    x: a float Tensor, input, will be split evenly across the last dim.
+    f: a function, (Tensor) -> (Tensor). Should not change the shape of the
+      Tensor. May create variables. Should NOT close over any Tensor values.
+    g: a function, (Tensor) -> (Tensor). Should not change the shape of the
+      Tensor. May create variables. Should NOT close over any Tensor values.
+    num_layers: int, number of reversible residual layers. Each layer will
+      apply f and g according to the equations above, with new variables in each
+      layer.
+    is_training: bool, whether to actually use the efficient backprop codepath.
+
+  Returns:
+    y: a float Tensor, output.
+  """
+  layer_scopes = []
+
+  def rev_block_grad(op, grad_y):
+    """Custom gradient fn for a block of reversible residual layers."""
+    y = op.outputs[0]
+    ys = tf.split(y, 2, axis=len(y.get_shape()) - 1)
+    grad_ys = tf.split(grad_y, 2, axis=len(y.get_shape()) - 1)
+
+    # Find all variables from f and from g
+    # Keep track of their positions in all_vars
+    all_vars = op.inputs[1:]
+    f_vars = [[] for _ in range(num_layers)]
+    g_vars = [[] for _ in range(num_layers)]
+    f_vars_idxs = [[] for _ in range(num_layers)]
+    g_vars_idxs = [[] for _ in range(num_layers)]
+
+    for i, v in enumerate(all_vars):
+      ref = v.op.inputs[0]
+      assert ref.dtype == dtypes.float32_ref
+      regex = LAYER_RE.match(v.name)
+      layer_no = int(regex.group(1))
+      fn_name = regex.group(2)
+      if fn_name == "f":
+        f_vars[layer_no].append(ref)
+        f_vars_idxs[layer_no].append(i)
+      else:
+        assert fn_name == "g"
+        g_vars[layer_no].append(ref)
+        g_vars_idxs[layer_no].append(i)
+
+    f_grads = []
+    g_grads = []
+
+    # Reverse state containers to go backward
+    layer_scopes.reverse()
+    f_vars.reverse()
+    g_vars.reverse()
+
+    for i in xrange(num_layers):
+      with tf.variable_scope(layer_scopes[i], reuse=True):
+        ys, grad_ys, grad_f_vars, grad_g_vars = _rev_layer_backward(
+            ys, grad_ys, f, g, f_vars[i], g_vars[i])
+        f_grads.append(grad_f_vars)
+        g_grads.append(grad_g_vars)
+
+    # Gradients were collected in reverse layer order
+    f_grads.reverse()
+    g_grads.reverse()
+
+    # Reorder the gradients so they match the original order of all_vars
+    var_grads = [None] * len(all_vars)
+    for idxs, grads in zip(f_vars_idxs, f_grads) + zip(g_vars_idxs, g_grads):
+      for i, grad in zip(idxs, grads):
+        var_grads[i] = grad
+
+    grad_x = tf.concat(grad_ys, axis=-1)
+    all_grads = [grad_x] + var_grads
+    return all_grads
+
+  @function.Defun(
+      tf.float32,
+      python_grad_func=rev_block_grad,
+      shape_func=lambda _: [x.get_shape()])
+  def rev_block_defun(inp):
+    inp.set_shape(x.get_shape())
+    return _rev_block_forward(
+        inp, f, g, num_layers=num_layers, layer_scopes=layer_scopes)
+
+  if is_training:
+    return rev_block_defun(x)
+  else:
+    return _rev_block_forward(x, f, g, num_layers=num_layers)
diff --git a/tensor2tensor/layers/rev_block_test.py b/tensor2tensor/layers/rev_block_test.py
new file mode 100644
index 000000000..bc4bcc6a4
--- /dev/null
+++ b/tensor2tensor/layers/rev_block_test.py
@@ -0,0 +1,92 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for RevBlock."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.layers import rev_block
+
+import tensorflow as tf
+
+
+class RevBlockTest(tf.test.TestCase):
+
+  def testSmoke(self):
+    channels = 8
+    num_layers = 4
+    batch_size = 16
+    use_defun = True
+    tf.set_random_seed(1234)
+
+    def f(x):
+      return tf.layers.dense(x, channels // 2, use_bias=True)
+
+    def g(x):
+      return tf.layers.dense(x, channels // 2, use_bias=True)
+
+    x = tf.random_uniform([batch_size, channels], dtype=tf.float32)
+    y = rev_block.rev_block(
+        x, f, g, num_layers=num_layers, is_training=use_defun)
+    loss = tf.reduce_mean(y + 10.)
+    grads = tf.gradients(loss, [x] + tf.global_variables())
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      _ = sess.run(grads)
+
+  def testRevBlock(self):
+    channels = 8
+    num_layers = 4
+    batch_size = 16
+    tf.set_random_seed(1234)
+
+    def f(x):
+      return tf.layers.dense(x, channels // 2, use_bias=True)
+
+    def g(x):
+      return tf.layers.dense(x, channels // 2, use_bias=True)
+
+    x = tf.random_uniform([batch_size, channels], dtype=tf.float32)
+
+    with tf.variable_scope("defun") as vs:
+      y_defun = rev_block.rev_block(x, f, g, num_layers=num_layers)
+      fg_vars = vs.trainable_variables()
+
+    num_vars = len(tf.global_variables())
+    with tf.variable_scope(vs, reuse=True):
+      y = rev_block.rev_block(x, f, g, num_layers=num_layers, is_training=False)
+    # Ensure no new vars were created - full reuse
+    assert len(tf.global_variables()) == num_vars
+
+    loss_defun = tf.reduce_mean(y_defun + 10.)
+    loss = tf.reduce_mean(y + 10.)
+
+    grads_defun = tf.gradients(loss_defun, [x] + fg_vars)
+    grads = tf.gradients(loss, [x] + fg_vars)
+
+    with self.test_session() as sess:
+      sess.run(tf.global_variables_initializer())
+      y_val, yd_val, gd_val, g_val = sess.run([y, y_defun, grads_defun, grads])
+      self.assertAllClose(y_val, yd_val)
+      for g1, g2 in zip(gd_val, g_val):
+        self.assertAllClose(g1, g2)
+
+
+if __name__ == "__main__":
+  tf.test.main()

From 9c54d864b73bc12103883f86ad3dead9aa3d4ce7 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Tue, 8 Aug 2017 11:14:05 -0700
Subject: [PATCH 06/17] Real modalities

PiperOrigin-RevId: 164617848
---
 .../data_generators/gene_expression.py        | 15 ++------
 tensor2tensor/layers/common_layers.py         | 18 +++++++--
 tensor2tensor/layers/modalities.py            | 38 +++++++++++++++++--
 tensor2tensor/models/gene_expression.py       |  8 ++--
 tensor2tensor/utils/metrics.py                | 38 +++++++++++++++++--
 tensor2tensor/utils/registry.py               | 14 +++++++
 tensor2tensor/utils/trainer_utils.py          |  5 ++-
 7 files changed, 110 insertions(+), 26 deletions(-)

diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index d314cec59..ea3aa4cc4 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -146,17 +146,14 @@ def hparams(self, defaults, model_hparams):
     p = defaults
     vocab_size = self._encoders["inputs"].vocab_size
     p.input_modality = {"inputs": (registry.Modalities.SYMBOL, vocab_size)}
-    p.target_modality = ("%s:real" % registry.Modalities.GENERIC,
+    p.target_modality = ("%s:log_poisson_loss" % registry.Modalities.REAL,
                          self.num_output_predictions)
     p.input_space_id = problem.SpaceID.DNA
     p.target_space_id = problem.SpaceID.REAL
 
   def example_reading_spec(self):
-    # TODO(rsepassi): propagate and apply targets_mask to output RealModality
-    # and to eval metrics (weights_fn?).
     data_fields = {
         "inputs": tf.VarLenFeature(tf.int64),
-        "targets_mask": tf.VarLenFeature(tf.float32),
         "targets": tf.VarLenFeature(tf.float32),
     }
     data_items_to_decoders = None
@@ -168,18 +165,12 @@ def preprocess_examples(self, examples, mode, hparams):
 
     # Reshape targets
     examples["targets"] = tf.reshape(examples["targets"],
-                                     [-1, self.num_output_predictions])
-    examples["targets_mask"] = tf.reshape(examples["targets_mask"], [-1, 1])
-
-    # Set masked targets to 0 (i.e. pad) so that loss and metrics ignore them.
-    # Add epsilon because some unmasked labels are actually 0.
-    examples["targets"] += 1e-6
-    examples["targets"] *= examples["targets_mask"]
+                                     [-1, 1, self.num_output_predictions])
 
     return examples
 
   def eval_metrics(self):
-    return [metrics.Metrics.RMSE]
+    return [metrics.Metrics.LOG_POISSON, metrics.Metrics.R2]
 
 
 @registry.register_problem("gene_expression_cage10")
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index ea18322e4..8054b27df 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -359,13 +359,23 @@ def conv2d_kernel(kernel_size_arg, name_suffix):
   return conv2d_kernel(kernel_size, "single")
 
 
-def conv(inputs, filters, kernel_size, **kwargs):
-  return conv_internal(tf.layers.conv2d, inputs, filters, kernel_size, **kwargs)
+def conv(inputs, filters, kernel_size, dilation_rate=1, **kwargs):
+  return conv_internal(
+      tf.layers.conv2d,
+      inputs,
+      filters,
+      kernel_size,
+      dilation_rate=dilation_rate,
+      **kwargs)
 
 
-def conv1d(inputs, filters, kernel_size, **kwargs):
+def conv1d(inputs, filters, kernel_size, dilation_rate=1, **kwargs):
   return tf.squeeze(
-      conv(tf.expand_dims(inputs, 2), filters, (kernel_size, 1), **kwargs), 2)
+      conv(
+          tf.expand_dims(inputs, 2),
+          filters, (kernel_size, 1),
+          dilation_rate=(dilation_rate, 1),
+          **kwargs), 2)
 
 
 def separable_conv(inputs, filters, kernel_size, **kwargs):
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 523c52fa8..5071a49ad 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -438,6 +438,7 @@ def __init__(self, model_hparams, vocab_size):
 @registry.register_image_modality("identity")
 @registry.register_symbol_modality("identity")
 @registry.register_class_label_modality("identity")
+@registry.register_real_modality("identity")
 class IdentityModality(modality.Modality):
   """Does nothing."""
 
@@ -452,9 +453,12 @@ def top(self, body_output, _):
     return body_output
 
 
-@registry.register_generic_modality("real")
 class RealModality(modality.Modality):
-  """Modality for real (i.e. float) vectors."""
+  """Base class for real (i.e. float) vectors.
+
+  * Bottom is a linear projection layer to hparams.hidden_size.
+  * Top is a linear projection layer to vocab_size.
+  """
 
   def bottom(self, x):
     with tf.variable_scope("real"):
@@ -464,7 +468,16 @@ def top(self, body_output, _):
     with tf.variable_scope("real"):
       return tf.layers.dense(body_output, self._vocab_size)
 
-  def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero):
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
+    raise NotImplementedError()
+
+
+@registry.register_real_modality("default")
+@registry.register_real_modality("l2_loss")
+class RealL2LossModality(RealModality):
+  """Modality for real (i.e. float) vectors with L2 (Gaussian) loss."""
+
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
     predictions = top_out
     with tf.name_scope("l2"):
       weights = weights_fn(targets)
@@ -472,6 +485,25 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_nonzero):
       return tf.reduce_sum(l2 * weights), tf.reduce_sum(weights)
 
 
+@registry.register_real_modality("log_poisson_loss")
+class RealLogPoissonLossModality(RealL2LossModality):
+  """Modality for real (i.e. float) vectors with log Poisson regression loss.
+
+  * Top is a linear projection to vocab size followed by a log transform.
+  """
+
+  def top(self, body_output, _):
+    with tf.variable_scope("real"):
+      return tf.log(tf.layers.dense(body_output, self._vocab_size))
+
+  def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
+    predictions = top_out
+    with tf.name_scope("log_possion"):
+      weights = weights_fn(targets)
+      lp_loss = tf.nn.log_poisson_loss(targets, predictions)
+      return tf.reduce_sum(lp_loss * weights), tf.reduce_sum(weights)
+
+
 @registry.register_image_modality("identity_no_pad")
 class IdentityModalityNoPad(modality.Modality):
   """Does nothing except making sure that there is no padding in cross-ent."""
diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py
index af2d83158..ad15926ac 100644
--- a/tensor2tensor/models/gene_expression.py
+++ b/tensor2tensor/models/gene_expression.py
@@ -121,12 +121,14 @@ def fc_layer(x, num_out, dropout_rate, name="fc"):
 def gene_expression_conv_base():
   """Hparams for GeneExpressionConv model."""
   hparams = common_hparams.basic_params1()
+  hparams.max_length = 10000000
+  hparams.batch_size = 1024
+  hparams.dropout = 0.1
   hparams.add_hparam("num_conv_layers", 4)
   hparams.add_hparam("num_dconv_layers", 7)
   hparams.add_hparam("pooling_windows", [2, 4, 4, 4])
 
-  # TODO(rsepassi): Correct the values of these hyperparameters
-  hparams.hidden_size = 128
-  hparams.kernel_width = 128
+  hparams.hidden_size = 256
+  hparams.kernel_width = 20
   hparams.add_hparam("stride", 1)
   return hparams
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index ea2187427..b4d7360ca 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -38,11 +38,13 @@ class Metrics(object):
   NEG_LOG_PERPLEXITY = "neg_log_perplexity"
   APPROX_BLEU = "approx_bleu_score"
   RMSE = "rmse"
+  LOG_POISSON = "log_poisson"
+  R2 = "r_squared"
   ROUGE_2_F = "rouge_2_fscore"
   ROUGE_L_F = "rouge_L_fscore"
 
 
-def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero):
+def padded_rmse(predictions, labels, weights_fn=common_layers.weights_all):
   predictions, labels = common_layers.pad_with_zeros(predictions, labels)
   targets = labels
   weights = weights_fn(targets)
@@ -50,6 +52,33 @@ def padded_rmse(predictions, labels, weights_fn=common_layers.weights_nonzero):
   return tf.reduce_sum(error * weights), tf.reduce_sum(weights)
 
 
+def padded_log_poisson(predictions,
+                       labels,
+                       weights_fn=common_layers.weights_all):
+  # Expects predictions to already be transformed into log space
+  predictions, labels = common_layers.pad_with_zeros(predictions, labels)
+  targets = labels
+  weights = weights_fn(targets)
+
+  lp_loss = tf.nn.log_poisson_loss(targets, predictions, compute_full_loss=True)
+  return tf.reduce_sum(lp_loss * weights), tf.reduce_sum(weights)
+
+
+def padded_variance_explained(predictions,
+                              labels,
+                              weights_fn=common_layers.weights_all):
+  # aka R^2
+  predictions, labels = common_layers.pad_with_zeros(predictions, labels)
+  targets = labels
+  weights = weights_fn(targets)
+
+  y_bar = tf.reduce_mean(weights * targets)
+  tot_ss = tf.reduce_sum(weights * tf.pow(targets - y_bar, 2))
+  res_ss = tf.reduce_sum(weights * tf.pow(targets - predictions, 2))
+  r2 = 1. - res_ss / tot_ss
+  return r2, tf.reduce_sum(weights)
+
+
 def padded_accuracy_topk(predictions,
                          labels,
                          k,
@@ -165,8 +194,9 @@ def problem_metric_fn(predictions, labels, weights):
                          (problem_name, metrics, METRICS_FNS.keys()))
 
     class_output = "image" in problem_name and "coco" not in problem_name
-    weights_fn = (common_layers.weights_all
-                  if class_output else common_layers.weights_nonzero)
+    real_output = "gene_expression" in problem_name
+    weights_fn = (common_layers.weights_all if class_output or real_output else
+                  common_layers.weights_nonzero)
 
     for metric in metrics:
       metric_fn = METRICS_FNS[metric]
@@ -191,6 +221,8 @@ def problem_metric_fn(predictions, labels, weights):
     Metrics.NEG_LOG_PERPLEXITY: padded_neg_log_perplexity,
     Metrics.APPROX_BLEU: bleu_hook.bleu_score,
     Metrics.RMSE: padded_rmse,
+    Metrics.LOG_POISSON: padded_log_poisson,
+    Metrics.R2: padded_variance_explained,
     Metrics.ROUGE_2_F: rouge.rouge_2_fscore,
     Metrics.ROUGE_L_F: rouge.rouge_l_fscore,
 }
diff --git a/tensor2tensor/utils/registry.py b/tensor2tensor/utils/registry.py
index 5402e5bde..fea647b2b 100644
--- a/tensor2tensor/utils/registry.py
+++ b/tensor2tensor/utils/registry.py
@@ -64,6 +64,7 @@ class Modalities(object):
   AUDIO = "audio"
   CLASS_LABEL = "class_label"
   GENERIC = "generic"
+  REAL = "real"
 
 
 _MODALITIES = {
@@ -72,6 +73,7 @@ class Modalities(object):
     Modalities.AUDIO: {},
     Modalities.CLASS_LABEL: {},
     Modalities.GENERIC: {},
+    Modalities.REAL: {},
 }
 
 # Camel case to snake case utils
@@ -277,6 +279,11 @@ def class_label_modality(name=None):
                                 Modalities.CLASS_LABEL.capitalize())
 
 
+def real_modality(name=None):
+  return _internal_get_modality(name, _MODALITIES[Modalities.REAL],
+                                Modalities.REAL.capitalize())
+
+
 def _internal_register_modality(name, mod_collection, collection_str):
   """Register a modality into mod_collection."""
 
@@ -309,6 +316,12 @@ def register_generic_modality(name=None):
                                      Modalities.GENERIC.capitalize())
 
 
+def register_real_modality(name=None):
+  """Register a real modality. name defaults to class name snake-cased."""
+  return _internal_register_modality(name, _MODALITIES[Modalities.REAL],
+                                     Modalities.REAL.capitalize())
+
+
 def register_audio_modality(name=None):
   """Register an audio modality. name defaults to class name snake-cased."""
   return _internal_register_modality(name, _MODALITIES[Modalities.AUDIO],
@@ -366,6 +379,7 @@ def create_modality(modality_spec, model_hparams):
       Modalities.IMAGE: image_modality,
       Modalities.CLASS_LABEL: class_label_modality,
       Modalities.GENERIC: generic_modality,
+      Modalities.REAL: real_modality,
   }
 
   modality_full_name, vocab_size = modality_spec
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 9e869c15c..ebf58ee97 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -219,10 +219,13 @@ def add_problem_hparams(hparams, problems):
   for problem_name in problems.split("-"):
     try:
       problem = registry.problem(problem_name)
-      p_hparams = problem.internal_hparams(hparams)
     except ValueError:
       problem = None
+
+    if problem is None:
       p_hparams = problem_hparams.problem_hparams(problem_name, hparams)
+    else:
+      p_hparams = problem.internal_hparams(hparams)
 
     hparams.problem_instances.append(problem)
     hparams.problems.append(p_hparams)

From f5c9b17e617ea9179b7d84d36b1e8162cb369f25 Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Tue, 8 Aug 2017 12:42:38 -0700
Subject: [PATCH 07/17] Added options for configuring different types of
 processing on layer input and layer output (normalization, dropout,
 residuals).  These settings are configured by common_hparams, and should work
 across many models. Normalization on layer input instead of after the
 residual seems to help in learning deep networks.  This change breaks current
 model checkpoints.

PiperOrigin-RevId: 164630450
---
 tensor2tensor/layers/common_hparams.py     |  17 +-
 tensor2tensor/layers/common_layers.py      | 146 ++++++++++++-----
 tensor2tensor/layers/common_layers_test.py |  55 ++-----
 tensor2tensor/models/attention_lm.py       |  54 +++----
 tensor2tensor/models/attention_lm_moe.py   |  27 ++--
 tensor2tensor/models/slicenet.py           |  10 +-
 tensor2tensor/models/transformer.py        | 176 ++++++++++++---------
 tensor2tensor/models/transformer_moe.py    |  39 +++--
 tensor2tensor/models/transformer_vae.py    |   3 +-
 9 files changed, 307 insertions(+), 220 deletions(-)

diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 6ecb06fb4..498b5eb37 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -69,8 +69,23 @@ def basic_params1():
       sampling_method="argmax",  # "argmax" or "random"
       problem_choice="adaptive",  # "uniform", "adaptive", "distributed"
       multiply_embedding_mode="sqrt_depth",
+      # Sequences of operations to perform on layer input and layer output.
+      # Used by common_layers.layer_preprocess, common_layers.layer_postprocess
+      # Each character repsesnts an operation:
+      #   d: apply dropout
+      #   n: apply normalization (see norm_type and norm_epsilon)
+      #   a: add layer input (residual connection - only during postprocess)
+      # TODO(noam): The current settings ("", "dan") are the published version
+      # of the transformer.  ("n", "da") seems better for harder-to-learn
+      # models, so it should probably be the default.
+      layer_preprocess_sequence="",
+      layer_postprocess_sequence="dan",
+      # dropout rate to use during layer_preprocess and layer_postprocess
+      layer_prepostprocess_dropout=0.1,
+      # What type of normalization to use
       norm_type="none",  # "batch", layer", "noam", "none".
-      layer_norm_epsilon=1e-6,
+      # epsilon parameter to normalization function
+      norm_epsilon=1e-6,
       symbol_modality_num_shards=16,
       # setting the max length in a minibatch. 0 means default behavior,
       # max_length = hparams.batch_size * length_multiplier
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 8054b27df..31bc0bced 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -462,64 +462,136 @@ def layer_norm(x, filters=None, epsilon=1e-6, name=None, reuse=None):
     return result
 
 
-def noam_norm(x, name=None):
+def noam_norm(x, epsilon=1.0, name=None):
   """One version of layer normalization."""
   with tf.name_scope(name, default_name="noam_norm", values=[x]):
     shape = x.get_shape()
     ndims = len(shape)
-    return (tf.nn.l2_normalize(x, ndims - 1, epsilon=1.0) *
+    return (tf.nn.l2_normalize(x, ndims - 1, epsilon=epsilon) *
             tf.sqrt(tf.to_float(shape[-1])))
 
 
-def get_norm(norm_type):
-  """Get the normalizer function."""
+def apply_norm(x, norm_type, depth, epsilon):
+  """Apply Normalization."""
   if norm_type == "layer":
-    return lambda x, name, filters=None, epsilon=1e-6: layer_norm(  # pylint: disable=g-long-lambda
-        x, filters=filters, epsilon=epsilon, name=name)
+    return layer_norm(x, filters=depth, epsilon=epsilon)
   if norm_type == "batch":
-    return tf.layers.batch_normalization
+    return tf.layers.batch_normalization(x, epsilon=epsilon)
   if norm_type == "noam":
-    return noam_norm
+    return noam_norm(x, epsilon)
   if norm_type == "none":
-    return lambda x, name: x
+    return x
   raise ValueError("Parameter normalizer_fn must be one of: 'layer', 'batch',"
                    "'noam', 'none'.")
 
 
-def residual_fn(x,
-                y,
-                norm_type,
-                residual_dropout,
-                filters=None,
-                epsilon=1e-16,
-                name=None,
-                reuse=None):
-  """Returns a function for combining layer input and layer output.
+def layer_prepostprocess(previous_value,
+                         x,
+                         sequence,
+                         dropout_rate,
+                         norm_type,
+                         depth,
+                         epsilon,
+                         name):
+  """Apply a sequence of functions to the input or output of a layer.
+
+  The sequence is specified as a string which may contain the following
+  characters:
+    a: add previous_value
+    n: apply normalization
+    d: apply dropout
 
-  The returned function on x (layer input) and y (layer output) computes:
-    norm_function(x + dropout(y))
+  For example, if sequence=="dna", then the output is
+    previous_value + normalize(dropout(x))
 
   Args:
-    x: tensor, input layer
-    y: tensor, output layer
-    norm_type: string, type of normalizer function
-    residual_dropout: integer, dropout value for residual connection
-    filters: integer, dimension for layer norm, optional
-    epsilon: integer, value of layer norm epsilon
-    name: string, name
-    reuse: bool, whether to reuse
+    previous_value: A Tensor, to be added as a residual connection ('a')
+    x: A Tensor to be transformed.
+    sequence: a string.
+    dropout_rate: a float
+    norm_type: a string (see apply_norm())
+    depth: an integer (size of last dimension of x).
+    epsilon: a float (parameter for normalization)
+    name: a string
 
   Returns:
-    residual layer output with applied norm_fn.
+    a Tensor
   """
-  with tf.variable_scope(
-      name, default_name="residual", values=[x, y], reuse=reuse):
-    norm_fn = get_norm(norm_type)
-    res = x + tf.nn.dropout(y, 1.0 - residual_dropout)
-    if norm_type == "layer":
-      return norm_fn(res, filters=filters, epsilon=epsilon, name=norm_type)
-    else:
-      return norm_fn(res, name=norm_type)
+  with tf.variable_scope(name):
+    for c in sequence:
+      if c == "a":
+        x += previous_value
+      elif c == "n":
+        x = apply_norm(x, norm_type, depth, epsilon)
+      else:
+        assert c == "d", ("Unknown sequence step %s" % c)
+        x = tf.nn.dropout(x, 1.0 - dropout_rate)
+    return x
+
+
+def layer_preprocess(layer_input, hparams):
+  """Apply layer preprocessing.
+
+  See layer_prepostprocess() for details.
+
+  A hyperparemeters object is passed for convenience.  The hyperparameters
+  that may be used are:
+
+    layer_preprocess_sequence
+    layer_prepostprocess_dropout
+    norm_type
+    hidden_size
+    norm_epsilon
+
+  Args:
+    layer_input: a Tensor
+    hparams: a hyperparameters object.
+
+  Returns:
+    a Tensor
+  """
+  assert "a" not in hparams.layer_preprocess_sequence, (
+      "No residual connections allowed in hparams.layer_preprocess_sequence")
+  return layer_prepostprocess(
+      None, layer_input,
+      sequence=hparams.layer_preprocess_sequence,
+      dropout_rate=hparams.layer_prepostprocess_dropout,
+      norm_type=hparams.norm_type,
+      depth=hparams.hidden_size,
+      epsilon=hparams.norm_epsilon,
+      name="layer_prepostprocess")
+
+
+def layer_postprocess(layer_input, layer_output, hparams):
+  """Apply layer postprocessing.
+
+  See layer_prepostprocess() for details.
+
+  A hyperparemeters object is passed for convenience.  The hyperparameters
+  that may be used are:
+
+    layer_postprocess_sequence
+    layer_prepostprocess_dropout
+    norm_type
+    hidden_size
+    norm_epsilon
+
+  Args:
+    layer_input: a Tensor
+    layer_output: a Tensor
+    hparams: a hyperparameters object.
+
+  Returns:
+    a Tensor
+  """
+  return layer_prepostprocess(
+      layer_input, layer_output,
+      sequence=hparams.layer_postprocess_sequence,
+      dropout_rate=hparams.layer_prepostprocess_dropout,
+      norm_type=hparams.norm_type,
+      depth=hparams.hidden_size,
+      epsilon=hparams.norm_epsilon,
+      name="layer_postprocess")
 
 
 def conv_block_internal(conv_fn,
diff --git a/tensor2tensor/layers/common_layers_test.py b/tensor2tensor/layers/common_layers_test.py
index df3ccc68f..3cf3f3374 100644
--- a/tensor2tensor/layers/common_layers_test.py
+++ b/tensor2tensor/layers/common_layers_test.py
@@ -303,74 +303,43 @@ def testDeconvStride2MultiStep(self):
       actual = session.run(a)
     self.assertEqual(actual.shape, (5, 32, 1, 16))
 
-  def testGetNormLayerFn(self):
-    norm_type = "layer"
+  def testApplyNormLayer(self):
     with self.test_session() as session:
-      a = common_layers.get_norm(norm_type)
       x1 = np.random.rand(5, 2, 1, 11)
-      x2 = a(tf.constant(x1, dtype=tf.float32), name="layer", filters=11)
+      x2 = common_layers.apply_norm(
+          tf.constant(x1, dtype=tf.float32), "layer", depth=11, epsilon=1e-6)
       session.run(tf.global_variables_initializer())
       actual = session.run(x2)
     self.assertEqual(actual.shape, (5, 2, 1, 11))
 
-  def testGetNormNoamFn(self):
-    norm_type = "noam"
+  def testApplyNormNoam(self):
     with self.test_session() as session:
-      a = common_layers.get_norm(norm_type)
       x1 = np.random.rand(5, 2, 1, 11)
-      x2 = a(tf.constant(x1, dtype=tf.float32), name="noam")
+      x2 = common_layers.apply_norm(
+          tf.constant(x1, dtype=tf.float32), "noam", depth=11, epsilon=1e-6)
       session.run(tf.global_variables_initializer())
       actual = session.run(x2)
     self.assertEqual(actual.shape, (5, 2, 1, 11))
 
-  def testGetNormBatchFn(self):
-    norm_type = "batch"
+  def testApplyNormBatch(self):
     with self.test_session() as session:
-      a = common_layers.get_norm(norm_type)
       x1 = np.random.rand(5, 2, 1, 11)
-      x2 = a(tf.constant(x1, dtype=tf.float32), name="batch")
+      x2 = common_layers.apply_norm(
+          tf.constant(x1, dtype=tf.float32), "batch", depth=11, epsilon=1e-6)
       session.run(tf.global_variables_initializer())
       actual = session.run(x2)
     self.assertEqual(actual.shape, (5, 2, 1, 11))
 
-  def testGetNormNoneFn(self):
-    norm_type = "none"
+  def testApplyNormNone(self):
     with self.test_session() as session:
-      a = common_layers.get_norm(norm_type)
       x1 = np.random.rand(5, 2, 1, 11)
-      x2 = a(tf.constant(x1, dtype=tf.float32), name="none")
+      x2 = common_layers.apply_norm(
+          tf.constant(x1, dtype=tf.float32), "none", depth=11, epsilon=1e-6)
       session.run(tf.global_variables_initializer())
       actual = session.run(x2)
     self.assertEqual(actual.shape, (5, 2, 1, 11))
     self.assertAllClose(actual, x1, atol=1e-03)
 
-  def testResidualFn(self):
-    norm_type = "batch"
-    with self.test_session() as session:
-      x1 = np.random.rand(5, 2, 1, 11)
-      x2 = np.random.rand(5, 2, 1, 11)
-      x3 = common_layers.residual_fn(
-          tf.constant(x1, dtype=tf.float32),
-          tf.constant(x2, dtype=tf.float32), norm_type, 0.1)
-      session.run(tf.global_variables_initializer())
-      actual = session.run(x3)
-    self.assertEqual(actual.shape, (5, 2, 1, 11))
-
-  def testResidualFnWithLayerNorm(self):
-    norm_type = "layer"
-    with self.test_session() as session:
-      x1 = np.random.rand(5, 2, 1, 11)
-      x2 = np.random.rand(5, 2, 1, 11)
-      x3 = common_layers.residual_fn(
-          tf.constant(x1, dtype=tf.float32),
-          tf.constant(x2, dtype=tf.float32),
-          norm_type,
-          0.1,
-          epsilon=0.1)
-      session.run(tf.global_variables_initializer())
-      actual = session.run(x3)
-    self.assertEqual(actual.shape, (5, 2, 1, 11))
-
   def testGlobalPool1d(self):
     x1 = np.random.rand(5, 4, 11)
     no_mask = np.ones((5, 4))
diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py
index 664bc9e21..495f25634 100644
--- a/tensor2tensor/models/attention_lm.py
+++ b/tensor2tensor/models/attention_lm.py
@@ -51,13 +51,10 @@ def model_fn_body(self, features):
     (decoder_input, decoder_self_attention_bias) = attention_lm_prepare_decoder(
         targets, hparams)
 
-    def residual_fn(x, y):
-      return common_layers.layer_norm(x + tf.nn.dropout(
-          y, 1.0 - hparams.residual_dropout))
-
-    decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
-    decoder_output = attention_lm_decoder(decoder_input, residual_fn,
-                                          decoder_self_attention_bias, hparams)
+    decoder_input = tf.nn.dropout(
+        decoder_input, 1.0 - hparams.layer_prepostprocess_dropout)
+    decoder_output = attention_lm_decoder(
+        decoder_input, decoder_self_attention_bias, hparams)
     decoder_output = tf.expand_dims(decoder_output, 2)
 
     return decoder_output
@@ -84,7 +81,6 @@ def attention_lm_prepare_decoder(targets, hparams):
 
 
 def attention_lm_decoder(decoder_input,
-                         residual_fn,
                          decoder_self_attention_bias,
                          hparams,
                          name="decoder"):
@@ -92,7 +88,6 @@ def attention_lm_decoder(decoder_input,
 
   Args:
     decoder_input: a Tensor
-    residual_fn: a function from (layer_input, layer_output) -> combined_output
     decoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
     hparams: hyperparameters for model
@@ -105,25 +100,25 @@ def attention_lm_decoder(decoder_input,
   with tf.variable_scope(name):
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
-        x = residual_fn(
-            x,
-            common_attention.multihead_attention(
-                x,
-                None,
-                decoder_self_attention_bias,
-                hparams.attention_key_channels or hparams.hidden_size,
-                hparams.attention_value_channels or hparams.hidden_size,
-                hparams.hidden_size,
-                hparams.num_heads,
-                hparams.attention_dropout,
-                name="decoder_self_attention"))
-        x = residual_fn(x,
-                        common_layers.conv_hidden_relu(
-                            x,
-                            hparams.filter_size,
-                            hparams.hidden_size,
-                            dropout=hparams.relu_dropout))
-  return x
+        with tf.variable_scope("self_attention"):
+          y = common_attention.multihead_attention(
+              common_layers.layer_preprocess(x, hparams),
+              None,
+              decoder_self_attention_bias,
+              hparams.attention_key_channels or hparams.hidden_size,
+              hparams.attention_value_channels or hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout)
+          x = common_layers.layer_postprocess(x, y, hparams)
+        with tf.variable_scope("ffn"):
+          y = common_layers.conv_hidden_relu(
+              common_layers.layer_preprocess(x, hparams),
+              hparams.filter_size,
+              hparams.hidden_size,
+              dropout=hparams.relu_dropout)
+          x = common_layers.layer_postprocess(x, y, hparams)
+      return common_layers.layer_preprocess(x, hparams)
 
 
 @registry.register_hparams
@@ -158,7 +153,6 @@ def attention_lm_base():
   # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("residual_dropout", 0.1)
   hparams.add_hparam("pos", "timing")  # timing, none
   return hparams
 
@@ -178,5 +172,5 @@ def attention_lm_small():
   hparams.num_hidden_layers = 4
   hparams.hidden_size = 512
   hparams.filter_size = 2048
-  hparams.residual_dropout = 0.5
+  hparams.layer_prepostprocess_dropout = 0.5
   return hparams
diff --git a/tensor2tensor/models/attention_lm_moe.py b/tensor2tensor/models/attention_lm_moe.py
index 780478fec..1869eef66 100644
--- a/tensor2tensor/models/attention_lm_moe.py
+++ b/tensor2tensor/models/attention_lm_moe.py
@@ -49,21 +49,24 @@ def model_fn_body_sharded(self, sharded_features):
     targets = sharded_features["targets"]
     targets = dp(tf.squeeze, targets, 2)
 
+    def preprocess(x):
+      return dp(common_layers.layer_preprocess, x, hparams)
+
+    def postprocess(x, y):
+      return dp(common_layers.layer_postprocess, x, y, hparams)
+
     (decoder_input, decoder_self_attention_bias) = dp(
         attention_lm_moe_prepare_decoder, targets, hparams)
 
-    def residual_fn(x, y):
-      return common_layers.layer_norm(x + tf.nn.dropout(
-          y, 1.0 - hparams.residual_dropout))
-
-    x = dp(tf.nn.dropout, decoder_input, 1.0 - hparams.residual_dropout)
+    x = dp(tf.nn.dropout, decoder_input,
+           1.0 - hparams.layer_prepostprocess_dropout)
     extra_loss = 0.0
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
         with tf.variable_scope("attention"):
           y = dp(
               common_attention.multihead_attention,
-              x,
+              preprocess(x),
               None,
               decoder_self_attention_bias,
               hparams.attention_key_channels or hparams.hidden_size,
@@ -72,11 +75,11 @@ def residual_fn(x, y):
               hparams.num_heads,
               hparams.attention_dropout,
               name="decoder_self_attention")
-          x = dp(residual_fn, x, y)
+          x = postprocess(x, y)
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers.split(","):
             y, loss = common_layers.moe_layer(
-                dp, self._ps_devices, x,
+                dp, self._ps_devices, preprocess(x),
                 hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
                 hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1,
                 hparams.moe_n2, hparams.moe_loss_coef)
@@ -84,11 +87,12 @@ def residual_fn(x, y):
           else:
             y = dp(
                 common_layers.conv_hidden_relu,
-                x,
+                preprocess(x),
                 hparams.filter_size,
                 hparams.hidden_size,
                 dropout=hparams.relu_dropout)
-          x = dp(residual_fn, x, y)
+          x = postprocess(x, y)
+    x = preprocess(x)
     decoder_output = dp(tf.expand_dims, x, 2)
     return decoder_output, extra_loss
 
@@ -163,7 +167,6 @@ def attention_lm_moe_base():
   # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("residual_dropout", 0.1)
   hparams.add_hparam("pos", "timing")  # timing, none
   return hparams
 
@@ -232,5 +235,5 @@ def attention_lm_moe_large():
   hparams.filter_size = 4096
   hparams.moe_hidden_size = 4096
   hparams.moe_n1 = 128
-  hparams.residual_dropout = 0.2
+  hparams.layer_prepostprocess_dropout = 0.2
   return hparams
diff --git a/tensor2tensor/models/slicenet.py b/tensor2tensor/models/slicenet.py
index 8900e6d11..1079659b5 100644
--- a/tensor2tensor/models/slicenet.py
+++ b/tensor2tensor/models/slicenet.py
@@ -111,7 +111,10 @@ def multi_conv_res(x, padding, name, layers, hparams, mask=None, source=None):
           hparams.separability - i
           for i in reversed(range(len(dilations_and_kernels2)))
       ]
-    norm_fn = common_layers.get_norm(hparams.norm_type)
+    def norm_fn(x, name):
+      with tf.variable_scope(name, default_name="norm"):
+        return common_layers.apply_norm(
+            x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon)
     for layer in xrange(layers):
       with tf.variable_scope("layer_%d" % layer):
         y = common_layers.subseparable_conv_block(
@@ -171,7 +174,10 @@ def similarity_cost(inputs_encoded, targets_encoded):
 
 def slicenet_middle(inputs_encoded, targets, target_space_emb, mask, hparams):
   """Middle part of slicenet, connecting encoder and decoder."""
-  norm_fn = common_layers.get_norm(hparams.norm_type)
+  def norm_fn(x, name):
+    with tf.variable_scope(name, default_name="norm"):
+      return common_layers.apply_norm(
+          x, hparams.norm_type, hparams.hidden_size, hparams.norm_epsilon)
 
   # Flatten targets and embed target_space_id.
   targets_flat = tf.expand_dims(common_layers.flatten4d3d(targets), axis=2)
diff --git a/tensor2tensor/models/transformer.py b/tensor2tensor/models/transformer.py
index c9c87da07..caf8ab198 100644
--- a/tensor2tensor/models/transformer.py
+++ b/tensor2tensor/models/transformer.py
@@ -55,36 +55,22 @@ def model_fn_body(self, features):
     (decoder_input, decoder_self_attention_bias) = transformer_prepare_decoder(
         targets, hparams)
 
-    residual_fn = get_residual_fn(hparams)
-
-    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
-    decoder_input = tf.nn.dropout(decoder_input, 1.0 - hparams.residual_dropout)
-    encoder_output = transformer_encoder(encoder_input, residual_fn,
-                                         encoder_self_attention_bias, hparams)
+    encoder_input = tf.nn.dropout(
+        encoder_input, 1.0 - hparams.layer_prepostprocess_dropout)
+    decoder_input = tf.nn.dropout(
+        decoder_input, 1.0 - hparams.layer_prepostprocess_dropout)
+    encoder_output = transformer_encoder(
+        encoder_input, encoder_self_attention_bias, hparams)
 
     decoder_output = transformer_decoder(
-        decoder_input, encoder_output, residual_fn, decoder_self_attention_bias,
+        decoder_input, encoder_output,
+        decoder_self_attention_bias,
         encoder_decoder_attention_bias, hparams)
     decoder_output = tf.expand_dims(decoder_output, 2)
 
     return decoder_output
 
 
-def get_residual_fn(hparams):
-  """Get residual_fn."""
-
-  def residual_fn(x, y):
-    return common_layers.residual_fn(
-        x,
-        y,
-        hparams.norm_type,
-        hparams.residual_dropout,
-        hparams.hidden_size,
-        epsilon=hparams.layer_norm_epsilon)
-
-  return residual_fn
-
-
 def transformer_prepare_encoder(inputs, target_space, hparams):
   """Prepare one shard of the model for the encoder.
 
@@ -143,7 +129,6 @@ def transformer_prepare_decoder(targets, hparams):
 
 
 def transformer_encoder(encoder_input,
-                        residual_fn,
                         encoder_self_attention_bias,
                         hparams,
                         name="encoder"):
@@ -151,7 +136,6 @@ def transformer_encoder(encoder_input,
 
   Args:
     encoder_input: a Tensor
-    residual_fn: a function from (layer_input, layer_output) -> combined_output
     encoder_self_attention_bias: bias Tensor for self-attention
        (see common_attention.attention_bias())
     hparams: hyperparameters for model
@@ -164,25 +148,29 @@ def transformer_encoder(encoder_input,
   with tf.variable_scope(name):
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
-        x = residual_fn(
-            x,
-            common_attention.multihead_attention(
-                x,
-                None,
-                encoder_self_attention_bias,
-                hparams.attention_key_channels or hparams.hidden_size,
-                hparams.attention_value_channels or hparams.hidden_size,
-                hparams.hidden_size,
-                hparams.num_heads,
-                hparams.attention_dropout,
-                name="encoder_self_attention"))
-        x = residual_fn(x, transformer_ffn_layer(x, hparams))
-  return x
+        with tf.variable_scope("self_attention"):
+          y = common_attention.multihead_attention(
+              common_layers.layer_preprocess(x, hparams),
+              None,
+              encoder_self_attention_bias,
+              hparams.attention_key_channels or hparams.hidden_size,
+              hparams.attention_value_channels or hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout)
+          x = common_layers.layer_postprocess(x, y, hparams)
+        with tf.variable_scope("ffn"):
+          y = transformer_ffn_layer(
+              common_layers.layer_preprocess(x, hparams), hparams)
+          x = common_layers.layer_postprocess(x, y, hparams)
+  # if normalization is done in layer_preprocess, then it shuold also be done
+  # on the output, since the output can grow very large, being the sum of
+  # a whole stack of unnormalized layer outputs.
+  return common_layers.layer_preprocess(x, hparams)
 
 
 def transformer_decoder(decoder_input,
                         encoder_output,
-                        residual_fn,
                         decoder_self_attention_bias,
                         encoder_decoder_attention_bias,
                         hparams,
@@ -192,7 +180,6 @@ def transformer_decoder(decoder_input,
   Args:
     decoder_input: a Tensor
     encoder_output: a Tensor
-    residual_fn: a function from (layer_input, layer_output) -> combined_output
     decoder_self_attention_bias: bias Tensor for self-attention
       (see common_attention.attention_bias())
     encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
@@ -207,32 +194,36 @@ def transformer_decoder(decoder_input,
   with tf.variable_scope(name):
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("layer_%d" % layer):
-        x = residual_fn(
-            x,
-            common_attention.multihead_attention(
-                x,
-                None,
-                decoder_self_attention_bias,
-                hparams.attention_key_channels or hparams.hidden_size,
-                hparams.attention_value_channels or hparams.hidden_size,
-                hparams.hidden_size,
-                hparams.num_heads,
-                hparams.attention_dropout,
-                name="decoder_self_attention"))
-        x = residual_fn(
-            x,
-            common_attention.multihead_attention(
-                x,
-                encoder_output,
-                encoder_decoder_attention_bias,
-                hparams.attention_key_channels or hparams.hidden_size,
-                hparams.attention_value_channels or hparams.hidden_size,
-                hparams.hidden_size,
-                hparams.num_heads,
-                hparams.attention_dropout,
-                name="encdec_attention"))
-        x = residual_fn(x, transformer_ffn_layer(x, hparams))
-  return x
+        with tf.variable_scope("self_attention"):
+          y = common_attention.multihead_attention(
+              common_layers.layer_preprocess(x, hparams),
+              None,
+              decoder_self_attention_bias,
+              hparams.attention_key_channels or hparams.hidden_size,
+              hparams.attention_value_channels or hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout)
+          x = common_layers.layer_postprocess(x, y, hparams)
+        with tf.variable_scope("encdec_attention"):
+          y = common_attention.multihead_attention(
+              common_layers.layer_preprocess(x, hparams),
+              encoder_output,
+              encoder_decoder_attention_bias,
+              hparams.attention_key_channels or hparams.hidden_size,
+              hparams.attention_value_channels or hparams.hidden_size,
+              hparams.hidden_size,
+              hparams.num_heads,
+              hparams.attention_dropout)
+          x = common_layers.layer_postprocess(x, y, hparams)
+        with tf.variable_scope("ffn"):
+          y = transformer_ffn_layer(
+              common_layers.layer_preprocess(x, hparams), hparams)
+          x = common_layers.layer_postprocess(x, y, hparams)
+  # if normalization is done in layer_preprocess, then it shuold also be done
+  # on the output, since the output can grow very large, being the sum of
+  # a whole stack of unnormalized layer outputs.
+  return common_layers.layer_preprocess(x, hparams)
 
 
 def transformer_ffn_layer(x, hparams):
@@ -307,13 +298,39 @@ def transformer_base():
   # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("residual_dropout", 0.1)
   hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("nbr_decoder_problems", 1)
   hparams.add_hparam("proximity_bias", int(False))
   return hparams
 
 
+@registry.register_hparams
+def transformer_n_da():
+  """Normalize on layer input, instead of after residual connection.
+
+  This version seems to cure failure-to-learn bugs - for example, with very
+  deep networks or hard-to-learn mappings.
+
+  Probably this should become the default.
+
+  Returns:
+    a hyperparameters.
+  """
+  hparams = transformer_base()
+  hparams.layer_preprocess_sequence = "n"
+  hparams.layer_postprocess_sequence = "da"
+  # This version seems to benefit from a higher learning rate.
+  hparams.learning_rate = 0.4
+  return hparams
+
+
+@registry.register_hparams
+def transformer_n_da_l10():
+  hparams = transformer_n_da()
+  hparams.num_hidden_layers = 10
+  return hparams
+
+
 @registry.register_hparams
 def transformer_big():
   """HParams for transfomer big model on WMT."""
@@ -322,7 +339,7 @@ def transformer_big():
   hparams.filter_size = 4096
   hparams.num_heads = 16
   hparams.batching_mantissa_bits = 2
-  hparams.residual_dropout = 0.3
+  hparams.layer_prepostprocess_dropout = 0.3
   return hparams
 
 
@@ -330,7 +347,7 @@ def transformer_big():
 def transformer_big_single_gpu():
   """HParams for transformer big model for single gpu."""
   hparams = transformer_big()
-  hparams.residual_dropout = 0.1
+  hparams.layer_prepostprocess_dropout = 0.1
   hparams.learning_rate_warmup_steps = 16000
   hparams.optimizer_adam_beta2 = 0.998
   hparams.batching_mantissa_bits = 3
@@ -352,7 +369,7 @@ def transformer_parsing_base():
   """Hparams for parsing on wsj only."""
   hparams = transformer_base()
   hparams.attention_dropout = 0.2
-  hparams.residual_dropout = 0.2
+  hparams.layer_prepostprocess_dropout = 0.2
   hparams.max_length = 512
   hparams.learning_rate_warmup_steps = 16000
   hparams.hidden_size = 1024
@@ -368,7 +385,7 @@ def transformer_parsing_big():
   hparams.max_length = 512
   hparams.shared_source_target_embedding = int(False)
   hparams.learning_rate_warmup_steps = 4000
-  hparams.residual_dropout = 0.1
+  hparams.layer_prepostprocess_dropout = 0.1
   hparams.batch_size = 2048
   hparams.learning_rate = 0.05
   return hparams
@@ -424,6 +441,13 @@ def transformer_l8():
   return hparams
 
 
+@registry.register_hparams
+def transformer_l10():
+  hparams = transformer_base()
+  hparams.num_hidden_layers = 10
+  return hparams
+
+
 @registry.register_hparams
 def transformer_h1():
   hparams = transformer_base()
@@ -483,14 +507,14 @@ def transformer_ff4096():
 @registry.register_hparams
 def transformer_dr0():
   hparams = transformer_base()
-  hparams.residual_dropout = 0.0
+  hparams.layer_prepostprocess_dropout = 0.0
   return hparams
 
 
 @registry.register_hparams
 def transformer_dr2():
   hparams = transformer_base()
-  hparams.residual_dropout = 0.2
+  hparams.layer_prepostprocess_dropout = 0.2
   return hparams
 
 
@@ -528,7 +552,7 @@ def transformer_big_dr1():
   hparams.hidden_size = 1024
   hparams.filter_size = 4096
   hparams.num_heads = 16
-  hparams.residual_dropout = 0.1
+  hparams.layer_prepostprocess_dropout = 0.1
   hparams.batching_mantissa_bits = 2
   return hparams
 
@@ -538,14 +562,14 @@ def transformer_big_enfr():
   hparams = transformer_big_dr1()
   hparams.shared_embedding_and_softmax_weights = int(False)
   hparams.filter_size = 8192
-  hparams.residual_dropout = 0.1
+  hparams.layer_prepostprocess_dropout = 0.1
   return hparams
 
 
 @registry.register_hparams
 def transformer_big_dr2():
   hparams = transformer_big_dr1()
-  hparams.residual_dropout = 0.2
+  hparams.layer_prepostprocess_dropout = 0.2
   return hparams
 
 
diff --git a/tensor2tensor/models/transformer_moe.py b/tensor2tensor/models/transformer_moe.py
index 8072f2cf8..6f01667d8 100644
--- a/tensor2tensor/models/transformer_moe.py
+++ b/tensor2tensor/models/transformer_moe.py
@@ -49,17 +49,22 @@ def model_fn_body_sharded(self, sharded_features):
     inputs = dp(common_layers.flatten4d3d, inputs)
     targets = dp(common_layers.flatten4d3d, targets)
 
+    def preprocess(x):
+      return dp(common_layers.layer_preprocess, x, hparams)
+
+    def postprocess(x, y):
+      return dp(common_layers.layer_postprocess, x, y, hparams)
+
     (encoder_input, encoder_self_attention_bias,
      encoder_decoder_attention_bias) = dp(
          transformer.transformer_prepare_encoder,
          inputs, target_space, hparams)
     (decoder_input, decoder_self_attention_bias) = dp(
         transformer.transformer_prepare_decoder, targets, hparams)
-    residual_fn = transformer.get_residual_fn(hparams)
     encoder_input = dp(tf.nn.dropout, encoder_input,
-                       1.0 - hparams.residual_dropout)
+                       1.0 - hparams.layer_prepostprocess_dropout)
     decoder_input = dp(tf.nn.dropout, decoder_input,
-                       1.0 - hparams.residual_dropout)
+                       1.0 - hparams.layer_prepostprocess_dropout)
     extra_loss = 0
     x = encoder_input
     for layer in xrange(hparams.num_hidden_layers):
@@ -67,7 +72,7 @@ def model_fn_body_sharded(self, sharded_features):
         with tf.variable_scope("encoder_self_attention"):
           y = dp(
               common_attention.multihead_attention,
-              x,
+              preprocess(x),
               None,
               encoder_self_attention_bias,
               hparams.attention_key_channels or hparams.hidden_size,
@@ -75,11 +80,11 @@ def model_fn_body_sharded(self, sharded_features):
               hparams.hidden_size,
               hparams.num_heads,
               hparams.attention_dropout)
-          x = dp(residual_fn, x, y)
+          x = postprocess(x, y)
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers_encoder.split(","):
             y, loss = common_layers.moe_layer(
-                dp, self._ps_devices, x,
+                dp, self._ps_devices, preprocess(x),
                 hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
                 hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1,
                 hparams.moe_n2, hparams.moe_loss_coef)
@@ -87,19 +92,19 @@ def model_fn_body_sharded(self, sharded_features):
           else:
             y = dp(
                 common_layers.conv_hidden_relu,
-                x,
+                preprocess(x),
                 hparams.filter_size,
                 hparams.hidden_size,
                 dropout=hparams.relu_dropout)
-          x = dp(residual_fn, x, y)
-    encoder_output = x
+          x = postprocess(x, y)
+    encoder_output = preprocess(x)
     x = decoder_input
     for layer in xrange(hparams.num_hidden_layers):
       with tf.variable_scope("decoder_layer_%d" % layer):
         with tf.variable_scope("decoder_self_attention"):
           y = dp(
               common_attention.multihead_attention,
-              x,
+              preprocess(x),
               None,
               decoder_self_attention_bias,
               hparams.attention_key_channels or hparams.hidden_size,
@@ -107,11 +112,11 @@ def model_fn_body_sharded(self, sharded_features):
               hparams.hidden_size,
               hparams.num_heads,
               hparams.attention_dropout)
-          x = dp(residual_fn, x, y)
+          x = postprocess(x, y)
         with tf.variable_scope("encoder_decoder_attention"):
           y = dp(
               common_attention.multihead_attention,
-              x,
+              preprocess(x),
               encoder_output,
               encoder_decoder_attention_bias,
               hparams.attention_key_channels or hparams.hidden_size,
@@ -119,11 +124,11 @@ def model_fn_body_sharded(self, sharded_features):
               hparams.hidden_size,
               hparams.num_heads,
               hparams.attention_dropout)
-          x = dp(residual_fn, x, y)
+          x = postprocess(x, y)
         with tf.variable_scope("ffn"):
           if str(layer) in hparams.moe_layers_decoder.split(","):
             y, loss = common_layers.moe_layer(
-                dp, self._ps_devices, x,
+                dp, self._ps_devices, preprocess(x),
                 hparams.mode == tf.contrib.learn.ModeKeys.TRAIN,
                 hparams.hidden_size, hparams.moe_hidden_size, hparams.moe_n1,
                 hparams.moe_n2, hparams.moe_loss_coef)
@@ -131,11 +136,12 @@ def model_fn_body_sharded(self, sharded_features):
           else:
             y = dp(
                 common_layers.conv_hidden_relu,
-                x,
+                preprocess(x),
                 hparams.filter_size,
                 hparams.hidden_size,
                 dropout=hparams.relu_dropout)
-          x = dp(residual_fn, x, y)
+          x = postprocess(x, y)
+    x = preprocess(x)
     decoder_output = dp(tf.expand_dims, x, 2)
     return decoder_output, extra_loss
 
@@ -178,7 +184,6 @@ def transformer_moe_base():
   # when not in training mode.
   hparams.add_hparam("attention_dropout", 0.0)
   hparams.add_hparam("relu_dropout", 0.0)
-  hparams.add_hparam("residual_dropout", 0.1)
   hparams.add_hparam("pos", "timing")  # timing, none
   hparams.add_hparam("nbr_decoder_problems", 1)
   hparams.add_hparam("proximity_bias", int(False))
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index f3d400045..47fcacd51 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -96,10 +96,9 @@ def encode(x, x_space, hparams, name):
   with tf.variable_scope(name):
     (encoder_input, encoder_self_attention_bias,
      _) = transformer.transformer_prepare_encoder(x, x_space, hparams)
-    residual_fn = transformer.get_residual_fn(hparams)
     encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
     return transformer.transformer_encoder(
-        encoder_input, residual_fn, encoder_self_attention_bias, hparams)
+        encoder_input, encoder_self_attention_bias, hparams)
 
 
 def dropmask(targets, targets_dropout_max, is_training):

From 342e3090a47b0f625e2bba9d4f40ca995a8067c0 Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Tue, 8 Aug 2017 14:18:05 -0700
Subject: [PATCH 08/17] Use softplus instead of log for
 RealLogPoissonLossModality

PiperOrigin-RevId: 164643866
---
 .../data_generators/gene_expression.py        | 20 +++++++++++-----
 tensor2tensor/layers/modalities.py            |  5 ++--
 tensor2tensor/models/gene_expression.py       | 23 ++++++++++++++-----
 tensor2tensor/models/gene_expression_test.py  |  2 +-
 4 files changed, 35 insertions(+), 15 deletions(-)

diff --git a/tensor2tensor/data_generators/gene_expression.py b/tensor2tensor/data_generators/gene_expression.py
index ea3aa4cc4..341a20c71 100644
--- a/tensor2tensor/data_generators/gene_expression.py
+++ b/tensor2tensor/data_generators/gene_expression.py
@@ -109,10 +109,10 @@ def generate_data(self, data_dir, tmp_dir, task_id=-1):
     # Collect created shard processes to start and join
     processes = []
 
-    datasets = [
-        (self.training_filepaths, self.num_shards, "train", num_train_examples),
-        (self.dev_filepaths, 10, "valid", num_dev_examples),
-        (self.test_filepaths, 10, "test", num_test_examples)]
+    datasets = [(self.training_filepaths, self.num_shards, "train",
+                 num_train_examples), (self.dev_filepaths, 10, "valid",
+                                       num_dev_examples),
+                (self.test_filepaths, 10, "test", num_test_examples)]
     for fname_fn, nshards, key_prefix, num_examples in datasets:
       outfiles = fname_fn(data_dir, nshards, shuffled=False)
       all_filepaths.extend(outfiles)
@@ -163,9 +163,12 @@ def preprocess_examples(self, examples, mode, hparams):
     del mode
     del hparams
 
-    # Reshape targets
+    # Reshape targets to contain num_output_predictions per output timestep
     examples["targets"] = tf.reshape(examples["targets"],
                                      [-1, 1, self.num_output_predictions])
+    # Slice off EOS - not needed, and messes up the GeneExpressionConv model
+    # which expects the input length to be a multiple of the target length.
+    examples["inputs"] = examples["inputs"][:-1]
 
     return examples
 
@@ -251,7 +254,12 @@ def dataset_generator(filepath,
       if i % 100 == 0:
         print("Generating example %d for %s" % (i, dataset))
       inputs, mask, outputs = inp_data[i], mask_data[i], out_data[i]
-      yield to_example_dict(encoder, inputs, mask, outputs)
+      ex_dict = to_example_dict(encoder, inputs, mask, outputs)
+      # Original data has one output for every 128 input bases. Ensure that the
+      # ratio has been maintained given the chunk size and removing EOS.
+      assert (len(ex_dict["inputs"]) - 1) == ((
+          128 // chunk_size) * ex_dict["targets_shape"][0])
+      yield ex_dict
 
 
 def to_example_dict(encoder, inputs, mask, outputs):
diff --git a/tensor2tensor/layers/modalities.py b/tensor2tensor/layers/modalities.py
index 5071a49ad..e44729041 100644
--- a/tensor2tensor/layers/modalities.py
+++ b/tensor2tensor/layers/modalities.py
@@ -489,12 +489,13 @@ def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
 class RealLogPoissonLossModality(RealL2LossModality):
   """Modality for real (i.e. float) vectors with log Poisson regression loss.
 
-  * Top is a linear projection to vocab size followed by a log transform.
+  * Top is a linear projection to vocab size followed by a softplus
+    transform (log(exp(features) + 1)).
   """
 
   def top(self, body_output, _):
     with tf.variable_scope("real"):
-      return tf.log(tf.layers.dense(body_output, self._vocab_size))
+      return tf.nn.softplus(tf.layers.dense(body_output, self._vocab_size))
 
   def loss(self, top_out, targets, weights_fn=common_layers.weights_all):
     predictions = top_out
diff --git a/tensor2tensor/models/gene_expression.py b/tensor2tensor/models/gene_expression.py
index ad15926ac..27aa631c6 100644
--- a/tensor2tensor/models/gene_expression.py
+++ b/tensor2tensor/models/gene_expression.py
@@ -38,6 +38,14 @@ class GeneExpressionConv(t2t_model.T2TModel):
   http://www.biorxiv.org/content/early/2017/07/10/161851
 
   Uses layer_norm instead of batch_norm.
+
+  Model expects that if targets are of length m, inputs are of length 32*m.  The
+  original data expected that inputs would be of length 128*m, but the data has
+  been preprocessed to chunk every 4 bases into 1 ID (see
+  data_generators/gene_expression.py).
+
+  The magnitude of the length reduction is controlled by the pooling sizes
+  (hparams.pooling_windows) at each conv layer (hparams.num_conv_layers).
   """
 
   def model_fn_body(self, features):
@@ -50,6 +58,7 @@ def model_fn_body(self, features):
     out = common_layers.flatten4d3d(out)
 
     # Conv layers
+    assert hp.num_conv_layers == len(hp.pooling_windows)
     for i in xrange(hp.num_conv_layers):
       out = conv_layer(
           out,
@@ -58,7 +67,7 @@ def model_fn_body(self, features):
           hp.stride,
           hp.pooling_windows[i],
           hp.dropout,
-          1,
+          dilation_rate=1,
           name="conv_%d" % (i + 1))
 
     # Dense dilated conv layers
@@ -68,10 +77,10 @@ def model_fn_body(self, features):
           out,
           hp.hidden_size,
           hp.kernel_width,
-          1,
-          0,
-          hp.dropout,
-          dilation_rate,
+          stride=1,
+          pooling_window=0,
+          dropout_rate=hp.dropout,
+          dilation_rate=dilation_rate,
           name="dconv_%d" % (i + 1))
       out = tf.concat([out, dconv_out], axis=2)
 
@@ -126,7 +135,9 @@ def gene_expression_conv_base():
   hparams.dropout = 0.1
   hparams.add_hparam("num_conv_layers", 4)
   hparams.add_hparam("num_dconv_layers", 7)
-  hparams.add_hparam("pooling_windows", [2, 4, 4, 4])
+  # The product of these pooling windows should match
+  # input_length/target_length.
+  hparams.add_hparam("pooling_windows", [2, 2, 2, 4])
 
   hparams.hidden_size = 256
   hparams.kernel_width = 20
diff --git a/tensor2tensor/models/gene_expression_test.py b/tensor2tensor/models/gene_expression_test.py
index 3b1dc6873..e2307f49f 100644
--- a/tensor2tensor/models/gene_expression_test.py
+++ b/tensor2tensor/models/gene_expression_test.py
@@ -42,7 +42,7 @@ def _testModel(self, hparams, model_cls):
     batch_size = 3
     target_length = 6
     target_out = 10  # GeneExpressionProblem.num_output_predictions
-    input_length = target_length * 128
+    input_length = target_length * 128 // 4  # chunk_size=4
     input_vocab_size = 5
 
     inputs = np.random.random_integers(

From daee057e764f008230cc9b88a61165d62a17891a Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Tue, 8 Aug 2017 16:37:04 -0700
Subject: [PATCH 09/17] Fix a bug in t2t_model for the case where the model
 returns a float

PiperOrigin-RevId: 164662524
---
 tensor2tensor/utils/t2t_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index a63f5cc7f..a33b0e0cd 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -425,7 +425,7 @@ def model_fn(self, features, skip=False, last_position_only=False):
       else:
         body_outputs, losses = self.model_fn_body_sharded(
             transformed_features)
-        if isinstance(losses, tf.Tensor):  # If it's a single extra loss.
+        if not isinstance(losses, dict):  # If it's a single extra loss.
           losses = {"extra": losses}
 
     with tf.variable_scope(target_modality.name, reuse=target_reuse):

From 4bd21c2a7b4a7e7a4d260b5b630e769017e2ee4f Mon Sep 17 00:00:00 2001
From: Noam Shazeer <noam@google.com>
Date: Tue, 8 Aug 2017 23:36:26 -0700
Subject: [PATCH 10/17] introduce a model hparam - prepend_inputs_to_targets,
 which transforms

PiperOrigin-RevId: 164690710
---
 .../data_generators/concatenate_examples.py   | 180 ------------------
 tensor2tensor/data_generators/problem.py      |  18 +-
 tensor2tensor/layers/common_hparams.py        |   8 +-
 tensor2tensor/layers/common_layers.py         |  16 ++
 tensor2tensor/models/attention_lm.py          |  17 +-
 tensor2tensor/utils/data_reader.py            |   2 +
 tensor2tensor/utils/metrics.py                |  12 +-
 tensor2tensor/utils/trainer_utils.py          |   2 +-
 8 files changed, 63 insertions(+), 192 deletions(-)
 delete mode 100644 tensor2tensor/data_generators/concatenate_examples.py

diff --git a/tensor2tensor/data_generators/concatenate_examples.py b/tensor2tensor/data_generators/concatenate_examples.py
deleted file mode 100644
index 9d7678fc4..000000000
--- a/tensor2tensor/data_generators/concatenate_examples.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# coding=utf-8
-# Copyright 2017 The Tensor2Tensor Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-r"""Convert seq-seq examples to "concatenated" examples.
-
-The concatenated example has no "inputs".
-Instead the source is at the beginning of the target.
-
-We can now use a simple language model.
-
-Example:
-seq-seq mode:
-{
-  "inputs": subtokenizer.encode("I love you.") + [1]
-  "targets": subtokenizer.encode("Je t'aime.") + [1]
-}
-->
-concatenated mode:
-{
-  "inputs": [0]
-  "targets": (subtokenizer.encode("source English I love you.") + [1]
-              + subtokenizer.encode("target French Je t'aime.") + [1])
-}
-
-We add a dummy feature "inputs"=[0] for compatibility with seq-to-seq models.
-
-If FLAGS.combine_to_length is nonzero, then we combine multiple examples into
-examples of a constant length, possibly with some padding at the end.
-
-"""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import random
-
-# Dependency imports
-
-from tensor2tensor.data_generators import generator_utils
-from tensor2tensor.data_generators import text_encoder
-import tensorflow as tf
-
-tf.flags.DEFINE_string("vocab_file", "", "SubwordTextEncoder vocabulary file")
-
-tf.flags.DEFINE_boolean(
-    "random_reverse", False,
-    "If true, write half of the example with source/target reversed")
-
-tf.flags.DEFINE_boolean(
-    "count_everything", False,
-    "If true, assign positive weights to designators, source and target. "
-    "If false, assign positive weights only to target.")
-
-tf.flags.DEFINE_string("source_domain_string", "English", "")
-tf.flags.DEFINE_string("target_domain_string", "French", "")
-
-tf.flags.DEFINE_integer(
-    "combine_to_length", 0,
-    "If positive, concatenate examples to form examples with target length "
-    " equal to this value. Targets are padded with subtoken id=0.")
-
-tf.flags.DEFINE_string("in_file", "", "input filename")
-
-tf.flags.DEFINE_string(
-    "out_prefix", "/usr/local/google/tmp/concat",
-    "The output filename is equal to out_prefix plus "
-    "the last 15 characters of in_file. (e.g. -00001-of-00100)")
-
-FLAGS = tf.flags.FLAGS
-
-
-def _make_example(ids, weights, raw_num_bytes):
-  if FLAGS.combine_to_length > 0:
-    ids += [0] * (FLAGS.combine_to_length - len(ids))
-  return generator_utils.to_example({
-      "targets": ids,
-      "target_weights": weights,
-      "inputs": [0],
-      "raw_num_bytes": [raw_num_bytes]
-  }).SerializeToString()
-
-
-def main(_):
-  """Convert a file to examples."""
-  subtokenizer = text_encoder.SubwordTextEncoder(FLAGS.vocab_file)
-  total_bytes = 0
-  total_subtokens = 0
-  total_examples = 0
-  dropped_examples = 0
-
-  combined_subtokens = []
-  combined_num_bytes = 0
-  combined_weights = []
-
-  source_specifier = subtokenizer.encode("source " + FLAGS.source_domain_string)
-  target_specifier = subtokenizer.encode("target " + FLAGS.target_domain_string)
-  if FLAGS.random_reverse:
-    r_source_specifier = subtokenizer.encode("source " +
-                                             FLAGS.target_domain_string)
-    r_target_specifier = subtokenizer.encode("target " +
-                                             FLAGS.source_domain_string)
-
-  reader = tf.python_io.tf_record_iterator(FLAGS.in_file)
-
-  out_file = FLAGS.out_prefix + FLAGS.in_file[-15:]
-  writer = tf.python_io.TFRecordWriter(out_file)
-
-  for record in reader:
-    total_examples += 1
-    if total_examples % 1000 == 0:
-      tf.logging.info("total_examples: %d", total_examples)
-    x = tf.train.Example()
-    x.ParseFromString(record)
-    inputs = [i for i in x.features.feature["inputs"].int64_list.value]
-    targets = [i for i in x.features.feature["targets"].int64_list.value]
-    should_reverse = FLAGS.random_reverse and random.random() < 0.5
-    source_bytes = len(subtokenizer.decode(inputs[:-1])) + 1
-    target_bytes = len(subtokenizer.decode(targets[:-1])) + 1
-    if not should_reverse:
-      subtokens = source_specifier + inputs + target_specifier + targets
-      weights = ([0.0] *
-                 (len(source_specifier) + len(inputs) + len(target_specifier)) +
-                 [1.0] * len(targets))
-      num_bytes = target_bytes
-    else:
-      subtokens = r_source_specifier + targets + r_target_specifier + inputs
-      weights = (
-          [0.0] *
-          (len(r_source_specifier) + len(targets) + len(r_target_specifier)) +
-          [1.0] * len(inputs))
-      num_bytes = source_bytes
-    if FLAGS.count_everything:
-      weights = [1.0] * len(subtokens)
-      num_bytes = source_bytes + target_bytes
-    total_bytes += num_bytes
-    total_subtokens += sum(weights)
-    if FLAGS.combine_to_length:
-      if combined_subtokens and (len(combined_subtokens) + len(subtokens) >
-                                 FLAGS.combine_to_length):
-        writer.write(
-            _make_example(combined_subtokens, combined_weights,
-                          combined_num_bytes))
-        combined_subtokens = []
-        combined_weights = []
-        combined_num_bytes = 0
-      if len(subtokens) <= FLAGS.combine_to_length:
-        combined_subtokens.extend(subtokens)
-        combined_weights.extend(weights)
-        combined_num_bytes += num_bytes
-      else:
-        dropped_examples += 1
-    else:
-      writer.write(_make_example(subtokens, weights, num_bytes))
-  if combined_subtokens:
-    writer.write(
-        _make_example(combined_subtokens, combined_weights, combined_num_bytes))
-  writer.close()
-
-  tf.logging.info("total bytes: %d", total_bytes)
-  tf.logging.info("total subtokens: %d", total_subtokens)
-  tf.logging.info("bytes per subtoken: %f", total_bytes / total_subtokens)
-  tf.logging.info("total documents: %d", total_examples)
-  tf.logging.info("dropped documents: %d", dropped_examples)
-
-
-if __name__ == "__main__":
-  tf.app.run()
diff --git a/tensor2tensor/data_generators/problem.py b/tensor2tensor/data_generators/problem.py
index 66cd7df4f..07fafb492 100644
--- a/tensor2tensor/data_generators/problem.py
+++ b/tensor2tensor/data_generators/problem.py
@@ -92,6 +92,18 @@ class SpaceID(object):
   CPP_TOK = 28
 
 
+def preprocess_examples_common(examples, hparams):
+  """Preprocessing steps common to all models."""
+  if hparams.max_input_seq_length > 0:
+    examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length]
+  if hparams.max_target_seq_length > 0:
+    examples["targets"] = examples["targets"][:hparams.max_target_seq_length]
+  if hparams.prepend_inputs_to_targets:
+    examples["targets"] = tf.concat(
+        [examples["inputs"], [0], examples["targets"]], 0)
+  return examples
+
+
 class Problem(object):
   """Problem base class. Specifies a T2T problem.
 
@@ -172,11 +184,7 @@ def example_reading_spec(self):
 
   def preprocess_examples(self, examples, mode, hparams):
     del mode
-    if hparams.max_input_seq_length > 0:
-      examples["inputs"] = examples["inputs"][:hparams.max_input_seq_length]
-    if hparams.max_target_seq_length > 0:
-      examples["targets"] = examples["targets"][:hparams.max_target_seq_length]
-    return examples
+    return preprocess_examples_common(examples, hparams)
 
   def eval_metrics(self):
     return [
diff --git a/tensor2tensor/layers/common_hparams.py b/tensor2tensor/layers/common_hparams.py
index 498b5eb37..10b5e7e59 100644
--- a/tensor2tensor/layers/common_hparams.py
+++ b/tensor2tensor/layers/common_hparams.py
@@ -118,7 +118,13 @@ def basic_params1():
       # mean there is no maximum or truncation.
       # You can change this behavior by overridding preprocess_examples() method
       # in your problem class.
-      max_target_seq_length=0)
+      max_target_seq_length=0,
+      # Treat a seq-to-seq problem as a language model by prepending the
+      # inputs to the targets.  During training, the loss is on both the
+      # inputs and the targets.  During eval, metrics are computed only on the
+      # target portion.
+      prepend_inputs_to_targets=int(False),
+  )
 
 
 class RangedHParams(object):
diff --git a/tensor2tensor/layers/common_layers.py b/tensor2tensor/layers/common_layers.py
index 31bc0bced..a85430c1c 100644
--- a/tensor2tensor/layers/common_layers.py
+++ b/tensor2tensor/layers/common_layers.py
@@ -1426,6 +1426,22 @@ def weights_nonzero(labels):
   return tf.to_float(tf.not_equal(labels, 0))
 
 
+def weights_second_part(labels):
+  """Weights function for 'prepend_inputs_to_targets'.
+
+  Weight 1.0 is assigned to all nonzero labels past the first zero.
+
+  Args:
+    labels: A Tensor of int32s.
+
+  Returns:
+    A Tensor of floats.
+  """
+  past_first_zero = tf.cumsum(tf.to_float(tf.equal(labels, 0)))
+  nonzero = tf.to_float(labels)
+  return tf.to_float(tf.not_equal(past_first_zero * nonzero, 0))
+
+
 def weights_all(labels):
   """Assign weight 1.0 to all labels."""
   return tf.ones_like(labels, dtype=tf.float32)
diff --git a/tensor2tensor/models/attention_lm.py b/tensor2tensor/models/attention_lm.py
index 495f25634..19f1915e8 100644
--- a/tensor2tensor/models/attention_lm.py
+++ b/tensor2tensor/models/attention_lm.py
@@ -118,7 +118,7 @@ def attention_lm_decoder(decoder_input,
               hparams.hidden_size,
               dropout=hparams.relu_dropout)
           x = common_layers.layer_postprocess(x, y, hparams)
-      return common_layers.layer_preprocess(x, hparams)
+    return common_layers.layer_preprocess(x, hparams)
 
 
 @registry.register_hparams
@@ -140,7 +140,6 @@ def attention_lm_base():
   hparams.weight_decay = 0.0
   hparams.optimizer_adam_beta1 = 0.9
   hparams.optimizer_adam_beta2 = 0.98
-  hparams.num_sampled_classes = 0
   hparams.label_smoothing = 0.0
   hparams.shared_embedding_and_softmax_weights = int(False)
 
@@ -174,3 +173,17 @@ def attention_lm_small():
   hparams.filter_size = 2048
   hparams.layer_prepostprocess_dropout = 0.5
   return hparams
+
+
+@registry.register_hparams
+def attention_lm_translation():
+  """Version to use for seq2seq."""
+  hparams = attention_lm_base()
+  hparams.layer_preprocess_sequence = "n"
+  hparams.layer_postprocess_sequence = "da"
+  hparams.learning_rate = 0.1
+  hparams.prepend_inputs_to_targets = int(True)
+  hparams.max_length = 512
+  hparams.label_smoothing = 0.1
+  hparams.shared_embedding_and_softmax_weights = int(True)
+  return hparams
diff --git a/tensor2tensor/utils/data_reader.py b/tensor2tensor/utils/data_reader.py
index 05aa9bf26..5c7041014 100644
--- a/tensor2tensor/utils/data_reader.py
+++ b/tensor2tensor/utils/data_reader.py
@@ -27,6 +27,7 @@
 from six.moves import zip  # pylint: disable=redefined-builtin
 
 from tensor2tensor.data_generators import problem_hparams
+from tensor2tensor.data_generators.problem import preprocess_examples_common
 from tensor2tensor.utils import registry
 
 import tensorflow as tf
@@ -233,6 +234,7 @@ def input_pipeline(problem, data_file_pattern, capacity, mode, hparams):
       data_items_to_decoders=data_items_to_decoders)
 
   if problem is None:
+    examples = preprocess_examples_common(examples, hparams)
     examples = preprocessing(examples, data_file_pattern)
   else:
     examples = problem.preprocess_examples(examples, mode, hparams)
diff --git a/tensor2tensor/utils/metrics.py b/tensor2tensor/utils/metrics.py
index b4d7360ca..fd82adc30 100644
--- a/tensor2tensor/utils/metrics.py
+++ b/tensor2tensor/utils/metrics.py
@@ -144,11 +144,12 @@ def padded_accuracy(predictions,
     return tf.to_float(tf.equal(outputs, padded_labels)), weights
 
 
-def create_evaluation_metrics(problems):
+def create_evaluation_metrics(problems, model_hparams):
   """Creates the evaluation metrics for the model.
 
   Args:
     problems: List of tuples (problem name, problem instance).
+    model_hparams: a set of hparams.
 
   Returns:
     A dictionary with keys that are strings naming the evaluation
@@ -195,8 +196,13 @@ def problem_metric_fn(predictions, labels, weights):
 
     class_output = "image" in problem_name and "coco" not in problem_name
     real_output = "gene_expression" in problem_name
-    weights_fn = (common_layers.weights_all if class_output or real_output else
-                  common_layers.weights_nonzero)
+    if model_hparams.prepend_inputs_to_targets:
+      assert not class_output
+      weights_fn = common_layers.weights_second_part
+    elif class_output or real_output:
+      weights_fn = common_layers.weights_all
+    else:
+      weights_fn = common_layers.weights_nonzero
 
     for metric in metrics:
       metric_fn = METRICS_FNS[metric]
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index ebf58ee97..703bc5b2f 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -144,7 +144,7 @@ def create_experiment(output_dir, data_dir, model_name, train_steps,
       data_dir=data_dir,
       model_name=model_name)
   eval_metrics = metrics.create_evaluation_metrics(
-      zip(FLAGS.problems.split("-"), hparams.problem_instances))
+      zip(FLAGS.problems.split("-"), hparams.problem_instances), hparams)
   if (hasattr(FLAGS, "autotune") and FLAGS.autotune and
       FLAGS.objective not in eval_metrics):
     raise ValueError("Tuning objective %s not among evaluation metrics %s" %

From 331c6e783f4fda28e0092c8f8f9afc8d906a387c Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Wed, 9 Aug 2017 00:20:51 -0700
Subject: [PATCH 11/17] Allow multiple losses to play with GANs.

PiperOrigin-RevId: 164693578
---
 tensor2tensor/bin/t2t-trainer           |   2 +-
 tensor2tensor/models/cycle_gan.py       | 204 ++++++++++++++++++++++++
 tensor2tensor/models/models.py          |   1 +
 tensor2tensor/models/transformer_vae.py |  45 +++---
 tensor2tensor/utils/input_fn_builder.py |   4 -
 tensor2tensor/utils/model_builder.py    |  26 ++-
 tensor2tensor/utils/t2t_model.py        |  10 +-
 7 files changed, 258 insertions(+), 34 deletions(-)
 create mode 100644 tensor2tensor/models/cycle_gan.py

diff --git a/tensor2tensor/bin/t2t-trainer b/tensor2tensor/bin/t2t-trainer
index 6e0be3f23..7c7b48932 100644
--- a/tensor2tensor/bin/t2t-trainer
+++ b/tensor2tensor/bin/t2t-trainer
@@ -62,7 +62,7 @@ def main(_):
   output_dir = os.path.expanduser(FLAGS.output_dir)
   tmp_dir = os.path.expanduser(FLAGS.tmp_dir)
   data_dir = os.path.expanduser(FLAGS.data_dir)
-  tf.gfile.MakeDir(output_dir)
+  tf.gfile.MakeDirs(output_dir)
 
   # Generate data if requested.
   if FLAGS.generate_data:
diff --git a/tensor2tensor/models/cycle_gan.py b/tensor2tensor/models/cycle_gan.py
new file mode 100644
index 000000000..5fcf96266
--- /dev/null
+++ b/tensor2tensor/models/cycle_gan.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright 2017 The Tensor2Tensor Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Cycle GAN."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+# Dependency imports
+
+from tensor2tensor.layers import common_layers
+from tensor2tensor.models import transformer_vae
+from tensor2tensor.utils import registry
+from tensor2tensor.utils import t2t_model
+
+import tensorflow as tf
+
+
+def reconstruct_loss(x, gt, hparams, reuse=None):
+  pred = tf.layers.dense(x, hparams.vocab_size, name="softmax", reuse=reuse)
+  xent, w = common_layers.padded_cross_entropy(pred, gt, 0.0)
+  return xent / w
+
+
+def discriminator(x, compress, hparams, name, reuse=None):
+  with tf.variable_scope(name, reuse=reuse):
+    x = tf.stop_gradient(2 * x) - x  # Reverse gradient.
+    if compress:
+      x = transformer_vae.compress(x, hparams, "compress")
+    else:
+      x = transformer_vae.residual_conv(x, 1, hparams, "compress_rc")
+    y = tf.reduce_mean(x, axis=1)
+    return tf.tanh(tf.layers.dense(y, 1, name="reduce"))
+
+
+def discriminate_loss(x, y, compress, hparams, name):
+  with tf.variable_scope(name):
+    d1 = discriminator(x, compress, hparams, "discriminator")
+    d2 = discriminator(y, compress, hparams, "discriminator", reuse=True)
+    dloss = tf.reduce_mean(tf.abs(d1 - d2))
+    return - dloss
+
+
+def split_on_batch(x):
+  batch_size = tf.shape(x)[0]
+  i = batch_size // 2
+  return x[:i, :, :, :], x[i:2*i, :, :, :]
+
+
+def cycle_gan_internal(inputs, targets, _, hparams):
+  """Cycle GAN, main step used for training."""
+  with tf.variable_scope("cycle_gan"):
+    # Embed inputs and targets.
+    inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)
+    inputs = common_layers.embedding(
+        inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed")
+    targets = common_layers.embedding(
+        targets_orig, hparams.vocab_size, hparams.hidden_size,
+        "embed", reuse=True)
+
+    # Split the batch into input-input and target-target parts.
+    inputs1, _ = split_on_batch(inputs)
+    _, targets2 = split_on_batch(targets)
+
+    # Define F and G, called inp2tgt and tgt2inp here.
+    def inp2tgt(x, reuse=False):
+      return transformer_vae.residual_conv(x, 1, hparams, "inp2tgt", reuse)
+    def tgt2inp(x, reuse=False):
+      return transformer_vae.residual_conv(x, 1, hparams, "tgt2inp", reuse)
+
+    # Input-input part.
+    inp1_tgt = inp2tgt(inputs1)
+    inp1_back = tgt2inp(inp1_tgt)
+
+    # Target-target part.
+    tgt2_inp = tgt2inp(targets2, reuse=True)
+    tgt2_back = inp2tgt(tgt2_inp, reuse=True)
+
+    # Reconstruction losses.
+    inp1_orig, _ = split_on_batch(inputs_orig)
+    _, tgt2_orig = split_on_batch(targets_orig)
+    inp1_loss = reconstruct_loss(
+        inp1_back, tf.squeeze(inp1_orig, axis=3), hparams)
+    tgt2_loss = reconstruct_loss(
+        tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True)
+
+    # Discriminator losses.
+    dloss1 = discriminate_loss(inputs1, tgt2_inp, True, hparams, "inp_disc")
+    dloss2 = discriminate_loss(targets2, inp1_tgt, True, hparams, "tgt_disc")
+
+    # Reconstruct targets from inputs.
+    tgt = inp2tgt(inputs, reuse=True)
+    tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True)
+
+    # We use the reconstruction only for tracking progress, no gradients here!
+    tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2))
+
+    losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss,
+              "target_target": hparams.cycle_loss_multiplier * tgt2_loss,
+              "input_disc": dloss1,
+              "target_disc": dloss2}
+    return tgt, losses
+
+
+@registry.register_model
+class CycleGAN(t2t_model.T2TModel):
+
+  def model_fn_body(self, features):
+    return cycle_gan_internal(
+        features["inputs"], features["targets"], features["target_space_id"],
+        self._hparams)
+
+
+def cycle_vae_gan_internal(inputs, targets, _, hparams):
+  """Cycle GAN, main step used for training."""
+  with tf.variable_scope("cycle_vae_gan"):
+    # Embed inputs and targets.
+    inputs_orig, targets_orig = tf.to_int32(inputs), tf.to_int32(targets)
+    k = 2**hparams.num_compress_steps
+    inputs_orig, targets_orig = common_layers.pad_to_same_length(
+        inputs_orig, targets_orig, final_length_divisible_by=k)
+    inputs = common_layers.embedding(
+        inputs_orig, hparams.vocab_size, hparams.hidden_size, "embed")
+    targets = common_layers.embedding(
+        targets_orig, hparams.vocab_size, hparams.hidden_size,
+        "embed", reuse=True)
+
+    # Split the batch into input-input and target-target parts.
+    inputs1, _ = split_on_batch(inputs)
+    _, targets2 = split_on_batch(targets)
+
+    # Input-input part.
+    inp1_back, kl_loss1, inp1_mu, inp1_log_sigma = transformer_vae.vae_compress(
+        inputs1, hparams, "inp2hyp", "hyp2inp")
+    inp1_hyp = tf.concat([inp1_mu, inp1_log_sigma], axis=3)
+
+    # Target-target part.
+    tgt2_back, kl_loss2, tgt2_mu, tgt2_log_sigma = transformer_vae.vae_compress(
+        targets2, hparams, "tgt2hyp", "hyp2tgt")
+    tgt2_hyp = tf.concat([tgt2_mu, tgt2_log_sigma], axis=3)
+
+    # Reconstruction losses.
+    inp1_orig, _ = split_on_batch(inputs_orig)
+    _, tgt2_orig = split_on_batch(targets_orig)
+    inp1_loss = reconstruct_loss(
+        inp1_back, tf.squeeze(inp1_orig, axis=3), hparams)
+    tgt2_loss = reconstruct_loss(
+        tgt2_back, tf.squeeze(tgt2_orig, axis=3), hparams, reuse=True)
+
+    # Discriminator loss.
+    dloss = discriminate_loss(inp1_hyp, tgt2_hyp, False, hparams, "dloss")
+
+    # Reconstruct targets from inputs.
+    tgt, _, _, _ = transformer_vae.vae_compress(
+        inputs, hparams, "inp2hyp", "hyp2tgt", reuse=True)
+    tgt = tf.layers.dense(tgt, hparams.vocab_size, name="softmax", reuse=True)
+    # We use the reconstruction only for tracking progress, no gradients here!
+    tgt = tf.stop_gradient(tf.expand_dims(tgt, axis=2))
+
+    kl_rev_decay = common_layers.inverse_exp_decay(hparams.kl_warmup_steps)
+    losses = {"input_input": hparams.cycle_loss_multiplier * inp1_loss,
+              "target_target": hparams.cycle_loss_multiplier * tgt2_loss,
+              "input_kl": kl_loss1 * kl_rev_decay,
+              "target_kl": kl_loss2 * kl_rev_decay,
+              "discriminator": dloss}
+    return tgt, losses
+
+
+@registry.register_model
+class CycleVaeGAN(t2t_model.T2TModel):
+
+  def model_fn_body(self, features):
+    return cycle_vae_gan_internal(
+        features["inputs"], features["targets"], features["target_space_id"],
+        self._hparams)
+
+
+@registry.register_hparams
+def cycle_gan_small():
+  """Set of hyperparameters."""
+  hparams = transformer_vae.transformer_vae_small()
+  hparams.batch_size = 2048
+  hparams.input_modalities = "inputs:symbol:identity"
+  hparams.target_modality = "symbol:identity"
+  hparams.weight_decay = 3.0
+  hparams.learning_rate = 0.005
+  hparams.kl_warmup_steps = 5000
+  hparams.learning_rate_warmup_steps = 3000
+  hparams.add_hparam("vocab_size", 32)  # Vocabulary size, need to set here.
+  hparams.add_hparam("cycle_loss_multiplier", 2.0)
+  return hparams
diff --git a/tensor2tensor/models/models.py b/tensor2tensor/models/models.py
index 4b1355dba..cba779fc9 100644
--- a/tensor2tensor/models/models.py
+++ b/tensor2tensor/models/models.py
@@ -28,6 +28,7 @@
 from tensor2tensor.models import attention_lm_moe
 from tensor2tensor.models import bluenet
 from tensor2tensor.models import bytenet
+from tensor2tensor.models import cycle_gan
 from tensor2tensor.models import gene_expression
 from tensor2tensor.models import long_answer
 from tensor2tensor.models import lstm
diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 47fcacd51..404d17783 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -31,9 +31,9 @@
 import tensorflow as tf
 
 
-def residual_conv(x, repeat, hparams, name):
+def residual_conv(x, repeat, hparams, name, reuse=None):
   """A stack of convolution blocks with residual connections."""
-  with tf.variable_scope(name):
+  with tf.variable_scope(name, reuse=reuse):
     k = (3, 1)
     dilations_and_kernels = [((1, 1), k) for _ in xrange(3)]
     for i in xrange(repeat):
@@ -49,7 +49,7 @@ def residual_conv(x, repeat, hparams, name):
     return x
 
 
-def decompress(source, hparams, first_relu, name):
+def decompress_step(source, hparams, first_relu, name):
   """Decompression function."""
   with tf.variable_scope(name):
     shape = tf.shape(source)
@@ -66,29 +66,42 @@ def vae(x, hparams, name):
     shape = tf.shape(x)
     epsilon = tf.random_normal([shape[0], shape[1], 1, hparams.z_size])
     z = mu + tf.exp(log_sigma / 2) * epsilon
-    dense = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense")
     kl = 0.5 * tf.reduce_mean(
         tf.exp(log_sigma) + tf.square(mu) - 1. - log_sigma, axis=-1)
-    return dense, tf.reduce_mean(kl)
+    return z, tf.reduce_mean(kl), mu, log_sigma
 
 
-def compress_vae(inputs, hparams, name):
-  """Compress, then VAE."""
+def compress(inputs, hparams, name):
+  """Compress."""
   with tf.variable_scope(name):
     # Run compression by strided convs.
-    cur = tf.expand_dims(inputs, axis=2)
+    cur = inputs
     for i in xrange(hparams.num_compress_steps):
       cur = residual_conv(cur, 1, hparams, "compress_rc_%d" % i)
       cur = common_layers.conv_block(
           cur, hparams.hidden_size, [((1, 1), (2, 1))],
           strides=(2, 1), name="compress_%d" % i)
+    return cur
+
 
+def vae_compress(inputs, hparams, compress_name, decompress_name, reuse=None):
+  """Compress, then VAE."""
+  with tf.variable_scope(compress_name, reuse=reuse):
+    cur = compress(inputs, hparams, "compress")
     # Convolve and ReLu to get state.
     cur = common_layers.conv_block(
         cur, hparams.hidden_size, [((1, 1), (1, 1))], name="mid_conv")
+    z, kl_loss, mu, log_sigma = vae(cur, hparams, name="vae")
+
+  with tf.variable_scope(decompress_name, reuse=reuse):
+    # Decompress.
+    z = tf.layers.dense(z, hparams.hidden_size, name="z_to_dense")
 
-    cur, kl_loss = vae(cur, hparams, name="vae")
-    return cur, kl_loss
+    for i in xrange(hparams.num_compress_steps):
+      j = hparams.num_compress_steps - i - 1
+      z = residual_conv(z, 1, hparams, "decompress_rc_%d" % j)
+      z = decompress_step(z, hparams, i > 0, "decompress__step_%d" % j)
+    return z, kl_loss, mu, log_sigma
 
 
 def encode(x, x_space, hparams, name):
@@ -127,7 +140,7 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
     inputs = encode(inputs, target_space, hparams, "input_enc")
 
     # Dropout targets or swap for zeros 5% of the time.
-    max_prestep = 90000
+    max_prestep = hparams.kl_warmup_steps
     prob_targets = 0.95 if is_training else 1.0
     targets_dropout_max = common_layers.inverse_lin_decay(max_prestep) - 0.01
     targets = dropmask(targets, targets_dropout_max, is_training)
@@ -143,13 +156,8 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
     #                      target_space, hparams, "enc")
 
     # Compress and vae.
-    z, kl_loss = compress_vae(targets, hparams, "vae")
-
-    # Decompress.
-    for i in xrange(hparams.num_compress_steps):
-      j = hparams.num_hidden_layers - i - 1
-      z = residual_conv(z, 1, hparams, "dec_rc_%d" % j)
-      z = decompress(z, hparams, i > 0, "decompress_%d" % j)
+    z, kl_loss, _, _ = vae_compress(tf.expand_dims(targets, axis=2), hparams,
+                                    "vae_compress", "vae_decompress")
 
     # Join z with inputs, run decoder.
     to_decode = common_layers.conv_block(
@@ -215,6 +223,7 @@ def transformer_vae_small():
   hparams.batch_size = 2048
   hparams.add_hparam("z_size", 128)
   hparams.add_hparam("num_compress_steps", 4)
+  hparams.add_hparam("kl_warmup_steps", 50000)
   return hparams
 
 
diff --git a/tensor2tensor/utils/input_fn_builder.py b/tensor2tensor/utils/input_fn_builder.py
index 79a765ca2..d1b68aa02 100644
--- a/tensor2tensor/utils/input_fn_builder.py
+++ b/tensor2tensor/utils/input_fn_builder.py
@@ -137,10 +137,6 @@ def input_fn():
             tf.get_variable(
                 "problem_%d/total_loss" % n, initializer=100.0,
                 trainable=False))
-        tf.get_variable(
-            "problem_%d/training_loss" % n, initializer=100.0, trainable=False)
-        tf.get_variable(
-            "problem_%d/extra_loss" % n, initializer=100.0, trainable=False)
     if fixed_problem is None:
       if (hparams.problem_choice == "uniform" or
           mode != tf.contrib.learn.ModeKeys.TRAIN):
diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
index 01768e263..34d062d45 100644
--- a/tensor2tensor/utils/model_builder.py
+++ b/tensor2tensor/utils/model_builder.py
@@ -166,6 +166,7 @@ def model_fn(features, targets, mode):
     train = mode == tf.contrib.learn.ModeKeys.TRAIN
 
     # Get multi-problem logits and loss based on features["problem_choice"].
+    loss_variable_names = []
     def nth_model(n):
       """Build the model for the n-th problem, plus some added variables."""
       model_class = registry.model(model)(
@@ -193,15 +194,19 @@ def nth_model(n):
       skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
       sharded_logits, losses_dict = model_class.model_fn(
           features, skip=(skipping_is_on and skip_this_one))
-      with tf.variable_scope("losses_avg", reuse=True):
+      with tf.variable_scope("losses_avg"):
         total_loss, ops = 0.0, []
         for loss_key, loss_value in six.iteritems(losses_dict):
+          loss_name = "problem_%d/%s_loss" % (n, loss_key)
           loss_moving_avg = tf.get_variable(
-              "problem_%d/%s_loss" % (n, loss_key))
+              loss_name, initializer=100.0, trainable=False)
+          loss_variable_names.append(loss_name)
           ops.append(
               loss_moving_avg.assign(loss_moving_avg * 0.9 + loss_value * 0.1))
           total_loss += loss_value
-        loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
+        with tf.variable_scope(tf.get_variable_scope(), reuse=True):
+          # Total loss was already constructed on input.
+          loss_moving_avg = tf.get_variable("problem_%d/total_loss" % n)
         ops.append(
             loss_moving_avg.assign(loss_moving_avg * 0.9 + total_loss * 0.1))
       with tf.variable_scope("train_stats"):  # Count steps for this problem.
@@ -256,13 +261,18 @@ def nth_model(n):
       tf.summary.scalar("learning_rate", learning_rate)
       global_step = tf.to_float(tf.contrib.framework.get_global_step())
       for n in xrange(len(my_hp.problems)):
+        names_and_vars = []
         with tf.variable_scope("losses_avg", reuse=True):
           total_loss_var = tf.get_variable("problem_%d/total_loss" % n)
-          training_loss_var = tf.get_variable("problem_%d/training_loss" % n)
-          extra_loss_var = tf.get_variable("problem_%d/extra_loss" % n)
-        tf.summary.scalar("loss_avg_%d/total_loss" % n, total_loss_var)
-        tf.summary.scalar("loss_avg_%d/training_loss" % n, training_loss_var)
-        tf.summary.scalar("loss_avg_%d/extra_loss" % n, extra_loss_var)
+          names_and_vars.append(("total_loss", total_loss_var))
+        with tf.variable_scope("losses_avg", reuse=True):
+          for loss_name in loss_variable_names:
+            if loss_name.startswith("problem_%d/" % n):
+              loss_var = tf.get_variable(loss_name)
+              loss_suffix = loss_name[loss_name.index("/") + 1:]
+              names_and_vars.append((loss_suffix, loss_var))
+        for (loss_name, loss_var) in names_and_vars:
+          tf.summary.scalar("loss_avg_%d/%s" % (n, loss_name), loss_var)
         with tf.variable_scope("train_stats", reuse=True):
           nth_steps = tf.get_variable("problem_%d_steps" % n, dtype=tf.int32)
         tf.summary.scalar("problem_%d_frequency" % n,
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index a33b0e0cd..76e2164b1 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -479,10 +479,14 @@ def model_fn_body_sharded(self, sharded_features):
           _with_timing(self.model_fn_body, "model_fn_body"),
           datashard_to_features)
       if isinstance(output, tuple):
-        if isinstance(output[1], dict):
-          loss = output[1]
+        losses_sharded = output[1]
+        if isinstance(losses_sharded[0], dict):
+          loss = {}
+          for k in losses_sharded[0].keys():
+            k_loss_sharded = [losses[k] for losses in losses_sharded]
+            loss[k] = tf.reduce_mean(k_loss_sharded)
         else:
-          loss = {"extra": tf.reduce_mean(output[1])}
+          loss = {"extra": tf.reduce_mean(losses_sharded)}
         output = output[0]
       else:
         loss = {"extra": 0.0}

From d12cb9d641ce4d5d56a092e779e0c442924c741b Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 9 Aug 2017 11:53:17 -0700
Subject: [PATCH 12/17] v1.1.7

PiperOrigin-RevId: 164753007
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index c62b3409c..5beeb1b3e 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
 
 setup(
     name='tensor2tensor',
-    version='1.1.6',
+    version='1.1.7',
     description='Tensor2Tensor',
     author='Google Inc.',
     author_email='no-reply@google.com',

From 0eeb116aa0d6a42d421c4b20dc5e4b0f12f28c7c Mon Sep 17 00:00:00 2001
From: Niki Parmar <nikip@google.com>
Date: Wed, 9 Aug 2017 12:11:24 -0700
Subject: [PATCH 13/17] Evaluate auto-regressively in t2t. Currently, we use
 the actual output during eval. To use the predicted output in the previous
 step, extend the infer code to run eval auto-regressively.

PiperOrigin-RevId: 164755091
---
 tensor2tensor/utils/model_builder.py |  8 ++-
 tensor2tensor/utils/t2t_model.py     | 77 ++++++++++++++++++++++------
 tensor2tensor/utils/trainer_utils.py |  3 ++
 3 files changed, 71 insertions(+), 17 deletions(-)

diff --git a/tensor2tensor/utils/model_builder.py b/tensor2tensor/utils/model_builder.py
index 34d062d45..da33e1e40 100644
--- a/tensor2tensor/utils/model_builder.py
+++ b/tensor2tensor/utils/model_builder.py
@@ -192,8 +192,12 @@ def nth_model(n):
       # On worker 0 also build graph for problems <= 1.
       # TODO(lukaszkaiser): why is this hack needed for variables init? Repair.
       skip_this_one = skip_this_one and (FLAGS.worker_id != 0 or n > 1)
-      sharded_logits, losses_dict = model_class.model_fn(
-          features, skip=(skipping_is_on and skip_this_one))
+      if (FLAGS.eval_run_autoregressive and
+          mode == tf.contrib.learn.ModeKeys.EVAL):
+        sharded_logits, losses_dict = model_class.eval_autoregressive(features)
+      else:
+        sharded_logits, losses_dict = model_class.model_fn(
+            features, skip=(skipping_is_on and skip_this_one))
       with tf.variable_scope("losses_avg"):
         total_loss, ops = 0.0, []
         for loss_key, loss_value in six.iteritems(losses_dict):
diff --git a/tensor2tensor/utils/t2t_model.py b/tensor2tensor/utils/t2t_model.py
index 76e2164b1..8fcf2482d 100644
--- a/tensor2tensor/utils/t2t_model.py
+++ b/tensor2tensor/utils/t2t_model.py
@@ -144,6 +144,30 @@ def _create_modalities(self, problem_hparams, hparams):
   def has_input(self):
     return self._problem_hparams.input_modality
 
+  def eval_autoregressive(self,
+                          features=None,
+                          decode_length=50,
+                          last_position_only=False):
+    """Autoregressive eval.
+
+    Quadratic time in decode_length.
+
+    Args:
+      features: an map of string to `Tensor`
+      decode_length: an integer.  How many additional timesteps to decode.
+      last_position_only: a boolean, speed-up by computing last position only.
+
+    Returns:
+      sharded_logits: a list of `Tensor`s. Assumes one datashard.
+      losses: a dictionary: {loss-name (string): floating point `Scalar`}.
+          Contains a single key "training".
+    """
+    _, logits, losses = self._greedy_infer(
+        features,
+        decode_length=decode_length,
+        last_position_only=last_position_only)
+    return [logits], losses
+
   def infer(self,
             features=None,
             decode_length=50,
@@ -179,11 +203,13 @@ def infer(self,
       beam_size = 1  # No use to run beam-search for a single class.
     if beam_size == 1:
       tf.logging.info("Greedy Decoding")
-      return self._greedy_infer(features, decode_length, last_position_only)
+      samples, _, _ = self._greedy_infer(features, decode_length,
+                                         last_position_only)
     else:
       tf.logging.info("Beam Decoding with beam size %d" % beam_size)
-      return self._beam_decode(features, decode_length, beam_size, top_beams,
-                               last_position_only, alpha)
+      samples = self._beam_decode(features, decode_length, beam_size, top_beams,
+                                  last_position_only, alpha)
+    return samples
 
   def _beam_decode(self, features, decode_length, beam_size, top_beams,
                    last_position_only, alpha):
@@ -268,6 +294,8 @@ def _greedy_infer(self, features, decode_length, last_position_only):
 
     Returns:
        samples: an integer `Tensor`.
+       logits: `Tensor` of shape [batch_size, time, 1, 1, vocab_size].
+       losses: a dictionary: {loss-name (string): floating point `Scalar`}
     """
     if not features:
       features = {}
@@ -278,14 +306,15 @@ def _greedy_infer(self, features, decode_length, last_position_only):
     if not self.has_input:
       features["partial_targets"] = tf.to_int64(features["inputs"])
 
-    def infer_step(recent_output, _):
+    def infer_step(recent_output, recent_logits, unused_loss):
       """Inference step."""
       recent_output.set_shape([None, None, None, 1])
       padded = tf.pad(recent_output, [[0, 0], [0, 1], [0, 0], [0, 0]])
       features["targets"] = padded
       # This is inefficient in that it generates samples at all timesteps,
       # not just the last one, except if last_position_only is set (dangerous).
-      samples = self.sample(features, last_position_only=last_position_only)
+      samples, logits, losses = self.sample(
+          features, last_position_only=last_position_only)
       # Concatenate the already-generated recent_output with last timestep
       # of the newly-generated samples.
       if last_position_only:
@@ -295,7 +324,11 @@ def infer_step(recent_output, _):
       cur_sample = tf.to_int64(tf.expand_dims(cur_sample, axis=1))
       samples = tf.concat([recent_output, cur_sample], axis=1)
       samples.set_shape([None, None, None, 1])
-      return samples
+
+      # Assuming we have one shard for logits.
+      logits = tf.concat([recent_logits, logits[0][:, -1:]], 1)
+      loss = sum(losses.values())
+      return samples, logits, loss
 
     # Create an initial output tensor. This will be passed
     # to the infer_step, which adds one timestep at every iteration.
@@ -308,20 +341,32 @@ def infer_step(recent_output, _):
     # input shape, so we confuse it about the input shape.
     initial_output = tf.slice(initial_output, [0, 0, 0, 0],
                               tf.shape(initial_output))
-    if _is_class_modality(
-        self._hparams.problems[self._problem_idx].target_modality):
+    target_modality = self._hparams.problems[self._problem_idx].target_modality
+    if _is_class_modality(target_modality):
       decode_length = 1
     else:
       decode_length = tf.shape(features["inputs"])[1] + decode_length
-    result = tf.foldl(
-        infer_step,
-        tf.range(decode_length),
-        initializer=initial_output,
+    # Initial values of result, logits and loss.
+    result = initial_output
+    # tensor of shape [batch_size, time, 1, 1, vocab_size]
+    logits = tf.zeros((batch_size, 0, 1, 1, target_modality.top_dimensionality))
+    logits.set_shape([None, None, None, None, None])
+    loss = 0.0
+
+    result, logits, loss = tf.while_loop(
+        lambda result, logits, loss: tf.shape(result)[1] < decode_length,
+        infer_step, [result, logits, loss],
+        shape_invariants=[
+            tf.TensorShape([None, None, None, None]),
+            tf.TensorShape([None, None, None, None, None]),
+            tf.TensorShape([]),
+        ],
         back_prop=False,
         parallel_iterations=1)
     if inputs_old is not None:  # Restore to not confuse Estimator.
       features["inputs"] = inputs_old
-    return result
+    losses = {"training": loss}
+    return result, logits, losses
 
   def sample(self, features, last_position_only=False):
     """Run the model and extract samples.
@@ -332,8 +377,10 @@ def sample(self, features, last_position_only=False):
 
     Returns:
        samples: an integer `Tensor`.
+       logits: a list of `Tensor`s, one per datashard.
+       losses: a dictionary: {loss-name (string): floating point `Scalar`}.
     """
-    sharded_logits, _ = self.model_fn(
+    sharded_logits, losses = self.model_fn(
         features, False, last_position_only=last_position_only)
     if self._hparams.sampling_method == "argmax":
       sharded_samples = self._data_parallelism(tf.argmax, sharded_logits, 4)
@@ -349,7 +396,7 @@ def _multinomial_squeeze(logits):
 
       sharded_samples = self._data_parallelism(_multinomial_squeeze,
                                                sharded_logits)
-    return tf.concat(sharded_samples, 0)
+    return tf.concat(sharded_samples, 0), sharded_logits, losses
 
   def _shard_features(self, features):  # pylint: disable=missing-docstring
     sharded_features = dict()
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index 703bc5b2f..e72938867 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -63,6 +63,9 @@
                      "The number of steps to run training for.")
 flags.DEFINE_integer("eval_steps", 10, "Number of steps in evaluation.")
 flags.DEFINE_bool("eval_print", False, "Print eval logits and predictions.")
+flags.DEFINE_bool("eval_run_autoregressive", False,
+                  "Run eval autoregressively where we condition on previous"
+                  "generated output instead of the actual target.")
 flags.DEFINE_integer("keep_checkpoint_max", 20,
                      "How many recent checkpoints to keep.")
 flags.DEFINE_bool("experimental_optimize_placement", False,

From a9826deb47ea061b597c128935a2a79ec7a67193 Mon Sep 17 00:00:00 2001
From: Niki Parmar <nikip@google.com>
Date: Wed, 9 Aug 2017 13:15:53 -0700
Subject: [PATCH 14/17] Extend decode_from_dataset to run decode iteratively
 for specified number of samples rather than one

PiperOrigin-RevId: 164761976
---
 tensor2tensor/utils/decoding.py      | 41 ++++++++++++++++++----------
 tensor2tensor/utils/trainer_utils.py |  3 ++
 2 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/tensor2tensor/utils/decoding.py b/tensor2tensor/utils/decoding.py
index 12057d8e6..5e8f4d482 100644
--- a/tensor2tensor/utils/decoding.py
+++ b/tensor2tensor/utils/decoding.py
@@ -45,13 +45,13 @@ def decode_from_dataset(estimator):
     tf.logging.info("Performing local inference.")
     infer_problems_data = data_reader.get_data_filepatterns(
         FLAGS.problems, hparams.data_dir, tf.contrib.learn.ModeKeys.INFER)
+
     infer_input_fn = input_fn_builder.build_input_fn(
         mode=tf.contrib.learn.ModeKeys.INFER,
         hparams=hparams,
         data_file_patterns=infer_problems_data,
         num_datashards=devices.data_parallelism().n,
         fixed_problem=i)
-    result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=False)
 
     def log_fn(inputs,
                targets,
@@ -66,14 +66,21 @@ def log_fn(inputs,
                                  "%s_prediction_%d.jpg" % (problem, j))
         show_and_save_image(inputs / 255., save_path)
       elif inputs_vocab:
-        decoded_inputs = inputs_vocab.decode(_save_until_eos(inputs.flatten()))
+        decoded_inputs = inputs_vocab.decode(
+            _save_until_eos(inputs.flatten()))
         tf.logging.info("Inference results INPUT: %s" % decoded_inputs)
 
-      decoded_outputs = targets_vocab.decode(_save_until_eos(outputs.flatten()))
+      if FLAGS.identity_output:
+        decoded_outputs = " ".join(map(str, outputs.flatten()))
+        decoded_targets = " ".join(map(str, targets.flatten()))
+      else:
+        decoded_outputs = targets_vocab.decode(
+            _save_until_eos(outputs.flatten()))
+        decoded_targets = targets_vocab.decode(
+            _save_until_eos(targets.flatten()))
+
       tf.logging.info("Inference results OUTPUT: %s" % decoded_outputs)
-      decoded_targets = targets_vocab.decode(_save_until_eos(targets.flatten()))
       tf.logging.info("Inference results TARGET: %s" % decoded_targets)
-
       if FLAGS.decode_to_file:
         output_filepath = FLAGS.decode_to_file + ".outputs." + problem
         output_file = tf.gfile.Open(output_filepath, "a")
@@ -81,21 +88,25 @@ def log_fn(inputs,
         target_filepath = FLAGS.decode_to_file + ".targets." + problem
         target_file = tf.gfile.Open(target_filepath, "a")
         target_file.write(decoded_targets + "\n")
-
-    # The function predict() returns an iterable over the network's
-    # predictions from the test input. We use it to log inputs and decodes.
-    inputs_iter = result_iter["inputs"]
-    targets_iter = result_iter["targets"]
-    outputs_iter = result_iter["outputs"]
-    for j, result in enumerate(zip(inputs_iter, targets_iter, outputs_iter)):
-      inputs, targets, outputs = result
+    result_iter = estimator.predict(input_fn=infer_input_fn, as_iterable=True)
+    count = 0
+    for result in result_iter:
+      # predictions from the test input. We use it to log inputs and decodes.
+      inputs = result["inputs"]
+      targets = result["targets"]
+      outputs = result["outputs"]
       if FLAGS.decode_return_beams:
         output_beams = np.split(outputs, FLAGS.decode_beam_size, axis=0)
         for k, beam in enumerate(output_beams):
           tf.logging.info("BEAM %d:" % k)
-          log_fn(inputs, targets, beam, problem, j)
+          log_fn(inputs, targets, beam, problem, count)
       else:
-        log_fn(inputs, targets, outputs, problem, j)
+        log_fn(inputs, targets, outputs, problem, count)
+
+      count += 1
+      if FLAGS.decode_num_samples != -1 and count >= FLAGS.decode_num_samples:
+        break
+    tf.logging.info("Completed inference on %d samples." % count)
 
 
 def decode_from_file(estimator, filename):
diff --git a/tensor2tensor/utils/trainer_utils.py b/tensor2tensor/utils/trainer_utils.py
index e72938867..22fd727f9 100644
--- a/tensor2tensor/utils/trainer_utils.py
+++ b/tensor2tensor/utils/trainer_utils.py
@@ -121,6 +121,9 @@
 flags.DEFINE_integer("decode_max_input_size", -1,
                      "Maximum number of ids in input. Or <= 0 for no max.")
 flags.DEFINE_bool("identity_output", False, "To print the output as identity")
+flags.DEFINE_integer("decode_num_samples", -1,
+                     "Number of samples to decode. Currently used in"
+                     "decode_from_dataset. Use -1 for all.")
 
 
 def make_experiment_fn(data_dir, model_name, train_steps, eval_steps):

From 4a36fb88638effd2262522f9eab93d02b339be95 Mon Sep 17 00:00:00 2001
From: Lukasz Kaiser <lukaszkaiser@google.com>
Date: Wed, 9 Aug 2017 14:15:46 -0700
Subject: [PATCH 15/17] Fix for issue #215 on github, update transformer_vae.

PiperOrigin-RevId: 164771762
---
 tensor2tensor/models/transformer_vae.py | 6 +++---
 tensor2tensor/utils/devices.py          | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tensor2tensor/models/transformer_vae.py b/tensor2tensor/models/transformer_vae.py
index 404d17783..74f1e4c8f 100644
--- a/tensor2tensor/models/transformer_vae.py
+++ b/tensor2tensor/models/transformer_vae.py
@@ -109,7 +109,7 @@ def encode(x, x_space, hparams, name):
   with tf.variable_scope(name):
     (encoder_input, encoder_self_attention_bias,
      _) = transformer.transformer_prepare_encoder(x, x_space, hparams)
-    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.residual_dropout)
+    encoder_input = tf.nn.dropout(encoder_input, 1.0 - hparams.dropout)
     return transformer.transformer_encoder(
         encoder_input, encoder_self_attention_bias, hparams)
 
@@ -143,7 +143,7 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
     max_prestep = hparams.kl_warmup_steps
     prob_targets = 0.95 if is_training else 1.0
     targets_dropout_max = common_layers.inverse_lin_decay(max_prestep) - 0.01
-    targets = dropmask(targets, targets_dropout_max, is_training)
+    targets = dropmask(targets, targets_dropout_max * 0.7, is_training)
     targets = tf.cond(tf.less(tf.random_uniform([]), prob_targets),
                       lambda: targets, lambda: tf.zeros_like(targets))
 
@@ -168,7 +168,7 @@ def vae_transformer_internal(inputs, targets, target_space, hparams):
     # ret = tf.squeeze(to_decode, axis=2)
 
     # Randomize decoder inputs..
-    kl_loss *= common_layers.inverse_exp_decay(max_prestep) * 3.0
+    kl_loss *= common_layers.inverse_exp_decay(max_prestep) * 10.0
     return tf.expand_dims(ret, axis=2), kl_loss
 
 
diff --git a/tensor2tensor/utils/devices.py b/tensor2tensor/utils/devices.py
index 4f76367e9..d04b73563 100644
--- a/tensor2tensor/utils/devices.py
+++ b/tensor2tensor/utils/devices.py
@@ -112,7 +112,7 @@ def _replica_device_setter(worker_device):
   if FLAGS.schedule == "local_run":
     assert not FLAGS.sync
     datashard_devices = ["gpu:%d" % d for d in _gpu_order(FLAGS.worker_gpu)]
-    if FLAGS.locally_shard_to_cpu:
+    if FLAGS.locally_shard_to_cpu or FLAGS.worker_gpu < 1:
       datashard_devices += ["cpu:0"]
     caching_devices = None
   elif FLAGS.sync:

From ae4919238bc1837f6c613ef8951a7c78322f5dda Mon Sep 17 00:00:00 2001
From: Ryan Sepassi <rsepassi@google.com>
Date: Wed, 9 Aug 2017 14:45:45 -0700
Subject: [PATCH 16/17] fix some open-source imports

PiperOrigin-RevId: 164776330
---
 tensor2tensor/data_generators/desc2code_test.py | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/tensor2tensor/data_generators/desc2code_test.py b/tensor2tensor/data_generators/desc2code_test.py
index 0d10c7d6f..24b7568d0 100644
--- a/tensor2tensor/data_generators/desc2code_test.py
+++ b/tensor2tensor/data_generators/desc2code_test.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Tests for google3.third_party.py.tensor2tensor.data_generators.desc2code."""
+"""Tests for desc2code."""
 
 from __future__ import absolute_import
 from __future__ import division
@@ -21,8 +21,8 @@
 
 # Dependency imports
 from tensor2tensor.data_generators import desc2code
-from google3.testing.pybase import googletest
 
+import tensorflow as tf
 
 CODE_CPP_IN = """
   #include <iostream>
@@ -39,10 +39,11 @@
 
 """
 
-CODE_CPP_OUT = "#include <iostream> void main() { /* Not this one */ int a = 3; }"  # pylint: disable=line-too-loong
+CODE_CPP_OUT = ("#include <iostream> void main() { /* Not this one */ int a = "
+                "3; }")
 
 
-class Desc2codeTest(googletest.TestCase):
+class Desc2codeTest(tf.test.TestCase):
 
   def testCppPreprocess(self):
     """Check that the file correctly preprocess the code source."""
@@ -50,15 +51,13 @@ def testCppPreprocess(self):
 
     self.assertEqual(  # Add space beween two lines
         cpp_pb.preprocess_target("firstline//comm1\nsecondline//comm2\n"),
-        "firstline secondline"
-    )
+        "firstline secondline")
     # Checking for boths comments and spaces
     self.assertEqual(cpp_pb.preprocess_target(CODE_CPP_IN), CODE_CPP_OUT)
     self.assertEqual(
         cpp_pb.preprocess_target("  not removed //abcd  "),
-        "not removed //abcd"
-    )
+        "not removed //abcd")
 
 
 if __name__ == "__main__":
-  googletest.main()
+  tf.test.main()

From af4f1e03b24cccfd56a2eb49ed50caf29f6cd361 Mon Sep 17 00:00:00 2001
From: T2T Team <no-reply@google.com>
Date: Wed, 9 Aug 2017 14:54:45 -0700
Subject: [PATCH 17/17] Simplify calls to embedding_to_padding, we always end
 up converting the padding mask to a float tensor.

PiperOrigin-RevId: 164777753
---
 tensor2tensor/layers/common_attention.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/tensor2tensor/layers/common_attention.py b/tensor2tensor/layers/common_attention.py
index a43afec47..2b1bd124f 100644
--- a/tensor2tensor/layers/common_attention.py
+++ b/tensor2tensor/layers/common_attention.py
@@ -166,17 +166,17 @@ def add_positional_embedding_nd(x, max_length, name):
 
 
 def embedding_to_padding(emb):
-  """Input embeddings -> is_padding.
+  """Calculates the padding mask based on which embeddings are all zero.
 
   We have hacked symbol_modality to return all-zero embeddings for padding.
 
   Args:
     emb: a Tensor with shape [..., depth].
   Returns:
-    a boolean Tensor with shape [...].
+    a float Tensor with shape [...].
   """
   emb_sum = tf.reduce_sum(tf.abs(emb), axis=-1)
-  return tf.equal(emb_sum, 0.0)
+  return tf.to_float(tf.equal(emb_sum, 0.0))
 
 
 def attention_bias_lower_triangle(length):
@@ -197,13 +197,13 @@ def attention_bias_ignore_padding(memory_padding):
   """Create an bias tensor to be added to attention logits.
 
   Args:
-    memory_padding: a boolean `Tensor` with shape [batch, memory_length].
+    memory_padding: a float `Tensor` with shape [batch, memory_length].
 
   Returns:
     a `Tensor` with shape [batch, 1, 1, memory_length].
   """
-  ret = tf.to_float(memory_padding) * -1e9
-  return tf.expand_dims(tf.expand_dims(ret, 1), 1)
+  ret = memory_padding * -1e9
+  return tf.expand_dims(tf.expand_dims(ret, axis=1), axis=1)
 
 
 def attention_bias_proximal(length):
@@ -523,8 +523,7 @@ def pad_l_and_r(x, pad_length):
     # [batch, heads, blocks, block_length, dim]
     k_new = tf.transpose(k_new, [2, 3, 0, 1, 4])
 
-    attention_bias = tf.expand_dims(
-        tf.to_float(embedding_to_padding(k_new)) * -1e9, axis=-2)
+    attention_bias = tf.expand_dims(embedding_to_padding(k_new) * -1e9, axis=-2)
 
     v_t = tf.transpose(v, [2, 0, 1, 3])
     v_new = tf.gather(v_t, gather_indices)