diff --git a/README.md b/README.md
index aa00654..e99bfc5 100644
--- a/README.md
+++ b/README.md
@@ -2,19 +2,36 @@
 
 This repo contains the code used in [Encoder-Agnostic Adaptation for Conditional Language Generation](https://arxiv.org/abs/1908.06938), Zachary M. Ziegler, Luke Melas-Kyriazi, Sebastian Gehrmann and Alexander M. Rush. It extends [OpenNMT-py](https://github.com/OpenNMT/OpenNMT-py).
 
-This code was tested with `pytorch 1.0.1`. See requirements.txt for a complete list of dependencies.
+This code was tested with `pytorch 1.2.0`. See requirements.txt for a complete list of dependencies.
+This was made to run with the following dependencies:
+
+1. Install python 3.6.1 (3.7 Also works fine).
+2. Install pip 21.0.1 via `pip install --upgrade pip`.
+3. Install python dependencies via `pip install -r /requirements.txt`.
+
+## Download Datasets
+
+The datasets used are provided [here](https://github.com/abisee/cnn-dailymail) `Python 2` or `Python 3` [here](https://github.com/saadz-khan/cnn-dailymail).
+
+1. Download the stories files.
+2. Process the raw stories from the above repositories follow this notebook in `google-colab` [here](https://github.com/saadz-khan/cnn-dailymail/blob/master/made_easy.ipynb) for easy use.
+3. Convert the format from `.bin` to `.txt.tgt` and `.txt.src` using [this](https://gist.github.com/jorgeramirez/15286b588dc2669ced95bbf6a6803420) script (Every .bin file seperately) or the above notebook.
+
 
 ## Download GPT2 weights
 
 `cd gpt2 && python download_model.py 124M`
 
+
+## Data-Used
+
+The BPEized data used in the experiments in the paper can be found [here](https://drive.google.com/file/d/1Z6AdOr2MtWlN7sYRTMibzAcghBjSBzZK/view?usp=sharing). To run any of these models with your own data you should first BPEize it with `python gpt2/encode_text.py <filename>`. Before training the raw data is preprocessed into binary data shards.
+
+
 ## General notes
 
 All experiments use gradient accumulation to mimic the large batch sizes these hyperparameter settings were optimized for by e.g. Facebook. If you run into GPU memory issues simply reduce the batch size and increase the `accum_count` to keep the effective batch size the same.
 
-## Data
-
-The BPEized data used in the experiments in the paper can be found [here](https://drive.google.com/file/d/1Z6AdOr2MtWlN7sYRTMibzAcghBjSBzZK/view?usp=sharing). To run any of these models with your own data you should first BPEize it with `python gpt2/encode_text.py <filename>`. Before training the raw data is preprocessed into binary data shards with the commands below.
 
 ## Class-conditional generation
 
@@ -22,7 +39,10 @@ The BPEized data used in the experiments in the paper can be found [here](https:
 
 `python preprocess.py -train_src data/imdb/train.src.bpe -train_tgt data/imdb/train.tgt.bpe -valid_src data/imdb/valid.src.bpe -valid_tgt data/imdb/valid.tgt.bpe -save_data data/imdb/IMDB_BPETGT -tgt_seq_length_trunc 400 -tgt_vocab gpt2/vocab.txt -fixed_vocab -free_src`
 
-### Train
+### Train  
+
+Kindly change the config files accordingly for `paths`, `gpu_ranks`, `world_size` and adjust `accum_count` so the effective batch size remains the same.
+
 **Baseline**: `python train.py -config config/imdb/transformer_imdb_cond.yml -run_name baseline`
 
 **Simple fusion**: `python train.py -config config/imdb/transformer_imdb_cond.yml -run_name simple_fusion -gpt2_params_path gpt2/models/124M/ -simple_fusion -dropout 0.1 -accum_count 30 -batch_size 1000 -valid_batch_size 16`
@@ -83,7 +103,3 @@ The default settings use 4 GPUs (see config files). If using more GPUs or fewer
 Generation is performed via top-k/random sampling.
 
 `python translate.py -beam_size 1 -random_sampling_topk 100 -random_sampling_temp 0.9 -model <path/to/model.pt> -src data/stories/test.wp_source.bpe -max_length 1000 -verbose`
-
-## Image captioning
-
-Coming soon...
diff --git a/config/cnndm/transformer_cnndm_baseline.yml b/config/cnndm/transformer_cnndm_baseline.yml
index fafbdf3..b99b5a5 100644
--- a/config/cnndm/transformer_cnndm_baseline.yml
+++ b/config/cnndm/transformer_cnndm_baseline.yml
@@ -43,10 +43,12 @@ share_embeddings: 'true'
 
 copy_attn: 'true'
 
-world_size: 4
-gpu_ranks:
-- 0
-- 1
-- 2
-- 3
+# world_size must be according to the number of GPU's used.
+world_size: 1
+# Setting CPU by default, Not everyone has GPU. For GPU uncomment the following lines
+#gpu_ranks: 
+#- 0
+#- 1
+#- 2
+#- 3
 tensorboard: 'true'
diff --git a/config/cnndm/transformer_cnndm_ctxattn.yml b/config/cnndm/transformer_cnndm_ctxattn.yml
index a7e3d30..6888975 100644
--- a/config/cnndm/transformer_cnndm_ctxattn.yml
+++ b/config/cnndm/transformer_cnndm_ctxattn.yml
@@ -46,10 +46,13 @@ share_embeddings: 'true'
 
 copy_attn: 'true'
 
-world_size: 4
-gpu_ranks:
-- 0
-- 1
-- 2
-- 3
+# world_size must be according to the number of GPU's used.
+world_size: 1
+# Setting CPU by default, Not everyone has GPU. For GPU uncomment the following lines
+#gpu_ranks: 
+#- 0
+#- 1
+#- 2
+#- 3
+
 tensorboard: 'true'
diff --git a/config/cnndm/transformer_cnndm_psa.yml b/config/cnndm/transformer_cnndm_psa.yml
index 409113c..01b20a0 100644
--- a/config/cnndm/transformer_cnndm_psa.yml
+++ b/config/cnndm/transformer_cnndm_psa.yml
@@ -46,10 +46,12 @@ share_embeddings: 'true'
 
 copy_attn: 'true'
 
-world_size: 4
-gpu_ranks:
-- 0
-- 1
-- 2
-- 3
+# world_size must be according to the number of GPU's used.
+world_size: 1
+# Setting CPU by default, Not everyone has GPU. For GPU uncomment the following lines
+#gpu_ranks: 
+#- 0
+#- 1
+#- 2
+#- 3
 tensorboard: 'true'
diff --git a/config/imdb/transformer_imdb_cond.yml b/config/imdb/transformer_imdb_cond.yml
index 1378669..3e8b1fd 100644
--- a/config/imdb/transformer_imdb_cond.yml
+++ b/config/imdb/transformer_imdb_cond.yml
@@ -39,6 +39,6 @@ position_encoding_ctxsize: 1024
 share_decoder_embeddings: 'true'
 
 world_size: 1
-gpu_ranks:
-- 0
+#gpu_ranks:
+#- 0
 tensorboard: 'true'
diff --git a/config/imdb/transformer_imdb_ctxattn.yml b/config/imdb/transformer_imdb_ctxattn.yml
index de36684..b53fc33 100644
--- a/config/imdb/transformer_imdb_ctxattn.yml
+++ b/config/imdb/transformer_imdb_ctxattn.yml
@@ -42,6 +42,6 @@ position_encoding_ctxsize: 1024
 share_decoder_embeddings: 'true'
 
 world_size: 1
-gpu_ranks:
-- 0
+#gpu_ranks:
+#- 0
 tensorboard: 'true'
diff --git a/config/imdb/transformer_imdb_psa.yml b/config/imdb/transformer_imdb_psa.yml
index 6ae5508..d3779d6 100644
--- a/config/imdb/transformer_imdb_psa.yml
+++ b/config/imdb/transformer_imdb_psa.yml
@@ -42,6 +42,6 @@ position_encoding_ctxsize: 1024
 share_decoder_embeddings: 'true'
 
 world_size: 1
-gpu_ranks:
-- 0
+#gpu_ranks:
+#- 0
 tensorboard: 'true'
diff --git a/config/story_gen/transformer_story_baseline.yml b/config/story_gen/transformer_story_baseline.yml
index b1e9939..819b93b 100644
--- a/config/story_gen/transformer_story_baseline.yml
+++ b/config/story_gen/transformer_story_baseline.yml
@@ -43,10 +43,12 @@ position_encoding_ctxsize: 1024
 share_decoder_embeddings: 'true'
 share_embeddings: 'true' # This is not quite the same, but probably should only have positive effect?
 
-world_size: 4
-gpu_ranks:
-- 0
-- 1
-- 2
-- 3
+# world_size must be according to the number of GPU's used.
+world_size: 1
+# Setting CPU by default, Not everyone has GPU. For GPU uncomment the following lines
+#gpu_ranks: 
+#- 0
+#- 1
+#- 2
+#- 3
 tensorboard: 'true'
diff --git a/config/story_gen/transformer_story_ctxattn.yml b/config/story_gen/transformer_story_ctxattn.yml
index 2863448..6275af2 100644
--- a/config/story_gen/transformer_story_ctxattn.yml
+++ b/config/story_gen/transformer_story_ctxattn.yml
@@ -49,10 +49,12 @@ share_decoder_embeddings: 'true'
 share_embeddings: 'true' # This is not quite the same, but probably should only have positive effect?
 share_position_embeddings: 'true'
 
-world_size: 4
-gpu_ranks:
-- 0
-- 1
-- 2
-- 3
+# world_size must be according to the number of GPU's used.
+world_size: 1
+# Setting CPU by default, Not everyone has GPU. For GPU uncomment the following lines
+#gpu_ranks: 
+#- 0
+#- 1
+#- 2
+#- 3
 tensorboard: 'true'
diff --git a/config/story_gen/transformer_story_psa.yml b/config/story_gen/transformer_story_psa.yml
index cd55406..946b255 100644
--- a/config/story_gen/transformer_story_psa.yml
+++ b/config/story_gen/transformer_story_psa.yml
@@ -49,10 +49,12 @@ share_decoder_embeddings: 'true'
 share_embeddings: 'true' # This is not quite the same, but probably should only have positive effect?
 share_position_embeddings: 'true'
 
-world_size: 4
-gpu_ranks:
-- 0
-- 1
-- 2
-- 3
+# world_size must be according to the number of GPU's used.
+world_size: 1
+# Setting CPU by default, Not everyone has GPU. For GPU uncomment the following lines
+#gpu_ranks: 
+#- 0
+#- 1
+#- 2
+#- 3
 tensorboard: 'true'
diff --git a/data/data_convert.py b/data/data_convert.py
new file mode 100644
index 0000000..c577c2d
--- /dev/null
+++ b/data/data_convert.py
@@ -0,0 +1,115 @@
+# Copyright 2017 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Example of converting model data.
+Usage:
+python data_convert_example.py --command binary_to_text --in_file /path/to_bin/data.bin --out_file name_text_data.txt
+python data_convert_example.py --command text_to_binary --in_file /path/to_text/text_data.txt --out_file data/name_binary_data(.bin)
+"""
+
+"""
+For this scipt use Tensorflow 1.15.2
+Modified version: 
+I adapted the _binary_to_text function so that it outputs the format expected by NeuSum (https://github.com/magic282/NeuSum)
+  article  => out_file.txt.src.txt
+  abstract => out_file.txt.tgt.txt
+  The next step is a bad solution but works.
+  Rename the file (Necessary so that the encode_text.py can find data for BPEising it)
+  article => out_file.txt.src
+  abstract => out_file.txt.tgt  
+"""
+
+
+import struct
+import sys
+import tensorflow as tf
+from tensorflow.core.example import example_pb2
+import os
+from nltk.tokenize import sent_tokenize, word_tokenize
+FLAGS = tf.app.flags.FLAGS
+tf.app.flags.DEFINE_string('command', 'binary_to_text',
+                           'Either binary_to_text or text_to_binary.'
+                           'Specify FLAGS.in_file accordingly.')
+tf.app.flags.DEFINE_string('in_file', '', 'path to file')
+tf.app.flags.DEFINE_string('out_file', '', 'path to file')
+
+
+def _binary_to_text_for_neusum():
+    reader = open(FLAGS.in_file, 'rb')
+    writer_src = open("%s.src.txt" % FLAGS.out_file, 'w')
+    writer_tgt = open("%s.tgt.txt" % FLAGS.out_file, 'w')
+    while True:
+        len_bytes = reader.read(8)
+        if not len_bytes:
+            sys.stderr.write('Done reading\n')
+            return
+        str_len = struct.unpack('q', len_bytes)[0]
+        tf_example_str = struct.unpack(
+            '%ds' % str_len, reader.read(str_len))[0]
+        tf_example = example_pb2.Example.FromString(tf_example_str)
+        src_sentences = sent_tokenize(
+            "%s" % tf_example.features.feature["article"].bytes_list.value[0])
+
+        # in this case we get rid off <s></s>
+        #<s> harry potter star daniel radcliffe gets # 20m fortune as he turns 18 monday . </s> <s> young actor says he has no plans to fritter his cash away . </s> <s> radcliffe 's earnings from first five potter films have been held in trust fund . </s>
+        tgt_txt =  "%s" % tf_example.features.feature["abstract"].bytes_list.value[0]
+        tgt_txt =  tgt_txt.replace(" </s> <s> ", "##SENT##")
+        tgt_txt =  tgt_txt.replace("<s> ", "")
+        tgt_txt =  tgt_txt.replace(" </s>", "")
+
+        writer_src.write("##SENT##".join(src_sentences) + os.linesep)
+        writer_tgt.write(tgt_txt + os.linesep)
+
+
+        #examples = []
+        # for key in tf_example.features.feature:
+        #    examples.append('%s=%s' % (
+        #        key, tf_example.features.feature[key].bytes_list.value[0]))
+        # writer.write('%s\n' % '\t'.join(examples))
+    reader.close()
+    writer_src.close()
+    writer_tgt.close()
+
+
+def _text_to_binary():
+    inputs = open(FLAGS.in_file, 'r').readlines()
+    writer = open(FLAGS.out_file, 'wb')
+    for inp in inputs:
+        tf_example = example_pb2.Example()
+        for feature in inp.strip().split('\t'):
+            (k, v) = feature.split('=')
+            if k.startswith('"') and k.endswith('"'):
+                k = k[1:-1]
+            if v.startswith('"') and v.endswith('"'):
+                v = v[1:-1]
+            tf_example.features.feature[k.encode(
+                'utf8')].bytes_list.value.extend([v.encode('utf8')])
+        tf_example_str = tf_example.SerializeToString()
+        str_len = len(tf_example_str)
+        writer.write(struct.pack('q', str_len))
+        writer.write(struct.pack('%ds' % str_len, tf_example_str))
+    writer.close()
+
+
+def main(unused_argv):
+    assert FLAGS.command and FLAGS.in_file and FLAGS.out_file
+    if FLAGS.command == 'binary_to_text':
+        _binary_to_text_for_neusum()
+    elif FLAGS.command == 'text_to_binary':
+        _text_to_binary()
+
+
+if __name__ == '__main__':
+    tf.app.run()
\ No newline at end of file
diff --git a/onmt/decoders/transformer.py b/onmt/decoders/transformer.py
index fe09ce2..e82b4f1 100644
--- a/onmt/decoders/transformer.py
+++ b/onmt/decoders/transformer.py
@@ -232,9 +232,9 @@ def forward(self, inputs, memory_bank, src_pad_mask, tgt_pad_mask,
             future_mask = torch.ones(
                 [tgt_len, tgt_len],
                 device=tgt_pad_mask.device,
-                dtype=torch.uint8)
+                dtype=torch.uint8).byte()
             future_mask = future_mask.triu_(1).view(1, tgt_len, tgt_len)
-            dec_mask = torch.gt(tgt_pad_mask + future_mask, 0)
+            dec_mask = torch.gt(tgt_pad_mask.byte() + future_mask, 0)
 
         input_norm = self.layer_norm_1(inputs)
 
diff --git a/onmt/inputters/text_dataset.py b/onmt/inputters/text_dataset.py
index fd7407e..bd6544d 100644
--- a/onmt/inputters/text_dataset.py
+++ b/onmt/inputters/text_dataset.py
@@ -182,7 +182,7 @@ def text_fields(**kwargs):
     truncate = kwargs.get("truncate", None)
     fields_ = []
     feat_delim = u"￨" if n_feats > 0 else None
-    for i in range(n_feats + 1):
+    for i in range(n_feats):
         name = base_name + "_feat_" + str(i - 1) if i > 0 else base_name
         truncated = [0]
         tokenize = partial(
diff --git a/onmt/train_single.py b/onmt/train_single.py
index 72645da..86d27ab 100755
--- a/onmt/train_single.py
+++ b/onmt/train_single.py
@@ -40,7 +40,10 @@ def _tally_parameters(model, only_trainable=False):
 
 def configure_process(opt, device_id):
     if device_id >= 0:
-        torch.cuda.set_device(device_id)
+        try:
+            torch.cuda.set_device(device_id)
+        except AttributeError:
+            print("Failed to set CUDA device, using CPU")
     set_random_seed(opt.seed, device_id >= 0)
 
 
diff --git a/onmt/trainer.py b/onmt/trainer.py
index 4cfc8b2..81bd000 100644
--- a/onmt/trainer.py
+++ b/onmt/trainer.py
@@ -12,6 +12,7 @@
 from copy import deepcopy
 import itertools
 import torch
+import time
 
 import onmt.utils
 from onmt.utils.logging import logger
@@ -198,10 +199,9 @@ def train(self,
                 train_iter, self.gpu_rank, None, self.n_gpu)
 
         #torch.cuda.synchronize()
-        #last_end_time = time.time()
         for i, (batches, normalization) in enumerate(
                 self._accum_batches(train_iter)):
-            #print('batch time: %0.5f' % (time.time() - last_end_time))
+            start_time = time.time()
             step = self.optim.training_step
 
             if self.gpu_verbose_level > 1:
@@ -257,7 +257,7 @@ def train(self,
                 break
             
             #torch.cuda.synchronize()
-            #last_end_time = time.time()
+            print(f"batch time: {time.time() - start_time} step {step}") 
 
         if self.model_saver is not None:
             self.model_saver.save(step, moving_average=self.moving_average)
diff --git a/requirements.txt b/requirements.txt
index 5b6cfbf..5a19023 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,9 +1,9 @@
-six
-tqdm==4.30.*
-torch==1.0.1
-git+https://github.com/pytorch/text.git@master#wheel=torchtext
-future
-configargparse
-PyYAML
-tensorflow
-tensorboardX
+six==1.15.0
+tqdm==4.30.0
+torch==1.2.0
+torchtext==0.4.0
+future==0.18.2
+configargparse==1.2.3
+PyYAML==5.3.1
+tensorflow==1.15.0
+tensorboardX==2.1
\ No newline at end of file