diff --git a/README.md b/README.md
index 78bec70..386c6b9 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ ForecastPFN is more accurate and faster compared to state-of-the-art forecasting
 
 The codebase has these parts: 
 - `./src/` contains all code to replicate the ForecastPFN synthetic data generation and training procedure
-- `./benchmark/` contains all the code to replicate the benchmark of ForecastPFN against the the other baselines. 
+- `./benchmark/` contains all the code to replicate the benchmark of ForecastPFN against the other baselines. 
 
 # Table of contents
 1. [Installation](#installation-)
@@ -80,7 +80,7 @@ The arguments that are passed are:
 See how our model performs:
 ![alt text](img/fpfn_performance.png?raw=true)
 
-The above figure shows analysis of performance vs. train budget, aggregated across datasets and prediction lengths. We plot the number of total MSE wins (left) where a higher value is better and mean MSE rank (right) where a lower values is better. Error bars show one standard deviation across training runs. ForecastPFN and Meta-N-BEATS are disadvantaged in these comparisons given that they see no training data for these series, only the length 36 input.
+The above figure shows an analysis of performance vs. train budget, aggregated across datasets and prediction lengths. We plot the number of total MSE wins (left) where a higher value is better and mean MSE rank (right) where a lower value is better. Error bars show one standard deviation across training runs. ForecastPFN and Meta-N-BEATS are disadvantaged in these comparisons given that they see no training data for these series, only the length 36 input.
 
 # Synthetic Data Generation <a name="SyntheticDataGeneration"></a>
 ForecastPFN is completely trained on synthetic data.
diff --git a/benchmark/.DS_Store b/benchmark/.DS_Store
deleted file mode 100644
index d6bded0..0000000
Binary files a/benchmark/.DS_Store and /dev/null differ
diff --git a/benchmark/README.md b/benchmark/README.md
index 7b051d3..6ec8da7 100644
--- a/benchmark/README.md
+++ b/benchmark/README.md
@@ -1,4 +1,4 @@
-This directory is for evaluation of ForecastPFN. We have evaluated ForecastPFN on seven real-world datasets which have been used in the literature. The datasets are in the `../academic_data` folder. The datasets include Illness, Exchange, ECL, ETTh1 and ETTh2, Weather and Traffic.
+This directory is for the evaluation of ForecastPFN. We have evaluated ForecastPFN on seven real-world datasets that have been used in the literature. The datasets are in the `../academic_data` folder. The datasets include Illness, Exchange, ECL, ETTh1 and ETTh2, Weather and Traffic.
 
 The evaluation has been done against multiple baselines which include Arima, Prophet, Informer, Fedformer-w, Autoformer, Transformer and Metalearn, as well as more simple baselines Mean, Last, and NaiveSeasonal.
 
@@ -24,12 +24,12 @@ The arguments that are passed are:
 - `root_path` : This denotes the parent directory which contains the required dataset.
 - `data_path` : This denotes the name of the file which contains the data. Look into the academic_data folder for information regarding other dataset files.
 - `model` : This is one of (ForecastPFN, Metalearn, Arima, Autoformer, Informer, Transformer, FEDformer-w, Prophet)
-- `seq_len` : The length of input sequence to be used. In our default setting, we have this set to 96 for exchange and 36 for all other datasets.
+- `seq_len` : The length of the input sequence to be used. In our default setting, we have this set to 96 for exchange and 36 for all other datasets.
 - `label_len` : In our default setting, we have this set to 48 for exchange and 18 for all other datasets.
-- `pred_len` : This is the length of prediction to be made. We have evaluated our model with various prediction lengths.
-- `train_budget` : This denotes the number of training examples that are available to the models which they can use for training. ForecastPFN and Metalearn use 0 examples since they are zero-shot.
+- `pred_len` : This is the length of the prediction to be made. We have evaluated our model with various prediction lengths.
+- `train_budget` : This denotes the number of training examples that are available to the models that they can use for training. ForecastPFN and Metalearn use 0 examples since they are zero-shot.
 - `itr` : Number of times evaluation should be repeated. This affects the transformer-based models since they are non-deterministic.
 
 All experiments that have been run for this paper can be found in `run.sh`. 
 
-Repliaction of the paper tables and plots can be found in the jupyter notebook `./analyze_results.ipynb`.
\ No newline at end of file
+Replication of the paper tables and plots can be found in the jupyter notebook `./analyze_results.ipynb`.
\ No newline at end of file
diff --git a/benchmark/data_provider/UnivariateTimeseriesSampler_WithStamps.py b/benchmark/data_provider/UnivariateTimeseriesSampler_WithStamps.py
index cdbc5aa..e87ab68 100644
--- a/benchmark/data_provider/UnivariateTimeseriesSampler_WithStamps.py
+++ b/benchmark/data_provider/UnivariateTimeseriesSampler_WithStamps.py
@@ -1,17 +1,17 @@
 import numpy as np
-import pandas as pd
-import datetime
+
 
 class UnivariateTimeseriesSampler_WithStamps:
-    def __init__(self,
-                 timeseries: np.ndarray,
-                 time_stamps: np.ndarray,
-                 insample_size: int,
-                 outsample_size: int,
-                 window_sampling_limit: int,
-                 batch_size: int,
-                 time_features,
-                 ):
+    def __init__(
+        self,
+        timeseries: np.ndarray,
+        time_stamps: np.ndarray,
+        insample_size: int,
+        outsample_size: int,
+        window_sampling_limit: int,
+        batch_size: int,
+        time_features,
+    ):
         self.timeseries = [ts for ts in timeseries]
         self.time_stamps = [ts for ts in time_stamps]
         self.window_sampling_limit = window_sampling_limit
@@ -20,7 +20,6 @@ def __init__(self,
         self.outsample_size = outsample_size
         self.time_features = time_features
         self.time_embedding_dim = self.time_features(self.time_stamps[0]).T.shape[0]
-        
 
     def __iter__(self):
         while True:
@@ -28,47 +27,79 @@ def __iter__(self):
             insample_mask = np.zeros((self.batch_size, self.insample_size))
             outsample = np.zeros((self.batch_size, self.outsample_size))
             outsample_mask = np.zeros((self.batch_size, self.outsample_size))
-            sampled_ts_indices = np.random.randint(len(self.timeseries), size=self.batch_size)
+            sampled_ts_indices = np.random.randint(
+                len(self.timeseries), size=self.batch_size
+            )
 
             insample_time_stamps = np.zeros(
-                (self.batch_size, self.insample_size, self.time_embedding_dim), dtype=object)
+                (self.batch_size, self.insample_size, self.time_embedding_dim),
+                dtype=object,
+            )
             outsample_time_stamps = np.zeros(
-                (self.batch_size, self.outsample_size, self.time_embedding_dim), dtype=object)
+                (self.batch_size, self.outsample_size, self.time_embedding_dim),
+                dtype=object,
+            )
             for i, sampled_index in enumerate(sampled_ts_indices):
                 sampled_timeseries = self.timeseries[sampled_index]
-                cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
-                                              high=len(sampled_timeseries),
-                                              size=1)[0]
+                cut_point = np.random.randint(
+                    low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
+                    high=len(sampled_timeseries),
+                    size=1,
+                )[0]
 
-                insample_window = sampled_timeseries[max(0, cut_point - self.insample_size):cut_point]
-                insample[i, -len(insample_window):] = insample_window
-                insample_mask[i, -len(insample_window):] = 1.0
+                insample_window = sampled_timeseries[
+                    max(0, cut_point - self.insample_size) : cut_point
+                ]
+                insample[i, -len(insample_window) :] = insample_window
+                insample_mask[i, -len(insample_window) :] = 1.0
                 outsample_window = sampled_timeseries[
-                                   cut_point:min(len(sampled_timeseries), cut_point + self.outsample_size)]
-                outsample[i, :len(outsample_window)] = outsample_window
-                outsample_mask[i, :len(outsample_window)] = 1.0
+                    cut_point : min(
+                        len(sampled_timeseries), cut_point + self.outsample_size
+                    )
+                ]
+                outsample[i, : len(outsample_window)] = outsample_window
+                outsample_mask[i, : len(outsample_window)] = 1.0
 
                 sampled_timestamps = self.time_stamps[sampled_index]
-                insample_window_time_stamps = sampled_timestamps[max(0, cut_point - self.insample_size):cut_point]
-                insample_time_stamps[i, -len(insample_window_time_stamps):] = self.time_features(insample_window_time_stamps)
+                insample_window_time_stamps = sampled_timestamps[
+                    max(0, cut_point - self.insample_size) : cut_point
+                ]
+                insample_time_stamps[
+                    i, -len(insample_window_time_stamps) :
+                ] = self.time_features(insample_window_time_stamps)
                 outsample_window_timestamps = sampled_timestamps[
-                                   cut_point:min(len(sampled_timestamps), cut_point + self.outsample_size)]
-                outsample_time_stamps[i, :len(outsample_window_timestamps)] = self.time_features(outsample_window_timestamps)
-            yield insample, insample_mask, outsample, outsample_mask, insample_time_stamps, outsample_time_stamps
+                    cut_point : min(
+                        len(sampled_timestamps), cut_point + self.outsample_size
+                    )
+                ]
+                outsample_time_stamps[
+                    i, : len(outsample_window_timestamps)
+                ] = self.time_features(outsample_window_timestamps)
+            yield (
+                insample,
+                insample_mask,
+                outsample,
+                outsample_mask,
+                insample_time_stamps,
+                outsample_time_stamps,
+            )
 
     def sequential_latest_insamples(self):
         batch_size = len(self.timeseries)
         insample = np.zeros((batch_size, self.insample_size))
         insample_mask = np.zeros((batch_size, self.insample_size))
         insample_time_stamps = np.zeros(
-                (batch_size, self.insample_size, self.time_embedding_dim), dtype=object)        
+            (batch_size, self.insample_size, self.time_embedding_dim), dtype=object
+        )
         for i, (ts, time_stamp) in enumerate(zip(self.timeseries, self.time_stamps)):
-            ts_last_window = ts[-self.insample_size:]
-            insample[i, -len(ts):] = ts_last_window
-            insample_mask[i, -len(ts):] = 1.0
+            ts_last_window = ts[-self.insample_size :]
+            insample[i, -len(ts) :] = ts_last_window
+            insample_mask[i, -len(ts) :] = 1.0
 
             sampled_timestamps = time_stamp
-            insample_window_time_stamps = sampled_timestamps[-self.insample_size:]
-            insample_time_stamps[i, -len(insample_window_time_stamps):] = self.time_features(insample_window_time_stamps)
+            insample_window_time_stamps = sampled_timestamps[-self.insample_size :]
+            insample_time_stamps[
+                i, -len(insample_window_time_stamps) :
+            ] = self.time_features(insample_window_time_stamps)
 
         return insample, insample_mask, insample_time_stamps
diff --git a/benchmark/data_provider/data_factory.py b/benchmark/data_provider/data_factory.py
index 42f96ff..6ed4bc4 100644
--- a/benchmark/data_provider/data_factory.py
+++ b/benchmark/data_provider/data_factory.py
@@ -1,5 +1,7 @@
-from data_provider.data_loader import Dataset_Custom
 from torch.utils.data import DataLoader
+
+from data_provider.data_loader import Dataset_Custom
+
 # from metalearned.resources.electricity.dataset import ElectricityDataset, ElectricityMeta
 # from metalearned.resources.m3.dataset import M3Dataset, M3Meta
 # from metalearned.resources.m4.dataset import M4Dataset, M4Meta
@@ -74,5 +76,6 @@ def data_provider(args, flag):
         batch_size=batch_size,
         shuffle=shuffle_flag,
         num_workers=args.num_workers,
-        drop_last=drop_last)
+        drop_last=drop_last,
+    )
     return data_set, data_loader
diff --git a/benchmark/data_provider/data_loader.py b/benchmark/data_provider/data_loader.py
index 616590c..566b74c 100644
--- a/benchmark/data_provider/data_loader.py
+++ b/benchmark/data_provider/data_loader.py
@@ -1,23 +1,33 @@
 import os
-import numpy as np
+import warnings
+
 import pandas as pd
-import os
-import torch
-from torch.utils.data import Dataset, DataLoader
-from utils.timefeatures import time_features
 from sklearn.preprocessing import StandardScaler
-import warnings
+from torch.utils.data import Dataset
+
+from utils.timefeatures import time_features
 
 warnings.filterwarnings('ignore')
 
+
 class Dataset_Custom(Dataset):
-    def __init__(self, root_path, flag='train', size=None,
-                 features='S', data_path='ETTh1.csv',
-                 target='OT', scale=True, timeenc=0, freq='h', 
-                 scaler=StandardScaler(), train_budget=None):
+    def __init__(
+        self,
+        root_path,
+        flag='train',
+        size=None,
+        features='S',
+        data_path='ETTh1.csv',
+        target='OT',
+        scale=True,
+        timeenc=0,
+        freq='h',
+        scaler=StandardScaler(),
+        train_budget=None,
+    ):
         # size [seq_len, label_len, pred_len]
         # info
-        if size == None:
+        if size is None:
             self.seq_len = 24 * 4 * 4
             self.label_len = 24 * 4
             self.pred_len = 24 * 4
@@ -43,12 +53,11 @@ def __init__(self, root_path, flag='train', size=None,
         self.__read_data__()
 
     def __read_data__(self):
-        df_raw = pd.read_csv(os.path.join(self.root_path,
-                                          self.data_path))
+        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))
 
-        '''
+        """
         df_raw.columns: ['date', ...(other features), target feature]
-        '''
+        """
         cols = list(df_raw.columns)
         cols.remove(self.target)
         cols.remove('date')
@@ -60,10 +69,13 @@ def __read_data__(self):
 
         train_start = 0
         if self.train_budget:
-            train_start = max(train_start, num_train -
-                              self.seq_len - self.train_budget)
+            train_start = max(train_start, num_train - self.seq_len - self.train_budget)
 
-        border1s = [train_start, num_train - self.seq_len, len(df_raw) - num_test - self.seq_len]
+        border1s = [
+            train_start,
+            num_train - self.seq_len,
+            len(df_raw) - num_test - self.seq_len,
+        ]
         border2s = [num_train, num_train + num_vali, len(df_raw)]
         border1 = border1s[self.set_type]
         border2 = border2s[self.set_type]
@@ -75,7 +87,7 @@ def __read_data__(self):
             df_data = df_raw[[self.target]]
 
         if self.scale:
-            train_data = df_data[0:border2s[0]]
+            train_data = df_data[0 : border2s[0]]
             self.scaler.fit(train_data.values)
             data = self.scaler.transform(df_data.values)
         else:
@@ -91,7 +103,9 @@ def __read_data__(self):
             df_stamp['hour'] = df_stamp.date.apply(lambda row: row.hour, 1)
             data_stamp = df_stamp.drop(['date'], 1).values
         elif self.timeenc == 1:
-            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
+            data_stamp = time_features(
+                pd.to_datetime(df_stamp['date'].values), freq=self.freq
+            )
             data_stamp = data_stamp.transpose(1, 0)
 
         self.data_x = data[border1:border2]
@@ -108,10 +122,10 @@ def __getitem__(self, index):
         seq_y = self.data_y[r_begin:r_end]
         seq_x_mark = self.data_stamp[s_begin:s_end]
         seq_y_mark = self.data_stamp[r_begin:r_end]
-        seq_x_original = self.data_stamp_original['date'].values[s_begin:s_end]
-        seq_y_original = self.data_stamp_original['date'].values[r_begin:r_end]
+        # seq_x_original = self.data_stamp_original["date"].values[s_begin:s_end]
+        # seq_y_original = self.data_stamp_original["date"].values[r_begin:r_end]
 
-        return seq_x, seq_y, seq_x_mark, seq_y_mark#, seq_x_original, seq_y_original
+        return seq_x, seq_y, seq_x_mark, seq_y_mark  # , seq_x_original, seq_y_original
 
     def __len__(self):
         return len(self.data_x) - self.seq_len - self.pred_len + 1
@@ -121,13 +135,24 @@ def inverse_transform(self, data):
 
 
 class Dataset_Pred(Dataset):
-    def __init__(self, root_path, flag='pred', size=None,
-                 features='S', data_path='ETTh1.csv',
-                 target='OT', scale=True, inverse=False, timeenc=0, freq='15min', cols=None,
-                 scaler=StandardScaler()):
+    def __init__(
+        self,
+        root_path,
+        flag='pred',
+        size=None,
+        features='S',
+        data_path='ETTh1.csv',
+        target='OT',
+        scale=True,
+        inverse=False,
+        timeenc=0,
+        freq='15min',
+        cols=None,
+        scaler=StandardScaler(),
+    ):
         # size [seq_len, label_len, pred_len]
         # info
-        if size == None:
+        if size is None:
             self.seq_len = 24 * 4 * 4
             self.label_len = 24 * 4
             self.pred_len = 24 * 4
@@ -151,11 +176,10 @@ def __init__(self, root_path, flag='pred', size=None,
         self.__read_data__()
 
     def __read_data__(self):
-        df_raw = pd.read_csv(os.path.join(self.root_path,
-                                          self.data_path))
-        '''
+        df_raw = pd.read_csv(os.path.join(self.root_path, self.data_path))
+        """
         df_raw.columns: ['date', ...(other features), target feature]
-        '''
+        """
         if self.cols:
             cols = self.cols.copy()
             cols.remove(self.target)
@@ -181,7 +205,9 @@ def __read_data__(self):
 
         tmp_stamp = df_raw[['date']][border1:border2]
         tmp_stamp['date'] = pd.to_datetime(tmp_stamp.date)
-        pred_dates = pd.date_range(tmp_stamp.date.values[-1], periods=self.pred_len + 1, freq=self.freq)
+        pred_dates = pd.date_range(
+            tmp_stamp.date.values[-1], periods=self.pred_len + 1, freq=self.freq
+        )
 
         df_stamp = pd.DataFrame(columns=['date'])
         df_stamp.date = list(tmp_stamp.date.values) + list(pred_dates[1:])
@@ -194,7 +220,9 @@ def __read_data__(self):
             df_stamp['minute'] = df_stamp.minute.map(lambda x: x // 15)
             data_stamp = df_stamp.drop(['date'], 1).values
         elif self.timeenc == 1:
-            data_stamp = time_features(pd.to_datetime(df_stamp['date'].values), freq=self.freq)
+            data_stamp = time_features(
+                pd.to_datetime(df_stamp['date'].values), freq=self.freq
+            )
             data_stamp = data_stamp.transpose(1, 0)
 
         self.data_x = data[border1:border2]
@@ -212,9 +240,9 @@ def __getitem__(self, index):
 
         seq_x = self.data_x[s_begin:s_end]
         if self.inverse:
-            seq_y = self.data_x[r_begin:r_begin + self.label_len]
+            seq_y = self.data_x[r_begin : r_begin + self.label_len]
         else:
-            seq_y = self.data_y[r_begin:r_begin + self.label_len]
+            seq_y = self.data_y[r_begin : r_begin + self.label_len]
         seq_x_mark = self.data_stamp[s_begin:s_end]
         seq_y_mark = self.data_stamp[r_begin:r_end]
 
diff --git a/benchmark/exp/exp_ForecastPFN.py b/benchmark/exp/exp_ForecastPFN.py
index 2e17d7a..b880da7 100644
--- a/benchmark/exp/exp_ForecastPFN.py
+++ b/benchmark/exp/exp_ForecastPFN.py
@@ -1,25 +1,26 @@
+import datetime
 import os
+import time
 import warnings
+
 import numpy as np
-import torch
-import torch.nn as nn
 import pandas as pd
-import datetime
-import time
-from data_provider.data_factory import data_provider
+import tensorflow as tf
+import torch
+from sklearn.preprocessing import StandardScaler
+
 from exp.exp_basic import Exp_Basic
-from utils.metrics import metric
 from utils.metrics import smape
-import tensorflow as tf
-import tensorflow_io
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
+
 gpus = tf.config.experimental.list_physical_devices('GPU')
 if gpus:
-  try:
-    tf.config.experimental.set_virtual_device_configuration(
-        gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
-  except RuntimeError as e:
-    print(e)
+    try:
+        tf.config.experimental.set_virtual_device_configuration(
+            gpus[0],
+            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)],
+        )
+    except RuntimeError as e:
+        print(e)
 
 
 warnings.filterwarnings('ignore')
@@ -30,24 +31,25 @@ def __init__(self, args):
         super(Exp_ForecastPFN, self).__init__(args)
 
     def _build_model(self):
-        return
+        pass
 
     def train(self, setting):
-        return 
-    
+        pass
+
     def _ForecastPFN_time_features(self, ts: np.ndarray):
         if type(ts[0]) == datetime.datetime:
             year = [x.year for x in ts]
             month = [x.month for x in ts]
             day = [x.day for x in ts]
-            day_of_week = [x.weekday()+1 for x in ts]
+            day_of_week = [x.weekday() + 1 for x in ts]
             day_of_year = [x.timetuple().tm_yday for x in ts]
             return np.stack([year, month, day, day_of_week, day_of_year], axis=-1)
         ts = pd.to_datetime(ts)
-        return np.stack([ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
+        return np.stack(
+            [ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1
+        )
 
-    def _process_tuple(self,x,x_mark,y_mark, 
-                       model, horizon):
+    def _process_tuple(self, x, x_mark, y_mark, model, horizon):
         """
         x: tensor of shape (n, 1)
         x_mark: tensor of shape (n, d)
@@ -74,7 +76,7 @@ def _process_tuple(self,x,x_mark,y_mark,
         history_std = np.nanstd(history[-6:])
 
         # local scale, don't know why defined so
-        local_scale = (history_mean + history_std + 1e-4)
+        local_scale = history_mean + history_std + 1e-4
 
         # change history based on local scale, to normalize it between 0 and 1
         history = np.clip(history / local_scale, a_min=0, a_max=1)
@@ -85,44 +87,60 @@ def _process_tuple(self,x,x_mark,y_mark,
                 target = tf.convert_to_tensor(x_mark)[-100:, :]
                 history = tf.convert_to_tensor(history)[-100:, :]
             else:
-                target = tf.pad(x_mark.cpu(), [[100-x.shape[0], 0], [0, 0]])
-                history = tf.pad(history, [[100-x.shape[0], 0], [0, 0]])
+                target = tf.pad(x_mark.cpu(), [[100 - x.shape[0], 0], [0, 0]])
+                history = tf.pad(history, [[100 - x.shape[0], 0], [0, 0]])
 
-            history = tf.repeat(tf.expand_dims(history, axis=0), [
-                                horizon], axis=0)[:, :, 0]
-            ts = tf.repeat(tf.expand_dims(
-                target, axis=0), [horizon], axis=0)
+            history = tf.repeat(tf.expand_dims(history, axis=0), [horizon], axis=0)[
+                :, :, 0
+            ]
+            ts = tf.repeat(tf.expand_dims(target, axis=0), [horizon], axis=0)
 
         else:
-            ts = tf.convert_to_tensor(x_mark.unsqueeze(0).repeat(
-                horizon, 1, 1), dtype=tf.int64)
+            ts = tf.convert_to_tensor(
+                x_mark.unsqueeze(0).repeat(horizon, 1, 1), dtype=tf.int64
+            )
             history = tf.convert_to_tensor(history, dtype=tf.float32)
 
-        task = tf.fill([horizon, ], 1)
+        task = tf.fill(
+            [
+                horizon,
+            ],
+            1,
+        )
         target_ts = tf.convert_to_tensor(
-            y_mark.cpu()[-horizon:, :].unsqueeze(1), dtype=tf.int64)
-
-        model_input = {'ts': ts, 'history': history,
-                        'target_ts': target_ts, 'task': task}
+            y_mark.cpu()[-horizon:, :].unsqueeze(1), dtype=tf.int64
+        )
+
+        model_input = {
+            'ts': ts,
+            'history': history,
+            'target_ts': target_ts,
+            'task': task,
+        }
         t1 = time.time()
         pred_vals = model(model_input)
         time_diff = time.time() - t1
-        scaled_vals = pred_vals['result'].numpy(
-        ).T.reshape(-1) * pred_vals['scale'].numpy().reshape(-1)
+        scaled_vals = pred_vals['result'].numpy().T.reshape(-1) * pred_vals[
+            'scale'
+        ].numpy().reshape(-1)
         scaled_vals = scaler.inverse_transform([scaled_vals])
         return scaled_vals, time_diff
-    
-    def _ForecastPFN_process_batch(self, model, batch_x, batch_y, batch_x_mark, batch_y_mark):
+
+    def _ForecastPFN_process_batch(
+        self, model, batch_x, batch_y, batch_x_mark, batch_y_mark
+    ):
         preds = []
         trues = []
-        for idx, (x, y, x_mark, y_mark) in enumerate(zip(batch_x, batch_y, batch_x_mark, batch_y_mark)):
-
+        for idx, (x, y, x_mark, y_mark) in enumerate(
+            zip(batch_x, batch_y, batch_x_mark, batch_y_mark)
+        ):
             pred, time_diff = self._process_tuple(
-                x, x_mark, y_mark, model, self.args.pred_len)
+                x, x_mark, y_mark, model, self.args.pred_len
+            )
 
-            y = y[-self.args.pred_len:, :].to(self.device)
+            y = y[-self.args.pred_len :, :].to(self.device)
             true = y.detach().cpu().numpy()
-            
+
             preds += [pred]
             trues += [true]
         return preds, trues, time_diff
@@ -130,11 +148,13 @@ def _ForecastPFN_process_batch(self, model, batch_x, batch_y, batch_x_mark, batc
     def test(self, setting, test=0):
         test_data, test_loader = self._get_data(flag='test')
         test_data.data_stamp = self._ForecastPFN_time_features(
-            list(test_data.data_stamp_original['date']))
+            list(test_data.data_stamp_original['date'])
+        )
         if test:
             print('loading model')
             pretrained = tf.keras.models.load_model(
-                self.args.model_path, custom_objects={'smape': smape})
+                self.args.model_path, custom_objects={'smape': smape}
+            )
 
         preds = []
         trues = []
@@ -145,7 +165,9 @@ def test(self, setting, test=0):
         self.test_timer.start_timer()
         timer = 0
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                test_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float().to(self.device)
 
@@ -153,9 +175,10 @@ def test(self, setting, test=0):
                 batch_y_mark = batch_y_mark.float().to(self.device)
 
                 pred, true, time = self._ForecastPFN_process_batch(
-                    pretrained, batch_x, batch_y, batch_x_mark, batch_y_mark)
+                    pretrained, batch_x, batch_y, batch_x_mark, batch_y_mark
+                )
                 timer += time
-                
+
                 preds.append(pred)
                 trues.append(true)
 
diff --git a/benchmark/exp/exp_arima.py b/benchmark/exp/exp_arima.py
index d4e007b..03fdd97 100644
--- a/benchmark/exp/exp_arima.py
+++ b/benchmark/exp/exp_arima.py
@@ -1,15 +1,9 @@
-import os
-import time
 import warnings
-import numpy as np
-import torch
-import torch.nn as nn
-from torch import optim
-from data_provider.data_factory import data_provider
-from exp.exp_basic import Exp_Basic
-from utils.metrics import metric
-import pmdarima
+
 import pandas as pd
+import pmdarima
+
+from exp.exp_basic import Exp_Basic
 
 warnings.filterwarnings('ignore')
 
@@ -21,11 +15,14 @@ def __init__(self, args):
     def _build_model(self):
         return pmdarima.auto_arima
 
-
     def train(self, setting):
         train_data, train_loader = self._get_data(flag='train')
-        train_df = pd.DataFrame({'y': train_data.data_y.T[0], 'ds': list(
-            pd.to_datetime(train_data.data_stamp_original['date']))})
+        train_df = pd.DataFrame(
+            {
+                'y': train_data.data_y.T[0],
+                'ds': list(pd.to_datetime(train_data.data_stamp_original['date'])),
+            }
+        )
         self.train_timer.start_timer()
         self.model = pmdarima.auto_arima(train_df.y.values)
         self.train_timer.end_timer()
@@ -35,23 +32,29 @@ def test(self, setting, test=0):
         horizon = self.args.pred_len
 
         test_data, test_loader = self._get_data(flag='test')
-        test_df = pd.DataFrame({'y': test_data.data_y.T[0], 'ds': list(
-            pd.to_datetime(test_data.data_stamp_original['date']))})
-
-        cmp = pd.DataFrame({
-            'date': test_df['ds'].values,
-            'y': test_df['y'].values,
-            'yhat': self.model.predict(test_df.shape[0])
-        })
+        test_df = pd.DataFrame(
+            {
+                'y': test_data.data_y.T[0],
+                'ds': list(pd.to_datetime(test_data.data_stamp_original['date'])),
+            }
+        )
+
+        cmp = pd.DataFrame(
+            {
+                'date': test_df['ds'].values,
+                'y': test_df['y'].values,
+                'yhat': self.model.predict(test_df.shape[0]),
+            }
+        )
 
         preds, trues = [], []
         self.test_timer.start_timer()
-        for i in range(self.args.seq_len, cmp.shape[0]-horizon+1):
-            pred = cmp[i:i+horizon]['yhat'].values
-            true = cmp[i:i+horizon]['y'].values
+        for i in range(self.args.seq_len, cmp.shape[0] - horizon + 1):
+            pred = cmp[i : i + horizon]['yhat'].values
+            true = cmp[i : i + horizon]['y'].values
             preds += [pred]
             trues += [true]
 
         self.test_timer.end_timer()
-        
+
         return self._save_test_data(setting, preds, trues)
diff --git a/benchmark/exp/exp_basic.py b/benchmark/exp/exp_basic.py
index 81e526e..e9b2b09 100644
--- a/benchmark/exp/exp_basic.py
+++ b/benchmark/exp/exp_basic.py
@@ -1,9 +1,12 @@
 import os
-import torch
+
 import numpy as np
+import torch
+
 from data_provider.data_factory import data_provider
-from utils.tools import TimeBudget
 from utils.metrics import metric
+from utils.tools import TimeBudget
+
 
 class Exp_Basic(object):
     def __init__(self, args):
@@ -20,8 +23,9 @@ def _build_model(self):
 
     def _acquire_device(self):
         if self.args.use_gpu:
-            os.environ["CUDA_VISIBLE_DEVICES"] = str(
-                self.args.gpu) if not self.args.use_multi_gpu else self.args.devices
+            os.environ['CUDA_VISIBLE_DEVICES'] = (
+                str(self.args.gpu) if not self.args.use_multi_gpu else self.args.devices
+            )
             device = torch.device('cuda:{}'.format(self.args.gpu))
             print('Use GPU: cuda:{}'.format(self.args.gpu))
         else:
@@ -48,8 +52,8 @@ def _save_test_data(self, setting, preds, trues):
 
         mae, mse, rmse, mape, mspe = metric(preds, trues)
         print('mse:{}, mae:{}'.format(mse, mae))
-        f = open("result.txt", 'a')
-        f.write(setting + "  \n")
+        f = open('result.txt', 'a')
+        f.write(setting + '  \n')
         f.write('mse:{}, mae:{}'.format(mse, mae))
         f.write('\n')
         f.write('\n')
@@ -58,15 +62,15 @@ def _save_test_data(self, setting, preds, trues):
         output = {
             'metrics': {
                 'mae': mae,
-                'mse': mse, 
-                'rmse': rmse, 
-                'mape': mape, 
+                'mse': mse,
+                'rmse': rmse,
+                'mape': mape,
                 'mspe': mspe,
             },
             'train_timer': self.train_timer.total_time,
             'vali_timer': self.vali_timer.total_time,
             'test_timer': self.test_timer.total_time,
-            'args': self.args
+            'args': self.args,
         }
         print(output)
 
diff --git a/benchmark/exp/exp_last.py b/benchmark/exp/exp_last.py
index 8be4ec4..7d8311d 100644
--- a/benchmark/exp/exp_last.py
+++ b/benchmark/exp/exp_last.py
@@ -1,15 +1,9 @@
-import os
-import time
 import warnings
-import numpy as np
+
+import pmdarima
 import torch
-import torch.nn as nn
-from torch import optim
-from data_provider.data_factory import data_provider
+
 from exp.exp_basic import Exp_Basic
-from utils.metrics import metric
-import pmdarima
-import pandas as pd
 
 warnings.filterwarnings('ignore')
 
@@ -21,25 +15,30 @@ def __init__(self, args):
     def _build_model(self):
         return pmdarima.auto_arima
 
-
     def train(self, setting):
-        return
+        pass
 
     def test(self, setting, test=0):
-        horizon = self.args.pred_len
-
         test_data, test_loader = self._get_data(flag='test')
 
         preds, trues = [], []
         self.test_timer.start_timer()
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                test_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float().to(self.device)
 
-                true = batch_y[:, -self.args.pred_len:].detach().cpu().numpy()
-                pred = batch_x[:,-1,:].unsqueeze(1).repeat(
-                    1, true.shape[1], 1).detach().cpu().numpy()
+                true = batch_y[:, -self.args.pred_len :].detach().cpu().numpy()
+                pred = (
+                    batch_x[:, -1, :]
+                    .unsqueeze(1)
+                    .repeat(1, true.shape[1], 1)
+                    .detach()
+                    .cpu()
+                    .numpy()
+                )
 
                 preds.append(pred)
                 trues.append(true)
diff --git a/benchmark/exp/exp_mean.py b/benchmark/exp/exp_mean.py
index 326acd7..8546e13 100644
--- a/benchmark/exp/exp_mean.py
+++ b/benchmark/exp/exp_mean.py
@@ -1,15 +1,9 @@
-import os
-import time
 import warnings
-import numpy as np
+
+import pmdarima
 import torch
-import torch.nn as nn
-from torch import optim
-from data_provider.data_factory import data_provider
+
 from exp.exp_basic import Exp_Basic
-from utils.metrics import metric
-import pmdarima
-import pandas as pd
 
 warnings.filterwarnings('ignore')
 
@@ -21,25 +15,30 @@ def __init__(self, args):
     def _build_model(self):
         return pmdarima.auto_arima
 
-
     def train(self, setting):
-        return
+        pass
 
     def test(self, setting, test=0):
-        horizon = self.args.pred_len
-
         test_data, test_loader = self._get_data(flag='test')
 
         preds, trues = [], []
         self.test_timer.start_timer()
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                test_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float().to(self.device)
 
-                true = batch_y[:, -self.args.pred_len:].detach().cpu().numpy()
-                pred = batch_x.mean(1).unsqueeze(1).repeat(
-                    1, true.shape[1], 1).detach().cpu().numpy()
+                true = batch_y[:, -self.args.pred_len :].detach().cpu().numpy()
+                pred = (
+                    batch_x.mean(1)
+                    .unsqueeze(1)
+                    .repeat(1, true.shape[1], 1)
+                    .detach()
+                    .cpu()
+                    .numpy()
+                )
 
                 preds.append(pred)
                 trues.append(true)
diff --git a/benchmark/exp/exp_metalearn.py b/benchmark/exp/exp_metalearn.py
index 075056e..bb317e1 100644
--- a/benchmark/exp/exp_metalearn.py
+++ b/benchmark/exp/exp_metalearn.py
@@ -1,36 +1,30 @@
 import os
+import time
 import warnings
-import numpy as np
+
+import tensorflow as tf
 import torch
-import torch.nn as nn
-import pandas as pd
-import datetime
-from data_provider.data_factory import data_provider
+
 from exp.exp_basic import Exp_Basic
-from utils.metrics import metric
-from utils.metrics import smape
-import tensorflow as tf
-import tensorflow_io
-import time
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
+
 gpus = tf.config.experimental.list_physical_devices('GPU')
 if gpus:
-  try:
-    tf.config.experimental.set_virtual_device_configuration(
-        gpus[0], [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)])
-  except RuntimeError as e:
-    print(e)
+    try:
+        tf.config.experimental.set_virtual_device_configuration(
+            gpus[0],
+            [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=5120)],
+        )
+    except RuntimeError as e:
+        print(e)
 
 import sys
+
 sys.path.append('metalearned')
 
 from metalearned.common.experiment import load_experiment_parameters
-from metalearned.common.torch_utils import SnapshotManager, to_device, to_tensor, mase_loss, mape_loss, smape_2_loss
+from metalearned.common.torch_utils import SnapshotManager
 from metalearned.models.nbeats_torch import nbeats_generic, nbeats_interpretable
 
-
-
-
 warnings.filterwarnings('ignore')
 
 
@@ -39,7 +33,6 @@ def __init__(self, args):
         super(Exp_Metalearn, self).__init__(args)
 
     def _build_model(self):
-
         self.args.path = f'metalearned/experiments/tl/ForecastPFN/loss_name=MAPE,input_size={self.args.seq_len},horizon={self.args.pred_len}/'
 
         experiment_parameters = load_experiment_parameters(self.args.path)
@@ -49,35 +42,39 @@ def _build_model(self):
         horizon = experiment_parameters['horizon']
 
         if experiment_parameters['model_type'] == 'generic':
-            model = nbeats_generic(input_size=input_size,
-                                output_size=horizon,
-                                blocks=experiment_parameters['blocks'],
-                                stacks=experiment_parameters['stacks'],
-                                fc_layers=experiment_parameters['layers'],
-                                fc_layers_size=experiment_parameters['width'],
-                                scaling=experiment_parameters['scaling'],
-                                mode=experiment_parameters['mode'])
+            model = nbeats_generic(
+                input_size=input_size,
+                output_size=horizon,
+                blocks=experiment_parameters['blocks'],
+                stacks=experiment_parameters['stacks'],
+                fc_layers=experiment_parameters['layers'],
+                fc_layers_size=experiment_parameters['width'],
+                scaling=experiment_parameters['scaling'],
+                mode=experiment_parameters['mode'],
+            )
         else:
-            model = nbeats_interpretable(input_size=input_size,
-                                        output_size=horizon,
-                                        trend_blocks=experiment_parameters['trend_blocks'],
-                                        trend_fc_layers=experiment_parameters['layers'],
-                                        trend_fc_layers_size=experiment_parameters['trend_fc_layers_size'],
-                                        degree_of_polynomial=experiment_parameters['degree_of_polynomial'],
-                                        seasonality_blocks=experiment_parameters['seasonality_blocks'],
-                                        seasonality_fc_layers=experiment_parameters['layers'],
-                                        seasonality_fc_layers_size=experiment_parameters[
-                                            'seasonality_fc_layers_size'],
-                                        num_of_harmonics=experiment_parameters['num_of_harmonics'],
-                                        scaling=experiment_parameters['scaling'],
-                                        mode=experiment_parameters['mode'])
+            model = nbeats_interpretable(
+                input_size=input_size,
+                output_size=horizon,
+                trend_blocks=experiment_parameters['trend_blocks'],
+                trend_fc_layers=experiment_parameters['layers'],
+                trend_fc_layers_size=experiment_parameters['trend_fc_layers_size'],
+                degree_of_polynomial=experiment_parameters['degree_of_polynomial'],
+                seasonality_blocks=experiment_parameters['seasonality_blocks'],
+                seasonality_fc_layers=experiment_parameters['layers'],
+                seasonality_fc_layers_size=experiment_parameters[
+                    'seasonality_fc_layers_size'
+                ],
+                num_of_harmonics=experiment_parameters['num_of_harmonics'],
+                scaling=experiment_parameters['scaling'],
+                mode=experiment_parameters['mode'],
+            )
 
         return model.to(self.device)
 
     def train(self, setting):
-        return
-    
-    
+        pass
+
     def test(self, setting, test=0):
         test_data, test_loader = self._get_data(flag='test')
 
@@ -90,14 +87,15 @@ def test(self, setting, test=0):
             experiment_parameters = self.args.experiment_parameters
 
             snapshot_dir = os.path.join(path, 'snapshots', time_freq)
-            snapshot_manager = SnapshotManager(snapshot_dir=snapshot_dir,
-                                            logging_frequency=experiment_parameters['logging_frequency'],
-                                            snapshot_frequency=experiment_parameters['snapshot_frequency'])
+            snapshot_manager = SnapshotManager(
+                snapshot_dir=snapshot_dir,
+                logging_frequency=experiment_parameters['logging_frequency'],
+                snapshot_frequency=experiment_parameters['snapshot_frequency'],
+            )
 
             self.model.load_state_dict(torch.load(snapshot_manager.model_snapshot_file))
             self.model.to(self.device)
 
-
         preds = []
         trues = []
         folder_path = './test_results/' + setting + '/'
@@ -107,9 +105,11 @@ def test(self, setting, test=0):
         self.test_timer.start_timer()
         timer = 0
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
-                batch_x = batch_x.float().to(self.device)[:,:,0]
-                batch_y = batch_y.float().to(self.device)[:,:,0]
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                test_loader
+            ):
+                batch_x = batch_x.float().to(self.device)[:, :, 0]
+                batch_y = batch_y.float().to(self.device)[:, :, 0]
 
                 print(batch_x.shape, batch_y.shape)
 
@@ -118,10 +118,10 @@ def test(self, setting, test=0):
 
                 t1 = time.time()
                 pred = self.model(batch_x, torch.ones(batch_x.shape).to(self.device))
-                timer += time.time()-t1
+                timer += time.time() - t1
 
                 pred = pred.detach().cpu().numpy()
-                true = batch_y[:,-self.args.pred_len:].detach().cpu().numpy()
+                true = batch_y[:, -self.args.pred_len :].detach().cpu().numpy()
 
                 preds.append(pred)
                 trues.append(true)
diff --git a/benchmark/exp/exp_prophet.py b/benchmark/exp/exp_prophet.py
index a7e30fb..b0710b6 100644
--- a/benchmark/exp/exp_prophet.py
+++ b/benchmark/exp/exp_prophet.py
@@ -1,13 +1,9 @@
-import os
-import time
 import warnings
-import numpy as np
-import torch
-import torch.nn as nn
-from torch import optim
-from exp.exp_basic import Exp_Basic
-import prophet
+
 import pandas as pd
+import prophet
+
+from exp.exp_basic import Exp_Basic
 
 warnings.filterwarnings('ignore')
 
@@ -21,8 +17,12 @@ def _build_model(self):
 
     def train(self, setting):
         train_data, train_loader = self._get_data(flag='train')
-        train_df = pd.DataFrame({'y': train_data.data_y.T[0], 'ds': list(
-            pd.to_datetime(train_data.data_stamp_original['date']))})
+        train_df = pd.DataFrame(
+            {
+                'y': train_data.data_y.T[0],
+                'ds': list(pd.to_datetime(train_data.data_stamp_original['date'])),
+            }
+        )
         self.train_timer.start_timer()
         self.model.fit(train_df)
         self.train_timer.end_timer()
@@ -32,24 +32,29 @@ def test(self, setting, test=0):
         horizon = self.args.pred_len
 
         test_data, test_loader = self._get_data(flag='test')
-        test_df = pd.DataFrame({'y': test_data.data_y.T[0], 'ds': list(
-            pd.to_datetime(test_data.data_stamp_original['date']))})
-        predict_frame = self.model.make_future_dataframe(
-            test_data.data_x.shape[0])
+        test_df = pd.DataFrame(
+            {
+                'y': test_data.data_y.T[0],
+                'ds': list(pd.to_datetime(test_data.data_stamp_original['date'])),
+            }
+        )
+        self.model.make_future_dataframe(test_data.data_x.shape[0])
         forecast = self.model.predict(test_df)
 
-        cmp = pd.DataFrame({
-            'date': test_df['ds'].values,
-            'ds': forecast.ds.values,
-            'y': test_df['y'].values,
-            'yhat': forecast.yhat.values
-        })
+        cmp = pd.DataFrame(
+            {
+                'date': test_df['ds'].values,
+                'ds': forecast.ds.values,
+                'y': test_df['y'].values,
+                'yhat': forecast.yhat.values,
+            }
+        )
 
         preds, trues = [], []
         self.test_timer.start_timer()
-        for i in range(self.args.seq_len, cmp.shape[0]-horizon+1):
-            pred = cmp[i:i+horizon]['yhat'].values
-            true = cmp[i:i+horizon]['y'].values
+        for i in range(self.args.seq_len, cmp.shape[0] - horizon + 1):
+            pred = cmp[i : i + horizon]['yhat'].values
+            true = cmp[i : i + horizon]['y'].values
             preds += [pred]
             trues += [true]
 
diff --git a/benchmark/exp/exp_resolver.py b/benchmark/exp/exp_resolver.py
index 6af7dba..e761189 100644
--- a/benchmark/exp/exp_resolver.py
+++ b/benchmark/exp/exp_resolver.py
@@ -1,14 +1,14 @@
 import warnings
-from exp.exp_transformer import Exp_Transformer
-from exp.exp_transformer_metalearn import Exp_Transformer_Meta
-from exp.exp_ForecastPFN import Exp_ForecastPFN
+
 from exp.exp_arima import Exp_Arima
-from exp.exp_prophet import Exp_Prophet
-from exp.exp_metalearn import Exp_Metalearn
-from exp.exp_mean import Exp_Mean
+from exp.exp_ForecastPFN import Exp_ForecastPFN
 from exp.exp_last import Exp_Last
+from exp.exp_mean import Exp_Mean
+from exp.exp_metalearn import Exp_Metalearn
+from exp.exp_prophet import Exp_Prophet
 from exp.exp_seasonalNaive import Exp_SeasonalNaive
-
+from exp.exp_transformer import Exp_Transformer
+from exp.exp_transformer_metalearn import Exp_Transformer_Meta
 
 warnings.filterwarnings('ignore')
 
diff --git a/benchmark/exp/exp_seasonalNaive.py b/benchmark/exp/exp_seasonalNaive.py
index 8a06136..4668e9f 100644
--- a/benchmark/exp/exp_seasonalNaive.py
+++ b/benchmark/exp/exp_seasonalNaive.py
@@ -1,15 +1,9 @@
-import os
-import time
 import warnings
-import numpy as np
+
+import pmdarima
 import torch
-import torch.nn as nn
-from torch import optim
-from data_provider.data_factory import data_provider
+
 from exp.exp_basic import Exp_Basic
-from utils.metrics import metric
-import pmdarima
-import pandas as pd
 
 warnings.filterwarnings('ignore')
 
@@ -21,25 +15,29 @@ def __init__(self, args):
     def _build_model(self):
         return pmdarima.auto_arima
 
-
     def train(self, setting):
-        return
+        pass
 
     def test(self, setting, test=0):
-        horizon = self.args.pred_len
-
         test_data, test_loader = self._get_data(flag='test')
 
         preds, trues = [], []
         self.test_timer.start_timer()
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                test_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float().to(self.device)
 
-                true = batch_y[:, -self.args.pred_len:].detach().cpu().numpy()
-                pred = batch_x[:,-7:,:].repeat(
-                    1,int(true.shape[1]/7)+1,1)[:,:true.shape[1],:].detach().cpu().numpy()
+                true = batch_y[:, -self.args.pred_len :].detach().cpu().numpy()
+                pred = (
+                    batch_x[:, -7:, :]
+                    .repeat(1, int(true.shape[1] / 7) + 1, 1)[:, : true.shape[1], :]
+                    .detach()
+                    .cpu()
+                    .numpy()
+                )
 
                 preds.append(pred)
                 trues.append(true)
diff --git a/benchmark/exp/exp_transformer.py b/benchmark/exp/exp_transformer.py
index 8281fb1..f53c411 100644
--- a/benchmark/exp/exp_transformer.py
+++ b/benchmark/exp/exp_transformer.py
@@ -1,16 +1,15 @@
 import os
 import time
 import warnings
+
 import numpy as np
 import torch
 import torch.nn as nn
 from torch import optim
-from data_provider.data_factory import data_provider
-from exp.exp_basic import Exp_Basic
-from transformer_models.models import FEDformer, Autoformer, Informer, Transformer
-from utils.tools import EarlyStopping, TimeBudget, adjust_learning_rate, visual
-from utils.metrics import metric
 
+from exp.exp_basic import Exp_Basic
+from transformer_models.models import Autoformer, FEDformer, Informer, Transformer
+from utils.tools import EarlyStopping, TimeBudget, adjust_learning_rate
 
 warnings.filterwarnings('ignore')
 
@@ -38,8 +37,7 @@ def _build_model(self):
         return model.to(self.device)
 
     def _select_optimizer(self):
-        model_optim = optim.Adam(
-            self.model.parameters(), lr=self.args.learning_rate)
+        model_optim = optim.Adam(self.model.parameters(), lr=self.args.learning_rate)
         return model_optim
 
     def _select_criterion(self):
@@ -51,7 +49,9 @@ def vali(self, vali_data, vali_loader, criterion):
         self.model.eval()
         self.vali_timer.start_timer()
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(vali_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                vali_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float()
 
@@ -59,29 +59,34 @@ def vali(self, vali_data, vali_loader, criterion):
                 batch_y_mark = batch_y_mark.float().to(self.device)
 
                 # decoder input
-                dec_inp = torch.zeros_like(
-                    batch_y[:, -self.args.pred_len:, :]).float()
-                dec_inp = torch.cat(
-                    [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
+                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len :, :]).float()
+                dec_inp = (
+                    torch.cat([batch_y[:, : self.args.label_len, :], dec_inp], dim=1)
+                    .float()
+                    .to(self.device)
+                )
                 # encoder - decoder
                 if self.args.use_amp:
                     with torch.cuda.amp.autocast():
                         if self.args.output_attention:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )[0]
                         else:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )
                 else:
                     if self.args.output_attention:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )[0]
                     else:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )
                 f_dim = -1 if self.args.features == 'MS' else 0
-                batch_y = batch_y[:, -self.args.pred_len:,
-                                  f_dim:].to(self.device)
+                batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(self.device)
 
                 pred = outputs.detach().cpu()
                 true = batch_y.detach().cpu()
@@ -106,11 +111,10 @@ def train(self, setting):
         else:
             raise NotImplementedError
 
-        time_now = time.time()
+        time.time()
 
         train_steps = len(train_loader)
-        early_stopping = EarlyStopping(
-            patience=self.args.patience, verbose=False)
+        early_stopping = EarlyStopping(patience=self.args.patience, verbose=False)
 
         model_optim = self._select_optimizer()
         criterion = self._select_criterion()
@@ -126,7 +130,9 @@ def train(self, setting):
 
             self.model.train()
             epoch_time = time.time()
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(train_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                train_loader
+            ):
                 iter_count += 1
                 model_optim.zero_grad()
                 batch_x = batch_x.float().to(self.device)
@@ -136,37 +142,43 @@ def train(self, setting):
                 batch_y_mark = batch_y_mark.float().to(self.device)
 
                 # decoder input
-                dec_inp = torch.zeros_like(
-                    batch_y[:, -self.args.pred_len:, :]).float()
-                dec_inp = torch.cat(
-                    [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
+                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len :, :]).float()
+                dec_inp = (
+                    torch.cat([batch_y[:, : self.args.label_len, :], dec_inp], dim=1)
+                    .float()
+                    .to(self.device)
+                )
 
                 # encoder - decoder
                 if self.args.use_amp:
                     with torch.cuda.amp.autocast():
                         if self.args.output_attention:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )[0]
                         else:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )
 
                         f_dim = -1 if self.args.features == 'MS' else 0
-                        batch_y = batch_y[:, -self.args.pred_len:,
-                                          f_dim:].to(self.device)
+                        batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(
+                            self.device
+                        )
                         loss = criterion(outputs, batch_y)
                         train_loss.append(loss.item())
                 else:
                     if self.args.output_attention:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )[0]
                     else:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )
 
                     f_dim = -1 if self.args.features == 'MS' else 0
-                    batch_y = batch_y[:, -self.args.pred_len:,
-                                      f_dim:].to(self.device)
+                    batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(self.device)
 
                     loss = criterion(outputs, batch_y)
                     train_loss.append(loss.item())
@@ -190,17 +202,19 @@ def train(self, setting):
 
                     return self.model
 
-            print("Epoch: {} cost time: {}".format(
-                epoch + 1, time.time() - epoch_time))
+            print('Epoch: {} cost time: {}'.format(epoch + 1, time.time() - epoch_time))
             train_loss = np.average(train_loss)
             vali_loss = self.vali(vali_data, vali_loader, criterion)
             test_loss = self.vali(test_data, test_loader, criterion)
 
-            print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format(
-                epoch + 1, train_steps, train_loss, vali_loss, test_loss))
+            print(
+                'Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}'.format(
+                    epoch + 1, train_steps, train_loss, vali_loss, test_loss
+                )
+            )
             early_stopping(vali_loss, self.model, path)
             if early_stopping.early_stop:
-                print("Early stopping")
+                print('Early stopping')
                 break
 
             adjust_learning_rate(model_optim, epoch + 1, self.args)
@@ -217,11 +231,18 @@ def test(self, setting, test=0):
         if test:
             print('loading model')
             if self.args.use_gpu:
-                self.model.load_state_dict(torch.load(os.path.join(
-                    './checkpoints/' + setting, 'checkpoint.pth')))
+                self.model.load_state_dict(
+                    torch.load(
+                        os.path.join('./checkpoints/' + setting, 'checkpoint.pth')
+                    )
+                )
             else:
-                self.model.load_state_dict(torch.load(os.path.join(
-                    './checkpoints/' + setting, 'checkpoint.pth'), map_location=torch.device('cpu')))
+                self.model.load_state_dict(
+                    torch.load(
+                        os.path.join('./checkpoints/' + setting, 'checkpoint.pth'),
+                        map_location=torch.device('cpu'),
+                    )
+                )
 
         preds = []
         trues = []
@@ -230,10 +251,11 @@ def test(self, setting, test=0):
             os.makedirs(folder_path)
 
         self.model.eval()
-        j = 0
         self.test_timer.start_timer()
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(test_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                test_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float().to(self.device)
 
@@ -241,32 +263,37 @@ def test(self, setting, test=0):
                 batch_y_mark = batch_y_mark.float().to(self.device)
 
                 # decoder input
-                dec_inp = torch.zeros_like(
-                    batch_y[:, -self.args.pred_len:, :]).float()
-                dec_inp = torch.cat(
-                    [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
+                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len :, :]).float()
+                dec_inp = (
+                    torch.cat([batch_y[:, : self.args.label_len, :], dec_inp], dim=1)
+                    .float()
+                    .to(self.device)
+                )
                 # encoder - decoder
                 if self.args.use_amp:
                     with torch.cuda.amp.autocast():
                         if self.args.output_attention:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )[0]
                         else:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )
                 else:
                     if self.args.output_attention:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )[0]
 
                     else:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )
 
                 f_dim = -1 if self.args.features == 'MS' else 0
 
-                batch_y = batch_y[:, -self.args.pred_len:,
-                                  f_dim:].to(self.device)
+                batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(self.device)
                 outputs = outputs.detach().cpu().numpy()
                 batch_y = batch_y.detach().cpu().numpy()
 
@@ -300,33 +327,41 @@ def predict(self, setting, load=False):
 
         self.model.eval()
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(pred_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                pred_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float()
                 batch_x_mark = batch_x_mark.float().to(self.device)
                 batch_y_mark = batch_y_mark.float().to(self.device)
 
                 # decoder input
-                dec_inp = torch.zeros_like(
-                    batch_y[:, -self.args.pred_len:, :]).float()
-                dec_inp = torch.cat(
-                    [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
+                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len :, :]).float()
+                dec_inp = (
+                    torch.cat([batch_y[:, : self.args.label_len, :], dec_inp], dim=1)
+                    .float()
+                    .to(self.device)
+                )
                 # encoder - decoder
                 if self.args.use_amp:
                     with torch.cuda.amp.autocast():
                         if self.args.output_attention:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )[0]
                         else:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )
                 else:
                     if self.args.output_attention:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )[0]
                     else:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )
                 pred = outputs.detach().cpu().numpy()  # .squeeze()
                 preds.append(pred)
 
diff --git a/benchmark/exp/exp_transformer_metalearn.py b/benchmark/exp/exp_transformer_metalearn.py
index 6081d12..1a62e89 100644
--- a/benchmark/exp/exp_transformer_metalearn.py
+++ b/benchmark/exp/exp_transformer_metalearn.py
@@ -1,33 +1,32 @@
+import sys
 import time
+
 import yaml
-import sys
+
 sys.path.append('/home/ubuntu/ForecastPFN/academic_comparison/')
 
 import os
-import time
 import warnings
+from typing import Dict
+
 import numpy as np
 import pandas as pd
 import torch
 import torch.nn as nn
-from tqdm import tqdm
 from torch import optim
-from typing import Dict
+from tqdm import tqdm
+
 from data_provider.data_factory import data_provider
 from exp.exp_basic import Exp_Basic
 from exp.torch_utils import *
-from transformer_models.models import FEDformer, Autoformer, Informer, Transformer
-from utils.tools import EarlyStopping, TimeBudget, adjust_learning_rate, visual
-from utils.metrics import metric
+from transformer_models.models import Autoformer, FEDformer, Informer, Transformer
+from utils.tools import EarlyStopping, TimeBudget, adjust_learning_rate
 
 sys.path.append('/home/ubuntu/ForecastPFN/src/')
 sys.path.append('/home/ubuntu/ForecastPFN/src/training/')
-from training.create_train_test_df import create_train_test_df
 import tensorflow as tf
-
-
 from training.config_variables import Config
-from training.constants import PADDING, HISTORY_LEN, TARGET_LEN, TRIM_LEN, TARGET_INDEX
+from training.constants import HISTORY_LEN, PADDING
 from training.prepare_dataset import filter_unusable_points
 from training.utils import load_tf_dataset
 
@@ -61,12 +60,9 @@ def _build_model(self):
         return model.to(self.device)
 
     def _get_data(self, flag):
-
-
         TARGET_LEN = self.args.label_len + self.args.pred_len
         TRIM_LEN = self.args.label_len + self.args.pred_len
-        TARGET_INDEX = 2*TRIM_LEN
-
+        TARGET_INDEX = 2 * TRIM_LEN
 
         def compute_time_features(ts: np.ndarray):
             """
@@ -76,24 +72,26 @@ def compute_time_features(ts: np.ndarray):
             """
             ts = pd.to_datetime(ts)
             if Config.is_sub_day:
-                return np.stack([ts.minute, ts.hour, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
+                return np.stack(
+                    [ts.minute, ts.hour, ts.day, ts.day_of_week + 1, ts.day_of_year],
+                    axis=-1,
+                )
             return np.stack([ts.month, ts.day, ts.day_of_week, ts.hour], axis=-1)
 
-
         def build_frames(r: Dict[str, tf.Tensor]):
             raw_date_info = tf.numpy_function(
-                compute_time_features, inp=[r['ts']], Tout=tf.int64)
+                compute_time_features, inp=[r['ts']], Tout=tf.int64
+            )
             date_info = tf.signal.frame(
-                tf.pad(raw_date_info, [[PADDING, 0], [0, 0]]),
-                HISTORY_LEN,
-                1,
-                axis=0
+                tf.pad(raw_date_info, [[PADDING, 0], [0, 0]]), HISTORY_LEN, 1, axis=0
             )
 
             history = tf.signal.frame(
-                tf.pad(r['y'], [[PADDING, 0]]), HISTORY_LEN, 1, axis=-1)
+                tf.pad(r['y'], [[PADDING, 0]]), HISTORY_LEN, 1, axis=-1
+            )
             noise = tf.signal.frame(
-                tf.pad(r['noise'], [[PADDING, 0]]), HISTORY_LEN, 1, axis=-1)
+                tf.pad(r['noise'], [[PADDING, 0]]), HISTORY_LEN, 1, axis=-1
+            )
 
             target_dates = tf.signal.frame(raw_date_info, TARGET_LEN, 1, axis=0)
             target_values = tf.signal.frame(r['y'], TARGET_LEN, 1, axis=-1)
@@ -107,7 +105,7 @@ def build_frames(r: Dict[str, tf.Tensor]):
                 noise[-start_index:-TARGET_LEN],
                 target_dates[TARGET_INDEX:],
                 target_values[TARGET_INDEX:],
-                target_noise[TARGET_INDEX:]
+                target_noise[TARGET_INDEX:],
             )
 
         @tf.function
@@ -117,15 +115,14 @@ def gen_random_single_point(
             noise: tf.Tensor,
             target_dates: tf.Tensor,
             target_values: tf.Tensor,
-            target_noise: tf.Tensor
+            target_noise: tf.Tensor,
         ):
-
             return dict(
                 ts=date_info,
-                history=history*noise,
+                history=history * noise,
                 noise=noise,
                 target_ts=target_dates,
-                target_noise=target_noise
+                target_noise=target_noise,
             ), target_values
 
         @tf.function
@@ -135,25 +132,24 @@ def gen_random_single_point_no_noise(
             noise: tf.Tensor,
             target_dates: tf.Tensor,
             target_values: tf.Tensor,
-            target_noise: tf.Tensor
+            target_noise: tf.Tensor,
         ):
-
             return dict(
                 ts=date_info,
                 history=history,
                 noise=noise,
                 target_ts=target_dates,
-                target_noise=target_noise
+                target_noise=target_noise,
             ), target_values
 
-
         def remove_noise(x, y):
             return (
                 {
                     'ts': x['ts'],
                     'history': x['history'],
                     'target_ts': x['target_ts'],
-                }, y
+                },
+                y,
             )
 
         def create_train_test_df(combined_ds, test_noise=False):
@@ -166,9 +162,14 @@ def create_train_test_df(combined_ds, test_noise=False):
                 base_train_df.map(func, num_parallel_calls=tf.data.AUTOTUNE)
                 for func in task_map.values()
             ]
-            train_df = tf.data.Dataset.choose_from_datasets(
-                train_tasks_dfs, tf.data.Dataset.range(len(train_tasks_dfs)).repeat()
-            ).unbatch().filter(filter_unusable_points)
+            train_df = (
+                tf.data.Dataset.choose_from_datasets(
+                    train_tasks_dfs,
+                    tf.data.Dataset.range(len(train_tasks_dfs)).repeat(),
+                )
+                .unbatch()
+                .filter(filter_unusable_points)
+            )
 
             task_map_test = {
                 'point': gen_random_single_point_no_noise,
@@ -185,21 +186,24 @@ def create_train_test_df(combined_ds, test_noise=False):
                     for func in task_map_test.values()
                 ]
 
-            test_df = tf.data.Dataset.choose_from_datasets(
-                test_tasks_dfs, tf.data.Dataset.range(len(test_tasks_dfs)).repeat()
-            ).unbatch().filter(filter_unusable_points)
+            test_df = (
+                tf.data.Dataset.choose_from_datasets(
+                    test_tasks_dfs, tf.data.Dataset.range(len(test_tasks_dfs)).repeat()
+                )
+                .unbatch()
+                .filter(filter_unusable_points)
+            )
 
             test_df = test_df.map(remove_noise)
 
             return train_df, test_df
 
-
         def get_combined_ds(config):
-            version = config["version"]
+            version = config['version']
             datasets = [
                 # load_tf_dataset(config["prefix"] + f"{version}/minute.tfrecords"),
                 # load_tf_dataset(config["prefix"] + f"{version}/hourly.tfrecords"),
-                load_tf_dataset(config["prefix"] + f"{version}/daily.tfrecords"),
+                load_tf_dataset(config['prefix'] + f'{version}/daily.tfrecords'),
                 # load_tf_dataset(config["prefix"] + f"{version}/weekly.tfrecords"),
                 # load_tf_dataset(config["prefix"] + f"{version}/monthly.tfrecords"),
             ]
@@ -209,36 +213,36 @@ def get_combined_ds(config):
 
             return combined_ds
 
-
-
         if flag == 'test':
             data_set, data_loader = data_provider(self.args, flag)
         elif flag == 'train':
-            with open('/home/ubuntu/ForecastPFN/src/training/config_mf_replicate_testnoiseF.yaml') as config_file:
+            with open(
+                '/home/ubuntu/ForecastPFN/src/training/config_mf_replicate_testnoiseF.yaml'
+            ) as config_file:
                 config = yaml.load(config_file, yaml.loader.SafeLoader)
 
             combined_ds = get_combined_ds(config)
-            train_df, vali_df = create_train_test_df(
-                combined_ds, config["test_noise"])
+            train_df, vali_df = create_train_test_df(combined_ds, config['test_noise'])
             data_loader = TFRecordDataLoader(
-                train_df, self.args.batch_size, True, 10_000)
+                train_df, self.args.batch_size, True, 10_000
+            )
             data_set = None
         elif flag == 'val':
-            with open('/home/ubuntu/ForecastPFN/src/training/config_mf_replicate_testnoiseF.yaml') as config_file:
+            with open(
+                '/home/ubuntu/ForecastPFN/src/training/config_mf_replicate_testnoiseF.yaml'
+            ) as config_file:
                 config = yaml.load(config_file, yaml.loader.SafeLoader)
 
             combined_ds = get_combined_ds(config)
-            train_df, vali_df = create_train_test_df(
-                combined_ds, config["test_noise"])
+            train_df, vali_df = create_train_test_df(combined_ds, config['test_noise'])
             data_set = None
             data_loader = TFRecordDataLoader(
-                vali_df,  self.args.batch_size, True, 10_000)
+                vali_df, self.args.batch_size, True, 10_000
+            )
         return data_set, data_loader
 
-
     def _select_optimizer(self):
-        model_optim = optim.Adam(
-            self.model.parameters(), lr=self.args.learning_rate)
+        model_optim = optim.Adam(self.model.parameters(), lr=self.args.learning_rate)
         return model_optim
 
     def _select_criterion(self):
@@ -255,37 +259,41 @@ def vali(self, vali_data, vali_loader, criterion):
                 X_batch = numpy_to_torch(batch_data[0], self.device)
                 y_batch = torch.from_numpy(batch_data[1]).to(self.device)
 
-                batch_x = X_batch['history'].float().to(
-                    self.device).unsqueeze(2)
+                batch_x = X_batch['history'].float().to(self.device).unsqueeze(2)
                 batch_y = y_batch.float().to(self.device).unsqueeze(2)
 
                 batch_x_mark = X_batch['ts'].float().to(self.device)
                 batch_y_mark = X_batch['target_ts'].float().to(self.device)
 
                 # decoder input
-                dec_inp = torch.zeros_like(
-                    batch_y[:, -self.args.pred_len:, :]).float()
-                dec_inp = torch.cat(
-                    [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
+                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len :, :]).float()
+                dec_inp = (
+                    torch.cat([batch_y[:, : self.args.label_len, :], dec_inp], dim=1)
+                    .float()
+                    .to(self.device)
+                )
                 # encoder - decoder
                 if self.args.use_amp:
                     with torch.cuda.amp.autocast():
                         if self.args.output_attention:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )[0]
                         else:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )
                 else:
                     if self.args.output_attention:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )[0]
                     else:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )
                 f_dim = -1 if self.args.features == 'MS' else 0
-                batch_y = batch_y[:, -self.args.pred_len:,
-                                  f_dim:].to(self.device)
+                batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(self.device)
 
                 pred = outputs.detach().cpu()
                 true = batch_y.detach().cpu()
@@ -304,7 +312,9 @@ def test(self, vali_data, vali_loader, criterion):
         self.model.eval()
         self.vali_timer.start_timer()
         with torch.no_grad():
-            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(vali_loader):
+            for i, (batch_x, batch_y, batch_x_mark, batch_y_mark) in enumerate(
+                vali_loader
+            ):
                 batch_x = batch_x.float().to(self.device)
                 batch_y = batch_y.float()
 
@@ -312,29 +322,34 @@ def test(self, vali_data, vali_loader, criterion):
                 batch_y_mark = batch_y_mark.float().to(self.device)
 
                 # decoder input
-                dec_inp = torch.zeros_like(
-                    batch_y[:, -self.args.pred_len:, :]).float()
-                dec_inp = torch.cat(
-                    [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
+                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len :, :]).float()
+                dec_inp = (
+                    torch.cat([batch_y[:, : self.args.label_len, :], dec_inp], dim=1)
+                    .float()
+                    .to(self.device)
+                )
                 # encoder - decoder
                 if self.args.use_amp:
                     with torch.cuda.amp.autocast():
                         if self.args.output_attention:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )[0]
                         else:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )
                 else:
                     if self.args.output_attention:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )[0]
                     else:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )
                 f_dim = -1 if self.args.features == 'MS' else 0
-                batch_y = batch_y[:, -self.args.pred_len:,
-                                  f_dim:].to(self.device)
+                batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(self.device)
 
                 pred = outputs.detach().cpu()
                 true = batch_y.detach().cpu()
@@ -346,7 +361,7 @@ def test(self, vali_data, vali_loader, criterion):
         self.model.train()
         self.vali_timer.end_timer()
         return total_loss
-    
+
     def train(self, setting):
         print(setting)
 
@@ -360,11 +375,10 @@ def train(self, setting):
         else:
             raise NotImplementedError
 
-        time_now = time.time()
+        time.time()
 
         train_steps = -1
-        early_stopping = EarlyStopping(
-            patience=self.args.patience, verbose=False)
+        early_stopping = EarlyStopping(patience=self.args.patience, verbose=False)
 
         model_optim = self._select_optimizer()
         criterion = self._select_criterion()
@@ -387,7 +401,7 @@ def train(self, setting):
 
                 batch_x = X_batch['history'].float().to(self.device).unsqueeze(2)
                 batch_y = y_batch.float().to(self.device).unsqueeze(2)
-                
+
                 batch_x_mark = X_batch['ts'].float().to(self.device)
                 batch_y_mark = X_batch['target_ts'].float().to(self.device)
 
@@ -395,37 +409,43 @@ def train(self, setting):
                 model_optim.zero_grad()
 
                 # decoder input
-                dec_inp = torch.zeros_like(
-                    batch_y[:, -self.args.pred_len:, :]).float()
-                dec_inp = torch.cat(
-                    [batch_y[:, :self.args.label_len, :], dec_inp], dim=1).float().to(self.device)
+                dec_inp = torch.zeros_like(batch_y[:, -self.args.pred_len :, :]).float()
+                dec_inp = (
+                    torch.cat([batch_y[:, : self.args.label_len, :], dec_inp], dim=1)
+                    .float()
+                    .to(self.device)
+                )
 
                 # encoder - decoder
                 if self.args.use_amp:
                     with torch.cuda.amp.autocast():
                         if self.args.output_attention:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )[0]
                         else:
                             outputs = self.model(
-                                batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                                batch_x, batch_x_mark, dec_inp, batch_y_mark
+                            )
 
                         f_dim = -1 if self.args.features == 'MS' else 0
-                        batch_y = batch_y[:, -self.args.pred_len:,
-                                          f_dim:].to(self.device)
+                        batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(
+                            self.device
+                        )
                         loss = criterion(outputs, batch_y)
                         train_loss.append(loss.item())
                 else:
                     if self.args.output_attention:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)[0]
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )[0]
                     else:
                         outputs = self.model(
-                            batch_x, batch_x_mark, dec_inp, batch_y_mark)
+                            batch_x, batch_x_mark, dec_inp, batch_y_mark
+                        )
 
                     f_dim = -1 if self.args.features == 'MS' else 0
-                    batch_y = batch_y[:, -self.args.pred_len:,
-                                      f_dim:].to(self.device)
+                    batch_y = batch_y[:, -self.args.pred_len :, f_dim:].to(self.device)
 
                     loss = criterion(outputs, batch_y)
                     train_loss.append(loss.item())
@@ -448,18 +468,20 @@ def train(self, setting):
                     self.model.load_state_dict(torch.load(best_model_path))
 
                     return self.model
-                
+
                 if batch_i >= 1_000:
                     break
 
-            print("Epoch: {} cost time: {}".format(
-                epoch + 1, time.time() - epoch_time))
+            print('Epoch: {} cost time: {}'.format(epoch + 1, time.time() - epoch_time))
             train_loss = np.average(train_loss)
             vali_loss = self.vali(vali_data, vali_loader, criterion)
             test_loss = self.test(test_data, test_loader, criterion)
 
-            print("Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}".format(
-                epoch + 1, train_steps, train_loss, vali_loss, test_loss))
+            print(
+                'Epoch: {0}, Steps: {1} | Train Loss: {2:.7f} Vali Loss: {3:.7f} Test Loss: {4:.7f}'.format(
+                    epoch + 1, train_steps, train_loss, vali_loss, test_loss
+                )
+            )
             early_stopping(vali_loss, self.model, path)
             # if early_stopping.early_stop:
             #     print("Early stopping")
@@ -473,4 +495,3 @@ def train(self, setting):
         self.model.load_state_dict(torch.load(best_model_path))
 
         return self.model
-
diff --git a/benchmark/exp/torch_utils.py b/benchmark/exp/torch_utils.py
index ca30a23..aae5e6a 100644
--- a/benchmark/exp/torch_utils.py
+++ b/benchmark/exp/torch_utils.py
@@ -1,18 +1,21 @@
-import torch
 import os
 from collections import OrderedDict
 from functools import partial
+
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import torch
 from torch.nn import MSELoss
 from torch.optim import Adam
-import tensorflow_datasets as tfds
-import tensorflow as tf
 from tqdm import tqdm
 
 DEFAULT_LOSS = MSELoss()
 DEFAULT_OPTIMIZER = partial(Adam, lr=0.001)
 
 
-load_dir = 'tensorboard/mf_replicate_testnoiseT_shuffle5Millilon.20230714-133237/models/51'
+load_dir = (
+    'tensorboard/mf_replicate_testnoiseT_shuffle5Millilon.20230714-133237/models/51'
+)
 
 
 def numpy_to_torch(X, device):
@@ -63,12 +66,16 @@ def __next__(self):
 
 
 class AdditionalValidationSets:
-    def __init__(self, validation_sets, batch_size=1, metrics=[], loss=DEFAULT_LOSS, device=None):
+    def __init__(
+        self, validation_sets, batch_size=1, metrics=[], loss=DEFAULT_LOSS, device=None
+    ):
         self.validation_sets = []
         for validation_set in validation_sets:
             if len(validation_set) not in [2]:
                 raise ValueError()
-            self.validation_sets.append([tfds.as_numpy(validation_set[0]), validation_set[1]])
+            self.validation_sets.append(
+                [tfds.as_numpy(validation_set[0]), validation_set[1]]
+            )
         self.epoch = []
         self.metrics = metrics
         self.loss = loss
@@ -88,7 +95,16 @@ def on_epoch_end(self, model, epoch, tbCallback=None):
                 validation_data, validation_set_name = validation_set
             else:
                 raise ValueError()
-            results = add_metrics_to_log(model, validation_data, self.loss, self.metrics, tbCallback, f'add_valid/{validation_set_name}/', epoch, self.device)
+            results = add_metrics_to_log(
+                model,
+                validation_data,
+                self.loss,
+                self.metrics,
+                tbCallback,
+                f'add_valid/{validation_set_name}/',
+                epoch,
+                self.device,
+            )
             log.update(results)
         self.logs[epoch] = log
         return log
@@ -106,13 +122,17 @@ def predict(model, data, device, steps_per_epoch=None):
         y_batch_pred = model(X_batch)
         y_batch_pred, y_batch = model.transform_output(y_batch_pred, y_batch)
         y_true = y_batch if y_true is None else torch.concat([y_true, y_batch])
-        y_pred = y_batch_pred if y_pred is None else torch.concat([y_pred, y_batch_pred])
+        y_pred = (
+            y_batch_pred if y_pred is None else torch.concat([y_pred, y_batch_pred])
+        )
         if steps_per_epoch is not None and batch_i >= steps_per_epoch:
             break
     return y_true, y_pred
 
 
-def add_metrics_to_log(model, data, loss, metrics, writer, prefix, epoch, device, steps_per_epoch=None):
+def add_metrics_to_log(
+    model, data, loss, metrics, writer, prefix, epoch, device, steps_per_epoch=None
+):
     with torch.no_grad():
         y_true, y_pred = predict(model, data, device, steps_per_epoch)
         y_true = y_true.reshape(-1)
@@ -130,23 +150,25 @@ def add_metrics_to_log(model, data, loss, metrics, writer, prefix, epoch, device
     return log
 
 
-def fit(model,
-        train_df,
-        batch_size=1024,
-        epochs=1,
-        verbose=1,
-        valid_df=None,
-        shuffle=0,
-        initial_epoch=0,
-        seed=None,
-        loss=DEFAULT_LOSS,
-        optimizer=DEFAULT_OPTIMIZER,
-        metrics=None,
-        writer=None,
-        device='cpu',
-        steps_per_epoch=None,
-        logdir=None,
-        additional_validation_sets=[]):
+def fit(
+    model,
+    train_df,
+    batch_size=1024,
+    epochs=1,
+    verbose=1,
+    valid_df=None,
+    shuffle=0,
+    initial_epoch=0,
+    seed=None,
+    loss=DEFAULT_LOSS,
+    optimizer=DEFAULT_OPTIMIZER,
+    metrics=None,
+    writer=None,
+    device='cpu',
+    steps_per_epoch=None,
+    logdir=None,
+    additional_validation_sets=[],
+):
     """Trains the model similar to Keras' .fit(...) method
 
     # Arguments
@@ -183,7 +205,9 @@ def fit(model,
 
     # Build DataLoaders
     valid_data = TFRecordDataLoader(valid_df, batch_size)
-    additional_valid_data = AdditionalValidationSets(additional_validation_sets, metrics=metrics, loss=loss, device=device)
+    additional_valid_data = AdditionalValidationSets(
+        additional_validation_sets, metrics=metrics, loss=loss, device=device
+    )
     # Compile optimizer
     opt = optimizer(model.parameters())
     # load = torch.load(load_dir)
@@ -192,11 +216,11 @@ def fit(model,
     # Run training loop
     logs = []
     for t in tqdm(range(initial_epoch, epochs)):
-        logfile.write(f"Epoch: {t+1}\n")
+        logfile.write(f'Epoch: {t+1}\n')
         train_data = TFRecordDataLoader(train_df, batch_size, True, shuffle)
         model.train()
         if verbose and t % 10 == 0:
-            print("Epoch {0} / {1}".format(t + 1, epochs))
+            print('Epoch {0} / {1}'.format(t + 1, epochs))
         log = OrderedDict()
         epoch_loss = 0.0
         # Run batches
@@ -222,18 +246,27 @@ def fit(model,
         # train_metric_log = add_metrics_to_log(model, train_data, loss, metrics, writer, prefix='train/metrics/', epoch=t, device=device, steps_per_epoch=steps_per_epoch)
         # log.update(train_metric_log)
         if valid_data is not None:
-            val_metric_log = add_metrics_to_log(model, valid_data, loss, metrics, writer, prefix='valid/metrics/', epoch=t, device=device)
+            val_metric_log = add_metrics_to_log(
+                model,
+                valid_data,
+                loss,
+                metrics,
+                writer,
+                prefix='valid/metrics/',
+                epoch=t,
+                device=device,
+            )
             log.update(val_metric_log)
         # Additional validation set
         if t % 10 == 0:
             add_log = additional_valid_data.on_epoch_end(model, t, writer)
-            logfile.write(str(add_log)+'\n')
+            logfile.write(str(add_log) + '\n')
             to_save = {
-                "model": model.state_dict(),
-                "optimizer": opt.state_dict(),
+                'model': model.state_dict(),
+                'optimizer': opt.state_dict(),
             }
             torch.save(to_save, logdir + f'/models/{t+1}')
-        logfile.write(str(log)+'\n')
+        logfile.write(str(log) + '\n')
         logfile.flush()
         logs.append(log)
 
diff --git a/benchmark/layers/AutoCorrelation.py b/benchmark/layers/AutoCorrelation.py
index 2fda13a..6fb6ec6 100644
--- a/benchmark/layers/AutoCorrelation.py
+++ b/benchmark/layers/AutoCorrelation.py
@@ -1,8 +1,8 @@
+import math
 import time
+
 import torch
 import torch.nn as nn
-import numpy as np
-import math
 from torch.nn.functional import interpolate
 
 
@@ -13,6 +13,7 @@ def func2(*args, **kw):
         t = time.time() - now
         print('call <{}>, time={}'.format(func.__name__, t))
         return y
+
     return func2
 
 
@@ -23,7 +24,16 @@ class AutoCorrelation(nn.Module):
     (2) time delay aggregation
     This block can replace the self-attention family mechanism seamlessly.
     """
-    def __init__(self, mask_flag=True, factor=1, scale=None, attention_dropout=0.1, output_attention=False, configs=None):
+
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=1,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+        configs=None,
+    ):
         super(AutoCorrelation, self).__init__()
         print('Autocorrelation used !')
         self.factor = factor
@@ -55,8 +65,13 @@ def time_delay_agg_training(self, values, corr):
         delays_agg = torch.zeros_like(values).float()
         for i in range(top_k):
             pattern = torch.roll(tmp_values, -int(index[i]), -1)
-            delays_agg = delays_agg + pattern * \
-                         (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length))
+            delays_agg = delays_agg + pattern * (
+                tmp_corr[:, i]
+                .unsqueeze(1)
+                .unsqueeze(1)
+                .unsqueeze(1)
+                .repeat(1, head, channel, length)
+            )
         return delays_agg  # size=[B, H, d, S]
 
     def time_delay_agg_inference(self, values, corr):
@@ -69,7 +84,14 @@ def time_delay_agg_inference(self, values, corr):
         channel = values.shape[2]
         length = values.shape[3]
         # index init
-        init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(batch, head, channel, 1).cuda()
+        init_index = (
+            torch.arange(length)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .repeat(batch, head, channel, 1)
+            .cuda()
+        )
         # find top k
         top_k = int(self.factor * math.log(length))
         mean_value = torch.mean(torch.mean(corr, dim=1), dim=1)
@@ -81,10 +103,17 @@ def time_delay_agg_inference(self, values, corr):
         tmp_values = values.repeat(1, 1, 1, 2)
         delays_agg = torch.zeros_like(values).float()
         for i in range(top_k):
-            tmp_delay = init_index + delay[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length)
+            tmp_delay = init_index + delay[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(
+                1
+            ).repeat(1, head, channel, length)
             pattern = torch.gather(tmp_values, dim=-1, index=tmp_delay)
-            delays_agg = delays_agg + pattern * \
-                         (tmp_corr[:, i].unsqueeze(1).unsqueeze(1).unsqueeze(1).repeat(1, head, channel, length))
+            delays_agg = delays_agg + pattern * (
+                tmp_corr[:, i]
+                .unsqueeze(1)
+                .unsqueeze(1)
+                .unsqueeze(1)
+                .repeat(1, head, channel, length)
+            )
         return delays_agg
 
     def time_delay_agg_full(self, values, corr):
@@ -96,7 +125,14 @@ def time_delay_agg_full(self, values, corr):
         channel = values.shape[2]
         length = values.shape[3]
         # index init
-        init_index = torch.arange(length).unsqueeze(0).unsqueeze(0).unsqueeze(0).repeat(batch, head, channel, 1).cuda()
+        init_index = (
+            torch.arange(length)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .repeat(batch, head, channel, 1)
+            .cuda()
+        )
         # find top k
         top_k = int(self.factor * math.log(length))
         weights = torch.topk(corr, top_k, dim=-1)[0]
@@ -116,7 +152,7 @@ def forward(self, queries, keys, values, attn_mask):
         B, L, H, E = queries.shape
         _, S, _, D = values.shape
         if L > S:
-            zeros = torch.zeros_like(queries[:, :(L - S), :]).float()
+            zeros = torch.zeros_like(queries[:, : (L - S), :]).float()
             values = torch.cat([values, zeros], dim=1)
             keys = torch.cat([keys, zeros], dim=1)
         else:
@@ -138,20 +174,34 @@ def forward(self, queries, keys, values, attn_mask):
                 for q, k, j in zip(qs, ks, j_list):
                     q_list += [interpolate(q, scale_factor=j, mode='linear')[:, :, -L:]]
                     k_list += [interpolate(k, scale_factor=j, mode='linear')[:, :, -L:]]
-                queries = torch.stack([i.reshape([B, H, E, L]) for i in q_list], dim=3).reshape([B, H, -1, L]).permute(0, 3, 1, 2)
-                keys = torch.stack([i.reshape([B, H, E, L]) for i in k_list], dim=3).reshape([B, H, -1, L]).permute(0, 3, 1, 2)
+                queries = (
+                    torch.stack([i.reshape([B, H, E, L]) for i in q_list], dim=3)
+                    .reshape([B, H, -1, L])
+                    .permute(0, 3, 1, 2)
+                )
+                keys = (
+                    torch.stack([i.reshape([B, H, E, L]) for i in k_list], dim=3)
+                    .reshape([B, H, -1, L])
+                    .permute(0, 3, 1, 2)
+                )
             else:
                 pass
-            q_fft = torch.fft.rfft(queries.permute(0, 2, 3, 1).contiguous(), dim=-1)  # size=[B, H, E, L]
+            q_fft = torch.fft.rfft(
+                queries.permute(0, 2, 3, 1).contiguous(), dim=-1
+            )  # size=[B, H, E, L]
             k_fft = torch.fft.rfft(keys.permute(0, 2, 3, 1).contiguous(), dim=-1)
             res = q_fft * torch.conj(k_fft)
-            corr = torch.fft.irfft(res, dim=-1) # size=[B, H, E, L]
+            corr = torch.fft.irfft(res, dim=-1)  # size=[B, H, E, L]
 
             # time delay agg
             if self.training:
-                V = self.time_delay_agg_training(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2)  # [B, L, H, E], [B, H, E, L] -> [B, L, H, E]
+                V = self.time_delay_agg_training(
+                    values.permute(0, 2, 3, 1).contiguous(), corr
+                ).permute(0, 3, 1, 2)  # [B, L, H, E], [B, H, E, L] -> [B, L, H, E]
             else:
-                V = self.time_delay_agg_inference(values.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2)
+                V = self.time_delay_agg_inference(
+                    values.permute(0, 2, 3, 1).contiguous(), corr
+                ).permute(0, 3, 1, 2)
         else:
             V_list = []
             queries = queries.reshape([B, L, -1])
@@ -172,12 +222,16 @@ def forward(self, queries, keys, values, attn_mask):
                 res = q_fft * torch.conj(k_fft)
                 corr = torch.fft.irfft(res, dim=-1)  # [B, H, E, L]
                 if self.training:
-                    V = self.time_delay_agg_training(v.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2)
+                    V = self.time_delay_agg_training(
+                        v.permute(0, 2, 3, 1).contiguous(), corr
+                    ).permute(0, 3, 1, 2)
                 else:
-                    V = self.time_delay_agg_inference(v.permute(0, 2, 3, 1).contiguous(), corr).permute(0, 3, 1, 2)
+                    V = self.time_delay_agg_inference(
+                        v.permute(0, 2, 3, 1).contiguous(), corr
+                    ).permute(0, 3, 1, 2)
                 V_list += [V]
-            Vl = V_list[-1].reshape([B, -1, H*E]).transpose(1, 2)
-            Vh_list = [i.reshape([B, -1, H*E]).transpose(1, 2) for i in V_list[:-1]]
+            Vl = V_list[-1].reshape([B, -1, H * E]).transpose(1, 2)
+            Vh_list = [i.reshape([B, -1, H * E]).transpose(1, 2) for i in V_list[:-1]]
             V = self.dwt1div((Vl, Vh_list)).reshape([B, H, E, -1]).permute(0, 3, 1, 2)
             # corr = self.dwt1div((V_list[-1], V_list[:-1]))
 
@@ -188,8 +242,7 @@ def forward(self, queries, keys, values, attn_mask):
 
 
 class AutoCorrelationLayer(nn.Module):
-    def __init__(self, correlation, d_model, n_heads, d_keys=None,
-                 d_values=None):
+    def __init__(self, correlation, d_model, n_heads, d_keys=None, d_values=None):
         super(AutoCorrelationLayer, self).__init__()
 
         d_keys = d_keys or (d_model // n_heads)
@@ -211,12 +264,7 @@ def forward(self, queries, keys, values, attn_mask):
         keys = self.key_projection(keys).view(B, S, H, -1)
         values = self.value_projection(values).view(B, S, H, -1)
 
-        out, attn = self.inner_correlation(
-            queries,
-            keys,
-            values,
-            attn_mask
-        )
+        out, attn = self.inner_correlation(queries, keys, values, attn_mask)
 
         out = out.view(B, L, -1)
-        return self.out_projection(out), attn
\ No newline at end of file
+        return self.out_projection(out), attn
diff --git a/benchmark/layers/Autoformer_EncDec.py b/benchmark/layers/Autoformer_EncDec.py
index 5bcae4e..d97c916 100644
--- a/benchmark/layers/Autoformer_EncDec.py
+++ b/benchmark/layers/Autoformer_EncDec.py
@@ -1,14 +1,15 @@
+import math
+
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import math
-from layers.SelfAttention_Family import FullAttention
 
 
 class my_Layernorm(nn.Module):
     """
     Special designed layernorm for the seasonal part
     """
+
     def __init__(self, channels):
         super(my_Layernorm, self).__init__()
         self.layernorm = nn.LayerNorm(channels)
@@ -23,6 +24,7 @@ class moving_avg(nn.Module):
     """
     Moving average block to highlight the trend of time series
     """
+
     def __init__(self, kernel_size, stride):
         super(moving_avg, self).__init__()
         if type(kernel_size) == list:
@@ -32,7 +34,9 @@ def __init__(self, kernel_size, stride):
 
     def forward(self, x):
         # padding on the both ends of time series
-        front = x[:, 0:1, :].repeat(1, self.kernel_size - 1-math.floor((self.kernel_size - 1) // 2), 1)
+        front = x[:, 0:1, :].repeat(
+            1, self.kernel_size - 1 - math.floor((self.kernel_size - 1) // 2), 1
+        )
         end = x[:, -1:, :].repeat(1, math.floor((self.kernel_size - 1) // 2), 1)
         x = torch.cat([front, x, end], dim=1)
         x = self.avg(x.permute(0, 2, 1))
@@ -44,6 +48,7 @@ class series_decomp(nn.Module):
     """
     Series decomposition block
     """
+
     def __init__(self, kernel_size):
         super(series_decomp, self).__init__()
         self.moving_avg = moving_avg(kernel_size, stride=1)
@@ -58,20 +63,23 @@ class series_decomp_multi(nn.Module):
     """
     Series decomposition block
     """
+
     def __init__(self, kernel_size):
         super(series_decomp_multi, self).__init__()
         self.moving_avg = [moving_avg(kernel, stride=1) for kernel in kernel_size]
         self.layer = torch.nn.Linear(1, len(kernel_size))
 
     def forward(self, x):
-        moving_mean=[]
+        moving_mean = []
         for func in self.moving_avg:
             moving_avg = func(x)
             moving_mean.append(moving_avg.unsqueeze(-1))
-        moving_mean=torch.cat(moving_mean,dim=-1)
-        moving_mean = torch.sum(moving_mean*nn.Softmax(-1)(self.layer(x.unsqueeze(-1))),dim=-1)
+        moving_mean = torch.cat(moving_mean, dim=-1)
+        moving_mean = torch.sum(
+            moving_mean * nn.Softmax(-1)(self.layer(x.unsqueeze(-1))), dim=-1
+        )
         res = x - moving_mean
-        return res, moving_mean 
+        return res, moving_mean
 
 
 class FourierDecomp(nn.Module):
@@ -80,19 +88,32 @@ def __init__(self):
         pass
 
     def forward(self, x):
-        x_ft = torch.fft.rfft(x, dim=-1)
+        torch.fft.rfft(x, dim=-1)
 
 
 class EncoderLayer(nn.Module):
     """
     Autoformer encoder layer with the progressive decomposition architecture
     """
-    def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, activation="relu"):
+
+    def __init__(
+        self,
+        attention,
+        d_model,
+        d_ff=None,
+        moving_avg=25,
+        dropout=0.1,
+        activation='relu',
+    ):
         super(EncoderLayer, self).__init__()
         d_ff = d_ff or 4 * d_model
         self.attention = attention
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
-        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.conv1 = nn.Conv1d(
+            in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False
+        )
+        self.conv2 = nn.Conv1d(
+            in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False
+        )
 
         if isinstance(moving_avg, list):
             self.decomp1 = series_decomp_multi(moving_avg)
@@ -102,13 +123,10 @@ def __init__(self, attention, d_model, d_ff=None, moving_avg=25, dropout=0.1, ac
             self.decomp2 = series_decomp(moving_avg)
 
         self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
+        self.activation = F.relu if activation == 'relu' else F.gelu
 
     def forward(self, x, attn_mask=None):
-        new_x, attn = self.attention(
-            x, x, x,
-            attn_mask=attn_mask
-        )
+        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
         x = x + self.dropout(new_x)
         x, _ = self.decomp1(x)
         y = x
@@ -122,10 +140,13 @@ class Encoder(nn.Module):
     """
     Autoformer encoder
     """
+
     def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
         super(Encoder, self).__init__()
         self.attn_layers = nn.ModuleList(attn_layers)
-        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
+        self.conv_layers = (
+            nn.ModuleList(conv_layers) if conv_layers is not None else None
+        )
         self.norm = norm_layer
 
     def forward(self, x, attn_mask=None):
@@ -152,14 +173,28 @@ class DecoderLayer(nn.Module):
     """
     Autoformer decoder layer with the progressive decomposition architecture
     """
-    def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None,
-                 moving_avg=25, dropout=0.1, activation="relu"):
+
+    def __init__(
+        self,
+        self_attention,
+        cross_attention,
+        d_model,
+        c_out,
+        d_ff=None,
+        moving_avg=25,
+        dropout=0.1,
+        activation='relu',
+    ):
         super(DecoderLayer, self).__init__()
         d_ff = d_ff or 4 * d_model
         self.self_attention = self_attention
         self.cross_attention = cross_attention
-        self.conv1 = nn.Conv1d(in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False)
-        self.conv2 = nn.Conv1d(in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False)
+        self.conv1 = nn.Conv1d(
+            in_channels=d_model, out_channels=d_ff, kernel_size=1, bias=False
+        )
+        self.conv2 = nn.Conv1d(
+            in_channels=d_ff, out_channels=d_model, kernel_size=1, bias=False
+        )
 
         if isinstance(moving_avg, list):
             self.decomp1 = series_decomp_multi(moving_avg)
@@ -171,21 +206,24 @@ def __init__(self, self_attention, cross_attention, d_model, c_out, d_ff=None,
             self.decomp3 = series_decomp(moving_avg)
 
         self.dropout = nn.Dropout(dropout)
-        self.projection = nn.Conv1d(in_channels=d_model, out_channels=c_out, kernel_size=3, stride=1, padding=1,
-                                    padding_mode='circular', bias=False)
-        self.activation = F.relu if activation == "relu" else F.gelu
+        self.projection = nn.Conv1d(
+            in_channels=d_model,
+            out_channels=c_out,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            padding_mode='circular',
+            bias=False,
+        )
+        self.activation = F.relu if activation == 'relu' else F.gelu
 
     def forward(self, x, cross, x_mask=None, cross_mask=None):
-        x = x + self.dropout(self.self_attention(
-            x, x, x,
-            attn_mask=x_mask
-        )[0])
+        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
 
         x, trend1 = self.decomp1(x)
-        x = x + self.dropout(self.cross_attention(
-            x, cross, cross,
-            attn_mask=cross_mask
-        )[0])
+        x = x + self.dropout(
+            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
+        )
 
         x, trend2 = self.decomp2(x)
         y = x
@@ -194,7 +232,9 @@ def forward(self, x, cross, x_mask=None, cross_mask=None):
         x, trend3 = self.decomp3(x + y)
 
         residual_trend = trend1 + trend2 + trend3
-        residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(1, 2)
+        residual_trend = self.projection(residual_trend.permute(0, 2, 1)).transpose(
+            1, 2
+        )
         return x, residual_trend
 
 
@@ -202,6 +242,7 @@ class Decoder(nn.Module):
     """
     Autoformer encoder
     """
+
     def __init__(self, layers, norm_layer=None, projection=None):
         super(Decoder, self).__init__()
         self.layers = nn.ModuleList(layers)
diff --git a/benchmark/layers/Embed.py b/benchmark/layers/Embed.py
index 1cc5034..1921d66 100644
--- a/benchmark/layers/Embed.py
+++ b/benchmark/layers/Embed.py
@@ -1,8 +1,7 @@
+import math
+
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from torch.nn.utils import weight_norm
-import math
 
 
 class PositionalEmbedding(nn.Module):
@@ -13,7 +12,9 @@ def __init__(self, d_model, max_len=5000):
         pe.require_grad = False
 
         position = torch.arange(0, max_len).float().unsqueeze(1)
-        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
+        div_term = (
+            torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)
+        ).exp()
 
         pe[:, 0::2] = torch.sin(position * div_term)
         pe[:, 1::2] = torch.cos(position * div_term)
@@ -22,18 +23,26 @@ def __init__(self, d_model, max_len=5000):
         self.register_buffer('pe', pe)
 
     def forward(self, x):
-        return self.pe[:, :x.size(1)]
+        return self.pe[:, : x.size(1)]
 
 
 class TokenEmbedding(nn.Module):
     def __init__(self, c_in, d_model):
         super(TokenEmbedding, self).__init__()
         padding = 1 if torch.__version__ >= '1.5.0' else 2
-        self.tokenConv = nn.Conv1d(in_channels=c_in, out_channels=d_model,
-                                   kernel_size=3, padding=padding, padding_mode='circular', bias=False)
+        self.tokenConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=d_model,
+            kernel_size=3,
+            padding=padding,
+            padding_mode='circular',
+            bias=False,
+        )
         for m in self.modules():
             if isinstance(m, nn.Conv1d):
-                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='leaky_relu')
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_in', nonlinearity='leaky_relu'
+                )
 
     def forward(self, x):
         x = self.tokenConv(x.permute(0, 2, 1)).transpose(1, 2)
@@ -48,7 +57,9 @@ def __init__(self, c_in, d_model):
         w.require_grad = False
 
         position = torch.arange(0, c_in).float().unsqueeze(1)
-        div_term = (torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)).exp()
+        div_term = (
+            torch.arange(0, d_model, 2).float() * -(math.log(10000.0) / d_model)
+        ).exp()
 
         w[:, 0::2] = torch.sin(position * div_term)
         w[:, 1::2] = torch.cos(position * div_term)
@@ -81,7 +92,9 @@ def __init__(self, d_model, embed_type='fixed', freq='h'):
     def forward(self, x):
         x = x.long()
 
-        minute_x = self.minute_embed(x[:, :, 4]) if hasattr(self, 'minute_embed') else 0.
+        minute_x = (
+            self.minute_embed(x[:, :, 4]) if hasattr(self, 'minute_embed') else 0.0
+        )
         hour_x = self.hour_embed(x[:, :, 3])
         weekday_x = self.weekday_embed(x[:, :, 2])
         day_x = self.day_embed(x[:, :, 1])
@@ -108,15 +121,22 @@ def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
 
         self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
         self.position_embedding = PositionalEmbedding(d_model=d_model)
-        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
-                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
-            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.temporal_embedding = (
+            TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
+            if embed_type != 'timeF'
+            else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
+        )
         self.dropout = nn.Dropout(p=dropout)
 
     def forward(self, x, x_mark):
-        x = self.value_embedding(x) + self.temporal_embedding(x_mark) + self.position_embedding(x)
+        x = (
+            self.value_embedding(x)
+            + self.temporal_embedding(x_mark)
+            + self.position_embedding(x)
+        )
         return self.dropout(x)
 
+
 class DataEmbedding_onlypos(nn.Module):
     def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
         super(DataEmbedding_onlypos, self).__init__()
@@ -128,16 +148,19 @@ def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
     def forward(self, x, x_mark):
         x = self.value_embedding(x) + self.position_embedding(x)
         return self.dropout(x)
-    
+
+
 class DataEmbedding_wo_pos(nn.Module):
     def __init__(self, c_in, d_model, embed_type='fixed', freq='h', dropout=0.1):
         super(DataEmbedding_wo_pos, self).__init__()
 
         self.value_embedding = TokenEmbedding(c_in=c_in, d_model=d_model)
         self.position_embedding = PositionalEmbedding(d_model=d_model)
-        self.temporal_embedding = TemporalEmbedding(d_model=d_model, embed_type=embed_type,
-                                                    freq=freq) if embed_type != 'timeF' else TimeFeatureEmbedding(
-            d_model=d_model, embed_type=embed_type, freq=freq)
+        self.temporal_embedding = (
+            TemporalEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
+            if embed_type != 'timeF'
+            else TimeFeatureEmbedding(d_model=d_model, embed_type=embed_type, freq=freq)
+        )
         self.dropout = nn.Dropout(p=dropout)
 
     def forward(self, x, x_mark):
diff --git a/benchmark/layers/FourierCorrelation.py b/benchmark/layers/FourierCorrelation.py
index 12892cd..9567d55 100644
--- a/benchmark/layers/FourierCorrelation.py
+++ b/benchmark/layers/FourierCorrelation.py
@@ -13,7 +13,7 @@ def get_frequency_modes(seq_len, modes=64, mode_select_method='random'):
     'random' means sampling randomly;
     'else' means sampling the lowest modes;
     """
-    modes = min(modes, seq_len//2)
+    modes = min(modes, seq_len // 2)
     if mode_select_method == 'random':
         index = list(range(0, seq_len // 2))
         np.random.shuffle(index)
@@ -26,7 +26,9 @@ def get_frequency_modes(seq_len, modes=64, mode_select_method='random'):
 
 # ########## fourier layer #############
 class FourierBlock(nn.Module):
-    def __init__(self, in_channels, out_channels, seq_len, modes=0, mode_select_method='random'):
+    def __init__(
+        self, in_channels, out_channels, seq_len, modes=0, mode_select_method='random'
+    ):
         super(FourierBlock, self).__init__()
         print('fourier enhanced block used!')
         """
@@ -34,17 +36,27 @@ def __init__(self, in_channels, out_channels, seq_len, modes=0, mode_select_meth
         it does FFT, linear transform, and Inverse FFT.    
         """
         # get modes on frequency domain
-        self.index = get_frequency_modes(seq_len, modes=modes, mode_select_method=mode_select_method)
+        self.index = get_frequency_modes(
+            seq_len, modes=modes, mode_select_method=mode_select_method
+        )
         print('modes={}, index={}'.format(modes, self.index))
 
-        self.scale = (1 / (in_channels * out_channels))
+        self.scale = 1 / (in_channels * out_channels)
         self.weights1 = nn.Parameter(
-            self.scale * torch.rand(8, in_channels // 8, out_channels // 8, len(self.index), dtype=torch.cfloat))
+            self.scale
+            * torch.rand(
+                8,
+                in_channels // 8,
+                out_channels // 8,
+                len(self.index),
+                dtype=torch.cfloat,
+            )
+        )
 
     # Complex multiplication
     def compl_mul1d(self, input, weights):
         # (batch, in_channel, x ), (in_channel, out_channel, x) -> (batch, out_channel, x)
-        return torch.einsum("bhi,hio->bho", input, weights)
+        return torch.einsum('bhi,hio->bho', input, weights)
 
     def forward(self, q, k, v, mask):
         # size = [B, L, H, E]
@@ -55,7 +67,9 @@ def forward(self, q, k, v, mask):
         # Perform Fourier neural operations
         out_ft = torch.zeros(B, H, E, L // 2 + 1, device=x.device, dtype=torch.cfloat)
         for wi, i in enumerate(self.index):
-            out_ft[:, :, :, wi] = self.compl_mul1d(x_ft[:, :, :, i], self.weights1[:, :, :, wi])
+            out_ft[:, :, :, wi] = self.compl_mul1d(
+                x_ft[:, :, :, i], self.weights1[:, :, :, wi]
+            )
         # Return to time domain
         x = torch.fft.irfft(out_ft, n=x.size(-1))
         return (x, None)
@@ -63,8 +77,17 @@ def forward(self, q, k, v, mask):
 
 # ########## Fourier Cross Former ####################
 class FourierCrossAttention(nn.Module):
-    def __init__(self, in_channels, out_channels, seq_len_q, seq_len_kv, modes=64, mode_select_method='random',
-                 activation='tanh', policy=0):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        seq_len_q,
+        seq_len_kv,
+        modes=64,
+        mode_select_method='random',
+        activation='tanh',
+        policy=0,
+    ):
         super(FourierCrossAttention, self).__init__()
         print(' fourier enhanced cross attention used!')
         """
@@ -74,56 +97,72 @@ def __init__(self, in_channels, out_channels, seq_len_q, seq_len_kv, modes=64, m
         self.in_channels = in_channels
         self.out_channels = out_channels
         # get modes for queries and keys (& values) on frequency domain
-        self.index_q = get_frequency_modes(seq_len_q, modes=modes, mode_select_method=mode_select_method)
-        self.index_kv = get_frequency_modes(seq_len_kv, modes=modes, mode_select_method=mode_select_method)
+        self.index_q = get_frequency_modes(
+            seq_len_q, modes=modes, mode_select_method=mode_select_method
+        )
+        self.index_kv = get_frequency_modes(
+            seq_len_kv, modes=modes, mode_select_method=mode_select_method
+        )
 
         print('modes_q={}, index_q={}'.format(len(self.index_q), self.index_q))
         print('modes_kv={}, index_kv={}'.format(len(self.index_kv), self.index_kv))
 
-        self.scale = (1 / (in_channels * out_channels))
+        self.scale = 1 / (in_channels * out_channels)
         self.weights1 = nn.Parameter(
-            self.scale * torch.rand(8, in_channels // 8, out_channels // 8, len(self.index_q), dtype=torch.cfloat))
+            self.scale
+            * torch.rand(
+                8,
+                in_channels // 8,
+                out_channels // 8,
+                len(self.index_q),
+                dtype=torch.cfloat,
+            )
+        )
 
     # Complex multiplication
     def compl_mul1d(self, input, weights):
         # (batch, in_channel, x ), (in_channel, out_channel, x) -> (batch, out_channel, x)
-        return torch.einsum("bhi,hio->bho", input, weights)
+        return torch.einsum('bhi,hio->bho', input, weights)
 
     def forward(self, q, k, v, mask):
         # size = [B, L, H, E]
         B, L, H, E = q.shape
         xq = q.permute(0, 2, 3, 1)  # size = [B, H, E, L]
         xk = k.permute(0, 2, 3, 1)
-        xv = v.permute(0, 2, 3, 1)
+        v.permute(0, 2, 3, 1)
 
         # Compute Fourier coefficients
-        xq_ft_ = torch.zeros(B, H, E, len(self.index_q), device=xq.device, dtype=torch.cfloat)
+        xq_ft_ = torch.zeros(
+            B, H, E, len(self.index_q), device=xq.device, dtype=torch.cfloat
+        )
         xq_ft = torch.fft.rfft(xq, dim=-1)
         for i, j in enumerate(self.index_q):
             xq_ft_[:, :, :, i] = xq_ft[:, :, :, j]
-        xk_ft_ = torch.zeros(B, H, E, len(self.index_kv), device=xq.device, dtype=torch.cfloat)
+        xk_ft_ = torch.zeros(
+            B, H, E, len(self.index_kv), device=xq.device, dtype=torch.cfloat
+        )
         xk_ft = torch.fft.rfft(xk, dim=-1)
         for i, j in enumerate(self.index_kv):
             xk_ft_[:, :, :, i] = xk_ft[:, :, :, j]
 
         # perform attention mechanism on frequency domain
-        xqk_ft = (torch.einsum("bhex,bhey->bhxy", xq_ft_, xk_ft_))
+        xqk_ft = torch.einsum('bhex,bhey->bhxy', xq_ft_, xk_ft_)
         if self.activation == 'tanh':
             xqk_ft = xqk_ft.tanh()
         elif self.activation == 'softmax':
             xqk_ft = torch.softmax(abs(xqk_ft), dim=-1)
             xqk_ft = torch.complex(xqk_ft, torch.zeros_like(xqk_ft))
         else:
-            raise Exception('{} actiation function is not implemented'.format(self.activation))
-        xqkv_ft = torch.einsum("bhxy,bhey->bhex", xqk_ft, xk_ft_)
-        xqkvw = torch.einsum("bhex,heox->bhox", xqkv_ft, self.weights1)
+            raise Exception(
+                '{} actiation function is not implemented'.format(self.activation)
+            )
+        xqkv_ft = torch.einsum('bhxy,bhey->bhex', xqk_ft, xk_ft_)
+        xqkvw = torch.einsum('bhex,heox->bhox', xqkv_ft, self.weights1)
         out_ft = torch.zeros(B, H, E, L // 2 + 1, device=xq.device, dtype=torch.cfloat)
         for i, j in enumerate(self.index_q):
             out_ft[:, :, :, j] = xqkvw[:, :, :, i]
         # Return to time domain
-        out = torch.fft.irfft(out_ft / self.in_channels / self.out_channels, n=xq.size(-1))
+        out = torch.fft.irfft(
+            out_ft / self.in_channels / self.out_channels, n=xq.size(-1)
+        )
         return (out, None)
-    
-
-
-
diff --git a/benchmark/layers/MultiWaveletCorrelation.py b/benchmark/layers/MultiWaveletCorrelation.py
index 5d8d7fc..71bfde2 100644
--- a/benchmark/layers/MultiWaveletCorrelation.py
+++ b/benchmark/layers/MultiWaveletCorrelation.py
@@ -1,20 +1,15 @@
-import torch
+import math
+from typing import List, Tuple
+
 import numpy as np
+import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
 
-from typing import List, Tuple
-import math
-from functools import partial
-from einops import rearrange, reduce, repeat
-from torch import nn, einsum, diagonal
-from math import log2, ceil
-import pdb
-from utils.masking import LocalMask
 from layers.utils import get_filter
 
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 
 
 class MultiWaveletTransform(nn.Module):
@@ -22,8 +17,17 @@ class MultiWaveletTransform(nn.Module):
     1D multiwavelet block.
     """
 
-    def __init__(self, ich=1, k=8, alpha=16, c=128,
-                 nCZ=1, L=0, base='legendre', attention_dropout=0.1):
+    def __init__(
+        self,
+        ich=1,
+        k=8,
+        alpha=16,
+        c=128,
+        nCZ=1,
+        L=0,
+        base='legendre',
+        attention_dropout=0.1,
+    ):
         super(MultiWaveletTransform, self).__init__()
         print('base', base)
         self.k = k
@@ -39,7 +43,7 @@ def forward(self, queries, keys, values, attn_mask):
         B, L, H, E = queries.shape
         _, S, _, D = values.shape
         if L > S:
-            zeros = torch.zeros_like(queries[:, :(L - S), :]).float()
+            zeros = torch.zeros_like(queries[:, : (L - S), :]).float()
             values = torch.cat([values, zeros], dim=1)
             keys = torch.cat([keys, zeros], dim=1)
         else:
@@ -63,13 +67,23 @@ class MultiWaveletCross(nn.Module):
     1D Multiwavelet Cross Attention layer.
     """
 
-    def __init__(self, in_channels, out_channels, seq_len_q, seq_len_kv, modes, c=64,
-                 k=8, ich=512,
-                 L=0,
-                 base='legendre',
-                 mode_select_method='random',
-                 initializer=None, activation='tanh',
-                 **kwargs):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        seq_len_q,
+        seq_len_kv,
+        modes,
+        c=64,
+        k=8,
+        ich=512,
+        L=0,
+        base='legendre',
+        mode_select_method='random',
+        initializer=None,
+        activation='tanh',
+        **kwargs,
+    ):
         super(MultiWaveletCross, self).__init__()
         print('base', base)
 
@@ -88,28 +102,48 @@ def __init__(self, in_channels, out_channels, seq_len_q, seq_len_kv, modes, c=64
         G1r[np.abs(G1r) < 1e-8] = 0
         self.max_item = 3
 
-        self.attn1 = FourierCrossAttentionW(in_channels=in_channels, out_channels=out_channels, seq_len_q=seq_len_q,
-                                            seq_len_kv=seq_len_kv, modes=modes, activation=activation,
-                                            mode_select_method=mode_select_method)
-        self.attn2 = FourierCrossAttentionW(in_channels=in_channels, out_channels=out_channels, seq_len_q=seq_len_q,
-                                            seq_len_kv=seq_len_kv, modes=modes, activation=activation,
-                                            mode_select_method=mode_select_method)
-        self.attn3 = FourierCrossAttentionW(in_channels=in_channels, out_channels=out_channels, seq_len_q=seq_len_q,
-                                            seq_len_kv=seq_len_kv, modes=modes, activation=activation,
-                                            mode_select_method=mode_select_method)
-        self.attn4 = FourierCrossAttentionW(in_channels=in_channels, out_channels=out_channels, seq_len_q=seq_len_q,
-                                            seq_len_kv=seq_len_kv, modes=modes, activation=activation,
-                                            mode_select_method=mode_select_method)
+        self.attn1 = FourierCrossAttentionW(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            seq_len_q=seq_len_q,
+            seq_len_kv=seq_len_kv,
+            modes=modes,
+            activation=activation,
+            mode_select_method=mode_select_method,
+        )
+        self.attn2 = FourierCrossAttentionW(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            seq_len_q=seq_len_q,
+            seq_len_kv=seq_len_kv,
+            modes=modes,
+            activation=activation,
+            mode_select_method=mode_select_method,
+        )
+        self.attn3 = FourierCrossAttentionW(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            seq_len_q=seq_len_q,
+            seq_len_kv=seq_len_kv,
+            modes=modes,
+            activation=activation,
+            mode_select_method=mode_select_method,
+        )
+        self.attn4 = FourierCrossAttentionW(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            seq_len_q=seq_len_q,
+            seq_len_kv=seq_len_kv,
+            modes=modes,
+            activation=activation,
+            mode_select_method=mode_select_method,
+        )
         self.T0 = nn.Linear(k, k)
-        self.register_buffer('ec_s', torch.Tensor(
-            np.concatenate((H0.T, H1.T), axis=0)))
-        self.register_buffer('ec_d', torch.Tensor(
-            np.concatenate((G0.T, G1.T), axis=0)))
+        self.register_buffer('ec_s', torch.Tensor(np.concatenate((H0.T, H1.T), axis=0)))
+        self.register_buffer('ec_d', torch.Tensor(np.concatenate((G0.T, G1.T), axis=0)))
 
-        self.register_buffer('rc_e', torch.Tensor(
-            np.concatenate((H0r, G0r), axis=0)))
-        self.register_buffer('rc_o', torch.Tensor(
-            np.concatenate((H1r, G1r), axis=0)))
+        self.register_buffer('rc_e', torch.Tensor(np.concatenate((H0r, G0r), axis=0)))
+        self.register_buffer('rc_o', torch.Tensor(np.concatenate((H1r, G1r), axis=0)))
 
         self.Lk = nn.Linear(ich, c * k)
         self.Lq = nn.Linear(ich, c * k)
@@ -132,7 +166,7 @@ def forward(self, q, k, v, mask=None):
         v = v.view(v.shape[0], v.shape[1], self.c, self.k)
 
         if N > S:
-            zeros = torch.zeros_like(q[:, :(N - S), :]).float()
+            zeros = torch.zeros_like(q[:, : (N - S), :]).float()
             v = torch.cat([v, zeros], dim=1)
             k = torch.cat([k, zeros], dim=1)
         else:
@@ -141,9 +175,9 @@ def forward(self, q, k, v, mask=None):
 
         ns = math.floor(np.log2(N))
         nl = pow(2, math.ceil(np.log2(N)))
-        extra_q = q[:, 0:nl - N, :, :]
-        extra_k = k[:, 0:nl - N, :, :]
-        extra_v = v[:, 0:nl - N, :, :]
+        extra_q = q[:, 0 : nl - N, :, :]
+        extra_k = k[:, 0 : nl - N, :, :]
+        extra_v = v[:, 0 : nl - N, :, :]
         q = torch.cat([q, extra_q], 1)
         k = torch.cat([k, extra_k], 1)
         v = torch.cat([v, extra_v], 1)
@@ -177,7 +211,10 @@ def forward(self, q, k, v, mask=None):
             dk, sk = Ud_k[i], Us_k[i]
             dq, sq = Ud_q[i], Us_q[i]
             dv, sv = Ud_v[i], Us_v[i]
-            Ud += [self.attn1(dq[0], dk[0], dv[0], mask)[0] + self.attn2(dq[1], dk[1], dv[1], mask)[0]]
+            Ud += [
+                self.attn1(dq[0], dk[0], dv[0], mask)[0]
+                + self.attn2(dq[1], dk[1], dv[1], mask)[0]
+            ]
             Us += [self.attn3(sq, sk, sv, mask)[0]]
         v = self.attn4(q, k, v, mask)[0]
 
@@ -190,9 +227,13 @@ def forward(self, q, k, v, mask=None):
         return (v.contiguous(), None)
 
     def wavelet_transform(self, x):
-        xa = torch.cat([x[:, ::2, :, :],
-                        x[:, 1::2, :, :],
-                        ], -1)
+        xa = torch.cat(
+            [
+                x[:, ::2, :, :],
+                x[:, 1::2, :, :],
+            ],
+            -1,
+        )
         d = torch.matmul(xa, self.ec_d)
         s = torch.matmul(xa, self.ec_s)
         return d, s
@@ -203,16 +244,23 @@ def evenOdd(self, x):
         x_e = torch.matmul(x, self.rc_e)
         x_o = torch.matmul(x, self.rc_o)
 
-        x = torch.zeros(B, N * 2, c, self.k,
-                        device=x.device)
+        x = torch.zeros(B, N * 2, c, self.k, device=x.device)
         x[..., ::2, :, :] = x_e
         x[..., 1::2, :, :] = x_o
         return x
 
 
 class FourierCrossAttentionW(nn.Module):
-    def __init__(self, in_channels, out_channels, seq_len_q, seq_len_kv, modes=16, activation='tanh',
-                 mode_select_method='random'):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        seq_len_q,
+        seq_len_kv,
+        modes=16,
+        activation='tanh',
+        mode_select_method='random',
+    ):
         super(FourierCrossAttentionW, self).__init__()
         print('corss fourier correlation used!')
         self.in_channels = in_channels
@@ -230,52 +278,58 @@ def forward(self, q, k, v, mask):
         self.index_k_v = list(range(0, min(int(xv.shape[3] // 2), self.modes1)))
 
         # Compute Fourier coefficients
-        xq_ft_ = torch.zeros(B, H, E, len(self.index_q), device=xq.device, dtype=torch.cfloat)
+        xq_ft_ = torch.zeros(
+            B, H, E, len(self.index_q), device=xq.device, dtype=torch.cfloat
+        )
         xq_ft = torch.fft.rfft(xq, dim=-1)
         for i, j in enumerate(self.index_q):
             xq_ft_[:, :, :, i] = xq_ft[:, :, :, j]
 
-        xk_ft_ = torch.zeros(B, H, E, len(self.index_k_v), device=xq.device, dtype=torch.cfloat)
+        xk_ft_ = torch.zeros(
+            B, H, E, len(self.index_k_v), device=xq.device, dtype=torch.cfloat
+        )
         xk_ft = torch.fft.rfft(xk, dim=-1)
         for i, j in enumerate(self.index_k_v):
             xk_ft_[:, :, :, i] = xk_ft[:, :, :, j]
-        xqk_ft = (torch.einsum("bhex,bhey->bhxy", xq_ft_, xk_ft_))
+        xqk_ft = torch.einsum('bhex,bhey->bhxy', xq_ft_, xk_ft_)
         if self.activation == 'tanh':
             xqk_ft = xqk_ft.tanh()
         elif self.activation == 'softmax':
             xqk_ft = torch.softmax(abs(xqk_ft), dim=-1)
             xqk_ft = torch.complex(xqk_ft, torch.zeros_like(xqk_ft))
         else:
-            raise Exception('{} actiation function is not implemented'.format(self.activation))
-        xqkv_ft = torch.einsum("bhxy,bhey->bhex", xqk_ft, xk_ft_)
+            raise Exception(
+                '{} actiation function is not implemented'.format(self.activation)
+            )
+        xqkv_ft = torch.einsum('bhxy,bhey->bhex', xqk_ft, xk_ft_)
 
         xqkvw = xqkv_ft
         out_ft = torch.zeros(B, H, E, L // 2 + 1, device=xq.device, dtype=torch.cfloat)
         for i, j in enumerate(self.index_q):
             out_ft[:, :, :, j] = xqkvw[:, :, :, i]
 
-        out = torch.fft.irfft(out_ft / self.in_channels / self.out_channels, n=xq.size(-1)).permute(0, 3, 2, 1)
+        out = torch.fft.irfft(
+            out_ft / self.in_channels / self.out_channels, n=xq.size(-1)
+        ).permute(0, 3, 2, 1)
         # size = [B, L, H, E]
         return (out, None)
 
 
 class sparseKernelFT1d(nn.Module):
-    def __init__(self,
-                 k, alpha, c=1,
-                 nl=1,
-                 initializer=None,
-                 **kwargs):
+    def __init__(self, k, alpha, c=1, nl=1, initializer=None, **kwargs):
         super(sparseKernelFT1d, self).__init__()
 
         self.modes1 = alpha
-        self.scale = (1 / (c * k * c * k))
-        self.weights1 = nn.Parameter(self.scale * torch.rand(c * k, c * k, self.modes1, dtype=torch.cfloat))
+        self.scale = 1 / (c * k * c * k)
+        self.weights1 = nn.Parameter(
+            self.scale * torch.rand(c * k, c * k, self.modes1, dtype=torch.cfloat)
+        )
         self.weights1.requires_grad = True
         self.k = k
 
     def compl_mul1d(self, x, weights):
         # (batch, in_channel, x ), (in_channel, out_channel, x) -> (batch, out_channel, x)
-        return torch.einsum("bix,iox->box", x, weights)
+        return torch.einsum('bix,iox->box', x, weights)
 
     def forward(self, x):
         B, N, c, k = x.shape  # (B, N, c, k)
@@ -295,12 +349,9 @@ def forward(self, x):
 
 # ##
 class MWT_CZ1d(nn.Module):
-    def __init__(self,
-                 k=3, alpha=64,
-                 L=0, c=1,
-                 base='legendre',
-                 initializer=None,
-                 **kwargs):
+    def __init__(
+        self, k=3, alpha=64, L=0, c=1, base='legendre', initializer=None, **kwargs
+    ):
         super(MWT_CZ1d, self).__init__()
 
         self.k = k
@@ -323,21 +374,17 @@ def __init__(self,
 
         self.T0 = nn.Linear(k, k)
 
-        self.register_buffer('ec_s', torch.Tensor(
-            np.concatenate((H0.T, H1.T), axis=0)))
-        self.register_buffer('ec_d', torch.Tensor(
-            np.concatenate((G0.T, G1.T), axis=0)))
+        self.register_buffer('ec_s', torch.Tensor(np.concatenate((H0.T, H1.T), axis=0)))
+        self.register_buffer('ec_d', torch.Tensor(np.concatenate((G0.T, G1.T), axis=0)))
 
-        self.register_buffer('rc_e', torch.Tensor(
-            np.concatenate((H0r, G0r), axis=0)))
-        self.register_buffer('rc_o', torch.Tensor(
-            np.concatenate((H1r, G1r), axis=0)))
+        self.register_buffer('rc_e', torch.Tensor(np.concatenate((H0r, G0r), axis=0)))
+        self.register_buffer('rc_o', torch.Tensor(np.concatenate((H1r, G1r), axis=0)))
 
     def forward(self, x):
         B, N, c, k = x.shape  # (B, N, k)
         ns = math.floor(np.log2(N))
         nl = pow(2, math.ceil(np.log2(N)))
-        extra_x = x[:, 0:nl - N, :, :]
+        extra_x = x[:, 0 : nl - N, :, :]
         x = torch.cat([x, extra_x], 1)
         Ud = torch.jit.annotate(List[Tensor], [])
         Us = torch.jit.annotate(List[Tensor], [])
@@ -359,22 +406,24 @@ def forward(self, x):
         return x
 
     def wavelet_transform(self, x):
-        xa = torch.cat([x[:, ::2, :, :],
-                        x[:, 1::2, :, :],
-                        ], -1)
+        xa = torch.cat(
+            [
+                x[:, ::2, :, :],
+                x[:, 1::2, :, :],
+            ],
+            -1,
+        )
         d = torch.matmul(xa, self.ec_d)
         s = torch.matmul(xa, self.ec_s)
         return d, s
 
     def evenOdd(self, x):
-
         B, N, c, ich = x.shape  # (B, N, c, k)
         assert ich == 2 * self.k
         x_e = torch.matmul(x, self.rc_e)
         x_o = torch.matmul(x, self.rc_o)
 
-        x = torch.zeros(B, N * 2, c, self.k,
-                        device=x.device)
+        x = torch.zeros(B, N * 2, c, self.k, device=x.device)
         x[..., ::2, :, :] = x_e
         x[..., 1::2, :, :] = x_o
         return x
diff --git a/benchmark/layers/SelfAttention_Family.py b/benchmark/layers/SelfAttention_Family.py
index c8138e2..8e1c820 100644
--- a/benchmark/layers/SelfAttention_Family.py
+++ b/benchmark/layers/SelfAttention_Family.py
@@ -1,18 +1,21 @@
+from math import sqrt
+
+import numpy as np
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-
-import matplotlib.pyplot as plt
 
-import numpy as np
-import math
-from math import sqrt
-from utils.masking import TriangularCausalMask, ProbMask
-import os
+from utils.masking import ProbMask, TriangularCausalMask
 
 
 class FullAttention(nn.Module):
-    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
         super(FullAttention, self).__init__()
         self.scale = scale
         self.mask_flag = mask_flag
@@ -22,9 +25,9 @@ def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1,
     def forward(self, queries, keys, values, attn_mask):
         B, L, H, E = queries.shape
         _, S, _, D = values.shape
-        scale = self.scale or 1. / sqrt(E)
+        scale = self.scale or 1.0 / sqrt(E)
 
-        scores = torch.einsum("blhe,bshe->bhls", queries, keys)
+        scores = torch.einsum('blhe,bshe->bhls', queries, keys)
 
         if self.mask_flag:
             if attn_mask is None:
@@ -33,7 +36,7 @@ def forward(self, queries, keys, values, attn_mask):
             scores.masked_fill_(attn_mask.mask, -np.inf)
 
         A = self.dropout(torch.softmax(scale * scores, dim=-1))
-        V = torch.einsum("bhls,bshd->blhd", A, values)
+        V = torch.einsum('bhls,bshd->blhd', A, values)
 
         if self.output_attention:
             return (V.contiguous(), A)
@@ -42,7 +45,14 @@ def forward(self, queries, keys, values, attn_mask):
 
 
 class ProbAttention(nn.Module):
-    def __init__(self, mask_flag=True, factor=5, scale=None, attention_dropout=0.1, output_attention=False):
+    def __init__(
+        self,
+        mask_flag=True,
+        factor=5,
+        scale=None,
+        attention_dropout=0.1,
+        output_attention=False,
+    ):
         super(ProbAttention, self).__init__()
         self.factor = factor
         self.scale = scale
@@ -57,18 +67,20 @@ def _prob_QK(self, Q, K, sample_k, n_top):  # n_top: c*ln(L_q)
 
         # calculate the sampled Q_K
         K_expand = K.unsqueeze(-3).expand(B, H, L_Q, L_K, E)
-        index_sample = torch.randint(L_K, (L_Q, sample_k))  # real U = U_part(factor*ln(L_k))*L_q
+        index_sample = torch.randint(
+            L_K, (L_Q, sample_k)
+        )  # real U = U_part(factor*ln(L_k))*L_q
         K_sample = K_expand[:, :, torch.arange(L_Q).unsqueeze(1), index_sample, :]
         Q_K_sample = torch.matmul(Q.unsqueeze(-2), K_sample.transpose(-2, -1)).squeeze()
 
-        # find the Top_k query with sparisty measurement
+        # find the Top_k query with sparsity measurement
         M = Q_K_sample.max(-1)[0] - torch.div(Q_K_sample.sum(-1), L_K)
         M_top = M.topk(n_top, sorted=False)[1]
 
         # use the reduced Q to calculate Q_K
-        Q_reduce = Q[torch.arange(B)[:, None, None],
-                   torch.arange(H)[None, :, None],
-                   M_top, :]  # factor*ln(L_q)
+        Q_reduce = Q[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], M_top, :
+        ]  # factor*ln(L_q)
         Q_K = torch.matmul(Q_reduce, K.transpose(-2, -1))  # factor*ln(L_q)*L_k
 
         return Q_K, M_top
@@ -80,7 +92,7 @@ def _get_initial_context(self, V, L_Q):
             V_sum = V.mean(dim=-2)
             contex = V_sum.unsqueeze(-2).expand(B, H, L_Q, V_sum.shape[-1]).clone()
         else:  # use mask
-            assert (L_Q == L_V)  # requires that L_Q == L_V, i.e. for self-attention only
+            assert L_Q == L_V  # requires that L_Q == L_V, i.e. for self-attention only
             contex = V.cumsum(dim=-2)
         return contex
 
@@ -93,12 +105,14 @@ def _update_context(self, context_in, V, scores, index, L_Q, attn_mask):
 
         attn = torch.softmax(scores, dim=-1)  # nn.Softmax(dim=-1)(scores)
 
-        context_in[torch.arange(B)[:, None, None],
-        torch.arange(H)[None, :, None],
-        index, :] = torch.matmul(attn, V).type_as(context_in)
+        context_in[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ] = torch.matmul(attn, V).type_as(context_in)
         if self.output_attention:
             attns = (torch.ones([B, H, L_V, L_V]) / L_V).type_as(attn).to(attn.device)
-            attns[torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :] = attn
+            attns[
+                torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+            ] = attn
             return (context_in, attns)
         else:
             return (context_in, None)
@@ -120,20 +134,21 @@ def forward(self, queries, keys, values, attn_mask):
         scores_top, index = self._prob_QK(queries, keys, sample_k=U_part, n_top=u)
 
         # add scale factor
-        scale = self.scale or 1. / sqrt(D)
+        scale = self.scale or 1.0 / sqrt(D)
         if scale is not None:
             scores_top = scores_top * scale
         # get the context
         context = self._get_initial_context(values, L_Q)
         # update the context with selected top_k queries
-        context, attn = self._update_context(context, values, scores_top, index, L_Q, attn_mask)
+        context, attn = self._update_context(
+            context, values, scores_top, index, L_Q, attn_mask
+        )
 
         return context.contiguous(), attn
 
 
 class AttentionLayer(nn.Module):
-    def __init__(self, attention, d_model, n_heads, d_keys=None,
-                 d_values=None):
+    def __init__(self, attention, d_model, n_heads, d_keys=None, d_values=None):
         super(AttentionLayer, self).__init__()
 
         d_keys = d_keys or (d_model // n_heads)
@@ -155,12 +170,7 @@ def forward(self, queries, keys, values, attn_mask):
         keys = self.key_projection(keys).view(B, S, H, -1)
         values = self.value_projection(values).view(B, S, H, -1)
 
-        out, attn = self.inner_attention(
-            queries,
-            keys,
-            values,
-            attn_mask
-        )
+        out, attn = self.inner_attention(queries, keys, values, attn_mask)
         out = out.view(B, L, -1)
 
         return self.out_projection(out), attn
diff --git a/benchmark/layers/Transformer_EncDec.py b/benchmark/layers/Transformer_EncDec.py
index c0c5789..2e614b5 100644
--- a/benchmark/layers/Transformer_EncDec.py
+++ b/benchmark/layers/Transformer_EncDec.py
@@ -1,4 +1,3 @@
-import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
@@ -6,11 +5,13 @@
 class ConvLayer(nn.Module):
     def __init__(self, c_in):
         super(ConvLayer, self).__init__()
-        self.downConv = nn.Conv1d(in_channels=c_in,
-                                  out_channels=c_in,
-                                  kernel_size=3,
-                                  padding=2,
-                                  padding_mode='circular')
+        self.downConv = nn.Conv1d(
+            in_channels=c_in,
+            out_channels=c_in,
+            kernel_size=3,
+            padding=2,
+            padding_mode='circular',
+        )
         self.norm = nn.BatchNorm1d(c_in)
         self.activation = nn.ELU()
         self.maxPool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)
@@ -25,7 +26,7 @@ def forward(self, x):
 
 
 class EncoderLayer(nn.Module):
-    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"):
+    def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation='relu'):
         super(EncoderLayer, self).__init__()
         d_ff = d_ff or 4 * d_model
         self.attention = attention
@@ -34,13 +35,10 @@ def __init__(self, attention, d_model, d_ff=None, dropout=0.1, activation="relu"
         self.norm1 = nn.LayerNorm(d_model)
         self.norm2 = nn.LayerNorm(d_model)
         self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
+        self.activation = F.relu if activation == 'relu' else F.gelu
 
     def forward(self, x, attn_mask=None):
-        new_x, attn = self.attention(
-            x, x, x,
-            attn_mask=attn_mask
-        )
+        new_x, attn = self.attention(x, x, x, attn_mask=attn_mask)
         x = x + self.dropout(new_x)
 
         y = x = self.norm1(x)
@@ -54,7 +52,9 @@ class Encoder(nn.Module):
     def __init__(self, attn_layers, conv_layers=None, norm_layer=None):
         super(Encoder, self).__init__()
         self.attn_layers = nn.ModuleList(attn_layers)
-        self.conv_layers = nn.ModuleList(conv_layers) if conv_layers is not None else None
+        self.conv_layers = (
+            nn.ModuleList(conv_layers) if conv_layers is not None else None
+        )
         self.norm = norm_layer
 
     def forward(self, x, attn_mask=None):
@@ -79,8 +79,15 @@ def forward(self, x, attn_mask=None):
 
 
 class DecoderLayer(nn.Module):
-    def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
-                 dropout=0.1, activation="relu"):
+    def __init__(
+        self,
+        self_attention,
+        cross_attention,
+        d_model,
+        d_ff=None,
+        dropout=0.1,
+        activation='relu',
+    ):
         super(DecoderLayer, self).__init__()
         d_ff = d_ff or 4 * d_model
         self.self_attention = self_attention
@@ -91,19 +98,15 @@ def __init__(self, self_attention, cross_attention, d_model, d_ff=None,
         self.norm2 = nn.LayerNorm(d_model)
         self.norm3 = nn.LayerNorm(d_model)
         self.dropout = nn.Dropout(dropout)
-        self.activation = F.relu if activation == "relu" else F.gelu
+        self.activation = F.relu if activation == 'relu' else F.gelu
 
     def forward(self, x, cross, x_mask=None, cross_mask=None):
-        x = x + self.dropout(self.self_attention(
-            x, x, x,
-            attn_mask=x_mask
-        )[0])
+        x = x + self.dropout(self.self_attention(x, x, x, attn_mask=x_mask)[0])
         x = self.norm1(x)
 
-        x = x + self.dropout(self.cross_attention(
-            x, cross, cross,
-            attn_mask=cross_mask
-        )[0])
+        x = x + self.dropout(
+            self.cross_attention(x, cross, cross, attn_mask=cross_mask)[0]
+        )
 
         y = x = self.norm2(x)
         y = self.dropout(self.activation(self.conv1(y.transpose(-1, 1))))
diff --git a/benchmark/layers/utils.py b/benchmark/layers/utils.py
index abad383..4c8673c 100644
--- a/benchmark/layers/utils.py
+++ b/benchmark/layers/utils.py
@@ -1,100 +1,130 @@
-import torch
-import torch.nn as nn
-
-import numpy as np
 from functools import partial
 
+import numpy as np
+import torch
+import torch.nn as nn
 from scipy.special import eval_legendre
-from sympy import Poly, legendre, Symbol, chebyshevt
+from sympy import Poly, Symbol, chebyshevt, legendre
 
 
 def legendreDer(k, x):
     def _legendre(k, x):
-        return (2*k+1) * eval_legendre(k, x)
+        return (2 * k + 1) * eval_legendre(k, x)
+
     out = 0
-    for i in np.arange(k-1,-1,-2):
+    for i in np.arange(k - 1, -1, -2):
         out += _legendre(i, x)
     return out
 
 
-def phi_(phi_c, x, lb = 0, ub = 1):
-    mask = np.logical_or(x<lb, x>ub) * 1.0
-    return np.polynomial.polynomial.Polynomial(phi_c)(x) * (1-mask)
+def phi_(phi_c, x, lb=0, ub=1):
+    mask = np.logical_or(x < lb, x > ub) * 1.0
+    return np.polynomial.polynomial.Polynomial(phi_c)(x) * (1 - mask)
 
 
 def get_phi_psi(k, base):
-    
     x = Symbol('x')
-    phi_coeff = np.zeros((k,k))
-    phi_2x_coeff = np.zeros((k,k))
+    phi_coeff = np.zeros((k, k))
+    phi_2x_coeff = np.zeros((k, k))
     if base == 'legendre':
         for ki in range(k):
-            coeff_ = Poly(legendre(ki, 2*x-1), x).all_coeffs()
-            phi_coeff[ki,:ki+1] = np.flip(np.sqrt(2*ki+1) * np.array(coeff_).astype(np.float64))
-            coeff_ = Poly(legendre(ki, 4*x-1), x).all_coeffs()
-            phi_2x_coeff[ki,:ki+1] = np.flip(np.sqrt(2) * np.sqrt(2*ki+1) * np.array(coeff_).astype(np.float64))
-        
+            coeff_ = Poly(legendre(ki, 2 * x - 1), x).all_coeffs()
+            phi_coeff[ki, : ki + 1] = np.flip(
+                np.sqrt(2 * ki + 1) * np.array(coeff_).astype(np.float64)
+            )
+            coeff_ = Poly(legendre(ki, 4 * x - 1), x).all_coeffs()
+            phi_2x_coeff[ki, : ki + 1] = np.flip(
+                np.sqrt(2) * np.sqrt(2 * ki + 1) * np.array(coeff_).astype(np.float64)
+            )
+
         psi1_coeff = np.zeros((k, k))
         psi2_coeff = np.zeros((k, k))
         for ki in range(k):
-            psi1_coeff[ki,:] = phi_2x_coeff[ki,:]
+            psi1_coeff[ki, :] = phi_2x_coeff[ki, :]
             for i in range(k):
-                a = phi_2x_coeff[ki,:ki+1]
-                b = phi_coeff[i, :i+1]
+                a = phi_2x_coeff[ki, : ki + 1]
+                b = phi_coeff[i, : i + 1]
                 prod_ = np.convolve(a, b)
-                prod_[np.abs(prod_)<1e-8] = 0
-                proj_ = (prod_ * 1/(np.arange(len(prod_))+1) * np.power(0.5, 1+np.arange(len(prod_)))).sum()
-                psi1_coeff[ki,:] -= proj_ * phi_coeff[i,:]
-                psi2_coeff[ki,:] -= proj_ * phi_coeff[i,:]
+                prod_[np.abs(prod_) < 1e-8] = 0
+                proj_ = (
+                    prod_
+                    * 1
+                    / (np.arange(len(prod_)) + 1)
+                    * np.power(0.5, 1 + np.arange(len(prod_)))
+                ).sum()
+                psi1_coeff[ki, :] -= proj_ * phi_coeff[i, :]
+                psi2_coeff[ki, :] -= proj_ * phi_coeff[i, :]
             for j in range(ki):
-                a = phi_2x_coeff[ki,:ki+1]
+                a = phi_2x_coeff[ki, : ki + 1]
                 b = psi1_coeff[j, :]
                 prod_ = np.convolve(a, b)
-                prod_[np.abs(prod_)<1e-8] = 0
-                proj_ = (prod_ * 1/(np.arange(len(prod_))+1) * np.power(0.5, 1+np.arange(len(prod_)))).sum()
-                psi1_coeff[ki,:] -= proj_ * psi1_coeff[j,:]
-                psi2_coeff[ki,:] -= proj_ * psi2_coeff[j,:]
-
-            a = psi1_coeff[ki,:]
+                prod_[np.abs(prod_) < 1e-8] = 0
+                proj_ = (
+                    prod_
+                    * 1
+                    / (np.arange(len(prod_)) + 1)
+                    * np.power(0.5, 1 + np.arange(len(prod_)))
+                ).sum()
+                psi1_coeff[ki, :] -= proj_ * psi1_coeff[j, :]
+                psi2_coeff[ki, :] -= proj_ * psi2_coeff[j, :]
+
+            a = psi1_coeff[ki, :]
             prod_ = np.convolve(a, a)
-            prod_[np.abs(prod_)<1e-8] = 0
-            norm1 = (prod_ * 1/(np.arange(len(prod_))+1) * np.power(0.5, 1+np.arange(len(prod_)))).sum()
-
-            a = psi2_coeff[ki,:]
+            prod_[np.abs(prod_) < 1e-8] = 0
+            norm1 = (
+                prod_
+                * 1
+                / (np.arange(len(prod_)) + 1)
+                * np.power(0.5, 1 + np.arange(len(prod_)))
+            ).sum()
+
+            a = psi2_coeff[ki, :]
             prod_ = np.convolve(a, a)
-            prod_[np.abs(prod_)<1e-8] = 0
-            norm2 = (prod_ * 1/(np.arange(len(prod_))+1) * (1-np.power(0.5, 1+np.arange(len(prod_))))).sum()
+            prod_[np.abs(prod_) < 1e-8] = 0
+            norm2 = (
+                prod_
+                * 1
+                / (np.arange(len(prod_)) + 1)
+                * (1 - np.power(0.5, 1 + np.arange(len(prod_))))
+            ).sum()
             norm_ = np.sqrt(norm1 + norm2)
-            psi1_coeff[ki,:] /= norm_
-            psi2_coeff[ki,:] /= norm_
-            psi1_coeff[np.abs(psi1_coeff)<1e-8] = 0
-            psi2_coeff[np.abs(psi2_coeff)<1e-8] = 0
-
-        phi = [np.poly1d(np.flip(phi_coeff[i,:])) for i in range(k)]
-        psi1 = [np.poly1d(np.flip(psi1_coeff[i,:])) for i in range(k)]
-        psi2 = [np.poly1d(np.flip(psi2_coeff[i,:])) for i in range(k)]
-    
+            psi1_coeff[ki, :] /= norm_
+            psi2_coeff[ki, :] /= norm_
+            psi1_coeff[np.abs(psi1_coeff) < 1e-8] = 0
+            psi2_coeff[np.abs(psi2_coeff) < 1e-8] = 0
+
+        phi = [np.poly1d(np.flip(phi_coeff[i, :])) for i in range(k)]
+        psi1 = [np.poly1d(np.flip(psi1_coeff[i, :])) for i in range(k)]
+        psi2 = [np.poly1d(np.flip(psi2_coeff[i, :])) for i in range(k)]
+
     elif base == 'chebyshev':
         for ki in range(k):
             if ki == 0:
-                phi_coeff[ki,:ki+1] = np.sqrt(2/np.pi)
-                phi_2x_coeff[ki,:ki+1] = np.sqrt(2/np.pi) * np.sqrt(2)
+                phi_coeff[ki, : ki + 1] = np.sqrt(2 / np.pi)
+                phi_2x_coeff[ki, : ki + 1] = np.sqrt(2 / np.pi) * np.sqrt(2)
             else:
-                coeff_ = Poly(chebyshevt(ki, 2*x-1), x).all_coeffs()
-                phi_coeff[ki,:ki+1] = np.flip(2/np.sqrt(np.pi) * np.array(coeff_).astype(np.float64))
-                coeff_ = Poly(chebyshevt(ki, 4*x-1), x).all_coeffs()
-                phi_2x_coeff[ki,:ki+1] = np.flip(np.sqrt(2) * 2 / np.sqrt(np.pi) * np.array(coeff_).astype(np.float64))
-                
-        phi = [partial(phi_, phi_coeff[i,:]) for i in range(k)]
-        
+                coeff_ = Poly(chebyshevt(ki, 2 * x - 1), x).all_coeffs()
+                phi_coeff[ki, : ki + 1] = np.flip(
+                    2 / np.sqrt(np.pi) * np.array(coeff_).astype(np.float64)
+                )
+                coeff_ = Poly(chebyshevt(ki, 4 * x - 1), x).all_coeffs()
+                phi_2x_coeff[ki, : ki + 1] = np.flip(
+                    np.sqrt(2)
+                    * 2
+                    / np.sqrt(np.pi)
+                    * np.array(coeff_).astype(np.float64)
+                )
+
+        phi = [partial(phi_, phi_coeff[i, :]) for i in range(k)]
+
         x = Symbol('x')
-        kUse = 2*k
-        roots = Poly(chebyshevt(kUse, 2*x-1)).all_roots()
+        kUse = 2 * k
+        roots = Poly(chebyshevt(kUse, 2 * x - 1)).all_roots()
         x_m = np.array([rt.evalf(20) for rt in roots]).astype(np.float64)
         # x_m[x_m==0.5] = 0.5 + 1e-8 # add small noise to avoid the case of 0.5 belonging to both phi(2x) and phi(2x-1)
         # not needed for our purpose here, we use even k always to avoid
         wm = np.pi / kUse / 2
-        
+
         psi1_coeff = np.zeros((k, k))
         psi2_coeff = np.zeros((k, k))
 
@@ -102,71 +132,82 @@ def get_phi_psi(k, base):
         psi2 = [[] for _ in range(k)]
 
         for ki in range(k):
-            psi1_coeff[ki,:] = phi_2x_coeff[ki,:]
+            psi1_coeff[ki, :] = phi_2x_coeff[ki, :]
             for i in range(k):
-                proj_ = (wm * phi[i](x_m) * np.sqrt(2)* phi[ki](2*x_m)).sum()
-                psi1_coeff[ki,:] -= proj_ * phi_coeff[i,:]
-                psi2_coeff[ki,:] -= proj_ * phi_coeff[i,:]
+                proj_ = (wm * phi[i](x_m) * np.sqrt(2) * phi[ki](2 * x_m)).sum()
+                psi1_coeff[ki, :] -= proj_ * phi_coeff[i, :]
+                psi2_coeff[ki, :] -= proj_ * phi_coeff[i, :]
 
             for j in range(ki):
-                proj_ = (wm * psi1[j](x_m) * np.sqrt(2) * phi[ki](2*x_m)).sum()        
-                psi1_coeff[ki,:] -= proj_ * psi1_coeff[j,:]
-                psi2_coeff[ki,:] -= proj_ * psi2_coeff[j,:]
+                proj_ = (wm * psi1[j](x_m) * np.sqrt(2) * phi[ki](2 * x_m)).sum()
+                psi1_coeff[ki, :] -= proj_ * psi1_coeff[j, :]
+                psi2_coeff[ki, :] -= proj_ * psi2_coeff[j, :]
 
-            psi1[ki] = partial(phi_, psi1_coeff[ki,:], lb = 0, ub = 0.5)
-            psi2[ki] = partial(phi_, psi2_coeff[ki,:], lb = 0.5, ub = 1)
+            psi1[ki] = partial(phi_, psi1_coeff[ki, :], lb=0, ub=0.5)
+            psi2[ki] = partial(phi_, psi2_coeff[ki, :], lb=0.5, ub=1)
 
             norm1 = (wm * psi1[ki](x_m) * psi1[ki](x_m)).sum()
             norm2 = (wm * psi2[ki](x_m) * psi2[ki](x_m)).sum()
 
             norm_ = np.sqrt(norm1 + norm2)
-            psi1_coeff[ki,:] /= norm_
-            psi2_coeff[ki,:] /= norm_
-            psi1_coeff[np.abs(psi1_coeff)<1e-8] = 0
-            psi2_coeff[np.abs(psi2_coeff)<1e-8] = 0
-
-            psi1[ki] = partial(phi_, psi1_coeff[ki,:], lb = 0, ub = 0.5+1e-16)
-            psi2[ki] = partial(phi_, psi2_coeff[ki,:], lb = 0.5+1e-16, ub = 1)
-        
+            psi1_coeff[ki, :] /= norm_
+            psi2_coeff[ki, :] /= norm_
+            psi1_coeff[np.abs(psi1_coeff) < 1e-8] = 0
+            psi2_coeff[np.abs(psi2_coeff) < 1e-8] = 0
+
+            psi1[ki] = partial(phi_, psi1_coeff[ki, :], lb=0, ub=0.5 + 1e-16)
+            psi2[ki] = partial(phi_, psi2_coeff[ki, :], lb=0.5 + 1e-16, ub=1)
+
     return phi, psi1, psi2
 
 
 def get_filter(base, k):
-    
     def psi(psi1, psi2, i, inp):
-        mask = (inp<=0.5) * 1.0
-        return psi1[i](inp) * mask + psi2[i](inp) * (1-mask)
-    
+        mask = (inp <= 0.5) * 1.0
+        return psi1[i](inp) * mask + psi2[i](inp) * (1 - mask)
+
     if base not in ['legendre', 'chebyshev']:
         raise Exception('Base not supported')
-    
+
     x = Symbol('x')
-    H0 = np.zeros((k,k))
-    H1 = np.zeros((k,k))
-    G0 = np.zeros((k,k))
-    G1 = np.zeros((k,k))
-    PHI0 = np.zeros((k,k))
-    PHI1 = np.zeros((k,k))
+    H0 = np.zeros((k, k))
+    H1 = np.zeros((k, k))
+    G0 = np.zeros((k, k))
+    G1 = np.zeros((k, k))
+    PHI0 = np.zeros((k, k))
+    PHI1 = np.zeros((k, k))
     phi, psi1, psi2 = get_phi_psi(k, base)
     if base == 'legendre':
-        roots = Poly(legendre(k, 2*x-1)).all_roots()
+        roots = Poly(legendre(k, 2 * x - 1)).all_roots()
         x_m = np.array([rt.evalf(20) for rt in roots]).astype(np.float64)
-        wm = 1/k/legendreDer(k,2*x_m-1)/eval_legendre(k-1,2*x_m-1)
-        
+        wm = 1 / k / legendreDer(k, 2 * x_m - 1) / eval_legendre(k - 1, 2 * x_m - 1)
+
         for ki in range(k):
             for kpi in range(k):
-                H0[ki, kpi] = 1/np.sqrt(2) * (wm * phi[ki](x_m/2) * phi[kpi](x_m)).sum()
-                G0[ki, kpi] = 1/np.sqrt(2) * (wm * psi(psi1, psi2, ki, x_m/2) * phi[kpi](x_m)).sum()
-                H1[ki, kpi] = 1/np.sqrt(2) * (wm * phi[ki]((x_m+1)/2) * phi[kpi](x_m)).sum()
-                G1[ki, kpi] = 1/np.sqrt(2) * (wm * psi(psi1, psi2, ki, (x_m+1)/2) * phi[kpi](x_m)).sum()
-                
+                H0[ki, kpi] = (
+                    1 / np.sqrt(2) * (wm * phi[ki](x_m / 2) * phi[kpi](x_m)).sum()
+                )
+                G0[ki, kpi] = (
+                    1
+                    / np.sqrt(2)
+                    * (wm * psi(psi1, psi2, ki, x_m / 2) * phi[kpi](x_m)).sum()
+                )
+                H1[ki, kpi] = (
+                    1 / np.sqrt(2) * (wm * phi[ki]((x_m + 1) / 2) * phi[kpi](x_m)).sum()
+                )
+                G1[ki, kpi] = (
+                    1
+                    / np.sqrt(2)
+                    * (wm * psi(psi1, psi2, ki, (x_m + 1) / 2) * phi[kpi](x_m)).sum()
+                )
+
         PHI0 = np.eye(k)
         PHI1 = np.eye(k)
-                
+
     elif base == 'chebyshev':
         x = Symbol('x')
-        kUse = 2*k
-        roots = Poly(chebyshevt(kUse, 2*x-1)).all_roots()
+        kUse = 2 * k
+        roots = Poly(chebyshevt(kUse, 2 * x - 1)).all_roots()
         x_m = np.array([rt.evalf(20) for rt in roots]).astype(np.float64)
         # x_m[x_m==0.5] = 0.5 + 1e-8 # add small noise to avoid the case of 0.5 belonging to both phi(2x) and phi(2x-1)
         # not needed for our purpose here, we use even k always to avoid
@@ -174,72 +215,97 @@ def psi(psi1, psi2, i, inp):
 
         for ki in range(k):
             for kpi in range(k):
-                H0[ki, kpi] = 1/np.sqrt(2) * (wm * phi[ki](x_m/2) * phi[kpi](x_m)).sum()
-                G0[ki, kpi] = 1/np.sqrt(2) * (wm * psi(psi1, psi2, ki, x_m/2) * phi[kpi](x_m)).sum()
-                H1[ki, kpi] = 1/np.sqrt(2) * (wm * phi[ki]((x_m+1)/2) * phi[kpi](x_m)).sum()
-                G1[ki, kpi] = 1/np.sqrt(2) * (wm * psi(psi1, psi2, ki, (x_m+1)/2) * phi[kpi](x_m)).sum()
-
-                PHI0[ki, kpi] = (wm * phi[ki](2*x_m) * phi[kpi](2*x_m)).sum() * 2
-                PHI1[ki, kpi] = (wm * phi[ki](2*x_m-1) * phi[kpi](2*x_m-1)).sum() * 2
-                
-        PHI0[np.abs(PHI0)<1e-8] = 0
-        PHI1[np.abs(PHI1)<1e-8] = 0
-
-    H0[np.abs(H0)<1e-8] = 0
-    H1[np.abs(H1)<1e-8] = 0
-    G0[np.abs(G0)<1e-8] = 0
-    G1[np.abs(G1)<1e-8] = 0
-        
+                H0[ki, kpi] = (
+                    1 / np.sqrt(2) * (wm * phi[ki](x_m / 2) * phi[kpi](x_m)).sum()
+                )
+                G0[ki, kpi] = (
+                    1
+                    / np.sqrt(2)
+                    * (wm * psi(psi1, psi2, ki, x_m / 2) * phi[kpi](x_m)).sum()
+                )
+                H1[ki, kpi] = (
+                    1 / np.sqrt(2) * (wm * phi[ki]((x_m + 1) / 2) * phi[kpi](x_m)).sum()
+                )
+                G1[ki, kpi] = (
+                    1
+                    / np.sqrt(2)
+                    * (wm * psi(psi1, psi2, ki, (x_m + 1) / 2) * phi[kpi](x_m)).sum()
+                )
+
+                PHI0[ki, kpi] = (wm * phi[ki](2 * x_m) * phi[kpi](2 * x_m)).sum() * 2
+                PHI1[ki, kpi] = (
+                    wm * phi[ki](2 * x_m - 1) * phi[kpi](2 * x_m - 1)
+                ).sum() * 2
+
+        PHI0[np.abs(PHI0) < 1e-8] = 0
+        PHI1[np.abs(PHI1) < 1e-8] = 0
+
+    H0[np.abs(H0) < 1e-8] = 0
+    H1[np.abs(H1) < 1e-8] = 0
+    G0[np.abs(G0) < 1e-8] = 0
+    G1[np.abs(G1) < 1e-8] = 0
+
     return H0, H1, G0, G1, PHI0, PHI1
 
 
-def train(model, train_loader, optimizer, epoch, device, verbose = 0,
-    lossFn = None, lr_schedule=None, 
-    post_proc = lambda args: args):
-        
+def train(
+    model,
+    train_loader,
+    optimizer,
+    epoch,
+    device,
+    verbose=0,
+    lossFn=None,
+    lr_schedule=None,
+    post_proc=lambda args: args,
+):
     if lossFn is None:
         lossFn = nn.MSELoss()
 
     model.train()
-    
-    total_loss = 0.
+
+    total_loss = 0.0
 
     for batch_idx, (data, target) in enumerate(train_loader):
-        
         bs = len(data)
         data, target = data.to(device), target.to(device)
         optimizer.zero_grad()
-        
+
         output = model(data)
-        
+
         target = post_proc(target)
         output = post_proc(output)
         loss = lossFn(output.view(bs, -1), target.view(bs, -1))
-        
+
         loss.backward()
         optimizer.step()
         total_loss += loss.sum().item()
-    if lr_schedule is not None: lr_schedule.step()
-    
-    if verbose>0:
-        print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
-                    epoch, batch_idx * len(data), len(train_loader.dataset),
-                    100. * batch_idx / len(train_loader), loss.item()))
-        
-    return total_loss/len(train_loader.dataset)
-
-
-def test(model, test_loader, device, verbose=0, lossFn=None,
-        post_proc = lambda args: args):
-    
+    if lr_schedule is not None:
+        lr_schedule.step()
+
+    if verbose > 0:
+        print(
+            'Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
+                epoch,
+                batch_idx * len(data),
+                len(train_loader.dataset),
+                100.0 * batch_idx / len(train_loader),
+                loss.item(),
+            )
+        )
+
+    return total_loss / len(train_loader.dataset)
+
+
+def test(
+    model, test_loader, device, verbose=0, lossFn=None, post_proc=lambda args: args
+):
     model.eval()
     if lossFn is None:
         lossFn = nn.MSELoss()
-    
-    
-    total_loss = 0.
-    predictions = []
-    
+
+    total_loss = 0.0
+
     with torch.no_grad():
         for data, target in test_loader:
             bs = len(data)
@@ -247,17 +313,18 @@ def test(model, test_loader, device, verbose=0, lossFn=None,
             data, target = data.to(device), target.to(device)
             output = model(data)
             output = post_proc(output)
-            
+
             loss = lossFn(output.view(bs, -1), target.view(bs, -1))
             total_loss += loss.sum().item()
-    
-    return total_loss/len(test_loader.dataset)
+
+    return total_loss / len(test_loader.dataset)
 
 
 # Till EoF
 # taken from FNO paper:
 # https://github.com/zongyi-li/fourier_neural_operator
 
+
 # normalization, pointwise gaussian
 class UnitGaussianNormalizer(object):
     def __init__(self, x, eps=0.00001):
@@ -274,15 +341,15 @@ def encode(self, x):
 
     def decode(self, x, sample_idx=None):
         if sample_idx is None:
-            std = self.std + self.eps # n
+            std = self.std + self.eps  # n
             mean = self.mean
         else:
             if len(self.mean.shape) == len(sample_idx[0].shape):
                 std = self.std[sample_idx] + self.eps  # batch*n
                 mean = self.mean[sample_idx]
             if len(self.mean.shape) > len(sample_idx[0].shape):
-                std = self.std[:,sample_idx]+ self.eps # T*batch*n
-                mean = self.mean[:,sample_idx]
+                std = self.std[:, sample_idx] + self.eps  # T*batch*n
+                mean = self.mean[:, sample_idx]
 
         # x is in shape of batch*n or T*batch*n
         x = (x * std) + mean
@@ -330,28 +397,29 @@ def __init__(self, x, low=0.0, high=1.0):
         mymin = torch.min(x, 0)[0].view(-1)
         mymax = torch.max(x, 0)[0].view(-1)
 
-        self.a = (high - low)/(mymax - mymin)
-        self.b = -self.a*mymax + high
+        self.a = (high - low) / (mymax - mymin)
+        self.b = -self.a * mymax + high
 
     def encode(self, x):
         s = x.size()
         x = x.view(s[0], -1)
-        x = self.a*x + self.b
+        x = self.a * x + self.b
         x = x.view(s)
         return x
 
     def decode(self, x):
         s = x.size()
         x = x.view(s[0], -1)
-        x = (x - self.b)/self.a
+        x = (x - self.b) / self.a
         x = x.view(s)
         return x
-    
+
+
 class LpLoss(object):
     def __init__(self, d=2, p=2, size_average=True, reduction=True):
         super(LpLoss, self).__init__()
 
-        #Dimension and Lp-norm type are postive
+        # Dimension and Lp-norm type are postive
         assert d > 0 and p > 0
 
         self.d = d
@@ -365,7 +433,9 @@ def abs(self, x, y):
         # Assume uniform mesh
         h = 1.0 / (x.size()[1] - 1.0)
 
-        all_norms = (h**(self.d/self.p))*torch.norm(x.view(num_examples,-1) - y.view(num_examples,-1), self.p, 1)
+        all_norms = (h ** (self.d / self.p)) * torch.norm(
+            x.view(num_examples, -1) - y.view(num_examples, -1), self.p, 1
+        )
 
         if self.reduction:
             if self.size_average:
@@ -378,16 +448,18 @@ def abs(self, x, y):
     def rel(self, x, y):
         num_examples = x.size()[0]
 
-        diff_norms = torch.norm(x.reshape(num_examples,-1) - y.reshape(num_examples,-1), self.p, 1)
-        y_norms = torch.norm(y.reshape(num_examples,-1), self.p, 1)
+        diff_norms = torch.norm(
+            x.reshape(num_examples, -1) - y.reshape(num_examples, -1), self.p, 1
+        )
+        y_norms = torch.norm(y.reshape(num_examples, -1), self.p, 1)
 
         if self.reduction:
             if self.size_average:
-                return torch.mean(diff_norms/y_norms)
+                return torch.mean(diff_norms / y_norms)
             else:
-                return torch.sum(diff_norms/y_norms)
+                return torch.sum(diff_norms / y_norms)
 
-        return diff_norms/y_norms
+        return diff_norms / y_norms
 
     def __call__(self, x, y):
-        return self.rel(x, y)
\ No newline at end of file
+        return self.rel(x, y)
diff --git a/benchmark/metalearned/common/evaluator.py b/benchmark/metalearned/common/evaluator.py
index 6ed209e..bb092b0 100644
--- a/benchmark/metalearned/common/evaluator.py
+++ b/benchmark/metalearned/common/evaluator.py
@@ -1,10 +1,13 @@
 from dataclasses import dataclass
+
 from common.timeseries import TimeseriesBundle
 
+
 @dataclass
 class Evaluator:
     test_set: TimeseriesBundle
 
+
 @dataclass
 class EvaluationResult:
-    test_set: TimeseriesBundle
\ No newline at end of file
+    test_set: TimeseriesBundle
diff --git a/benchmark/metalearned/common/experiment.py b/benchmark/metalearned/common/experiment.py
index 21c28ff..80235d7 100644
--- a/benchmark/metalearned/common/experiment.py
+++ b/benchmark/metalearned/common/experiment.py
@@ -12,10 +12,12 @@
 command_file_name = 'experiment.cmd'
 
 
-def create_experiment(experiment_path: str,
-                      parameters: Dict,
-                      command: Callable[[str, Dict], str],
-                      callback: Callable[[str, Dict], None] = lambda path, params: None) -> None:
+def create_experiment(
+    experiment_path: str,
+    parameters: Dict,
+    command: Callable[[str, Dict], str],
+    callback: Callable[[str, Dict], None] = lambda path, params: None,
+) -> None:
     """
     Create experiment.
     If parameters contain keys with multiple values, then multiple sub-experiments will be created.
@@ -40,8 +42,13 @@ def create_experiment(experiment_path: str,
     logging.info('Generating experiments ...')
     for variables_instance in tqdm(product(*experiment_variables)):
         sub_experiment_name = ','.join(
-            ['%s=%.4g' % (name, value) if isinstance(value, float) else '%s=%s' % (name, str(value).replace(' ', '_'))
-             for name, value in dict(variables_instance).items()])
+            [
+                '%s=%.4g' % (name, value)
+                if isinstance(value, float)
+                else '%s=%s' % (name, str(value).replace(' ', '_'))
+                for name, value in dict(variables_instance).items()
+            ]
+        )
         sub_experiment_path = os.path.join(experiment_path, sub_experiment_name)
         Path(sub_experiment_path).mkdir(parents=True, exist_ok=False)
 
@@ -51,7 +58,9 @@ def create_experiment(experiment_path: str,
         # write command file
         with open(os.path.join(sub_experiment_path, command_file_name), 'w') as f:
             f.write(command(sub_experiment_path, dict(variables_instance)))
-        callback(sub_experiment_path, dict(**{**parameters, **dict(variables_instance)}))
+        callback(
+            sub_experiment_path, dict(**{**parameters, **dict(variables_instance)})
+        )
 
 
 def load_experiment_parameters(experiment_path: str) -> Dict:
diff --git a/benchmark/metalearned/common/metrics.py b/benchmark/metalearned/common/metrics.py
index 93402b5..c6e7d37 100644
--- a/benchmark/metalearned/common/metrics.py
+++ b/benchmark/metalearned/common/metrics.py
@@ -4,7 +4,9 @@
 Target = np.ndarray
 
 
-def mase(forecast: Forecast, insample: np.ndarray, outsample: Target, frequency: int) -> np.ndarray:
+def mase(
+    forecast: Forecast, insample: np.ndarray, outsample: Target, frequency: int
+) -> np.ndarray:
     """
     Calculate MASE of each point for each timeseries.
     https://en.wikipedia.org/wiki/Mean_absolute_scaled_error
@@ -15,7 +17,9 @@ def mase(forecast: Forecast, insample: np.ndarray, outsample: Target, frequency:
     :param frequency:
     :return:
     """
-    return np.mean(np.abs(forecast - outsample)) / np.mean(np.abs(insample[:-frequency] - insample[frequency:]))
+    return np.mean(np.abs(forecast - outsample)) / np.mean(
+        np.abs(insample[:-frequency] - insample[frequency:])
+    )
 
 
 def nd(forecast: Forecast, target: Target) -> float:
@@ -37,7 +41,9 @@ def nrmse(forecast: Forecast, target: Target) -> float:
     :param target:
     :return:
     """
-    return np.sqrt(np.mean(np.power((forecast - target), 2))) / (np.mean(np.abs(target)))
+    return np.sqrt(np.mean(np.power((forecast - target), 2))) / (
+        np.mean(np.abs(target))
+    )
 
 
 def mape(forecast: Forecast, target: Target) -> np.ndarray:
@@ -77,30 +83,29 @@ def smape_2(forecast: Forecast, target: Target) -> np.ndarray:
     :return: Same shape array with sMAPE calculated for each time step of each timeseries.
     """
     denom = np.abs(target) + np.abs(forecast)
-    denom[denom == 0.0] = 1.0  # divide by 1.0 instead of 0.0, in case when denom is zero the enum will be 0.0 anyways.
+    denom[
+        denom == 0.0
+    ] = 1.0  # divide by 1.0 instead of 0.0, in case when denom is zero the enum will be 0.0 anyways.
     return 200 * np.abs(forecast - target) / denom
 
 
-
-
 import tensorflow as tf
 from keras import backend
 
 
 def smape(y_true, y_pred):
-    """ Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
-        `loss = 200 * mean(abs((y_true - y_pred) / (y_true + y_pred), axis=-1)`
-        Args:
-        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-        Returns:
-        Symmetric mean absolute percentage error values. shape = `[batch_size, d0, ..
-        dN-1]`.
-        """
+    """Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
+    `loss = 200 * mean(abs((y_true - y_pred) / (y_true + y_pred), axis=-1)`
+    Args:
+    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+    Returns:
+    Symmetric mean absolute percentage error values. shape = `[batch_size, d0, ..
+    dN-1]`.
+    """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
     diff = tf.abs(
-        (y_true - y_pred) /
-        backend.maximum(y_true + y_pred, backend.epsilon())
+        (y_true - y_pred) / backend.maximum(y_true + y_pred, backend.epsilon())
     )
     return 200.0 * backend.mean(diff, axis=-1)
diff --git a/benchmark/metalearned/common/samplers.py b/benchmark/metalearned/common/samplers.py
index ebb2306..498a62f 100644
--- a/benchmark/metalearned/common/samplers.py
+++ b/benchmark/metalearned/common/samplers.py
@@ -2,12 +2,14 @@
 
 
 class UnivariateTimeseriesSampler:
-    def __init__(self,
-                 timeseries: np.ndarray,
-                 insample_size: int,
-                 outsample_size: int,
-                 window_sampling_limit: int,
-                 batch_size: int):
+    def __init__(
+        self,
+        timeseries: np.ndarray,
+        insample_size: int,
+        outsample_size: int,
+        window_sampling_limit: int,
+        batch_size: int,
+    ):
         self.timeseries = [ts for ts in timeseries]
         self.window_sampling_limit = window_sampling_limit
         self.batch_size = batch_size
@@ -20,20 +22,29 @@ def __iter__(self):
             insample_mask = np.zeros((self.batch_size, self.insample_size))
             outsample = np.zeros((self.batch_size, self.outsample_size))
             outsample_mask = np.zeros((self.batch_size, self.outsample_size))
-            sampled_ts_indices = np.random.randint(len(self.timeseries), size=self.batch_size)
+            sampled_ts_indices = np.random.randint(
+                len(self.timeseries), size=self.batch_size
+            )
             for i, sampled_index in enumerate(sampled_ts_indices):
                 sampled_timeseries = self.timeseries[sampled_index]
-                cut_point = np.random.randint(low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
-                                              high=len(sampled_timeseries),
-                                              size=1)[0]
+                cut_point = np.random.randint(
+                    low=max(1, len(sampled_timeseries) - self.window_sampling_limit),
+                    high=len(sampled_timeseries),
+                    size=1,
+                )[0]
 
-                insample_window = sampled_timeseries[max(0, cut_point - self.insample_size):cut_point]
-                insample[i, -len(insample_window):] = insample_window
-                insample_mask[i, -len(insample_window):] = 1.0
+                insample_window = sampled_timeseries[
+                    max(0, cut_point - self.insample_size) : cut_point
+                ]
+                insample[i, -len(insample_window) :] = insample_window
+                insample_mask[i, -len(insample_window) :] = 1.0
                 outsample_window = sampled_timeseries[
-                                   cut_point:min(len(sampled_timeseries), cut_point + self.outsample_size)]
-                outsample[i, :len(outsample_window)] = outsample_window
-                outsample_mask[i, :len(outsample_window)] = 1.0
+                    cut_point : min(
+                        len(sampled_timeseries), cut_point + self.outsample_size
+                    )
+                ]
+                outsample[i, : len(outsample_window)] = outsample_window
+                outsample_mask[i, : len(outsample_window)] = 1.0
             yield insample, insample_mask, outsample, outsample_mask
 
     def sequential_latest_insamples(self):
@@ -41,7 +52,7 @@ def sequential_latest_insamples(self):
         insample = np.zeros((batch_size, self.insample_size))
         insample_mask = np.zeros((batch_size, self.insample_size))
         for i, ts in enumerate(self.timeseries):
-            ts_last_window = ts[-self.insample_size:]
-            insample[i, -len(ts):] = ts_last_window
-            insample_mask[i, -len(ts):] = 1.0
+            ts_last_window = ts[-self.insample_size :]
+            insample[i, -len(ts) :] = ts_last_window
+            insample_mask[i, -len(ts) :] = 1.0
         return insample, insample_mask
diff --git a/benchmark/metalearned/common/summary_utils.py b/benchmark/metalearned/common/summary_utils.py
index 704f16b..a784ad3 100644
--- a/benchmark/metalearned/common/summary_utils.py
+++ b/benchmark/metalearned/common/summary_utils.py
@@ -42,40 +42,69 @@ def __init__(self, filter_path: str, evaluator):
                     self.groups[parameter_key] = {}
                 if parameter_value not in self.groups[parameter_key]:
                     self.groups[parameter_key][parameter_value] = []
-                self.groups[parameter_key][parameter_value].append(len(self.predictions) - 1)
+                self.groups[parameter_key][parameter_value].append(
+                    len(self.predictions) - 1
+                )
         self.group_names = ', '.join(self.groups.keys())
         logging.debug(f'Loaded {len(self.predictions)} predictions')
         logging.debug(f'Parameters: {self.group_names}')
 
-    def bootstrap(self,
-                  ensemble_keys: List[str],
-                  bootstrap_key: str,
-                  bootstrap_size: int,
-                  number_of_samples: int):
+    def bootstrap(
+        self,
+        ensemble_keys: List[str],
+        bootstrap_key: str,
+        bootstrap_size: int,
+        number_of_samples: int,
+    ):
         group_keys = self.groups.keys() - set(ensemble_keys)
-        group_values = list(itertools.product(*map(lambda g: self.groups[g].keys(), group_keys)))
+        group_values = list(
+            itertools.product(*map(lambda g: self.groups[g].keys(), group_keys))
+        )
 
         results = []
         for group_instance in tqdm(group_values):
-            group_ids = [set(self.groups[group_key][group_value]) for group_key, group_value in
-                         list(zip(group_keys, group_instance))]
+            group_ids = [
+                set(self.groups[group_key][group_value])
+                for group_key, group_value in list(zip(group_keys, group_instance))
+            ]
             group_filter = set.intersection(*group_ids) if len(group_ids) > 0 else None
-            if group_instance != () and (group_filter is None or len(group_filter) == 0):
+            if group_instance != () and (
+                group_filter is None or len(group_filter) == 0
+            ):
                 continue
             for _ in range(number_of_samples):
                 sampled_ids = set(
-                    itertools.chain(*random.sample(list(self.groups[bootstrap_key].values()), k=bootstrap_size)))
-                ensemble_ids = sampled_ids.intersection(group_filter) if group_filter is not None else sampled_ids
+                    itertools.chain(
+                        *random.sample(
+                            list(self.groups[bootstrap_key].values()), k=bootstrap_size
+                        )
+                    )
+                )
+                ensemble_ids = (
+                    sampled_ids.intersection(group_filter)
+                    if group_filter is not None
+                    else sampled_ids
+                )
                 if ensemble_ids is None or len(ensemble_ids) == 0:
                     continue
-                ensemble_predictions = pd.concat([self.predictions[i]
-                                                  for i in ensemble_ids],
-                                                 sort=False).groupby(level='id', sort=False).median()
+                ensemble_predictions = (
+                    pd.concat([self.predictions[i] for i in ensemble_ids], sort=False)
+                    .groupby(level='id', sort=False)
+                    .median()
+                )
                 group_columns = dict(zip(group_keys, group_instance))
-                evaluation_results = self.evaluator.evaluate(ensemble_predictions.values)
+                evaluation_results = self.evaluator.evaluate(
+                    ensemble_predictions.values
+                )
                 for evaluation_key, evaluation_value in evaluation_results.items():
-                    results.append(pd.DataFrame({
-                        'metric': evaluation_value,
-                        'evaluation_key': evaluation_key,
-                        **group_columns}, index=[0]))
+                    results.append(
+                        pd.DataFrame(
+                            {
+                                'metric': evaluation_value,
+                                'evaluation_key': evaluation_key,
+                                **group_columns,
+                            },
+                            index=[0],
+                        )
+                    )
         return pd.concat(results, sort=False).reset_index()
diff --git a/benchmark/metalearned/common/timeseries.py b/benchmark/metalearned/common/timeseries.py
index 33428e4..e65c907 100644
--- a/benchmark/metalearned/common/timeseries.py
+++ b/benchmark/metalearned/common/timeseries.py
@@ -5,9 +5,9 @@
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Tuple
 
+import dill
 import numpy as np
 from dateutil.relativedelta import relativedelta
-import dill
 
 
 class TimeUnit(ABC):
@@ -98,25 +98,33 @@ class Timeseries:
     meta: Dict[str, Any]
 
     def copy(self, start_date: datetime, values: np.ndarray) -> 'Timeseries':
-        return Timeseries(id=self.id,
-                          start_date=start_date,
-                          time_unit=self.time_unit,
-                          frequency=self.frequency,
-                          period=self.period,
-                          values=values,
-                          meta=self.meta)
+        return Timeseries(
+            id=self.id,
+            start_date=start_date,
+            time_unit=self.time_unit,
+            frequency=self.frequency,
+            period=self.period,
+            values=values,
+            meta=self.meta,
+        )
 
     def future_values(self, values: np.ndarray) -> 'Timeseries':
-        return self.copy(start_date=self.time_unit.add(self.start_date, len(self.values)), values=values)
+        return self.copy(
+            start_date=self.time_unit.add(self.start_date, len(self.values)),
+            values=values,
+        )
 
     def split(self, n: int) -> TimeseriesSplit:
         time_shift = n if n >= 0 else len(self.values) + n
         split_time = self.time_unit.add(self.start_date, time_shift * self.frequency)
-        return self.copy(start_date=self.start_date, values=self.values[:n]), self.copy(start_date=split_time,
-                                                                                        values=self.values[n:])
+        return self.copy(start_date=self.start_date, values=self.values[:n]), self.copy(
+            start_date=split_time, values=self.values[n:]
+        )
 
     def split_by_time(self, split_date: datetime) -> TimeseriesSplit:
-        points_to_include = int(self.time_unit.delta(split_date, self.start_date) // self.frequency)
+        points_to_include = int(
+            self.time_unit.delta(split_date, self.start_date) // self.frequency
+        )
         if points_to_include < 0:
             before = self.copy(split_date, np.empty(0))
             on_and_after = self
@@ -135,8 +143,12 @@ def values(self) -> List[np.ndarray]:
 
     def time_stamps(self) -> List[np.ndarray]:
         def _make_time_stamps(ts):
-            return np.array([ts.time_unit.add(ts.start_date, ts.frequency*i)
-                    for i in range(len(ts.values))])
+            return np.array(
+                [
+                    ts.time_unit.add(ts.start_date, ts.frequency * i)
+                    for i in range(len(ts.values))
+                ]
+            )
 
         return list(map(_make_time_stamps, self.timeseries))
 
@@ -152,7 +164,9 @@ def filter(self, f: Callable[[Timeseries], bool]) -> 'TimeseriesBundle':
     def map(self, f: Callable[[Timeseries], Timeseries]) -> 'TimeseriesBundle':
         return TimeseriesBundle(list(map(f, self.timeseries)))
 
-    def split(self, f: Callable[[Timeseries], TimeseriesSplit]) -> Tuple['TimeseriesBundle', 'TimeseriesBundle']:
+    def split(
+        self, f: Callable[[Timeseries], TimeseriesSplit]
+    ) -> Tuple['TimeseriesBundle', 'TimeseriesBundle']:
         bucket_1 = []
         bucket_2 = []
         for timeseries in self.timeseries:
@@ -161,14 +175,20 @@ def split(self, f: Callable[[Timeseries], TimeseriesSplit]) -> Tuple['Timeseries
             bucket_2.append(part_2)
         return TimeseriesBundle(bucket_1), TimeseriesBundle(bucket_2)
 
-    def intersection_by_id(self, bundle: 'TimeseriesBundle') -> Tuple['TimeseriesBundle', 'TimeseriesBundle']:
+    def intersection_by_id(
+        self, bundle: 'TimeseriesBundle'
+    ) -> Tuple['TimeseriesBundle', 'TimeseriesBundle']:
         bundle_ids = bundle.ids()
         ids = [ts_id for ts_id in self.ids() if ts_id in bundle_ids]
-        return self.filter(lambda ts: ts.id in ids), bundle.filter(lambda ts: ts.id in ids)
+        return self.filter(lambda ts: ts.id in ids), bundle.filter(
+            lambda ts: ts.id in ids
+        )
 
     def future_values(self, values: np.array) -> 'TimeseriesBundle':
         assert len(values) == len(self.timeseries)
-        return TimeseriesBundle([ts.future_values(values[i]) for i, ts in enumerate(self.timeseries)])
+        return TimeseriesBundle(
+            [ts.future_values(values[i]) for i, ts in enumerate(self.timeseries)]
+        )
 
 
 class TimeseriesLoader(ABC):
@@ -192,4 +212,3 @@ def download(self) -> TimeseriesBundle:
         :return: Training and test splits.
         """
         pass
-
diff --git a/benchmark/metalearned/common/torch_utils.py b/benchmark/metalearned/common/torch_utils.py
index cdaf534..1ed055e 100644
--- a/benchmark/metalearned/common/torch_utils.py
+++ b/benchmark/metalearned/common/torch_utils.py
@@ -22,8 +22,8 @@ def to_device(module: t.nn.Module, use_cuda: bool = True):
 
 def div_no_nan(a, b):
     result = a / b
-    result[result != result] = .0
-    result[result == np.inf] = .0
+    result[result != result] = 0.0
+    result[result == np.inf] = 0.0
     return result
 
 
@@ -33,11 +33,16 @@ def mape_loss(forecast, target, mask):
 
 
 def smape_1_loss(forecast, target, mask):
-    return 200 * t.mean(div_no_nan(t.abs(forecast - target), forecast.data + target.data) * mask)
+    return 200 * t.mean(
+        div_no_nan(t.abs(forecast - target), forecast.data + target.data) * mask
+    )
 
 
 def smape_2_loss(forecast, target, mask):
-    return 200 * t.mean(div_no_nan(t.abs(forecast - target), t.abs(forecast.data) + t.abs(target.data)) * mask)
+    return 200 * t.mean(
+        div_no_nan(t.abs(forecast - target), t.abs(forecast.data) + t.abs(target.data))
+        * mask
+    )
 
 
 def mase_loss(insample, freq, forecast, target, mask):
@@ -47,7 +52,9 @@ def mase_loss(insample, freq, forecast, target, mask):
 
 
 class SnapshotManager:
-    def __init__(self, snapshot_dir: str, logging_frequency: int, snapshot_frequency: int):
+    def __init__(
+        self, snapshot_dir: str, logging_frequency: int, snapshot_frequency: int
+    ):
         self.model_snapshot_file = os.path.join(snapshot_dir, 'model')
         self.optimizer_snapshot_file = os.path.join(snapshot_dir, 'optimizer')
         self.losses_file = os.path.join(snapshot_dir, 'losses')
@@ -59,16 +66,26 @@ def __init__(self, snapshot_dir: str, logging_frequency: int, snapshot_frequency
         self.losses = {'training': {}, 'validation': {}}
         self.time_track = {}
 
-    def restore(self, model: Optional[t.nn.Module], optimizer: Optional[t.optim.Optimizer]) -> int:
+    def restore(
+        self, model: Optional[t.nn.Module], optimizer: Optional[t.optim.Optimizer]
+    ) -> int:
         if model is not None and os.path.isfile(self.model_snapshot_file):
             model.load_state_dict(t.load(self.model_snapshot_file))
         if optimizer is not None and os.path.isfile(self.optimizer_snapshot_file):
             optimizer.load_state_dict(t.load(self.optimizer_snapshot_file))
-        iteration = t.load(self.iteration_file)['iteration'] if os.path.isfile(self.iteration_file) else 0
+        iteration = (
+            t.load(self.iteration_file)['iteration']
+            if os.path.isfile(self.iteration_file)
+            else 0
+        )
         if os.path.isfile(self.losses_file):
             losses = t.load(self.losses_file)
-            training_losses = {k: v for k, v in losses['training'].items() if k <= iteration}
-            validation_losses = {k: v for k, v in losses['validation'].items() if k <= iteration}
+            training_losses = {
+                k: v for k, v in losses['training'].items() if k <= iteration
+            }
+            validation_losses = {
+                k: v for k, v in losses['validation'].items() if k <= iteration
+            }
             # when restoring remove losses which were after the last snapshot
             self.losses = {'training': training_losses, 'validation': validation_losses}
             self.snapshot(self.losses_file, self.losses)
@@ -86,12 +103,14 @@ def load_training_losses(self) -> pd.DataFrame:
     def enable_time_tracking(self):
         self.start_time = time.time()
 
-    def register(self,
-                 iteration: int,
-                 training_loss: float,
-                 validation_loss: float,
-                 model: t.nn.Module,
-                 optimizer: Optional[t.optim.Optimizer]) -> None:
+    def register(
+        self,
+        iteration: int,
+        training_loss: float,
+        validation_loss: float,
+        model: t.nn.Module,
+        optimizer: Optional[t.optim.Optimizer],
+    ) -> None:
         if iteration == 1 or iteration % self.logging_frequency == 0:
             self.losses['training'][iteration] = training_loss
             self.losses['validation'][iteration] = validation_loss
diff --git a/benchmark/metalearned/common/utils.py b/benchmark/metalearned/common/utils.py
index c456c46..2da7bdf 100644
--- a/benchmark/metalearned/common/utils.py
+++ b/benchmark/metalearned/common/utils.py
@@ -4,15 +4,15 @@
 import pathlib
 import sys
 import urllib
-from decimal import Decimal, ROUND_HALF_UP
+from decimal import ROUND_HALF_UP, Decimal
 from glob import glob
 from itertools import dropwhile, takewhile
+from math import pow
 from typing import Any, Callable, List
 from urllib import request
 
 import numpy as np
 import pandas as pd
-from math import pow
 from tqdm import tqdm
 
 
@@ -24,17 +24,32 @@ def get_module_path():
 
 
 def round_half_up(n, precision):
-    return int(Decimal(n * pow(10, precision)).to_integral_value(rounding=ROUND_HALF_UP)) / pow(10, precision)
-
-
-def median_ensemble(experiment_path: str,
-                    summary_filter: str = '**',
-                    forecast_file: str = 'forecast.csv',
-                    group_by: str = 'id'):
-    return pd.concat([pd.read_csv(file)
-                      for file in
-                      tqdm(glob(os.path.join(experiment_path, summary_filter, forecast_file)))], sort=False) \
-        .set_index(group_by).groupby(level=group_by, sort=False).median().values
+    return int(
+        Decimal(n * pow(10, precision)).to_integral_value(rounding=ROUND_HALF_UP)
+    ) / pow(10, precision)
+
+
+def median_ensemble(
+    experiment_path: str,
+    summary_filter: str = '**',
+    forecast_file: str = 'forecast.csv',
+    group_by: str = 'id',
+):
+    return (
+        pd.concat(
+            [
+                pd.read_csv(file)
+                for file in tqdm(
+                    glob(os.path.join(experiment_path, summary_filter, forecast_file))
+                )
+            ],
+            sort=False,
+        )
+        .set_index(group_by)
+        .groupby(level=group_by, sort=False)
+        .median()
+        .values
+    )
 
 
 def group_values(values: np.ndarray, groups: np.ndarray, group_name: str):
@@ -50,8 +65,11 @@ def download_url(url: str, file_path: str) -> None:
     """
 
     def progress(count, block_size, total_size):
-        sys.stdout.write('\rDownloading {} from {} {:.1f}%'.format(file_path, url, float(count * block_size) / float(
-            total_size) * 100.0))
+        sys.stdout.write(
+            '\rDownloading {} from {} {:.1f}%'.format(
+                file_path, url, float(count * block_size) / float(total_size) * 100.0
+            )
+        )
         sys.stdout.flush()
 
     if not os.path.isfile(file_path):
@@ -63,7 +81,9 @@ def progress(count, block_size, total_size):
         sys.stdout.write('\n')
         sys.stdout.flush()
         file_info = os.stat(f)
-        logging.info(f'Successfully downloaded {os.path.basename(file_path)} {file_info.st_size} bytes.')
+        logging.info(
+            f'Successfully downloaded {os.path.basename(file_path)} {file_info.st_size} bytes.'
+        )
     else:
         file_info = os.stat(file_path)
         logging.info(f'File already exists: {file_path} {file_info.st_size} bytes.')
@@ -107,5 +127,8 @@ def ordered_insert(ordered_stack: List, value, f: Callable[[Any, Any], bool]):
     (and truncated if necessary).
     :return: New instance of stack with inserted element.
     """
-    return (list(takewhile(lambda x: f(x, value), ordered_stack)) + [value] +
-            list(dropwhile(lambda x: f(x, value), ordered_stack)))[:len(ordered_stack)]
+    return (
+        list(takewhile(lambda x: f(x, value), ordered_stack))
+        + [value]
+        + list(dropwhile(lambda x: f(x, value), ordered_stack))
+    )[: len(ordered_stack)]
diff --git a/benchmark/metalearned/dataset.py b/benchmark/metalearned/dataset.py
index 6b41842..d3aad4c 100644
--- a/benchmark/metalearned/dataset.py
+++ b/benchmark/metalearned/dataset.py
@@ -7,11 +7,18 @@
 import numpy as np
 import pandas as pd
 import patoolib
-from tqdm import tqdm
-
 from common.settings import RESOURCES_DIR
-from common.timeseries import Timeseries, TimeseriesBundle, TimeseriesLoader, Year, Month, Day, Hour
+from common.timeseries import (
+    Day,
+    Hour,
+    Month,
+    Timeseries,
+    TimeseriesBundle,
+    TimeseriesLoader,
+    Year,
+)
 from common.utils import download_url
+from tqdm import tqdm
 
 
 @dataclass(frozen=True)
@@ -31,29 +38,37 @@ def period_map(self):
 class M4Dataset(TimeseriesLoader):
     def download(self) -> TimeseriesBundle:
         url_template = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/{}/{}-{}.csv'
-        m4_info_url = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/M4-info.csv'
+        m4_info_url = (
+            'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/M4-info.csv'
+        )
         m4_info_path = os.path.join(self.path, 'M4info.csv')
 
         ssl._create_default_https_context = ssl._create_unverified_context
 
         download_url(m4_info_url, m4_info_path)
         for sp in M4Meta.seasonal_patterns:
-            training_url = url_template.format("Train", sp, "train")
-            download_url(training_url, os.path.join(M4Meta.dataset_path, f'{sp}-train.csv'))
-            test_url = url_template.format("Test", sp, "test")
+            training_url = url_template.format('Train', sp, 'train')
+            download_url(
+                training_url, os.path.join(M4Meta.dataset_path, f'{sp}-train.csv')
+            )
+            test_url = url_template.format('Test', sp, 'test')
             download_url(test_url, os.path.join(M4Meta.dataset_path, f'{sp}-test.csv'))
 
         # Download naive2 forecasts, needed for OWA metric
         m4_naive2_archive = os.path.join(self.path, 'naive2.rar')
-        download_url('https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-Naive2.rar',
-                     m4_naive2_archive)
+        download_url(
+            'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-Naive2.rar',
+            m4_naive2_archive,
+        )
         patoolib.extract_archive(m4_naive2_archive, outdir=self.path)
         os.remove(m4_naive2_archive)
 
         # Download m4 competition winner predictions, for summary testing purposes only
         m4_winner_archive = os.path.join(self.path, 'submission-118.rar')
-        download_url('https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-118.rar',
-                     m4_winner_archive)
+        download_url(
+            'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-118.rar',
+            m4_winner_archive,
+        )
         patoolib.extract_archive(m4_winner_archive, outdir=self.path)
         os.remove(m4_winner_archive)
 
@@ -66,12 +81,14 @@ def download(self) -> TimeseriesBundle:
             'Monthly': (Month(), 1),
             'Weekly': (Day(), 7),
             'Daily': (Day(), 1),
-            'Hourly': (Hour(), 1)
+            'Hourly': (Hour(), 1),
         }
 
         all_timeseries = []
         for sp in M4Meta.seasonal_patterns:
-            training_set = pd.read_csv(os.path.join(M4Meta.dataset_path, f'{sp}-train.csv'))
+            training_set = pd.read_csv(
+                os.path.join(M4Meta.dataset_path, f'{sp}-train.csv')
+            )
             test_set = pd.read_csv(os.path.join(M4Meta.dataset_path, f'{sp}-test.csv'))
 
             time_unit, frequency = time_units_mapping[sp]
@@ -89,11 +106,15 @@ def download(self) -> TimeseriesBundle:
                 parsed_date = None
                 for parsing_format in parsing_formats:
                     try:
-                        parsed_date = datetime.strptime(timeseries_info.StartingDate, parsing_format)
+                        parsed_date = datetime.strptime(
+                            timeseries_info.StartingDate, parsing_format
+                        )
                     except Exception:
                         continue
                 if parsed_date is None:
-                    raise ValueError(f'Could not parse {timeseries_info.StartingDate} for {timeseries_id}')
+                    raise ValueError(
+                        f'Could not parse {timeseries_info.StartingDate} for {timeseries_id}'
+                    )
                 # all M4 years are in the 1900s or 1800s
                 if parsed_date.year > 2000:
                     parsed_date = parsed_date.replace(year=parsed_date.year - 100)
@@ -101,14 +122,15 @@ def download(self) -> TimeseriesBundle:
                 if parsed_date.year > 2000:
                     print('over')
 
-                timeseries = Timeseries(id=timeseries_id,
-                                        start_date=parsed_date,
-                                        time_unit=time_unit,
-                                        frequency=frequency,
-                                        period=int(timeseries_info.Frequency),
-                                        values=np.concatenate([training_values, test_values]),
-                                        meta={'seasonal_pattern': sp}
-                                        )
+                timeseries = Timeseries(
+                    id=timeseries_id,
+                    start_date=parsed_date,
+                    time_unit=time_unit,
+                    frequency=frequency,
+                    period=int(timeseries_info.Frequency),
+                    values=np.concatenate([training_values, test_values]),
+                    meta={'seasonal_pattern': sp},
+                )
                 all_timeseries.append(timeseries)
 
         return TimeseriesBundle(all_timeseries)
@@ -116,7 +138,9 @@ def download(self) -> TimeseriesBundle:
     def standard_split(self) -> Tuple[TimeseriesBundle, TimeseriesBundle]:
         bundle = self.load_cache()
         horizons_map = M4Meta().horizons_map()
-        return bundle.split(lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']]))
+        return bundle.split(
+            lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']])
+        )
 
     @staticmethod
     def filter(bundle: TimeseriesBundle, seasonal_pattern: str) -> TimeseriesBundle:
diff --git a/benchmark/metalearned/experiments/tl/main.py b/benchmark/metalearned/experiments/tl/main.py
index 04173ec..1249558 100644
--- a/benchmark/metalearned/experiments/tl/main.py
+++ b/benchmark/metalearned/experiments/tl/main.py
@@ -4,18 +4,21 @@
 import numpy as np
 import pandas as pd
 import torch as t
-from fire import Fire
-from scipy.interpolate import interp1d
-from torch import optim
-
-from common.experiment import create_experiment
-from common.experiment import load_experiment_parameters
+from common.experiment import create_experiment, load_experiment_parameters
 from common.samplers import UnivariateTimeseriesSampler
 from common.settings import experiment_path
 from common.timeseries import TimeseriesBundle
-from common.torch_utils import SnapshotManager, to_device, to_tensor, mase_loss, mape_loss, smape_2_loss
+from common.torch_utils import (
+    SnapshotManager,
+    mape_loss,
+    mase_loss,
+    smape_2_loss,
+    to_device,
+    to_tensor,
+)
 from common.utils import get_module_path
 from experiments.tl.parameters import parameters
+from fire import Fire
 from models.nbeats_torch import nbeats_generic, nbeats_interpretable
 from resources.electricity.dataset import ElectricityDataset, ElectricityMeta
 from resources.fred.dataset import FredDataset, FredMeta
@@ -23,19 +26,27 @@
 from resources.m4.dataset import M4Dataset, M4Meta
 from resources.tourism.dataset import TourismDataset, TourismMeta
 from resources.traffic.dataset import TrafficDataset, TrafficMeta
+from scipy.interpolate import interp1d
+from torch import optim
 
 module_path = get_module_path()
 
 
 def init(name: str):
-    create_experiment(experiment_path=experiment_path(module_path, name),
-                      parameters=parameters[name],
-                      command=lambda path, params: f'python {module_path}/main.py run --path={path}')
+    create_experiment(
+        experiment_path=experiment_path(module_path, name),
+        parameters=parameters[name],
+        command=lambda path, params: f'python {module_path}/main.py run --path={path}',
+    )
 
 
 def run(path: str):
     experiment_parameters = load_experiment_parameters(path)
-    source_dataset_name = experiment_parameters['source_dataset'] if 'source_dataset' in experiment_parameters else 'M4'
+    source_dataset_name = (
+        experiment_parameters['source_dataset']
+        if 'source_dataset' in experiment_parameters
+        else 'M4'
+    )
     loss_name = experiment_parameters['loss_name']
 
     model_horizons = {
@@ -95,67 +106,90 @@ def run(path: str):
     tl_models = {}
     for model_name, horizon in model_horizons.items():
         sp = model_sps[model_name]
-        training_subset = source_dataset.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
+        training_subset = source_dataset.filter(
+            lambda ts: ts.meta['seasonal_pattern'] == sp
+        )
         training_values = np.array(training_subset.values())
         if source_dataset_name == 'FRED':  # interpolate monthly data
             if model_name == 'H24':
                 training_values = []
                 for values in training_subset.values():
-                    interpolation_fn = interp1d(x=np.array(range(len(values))), y=values, kind='linear')
-                    training_values.append(interpolation_fn(np.arange(0, len(values) - 0.5, 0.5)))
+                    interpolation_fn = interp1d(
+                        x=np.array(range(len(values))), y=values, kind='linear'
+                    )
+                    training_values.append(
+                        interpolation_fn(np.arange(0, len(values) - 0.5, 0.5))
+                    )
                 training_values = np.array(training_values)
             elif model_name == 'H48':
                 training_values = []
                 for values in training_subset.values():
-                    interpolation_fn = interp1d(x=np.array(range(len(values))), y=values, kind='linear')
-                    training_values.append(interpolation_fn(np.arange(0, len(values) - 0.75, 0.25)))
+                    interpolation_fn = interp1d(
+                        x=np.array(range(len(values))), y=values, kind='linear'
+                    )
+                    training_values.append(
+                        interpolation_fn(np.arange(0, len(values) - 0.75, 0.25))
+                    )
                 training_values = np.array(training_values)
 
         input_size = experiment_parameters['lookback_period'] * horizon
-        training_dataset = UnivariateTimeseriesSampler(timeseries=training_values,
-                                                       insample_size=input_size,
-                                                       outsample_size=horizon,
-                                                       window_sampling_limit=int(
-                                                           experiment_parameters['history_horizons'] * horizon),
-                                                       batch_size=experiment_parameters['batch_size'])
+        training_dataset = UnivariateTimeseriesSampler(
+            timeseries=training_values,
+            insample_size=input_size,
+            outsample_size=horizon,
+            window_sampling_limit=int(
+                experiment_parameters['history_horizons'] * horizon
+            ),
+            batch_size=experiment_parameters['batch_size'],
+        )
 
         #
         # Training
         #
         snapshot_dir = os.path.join(path, 'snapshots', model_name)
 
-        snapshot_manager = SnapshotManager(snapshot_dir=snapshot_dir,
-                                           logging_frequency=experiment_parameters['logging_frequency'],
-                                           snapshot_frequency=experiment_parameters['snapshot_frequency'])
+        snapshot_manager = SnapshotManager(
+            snapshot_dir=snapshot_dir,
+            logging_frequency=experiment_parameters['logging_frequency'],
+            snapshot_frequency=experiment_parameters['snapshot_frequency'],
+        )
 
         if experiment_parameters['model_type'] == 'generic':
-            model = nbeats_generic(input_size=input_size,
-                                   output_size=horizon,
-                                   blocks=experiment_parameters['blocks'],
-                                   stacks=experiment_parameters['stacks'],
-                                   fc_layers=experiment_parameters['layers'],
-                                   fc_layers_size=experiment_parameters['width'],
-                                   scaling=experiment_parameters['scaling'],
-                                   mode=experiment_parameters['mode'])
+            model = nbeats_generic(
+                input_size=input_size,
+                output_size=horizon,
+                blocks=experiment_parameters['blocks'],
+                stacks=experiment_parameters['stacks'],
+                fc_layers=experiment_parameters['layers'],
+                fc_layers_size=experiment_parameters['width'],
+                scaling=experiment_parameters['scaling'],
+                mode=experiment_parameters['mode'],
+            )
         else:
-            model = nbeats_interpretable(input_size=input_size,
-                                         output_size=horizon,
-                                         trend_blocks=experiment_parameters['trend_blocks'],
-                                         trend_fc_layers=experiment_parameters['layers'],
-                                         trend_fc_layers_size=experiment_parameters['trend_fc_layers_size'],
-                                         degree_of_polynomial=experiment_parameters['degree_of_polynomial'],
-                                         seasonality_blocks=experiment_parameters['seasonality_blocks'],
-                                         seasonality_fc_layers=experiment_parameters['layers'],
-                                         seasonality_fc_layers_size=experiment_parameters['seasonality_fc_layers_size'],
-                                         num_of_harmonics=experiment_parameters['num_of_harmonics'],
-                                         scaling=experiment_parameters['scaling'],
-                                         mode=experiment_parameters['mode'])
+            model = nbeats_interpretable(
+                input_size=input_size,
+                output_size=horizon,
+                trend_blocks=experiment_parameters['trend_blocks'],
+                trend_fc_layers=experiment_parameters['layers'],
+                trend_fc_layers_size=experiment_parameters['trend_fc_layers_size'],
+                degree_of_polynomial=experiment_parameters['degree_of_polynomial'],
+                seasonality_blocks=experiment_parameters['seasonality_blocks'],
+                seasonality_fc_layers=experiment_parameters['layers'],
+                seasonality_fc_layers_size=experiment_parameters[
+                    'seasonality_fc_layers_size'
+                ],
+                num_of_harmonics=experiment_parameters['num_of_harmonics'],
+                scaling=experiment_parameters['scaling'],
+                mode=experiment_parameters['mode'],
+            )
 
         model = to_device(model)
 
-        optimizer = optim.Adam(model.parameters(),
-                               lr=experiment_parameters['learning_rate'],
-                               weight_decay=0.0)
+        optimizer = optim.Adam(
+            model.parameters(),
+            lr=experiment_parameters['learning_rate'],
+            weight_decay=0.0,
+        )
 
         lr_decay_step = experiment_parameters['iterations'] // 3
         if lr_decay_step == 0:
@@ -176,7 +210,9 @@ def run(path: str):
             if loss_name == 'MAPE':
                 training_loss = mape_loss(forecast, y, y_mask)
             elif loss_name == 'MASE':
-                training_loss = mase_loss(x, training_subset.timeseries[0].period, forecast, y, y_mask)
+                training_loss = mase_loss(
+                    x, training_subset.timeseries[0].period, forecast, y, y_mask
+                )
             elif loss_name == 'SMAPE':
                 training_loss = smape_2_loss(forecast, y, y_mask)
             else:
@@ -190,13 +226,22 @@ def run(path: str):
             optimizer.step()
 
             for param_group in optimizer.param_groups:
-                param_group['lr'] = experiment_parameters['learning_rate'] * 0.5 ** (i // lr_decay_step)
-
-            snapshot_manager.register(iteration=i,
-                                      training_loss=float(training_loss),
-                                      validation_loss=np.nan, model=model,
-                                      optimizer=optimizer)
-        tl_models[model_name] = {'p_model': model, 'p_input_size': input_size, 'p_horizon': horizon}
+                param_group['lr'] = experiment_parameters['learning_rate'] * 0.5 ** (
+                    i // lr_decay_step
+                )
+
+            snapshot_manager.register(
+                iteration=i,
+                training_loss=float(training_loss),
+                validation_loss=np.nan,
+                model=model,
+                optimizer=optimizer,
+            )
+        tl_models[model_name] = {
+            'p_model': model,
+            'p_input_size': input_size,
+            'p_horizon': horizon,
+        }
 
     #
     # Predictions
@@ -205,113 +250,233 @@ def run(path: str):
     def forecast(bundle: TimeseriesBundle, p_model, p_input_size, p_horizon):
         forecasts = []
         input_set = np.array(bundle.values())
-        input_set = UnivariateTimeseriesSampler(timeseries=input_set,
-                                                insample_size=p_input_size,
-                                                outsample_size=0,
-                                                window_sampling_limit=1,
-                                                batch_size=1)
+        input_set = UnivariateTimeseriesSampler(
+            timeseries=input_set,
+            insample_size=p_input_size,
+            outsample_size=0,
+            window_sampling_limit=1,
+            batch_size=1,
+        )
         p_x, p_x_mask = map(to_tensor, input_set.sequential_latest_insamples())
         p_model.eval()
         with t.no_grad():
             forecasts.extend(p_model(p_x, p_x_mask).cpu().detach().numpy())
 
-        forecasts_df = pd.DataFrame(forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon)])
+        forecasts_df = pd.DataFrame(
+            forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon)]
+        )
         forecasts_df.index = bundle.ids()
         forecasts_df.index.name = 'id'
         return forecasts_df
 
-    def rolling_daily_forecast(base_insample: TimeseriesBundle, rolling_insample: TimeseriesBundle,
-                               p_model, p_input_size, p_horizon):
+    def rolling_daily_forecast(
+        base_insample: TimeseriesBundle,
+        rolling_insample: TimeseriesBundle,
+        p_model,
+        p_input_size,
+        p_horizon,
+    ):
         forecasts = []
         base_insample_values = np.array(base_insample.values())
         rolling_insample_values = np.array(rolling_insample.values())
         for window_id in range(7):
-            insample = np.concatenate([base_insample_values, rolling_insample_values[:, :window_id * p_horizon]],
-                                      axis=1)
-            input_set = UnivariateTimeseriesSampler(timeseries=insample,
-                                                    insample_size=p_input_size,
-                                                    outsample_size=0,
-                                                    window_sampling_limit=1,
-                                                    batch_size=1)
+            insample = np.concatenate(
+                [
+                    base_insample_values,
+                    rolling_insample_values[:, : window_id * p_horizon],
+                ],
+                axis=1,
+            )
+            input_set = UnivariateTimeseriesSampler(
+                timeseries=insample,
+                insample_size=p_input_size,
+                outsample_size=0,
+                window_sampling_limit=1,
+                batch_size=1,
+            )
             p_x, p_x_mask = map(to_tensor, input_set.sequential_latest_insamples())
             p_model.eval()
             with t.no_grad():
                 window_forecast = p_model(p_x, p_x_mask).cpu().detach().numpy()
-                forecasts = window_forecast if len(forecasts) == 0 else np.concatenate([forecasts, window_forecast],
-                                                                                       axis=1)
-
-        forecasts_df = pd.DataFrame(forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon * 7)])
+                forecasts = (
+                    window_forecast
+                    if len(forecasts) == 0
+                    else np.concatenate([forecasts, window_forecast], axis=1)
+                )
+
+        forecasts_df = pd.DataFrame(
+            forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon * 7)]
+        )
         forecasts_df.index = base_insample.ids()
         forecasts_df.index.name = 'id'
-        forecasts_df.columns = [f'V{i}' for i in range(1, len(forecasts_df.columns) + 1)]
+        forecasts_df.columns = [
+            f'V{i}' for i in range(1, len(forecasts_df.columns) + 1)
+        ]
         return forecasts_df
 
     # M4
     target_input, _ = M4Dataset(M4Meta.dataset_path).standard_split()
-    yearly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'), **tl_models['Y6'])
-    quarterly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'), **tl_models['Q8'])
-    monthly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'), **tl_models['M18'])
-    weekly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Weekly'), **tl_models['W13'])
-    daily = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Daily'), **tl_models['D14'])
-    hourly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Hourly'), **tl_models['H48'])
+    yearly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'),
+        **tl_models['Y6'],
+    )
+    quarterly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'),
+        **tl_models['Q8'],
+    )
+    monthly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'),
+        **tl_models['M18'],
+    )
+    weekly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Weekly'),
+        **tl_models['W13'],
+    )
+    daily = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Daily'),
+        **tl_models['D14'],
+    )
+    hourly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Hourly'),
+        **tl_models['H48'],
+    )
     pd.concat([yearly, quarterly, monthly, weekly, daily, hourly], sort=False).to_csv(
-        os.path.join(os.path.join(path, 'M4.csv')))
+        os.path.join(os.path.join(path, 'M4.csv'))
+    )
 
     # M3
     target_input, _ = M3Dataset(M3Meta.dataset_path).standard_split()
-    yearly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Year'), **tl_models['Y6'])
-    quarterly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Quart'), **tl_models['Q8'])
-    monthly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Month'), **tl_models['M18'])
-    others = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Other'), **tl_models['Q8'])
-    pd.concat([yearly, quarterly, monthly, others], sort=False).to_csv(os.path.join(os.path.join(path, 'M3.csv')))
+    yearly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Year'),
+        **tl_models['Y6'],
+    )
+    quarterly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Quart'),
+        **tl_models['Q8'],
+    )
+    monthly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Month'),
+        **tl_models['M18'],
+    )
+    others = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Other'),
+        **tl_models['Q8'],
+    )
+    pd.concat([yearly, quarterly, monthly, others], sort=False).to_csv(
+        os.path.join(os.path.join(path, 'M3.csv'))
+    )
 
     # Tourism
     target_input, _ = TourismDataset(TourismMeta.dataset_path).standard_split()
-    yearly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'), **tl_models['Y4'])
-    quarterly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'), **tl_models['Q8'])
-    monthly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'), **tl_models['M24'])
-    pd.concat([yearly, quarterly, monthly], sort=False).to_csv(os.path.join(os.path.join(path, 'tourism.csv')))
+    yearly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'),
+        **tl_models['Y4'],
+    )
+    quarterly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'),
+        **tl_models['Q8'],
+    )
+    monthly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'),
+        **tl_models['M24'],
+    )
+    pd.concat([yearly, quarterly, monthly], sort=False).to_csv(
+        os.path.join(os.path.join(path, 'tourism.csv'))
+    )
 
     # Electricity
-    target_input, rolling_target_input = ElectricityDataset(ElectricityMeta.dataset_path). \
-        load_cache().split(lambda ts: ts.split(-24 * 7))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'electricity_last_window.csv')))
-
-    target_input, rolling_target_input = ElectricityDataset(ElectricityMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(ElectricityMeta.deepar_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'electricity_deepar.csv')))
-
-    target_input, rolling_target_input = ElectricityDataset(ElectricityMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(ElectricityMeta.deepfact_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'electricity_deepfactors.csv')))
+    target_input, rolling_target_input = (
+        ElectricityDataset(ElectricityMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split(-24 * 7))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'electricity_last_window.csv')))
+
+    target_input, rolling_target_input = (
+        ElectricityDataset(ElectricityMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(ElectricityMeta.deepar_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'electricity_deepar.csv')))
+
+    target_input, rolling_target_input = (
+        ElectricityDataset(ElectricityMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(ElectricityMeta.deepfact_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'electricity_deepfactors.csv')))
 
     # Traffic
-    target_input, rolling_target_input = TrafficDataset(TrafficMeta.dataset_path).load_cache().\
-        split(lambda ts: ts.split(-24 * 7))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'traffic_last_window.csv')))
-
-    target_input, rolling_target_input = TrafficDataset(TrafficMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(TrafficMeta.deepar_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'traffic_deepar.csv')))
-
-    target_input, rolling_target_input = TrafficDataset(TrafficMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(TrafficMeta.deepfact_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'traffic_deepfactors.csv')))
+    target_input, rolling_target_input = (
+        TrafficDataset(TrafficMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split(-24 * 7))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'traffic_last_window.csv')))
+
+    target_input, rolling_target_input = (
+        TrafficDataset(TrafficMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(TrafficMeta.deepar_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'traffic_deepar.csv')))
+
+    target_input, rolling_target_input = (
+        TrafficDataset(TrafficMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(TrafficMeta.deepfact_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'traffic_deepfactors.csv')))
 
     # FRED
     target_input, _ = FredDataset(FredMeta.dataset_path).standard_split()
-    yearly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'), **tl_models['Y6'])
-    quarterly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'), **tl_models['Q8'])
-    monthly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'), **tl_models['M18'])
-    weekly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Weekly'), **tl_models['W13'])
-    daily = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Daily'), **tl_models['D14'])
-    pd.concat([yearly, quarterly, monthly, weekly, daily]).to_csv(os.path.join(os.path.join(path, 'fred.csv')))
+    yearly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'),
+        **tl_models['Y6'],
+    )
+    quarterly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'),
+        **tl_models['Q8'],
+    )
+    monthly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'),
+        **tl_models['M18'],
+    )
+    weekly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Weekly'),
+        **tl_models['W13'],
+    )
+    daily = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Daily'),
+        **tl_models['D14'],
+    )
+    pd.concat([yearly, quarterly, monthly, weekly, daily]).to_csv(
+        os.path.join(os.path.join(path, 'fred.csv'))
+    )
 
 
 def evaluate(name: str, summary_filter: str, validation_mode: bool = False):
diff --git a/benchmark/metalearned/experiments/tl/parameters.py b/benchmark/metalearned/experiments/tl/parameters.py
index 789b624..6e70d26 100644
--- a/benchmark/metalearned/experiments/tl/parameters.py
+++ b/benchmark/metalearned/experiments/tl/parameters.py
@@ -1,31 +1,24 @@
 common = {
     'repeats': list(range(10)),
-
     'lookback_period': list(range(2, 8)),
     'loss_name': 'MASE',
     'scaling': 'max',
     'iterations': 15000,
     'history_horizons': 10,
-
     'batch_size': 1024,
     'learning_rate': 0.001,
-
     'mode': 'dress',
-
     'width': 512,
     'layers': 4,
     'blocks': 10,
     'stacks': 1,
-
     # interpretable
     'trend_blocks': 3,
     'trend_fc_layers_size': 256,
     'degree_of_polynomial': 3,
-
     'seasonality_blocks': 3,
     'seasonality_fc_layers_size': 2048,
     'num_of_harmonics': 1,
-
     'logging_frequency': 500,
     'snapshot_frequency': 5000,
 }
@@ -39,7 +32,7 @@
         'blocks': 30,
         'stacks': 1,
         'iterations': [5000, 15000],
-        'loss_name': ['MASE', 'MAPE', 'SMAPE']
+        'loss_name': ['MASE', 'MAPE', 'SMAPE'],
     },
     'shared_grid': {
         **common,
@@ -76,5 +69,5 @@
         'blocks': 1,
         'stacks': 30,
         'mode': 'dress',
-    }
+    },
 }
diff --git a/benchmark/metalearned/main.py b/benchmark/metalearned/main.py
index dd45654..db0a1eb 100644
--- a/benchmark/metalearned/main.py
+++ b/benchmark/metalearned/main.py
@@ -4,18 +4,21 @@
 import numpy as np
 import pandas as pd
 import torch as t
-from fire import Fire
-from scipy.interpolate import interp1d
-from torch import optim
-
-from common.experiment import create_experiment
-from common.experiment import load_experiment_parameters
+from common.experiment import create_experiment, load_experiment_parameters
 from common.samplers import UnivariateTimeseriesSampler
 from common.settings import experiment_path
 from common.timeseries import TimeseriesBundle
-from common.torch_utils import SnapshotManager, to_device, to_tensor, mase_loss, mape_loss, smape_2_loss
+from common.torch_utils import (
+    SnapshotManager,
+    mape_loss,
+    mase_loss,
+    smape_2_loss,
+    to_device,
+    to_tensor,
+)
 from common.utils import get_module_path
 from experiments.tl.parameters import parameters
+from fire import Fire
 from models.nbeats_torch import nbeats_generic, nbeats_interpretable
 from resources.electricity.dataset import ElectricityDataset, ElectricityMeta
 from resources.fred.dataset import FredDataset, FredMeta
@@ -23,19 +26,27 @@
 from resources.m4.dataset import M4Dataset, M4Meta
 from resources.tourism.dataset import TourismDataset, TourismMeta
 from resources.traffic.dataset import TrafficDataset, TrafficMeta
+from scipy.interpolate import interp1d
+from torch import optim
 
 module_path = get_module_path()
 
 
 def init(name: str):
-    create_experiment(experiment_path=experiment_path(module_path, name),
-                      parameters=parameters[name],
-                      command=lambda path, params: f'python {module_path}/main.py run --path={path}')
+    create_experiment(
+        experiment_path=experiment_path(module_path, name),
+        parameters=parameters[name],
+        command=lambda path, params: f'python {module_path}/main.py run --path={path}',
+    )
 
 
 def run(path: str):
     experiment_parameters = load_experiment_parameters(path)
-    source_dataset_name = experiment_parameters['source_dataset'] if 'source_dataset' in experiment_parameters else 'M4'
+    source_dataset_name = (
+        experiment_parameters['source_dataset']
+        if 'source_dataset' in experiment_parameters
+        else 'M4'
+    )
     loss_name = experiment_parameters['loss_name']
 
     model_horizons = {
@@ -95,67 +106,90 @@ def run(path: str):
     tl_models = {}
     for model_name, horizon in model_horizons.items():
         sp = model_sps[model_name]
-        training_subset = source_dataset.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
+        training_subset = source_dataset.filter(
+            lambda ts: ts.meta['seasonal_pattern'] == sp
+        )
         training_values = np.array(training_subset.values())
         if source_dataset_name == 'FRED':  # interpolate monthly data
             if model_name == 'H24':
                 training_values = []
                 for values in training_subset.values():
-                    interpolation_fn = interp1d(x=np.array(range(len(values))), y=values, kind='linear')
-                    training_values.append(interpolation_fn(np.arange(0, len(values) - 0.5, 0.5)))
+                    interpolation_fn = interp1d(
+                        x=np.array(range(len(values))), y=values, kind='linear'
+                    )
+                    training_values.append(
+                        interpolation_fn(np.arange(0, len(values) - 0.5, 0.5))
+                    )
                 training_values = np.array(training_values)
             elif model_name == 'H48':
                 training_values = []
                 for values in training_subset.values():
-                    interpolation_fn = interp1d(x=np.array(range(len(values))), y=values, kind='linear')
-                    training_values.append(interpolation_fn(np.arange(0, len(values) - 0.75, 0.25)))
+                    interpolation_fn = interp1d(
+                        x=np.array(range(len(values))), y=values, kind='linear'
+                    )
+                    training_values.append(
+                        interpolation_fn(np.arange(0, len(values) - 0.75, 0.25))
+                    )
                 training_values = np.array(training_values)
 
         input_size = experiment_parameters['lookback_period'] * horizon
-        training_dataset = UnivariateTimeseriesSampler(timeseries=training_values,
-                                                       insample_size=input_size,
-                                                       outsample_size=horizon,
-                                                       window_sampling_limit=int(
-                                                           experiment_parameters['history_horizons'] * horizon),
-                                                       batch_size=experiment_parameters['batch_size'])
+        training_dataset = UnivariateTimeseriesSampler(
+            timeseries=training_values,
+            insample_size=input_size,
+            outsample_size=horizon,
+            window_sampling_limit=int(
+                experiment_parameters['history_horizons'] * horizon
+            ),
+            batch_size=experiment_parameters['batch_size'],
+        )
 
         #
         # Training
         #
         snapshot_dir = os.path.join(path, 'snapshots', model_name)
 
-        snapshot_manager = SnapshotManager(snapshot_dir=snapshot_dir,
-                                           logging_frequency=experiment_parameters['logging_frequency'],
-                                           snapshot_frequency=experiment_parameters['snapshot_frequency'])
+        snapshot_manager = SnapshotManager(
+            snapshot_dir=snapshot_dir,
+            logging_frequency=experiment_parameters['logging_frequency'],
+            snapshot_frequency=experiment_parameters['snapshot_frequency'],
+        )
 
         if experiment_parameters['model_type'] == 'generic':
-            model = nbeats_generic(input_size=input_size,
-                                   output_size=horizon,
-                                   blocks=experiment_parameters['blocks'],
-                                   stacks=experiment_parameters['stacks'],
-                                   fc_layers=experiment_parameters['layers'],
-                                   fc_layers_size=experiment_parameters['width'],
-                                   scaling=experiment_parameters['scaling'],
-                                   mode=experiment_parameters['mode'])
+            model = nbeats_generic(
+                input_size=input_size,
+                output_size=horizon,
+                blocks=experiment_parameters['blocks'],
+                stacks=experiment_parameters['stacks'],
+                fc_layers=experiment_parameters['layers'],
+                fc_layers_size=experiment_parameters['width'],
+                scaling=experiment_parameters['scaling'],
+                mode=experiment_parameters['mode'],
+            )
         else:
-            model = nbeats_interpretable(input_size=input_size,
-                                         output_size=horizon,
-                                         trend_blocks=experiment_parameters['trend_blocks'],
-                                         trend_fc_layers=experiment_parameters['layers'],
-                                         trend_fc_layers_size=experiment_parameters['trend_fc_layers_size'],
-                                         degree_of_polynomial=experiment_parameters['degree_of_polynomial'],
-                                         seasonality_blocks=experiment_parameters['seasonality_blocks'],
-                                         seasonality_fc_layers=experiment_parameters['layers'],
-                                         seasonality_fc_layers_size=experiment_parameters['seasonality_fc_layers_size'],
-                                         num_of_harmonics=experiment_parameters['num_of_harmonics'],
-                                         scaling=experiment_parameters['scaling'],
-                                         mode=experiment_parameters['mode'])
+            model = nbeats_interpretable(
+                input_size=input_size,
+                output_size=horizon,
+                trend_blocks=experiment_parameters['trend_blocks'],
+                trend_fc_layers=experiment_parameters['layers'],
+                trend_fc_layers_size=experiment_parameters['trend_fc_layers_size'],
+                degree_of_polynomial=experiment_parameters['degree_of_polynomial'],
+                seasonality_blocks=experiment_parameters['seasonality_blocks'],
+                seasonality_fc_layers=experiment_parameters['layers'],
+                seasonality_fc_layers_size=experiment_parameters[
+                    'seasonality_fc_layers_size'
+                ],
+                num_of_harmonics=experiment_parameters['num_of_harmonics'],
+                scaling=experiment_parameters['scaling'],
+                mode=experiment_parameters['mode'],
+            )
 
         model = to_device(model)
 
-        optimizer = optim.Adam(model.parameters(),
-                               lr=experiment_parameters['learning_rate'],
-                               weight_decay=0.0)
+        optimizer = optim.Adam(
+            model.parameters(),
+            lr=experiment_parameters['learning_rate'],
+            weight_decay=0.0,
+        )
 
         lr_decay_step = experiment_parameters['iterations'] // 3
         if lr_decay_step == 0:
@@ -176,7 +210,9 @@ def run(path: str):
             if loss_name == 'MAPE':
                 training_loss = mape_loss(forecast, y, y_mask)
             elif loss_name == 'MASE':
-                training_loss = mase_loss(x, training_subset.timeseries[0].period, forecast, y, y_mask)
+                training_loss = mase_loss(
+                    x, training_subset.timeseries[0].period, forecast, y, y_mask
+                )
             elif loss_name == 'SMAPE':
                 training_loss = smape_2_loss(forecast, y, y_mask)
             else:
@@ -190,13 +226,22 @@ def run(path: str):
             optimizer.step()
 
             for param_group in optimizer.param_groups:
-                param_group['lr'] = experiment_parameters['learning_rate'] * 0.5 ** (i // lr_decay_step)
-
-            snapshot_manager.register(iteration=i,
-                                      training_loss=float(training_loss),
-                                      validation_loss=np.nan, model=model,
-                                      optimizer=optimizer)
-        tl_models[model_name] = {'p_model': model, 'p_input_size': input_size, 'p_horizon': horizon}
+                param_group['lr'] = experiment_parameters['learning_rate'] * 0.5 ** (
+                    i // lr_decay_step
+                )
+
+            snapshot_manager.register(
+                iteration=i,
+                training_loss=float(training_loss),
+                validation_loss=np.nan,
+                model=model,
+                optimizer=optimizer,
+            )
+        tl_models[model_name] = {
+            'p_model': model,
+            'p_input_size': input_size,
+            'p_horizon': horizon,
+        }
 
     #
     # Predictions
@@ -205,104 +250,207 @@ def run(path: str):
     def forecast(bundle: TimeseriesBundle, p_model, p_input_size, p_horizon):
         forecasts = []
         input_set = np.array(bundle.values())
-        input_set = UnivariateTimeseriesSampler(timeseries=input_set,
-                                                insample_size=p_input_size,
-                                                outsample_size=0,
-                                                window_sampling_limit=1,
-                                                batch_size=1)
+        input_set = UnivariateTimeseriesSampler(
+            timeseries=input_set,
+            insample_size=p_input_size,
+            outsample_size=0,
+            window_sampling_limit=1,
+            batch_size=1,
+        )
         p_x, p_x_mask = map(to_tensor, input_set.sequential_latest_insamples())
         p_model.eval()
         with t.no_grad():
             forecasts.extend(p_model(p_x, p_x_mask).cpu().detach().numpy())
 
-        forecasts_df = pd.DataFrame(forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon)])
+        forecasts_df = pd.DataFrame(
+            forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon)]
+        )
         forecasts_df.index = bundle.ids()
         forecasts_df.index.name = 'id'
         return forecasts_df
 
-    def rolling_daily_forecast(base_insample: TimeseriesBundle, rolling_insample: TimeseriesBundle,
-                               p_model, p_input_size, p_horizon):
+    def rolling_daily_forecast(
+        base_insample: TimeseriesBundle,
+        rolling_insample: TimeseriesBundle,
+        p_model,
+        p_input_size,
+        p_horizon,
+    ):
         forecasts = []
         base_insample_values = np.array(base_insample.values())
         rolling_insample_values = np.array(rolling_insample.values())
         for window_id in range(7):
-            insample = np.concatenate([base_insample_values, rolling_insample_values[:, :window_id * p_horizon]],
-                                      axis=1)
-            input_set = UnivariateTimeseriesSampler(timeseries=insample,
-                                                    insample_size=p_input_size,
-                                                    outsample_size=0,
-                                                    window_sampling_limit=1,
-                                                    batch_size=1)
+            insample = np.concatenate(
+                [
+                    base_insample_values,
+                    rolling_insample_values[:, : window_id * p_horizon],
+                ],
+                axis=1,
+            )
+            input_set = UnivariateTimeseriesSampler(
+                timeseries=insample,
+                insample_size=p_input_size,
+                outsample_size=0,
+                window_sampling_limit=1,
+                batch_size=1,
+            )
             p_x, p_x_mask = map(to_tensor, input_set.sequential_latest_insamples())
             p_model.eval()
             with t.no_grad():
                 window_forecast = p_model(p_x, p_x_mask).cpu().detach().numpy()
-                forecasts = window_forecast if len(forecasts) == 0 else np.concatenate([forecasts, window_forecast],
-                                                                                       axis=1)
-
-        forecasts_df = pd.DataFrame(forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon * 7)])
+                forecasts = (
+                    window_forecast
+                    if len(forecasts) == 0
+                    else np.concatenate([forecasts, window_forecast], axis=1)
+                )
+
+        forecasts_df = pd.DataFrame(
+            forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon * 7)]
+        )
         forecasts_df.index = base_insample.ids()
         forecasts_df.index.name = 'id'
-        forecasts_df.columns = [f'V{i}' for i in range(1, len(forecasts_df.columns) + 1)]
+        forecasts_df.columns = [
+            f'V{i}' for i in range(1, len(forecasts_df.columns) + 1)
+        ]
         return forecasts_df
 
     # M4
     target_input, _ = M4Dataset(M4Meta.dataset_path).standard_split()
-    yearly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'), **tl_models['Y6'])
-    quarterly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'), **tl_models['Q8'])
-    monthly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'), **tl_models['M18'])
-    weekly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Weekly'), **tl_models['W13'])
-    daily = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Daily'), **tl_models['D14'])
-    hourly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Hourly'), **tl_models['H48'])
+    yearly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'),
+        **tl_models['Y6'],
+    )
+    quarterly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'),
+        **tl_models['Q8'],
+    )
+    monthly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'),
+        **tl_models['M18'],
+    )
+    weekly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Weekly'),
+        **tl_models['W13'],
+    )
+    daily = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Daily'),
+        **tl_models['D14'],
+    )
+    hourly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Hourly'),
+        **tl_models['H48'],
+    )
     pd.concat([yearly, quarterly, monthly, weekly, daily, hourly], sort=False).to_csv(
-        os.path.join(os.path.join(path, 'M4.csv')))
+        os.path.join(os.path.join(path, 'M4.csv'))
+    )
 
     # M3
     target_input, _ = M3Dataset(M3Meta.dataset_path).standard_split()
-    yearly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Year'), **tl_models['Y6'])
-    quarterly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Quart'), **tl_models['Q8'])
-    monthly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Month'), **tl_models['M18'])
-    others = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Other'), **tl_models['Q8'])
-    pd.concat([yearly, quarterly, monthly, others], sort=False).to_csv(os.path.join(os.path.join(path, 'M3.csv')))
+    yearly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Year'),
+        **tl_models['Y6'],
+    )
+    quarterly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Quart'),
+        **tl_models['Q8'],
+    )
+    monthly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Month'),
+        **tl_models['M18'],
+    )
+    others = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'M3Other'),
+        **tl_models['Q8'],
+    )
+    pd.concat([yearly, quarterly, monthly, others], sort=False).to_csv(
+        os.path.join(os.path.join(path, 'M3.csv'))
+    )
 
     # Tourism
     target_input, _ = TourismDataset(TourismMeta.dataset_path).standard_split()
-    yearly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'), **tl_models['Y4'])
-    quarterly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'), **tl_models['Q8'])
-    monthly = forecast(target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'), **tl_models['M24'])
-    pd.concat([yearly, quarterly, monthly], sort=False).to_csv(os.path.join(os.path.join(path, 'tourism.csv')))
+    yearly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Yearly'),
+        **tl_models['Y4'],
+    )
+    quarterly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Quarterly'),
+        **tl_models['Q8'],
+    )
+    monthly = forecast(
+        target_input.filter(lambda ts: ts.meta['seasonal_pattern'] == 'Monthly'),
+        **tl_models['M24'],
+    )
+    pd.concat([yearly, quarterly, monthly], sort=False).to_csv(
+        os.path.join(os.path.join(path, 'tourism.csv'))
+    )
 
     # Electricity
-    target_input, rolling_target_input = ElectricityDataset(ElectricityMeta.dataset_path). \
-        load_cache().split(lambda ts: ts.split(-24 * 7))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'electricity_last_window.csv')))
-
-    target_input, rolling_target_input = ElectricityDataset(ElectricityMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(ElectricityMeta.deepar_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'electricity_deepar.csv')))
-
-    target_input, rolling_target_input = ElectricityDataset(ElectricityMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(ElectricityMeta.deepfact_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'electricity_deepfactors.csv')))
+    target_input, rolling_target_input = (
+        ElectricityDataset(ElectricityMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split(-24 * 7))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'electricity_last_window.csv')))
+
+    target_input, rolling_target_input = (
+        ElectricityDataset(ElectricityMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(ElectricityMeta.deepar_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'electricity_deepar.csv')))
+
+    target_input, rolling_target_input = (
+        ElectricityDataset(ElectricityMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(ElectricityMeta.deepfact_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'electricity_deepfactors.csv')))
 
     # Traffic
-    target_input, rolling_target_input = TrafficDataset(TrafficMeta.dataset_path).load_cache().\
-        split(lambda ts: ts.split(-24 * 7))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'traffic_last_window.csv')))
-
-    target_input, rolling_target_input = TrafficDataset(TrafficMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(TrafficMeta.deepar_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'traffic_deepar.csv')))
-
-    target_input, rolling_target_input = TrafficDataset(TrafficMeta.dataset_path).load_cache(). \
-        split(lambda ts: ts.split_by_time(TrafficMeta.deepfact_split))
-    rolling_daily_forecast(base_insample=target_input, rolling_insample=rolling_target_input, **tl_models['H24']). \
-        to_csv(os.path.join(os.path.join(path, 'traffic_deepfactors.csv')))
+    target_input, rolling_target_input = (
+        TrafficDataset(TrafficMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split(-24 * 7))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'traffic_last_window.csv')))
+
+    target_input, rolling_target_input = (
+        TrafficDataset(TrafficMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(TrafficMeta.deepar_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'traffic_deepar.csv')))
+
+    target_input, rolling_target_input = (
+        TrafficDataset(TrafficMeta.dataset_path)
+        .load_cache()
+        .split(lambda ts: ts.split_by_time(TrafficMeta.deepfact_split))
+    )
+    rolling_daily_forecast(
+        base_insample=target_input,
+        rolling_insample=rolling_target_input,
+        **tl_models['H24'],
+    ).to_csv(os.path.join(os.path.join(path, 'traffic_deepfactors.csv')))
 
     # FRED
     # target_input, _ = FredDataset(FredMeta.dataset_path).standard_split()
diff --git a/benchmark/metalearned/main_ForecastPFN.py b/benchmark/metalearned/main_ForecastPFN.py
index 0bd365e..ad51509 100644
--- a/benchmark/metalearned/main_ForecastPFN.py
+++ b/benchmark/metalearned/main_ForecastPFN.py
@@ -1,42 +1,40 @@
-from tqdm import tqdm
 import pathlib
 import sys
+
+from tqdm import tqdm
+
 sys.path.append('..')
 
 import logging
 import os
 
-import numpy as np
 import pandas as pd
-import torch as t
 import tensorflow as tf
-import tensorflow_io
-from fire import Fire
-from scipy.interpolate import interp1d
-from torch import optim
-
-from common.experiment import create_experiment
-from common.experiment import load_experiment_parameters
-from data_provider.UnivariateTimeseriesSampler_WithStamps import UnivariateTimeseriesSampler_WithStamps
-from exp.exp_ForecastPFN import Exp_ForecastPFN
+from common.experiment import create_experiment, load_experiment_parameters
+from common.metrics import smape
 from common.settings import experiment_path
 from common.timeseries import TimeseriesBundle
-from common.torch_utils import SnapshotManager, to_device, to_tensor, mase_loss, mape_loss, smape_2_loss
+from common.torch_utils import to_tensor
 from common.utils import get_module_path
-from common.metrics import smape
 from experiments.tl.parameters import parameters
-from models.nbeats_torch import nbeats_generic, nbeats_interpretable
+from fire import Fire
 from resources.m3.dataset import M3Dataset, M3Meta
-from resources.m4.dataset import M4Dataset, M4Meta
 from resources.tourism.dataset import TourismDataset, TourismMeta
 
+from data_provider.UnivariateTimeseriesSampler_WithStamps import (
+    UnivariateTimeseriesSampler_WithStamps,
+)
+from exp.exp_ForecastPFN import Exp_ForecastPFN
+
 module_path = get_module_path()
 
 
 def init(name: str):
-    create_experiment(experiment_path=experiment_path(module_path, name),
-                      parameters=parameters[name],
-                      command=lambda path, params: f'python {module_path}/main.py run --path={path}')
+    create_experiment(
+        experiment_path=experiment_path(module_path, name),
+        parameters=parameters[name],
+        command=lambda path, params: f'python {module_path}/main.py run --path={path}',
+    )
 
 
 def run(path: str):
@@ -59,65 +57,79 @@ def run(path: str):
         input_size = experiment_parameters['lookback_period'] * horizon
         model = Exp_ForecastPFN(None)
 
-        tl_models[model_name] = {'p_model': model, 'p_input_size': input_size, 'p_horizon': horizon}
+        tl_models[model_name] = {
+            'p_model': model,
+            'p_input_size': input_size,
+            'p_horizon': horizon,
+        }
 
     #
     # Predictions
     #
 
-    def forecast(in_bundle: TimeseriesBundle, out_bundle: TimeseriesBundle,
-                 sp: str,
-                 p_model, p_input_size, p_horizon):
+    def forecast(
+        in_bundle: TimeseriesBundle,
+        out_bundle: TimeseriesBundle,
+        sp: str,
+        p_model,
+        p_input_size,
+        p_horizon,
+    ):
         forecasts = []
 
-        in_bundle = in_bundle.filter(
-            lambda ts: ts.meta['seasonal_pattern'] == sp)
-        out_bundle = out_bundle.filter(
-            lambda ts: ts.meta['seasonal_pattern'] == sp)
+        in_bundle = in_bundle.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
+        out_bundle = out_bundle.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
 
         input_set = in_bundle.values()
         input_timestamps = in_bundle.time_stamps()
-        input_set = UnivariateTimeseriesSampler_WithStamps(timeseries=input_set,
-                                                           time_stamps=input_timestamps,
-                                                           insample_size=p_input_size,
-                                                           outsample_size=0,
-                                                           window_sampling_limit=1,
-                                                           batch_size=1,
-                                                           time_features=p_model._ForecastPFN_time_features,
-                                                        )
+        input_set = UnivariateTimeseriesSampler_WithStamps(
+            timeseries=input_set,
+            time_stamps=input_timestamps,
+            insample_size=p_input_size,
+            outsample_size=0,
+            window_sampling_limit=1,
+            batch_size=1,
+            time_features=p_model._ForecastPFN_time_features,
+        )
         p_x, p_x_mask, p_x_timestamps = input_set.sequential_latest_insamples()
 
         output_set = out_bundle.values()
         output_timestamps = out_bundle.time_stamps()
-        output_set = UnivariateTimeseriesSampler_WithStamps(timeseries=output_set,
-                                                            time_stamps=output_timestamps,
-                                                            insample_size=p_horizon,
-                                                            outsample_size=0,
-                                                            window_sampling_limit=1,
-                                                            batch_size=1,
-                                                            time_features=p_model._ForecastPFN_time_features,
-                                                            )
+        output_set = UnivariateTimeseriesSampler_WithStamps(
+            timeseries=output_set,
+            time_stamps=output_timestamps,
+            insample_size=p_horizon,
+            outsample_size=0,
+            window_sampling_limit=1,
+            batch_size=1,
+            time_features=p_model._ForecastPFN_time_features,
+        )
         p_y, p_y_mask, p_y_timestamps = output_set.sequential_latest_insamples()
 
         x, x_mark, y, y_mark = p_x, p_x_timestamps, p_y, p_y_timestamps
 
         batch_x, batch_y = to_tensor(x)[:, :, None], to_tensor(y)[:, :, None]
-        batch_x_mark, batch_y_mark = to_tensor(
-            x_mark.astype(int)), to_tensor(y_mark.astype(int))
-    
-        
+        batch_x_mark, batch_y_mark = (
+            to_tensor(x_mark.astype(int)),
+            to_tensor(y_mark.astype(int)),
+        )
+
         model = tf.keras.models.load_model(
-            str(pathlib.Path(path).parent) + '/ckpts/', custom_objects={'smape': smape})
-        for idx, (x, y, x_mark, y_mark) in tqdm(enumerate(zip(batch_x, batch_y, batch_x_mark, batch_y_mark))):
+            str(pathlib.Path(path).parent) + '/ckpts/', custom_objects={'smape': smape}
+        )
+        for idx, (x, y, x_mark, y_mark) in tqdm(
+            enumerate(zip(batch_x, batch_y, batch_x_mark, batch_y_mark))
+        ):
             pred = p_model._process_tuple(x, x_mark, y_mark, model, p_horizon)
             forecasts.extend(pred)
 
-        forecasts_df = pd.DataFrame(forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon)])
+        forecasts_df = pd.DataFrame(
+            forecasts, columns=[f'V{idx + 1}' for idx in range(p_horizon)]
+        )
         forecasts_df.index = in_bundle.ids()
         forecasts_df.index.name = 'id'
         return forecasts_df
 
-
     # M4
     # target_input, target_output = M4Dataset(
     #     M4Meta.dataset_path).standard_split()
@@ -131,20 +143,26 @@ def forecast(in_bundle: TimeseriesBundle, out_bundle: TimeseriesBundle,
     #     os.path.join(os.path.join(path, 'M4.csv')))
 
     # M3
-    target_input, target_output = M3Dataset(
-        M3Meta.dataset_path).standard_split()
+    target_input, target_output = M3Dataset(M3Meta.dataset_path).standard_split()
     yearly = forecast(target_input, target_output, 'M3Year', **tl_models['Y6'])
     quarterly = forecast(target_input, target_output, 'M3Quart', **tl_models['Q8'])
     monthly = forecast(target_input, target_output, 'M3Month', **tl_models['M18'])
     others = forecast(target_input, target_output, 'M3Other', **tl_models['Q8'])
-    pd.concat([yearly, quarterly, monthly, others], sort=False).to_csv(os.path.join(os.path.join(path, 'M3.csv')))
+    pd.concat([yearly, quarterly, monthly, others], sort=False).to_csv(
+        os.path.join(os.path.join(path, 'M3.csv'))
+    )
 
     # Tourism
-    target_input, target_output = TourismDataset(TourismMeta.dataset_path).standard_split()
+    target_input, target_output = TourismDataset(
+        TourismMeta.dataset_path
+    ).standard_split()
     yearly = forecast(target_input, target_output, 'Yearly', **tl_models['Y4'])
     quarterly = forecast(target_input, target_output, 'Quarterly', **tl_models['Q8'])
     monthly = forecast(target_input, target_output, 'Monthly', **tl_models['M24'])
-    pd.concat([yearly, quarterly, monthly], sort=False).to_csv(os.path.join(os.path.join(path, 'tourism.csv')))
+    pd.concat([yearly, quarterly, monthly], sort=False).to_csv(
+        os.path.join(os.path.join(path, 'tourism.csv'))
+    )
+
 
 def evaluate(name: str, summary_filter: str, validation_mode: bool = False):
     pass
diff --git a/benchmark/metalearned/models/nbeats_torch.py b/benchmark/metalearned/models/nbeats_torch.py
index 419b969..9569665 100644
--- a/benchmark/metalearned/models/nbeats_torch.py
+++ b/benchmark/metalearned/models/nbeats_torch.py
@@ -1,21 +1,21 @@
+from itertools import chain
 from typing import Tuple
 
 import numpy as np
 import torch as t
-from itertools import chain
-
 from common.torch_utils import div_no_nan
 
 
 class NBeatsFC(t.nn.Module):
-    def __init__(self,
-                 input_size: int,
-                 fc_layers: int,
-                 output_size: int):
+    def __init__(self, input_size: int, fc_layers: int, output_size: int):
         super().__init__()
-        self.fc_layers = t.nn.ModuleList([t.nn.Linear(in_features=input_size, out_features=output_size)] +
-                                         [t.nn.Linear(in_features=output_size, out_features=output_size)
-                                          for _ in range(fc_layers - 1)])
+        self.fc_layers = t.nn.ModuleList(
+            [t.nn.Linear(in_features=input_size, out_features=output_size)]
+            + [
+                t.nn.Linear(in_features=output_size, out_features=output_size)
+                for _ in range(fc_layers - 1)
+            ]
+        )
 
     def forward(self, x: t.Tensor) -> t.Tensor:
         output = x
@@ -25,16 +25,16 @@ def forward(self, x: t.Tensor) -> t.Tensor:
 
 
 class NBeatsGenericBlock(t.nn.Module):
-    def __init__(self,
-                 input_size: int,
-                 fc_layers: int,
-                 fc_layers_size: int,
-                 output_size: int):
+    def __init__(
+        self, input_size: int, fc_layers: int, fc_layers_size: int, output_size: int
+    ):
         super().__init__()
-        self.fc = NBeatsFC(input_size=input_size,
-                           fc_layers=fc_layers,
-                           output_size=fc_layers_size)
-        self.basis = t.nn.Linear(in_features=fc_layers_size, out_features=input_size + output_size)
+        self.fc = NBeatsFC(
+            input_size=input_size, fc_layers=fc_layers, output_size=fc_layers_size
+        )
+        self.basis = t.nn.Linear(
+            in_features=fc_layers_size, out_features=input_size + output_size
+        )
         self.output_size = output_size
 
         self.backcast_dump = None
@@ -51,55 +51,95 @@ def forward(self, x: t.Tensor) -> Tuple[t.Tensor, t.Tensor]:
 
 
 class NBeatsTrendBlock(t.nn.Module):
-    def __init__(self,
-                 input_size: int,
-                 fc_layers: int,
-                 fc_layers_size: int,
-                 degree_of_polynomial: int,
-                 output_size: int):
+    def __init__(
+        self,
+        input_size: int,
+        fc_layers: int,
+        fc_layers_size: int,
+        degree_of_polynomial: int,
+        output_size: int,
+    ):
         super().__init__()
-        self.polynomial_size = degree_of_polynomial + 1  # degree of polynomial with constant term
-        self.fc = NBeatsFC(input_size=input_size,
-                           fc_layers=fc_layers,
-                           output_size=fc_layers_size)
-        self.basis = t.nn.Linear(in_features=fc_layers_size, out_features=2 * self.polynomial_size)
+        self.polynomial_size = (
+            degree_of_polynomial + 1
+        )  # degree of polynomial with constant term
+        self.fc = NBeatsFC(
+            input_size=input_size, fc_layers=fc_layers, output_size=fc_layers_size
+        )
+        self.basis = t.nn.Linear(
+            in_features=fc_layers_size, out_features=2 * self.polynomial_size
+        )
         self.output_size = output_size
-        self.backcast_time = np.concatenate([np.power(np.arange(input_size, dtype=np.float) / input_size, i)[None, :]
-                                             for i in range(self.polynomial_size)])
-        self.forecast_time = np.concatenate([np.power(np.arange(output_size, dtype=np.float) / output_size, i)[None, :]
-                                             for i in range(self.polynomial_size)])
+        self.backcast_time = np.concatenate(
+            [
+                np.power(np.arange(input_size, dtype=np.float) / input_size, i)[None, :]
+                for i in range(self.polynomial_size)
+            ]
+        )
+        self.forecast_time = np.concatenate(
+            [
+                np.power(np.arange(output_size, dtype=np.float) / output_size, i)[
+                    None, :
+                ]
+                for i in range(self.polynomial_size)
+            ]
+        )
         self.backcast_dump = None
         self.forecast_dump = None
 
     def forward(self, x: t.Tensor):
         thetas = self.basis(self.fc(x))
-        backcast = t.einsum('bp,pt->bt', thetas[:, self.polynomial_size:], x.new(self.backcast_time))
-        forecast = t.einsum('bp,pt->bt', thetas[:, :self.polynomial_size], x.new(self.forecast_time))
+        backcast = t.einsum(
+            'bp,pt->bt', thetas[:, self.polynomial_size :], x.new(self.backcast_time)
+        )
+        forecast = t.einsum(
+            'bp,pt->bt', thetas[:, : self.polynomial_size], x.new(self.forecast_time)
+        )
         self.backcast_dump = backcast
         self.forecast_dump = forecast
         return backcast, forecast
 
 
 class NBeatsSeasonalityBlock(t.nn.Module):
-    def __init__(self,
-                 input_size: int,
-                 fc_layers: int,
-                 fc_layers_size: int,
-                 num_of_harmonics: int,
-                 output_size: int):
+    def __init__(
+        self,
+        input_size: int,
+        fc_layers: int,
+        fc_layers_size: int,
+        num_of_harmonics: int,
+        output_size: int,
+    ):
         super().__init__()
-        self.basis_parameters = int(np.ceil(num_of_harmonics / 2 * output_size) - (num_of_harmonics - 1))
-
-        self.fc = NBeatsFC(input_size=input_size,
-                           fc_layers=fc_layers,
-                           output_size=fc_layers_size)
-        self.basis = t.nn.Linear(in_features=fc_layers_size, out_features=4 * self.basis_parameters)
-
-        frequency = np.append(np.zeros(1, dtype=np.float32),
-                              np.arange(num_of_harmonics, num_of_harmonics / 2 * output_size,
-                                        dtype=np.float32) / num_of_harmonics)[None, :]
-        backcast_grid = -2 * np.pi * (np.arange(input_size, dtype=np.float32)[:, None] / output_size) * frequency
-        forecast_grid = 2 * np.pi * (np.arange(output_size, dtype=np.float32)[:, None] / output_size) * frequency
+        self.basis_parameters = int(
+            np.ceil(num_of_harmonics / 2 * output_size) - (num_of_harmonics - 1)
+        )
+
+        self.fc = NBeatsFC(
+            input_size=input_size, fc_layers=fc_layers, output_size=fc_layers_size
+        )
+        self.basis = t.nn.Linear(
+            in_features=fc_layers_size, out_features=4 * self.basis_parameters
+        )
+
+        frequency = np.append(
+            np.zeros(1, dtype=np.float32),
+            np.arange(
+                num_of_harmonics, num_of_harmonics / 2 * output_size, dtype=np.float32
+            )
+            / num_of_harmonics,
+        )[None, :]
+        backcast_grid = (
+            -2
+            * np.pi
+            * (np.arange(input_size, dtype=np.float32)[:, None] / output_size)
+            * frequency
+        )
+        forecast_grid = (
+            2
+            * np.pi
+            * (np.arange(output_size, dtype=np.float32)[:, None] / output_size)
+            * frequency
+        )
         self.backcast_cos_template = np.transpose(np.cos(backcast_grid))
         self.backcast_sin_template = np.transpose(np.sin(backcast_grid))
         self.forecast_cos_template = np.transpose(np.cos(forecast_grid))
@@ -111,20 +151,28 @@ def __init__(self,
     def forward(self, x: t.Tensor):
         harmonics_weights = self.basis(self.fc(x))
 
-        backcast_harmonics_cos = t.einsum('bp,pt->bt',
-                                          harmonics_weights[:, 2 * self.basis_parameters:3 * self.basis_parameters],
-                                          x.new(self.backcast_cos_template))
-        backcast_harmonics_sin = t.einsum('bp,pt->bt',
-                                          harmonics_weights[:, 3 * self.basis_parameters:],
-                                          x.new(self.backcast_sin_template))
+        backcast_harmonics_cos = t.einsum(
+            'bp,pt->bt',
+            harmonics_weights[:, 2 * self.basis_parameters : 3 * self.basis_parameters],
+            x.new(self.backcast_cos_template),
+        )
+        backcast_harmonics_sin = t.einsum(
+            'bp,pt->bt',
+            harmonics_weights[:, 3 * self.basis_parameters :],
+            x.new(self.backcast_sin_template),
+        )
         backcast = backcast_harmonics_sin + backcast_harmonics_cos
 
-        forecast_harmonics_cos = t.einsum('bp,pt->bt',
-                                          harmonics_weights[:, :self.basis_parameters],
-                                          x.new(self.forecast_cos_template))
-        forecast_harmonics_sin = t.einsum('bp,pt->bt',
-                                          harmonics_weights[:, self.basis_parameters:2 * self.basis_parameters],
-                                          x.new(self.forecast_sin_template))
+        forecast_harmonics_cos = t.einsum(
+            'bp,pt->bt',
+            harmonics_weights[:, : self.basis_parameters],
+            x.new(self.forecast_cos_template),
+        )
+        forecast_harmonics_sin = t.einsum(
+            'bp,pt->bt',
+            harmonics_weights[:, self.basis_parameters : 2 * self.basis_parameters],
+            x.new(self.forecast_sin_template),
+        )
         forecast = forecast_harmonics_sin + forecast_harmonics_cos
 
         self.backcast_dump = backcast
@@ -189,41 +237,67 @@ def forward(self, x: t.Tensor, input_mask: t.Tensor) -> t.Tensor:
         return forecast
 
 
-def nbeats_generic(input_size: int, output_size: int,
-                   blocks: int = 1, stacks: int = 30,
-                   fc_layers: int = 4, fc_layers_size: int = 512,
-                   scaling: str = None, mode: str = 'dress'):
-    modules = [[NBeatsGenericBlock(input_size=input_size,
-                                   fc_layers=fc_layers,
-                                   fc_layers_size=fc_layers_size,
-                                   output_size=output_size)] * blocks for _ in range(stacks)]
-
-    return NBeats(t.nn.ModuleList(list(chain.from_iterable(modules))),
-                  scaling=scaling,
-                  mode=mode)
-
-
-def nbeats_interpretable(input_size: int, output_size: int,
-                         trend_blocks: int = 3,
-                         trend_fc_layers: int = 4,
-                         trend_fc_layers_size: int = 256,
-                         degree_of_polynomial: int = 3,
-                         seasonality_blocks: int = 3,
-                         seasonality_fc_layers: int = 4,
-                         seasonality_fc_layers_size: int = 2048,
-                         num_of_harmonics: int = 1,
-                         scaling: str = None,
-                         mode: str = 'dress'):
-    trend_block = NBeatsTrendBlock(input_size=input_size,
-                                   fc_layers=trend_fc_layers,
-                                   fc_layers_size=trend_fc_layers_size,
-                                   degree_of_polynomial=degree_of_polynomial,
-                                   output_size=output_size)
-    seasonality_block = NBeatsSeasonalityBlock(input_size=input_size,
-                                               fc_layers=seasonality_fc_layers,
-                                               fc_layers_size=seasonality_fc_layers_size,
-                                               num_of_harmonics=num_of_harmonics,
-                                               output_size=output_size)
-    return NBeats(t.nn.ModuleList(
-        [trend_block for _ in range(trend_blocks)] + [seasonality_block for _ in range(seasonality_blocks)]),
-        scaling=scaling, mode=mode)
+def nbeats_generic(
+    input_size: int,
+    output_size: int,
+    blocks: int = 1,
+    stacks: int = 30,
+    fc_layers: int = 4,
+    fc_layers_size: int = 512,
+    scaling: str = None,
+    mode: str = 'dress',
+):
+    modules = [
+        [
+            NBeatsGenericBlock(
+                input_size=input_size,
+                fc_layers=fc_layers,
+                fc_layers_size=fc_layers_size,
+                output_size=output_size,
+            )
+        ]
+        * blocks
+        for _ in range(stacks)
+    ]
+
+    return NBeats(
+        t.nn.ModuleList(list(chain.from_iterable(modules))), scaling=scaling, mode=mode
+    )
+
+
+def nbeats_interpretable(
+    input_size: int,
+    output_size: int,
+    trend_blocks: int = 3,
+    trend_fc_layers: int = 4,
+    trend_fc_layers_size: int = 256,
+    degree_of_polynomial: int = 3,
+    seasonality_blocks: int = 3,
+    seasonality_fc_layers: int = 4,
+    seasonality_fc_layers_size: int = 2048,
+    num_of_harmonics: int = 1,
+    scaling: str = None,
+    mode: str = 'dress',
+):
+    trend_block = NBeatsTrendBlock(
+        input_size=input_size,
+        fc_layers=trend_fc_layers,
+        fc_layers_size=trend_fc_layers_size,
+        degree_of_polynomial=degree_of_polynomial,
+        output_size=output_size,
+    )
+    seasonality_block = NBeatsSeasonalityBlock(
+        input_size=input_size,
+        fc_layers=seasonality_fc_layers,
+        fc_layers_size=seasonality_fc_layers_size,
+        num_of_harmonics=num_of_harmonics,
+        output_size=output_size,
+    )
+    return NBeats(
+        t.nn.ModuleList(
+            [trend_block for _ in range(trend_blocks)]
+            + [seasonality_block for _ in range(seasonality_blocks)]
+        ),
+        scaling=scaling,
+        mode=mode,
+    )
diff --git a/benchmark/metalearned/resources/electricity/dataset.py b/benchmark/metalearned/resources/electricity/dataset.py
index f6b5282..7336021 100644
--- a/benchmark/metalearned/resources/electricity/dataset.py
+++ b/benchmark/metalearned/resources/electricity/dataset.py
@@ -5,11 +5,10 @@
 
 import numpy as np
 import patoolib
-from tqdm import tqdm
-
 from common.settings import RESOURCES_DIR
-from common.timeseries import Timeseries, TimeseriesBundle, TimeseriesLoader, Hour
+from common.timeseries import Hour, Timeseries, TimeseriesBundle, TimeseriesLoader
 from common.utils import download_url
+from tqdm import tqdm
 
 """
 Hourly aggregated dataset from https://archive.ics.uci.edu/ml/datasets/ElectricityLoadDiagrams20112014
@@ -39,20 +38,29 @@ class ElectricityDataset(TimeseriesLoader):
     def download(self) -> TimeseriesBundle:
         archive_file = os.path.join(self.path, 'dataset.zip')
         raw_file = os.path.join(self.path, 'LD2011_2014.txt')
-        download_url('https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip',
-                     archive_file)
+        download_url(
+            'https://archive.ics.uci.edu/ml/machine-learning-databases/00321/LD2011_2014.txt.zip',
+            archive_file,
+        )
         patoolib.extract_archive(archive_file, outdir=self.path)
 
         with open(raw_file, 'r') as f:
             raw = f.readlines()
 
-        parsed_values = np.array(list(map(
-            lambda raw_line: np.array(raw_line.replace(',', '.').strip().split(';')[1:]).astype(np.float), tqdm(raw[1:])
-        )))
+        parsed_values = np.array(
+            list(
+                map(
+                    lambda raw_line: np.array(
+                        raw_line.replace(',', '.').strip().split(';')[1:]
+                    ).astype(np.float),
+                    tqdm(raw[1:]),
+                )
+            )
+        )
 
         aggregated = []
         for i in tqdm(range(0, parsed_values.shape[0], 4)):
-            aggregated.append(parsed_values[i:i + 4, :].sum(axis=0))
+            aggregated.append(parsed_values[i : i + 4, :].sum(axis=0))
         aggregated = np.array(aggregated)
 
         # regarding time labels, in dataset description authors specify
@@ -62,19 +70,25 @@ def download(self) -> TimeseriesBundle:
         # neither for "2012-03-25 01:45:00", thus it's not clear how to deal with daylight saving time change in this
         # dataset. Taking into account this uncertainty the starting date is treated as UTC (without time changes).
 
-        start_date = datetime(2011, 1, 1, 1, 0, 0)  # aggregated towards next hour instead of current hour.
+        start_date = datetime(
+            2011, 1, 1, 1, 0, 0
+        )  # aggregated towards next hour instead of current hour.
 
         dataset = aggregated.T  # use time step as second dimension.
         timeseries = []
 
         for i, values in enumerate(dataset):
-            timeseries.append(Timeseries(id=str(i),
-                                         start_date=start_date,
-                                         time_unit=Hour(),
-                                         frequency=1,
-                                         period=ElectricityMeta.period,
-                                         values=values,
-                                         meta={}))
+            timeseries.append(
+                Timeseries(
+                    id=str(i),
+                    start_date=start_date,
+                    time_unit=Hour(),
+                    frequency=1,
+                    period=ElectricityMeta.period,
+                    values=values,
+                    meta={},
+                )
+            )
         return TimeseriesBundle(timeseries)
 
     def standard_split(self) -> Tuple[TimeseriesBundle, TimeseriesBundle]:
diff --git a/benchmark/metalearned/resources/electricity/evaluator.py b/benchmark/metalearned/resources/electricity/evaluator.py
index 99e53c9..8f3561d 100644
--- a/benchmark/metalearned/resources/electricity/evaluator.py
+++ b/benchmark/metalearned/resources/electricity/evaluator.py
@@ -2,8 +2,7 @@
 from typing import Callable
 
 import numpy as np
-
-from common.evaluator import Evaluator, EvaluationResult
+from common.evaluator import EvaluationResult, Evaluator
 from common.metrics import nd
 from common.timeseries import TimeseriesBundle
 from common.utils import round_half_up
@@ -15,6 +14,11 @@ class ElectricityEvaluator(Evaluator):
     precision: int = 2
 
     def evaluate(self, forecasts: TimeseriesBundle) -> EvaluationResult:
-        return {'metric': round_half_up(self.metric_fn(np.array(forecasts.values()),
-                                                       np.array(self.test_set.values())),
-                                        self.precision)}
+        return {
+            'metric': round_half_up(
+                self.metric_fn(
+                    np.array(forecasts.values()), np.array(self.test_set.values())
+                ),
+                self.precision,
+            )
+        }
diff --git a/benchmark/metalearned/resources/fred/api.py b/benchmark/metalearned/resources/fred/api.py
index d811291..cc654d0 100644
--- a/benchmark/metalearned/resources/fred/api.py
+++ b/benchmark/metalearned/resources/fred/api.py
@@ -30,8 +30,10 @@ def __init__(self, dataset_path):
         self.dataset_path = dataset_path
 
         if not os.path.isfile(key_path):
-            raise Exception(f'Cannot find FRED key file. Create an API key and place it in {key_path}. '
-                            'https://research.stlouisfed.org/docs/api/api_key.html')
+            raise Exception(
+                f'Cannot find FRED key file. Create an API key and place it in {key_path}. '
+                'https://research.stlouisfed.org/docs/api/api_key.html'
+            )
 
         with open(key_path, 'r') as f:
             key = f.readline().strip()
@@ -42,7 +44,7 @@ def call(self, api_fn: Callable[[Fred], A], attempt=1) -> A:
             raise Exception('Maximum retries exceeded')
         try:
             return api_fn(self.api)
-        except Exception as e:
+        except Exception:
             # logging.info(f'API Error: {str(e)}. Waiting {self.wait_delay} seconds to retry. Attempt: {attempt}')
             time.sleep(self.wait_delay)
             return self.call(api_fn=api_fn, attempt=attempt + 1)
@@ -57,14 +59,18 @@ def fetch_categories(self, parent=0):
 
     def fetch_observation(self, timeseries_id: str):
         try:
-            values = self.api.series.observations(timeseries_id, params={'output_type': 1,
-                                                                         'realtime_start': '1776-07-04'})
+            values = self.api.series.observations(
+                timeseries_id, params={'output_type': 1, 'realtime_start': '1776-07-04'}
+            )
             values = values.groupby('date').head(1)
             values = values.set_index('date')['value']
         except Exception as e:
-            if 'The series does not exist in ALFRED but may exist in FRED' in str(e) \
-                    or 'this exceeds the maximum number of vintage dates allowed' in str(e).lower() \
-                    or 'bad request' in str(e).lower():
+            if (
+                'The series does not exist in ALFRED but may exist in FRED' in str(e)
+                or 'this exceeds the maximum number of vintage dates allowed'
+                in str(e).lower()
+                or 'bad request' in str(e).lower()
+            ):
                 # There are a couple of situations where ALFRED (vintage data)
                 # would not work properly
                 values = self.api.series.observations(timeseries_id)
@@ -88,7 +94,7 @@ def fetch_all(self):
                 categories = pickle.load(f)
                 logging.info(f'Loaded {len(categories)} categories')
         else:
-            logging.info(f'Fetching categories')
+            logging.info('Fetching categories')
             categories = self.fetch_categories()
             logging.info(f'Fetched {len(categories)} categories')
             with open(categories_cache_path, 'wb') as f:
@@ -97,7 +103,7 @@ def fetch_all(self):
         #
         # Fetch timeseries
         #
-        logging.info(f'Fetching timeseries')
+        logging.info('Fetching timeseries')
         dataset_file_path = os.path.join(self.dataset_path, 'dataset.pickle')
 
         dataset = {'processed_categories': [], 'data': {}}
@@ -105,38 +111,46 @@ def fetch_all(self):
             with open(dataset_file_path, 'rb') as cache_file_name:
                 dataset = pickle.load(cache_file_name)
 
-        categories_to_process = [c for c in categories if c not in dataset['processed_categories']]
+        categories_to_process = [
+            c for c in categories if c not in dataset['processed_categories']
+        ]
 
         limit = 1000
         for category_id in tqdm(categories_to_process):
             offset = 0
             while True:
-                timeseries_meta = self.call(lambda api: api.category.series(category_id, params={'limit': limit,
-                                                                                                 'offset': offset}))
+                timeseries_meta = self.call(
+                    lambda api: api.category.series(
+                        category_id, params={'limit': limit, 'offset': offset}
+                    )
+                )
                 if len(timeseries_meta) == 0:
                     break
 
                 for _, ts_meta in timeseries_meta.iterrows():
                     ts_id = str(ts_meta.id)
-                    start_date = datetime.datetime.strptime(str(ts_meta.observation_start), '%Y-%m-%d %H:%M:%S')
+                    start_date = datetime.datetime.strptime(
+                        str(ts_meta.observation_start), '%Y-%m-%d %H:%M:%S'
+                    )
                     time_unit = str(ts_meta.frequency)
                     if ts_id not in dataset['data']:
                         dataset['data'][ts_id] = {
                             'start_date': start_date,
                             'time_unit': time_unit,
-                            'meta': {
-                                'categories': [category_id]
-                            },
-                            'values': self.call(lambda api: self.fetch_observation(ts_id))
+                            'meta': {'categories': [category_id]},
+                            'values': self.call(
+                                lambda api: self.fetch_observation(ts_id)
+                            ),
                         }
                     else:
                         dataset['data'][ts_id]['meta']['categories'].append(category_id)
                 offset += 1
 
             dataset['processed_categories'].append(category_id)
-            temp_file = tempfile.NamedTemporaryFile(dir=self.dataset_path, delete=False, mode='wb')
+            temp_file = tempfile.NamedTemporaryFile(
+                dir=self.dataset_path, delete=False, mode='wb'
+            )
             pickle.dump(dataset, temp_file, protocol=pickle.HIGHEST_PROTOCOL)
             temp_file.flush()
             os.fsync(temp_file.fileno())
             os.rename(temp_file.name, dataset_file_path)
-
diff --git a/benchmark/metalearned/resources/fred/dataset.py b/benchmark/metalearned/resources/fred/dataset.py
index f419063..1fbdda6 100644
--- a/benchmark/metalearned/resources/fred/dataset.py
+++ b/benchmark/metalearned/resources/fred/dataset.py
@@ -4,11 +4,17 @@
 from dataclasses import dataclass
 from typing import Tuple
 
-from tqdm import tqdm
-
 from common.settings import RESOURCES_DIR
-from common.timeseries import Timeseries, TimeseriesBundle, TimeseriesLoader, Year, Month, Day
+from common.timeseries import (
+    Day,
+    Month,
+    Timeseries,
+    TimeseriesBundle,
+    TimeseriesLoader,
+    Year,
+)
 from resources.fred.api import FredAPI
+from tqdm import tqdm
 
 
 @dataclass(frozen=True)
@@ -36,7 +42,7 @@ def download(self) -> TimeseriesBundle:
             'Quarterly': (Month(), 3),
             'Monthly': (Month(), 1),
             'Weekly': (Day(), 7),
-            'Daily': (Day(), 1)
+            'Daily': (Day(), 1),
         }
 
         period_map = FredMeta().period_map()
@@ -44,26 +50,33 @@ def download(self) -> TimeseriesBundle:
         timeseries = []
         for ts_id, record in tqdm(raw_data.items()):
             sp = record['time_unit']
-            frequency = [frequency_map[s] for s in frequency_map.keys() if sp.startswith(s)]
+            frequency = [
+                frequency_map[s] for s in frequency_map.keys() if sp.startswith(s)
+            ]
             period = [period_map[s] for s in period_map.keys() if sp.startswith(s)]
             if len(frequency) > 0:
                 frequency = frequency[0]
             else:
-                raise Exception(f"Cannot match frequency for: {sp}")
+                raise Exception(f'Cannot match frequency for: {sp}')
             if len(period) > 0:
                 period = period[0]
             else:
-                raise Exception(f"Cannot match frequency for: {sp}")
-            timeseries.append(Timeseries(id=ts_id,
-                                         start_date=record['start_date'],
-                                         time_unit=frequency[0],
-                                         frequency=frequency[1],
-                                         period=period,
-                                         values=record['values'],
-                                         meta={'seasonal_pattern': sp}
-                                         ))
-        grouped_timeseries = [list(filter(lambda ts: ts.meta['seasonal_pattern'] == sp, timeseries))
-                              for sp in FredMeta.seasonal_patterns]
+                raise Exception(f'Cannot match frequency for: {sp}')
+            timeseries.append(
+                Timeseries(
+                    id=ts_id,
+                    start_date=record['start_date'],
+                    time_unit=frequency[0],
+                    frequency=frequency[1],
+                    period=period,
+                    values=record['values'],
+                    meta={'seasonal_pattern': sp},
+                )
+            )
+        grouped_timeseries = [
+            list(filter(lambda ts: ts.meta['seasonal_pattern'] == sp, timeseries))
+            for sp in FredMeta.seasonal_patterns
+        ]
         grouped_timeseries = [ts for sp_ts in grouped_timeseries for ts in sp_ts]
 
         return TimeseriesBundle(grouped_timeseries)
@@ -71,7 +84,9 @@ def download(self) -> TimeseriesBundle:
     def standard_split(self) -> Tuple[TimeseriesBundle, TimeseriesBundle]:
         bundle = self.load_cache()
         horizons_map = FredMeta().horizons_map()
-        return bundle.split(lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']]))
+        return bundle.split(
+            lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']])
+        )
 
 
 if __name__ == '__main__':
diff --git a/benchmark/metalearned/resources/fred/evaluator.py b/benchmark/metalearned/resources/fred/evaluator.py
index 7942721..69a48af 100644
--- a/benchmark/metalearned/resources/fred/evaluator.py
+++ b/benchmark/metalearned/resources/fred/evaluator.py
@@ -1,10 +1,8 @@
 from collections import OrderedDict
-from collections import OrderedDict
 from dataclasses import dataclass
 
 import numpy as np
-
-from common.evaluator import Evaluator, EvaluationResult
+from common.evaluator import EvaluationResult, Evaluator
 from common.metrics import smape_2
 from common.timeseries import TimeseriesBundle
 from common.utils import round_half_up
@@ -19,11 +17,19 @@ def evaluate(self, forecast: TimeseriesBundle) -> EvaluationResult:
         insamples, _ = FredDataset(FredMeta.dataset_path).standard_split()
         if self.validation:
             horizons_map = FredMeta().horizons_map()
-            insamples, _ = insamples.split(lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']]))
+            insamples, _ = insamples.split(
+                lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']])
+            )
 
-        grouped_smapes = {sp: np.mean(smape_2(forecast=np.array(self.filter_by_sp(forecast, sp).values()),
-                                              target=np.array(self.filter_by_sp(self.test_set, sp).values())))
-                          for sp in FredMeta.seasonal_patterns}
+        grouped_smapes = {
+            sp: np.mean(
+                smape_2(
+                    forecast=np.array(self.filter_by_sp(forecast, sp).values()),
+                    target=np.array(self.filter_by_sp(self.test_set, sp).values()),
+                )
+            )
+            for sp in FredMeta.seasonal_patterns
+        }
 
         grouped_smapes = self.summarize_groups(grouped_smapes)
 
@@ -34,7 +40,9 @@ def summarize_groups(self, scores):
 
         weighted_score = {}
         for sp in ['Yearly', 'Quarterly', 'Monthly', 'Weekly', 'Daily']:
-            weighted_score[sp] = scores[sp] * len(self.filter_by_sp(self.test_set, sp).timeseries)
+            weighted_score[sp] = scores[sp] * len(
+                self.filter_by_sp(self.test_set, sp).timeseries
+            )
             scores_summary[sp] = scores[sp]
 
         average = np.sum(list(weighted_score.values())) / len(self.test_set.timeseries)
@@ -43,7 +51,9 @@ def summarize_groups(self, scores):
         return scores_summary
 
     @staticmethod
-    def filter_by_sp(bundle: TimeseriesBundle, seasonal_pattern: str) -> TimeseriesBundle:
+    def filter_by_sp(
+        bundle: TimeseriesBundle, seasonal_pattern: str
+    ) -> TimeseriesBundle:
         return bundle.filter(lambda ts: ts.meta['seasonal_pattern'] == seasonal_pattern)
 
     @staticmethod
diff --git a/benchmark/metalearned/resources/m3/dataset.py b/benchmark/metalearned/resources/m3/dataset.py
index 0904d3c..24e104c 100644
--- a/benchmark/metalearned/resources/m3/dataset.py
+++ b/benchmark/metalearned/resources/m3/dataset.py
@@ -5,11 +5,17 @@
 
 import numpy as np
 import pandas as pd
-from tqdm import tqdm
-
 from common.settings import RESOURCES_DIR
-from common.timeseries import Timeseries, TimeseriesBundle, TimeseriesLoader, Unknown, Year, Month
+from common.timeseries import (
+    Month,
+    Timeseries,
+    TimeseriesBundle,
+    TimeseriesLoader,
+    Unknown,
+    Year,
+)
 from common.utils import download_url
+from tqdm import tqdm
 
 
 @dataclass(frozen=True)
@@ -19,9 +25,30 @@ class M3Meta:
     seasonal_patterns = ['M3Year', 'M3Quart', 'M3Month', 'M3Other']
     horizons = [6, 8, 18, 8]
     frequency = [1, 4, 12, 1]
-    models = ['NAIVE2', 'SINGLE', 'HOLT', 'DAMPEN', 'WINTER', 'COMB S-H-D', 'B-J auto', 'AutoBox1', 'AutoBox2',
-              'AutoBox3', 'ROBUST-Trend', 'ARARMA', 'Auto-ANN', 'Flors-Pearc1', 'Flors-Pearc2', 'PP-Autocast',
-              'ForecastPro', 'SMARTFCS', 'THETAsm', 'THETA', 'RBF', 'ForcX']
+    models = [
+        'NAIVE2',
+        'SINGLE',
+        'HOLT',
+        'DAMPEN',
+        'WINTER',
+        'COMB S-H-D',
+        'B-J auto',
+        'AutoBox1',
+        'AutoBox2',
+        'AutoBox3',
+        'ROBUST-Trend',
+        'ARARMA',
+        'Auto-ANN',
+        'Flors-Pearc1',
+        'Flors-Pearc2',
+        'PP-Autocast',
+        'ForecastPro',
+        'SMARTFCS',
+        'THETAsm',
+        'THETA',
+        'RBF',
+        'ForcX',
+    ]
 
     def horizons_map(self):
         return dict(zip(self.seasonal_patterns, self.horizons))
@@ -67,37 +94,48 @@ def download(self) -> TimeseriesBundle:
                         time_unit = Unknown()
                         pass
 
-                timeseries.append(Timeseries(id=str(row['Series']),
-                                             start_date=starting_date,
-                                             time_unit=time_unit,
-                                             frequency=frequency,
-                                             period=1,
-                                             values=row.T[6:row.N + 6].values.astype(np.float32),
-                                             meta={'seasonal_pattern': sp}
-                                             ))
+                timeseries.append(
+                    Timeseries(
+                        id=str(row['Series']),
+                        start_date=starting_date,
+                        time_unit=time_unit,
+                        frequency=frequency,
+                        period=1,
+                        values=row.T[6 : row.N + 6].values.astype(np.float32),
+                        meta={'seasonal_pattern': sp},
+                    )
+                )
         return TimeseriesBundle(timeseries)
 
     def standard_split(self) -> Tuple[TimeseriesBundle, TimeseriesBundle]:
         bundle = self.load_cache()
         horizons_map = M3Meta().horizons_map()
-        return bundle.split(lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']]))
+        return bundle.split(
+            lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']])
+        )
 
 
 class M3Forecasts(TimeseriesLoader):
     def download(self) -> TimeseriesBundle:
         raw_file_path = os.path.join(M3Meta.forecasts_path, 'M3Forecast.xls')
-        download_url('https://forecasters.org/data/m3comp/M3Forecast.xls', raw_file_path)
+        download_url(
+            'https://forecasters.org/data/m3comp/M3Forecast.xls', raw_file_path
+        )
 
         original_timeseries = M3Dataset(M3Meta().dataset_path).load_cache()
         horizon_mapping = M3Meta().horizons_map()
-        training_set, _ = original_timeseries.split(lambda t: t.split(-horizon_mapping[t.meta['seasonal_pattern']]))
+        training_set, _ = original_timeseries.split(
+            lambda t: t.split(-horizon_mapping[t.meta['seasonal_pattern']])
+        )
         training_timeseries = training_set.timeseries
 
         models_forecasts = []
         for model_name in tqdm(M3Meta.models):
             forecast = pd.read_excel(raw_file_path, sheet_name=model_name, header=None)
             for i, row in forecast.iterrows():
-                ts = training_timeseries[i].future_values(row.T[2:row[1] + 2].values.astype(np.float32))
+                ts = training_timeseries[i].future_values(
+                    row.T[2 : row[1] + 2].values.astype(np.float32)
+                )
                 ts.meta = {**ts.meta, 'model': model_name}
                 models_forecasts.append(ts)
         return TimeseriesBundle(models_forecasts)
diff --git a/benchmark/metalearned/resources/m3/evaluator.py b/benchmark/metalearned/resources/m3/evaluator.py
index 00cf895..f9d8243 100644
--- a/benchmark/metalearned/resources/m3/evaluator.py
+++ b/benchmark/metalearned/resources/m3/evaluator.py
@@ -2,8 +2,7 @@
 from dataclasses import dataclass
 
 import numpy as np
-
-from common.evaluator import Evaluator, EvaluationResult
+from common.evaluator import EvaluationResult, Evaluator
 from common.metrics import smape_1, smape_2
 from common.timeseries import TimeseriesBundle
 from resources.m3.dataset import M3Meta
@@ -22,7 +21,9 @@ def evaluate(self, forecasts: TimeseriesBundle) -> EvaluationResult:
         evaluation_function = smape_1 if self.smape_1 else smape_2
 
         for sp in M3Meta.seasonal_patterns:
-            target_sp = self.test_set.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
+            target_sp = self.test_set.filter(
+                lambda ts: ts.meta['seasonal_pattern'] == sp
+            )
             forecast_sp = forecasts.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
 
             target, forecast = target_sp.intersection_by_id(forecast_sp)
diff --git a/benchmark/metalearned/resources/m3/metrics.py b/benchmark/metalearned/resources/m3/metrics.py
index edc7578..0419a62 100644
--- a/benchmark/metalearned/resources/m3/metrics.py
+++ b/benchmark/metalearned/resources/m3/metrics.py
@@ -32,7 +32,9 @@ def smape_m3_dataset_horizon(target_dataset, forecast_dataset, horizon):
     i = 0
     for prediction, target in zip(forecast_dataset.values, target_dataset.values):
         if target_dataset.horizons[i] >= horizon:
-            smape_cum += smape_m3(prediction[horizon-1], target[-len(prediction)+horizon-1]).sum()
+            smape_cum += smape_m3(
+                prediction[horizon - 1], target[-len(prediction) + horizon - 1]
+            ).sum()
             smape_n_points += 1
         i += 1
 
@@ -57,9 +59,12 @@ def smape_m3_dataset_horizon_avg(target_dataset, forecast_dataset, horizon):
     for prediction, target in zip(forecast_dataset.values, target_dataset.values):
         horizon_clamped = min(target_dataset.horizons[i], horizon)
         if horizon_clamped == target_dataset.horizons[i]:
-            target_clamped = target[-target_dataset.horizons[i]:]
+            target_clamped = target[-target_dataset.horizons[i] :]
         else:
-            target_clamped = target[-target_dataset.horizons[i]:-target_dataset.horizons[i]+horizon_clamped]
+            target_clamped = target[
+                -target_dataset.horizons[i] : -target_dataset.horizons[i]
+                + horizon_clamped
+            ]
         smape_cum += smape_m3(prediction[:horizon_clamped], target_clamped).sum()
         smape_n_points += len(target_clamped)
         i += 1
@@ -107,7 +112,9 @@ def smape_m3_dataset_horizon(target_dataset, forecast_dataset, horizon):
     i = 0
     for prediction, target in zip(forecast_dataset.values, target_dataset.values):
         if target_dataset.horizons[i] >= horizon:
-            smape_cum += smape_m3(prediction[horizon-1], target[-len(prediction)+horizon-1]).sum()
+            smape_cum += smape_m3(
+                prediction[horizon - 1], target[-len(prediction) + horizon - 1]
+            ).sum()
             smape_n_points += 1
         i += 1
 
@@ -132,12 +139,14 @@ def smape_m3_dataset_horizon_avg(target_dataset, forecast_dataset, horizon):
     for prediction, target in zip(forecast_dataset.values, target_dataset.values):
         horizon_clamped = min(target_dataset.horizons[i], horizon)
         if horizon_clamped == target_dataset.horizons[i]:
-            target_clamped = target[-target_dataset.horizons[i]:]
+            target_clamped = target[-target_dataset.horizons[i] :]
         else:
-            target_clamped = target[-target_dataset.horizons[i]:-target_dataset.horizons[i]+horizon_clamped]
+            target_clamped = target[
+                -target_dataset.horizons[i] : -target_dataset.horizons[i]
+                + horizon_clamped
+            ]
         smape_cum += smape_m3(prediction[:horizon_clamped], target_clamped).sum()
         smape_n_points += len(target_clamped)
         i += 1
 
     return smape_cum / smape_n_points
-
diff --git a/benchmark/metalearned/resources/m4/dataset.py b/benchmark/metalearned/resources/m4/dataset.py
index a919b1d..199e82e 100644
--- a/benchmark/metalearned/resources/m4/dataset.py
+++ b/benchmark/metalearned/resources/m4/dataset.py
@@ -7,11 +7,18 @@
 import numpy as np
 import pandas as pd
 import patoolib
-from tqdm import tqdm
-
 from common.settings import RESOURCES_DIR
-from common.timeseries import Timeseries, TimeseriesBundle, TimeseriesLoader, Year, Month, Day, Hour
+from common.timeseries import (
+    Day,
+    Hour,
+    Month,
+    Timeseries,
+    TimeseriesBundle,
+    TimeseriesLoader,
+    Year,
+)
 from common.utils import download_url
+from tqdm import tqdm
 
 
 @dataclass(frozen=True)
@@ -31,29 +38,37 @@ def period_map(self):
 class M4Dataset(TimeseriesLoader):
     def download(self) -> TimeseriesBundle:
         url_template = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/{}/{}-{}.csv'
-        m4_info_url = 'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/M4-info.csv'
+        m4_info_url = (
+            'https://github.com/Mcompetitions/M4-methods/raw/master/Dataset/M4-info.csv'
+        )
         m4_info_path = os.path.join(self.path, 'M4info.csv')
 
         ssl._create_default_https_context = ssl._create_unverified_context
 
         download_url(m4_info_url, m4_info_path)
         for sp in M4Meta.seasonal_patterns:
-            training_url = url_template.format("Train", sp, "train")
-            download_url(training_url, os.path.join(M4Meta.dataset_path, f'{sp}-train.csv'))
-            test_url = url_template.format("Test", sp, "test")
+            training_url = url_template.format('Train', sp, 'train')
+            download_url(
+                training_url, os.path.join(M4Meta.dataset_path, f'{sp}-train.csv')
+            )
+            test_url = url_template.format('Test', sp, 'test')
             download_url(test_url, os.path.join(M4Meta.dataset_path, f'{sp}-test.csv'))
 
         # Download naive2 forecasts, needed for OWA metric
         m4_naive2_archive = os.path.join(self.path, 'naive2.rar')
-        download_url('https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-Naive2.rar',
-                     m4_naive2_archive)
+        download_url(
+            'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-Naive2.rar',
+            m4_naive2_archive,
+        )
         patoolib.extract_archive(m4_naive2_archive, outdir=self.path)
         os.remove(m4_naive2_archive)
 
         # Download m4 competition winner predictions, for summary testing purposes only
         m4_winner_archive = os.path.join(self.path, 'submission-118.rar')
-        download_url('https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-118.rar',
-                     m4_winner_archive)
+        download_url(
+            'https://github.com/M4Competition/M4-methods/raw/master/Point%20Forecasts/submission-118.rar',
+            m4_winner_archive,
+        )
         patoolib.extract_archive(m4_winner_archive, outdir=self.path)
         os.remove(m4_winner_archive)
 
@@ -66,12 +81,14 @@ def download(self) -> TimeseriesBundle:
             'Monthly': (Month(), 1),
             'Weekly': (Day(), 7),
             'Daily': (Day(), 1),
-            'Hourly': (Hour(), 1)
+            'Hourly': (Hour(), 1),
         }
 
         all_timeseries = []
         for sp in M4Meta.seasonal_patterns:
-            training_set = pd.read_csv(os.path.join(M4Meta.dataset_path, f'{sp}-train.csv'))
+            training_set = pd.read_csv(
+                os.path.join(M4Meta.dataset_path, f'{sp}-train.csv')
+            )
             test_set = pd.read_csv(os.path.join(M4Meta.dataset_path, f'{sp}-test.csv'))
 
             time_unit, frequency = time_units_mapping[sp]
@@ -89,23 +106,28 @@ def download(self) -> TimeseriesBundle:
                 parsed_date = None
                 for parsing_format in parsing_formats:
                     try:
-                        parsed_date = datetime.strptime(timeseries_info.StartingDate, parsing_format)
+                        parsed_date = datetime.strptime(
+                            timeseries_info.StartingDate, parsing_format
+                        )
                     except Exception:
                         continue
                 if parsed_date is None:
-                    raise ValueError(f'Could not parse {timeseries_info.StartingDate} for {timeseries_id}')
+                    raise ValueError(
+                        f'Could not parse {timeseries_info.StartingDate} for {timeseries_id}'
+                    )
                 # all M4 years are in the 1900s or 1800s
                 if parsed_date.year > 2000:
                     parsed_date = parsed_date.replace(year=parsed_date.year - 100)
 
-                timeseries = Timeseries(id=timeseries_id,
-                                        start_date=parsed_date,
-                                        time_unit=time_unit,
-                                        frequency=frequency,
-                                        period=int(timeseries_info.Frequency),
-                                        values=np.concatenate([training_values, test_values]),
-                                        meta={'seasonal_pattern': sp}
-                                        )
+                timeseries = Timeseries(
+                    id=timeseries_id,
+                    start_date=parsed_date,
+                    time_unit=time_unit,
+                    frequency=frequency,
+                    period=int(timeseries_info.Frequency),
+                    values=np.concatenate([training_values, test_values]),
+                    meta={'seasonal_pattern': sp},
+                )
                 all_timeseries.append(timeseries)
 
         return TimeseriesBundle(all_timeseries)
@@ -113,7 +135,9 @@ def download(self) -> TimeseriesBundle:
     def standard_split(self) -> Tuple[TimeseriesBundle, TimeseriesBundle]:
         bundle = self.load_cache()
         horizons_map = M4Meta().horizons_map()
-        return bundle.split(lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']]))
+        return bundle.split(
+            lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']])
+        )
 
     @staticmethod
     def filter(bundle: TimeseriesBundle, seasonal_pattern: str) -> TimeseriesBundle:
diff --git a/benchmark/metalearned/resources/m4/evaluator.py b/benchmark/metalearned/resources/m4/evaluator.py
index 194f65f..7225056 100644
--- a/benchmark/metalearned/resources/m4/evaluator.py
+++ b/benchmark/metalearned/resources/m4/evaluator.py
@@ -4,8 +4,7 @@
 
 import numpy as np
 import pandas as pd
-
-from common.evaluator import Evaluator, EvaluationResult
+from common.evaluator import EvaluationResult, Evaluator
 from common.metrics import mase, smape_2
 from common.timeseries import TimeseriesBundle
 from common.utils import clean_nans, round_half_up
@@ -21,11 +20,19 @@ def evaluate(self, forecast: TimeseriesBundle) -> EvaluationResult:
         insamples, _ = M4Dataset(M4Meta.dataset_path).standard_split()
         if self.validation:
             horizons_map = M4Meta().horizons_map()
-            insamples, _ = insamples.split(lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']]))
-
-        grouped_smapes = {sp: np.mean(smape_2(forecast=np.array(M4Dataset.filter(forecast, sp).values()),
-                                              target=np.array(M4Dataset.filter(self.test_set, sp).values())))
-                          for sp in M4Meta.seasonal_patterns}
+            insamples, _ = insamples.split(
+                lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']])
+            )
+
+        grouped_smapes = {
+            sp: np.mean(
+                smape_2(
+                    forecast=np.array(M4Dataset.filter(forecast, sp).values()),
+                    target=np.array(M4Dataset.filter(self.test_set, sp).values()),
+                )
+            )
+            for sp in M4Meta.seasonal_patterns
+        }
 
         grouped_smapes = self.summarize_groups(grouped_smapes)
 
@@ -33,7 +40,8 @@ def evaluate(self, forecast: TimeseriesBundle) -> EvaluationResult:
             grouped_owa = OrderedDict()
             if not self.validation:
                 naive2_forecasts = pd.read_csv(
-                    os.path.join(M4Meta.dataset_path, 'submission-Naive2.csv'))
+                    os.path.join(M4Meta.dataset_path, 'submission-Naive2.csv')
+                )
                 naive2_forecasts.set_index(keys='id', inplace=True)
 
                 model_mases = {}
@@ -41,32 +49,56 @@ def evaluate(self, forecast: TimeseriesBundle) -> EvaluationResult:
                 naive2_mases = {}
                 for sp in M4Meta.seasonal_patterns:
                     model_forecasts = M4Dataset.filter(forecast, sp)
-                    naive2_forecast = clean_nans(naive2_forecasts.loc[model_forecasts.ids()].values)
+                    naive2_forecast = clean_nans(
+                        naive2_forecasts.loc[model_forecasts.ids()].values
+                    )
 
                     model_forecast_values = model_forecasts.values()
 
-                    target = self.test_set.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
+                    target = self.test_set.filter(
+                        lambda ts: ts.meta['seasonal_pattern'] == sp
+                    )
                     target_values = np.array(target.values())
                     # all timeseries within group have same frequency
                     period = target.period()[0]
                     insample = M4Dataset.filter(insamples, sp).values()
 
-                    model_mases[sp] = np.mean([mase(forecast=model_forecast_values[i],
-                                                    insample=insample[i],
-                                                    outsample=target_values[i],
-                                                    frequency=period) for i in range(len(model_forecast_values))])
-                    naive2_mases[sp] = np.mean([mase(forecast=naive2_forecast[i],
-                                                     insample=insample[i],
-                                                     outsample=target_values[i],
-                                                     frequency=period) for i in range(len(model_forecast_values))])
+                    model_mases[sp] = np.mean(
+                        [
+                            mase(
+                                forecast=model_forecast_values[i],
+                                insample=insample[i],
+                                outsample=target_values[i],
+                                frequency=period,
+                            )
+                            for i in range(len(model_forecast_values))
+                        ]
+                    )
+                    naive2_mases[sp] = np.mean(
+                        [
+                            mase(
+                                forecast=naive2_forecast[i],
+                                insample=insample[i],
+                                outsample=target_values[i],
+                                frequency=period,
+                            )
+                            for i in range(len(model_forecast_values))
+                        ]
+                    )
 
                     naive2_smapes[sp] = np.mean(smape_2(naive2_forecast, target_values))
                 grouped_model_mases = self.summarize_groups(model_mases)
                 grouped_naive2_smapes = self.summarize_groups(naive2_smapes)
                 grouped_naive2_mases = self.summarize_groups(naive2_mases)
                 for k in grouped_model_mases.keys():
-                    grouped_owa[k] = round_half_up((grouped_model_mases[k] / grouped_naive2_mases[k] +
-                                                    grouped_smapes[k] / grouped_naive2_smapes[k]) / 2, 3)
+                    grouped_owa[k] = round_half_up(
+                        (
+                            grouped_model_mases[k] / grouped_naive2_mases[k]
+                            + grouped_smapes[k] / grouped_naive2_smapes[k]
+                        )
+                        / 2,
+                        3,
+                    )
             return self.round_values(grouped_owa)
         else:
             return self.round_values(grouped_smapes)
@@ -76,7 +108,9 @@ def summarize_groups(self, scores):
 
         weighted_score = {}
         for sp in ['Yearly', 'Quarterly', 'Monthly', 'Weekly', 'Daily', 'Hourly']:
-            weighted_score[sp] = scores[sp] * len(M4Dataset.filter(self.test_set, sp).timeseries)
+            weighted_score[sp] = scores[sp] * len(
+                M4Dataset.filter(self.test_set, sp).timeseries
+            )
             scores_summary[sp] = scores[sp]
 
         others_score = 0
diff --git a/benchmark/metalearned/resources/tourism/dataset.py b/benchmark/metalearned/resources/tourism/dataset.py
index 14864a9..f96fc00 100644
--- a/benchmark/metalearned/resources/tourism/dataset.py
+++ b/benchmark/metalearned/resources/tourism/dataset.py
@@ -6,9 +6,14 @@
 import numpy as np
 import pandas as pd
 import patoolib
-
 from common.settings import RESOURCES_DIR
-from common.timeseries import Timeseries, TimeseriesBundle, TimeseriesLoader, Year, Month
+from common.timeseries import (
+    Month,
+    Timeseries,
+    TimeseriesBundle,
+    TimeseriesLoader,
+    Year,
+)
 from common.utils import download_url
 
 
@@ -29,74 +34,106 @@ def period_map(self):
 class TourismDataset(TimeseriesLoader):
     def download(self) -> TimeseriesBundle:
         archive_file = os.path.join(self.path, 'm3.zip')
-        download_url('https://robjhyndman.com/data/27-3-Athanasopoulos1.zip', archive_file)
+        download_url(
+            'https://robjhyndman.com/data/27-3-Athanasopoulos1.zip', archive_file
+        )
         patoolib.extract_archive(archive_file, outdir=self.path)
 
         timeseries = []
 
         # Yearly
-        insample = pd.read_csv(os.path.join(TourismMeta.dataset_path, f'yearly_in.csv'), header=0)
-        outsample = pd.read_csv(os.path.join(TourismMeta.dataset_path, f'yearly_oos.csv'), header=0)
+        insample = pd.read_csv(
+            os.path.join(TourismMeta.dataset_path, 'yearly_in.csv'), header=0
+        )
+        outsample = pd.read_csv(
+            os.path.join(TourismMeta.dataset_path, 'yearly_oos.csv'), header=0
+        )
         outsampleT = outsample.T
 
         for timeseries_id, ts_row in insample.T.iterrows():
             outsample_row = outsampleT.loc[timeseries_id].values
             start_date = datetime.strptime(str(int(ts_row[[1]])), '%Y')
-            insample_values = ts_row.values[2:2 + int(ts_row[[0]])]
-            outsample_values = outsample_row[2:2 + int(outsample_row[0])]
+            insample_values = ts_row.values[2 : 2 + int(ts_row[[0]])]
+            outsample_values = outsample_row[2 : 2 + int(outsample_row[0])]
             values = np.concatenate([insample_values, outsample_values])
-            timeseries.append(Timeseries(id=timeseries_id,
-                                         start_date=start_date,
-                                         time_unit=Year(),
-                                         frequency=1,
-                                         period=1,
-                                         values=values,
-                                         meta={'seasonal_pattern': 'Yearly'}))
+            timeseries.append(
+                Timeseries(
+                    id=timeseries_id,
+                    start_date=start_date,
+                    time_unit=Year(),
+                    frequency=1,
+                    period=1,
+                    values=values,
+                    meta={'seasonal_pattern': 'Yearly'},
+                )
+            )
 
         # Quarterly
-        insample = pd.read_csv(os.path.join(TourismMeta.dataset_path, f'quarterly_in.csv'), header=0)
-        outsample = pd.read_csv(os.path.join(TourismMeta.dataset_path, f'quarterly_oos.csv'), header=0)
+        insample = pd.read_csv(
+            os.path.join(TourismMeta.dataset_path, 'quarterly_in.csv'), header=0
+        )
+        outsample = pd.read_csv(
+            os.path.join(TourismMeta.dataset_path, 'quarterly_oos.csv'), header=0
+        )
         outsampleT = outsample.T
 
         for timeseries_id, ts_row in insample.T.iterrows():
             outsample_row = outsampleT.loc[timeseries_id].values
-            start_date = datetime.strptime(f'{str(int(ts_row[[1]]))}-{str((int(ts_row[[2]]) - 1) * 3)}', '%Y-%M')
-            insample_values = ts_row.values[3:3 + int(ts_row[[0]])]
-            outsample_values = outsample_row[3:3 + int(outsample_row[0])]
+            start_date = datetime.strptime(
+                f'{str(int(ts_row[[1]]))}-{str((int(ts_row[[2]]) - 1) * 3)}', '%Y-%M'
+            )
+            insample_values = ts_row.values[3 : 3 + int(ts_row[[0]])]
+            outsample_values = outsample_row[3 : 3 + int(outsample_row[0])]
             values = np.concatenate([insample_values, outsample_values])
-            timeseries.append(Timeseries(id=timeseries_id,
-                                         start_date=start_date,
-                                         time_unit=Month(),
-                                         frequency=3,
-                                         period=1,
-                                         values=values,
-                                         meta={'seasonal_pattern': 'Quarterly'}))
+            timeseries.append(
+                Timeseries(
+                    id=timeseries_id,
+                    start_date=start_date,
+                    time_unit=Month(),
+                    frequency=3,
+                    period=1,
+                    values=values,
+                    meta={'seasonal_pattern': 'Quarterly'},
+                )
+            )
 
         # Monthly
-        insample = pd.read_csv(os.path.join(TourismMeta.dataset_path, f'monthly_in.csv'), header=0)
-        outsample = pd.read_csv(os.path.join(TourismMeta.dataset_path, f'monthly_oos.csv'), header=0)
+        insample = pd.read_csv(
+            os.path.join(TourismMeta.dataset_path, 'monthly_in.csv'), header=0
+        )
+        outsample = pd.read_csv(
+            os.path.join(TourismMeta.dataset_path, 'monthly_oos.csv'), header=0
+        )
         outsampleT = outsample.T
 
         for timeseries_id, ts_row in insample.T.iterrows():
             outsample_row = outsampleT.loc[timeseries_id].values
-            start_date = datetime.strptime(f'{str(int(ts_row[[1]]))}-{str(int(ts_row[[2]]))}', '%Y-%M')
-            insample_values = ts_row.values[3:3 + int(ts_row[[0]])]
-            outsample_values = outsample_row[3:3 + int(outsample_row[0])]
+            start_date = datetime.strptime(
+                f'{str(int(ts_row[[1]]))}-{str(int(ts_row[[2]]))}', '%Y-%M'
+            )
+            insample_values = ts_row.values[3 : 3 + int(ts_row[[0]])]
+            outsample_values = outsample_row[3 : 3 + int(outsample_row[0])]
             values = np.concatenate([insample_values, outsample_values])
-            timeseries.append(Timeseries(id=timeseries_id,
-                                         start_date=start_date,
-                                         time_unit=Month(),
-                                         frequency=1,
-                                         period=1,
-                                         values=values,
-                                         meta={'seasonal_pattern': 'Monthly'}))
+            timeseries.append(
+                Timeseries(
+                    id=timeseries_id,
+                    start_date=start_date,
+                    time_unit=Month(),
+                    frequency=1,
+                    period=1,
+                    values=values,
+                    meta={'seasonal_pattern': 'Monthly'},
+                )
+            )
 
         return TimeseriesBundle(timeseries)
 
     def standard_split(self) -> Tuple[TimeseriesBundle, TimeseriesBundle]:
         bundle = self.load_cache()
         horizons_map = TourismMeta().horizons_map()
-        return bundle.split(lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']]))
+        return bundle.split(
+            lambda ts: ts.split(-horizons_map[ts.meta['seasonal_pattern']])
+        )
 
 
 if __name__ == '__main__':
diff --git a/benchmark/metalearned/resources/tourism/evaluator.py b/benchmark/metalearned/resources/tourism/evaluator.py
index 1f0df89..0763cbb 100644
--- a/benchmark/metalearned/resources/tourism/evaluator.py
+++ b/benchmark/metalearned/resources/tourism/evaluator.py
@@ -3,8 +3,7 @@
 from typing import Callable
 
 import numpy as np
-
-from common.evaluator import Evaluator, EvaluationResult
+from common.evaluator import EvaluationResult, Evaluator
 from common.metrics import mape
 from common.timeseries import TimeseriesBundle
 from resources.tourism.dataset import TourismMeta
@@ -22,8 +21,12 @@ def evaluate(self, forecasts: TimeseriesBundle) -> EvaluationResult:
         offset = 0
 
         for sp in TourismMeta.seasonal_patterns:
-            target_for_sp = self.test_set.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
-            forecast_for_sp = forecasts.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
+            target_for_sp = self.test_set.filter(
+                lambda ts: ts.meta['seasonal_pattern'] == sp
+            )
+            forecast_for_sp = forecasts.filter(
+                lambda ts: ts.meta['seasonal_pattern'] == sp
+            )
 
             target = np.array(target_for_sp.values())
             forecast = np.array(forecast_for_sp.values())
@@ -39,5 +42,7 @@ def evaluate(self, forecasts: TimeseriesBundle) -> EvaluationResult:
             results[sp] = round(float(np.mean(score)), self.precision)
             offset += len(target)
 
-        results['Average'] = round(cumulative_metrics / cumulative_points, self.precision)
+        results['Average'] = round(
+            cumulative_metrics / cumulative_points, self.precision
+        )
         return results
diff --git a/benchmark/metalearned/resources/traffic/dataset.py b/benchmark/metalearned/resources/traffic/dataset.py
index 66709ab..1c4ad5b 100644
--- a/benchmark/metalearned/resources/traffic/dataset.py
+++ b/benchmark/metalearned/resources/traffic/dataset.py
@@ -5,12 +5,10 @@
 
 import numpy as np
 import patoolib
-from tqdm import tqdm
-
 from common.settings import RESOURCES_DIR
-from common.timeseries import Timeseries, TimeseriesBundle, TimeseriesLoader, Hour
+from common.timeseries import Hour, Timeseries, TimeseriesBundle, TimeseriesLoader
 from common.utils import download_url
-
+from tqdm import tqdm
 
 """
 Hourly aggregated dataset from https://archive.ics.uci.edu/ml/datasets/PEMS-SF
@@ -19,6 +17,8 @@
 Dataset was also compared with the one built by the TRMF paper's author:
 https://github.com/rofuyu/exp-trmf-nips16/blob/master/python/exp-scripts/datasets/download-data.sh
 """
+
+
 @dataclass(frozen=True)
 class TrafficMeta:
     dataset_path = os.path.join(RESOURCES_DIR, 'traffic')
@@ -38,8 +38,10 @@ def download(self) -> TimeseriesBundle:
         train_raw_file = os.path.join(self.path, 'PEMS_train')
         test_raw_file = os.path.join(self.path, 'PEMS_test')
         perm_raw_file = os.path.join(self.path, 'randperm')
-        download_url('https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip',
-                     archive_file)
+        download_url(
+            'https://archive.ics.uci.edu/ml/machine-learning-databases/00204/PEMS-SF.zip',
+            archive_file,
+        )
         patoolib.extract_archive(archive_file, outdir=self.path)
         with open(train_raw_file, 'r') as f:
             train_raw_data = f.readlines()
@@ -47,7 +49,9 @@ def download(self) -> TimeseriesBundle:
             test_raw_data = f.readlines()
         with open(perm_raw_file, 'r') as f:
             permutations = f.readlines()
-        permutations = np.array(permutations[0].rstrip()[1:-1].split(' ')).astype(np.int)
+        permutations = np.array(permutations[0].rstrip()[1:-1].split(' ')).astype(
+            np.int
+        )
 
         raw_data = train_raw_data + test_raw_data
 
@@ -77,7 +81,9 @@ def download(self) -> TimeseriesBundle:
         #  - Mar. 8, 2009 - Anomaly
         #  ------------------------------------------
         # Thus 455 - 15 = 440 days from 2008-01-01 to 2008-03-30 (incl.)
-        start_date = datetime.strptime('2008-01-02', '%Y-%m-%d')  # 2008-01-01 is a holiday
+        start_date = datetime.strptime(
+            '2008-01-02', '%Y-%m-%d'
+        )  # 2008-01-01 is a holiday
         current_date = start_date
         excluded_dates = [
             datetime.strptime('2008-01-21', '%Y-%m-%d'),
@@ -110,18 +116,27 @@ def download(self) -> TimeseriesBundle:
                     values = np.concatenate([values, daily], axis=1)
             else:  # should never be in the first 24*7 records.
                 # fill gaps with same day of previous week.
-                values = np.concatenate([values, values[:, -24 * 7 * 6:-24 * 6 * 6]], axis=1)
+                values = np.concatenate(
+                    [values, values[:, -24 * 7 * 6 : -24 * 6 * 6]], axis=1
+                )
             current_date += timedelta(days=1)
 
         # aggregate 10 minutes events to hourly
-        hourly = np.array([list(map(np.mean, zip(*(iter(lane),) * 6))) for lane in tqdm(values)])
-        timeseries = [Timeseries(id=str(i),
-                                 start_date=start_date,
-                                 time_unit=Hour(),
-                                 frequency=1,
-                                 period=24 * 7,
-                                 values=values,
-                                 meta={}) for i, values in enumerate(hourly)]
+        hourly = np.array(
+            [list(map(np.mean, zip(*(iter(lane),) * 6))) for lane in tqdm(values)]
+        )
+        timeseries = [
+            Timeseries(
+                id=str(i),
+                start_date=start_date,
+                time_unit=Hour(),
+                frequency=1,
+                period=24 * 7,
+                values=values,
+                meta={},
+            )
+            for i, values in enumerate(hourly)
+        ]
         return TimeseriesBundle(timeseries=timeseries)
 
     def standard_split(self) -> Tuple[TimeseriesBundle, TimeseriesBundle]:
diff --git a/benchmark/metalearned/resources/traffic/evaluator.py b/benchmark/metalearned/resources/traffic/evaluator.py
index 08e5ea8..f936f84 100644
--- a/benchmark/metalearned/resources/traffic/evaluator.py
+++ b/benchmark/metalearned/resources/traffic/evaluator.py
@@ -2,8 +2,7 @@
 from typing import Callable
 
 import numpy as np
-
-from common.evaluator import Evaluator, EvaluationResult
+from common.evaluator import EvaluationResult, Evaluator
 from common.metrics import nd
 from common.timeseries import TimeseriesBundle
 from common.utils import round_half_up
@@ -15,6 +14,11 @@ class TrafficEvaluator(Evaluator):
     precision: int = 2
 
     def evaluate(self, forecasts: TimeseriesBundle) -> EvaluationResult:
-        return {'metric': round_half_up(self.metric_fn(np.array(forecasts.values()),
-                                                       np.array(self.test_set.values())),
-                                        self.precision)}
+        return {
+            'metric': round_half_up(
+                self.metric_fn(
+                    np.array(forecasts.values()), np.array(self.test_set.values())
+                ),
+                self.precision,
+            )
+        }
diff --git a/benchmark/run.py b/benchmark/run.py
index 9212084..9db5158 100644
--- a/benchmark/run.py
+++ b/benchmark/run.py
@@ -1,60 +1,89 @@
-import numpy as np
+import argparse
 import random
-from exp.exp_resolver import resolve_experiment
+import sys
+
+import numpy as np
 import torch
-import argparse
-from utils.arg_resolver import resolve_transformer_args, _model_is_transformer, setting_string, resolve_args
 
-import sys
-sys.path.append("metalearned")
+from exp.exp_resolver import resolve_experiment
+from utils.arg_resolver import (
+    _model_is_transformer,
+    resolve_args,
+    resolve_transformer_args,
+    setting_string,
+)
 
-def parse():
+sys.path.append('metalearned')
 
+
+def parse():
     parser = argparse.ArgumentParser(
-        description='Comparing performance of ForecastPFN to other Time Series Benchmarks')
+        description='Comparing performance of ForecastPFN to other Time Series Benchmarks'
+    )
 
     parser.add_argument('--is_training', type=int, default=1, help='status')
     parser.add_argument('--use_gpu', type=bool, default=True, help='status')
     parser.add_argument('--itr', type=int, default=1, help='status')
 
     # model settings
-    parser.add_argument('--model', type=str, default='ForecastPFN',
-                        help='model name, options: [ForecastPFN, FEDformer, Autoformer, Informer, Transformer, Arima, Prophet]')
+    parser.add_argument(
+        '--model',
+        type=str,
+        default='ForecastPFN',
+        help='model name, options: [ForecastPFN, FEDformer, Autoformer, Informer, Transformer, Arima, Prophet]',
+    )
 
     # forecasting task
-    parser.add_argument('--seq_len', type=int, default=96,
-                        help='input sequence length')
-    parser.add_argument('--label_len', type=int,
-                        default=48, help='start token length')
-    parser.add_argument('--pred_len', type=int, default=96,
-                        help='prediction sequence length')
-
-    parser.add_argument('--time_budget', type=int,
-                        help='amount of time budget to train the model')
-    parser.add_argument('--train_budget', type=int,
-                        help='length of training sequence')
+    parser.add_argument('--seq_len', type=int, default=96, help='input sequence length')
+    parser.add_argument('--label_len', type=int, default=48, help='start token length')
+    parser.add_argument(
+        '--pred_len', type=int, default=96, help='prediction sequence length'
+    )
+
+    parser.add_argument(
+        '--time_budget', type=int, help='amount of time budget to train the model'
+    )
+    parser.add_argument('--train_budget', type=int, help='length of training sequence')
 
     # data loader
-    parser.add_argument('--data', type=str,
-                        default='ETTh1', help='dataset type')
-    parser.add_argument('--root_path', type=str,
-                        default='./dataset/ETT/', help='root path of the data file')
-    parser.add_argument('--data_path', type=str,
-                        default='ETTh1.csv', help='data file')
-    parser.add_argument('--target', type=str,
-                        default='OT', help='name of target column')
-    parser.add_argument('--scale', type=bool, default=True,
-                        help='scale the time series with sklearn.StandardScale()')
+    parser.add_argument('--data', type=str, default='ETTh1', help='dataset type')
+    parser.add_argument(
+        '--root_path',
+        type=str,
+        default='./dataset/ETT/',
+        help='root path of the data file',
+    )
+    parser.add_argument('--data_path', type=str, default='ETTh1.csv', help='data file')
+    parser.add_argument(
+        '--target', type=str, default='OT', help='name of target column'
+    )
+    parser.add_argument(
+        '--scale',
+        type=bool,
+        default=True,
+        help='scale the time series with sklearn.StandardScale()',
+    )
 
     # ForecastPFN
-    parser.add_argument('--model_path', type=str, default='s3://realityengines.datasets/forecasting/pretrained/gurnoor/models/20230202-025828/ckpts',
-                        help='encoder input size')
-    parser.add_argument('--scaler', type=str, default='standard',
-                        help='scale the test series with sklearn.StandardScale()')
+    parser.add_argument(
+        '--model_path',
+        type=str,
+        default='s3://realityengines.datasets/forecasting/pretrained/gurnoor/models/20230202-025828/ckpts',
+        help='encoder input size',
+    )
+    parser.add_argument(
+        '--scaler',
+        type=str,
+        default='standard',
+        help='scale the test series with sklearn.StandardScale()',
+    )
 
     # Metalearn
-    parser.add_argument('--metalearn_freq', type=str,
-                        help='which type of model should be used for the Metalearn model. Typically M, W, or D.')
+    parser.add_argument(
+        '--metalearn_freq',
+        type=str,
+        help='which type of model should be used for the Metalearn model. Typically M, W, or D.',
+    )
     return parser
 
 
@@ -71,7 +100,7 @@ def main():
     args = resolve_args(args)
     if _model_is_transformer(args.model):
         args = resolve_transformer_args(args)
-    
+
     if args.model != 'ForecastPFN':
         args.model_name = None
     else:
@@ -85,7 +114,6 @@ def main():
         args.device_ids = [int(id_) for id_ in device_ids]
         args.gpu = args.device_ids[0]
 
-
     print('Args in experiment:')
     print(args)
 
@@ -96,10 +124,14 @@ def main():
             # setting record of experiments
             setting = setting_string(args, ii)
 
-            print('>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting))
+            print(
+                '>>>>>>>start training : {}>>>>>>>>>>>>>>>>>>>>>>>>>>'.format(setting)
+            )
             exp.train(setting)
 
-            print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
+            print(
+                '>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting)
+            )
             exp.test(setting)
 
             torch.cuda.empty_cache()
@@ -107,12 +139,11 @@ def main():
     else:
         ii = 0
         setting = setting_string(args, ii)
-        
 
         print('>>>>>>>testing : {}<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<'.format(setting))
         exp.test(setting, test=1)
         torch.cuda.empty_cache()
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/benchmark/transformer_models/model_resolver.py b/benchmark/transformer_models/model_resolver.py
index bf5e9a2..4a6f4d3 100644
--- a/benchmark/transformer_models/model_resolver.py
+++ b/benchmark/transformer_models/model_resolver.py
@@ -1,17 +1,15 @@
-import pandas as pd
-import numpy as np
-import prophet
 import pmdarima
+import prophet
 
-from transformer_models.models import FEDformer, Autoformer, Informer, Transformer
+from transformer_models.models import Autoformer, FEDformer, Informer, Transformer
 
 
-class Arima():
+class Arima:
     def __init__(self) -> None:
         self.model = pmdarima.auto_arima
 
 
-class Prophet():
+class Prophet:
     def __init__(self) -> None:
         self.model = prophet.Prophet()
 
diff --git a/benchmark/transformer_models/models/Autoformer.py b/benchmark/transformer_models/models/Autoformer.py
index 352ef14..2cc4f94 100644
--- a/benchmark/transformer_models/models/Autoformer.py
+++ b/benchmark/transformer_models/models/Autoformer.py
@@ -5,10 +5,17 @@
 
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from layers.Embed import DataEmbedding, DataEmbedding_wo_pos
+
 from layers.AutoCorrelation import AutoCorrelation, AutoCorrelationLayer
-from layers.Autoformer_EncDec import Encoder, Decoder, EncoderLayer, DecoderLayer, my_Layernorm, series_decomp
+from layers.Autoformer_EncDec import (
+    Decoder,
+    DecoderLayer,
+    Encoder,
+    EncoderLayer,
+    my_Layernorm,
+    series_decomp,
+)
+from layers.Embed import DataEmbedding_wo_pos
 
 
 class Model(nn.Module):
@@ -16,6 +23,7 @@ class Model(nn.Module):
     Autoformer is the first method to achieve the series-wise connection,
     with inherent O(LlogL) complexity
     """
+
     def __init__(self, configs):
         super(Model, self).__init__()
         self.seq_len = configs.seq_len
@@ -30,40 +38,69 @@ def __init__(self, configs):
         # Embedding
         # The series-wise connection inherently contains the sequential information.
         # Thus, we can discard the position embedding of transformers.
-        self.enc_embedding = DataEmbedding_wo_pos(configs.enc_in, configs.d_model, configs.embed, configs.freq,
-                                                  configs.dropout)
-        self.dec_embedding = DataEmbedding_wo_pos(configs.dec_in, configs.d_model, configs.embed, configs.freq,
-                                                  configs.dropout)
+        self.enc_embedding = DataEmbedding_wo_pos(
+            configs.enc_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
+        self.dec_embedding = DataEmbedding_wo_pos(
+            configs.dec_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
 
         # Encoder
         self.encoder = Encoder(
             [
                 EncoderLayer(
                     AutoCorrelationLayer(
-                        AutoCorrelation(False, configs.factor, attention_dropout=configs.dropout,
-                                        output_attention=configs.output_attention),
-                        configs.d_model, configs.n_heads),
+                        AutoCorrelation(
+                            False,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=configs.output_attention,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     configs.d_model,
                     configs.d_ff,
                     moving_avg=configs.moving_avg,
                     dropout=configs.dropout,
-                    activation=configs.activation
-                ) for l in range(configs.e_layers)
+                    activation=configs.activation,
+                )
+                for l in range(configs.e_layers)
             ],
-            norm_layer=my_Layernorm(configs.d_model)
+            norm_layer=my_Layernorm(configs.d_model),
         )
         # Decoder
         self.decoder = Decoder(
             [
                 DecoderLayer(
                     AutoCorrelationLayer(
-                        AutoCorrelation(True, configs.factor, attention_dropout=configs.dropout,
-                                        output_attention=False),
-                        configs.d_model, configs.n_heads),
+                        AutoCorrelation(
+                            True,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=False,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     AutoCorrelationLayer(
-                        AutoCorrelation(False, configs.factor, attention_dropout=configs.dropout,
-                                        output_attention=False),
-                        configs.d_model, configs.n_heads),
+                        AutoCorrelation(
+                            False,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=False,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     configs.d_model,
                     configs.c_out,
                     configs.d_ff,
@@ -74,29 +111,46 @@ def __init__(self, configs):
                 for l in range(configs.d_layers)
             ],
             norm_layer=my_Layernorm(configs.d_model),
-            projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
+            projection=nn.Linear(configs.d_model, configs.c_out, bias=True),
         )
 
-    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
-                enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
+    def forward(
+        self,
+        x_enc,
+        x_mark_enc,
+        x_dec,
+        x_mark_dec,
+        enc_self_mask=None,
+        dec_self_mask=None,
+        dec_enc_mask=None,
+    ):
         # decomp init
         mean = torch.mean(x_enc, dim=1).unsqueeze(1).repeat(1, self.pred_len, 1)
-        zeros = torch.zeros([x_dec.shape[0], self.pred_len, x_dec.shape[2]], device=x_enc.device)
+        zeros = torch.zeros(
+            [x_dec.shape[0], self.pred_len, x_dec.shape[2]], device=x_enc.device
+        )
         seasonal_init, trend_init = self.decomp(x_enc)
         # decoder input
-        trend_init = torch.cat([trend_init[:, -self.label_len:, :], mean], dim=1)
-        seasonal_init = torch.cat([seasonal_init[:, -self.label_len:, :], zeros], dim=1)
+        trend_init = torch.cat([trend_init[:, -self.label_len :, :], mean], dim=1)
+        seasonal_init = torch.cat(
+            [seasonal_init[:, -self.label_len :, :], zeros], dim=1
+        )
         # enc
         enc_out = self.enc_embedding(x_enc, x_mark_enc)
         enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)
         # dec
         dec_out = self.dec_embedding(seasonal_init, x_mark_dec)
-        seasonal_part, trend_part = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask,
-                                                 trend=trend_init)
+        seasonal_part, trend_part = self.decoder(
+            dec_out,
+            enc_out,
+            x_mask=dec_self_mask,
+            cross_mask=dec_enc_mask,
+            trend=trend_init,
+        )
         # final
         dec_out = trend_part + seasonal_part
 
         if self.output_attention:
-            return dec_out[:, -self.pred_len:, :], attns
+            return dec_out[:, -self.pred_len :, :], attns
         else:
-            return dec_out[:, -self.pred_len:, :]
\ No newline at end of file
+            return dec_out[:, -self.pred_len :, :]
diff --git a/benchmark/transformer_models/models/FEDformer.py b/benchmark/transformer_models/models/FEDformer.py
index 21c11ce..baf281d 100644
--- a/benchmark/transformer_models/models/FEDformer.py
+++ b/benchmark/transformer_models/models/FEDformer.py
@@ -1,23 +1,29 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from layers.Embed import DataEmbedding, DataEmbedding_wo_pos
-from layers.AutoCorrelation import AutoCorrelation, AutoCorrelationLayer
+
+from layers.AutoCorrelation import AutoCorrelationLayer
+from layers.Autoformer_EncDec import (
+    Decoder,
+    DecoderLayer,
+    Encoder,
+    EncoderLayer,
+    my_Layernorm,
+    series_decomp,
+    series_decomp_multi,
+)
+from layers.Embed import DataEmbedding_wo_pos
 from layers.FourierCorrelation import FourierBlock, FourierCrossAttention
 from layers.MultiWaveletCorrelation import MultiWaveletCross, MultiWaveletTransform
-from layers.SelfAttention_Family import FullAttention, ProbAttention
-from layers.Autoformer_EncDec import Encoder, Decoder, EncoderLayer, DecoderLayer, my_Layernorm, series_decomp, series_decomp_multi
-import math
-import numpy as np
-
 
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
 
 
 class Model(nn.Module):
     """
     FEDformer performs the attention mechanism on frequency domain and achieved O(N) complexity
     """
+
     def __init__(self, configs):
         super(Model, self).__init__()
         self.version = configs.version
@@ -38,70 +44,94 @@ def __init__(self, configs):
         # Embedding
         # The series-wise connection inherently contains the sequential information.
         # Thus, we can discard the position embedding of transformers.
-        self.enc_embedding = DataEmbedding_wo_pos(configs.enc_in, configs.d_model, configs.embed, configs.freq,
-                                                  configs.dropout)
-        self.dec_embedding = DataEmbedding_wo_pos(configs.dec_in, configs.d_model, configs.embed, configs.freq,
-                                                  configs.dropout)
+        self.enc_embedding = DataEmbedding_wo_pos(
+            configs.enc_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
+        self.dec_embedding = DataEmbedding_wo_pos(
+            configs.dec_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
 
         if configs.version == 'Wavelets':
-            encoder_self_att = MultiWaveletTransform(ich=configs.d_model, L=configs.L, base=configs.base)
-            decoder_self_att = MultiWaveletTransform(ich=configs.d_model, L=configs.L, base=configs.base)
-            decoder_cross_att = MultiWaveletCross(in_channels=configs.d_model,
-                                                  out_channels=configs.d_model,
-                                                  seq_len_q=self.seq_len // 2 + self.pred_len,
-                                                  seq_len_kv=self.seq_len,
-                                                  modes=configs.modes,
-                                                  ich=configs.d_model,
-                                                  base=configs.base,
-                                                  activation=configs.cross_activation)
+            encoder_self_att = MultiWaveletTransform(
+                ich=configs.d_model, L=configs.L, base=configs.base
+            )
+            decoder_self_att = MultiWaveletTransform(
+                ich=configs.d_model, L=configs.L, base=configs.base
+            )
+            decoder_cross_att = MultiWaveletCross(
+                in_channels=configs.d_model,
+                out_channels=configs.d_model,
+                seq_len_q=self.seq_len // 2 + self.pred_len,
+                seq_len_kv=self.seq_len,
+                modes=configs.modes,
+                ich=configs.d_model,
+                base=configs.base,
+                activation=configs.cross_activation,
+            )
         else:
-            encoder_self_att = FourierBlock(in_channels=configs.d_model,
-                                            out_channels=configs.d_model,
-                                            seq_len=self.seq_len,
-                                            modes=configs.modes,
-                                            mode_select_method=configs.mode_select)
-            decoder_self_att = FourierBlock(in_channels=configs.d_model,
-                                            out_channels=configs.d_model,
-                                            seq_len=self.seq_len//2+self.pred_len,
-                                            modes=configs.modes,
-                                            mode_select_method=configs.mode_select)
-            decoder_cross_att = FourierCrossAttention(in_channels=configs.d_model,
-                                                      out_channels=configs.d_model,
-                                                      seq_len_q=self.seq_len//2+self.pred_len,
-                                                      seq_len_kv=self.seq_len,
-                                                      modes=configs.modes,
-                                                      mode_select_method=configs.mode_select)
+            encoder_self_att = FourierBlock(
+                in_channels=configs.d_model,
+                out_channels=configs.d_model,
+                seq_len=self.seq_len,
+                modes=configs.modes,
+                mode_select_method=configs.mode_select,
+            )
+            decoder_self_att = FourierBlock(
+                in_channels=configs.d_model,
+                out_channels=configs.d_model,
+                seq_len=self.seq_len // 2 + self.pred_len,
+                modes=configs.modes,
+                mode_select_method=configs.mode_select,
+            )
+            decoder_cross_att = FourierCrossAttention(
+                in_channels=configs.d_model,
+                out_channels=configs.d_model,
+                seq_len_q=self.seq_len // 2 + self.pred_len,
+                seq_len_kv=self.seq_len,
+                modes=configs.modes,
+                mode_select_method=configs.mode_select,
+            )
         # Encoder
-        enc_modes = int(min(configs.modes, configs.seq_len//2))
-        dec_modes = int(min(configs.modes, (configs.seq_len//2+configs.pred_len)//2))
+        enc_modes = int(min(configs.modes, configs.seq_len // 2))
+        dec_modes = int(
+            min(configs.modes, (configs.seq_len // 2 + configs.pred_len) // 2)
+        )
         print('enc_modes: {}, dec_modes: {}'.format(enc_modes, dec_modes))
 
         self.encoder = Encoder(
             [
                 EncoderLayer(
                     AutoCorrelationLayer(
-                        encoder_self_att,
-                        configs.d_model, configs.n_heads),
-
+                        encoder_self_att, configs.d_model, configs.n_heads
+                    ),
                     configs.d_model,
                     configs.d_ff,
                     moving_avg=configs.moving_avg,
                     dropout=configs.dropout,
-                    activation=configs.activation
-                ) for l in range(configs.e_layers)
+                    activation=configs.activation,
+                )
+                for l in range(configs.e_layers)
             ],
-            norm_layer=my_Layernorm(configs.d_model)
+            norm_layer=my_Layernorm(configs.d_model),
         )
         # Decoder
         self.decoder = Decoder(
             [
                 DecoderLayer(
                     AutoCorrelationLayer(
-                        decoder_self_att,
-                        configs.d_model, configs.n_heads),
+                        decoder_self_att, configs.d_model, configs.n_heads
+                    ),
                     AutoCorrelationLayer(
-                        decoder_cross_att,
-                        configs.d_model, configs.n_heads),
+                        decoder_cross_att, configs.d_model, configs.n_heads
+                    ),
                     configs.d_model,
                     configs.c_out,
                     configs.d_ff,
@@ -112,35 +142,53 @@ def __init__(self, configs):
                 for l in range(configs.d_layers)
             ],
             norm_layer=my_Layernorm(configs.d_model),
-            projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
+            projection=nn.Linear(configs.d_model, configs.c_out, bias=True),
         )
 
-    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
-                enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
+    def forward(
+        self,
+        x_enc,
+        x_mark_enc,
+        x_dec,
+        x_mark_dec,
+        enc_self_mask=None,
+        dec_self_mask=None,
+        dec_enc_mask=None,
+    ):
         # decomp init
         mean = torch.mean(x_enc, dim=1).unsqueeze(1).repeat(1, self.pred_len, 1)
-        zeros = torch.zeros([x_dec.shape[0], self.pred_len, x_dec.shape[2]]).to(device)  # cuda()
+        torch.zeros([x_dec.shape[0], self.pred_len, x_dec.shape[2]]).to(
+            device
+        )  # cuda()
         seasonal_init, trend_init = self.decomp(x_enc)
         # decoder input
-        trend_init = torch.cat([trend_init[:, -self.label_len:, :], mean], dim=1)
-        seasonal_init = F.pad(seasonal_init[:, -self.label_len:, :], (0, 0, 0, self.pred_len))
+        trend_init = torch.cat([trend_init[:, -self.label_len :, :], mean], dim=1)
+        seasonal_init = F.pad(
+            seasonal_init[:, -self.label_len :, :], (0, 0, 0, self.pred_len)
+        )
         # enc
         enc_out = self.enc_embedding(x_enc, x_mark_enc)
         enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)
         # dec
         dec_out = self.dec_embedding(seasonal_init, x_mark_dec)
-        seasonal_part, trend_part = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask,
-                                                 trend=trend_init)
+        seasonal_part, trend_part = self.decoder(
+            dec_out,
+            enc_out,
+            x_mask=dec_self_mask,
+            cross_mask=dec_enc_mask,
+            trend=trend_init,
+        )
         # final
         dec_out = trend_part + seasonal_part
 
         if self.output_attention:
-            return dec_out[:, -self.pred_len:, :], attns
+            return dec_out[:, -self.pred_len :, :], attns
         else:
-            return dec_out[:, -self.pred_len:, :]  # [B, L, D]
+            return dec_out[:, -self.pred_len :, :]  # [B, L, D]
 
 
 if __name__ == '__main__':
+
     class Configs(object):
         ab = 0
         modes = 32
@@ -177,7 +225,7 @@ class Configs(object):
     enc = torch.randn([3, configs.seq_len, 7])
     enc_mark = torch.randn([3, configs.seq_len, 4])
 
-    dec = torch.randn([3, configs.seq_len//2+configs.pred_len, 7])
-    dec_mark = torch.randn([3, configs.seq_len//2+configs.pred_len, 4])
+    dec = torch.randn([3, configs.seq_len // 2 + configs.pred_len, 7])
+    dec_mark = torch.randn([3, configs.seq_len // 2 + configs.pred_len, 4])
     out = model.forward(enc, enc_mark, dec, dec_mark)
     print(out)
diff --git a/benchmark/transformer_models/models/Informer.py b/benchmark/transformer_models/models/Informer.py
index 9dfecfe..5d19385 100644
--- a/benchmark/transformer_models/models/Informer.py
+++ b/benchmark/transformer_models/models/Informer.py
@@ -1,59 +1,93 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from utils.masking import TriangularCausalMask, ProbMask
-from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
-from layers.SelfAttention_Family import FullAttention, ProbAttention, AttentionLayer
+
 from layers.Embed import DataEmbedding
-import numpy as np
+from layers.SelfAttention_Family import AttentionLayer, ProbAttention
+from layers.Transformer_EncDec import (
+    ConvLayer,
+    Decoder,
+    DecoderLayer,
+    Encoder,
+    EncoderLayer,
+)
 
 
 class Model(nn.Module):
     """
     Informer with Propspare attention in O(LlogL) complexity
     """
+
     def __init__(self, configs):
         super(Model, self).__init__()
         self.pred_len = configs.pred_len
         self.output_attention = configs.output_attention
 
         # Embedding
-        self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
-                                           configs.dropout)
-        self.dec_embedding = DataEmbedding(configs.dec_in, configs.d_model, configs.embed, configs.freq,
-                                           configs.dropout)
+        self.enc_embedding = DataEmbedding(
+            configs.enc_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
+        self.dec_embedding = DataEmbedding(
+            configs.dec_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
 
         # Encoder
         self.encoder = Encoder(
             [
                 EncoderLayer(
                     AttentionLayer(
-                        ProbAttention(False, configs.factor, attention_dropout=configs.dropout,
-                                      output_attention=configs.output_attention),
-                        configs.d_model, configs.n_heads),
+                        ProbAttention(
+                            False,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=configs.output_attention,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     configs.d_model,
                     configs.d_ff,
                     dropout=configs.dropout,
-                    activation=configs.activation
-                ) for l in range(configs.e_layers)
+                    activation=configs.activation,
+                )
+                for l in range(configs.e_layers)
             ],
-            [
-                ConvLayer(
-                    configs.d_model
-                ) for l in range(configs.e_layers - 1)
-            ] if configs.distil else None,
-            norm_layer=torch.nn.LayerNorm(configs.d_model)
+            [ConvLayer(configs.d_model) for l in range(configs.e_layers - 1)]
+            if configs.distil
+            else None,
+            norm_layer=torch.nn.LayerNorm(configs.d_model),
         )
         # Decoder
         self.decoder = Decoder(
             [
                 DecoderLayer(
                     AttentionLayer(
-                        ProbAttention(True, configs.factor, attention_dropout=configs.dropout, output_attention=False),
-                        configs.d_model, configs.n_heads),
+                        ProbAttention(
+                            True,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=False,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     AttentionLayer(
-                        ProbAttention(False, configs.factor, attention_dropout=configs.dropout, output_attention=False),
-                        configs.d_model, configs.n_heads),
+                        ProbAttention(
+                            False,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=False,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     configs.d_model,
                     configs.d_ff,
                     dropout=configs.dropout,
@@ -62,19 +96,28 @@ def __init__(self, configs):
                 for l in range(configs.d_layers)
             ],
             norm_layer=torch.nn.LayerNorm(configs.d_model),
-            projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
+            projection=nn.Linear(configs.d_model, configs.c_out, bias=True),
         )
 
-    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
-                enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
-
+    def forward(
+        self,
+        x_enc,
+        x_mark_enc,
+        x_dec,
+        x_mark_dec,
+        enc_self_mask=None,
+        dec_self_mask=None,
+        dec_enc_mask=None,
+    ):
         enc_out = self.enc_embedding(x_enc, x_mark_enc)
         enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)
 
         dec_out = self.dec_embedding(x_dec, x_mark_dec)
-        dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask)
+        dec_out = self.decoder(
+            dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask
+        )
 
         if self.output_attention:
-            return dec_out[:, -self.pred_len:, :], attns
+            return dec_out[:, -self.pred_len :, :], attns
         else:
-            return dec_out[:, -self.pred_len:, :]
+            return dec_out[:, -self.pred_len :, :]
diff --git a/benchmark/transformer_models/models/Transformer.py b/benchmark/transformer_models/models/Transformer.py
index 88c4a88..0d1a48c 100644
--- a/benchmark/transformer_models/models/Transformer.py
+++ b/benchmark/transformer_models/models/Transformer.py
@@ -1,50 +1,83 @@
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
-from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer, ConvLayer
-from layers.SelfAttention_Family import FullAttention, AttentionLayer
+
 from layers.Embed import DataEmbedding
+from layers.SelfAttention_Family import AttentionLayer, FullAttention
+from layers.Transformer_EncDec import Decoder, DecoderLayer, Encoder, EncoderLayer
 
 
 class Model(nn.Module):
     """
     Vanilla Transformer with O(L^2) complexity
     """
+
     def __init__(self, configs):
         super(Model, self).__init__()
         self.pred_len = configs.pred_len
         self.output_attention = configs.output_attention
 
         # Embedding
-        self.enc_embedding = DataEmbedding(configs.enc_in, configs.d_model, configs.embed, configs.freq,
-                                           configs.dropout)
-        self.dec_embedding = DataEmbedding(configs.dec_in, configs.d_model, configs.embed, configs.freq,
-                                           configs.dropout)
+        self.enc_embedding = DataEmbedding(
+            configs.enc_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
+        self.dec_embedding = DataEmbedding(
+            configs.dec_in,
+            configs.d_model,
+            configs.embed,
+            configs.freq,
+            configs.dropout,
+        )
         # Encoder
         self.encoder = Encoder(
             [
                 EncoderLayer(
                     AttentionLayer(
-                        FullAttention(False, configs.factor, attention_dropout=configs.dropout,
-                                      output_attention=configs.output_attention), configs.d_model, configs.n_heads),
+                        FullAttention(
+                            False,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=configs.output_attention,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     configs.d_model,
                     configs.d_ff,
                     dropout=configs.dropout,
-                    activation=configs.activation
-                ) for l in range(configs.e_layers)
+                    activation=configs.activation,
+                )
+                for l in range(configs.e_layers)
             ],
-            norm_layer=torch.nn.LayerNorm(configs.d_model)
+            norm_layer=torch.nn.LayerNorm(configs.d_model),
         )
         # Decoder
         self.decoder = Decoder(
             [
                 DecoderLayer(
                     AttentionLayer(
-                        FullAttention(True, configs.factor, attention_dropout=configs.dropout, output_attention=False),
-                        configs.d_model, configs.n_heads),
+                        FullAttention(
+                            True,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=False,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     AttentionLayer(
-                        FullAttention(False, configs.factor, attention_dropout=configs.dropout, output_attention=False),
-                        configs.d_model, configs.n_heads),
+                        FullAttention(
+                            False,
+                            configs.factor,
+                            attention_dropout=configs.dropout,
+                            output_attention=False,
+                        ),
+                        configs.d_model,
+                        configs.n_heads,
+                    ),
                     configs.d_model,
                     configs.d_ff,
                     dropout=configs.dropout,
@@ -53,19 +86,28 @@ def __init__(self, configs):
                 for l in range(configs.d_layers)
             ],
             norm_layer=torch.nn.LayerNorm(configs.d_model),
-            projection=nn.Linear(configs.d_model, configs.c_out, bias=True)
+            projection=nn.Linear(configs.d_model, configs.c_out, bias=True),
         )
 
-    def forward(self, x_enc, x_mark_enc, x_dec, x_mark_dec,
-                enc_self_mask=None, dec_self_mask=None, dec_enc_mask=None):
-
+    def forward(
+        self,
+        x_enc,
+        x_mark_enc,
+        x_dec,
+        x_mark_dec,
+        enc_self_mask=None,
+        dec_self_mask=None,
+        dec_enc_mask=None,
+    ):
         enc_out = self.enc_embedding(x_enc, x_mark_enc)
         enc_out, attns = self.encoder(enc_out, attn_mask=enc_self_mask)
 
         dec_out = self.dec_embedding(x_dec, x_mark_dec)
-        dec_out = self.decoder(dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask)
+        dec_out = self.decoder(
+            dec_out, enc_out, x_mask=dec_self_mask, cross_mask=dec_enc_mask
+        )
 
         if self.output_attention:
-            return dec_out[:, -self.pred_len:, :], attns
+            return dec_out[:, -self.pred_len :, :], attns
         else:
-            return dec_out[:, -self.pred_len:, :]
+            return dec_out[:, -self.pred_len :, :]
diff --git a/benchmark/utils/arg_resolver.py b/benchmark/utils/arg_resolver.py
index 3b73ca1..32e1ad2 100644
--- a/benchmark/utils/arg_resolver.py
+++ b/benchmark/utils/arg_resolver.py
@@ -1,21 +1,34 @@
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from sklearn.preprocessing import MinMaxScaler, StandardScaler
+
 
 def _model_is_transformer(model):
-    if model in ['FEDformer', 'FEDformer-f', 'FEDformer-w', 'FEDformer_Meta', 'Autoformer', 'Informer', 'Transformer']:
+    if model in [
+        'FEDformer',
+        'FEDformer-f',
+        'FEDformer-w',
+        'FEDformer_Meta',
+        'Autoformer',
+        'Informer',
+        'Transformer',
+    ]:
         return True
     return False
 
+
 def setting_string(args, ii):
-    setting = '{}_{}_sl{}_ll{}_pl{}_timebudget_{}_trainbudget_{}_model-path_{}_itr_{}'.format(
-        args.model,
-        args.data,
-        args.seq_len,
-        args.label_len,
-        args.pred_len,
-        args.time_budget,
-        args.train_budget,
-        args.model_name,
-        ii)
+    setting = (
+        '{}_{}_sl{}_ll{}_pl{}_timebudget_{}_trainbudget_{}_model-path_{}_itr_{}'.format(
+            args.model,
+            args.data,
+            args.seq_len,
+            args.label_len,
+            args.pred_len,
+            args.time_budget,
+            args.train_budget,
+            args.model_name,
+            ii,
+        )
+    )
     return setting
 
 
@@ -36,7 +49,6 @@ def resolve_args(args):
     return args
 
 
-
 def resolve_transformer_args(args):
     args.mode_select = 'random'
     args.modes = 64
diff --git a/benchmark/utils/masking.py b/benchmark/utils/masking.py
index 4f768bd..6d68561 100644
--- a/benchmark/utils/masking.py
+++ b/benchmark/utils/masking.py
@@ -1,39 +1,49 @@
-import torch
-import numpy as np
 import math
 
-class TriangularCausalMask():
-    def __init__(self, B, L, device="cpu"):
+import numpy as np
+import torch
+
+
+class TriangularCausalMask:
+    def __init__(self, B, L, device='cpu'):
         mask_shape = [B, 1, L, L]
         with torch.no_grad():
-            self._mask = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
+            self._mask = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(device)
 
     @property
     def mask(self):
         return self._mask
 
 
-class ProbMask():
-    def __init__(self, B, H, L, index, scores, device="cpu"):
+class ProbMask:
+    def __init__(self, B, H, L, index, scores, device='cpu'):
         _mask = torch.ones(L, scores.shape[-1], dtype=torch.bool).to(device).triu(1)
         _mask_ex = _mask[None, None, :].expand(B, H, L, scores.shape[-1])
-        indicator = _mask_ex[torch.arange(B)[:, None, None],
-                    torch.arange(H)[None, :, None],
-                    index, :].to(device)
+        indicator = _mask_ex[
+            torch.arange(B)[:, None, None], torch.arange(H)[None, :, None], index, :
+        ].to(device)
         self._mask = indicator.view(scores.shape).to(device)
 
     @property
     def mask(self):
         return self._mask
 
-class LocalMask():
-    def __init__(self, B, L,S,device="cpu"):
+
+class LocalMask:
+    def __init__(self, B, L, S, device='cpu'):
         mask_shape = [B, 1, L, S]
         with torch.no_grad():
             self.len = math.ceil(np.log2(L))
-            self._mask1 = torch.triu(torch.ones(mask_shape, dtype=torch.bool), diagonal=1).to(device)
-            self._mask2 = ~torch.triu(torch.ones(mask_shape,dtype=torch.bool),diagonal=-self.len).to(device)
-            self._mask = self._mask1+self._mask2
+            self._mask1 = torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=1
+            ).to(device)
+            self._mask2 = ~torch.triu(
+                torch.ones(mask_shape, dtype=torch.bool), diagonal=-self.len
+            ).to(device)
+            self._mask = self._mask1 + self._mask2
+
     @property
     def mask(self):
-        return self._mask
\ No newline at end of file
+        return self._mask
diff --git a/benchmark/utils/metrics.py b/benchmark/utils/metrics.py
index 3ab2e13..2befaa7 100644
--- a/benchmark/utils/metrics.py
+++ b/benchmark/utils/metrics.py
@@ -4,7 +4,9 @@
 
 
 def RSE(pred, true):
-    return np.sqrt(np.sum((true - pred) ** 2)) / np.sqrt(np.sum((true - true.mean()) ** 2))
+    return np.sqrt(np.sum((true - pred) ** 2)) / np.sqrt(
+        np.sum((true - true.mean()) ** 2)
+    )
 
 
 def CORR(pred, true):
@@ -42,20 +44,20 @@ def metric(pred, true):
 
     return mae, mse, rmse, mape, mspe
 
+
 def smape(y_true, y_pred):
-    """ Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
-        `loss = 200 * mean(abs((y_true - y_pred) / (y_true + y_pred), axis=-1)`
-        Args:
-        y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
-        y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
-        Returns:
-        Symmetric mean absolute percentage error values. shape = `[batch_size, d0, ..
-        dN-1]`.
-        """
+    """Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
+    `loss = 200 * mean(abs((y_true - y_pred) / (y_true + y_pred), axis=-1)`
+    Args:
+    y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
+    y_pred: The predicted values. shape = `[batch_size, d0, .. dN]`.
+    Returns:
+    Symmetric mean absolute percentage error values. shape = `[batch_size, d0, ..
+    dN-1]`.
+    """
     y_pred = tf.convert_to_tensor(y_pred)
     y_true = tf.cast(y_true, y_pred.dtype)
     diff = tf.abs(
-        (y_true - y_pred) /
-        backend.maximum(y_true + y_pred, backend.epsilon())
+        (y_true - y_pred) / backend.maximum(y_true + y_pred, backend.epsilon())
     )
     return 200.0 * backend.mean(diff, axis=-1)
diff --git a/benchmark/utils/timefeatures.py b/benchmark/utils/timefeatures.py
index 0e93870..165df2d 100644
--- a/benchmark/utils/timefeatures.py
+++ b/benchmark/utils/timefeatures.py
@@ -14,63 +14,63 @@ def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
         pass
 
     def __repr__(self):
-        return self.__class__.__name__ + "()"
+        return self.__class__.__name__ + '()'
 
 
 class SecondOfMinute(TimeFeature):
     """Minute of hour encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [x.second / 59.0 - 0.5 if x!=0 else 0 for x in index]
+        return [x.second / 59.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 class MinuteOfHour(TimeFeature):
     """Minute of hour encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [x.minute / 59.0 - 0.5 if x!=0 else 0 for x in index]
+        return [x.minute / 59.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 class HourOfDay(TimeFeature):
     """Hour of day encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [x.hour / 23.0 - 0.5 if x!=0 else 0 for x in index]
+        return [x.hour / 23.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 class DayOfWeek(TimeFeature):
     """Hour of day encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [x.dayofweek / 6.0 - 0.5 if x!=0 else 0 for x in index]
+        return [x.dayofweek / 6.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 class DayOfMonth(TimeFeature):
     """Day of month encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [(x.day - 1) / 30.0 - 0.5 if x!=0 else 0 for x in index]
+        return [(x.day - 1) / 30.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 class DayOfYear(TimeFeature):
     """Day of year encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [(x.dayofyear - 1) / 365.0 - 0.5 if x!=0 else 0 for x in index]
+        return [(x.dayofyear - 1) / 365.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 class MonthOfYear(TimeFeature):
     """Month of year encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [(x.month - 1) / 11.0 - 0.5 if x!=0 else 0 for x in index]
+        return [(x.month - 1) / 11.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 class WeekOfYear(TimeFeature):
     """Week of year encoded as value between [-0.5, 0.5]"""
 
     def __call__(self, index: pd.DatetimeIndex) -> np.ndarray:
-        return [(x.isocalendar().week - 1) / 52.0 - 0.5 if x!=0 else 0 for x in index]
+        return [(x.isocalendar().week - 1) / 52.0 - 0.5 if x != 0 else 0 for x in index]
 
 
 def time_features_from_frequency_str(freq_str: str) -> List[TimeFeature]:
diff --git a/benchmark/utils/tools.py b/benchmark/utils/tools.py
index e74b399..d3e018b 100644
--- a/benchmark/utils/tools.py
+++ b/benchmark/utils/tools.py
@@ -1,7 +1,8 @@
+import time
+
+import matplotlib.pyplot as plt
 import numpy as np
 import torch
-import matplotlib.pyplot as plt
-import time
 
 plt.switch_backend('agg')
 
@@ -11,11 +12,8 @@ def adjust_learning_rate(optimizer, epoch, args):
     if args.lradj == 'type1':
         lr_adjust = {epoch: args.learning_rate * (0.5 ** ((epoch - 1) // 1))}
     elif args.lradj == 'type2':
-        lr_adjust = {
-            2: 5e-5, 4: 1e-5, 6: 5e-6, 8: 1e-6,
-            10: 5e-7, 15: 1e-7, 20: 5e-8
-        }
-    elif args.lradj =='type3':
+        lr_adjust = {2: 5e-5, 4: 1e-5, 6: 5e-6, 8: 1e-6, 10: 5e-7, 15: 1e-7, 20: 5e-8}
+    elif args.lradj == 'type3':
         lr_adjust = {epoch: args.learning_rate}
     elif args.lradj == 'type4':
         lr_adjust = {epoch: args.learning_rate * (0.9 ** ((epoch - 1) // 1))}
@@ -53,13 +51,16 @@ def __call__(self, val_loss, model, path):
 
     def save_checkpoint(self, val_loss, model, path, epoch=None):
         if self.verbose:
-            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
+            print(
+                f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...'
+            )
         if epoch:
             torch.save(model.state_dict(), path + '/' + f'checkpoint_{epoch}.pth')
         else:
             torch.save(model.state_dict(), path + '/' + 'checkpoint.pth')
         self.val_loss_min = val_loss
 
+
 class TimeBudget:
     def __init__(self, budget):
         self.budget = budget
@@ -83,14 +84,16 @@ def step(self):
             self.end_timer()
             return
 
+
 class dotdict(dict):
     """dot.notation access to dictionary attributes"""
+
     __getattr__ = dict.get
     __setattr__ = dict.__setitem__
     __delattr__ = dict.__delitem__
 
 
-class StandardScaler():
+class StandardScaler:
     def __init__(self, mean, std):
         self.mean = mean
         self.std = std
diff --git a/src/build_validation_dataset.py b/src/build_validation_dataset.py
index c365865..8e648ca 100644
--- a/src/build_validation_dataset.py
+++ b/src/build_validation_dataset.py
@@ -2,22 +2,27 @@
 Module to transform different real world datasets
 into format used for our synthetic dataset
 """
-import pandas as pd
-import numpy as np
-import tensorflow as tf
 import csv
 from datetime import datetime
+from functools import reduce
+
+import numpy as np
+import pandas as pd
+import tensorflow as tf
 from dateutil.relativedelta import relativedelta
 from tqdm import trange
-from functools import reduce
 
 HISTORY = 100
 HORIZON = 10
 NUM_TASKS = 3
 
+
 def compute_time_features(ts: np.ndarray):
     ts = pd.to_datetime(ts)
-    return np.stack([ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
+    return np.stack(
+        [ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1
+    )
+
 
 def build_input(ts, target_full, task=1):
     # horizon should be fixed as defined in model
@@ -28,11 +33,12 @@ def build_input(ts, target_full, task=1):
     target = target_full[:-HORIZON]
     target_to_predict = target_full[-HORIZON:]
 
-
     if task == 2:
         target_to_predict = np.cumsum(target_to_predict) / (1 + np.arange(HORIZON))
     elif task == 3:
-        target_to_predict = [np.std(target_to_predict[:i+1]) for i in range(len(target_to_predict))]
+        target_to_predict = [
+            np.std(target_to_predict[: i + 1]) for i in range(len(target_to_predict))
+        ]
 
     # this is the target value of the data before the horizon
     target = tf.convert_to_tensor(target, dtype=tf.float32)
@@ -49,15 +55,20 @@ def build_input(ts, target_full, task=1):
 
     return {
         'ts': tf.repeat(tf.expand_dims(date_tensor, axis=0), [HORIZON], axis=0),
-
         # repeat the before horizon values horizon number of times,
         # so that for each of the predictions for each target_ts, you
         # have an available set of features
         'history': tf.repeat(tf.expand_dims(target, axis=0), [HORIZON], axis=0),
         'target_ts': tf.expand_dims(target_dates, axis=1),
-        'task': tf.fill([HORIZON,], task)
+        'task': tf.fill(
+            [
+                HORIZON,
+            ],
+            task,
+        ),
     }, tf.expand_dims(tf.convert_to_tensor(target_to_predict, dtype=tf.float32), axis=1)
 
+
 def read_timeseries_file(filename):
     """
     Function to read the standard datasets for time series.
@@ -72,6 +83,7 @@ def read_timeseries_file(filename):
 
     return lines
 
+
 def get_dates(num_days, freq):
     dates = []
 
@@ -84,14 +96,15 @@ def get_dates(num_days, freq):
     for _ in range(num_days):
         dates.append(pd.to_datetime(current_date))
         if freq == 'daily':
-            current_date += relativedelta(days = 1)
+            current_date += relativedelta(days=1)
         elif freq == 'weekly':
-            current_date += relativedelta(weeks = 1)
+            current_date += relativedelta(weeks=1)
         elif freq == 'monthly':
-            current_date += relativedelta(months = 1)
+            current_date += relativedelta(months=1)
 
     return dates
 
+
 def split_dataset(dataset):
     """
     If the size of dataset is n * (HISTORY + HORIZON), we split it
@@ -105,8 +118,8 @@ def split_dataset(dataset):
     # otherwise, slide a window starting from the first point
     # with a stride of HISTORY // 3 until the elements in
     # window are less than HISTORY + HORIZON
-    while i == 0 or i+HISTORY+HORIZON < len(dataset):
-        mini_datasets.append(dataset[i:i+HISTORY+HORIZON])
+    while i == 0 or i + HISTORY + HORIZON < len(dataset):
+        mini_datasets.append(dataset[i : i + HISTORY + HORIZON])
         i += HISTORY // 3
 
     return mini_datasets
@@ -122,11 +135,11 @@ def build_dataset(dataset, freq):
     # TODO: change it from 2
     # keeping it 2 for testing, as dataset creation takes time
     for i in trange(100):
-    # for i in trange(len(dataset)):
+        # for i in trange(len(dataset)):
         for X in split_dataset(dataset[i]):
             dates = get_dates(len(X), freq)
 
-            for task in range(1, NUM_TASKS+1):
+            for task in range(1, NUM_TASKS + 1):
                 built_input, output = build_input(dates, X, task=task)
 
                 ts_list += [ts for ts in built_input['ts']]
@@ -136,15 +149,21 @@ def build_dataset(dataset, freq):
 
                 outputs += [y for y in output]
 
-    dataset_frame = tf.data.Dataset.from_tensor_slices(({
-        'ts': ts_list,
-        'history': history_list,
-        'target_ts': target_ts_list,
-        'task': task_list
-    }, outputs))
+    dataset_frame = tf.data.Dataset.from_tensor_slices(
+        (
+            {
+                'ts': ts_list,
+                'history': history_list,
+                'target_ts': target_ts_list,
+                'task': task_list,
+            },
+            outputs,
+        )
+    )
 
     return dataset_frame
 
+
 def construct_dataframe(train_dataset_and_freq):
     """
     Function to construct the dataframe in accordance with the training format
@@ -155,22 +174,29 @@ def construct_dataframe(train_dataset_and_freq):
 
     return reduce(lambda df1, df2: df1.concatenate(df2), dfs)
 
+
 def get_validation_dataset():
     """
     Function to read data from various sources and feed them as input to
     build a dataframe for getting the validation dataset
     """
-    wikiweb_train = read_timeseries_file("/home/ubuntu/notebooks/forecasting/pretraining/wikiweb_train.csv")
-    tourism_train = read_timeseries_file("/home/ubuntu/notebooks/forecasting/pretraining/tourism_train.csv")
-    exchange_rate_train = read_timeseries_file("/home/ubuntu/notebooks/forecasting/pretraining/exchange_rate_train.csv")
-    m3_train = read_timeseries_file("/home/ubuntu/notebooks/forecasting/pretraining/m3_train.csv")
+    wikiweb_train = read_timeseries_file(
+        '/home/ubuntu/notebooks/forecasting/pretraining/wikiweb_train.csv'
+    )
+    tourism_train = read_timeseries_file(
+        '/home/ubuntu/notebooks/forecasting/pretraining/tourism_train.csv'
+    )
+    read_timeseries_file(
+        '/home/ubuntu/notebooks/forecasting/pretraining/exchange_rate_train.csv'
+    )
+    read_timeseries_file('/home/ubuntu/notebooks/forecasting/pretraining/m3_train.csv')
 
     # add different datasets and their frequency here
     # TODO: addition of monthly dataset shoots up
     # validation loss to ~40k. Need to see how to fix that
     train_dataset_and_freq = [
-        (wikiweb_train, "daily"),
-        (tourism_train, "monthly"),
+        (wikiweb_train, 'daily'),
+        (tourism_train, 'monthly'),
         # (exchange_rate_train, "daily"),
         # (m3_train, "monthly")
     ]
@@ -179,8 +205,10 @@ def get_validation_dataset():
     # print(len(list(constructed_dataframe)))
     return constructed_dataframe
 
+
 def main():
     get_validation_dataset()
 
+
 if __name__ == '__main__':
     main()
diff --git a/src/evaluate_public_datasets/evaluate.py b/src/evaluate_public_datasets/evaluate.py
index cf3055d..e7252d6 100644
--- a/src/evaluate_public_datasets/evaluate.py
+++ b/src/evaluate_public_datasets/evaluate.py
@@ -1,26 +1,29 @@
 """
 Module to evaluate the model on real world datasets
 """
-import yaml
 import argparse
-import tensorflow as tf
-import tensorflow_io
-import pandas as pd
+
 import numpy as np
+import pandas as pd
+import tensorflow as tf
+import yaml
 from process_data import read_timeseries_file
-from tqdm import trange
 from scipy.stats.mstats import winsorize
-from sklearn.metrics import mean_squared_error, mean_absolute_error
-from sklearn.preprocessing import StandardScaler, MinMaxScaler
-
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+from sklearn.preprocessing import MinMaxScaler
+from tqdm import trange
 
 HISTORY = 100
 
+
 def compute_time_features(ts: np.ndarray):
     ts = pd.to_datetime(ts)
-    return np.stack([ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
+    return np.stack(
+        [ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1
+    )
     # return np.stack([ts.minute, ts.hour, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
 
+
 def build_input(ts, target, task=1):
     horizon = len(ts) - len(target)
     all_dates = tf.numpy_function(compute_time_features, inp=[ts], Tout=tf.int64)
@@ -41,31 +44,39 @@ def build_input(ts, target, task=1):
     target = target[-HISTORY:]
     return {
         'ts': tf.repeat(tf.expand_dims(date_tensor, axis=0), [horizon], axis=0),
-
         # repeat the before horizon values horizon number of times,
         # so that for each of the predictions for each target_ts, you
         # have an available set of features
         'history': tf.repeat(tf.expand_dims(target, axis=0), [horizon], axis=0),
         'target_ts': tf.expand_dims(target_dates, axis=1),
-        'task': tf.fill([horizon,], task),
+        'task': tf.fill(
+            [
+                horizon,
+            ],
+            task,
+        ),
     }
 
+
 def evaluate_model(config, train_data, test_data, freq, name):
     pretrained = tf.keras.models.load_model(config['model_path'])
 
     BATCH_SIZE = 100
-    item_id, pred_start, actual, pred = [], [], [], []
+    _item_id, _pred_start, actual, pred = [], [], [], []
     stds = []
     wapes = []
     for i in trange(0, len(train_data), BATCH_SIZE):
-        test_points = train_data[i:(i+BATCH_SIZE)]
+        test_points = train_data[i : (i + BATCH_SIZE)]
         for idx, current_point in enumerate(test_points):
-
             # timestamps of history
-            history_ts = pd.date_range(start='2010-01-01', periods=len(train_data[i+idx] + test_data[i+idx]), freq=freq)
+            history_ts = pd.date_range(
+                start='2010-01-01',
+                periods=len(train_data[i + idx] + test_data[i + idx]),
+                freq=freq,
+            )
 
             # values of history
-            history = train_data[i+idx]
+            history = train_data[i + idx]
 
             # mean of history's last 6 values
             history_mean = np.nanmean(history[-6:])
@@ -74,7 +85,7 @@ def evaluate_model(config, train_data, test_data, freq, name):
             history_std = np.nanstd(history[-6:])
 
             # local scale, don't know why defined so
-            local_scale = (history_mean + history_std + 1e-4)
+            local_scale = history_mean + history_std + 1e-4
 
             # change history based on local scale, to normalize it between 0 and 1
             history = np.clip(history / local_scale, a_min=0, a_max=1)
@@ -87,19 +98,26 @@ def evaluate_model(config, train_data, test_data, freq, name):
             pred_vals = pretrained(build_input(history_ts, history, task=1))
 
             # get scaled mean based on the given history
-            scaled_vals = (pred_vals['result'].numpy().reshape(-1) * pred_vals['scale'].numpy().reshape(-1)) * local_scale
-
-            if np.mean(np.array(test_data[i+idx])):
-                wape = np.mean(np.abs(scaled_vals - np.array(test_data[i+idx]))) / np.mean(np.array(test_data[i+idx]))
+            scaled_vals = (
+                pred_vals['result'].numpy().reshape(-1)
+                * pred_vals['scale'].numpy().reshape(-1)
+            ) * local_scale
+
+            if np.mean(np.array(test_data[i + idx])):
+                wape = np.mean(
+                    np.abs(scaled_vals - np.array(test_data[i + idx]))
+                ) / np.mean(np.array(test_data[i + idx]))
                 wapes.append(wape)
 
-            assert len(scaled_vals) == len(test_data[i+idx])
+            assert len(scaled_vals) == len(test_data[i + idx])
 
             scaler = MinMaxScaler()
-            scaler.fit(np.array(train_data[i+idx]).reshape(-1, 1))
+            scaler.fit(np.array(train_data[i + idx]).reshape(-1, 1))
 
             predicted_scaled = scaler.transform(np.array(scaled_vals).reshape(-1, 1))
-            actual_scaled = scaler.transform(np.array(test_data[i+idx]).reshape(-1, 1))
+            actual_scaled = scaler.transform(
+                np.array(test_data[i + idx]).reshape(-1, 1)
+            )
             stds.append(np.std(actual_scaled))
 
             for pred_val, actual_val in zip(predicted_scaled, actual_scaled):
@@ -108,31 +126,29 @@ def evaluate_model(config, train_data, test_data, freq, name):
                     pred.append(pred_val)
                     actual.append(actual_val)
 
-
-
-    eval_clipped_df = pd.DataFrame(dict(
-        actual=actual,
-        pred=pred
-    ))
+    eval_clipped_df = pd.DataFrame(dict(actual=actual, pred=pred))
 
     eval_clipped_df = eval_clipped_df.assign(
         cmape=lambda df: np.abs(df.actual - df.pred) / df.actual
     ).assign(
         winsorized_cmape=lambda df: winsorize(df.cmape, (0.01, 0.01)),
-        squashed_cmape=lambda df: np.where(df.cmape > 1, 1 + np.log(df.cmape), df.cmape)
+        squashed_cmape=lambda df: np.where(
+            df.cmape > 1, 1 + np.log(df.cmape), df.cmape
+        ),
     )
 
     print(eval_clipped_df[(eval_clipped_df.actual > 0)].describe())
 
     # print(wapes)
     # print(np.nanmean(wapes))
-    print("MAE:", mean_absolute_error(actual, pred))
-    print("MSE:", mean_squared_error(actual, pred))
+    print('MAE:', mean_absolute_error(actual, pred))
+    print('MSE:', mean_squared_error(actual, pred))
     print(np.mean(stds))
 
+
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", required=True, help="Path to config file")
+    parser.add_argument('-c', '--config', required=True, help='Path to config file')
     args = parser.parse_args()
 
     with open(args.config) as config_file:
@@ -141,10 +157,8 @@ def main():
     train_data = read_timeseries_file(config['train_file'])
     test_data = read_timeseries_file(config['test_file'])
 
-
     evaluate_model(config, train_data, test_data, config['freq'], config['name'])
 
 
-
 if __name__ == '__main__':
     main()
diff --git a/src/evaluate_public_datasets/process_data.py b/src/evaluate_public_datasets/process_data.py
index 043c75d..fbf58b7 100644
--- a/src/evaluate_public_datasets/process_data.py
+++ b/src/evaluate_public_datasets/process_data.py
@@ -3,10 +3,10 @@
 them as a tfrecords file
 """
 import csv
-import tensorflow as tf
-import numpy as np
+
 import pandas as pd
 
+
 def read_timeseries_file(filename):
     """
     Function to read the standard datasets for time series.
@@ -27,4 +27,3 @@ def generate_tf_test_examples(name, train_data, test_data, freq):
     dates = pd.date_range(start='2010-01-01', periods=len_data, freq=freq)
 
     return name, dates, train_data, test_data
-
diff --git a/src/evaluation/evaluate.py b/src/evaluation/evaluate.py
index b3158cd..d337d60 100644
--- a/src/evaluation/evaluate.py
+++ b/src/evaluation/evaluate.py
@@ -2,17 +2,17 @@
 Module to evaluate on the customer dataset
 """
 
-import yaml
 import argparse
-import pandas as pd
+
 import numpy as np
+import pandas as pd
+import reainternal.mllibs.pipelinelib as PL
 import tensorflow as tf
-import tensorflow_io
+import yaml
+from prepare_dataset import build_input
 from reainternal import environment
-import reainternal.mllibs.pipelinelib as PL
 from scipy.stats.mstats import winsorize
 from tqdm import trange
-from prepare_dataset import build_input
 
 
 def evaluate(config):
@@ -29,12 +29,19 @@ def evaluate(config):
     BATCH_SIZE = 100
     item_id, pred_start, actual, pred = [], [], [], []
     for i in trange(0, len(record_index), BATCH_SIZE):
-        test_points = list(model_info.prepared_dataset_instance.get_prediction_records(record_index[i:(i + BATCH_SIZE)]))
+        test_points = list(
+            model_info.prepared_dataset_instance.get_prediction_records(
+                record_index[i : (i + BATCH_SIZE)]
+            )
+        )
         for current_point in test_points:
             # contains the history of available values and the targets
-            prediction_record, _ = model_info.serving_dataset_instance.dataset_class.prepare_data_for_prediction(
-                model_info.serving_dataset_instance,
-                current_point.model_input)
+            (
+                prediction_record,
+                _,
+            ) = model_info.serving_dataset_instance.dataset_class.prepare_data_for_prediction(
+                model_info.serving_dataset_instance, current_point.model_input
+            )
 
             # timestamps of history
             history_ts = prediction_record[ts_col]
@@ -49,7 +56,7 @@ def evaluate(config):
             history_std = np.nanstd(history[-6:])
 
             # local scale, don't know why defined so
-            local_scale = (history_mean + history_std + 1e-4)
+            local_scale = history_mean + history_std + 1e-4
 
             # change history based on local scale, to normalize it between 0 and 1
             history = np.clip(history / local_scale, a_min=0, a_max=1)
@@ -58,34 +65,38 @@ def evaluate(config):
             pred_mean = pretrained(build_input(history_ts, history, task=2))
 
             # get scaled mean based on the given history
-            scaled_mean = (pred_mean['result'].numpy().reshape(-1) * pred_mean['scale'].numpy().reshape(-1)) * local_scale
+            scaled_mean = (
+                pred_mean['result'].numpy().reshape(-1)
+                * pred_mean['scale'].numpy().reshape(-1)
+            ) * local_scale
 
             item_id.append(current_point.test_info[0])
             pred_start.append(current_point.test_info[1])
             actual.append(np.mean(current_point.actual[target_col]))
             pred.append(scaled_mean[-1])
 
-    eval_clipped_df = pd.DataFrame(dict(
-        item_id=item_id,
-        pred_start=pred_start,
-        actual=actual,
-        pred=pred
-    ))
+    eval_clipped_df = pd.DataFrame(
+        dict(item_id=item_id, pred_start=pred_start, actual=actual, pred=pred)
+    )
 
     eval_clipped_df = eval_clipped_df.assign(
         cmape=lambda df: np.abs(df.actual - df.pred) / df.actual
     ).assign(
         winsorized_cmape=lambda df: winsorize(df.cmape, (0.01, 0.01)),
-        squashed_cmape=lambda df: np.where(df.cmape > 1, 1 + np.log(df.cmape), df.cmape)
+        squashed_cmape=lambda df: np.where(
+            df.cmape > 1, 1 + np.log(df.cmape), df.cmape
+        ),
     )
 
-    return eval_clipped_df[(eval_clipped_df.actual > 0) & (eval_clipped_df.pred_start == '2021-06-30T00:00:00')].describe()
-
+    return eval_clipped_df[
+        (eval_clipped_df.actual > 0)
+        & (eval_clipped_df.pred_start == '2021-06-30T00:00:00')
+    ].describe()
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", required=True, help="Path to config file")
+    parser.add_argument('-c', '--config', required=True, help='Path to config file')
     args = parser.parse_args()
 
     with open(args.config) as config_file:
@@ -96,5 +107,6 @@ def main():
     results_df = evaluate(config)
     print(results_df)
 
+
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/src/evaluation/prepare_dataset.py b/src/evaluation/prepare_dataset.py
index e69f5e9..80de27f 100644
--- a/src/evaluation/prepare_dataset.py
+++ b/src/evaluation/prepare_dataset.py
@@ -1,16 +1,19 @@
 """
 Module to prepare customer dataset for evaluation
 """
-import pandas as pd
 import numpy as np
+import pandas as pd
 import tensorflow as tf
-import tensorflow_io
 
 HISTORY = 100
 
+
 def compute_time_features(ts: np.ndarray):
     ts = pd.to_datetime(ts)
-    return np.stack([ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
+    return np.stack(
+        [ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1
+    )
+
 
 def build_input(ts, target, task=1):
     horizon = len(ts) - len(target)
@@ -31,11 +34,15 @@ def build_input(ts, target, task=1):
     date_tensor = date_tensor[-HISTORY:]
     return {
         'ts': tf.repeat(tf.expand_dims(date_tensor, axis=0), [horizon], axis=0),
-
         # repeat the before horizon values horizon number of times,
         # so that for each of the predictions for each target_ts, you
         # have an available set of features
         'history': tf.repeat(tf.expand_dims(target, axis=0), [horizon], axis=0),
         'target_ts': tf.expand_dims(target_dates, axis=1),
-        'task': tf.fill([horizon,], task),
+        'task': tf.fill(
+            [
+                horizon,
+            ],
+            task,
+        ),
     }
diff --git a/src/prepare_public_datasets/constants.py b/src/prepare_public_datasets/constants.py
index c14c76e..d81da53 100644
--- a/src/prepare_public_datasets/constants.py
+++ b/src/prepare_public_datasets/constants.py
@@ -1,2 +1,2 @@
 CONTEXT_LENGTH = 200
-WINDOW_STRIDE = 30
\ No newline at end of file
+WINDOW_STRIDE = 30
diff --git a/src/prepare_public_datasets/prepare.py b/src/prepare_public_datasets/prepare.py
index bca22d7..eb30b10 100644
--- a/src/prepare_public_datasets/prepare.py
+++ b/src/prepare_public_datasets/prepare.py
@@ -2,16 +2,18 @@
 Module to prepare public datasets for training
 """
 
-import csv
-import yaml
 import argparse
+import csv
+from tempfile import NamedTemporaryFile
+
 import numpy as np
 import pandas as pd
 import tensorflow as tf
-from tempfile import NamedTemporaryFile
+import yaml
+from constants import CONTEXT_LENGTH, WINDOW_STRIDE
 from reainternal.cloud import CloudLocation
 from tqdm import tqdm
-from constants import *
+
 
 def read_timeseries_file(filename):
     """
@@ -27,6 +29,7 @@ def read_timeseries_file(filename):
 
     return lines
 
+
 def generate_tf_train_examples(name, train_data_list, freq):
     """
     Method to generate the examples from train data
@@ -36,7 +39,7 @@ def generate_tf_train_examples(name, train_data_list, freq):
         i = len(train_data)
 
         while i > 0:
-            train_data = train_data[max(i-CONTEXT_LENGTH, 0):i]
+            train_data = train_data[max(i - CONTEXT_LENGTH, 0) : i]
             if len(train_data) < CONTEXT_LENGTH:
                 train_data = [0] * (CONTEXT_LENGTH - len(train_data)) + train_data
 
@@ -53,23 +56,24 @@ def generate_tf_train_examples(name, train_data_list, freq):
             print(train_data)
 
             yield tf.train.Example(
-                    features=tf.train.Features(
-                        feature={
-                            "id": tf.train.Feature(bytes_list=tf.train.BytesList(value=[name.encode()])),
-                            "ts": tf.train.Feature(
-                                int64_list=tf.train.Int64List(
-                                    value=dates.astype(np.int64)
-                                )
-                            ),
-                            "y": tf.train.Feature(
-                                float_list=tf.train.FloatList(value=train_data)
-                            ),
-                            "noise": tf.train.Feature(
-                                float_list=tf.train.FloatList(value=noise)
-                            ),
-                        }
-                    )
+                features=tf.train.Features(
+                    feature={
+                        'id': tf.train.Feature(
+                            bytes_list=tf.train.BytesList(value=[name.encode()])
+                        ),
+                        'ts': tf.train.Feature(
+                            int64_list=tf.train.Int64List(value=dates.astype(np.int64))
+                        ),
+                        'y': tf.train.Feature(
+                            float_list=tf.train.FloatList(value=train_data)
+                        ),
+                        'noise': tf.train.Feature(
+                            float_list=tf.train.FloatList(value=noise)
+                        ),
+                    }
                 )
+            )
+
 
 def save_tf_records(prefix: str, dest: str, it):
     """
@@ -77,35 +81,40 @@ def save_tf_records(prefix: str, dest: str, it):
     """
     with NamedTemporaryFile() as tfile:
         with tf.io.TFRecordWriter(
-            tfile.name, options=tf.io.TFRecordOptions(compression_type="GZIP")
+            tfile.name, options=tf.io.TFRecordOptions(compression_type='GZIP')
         ) as writer:
             for record in tqdm(it):
                 writer.write(record.SerializeToString())
         tfile.seek(0)
         CloudLocation(prefix + dest).copy_from_file(tfile)
 
+
 def save_tf_dataset(prefix: str, dataset_name: str, data: list, freq: str):
     """
     Generate dataset and save as tf records
     """
     save_tf_records(
         prefix,
-        f"{dataset_name}.tfrecords",
-        generate_tf_train_examples(dataset_name, data, freq)
+        f'{dataset_name}.tfrecords',
+        generate_tf_train_examples(dataset_name, data, freq),
     )
 
-    print(f"Written to file {dataset_name}.tfrecords")
+    print(f'Written to file {dataset_name}.tfrecords')
+
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", required=True, help="Path to config file")
+    parser.add_argument('-c', '--config', required=True, help='Path to config file')
     args = parser.parse_args()
 
     with open(args.config) as config_file:
         config = yaml.load(config_file, yaml.loader.SafeLoader)
 
     train_data = read_timeseries_file(config['train_path'])
-    save_tf_dataset(config['prefix'], config['dataset_name'], train_data, config['freq'])
+    save_tf_dataset(
+        config['prefix'], config['dataset_name'], train_data, config['freq']
+    )
+
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()
diff --git a/src/synthetic_generation/config_variables.py b/src/synthetic_generation/config_variables.py
index 9e8b280..55382d8 100644
--- a/src/synthetic_generation/config_variables.py
+++ b/src/synthetic_generation/config_variables.py
@@ -2,6 +2,7 @@
 Module containing configuration setting for the script
 """
 
+
 class Config:
     frequencies = None
     frequency_names = None
@@ -11,14 +12,35 @@ class Config:
     @classmethod
     def set_freq_variables(cls, is_sub_day):
         if is_sub_day:
-            cls.frequencies = [("min", 1/1440), ("H", 1/24), ("D", 1), ("W", 7), ("MS", 30), ("Y", 12)]
-            cls.frequency_names = ["minute", "hourly", "daily", "weekly", "monthly", "yearly"]
-            cls.freq_and_index = (("minute", 0), ("hourly", 1), ("daily", 2), ("weekly", 3), ("monthly", 4), ("yearly", 5))
+            cls.frequencies = [
+                ('min', 1 / 1440),
+                ('H', 1 / 24),
+                ('D', 1),
+                ('W', 7),
+                ('MS', 30),
+                ('Y', 12),
+            ]
+            cls.frequency_names = [
+                'minute',
+                'hourly',
+                'daily',
+                'weekly',
+                'monthly',
+                'yearly',
+            ]
+            cls.freq_and_index = (
+                ('minute', 0),
+                ('hourly', 1),
+                ('daily', 2),
+                ('weekly', 3),
+                ('monthly', 4),
+                ('yearly', 5),
+            )
         else:
-            cls.frequencies = [("D", 1), ("W", 7), ("MS", 30)]
-            cls.frequency_names = ["daily", "weekly", "monthly"]
-            cls.freq_and_index = (("daily", 0), ("weekly", 1), ("monthly", 2))
+            cls.frequencies = [('D', 1), ('W', 7), ('MS', 30)]
+            cls.frequency_names = ['daily', 'weekly', 'monthly']
+            cls.freq_and_index = (('daily', 0), ('weekly', 1), ('monthly', 2))
 
     @classmethod
     def set_transition(cls, transition):
-        cls.transition = transition
\ No newline at end of file
+        cls.transition = transition
diff --git a/src/synthetic_generation/constants.py b/src/synthetic_generation/constants.py
index 505c9d7..1c9712e 100644
--- a/src/synthetic_generation/constants.py
+++ b/src/synthetic_generation/constants.py
@@ -3,28 +3,29 @@
 """
 
 from datetime import date
+
 import tensorflow as tf
 
-BASE_START = date.fromisoformat("1885-01-01").toordinal()
-BASE_END = date.fromisoformat("2023-12-31").toordinal() + 1
+BASE_START = date.fromisoformat('1885-01-01').toordinal()
+BASE_END = date.fromisoformat('2023-12-31').toordinal() + 1
 
 PRODUCT_SCHEMA = {
-    "doc": "Timeseries sample",
-    "name": "TimeseriesSample",
-    "type": "record",
-    "fields": [
-        {"name": "id", "type": "string"},
-        {"name": "ts", "type": {"type": "int", "logicalType": "date"}},
-        {"name": "y", "type": ["null", "float"]},
-        {"name": "noise", "type": ["float"]}
+    'doc': 'Timeseries sample',
+    'name': 'TimeseriesSample',
+    'type': 'record',
+    'fields': [
+        {'name': 'id', 'type': 'string'},
+        {'name': 'ts', 'type': {'type': 'int', 'logicalType': 'date'}},
+        {'name': 'y', 'type': ['null', 'float']},
+        {'name': 'noise', 'type': ['float']},
     ],
 }
 
 CONTEXT_LENGTH = 1_000
 
 TF_SCHEMA = {
-    "id": tf.io.FixedLenFeature([], dtype=tf.string),
-    "ts": tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.int64),
-    "y": tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32),
-    "noise": tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32)
+    'id': tf.io.FixedLenFeature([], dtype=tf.string),
+    'ts': tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.int64),
+    'y': tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32),
+    'noise': tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32),
 }
diff --git a/src/synthetic_generation/generate_series.py b/src/synthetic_generation/generate_series.py
index 7f6b6f8..39cab33 100644
--- a/src/synthetic_generation/generate_series.py
+++ b/src/synthetic_generation/generate_series.py
@@ -2,19 +2,21 @@
 Module to generate synthetic series
 """
 
+from datetime import date
+
 import numpy as np
 import pandas as pd
-from datetime import date
-from pandas.tseries.frequencies import to_offset
-from constants import *
 from config_variables import Config
+from constants import BASE_END, BASE_START, CONTEXT_LENGTH
 from generate_series_components import make_series
-from utils import sample_scale, get_transition_coefficients
-from series_config import ComponentScale, SeriesConfig, ComponentNoise
+from pandas.tseries.frequencies import to_offset
 from scipy.stats import beta
+from series_config import ComponentNoise, ComponentScale, SeriesConfig
+from utils import get_transition_coefficients, sample_scale
+
 
 def __generate(
-    n = 100,
+    n=100,
     freq_index: int = None,
     start: pd.Timestamp = None,
     options: dict = {},
@@ -31,22 +33,22 @@ def __generate(
 
     # annual, monthly, weekly, hourly and minutely components
     a, m, w, h, minute = 0.0, 0.0, 0.0, 0.0, 0.0
-    if freq == "min":
+    if freq == 'min':
         minute = np.random.uniform(0.0, 1.0)
         h = np.random.uniform(0.0, 0.2)
-    elif freq == "H":
+    elif freq == 'H':
         minute = np.random.uniform(0.0, 0.2)
         h = np.random.uniform(0.0, 1)
-    elif freq == "D":
+    elif freq == 'D':
         w = np.random.uniform(0.0, 1.0)
         m = np.random.uniform(0.0, 0.2)
-    elif freq == "W":
+    elif freq == 'W':
         m = np.random.uniform(0.0, 0.3)
         a = np.random.uniform(0.0, 0.3)
-    elif freq == "MS":
+    elif freq == 'MS':
         w = np.random.uniform(0.0, 0.1)
         a = np.random.uniform(0.0, 0.5)
-    elif freq == "Y":
+    elif freq == 'Y':
         w = np.random.uniform(0.0, 0.2)
         a = np.random.uniform(0.0, 1)
     else:
@@ -54,7 +56,9 @@ def __generate(
 
     if start is None:
         # start = pd.Timestamp(date.fromordinal(np.random.randint(BASE_START, BASE_END)))
-        start = pd.Timestamp(date.fromordinal(int((BASE_START - BASE_END)*beta.rvs(5,1)+BASE_START)))
+        start = pd.Timestamp(
+            date.fromordinal(int((BASE_START - BASE_END) * beta.rvs(5, 1) + BASE_START))
+        )
 
     scale_config = ComponentScale(
         1.0,
@@ -64,7 +68,7 @@ def __generate(
         m=m,
         w=w,
         minute=minute,
-        h=h
+        h=h,
     )
 
     offset_config = ComponentScale(
@@ -77,17 +81,16 @@ def __generate(
     )
 
     noise_config = ComponentNoise(
-        k=np.random.uniform(1, 5),
-        median=1,
-        scale=sample_scale()
+        k=np.random.uniform(1, 5), median=1, scale=sample_scale()
     )
 
     cfg = SeriesConfig(scale_config, offset_config, noise_config)
 
     return cfg, make_series(cfg, to_offset(freq), n, start, options, random_walk)
 
+
 def generate(
-    n = 100,
+    n=100,
     freq_index: int = None,
     start: pd.Timestamp = None,
     options: dict = {},
@@ -106,10 +109,8 @@ def generate(
     else:
         values = series1['values']
 
-    dataframe_data = {
-        'series_values': values,
-        'noise': series1['noise']
-    }
-
-    return cfg1, pd.DataFrame(data=dataframe_data, index=series1['dates'])#.clip(lower=0.0)
+    dataframe_data = {'series_values': values, 'noise': series1['noise']}
 
+    return cfg1, pd.DataFrame(
+        data=dataframe_data, index=series1['dates']
+    )  # .clip(lower=0.0)
diff --git a/src/synthetic_generation/generate_series_components.py b/src/synthetic_generation/generate_series_components.py
index 7bc88b2..900d5e2 100644
--- a/src/synthetic_generation/generate_series_components.py
+++ b/src/synthetic_generation/generate_series_components.py
@@ -1,12 +1,13 @@
 """
 Module to generate trend and seasonal components of series
 """
+from collections import defaultdict
+
 import numpy as np
 import pandas as pd
-from constants import *
 from series_config import SeriesConfig
-from utils import shift_axis, weibull_noise, get_random_walk_series
-from collections import defaultdict
+from utils import get_random_walk_series, shift_axis, weibull_noise
+
 
 def make_series_trend(series: SeriesConfig, dates: pd.DatetimeIndex):
     """
@@ -25,6 +26,7 @@ def make_series_trend(series: SeriesConfig, dates: pd.DatetimeIndex):
 
     return values
 
+
 def get_freq_component(dates_feature: pd.Index, n_harmonics: int, n_total: int):
     """
     Method to get systematic movement of values across time
@@ -38,7 +40,7 @@ def get_freq_component(dates_feature: pd.Index, n_harmonics: int, n_total: int):
     :return: numpy array of shape dates_feature.shape containing
     sinusoidal value for a given point in time
     """
-    harmonics = list(range(1, n_harmonics+1))
+    harmonics = list(range(1, n_harmonics + 1))
 
     # initialize sin and cosine coefficients with 0
     sin_coef = np.zeros(n_harmonics)
@@ -46,8 +48,8 @@ def get_freq_component(dates_feature: pd.Index, n_harmonics: int, n_total: int):
 
     # choose coefficients inversely proportional to the harmonic
     for idx, harmonic in enumerate(harmonics):
-        sin_coef[idx] = np.random.normal(scale = 1 / harmonic)
-        cos_coef[idx] = np.random.normal(scale = 1 / harmonic)
+        sin_coef[idx] = np.random.normal(scale=1 / harmonic)
+        cos_coef[idx] = np.random.normal(scale=1 / harmonic)
 
     # normalize the coefficients such that their sum of squares is 1
     coef_sq_sum = np.sqrt(np.sum(np.square(sin_coef)) + np.sum(np.square(cos_coef)))
@@ -58,8 +60,12 @@ def get_freq_component(dates_feature: pd.Index, n_harmonics: int, n_total: int):
     # comprises of patterns of varying frequency
     return_val = 0
     for idx, harmonic in enumerate(harmonics):
-        return_val += sin_coef[idx] * np.sin(2 * np.pi * harmonic * dates_feature / n_total)
-        return_val += cos_coef[idx] * np.cos(2 * np.pi * harmonic * dates_feature / n_total)
+        return_val += sin_coef[idx] * np.sin(
+            2 * np.pi * harmonic * dates_feature / n_total
+        )
+        return_val += cos_coef[idx] * np.cos(
+            2 * np.pi * harmonic * dates_feature / n_total
+        )
 
     return return_val
 
@@ -75,24 +81,35 @@ def make_series_seasonal(series: SeriesConfig, dates: pd.DatetimeIndex):
 
     seasonal_components = defaultdict(lambda: 1)
     if series.scale.minute is not None:
-        seasonal_components['minute'] = 1 + series.scale.minute * get_freq_component(dates.minute, 10, 60)
+        seasonal_components['minute'] = 1 + series.scale.minute * get_freq_component(
+            dates.minute, 10, 60
+        )
         seasonal *= seasonal_components['minute']
     if series.scale.h is not None:
-        seasonal_components['h'] = 1 + series.scale.h * get_freq_component(dates.hour, 10, 24)
+        seasonal_components['h'] = 1 + series.scale.h * get_freq_component(
+            dates.hour, 10, 24
+        )
         seasonal *= seasonal_components['h']
     if series.scale.a is not None:
-        seasonal_components['a'] = 1 + series.scale.a * get_freq_component(dates.month, 6, 12)
+        seasonal_components['a'] = 1 + series.scale.a * get_freq_component(
+            dates.month, 6, 12
+        )
         seasonal *= seasonal_components['a']
     if series.scale.m is not None:
-        seasonal_components['m'] = 1 + series.scale.m * get_freq_component(dates.day, 10, 30.5)
+        seasonal_components['m'] = 1 + series.scale.m * get_freq_component(
+            dates.day, 10, 30.5
+        )
         seasonal *= seasonal_components['m']
     if series.scale.w is not None:
-        seasonal_components['w'] = 1 + series.scale.w * get_freq_component(dates.dayofweek, 4, 7)
+        seasonal_components['w'] = 1 + series.scale.w * get_freq_component(
+            dates.dayofweek, 4, 7
+        )
         seasonal *= seasonal_components['w']
 
     seasonal_components['seasonal'] = seasonal
     return seasonal_components
 
+
 def make_series(
     series: SeriesConfig,
     freq: pd.DateOffset,
@@ -120,7 +137,7 @@ def make_series(
         weibull_noise_term = weibull_noise(
             k=series.noise_config.k,
             median=series.noise_config.median,
-            length=len(values)
+            length=len(values),
         )
 
         # approximating estimated value from median
@@ -128,13 +145,15 @@ def make_series(
 
         # expected value of this term is 0
         # for no noise, scale is set to 0
-        scaled_noise_term = series.noise_config.scale * (weibull_noise_term - noise_expected_val)
+        scaled_noise_term = series.noise_config.scale * (
+            weibull_noise_term - noise_expected_val
+        )
 
     dataframe_data = {
         **values_seasonal,
         'values': values,
         'noise': 1 + scaled_noise_term,
-        'dates': dates
+        'dates': dates,
     }
 
     return dataframe_data
diff --git a/src/synthetic_generation/main.py b/src/synthetic_generation/main.py
index df34c78..ee35e21 100644
--- a/src/synthetic_generation/main.py
+++ b/src/synthetic_generation/main.py
@@ -2,17 +2,17 @@
 Module to generate synthetic dataset for pre training
 a time series forecasting model
 """
-import yaml
 import argparse
-import pandas as pd
+
+import yaml
+from config_variables import Config
 from tf_generate_series import (
-    save_tf_records,
-    tf_generate_n,
     convert_tf_to_rows,
-    load_tf_dataset,
     generate_product_input,
+    load_tf_dataset,
+    save_tf_records,
+    tf_generate_n,
 )
-from config_variables import Config
 
 
 def save_tf_dataset(prefix: str, version: str, options: dict, num_series: int = 10_000):
@@ -20,10 +20,10 @@ def save_tf_dataset(prefix: str, version: str, options: dict, num_series: int =
     Generate dataset and save as tf records
     """
     for freq, freq_index in Config.freq_and_index:
-        print("Frequency: " + freq)
+        print('Frequency: ' + freq)
         save_tf_records(
             prefix,
-            f"{version}/{freq}.tfrecords",
+            f'{version}/{freq}.tfrecords',
             tf_generate_n(
                 N=num_series,
                 freq_index=freq_index,
@@ -38,32 +38,35 @@ def generate_product_input_dataset(prefix, version):
     Load dataset from tf records and save as avro files
     """
     for freq in Config.frequency_names:
-        print("Frequency: " + freq)
+        print('Frequency: ' + freq)
         generate_product_input(
             prefix,
-            f"{version}/{freq}.avro",
+            f'{version}/{freq}.avro',
             convert_tf_to_rows(
-                load_tf_dataset(prefix, f"{version}/{freq}.tfrecords").as_numpy_iterator()
+                load_tf_dataset(
+                    prefix, f'{version}/{freq}.tfrecords'
+                ).as_numpy_iterator()
             ),
         )
 
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", required=True, help="Path to config file")
+    parser.add_argument('-c', '--config', required=True, help='Path to config file')
     args = parser.parse_args()
 
     with open(args.config) as config_file:
         config = yaml.load(config_file, yaml.loader.SafeLoader)
 
-    Config.set_freq_variables(config["sub_day"])
-    if "transition" in config:
-        Config.set_transition(config["transition"])
-
+    Config.set_freq_variables(config['sub_day'])
+    if 'transition' in config:
+        Config.set_transition(config['transition'])
 
-    save_tf_dataset(config["prefix"], config["version"], config["options"], config["num_series"])
-    generate_product_input_dataset(config["prefix"], config["version"])
+    save_tf_dataset(
+        config['prefix'], config['version'], config['options'], config['num_series']
+    )
+    generate_product_input_dataset(config['prefix'], config['version'])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/src/synthetic_generation/series_config.py b/src/synthetic_generation/series_config.py
index 09c0d06..33b2691 100644
--- a/src/synthetic_generation/series_config.py
+++ b/src/synthetic_generation/series_config.py
@@ -1,9 +1,10 @@
 """
 Module containing dataclasses for synthetic data generator
 """
-import numpy as np
 from dataclasses import dataclass
 
+import numpy as np
+
 
 @dataclass
 class ComponentScale:
@@ -17,6 +18,7 @@ class ComponentScale:
     h: np.ndarray = None
     minute: np.ndarray = None
 
+
 @dataclass
 class ComponentNoise:
     # shape parameter for the weibull distribution
@@ -28,6 +30,7 @@ class ComponentNoise:
     # no noise can be represented by scale = 0
     scale: float
 
+
 @dataclass
 class SeriesConfig:
     scale: ComponentScale
@@ -35,4 +38,4 @@ class SeriesConfig:
     noise_config: ComponentNoise
 
     def __str__(self):
-        return f"L{1000*self.scale.linear:+02.0f}E{10000*(self.scale.exp - 1):+02.0f}A{100*self.scale.a:02.0f}M{100*self.scale.m:02.0f}W{100*self.scale.w:02.0f}"
+        return f'L{1000*self.scale.linear:+02.0f}E{10000*(self.scale.exp - 1):+02.0f}A{100*self.scale.a:02.0f}M{100*self.scale.m:02.0f}W{100*self.scale.w:02.0f}'
diff --git a/src/synthetic_generation/tf_generate_series.py b/src/synthetic_generation/tf_generate_series.py
index b66e1aa..511847d 100644
--- a/src/synthetic_generation/tf_generate_series.py
+++ b/src/synthetic_generation/tf_generate_series.py
@@ -2,17 +2,17 @@
 Module to convert process synthetic series using tensorflow
 """
 
+from datetime import date
+from tempfile import NamedTemporaryFile
+
 import fastavro
-import tensorflow_io
 import numpy as np
 import pandas as pd
 import tensorflow as tf
-from datetime import date
-from tempfile import NamedTemporaryFile
-from reainternal.cloud import CloudLocation
+from constants import CONTEXT_LENGTH
 from generate_series import generate
-from constants import *
-from series_config import *
+from reainternal.cloud import CloudLocation
+from series_config import PRODUCT_SCHEMA, TF_SCHEMA
 
 
 def tf_generate_n(
@@ -28,9 +28,9 @@ def tf_generate_n(
 
     for i in range(N):
         if i % 1000 == 0:
-            print(f"Completed: {i}")
+            print(f'Completed: {i}')
 
-        if i < N * options.get("linear_random_walk_frac", 0):
+        if i < N * options.get('linear_random_walk_frac', 0):
             cfg, sample = generate(
                 size,
                 freq_index=freq_index,
@@ -43,7 +43,7 @@ def tf_generate_n(
                 size, freq_index=freq_index, start=start, options=options
             )
         # cfg is the name of the time series
-        # sample is a pandas dataframe where 
+        # sample is a pandas dataframe where
         #   the index is the datetime object
         #   columns `series_value` and `noise`
 
@@ -51,16 +51,16 @@ def tf_generate_n(
         yield tf.train.Example(
             features=tf.train.Features(
                 feature={
-                    "id": tf.train.Feature(bytes_list=tf.train.BytesList(value=[id_])),
-                    "ts": tf.train.Feature(
+                    'id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[id_])),
+                    'ts': tf.train.Feature(
                         int64_list=tf.train.Int64List(
                             value=sample.index.astype(np.int64)
                         )
                     ),
-                    "y": tf.train.Feature(
+                    'y': tf.train.Feature(
                         float_list=tf.train.FloatList(value=sample.series_values.values)
                     ),
-                    "noise": tf.train.Feature(
+                    'noise': tf.train.Feature(
                         float_list=tf.train.FloatList(value=sample.noise.values)
                     ),
                 }
@@ -74,7 +74,7 @@ def save_tf_records(prefix: str, dest: str, it):
     """
     with NamedTemporaryFile() as tfile:
         with tf.io.TFRecordWriter(
-            tfile.name, options=tf.io.TFRecordOptions(compression_type="GZIP")
+            tfile.name, options=tf.io.TFRecordOptions(compression_type='GZIP')
         ) as writer:
             for record in it:
                 writer.write(record.SerializeToString())
@@ -87,27 +87,25 @@ def decode_fn(record_bytes):
 
 
 def load_tf_dataset(prefix: str, src: str):
-    return tf.data.TFRecordDataset(prefix + src, compression_type="GZIP").map(
-        decode_fn
-    )
+    return tf.data.TFRecordDataset(prefix + src, compression_type='GZIP').map(decode_fn)
 
 
 def convert_tf_to_rows(records):
     for i, r in enumerate(records):
         if i % 1000 == 0:
-            print(f"Completed: {i}")
-        id_ = r["id"].decode()
+            print(f'Completed: {i}')
+        id_ = r['id'].decode()
         for ts, y, noise in zip(
-            (date.fromtimestamp(v / 1_000_000_000) for v in r["ts"]),
-            (float(v) for v in r["y"]),
-            (float(_noise) for _noise in r["noise"])
+            (date.fromtimestamp(v / 1_000_000_000) for v in r['ts']),
+            (float(v) for v in r['y']),
+            (float(_noise) for _noise in r['noise']),
         ):
-            yield {"id": id_, "ts": ts, "y": y, "noise": noise}
+            yield {'id': id_, 'ts': ts, 'y': y, 'noise': noise}
 
 
 def generate_product_input(prefix: str, dest: str, it):
     """
     Write generated dataset into avro files
     """
-    with CloudLocation(prefix + dest).open(mode="wb") as file:
-        fastavro.writer(file, PRODUCT_SCHEMA, it, codec="deflate")
+    with CloudLocation(prefix + dest).open(mode='wb') as file:
+        fastavro.writer(file, PRODUCT_SCHEMA, it, codec='deflate')
diff --git a/src/synthetic_generation/utils.py b/src/synthetic_generation/utils.py
index 1cf00b2..d14fc27 100644
--- a/src/synthetic_generation/utils.py
+++ b/src/synthetic_generation/utils.py
@@ -66,5 +66,5 @@ def get_transition_coefficients(context_length):
     m = (a + b) / 2
     k = 1 / (a - m) * np.log(f_a / (1 - f_a))
 
-    coeff = 1 / (1 + np.exp(-k * (np.arange(1, context_length+1) - m)))
+    coeff = 1 / (1 + np.exp(-k * (np.arange(1, context_length + 1) - m)))
     return coeff
diff --git a/src/training/config_variables.py b/src/training/config_variables.py
index 0140cd2..36841e0 100644
--- a/src/training/config_variables.py
+++ b/src/training/config_variables.py
@@ -2,6 +2,7 @@
 Module containing configuration setting for the script
 """
 
+
 class Config:
     is_sub_day = False
 
diff --git a/src/training/constants.py b/src/training/constants.py
index e535604..6f7f730 100644
--- a/src/training/constants.py
+++ b/src/training/constants.py
@@ -21,14 +21,14 @@
 
 CONTEXT_LENGTH = 500
 TF_SCHEMA = {
-    "id": tf.io.FixedLenFeature([], dtype=tf.string),
-    "ts": tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.int64),
-    "y": tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32),
-    "noise": tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32)
+    'id': tf.io.FixedLenFeature([], dtype=tf.string),
+    'ts': tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.int64),
+    'y': tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32),
+    'noise': tf.io.FixedLenFeature([CONTEXT_LENGTH], dtype=tf.float32),
 }
 
 
 # constant to reference where the academic_comparison and metalearning folders are
 # will not be needed for training without validating on these datasets
 ACADEMIC_HOME = '/home/ubuntu/ForecastPFN/academic_comparison/'
-METALEARNED_HOME = ACADEMIC_HOME + 'metalearned/'
\ No newline at end of file
+METALEARNED_HOME = ACADEMIC_HOME + 'metalearned/'
diff --git a/src/training/create_train_test_df.py b/src/training/create_train_test_df.py
index 52a53c0..79dac75 100644
--- a/src/training/create_train_test_df.py
+++ b/src/training/create_train_test_df.py
@@ -2,10 +2,16 @@
 Module to create train and test dfs
 """
 import tensorflow as tf
-import tensorflow_io
-from prepare_dataset import gen_random_single_point, gen_mean_to_random_date, \
-    gen_std_to_random_date, filter_unusable_points, build_frames, gen_random_single_point_no_noise, \
-    gen_mean_to_random_date_no_noise, gen_std_to_random_date_no_noise
+from prepare_dataset import (
+    build_frames,
+    filter_unusable_points,
+    gen_mean_to_random_date,
+    gen_mean_to_random_date_no_noise,
+    gen_random_single_point,
+    gen_random_single_point_no_noise,
+    gen_std_to_random_date,
+    gen_std_to_random_date_no_noise,
+)
 
 
 def remove_noise(x, y):
@@ -20,10 +26,12 @@ def remove_noise(x, y):
             'ts': x['ts'],
             'history': x['history'],
             'target_ts': x['target_ts'],
-            'task': x['task']
-        }, y
+            'task': x['task'],
+        },
+        y,
     )
 
+
 def create_train_test_df(combined_ds, test_noise=False):
     """
     Method to create a train/test split from the combined_ds
@@ -36,20 +44,24 @@ def create_train_test_df(combined_ds, test_noise=False):
     task_map = {
         'point': gen_random_single_point,
         'mean': gen_mean_to_random_date,
-        'stdev': gen_std_to_random_date
+        'stdev': gen_std_to_random_date,
     }
     train_tasks_dfs = [
         base_train_df.map(func, num_parallel_calls=tf.data.AUTOTUNE)
         for func in task_map.values()
     ]
-    train_df = tf.data.Dataset.choose_from_datasets(
-        train_tasks_dfs, tf.data.Dataset.range(len(train_tasks_dfs)).repeat()
-    ).unbatch().filter(filter_unusable_points)
+    train_df = (
+        tf.data.Dataset.choose_from_datasets(
+            train_tasks_dfs, tf.data.Dataset.range(len(train_tasks_dfs)).repeat()
+        )
+        .unbatch()
+        .filter(filter_unusable_points)
+    )
 
     task_map_test = {
         'point': gen_random_single_point_no_noise,
         'mean': gen_mean_to_random_date_no_noise,
-        'stdev': gen_std_to_random_date_no_noise
+        'stdev': gen_std_to_random_date_no_noise,
     }
 
     if test_noise:
@@ -63,9 +75,13 @@ def create_train_test_df(combined_ds, test_noise=False):
             for func in task_map_test.values()
         ]
 
-    test_df = tf.data.Dataset.choose_from_datasets(
-        test_tasks_dfs, tf.data.Dataset.range(len(test_tasks_dfs)).repeat()
-    ).unbatch().filter(filter_unusable_points)
+    test_df = (
+        tf.data.Dataset.choose_from_datasets(
+            test_tasks_dfs, tf.data.Dataset.range(len(test_tasks_dfs)).repeat()
+        )
+        .unbatch()
+        .filter(filter_unusable_points)
+    )
 
     # remove noise and target_noise from train and test df as they are now useless
     # train_df = train_df.map(remove_noise)
diff --git a/src/training/metalearned_validation.py b/src/training/metalearned_validation.py
index 3c97e24..164027f 100644
--- a/src/training/metalearned_validation.py
+++ b/src/training/metalearned_validation.py
@@ -1,62 +1,69 @@
+import datetime
 import sys
-ACADEMIC_HOME = '/home/ubuntu/notebooks/ForecastPFN/academic_comparison/'
-METALEARNED_HOME = ACADEMIC_HOME + 'metalearned/'
-sys.path.append(ACADEMIC_HOME)
-sys.path.append(METALEARNED_HOME)
 
-import datetime
 import numpy as np
 import pandas as pd
 import tensorflow as tf
-from data_provider.UnivariateTimeseriesSampler_WithStamps import UnivariateTimeseriesSampler_WithStamps
-from resources.tourism.dataset import TourismDataset, TourismMeta
+from data_provider.UnivariateTimeseriesSampler_WithStamps import (
+    UnivariateTimeseriesSampler_WithStamps,
+)
 from resources.m3.dataset import M3Dataset, M3Meta
+from resources.tourism.dataset import TourismDataset, TourismMeta
+
+ACADEMIC_HOME = '/home/ubuntu/notebooks/ForecastPFN/academic_comparison/'
+METALEARNED_HOME = ACADEMIC_HOME + 'metalearned/'
+sys.path.append(ACADEMIC_HOME)
+sys.path.append(METALEARNED_HOME)
+
 
 def _ForecastPFN_time_features(ts: np.ndarray):
     if type(ts[0]) == datetime.datetime:
         year = [x.year for x in ts]
         month = [x.month for x in ts]
         day = [x.day for x in ts]
-        day_of_week = [x.weekday()+1 for x in ts]
+        day_of_week = [x.weekday() + 1 for x in ts]
         day_of_year = [x.timetuple().tm_yday for x in ts]
         return np.stack([year, month, day, day_of_week, day_of_year], axis=-1)
     ts = pd.to_datetime(ts)
-    return np.stack([ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
-
+    return np.stack(
+        [ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1
+    )
 
-def prepare_metalearned_test(metaleanredDataset, metalearnedMeta, sp, p_input_size, p_horizon) -> tf.data.Dataset:
 
+def prepare_metalearned_test(
+    metaleanredDataset, metalearnedMeta, sp, p_input_size, p_horizon
+) -> tf.data.Dataset:
     target_input, target_output = metaleanredDataset(
-            METALEARNED_HOME+metalearnedMeta.dataset_path).standard_split()
+        METALEARNED_HOME + metalearnedMeta.dataset_path
+    ).standard_split()
     in_bundle, out_bundle, sp = target_input, target_output, sp
-    in_bundle = in_bundle.filter(
-        lambda ts: ts.meta['seasonal_pattern'] == sp)
-    out_bundle = out_bundle.filter(
-        lambda ts: ts.meta['seasonal_pattern'] == sp)
-    
+    in_bundle = in_bundle.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
+    out_bundle = out_bundle.filter(lambda ts: ts.meta['seasonal_pattern'] == sp)
 
     input_set = in_bundle.values()
     input_timestamps = in_bundle.time_stamps()
-    input_set = UnivariateTimeseriesSampler_WithStamps(timeseries=input_set,
-                                                    time_stamps=input_timestamps,
-                                                    insample_size=p_input_size,
-                                                    outsample_size=0,
-                                                    window_sampling_limit=1,
-                                                    batch_size=1,
-                                                    time_features=_ForecastPFN_time_features,
-                                                    )
+    input_set = UnivariateTimeseriesSampler_WithStamps(
+        timeseries=input_set,
+        time_stamps=input_timestamps,
+        insample_size=p_input_size,
+        outsample_size=0,
+        window_sampling_limit=1,
+        batch_size=1,
+        time_features=_ForecastPFN_time_features,
+    )
     p_x, p_x_mask, p_x_timestamps = input_set.sequential_latest_insamples()
 
     output_set = out_bundle.values()
     output_timestamps = out_bundle.time_stamps()
-    output_set = UnivariateTimeseriesSampler_WithStamps(timeseries=output_set,
-                                                        time_stamps=output_timestamps,
-                                                        insample_size=p_horizon,
-                                                        outsample_size=0,
-                                                        window_sampling_limit=1,
-                                                        batch_size=1,
-                                                        time_features=_ForecastPFN_time_features,
-                                                        )
+    output_set = UnivariateTimeseriesSampler_WithStamps(
+        timeseries=output_set,
+        time_stamps=output_timestamps,
+        insample_size=p_horizon,
+        outsample_size=0,
+        window_sampling_limit=1,
+        batch_size=1,
+        time_features=_ForecastPFN_time_features,
+    )
     p_y, p_y_mask, p_y_timestamps = output_set.sequential_latest_insamples()
 
     x, x_mark, y, y_mark = p_x, p_x_timestamps, p_y, p_y_timestamps
@@ -67,9 +74,11 @@ def prepare_metalearned_test(metaleanredDataset, metalearnedMeta, sp, p_input_si
     task = []
     y_out = []
     for x, y, x_mark, y_mark in zip(p_x, p_y, p_x_timestamps, p_y_timestamps):
-        for yi, yi_mark in zip(y,y_mark):
+        for yi, yi_mark in zip(y, y_mark):
             if sum(yi_mark):
-                ts.append(np.append(np.zeros((100 - x_mark.shape[0],5)), x_mark, axis=0))
+                ts.append(
+                    np.append(np.zeros((100 - x_mark.shape[0], 5)), x_mark, axis=0)
+                )
                 history.append(np.append(np.zeros(100 - x_mark.shape[0]), x))
                 target_ts.append(np.array([yi_mark]))
                 task.append(1)
@@ -77,34 +86,30 @@ def prepare_metalearned_test(metaleanredDataset, metalearnedMeta, sp, p_input_si
 
     ts = tf.convert_to_tensor(np.array(ts), dtype=np.int64, name='ts')
     history = tf.convert_to_tensor(np.array(history), dtype=np.float32, name='history')
-    target_ts = tf.convert_to_tensor(np.array(target_ts), dtype=np.int64, name='target_ts')
+    target_ts = tf.convert_to_tensor(
+        np.array(target_ts), dtype=np.int64, name='target_ts'
+    )
     task = tf.convert_to_tensor(np.array(task), dtype=np.int64, name='task')
     y = tf.convert_to_tensor(np.array(y_out), dtype=np.float32)
 
-    ds = {
-                'ts': ts,
-                'history': history,
-                'target_ts': target_ts,
-                'task': task
-            }, y
+    ds = {'ts': ts, 'history': history, 'target_ts': target_ts, 'task': task}, y
 
     return tf.data.Dataset.from_tensor_slices(ds)
 
 
 # Tourism
 tourism_yearly_test_df = prepare_metalearned_test(
-    TourismDataset, TourismMeta, 'Yearly', 8, 4) 
+    TourismDataset, TourismMeta, 'Yearly', 8, 4
+)
 tourism_quarterly_test_df = prepare_metalearned_test(
-    TourismDataset, TourismMeta, 'Quarterly', 16, 8) 
+    TourismDataset, TourismMeta, 'Quarterly', 16, 8
+)
 tourism_monthly_test_df = prepare_metalearned_test(
-    TourismDataset, TourismMeta, 'Monthly', 48, 24) 
+    TourismDataset, TourismMeta, 'Monthly', 48, 24
+)
 
 # M3
-m3_yearly_test_df = prepare_metalearned_test(
-    M3Dataset, M3Meta, 'M3Year', 12, 6) 
-m3_quarterly_test_df = prepare_metalearned_test(
-    M3Dataset, M3Meta, 'M3Quart', 16, 8) 
-m3_monthly_test_df = prepare_metalearned_test(
-    M3Dataset, M3Meta, 'M3Month', 36, 18) 
-m3_others_test_df = prepare_metalearned_test(
-    M3Dataset, M3Meta, 'M3Other', 16, 8) 
+m3_yearly_test_df = prepare_metalearned_test(M3Dataset, M3Meta, 'M3Year', 12, 6)
+m3_quarterly_test_df = prepare_metalearned_test(M3Dataset, M3Meta, 'M3Quart', 16, 8)
+m3_monthly_test_df = prepare_metalearned_test(M3Dataset, M3Meta, 'M3Month', 36, 18)
+m3_others_test_df = prepare_metalearned_test(M3Dataset, M3Meta, 'M3Other', 16, 8)
diff --git a/src/training/models.py b/src/training/models.py
index d45bc3e..4727df1 100644
--- a/src/training/models.py
+++ b/src/training/models.py
@@ -1,10 +1,11 @@
 from typing import Dict
+
 import tensorflow as tf
-import tensorflow_io
-from tensorflow.keras import layers, Model, Input
-from constants import *
+from constants import DAY, DOW, MONTH, NUM_TASKS, YEAR
 from prepare_dataset import position_encoding
-from scalers import robust_scaler, max_scaling
+from scalers import max_scaling, robust_scaler
+from tensorflow.keras import layers
+
 
 class CustomScaling(layers.Layer):
     def __init__(self, name):
@@ -14,10 +15,10 @@ def __init__(self, name):
         elif name == 'robust':
             self.scaler = robust_scaler
 
-
     def call(self, history_channels, epsilon):
         return self.scaler(history_channels, epsilon)
 
+
 class PositionExpansion(layers.Layer):
     def __init__(self, periods: int, freqs: int, **kwargs):
         super().__init__(**kwargs)
@@ -32,14 +33,23 @@ def call(self, tc):
         out_shape = tf.shape(tc)
         return tf.reshape(embedded, [out_shape[0], out_shape[1], self.channels])
 
+
 class TransformerBlock(layers.Layer):
     def __init__(self, key_dim, heads=4, value_dim=None, residual=False, **kwargs):
         super().__init__(**kwargs)
         self.attention = layers.MultiHeadAttention(
-            num_heads=heads, key_dim=key_dim, value_dim=value_dim, name=f'{self.name}_attention')
+            num_heads=heads,
+            key_dim=key_dim,
+            value_dim=value_dim,
+            name=f'{self.name}_attention',
+        )
         value_dim = value_dim or key_dim
-        self.ff1 = layers.Dense(4 * heads * value_dim, activation='gelu', name=f'{self.name}_ff1')
-        self.ff2 = layers.Dense(heads * value_dim, activation='gelu', name=f'{self.name}_ff2')
+        self.ff1 = layers.Dense(
+            4 * heads * value_dim, activation='gelu', name=f'{self.name}_ff1'
+        )
+        self.ff2 = layers.Dense(
+            heads * value_dim, activation='gelu', name=f'{self.name}_ff2'
+        )
         self.residual = residual
         if self.residual:
             self.attn_norm = layers.LayerNormalization(name=f'{self.name}_attn_norm')
@@ -52,8 +62,9 @@ def call(self, x, mask):
         a = self.attention(x, x, attention_mask=mask)
         a = self.ff1(a)
         return self.ff2(a)
-        #na = self.attn_norm(a + x)
-        #return self.ff_norm(self.ff(na) + na)
+        # na = self.attn_norm(a + x)
+        # return self.ff_norm(self.ff(na) + na)
+
 
 class BaseModel(tf.keras.Model):
     def __init__(self, epsilon=1e-4, scaler='robust', **kwargs):
@@ -64,9 +75,16 @@ def __init__(self, epsilon=1e-4, scaler='robust', **kwargs):
         self.pos_day = PositionExpansion(31, 6)
         self.pos_dow = PositionExpansion(7, 4)
         self.robust_scaler = CustomScaling(scaler)
-        self.embed_size = sum(emb.channels for emb in (self.pos_year, self.pos_month, self.pos_day, self.pos_dow))
-        self.expand_target_nopos = layers.Dense(self.embed_size, name='NoPosEnc', activation='relu')
-        self.expand_target_forpos = layers.Dense(self.embed_size, name='ForPosEnc', activation='relu')
+        self.embed_size = sum(
+            emb.channels
+            for emb in (self.pos_year, self.pos_month, self.pos_day, self.pos_dow)
+        )
+        self.expand_target_nopos = layers.Dense(
+            self.embed_size, name='NoPosEnc', activation='relu'
+        )
+        self.expand_target_forpos = layers.Dense(
+            self.embed_size, name='ForPosEnc', activation='relu'
+        )
         self.concat_pos = layers.Concatenate(axis=-1, name='ConcatPos')
         self.concat_embed = layers.Concatenate(axis=-1, name='ConcatEmbed')
         # Will be an embedding when we have different tasks.
@@ -83,32 +101,40 @@ def call(self, x: Dict[str, tf.Tensor]):
         # Build position encodings
         year = self.tc(ts, YEAR)
         delta_year = tf.clip_by_value(year[:, -1:] - year, 0, self.pos_year.periods)
-        pos_embedding = self.concat_pos([
-            self.pos_year(delta_year),
-            self.pos_month(self.tc(ts, MONTH)),
-            self.pos_day(self.tc(ts, DAY)),
-            self.pos_dow(self.tc(ts, DOW)),
-        ])
+        pos_embedding = self.concat_pos(
+            [
+                self.pos_year(delta_year),
+                self.pos_month(self.tc(ts, MONTH)),
+                self.pos_day(self.tc(ts, DAY)),
+                self.pos_dow(self.tc(ts, DOW)),
+            ]
+        )
         mask = year > 0
 
         # Embed history
         history_channels = tf.expand_dims(history, axis=-1)
-#         scale = self.max_scaling(history_channels) + self.epsilon
-#         scaled = history_channels / scale
+        #         scale = self.max_scaling(history_channels) + self.epsilon
+        #         scaled = history_channels / scale
         scale, scaled = self.robust_scaler(history_channels, self.epsilon)
         embed_nopos = self.expand_target_nopos(scaled)
         embed_pos = self.expand_target_forpos(scaled) + pos_embedding
         embedded = self.concat_embed([embed_nopos, embed_pos])
 
-
         # Embed target
-        target_year = tf.clip_by_value(year[:, -1:] - self.tc(target_ts, YEAR), 0, self.pos_year.periods)
-        target_pos_embed = tf.squeeze(self.concat_pos([
-            self.pos_year(target_year),
-            self.pos_month(self.tc(target_ts, MONTH)),
-            self.pos_day(self.tc(target_ts, DAY)),
-            self.pos_dow(self.tc(target_ts, DOW)),
-        ]), axis=1)
+        target_year = tf.clip_by_value(
+            year[:, -1:] - self.tc(target_ts, YEAR), 0, self.pos_year.periods
+        )
+        target_pos_embed = tf.squeeze(
+            self.concat_pos(
+                [
+                    self.pos_year(target_year),
+                    self.pos_month(self.tc(target_ts, MONTH)),
+                    self.pos_day(self.tc(target_ts, DAY)),
+                    self.pos_dow(self.tc(target_ts, DOW)),
+                ]
+            ),
+            axis=1,
+        )
         task_embed = self.target_marker(task)
         target = self.concat_embed([task_embed, task_embed + target_pos_embed])
 
@@ -120,11 +146,21 @@ def call(self, x: Dict[str, tf.Tensor]):
     def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
         # return super().compute_loss(x, y, y_pred['result'], sample_weight)
         scale = y_pred['scale']
-        return super().compute_loss(x, y / scale, y_pred['result'] / scale, sample_weight)
-
-    def forecast(self, ts: tf.Tensor, mask: tf.Tensor, scale: tf.Tensor, embedded: tf.Tensor, target: tf.Tensor):
+        return super().compute_loss(
+            x, y / scale, y_pred['result'] / scale, sample_weight
+        )
+
+    def forecast(
+        self,
+        ts: tf.Tensor,
+        mask: tf.Tensor,
+        scale: tf.Tensor,
+        embedded: tf.Tensor,
+        target: tf.Tensor,
+    ):
         return NotImplemented
 
+
 class LSTMModel(BaseModel):
     def __init__(self, unit=30, **kwargs):
         super().__init__(**kwargs)
@@ -133,7 +169,14 @@ def __init__(self, unit=30, **kwargs):
         self.combine_target = layers.Concatenate(name='AppendTarget', axis=-1)
         self.cont_output = layers.Dense(1, name='Output', activation='relu')
 
-    def forecast(self, ts: tf.Tensor, mask: tf.Tensor, scale: tf.Tensor, embedded: tf.Tensor, target: tf.Tensor):
+    def forecast(
+        self,
+        ts: tf.Tensor,
+        mask: tf.Tensor,
+        scale: tf.Tensor,
+        embedded: tf.Tensor,
+        target: tf.Tensor,
+    ):
         lstm_out = self.lstm(embedded, mask=mask)
         with_target = self.combine_target([lstm_out, target])
         return self.cont_output(with_target)
@@ -160,13 +203,17 @@ def __init__(self, tx_layers=2, **kwargs):
     #     self.encoder2 = TransformerBlock(key_dim=(self.embed_size * 2))
     #     self.final_output = layers.Dense(1, name='FinalOutput', activation='relu')
 
-    def forecast(self, ts: tf.Tensor, mask: tf.Tensor, scale: tf.Tensor, embedded: tf.Tensor, target: tf.Tensor):
+    def forecast(
+        self,
+        ts: tf.Tensor,
+        mask: tf.Tensor,
+        scale: tf.Tensor,
+        embedded: tf.Tensor,
+        target: tf.Tensor,
+    ):
         mask = tf.pad(mask, [[0, 0], [0, 1]], constant_values=True)
         mask = tf.math.logical_and(tf.expand_dims(mask, 1), tf.expand_dims(mask, -1))
-        x = self.concat_target([
-            embedded,
-            tf.expand_dims(target, axis=1)
-        ])
+        x = self.concat_target([embedded, tf.expand_dims(target, axis=1)])
         x = self.encoder1(x, mask)
         x = self.encoder2(x, mask)
         # x = self.encoder3(x, mask)
diff --git a/src/training/noise_ablation.py b/src/training/noise_ablation.py
index b8692e3..eeb82fd 100644
--- a/src/training/noise_ablation.py
+++ b/src/training/noise_ablation.py
@@ -2,34 +2,43 @@
 Module to train the model
 """
 
-from keras import backend
-import yaml
-import datetime
 import argparse
-import tensorflow as tf
-from tensorflow import keras
+import datetime
+
 import numpy as np
-import tensorflow_io
-from utils import load_tf_dataset
-from models import TransformerModel
-from create_train_test_df import create_train_test_df
+import tensorflow as tf
+import yaml
 from config_variables import Config
+from create_train_test_df import create_train_test_df
+from keras import backend
+from metalearned_validation import (
+    m3_monthly_test_df,
+    m3_others_test_df,
+    m3_quarterly_test_df,
+    m3_yearly_test_df,
+    tourism_monthly_test_df,
+    tourism_quarterly_test_df,
+    tourism_yearly_test_df,
+)
+from models import TransformerModel
+from train import AdditionalValidationSets
+from utils import load_tf_dataset
 
 
 def get_combined_ds(config):
-    version = config["version"]
+    version = config['version']
 
     # all the datasets we have. Ideally we use only 3 of these for trainig
     # adjust the values in this list accordingly
     datasets = [
         # load_tf_dataset(config["prefix"] + f"{version}/minute.tfrecords"),
         # load_tf_dataset(config["prefix"] + f"{version}/hourly.tfrecords"),
-        load_tf_dataset(config["prefix"] + f"{version}/daily.tfrecords"),
-        load_tf_dataset(config["prefix"] + f"{version}/weekly.tfrecords"),
-        load_tf_dataset(config["prefix"] + f"{version}/monthly.tfrecords"),
+        load_tf_dataset(config['prefix'] + f'{version}/daily.tfrecords'),
+        load_tf_dataset(config['prefix'] + f'{version}/weekly.tfrecords'),
+        load_tf_dataset(config['prefix'] + f'{version}/monthly.tfrecords'),
     ]
 
-    # # ucomment these lines to use the real world datasets in training
+    # # uncomment these lines to use the real world datasets in training
     # tourism_ds = load_tf_dataset(config['prefix'] + 'tourism.tfrecords')
     # wikiweb_ds = load_tf_dataset(config['prefix'] + 'wikiweb.tfrecords')
 
@@ -45,24 +54,21 @@ def main():
     np.random.seed(42)
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", required=True, help="Path to config file")
+    parser.add_argument('-c', '--config', required=True, help='Path to config file')
     args = parser.parse_args()
 
     with open(args.config) as config_file:
         config = yaml.load(config_file, yaml.loader.SafeLoader)
 
-    Config.set_sub_day(config["sub_day"])
+    Config.set_sub_day(config['sub_day'])
 
     combined_ds = get_combined_ds(config)
-    train_df, test_df = create_train_test_df(combined_ds, config["test_noise"])
-
-
+    train_df, test_df = create_train_test_df(combined_ds, config['test_noise'])
 
     model = TransformerModel(scaler=config['scaler'])
 
-
     def smape(y_true, y_pred):
-        """ Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
+        """Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
         `loss = 200 * mean(abs((y_true - y_pred) / (y_true + y_pred), axis=-1)`
         Args:
         y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
@@ -84,55 +90,58 @@ def smape(y_true, y_pred):
 
     # need these two lines, else fit gives error
     batch_X, batch_y = next(iter(train_df.batch(2).take(1)))
-    pred_y = model(batch_X)
-
+    model(batch_X)
 
     model.compile(
         optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
         loss=tf.keras.losses.MeanSquaredError(),
-        metrics=[tf.keras.metrics.MeanAbsolutePercentageError(name='mape'),
-                 tf.keras.metrics.MeanSquaredError(name='mse'),
-                 smape,
-                 ]
+        metrics=[
+            tf.keras.metrics.MeanAbsolutePercentageError(name='mape'),
+            tf.keras.metrics.MeanSquaredError(name='mse'),
+            smape,
+        ],
     )
 
+    fit_id = '.'.join(
+        [config['model_save_name'], datetime.datetime.now().strftime('%Y%m%d-%H%M%S')]
+    )
 
-    fit_id = '.'.join([config["model_save_name"],
-                       datetime.datetime.now().strftime("%Y%m%d-%H%M%S")])
-    
-    logdir = f"/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}"
+    logdir = f'/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}'
     tbCallback = tf.keras.callbacks.TensorBoard(logdir)
-    tbCallback._val_dir = logdir+'/validation'
+    tbCallback._val_dir = logdir + '/validation'
     callbacks = tf.keras.callbacks.CallbackList(
         callbacks=[
             tf.keras.callbacks.ModelCheckpoint(
-                config["prefix"] + f"models/{fit_id}/ckpts", monitor="loss", verbose=1
+                config['prefix'] + f'models/{fit_id}/ckpts', monitor='loss', verbose=1
             ),
             tf.keras.callbacks.TensorBoard(
-                f"/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}"
+                f'/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}'
             ),
             # tf.keras.callbacks.LearningRateScheduler(
             #     lambda epoch, lr: min(0.001, lr * (epoch + 1))
             # )
-            AdditionalValidationSets([(tourism_yearly_test_df, 'tourism_yearly'),
-                                      (tourism_quarterly_test_df,'tourism_quarterly'),
-                                      (tourism_monthly_test_df,'tourism_monthly'),
-                                      (m3_yearly_test_df, 'm3_yearly'),
-                                      (m3_quarterly_test_df, 'm3_quarterly'),
-                                      (m3_monthly_test_df, 'm3_monthly'),
-                                      (m3_others_test_df, 'm3_others'),
-                                      ], 
-                                      tbCallback)
+            AdditionalValidationSets(
+                [
+                    (tourism_yearly_test_df, 'tourism_yearly'),
+                    (tourism_quarterly_test_df, 'tourism_quarterly'),
+                    (tourism_monthly_test_df, 'tourism_monthly'),
+                    (m3_yearly_test_df, 'm3_yearly'),
+                    (m3_quarterly_test_df, 'm3_quarterly'),
+                    (m3_monthly_test_df, 'm3_monthly'),
+                    (m3_others_test_df, 'm3_others'),
+                ],
+                tbCallback,
+            ),
         ],
         add_history=True,
         add_progbar=True,
         model=model,
     )
 
-
     model.fit(
-        train_df.shuffle(5_000_000, reshuffle_each_iteration=True).batch(
-            1024).prefetch(tf.data.AUTOTUNE),
+        train_df.shuffle(5_000_000, reshuffle_each_iteration=True)
+        .batch(1024)
+        .prefetch(tf.data.AUTOTUNE),
         # train_df.take(1000_000).cache().shuffle(100_000).batch(1024).prefetch(tf.data.AUTOTUNE),
         validation_data=test_df.batch(1024, drop_remainder=False).cache(),
         epochs=700,
@@ -140,8 +149,8 @@ def smape(y_true, y_pred):
         callbacks=callbacks,
     )
 
-    model.save(config["prefix"] + 'models/'+ config["model_save_name"])
+    model.save(config['prefix'] + 'models/' + config['model_save_name'])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/src/training/prepare_dataset.py b/src/training/prepare_dataset.py
index 4d5aeba..d0fc867 100644
--- a/src/training/prepare_dataset.py
+++ b/src/training/prepare_dataset.py
@@ -4,13 +4,21 @@
 """
 
 from typing import Dict
-import pandas as pd
+
 import numpy as np
+import pandas as pd
 import tensorflow as tf
-import tensorflow_io
 from config_variables import Config
-from constants import PADDING, HISTORY_LEN, TARGET_LEN, TRIM_LEN, TARGET_INDEX, \
-    SINGLE_POINT, MEAN_TO_DATE, STDEV_TO_DATE
+from constants import (
+    HISTORY_LEN,
+    MEAN_TO_DATE,
+    PADDING,
+    SINGLE_POINT,
+    STDEV_TO_DATE,
+    TARGET_INDEX,
+    TARGET_LEN,
+    TRIM_LEN,
+)
 
 
 def compute_time_features(ts: np.ndarray):
@@ -21,21 +29,22 @@ def compute_time_features(ts: np.ndarray):
     """
     ts = pd.to_datetime(ts)
     if Config.is_sub_day:
-        return np.stack([ts.minute, ts.hour, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
-    return np.stack([ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1)
-
+        return np.stack(
+            [ts.minute, ts.hour, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1
+        )
+    return np.stack(
+        [ts.year, ts.month, ts.day, ts.day_of_week + 1, ts.day_of_year], axis=-1
+    )
 
 
 @tf.function
 def build_frames(r: Dict[str, tf.Tensor]):
-    raw_date_info = tf.numpy_function(compute_time_features, inp=[r['ts']], Tout=tf.int64)
+    raw_date_info = tf.numpy_function(
+        compute_time_features, inp=[r['ts']], Tout=tf.int64
+    )
     date_info = tf.signal.frame(
-                    tf.pad(raw_date_info, [[PADDING, 0], [0, 0]]),
-                    HISTORY_LEN,
-                    1,
-                    axis=0
-                )
-
+        tf.pad(raw_date_info, [[PADDING, 0], [0, 0]]), HISTORY_LEN, 1, axis=0
+    )
 
     history = tf.signal.frame(tf.pad(r['y'], [[PADDING, 0]]), HISTORY_LEN, 1, axis=-1)
     noise = tf.signal.frame(tf.pad(r['noise'], [[PADDING, 0]]), HISTORY_LEN, 1, axis=-1)
@@ -45,180 +54,220 @@ def build_frames(r: Dict[str, tf.Tensor]):
     target_noise = tf.signal.frame(r['noise'], TARGET_LEN, 1, axis=-1)
 
     start_index = target_values.shape[0] - TRIM_LEN
-    batch_size = start_index - TARGET_LEN
+    start_index - TARGET_LEN
 
     return (
-            date_info[-start_index:-TARGET_LEN],
-            history[-start_index:-TARGET_LEN],
-            noise[-start_index:-TARGET_LEN],
-            target_dates[TARGET_INDEX:],
-            target_values[TARGET_INDEX:],
-            target_noise[TARGET_INDEX:]
-        )
+        date_info[-start_index:-TARGET_LEN],
+        history[-start_index:-TARGET_LEN],
+        noise[-start_index:-TARGET_LEN],
+        target_dates[TARGET_INDEX:],
+        target_values[TARGET_INDEX:],
+        target_noise[TARGET_INDEX:],
+    )
 
 
 @tf.function
 def gen_random_single_point(
-        date_info: tf.Tensor,
-        history: tf.Tensor,
-        noise: tf.Tensor,
-        target_dates: tf.Tensor,
-        target_values: tf.Tensor,
-        target_noise: tf.Tensor
-    ):
-
-
+    date_info: tf.Tensor,
+    history: tf.Tensor,
+    noise: tf.Tensor,
+    target_dates: tf.Tensor,
+    target_values: tf.Tensor,
+    target_noise: tf.Tensor,
+):
     # To limit to a single date
     batch_size = tf.shape(target_dates)[0]
-    targets = tf.random.uniform(shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32)
+    targets = tf.random.uniform(
+        shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32
+    )
     target_date = tf.gather(target_dates, targets, axis=1, batch_dims=1)
     target_value = tf.gather(target_values, targets, axis=1, batch_dims=1)
     return dict(
         ts=date_info,
-        history=history*noise,
+        history=history * noise,
         noise=noise,
         target_ts=target_date,
-        task=tf.fill([batch_size,], SINGLE_POINT),
-        target_noise=target_noise
+        task=tf.fill(
+            [
+                batch_size,
+            ],
+            SINGLE_POINT,
+        ),
+        target_noise=target_noise,
     ), target_value
 
 
 @tf.function
 def gen_mean_to_random_date(
-        date_info: tf.Tensor,
-        history: tf.Tensor,
-        noise: tf.Tensor,
-        target_dates: tf.Tensor,
-        target_values: tf.Tensor,
-        target_noise: tf.Tensor
-    ):
+    date_info: tf.Tensor,
+    history: tf.Tensor,
+    noise: tf.Tensor,
+    target_dates: tf.Tensor,
+    target_values: tf.Tensor,
+    target_noise: tf.Tensor,
+):
     # To limit to a single date
     batch_size = tf.shape(target_dates)[0]
-    targets = tf.random.uniform(shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32)
+    targets = tf.random.uniform(
+        shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32
+    )
     target_date = tf.gather(target_dates, targets, axis=1, batch_dims=1)
     target_value = tf.math.reduce_mean(
-                        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
-                        keepdims=True,
-                        axis=-1
-                   )
+        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
+        keepdims=True,
+        axis=-1,
+    )
     return dict(
         ts=date_info,
-        history=history*noise*.75,
+        history=history * noise * 0.75,
         noise=noise,
         target_ts=target_date,
-        task=tf.fill([batch_size,], MEAN_TO_DATE),
-        target_noise=target_noise
+        task=tf.fill(
+            [
+                batch_size,
+            ],
+            MEAN_TO_DATE,
+        ),
+        target_noise=target_noise,
     ), target_value
 
 
 @tf.function
 def gen_std_to_random_date(
-            date_info: tf.Tensor,
-            history: tf.Tensor,
-            noise: tf.Tensor,
-            target_dates: tf.Tensor,
-            target_values: tf.Tensor,
-            target_noise: tf.Tensor
-        ):
+    date_info: tf.Tensor,
+    history: tf.Tensor,
+    noise: tf.Tensor,
+    target_dates: tf.Tensor,
+    target_values: tf.Tensor,
+    target_noise: tf.Tensor,
+):
     # To limit to a single date
     batch_size = tf.shape(target_dates)[0]
-    targets = tf.random.uniform(shape=[batch_size, 1], minval=(TARGET_LEN // 2), maxval=TARGET_LEN, dtype=tf.int32)
+    targets = tf.random.uniform(
+        shape=[batch_size, 1],
+        minval=(TARGET_LEN // 2),
+        maxval=TARGET_LEN,
+        dtype=tf.int32,
+    )
     target_date = tf.gather(target_dates, targets, axis=1, batch_dims=1)
     target_value = tf.math.reduce_std(
-                        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
-                        keepdims=True,
-                        axis=-1
-                   )
+        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
+        keepdims=True,
+        axis=-1,
+    )
     target_noise_std = tf.math.reduce_std(
-                        tf.RaggedTensor.from_tensor(target_noise, lengths=(targets[:, 0] + 1)),
-                        keepdims=True,
-                        axis=-1
-                   )
+        tf.RaggedTensor.from_tensor(target_noise, lengths=(targets[:, 0] + 1)),
+        keepdims=True,
+        axis=-1,
+    )
 
     target_value = tf.math.sqrt(target_value**2 + target_noise_std**2)
 
     return dict(
         ts=date_info,
-        history=history*noise,
+        history=history * noise,
         noise=noise,
         target_ts=target_date,
-        task=tf.fill([batch_size,], STDEV_TO_DATE),
-        target_noise=target_noise
+        task=tf.fill(
+            [
+                batch_size,
+            ],
+            STDEV_TO_DATE,
+        ),
+        target_noise=target_noise,
     ), target_value
 
+
 @tf.function
 def gen_random_single_point_no_noise(
-        date_info: tf.Tensor,
-        history: tf.Tensor,
-        noise: tf.Tensor,
-        target_dates: tf.Tensor,
-        target_values: tf.Tensor,
-        target_noise: tf.Tensor
-    ):
-
-
+    date_info: tf.Tensor,
+    history: tf.Tensor,
+    noise: tf.Tensor,
+    target_dates: tf.Tensor,
+    target_values: tf.Tensor,
+    target_noise: tf.Tensor,
+):
     # To limit to a single date
     batch_size = tf.shape(target_dates)[0]
-    targets = tf.random.uniform(shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32)
+    targets = tf.random.uniform(
+        shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32
+    )
     target_date = tf.gather(target_dates, targets, axis=1, batch_dims=1)
     target_value = tf.gather(target_values, targets, axis=1, batch_dims=1)
     return dict(
         ts=date_info,
         history=history,
         target_ts=target_date,
-        task=tf.fill([batch_size,], SINGLE_POINT),
+        task=tf.fill(
+            [
+                batch_size,
+            ],
+            SINGLE_POINT,
+        ),
     ), target_value
 
 
 @tf.function
 def gen_mean_to_random_date_no_noise(
-        date_info: tf.Tensor,
-        history: tf.Tensor,
-        noise: tf.Tensor,
-        target_dates: tf.Tensor,
-        target_values: tf.Tensor,
-        target_noise: tf.Tensor
-    ):
+    date_info: tf.Tensor,
+    history: tf.Tensor,
+    noise: tf.Tensor,
+    target_dates: tf.Tensor,
+    target_values: tf.Tensor,
+    target_noise: tf.Tensor,
+):
     # To limit to a single date
     batch_size = tf.shape(target_dates)[0]
-    targets = tf.random.uniform(shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32)
+    targets = tf.random.uniform(
+        shape=[batch_size, 1], maxval=TARGET_LEN, dtype=tf.int32
+    )
     target_date = tf.gather(target_dates, targets, axis=1, batch_dims=1)
     target_value = tf.math.reduce_mean(
-                        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
-                        keepdims=True,
-                        axis=-1
-                   )
+        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
+        keepdims=True,
+        axis=-1,
+    )
     return dict(
         ts=date_info,
         history=history,
         target_ts=target_date,
-        task=tf.fill([batch_size,], MEAN_TO_DATE),
+        task=tf.fill(
+            [
+                batch_size,
+            ],
+            MEAN_TO_DATE,
+        ),
     ), target_value
 
 
 @tf.function
 def gen_std_to_random_date_no_noise(
-            date_info: tf.Tensor,
-            history: tf.Tensor,
-            noise: tf.Tensor,
-            target_dates: tf.Tensor,
-            target_values: tf.Tensor,
-            target_noise: tf.Tensor
-        ):
+    date_info: tf.Tensor,
+    history: tf.Tensor,
+    noise: tf.Tensor,
+    target_dates: tf.Tensor,
+    target_values: tf.Tensor,
+    target_noise: tf.Tensor,
+):
     # To limit to a single date
     batch_size = tf.shape(target_dates)[0]
-    targets = tf.random.uniform(shape=[batch_size, 1], minval=(TARGET_LEN // 2), maxval=TARGET_LEN, dtype=tf.int32)
+    targets = tf.random.uniform(
+        shape=[batch_size, 1],
+        minval=(TARGET_LEN // 2),
+        maxval=TARGET_LEN,
+        dtype=tf.int32,
+    )
     target_date = tf.gather(target_dates, targets, axis=1, batch_dims=1)
     target_value = tf.math.reduce_std(
-                        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
-                        keepdims=True,
-                        axis=-1
-                   )
+        tf.RaggedTensor.from_tensor(target_values, lengths=(targets[:, 0] + 1)),
+        keepdims=True,
+        axis=-1,
+    )
     target_noise_std = tf.math.reduce_std(
-                        tf.RaggedTensor.from_tensor(target_noise, lengths=(targets[:, 0] + 1)),
-                        keepdims=True,
-                        axis=-1
-                   )
+        tf.RaggedTensor.from_tensor(target_noise, lengths=(targets[:, 0] + 1)),
+        keepdims=True,
+        axis=-1,
+    )
 
     target_value = tf.math.sqrt(target_value**2 + target_noise_std**2)
 
@@ -226,9 +275,15 @@ def gen_std_to_random_date_no_noise(
         ts=date_info,
         history=history,
         target_ts=target_date,
-        task=tf.fill([batch_size,], STDEV_TO_DATE),
+        task=tf.fill(
+            [
+                batch_size,
+            ],
+            STDEV_TO_DATE,
+        ),
     ), target_value
 
+
 @tf.function
 def filter_unusable_points(X: Dict[str, tf.Tensor], y: tf.Tensor):
     """
@@ -236,8 +291,17 @@ def filter_unusable_points(X: Dict[str, tf.Tensor], y: tf.Tensor):
     """
     return tf.logical_and(tf.reduce_max(X['history']) > 0.1, tf.math.is_finite(y))[0]
 
+
 def position_encoding(periods: int, freqs: int):
-    return np.hstack([
-        np.fromfunction(lambda i, j: np.sin(np.pi / periods * (2**j) * (i-1)), (periods + 1, freqs)),
-        np.fromfunction(lambda i, j: np.cos(np.pi / periods * (2**j) * (i-1)), (periods + 1, freqs))
-    ])
\ No newline at end of file
+    return np.hstack(
+        [
+            np.fromfunction(
+                lambda i, j: np.sin(np.pi / periods * (2**j) * (i - 1)),
+                (periods + 1, freqs),
+            ),
+            np.fromfunction(
+                lambda i, j: np.cos(np.pi / periods * (2**j) * (i - 1)),
+                (periods + 1, freqs),
+            ),
+        ]
+    )
diff --git a/src/training/scalers.py b/src/training/scalers.py
index 5eed7f0..ba5c510 100644
--- a/src/training/scalers.py
+++ b/src/training/scalers.py
@@ -2,9 +2,8 @@
 Module containing different scaler functions
 """
 import tensorflow as tf
-import tensorflow_io
 from tensorflow.keras import layers
-import numpy as np
+
 
 def robust_scaler(inputs, epsilon):
     # inputs.shape = (batch_size, history_len, 1)
@@ -29,14 +28,10 @@ def robust_scaler(inputs, epsilon):
 
     # calculate mean and std of clipped data
     clipped_mean = tf.math.reduce_mean(
-        clipped_and_masked,
-        axis=1,
-        keepdims=True
+        clipped_and_masked, axis=1, keepdims=True
     ).to_tensor()
     clipped_std = tf.math.reduce_std(
-        clipped_and_masked,
-        axis=1,
-        keepdims=True
+        clipped_and_masked, axis=1, keepdims=True
     ).to_tensor()
 
     # scale is of shape (batch_size,1,1)
@@ -51,10 +46,8 @@ def robust_scaler(inputs, epsilon):
 
 
 def max_scaling(inputs, epsilon):
-
     scaler = layers.GlobalMaxPooling1D(name='MaxScaling', keepdims=1)
 
     scale = scaler(inputs) + epsilon
     output = inputs / scale
     return scale, output
-
diff --git a/src/training/train.py b/src/training/train.py
index 3271f6e..eb46e7b 100644
--- a/src/training/train.py
+++ b/src/training/train.py
@@ -2,30 +2,28 @@
 Module to train the model
 """
 
-from keras import backend
-import yaml
-import datetime
 import argparse
-import tensorflow as tf
-from tensorflow import keras
+import datetime
+
 import numpy as np
-import tensorflow_io
-from utils import load_tf_dataset
-from models import TransformerModel
-from create_train_test_df import create_train_test_df
+import tensorflow as tf
+import yaml
 from config_variables import Config
-from metalearned_validation import prepare_metalearned_test
+from create_train_test_df import create_train_test_df
+from keras import backend
+from models import TransformerModel
+from utils import load_tf_dataset
 
 
 def get_combined_ds(config):
-    version = config["version"]
+    version = config['version']
 
     # all the datasets we have. Ideally we use only 3 of these for trainig
     # adjust the values in this list accordingly
     datasets = [
-        load_tf_dataset(config["prefix"] + f"{version}/daily.tfrecords"),
-        load_tf_dataset(config["prefix"] + f"{version}/weekly.tfrecords"),
-        load_tf_dataset(config["prefix"] + f"{version}/monthly.tfrecords"),
+        load_tf_dataset(config['prefix'] + f'{version}/daily.tfrecords'),
+        load_tf_dataset(config['prefix'] + f'{version}/weekly.tfrecords'),
+        load_tf_dataset(config['prefix'] + f'{version}/monthly.tfrecords'),
     ]
 
     combined_ds = tf.data.Dataset.choose_from_datasets(
@@ -40,21 +38,21 @@ def main():
     np.random.seed(42)
 
     parser = argparse.ArgumentParser()
-    parser.add_argument("-c", "--config", required=True, help="Path to config file")
+    parser.add_argument('-c', '--config', required=True, help='Path to config file')
     args = parser.parse_args()
 
     with open(args.config) as config_file:
         config = yaml.load(config_file, yaml.loader.SafeLoader)
 
-    Config.set_sub_day(config["sub_day"])
+    Config.set_sub_day(config['sub_day'])
 
     combined_ds = get_combined_ds(config)
-    train_df, test_df = create_train_test_df(combined_ds, config["test_noise"])
+    train_df, test_df = create_train_test_df(combined_ds, config['test_noise'])
 
     model = TransformerModel(scaler=config['scaler'])
 
     def smape(y_true, y_pred):
-        """ Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
+        """Calculate Armstrong's original definition of sMAPE between `y_true` & `y_pred`.
         `loss = 200 * mean(abs((y_true - y_pred) / (y_true + y_pred), axis=-1)`
         Args:
         y_true: Ground truth values. shape = `[batch_size, d0, .. dN]`.
@@ -72,20 +70,19 @@ def smape(y_true, y_pred):
 
     # need these two lines, else fit gives error
     batch_X, batch_y = next(iter(train_df.batch(2).take(1)))
-    pred_y = model(batch_X)
-
+    model(batch_X)
 
     model.compile(
         optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
         loss=tf.keras.losses.MeanSquaredError(),
-        metrics=[tf.keras.metrics.MeanAbsolutePercentageError(name='mape'),
-                 tf.keras.metrics.MeanSquaredError(name='mse'),
-                 smape,
-                 ]
+        metrics=[
+            tf.keras.metrics.MeanAbsolutePercentageError(name='mape'),
+            tf.keras.metrics.MeanSquaredError(name='mse'),
+            smape,
+        ],
     )
 
     class AdditionalValidationSets(tf.keras.callbacks.Callback):
-
         def __init__(self, validation_sets, tbCallback, verbose=1, batch_size=1):
             """
             :param validation_sets:
@@ -126,35 +123,35 @@ def on_epoch_end(self, epoch, logs=None):
                     sample_weights = None
                 else:
                     raise ValueError()
-                
+
                 print(validation_set_name)
-                results = self.model.evaluate(x=validation_data,
-                                            verbose=self.verbose,
-                                            sample_weight=sample_weights,
-                                            batch_size=self.batch_size)
+                results = self.model.evaluate(
+                    x=validation_data,
+                    verbose=self.verbose,
+                    sample_weight=sample_weights,
+                    batch_size=self.batch_size,
+                )
 
                 for metric, result in zip(self.model.metrics_names, results):
                     valuename = validation_set_name + '_' + metric
                     self.history.setdefault(valuename, []).append(result)
                     with self.tbCallback._val_writer.as_default(step=epoch):
-                        tf.summary.scalar(valuename, result)             
-        
-
+                        tf.summary.scalar(valuename, result)
 
+    fit_id = '.'.join(
+        [config['model_save_name'], datetime.datetime.now().strftime('%Y%m%d-%H%M%S')]
+    )
 
-    fit_id = '.'.join([config["model_save_name"],
-                       datetime.datetime.now().strftime("%Y%m%d-%H%M%S")])
-    
-    logdir = f"/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}"
+    logdir = f'/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}'
     tbCallback = tf.keras.callbacks.TensorBoard(logdir)
-    tbCallback._val_dir = logdir+'/validation'
+    tbCallback._val_dir = logdir + '/validation'
     callbacks = tf.keras.callbacks.CallbackList(
         callbacks=[
             tf.keras.callbacks.ModelCheckpoint(
-                config["prefix"] + f"models/{fit_id}/ckpts", monitor="loss", verbose=1
+                config['prefix'] + f'models/{fit_id}/ckpts', monitor='loss', verbose=1
             ),
             tf.keras.callbacks.TensorBoard(
-                f"/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}"
+                f'/home/ubuntu/tensorboard/notebook/pretrained/{fit_id}'
             ),
         ],
         add_history=True,
@@ -162,10 +159,10 @@ def on_epoch_end(self, epoch, logs=None):
         model=model,
     )
 
-
     model.fit(
-        train_df.shuffle(5_000, reshuffle_each_iteration=True).batch(
-            1024).prefetch(tf.data.AUTOTUNE),
+        train_df.shuffle(5_000, reshuffle_each_iteration=True)
+        .batch(1024)
+        .prefetch(tf.data.AUTOTUNE),
         # train_df.take(1000_000).cache().shuffle(100_000).batch(1024).prefetch(tf.data.AUTOTUNE),
         validation_data=test_df.batch(1024, drop_remainder=False).cache(),
         epochs=700,
@@ -173,8 +170,8 @@ def on_epoch_end(self, epoch, logs=None):
         callbacks=callbacks,
     )
 
-    model.save(config["prefix"] + 'models/'+ config["model_save_name"])
+    model.save(config['prefix'] + 'models/' + config['model_save_name'])
 
 
-if __name__ == "__main__":
+if __name__ == '__main__':
     main()
diff --git a/src/training/utils.py b/src/training/utils.py
index e34a4d7..3445d5b 100644
--- a/src/training/utils.py
+++ b/src/training/utils.py
@@ -2,9 +2,9 @@
 Utility functions for training script
 """
 import tensorflow as tf
-import tensorflow_io
 from constants import TF_SCHEMA
 
+
 def decode_fn(record_bytes):
     """
     Method to process bytes from tfrecord files