From b7c72b568eb184631cf6b1901d3cf0e0d15fdef3 Mon Sep 17 00:00:00 2001
From: Danial Dervovic <dannydervovic@hotmail.com>
Date: Thu, 9 Jul 2020 10:19:56 +0100
Subject: [PATCH] Stylized facts plot update. (#21)

* Stylized facts plot update.

Following changes:

  1. Asset return stylized facts interface is streamlined. Interface for
preprocessed WRDS data removed and ABIDES interface simplified.
  2. Order flow stylized facts processing bugfixes.
  3. Import code for sim data preprocessing refactored.
  4. Efficiency improvement via fewer disk reads from pandas.

Co-authored-by: Danial Dervovic <ddervs@googlemail.com>
---
 realism/asset_returns_stylized_facts.py       | 101 ++++++--------
 realism/market_impact/abm_market_impact.py    | 110 +++++++++++++++
 .../marketreplay_market_impact.py             | 128 ++++++++++++++++++
 realism/metrics/aggregation_normality.py      |   4 +-
 realism/metrics/autocorrelation.py            |  10 +-
 realism/metrics/kurtosis.py                   |   5 +-
 realism/metrics/metric.py                     |  39 ++----
 realism/metrics/minutely_returns.py           |   5 +-
 .../metrics/returns_volatility_correlation.py |   5 +-
 realism/metrics/volatility_clustering.py      |   8 +-
 .../metrics/volume_volatility_correlation.py  |   5 +-
 realism/order_flow_stylized_facts.py          |  16 ++-
 realism/realism_utils.py                      |   2 +-
 util/formatting/mid_price_from_orderbook.py   |  10 +-
 .../prepare_abides_data_for_plotting.py       |   4 +-
 .../prepare_dow_data_for_plotting.py          |   2 +-
 16 files changed, 341 insertions(+), 113 deletions(-)
 create mode 100644 realism/market_impact/abm_market_impact.py
 create mode 100644 realism/market_impact/marketreplay_market_impact.py

diff --git a/realism/asset_returns_stylized_facts.py b/realism/asset_returns_stylized_facts.py
index 40d5aba55..06b6e8b51 100644
--- a/realism/asset_returns_stylized_facts.py
+++ b/realism/asset_returns_stylized_facts.py
@@ -10,8 +10,11 @@
 from glob import glob
 from pathlib import Path
 import argparse
-sys.path.append('..')
-from order_flow_stylized_facts import get_plot_colors
+from tqdm import tqdm
+
+p = str(Path(__file__).resolve().parents[1])  # directory one level up from this file
+sys.path.append(p)
+from realism_utils import get_plot_colors
 from util.formatting.convert_order_stream import dir_path
 
 # Create cache folder if it does not exist
@@ -29,78 +32,65 @@
 all_metrics = [MinutelyReturns, AggregationNormality, Autocorrelation, VolatilityClustering, Kurtosis, VolumeVolatilityCorrelation, ReturnsVolatilityCorrelation]
 
 
-def get_sims(sim_dir, n, my_metric):
+def get_sims(sim_dir, my_metric, ohlcv_dict):
     sims = []
     exchanges = [a for a in Path(sim_dir).rglob('*.bz2') if "exchange" in str(a).lower()]
-    mult = math.ceil(n * samples_per_day / len(exchanges))
 
     for exchange in exchanges:
-        ohlcv = get_trades(exchange)
-        sims += my_metric.compute(ohlcv)
+        ohlcv = ohlcv_dict[exchange]
+        if ohlcv is not None:
+            sims += my_metric.compute(ohlcv)
 
-    sims = sims * mult  # Duplicate data to match size of historical data
     random.shuffle(sims)
+
     return sims
 
 
-def plot_metrics(samples_per_day, real_dir, sim_dirs, sim_colors, recompute):
+def get_ohlcvs(sim_dirs, recompute):
+    print("Loading simulation data...")
+    exchanges = []
+    for sim_dir in sim_dirs:
+        exchanges += [a for a in Path(sim_dir).rglob('*.bz2') if "exchange" in str(a).lower()]
+
+    pickled_ohclv = "cache/{}_ohclv.pickle".format("_".join(sim_dirs).replace("/", ""))
+    if (not os.path.exists(pickled_ohclv)) or recompute:  # Pickled simulated metric not found in cache.
+        ohclv_dict = dict()
+        for exchange in tqdm(exchanges, desc="Files loaded"):
+            ohclv_dict.update(
+                {exchange: get_trades(exchange)}
+            )
+        pickle.dump(ohclv_dict, open(pickled_ohclv, "wb"))
+    else:  # Pickled ohclv found in cache.
+        ohclv_dict = pickle.load(open(pickled_ohclv, "rb"))
+
+    return ohclv_dict
+
+
+def plot_metrics(sim_dirs, sim_colors, output_dir, ohclv_dict, recompute):
     # Loop through all metrics
     for my_metric in all_metrics:
         print(my_metric)
         my_metric = my_metric()
-
-        # Calculate metrics for real data
-        real_data = sorted(os.listdir(real_dir))
-        n = len(real_data)
-        pickled_real = "cache/{}_real.pickle".format(my_metric.__class__.__name__)
-
-        # HISTORICAL METRIC
-        if (not os.path.exists(pickled_real)) or recompute: # Pickled historical metric not found in cache.
-            reals = []
-            for f in real_data: # For each historical trading day...
-                print(f)
-                f = os.path.join(real_dir, f)
-                df = pd.read_pickle(f, compression="bz2")
-                df.reset_index(level=-1, inplace=True)
-                df.level_1 = pd.to_datetime(df.level_1)
-                symbols = df.index.unique().tolist()
-                random.shuffle(symbols)
-                symbols = symbols[:samples_per_day] # ...sample `samples_per_day` symbols.
-                df = df[df.index.isin(symbols)] # Only keep data for sampled symbols.
-                for sym in symbols:
-                    select = df[df.index == sym].set_index("level_1") # Set timestamp as index.
-                    vol = select["volume"]
-                    select = select.drop("volume", axis=1)
-                    select = (np.round((1000*select/select.iloc[0]).dropna())*100).astype("int")
-                    select["volume"] = vol
-                    reals += my_metric.compute(select) # Compute metric on sampled data.
-            pickle.dump(reals, open(pickled_real, "wb"))
-        else: # Pickled historical metric found in cache.
-            reals = pickle.load(open(pickled_real, "rb"))
-
-        # SIMULATED METRIC
-        first = True if real_dir is None else False
+        result = dict()
         for i, sim_dir in enumerate(sim_dirs):
             # Calculate metrics for simulated data (via sampling)
             pickled_sim = "cache/{}_{}.pickle".format(my_metric.__class__.__name__, sim_dir.replace("/",""))
             if (not os.path.exists(pickled_sim)) or recompute: # Pickled simulated metric not found in cache.
-                sims = get_sims(sim_dir, n, my_metric)
-                sims = sims[:len(reals)] # Ensure length of simulated and historical data matches
+                sims = get_sims(sim_dir, my_metric, ohclv_dict)
                 pickle.dump(sims, open(pickled_sim, "wb"))
             else: # Pickled simulated metric found in cache.
                 sims = pickle.load(open(pickled_sim, "rb"))
 
             sim_name = sim_dir.rstrip('/').split("/")[-1]
-            result = {(sim_name, sim_colors[i]): sims}
+            result.update({(sim_name, sim_colors[i]): sims})
 
-            # Create plot for each config and metric
-            my_metric.visualize(result, reals, plot_real=not first)
-            first = True
+        # Create plot for each config and metric
+        my_metric.visualize(result)
 
         plt.title(plt.gca().title.get_text())
-        try: os.mkdir("visualizations")
+        try: os.mkdir(output_dir)
         except: pass
-        plt.savefig("visualizations/{}_{}.png".format(my_metric.__class__.__name__, sim_name),bbox_inches='tight')
+        plt.savefig("{}/{}.png".format(output_dir, my_metric.__class__.__name__), bbox_inches='tight')
         plt.clf()
 
 
@@ -108,22 +98,19 @@ def plot_metrics(samples_per_day, real_dir, sim_dirs, sim_colors, recompute):
 
     parser = argparse.ArgumentParser(description='Processes historical data and simulated stream files and produce plots'
                                                  ' of stylized fact metrics for asset return distributions.')
-    parser.add_argument('-r', '--historical-data-dir', type=dir_path, required=False, help="Directory containing preprocessed"
-                                                                                    "historical data for asset return"
-                                                                                    "stylized facts")
-    parser.add_argument('-s', '--simulated-data-dir', type=dir_path, action='append', required=False,
+    parser.add_argument('-s', '--simulated-data-dir', type=dir_path, action='append', required=True,
                         help="Directory containing .bz2 output log files from ABIDES Exchange Agent. Note that the "
                              "filenames MUST contain the word 'Exchange' in any case. One can add many simulated data "
                              "directories")
     parser.add_argument('-z', '--recompute', action="store_true", help="Rerun computations without caching.")
-
+    parser.add_argument('-o', '--output-dir', default='visualizations', help='Path to output directory', type=dir_path)
 
     args, remaining_args = parser.parse_known_args()
 
     # Settings
-    samples_per_day = 30
-    real_dir = args.historical_data_dir
     sim_dirs = args.simulated_data_dir
-    sim_colors = get_plot_colors(sim_dirs, start_idx=1)
+    sim_dirs.sort()
+    sim_colors = get_plot_colors(sim_dirs)
+    ohclv_dict = get_ohlcvs(sim_dirs, args.recompute)
 
-    plot_metrics(samples_per_day, real_dir, sim_dirs, sim_colors, args.recompute)
+    plot_metrics(sim_dirs, sim_colors, args.output_dir, ohclv_dict, args.recompute)
diff --git a/realism/market_impact/abm_market_impact.py b/realism/market_impact/abm_market_impact.py
new file mode 100644
index 000000000..54a324464
--- /dev/null
+++ b/realism/market_impact/abm_market_impact.py
@@ -0,0 +1,110 @@
+import argparse
+import pandas as pd
+import numpy as np
+import sys
+p = str(Path(__file__).resolve().parents[2])  # directory two levels up from this file
+sys.path.append(p)
+
+from realism.realism_utils import make_orderbook_for_analysis
+
+
+def create_orderbooks(exchange_path, ob_path):
+    MID_PRICE_CUTOFF = 10000
+    processed_orderbook = make_orderbook_for_analysis(exchange_path, ob_path, num_levels=1,
+                                                      hide_liquidity_collapse=False)
+    cleaned_orderbook = processed_orderbook[(processed_orderbook['MID_PRICE'] > - MID_PRICE_CUTOFF) &
+                                            (processed_orderbook['MID_PRICE'] < MID_PRICE_CUTOFF)]
+    transacted_orders = cleaned_orderbook.loc[cleaned_orderbook.TYPE == "ORDER_EXECUTED"]
+
+    transacted_orders = transacted_orders.reset_index()
+    transacted_orders = transacted_orders.sort_values(by=['index', 'ORDER_ID']).iloc[1::2]
+    transacted_orders.set_index('index', inplace=True)
+    return processed_orderbook, transacted_orders, cleaned_orderbook
+
+
+def calculate_market_impact(orders_df, ob_df, start_time, end_time, tao):
+
+    def create_bins(tao, start_time, end_time, orders_df, is_buy):
+        bins = pd.interval_range(start=start_time, end=end_time, freq=pd.DateOffset(seconds=tao))
+        binned = pd.cut(orders_df.loc[orders_df.BUY_SELL_FLAG == is_buy].index, bins=bins)
+        binned_volume = orders_df.loc[orders_df.BUY_SELL_FLAG == is_buy].groupby(binned).SIZE.agg(np.sum)
+        return binned_volume
+
+    def calculate_mid_move(row):
+        try:
+            t_start = row.name.left
+            t_end = row.name.right
+            mid_t_start = mid_resampled.loc[mid_resampled.index == t_start].item()
+            mid_t_end = mid_resampled.loc[mid_resampled.index == t_end].item()
+            if row.ti < 0:
+                row.mi = -1 * ((mid_t_end - mid_t_start) / mid_t_start) * 10000 # bps
+            else:
+                row.mi = (mid_t_end - mid_t_start) / mid_t_start * 10000 # bps
+            return row.mi
+        except:
+            pass
+
+    ob_df = ob_df.reset_index().drop_duplicates(subset='index', keep='last').set_index('index')
+
+    mid = ob_df.MID_PRICE
+    mid_resampled = mid.resample(f'{tao}s').ffill()
+
+    binned_buy_volume = create_bins(tao=int(tao), start_time=start_time, end_time=end_time, orders_df=orders_df,
+                                    is_buy=True).fillna(0)
+    binned_sell_volume = create_bins(tao=int(tao), start_time=start_time, end_time=end_time, orders_df=orders_df,
+                                     is_buy=False).fillna(0)
+
+    midf = pd.DataFrame()
+    midf['buy_vol'] = binned_buy_volume
+    midf['sell_vol'] = binned_sell_volume
+    midf['ti'] = midf['buy_vol'] - midf['sell_vol']  # Trade Imbalance
+    midf['pov'] = abs(midf['ti']) / (midf['buy_vol'] + midf['sell_vol'])  # Participation of Volume in tao
+    midf['mi'] = None
+    midf.index = pd.interval_range(start=start_time, end=end_time, freq=pd.DateOffset(seconds=int(tao)))
+
+    midf.mi = midf.apply(calculate_mid_move, axis=1)
+
+    pov_bins = np.linspace(start=0, stop=1, num=1000, endpoint=False)
+    pov_binned = pd.cut(x=midf['pov'], bins=pov_bins)
+
+    midf['pov_bins'] = pov_binned
+
+    midf_gpd = midf.sort_values(by='pov_bins')
+    midf_gpd.index = midf_gpd.pov_bins
+    del midf_gpd['pov_bins']
+
+    df = pd.DataFrame(index=midf_gpd.index)
+    df['mi'] = midf_gpd['mi']
+    df['pov'] = midf_gpd['pov']
+    df = df.groupby(df.index).mean()
+
+    return df
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Market Impact Curve as described in AlmgrenChriss 05 paper')
+
+    parser.add_argument('--stock', default=None, required=True, help='stock (ABM)')
+    parser.add_argument('--date', default=None, required=True, help='date (20200101)')
+    parser.add_argument('--log', type=str, default=None, required=True, help='log folder')
+    parser.add_argument('--tao', type=int, required=True, help='Number of seconds in each bin')
+
+    args, remaining_args = parser.parse_known_args()
+
+    stock = args.stock
+    date = args.date
+    start_time = pd.Timestamp(date) + pd.to_timedelta('09:30:00')
+    end_time = pd.Timestamp(date) + pd.to_timedelta('16:00:00')
+    abides_log_folder = args.log
+
+    print('Processing market impact data for {}'.format(abides_log_folder))
+
+    processed_orderbook, transacted_orders, cleaned_orderbook = create_orderbooks(
+                                            exchange_path=abides_log_folder + '/EXCHANGE_AGENT.bz2',
+                                            ob_path=abides_log_folder + '/ORDERBOOK_{}_FULL.bz2'.format(stock))
+
+    df = calculate_market_impact(transacted_orders, cleaned_orderbook, start_time, end_time, tao=args.tao)
+    df.to_pickle(abides_log_folder + f'/market_impact_df_tao_{args.tao}.bz2')
+
+    print('Processed market impact data for {}'.format(abides_log_folder))
\ No newline at end of file
diff --git a/realism/market_impact/marketreplay_market_impact.py b/realism/market_impact/marketreplay_market_impact.py
new file mode 100644
index 000000000..4f256268e
--- /dev/null
+++ b/realism/market_impact/marketreplay_market_impact.py
@@ -0,0 +1,128 @@
+import argparse
+import pandas as pd
+import numpy as np
+
+num_levels = 50
+columns = [[f'ask_price_{level}', f'ask_size_{level}', f'bid_price_{level}', f'bid_size_{level}'] for level in range(1, num_levels+1)]
+columns = [x for b in columns for x in b]
+
+
+def process_data(abides_log_folder, stock, date):
+    csv_orderbooks_parent_folder = '/efs/data/get_real_data/lobsterized/orderbook/'
+
+    # Orderbook snapshots
+    ob_df = pd.read_csv(csv_orderbooks_parent_folder + f'orderbook_{stock}_{date}.csv')
+    ob_df.columns = columns
+    ob_df.index = pd.read_pickle(abides_log_folder + f'ORDERBOOK_{stock}_FREQ_ALL_{date.replace("-", "")}.bz2').index[1:]
+
+    start_time = pd.Timestamp(date) + pd.to_timedelta('09:30:00')
+    end_time = pd.Timestamp(date) + pd.to_timedelta('16:00:00')
+    ob_df = ob_df.loc[(ob_df.index >= start_time) & (ob_df.index <= end_time)]
+
+
+    # Transacted Orders
+    ea_df = pd.read_pickle(abides_log_folder + 'EXCHANGE_AGENT.bz2')
+    ea_df = ea_df.loc[ea_df.EventType == 'ORDER_EXECUTED']
+
+    transacted_orders_df = pd.DataFrame(columns=['TIMESTAMP', 'ORDER_ID', 'PRICE', 'SIZE', 'BUY_SELL_FLAG'])
+
+    i = 0
+    for index, row in ea_df.iterrows():
+        transacted_orders_df = transacted_orders_df.append(pd.Series(data={
+            'TIMESTAMP': index,
+            'ORDER_ID': row.Event['order_id'],
+            'PRICE': row.Event['fill_price'],
+            'SIZE': row.Event['quantity'],
+            'BUY_SELL_FLAG': row.Event['is_buy_order']
+        }), ignore_index=True)
+        i += 1
+
+    transacted_orders_df.set_index('TIMESTAMP', inplace=True)
+
+    transacted_orders_df = transacted_orders_df.sort_values(by=['TIMESTAMP', 'ORDER_ID']).iloc[1::2]
+
+    return ob_df, transacted_orders_df, start_time, end_time
+
+
+def calculate_market_impact(orders_df, ob_df, start_time, end_time, tao):
+
+    def create_bins(tao, start_time, end_time, orders_df, is_buy):
+        bins = pd.interval_range(start=start_time, end=end_time, freq=pd.DateOffset(seconds=tao))
+        binned = pd.cut(orders_df.loc[orders_df.BUY_SELL_FLAG == is_buy].index, bins=bins)
+        binned_volume = orders_df.loc[orders_df.BUY_SELL_FLAG == is_buy].groupby(binned).SIZE.agg(np.sum)
+        return binned_volume
+
+    def calculate_mid_move(row):
+        try:
+            t_start = row.name.left
+            t_end = row.name.right
+            mid_t_start = mid_resampled.loc[mid_resampled.index == t_start].item()
+            mid_t_end = mid_resampled.loc[mid_resampled.index == t_end].item()
+            if row.ti < 0:
+                row.mi = -1 * ((mid_t_end - mid_t_start) / mid_t_start) * 10000 # bps
+            else:
+                row.mi = (mid_t_end - mid_t_start) / mid_t_start * 10000 # bps
+            return row.mi
+        except:
+            pass
+
+    mid = (ob_df.ask_price_1 + ob_df.bid_price_1) / 2
+    mid_resampled = mid.resample(f'{tao}s').ffill()
+
+    binned_buy_volume = create_bins(tao=int(tao), start_time=start_time, end_time=end_time, orders_df=orders_df,
+                                    is_buy=True).fillna(0)
+    binned_sell_volume = create_bins(tao=int(tao), start_time=start_time, end_time=end_time, orders_df=orders_df,
+                                     is_buy=False).fillna(0)
+
+    midf = pd.DataFrame()
+    midf['buy_vol'] = binned_buy_volume
+    midf['sell_vol'] = binned_sell_volume
+    midf['ti'] = midf['buy_vol'] - midf['sell_vol']  # Trade Imbalance
+    midf['pov'] = abs(midf['ti']) / (midf['buy_vol'] + midf['sell_vol'])  # Participation of Volume in tao
+    midf['mi'] = None
+    midf.index = pd.interval_range(start=start_time, end=end_time, freq=pd.DateOffset(seconds=int(tao)))
+
+    midf.mi = midf.apply(calculate_mid_move, axis=1)
+
+    pov_bins = np.linspace(start=0, stop=1, num=1000, endpoint=False)
+    pov_binned = pd.cut(x=midf['pov'], bins=pov_bins)
+
+    midf['pov_bins'] = pov_binned
+
+    midf_gpd = midf.sort_values(by='pov_bins')
+    midf_gpd.index = midf_gpd.pov_bins
+    del midf_gpd['pov_bins']
+
+    df = pd.DataFrame(index=midf_gpd.index)
+    df['mi'] = midf_gpd['mi']
+    df['pov'] = midf_gpd['pov']
+    df = df.groupby(df.index).mean()
+
+    return df
+
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser(description='Market Impact Curve as described in AlmgrenChriss 05 paper')
+
+    parser.add_argument('--tao',  required=True, help='Number of seconds in each bin')
+    parser.add_argument('--ticker', required=True,  help='Name of the stock/symbol')
+    parser.add_argument('--date', required=True, help='Historical date')
+
+    args, remaining_args = parser.parse_known_args()
+
+    stock = args.ticker
+    date = args.date
+
+    abides_logs_parent_folder = '/efs/data/get_real_data/marketreplay-logs/log/'
+    abides_log_folder = abides_logs_parent_folder + f'marketreplay_{stock}_{date.replace("-", "")}/'
+
+    ob_df, orders_df, start_time, end_time = process_data(abides_log_folder=abides_log_folder,
+                                                          stock=stock,
+                                                          date=date)
+    print(f'Processed order book data for {stock} {date}, calculating market impact ...')
+
+    df = calculate_market_impact(orders_df, ob_df, start_time, end_time, tao=args.tao)
+    df.to_pickle(abides_log_folder + f'market_impact_df_tao_{args.tao}.bz2')
+
+    print(f'Processed market impact data for {stock} {date}')
\ No newline at end of file
diff --git a/realism/metrics/aggregation_normality.py b/realism/metrics/aggregation_normality.py
index d165e7dac..7f2ca3a13 100644
--- a/realism/metrics/aggregation_normality.py
+++ b/realism/metrics/aggregation_normality.py
@@ -10,5 +10,5 @@ def compute(self, df):
         df = df[["close"]].resample("10T").last()
         return self.mr.compute(df)
 
-    def visualize(self, simulated, real, plot_real=True):
-        self.hist(simulated, real, "Aggregation Normality (10 minutes)", "Log Returns", log=True, clip=.05, plot_real=plot_real)
+    def visualize(self, simulated):
+        self.hist(simulated, "Aggregation Normality (10 minutes)", "Log Returns", log=True, clip=.05)
diff --git a/realism/metrics/autocorrelation.py b/realism/metrics/autocorrelation.py
index 071295dcf..7164addac 100644
--- a/realism/metrics/autocorrelation.py
+++ b/realism/metrics/autocorrelation.py
@@ -3,6 +3,7 @@
 from metrics.minutely_returns import MinutelyReturns
 import pandas as pd
 
+
 class Autocorrelation(Metric):
 
     def __init__(self, lag=1, window=30):
@@ -16,13 +17,10 @@ def compute(self, df):
         df = df.rolling(self.window, center=True).apply(lambda x: x.autocorr(lag=self.lag), raw=False)
         return df.dropna().tolist()
 
-    def visualize(self, simulated, real, plot_real=True):
+    def visualize(self, simulated):
         min_sim = min([len(x) for x in simulated.values()])
-        min_size = min(min_sim, len(real))
-        random.shuffle(real)
-        real = real[:min_size]
         for k, v in simulated.items():
             random.shuffle(v)
-            simulated[k] = v[:min_size]
-        self.hist(simulated, real, title="Autocorrelation (lag={}, window={})".format(self.lag, self.window), xlabel="Correlation coefficient", log=False, plot_real=plot_real)
+            simulated[k] = v[:min_sim]
+        self.hist(simulated, title="Autocorrelation (lag={}, window={})".format(self.lag, self.window), xlabel="Correlation coefficient", log=False)
 
diff --git a/realism/metrics/kurtosis.py b/realism/metrics/kurtosis.py
index ef96b1eb7..863a3f0af 100644
--- a/realism/metrics/kurtosis.py
+++ b/realism/metrics/kurtosis.py
@@ -2,6 +2,7 @@
 from metrics.minutely_returns import MinutelyReturns
 from scipy.stats import kurtosis
 
+
 class Kurtosis(Metric):
 
     def __init__(self, intervals=4):
@@ -16,5 +17,5 @@ def compute(self, df):
             ks.append(kurtosis(rets))
         return [ks]
 
-    def visualize(self, simulated, real, plot_real=True):
-        self.line(simulated, real, title="Kurtosis", xlabel="Time scale (min)", ylabel="Average kurtosis", logy=True, plot_real=plot_real)
+    def visualize(self, simulated):
+        self.line(simulated, title="Kurtosis", xlabel="Time scale (min)", ylabel="Average kurtosis", logy=True)
diff --git a/realism/metrics/metric.py b/realism/metrics/metric.py
index 172b74fff..1cf284d97 100644
--- a/realism/metrics/metric.py
+++ b/realism/metrics/metric.py
@@ -1,6 +1,7 @@
 import numpy as np
 import matplotlib.pyplot as plt
 
+
 class Metric:
 
     # Returns a list of computed metrics
@@ -8,28 +9,18 @@ def compute(self, df):
         raise NotImplementedError
 
     # Visualizes metric form list of simulated metric values and list of real metric values.
-    def visualize(self, simulated, real, plot_real=True):
+    def visualize(self, simulated):
         raise NotImplementedError
 
     # Create an overlapping histogram of the provided data.
-    def hist(self, simulated, real, title="Simulated vs. Historical Histogram", xlabel="Values", log=False, bins=75, clip=None, plot_real=True):
-
-        real = np.array(real)
+    def hist(self, simulated, title="Simulation data histogram", xlabel="Values", log=False, bins=75, clip=None):
         for k, v in simulated.items():
-            simulated[k] = np.array(v)
-
-        if clip is not None:
-            for k, v in simulated.items():
-                simulated[k] = v[(v <= clip) & (v >= -1*clip)]
-            real = real[(real <= clip) & (real >= -1*clip)]
-
-        as_numpy = np.stack(list(simulated.values()))
-        left = min(as_numpy.min(), min(real))
-        right = max(as_numpy.max(), max(real))
+            simulated[k] = np.array(v).reshape((len(v), 1))
+        first_sim = simulated[list(simulated.keys())[0]]
+        as_numpy = np.vstack(list(simulated.values()))
+        left = min(as_numpy.min(), min(first_sim))
+        right = max(as_numpy.max(), max(first_sim))
         bins = np.linspace(left, right, bins)
-        
-        if plot_real:
-            plt.hist(real, bins=bins, color="red", log=log, alpha=1, label="Historical", histtype="step", linewidth=3)
 
         # Show histograms
         for k, v in simulated.items():
@@ -41,19 +32,11 @@ def hist(self, simulated, real, title="Simulated vs. Historical Histogram", xlab
         plt.legend()
 
     # Create a line plot of the simulated and real metrics. Also calculates error bars.
-    def line(self, simulated, real, title="Simulated vs. Historical", xlabel="X", ylabel="Y", logy=False, plot_real=True):
-        real = np.array(real)
+    def line(self, simulated, title="Simulation data", xlabel="X", ylabel="Y", logy=False):
         for k, v in simulated.items():
             simulated[k] = np.array(v)
-
-        x = np.arange(real.shape[1])+1
-
-        err_real = np.nanstd(real, axis=0)
-        real = np.nanmean(real, axis=0)
-
-        if plot_real:
-            plt.plot(x, real, color="red", linewidth=4, label="Historical")
-            #plt.fill_between(x, real-err_real, real+err_real, alpha=.1, color="red").set_linestyle('dashed')
+        first_sim = simulated[list(simulated.keys())[0]]
+        x = np.arange(first_sim.shape[1])+1
 
         for k, v in simulated.items():
             err_simulated = np.nanstd(v, axis=0)
diff --git a/realism/metrics/minutely_returns.py b/realism/metrics/minutely_returns.py
index 43070fd50..db44af2f8 100644
--- a/realism/metrics/minutely_returns.py
+++ b/realism/metrics/minutely_returns.py
@@ -2,6 +2,7 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
+
 class MinutelyReturns(Metric):
 
     def compute(self, df):
@@ -10,5 +11,5 @@ def compute(self, df):
         df = df.diff().dropna()
         return df.tolist()
 
-    def visualize(self, simulated, real, plot_real=True):
-        self.hist(simulated, real, title="Minutely Log Returns", xlabel="Log Returns", log=True, clip=.05, plot_real=plot_real)
\ No newline at end of file
+    def visualize(self, simulated):
+        self.hist(simulated, title="Minutely Log Returns", xlabel="Log Returns", log=True, clip=.05)
diff --git a/realism/metrics/returns_volatility_correlation.py b/realism/metrics/returns_volatility_correlation.py
index 6796b38f0..cbc8d0370 100644
--- a/realism/metrics/returns_volatility_correlation.py
+++ b/realism/metrics/returns_volatility_correlation.py
@@ -3,6 +3,7 @@
 from scipy.stats import kurtosis
 import numpy as np
 
+
 class ReturnsVolatilityCorrelation(Metric):
 
     def __init__(self, intervals=4):
@@ -13,5 +14,5 @@ def compute(self, df):
         volatility = abs(returns)
         return [np.corrcoef(returns, volatility)[0,1]]
 
-    def visualize(self, simulated, real, plot_real=True):
-        self.hist(simulated, real, title="Returns/Volatility Correlation", xlabel="Correlation coefficient", plot_real=plot_real, bins=50)
+    def visualize(self, simulated):
+        self.hist(simulated, title="Returns/Volatility Correlation", xlabel="Correlation coefficient", bins=50)
diff --git a/realism/metrics/volatility_clustering.py b/realism/metrics/volatility_clustering.py
index 0c22371a2..aae606793 100644
--- a/realism/metrics/volatility_clustering.py
+++ b/realism/metrics/volatility_clustering.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import numpy as np
 
+
 class VolatilityClustering(Metric):
 
     def __init__(self, lags=10, mode="abs"):
@@ -21,8 +22,7 @@ def compute(self, df):
             df = abs(df)
         elif self.mode == "square":
             df = df ** 2
-        return [[df.autocorr(lag) for lag in range(1,self.lags+1)]]
-
-    def visualize(self, simulated, real, plot_real=True):
-        self.line(simulated, real, "Volatility Clustering/Long Range Dependence", "Lag", "Correlation coefficient", plot_real=plot_real)
+        return [[df.autocorr(lag) for lag in range(1, self.lags+1)]]
 
+    def visualize(self, simulated):
+        self.line(simulated, "Volatility Clustering/Long Range Dependence", "Lag", "Correlation coefficient")
diff --git a/realism/metrics/volume_volatility_correlation.py b/realism/metrics/volume_volatility_correlation.py
index 43d36f722..29b254a6b 100644
--- a/realism/metrics/volume_volatility_correlation.py
+++ b/realism/metrics/volume_volatility_correlation.py
@@ -3,6 +3,7 @@
 from scipy.stats import kurtosis
 import numpy as np
 
+
 class VolumeVolatilityCorrelation(Metric):
 
     def __init__(self, intervals=4):
@@ -13,5 +14,5 @@ def compute(self, df):
         volume = df["volume"].iloc[1:].values
         return [np.corrcoef(volume, volatility)[0,1]]
 
-    def visualize(self, simulated, real, plot_real=True):
-        self.hist(simulated, real, title="Volume/Volatility Correlation", xlabel="Correlation coefficient", plot_real=plot_real)
+    def visualize(self, simulated):
+        self.hist(simulated, title="Volume/Volatility Correlation", xlabel="Correlation coefficient")
diff --git a/realism/order_flow_stylized_facts.py b/realism/order_flow_stylized_facts.py
index 4a4d2a3c3..a01af46dc 100644
--- a/realism/order_flow_stylized_facts.py
+++ b/realism/order_flow_stylized_facts.py
@@ -47,6 +47,9 @@ class Constants:
     intraday_volume_linewidth = 5
 
 
+YEAR_OFFSET = 0.1  # adds offset of ${YEAR_OFFSET} years to each trace to differentiate between seeds
+
+
 def unpickle_stream_dfs_to_stream_list(dir_containing_pickles):
     """ Extracts pickled dataframes over a number of dates to a dict containing dataframes and their dates.
 
@@ -57,7 +60,7 @@ def unpickle_stream_dfs_to_stream_list(dir_containing_pickles):
     """
 
     bundled_streams = []
-    symbol_regex = r".*\/orders_(\w*)_(\d{8}).pkl"
+    symbol_regex = r".*\/orders_(\w*)_(\d{8}).*.pkl"
 
     stream_file_list = glob.glob(f"{dir_containing_pickles}/orders*.pkl")
     for stream_pkl in stream_file_list:
@@ -86,12 +89,19 @@ def bundled_stream_interarrival_times(bundled_streams):
     """ From bundled streams return dict with interarrival times collated by symbol. """
 
     interarrivals_dict = dict()
+    year_offset = 0
 
     for idx, elem in enumerate(bundled_streams):
         print(f"Processing elem {idx + 1} of {len(bundled_streams)}")
+
+        year_offset_td = pd.Timedelta(int(365 * (year_offset * YEAR_OFFSET)), unit='day')
         orders_df = elem["orders_df"]
         symbol = elem["symbol"]
         arrival_times = orders_df.index.to_series()
+        # offset arrival times by ${YEAR_OFFSET} to separate same day, different seed
+        arrival_times = arrival_times + year_offset_td
+        arrival_times.index = arrival_times.index + year_offset_td
+
         interarrival_times = arrival_times.diff()
         interarrival_times = interarrival_times.iloc[1:].apply(pd.Timedelta.total_seconds)
         interarrival_times = interarrival_times.rename("Interarrival time /s")
@@ -101,6 +111,8 @@ def bundled_stream_interarrival_times(bundled_streams):
         else:
             interarrivals_dict[symbol] = interarrivals_dict[symbol].append(interarrival_times)
 
+        year_offset += 1
+
     return interarrivals_dict
 
 
@@ -172,7 +184,7 @@ def bundled_stream_binned_trade_counts(bundled_interarrivals_dict, binwidth):
     trades_within_bins_dict = dict()
 
     for symbol, interarrival_times in bundled_interarrivals_dict.items():
-        series_list = [group[1] for group in interarrival_times.groupby(interarrival_times.index.day)]
+        series_list = [group[1] for group in interarrival_times.groupby(interarrival_times.index.date)]
         for idx, series in enumerate(series_list):
             print(f"Processing series {idx + 1} of {len(series_list)} for symbol {symbol}")
             counted_trades = count_trades_within_bins(series, binwidth=binwidth)
diff --git a/realism/realism_utils.py b/realism/realism_utils.py
index dc4302007..cc99d96d3 100644
--- a/realism/realism_utils.py
+++ b/realism/realism_utils.py
@@ -24,7 +24,7 @@ def get_trades(sim_file):
   # Code taken from `read_simulated_trades`
   try:
     df = pd.read_pickle(sim_file, compression='bz2')
-  except OSError:
+  except (OSError, EOFError):
       return None
   
   df = df[df['EventType'] == 'LAST_TRADE']
diff --git a/util/formatting/mid_price_from_orderbook.py b/util/formatting/mid_price_from_orderbook.py
index a446fb0b0..e7381385b 100644
--- a/util/formatting/mid_price_from_orderbook.py
+++ b/util/formatting/mid_price_from_orderbook.py
@@ -1,5 +1,11 @@
-from convert_order_book import process_orderbook, is_wide_book
-from convert_order_stream import dir_path
+import sys
+from pathlib import Path
+
+p = str(Path(__file__).resolve().parents[2])  # directory two levels up from this file
+sys.path.append(p)
+
+from util.formatting.convert_order_book import process_orderbook, is_wide_book
+from util.formatting.convert_order_stream import dir_path
 import pandas as pd
 import os
 import argparse
diff --git a/util/formatting/prepare_abides_data_for_plotting.py b/util/formatting/prepare_abides_data_for_plotting.py
index 339ab6dad..3898773b8 100644
--- a/util/formatting/prepare_abides_data_for_plotting.py
+++ b/util/formatting/prepare_abides_data_for_plotting.py
@@ -1,8 +1,8 @@
-from convert_order_stream import convert_stream_to_format
+from util.formatting.convert_order_stream import convert_stream_to_format
 import os
 import argparse
 from dateutil.parser import parse
-from convert_order_stream import dir_path
+from util.formatting.convert_order_stream import dir_path
 import pandas as pd
 
 
diff --git a/util/formatting/prepare_dow_data_for_plotting.py b/util/formatting/prepare_dow_data_for_plotting.py
index 5684c757a..56216938e 100644
--- a/util/formatting/prepare_dow_data_for_plotting.py
+++ b/util/formatting/prepare_dow_data_for_plotting.py
@@ -1,6 +1,6 @@
 import pandas as pd
 from datetime import datetime
-from convert_order_stream import dir_path
+from util.formatting.convert_order_stream import dir_path
 import argparse
 from dateutil.parser import parse
 from datetime import timedelta