From a6b9de90c7066a3df629699cc23946bad60b0109 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 28 Apr 2021 10:49:26 +0100
Subject: [PATCH 01/20] move actions from core, stop from being a provider
 module

---
 anvil/core.py                 | 8 --------
 anvil/models.py               | 9 ++++++++-
 anvil/scripts/anvil_sample.py | 1 -
 anvil/scripts/anvil_train.py  | 2 +-
 4 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/anvil/core.py b/anvil/core.py
index 8c72a13..c2a3aaf 100644
--- a/anvil/core.py
+++ b/anvil/core.py
@@ -6,7 +6,6 @@
 import torch
 import torch.nn as nn
 
-from reportengine import collect
 
 ACTIVATION_LAYERS = {
     "leaky_relu": nn.LeakyReLU,
@@ -76,10 +75,3 @@ def forward(self, v_in: torch.tensor):
         shape (n_batch, size_out)
         """
         return self.network(v_in)
-
-_normalising_flow = collect("model_action", ("model_params",))
-
-def model_to_load(_normalising_flow):
-    return _normalising_flow[0]
-
-
diff --git a/anvil/models.py b/anvil/models.py
index 55cb6ed..3596b04 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -8,8 +8,9 @@
 """
 from functools import partial
 
-from anvil.core import Sequential
+from reportengine import collect
 
+from anvil.core import Sequential
 import anvil.layers as layers
 
 
@@ -105,6 +106,12 @@ def affine_spline(real_nvp, rational_quadratic_spline):
     return Sequential(real_nvp, rational_quadratic_spline)
 
 
+_normalising_flow = collect("model_action", ("model_params",))
+
+def model_to_load(_normalising_flow):
+    return _normalising_flow[0]
+
+
 MODEL_OPTIONS = {
     "nice": nice,
     "real_nvp": real_nvp,
diff --git a/anvil/scripts/anvil_sample.py b/anvil/scripts/anvil_sample.py
index ff4b9d1..66ed8c5 100644
--- a/anvil/scripts/anvil_sample.py
+++ b/anvil/scripts/anvil_sample.py
@@ -13,7 +13,6 @@
 log = logging.getLogger(__name__)
 
 PROVIDERS = [
-    "anvil.core",
     "anvil.models",
     "anvil.sample",
     "anvil.models",
diff --git a/anvil/scripts/anvil_train.py b/anvil/scripts/anvil_train.py
index 437efe0..6767c3b 100644
--- a/anvil/scripts/anvil_train.py
+++ b/anvil/scripts/anvil_train.py
@@ -12,7 +12,7 @@
 
 log = logging.getLogger(__name__)
 
-PROVIDERS = ["anvil.train", "anvil.checkpoint", "anvil.core", "anvil.models"]
+PROVIDERS = ["anvil.train", "anvil.checkpoint", "anvil.models"]
 
 TRAINING_ACTIONS = ["train"]
 

From 5b6b28d8a5e85c13f9a6d826856db138c944ce5e Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Wed, 28 Apr 2021 11:25:58 +0100
Subject: [PATCH 02/20] add new type of model, which is a sequence of other
 models, allowing for much more flexibility

---
 anvil/config.py                              | 20 ++++++++-
 anvil/core.py                                |  7 ++-
 anvil/models.py                              | 16 ++++++-
 examples/runcards/train_sequential_model.yml | 46 ++++++++++++++++++++
 4 files changed, 85 insertions(+), 4 deletions(-)
 create mode 100644 examples/runcards/train_sequential_model.yml

diff --git a/anvil/config.py b/anvil/config.py
index aaaecc3..96953d0 100644
--- a/anvil/config.py
+++ b/anvil/config.py
@@ -13,7 +13,7 @@
 
 from anvil.geometry import Geometry2D
 from anvil.checkpoint import TrainingOutput
-from anvil.models import MODEL_OPTIONS
+from anvil.models import MODEL_OPTIONS, LOADED_MODEL_OPTIONS
 from anvil.distributions import BASE_OPTIONS, TARGET_OPTIONS
 
 from random import randint
@@ -91,6 +91,24 @@ def produce_model_action(self, model: str):
         except KeyError:
             raise ConfigError(f"Invalid model {model}", model, MODEL_OPTIONS.keys())
 
+    @explicit_node
+    def produce_model_to_load(self, model: str, model_params):
+        """Decides whether to load sequential model or a preset combination"""
+        if isinstance(model_params, list):
+            inner_models = {inner.get("model") for inner in model_params}
+            if ("sequential_model" in inner_models) or (None in inner_models):
+                raise ConfigError(
+                    "Inner models cannot be undefined or `sequential_model`",
+                    inner_models,
+                    MODEL_OPTIONS.keys()
+                )
+            if model != "sequential_model":
+                raise ConfigError(
+                    "model_params can only be a list when the model is `sequential_model`"
+                )
+            return LOADED_MODEL_OPTIONS["sequential_model"]
+        return LOADED_MODEL_OPTIONS["preset_model"]
+
     def parse_n_batch(self, nb: int):
         """Batch size for training."""
         return nb
diff --git a/anvil/core.py b/anvil/core.py
index c2a3aaf..df85ffc 100644
--- a/anvil/core.py
+++ b/anvil/core.py
@@ -1,7 +1,10 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copywrite © 2021 anvil Michael Wilson, Joe Marsh Rossney, Luigi Del Debbio
-r"""
-coupling.py
+"""
+core.py
+
+Module containing project specific extensions to pytorch base classes.
+
 """
 import torch
 import torch.nn as nn
diff --git a/anvil/models.py b/anvil/models.py
index 3596b04..d079c09 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -108,10 +108,18 @@ def affine_spline(real_nvp, rational_quadratic_spline):
 
 _normalising_flow = collect("model_action", ("model_params",))
 
-def model_to_load(_normalising_flow):
+def preset_model(_normalising_flow):
     return _normalising_flow[0]
 
 
+def sequential_model(_normalising_flow):
+    """action which wraps a list of affine models in
+    :py:class:`anvil.core.Sequential`. This allows the user to specify an
+    arbitrary combination of layers as the model
+
+    """
+    return Sequential(*_normalising_flow)
+
 MODEL_OPTIONS = {
     "nice": nice,
     "real_nvp": real_nvp,
@@ -119,3 +127,9 @@ def model_to_load(_normalising_flow):
     "spline_affine": spline_affine,
     "affine_spline": affine_spline,
 }
+
+
+LOADED_MODEL_OPTIONS = {
+    "preset_model": preset_model,
+    "sequential_model": sequential_model
+}
diff --git a/examples/runcards/train_sequential_model.yml b/examples/runcards/train_sequential_model.yml
new file mode 100644
index 0000000..81c735c
--- /dev/null
+++ b/examples/runcards/train_sequential_model.yml
@@ -0,0 +1,46 @@
+# Example of how to specify a custom sequential model explicitly.
+
+# Lattice
+lattice_length: 6
+lattice_dimension: 2
+
+# Target
+target: phi_four
+parameterisation: albergo2019
+couplings:
+    m_sq: -4
+    lam: 6.975
+
+# Model
+base: gaussian
+
+model: sequential_model
+
+model_params:
+ - model: real_nvp
+   n_affine: 2
+   z2_equivar: true
+   activation: tanh
+   hidden_shape: [72]
+ - model: rational_quadratic_spline
+   n_spline: 1
+   n_segments: 8
+   z2_equivar_spline: false
+   activation: tanh
+   hidden_shape: [72]
+
+# Training
+n_batch: 1000
+epochs: 2000
+save_interval: 1000
+
+# Optimizer
+optimizer: Adam
+optimizer_params:
+    lr: 0.005
+
+# Scheduler
+scheduler: CosineAnnealingLR
+scheduler_params:
+    T_max: 2000
+

From 7b6b3c5eab967039f370f4945a8a239ec47f25dd Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Thu, 29 Apr 2021 11:42:27 +0100
Subject: [PATCH 03/20] remove duplicate import

---
 anvil/scripts/anvil_sample.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/anvil/scripts/anvil_sample.py b/anvil/scripts/anvil_sample.py
index 66ed8c5..a216ab2 100644
--- a/anvil/scripts/anvil_sample.py
+++ b/anvil/scripts/anvil_sample.py
@@ -15,7 +15,6 @@
 PROVIDERS = [
     "anvil.models",
     "anvil.sample",
-    "anvil.models",
     "anvil.observables",
     "anvil.plot",
     "anvil.table",

From 4122eb76a62da5b515c623cf2900760d471259bd Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Thu, 29 Apr 2021 16:17:39 +0100
Subject: [PATCH 04/20] only allow explicit specification of models

---
 anvil/benchmark_config/free_scalar_train.yml |   7 +-
 anvil/config.py                              |  26 +--
 anvil/layers.py                              |   6 +-
 anvil/models.py                              | 211 +++++++++++++------
 examples/runcards/train.yml                  |  24 ++-
 examples/runcards/train_sequential_model.yml |  46 ----
 6 files changed, 173 insertions(+), 147 deletions(-)
 delete mode 100644 examples/runcards/train_sequential_model.yml

diff --git a/anvil/benchmark_config/free_scalar_train.yml b/anvil/benchmark_config/free_scalar_train.yml
index 5edd89e..9b85f20 100644
--- a/anvil/benchmark_config/free_scalar_train.yml
+++ b/anvil/benchmark_config/free_scalar_train.yml
@@ -12,12 +12,9 @@ couplings:
 # Model
 base: gaussian
 
-#model: rational_quadratic_spline
-#model: real_nvp
-model: nice
 model_params:
-    n_affine: 2
-    n_additive: 2
+    layer: nice
+    n_blocks: 2
     hidden_shape: [36]
     activation: tanh
     z2_equivar: True
diff --git a/anvil/config.py b/anvil/config.py
index 96953d0..7eeb68e 100644
--- a/anvil/config.py
+++ b/anvil/config.py
@@ -13,7 +13,7 @@
 
 from anvil.geometry import Geometry2D
 from anvil.checkpoint import TrainingOutput
-from anvil.models import MODEL_OPTIONS, LOADED_MODEL_OPTIONS
+from anvil.models import LAYER_OPTIONS
 from anvil.distributions import BASE_OPTIONS, TARGET_OPTIONS
 
 from random import randint
@@ -84,30 +84,12 @@ def parse_parameterisation(self, param: str):
         return param
 
     @explicit_node
-    def produce_model_action(self, model: str):
+    def produce_layer_action(self, layer: str):
         """Given a string, return the flow model action indexed by that string."""
         try:
-            return MODEL_OPTIONS[model]
+            return LAYER_OPTIONS[layer]
         except KeyError:
-            raise ConfigError(f"Invalid model {model}", model, MODEL_OPTIONS.keys())
-
-    @explicit_node
-    def produce_model_to_load(self, model: str, model_params):
-        """Decides whether to load sequential model or a preset combination"""
-        if isinstance(model_params, list):
-            inner_models = {inner.get("model") for inner in model_params}
-            if ("sequential_model" in inner_models) or (None in inner_models):
-                raise ConfigError(
-                    "Inner models cannot be undefined or `sequential_model`",
-                    inner_models,
-                    MODEL_OPTIONS.keys()
-                )
-            if model != "sequential_model":
-                raise ConfigError(
-                    "model_params can only be a list when the model is `sequential_model`"
-                )
-            return LOADED_MODEL_OPTIONS["sequential_model"]
-        return LOADED_MODEL_OPTIONS["preset_model"]
+            raise ConfigError(f"Invalid model {layer}", layer, LAYER_OPTIONS.keys())
 
     def parse_n_batch(self, nb: int):
         """Batch size for training."""
diff --git a/anvil/layers.py b/anvil/layers.py
index af8997f..70296ba 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -186,7 +186,7 @@ def forward(self, v_in, log_density, *unused) -> torch.Tensor:
         s_out = self.s_network(v_for_net)
         t_out = self.t_network(v_for_net)
 
-        # If enforcing s(-v) = -s(v), we want to use |s(v)| in affine transf.
+        # If enforcing s(-v) = s(v), we want to use |s(v)| in affine transf.
         if self.z2_equivar:
             s_out = torch.abs(s_out)
 
@@ -259,9 +259,9 @@ def forward(self, v_in, log_density, negative_mag):
             v_in_passive - v_in_passive.mean()
         ) / v_in_passive.std()  # reduce numerical instability
 
-        # Naively enforce C(-v) = -C(v)
+        # Naively enforce C(-v) = C(v)
         if self.z2_equivar:
-            v_in_passive_stand[negative_mag] = -v_in_passive_stand[negative_mag]
+            v_for_net[negative_mag] = -v_for_net[negative_mag]
 
         v_out_b = torch.zeros_like(v_in_active)
         gradient = torch.ones_like(v_in_active).unsqueeze(dim=-1)
diff --git a/anvil/models.py b/anvil/models.py
index d079c09..15faec6 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -3,8 +3,10 @@
 """
 models.py
 
-Module containing reportengine actions which return callable objects that execute
-normalising flows constructed from multiple layers via function composition.
+Module containing reportengine actions which return normalising flow models.
+Generally this involves piecing together components from :py:mod:`anvil.layers`
+and :py:mod:`anvil.core` to produce sequences of transformations.
+
 """
 from functools import partial
 
@@ -14,82 +16,187 @@
 import anvil.layers as layers
 
 
-def coupling_pair(coupling_layer, size_half, **layer_spec):
-    """Helper function which returns a callable object that performs a coupling
-    transformation on both even and odd lattice sites."""
-    coupling_transformation = partial(coupling_layer, size_half, **layer_spec)
+def _coupling_pair(coupling_layer, **kwargs):
+    """Helper function which wraps a pair of coupling layers from
+    :py:mod:`anvil.layers` in the module container
+    :py:class`anvil.core.Sequential`. The first transformation layer acts on
+    the even sites and the second transformation acts on the odd sites, so one
+    of these blocks ensures all sites are transformed as part of an
+    active partition.
+
+    """
+    coupling_transformation = partial(coupling_layer, **kwargs)
     return Sequential(
         coupling_transformation(even_sites=True),
         coupling_transformation(even_sites=False),
     )
 
 
-def real_nvp(
+def _real_nvp(
     size_half,
-    n_affine,
+    n_blocks,
     hidden_shape,
     activation="tanh",
-    z2_equivar=False,
+    z2_equivar=True,
 ):
-    """Action that returns a callable object that performs a sequence of `n_affine`
-    affine coupling transformations on both partitions of the input vector."""
+    r"""Action which returns a sequence of ``n_blocks`` pairs of
+    :py:class:`anvil.layers.AffineLayer` s, followed by a single
+    :py:class:`anvil.layers.GlobalRescaling` all wrapped in the module container
+    :py:class`anvil.core.Sequential`.
+
+    The first ``n_blocks`` elements of the outer ``Sequential``
+    are ``Sequential`` s containing a pair of ``AffineLayer`` s which
+    act on the even and odd sites respectively.
+
+    Parameters
+    ----------
+    size_half: int
+        Inferred from ``lattice_size``, the size of the active/passive
+        partitions (which are equal size, `lattice_size / 2`).
+    n_blocks: int
+        The number of pairs of :py:class:`anvil.layers.AffineLayer`
+        transformations.
+    hidden_shape: list[int]
+        the shape of the neural networks used in the AffineLayer. The visible
+        layers are defined by the ``lattice_size``. Typically we have found
+        a single hidden layer neural network is effective, which can be
+        specified by passing a list of length 1, i.e. ``[72]`` would
+        be a single hidden layered network with 72 nodes in the hidden layer.
+    activation: str, default="tanh"
+        The activation function to use for each hidden layer. The output layer
+        of the network is linear (has no activation function).
+    z2_equivar: bool, default=True
+        Whether or not to impose z2 equivariance. This changes the transformation
+        such that the neural networks have no bias term and s(-v) = s(v) which
+        imposes a :math:`\mathbb{Z}_2` symmetry.
+
+    Returns
+    -------
+    real_nvp: anvil.core.Sequential
+        A sequence of affine transformations, which we refer to as a real NVP
+        (Non-volume preserving) flow.
+
+    See Also
+    --------
+    :py:mod:`anvil.core` contains the fully connected neural network class
+    as well as valid choices for activation functions.
+
+    """
     blocks = [
-        coupling_pair(
+        _coupling_pair(
             layers.AffineLayer,
-            size_half,
+            size_half=size_half,
             hidden_shape=hidden_shape,
             activation=activation,
             z2_equivar=z2_equivar,
         )
-        for i in range(n_affine)
+        for i in range(n_blocks)
     ]
     return Sequential(*blocks, layers.GlobalRescaling())
 
 
-def nice(
+def _nice(
     size_half,
-    n_additive,
+    n_blocks,
     hidden_shape,
     activation="tanh",
-    z2_equivar=False,
+    z2_equivar=True,
 ):
-    """Action that returns a callable object that performs a sequence of `n_affine`
-    affine coupling transformations on both partitions of the input vector."""
+    """Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
+    :py:class:`layers.AdditiveLayer` s followed by a single
+    :py:class:`layers.GlobalRescaling`. The pairs of ``AdditiveLayer`` s
+    act on the even and odd sites respectively.
+
+    Parameters
+    ----------
+    size_half: int
+        Inferred from ``lattice_size``, the size of the active/passive
+        partitions (which are equal size, `lattice_size / 2`).
+    n_blocks: int
+        The number of pairs of :py:class:`anvil.layers.AffineLayer`
+        transformations.
+    hidden_shape: list[int]
+        the shape of the neural networks used in the each layer. The visible
+        layers are defined by the ``lattice_size``.
+    activation: str, default="tanh"
+        The activation function to use for each hidden layer. The output layer
+        of the network is linear (has no activation function).
+    z2_equivar: bool, default=True
+        Whether or not to impose z2 equivariance. This changes the transformation
+        such that the neural networks have no bias term and s(-v) = s(v) which
+        imposes a :math:`\mathbb{Z}_2` symmetry.
+
+    Returns
+    -------
+    nice: anvil.core.Sequential
+        A sequence of additive transformations, which we refer to as a
+        nice flow.
+
+    """
     blocks = [
-        coupling_pair(
+        _coupling_pair(
             layers.AdditiveLayer,
-            size_half,
+            size_half=size_half,
             hidden_shape=hidden_shape,
             activation=activation,
             z2_equivar=z2_equivar,
         )
-        for i in range(n_additive)
+        for i in range(n_blocks)
     ]
     return Sequential(*blocks, layers.GlobalRescaling())
 
 
-def rational_quadratic_spline(
+def _rational_quadratic_spline(
     size_half,
     hidden_shape,
     interval=5,
-    n_spline=1,
+    n_blocks=1,
     n_segments=4,
     activation="tanh",
-    z2_equivar_spline=False,
+    z2_equivar=False,
 ):
-    """Action that returns a callable object that performs a pair of circular spline
-    transformations, one on each half of the input vector."""
+    """Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
+    :py:class:`layers.RationalQuadraticSplineLayer` s followed by a single
+    :py:class:`layers.GlobalRescaling`. The pairs of RQS's
+    act on the even and odd sites respectively.
+
+    Parameters
+    ----------
+    size_half: int
+        inferred from ``lattice_size``, the size of the active/passive
+        partitions (which are equal size, `lattice_size / 2`).
+    hidden_shape: list[int]
+        the shape of the neural networks used in the each layer. The visible
+        layers are defined by the ``lattice_size``.
+    interval: int, default=5
+        the interval within which the RQS applies the transformation, at present
+        if a field variable is outside of this region it is mapped to itself
+        (i.e the gradient of the transformation is 1 outside of the interval).
+    n_blocks: int, default=1
+        The number of pairs of :py:class:`anvil.layers.AffineLayer`
+        transformations. For RQS this is set to 1.
+    n_segments: int, default=4
+        The number of segments to use in the RQS transformation.
+    activation: str, default="tanh"
+        The activation function to use for each hidden layer. The output layer
+        of the network is linear (has no activation function).
+    z2_equivar: bool, default=False
+        Whether or not to impose z2 equivariance. This is only done crudely
+        by splitting the sites according to the sign of the sum across lattice
+        sites.
+
+    """
     blocks = [
-        coupling_pair(
+        _coupling_pair(
             layers.RationalQuadraticSplineLayer,
-            size_half,
+            size_half=size_half,
             interval=interval,
             n_segments=n_segments,
             hidden_shape=hidden_shape,
             activation=activation,
-            z2_equivar=z2_equivar_spline,
+            z2_equivar=z2_equivar,
         )
-        for _ in range(n_spline)
+        for _ in range(n_blocks)
     ]
     return Sequential(
         #layers.BatchNormLayer(),
@@ -97,39 +204,23 @@ def rational_quadratic_spline(
         layers.GlobalRescaling(),
     )
 
+_normalising_flow = collect("layer_action", ("model_params",))
 
-def spline_affine(real_nvp, rational_quadratic_spline):
-    return Sequential(rational_quadratic_spline, real_nvp)
-
-
-def affine_spline(real_nvp, rational_quadratic_spline):
-    return Sequential(real_nvp, rational_quadratic_spline)
-
-
-_normalising_flow = collect("model_action", ("model_params",))
-
-def preset_model(_normalising_flow):
-    return _normalising_flow[0]
-
-
-def sequential_model(_normalising_flow):
-    """action which wraps a list of affine models in
+def model_to_load(_normalising_flow):
+    """action which wraps a list of layers in
     :py:class:`anvil.core.Sequential`. This allows the user to specify an
-    arbitrary combination of layers as the model
+    arbitrary combination of layers as the model.
+
+    For more information
+    on valid choices for layers, see :py:var:`LAYER_OPTIONS` or the various
+    functions in :py:mod:`anvil.models` which produce sequences of the layers
+    found in :py:mod:`anvil.layers`.
 
     """
     return Sequential(*_normalising_flow)
 
-MODEL_OPTIONS = {
-    "nice": nice,
-    "real_nvp": real_nvp,
-    "rational_quadratic_spline": rational_quadratic_spline,
-    "spline_affine": spline_affine,
-    "affine_spline": affine_spline,
-}
-
-
-LOADED_MODEL_OPTIONS = {
-    "preset_model": preset_model,
-    "sequential_model": sequential_model
+LAYER_OPTIONS = {
+    "nice": _nice,
+    "real_nvp": _real_nvp,
+    "rational_quadratic_spline": _rational_quadratic_spline,
 }
diff --git a/examples/runcards/train.yml b/examples/runcards/train.yml
index f9c319a..945fa86 100644
--- a/examples/runcards/train.yml
+++ b/examples/runcards/train.yml
@@ -1,3 +1,5 @@
+# Example of how to specify a custom sequential model explicitly.
+
 # Lattice
 lattice_length: 6
 lattice_dimension: 2
@@ -12,18 +14,18 @@ couplings:
 # Model
 base: gaussian
 
-model: affine_spline
 model_params:
-    hidden_shape: [72]
-    activation: tanh
-    
-    n_affine: 2
-    z2_equivar: true
-
-    n_spline: 1
-    n_segments: 8
-    z2_equivar_spline: false
-
+ - layer: real_nvp
+   n_blocks: 2
+   z2_equivar: true
+   activation: tanh
+   hidden_shape: [72]
+ - layer: rational_quadratic_spline
+   n_blocks: 1
+   n_segments: 8
+   z2_equivar: false
+   activation: tanh
+   hidden_shape: [72]
 
 # Training
 n_batch: 1000
diff --git a/examples/runcards/train_sequential_model.yml b/examples/runcards/train_sequential_model.yml
deleted file mode 100644
index 81c735c..0000000
--- a/examples/runcards/train_sequential_model.yml
+++ /dev/null
@@ -1,46 +0,0 @@
-# Example of how to specify a custom sequential model explicitly.
-
-# Lattice
-lattice_length: 6
-lattice_dimension: 2
-
-# Target
-target: phi_four
-parameterisation: albergo2019
-couplings:
-    m_sq: -4
-    lam: 6.975
-
-# Model
-base: gaussian
-
-model: sequential_model
-
-model_params:
- - model: real_nvp
-   n_affine: 2
-   z2_equivar: true
-   activation: tanh
-   hidden_shape: [72]
- - model: rational_quadratic_spline
-   n_spline: 1
-   n_segments: 8
-   z2_equivar_spline: false
-   activation: tanh
-   hidden_shape: [72]
-
-# Training
-n_batch: 1000
-epochs: 2000
-save_interval: 1000
-
-# Optimizer
-optimizer: Adam
-optimizer_params:
-    lr: 0.005
-
-# Scheduler
-scheduler: CosineAnnealingLR
-scheduler_params:
-    T_max: 2000
-

From 4dbe1a45a1d5487fe4bb3ab68c15dcff2c047ba1 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 30 Apr 2021 10:52:21 +0100
Subject: [PATCH 05/20] move sequential to layers

---
 anvil/core.py   | 11 ---------
 anvil/layers.py | 66 ++++++++++++++++++++++++++++++++++---------------
 anvil/models.py | 38 ++++++++++++++--------------
 3 files changed, 64 insertions(+), 51 deletions(-)

diff --git a/anvil/core.py b/anvil/core.py
index df85ffc..81ea02b 100644
--- a/anvil/core.py
+++ b/anvil/core.py
@@ -17,17 +17,6 @@
 }
 
 
-class Sequential(nn.Sequential):
-    """Modify the nn.Sequential class so that it takes an input vector *and* a
-    value for the current logarithm of the model density, returning an output
-    vector and the updated log density."""
-
-    def forward(self, v, log_density, *args):
-        for module in self:
-            v, log_density = module(v, log_density, *args)
-        return v, log_density
-
-
 class FullyConnectedNeuralNetwork(nn.Module):
     """Generic class for neural networks used in coupling layers.
 
diff --git a/anvil/layers.py b/anvil/layers.py
index 70296ba..f91253f 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -3,32 +3,46 @@
 r"""
 layers.py
 
-Contains nn.Modules which implement transformations of input configurations whilst computing
-the Jacobian determinant of the transformation.
-
-Each transformation layers may contain several neural networks or learnable parameters.
-
-A normalising flow, f, can be constructed from multiple layers using function composition:
-
-        f(z) = g_n( ... ( g_2( g_1( z ) ) ) ... )
-
-which is implemented using the architecture provided by torch.nn
+Contains the transformations or "layers" which are the building blocks of
+normalising flows. The layers are implemented using the PyTorch library, which
+in practice means they subclass :py:class:`torch.nn.Module`. For more
+information, check out the PyTorch
+`Module docs <https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module>`_.
+
+The basic idea is of a flow is to generate a latent variable, in our framework
+this would be using a class in :py:mod:`anvil.distributions`. The latent
+variables are then transformed by sequentially applying the transformation
+layers. The key feature of the transformations is the ability to easily calculate
+the Jacobian determinant. If the base density function is known, then we can
+evaluate the model density exactly.
+
+The bottom line is that we enforce a convention to the ``forward`` method
+of each layer (a special method of :py:class:`torch.nn.Module` subclasses).
+All layers in this module should contain a ``forward`` method which takes two
+:py:class:`torch.Tensor` objects as inputs:
+
+    - a batch of input configurations, dimensions ``(batch size, lattice size)``.
+    - a batch of scalars, dimensions ``(batch size, 1)``, that are the logarithm of the
+      'current' probability density, at this stage in the normalising flow.
 
-All layers in this module contain a `forward` method which takes two torch.tensor objects
-as inputs:
+Each transformation layers may contain several neural networks or learnable
+parameters.
 
-    - a batch of input configurations, dimensions (batch size, lattice size).
+A full normalising flow, f, can be constructed from multiple layers using
+function composition:
 
-    - a batch of scalars, dimensions (batch size, 1), that are the logarithm of the
-      'current' probability density, at this stage in the normalising flow.
+.. math::
 
-and returns two torch.tensor objects:
+        f(z) = g_{N_layers}( \ldots ( g_2( g_1( z ) ) ) \ldots )
 
-    - a batch of configurations \phi which have been transformed according to the 
-      transformation, with the same dimensions as the input configurations.
+As a matter of convenience we provide a subclass of
+:py:class:`torch.nn.Sequential`, which is initialised by passing multiple layers
+as arguments (in the order in which the layers are applied). The main feature
+of our version, :py:class:`Sequential`, is that it conforms to our ``forward``
+convention. From the perspective of the user :py:class:`Sequential` appears
+as a single subclass of :py:class:`torch.nn.Module` which performs the
+full normalising flow transformation :math:`f(z)`.
 
-    - the updated logarithm of the probability density, including the contribution from
-      the Jacobian determinant of this transformation.
 """
 import torch
 import torch.nn as nn
@@ -428,3 +442,15 @@ def forward(self, v_in, log_density, *unused):
         v_out = self.scale * v_in
         log_density -= v_out.shape[-1] * torch.log(self.scale)
         return v_out, log_density
+
+
+class Sequential(nn.Sequential):
+    """Similar to :py:class:`torch.nn.Sequential` except conforms to our
+    ``forward`` convention.
+
+    """
+
+    def forward(self, v, log_density, *args):
+        for module in self:
+            v, log_density = module(v, log_density, *args)
+        return v, log_density
diff --git a/anvil/models.py b/anvil/models.py
index 15faec6..6553e1e 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -5,34 +5,33 @@
 
 Module containing reportengine actions which return normalising flow models.
 Generally this involves piecing together components from :py:mod:`anvil.layers`
-and :py:mod:`anvil.core` to produce sequences of transformations.
+to produce sequences of transformations.
 
 """
 from functools import partial
 
 from reportengine import collect
 
-from anvil.core import Sequential
 import anvil.layers as layers
 
 
 def _coupling_pair(coupling_layer, **kwargs):
     """Helper function which wraps a pair of coupling layers from
     :py:mod:`anvil.layers` in the module container
-    :py:class`anvil.core.Sequential`. The first transformation layer acts on
+    :py:class`layers.Sequential`. The first transformation layer acts on
     the even sites and the second transformation acts on the odd sites, so one
     of these blocks ensures all sites are transformed as part of an
     active partition.
 
     """
     coupling_transformation = partial(coupling_layer, **kwargs)
-    return Sequential(
+    return layers.Sequential(
         coupling_transformation(even_sites=True),
         coupling_transformation(even_sites=False),
     )
 
 
-def _real_nvp(
+def real_nvp(
     size_half,
     n_blocks,
     hidden_shape,
@@ -42,7 +41,7 @@ def _real_nvp(
     r"""Action which returns a sequence of ``n_blocks`` pairs of
     :py:class:`anvil.layers.AffineLayer` s, followed by a single
     :py:class:`anvil.layers.GlobalRescaling` all wrapped in the module container
-    :py:class`anvil.core.Sequential`.
+    :py:class`layers.Sequential`.
 
     The first ``n_blocks`` elements of the outer ``Sequential``
     are ``Sequential`` s containing a pair of ``AffineLayer`` s which
@@ -72,7 +71,7 @@ def _real_nvp(
 
     Returns
     -------
-    real_nvp: anvil.core.Sequential
+    real_nvp: layers.Sequential
         A sequence of affine transformations, which we refer to as a real NVP
         (Non-volume preserving) flow.
 
@@ -92,17 +91,17 @@ def _real_nvp(
         )
         for i in range(n_blocks)
     ]
-    return Sequential(*blocks, layers.GlobalRescaling())
+    return layers.Sequential(*blocks, layers.GlobalRescaling())
 
 
-def _nice(
+def nice(
     size_half,
     n_blocks,
     hidden_shape,
     activation="tanh",
     z2_equivar=True,
 ):
-    """Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
+    r"""Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
     :py:class:`layers.AdditiveLayer` s followed by a single
     :py:class:`layers.GlobalRescaling`. The pairs of ``AdditiveLayer`` s
     act on the even and odd sites respectively.
@@ -128,7 +127,7 @@ def _nice(
 
     Returns
     -------
-    nice: anvil.core.Sequential
+    nice: layers.Sequential
         A sequence of additive transformations, which we refer to as a
         nice flow.
 
@@ -143,10 +142,10 @@ def _nice(
         )
         for i in range(n_blocks)
     ]
-    return Sequential(*blocks, layers.GlobalRescaling())
+    return layers.Sequential(*blocks, layers.GlobalRescaling())
 
 
-def _rational_quadratic_spline(
+def rational_quadratic_spline(
     size_half,
     hidden_shape,
     interval=5,
@@ -198,8 +197,7 @@ def _rational_quadratic_spline(
         )
         for _ in range(n_blocks)
     ]
-    return Sequential(
-        #layers.BatchNormLayer(),
+    return layers.Sequential(
         *blocks,
         layers.GlobalRescaling(),
     )
@@ -208,7 +206,7 @@ def _rational_quadratic_spline(
 
 def model_to_load(_normalising_flow):
     """action which wraps a list of layers in
-    :py:class:`anvil.core.Sequential`. This allows the user to specify an
+    :py:class:`layers.Sequential`. This allows the user to specify an
     arbitrary combination of layers as the model.
 
     For more information
@@ -217,10 +215,10 @@ def model_to_load(_normalising_flow):
     found in :py:mod:`anvil.layers`.
 
     """
-    return Sequential(*_normalising_flow)
+    return layers.Sequential(*_normalising_flow)
 
 LAYER_OPTIONS = {
-    "nice": _nice,
-    "real_nvp": _real_nvp,
-    "rational_quadratic_spline": _rational_quadratic_spline,
+    "nice": nice,
+    "real_nvp": real_nvp,
+    "rational_quadratic_spline": rational_quadratic_spline,
 }

From 8be7987e1bc802e0fe166c4772d31ea45c3c7a54 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 30 Apr 2021 11:33:02 +0100
Subject: [PATCH 06/20] update module names, cleanup some docs warnings

---
 anvil/benchmarks.py                           |  4 +-
 anvil/distributions.py                        | 24 ++++----
 anvil/free_scalar.py                          | 24 ++++----
 anvil/layers.py                               | 17 +++---
 anvil/models.py                               | 58 +++++++++++++++----
 anvil/{core.py => neural_network.py}          | 17 +++---
 .../sphinx/source/get-started/basic-usage.rst |  2 +-
 7 files changed, 93 insertions(+), 53 deletions(-)
 rename anvil/{core.py => neural_network.py} (82%)

diff --git a/anvil/benchmarks.py b/anvil/benchmarks.py
index 0dfe071..7175854 100644
--- a/anvil/benchmarks.py
+++ b/anvil/benchmarks.py
@@ -40,8 +40,8 @@ def free_scalar_theory(couplings, lattice_length):
 def fourier_transform(configs, training_geometry):
     """Takes the Fourier transform of a sample of field configurations.
 
-    Inputs
-    ------
+    Parameters
+    ----------
     configs: torch.tensor
         A (hopefully decorrelated) sample of field configurations in the
         split representation. Shape: (sample_size, lattice_size)
diff --git a/anvil/distributions.py b/anvil/distributions.py
index f34c466..919f944 100644
--- a/anvil/distributions.py
+++ b/anvil/distributions.py
@@ -14,8 +14,8 @@ class Gaussian:
     """
     Class which handles the generation of a sample of latent Gaussian variables.
 
-    Inputs:
-    -------
+    Parameters
+    ----------
     lattice_size: int
         Number of nodes on the lattice.
     sigma: float
@@ -59,6 +59,18 @@ class PhiFourScalar:
     The forward pass returns the corresponding log density (unnormalised) which
     is equal to -S
 
+    The parameters required differ depending on the parameterisation you're
+    using:
+
+    ================  =============
+    parameterisation  couplings
+    ================  =============
+    standard          m_sq, g
+    albergo2019       m_sq, lam
+    nicoli2020        kappa, lam
+    bosetti2015       beta, lam
+    ================  =============
+
     Parameters
     ----------
     geometry:
@@ -70,14 +82,6 @@ class PhiFourScalar:
         dictionary with two entries that are the couplings of the theory.
         See below.
 
-
-        parameterisation            couplings
-        -------------------------------------
-        standard                    m_sq, g
-        albergo2019                 m_sq, lam
-        nicoli2020                  kappa, lam
-        bosetti2015                 beta, lam
-
     Notes
     -----
     The general form of the action is
diff --git a/anvil/free_scalar.py b/anvil/free_scalar.py
index af2a050..a985c89 100644
--- a/anvil/free_scalar.py
+++ b/anvil/free_scalar.py
@@ -127,8 +127,8 @@ def gen_complex_normal(n_sample, sigma, real=False):
         """Returns a stack of complex arrays where real and imaginary components
         are drawn from a Gaussian distribution with the same width.
 
-        Inputs:
-        -------
+        Parameters
+        ----------
         n_sample: int
             sample size
         sigma: numpy.ndarray
@@ -137,8 +137,8 @@ def gen_complex_normal(n_sample, sigma, real=False):
             (optional) flag. If True, the imaginary component is set to
             zero, but a complex array is still returned.
 
-        Returns:
-        --------
+        Returns
+        -------
         out: numpy.ndarray
             complex array of shape (n_sample, *sigma.shape)
         """
@@ -157,13 +157,13 @@ def gen_eigenmodes(self, n_sample):
         Gaussian distributions with variances given by the eigenvalues of the
         kinetic operator - see _variance() method above.
 
-        Inputs:
-        -------
+        Parameters
+        ----------
         n_sample: int
             sample size
 
-        Returns:
-        --------
+        Returns
+        -------
         eigenmodes: numpy.ndarray
             complex array of eigenmodes with shape (n_sample, L, L)
             where L is the side length of the square lattice.
@@ -206,13 +206,13 @@ def gen_eigenmodes(self, n_sample):
     def gen_real_space_fields(self, n_sample):
         """Returns the inverse fourier transform of a sample of eigenmodes.
 
-        Inputs:
-        -------
+        Parameters
+        ----------
         n_sample: int
             sample size
 
-        Returns:
-        --------
+        Returns
+        -------
         fields: numpy.ndarray
             real array of real-space fields, with shape (n_sample, L, L),
             where L is the side-length of the square lattice.
diff --git a/anvil/layers.py b/anvil/layers.py
index f91253f..3f2c646 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -33,7 +33,7 @@
 
 .. math::
 
-        f(z) = g_{N_layers}( \ldots ( g_2( g_1( z ) ) ) \ldots )
+        f(z) = g_{N_{\rm layers}}( \ldots ( g_2( g_1( z ) ) ) \ldots )
 
 As a matter of convenience we provide a subclass of
 :py:class:`torch.nn.Sequential`, which is initialised by passing multiple layers
@@ -47,7 +47,7 @@
 import torch
 import torch.nn as nn
 
-from anvil.core import FullyConnectedNeuralNetwork
+from anvil.neural_network import DenseNeuralNetwork
 
 
 class CouplingLayer(nn.Module):
@@ -127,7 +127,7 @@ def __init__(
     ):
         super().__init__(size_half, even_sites)
 
-        self.t_network = FullyConnectedNeuralNetwork(
+        self.t_network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half,
             hidden_shape=hidden_shape,
@@ -175,14 +175,14 @@ def __init__(
     ):
         super().__init__(size_half, even_sites)
 
-        self.s_network = FullyConnectedNeuralNetwork(
+        self.s_network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half,
             hidden_shape=hidden_shape,
             activation=activation,
             bias=not z2_equivar,
         )
-        self.t_network = FullyConnectedNeuralNetwork(
+        self.t_network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half,
             hidden_shape=hidden_shape,
@@ -250,7 +250,7 @@ def __init__(
         self.size_half = size_half
         self.n_segments = n_segments
 
-        self.network = FullyConnectedNeuralNetwork(
+        self.network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half * (3 * n_segments - 1),
             hidden_shape=hidden_shape,
@@ -447,10 +447,11 @@ def forward(self, v_in, log_density, *unused):
 class Sequential(nn.Sequential):
     """Similar to :py:class:`torch.nn.Sequential` except conforms to our
     ``forward`` convention.
-
     """
-
     def forward(self, v, log_density, *args):
+        """overrides the base class ``forward`` method to conform to our
+        conventioned for expected inputs/outputs of ``forward`` methods.
+        """
         for module in self:
             v, log_density = module(v, log_density, *args)
         return v, log_density
diff --git a/anvil/models.py b/anvil/models.py
index 6553e1e..14206b2 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -18,7 +18,7 @@
 def _coupling_pair(coupling_layer, **kwargs):
     """Helper function which wraps a pair of coupling layers from
     :py:mod:`anvil.layers` in the module container
-    :py:class`layers.Sequential`. The first transformation layer acts on
+    :py:class`anvil.layers.Sequential`. The first transformation layer acts on
     the even sites and the second transformation acts on the odd sites, so one
     of these blocks ensures all sites are transformed as part of an
     active partition.
@@ -41,7 +41,7 @@ def real_nvp(
     r"""Action which returns a sequence of ``n_blocks`` pairs of
     :py:class:`anvil.layers.AffineLayer` s, followed by a single
     :py:class:`anvil.layers.GlobalRescaling` all wrapped in the module container
-    :py:class`layers.Sequential`.
+    :py:class`anvil.layers.Sequential`.
 
     The first ``n_blocks`` elements of the outer ``Sequential``
     are ``Sequential`` s containing a pair of ``AffineLayer`` s which
@@ -71,13 +71,13 @@ def real_nvp(
 
     Returns
     -------
-    real_nvp: layers.Sequential
+    real_nvp: anvil.layers.Sequential
         A sequence of affine transformations, which we refer to as a real NVP
         (Non-volume preserving) flow.
 
     See Also
     --------
-    :py:mod:`anvil.core` contains the fully connected neural network class
+    :py:mod:`anvil.neural_network` contains the fully connected neural network class
     as well as valid choices for activation functions.
 
     """
@@ -102,8 +102,8 @@ def nice(
     z2_equivar=True,
 ):
     r"""Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
-    :py:class:`layers.AdditiveLayer` s followed by a single
-    :py:class:`layers.GlobalRescaling`. The pairs of ``AdditiveLayer`` s
+    :py:class:`anvil.layers.AdditiveLayer` s followed by a single
+    :py:class:`anvil.layers.GlobalRescaling`. The pairs of ``AdditiveLayer`` s
     act on the even and odd sites respectively.
 
     Parameters
@@ -127,7 +127,7 @@ def nice(
 
     Returns
     -------
-    nice: layers.Sequential
+    nice: anvil.layers.Sequential
         A sequence of additive transformations, which we refer to as a
         nice flow.
 
@@ -155,8 +155,8 @@ def rational_quadratic_spline(
     z2_equivar=False,
 ):
     """Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
-    :py:class:`layers.RationalQuadraticSplineLayer` s followed by a single
-    :py:class:`layers.GlobalRescaling`. The pairs of RQS's
+    :py:class:`anvil.layers.RationalQuadraticSplineLayer` s followed by a single
+    :py:class:`anvil.layers.GlobalRescaling`. The pairs of RQS's
     act on the even and odd sites respectively.
 
     Parameters
@@ -206,17 +206,51 @@ def rational_quadratic_spline(
 
 def model_to_load(_normalising_flow):
     """action which wraps a list of layers in
-    :py:class:`layers.Sequential`. This allows the user to specify an
+    :py:class:`anvil.layers.Sequential`. This allows the user to specify an
     arbitrary combination of layers as the model.
 
-    For more information
-    on valid choices for layers, see :py:var:`LAYER_OPTIONS` or the various
+    For more information on valid choices for layers, see
+    ``anvil.models.LAYER_OPTIONS`` or the various
     functions in :py:mod:`anvil.models` which produce sequences of the layers
     found in :py:mod:`anvil.layers`.
 
+    At present, available transformations are:
+
+        - ``nice``
+        - ``real_nvp``
+        - ``rational_quadratic_spline``
+
+    You can see their dependencies using the ``anvil`` provider help, e.g.
+    for ``real_nvp``:
+
+    .. code::
+
+        $ anvil-sample --help real_nvp
+        ...
+        < action docstring - poorly formatted>
+        ...
+        The following resources are read from the configuration:
+
+            lattice_length(int):
+        [Used by lattice_size]
+
+            lattice_dimension(int): Parse lattice dimension from runcard
+        [Used by lattice_size]
+
+        The following additionl arguments can be used to control the
+        behaviour. They are set by default to sensible values:
+
+        n_blocks
+        hidden_shape
+        activation = tanh
+        z2_equivar = True
+
+    ``anvil-train`` will also provide the same information.
+
     """
     return layers.Sequential(*_normalising_flow)
 
+# Update docstring above if you add to this!
 LAYER_OPTIONS = {
     "nice": nice,
     "real_nvp": real_nvp,
diff --git a/anvil/core.py b/anvil/neural_network.py
similarity index 82%
rename from anvil/core.py
rename to anvil/neural_network.py
index 81ea02b..8806ed4 100644
--- a/anvil/core.py
+++ b/anvil/neural_network.py
@@ -1,9 +1,10 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copywrite © 2021 anvil Michael Wilson, Joe Marsh Rossney, Luigi Del Debbio
 """
-core.py
+neural_network.py
 
-Module containing project specific extensions to pytorch base classes.
+Module containing neural networks which are used as part of transformation
+layers, found in :py:mod:`anvil.layers`.
 
 """
 import torch
@@ -17,8 +18,8 @@
 }
 
 
-class FullyConnectedNeuralNetwork(nn.Module):
-    """Generic class for neural networks used in coupling layers.
+class DenseNeuralNetwork(nn.Module):
+    """Dense neural networks used in coupling layers.
 
     Parameters
     ----------
@@ -30,11 +31,11 @@ class FullyConnectedNeuralNetwork(nn.Module):
         List specifying the number of nodes in the intermediate layers
     activation: (str, None)
         Key representing the activation function used for each layer
-        except the final one.
-    no_final_activation: bool
-        If True, leave the network output unconstrained.
-    bias: bool
+        except the final one. Valid options can be found in
+        ``ACTIVATION_LAYERS``.
+    bias: bool, default=True
         Whether to use biases in networks.
+
     """
 
     def __init__(
diff --git a/docs/sphinx/source/get-started/basic-usage.rst b/docs/sphinx/source/get-started/basic-usage.rst
index ad2e483..ef0c557 100644
--- a/docs/sphinx/source/get-started/basic-usage.rst
+++ b/docs/sphinx/source/get-started/basic-usage.rst
@@ -118,7 +118,7 @@ We supply some basic
 machinery to build your own normalising flow models. The relevant modules for
 this purpose are
 
- - :py:mod:`anvil.core`: containing some basic extensions to ``pytorch`` modules relevant for this project.
+ - :py:mod:`anvil.neural_network`: Generic neural networks.
  - :py:mod:`anvil.layers`: a collection of transformation layer classes
  - :py:mod:`anvil.geometry`: classes which transform the output of the transformations into meaningful geometries. These dictate which sites in your lattice get alternated between active and passive partitions.
  - :py:mod:`anvil.distributions`: a collection of distributions which can be used as base distributions (for latent variables) or target distributions.

From 2e3d6700a489f667f0a29d9c13c65cccd5a2f4da Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 30 Apr 2021 11:34:44 +0100
Subject: [PATCH 07/20] remove spurious comment in train

---
 examples/runcards/train.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/runcards/train.yml b/examples/runcards/train.yml
index 945fa86..8dee34f 100644
--- a/examples/runcards/train.yml
+++ b/examples/runcards/train.yml
@@ -1,5 +1,3 @@
-# Example of how to specify a custom sequential model explicitly.
-
 # Lattice
 lattice_length: 6
 lattice_dimension: 2

From 76cb8ea50b0e4e474f77e8157eb06aa0238d4fd3 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 30 Apr 2021 11:40:39 +0100
Subject: [PATCH 08/20] flatten out the inner layers in model_to_load

---
 anvil/models.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/anvil/models.py b/anvil/models.py
index 14206b2..89abf45 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -248,7 +248,12 @@ def model_to_load(_normalising_flow):
     ``anvil-train`` will also provide the same information.
 
     """
-    return layers.Sequential(*_normalising_flow)
+    # assume that _normalising_flow is a list of layers, each layer
+    # is a sequential of blocks, each block is a pair of transformations
+    # which transforms the entire input state - flatten this out, so output
+    # is Sequential of blocks
+    flow_flat = [block for layer in _normalising_flow for block in layer]
+    return layers.Sequential(*flow_flat)
 
 # Update docstring above if you add to this!
 LAYER_OPTIONS = {

From 4d287db5883f7118a31af2bc4794ea97d8510691 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 30 Apr 2021 11:45:14 +0100
Subject: [PATCH 09/20] might as well just call the model params input model

---
 anvil/benchmark_config/free_scalar_train.yml | 2 +-
 anvil/models.py                              | 2 +-
 examples/runcards/train.yml                  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/anvil/benchmark_config/free_scalar_train.yml b/anvil/benchmark_config/free_scalar_train.yml
index 9b85f20..d4407da 100644
--- a/anvil/benchmark_config/free_scalar_train.yml
+++ b/anvil/benchmark_config/free_scalar_train.yml
@@ -12,7 +12,7 @@ couplings:
 # Model
 base: gaussian
 
-model_params:
+model:
     layer: nice
     n_blocks: 2
     hidden_shape: [36]
diff --git a/anvil/models.py b/anvil/models.py
index 89abf45..155eea7 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -202,7 +202,7 @@ def rational_quadratic_spline(
         layers.GlobalRescaling(),
     )
 
-_normalising_flow = collect("layer_action", ("model_params",))
+_normalising_flow = collect("layer_action", ("model",))
 
 def model_to_load(_normalising_flow):
     """action which wraps a list of layers in
diff --git a/examples/runcards/train.yml b/examples/runcards/train.yml
index 8dee34f..fdda3ea 100644
--- a/examples/runcards/train.yml
+++ b/examples/runcards/train.yml
@@ -12,7 +12,7 @@ couplings:
 # Model
 base: gaussian
 
-model_params:
+model:
  - layer: real_nvp
    n_blocks: 2
    z2_equivar: true

From 3a1232ebcc6bec89f485810fcafac1cee058a127 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 30 Apr 2021 13:01:19 +0100
Subject: [PATCH 10/20] add basic layers tests

---
 anvil/layers.py            |   8 +--
 anvil/tests/test_layers.py | 144 +++++++++++++++++++++++++++++++++++++
 conda-recipe/meta.yaml     |   1 +
 3 files changed, 149 insertions(+), 4 deletions(-)
 create mode 100644 anvil/tests/test_layers.py

diff --git a/anvil/layers.py b/anvil/layers.py
index 3f2c646..703e2c4 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -135,7 +135,7 @@ def __init__(
             bias=not z2_equivar,
         )
 
-    def forward(self, v_in, log_density, *unused) -> torch.Tensor:
+    def forward(self, v_in, log_density, *args) -> torch.Tensor:
         r"""Forward pass of affine transformation."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
@@ -191,7 +191,7 @@ def __init__(
         )
         self.z2_equivar = z2_equivar
 
-    def forward(self, v_in, log_density, *unused) -> torch.Tensor:
+    def forward(self, v_in, log_density, *args) -> torch.Tensor:
         r"""Forward pass of affine transformation."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
@@ -424,7 +424,7 @@ def __init__(self, scale=1):
         super().__init__()
         self.gamma = scale
 
-    def forward(self, v_in, log_density, *unused):
+    def forward(self, v_in, log_density, *args):
         """Forward pass of the batch normalisation transformation."""
         mult = self.gamma / torch.std(v_in)
         v_out = mult * (v_in - v_in.mean())
@@ -438,7 +438,7 @@ def __init__(self, initial=1):
 
         self.scale = nn.Parameter(torch.Tensor([initial]))
 
-    def forward(self, v_in, log_density, *unused):
+    def forward(self, v_in, log_density, *args):
         v_out = self.scale * v_in
         log_density -= v_out.shape[-1] * torch.log(self.scale)
         return v_out, log_density
diff --git a/anvil/tests/test_layers.py b/anvil/tests/test_layers.py
new file mode 100644
index 0000000..bb5998f
--- /dev/null
+++ b/anvil/tests/test_layers.py
@@ -0,0 +1,144 @@
+"""
+Tests of the base classes in :py:mod:`anvil.layers`
+
+"""
+from hypothesis import given
+from hypothesis.strategies import integers, booleans
+import numpy as np
+import pytest
+import torch
+
+import anvil.layers as layers
+from anvil.distributions import Gaussian
+
+N_BATCH = 100
+SIZE = 36
+SIZE_HALF = SIZE // 2
+HIDDEN_SHAPE = [36]
+ACTIVATION = "tanh"
+
+@given(integers(min_value=0, max_value=2**16), booleans())
+def test_coupling_init(size_half, even_sites):
+    """Hypothesis test the initialisation of the base class in layers"""
+    layers.CouplingLayer(size_half, even_sites)
+
+
+@pytest.mark.skip(reason="batch norm in layers requires epsilon to avoid NaNs.")
+def test_additive_layers():
+    equivar_additive = layers.AdditiveLayer(
+        SIZE_HALF,
+        hidden_shape=HIDDEN_SHAPE,
+        activation=ACTIVATION,
+        z2_equivar=True,
+        even_sites=True
+    )
+    input_tensor = torch.zeros((N_BATCH, SIZE))
+    with torch.no_grad():
+        output_tensor, output_density = equivar_additive(input_tensor, 0)
+
+    assert output_density == 0
+    np.testing.assert_allclose(input_tensor.numpy(), output_tensor.numpy())
+
+
+def basic_layer_test(layer, input_states, input_log_density, *args):
+    """Basic check that layer transforms input states properly.
+
+    In practice we check:
+
+        - field variables and log densities are valid real numbers
+        - output states are correct shape
+        - outputs are correct typing
+
+    """
+    output_states, output_log_density = layer(input_states, input_log_density, *args)
+    # all numbers
+    any_nan = (
+        torch.any(torch.isnan(output_states)) or
+        torch.any(torch.isnan(output_log_density))
+    )
+    assert not any_nan
+    # correct shape
+    assert input_states.shape == output_states.shape
+
+    assert isinstance(output_states, torch.Tensor)
+    assert isinstance(output_log_density, torch.Tensor)
+
+
+@pytest.fixture()
+@torch.no_grad()
+def gaussian_input():
+    """Basic input states for testing"""
+    latent_distribution = Gaussian(SIZE) # use default standard normal
+    return latent_distribution(N_BATCH)
+
+@pytest.mark.parametrize("layer_class", [layers.AdditiveLayer, layers.AffineLayer])
+@pytest.mark.parametrize("z2_equivar", [True, False])
+@pytest.mark.parametrize("even_sites", [True, False])
+@torch.no_grad()
+def test_affine_like_basic(gaussian_input, layer_class, z2_equivar, even_sites):
+    """Apply :py:func:`basic_layer_test` to layers with same initialisation
+    parameters as :py:class:`anvil.layers.AffineLayer`.
+
+    """
+    layer = layer_class(
+        SIZE_HALF,
+        hidden_shape=HIDDEN_SHAPE,
+        activation=ACTIVATION,
+        z2_equivar=z2_equivar,
+        even_sites=even_sites,
+    )
+    basic_layer_test(layer, *gaussian_input)
+
+@pytest.mark.parametrize("z2_equivar", [True, False])
+@pytest.mark.parametrize("even_sites", [True, False])
+@torch.no_grad()
+def test_rqs_basic(gaussian_input, z2_equivar, even_sites):
+    """Apply :py:func:`basic_layer_test` to
+    :py:class:`anvil.layers.RationalQuadraticSplineLayer`.
+    """
+    layer = layers.RationalQuadraticSplineLayer(
+        SIZE_HALF,
+        interval=5,
+        n_segments=4,
+        hidden_shape=HIDDEN_SHAPE,
+        activation=ACTIVATION,
+        z2_equivar=z2_equivar,
+        even_sites=even_sites,
+    )
+    negative_mag = gaussian_input[0].sum(dim=1) < 0
+    basic_layer_test(layer, *gaussian_input, negative_mag)
+
+@pytest.mark.parametrize(
+    "layer_class",
+    [layers.GlobalRescaling, layers.BatchNormLayer, layers.GlobalAffineLayer]
+)
+@torch.no_grad()
+def test_scaling_layer_basic(gaussian_input, layer_class):
+    if layer_class is layers.GlobalAffineLayer:
+        layer = layer_class(1, 0)
+    else:
+        layer = layer_class()
+    basic_layer_test(layer, *gaussian_input)
+
+@torch.no_grad()
+def test_sequential_basic(gaussian_input):
+    inner_layers = [
+        layers.AffineLayer(
+            SIZE_HALF,
+            hidden_shape=HIDDEN_SHAPE,
+            activation=ACTIVATION,
+            z2_equivar=False,
+            even_sites=bool(i % 2),
+        ) for i in range(8)]
+    layer = layers.Sequential(*inner_layers)
+    basic_layer_test(layer, *gaussian_input)
+
+    # check application of sequetion matches output of applying each layer.
+    output_states, output_density = inner_layers[0](*gaussian_input)
+    for el in inner_layers[1:]:
+        output_states, output_density = el(output_states, output_density)
+
+    seq_output_states, seq_output_density = layer(*gaussian_input)
+
+    np.testing.assert_allclose(seq_output_states.numpy(), output_states.numpy())
+    np.testing.assert_allclose(seq_output_density.numpy(), output_density.numpy())
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index a551074..e4184ab 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -27,6 +27,7 @@ build:
 test:
   requires:
     - pytest
+    - hypothesis
   commands:
     - pytest --pyargs anvil
 

From 4f68d583a43fa72e9244f8c267a5e731551f7ea2 Mon Sep 17 00:00:00 2001
From: wilsonm <michael.wilson@ed.ac.uk>
Date: Fri, 30 Apr 2021 13:50:50 +0100
Subject: [PATCH 11/20] model tests, probably could be improved but at least
 have coverage.

---
 anvil/tests/test_models.py | 53 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 anvil/tests/test_models.py

diff --git a/anvil/tests/test_models.py b/anvil/tests/test_models.py
new file mode 100644
index 0000000..8996f0b
--- /dev/null
+++ b/anvil/tests/test_models.py
@@ -0,0 +1,53 @@
+"""
+Test higher level model construction from :py:mod:`anvil.models`.
+
+"""
+from hypothesis import given
+from hypothesis.strategies import integers, lists
+import pytest
+import torch
+
+from anvil.api import API
+from anvil.models import LAYER_OPTIONS
+
+LAYERS = list(LAYER_OPTIONS.keys())
+
+PARAMS = {
+    "hidden_shape": (32,),
+    "n_blocks": 3,
+    "lattice_length": 6,
+    "lattice_dimension": 2,
+}
+
+@pytest.mark.parametrize("layer_action", LAYERS)
+def test_layer_actions(layer_action):
+    """Call the API on each of the layer actions, using mainly default arguments
+    """
+    getattr(API, layer_action)(**PARAMS)
+    return
+
+# put limits on these so not to crash your computer.
+@given(
+    lists(integers(min_value=0, max_value=2), min_size=1, max_size=3),
+    integers(min_value=1, max_value=4),
+    integers(min_value=1, max_value=8),
+    lists(integers(min_value=1, max_value=2 ** 6), min_size=1, max_size=3)
+)
+def test_model_construction(layer_idx, n_blocks, lattice_length_half, hidden_shape):
+    """Hypothesis test the model construction"""
+    # require even lattice sites.
+    model = [{"layer": LAYERS[idx]} for idx in layer_idx]
+    lattice_length = 2 * lattice_length_half
+    params = {
+        "model": model,
+        "n_blocks": n_blocks,
+        "lattice_length": lattice_length,
+        "hidden_shape": hidden_shape,
+        "lattice_dimension": 2,
+        # for some reason the RQS defaults get missed?
+        "interval": 5,
+        "n_segments": 4,
+    }
+    # might help with memory.
+    with torch.no_grad():
+        API.model_to_load(**params)

From 20a41de2acf11e0c47ef4a40b271f0ff4b8503fa Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 10:58:03 +0100
Subject: [PATCH 12/20] renamed coupling_pair to coupling_block

---
 anvil/layers.py | 4 ++--
 anvil/models.py | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/anvil/layers.py b/anvil/layers.py
index 703e2c4..761cc78 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -200,7 +200,7 @@ def forward(self, v_in, log_density, *args) -> torch.Tensor:
         s_out = self.s_network(v_for_net)
         t_out = self.t_network(v_for_net)
 
-        # If enforcing s(-v) = s(v), we want to use |s(v)| in affine transf.
+        # If enforcing C(-v) = -C(v), we want to use |s(v)| in affine transf.
         if self.z2_equivar:
             s_out = torch.abs(s_out)
 
@@ -273,7 +273,7 @@ def forward(self, v_in, log_density, negative_mag):
             v_in_passive - v_in_passive.mean()
         ) / v_in_passive.std()  # reduce numerical instability
 
-        # Naively enforce C(-v) = C(v)
+        # Naively enforce C(-v) = -C(v)
         if self.z2_equivar:
             v_for_net[negative_mag] = -v_for_net[negative_mag]
 
diff --git a/anvil/models.py b/anvil/models.py
index 155eea7..26ae8bf 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -15,7 +15,7 @@
 import anvil.layers as layers
 
 
-def _coupling_pair(coupling_layer, **kwargs):
+def _coupling_block(coupling_layer, **kwargs):
     """Helper function which wraps a pair of coupling layers from
     :py:mod:`anvil.layers` in the module container
     :py:class`anvil.layers.Sequential`. The first transformation layer acts on
@@ -82,7 +82,7 @@ def real_nvp(
 
     """
     blocks = [
-        _coupling_pair(
+        _coupling_block(
             layers.AffineLayer,
             size_half=size_half,
             hidden_shape=hidden_shape,
@@ -133,7 +133,7 @@ def nice(
 
     """
     blocks = [
-        _coupling_pair(
+        _coupling_block(
             layers.AdditiveLayer,
             size_half=size_half,
             hidden_shape=hidden_shape,
@@ -186,7 +186,7 @@ def rational_quadratic_spline(
 
     """
     blocks = [
-        _coupling_pair(
+        _coupling_block(
             layers.RationalQuadraticSplineLayer,
             size_half=size_half,
             interval=interval,

From af77d4113d20e3356e758d64d31d07c8df59ff47 Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 12:54:53 +0100
Subject: [PATCH 13/20] layer actions for batch norm and global rescaling

---
 anvil/layers.py             | 58 ++++++++++++++++++++++++++++++++-----
 anvil/models.py             | 57 ++++++++++++++++++++++++++----------
 examples/runcards/train.yml |  2 ++
 3 files changed, 93 insertions(+), 24 deletions(-)

diff --git a/anvil/layers.py b/anvil/layers.py
index 761cc78..8eec9aa 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -410,14 +410,32 @@ def forward(self, v_in, log_density):
         return self.scale * v_in + self.shift, log_density
 
 
-# TODO not necessary to define a nn.module for this now I've taken out learnable gamma
+# NOTE: not necessary to define a nn.module for this now gamma is no longer learnable
 class BatchNormLayer(nn.Module):
-    """Performs batch normalisation on the input vector.
+    r"""Performs batch normalisation on the inputs, conforming to our ``forward``
+    convention.
+
+    Inputs are standardised over all tensor dimensions such that the resulting sample
+    has null mean and unit variance, after which a rescaling factor is applied.
+
+    .. math::
+
+            v_{\rm out} = \gamma
+                \frac{v_{\rm in} - \mathbb{E}[ v_{\rm in} ]}
+                {\sqrt{\var( v_{\rm in} ) + \epsilon}}
 
     Parameters
     ----------
-    scale: int
-        An additional scale factor to be applied after batch normalisation.
+    scale: float
+        The multiplicative factor, :math:`\gamma`, applied to the standardised data.
+
+    Notes
+    -----
+    Applying batch normalisation before the first spline layer can be helpful for
+    ensuring that the inputs remain within the transformation interval. However,
+    this layer adds undesirable stochasticity which can impede optimisation. One
+    might consider replacing it with :py:class:`anvil.layers.GlobalRescaling` using
+    a static scale parameter.
     """
 
     def __init__(self, scale=1):
@@ -426,17 +444,40 @@ def __init__(self, scale=1):
 
     def forward(self, v_in, log_density, *args):
         """Forward pass of the batch normalisation transformation."""
-        mult = self.gamma / torch.std(v_in)
+        mult = self.gamma / torch.sqrt(v_in.var() + 1e-6)  # for stability
         v_out = mult * (v_in - v_in.mean())
-        log_density -= mult * v_out.shape[1]
+        log_density -= torch.log(mult) * v_out.shape[1]
         return (v_out, log_density)
 
 
 class GlobalRescaling(nn.Module):
-    def __init__(self, initial=1):
+    r"""Performs a global rescaling of the inputs via a (potentially learnable)
+    multiplicative factor, conforming to our ``forward`` convention.
+
+    Parameters
+    ----------
+    scale: float
+        The multiplicative factor applied to the inputs.
+    learnable: bool, default=True
+        If True, ``scale`` will be optimised during the training.
+
+    Notes
+    -----
+    Applying a rescaling layer with a learnable ``scale`` to the final layer of a
+    normalizing flow can be useful since it avoids the need to tune earlier layers
+    to match the width of the target density. However, for best performance one
+    should generally use a static ``scale`` to reduce stochasticity in the
+    optimisation.
+
+    """
+
+    def __init__(self, scale=1, learnable=True):
         super().__init__()
 
-        self.scale = nn.Parameter(torch.Tensor([initial]))
+        if learnable:
+            self.scale = nn.Parameter(torch.Tensor([scale]))
+        else:
+            self.scale = scale
 
     def forward(self, v_in, log_density, *args):
         v_out = self.scale * v_in
@@ -448,6 +489,7 @@ class Sequential(nn.Sequential):
     """Similar to :py:class:`torch.nn.Sequential` except conforms to our
     ``forward`` convention.
     """
+
     def forward(self, v, log_density, *args):
         """overrides the base class ``forward`` method to conform to our
         conventioned for expected inputs/outputs of ``forward`` methods.
diff --git a/anvil/models.py b/anvil/models.py
index 26ae8bf..6c7e13d 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -39,8 +39,7 @@ def real_nvp(
     z2_equivar=True,
 ):
     r"""Action which returns a sequence of ``n_blocks`` pairs of
-    :py:class:`anvil.layers.AffineLayer` s, followed by a single
-    :py:class:`anvil.layers.GlobalRescaling` all wrapped in the module container
+    :py:class:`anvil.layers.AffineLayer` s, wrapped in the module container
     :py:class`anvil.layers.Sequential`.
 
     The first ``n_blocks`` elements of the outer ``Sequential``
@@ -89,9 +88,9 @@ def real_nvp(
             activation=activation,
             z2_equivar=z2_equivar,
         )
-        for i in range(n_blocks)
+        for _ in range(n_blocks)
     ]
-    return layers.Sequential(*blocks, layers.GlobalRescaling())
+    return layers.Sequential(*blocks)
 
 
 def nice(
@@ -102,9 +101,8 @@ def nice(
     z2_equivar=True,
 ):
     r"""Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
-    :py:class:`anvil.layers.AdditiveLayer` s followed by a single
-    :py:class:`anvil.layers.GlobalRescaling`. The pairs of ``AdditiveLayer`` s
-    act on the even and odd sites respectively.
+    :py:class:`anvil.layers.AdditiveLayer`.
+    The pairs of ``AdditiveLayer`` s act on the even and odd sites respectively.
 
     Parameters
     ----------
@@ -140,9 +138,9 @@ def nice(
             activation=activation,
             z2_equivar=z2_equivar,
         )
-        for i in range(n_blocks)
+        for _ in range(n_blocks)
     ]
-    return layers.Sequential(*blocks, layers.GlobalRescaling())
+    return layers.Sequential(*blocks)
 
 
 def rational_quadratic_spline(
@@ -156,8 +154,7 @@ def rational_quadratic_spline(
 ):
     """Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
     :py:class:`anvil.layers.RationalQuadraticSplineLayer` s followed by a single
-    :py:class:`anvil.layers.GlobalRescaling`. The pairs of RQS's
-    act on the even and odd sites respectively.
+    The pairs of RQS's act on the even and odd sites respectively.
 
     Parameters
     ----------
@@ -183,8 +180,8 @@ def rational_quadratic_spline(
         Whether or not to impose z2 equivariance. This is only done crudely
         by splitting the sites according to the sign of the sum across lattice
         sites.
-
     """
+
     blocks = [
         _coupling_block(
             layers.RationalQuadraticSplineLayer,
@@ -197,13 +194,36 @@ def rational_quadratic_spline(
         )
         for _ in range(n_blocks)
     ]
-    return layers.Sequential(
-        *blocks,
-        layers.GlobalRescaling(),
-    )
+    return layers.Sequential(*blocks)
+
+
+def batch_norm(scale=1):
+    r"""Action which returns an instance of :py:class:`anvil.layers.BatchNormLayer`.
+
+    Parameters
+    ----------
+    scale: float
+        The multiplicative factor applied to the standardised data.
+    """
+    return layers.Sequential(layers.BatchNormLayer(scale=scale))
+
+
+def global_rescaling(scale=1, learnable=True):
+    r"""Action which returns and instance of :py:class:`anvil.layers.GlobalRescaling`.
+
+    Parameters
+    ----------
+    scale: float
+        The multiplicative factor applied to the inputs.
+    learnable: bool, default=True
+        If True, ``scale`` will be optimised during the training.
+    """
+    return layers.Sequential(layers.GlobalRescaling(scale=scale, learnable=learnable))
+
 
 _normalising_flow = collect("layer_action", ("model",))
 
+
 def model_to_load(_normalising_flow):
     """action which wraps a list of layers in
     :py:class:`anvil.layers.Sequential`. This allows the user to specify an
@@ -219,6 +239,8 @@ def model_to_load(_normalising_flow):
         - ``nice``
         - ``real_nvp``
         - ``rational_quadratic_spline``
+        - ``batch_norm``
+        - ``global_rescaling``
 
     You can see their dependencies using the ``anvil`` provider help, e.g.
     for ``real_nvp``:
@@ -255,9 +277,12 @@ def model_to_load(_normalising_flow):
     flow_flat = [block for layer in _normalising_flow for block in layer]
     return layers.Sequential(*flow_flat)
 
+
 # Update docstring above if you add to this!
 LAYER_OPTIONS = {
     "nice": nice,
     "real_nvp": real_nvp,
     "rational_quadratic_spline": rational_quadratic_spline,
+    "batch_norm": batch_norm,
+    "global_rescaling": global_rescaling,
 }
diff --git a/examples/runcards/train.yml b/examples/runcards/train.yml
index fdda3ea..1115be7 100644
--- a/examples/runcards/train.yml
+++ b/examples/runcards/train.yml
@@ -24,6 +24,8 @@ model:
    z2_equivar: false
    activation: tanh
    hidden_shape: [72]
+ - layer: global_rescaling
+   learnable: true
 
 # Training
 n_batch: 1000

From 3b52d3c37e9224e79dc55f45ad2715dec678bb27 Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 14:11:39 +0100
Subject: [PATCH 14/20] update example runcards

---
 anvil/benchmark_config/free_scalar_train.yml | 5 ++++-
 examples/runcards/train.yml                  | 4 ++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/anvil/benchmark_config/free_scalar_train.yml b/anvil/benchmark_config/free_scalar_train.yml
index d4407da..ed9fec2 100644
--- a/anvil/benchmark_config/free_scalar_train.yml
+++ b/anvil/benchmark_config/free_scalar_train.yml
@@ -13,11 +13,14 @@ couplings:
 base: gaussian
 
 model:
-    layer: nice
+  - layer: nice
     n_blocks: 2
     hidden_shape: [36]
     activation: tanh
     z2_equivar: True
+  - layer: global_rescaling
+    scale: 1
+    learnable: True
 
 # Training length
 n_batch: 2000
diff --git a/examples/runcards/train.yml b/examples/runcards/train.yml
index 1115be7..0e4144a 100644
--- a/examples/runcards/train.yml
+++ b/examples/runcards/train.yml
@@ -18,6 +18,9 @@ model:
    z2_equivar: true
    activation: tanh
    hidden_shape: [72]
+ - layer: global_rescaling
+   scale: 1
+   learnable: true
  - layer: rational_quadratic_spline
    n_blocks: 1
    n_segments: 8
@@ -25,6 +28,7 @@ model:
    activation: tanh
    hidden_shape: [72]
  - layer: global_rescaling
+   scale: 1
    learnable: true
 
 # Training

From 5bfc87adb2190b11ed95c38fe462605ced19cc4e Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 14:14:48 +0100
Subject: [PATCH 15/20] remove default scale factor for global rescaling layer

---
 anvil/models.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/anvil/models.py b/anvil/models.py
index 6c7e13d..9e728b3 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -145,10 +145,10 @@ def nice(
 
 def rational_quadratic_spline(
     size_half,
+    n_blocks,
     hidden_shape,
+    n_segments,
     interval=5,
-    n_blocks=1,
-    n_segments=4,
     activation="tanh",
     z2_equivar=False,
 ):
@@ -161,18 +161,18 @@ def rational_quadratic_spline(
     size_half: int
         inferred from ``lattice_size``, the size of the active/passive
         partitions (which are equal size, `lattice_size / 2`).
+    n_blocks: int
+        The number of pairs of :py:class:`anvil.layers.AffineLayer`
+        transformations. For RQS this is set to 1.
     hidden_shape: list[int]
         the shape of the neural networks used in the each layer. The visible
         layers are defined by the ``lattice_size``.
+    n_segments: int
+        The number of segments to use in the RQS transformation.
     interval: int, default=5
         the interval within which the RQS applies the transformation, at present
         if a field variable is outside of this region it is mapped to itself
         (i.e the gradient of the transformation is 1 outside of the interval).
-    n_blocks: int, default=1
-        The number of pairs of :py:class:`anvil.layers.AffineLayer`
-        transformations. For RQS this is set to 1.
-    n_segments: int, default=4
-        The number of segments to use in the RQS transformation.
     activation: str, default="tanh"
         The activation function to use for each hidden layer. The output layer
         of the network is linear (has no activation function).
@@ -197,18 +197,18 @@ def rational_quadratic_spline(
     return layers.Sequential(*blocks)
 
 
-def batch_norm(scale=1):
+def batch_norm(scale=1.0):
     r"""Action which returns an instance of :py:class:`anvil.layers.BatchNormLayer`.
 
     Parameters
     ----------
-    scale: float
+    scale: float, default=1.0
         The multiplicative factor applied to the standardised data.
     """
     return layers.Sequential(layers.BatchNormLayer(scale=scale))
 
 
-def global_rescaling(scale=1, learnable=True):
+def global_rescaling(scale, learnable=True):
     r"""Action which returns and instance of :py:class:`anvil.layers.GlobalRescaling`.
 
     Parameters

From 6e447731a5a8040630d721a3d7a2b5e2c1f2cb3b Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 14:54:22 +0100
Subject: [PATCH 16/20] add epsilon and remove shift from data standardisation

---
 anvil/layers.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/anvil/layers.py b/anvil/layers.py
index 8eec9aa..fefd9db 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -139,10 +139,9 @@ def forward(self, v_in, log_density, *args) -> torch.Tensor:
         r"""Forward pass of affine transformation."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
+        v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6)
 
-        t_out = self.t_network(
-            (v_in_passive - v_in_passive.mean()) / v_in_passive.std()
-        )
+        t_out = self.t_network(v_for_net)
 
         v_out = self._join_func([v_in_passive, v_in_active - t_out], dim=1)
 
@@ -195,7 +194,7 @@ def forward(self, v_in, log_density, *args) -> torch.Tensor:
         r"""Forward pass of affine transformation."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
-        v_for_net = (v_in_passive - v_in_passive.mean()) / v_in_passive.std()
+        v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6)
 
         s_out = self.s_network(v_for_net)
         t_out = self.t_network(v_for_net)
@@ -269,9 +268,7 @@ def forward(self, v_in, log_density, negative_mag):
         """Forward pass of the rational quadratic spline layer."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
-        v_for_net = (
-            v_in_passive - v_in_passive.mean()
-        ) / v_in_passive.std()  # reduce numerical instability
+        v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6)
 
         # Naively enforce C(-v) = -C(v)
         if self.z2_equivar:
@@ -474,10 +471,9 @@ class GlobalRescaling(nn.Module):
     def __init__(self, scale=1, learnable=True):
         super().__init__()
 
+        self.scale = torch.Tensor([scale])
         if learnable:
-            self.scale = nn.Parameter(torch.Tensor([scale]))
-        else:
-            self.scale = scale
+            self.scale = nn.Parameter(self.scale)
 
     def forward(self, v_in, log_density, *args):
         v_out = self.scale * v_in

From ff023074ba39ca90e18def02594648e72bedae18 Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 14:54:36 +0100
Subject: [PATCH 17/20] update tests

---
 anvil/tests/test_layers.py | 11 ++++++-----
 anvil/tests/test_models.py |  2 ++
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/anvil/tests/test_layers.py b/anvil/tests/test_layers.py
index bb5998f..9492388 100644
--- a/anvil/tests/test_layers.py
+++ b/anvil/tests/test_layers.py
@@ -23,10 +23,9 @@ def test_coupling_init(size_half, even_sites):
     layers.CouplingLayer(size_half, even_sites)
 
 
-@pytest.mark.skip(reason="batch norm in layers requires epsilon to avoid NaNs.")
 def test_additive_layers():
     equivar_additive = layers.AdditiveLayer(
-        SIZE_HALF,
+        size_half=SIZE_HALF,
         hidden_shape=HIDDEN_SHAPE,
         activation=ACTIVATION,
         z2_equivar=True,
@@ -81,7 +80,7 @@ def test_affine_like_basic(gaussian_input, layer_class, z2_equivar, even_sites):
 
     """
     layer = layer_class(
-        SIZE_HALF,
+        size_half=SIZE_HALF,
         hidden_shape=HIDDEN_SHAPE,
         activation=ACTIVATION,
         z2_equivar=z2_equivar,
@@ -97,7 +96,7 @@ def test_rqs_basic(gaussian_input, z2_equivar, even_sites):
     :py:class:`anvil.layers.RationalQuadraticSplineLayer`.
     """
     layer = layers.RationalQuadraticSplineLayer(
-        SIZE_HALF,
+        size_half=SIZE_HALF,
         interval=5,
         n_segments=4,
         hidden_shape=HIDDEN_SHAPE,
@@ -116,6 +115,8 @@ def test_rqs_basic(gaussian_input, z2_equivar, even_sites):
 def test_scaling_layer_basic(gaussian_input, layer_class):
     if layer_class is layers.GlobalAffineLayer:
         layer = layer_class(1, 0)
+    elif layer_class is layers.GlobalRescaling:
+        layer = layer_class(scale=1.0, learnable=False)
     else:
         layer = layer_class()
     basic_layer_test(layer, *gaussian_input)
@@ -124,7 +125,7 @@ def test_scaling_layer_basic(gaussian_input, layer_class):
 def test_sequential_basic(gaussian_input):
     inner_layers = [
         layers.AffineLayer(
-            SIZE_HALF,
+            size_half=SIZE_HALF,
             hidden_shape=HIDDEN_SHAPE,
             activation=ACTIVATION,
             z2_equivar=False,
diff --git a/anvil/tests/test_models.py b/anvil/tests/test_models.py
index 8996f0b..d8345ef 100644
--- a/anvil/tests/test_models.py
+++ b/anvil/tests/test_models.py
@@ -15,8 +15,10 @@
 PARAMS = {
     "hidden_shape": (32,),
     "n_blocks": 3,
+    "n_segments": 4,
     "lattice_length": 6,
     "lattice_dimension": 2,
+    "scale": 1.0,
 }
 
 @pytest.mark.parametrize("layer_action", LAYERS)

From f147afaa2d7a7c412da55bcd447c1e9235b833fa Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 16:46:30 +0100
Subject: [PATCH 18/20] add test for independence of rescaling layers,
 including breaking example

---
 anvil/tests/test_models.py | 52 +++++++++++++++++++++++++++++++++++---
 1 file changed, 48 insertions(+), 4 deletions(-)

diff --git a/anvil/tests/test_models.py b/anvil/tests/test_models.py
index d8345ef..8c0bdbd 100644
--- a/anvil/tests/test_models.py
+++ b/anvil/tests/test_models.py
@@ -6,6 +6,7 @@
 from hypothesis.strategies import integers, lists
 import pytest
 import torch
+from copy import deepcopy
 
 from anvil.api import API
 from anvil.models import LAYER_OPTIONS
@@ -14,26 +15,27 @@
 
 PARAMS = {
     "hidden_shape": (32,),
-    "n_blocks": 3,
+    "n_blocks": 2,
     "n_segments": 4,
     "lattice_length": 6,
     "lattice_dimension": 2,
     "scale": 1.0,
 }
 
+
 @pytest.mark.parametrize("layer_action", LAYERS)
 def test_layer_actions(layer_action):
-    """Call the API on each of the layer actions, using mainly default arguments
-    """
+    """Call the API on each of the layer actions, using mainly default arguments"""
     getattr(API, layer_action)(**PARAMS)
     return
 
+
 # put limits on these so not to crash your computer.
 @given(
     lists(integers(min_value=0, max_value=2), min_size=1, max_size=3),
     integers(min_value=1, max_value=4),
     integers(min_value=1, max_value=8),
-    lists(integers(min_value=1, max_value=2 ** 6), min_size=1, max_size=3)
+    lists(integers(min_value=1, max_value=2 ** 6), min_size=1, max_size=3),
 )
 def test_model_construction(layer_idx, n_blocks, lattice_length_half, hidden_shape):
     """Hypothesis test the model construction"""
@@ -53,3 +55,45 @@ def test_model_construction(layer_idx, n_blocks, lattice_length_half, hidden_sha
     # might help with memory.
     with torch.no_grad():
         API.model_to_load(**params)
+
+
+def layer_independence_test(model_spec):
+    """Check that each layer's parameters are updated independently."""
+
+    # Collect over these layers
+    model = API.model_to_load(**model_spec)
+    layer1, layer2 = [layer for layer in model]
+
+    layer2_copy = deepcopy(layer2)
+
+    # Update parameters in first layer
+    valid_key, valid_tensor = next(iter(layer1.state_dict().items()))
+    update = {valid_key: torch.rand_like(valid_tensor)}
+    layer1.load_state_dict(update, strict=False)
+
+    # Check that second layer is unchanged
+    # NOTE: may be safer to iterate over shared keys
+    for original, copy in zip(layer2.parameters(), layer2_copy.parameters()):
+        assert torch.allclose(original, copy)
+
+
+# TODO: extend to other layers... @pytest.mark.parametrize("layer_action", LAYERS)
+@torch.no_grad()
+def test_layer_independence_global_rescaling():
+    # Build a model with two identical sets of layers
+    working_example = {  # This is OK
+        "model": [
+            {"layer": "global_rescaling", "scale": 1.0},
+            {"layer": "global_rescaling", "scale": 1.0},
+        ]
+    }
+    layer_independence_test(working_example)
+
+    breaking_example = {  # This is NOT ok!
+        "model": [
+            {"layer": "global_rescaling"},
+            {"layer": "global_rescaling"},
+        ],
+        "scale": 1.0,
+    }
+    layer_independence_test(breaking_example)

From 8abf7470dbf4ca50b5f9038a4159acace7587e66 Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 17:45:33 +0100
Subject: [PATCH 19/20] update layer independence test for generic layers

---
 anvil/tests/test_models.py | 66 ++++++++++++++++++++++++++++++--------
 1 file changed, 53 insertions(+), 13 deletions(-)

diff --git a/anvil/tests/test_models.py b/anvil/tests/test_models.py
index 8c0bdbd..e5a0362 100644
--- a/anvil/tests/test_models.py
+++ b/anvil/tests/test_models.py
@@ -11,6 +11,11 @@
 from anvil.api import API
 from anvil.models import LAYER_OPTIONS
 
+
+class LayersNotIndependentError(Exception):
+    pass
+
+
 LAYERS = list(LAYER_OPTIONS.keys())
 
 PARAMS = {
@@ -59,25 +64,31 @@ def test_model_construction(layer_idx, n_blocks, lattice_length_half, hidden_sha
 
 def layer_independence_test(model_spec):
     """Check that each layer's parameters are updated independently."""
-
-    # Collect over these layers
-    model = API.model_to_load(**model_spec)
-    layer1, layer2 = [layer for layer in model]
-
-    layer2_copy = deepcopy(layer2)
+    model = iter(API.model_to_load(**model_spec))
+    model_copy = deepcopy(model)
 
     # Update parameters in first layer
-    valid_key, valid_tensor = next(iter(layer1.state_dict().items()))
-    update = {valid_key: torch.rand_like(valid_tensor)}
+    layer1 = next(model)
+    update = {}
+    for valid_key, valid_tensor in layer1.state_dict().items():
+        update[valid_key] = torch.rand_like(valid_tensor)
     layer1.load_state_dict(update, strict=False)
 
-    # Check that second layer is unchanged
+    # Check that this is different from the copy
+    layer1_copy = next(model_copy)
+    for original, copy in zip(layer1.parameters(), layer1_copy.parameters()):
+        assert not torch.allclose(original, copy)
+
+    # Now check that the other layers are unchanged
     # NOTE: may be safer to iterate over shared keys
-    for original, copy in zip(layer2.parameters(), layer2_copy.parameters()):
-        assert torch.allclose(original, copy)
+    for layer, layer_copy in zip(model, model_copy):
+        for original, copy in zip(layer.parameters(), layer_copy.parameters()):
+            if not torch.allclose(original, copy):
+                raise LayersNotIndependentError(
+                    "Parameters are being shared amongst layers that should be independent."
+                )
 
 
-# TODO: extend to other layers... @pytest.mark.parametrize("layer_action", LAYERS)
 @torch.no_grad()
 def test_layer_independence_global_rescaling():
     # Build a model with two identical sets of layers
@@ -96,4 +107,33 @@ def test_layer_independence_global_rescaling():
         ],
         "scale": 1.0,
     }
-    layer_independence_test(breaking_example)
+    with pytest.raises(LayersNotIndependentError):
+        layer_independence_test(breaking_example)
+
+
+# TODO: could extend to all layers quite easily
+@torch.no_grad()
+def test_layer_independence_additive():
+    params = {
+        "hidden_shape": (32,),
+        "n_blocks": 1,
+        "lattice_length": 6,
+        "lattice_dimension": 2,
+    }
+    working_example = {
+        "model": [
+            {"layer": "nice", **params},
+            {"layer": "nice", **params},
+        ]
+    }
+    layer_independence_test(working_example)
+    
+    breaking_example = {
+        "model": [
+            {"layer": "nice"},
+            {"layer": "nice"},
+        ],
+        **params,
+    }
+    with pytest.raises(LayersNotIndependentError):
+        layer_independence_test(breaking_example)

From 3e50d20475b10891e3b8e591d29c1a56533aeb5a Mon Sep 17 00:00:00 2001
From: marshrossney <17361029+marshrossney@users.noreply.github.com>
Date: Fri, 14 May 2021 18:00:01 +0100
Subject: [PATCH 20/20] raise AssertionError instead of custom exception

---
 anvil/tests/test_models.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/anvil/tests/test_models.py b/anvil/tests/test_models.py
index e5a0362..a3e6ddf 100644
--- a/anvil/tests/test_models.py
+++ b/anvil/tests/test_models.py
@@ -12,10 +12,6 @@
 from anvil.models import LAYER_OPTIONS
 
 
-class LayersNotIndependentError(Exception):
-    pass
-
-
 LAYERS = list(LAYER_OPTIONS.keys())
 
 PARAMS = {
@@ -83,10 +79,9 @@ def layer_independence_test(model_spec):
     # NOTE: may be safer to iterate over shared keys
     for layer, layer_copy in zip(model, model_copy):
         for original, copy in zip(layer.parameters(), layer_copy.parameters()):
-            if not torch.allclose(original, copy):
-                raise LayersNotIndependentError(
-                    "Parameters are being shared amongst layers that should be independent."
-                )
+            assert torch.allclose(
+                original, copy
+            ), "Parameters are being shared amongst layers that should be independent."
 
 
 @torch.no_grad()
@@ -107,7 +102,7 @@ def test_layer_independence_global_rescaling():
         ],
         "scale": 1.0,
     }
-    with pytest.raises(LayersNotIndependentError):
+    with pytest.raises(AssertionError):
         layer_independence_test(breaking_example)
 
 
@@ -127,7 +122,7 @@ def test_layer_independence_additive():
         ]
     }
     layer_independence_test(working_example)
-    
+
     breaking_example = {
         "model": [
             {"layer": "nice"},
@@ -135,5 +130,5 @@ def test_layer_independence_additive():
         ],
         **params,
     }
-    with pytest.raises(LayersNotIndependentError):
+    with pytest.raises(AssertionError):
         layer_independence_test(breaking_example)