diff --git a/anvil/benchmark_config/free_scalar_train.yml b/anvil/benchmark_config/free_scalar_train.yml
index 5edd89e..ed9fec2 100644
--- a/anvil/benchmark_config/free_scalar_train.yml
+++ b/anvil/benchmark_config/free_scalar_train.yml
@@ -12,15 +12,15 @@ couplings:
 # Model
 base: gaussian
 
-#model: rational_quadratic_spline
-#model: real_nvp
-model: nice
-model_params:
-    n_affine: 2
-    n_additive: 2
+model:
+  - layer: nice
+    n_blocks: 2
     hidden_shape: [36]
     activation: tanh
     z2_equivar: True
+  - layer: global_rescaling
+    scale: 1
+    learnable: True
 
 # Training length
 n_batch: 2000
diff --git a/anvil/benchmarks.py b/anvil/benchmarks.py
index 0dfe071..7175854 100644
--- a/anvil/benchmarks.py
+++ b/anvil/benchmarks.py
@@ -40,8 +40,8 @@ def free_scalar_theory(couplings, lattice_length):
 def fourier_transform(configs, training_geometry):
     """Takes the Fourier transform of a sample of field configurations.
 
-    Inputs
-    ------
+    Parameters
+    ----------
     configs: torch.tensor
         A (hopefully decorrelated) sample of field configurations in the
         split representation. Shape: (sample_size, lattice_size)
diff --git a/anvil/config.py b/anvil/config.py
index aaaecc3..7eeb68e 100644
--- a/anvil/config.py
+++ b/anvil/config.py
@@ -13,7 +13,7 @@
 
 from anvil.geometry import Geometry2D
 from anvil.checkpoint import TrainingOutput
-from anvil.models import MODEL_OPTIONS
+from anvil.models import LAYER_OPTIONS
 from anvil.distributions import BASE_OPTIONS, TARGET_OPTIONS
 
 from random import randint
@@ -84,12 +84,12 @@ def parse_parameterisation(self, param: str):
         return param
 
     @explicit_node
-    def produce_model_action(self, model: str):
+    def produce_layer_action(self, layer: str):
         """Given a string, return the flow model action indexed by that string."""
         try:
-            return MODEL_OPTIONS[model]
+            return LAYER_OPTIONS[layer]
         except KeyError:
-            raise ConfigError(f"Invalid model {model}", model, MODEL_OPTIONS.keys())
+            raise ConfigError(f"Invalid model {layer}", layer, LAYER_OPTIONS.keys())
 
     def parse_n_batch(self, nb: int):
         """Batch size for training."""
diff --git a/anvil/distributions.py b/anvil/distributions.py
index f34c466..919f944 100644
--- a/anvil/distributions.py
+++ b/anvil/distributions.py
@@ -14,8 +14,8 @@ class Gaussian:
     """
     Class which handles the generation of a sample of latent Gaussian variables.
 
-    Inputs:
-    -------
+    Parameters
+    ----------
     lattice_size: int
         Number of nodes on the lattice.
     sigma: float
@@ -59,6 +59,18 @@ class PhiFourScalar:
     The forward pass returns the corresponding log density (unnormalised) which
     is equal to -S
 
+    The parameters required differ depending on the parameterisation you're
+    using:
+
+    ================  =============
+    parameterisation  couplings
+    ================  =============
+    standard          m_sq, g
+    albergo2019       m_sq, lam
+    nicoli2020        kappa, lam
+    bosetti2015       beta, lam
+    ================  =============
+
     Parameters
     ----------
     geometry:
@@ -70,14 +82,6 @@ class PhiFourScalar:
         dictionary with two entries that are the couplings of the theory.
         See below.
 
-
-        parameterisation            couplings
-        -------------------------------------
-        standard                    m_sq, g
-        albergo2019                 m_sq, lam
-        nicoli2020                  kappa, lam
-        bosetti2015                 beta, lam
-
     Notes
     -----
     The general form of the action is
diff --git a/anvil/free_scalar.py b/anvil/free_scalar.py
index af2a050..a985c89 100644
--- a/anvil/free_scalar.py
+++ b/anvil/free_scalar.py
@@ -127,8 +127,8 @@ def gen_complex_normal(n_sample, sigma, real=False):
         """Returns a stack of complex arrays where real and imaginary components
         are drawn from a Gaussian distribution with the same width.
 
-        Inputs:
-        -------
+        Parameters
+        ----------
         n_sample: int
             sample size
         sigma: numpy.ndarray
@@ -137,8 +137,8 @@ def gen_complex_normal(n_sample, sigma, real=False):
             (optional) flag. If True, the imaginary component is set to
             zero, but a complex array is still returned.
 
-        Returns:
-        --------
+        Returns
+        -------
         out: numpy.ndarray
             complex array of shape (n_sample, *sigma.shape)
         """
@@ -157,13 +157,13 @@ def gen_eigenmodes(self, n_sample):
         Gaussian distributions with variances given by the eigenvalues of the
         kinetic operator - see _variance() method above.
 
-        Inputs:
-        -------
+        Parameters
+        ----------
         n_sample: int
             sample size
 
-        Returns:
-        --------
+        Returns
+        -------
         eigenmodes: numpy.ndarray
             complex array of eigenmodes with shape (n_sample, L, L)
             where L is the side length of the square lattice.
@@ -206,13 +206,13 @@ def gen_eigenmodes(self, n_sample):
     def gen_real_space_fields(self, n_sample):
         """Returns the inverse fourier transform of a sample of eigenmodes.
 
-        Inputs:
-        -------
+        Parameters
+        ----------
         n_sample: int
             sample size
 
-        Returns:
-        --------
+        Returns
+        -------
         fields: numpy.ndarray
             real array of real-space fields, with shape (n_sample, L, L),
             where L is the side-length of the square lattice.
diff --git a/anvil/layers.py b/anvil/layers.py
index af8997f..fefd9db 100644
--- a/anvil/layers.py
+++ b/anvil/layers.py
@@ -3,37 +3,51 @@
 r"""
 layers.py
 
-Contains nn.Modules which implement transformations of input configurations whilst computing
-the Jacobian determinant of the transformation.
-
-Each transformation layers may contain several neural networks or learnable parameters.
-
-A normalising flow, f, can be constructed from multiple layers using function composition:
-
-        f(z) = g_n( ... ( g_2( g_1( z ) ) ) ... )
-
-which is implemented using the architecture provided by torch.nn
+Contains the transformations or "layers" which are the building blocks of
+normalising flows. The layers are implemented using the PyTorch library, which
+in practice means they subclass :py:class:`torch.nn.Module`. For more
+information, check out the PyTorch
+`Module docs <https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module>`_.
+
+The basic idea is of a flow is to generate a latent variable, in our framework
+this would be using a class in :py:mod:`anvil.distributions`. The latent
+variables are then transformed by sequentially applying the transformation
+layers. The key feature of the transformations is the ability to easily calculate
+the Jacobian determinant. If the base density function is known, then we can
+evaluate the model density exactly.
+
+The bottom line is that we enforce a convention to the ``forward`` method
+of each layer (a special method of :py:class:`torch.nn.Module` subclasses).
+All layers in this module should contain a ``forward`` method which takes two
+:py:class:`torch.Tensor` objects as inputs:
+
+    - a batch of input configurations, dimensions ``(batch size, lattice size)``.
+    - a batch of scalars, dimensions ``(batch size, 1)``, that are the logarithm of the
+      'current' probability density, at this stage in the normalising flow.
 
-All layers in this module contain a `forward` method which takes two torch.tensor objects
-as inputs:
+Each transformation layers may contain several neural networks or learnable
+parameters.
 
-    - a batch of input configurations, dimensions (batch size, lattice size).
+A full normalising flow, f, can be constructed from multiple layers using
+function composition:
 
-    - a batch of scalars, dimensions (batch size, 1), that are the logarithm of the
-      'current' probability density, at this stage in the normalising flow.
+.. math::
 
-and returns two torch.tensor objects:
+        f(z) = g_{N_{\rm layers}}( \ldots ( g_2( g_1( z ) ) ) \ldots )
 
-    - a batch of configurations \phi which have been transformed according to the 
-      transformation, with the same dimensions as the input configurations.
+As a matter of convenience we provide a subclass of
+:py:class:`torch.nn.Sequential`, which is initialised by passing multiple layers
+as arguments (in the order in which the layers are applied). The main feature
+of our version, :py:class:`Sequential`, is that it conforms to our ``forward``
+convention. From the perspective of the user :py:class:`Sequential` appears
+as a single subclass of :py:class:`torch.nn.Module` which performs the
+full normalising flow transformation :math:`f(z)`.
 
-    - the updated logarithm of the probability density, including the contribution from
-      the Jacobian determinant of this transformation.
 """
 import torch
 import torch.nn as nn
 
-from anvil.core import FullyConnectedNeuralNetwork
+from anvil.neural_network import DenseNeuralNetwork
 
 
 class CouplingLayer(nn.Module):
@@ -113,7 +127,7 @@ def __init__(
     ):
         super().__init__(size_half, even_sites)
 
-        self.t_network = FullyConnectedNeuralNetwork(
+        self.t_network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half,
             hidden_shape=hidden_shape,
@@ -121,14 +135,13 @@ def __init__(
             bias=not z2_equivar,
         )
 
-    def forward(self, v_in, log_density, *unused) -> torch.Tensor:
+    def forward(self, v_in, log_density, *args) -> torch.Tensor:
         r"""Forward pass of affine transformation."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
+        v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6)
 
-        t_out = self.t_network(
-            (v_in_passive - v_in_passive.mean()) / v_in_passive.std()
-        )
+        t_out = self.t_network(v_for_net)
 
         v_out = self._join_func([v_in_passive, v_in_active - t_out], dim=1)
 
@@ -161,14 +174,14 @@ def __init__(
     ):
         super().__init__(size_half, even_sites)
 
-        self.s_network = FullyConnectedNeuralNetwork(
+        self.s_network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half,
             hidden_shape=hidden_shape,
             activation=activation,
             bias=not z2_equivar,
         )
-        self.t_network = FullyConnectedNeuralNetwork(
+        self.t_network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half,
             hidden_shape=hidden_shape,
@@ -177,16 +190,16 @@ def __init__(
         )
         self.z2_equivar = z2_equivar
 
-    def forward(self, v_in, log_density, *unused) -> torch.Tensor:
+    def forward(self, v_in, log_density, *args) -> torch.Tensor:
         r"""Forward pass of affine transformation."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
-        v_for_net = (v_in_passive - v_in_passive.mean()) / v_in_passive.std()
+        v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6)
 
         s_out = self.s_network(v_for_net)
         t_out = self.t_network(v_for_net)
 
-        # If enforcing s(-v) = -s(v), we want to use |s(v)| in affine transf.
+        # If enforcing C(-v) = -C(v), we want to use |s(v)| in affine transf.
         if self.z2_equivar:
             s_out = torch.abs(s_out)
 
@@ -236,7 +249,7 @@ def __init__(
         self.size_half = size_half
         self.n_segments = n_segments
 
-        self.network = FullyConnectedNeuralNetwork(
+        self.network = DenseNeuralNetwork(
             size_in=size_half,
             size_out=size_half * (3 * n_segments - 1),
             hidden_shape=hidden_shape,
@@ -255,13 +268,11 @@ def forward(self, v_in, log_density, negative_mag):
         """Forward pass of the rational quadratic spline layer."""
         v_in_passive = v_in[:, self._passive_ind]
         v_in_active = v_in[:, self._active_ind]
-        v_for_net = (
-            v_in_passive - v_in_passive.mean()
-        ) / v_in_passive.std()  # reduce numerical instability
+        v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6)
 
         # Naively enforce C(-v) = -C(v)
         if self.z2_equivar:
-            v_in_passive_stand[negative_mag] = -v_in_passive_stand[negative_mag]
+            v_for_net[negative_mag] = -v_for_net[negative_mag]
 
         v_out_b = torch.zeros_like(v_in_active)
         gradient = torch.ones_like(v_in_active).unsqueeze(dim=-1)
@@ -396,35 +407,89 @@ def forward(self, v_in, log_density):
         return self.scale * v_in + self.shift, log_density
 
 
-# TODO not necessary to define a nn.module for this now I've taken out learnable gamma
+# NOTE: not necessary to define a nn.module for this now gamma is no longer learnable
 class BatchNormLayer(nn.Module):
-    """Performs batch normalisation on the input vector.
+    r"""Performs batch normalisation on the inputs, conforming to our ``forward``
+    convention.
+
+    Inputs are standardised over all tensor dimensions such that the resulting sample
+    has null mean and unit variance, after which a rescaling factor is applied.
+
+    .. math::
+
+            v_{\rm out} = \gamma
+                \frac{v_{\rm in} - \mathbb{E}[ v_{\rm in} ]}
+                {\sqrt{\var( v_{\rm in} ) + \epsilon}}
 
     Parameters
     ----------
-    scale: int
-        An additional scale factor to be applied after batch normalisation.
+    scale: float
+        The multiplicative factor, :math:`\gamma`, applied to the standardised data.
+
+    Notes
+    -----
+    Applying batch normalisation before the first spline layer can be helpful for
+    ensuring that the inputs remain within the transformation interval. However,
+    this layer adds undesirable stochasticity which can impede optimisation. One
+    might consider replacing it with :py:class:`anvil.layers.GlobalRescaling` using
+    a static scale parameter.
     """
 
     def __init__(self, scale=1):
         super().__init__()
         self.gamma = scale
 
-    def forward(self, v_in, log_density, *unused):
+    def forward(self, v_in, log_density, *args):
         """Forward pass of the batch normalisation transformation."""
-        mult = self.gamma / torch.std(v_in)
+        mult = self.gamma / torch.sqrt(v_in.var() + 1e-6)  # for stability
         v_out = mult * (v_in - v_in.mean())
-        log_density -= mult * v_out.shape[1]
+        log_density -= torch.log(mult) * v_out.shape[1]
         return (v_out, log_density)
 
 
 class GlobalRescaling(nn.Module):
-    def __init__(self, initial=1):
+    r"""Performs a global rescaling of the inputs via a (potentially learnable)
+    multiplicative factor, conforming to our ``forward`` convention.
+
+    Parameters
+    ----------
+    scale: float
+        The multiplicative factor applied to the inputs.
+    learnable: bool, default=True
+        If True, ``scale`` will be optimised during the training.
+
+    Notes
+    -----
+    Applying a rescaling layer with a learnable ``scale`` to the final layer of a
+    normalizing flow can be useful since it avoids the need to tune earlier layers
+    to match the width of the target density. However, for best performance one
+    should generally use a static ``scale`` to reduce stochasticity in the
+    optimisation.
+
+    """
+
+    def __init__(self, scale=1, learnable=True):
         super().__init__()
 
-        self.scale = nn.Parameter(torch.Tensor([initial]))
+        self.scale = torch.Tensor([scale])
+        if learnable:
+            self.scale = nn.Parameter(self.scale)
 
-    def forward(self, v_in, log_density, *unused):
+    def forward(self, v_in, log_density, *args):
         v_out = self.scale * v_in
         log_density -= v_out.shape[-1] * torch.log(self.scale)
         return v_out, log_density
+
+
+class Sequential(nn.Sequential):
+    """Similar to :py:class:`torch.nn.Sequential` except conforms to our
+    ``forward`` convention.
+    """
+
+    def forward(self, v, log_density, *args):
+        """overrides the base class ``forward`` method to conform to our
+        conventioned for expected inputs/outputs of ``forward`` methods.
+        """
+        for module in self:
+            v, log_density = module(v, log_density, *args)
+        return v, log_density
diff --git a/anvil/models.py b/anvil/models.py
index 55cb6ed..9e728b3 100644
--- a/anvil/models.py
+++ b/anvil/models.py
@@ -3,21 +3,29 @@
 """
 models.py
 
-Module containing reportengine actions which return callable objects that execute
-normalising flows constructed from multiple layers via function composition.
+Module containing reportengine actions which return normalising flow models.
+Generally this involves piecing together components from :py:mod:`anvil.layers`
+to produce sequences of transformations.
+
 """
 from functools import partial
 
-from anvil.core import Sequential
+from reportengine import collect
 
 import anvil.layers as layers
 
 
-def coupling_pair(coupling_layer, size_half, **layer_spec):
-    """Helper function which returns a callable object that performs a coupling
-    transformation on both even and odd lattice sites."""
-    coupling_transformation = partial(coupling_layer, size_half, **layer_spec)
-    return Sequential(
+def _coupling_block(coupling_layer, **kwargs):
+    """Helper function which wraps a pair of coupling layers from
+    :py:mod:`anvil.layers` in the module container
+    :py:class`anvil.layers.Sequential`. The first transformation layer acts on
+    the even sites and the second transformation acts on the odd sites, so one
+    of these blocks ensures all sites are transformed as part of an
+    active partition.
+
+    """
+    coupling_transformation = partial(coupling_layer, **kwargs)
+    return layers.Sequential(
         coupling_transformation(even_sites=True),
         coupling_transformation(even_sites=False),
     )
@@ -25,90 +33,256 @@ def coupling_pair(coupling_layer, size_half, **layer_spec):
 
 def real_nvp(
     size_half,
-    n_affine,
+    n_blocks,
     hidden_shape,
     activation="tanh",
-    z2_equivar=False,
+    z2_equivar=True,
 ):
-    """Action that returns a callable object that performs a sequence of `n_affine`
-    affine coupling transformations on both partitions of the input vector."""
+    r"""Action which returns a sequence of ``n_blocks`` pairs of
+    :py:class:`anvil.layers.AffineLayer` s, wrapped in the module container
+    :py:class`anvil.layers.Sequential`.
+
+    The first ``n_blocks`` elements of the outer ``Sequential``
+    are ``Sequential`` s containing a pair of ``AffineLayer`` s which
+    act on the even and odd sites respectively.
+
+    Parameters
+    ----------
+    size_half: int
+        Inferred from ``lattice_size``, the size of the active/passive
+        partitions (which are equal size, `lattice_size / 2`).
+    n_blocks: int
+        The number of pairs of :py:class:`anvil.layers.AffineLayer`
+        transformations.
+    hidden_shape: list[int]
+        the shape of the neural networks used in the AffineLayer. The visible
+        layers are defined by the ``lattice_size``. Typically we have found
+        a single hidden layer neural network is effective, which can be
+        specified by passing a list of length 1, i.e. ``[72]`` would
+        be a single hidden layered network with 72 nodes in the hidden layer.
+    activation: str, default="tanh"
+        The activation function to use for each hidden layer. The output layer
+        of the network is linear (has no activation function).
+    z2_equivar: bool, default=True
+        Whether or not to impose z2 equivariance. This changes the transformation
+        such that the neural networks have no bias term and s(-v) = s(v) which
+        imposes a :math:`\mathbb{Z}_2` symmetry.
+
+    Returns
+    -------
+    real_nvp: anvil.layers.Sequential
+        A sequence of affine transformations, which we refer to as a real NVP
+        (Non-volume preserving) flow.
+
+    See Also
+    --------
+    :py:mod:`anvil.neural_network` contains the fully connected neural network class
+    as well as valid choices for activation functions.
+
+    """
     blocks = [
-        coupling_pair(
+        _coupling_block(
             layers.AffineLayer,
-            size_half,
+            size_half=size_half,
             hidden_shape=hidden_shape,
             activation=activation,
             z2_equivar=z2_equivar,
         )
-        for i in range(n_affine)
+        for _ in range(n_blocks)
     ]
-    return Sequential(*blocks, layers.GlobalRescaling())
+    return layers.Sequential(*blocks)
 
 
 def nice(
     size_half,
-    n_additive,
+    n_blocks,
     hidden_shape,
     activation="tanh",
-    z2_equivar=False,
+    z2_equivar=True,
 ):
-    """Action that returns a callable object that performs a sequence of `n_affine`
-    affine coupling transformations on both partitions of the input vector."""
+    r"""Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
+    :py:class:`anvil.layers.AdditiveLayer`.
+    The pairs of ``AdditiveLayer`` s act on the even and odd sites respectively.
+
+    Parameters
+    ----------
+    size_half: int
+        Inferred from ``lattice_size``, the size of the active/passive
+        partitions (which are equal size, `lattice_size / 2`).
+    n_blocks: int
+        The number of pairs of :py:class:`anvil.layers.AffineLayer`
+        transformations.
+    hidden_shape: list[int]
+        the shape of the neural networks used in the each layer. The visible
+        layers are defined by the ``lattice_size``.
+    activation: str, default="tanh"
+        The activation function to use for each hidden layer. The output layer
+        of the network is linear (has no activation function).
+    z2_equivar: bool, default=True
+        Whether or not to impose z2 equivariance. This changes the transformation
+        such that the neural networks have no bias term and s(-v) = s(v) which
+        imposes a :math:`\mathbb{Z}_2` symmetry.
+
+    Returns
+    -------
+    nice: anvil.layers.Sequential
+        A sequence of additive transformations, which we refer to as a
+        nice flow.
+
+    """
     blocks = [
-        coupling_pair(
+        _coupling_block(
             layers.AdditiveLayer,
-            size_half,
+            size_half=size_half,
             hidden_shape=hidden_shape,
             activation=activation,
             z2_equivar=z2_equivar,
         )
-        for i in range(n_additive)
+        for _ in range(n_blocks)
     ]
-    return Sequential(*blocks, layers.GlobalRescaling())
+    return layers.Sequential(*blocks)
 
 
 def rational_quadratic_spline(
     size_half,
+    n_blocks,
     hidden_shape,
+    n_segments,
     interval=5,
-    n_spline=1,
-    n_segments=4,
     activation="tanh",
-    z2_equivar_spline=False,
+    z2_equivar=False,
 ):
-    """Action that returns a callable object that performs a pair of circular spline
-    transformations, one on each half of the input vector."""
+    """Similar to :py:func:`real_nvp`, excepts instead wraps pairs of
+    :py:class:`anvil.layers.RationalQuadraticSplineLayer` s followed by a single
+    The pairs of RQS's act on the even and odd sites respectively.
+
+    Parameters
+    ----------
+    size_half: int
+        inferred from ``lattice_size``, the size of the active/passive
+        partitions (which are equal size, `lattice_size / 2`).
+    n_blocks: int
+        The number of pairs of :py:class:`anvil.layers.AffineLayer`
+        transformations. For RQS this is set to 1.
+    hidden_shape: list[int]
+        the shape of the neural networks used in the each layer. The visible
+        layers are defined by the ``lattice_size``.
+    n_segments: int
+        The number of segments to use in the RQS transformation.
+    interval: int, default=5
+        the interval within which the RQS applies the transformation, at present
+        if a field variable is outside of this region it is mapped to itself
+        (i.e the gradient of the transformation is 1 outside of the interval).
+    activation: str, default="tanh"
+        The activation function to use for each hidden layer. The output layer
+        of the network is linear (has no activation function).
+    z2_equivar: bool, default=False
+        Whether or not to impose z2 equivariance. This is only done crudely
+        by splitting the sites according to the sign of the sum across lattice
+        sites.
+    """
+
     blocks = [
-        coupling_pair(
+        _coupling_block(
             layers.RationalQuadraticSplineLayer,
-            size_half,
+            size_half=size_half,
             interval=interval,
             n_segments=n_segments,
             hidden_shape=hidden_shape,
             activation=activation,
-            z2_equivar=z2_equivar_spline,
+            z2_equivar=z2_equivar,
         )
-        for _ in range(n_spline)
+        for _ in range(n_blocks)
     ]
-    return Sequential(
-        #layers.BatchNormLayer(),
-        *blocks,
-        layers.GlobalRescaling(),
-    )
+    return layers.Sequential(*blocks)
+
+
+def batch_norm(scale=1.0):
+    r"""Action which returns an instance of :py:class:`anvil.layers.BatchNormLayer`.
+
+    Parameters
+    ----------
+    scale: float, default=1.0
+        The multiplicative factor applied to the standardised data.
+    """
+    return layers.Sequential(layers.BatchNormLayer(scale=scale))
+
+
+def global_rescaling(scale, learnable=True):
+    r"""Action which returns and instance of :py:class:`anvil.layers.GlobalRescaling`.
+
+    Parameters
+    ----------
+    scale: float
+        The multiplicative factor applied to the inputs.
+    learnable: bool, default=True
+        If True, ``scale`` will be optimised during the training.
+    """
+    return layers.Sequential(layers.GlobalRescaling(scale=scale, learnable=learnable))
+
+
+_normalising_flow = collect("layer_action", ("model",))
+
+
+def model_to_load(_normalising_flow):
+    """action which wraps a list of layers in
+    :py:class:`anvil.layers.Sequential`. This allows the user to specify an
+    arbitrary combination of layers as the model.
+
+    For more information on valid choices for layers, see
+    ``anvil.models.LAYER_OPTIONS`` or the various
+    functions in :py:mod:`anvil.models` which produce sequences of the layers
+    found in :py:mod:`anvil.layers`.
+
+    At present, available transformations are:
+
+        - ``nice``
+        - ``real_nvp``
+        - ``rational_quadratic_spline``
+        - ``batch_norm``
+        - ``global_rescaling``
+
+    You can see their dependencies using the ``anvil`` provider help, e.g.
+    for ``real_nvp``:
+
+    .. code::
+
+        $ anvil-sample --help real_nvp
+        ...
+        < action docstring - poorly formatted>
+        ...
+        The following resources are read from the configuration:
+
+            lattice_length(int):
+        [Used by lattice_size]
+
+            lattice_dimension(int): Parse lattice dimension from runcard
+        [Used by lattice_size]
 
+        The following additionl arguments can be used to control the
+        behaviour. They are set by default to sensible values:
 
-def spline_affine(real_nvp, rational_quadratic_spline):
-    return Sequential(rational_quadratic_spline, real_nvp)
+        n_blocks
+        hidden_shape
+        activation = tanh
+        z2_equivar = True
 
+    ``anvil-train`` will also provide the same information.
 
-def affine_spline(real_nvp, rational_quadratic_spline):
-    return Sequential(real_nvp, rational_quadratic_spline)
+    """
+    # assume that _normalising_flow is a list of layers, each layer
+    # is a sequential of blocks, each block is a pair of transformations
+    # which transforms the entire input state - flatten this out, so output
+    # is Sequential of blocks
+    flow_flat = [block for layer in _normalising_flow for block in layer]
+    return layers.Sequential(*flow_flat)
 
 
-MODEL_OPTIONS = {
+# Update docstring above if you add to this!
+LAYER_OPTIONS = {
     "nice": nice,
     "real_nvp": real_nvp,
     "rational_quadratic_spline": rational_quadratic_spline,
-    "spline_affine": spline_affine,
-    "affine_spline": affine_spline,
+    "batch_norm": batch_norm,
+    "global_rescaling": global_rescaling,
 }
diff --git a/anvil/core.py b/anvil/neural_network.py
similarity index 66%
rename from anvil/core.py
rename to anvil/neural_network.py
index 8c72a13..8806ed4 100644
--- a/anvil/core.py
+++ b/anvil/neural_network.py
@@ -1,12 +1,15 @@
 # SPDX-License-Identifier: GPL-3.0-or-later
 # Copywrite © 2021 anvil Michael Wilson, Joe Marsh Rossney, Luigi Del Debbio
-r"""
-coupling.py
+"""
+neural_network.py
+
+Module containing neural networks which are used as part of transformation
+layers, found in :py:mod:`anvil.layers`.
+
 """
 import torch
 import torch.nn as nn
 
-from reportengine import collect
 
 ACTIVATION_LAYERS = {
     "leaky_relu": nn.LeakyReLU,
@@ -15,19 +18,8 @@
 }
 
 
-class Sequential(nn.Sequential):
-    """Modify the nn.Sequential class so that it takes an input vector *and* a
-    value for the current logarithm of the model density, returning an output
-    vector and the updated log density."""
-
-    def forward(self, v, log_density, *args):
-        for module in self:
-            v, log_density = module(v, log_density, *args)
-        return v, log_density
-
-
-class FullyConnectedNeuralNetwork(nn.Module):
-    """Generic class for neural networks used in coupling layers.
+class DenseNeuralNetwork(nn.Module):
+    """Dense neural networks used in coupling layers.
 
     Parameters
     ----------
@@ -39,11 +31,11 @@ class FullyConnectedNeuralNetwork(nn.Module):
         List specifying the number of nodes in the intermediate layers
     activation: (str, None)
         Key representing the activation function used for each layer
-        except the final one.
-    no_final_activation: bool
-        If True, leave the network output unconstrained.
-    bias: bool
+        except the final one. Valid options can be found in
+        ``ACTIVATION_LAYERS``.
+    bias: bool, default=True
         Whether to use biases in networks.
+
     """
 
     def __init__(
@@ -76,10 +68,3 @@ def forward(self, v_in: torch.tensor):
         shape (n_batch, size_out)
         """
         return self.network(v_in)
-
-_normalising_flow = collect("model_action", ("model_params",))
-
-def model_to_load(_normalising_flow):
-    return _normalising_flow[0]
-
-
diff --git a/anvil/scripts/anvil_sample.py b/anvil/scripts/anvil_sample.py
index ff4b9d1..a216ab2 100644
--- a/anvil/scripts/anvil_sample.py
+++ b/anvil/scripts/anvil_sample.py
@@ -13,10 +13,8 @@
 log = logging.getLogger(__name__)
 
 PROVIDERS = [
-    "anvil.core",
     "anvil.models",
     "anvil.sample",
-    "anvil.models",
     "anvil.observables",
     "anvil.plot",
     "anvil.table",
diff --git a/anvil/scripts/anvil_train.py b/anvil/scripts/anvil_train.py
index 437efe0..6767c3b 100644
--- a/anvil/scripts/anvil_train.py
+++ b/anvil/scripts/anvil_train.py
@@ -12,7 +12,7 @@
 
 log = logging.getLogger(__name__)
 
-PROVIDERS = ["anvil.train", "anvil.checkpoint", "anvil.core", "anvil.models"]
+PROVIDERS = ["anvil.train", "anvil.checkpoint", "anvil.models"]
 
 TRAINING_ACTIONS = ["train"]
 
diff --git a/anvil/tests/test_layers.py b/anvil/tests/test_layers.py
new file mode 100644
index 0000000..9492388
--- /dev/null
+++ b/anvil/tests/test_layers.py
@@ -0,0 +1,145 @@
+"""
+Tests of the base classes in :py:mod:`anvil.layers`
+
+"""
+from hypothesis import given
+from hypothesis.strategies import integers, booleans
+import numpy as np
+import pytest
+import torch
+
+import anvil.layers as layers
+from anvil.distributions import Gaussian
+
+N_BATCH = 100
+SIZE = 36
+SIZE_HALF = SIZE // 2
+HIDDEN_SHAPE = [36]
+ACTIVATION = "tanh"
+
+@given(integers(min_value=0, max_value=2**16), booleans())
+def test_coupling_init(size_half, even_sites):
+    """Hypothesis test the initialisation of the base class in layers"""
+    layers.CouplingLayer(size_half, even_sites)
+
+
+def test_additive_layers():
+    equivar_additive = layers.AdditiveLayer(
+        size_half=SIZE_HALF,
+        hidden_shape=HIDDEN_SHAPE,
+        activation=ACTIVATION,
+        z2_equivar=True,
+        even_sites=True
+    )
+    input_tensor = torch.zeros((N_BATCH, SIZE))
+    with torch.no_grad():
+        output_tensor, output_density = equivar_additive(input_tensor, 0)
+
+    assert output_density == 0
+    np.testing.assert_allclose(input_tensor.numpy(), output_tensor.numpy())
+
+
+def basic_layer_test(layer, input_states, input_log_density, *args):
+    """Basic check that layer transforms input states properly.
+
+    In practice we check:
+
+        - field variables and log densities are valid real numbers
+        - output states are correct shape
+        - outputs are correct typing
+
+    """
+    output_states, output_log_density = layer(input_states, input_log_density, *args)
+    # all numbers
+    any_nan = (
+        torch.any(torch.isnan(output_states)) or
+        torch.any(torch.isnan(output_log_density))
+    )
+    assert not any_nan
+    # correct shape
+    assert input_states.shape == output_states.shape
+
+    assert isinstance(output_states, torch.Tensor)
+    assert isinstance(output_log_density, torch.Tensor)
+
+
+@pytest.fixture()
+@torch.no_grad()
+def gaussian_input():
+    """Basic input states for testing"""
+    latent_distribution = Gaussian(SIZE) # use default standard normal
+    return latent_distribution(N_BATCH)
+
+@pytest.mark.parametrize("layer_class", [layers.AdditiveLayer, layers.AffineLayer])
+@pytest.mark.parametrize("z2_equivar", [True, False])
+@pytest.mark.parametrize("even_sites", [True, False])
+@torch.no_grad()
+def test_affine_like_basic(gaussian_input, layer_class, z2_equivar, even_sites):
+    """Apply :py:func:`basic_layer_test` to layers with same initialisation
+    parameters as :py:class:`anvil.layers.AffineLayer`.
+
+    """
+    layer = layer_class(
+        size_half=SIZE_HALF,
+        hidden_shape=HIDDEN_SHAPE,
+        activation=ACTIVATION,
+        z2_equivar=z2_equivar,
+        even_sites=even_sites,
+    )
+    basic_layer_test(layer, *gaussian_input)
+
+@pytest.mark.parametrize("z2_equivar", [True, False])
+@pytest.mark.parametrize("even_sites", [True, False])
+@torch.no_grad()
+def test_rqs_basic(gaussian_input, z2_equivar, even_sites):
+    """Apply :py:func:`basic_layer_test` to
+    :py:class:`anvil.layers.RationalQuadraticSplineLayer`.
+    """
+    layer = layers.RationalQuadraticSplineLayer(
+        size_half=SIZE_HALF,
+        interval=5,
+        n_segments=4,
+        hidden_shape=HIDDEN_SHAPE,
+        activation=ACTIVATION,
+        z2_equivar=z2_equivar,
+        even_sites=even_sites,
+    )
+    negative_mag = gaussian_input[0].sum(dim=1) < 0
+    basic_layer_test(layer, *gaussian_input, negative_mag)
+
+@pytest.mark.parametrize(
+    "layer_class",
+    [layers.GlobalRescaling, layers.BatchNormLayer, layers.GlobalAffineLayer]
+)
+@torch.no_grad()
+def test_scaling_layer_basic(gaussian_input, layer_class):
+    if layer_class is layers.GlobalAffineLayer:
+        layer = layer_class(1, 0)
+    elif layer_class is layers.GlobalRescaling:
+        layer = layer_class(scale=1.0, learnable=False)
+    else:
+        layer = layer_class()
+    basic_layer_test(layer, *gaussian_input)
+
+@torch.no_grad()
+def test_sequential_basic(gaussian_input):
+    inner_layers = [
+        layers.AffineLayer(
+            size_half=SIZE_HALF,
+            hidden_shape=HIDDEN_SHAPE,
+            activation=ACTIVATION,
+            z2_equivar=False,
+            even_sites=bool(i % 2),
+        ) for i in range(8)]
+    layer = layers.Sequential(*inner_layers)
+    basic_layer_test(layer, *gaussian_input)
+
+    # check application of sequetion matches output of applying each layer.
+    output_states, output_density = inner_layers[0](*gaussian_input)
+    for el in inner_layers[1:]:
+        output_states, output_density = el(output_states, output_density)
+
+    seq_output_states, seq_output_density = layer(*gaussian_input)
+
+    np.testing.assert_allclose(seq_output_states.numpy(), output_states.numpy())
+    np.testing.assert_allclose(seq_output_density.numpy(), output_density.numpy())
diff --git a/anvil/tests/test_models.py b/anvil/tests/test_models.py
new file mode 100644
index 0000000..a3e6ddf
--- /dev/null
+++ b/anvil/tests/test_models.py
@@ -0,0 +1,134 @@
+"""
+Test higher level model construction from :py:mod:`anvil.models`.
+
+"""
+from hypothesis import given
+from hypothesis.strategies import integers, lists
+import pytest
+import torch
+from copy import deepcopy
+
+from anvil.api import API
+from anvil.models import LAYER_OPTIONS
+
+
+LAYERS = list(LAYER_OPTIONS.keys())
+
+PARAMS = {
+    "hidden_shape": (32,),
+    "n_blocks": 2,
+    "n_segments": 4,
+    "lattice_length": 6,
+    "lattice_dimension": 2,
+    "scale": 1.0,
+}
+
+
+@pytest.mark.parametrize("layer_action", LAYERS)
+def test_layer_actions(layer_action):
+    """Call the API on each of the layer actions, using mainly default arguments"""
+    getattr(API, layer_action)(**PARAMS)
+    return
+
+
+# put limits on these so not to crash your computer.
+@given(
+    lists(integers(min_value=0, max_value=2), min_size=1, max_size=3),
+    integers(min_value=1, max_value=4),
+    integers(min_value=1, max_value=8),
+    lists(integers(min_value=1, max_value=2 ** 6), min_size=1, max_size=3),
+)
+def test_model_construction(layer_idx, n_blocks, lattice_length_half, hidden_shape):
+    """Hypothesis test the model construction"""
+    # require even lattice sites.
+    model = [{"layer": LAYERS[idx]} for idx in layer_idx]
+    lattice_length = 2 * lattice_length_half
+    params = {
+        "model": model,
+        "n_blocks": n_blocks,
+        "lattice_length": lattice_length,
+        "hidden_shape": hidden_shape,
+        "lattice_dimension": 2,
+        # for some reason the RQS defaults get missed?
+        "interval": 5,
+        "n_segments": 4,
+    }
+    # might help with memory.
+    with torch.no_grad():
+        API.model_to_load(**params)
+
+
+def layer_independence_test(model_spec):
+    """Check that each layer's parameters are updated independently."""
+    model = iter(API.model_to_load(**model_spec))
+    model_copy = deepcopy(model)
+
+    # Update parameters in first layer
+    layer1 = next(model)
+    update = {}
+    for valid_key, valid_tensor in layer1.state_dict().items():
+        update[valid_key] = torch.rand_like(valid_tensor)
+    layer1.load_state_dict(update, strict=False)
+
+    # Check that this is different from the copy
+    layer1_copy = next(model_copy)
+    for original, copy in zip(layer1.parameters(), layer1_copy.parameters()):
+        assert not torch.allclose(original, copy)
+
+    # Now check that the other layers are unchanged
+    # NOTE: may be safer to iterate over shared keys
+    for layer, layer_copy in zip(model, model_copy):
+        for original, copy in zip(layer.parameters(), layer_copy.parameters()):
+            assert torch.allclose(
+                original, copy
+            ), "Parameters are being shared amongst layers that should be independent."
+
+
+@torch.no_grad()
+def test_layer_independence_global_rescaling():
+    # Build a model with two identical sets of layers
+    working_example = {  # This is OK
+        "model": [
+            {"layer": "global_rescaling", "scale": 1.0},
+            {"layer": "global_rescaling", "scale": 1.0},
+        ]
+    }
+    layer_independence_test(working_example)
+
+    breaking_example = {  # This is NOT ok!
+        "model": [
+            {"layer": "global_rescaling"},
+            {"layer": "global_rescaling"},
+        ],
+        "scale": 1.0,
+    }
+    with pytest.raises(AssertionError):
+        layer_independence_test(breaking_example)
+
+
+# TODO: could extend to all layers quite easily
+@torch.no_grad()
+def test_layer_independence_additive():
+    params = {
+        "hidden_shape": (32,),
+        "n_blocks": 1,
+        "lattice_length": 6,
+        "lattice_dimension": 2,
+    }
+    working_example = {
+        "model": [
+            {"layer": "nice", **params},
+            {"layer": "nice", **params},
+        ]
+    }
+    layer_independence_test(working_example)
+
+    breaking_example = {
+        "model": [
+            {"layer": "nice"},
+            {"layer": "nice"},
+        ],
+        **params,
+    }
+    with pytest.raises(AssertionError):
+        layer_independence_test(breaking_example)
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
index a551074..e4184ab 100644
--- a/conda-recipe/meta.yaml
+++ b/conda-recipe/meta.yaml
@@ -27,6 +27,7 @@ build:
 test:
   requires:
     - pytest
+    - hypothesis
   commands:
     - pytest --pyargs anvil
 
diff --git a/docs/sphinx/source/get-started/basic-usage.rst b/docs/sphinx/source/get-started/basic-usage.rst
index ad2e483..ef0c557 100644
--- a/docs/sphinx/source/get-started/basic-usage.rst
+++ b/docs/sphinx/source/get-started/basic-usage.rst
@@ -118,7 +118,7 @@ We supply some basic
 machinery to build your own normalising flow models. The relevant modules for
 this purpose are
 
- - :py:mod:`anvil.core`: containing some basic extensions to ``pytorch`` modules relevant for this project.
+ - :py:mod:`anvil.neural_network`: Generic neural networks.
  - :py:mod:`anvil.layers`: a collection of transformation layer classes
  - :py:mod:`anvil.geometry`: classes which transform the output of the transformations into meaningful geometries. These dictate which sites in your lattice get alternated between active and passive partitions.
  - :py:mod:`anvil.distributions`: a collection of distributions which can be used as base distributions (for latent variables) or target distributions.
diff --git a/examples/runcards/train.yml b/examples/runcards/train.yml
index f9c319a..0e4144a 100644
--- a/examples/runcards/train.yml
+++ b/examples/runcards/train.yml
@@ -12,18 +12,24 @@ couplings:
 # Model
 base: gaussian
 
-model: affine_spline
-model_params:
-    hidden_shape: [72]
-    activation: tanh
-    
-    n_affine: 2
-    z2_equivar: true
-
-    n_spline: 1
-    n_segments: 8
-    z2_equivar_spline: false
-
+model:
+ - layer: real_nvp
+   n_blocks: 2
+   z2_equivar: true
+   activation: tanh
+   hidden_shape: [72]
+ - layer: global_rescaling
+   scale: 1
+   learnable: true
+ - layer: rational_quadratic_spline
+   n_blocks: 1
+   n_segments: 8
+   z2_equivar: false
+   activation: tanh
+   hidden_shape: [72]
+ - layer: global_rescaling
+   scale: 1
+   learnable: true
 
 # Training
 n_batch: 1000