diff --git a/anvil/benchmark_config/free_scalar_train.yml b/anvil/benchmark_config/free_scalar_train.yml index 5edd89e..ed9fec2 100644 --- a/anvil/benchmark_config/free_scalar_train.yml +++ b/anvil/benchmark_config/free_scalar_train.yml @@ -12,15 +12,15 @@ couplings: # Model base: gaussian -#model: rational_quadratic_spline -#model: real_nvp -model: nice -model_params: - n_affine: 2 - n_additive: 2 +model: + - layer: nice + n_blocks: 2 hidden_shape: [36] activation: tanh z2_equivar: True + - layer: global_rescaling + scale: 1 + learnable: True # Training length n_batch: 2000 diff --git a/anvil/benchmarks.py b/anvil/benchmarks.py index 0dfe071..7175854 100644 --- a/anvil/benchmarks.py +++ b/anvil/benchmarks.py @@ -40,8 +40,8 @@ def free_scalar_theory(couplings, lattice_length): def fourier_transform(configs, training_geometry): """Takes the Fourier transform of a sample of field configurations. - Inputs - ------ + Parameters + ---------- configs: torch.tensor A (hopefully decorrelated) sample of field configurations in the split representation. Shape: (sample_size, lattice_size) diff --git a/anvil/config.py b/anvil/config.py index aaaecc3..7eeb68e 100644 --- a/anvil/config.py +++ b/anvil/config.py @@ -13,7 +13,7 @@ from anvil.geometry import Geometry2D from anvil.checkpoint import TrainingOutput -from anvil.models import MODEL_OPTIONS +from anvil.models import LAYER_OPTIONS from anvil.distributions import BASE_OPTIONS, TARGET_OPTIONS from random import randint @@ -84,12 +84,12 @@ def parse_parameterisation(self, param: str): return param @explicit_node - def produce_model_action(self, model: str): + def produce_layer_action(self, layer: str): """Given a string, return the flow model action indexed by that string.""" try: - return MODEL_OPTIONS[model] + return LAYER_OPTIONS[layer] except KeyError: - raise ConfigError(f"Invalid model {model}", model, MODEL_OPTIONS.keys()) + raise ConfigError(f"Invalid model {layer}", layer, LAYER_OPTIONS.keys()) def parse_n_batch(self, nb: int): """Batch size for training.""" diff --git a/anvil/distributions.py b/anvil/distributions.py index f34c466..919f944 100644 --- a/anvil/distributions.py +++ b/anvil/distributions.py @@ -14,8 +14,8 @@ class Gaussian: """ Class which handles the generation of a sample of latent Gaussian variables. - Inputs: - ------- + Parameters + ---------- lattice_size: int Number of nodes on the lattice. sigma: float @@ -59,6 +59,18 @@ class PhiFourScalar: The forward pass returns the corresponding log density (unnormalised) which is equal to -S + The parameters required differ depending on the parameterisation you're + using: + + ================ ============= + parameterisation couplings + ================ ============= + standard m_sq, g + albergo2019 m_sq, lam + nicoli2020 kappa, lam + bosetti2015 beta, lam + ================ ============= + Parameters ---------- geometry: @@ -70,14 +82,6 @@ class PhiFourScalar: dictionary with two entries that are the couplings of the theory. See below. - - parameterisation couplings - ------------------------------------- - standard m_sq, g - albergo2019 m_sq, lam - nicoli2020 kappa, lam - bosetti2015 beta, lam - Notes ----- The general form of the action is diff --git a/anvil/free_scalar.py b/anvil/free_scalar.py index af2a050..a985c89 100644 --- a/anvil/free_scalar.py +++ b/anvil/free_scalar.py @@ -127,8 +127,8 @@ def gen_complex_normal(n_sample, sigma, real=False): """Returns a stack of complex arrays where real and imaginary components are drawn from a Gaussian distribution with the same width. - Inputs: - ------- + Parameters + ---------- n_sample: int sample size sigma: numpy.ndarray @@ -137,8 +137,8 @@ def gen_complex_normal(n_sample, sigma, real=False): (optional) flag. If True, the imaginary component is set to zero, but a complex array is still returned. - Returns: - -------- + Returns + ------- out: numpy.ndarray complex array of shape (n_sample, *sigma.shape) """ @@ -157,13 +157,13 @@ def gen_eigenmodes(self, n_sample): Gaussian distributions with variances given by the eigenvalues of the kinetic operator - see _variance() method above. - Inputs: - ------- + Parameters + ---------- n_sample: int sample size - Returns: - -------- + Returns + ------- eigenmodes: numpy.ndarray complex array of eigenmodes with shape (n_sample, L, L) where L is the side length of the square lattice. @@ -206,13 +206,13 @@ def gen_eigenmodes(self, n_sample): def gen_real_space_fields(self, n_sample): """Returns the inverse fourier transform of a sample of eigenmodes. - Inputs: - ------- + Parameters + ---------- n_sample: int sample size - Returns: - -------- + Returns + ------- fields: numpy.ndarray real array of real-space fields, with shape (n_sample, L, L), where L is the side-length of the square lattice. diff --git a/anvil/layers.py b/anvil/layers.py index af8997f..fefd9db 100644 --- a/anvil/layers.py +++ b/anvil/layers.py @@ -3,37 +3,51 @@ r""" layers.py -Contains nn.Modules which implement transformations of input configurations whilst computing -the Jacobian determinant of the transformation. - -Each transformation layers may contain several neural networks or learnable parameters. - -A normalising flow, f, can be constructed from multiple layers using function composition: - - f(z) = g_n( ... ( g_2( g_1( z ) ) ) ... ) - -which is implemented using the architecture provided by torch.nn +Contains the transformations or "layers" which are the building blocks of +normalising flows. The layers are implemented using the PyTorch library, which +in practice means they subclass :py:class:`torch.nn.Module`. For more +information, check out the PyTorch +`Module docs `_. + +The basic idea is of a flow is to generate a latent variable, in our framework +this would be using a class in :py:mod:`anvil.distributions`. The latent +variables are then transformed by sequentially applying the transformation +layers. The key feature of the transformations is the ability to easily calculate +the Jacobian determinant. If the base density function is known, then we can +evaluate the model density exactly. + +The bottom line is that we enforce a convention to the ``forward`` method +of each layer (a special method of :py:class:`torch.nn.Module` subclasses). +All layers in this module should contain a ``forward`` method which takes two +:py:class:`torch.Tensor` objects as inputs: + + - a batch of input configurations, dimensions ``(batch size, lattice size)``. + - a batch of scalars, dimensions ``(batch size, 1)``, that are the logarithm of the + 'current' probability density, at this stage in the normalising flow. -All layers in this module contain a `forward` method which takes two torch.tensor objects -as inputs: +Each transformation layers may contain several neural networks or learnable +parameters. - - a batch of input configurations, dimensions (batch size, lattice size). +A full normalising flow, f, can be constructed from multiple layers using +function composition: - - a batch of scalars, dimensions (batch size, 1), that are the logarithm of the - 'current' probability density, at this stage in the normalising flow. +.. math:: -and returns two torch.tensor objects: + f(z) = g_{N_{\rm layers}}( \ldots ( g_2( g_1( z ) ) ) \ldots ) - - a batch of configurations \phi which have been transformed according to the - transformation, with the same dimensions as the input configurations. +As a matter of convenience we provide a subclass of +:py:class:`torch.nn.Sequential`, which is initialised by passing multiple layers +as arguments (in the order in which the layers are applied). The main feature +of our version, :py:class:`Sequential`, is that it conforms to our ``forward`` +convention. From the perspective of the user :py:class:`Sequential` appears +as a single subclass of :py:class:`torch.nn.Module` which performs the +full normalising flow transformation :math:`f(z)`. - - the updated logarithm of the probability density, including the contribution from - the Jacobian determinant of this transformation. """ import torch import torch.nn as nn -from anvil.core import FullyConnectedNeuralNetwork +from anvil.neural_network import DenseNeuralNetwork class CouplingLayer(nn.Module): @@ -113,7 +127,7 @@ def __init__( ): super().__init__(size_half, even_sites) - self.t_network = FullyConnectedNeuralNetwork( + self.t_network = DenseNeuralNetwork( size_in=size_half, size_out=size_half, hidden_shape=hidden_shape, @@ -121,14 +135,13 @@ def __init__( bias=not z2_equivar, ) - def forward(self, v_in, log_density, *unused) -> torch.Tensor: + def forward(self, v_in, log_density, *args) -> torch.Tensor: r"""Forward pass of affine transformation.""" v_in_passive = v_in[:, self._passive_ind] v_in_active = v_in[:, self._active_ind] + v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6) - t_out = self.t_network( - (v_in_passive - v_in_passive.mean()) / v_in_passive.std() - ) + t_out = self.t_network(v_for_net) v_out = self._join_func([v_in_passive, v_in_active - t_out], dim=1) @@ -161,14 +174,14 @@ def __init__( ): super().__init__(size_half, even_sites) - self.s_network = FullyConnectedNeuralNetwork( + self.s_network = DenseNeuralNetwork( size_in=size_half, size_out=size_half, hidden_shape=hidden_shape, activation=activation, bias=not z2_equivar, ) - self.t_network = FullyConnectedNeuralNetwork( + self.t_network = DenseNeuralNetwork( size_in=size_half, size_out=size_half, hidden_shape=hidden_shape, @@ -177,16 +190,16 @@ def __init__( ) self.z2_equivar = z2_equivar - def forward(self, v_in, log_density, *unused) -> torch.Tensor: + def forward(self, v_in, log_density, *args) -> torch.Tensor: r"""Forward pass of affine transformation.""" v_in_passive = v_in[:, self._passive_ind] v_in_active = v_in[:, self._active_ind] - v_for_net = (v_in_passive - v_in_passive.mean()) / v_in_passive.std() + v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6) s_out = self.s_network(v_for_net) t_out = self.t_network(v_for_net) - # If enforcing s(-v) = -s(v), we want to use |s(v)| in affine transf. + # If enforcing C(-v) = -C(v), we want to use |s(v)| in affine transf. if self.z2_equivar: s_out = torch.abs(s_out) @@ -236,7 +249,7 @@ def __init__( self.size_half = size_half self.n_segments = n_segments - self.network = FullyConnectedNeuralNetwork( + self.network = DenseNeuralNetwork( size_in=size_half, size_out=size_half * (3 * n_segments - 1), hidden_shape=hidden_shape, @@ -255,13 +268,11 @@ def forward(self, v_in, log_density, negative_mag): """Forward pass of the rational quadratic spline layer.""" v_in_passive = v_in[:, self._passive_ind] v_in_active = v_in[:, self._active_ind] - v_for_net = ( - v_in_passive - v_in_passive.mean() - ) / v_in_passive.std() # reduce numerical instability + v_for_net = v_in_passive / torch.sqrt(v_in_passive.var() + 1e-6) # Naively enforce C(-v) = -C(v) if self.z2_equivar: - v_in_passive_stand[negative_mag] = -v_in_passive_stand[negative_mag] + v_for_net[negative_mag] = -v_for_net[negative_mag] v_out_b = torch.zeros_like(v_in_active) gradient = torch.ones_like(v_in_active).unsqueeze(dim=-1) @@ -396,35 +407,89 @@ def forward(self, v_in, log_density): return self.scale * v_in + self.shift, log_density -# TODO not necessary to define a nn.module for this now I've taken out learnable gamma +# NOTE: not necessary to define a nn.module for this now gamma is no longer learnable class BatchNormLayer(nn.Module): - """Performs batch normalisation on the input vector. + r"""Performs batch normalisation on the inputs, conforming to our ``forward`` + convention. + + Inputs are standardised over all tensor dimensions such that the resulting sample + has null mean and unit variance, after which a rescaling factor is applied. + + .. math:: + + v_{\rm out} = \gamma + \frac{v_{\rm in} - \mathbb{E}[ v_{\rm in} ]} + {\sqrt{\var( v_{\rm in} ) + \epsilon}} Parameters ---------- - scale: int - An additional scale factor to be applied after batch normalisation. + scale: float + The multiplicative factor, :math:`\gamma`, applied to the standardised data. + + Notes + ----- + Applying batch normalisation before the first spline layer can be helpful for + ensuring that the inputs remain within the transformation interval. However, + this layer adds undesirable stochasticity which can impede optimisation. One + might consider replacing it with :py:class:`anvil.layers.GlobalRescaling` using + a static scale parameter. """ def __init__(self, scale=1): super().__init__() self.gamma = scale - def forward(self, v_in, log_density, *unused): + def forward(self, v_in, log_density, *args): """Forward pass of the batch normalisation transformation.""" - mult = self.gamma / torch.std(v_in) + mult = self.gamma / torch.sqrt(v_in.var() + 1e-6) # for stability v_out = mult * (v_in - v_in.mean()) - log_density -= mult * v_out.shape[1] + log_density -= torch.log(mult) * v_out.shape[1] return (v_out, log_density) class GlobalRescaling(nn.Module): - def __init__(self, initial=1): + r"""Performs a global rescaling of the inputs via a (potentially learnable) + multiplicative factor, conforming to our ``forward`` convention. + + Parameters + ---------- + scale: float + The multiplicative factor applied to the inputs. + learnable: bool, default=True + If True, ``scale`` will be optimised during the training. + + Notes + ----- + Applying a rescaling layer with a learnable ``scale`` to the final layer of a + normalizing flow can be useful since it avoids the need to tune earlier layers + to match the width of the target density. However, for best performance one + should generally use a static ``scale`` to reduce stochasticity in the + optimisation. + + """ + + def __init__(self, scale=1, learnable=True): super().__init__() - self.scale = nn.Parameter(torch.Tensor([initial])) + self.scale = torch.Tensor([scale]) + if learnable: + self.scale = nn.Parameter(self.scale) - def forward(self, v_in, log_density, *unused): + def forward(self, v_in, log_density, *args): v_out = self.scale * v_in log_density -= v_out.shape[-1] * torch.log(self.scale) return v_out, log_density + + +class Sequential(nn.Sequential): + """Similar to :py:class:`torch.nn.Sequential` except conforms to our + ``forward`` convention. + """ + + def forward(self, v, log_density, *args): + """overrides the base class ``forward`` method to conform to our + conventioned for expected inputs/outputs of ``forward`` methods. + """ + for module in self: + v, log_density = module(v, log_density, *args) + return v, log_density diff --git a/anvil/models.py b/anvil/models.py index 55cb6ed..9e728b3 100644 --- a/anvil/models.py +++ b/anvil/models.py @@ -3,21 +3,29 @@ """ models.py -Module containing reportengine actions which return callable objects that execute -normalising flows constructed from multiple layers via function composition. +Module containing reportengine actions which return normalising flow models. +Generally this involves piecing together components from :py:mod:`anvil.layers` +to produce sequences of transformations. + """ from functools import partial -from anvil.core import Sequential +from reportengine import collect import anvil.layers as layers -def coupling_pair(coupling_layer, size_half, **layer_spec): - """Helper function which returns a callable object that performs a coupling - transformation on both even and odd lattice sites.""" - coupling_transformation = partial(coupling_layer, size_half, **layer_spec) - return Sequential( +def _coupling_block(coupling_layer, **kwargs): + """Helper function which wraps a pair of coupling layers from + :py:mod:`anvil.layers` in the module container + :py:class`anvil.layers.Sequential`. The first transformation layer acts on + the even sites and the second transformation acts on the odd sites, so one + of these blocks ensures all sites are transformed as part of an + active partition. + + """ + coupling_transformation = partial(coupling_layer, **kwargs) + return layers.Sequential( coupling_transformation(even_sites=True), coupling_transformation(even_sites=False), ) @@ -25,90 +33,256 @@ def coupling_pair(coupling_layer, size_half, **layer_spec): def real_nvp( size_half, - n_affine, + n_blocks, hidden_shape, activation="tanh", - z2_equivar=False, + z2_equivar=True, ): - """Action that returns a callable object that performs a sequence of `n_affine` - affine coupling transformations on both partitions of the input vector.""" + r"""Action which returns a sequence of ``n_blocks`` pairs of + :py:class:`anvil.layers.AffineLayer` s, wrapped in the module container + :py:class`anvil.layers.Sequential`. + + The first ``n_blocks`` elements of the outer ``Sequential`` + are ``Sequential`` s containing a pair of ``AffineLayer`` s which + act on the even and odd sites respectively. + + Parameters + ---------- + size_half: int + Inferred from ``lattice_size``, the size of the active/passive + partitions (which are equal size, `lattice_size / 2`). + n_blocks: int + The number of pairs of :py:class:`anvil.layers.AffineLayer` + transformations. + hidden_shape: list[int] + the shape of the neural networks used in the AffineLayer. The visible + layers are defined by the ``lattice_size``. Typically we have found + a single hidden layer neural network is effective, which can be + specified by passing a list of length 1, i.e. ``[72]`` would + be a single hidden layered network with 72 nodes in the hidden layer. + activation: str, default="tanh" + The activation function to use for each hidden layer. The output layer + of the network is linear (has no activation function). + z2_equivar: bool, default=True + Whether or not to impose z2 equivariance. This changes the transformation + such that the neural networks have no bias term and s(-v) = s(v) which + imposes a :math:`\mathbb{Z}_2` symmetry. + + Returns + ------- + real_nvp: anvil.layers.Sequential + A sequence of affine transformations, which we refer to as a real NVP + (Non-volume preserving) flow. + + See Also + -------- + :py:mod:`anvil.neural_network` contains the fully connected neural network class + as well as valid choices for activation functions. + + """ blocks = [ - coupling_pair( + _coupling_block( layers.AffineLayer, - size_half, + size_half=size_half, hidden_shape=hidden_shape, activation=activation, z2_equivar=z2_equivar, ) - for i in range(n_affine) + for _ in range(n_blocks) ] - return Sequential(*blocks, layers.GlobalRescaling()) + return layers.Sequential(*blocks) def nice( size_half, - n_additive, + n_blocks, hidden_shape, activation="tanh", - z2_equivar=False, + z2_equivar=True, ): - """Action that returns a callable object that performs a sequence of `n_affine` - affine coupling transformations on both partitions of the input vector.""" + r"""Similar to :py:func:`real_nvp`, excepts instead wraps pairs of + :py:class:`anvil.layers.AdditiveLayer`. + The pairs of ``AdditiveLayer`` s act on the even and odd sites respectively. + + Parameters + ---------- + size_half: int + Inferred from ``lattice_size``, the size of the active/passive + partitions (which are equal size, `lattice_size / 2`). + n_blocks: int + The number of pairs of :py:class:`anvil.layers.AffineLayer` + transformations. + hidden_shape: list[int] + the shape of the neural networks used in the each layer. The visible + layers are defined by the ``lattice_size``. + activation: str, default="tanh" + The activation function to use for each hidden layer. The output layer + of the network is linear (has no activation function). + z2_equivar: bool, default=True + Whether or not to impose z2 equivariance. This changes the transformation + such that the neural networks have no bias term and s(-v) = s(v) which + imposes a :math:`\mathbb{Z}_2` symmetry. + + Returns + ------- + nice: anvil.layers.Sequential + A sequence of additive transformations, which we refer to as a + nice flow. + + """ blocks = [ - coupling_pair( + _coupling_block( layers.AdditiveLayer, - size_half, + size_half=size_half, hidden_shape=hidden_shape, activation=activation, z2_equivar=z2_equivar, ) - for i in range(n_additive) + for _ in range(n_blocks) ] - return Sequential(*blocks, layers.GlobalRescaling()) + return layers.Sequential(*blocks) def rational_quadratic_spline( size_half, + n_blocks, hidden_shape, + n_segments, interval=5, - n_spline=1, - n_segments=4, activation="tanh", - z2_equivar_spline=False, + z2_equivar=False, ): - """Action that returns a callable object that performs a pair of circular spline - transformations, one on each half of the input vector.""" + """Similar to :py:func:`real_nvp`, excepts instead wraps pairs of + :py:class:`anvil.layers.RationalQuadraticSplineLayer` s followed by a single + The pairs of RQS's act on the even and odd sites respectively. + + Parameters + ---------- + size_half: int + inferred from ``lattice_size``, the size of the active/passive + partitions (which are equal size, `lattice_size / 2`). + n_blocks: int + The number of pairs of :py:class:`anvil.layers.AffineLayer` + transformations. For RQS this is set to 1. + hidden_shape: list[int] + the shape of the neural networks used in the each layer. The visible + layers are defined by the ``lattice_size``. + n_segments: int + The number of segments to use in the RQS transformation. + interval: int, default=5 + the interval within which the RQS applies the transformation, at present + if a field variable is outside of this region it is mapped to itself + (i.e the gradient of the transformation is 1 outside of the interval). + activation: str, default="tanh" + The activation function to use for each hidden layer. The output layer + of the network is linear (has no activation function). + z2_equivar: bool, default=False + Whether or not to impose z2 equivariance. This is only done crudely + by splitting the sites according to the sign of the sum across lattice + sites. + """ + blocks = [ - coupling_pair( + _coupling_block( layers.RationalQuadraticSplineLayer, - size_half, + size_half=size_half, interval=interval, n_segments=n_segments, hidden_shape=hidden_shape, activation=activation, - z2_equivar=z2_equivar_spline, + z2_equivar=z2_equivar, ) - for _ in range(n_spline) + for _ in range(n_blocks) ] - return Sequential( - #layers.BatchNormLayer(), - *blocks, - layers.GlobalRescaling(), - ) + return layers.Sequential(*blocks) + + +def batch_norm(scale=1.0): + r"""Action which returns an instance of :py:class:`anvil.layers.BatchNormLayer`. + + Parameters + ---------- + scale: float, default=1.0 + The multiplicative factor applied to the standardised data. + """ + return layers.Sequential(layers.BatchNormLayer(scale=scale)) + + +def global_rescaling(scale, learnable=True): + r"""Action which returns and instance of :py:class:`anvil.layers.GlobalRescaling`. + + Parameters + ---------- + scale: float + The multiplicative factor applied to the inputs. + learnable: bool, default=True + If True, ``scale`` will be optimised during the training. + """ + return layers.Sequential(layers.GlobalRescaling(scale=scale, learnable=learnable)) + + +_normalising_flow = collect("layer_action", ("model",)) + + +def model_to_load(_normalising_flow): + """action which wraps a list of layers in + :py:class:`anvil.layers.Sequential`. This allows the user to specify an + arbitrary combination of layers as the model. + + For more information on valid choices for layers, see + ``anvil.models.LAYER_OPTIONS`` or the various + functions in :py:mod:`anvil.models` which produce sequences of the layers + found in :py:mod:`anvil.layers`. + + At present, available transformations are: + + - ``nice`` + - ``real_nvp`` + - ``rational_quadratic_spline`` + - ``batch_norm`` + - ``global_rescaling`` + + You can see their dependencies using the ``anvil`` provider help, e.g. + for ``real_nvp``: + + .. code:: + + $ anvil-sample --help real_nvp + ... + < action docstring - poorly formatted> + ... + The following resources are read from the configuration: + + lattice_length(int): + [Used by lattice_size] + + lattice_dimension(int): Parse lattice dimension from runcard + [Used by lattice_size] + The following additionl arguments can be used to control the + behaviour. They are set by default to sensible values: -def spline_affine(real_nvp, rational_quadratic_spline): - return Sequential(rational_quadratic_spline, real_nvp) + n_blocks + hidden_shape + activation = tanh + z2_equivar = True + ``anvil-train`` will also provide the same information. -def affine_spline(real_nvp, rational_quadratic_spline): - return Sequential(real_nvp, rational_quadratic_spline) + """ + # assume that _normalising_flow is a list of layers, each layer + # is a sequential of blocks, each block is a pair of transformations + # which transforms the entire input state - flatten this out, so output + # is Sequential of blocks + flow_flat = [block for layer in _normalising_flow for block in layer] + return layers.Sequential(*flow_flat) -MODEL_OPTIONS = { +# Update docstring above if you add to this! +LAYER_OPTIONS = { "nice": nice, "real_nvp": real_nvp, "rational_quadratic_spline": rational_quadratic_spline, - "spline_affine": spline_affine, - "affine_spline": affine_spline, + "batch_norm": batch_norm, + "global_rescaling": global_rescaling, } diff --git a/anvil/core.py b/anvil/neural_network.py similarity index 66% rename from anvil/core.py rename to anvil/neural_network.py index 8c72a13..8806ed4 100644 --- a/anvil/core.py +++ b/anvil/neural_network.py @@ -1,12 +1,15 @@ # SPDX-License-Identifier: GPL-3.0-or-later # Copywrite © 2021 anvil Michael Wilson, Joe Marsh Rossney, Luigi Del Debbio -r""" -coupling.py +""" +neural_network.py + +Module containing neural networks which are used as part of transformation +layers, found in :py:mod:`anvil.layers`. + """ import torch import torch.nn as nn -from reportengine import collect ACTIVATION_LAYERS = { "leaky_relu": nn.LeakyReLU, @@ -15,19 +18,8 @@ } -class Sequential(nn.Sequential): - """Modify the nn.Sequential class so that it takes an input vector *and* a - value for the current logarithm of the model density, returning an output - vector and the updated log density.""" - - def forward(self, v, log_density, *args): - for module in self: - v, log_density = module(v, log_density, *args) - return v, log_density - - -class FullyConnectedNeuralNetwork(nn.Module): - """Generic class for neural networks used in coupling layers. +class DenseNeuralNetwork(nn.Module): + """Dense neural networks used in coupling layers. Parameters ---------- @@ -39,11 +31,11 @@ class FullyConnectedNeuralNetwork(nn.Module): List specifying the number of nodes in the intermediate layers activation: (str, None) Key representing the activation function used for each layer - except the final one. - no_final_activation: bool - If True, leave the network output unconstrained. - bias: bool + except the final one. Valid options can be found in + ``ACTIVATION_LAYERS``. + bias: bool, default=True Whether to use biases in networks. + """ def __init__( @@ -76,10 +68,3 @@ def forward(self, v_in: torch.tensor): shape (n_batch, size_out) """ return self.network(v_in) - -_normalising_flow = collect("model_action", ("model_params",)) - -def model_to_load(_normalising_flow): - return _normalising_flow[0] - - diff --git a/anvil/scripts/anvil_sample.py b/anvil/scripts/anvil_sample.py index ff4b9d1..a216ab2 100644 --- a/anvil/scripts/anvil_sample.py +++ b/anvil/scripts/anvil_sample.py @@ -13,10 +13,8 @@ log = logging.getLogger(__name__) PROVIDERS = [ - "anvil.core", "anvil.models", "anvil.sample", - "anvil.models", "anvil.observables", "anvil.plot", "anvil.table", diff --git a/anvil/scripts/anvil_train.py b/anvil/scripts/anvil_train.py index 437efe0..6767c3b 100644 --- a/anvil/scripts/anvil_train.py +++ b/anvil/scripts/anvil_train.py @@ -12,7 +12,7 @@ log = logging.getLogger(__name__) -PROVIDERS = ["anvil.train", "anvil.checkpoint", "anvil.core", "anvil.models"] +PROVIDERS = ["anvil.train", "anvil.checkpoint", "anvil.models"] TRAINING_ACTIONS = ["train"] diff --git a/anvil/tests/test_layers.py b/anvil/tests/test_layers.py new file mode 100644 index 0000000..9492388 --- /dev/null +++ b/anvil/tests/test_layers.py @@ -0,0 +1,145 @@ +""" +Tests of the base classes in :py:mod:`anvil.layers` + +""" +from hypothesis import given +from hypothesis.strategies import integers, booleans +import numpy as np +import pytest +import torch + +import anvil.layers as layers +from anvil.distributions import Gaussian + +N_BATCH = 100 +SIZE = 36 +SIZE_HALF = SIZE // 2 +HIDDEN_SHAPE = [36] +ACTIVATION = "tanh" + +@given(integers(min_value=0, max_value=2**16), booleans()) +def test_coupling_init(size_half, even_sites): + """Hypothesis test the initialisation of the base class in layers""" + layers.CouplingLayer(size_half, even_sites) + + +def test_additive_layers(): + equivar_additive = layers.AdditiveLayer( + size_half=SIZE_HALF, + hidden_shape=HIDDEN_SHAPE, + activation=ACTIVATION, + z2_equivar=True, + even_sites=True + ) + input_tensor = torch.zeros((N_BATCH, SIZE)) + with torch.no_grad(): + output_tensor, output_density = equivar_additive(input_tensor, 0) + + assert output_density == 0 + np.testing.assert_allclose(input_tensor.numpy(), output_tensor.numpy()) + + +def basic_layer_test(layer, input_states, input_log_density, *args): + """Basic check that layer transforms input states properly. + + In practice we check: + + - field variables and log densities are valid real numbers + - output states are correct shape + - outputs are correct typing + + """ + output_states, output_log_density = layer(input_states, input_log_density, *args) + # all numbers + any_nan = ( + torch.any(torch.isnan(output_states)) or + torch.any(torch.isnan(output_log_density)) + ) + assert not any_nan + # correct shape + assert input_states.shape == output_states.shape + + assert isinstance(output_states, torch.Tensor) + assert isinstance(output_log_density, torch.Tensor) + + +@pytest.fixture() +@torch.no_grad() +def gaussian_input(): + """Basic input states for testing""" + latent_distribution = Gaussian(SIZE) # use default standard normal + return latent_distribution(N_BATCH) + +@pytest.mark.parametrize("layer_class", [layers.AdditiveLayer, layers.AffineLayer]) +@pytest.mark.parametrize("z2_equivar", [True, False]) +@pytest.mark.parametrize("even_sites", [True, False]) +@torch.no_grad() +def test_affine_like_basic(gaussian_input, layer_class, z2_equivar, even_sites): + """Apply :py:func:`basic_layer_test` to layers with same initialisation + parameters as :py:class:`anvil.layers.AffineLayer`. + + """ + layer = layer_class( + size_half=SIZE_HALF, + hidden_shape=HIDDEN_SHAPE, + activation=ACTIVATION, + z2_equivar=z2_equivar, + even_sites=even_sites, + ) + basic_layer_test(layer, *gaussian_input) + +@pytest.mark.parametrize("z2_equivar", [True, False]) +@pytest.mark.parametrize("even_sites", [True, False]) +@torch.no_grad() +def test_rqs_basic(gaussian_input, z2_equivar, even_sites): + """Apply :py:func:`basic_layer_test` to + :py:class:`anvil.layers.RationalQuadraticSplineLayer`. + """ + layer = layers.RationalQuadraticSplineLayer( + size_half=SIZE_HALF, + interval=5, + n_segments=4, + hidden_shape=HIDDEN_SHAPE, + activation=ACTIVATION, + z2_equivar=z2_equivar, + even_sites=even_sites, + ) + negative_mag = gaussian_input[0].sum(dim=1) < 0 + basic_layer_test(layer, *gaussian_input, negative_mag) + +@pytest.mark.parametrize( + "layer_class", + [layers.GlobalRescaling, layers.BatchNormLayer, layers.GlobalAffineLayer] +) +@torch.no_grad() +def test_scaling_layer_basic(gaussian_input, layer_class): + if layer_class is layers.GlobalAffineLayer: + layer = layer_class(1, 0) + elif layer_class is layers.GlobalRescaling: + layer = layer_class(scale=1.0, learnable=False) + else: + layer = layer_class() + basic_layer_test(layer, *gaussian_input) + +@torch.no_grad() +def test_sequential_basic(gaussian_input): + inner_layers = [ + layers.AffineLayer( + size_half=SIZE_HALF, + hidden_shape=HIDDEN_SHAPE, + activation=ACTIVATION, + z2_equivar=False, + even_sites=bool(i % 2), + ) for i in range(8)] + layer = layers.Sequential(*inner_layers) + basic_layer_test(layer, *gaussian_input) + + # check application of sequetion matches output of applying each layer. + output_states, output_density = inner_layers[0](*gaussian_input) + for el in inner_layers[1:]: + output_states, output_density = el(output_states, output_density) + + seq_output_states, seq_output_density = layer(*gaussian_input) + + np.testing.assert_allclose(seq_output_states.numpy(), output_states.numpy()) + np.testing.assert_allclose(seq_output_density.numpy(), output_density.numpy()) diff --git a/anvil/tests/test_models.py b/anvil/tests/test_models.py new file mode 100644 index 0000000..a3e6ddf --- /dev/null +++ b/anvil/tests/test_models.py @@ -0,0 +1,134 @@ +""" +Test higher level model construction from :py:mod:`anvil.models`. + +""" +from hypothesis import given +from hypothesis.strategies import integers, lists +import pytest +import torch +from copy import deepcopy + +from anvil.api import API +from anvil.models import LAYER_OPTIONS + + +LAYERS = list(LAYER_OPTIONS.keys()) + +PARAMS = { + "hidden_shape": (32,), + "n_blocks": 2, + "n_segments": 4, + "lattice_length": 6, + "lattice_dimension": 2, + "scale": 1.0, +} + + +@pytest.mark.parametrize("layer_action", LAYERS) +def test_layer_actions(layer_action): + """Call the API on each of the layer actions, using mainly default arguments""" + getattr(API, layer_action)(**PARAMS) + return + + +# put limits on these so not to crash your computer. +@given( + lists(integers(min_value=0, max_value=2), min_size=1, max_size=3), + integers(min_value=1, max_value=4), + integers(min_value=1, max_value=8), + lists(integers(min_value=1, max_value=2 ** 6), min_size=1, max_size=3), +) +def test_model_construction(layer_idx, n_blocks, lattice_length_half, hidden_shape): + """Hypothesis test the model construction""" + # require even lattice sites. + model = [{"layer": LAYERS[idx]} for idx in layer_idx] + lattice_length = 2 * lattice_length_half + params = { + "model": model, + "n_blocks": n_blocks, + "lattice_length": lattice_length, + "hidden_shape": hidden_shape, + "lattice_dimension": 2, + # for some reason the RQS defaults get missed? + "interval": 5, + "n_segments": 4, + } + # might help with memory. + with torch.no_grad(): + API.model_to_load(**params) + + +def layer_independence_test(model_spec): + """Check that each layer's parameters are updated independently.""" + model = iter(API.model_to_load(**model_spec)) + model_copy = deepcopy(model) + + # Update parameters in first layer + layer1 = next(model) + update = {} + for valid_key, valid_tensor in layer1.state_dict().items(): + update[valid_key] = torch.rand_like(valid_tensor) + layer1.load_state_dict(update, strict=False) + + # Check that this is different from the copy + layer1_copy = next(model_copy) + for original, copy in zip(layer1.parameters(), layer1_copy.parameters()): + assert not torch.allclose(original, copy) + + # Now check that the other layers are unchanged + # NOTE: may be safer to iterate over shared keys + for layer, layer_copy in zip(model, model_copy): + for original, copy in zip(layer.parameters(), layer_copy.parameters()): + assert torch.allclose( + original, copy + ), "Parameters are being shared amongst layers that should be independent." + + +@torch.no_grad() +def test_layer_independence_global_rescaling(): + # Build a model with two identical sets of layers + working_example = { # This is OK + "model": [ + {"layer": "global_rescaling", "scale": 1.0}, + {"layer": "global_rescaling", "scale": 1.0}, + ] + } + layer_independence_test(working_example) + + breaking_example = { # This is NOT ok! + "model": [ + {"layer": "global_rescaling"}, + {"layer": "global_rescaling"}, + ], + "scale": 1.0, + } + with pytest.raises(AssertionError): + layer_independence_test(breaking_example) + + +# TODO: could extend to all layers quite easily +@torch.no_grad() +def test_layer_independence_additive(): + params = { + "hidden_shape": (32,), + "n_blocks": 1, + "lattice_length": 6, + "lattice_dimension": 2, + } + working_example = { + "model": [ + {"layer": "nice", **params}, + {"layer": "nice", **params}, + ] + } + layer_independence_test(working_example) + + breaking_example = { + "model": [ + {"layer": "nice"}, + {"layer": "nice"}, + ], + **params, + } + with pytest.raises(AssertionError): + layer_independence_test(breaking_example) diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml index a551074..e4184ab 100644 --- a/conda-recipe/meta.yaml +++ b/conda-recipe/meta.yaml @@ -27,6 +27,7 @@ build: test: requires: - pytest + - hypothesis commands: - pytest --pyargs anvil diff --git a/docs/sphinx/source/get-started/basic-usage.rst b/docs/sphinx/source/get-started/basic-usage.rst index ad2e483..ef0c557 100644 --- a/docs/sphinx/source/get-started/basic-usage.rst +++ b/docs/sphinx/source/get-started/basic-usage.rst @@ -118,7 +118,7 @@ We supply some basic machinery to build your own normalising flow models. The relevant modules for this purpose are - - :py:mod:`anvil.core`: containing some basic extensions to ``pytorch`` modules relevant for this project. + - :py:mod:`anvil.neural_network`: Generic neural networks. - :py:mod:`anvil.layers`: a collection of transformation layer classes - :py:mod:`anvil.geometry`: classes which transform the output of the transformations into meaningful geometries. These dictate which sites in your lattice get alternated between active and passive partitions. - :py:mod:`anvil.distributions`: a collection of distributions which can be used as base distributions (for latent variables) or target distributions. diff --git a/examples/runcards/train.yml b/examples/runcards/train.yml index f9c319a..0e4144a 100644 --- a/examples/runcards/train.yml +++ b/examples/runcards/train.yml @@ -12,18 +12,24 @@ couplings: # Model base: gaussian -model: affine_spline -model_params: - hidden_shape: [72] - activation: tanh - - n_affine: 2 - z2_equivar: true - - n_spline: 1 - n_segments: 8 - z2_equivar_spline: false - +model: + - layer: real_nvp + n_blocks: 2 + z2_equivar: true + activation: tanh + hidden_shape: [72] + - layer: global_rescaling + scale: 1 + learnable: true + - layer: rational_quadratic_spline + n_blocks: 1 + n_segments: 8 + z2_equivar: false + activation: tanh + hidden_shape: [72] + - layer: global_rescaling + scale: 1 + learnable: true # Training n_batch: 1000