diff --git a/docs/api_reference/_autosummary/xeofs.cross.CCA.rst b/docs/api_reference/_autosummary/xeofs.cross.CCA.rst new file mode 100644 index 0000000..1149b43 --- /dev/null +++ b/docs/api_reference/_autosummary/xeofs.cross.CCA.rst @@ -0,0 +1,46 @@ +CCA +=== + +.. currentmodule:: xeofs.cross + +.. autoclass:: CCA + :members: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~CCA.__init__ + ~CCA.components + ~CCA.compute + ~CCA.correlation_coefficients_X + ~CCA.correlation_coefficients_Y + ~CCA.cross_correlation_coefficients + ~CCA.deserialize + ~CCA.fit + ~CCA.fraction_variance_X_explained_by_X + ~CCA.fraction_variance_Y_explained_by_X + ~CCA.fraction_variance_Y_explained_by_Y + ~CCA.get_params + ~CCA.get_serialization_attrs + ~CCA.heterogeneous_patterns + ~CCA.homogeneous_patterns + ~CCA.inverse_transform + ~CCA.load + ~CCA.predict + ~CCA.save + ~CCA.scores + ~CCA.serialize + ~CCA.squared_covariance_fraction + ~CCA.transform + + + + + + \ No newline at end of file diff --git a/docs/api_reference/_autosummary/xeofs.cross.ComplexCCA.rst b/docs/api_reference/_autosummary/xeofs.cross.ComplexCCA.rst new file mode 100644 index 0000000..0c09990 --- /dev/null +++ b/docs/api_reference/_autosummary/xeofs.cross.ComplexCCA.rst @@ -0,0 +1,50 @@ +ComplexCCA +========== + +.. currentmodule:: xeofs.cross + +.. autoclass:: ComplexCCA + :members: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~ComplexCCA.__init__ + ~ComplexCCA.components + ~ComplexCCA.components_amplitude + ~ComplexCCA.components_phase + ~ComplexCCA.compute + ~ComplexCCA.correlation_coefficients_X + ~ComplexCCA.correlation_coefficients_Y + ~ComplexCCA.cross_correlation_coefficients + ~ComplexCCA.deserialize + ~ComplexCCA.fit + ~ComplexCCA.fraction_variance_X_explained_by_X + ~ComplexCCA.fraction_variance_Y_explained_by_X + ~ComplexCCA.fraction_variance_Y_explained_by_Y + ~ComplexCCA.get_params + ~ComplexCCA.get_serialization_attrs + ~ComplexCCA.heterogeneous_patterns + ~ComplexCCA.homogeneous_patterns + ~ComplexCCA.inverse_transform + ~ComplexCCA.load + ~ComplexCCA.predict + ~ComplexCCA.save + ~ComplexCCA.scores + ~ComplexCCA.scores_amplitude + ~ComplexCCA.scores_phase + ~ComplexCCA.serialize + ~ComplexCCA.squared_covariance_fraction + ~ComplexCCA.transform + + + + + + \ No newline at end of file diff --git a/docs/api_reference/_autosummary/xeofs.cross.HilbertCCA.rst b/docs/api_reference/_autosummary/xeofs.cross.HilbertCCA.rst new file mode 100644 index 0000000..6f696a7 --- /dev/null +++ b/docs/api_reference/_autosummary/xeofs.cross.HilbertCCA.rst @@ -0,0 +1,50 @@ +HilbertCCA +========== + +.. currentmodule:: xeofs.cross + +.. autoclass:: HilbertCCA + :members: + :inherited-members: + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~HilbertCCA.__init__ + ~HilbertCCA.components + ~HilbertCCA.components_amplitude + ~HilbertCCA.components_phase + ~HilbertCCA.compute + ~HilbertCCA.correlation_coefficients_X + ~HilbertCCA.correlation_coefficients_Y + ~HilbertCCA.cross_correlation_coefficients + ~HilbertCCA.deserialize + ~HilbertCCA.fit + ~HilbertCCA.fraction_variance_X_explained_by_X + ~HilbertCCA.fraction_variance_Y_explained_by_X + ~HilbertCCA.fraction_variance_Y_explained_by_Y + ~HilbertCCA.get_params + ~HilbertCCA.get_serialization_attrs + ~HilbertCCA.heterogeneous_patterns + ~HilbertCCA.homogeneous_patterns + ~HilbertCCA.inverse_transform + ~HilbertCCA.load + ~HilbertCCA.predict + ~HilbertCCA.save + ~HilbertCCA.scores + ~HilbertCCA.scores_amplitude + ~HilbertCCA.scores_phase + ~HilbertCCA.serialize + ~HilbertCCA.squared_covariance_fraction + ~HilbertCCA.transform + + + + + + \ No newline at end of file diff --git a/docs/api_reference/cross_set_analysis.rst b/docs/api_reference/cross_set_analysis.rst index 970b6a8..f84fb0c 100644 --- a/docs/api_reference/cross_set_analysis.rst +++ b/docs/api_reference/cross_set_analysis.rst @@ -8,10 +8,13 @@ Methods that investigate relationships or patterns between variables **across tw :template: custom-class-template.rst :recursive: + ~xeofs.cross.CCA ~xeofs.cross.MCA ~xeofs.cross.CPCCA + ~xeofs.cross.ComplexCCA ~xeofs.cross.ComplexMCA ~xeofs.cross.ComplexCPCCA + ~xeofs.cross.HilbertCCA ~xeofs.cross.HilbertMCA ~xeofs.cross.HilbertCPCCA diff --git a/docs/auto_examples/auto_examples_jupyter.zip b/docs/auto_examples/auto_examples_jupyter.zip index 6ee59c6..186ccd5 100644 Binary files a/docs/auto_examples/auto_examples_jupyter.zip and b/docs/auto_examples/auto_examples_jupyter.zip differ diff --git a/docs/auto_examples/auto_examples_python.zip b/docs/auto_examples/auto_examples_python.zip index a95db94..232d002 100644 Binary files a/docs/auto_examples/auto_examples_python.zip and b/docs/auto_examples/auto_examples_python.zip differ diff --git a/tests/models/cross/test_cca.py b/tests/models/cross/test_cca.py new file mode 100644 index 0000000..b603e9c --- /dev/null +++ b/tests/models/cross/test_cca.py @@ -0,0 +1,292 @@ +import dask.array as da +import numpy as np +import pytest +import xarray as xr + +from xeofs.cross import CCA + + +def generate_random_data(shape, lazy=False, seed=142): + rng = np.random.default_rng(seed) + if lazy: + return xr.DataArray( + da.random.random(shape, chunks=(5, 5)), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + else: + return xr.DataArray( + rng.random(shape), + dims=["sample", "feature"], + coords={"sample": np.arange(shape[0]), "feature": np.arange(shape[1])}, + ) + + +def generate_well_conditioned_data(lazy=False): + rng = np.random.default_rng(142) + t = np.linspace(0, 50, 200) + std = 0.1 + x1 = np.sin(t)[:, None] + rng.normal(0, std, size=(200, 2)) + x2 = np.sin(t)[:, None] + rng.normal(0, std, size=(200, 3)) + x1[:, 1] = x1[:, 1] ** 2 + x2[:, 1] = x2[:, 1] ** 3 + x2[:, 2] = abs(x2[:, 2]) ** (0.5) + coords_time = np.arange(len(t)) + coords_fx = [1, 2] + coords_fy = [1, 2, 3] + X = xr.DataArray( + x1, + dims=["sample", "feature"], + coords={"sample": coords_time, "feature": coords_fx}, + ) + Y = xr.DataArray( + x2, + dims=["sample", "feature"], + coords={"sample": coords_time, "feature": coords_fy}, + ) + if lazy: + X = X.chunk({"sample": 5, "feature": -1}) + Y = Y.chunk({"sample": 5, "feature": -1}) + return X, Y + else: + return X, Y + + +@pytest.fixture +def cca(): + return CCA(n_modes=1) + + +def test_initialization(): + model = CCA() + assert model is not None + + +def test_fit(cca): + X, Y = generate_well_conditioned_data() + cca.fit(X, Y, dim="sample") + assert hasattr(cca, "preprocessor1") + assert hasattr(cca, "preprocessor2") + assert hasattr(cca, "data") + + +def test_fit_empty_data(cca): + with pytest.raises(ValueError): + cca.fit(xr.DataArray(), xr.DataArray(), "time") + + +def test_fit_invalid_dims(cca): + X, Y = generate_well_conditioned_data() + with pytest.raises(ValueError): + cca.fit(X, Y, dim=("invalid_dim1", "invalid_dim2")) + + +def test_transform(cca): + X, Y = generate_well_conditioned_data() + cca.fit(X, Y, dim="sample") + result = cca.transform(X, Y) + assert isinstance(result, list) + assert isinstance(result[0], xr.DataArray) + + +def test_transform_unseen_data(cca): + X, Y = generate_well_conditioned_data() + x = X.isel(sample=slice(151, 200)) + y = Y.isel(sample=slice(151, 200)) + X = X.isel(sample=slice(None, 150)) + Y = Y.isel(sample=slice(None, 150)) + + cca.fit(X, Y, "sample") + result = cca.transform(x, y) + assert isinstance(result, list) + assert isinstance(result[0], xr.DataArray) + # Check that unseen data can be transformed + assert result[0].notnull().all() + assert result[1].notnull().all() + + +def test_inverse_transform(cca): + X, Y = generate_well_conditioned_data() + cca.fit(X, Y, "sample") + # Assuming mode as 1 for simplicity + scores1 = cca.data["scores1"].sel(mode=1) + scores2 = cca.data["scores2"].sel(mode=1) + Xrec1, Xrec2 = cca.inverse_transform(scores1, scores2) + assert isinstance(Xrec1, xr.DataArray) + assert isinstance(Xrec2, xr.DataArray) + + +@pytest.mark.parametrize("use_pca", [False, True]) +def test_squared_covariance_fraction(use_pca): + X, Y = generate_well_conditioned_data() + cca = CCA(n_modes=2, use_pca=use_pca, n_pca_modes="all") + cca.fit(X, Y, "sample") + scf = cca.squared_covariance_fraction() + assert isinstance(scf, xr.DataArray) + assert all(scf <= 1), "Squared covariance fraction is greater than 1" + + +@pytest.mark.parametrize("use_pca", [False, True]) +def test_total_squared_covariance(use_pca): + X, Y = generate_well_conditioned_data() + + # Compute total squared covariance + X_ = X.rename({"feature": "x"}) + Y_ = Y.rename({"feature": "y"}) + cov_mat = xr.cov(X_, Y_, dim="sample") + tsc = (cov_mat**2).sum() + + cca = CCA(n_modes=2, use_pca=use_pca, n_pca_modes="all") + cca.fit(X, Y, "sample") + tsc_model = cca.data["total_squared_covariance"] + xr.testing.assert_allclose(tsc, tsc_model) + + +def test_fit_different_coordinates(): + """Like a lagged CCA scenario""" + X, Y = generate_well_conditioned_data() + X = X.isel(sample=slice(0, 99)) + Y = Y.isel(sample=slice(100, 199)) + cca = CCA(n_modes=2, use_pca=False) + cca.fit(X, Y, "sample") + r = cca.cross_correlation_coefficients() + # Correlation coefficents are not zero + assert np.all(r > np.finfo(r.dtype).eps) + + +@pytest.mark.parametrize( + "dim", + [(("time",)), (("lat", "lon")), (("lon", "lat"))], +) +def test_components(mock_data_array, dim): + cca = CCA(n_modes=2, use_pca=False) + cca.fit(mock_data_array, mock_data_array, dim) + components1, components2 = cca.components() + feature_dims = tuple(set(mock_data_array.dims) - set(dim)) + assert isinstance(components1, xr.DataArray) + assert isinstance(components2, xr.DataArray) + assert set(components1.dims) == set( + ("mode",) + feature_dims + ), "Components1 does not have the right feature dimensions" + assert set(components2.dims) == set( + ("mode",) + feature_dims + ), "Components2 does not have the right feature dimensions" + + +@pytest.mark.parametrize("shapeX", [(30, 10)]) +@pytest.mark.parametrize("shapeY", [(30, 10), (30, 5), (30, 15)]) +@pytest.mark.parametrize("use_pca", [False, True]) +def test_components_coordinates(shapeX, shapeY, use_pca): + # Test that the components have the right coordinates + X = generate_random_data(shapeX) + Y = generate_random_data(shapeY) + + cca = CCA(n_modes=2, use_pca=use_pca, n_pca_modes="all") + cca.fit(X, Y, "sample") + components1, components2 = cca.components() + xr.testing.assert_equal(components1.coords["feature"], X.coords["feature"]) + xr.testing.assert_equal(components2.coords["feature"], Y.coords["feature"]) + + +@pytest.mark.parametrize("correction", [(None), ("fdr_bh")]) +def test_homogeneous_patterns(correction): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cca = CCA(n_modes=10, use_pca=False) + cca.fit(X, Y, "sample") + + _ = cca.homogeneous_patterns(correction=correction) + + +@pytest.mark.parametrize( + "correction", + [(None), ("fdr_bh")], +) +def test_heterogeneous_patterns(correction): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cca = CCA(n_modes=10, use_pca=False) + cca.fit(X, Y, "sample") + + _ = cca.heterogeneous_patterns(correction=correction) + + +def test_predict(): + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + cca = CCA(n_modes=10, use_pca=False) + cca.fit(X, Y, "sample") + + Xnew = generate_random_data((200, 10), seed=123) + + Ry_pred = cca.predict(Xnew) + _ = cca.inverse_transform(Y=Ry_pred) + + +@pytest.mark.parametrize("engine", ["netcdf4", "zarr"]) +def test_save_load(tmp_path, engine): + """Test save/load methods in MCA class, ensuring that we can + roundtrip the model and get the same results when transforming + data.""" + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + original = CCA() + original.fit(X, Y, "sample") + + # Save the CCA model + original.save(tmp_path / "cca", engine=engine) + + # Check that the CCA model has been saved + assert (tmp_path / "cca").exists() + + # Recreate the model from saved file + loaded = CCA.load(tmp_path / "cca", engine=engine) + + # Check that the params and DataContainer objects match + assert original.get_params() == loaded.get_params() + assert all([key in loaded.data for key in original.data]) + for key in original.data: + if original.data._allow_compute[key]: + assert loaded.data[key].equals(original.data[key]) + else: + # but ensure that input data is not saved by default + assert loaded.data[key].size <= 1 + assert loaded.data[key].attrs["placeholder"] is True + + # Test that the recreated model can be used to transform new data + assert np.allclose( + original.transform(X, Y), + loaded.transform(X, Y), + ) + + # The loaded model should also be able to inverse_transform new data + XYr_o = original.inverse_transform(*original.scores()) + XYr_l = loaded.inverse_transform(*loaded.scores()) + assert np.allclose(XYr_o[0], XYr_l[0]) + assert np.allclose(XYr_o[1], XYr_l[1]) + + +def test_serialize_deserialize_dataarray(mock_data_array): + """Test roundtrip serialization when the model is fit on a DataArray.""" + model = CCA() + model.fit(mock_data_array, mock_data_array, "time") + dt = model.serialize() + rebuilt_model = CCA.deserialize(dt) + assert np.allclose( + model.transform(mock_data_array), rebuilt_model.transform(mock_data_array) + ) + + +def test_serialize_deserialize_dataset(mock_dataset): + """Test roundtrip serialization when the model is fit on a Dataset.""" + model = CCA() + model.fit(mock_dataset, mock_dataset, "time") + dt = model.serialize() + rebuilt_model = CCA.deserialize(dt) + assert np.allclose( + model.transform(mock_dataset), rebuilt_model.transform(mock_dataset) + ) diff --git a/tests/models/cross/test_cpcca.py b/tests/models/cross/test_cpcca.py index 20035d2..258ff14 100644 --- a/tests/models/cross/test_cpcca.py +++ b/tests/models/cross/test_cpcca.py @@ -272,3 +272,70 @@ def test_predict(): Ry_pred = cpcca.predict(Xnew) _ = cpcca.inverse_transform(Y=Ry_pred) + + +@pytest.mark.parametrize("engine", ["netcdf4", "zarr"]) +@pytest.mark.parametrize("alpha", [0.0, 0.5, 1.0]) +def test_save_load(tmp_path, engine, alpha): + """Test save/load methods in MCA class, ensuring that we can + roundtrip the model and get the same results when transforming + data.""" + X = generate_random_data((200, 10), seed=123) + Y = generate_random_data((200, 20), seed=321) + + original = CPCCA(alpha=alpha) + original.fit(X, Y, "sample") + + # Save the CPCCA model + original.save(tmp_path / "cpcca", engine=engine) + + # Check that the CPCCA model has been saved + assert (tmp_path / "cpcca").exists() + + # Recreate the model from saved file + loaded = CPCCA.load(tmp_path / "cpcca", engine=engine) + + # Check that the params and DataContainer objects match + assert original.get_params() == loaded.get_params() + assert all([key in loaded.data for key in original.data]) + for key in original.data: + if original.data._allow_compute[key]: + assert loaded.data[key].equals(original.data[key]) + else: + # but ensure that input data is not saved by default + assert loaded.data[key].size <= 1 + assert loaded.data[key].attrs["placeholder"] is True + + # Test that the recreated model can be used to transform new data + assert np.allclose( + original.transform(X, Y), + loaded.transform(X, Y), + ) + + # The loaded model should also be able to inverse_transform new data + XYr_o = original.inverse_transform(*original.scores()) + XYr_l = loaded.inverse_transform(*loaded.scores()) + assert np.allclose(XYr_o[0], XYr_l[0]) + assert np.allclose(XYr_o[1], XYr_l[1]) + + +def test_serialize_deserialize_dataarray(mock_data_array): + """Test roundtrip serialization when the model is fit on a DataArray.""" + model = CPCCA() + model.fit(mock_data_array, mock_data_array, "time") + dt = model.serialize() + rebuilt_model = CPCCA.deserialize(dt) + assert np.allclose( + model.transform(mock_data_array), rebuilt_model.transform(mock_data_array) + ) + + +def test_serialize_deserialize_dataset(mock_dataset): + """Test roundtrip serialization when the model is fit on a Dataset.""" + model = CPCCA() + model.fit(mock_dataset, mock_dataset, "time") + dt = model.serialize() + rebuilt_model = CPCCA.deserialize(dt) + assert np.allclose( + model.transform(mock_dataset), rebuilt_model.transform(mock_dataset) + ) diff --git a/xeofs/cross/__init__.py b/xeofs/cross/__init__.py index 830df25..456927c 100644 --- a/xeofs/cross/__init__.py +++ b/xeofs/cross/__init__.py @@ -1,17 +1,21 @@ import warnings +from .cca import CCA, ComplexCCA, HilbertCCA from .cpcca import CPCCA, ComplexCPCCA, HilbertCPCCA from .cpcca_rotator import ComplexCPCCARotator, CPCCARotator, HilbertCPCCARotator from .mca import MCA, ComplexMCA, HilbertMCA from .mca_rotator import ComplexMCARotator, HilbertMCARotator, MCARotator __all__ = [ + "CCA", + "MCA", + "CPCCA", + "ComplexCCA", "ComplexMCA", "ComplexCPCCA", + "HilbertCCA", "HilbertMCA", "HilbertCPCCA", - "MCA", - "CPCCA", "MCARotator", "CPCCARotator", "ComplexMCARotator", diff --git a/xeofs/cross/cca.py b/xeofs/cross/cca.py new file mode 100644 index 0000000..524777f --- /dev/null +++ b/xeofs/cross/cca.py @@ -0,0 +1,356 @@ +from typing import Sequence + +import numpy as np + +from .cpcca import CPCCA, ComplexCPCCA, HilbertCPCCA + + +class CCA(CPCCA): + """Canonical Correlation Analysis (CCA). + + CCA seeks to find paris of coupled patterns that maximize the correlation [1]_ [2]_ . + + This method solves the following optimization problem: + + :math:`\\max_{q_x, q_y} \\left( q_x^T X^T Y q_y \\right)` + + subject to the constraints: + + :math:`q_x^T (X^TX) q_x = 1, \\quad q_y^T (Y^TY) q_y = 1` + + where :math:`X` and :math:`Y` are the input data matrices and :math:`q_x` + and :math:`q_y` are the corresponding pattern vectors. + + Parameters + ---------- + n_modes : int, default=2 + Number of modes to calculate. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is then computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. + compute : bool, default=True + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None + Seed for the random number generator. + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver function. + + + References + ---------- + .. [1] Bretherton, C., Smith, C., Wallace, J., 1992. An intercomparison of + methods for finding coupled patterns in climate data. Journal of climate + 5, 541–560. + .. [2] Wilks, D. S. Statistical Methods in the Atmospheric Sciences. + (Academic Press, 2019). + doi:https://doi.org/10.1016/B978-0-12-815823-4.00011-0. + + Examples + -------- + + Perform CCA on two datasets on a regular longitude-latitude grid: + + >>> model = CCA(n_modes=5, use_coslat=True) + >>> model.fit(X, Y, dim="time") + + """ + + def __init__( + self, + n_modes: int = 2, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, + compute: bool = True, + sample_name: str = "sample", + feature_name: Sequence[str] | str = "feature", + solver: str = "auto", + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, + ): + CPCCA.__init__( + self, + n_modes=n_modes, + alpha=[0.0, 0.0], + standardize=standardize, + use_coslat=use_coslat, + check_nans=check_nans, + use_pca=use_pca, + n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, + compute=compute, + sample_name=sample_name, + feature_name=feature_name, + solver=solver, + random_state=random_state, + solver_kwargs=solver_kwargs, + ) + self.attrs.update({"model": "Canonical Correlation Analysis"}) + # Renove alpha from the inherited CPCCA serialization params because it is hard-coded for CCA + self._params.pop("alpha") + + +class ComplexCCA(ComplexCPCCA, CCA): + """Complex CCA. + + CCA applied to a complex-valued field obtained from a pair of variables such + as the zonal and meridional components, :math:`U` and :math:`V`, of the wind + field. Complex CCA analysis then maximizes the correlation between + two datasets of the form + + .. math:: + Z_x = U_x + iV_x + + and + + .. math:: + Z_y = U_y + iV_y + + into a set of complex-valued components and PC scores. + + + Parameters + ---------- + n_modes : int, default=2 + Number of modes to calculate. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is then computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. + compute : bool, default=True + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None + Seed for the random number generator. + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver function. + + Examples + -------- + + With two DataArrays `u_i` and `v_i` representing the zonal and meridional + components of the wind field for two different regions :math:`x` and + :math:`y`, construct + + >>> X = u_x + 1j * v_x + >>> Y = u_y + 1j * v_y + + and fit the Complex CCA model: + + >>> model = ComplexCCA(n_modes=5) + >>> model.fit(X, Y, "time") + + + """ + + def __init__( + self, + n_modes: int = 2, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, + compute: bool = True, + sample_name: str = "sample", + feature_name: Sequence[str] | str = "feature", + solver: str = "auto", + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, + ): + ComplexCPCCA.__init__( + self, + n_modes=n_modes, + alpha=[0.0, 0.0], + standardize=standardize, + use_coslat=use_coslat, + check_nans=check_nans, + use_pca=use_pca, + n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, + compute=compute, + sample_name=sample_name, + feature_name=feature_name, + solver=solver, + random_state=random_state, + solver_kwargs=solver_kwargs, + ) + self.attrs.update({"model": "Complex CCA"}) + # Renove alpha from the inherited CPCCA serialization params because it is hard-coded for CCA + self._params.pop("alpha") + + +class HilbertCCA(HilbertCPCCA, ComplexCCA): + """Hilbert CCA. + + Hilbert CCA extends CCA by examining amplitude-phase relationships. It + augments the input data with its Hilbert transform, creating a + complex-valued field. + + This method solves the following optimization problem: + + :math:`\\max_{q_x, q_y} \\left( q_x^T X^T Y q_y \\right)` + + subject to the constraints: + + :math:`q_x^T (X^TX) q_x = 1, \\quad q_y^T (Y^TY) q_y = 1` + + where :math:`H` denotes the conjugate transpose and :math:`X` and :math:`Y` + are the augmented data matrices. + + An optional padding with exponentially decaying values can be applied prior + to the Hilbert transform in order to mitigate the impact of spectral + leakage. + + + Parameters + ---------- + n_modes : int, default=2 + Number of modes to calculate. + padding : Sequence[str] | str | None, default="exp" + Padding method for the Hilbert transform. Available options are: - None: + no padding - "exp": exponential decay + decay_factor : Sequence[float] | float, default=0.2 + Decay factor for the exponential padding. + standardize : Squence[bool] | bool, default=False + Whether to standardize the input data. Generally not recommended as + standardization can be managed by the degree of whitening. + use_coslat : Sequence[bool] | bool, default=False + For data on a longitude-latitude grid, whether to correct for varying + grid cell areas towards the poles by scaling each grid point with the + square root of the cosine of its latitude. + use_pca : Sequence[bool] | bool, default=False + Whether to preprocess each field individually by reducing dimensionality + through PCA. The cross-covariance matrix is computed in the reduced + principal component space. + n_pca_modes : Sequence[int | float | str] | int | float | str, default=0.999 + Number of modes to retain during PCA preprocessing step. If int, + specifies the exact number of modes; if float, specifies the fraction of + variance to retain; if "all", all modes are retained. + pca_init_rank_reduction : Sequence[float] | float, default=0.3 + Relevant when `use_pca=True` and `n_pca_modes` is a float. Specifies the + initial fraction of rank reduction for faster PCA computation via + randomized SVD. + check_nans : Sequence[bool] | bool, default=True + Whether to check for NaNs in the input data. Set to False for lazy model + evaluation. + compute : bool, default=True + Whether to compute the model elements eagerly. If True, the following + are computed sequentially: preprocessor scaler, optional NaN checks, SVD + decomposition, scores, and components. + random_state : numpy.random.Generator | int | None, default=None + Seed for the random number generator. + sample_name : str, default="sample" + Name for the new sample dimension. + feature_name : Sequence[str] | str, default="feature" + Name for the new feature dimension. + solver : {"auto", "full", "randomized"} + Solver to use for the SVD computation. + solver_kwargs : dict, default={} + Additional keyword arguments passed to the SVD solver function. + + + + Examples + -------- + >>> model = HilbertCCA(n_modes=5) + >>> model.fit(X, Y, "time") + + """ + + def __init__( + self, + n_modes: int = 2, + padding: Sequence[str] | str | None = "exp", + decay_factor: Sequence[float] | float = 0.2, + standardize: Sequence[bool] | bool = False, + use_coslat: Sequence[bool] | bool = False, + check_nans: Sequence[bool] | bool = True, + use_pca: Sequence[bool] | bool = True, + n_pca_modes: Sequence[float | int | str] | float | int | str = 0.999, + pca_init_rank_reduction: Sequence[float] | float = 0.3, + compute: bool = True, + sample_name: str = "sample", + feature_name: Sequence[str] | str = "feature", + solver: str = "auto", + random_state: np.random.Generator | int | None = None, + solver_kwargs: dict = {}, + ): + HilbertCPCCA.__init__( + self, + n_modes=n_modes, + alpha=[1.0, 1.0], + standardize=standardize, + use_coslat=use_coslat, + check_nans=check_nans, + use_pca=use_pca, + n_pca_modes=n_pca_modes, + pca_init_rank_reduction=pca_init_rank_reduction, + compute=compute, + sample_name=sample_name, + feature_name=feature_name, + solver=solver, + random_state=random_state, + solver_kwargs=solver_kwargs, + padding=padding, + decay_factor=decay_factor, + ) + self.attrs.update({"model": "Hilbert CCA"}) + # Renove alpha from the inherited CPCCA serialization params because it is hard-coded for CCA + self._params.pop("alpha")