DataWaveProject · surbhigoel77 · Feb 20, 2024 · Feb 22, 2024 · Feb 22, 2024 · Feb 26, 2024
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,7 @@
+repos:
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.1.0
+    hooks:
+      - id: ruff
+        args: [--fix]  # This will auto-fix issues if possible
+
diff --git a/README.md b/README.md
@@ -48,13 +48,23 @@ The model is trained using the script `train.py` using the demo data. The optimi
 The `Demodata` folder contains the demo data used to train and test the model
 
 The `newCAM_emulation` folder contains the code that is required to load data, train the model and make predictions which is structured as following:
-> `train.py` - train the model
 
-> `NN-pred.py` - predict the GWD using the trained model
-
-> `loaddata.py` - load the data and reshape it to the NN input
+> `loaddata.py` - load the data from source .nc files and normalises before feeding it to the neural network.
 
-> `model.py` - define the NN model
+> `model.py` - defines the NN class and the early stopping mechanism.
+
+> `train.py` - trains the model for given number of epochs using the training and validation loops.
+
+> `main.py` - uses the above three modules to sequentially 
+1. Read the features list (would vary depending on the GW source, currently is convection)
+2. Take information on data like ilev, number of variables varying acrross vertical levels etc.
+3. Use `loaddata.py` to load data for the variables in the feature list defined earlier, normalise it, build an `xtrain` `ytrain` for model using a data loader and finally create a custom dataset for easy iteration over the xtrain and ytrain. 
+4. Take model hyperparameters such as learning rate, epochs, hidden layers and passes to `model.py`
+5. Also take Loss function, optimiser and early stopping parameters ans pass it to `train.py` along with the defined model and the custom dataset.
+6. Train the model and save the weights in the  
+`trained_models` folder. 
+7. The saved model can be loaded and tested on any dataset here.
+
 
 ## Usage Instructions
 To use the repository, following steps are required:

diff --git a/newCAM_emulation/Model.py b/newCAM_emulation/Model.py
@@ -1,158 +1,58 @@
 """Neural Network model for the CAM-EM."""
 
-import netCDF4 as nc
 import numpy as np
-import scipy.stats as st
 import torch
-import xarray as xr
 from torch import nn
-from torch.nn.utils import prune
-from torch.utils.data import DataLoader, Dataset
 
+# ruff: noqa: PLR0913
 
-# Required for feeding the data iinto NN.
-class myDataset(Dataset):
-    """
-    Dataset class for loading features and labels.
-
-    Args:
-        X (numpy.ndarray): Input features.
-        Y (numpy.ndarray): Corresponding labels.
-    """
-
-    def __init__(self, X, Y):
-        """Create an instance of myDataset class."""
-        self.features = torch.tensor(X, dtype=torch.float64)
-        self.labels = torch.tensor(Y, dtype=torch.float64)
-
-    def __len__(self):
-        """Return the number of samples in the dataset."""
-        return len(self.features.T)
-
-    def __getitem__(self, idx):
-        """Return a sample from the dataset."""
-        feature = self.features[:, idx]
-        label = self.labels[:, idx]
-
-        return feature, label
 
-
-# The NN model.
 class FullyConnected(nn.Module):
     """
     Fully connected neural network model.
 
-    The model consists of multiple fully connected layers with SiLU activation function.
-
     Attributes
     ----------
-        linear_stack (torch.nn.Sequential): Sequential container for layers.
+    linear_stack : nn.Sequential
+        Sequential container of linear layers and activation functions.
     """
 
-    def __init__(self):
-        """Create an instance of FullyConnected NN model."""
+    def __init__(
+        self, ilev=93, in_ver=8, in_nover=4, out_ver=2, hidden_layers=8, hidden_size=500
+    ):
         super(FullyConnected, self).__init__()
-        ilev = 93
-
-        self.linear_stack = nn.Sequential(
-            nn.Linear(8 * ilev + 4, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 500, dtype=torch.float64),
-            nn.SiLU(),
-            nn.Linear(500, 2 * ilev, dtype=torch.float64),
-        )
+        self.ilev = ilev
+        self.in_ver = in_ver
+        self.in_nover = in_nover
+        self.out_ver = out_ver
+        self.hidden_layers = hidden_layers
+        self.hidden_size = hidden_size
+
+        layers = []
+
+        input_size = in_ver * ilev + in_nover
+
+        # The following for loop provides the sequential layer by layer flow
+        # of data in the model as the layers used in our model are identical.
+        for _ in range(hidden_layers):
+            layers.append(nn.Linear(input_size, hidden_size, dtype=torch.float64))
+            layers.append(nn.SiLU())
+            input_size = hidden_size
+        layers.append(nn.Linear(hidden_size, out_ver * ilev, dtype=torch.float64))
+        self.linear_stack = nn.Sequential(*layers)
 
     def forward(self, X):
         """
         Forward pass through the network.
 
-        Args:
-            X (torch.Tensor): Input tensor.
+        Parameters
+        ----------
+        X : torch.Tensor
+            Input tensor.
 
         Returns
         -------
-            torch.Tensor: Output tensor.
+        torch.Tensor
+            Output tensor.
         """
         return self.linear_stack(X)
-
-
-# training loop
-def train_loop(dataloader, model, loss_fn, optimizer):
-    """
-    Training loop.
-
-    Args:
-        dataloader (DataLoader): DataLoader for training data.
-        model (nn.Module): Neural network model.
-        loss_fn (torch.nn.Module): Loss function.
-        optimizer (torch.optim.Optimizer): Optimizer.
-
-    Returns
-    -------
-        float: Average training loss.
-    """
-    size = len(dataloader.dataset)
-    avg_loss = 0
-    for batch, (X, Y) in enumerate(dataloader):
-        # Compute prediction and loss
-        pred = model(X)
-        loss = loss_fn(pred, Y)
-
-        # Backpropagation
-        optimizer.zero_grad(set_to_none=True)
-        loss.backward()
-        optimizer.step()
-
-        with torch.no_grad():
-            avg_loss += loss.item()
-
-    avg_loss /= len(dataloader)
-
-    return avg_loss
-
-
-# validating loop
-def val_loop(dataloader, model, loss_fn):
-    """
-    Validation loop.
-
-    Args:
-        dataloader (DataLoader): DataLoader for validation data.
-        model (nn.Module): Neural network model.
-        loss_fn (torch.nn.Module): Loss function.
-
-    Returns
-    -------
-        float: Average validation loss.
-    """
-    avg_loss = 0
-    with torch.no_grad():
-        for batch, (X, Y) in enumerate(dataloader):
-            # Compute prediction and loss
-            pred = model(X)
-            loss = loss_fn(pred, Y)
-            avg_loss += loss.item()
-
-    avg_loss /= len(dataloader)
-
-    return avg_loss
diff --git a/newCAM_emulation/NN_pred.py b/newCAM_emulation/NN_pred.py