geoelements · kks32 · Jun 28, 2024 · Jun 25, 2024 · Jun 25, 2024 · Jun 25, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -2,40 +2,17 @@ version: 2.0
 jobs:
   gns:
     docker:
-      - image: quay.io/geoelements/gns
+      - image: ghcr.io/geoelements/gns:config
     steps:
       - checkout
-      # GCC
       - run:
           name: Train & Test
           command: |
-            TMP_DIR="./gns-sample"
-            DATASET_NAME="WaterDropSample"
-            git clone https://github.com/geoelements/gns-sample
-            mkdir -p ${TMP_DIR}/${DATASET_NAME}/models/
-            mkdir -p ${TMP_DIR}/${DATASET_NAME}/rollout/
-            DATA_PATH="${TMP_DIR}/${DATASET_NAME}/dataset/"
-            MODEL_PATH="${TMP_DIR}/${DATASET_NAME}/models/"
-            ROLLOUT_PATH="${TMP_DIR}/${DATASET_NAME}/rollout/"
-            conda install -c anaconda absl-py -y
-            conda install -c conda-forge numpy -y
-            conda install -c conda-forge dm-tree -y
-            conda install -c conda-forge matplotlib-base -y
-            conda install -c conda-forge pyevtk -y
-            conda install -c conda-forge pytest -y
-            conda install -c conda-forge tensorboard -y
+            git clone https://github.com/geoelements/gns-sample ../gns-sample
             pytest test/
-            echo "Test paths: ${DATA_PATH} ${MODEL_PATH}"
-            ls
-            python -m gns.train --data_path=${DATA_PATH} --model_path=${MODEL_PATH} --ntraining_steps=10
-            echo "Predict rollout"
-            ls ./gns-sample/WaterDropSample/models/
-
-      - run: 
-          name: Black check 
-          command: |
-            conda install -c conda-forge black -y
-            black --check .
+            python -m gns.train
+            ls ../gns-sample/WaterDropSample/models/
+
 
 workflows:
     version: 2

diff --git a/.github/workflows/container.yml b/.github/workflows/container.yml
@@ -0,0 +1,43 @@
+name: Build and Push to GHCR
+
+on: 
+  push:
+    paths:
+      - Dockerfile
+      - requirements.txt
+
+env:
+  REGISTRY: ghcr.io
+  IMAGE_NAME: ${{ github.repository }}
+
+jobs:
+  build-and-push:
+    runs-on: ubuntu-latest
+    permissions:
+      contents: read
+      packages: write
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Log in to the Container registry
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@v5
+        with:
+          images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/.github/workflows/train.yml b/.github/workflows/train.yml
@@ -0,0 +1,25 @@
+name: GNS Train and Test
+
+on:
+  push:
+
+jobs:
+  gns:
+    runs-on: ubuntu-latest
+    container:
+      image: ghcr.io/geoelements/gns:config
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    - name: PyTest
+      run: |
+        pytest test/
+
+    - name: Train GNS
+      run: |
+        TMP_DIR="../gns-sample"
+        DATASET_NAME="WaterDropSample"
+        git clone https://github.com/geoelements/gns-sample ../gns-sample
+        python -m gns.train
diff --git a/.gitignore b/.gitignore
@@ -8,6 +8,7 @@
 scratch
 log
 **/logs/*
+outputs/*
 
 # Byte-compiled / optimized / DLL files
 __pycache__/

diff --git a/Dockerfile b/Dockerfile
@@ -1,10 +1,25 @@
-FROM continuumio/anaconda3:latest
-RUN conda install pytorch==1.12.1 torchvision==0.13.1 torchaudio==0.12.1 cpuonly -c pytorch
-RUN conda install pyg -c pyg
-RUN conda install -c anaconda absl-py 
-RUN conda install -c conda-forge numpy
-RUN conda install -c conda-forge dm-tree
-RUN conda install -c conda-forge matplotlib-base
-RUN conda install -c conda-forge pyevtk
-WORKDIR /home/gns
-RUN /bin/bash
+FROM python:3.11
+
+WORKDIR /app
+
+COPY requirements.txt .
+
+RUN pip3 install --upgrade pip && \
+    pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu && \
+    pip3 install torch_geometric && \
+    pip3 install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.3.0+cpu.html && \
+    pip3 install -r requirements.txt
+
+ENV PYTHONPATH=/app
+
+# Add Python path to PATH
+ENV PATH="/usr/local/bin:${PATH}"
+
+# Create a bash script to set up the environment
+RUN echo '#!/bin/bash\n\
+export PYTHONPATH=/app\n\
+export PATH="/usr/local/bin:$PATH"\n\
+exec "$@"' > /entrypoint.sh && chmod +x /entrypoint.sh
+
+ENTRYPOINT ["/entrypoint.sh"]
+CMD ["/bin/bash"]
diff --git a/README.md b/README.md
@@ -18,7 +18,7 @@ MeshNet is a scalable surrogate simulator for any mesh-based models like Finite
 > Training GNS/MeshNet on simulation data
 ```shell
 # For particulate domain,
-python3 -m gns.train --data_path="<input-training-data-path>" --model_path="<path-to-load-save-model-file>" --ntraining_steps=100
+python3 -m gns.train mode="train" --config-path ./ --config-name config.yaml
 # For mesh-based domain,
 python3 -m meshnet.train --data_path="<input-training-data-path>" --model_path="<path-to-load-save-model-file>" --ntraining_steps=100
 ```
@@ -29,15 +29,15 @@ To resume training specify `model_file` and `train_state_file`:
 
 ```shell
 # For particulate domain,
-python3 -m gns.train --data_path="<input-training-data-path>" --model_path="<path-to-load-save-model-file>" --model_file="model.pt" --train_state_file="train_state.pt" --ntraining_steps=100
+python3 -m gns.train mode="train" training.resume=True
 # For mesh-based domain,
 python3 -m meshnet.train --data_path="<input-training-data-path>" --model_path="<path-to-load-save-model-file>" --model_file="model.pt" --train_state_file="train_state.pt" --ntraining_steps=100
 ```
 
 > Rollout prediction
 ```shell
 # For particulate domain,
-python3 -m gns.train --mode="rollout" --data_path="<input-data-path>" --model_path="<path-to-load-save-model-file>" --output_path="<path-to-save-output>" --model_file="model.pt" --train_state_file="train_state.pt"
+python3 -m gns.train mode="rollout"
 # For mesh-based domain,
 python3 -m meshnet.train --mode="rollout" --data_path="<input-data-path>" --model_path="<path-to-load-save-model-file>" --output_path="<path-to-save-output>" --model_file="model.pt" --train_state_file="train_state.pt"
 ```
@@ -61,91 +61,64 @@ In mesh-based domain, the renderer writes `.gif` animation.
 > Meshnet GNS prediction of cylinder flow after training for 1 million steps.
 
 
-## Command line arguments details
+## Configuration file
 <details>
-<summary>`train.py` in GNS (particulate domain) </summary>
-
-**mode (Enum)** 
-
-This flag is used to set the operation mode for the script. It can take one of three values; 'train', 'valid', or 'rollout'.
-
-**batch_size (Integer)**
-
-Batch size for training.
-
-**noise_std (Float)** 
-
-Standard deviation of the noise when training.
-
-**data_path (String)** 
-
-Specifies the directory path where the dataset is located. 
-The dataset is expected to be in a specific format (e.g., .npz files).
-It should contain `metadata.json`.
-If `--mode` is training, the directory should contain `train.npz`.
-If `--mode` is testing (rollout), the directory should contain `test.npz`.
-If `--mode` is valid, the directory should contain `valid.npz`.
-
-**model_path (String)** 
-
-The directory path where the trained model checkpoints are saved during training or loaded from during validation/rollout.
-
-**output_path (String)** 
-
-Defines the directory where the outputs (e.g., rollouts) are saved, 
-when the `--mode` is set to rollout.
-This is particularly relevant in the rollout mode where the predictions of the model are stored.
-
-**output_filename (String)** 
-
-Base filename to use when saving outputs during rollout.
-Default is "rollout", and the output will be saved as `rollout.pkl` in `output_path`. 
-It is not intended to include the file extension.
-
-**model_file (String)** 
-
-The filename of the model checkpoint to load for validation or rollout (e.g., model-10000.pt). 
-It supports a special value "latest" to automatically select the newest checkpoint file. 
-This flexibility facilitates the evaluation of models at different stages of training.
-
-**train_state_file (String)** 
-
-Similar to model_file, but for loading the training state (e.g., optimizer state).
-It supports a special value "latest" to automatically select the newest checkpoint file. 
-(e.g., training_state-10000.pt)
-
-**ntraining_steps (Integer)** 
-
-The total number of training steps to execute before stopping.
-
-**nsave_steps (Integer)** 
-
-Interval at which the model and training state are saved.
-
-**lr_init (Float)** 
-
-Initial learning rate.
-
-**lr_decay (Float)** 
-
-How much the learning rate should decay over time.
-
-**lr_decay_steps (Integer)** 
-
-Steps at which learning rate should decay.
-
-**cuda_device_number (Integer)** 
-
-Base CUDA device (zero indexed).
-Default is None so default CUDA device will be used.
-
-**n_gpus (Integer)** 
-
-Number of GPUs to use for training.
-
-**tensorboard_log_dir (String)**
-
-Path to log info on training and validation and visualize via tensorboard.
+<summary>GNS (particulate domain) </summary>
+
+```yaml
+defaults:
+  - _self_
+  - override hydra/hydra_logging: disabled  
+  - override hydra/job_logging: disabled  
+
+hydra:
+  output_subdir: null  
+  run:
+    dir: .
+
+# Top-level configuration
+mode: train
+
+# Data configuration
+data:
+  path: ../gns-sample/WaterDropSample/dataset/
+  batch_size: 2
+  noise_std: 6.7e-4
+  input_sequence_length: 6
+  num_particle_types: 9
+  kinematic_particle_id: 3
+
+# Model configuration
+model:
+  path: ../gns-sample/WaterDropSample/models/
+  file: null
+  train_state_file: null
+
+# Output configuration
+output:
+  path: ../gns-sample/WaterDropSample/rollouts/
+  filename: rollout
+
+# Training configuration
+training:
+  steps: 2000
+  validation_interval: null
+  save_steps: 500
+  resume: False
+  learning_rate:
+    initial: 1e-4
+    decay: 0.1
+    decay_steps: 50000
+
+# Hardware configuration
+hardware:
+  cuda_device_number: null
+  n_gpus: 1
+
+# Logging configuration
+logging:
+  tensorboard_dir: logs/
+```
 
 </details>
 
@@ -254,7 +227,7 @@ The dataset is shared on [DesignSafe DataDepot](https://doi.org/10.17603/ds2-fzg
 
 GNS uses [pytorch geometric](https://www.pyg.org/) and [CUDA](https://developer.nvidia.com/cuda-downloads). These packages have specific requirements, please see [PyG installation]((https://pytorch-geometric.readthedocs.io/en/latest/notes/installation.html) for details. 
 
-> CPU-only installation on Linux
+> CPU-only installation on Linux (Conda)
 
 ```shell
 conda install -y pytorch torchvision torchaudio cpuonly -c pytorch