Skip to content

Commit

Permalink
debugging on NERSC
Browse files Browse the repository at this point in the history
  • Loading branch information
vganapati committed Jun 14, 2023
1 parent 9abb157 commit 60020cf
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 14 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ Run the following commands to use the conda environment and start an interactive
```
module load python
conda activate PINN
salloc -N 1 --time=60 -C gpu -A m3562_g --qos=interactive --ntasks-per-gpu=8 --cpus-per-task=4
salloc -N 1 --time=60 -C gpu -A m3562_g --qos=interactive --ntasks-per-gpu=1 --cpus-per-task=32
```

Navigate to the directory and run the code:
Expand All @@ -129,7 +129,7 @@ Navigate to the output directory and run the code:
cd $SCRATCH/output_PINN
sbatch $SCRATCH/PINN/slurm_train.sh
```
Submitted batch job 10209497
Submitted batch job 10212188


## Resources
Expand Down
14 changes: 7 additions & 7 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def get_device(args):
if torch.cuda.is_available()
else "cpu")

print(f"Using {device} device")
print("Using " + str(device) + " device")
return device

def get_rank(use_dist):
Expand Down Expand Up @@ -178,15 +178,15 @@ def run(rank, world_size, args,
dtype = torch.float,
):
if args.use_dist:
print(f"Running on rank {rank}.")
print("Running on rank " + str(rank) + ".")

train_set, train_set_2, test_set, bsz = partition_dataset(args, world_size)

# Force num_basis = 1 if not using pde-cl
if not(args.use_pde_cl):
args.num_basis = 1

print(f"Using {args.num_basis} basis functions")
print("Using " + str(args.num_basis) + " basis functions")

model = NeuralNetwork(args.num_basis, args.two_d)
print(model)
Expand Down Expand Up @@ -232,7 +232,7 @@ def loss_fn(data, u_scatter, data_2):
test_loss_vec = []
start = time.time()
for t in range(args.epochs):
print(f"Epoch {t+1}\n-------------------------------")
print("Epoch " + str(t+1) + "\n-------------------------------")
train(train_set, train_set_2, model, loss_fn, optimizer, dtype, args.jitter, device, args.use_dist)
test_loss = test(test_set, model, loss_fn, device)
test_loss_vec.append(test_loss)
Expand All @@ -246,15 +246,15 @@ def loss_fn(data, u_scatter, data_2):
if args.use_dist:
cleanup()

def visualize(args):
def visualize(args, num_devices):
"""
Visualize the PINN with list of evaluation coordinates
Not yet implemented with distributed computing
"""
device = get_device(args)
eval_data, lengths = create_data(args.eval_data_x_start, args.eval_data_x_end,
args.eval_data_x_step, args.two_d)
eval_dataloader = DataLoader(eval_data, batch_size=args.batch_size, shuffle=False)
eval_dataloader = DataLoader(eval_data, batch_size=args.batch_size//num_devices, shuffle=False)

# Load model
model = NeuralNetwork(args.num_basis, args.two_d).to(device)
Expand Down Expand Up @@ -436,6 +436,6 @@ def loss_fn(data, u_scatter, data_2):
run(rank, world_size, args,
)

visualize(args)
visualize(args, world_size)
end = time.time()
print("Time to train (s): " + str(end-start))
10 changes: 5 additions & 5 deletions slurm_train.sh
Original file line number Diff line number Diff line change
@@ -1,25 +1,25 @@
#!/bin/bash

#SBATCH -N 4 # Number of nodes
#SBATCH -N 1 # Number of nodes
#SBATCH -J PINN # job name
#SBATCH -L SCRATCH # job requires SCRATCH files
#SBATCH -A m2859_g # allocation
#SBATCH -C gpu
#SBATCH -q regular
#SBATCH -t 00:10:00
#SBATCH -t 00:05:00
#SBATCH --gpus-per-node=4
#SBATCH --ntasks-per-gpu=1
#SBATCH --gpus 16
#SBATCH --gpus 4
#SBATCH -o %j.out
#SBATCH -e %j.err

export MASTER_ADDR=$(hostname)
export BATCH_SIZE=8192
export BATCH_SIZE=512
export SCRATCH_FOLDER=$SCRATCH/output_PINN/$SLURM_JOB_ID
mkdir -p $SCRATCH_FOLDER; cd $SCRATCH_FOLDER

echo "jobstart $(date)";pwd

srun -n 16 -c 32 python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 5
srun -n 4 -c 32 python $SCRATCH/PINN/main.py --upc --2d --dist --bs $BATCH_SIZE --epochs 5

echo "jobend $(date)";pwd

0 comments on commit 60020cf

Please sign in to comment.