From ff71fad9103ecf912f60119d60dcfaed580b929b Mon Sep 17 00:00:00 2001 From: Seungbaek Hong Date: Tue, 19 Nov 2024 19:55:22 +0900 Subject: [PATCH] [trivial] bug fix on recorder_v2.py script Currently, errors occur in creating golden data for most of `genModelTests_v2.py` unittests except for mixed-precision unittests. Therefore, I've restored the previous function as a 'default' type for non-mixed-precision unittests, while newly added mixed precision unittests are classified as 'mixed' type. Additionally, python formatter has been applied and some bugs(cpu<->cuda mismatch, etc.) have been fixed. Signed-off-by: Seungbaek Hong --- test/input_gen/genModelTests_v2.py | 2 + test/input_gen/recorder_v2.py | 96 +++++++++++++++++------------- 2 files changed, 57 insertions(+), 41 deletions(-) diff --git a/test/input_gen/genModelTests_v2.py b/test/input_gen/genModelTests_v2.py index b9b03cebe..86d2c808b 100644 --- a/test/input_gen/genModelTests_v2.py +++ b/test/input_gen/genModelTests_v2.py @@ -801,6 +801,7 @@ def forward(self, inputs, labels): label_dims=[(1, 10)], name="fc_mixed_training", optimizer=fc_mixed_training.getOptimizer(), + type="mixed", ) inspect_file("fc_mixed_training.nnmodelgolden") @@ -846,6 +847,7 @@ def forward(self, inputs, labels): label_dims=[(1, 1)], name="fc_mixed_training_nan_sgd", optimizer=fc_mixed_training_nan_sgd.getOptimizer(), + type="mixed", ) # Function to check the created golden test file diff --git a/test/input_gen/recorder_v2.py b/test/input_gen/recorder_v2.py index 71bed1b5a..868b55d90 100644 --- a/test/input_gen/recorder_v2.py +++ b/test/input_gen/recorder_v2.py @@ -17,9 +17,9 @@ from transLayer_v2 import params_translated -if torch.__version__ != "1.9.1": +if torch.__version__ != "2.4": print( - "the script was tested at version 1.9.1 it might not work if torch version is different" + "the script was tested at version 2.4 it might not work if torch version is different" ) SEED = 1234 @@ -92,6 +92,7 @@ def record_v2( input_dtype=None, input_label_reader=None, optimizer=None, + type="default", ): ## file format is as below # [ ...] @@ -132,7 +133,7 @@ def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler): model_ = model.cuda() print(inputs[0], " inputs inside") - output = model_(inputs[0], labels[0]) + output = model_(inputs[0].cuda(), labels[0].cuda()) print("model output type: ", output.dtype) @@ -145,51 +146,64 @@ def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler): print("Gradient ---------------") for param in model_.parameters(): print(param.grad) - mask = torch.isnan(param.grad) or torch.isinf(param.grad) - check_nan = mask.int() - if check_nan.sum().item(): - is_nan = True - else: - is_nan = False + is_nan = torch.any(torch.isnan(param.grad) | torch.isinf(param.grad)) + if is_nan: + print("nan or inf detected in gradient") + break if not is_nan: print("------------------------------- not nan") write_fn(output, "int32", "float32") return output, is_nan - with open(file_name, "wb") as f: - # write number of iterations - print("iteration : ", iteration) - np.array([iteration], dtype="int32").tofile(f) - - write_fn = _get_writer_mixed(f) - for i in range(iteration): - if input_label_reader != None: - inputs, labels = input_label_reader(input_dims, label_dims, input_dtype) - else: - inputs = _rand_like( - input_dims, dtype=input_dtype if input_dtype is not None else float - ) - labels = _rand_like(label_dims, dtype=float) - print("inputs ==============") - write_fn(inputs, "int32", "float32") - print("labels ==============") - write_fn(labels, "int32", "float32") - is_nan = True - print("=========================== ", i) - scaler = amp.GradScaler() - print("weights ==============") - write_fn(list(t for _, t in params_translated(model)), "int16", "float16") - print("\n\n") - while is_nan: - print("before is_nan_", is_nan) - output, is_nan_ = record_iteration_with_amp( - write_fn, inputs, labels, is_nan, scaler + if type == "default": + with open(file_name, "wb") as f: + # write number of iterations + np.array([iteration], dtype="int32").tofile(f) + + write_fn = _get_writer(f) + for _ in range(iteration): + record_iteration(write_fn) + + elif type == "mixed": + with open(file_name, "wb") as f: + # write number of iterations + print("iteration : ", iteration) + np.array([iteration], dtype="int32").tofile(f) + + write_fn = _get_writer_mixed(f) + for i in range(iteration): + if input_label_reader != None: + inputs, labels = input_label_reader( + input_dims, label_dims, input_dtype + ) + else: + inputs = _rand_like( + input_dims, + dtype=input_dtype if input_dtype is not None else float, + ) + labels = _rand_like(label_dims, dtype=float) + print("inputs ==============") + write_fn(inputs, "int32", "float32") + print("labels ==============") + write_fn(labels, "int32", "float32") + is_nan = True + print("=========================== ", i) + scaler = amp.GradScaler() + print("weights ==============") + write_fn( + list(t for _, t in params_translated(model)), "int16", "float16" ) - is_nan = is_nan_ - print("after is_nan_", is_nan) - scaler.step(optimizer) - scaler.update() + print("\n\n") + while is_nan: + print("before is_nan_", is_nan) + output, is_nan_ = record_iteration_with_amp( + write_fn, inputs, labels, is_nan, scaler + ) + is_nan = is_nan_ + print("after is_nan_", is_nan) + scaler.step(optimizer) + scaler.update() ##