Skip to content

Commit

Permalink
[trivial] bug fix on recorder_v2.py script
Browse files Browse the repository at this point in the history
Currently, errors occur in creating golden data for most of `genModelTests_v2.py` unittests except for mixed-precision unittests. Therefore, I've restored the previous function as a 'default' type for non-mixed-precision unittests, while newly added mixed precision unittests are classified as 'mixed' type.

Additionally, python formatter has been applied and some bugs(cpu<->cuda mismatch, etc.) have been fixed.

Signed-off-by: Seungbaek Hong <[email protected]>
  • Loading branch information
baek2sm authored and jijoongmoon committed Nov 26, 2024
1 parent dc0122a commit ff71fad
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 41 deletions.
2 changes: 2 additions & 0 deletions test/input_gen/genModelTests_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -801,6 +801,7 @@ def forward(self, inputs, labels):
label_dims=[(1, 10)],
name="fc_mixed_training",
optimizer=fc_mixed_training.getOptimizer(),
type="mixed",
)

inspect_file("fc_mixed_training.nnmodelgolden")
Expand Down Expand Up @@ -846,6 +847,7 @@ def forward(self, inputs, labels):
label_dims=[(1, 1)],
name="fc_mixed_training_nan_sgd",
optimizer=fc_mixed_training_nan_sgd.getOptimizer(),
type="mixed",
)

# Function to check the created golden test file
Expand Down
96 changes: 55 additions & 41 deletions test/input_gen/recorder_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@

from transLayer_v2 import params_translated

if torch.__version__ != "1.9.1":
if torch.__version__ != "2.4":
print(
"the script was tested at version 1.9.1 it might not work if torch version is different"
"the script was tested at version 2.4 it might not work if torch version is different"
)

SEED = 1234
Expand Down Expand Up @@ -92,6 +92,7 @@ def record_v2(
input_dtype=None,
input_label_reader=None,
optimizer=None,
type="default",
):
## file format is as below
# [<number of iteration(int)> <Iteration> <Iteration>...<Iteration>]
Expand Down Expand Up @@ -132,7 +133,7 @@ def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler):
model_ = model.cuda()

print(inputs[0], " inputs inside")
output = model_(inputs[0], labels[0])
output = model_(inputs[0].cuda(), labels[0].cuda())

print("model output type: ", output.dtype)

Expand All @@ -145,51 +146,64 @@ def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler):
print("Gradient ---------------")
for param in model_.parameters():
print(param.grad)
mask = torch.isnan(param.grad) or torch.isinf(param.grad)
check_nan = mask.int()
if check_nan.sum().item():
is_nan = True
else:
is_nan = False
is_nan = torch.any(torch.isnan(param.grad) | torch.isinf(param.grad))
if is_nan:
print("nan or inf detected in gradient")
break

if not is_nan:
print("------------------------------- not nan")
write_fn(output, "int32", "float32")
return output, is_nan

with open(file_name, "wb") as f:
# write number of iterations
print("iteration : ", iteration)
np.array([iteration], dtype="int32").tofile(f)

write_fn = _get_writer_mixed(f)
for i in range(iteration):
if input_label_reader != None:
inputs, labels = input_label_reader(input_dims, label_dims, input_dtype)
else:
inputs = _rand_like(
input_dims, dtype=input_dtype if input_dtype is not None else float
)
labels = _rand_like(label_dims, dtype=float)
print("inputs ==============")
write_fn(inputs, "int32", "float32")
print("labels ==============")
write_fn(labels, "int32", "float32")
is_nan = True
print("=========================== ", i)
scaler = amp.GradScaler()
print("weights ==============")
write_fn(list(t for _, t in params_translated(model)), "int16", "float16")
print("\n\n")
while is_nan:
print("before is_nan_", is_nan)
output, is_nan_ = record_iteration_with_amp(
write_fn, inputs, labels, is_nan, scaler
if type == "default":
with open(file_name, "wb") as f:
# write number of iterations
np.array([iteration], dtype="int32").tofile(f)

write_fn = _get_writer(f)
for _ in range(iteration):
record_iteration(write_fn)

elif type == "mixed":
with open(file_name, "wb") as f:
# write number of iterations
print("iteration : ", iteration)
np.array([iteration], dtype="int32").tofile(f)

write_fn = _get_writer_mixed(f)
for i in range(iteration):
if input_label_reader != None:
inputs, labels = input_label_reader(
input_dims, label_dims, input_dtype
)
else:
inputs = _rand_like(
input_dims,
dtype=input_dtype if input_dtype is not None else float,
)
labels = _rand_like(label_dims, dtype=float)
print("inputs ==============")
write_fn(inputs, "int32", "float32")
print("labels ==============")
write_fn(labels, "int32", "float32")
is_nan = True
print("=========================== ", i)
scaler = amp.GradScaler()
print("weights ==============")
write_fn(
list(t for _, t in params_translated(model)), "int16", "float16"
)
is_nan = is_nan_
print("after is_nan_", is_nan)
scaler.step(optimizer)
scaler.update()
print("\n\n")
while is_nan:
print("before is_nan_", is_nan)
output, is_nan_ = record_iteration_with_amp(
write_fn, inputs, labels, is_nan, scaler
)
is_nan = is_nan_
print("after is_nan_", is_nan)
scaler.step(optimizer)
scaler.update()


##
Expand Down

0 comments on commit ff71fad

Please sign in to comment.