diff --git a/examples/quantization_aware_training/torch/resnet18/main.py b/examples/quantization_aware_training/torch/resnet18/main.py index 83c07e71438..026bd44acb8 100644 --- a/examples/quantization_aware_training/torch/resnet18/main.py +++ b/examples/quantization_aware_training/torch/resnet18/main.py @@ -16,12 +16,14 @@ import re import subprocess +import time import warnings from copy import deepcopy from pathlib import Path from typing import List, Tuple import openvino as ov +import openvino.torch # noqa import torch import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq import torch.nn as nn @@ -37,6 +39,7 @@ from torch.ao.quantization.quantize_pt2e import convert_pt2e from torch.ao.quantization.quantize_pt2e import prepare_pt2e from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer +from torch.fx.passes.graph_drawer import FxGraphDrawer from torch.jit import TracerWarning import nncf @@ -63,6 +66,18 @@ DATASET_PATH = "~/.cache/nncf/datasets" +def measure_time(model, example_inputs, num_iters): + with torch.no_grad(): + model(*example_inputs) + total_time = 0 + for i in range(0, num_iters): + start_time = time.time() + model(*example_inputs) + total_time += time.time() - start_time + average_time = (total_time / num_iters) * 1000 + return average_time + + def download_dataset() -> Path: downloader = FastDownload(base=DATASET_PATH, archive="downloaded", data="extracted") return downloader.get(DATASET_URL) @@ -264,7 +279,7 @@ def transform_fn(data_item): with torch.no_grad(): example_inputs = (torch.ones((1, 3, IMAGE_SIZE, IMAGE_SIZE)),) - exported_model = capture_pre_autograd_graph(model, example_inputs) + exported_model = capture_pre_autograd_graph(model.eval(), example_inputs) NNCF_TORCH_FX = False @@ -277,15 +292,50 @@ def transform_fn(data_item): from tqdm import tqdm - for data in tqdm(islice(quantization_dataset.get_inference_data(), 3)): + for data in tqdm(islice(quantization_dataset.get_inference_data(), 300)): prepared_model(data) quantized_model = convert_pt2e(prepared_model) + + g = FxGraphDrawer(quantized_model, "acc_resnet18_int8_native") + g.get_dot_graph().write_svg("acc_resnet18_int8_native.svg") else: quantized_model = nncf.quantize(exported_model, quantization_dataset) + g = FxGraphDrawer(quantized_model, "acc_resnet18_int8_nncf") + g.get_dot_graph().write_svg("acc_resnet18_int8_nncf.svg") - quantized_model = torch.compile(quantized_model) - acc1_int8_init = validate(val_loader, quantized_model, device) + # quantized_model = torch.compile(quantized_model) + # acc1_int8_init = validate(val_loader, quantized_model, device) + acc1_int8_init = validate(val_loader, torch.compile(quantized_model), device) print(f"Accuracy@1 of initialized INT8 model: {acc1_int8_init:.3f}") + + num_iters = 100 + + print("original model execution time: ", measure_time(model, example_inputs, num_iters)) + native_optimized_model_fp32 = torch.compile(exported_model) + print( + "Torch Inductor FP32 model execution time: ", + measure_time(native_optimized_model_fp32, example_inputs, num_iters), + ) + native_optimized_model_int8 = torch.compile(quantized_model) + print( + "Torch Inductor INT8 model execution time: ", + measure_time(native_optimized_model_int8, example_inputs, num_iters), + ) + + ov_optimized_model_fp32 = torch.compile(exported_model, backend="openvino") + print( + "Torch.compile OpenVINO FP32 model execution time: ", + measure_time(ov_optimized_model_fp32, example_inputs, num_iters), + ) + + ov_optimized_model_int8 = torch.compile( + quantized_model, backend="openvino", options={"model_caching": True, "cache_dir": "./model_cache"} + ) + print( + "Torch.compile OpenVINO INT8 model execution time: ", + measure_time(ov_optimized_model_int8, example_inputs, num_iters), + ) + return ############################################################################### # Step 3: Fine tune quantized model diff --git a/nncf/experimental/torch_fx/quantization/default_quantization.py b/nncf/experimental/torch_fx/quantization/default_quantization.py index 978bbee88c6..23d9d72b4f8 100644 --- a/nncf/experimental/torch_fx/quantization/default_quantization.py +++ b/nncf/experimental/torch_fx/quantization/default_quantization.py @@ -36,6 +36,7 @@ operator_metatypes.PTLayerNormMetatype, operator_metatypes.PTModuleLayerNormMetatype, # operator_metatypes.PTAddMetatype, + operator_metatypes.PTReshapeMetatype, operator_metatypes.PTMulMetatype, operator_metatypes.PTDivMetatype, operator_metatypes.PTMatMulMetatype, @@ -80,7 +81,7 @@ operator_metatypes.PTTransposeMetatype, operator_metatypes.PTGatherMetatype, operator_metatypes.PTScatterMetatype, - operator_metatypes.PTReshapeMetatype, + # operator_metatypes.PTReshapeMetatype, operator_metatypes.PTSqueezeMetatype, operator_metatypes.PTSplitMetatype, operator_metatypes.PTExpandMetatype, diff --git a/nncf/experimental/torch_fx/transformations.py b/nncf/experimental/torch_fx/transformations.py index d22057017be..bea5d6b587b 100644 --- a/nncf/experimental/torch_fx/transformations.py +++ b/nncf/experimental/torch_fx/transformations.py @@ -105,11 +105,13 @@ def insert_one_qdq( } quantize_op = torch.ops.quantized_decomposed.quantize_per_tensor.default dequantize_op = torch.ops.quantized_decomposed.dequantize_per_tensor.default + # Quantized functions accepts only uint8 as an input if target_point.target_type != TargetType.OPERATION_WITH_WEIGHTS and qparams["_dtype_"] == torch.int8: - qparams["_zero_point_"] = 125 + qparams["_zero_point_"] = qparams["_zero_point_"] - qparams["_quant_min_"] + quants_len = qparams["_quant_max_"] - qparams["_quant_min_"] qparams["_quant_min_"] = 0 - qparams["_quant_max_"] = 255 + qparams["_quant_max_"] = quants_len qparams["_dtype_"] = torch.uint8 # TODO: map FakeQuantizePramaeters to qparams for quantize/dequantize # 2. replace activation_post_process node with quantize and dequantize diff --git a/torch_compile_ex_release.py b/torch_compile_ex_release.py index 78d66f3b7fc..7bd0addf02e 100644 --- a/torch_compile_ex_release.py +++ b/torch_compile_ex_release.py @@ -44,7 +44,7 @@ def get_exported_model_from_nn_module(module, example_inputs): return capture_pre_autograd_graph(module, example_inputs) -NNCF_IMPL = False +NNCF_IMPL = True def get_qsetup(exported_model, example_inputs): @@ -163,13 +163,13 @@ def main(model_name, num_iters): converted_model = quantize(copy.deepcopy(model), example_inputs) - # print("original model execution time: ", measure_time(model, example_inputs, num_iters)) + print("original model execution time: ", measure_time(model, example_inputs, num_iters)) - # native_optimized_model_fp32 = torch.compile(model) - # print( - # "Torch Inductor FP32 model execution time: ", - # measure_time(native_optimized_model_fp32, example_inputs, num_iters), - # ) + native_optimized_model_fp32 = torch.compile(model) + print( + "Torch Inductor FP32 model execution time: ", + measure_time(native_optimized_model_fp32, example_inputs, num_iters), + ) native_optimized_model_int8 = torch.compile(converted_model) print(