Merge remote-tracking branch 'origin/develop' into dl/quantization/pa…

…sses_for_splitted_graphs
daniil-lyakhov · Dec 8, 2023 · 127ae4e · 127ae4e
2 parents c4c1c91 + 7af953a
commit 127ae4e
Show file tree

Hide file tree

Showing 144 changed files with 2,535 additions and 1,384 deletions.
diff --git a/codecov.yml b/codecov.yml
@@ -5,8 +5,7 @@ ignore:
 
 codecov:
   notify:
-     after_n_builds: 2
-     wait_for_ci: no
+     wait_for_ci: true
   max_report_age: off
 
 coverage:
@@ -15,6 +14,7 @@ coverage:
       default:
         branches:
           - develop
+        target: 90%
         informational: true
         only_pulls: true
         paths:
@@ -23,15 +23,72 @@ coverage:
       default:
         branches:
           - develop
+        target: 90%
         informational: true
         only_pulls: true
         paths:
-          - "nncf/onnx"
-          - "nncf/common"  # extend this once we collect coverage reports for more than just onnx and common part of precommit
+          - "nncf"
 
 comment:
-  layout: "diff, flags, files"
+  layout: "reach, diff, files, flags, components"
   require_changes: false
 
   require_head: false
   require_base: false
+
+flag_management:
+  # Flag coverage percentage seems to show the "percentage of lines under the flag path covered as reported ONLY
+  # by the upload with the corresponding flag", so e.g. for COMMON the flag coverage percentage will report the
+  # percentage of common code tested ONLY by the common tests, and e.g. not by backend-specific precommit parts
+  # (which also run common code and are therefore indirectly providing coverage). Ideally each flag-specific path
+  # would be described below with the corresponding flag and provide valuable information on whether the test code base
+  # is written efficiently, e.g. that the backend-specific tests predominantly validate backend-specific code and the
+  # common tests completely cover the common code on their own. However, if we set all flags with paths here, then the
+  # total repo coverage percentage will sink, because codecov currently reports the overall coverage based on the union
+  # of the "flag" coverages - not the "component" coverages (see below) - and currently NNCF's precommit tests are
+  # biased toward validating common code via backend-specific tests. In the future the tests will be gradually
+  # refactored to have more "locality" in what each precommit section tests.
+  individual_flags:
+    - name: COMMON
+      paths:
+        - nncf/common
+        - nncf/quantization
+
+component_management:
+  # In contrast to the "flag" coverage above, the "component" display seems to calculate percentage based on the
+  # coverage information from ALL uploads for the code in the specified path. With this, the "component" coverage
+  # percentage is a better representation of what sub-paths in the NNCF code base are covered with at least one test,
+  # without distinction whether the test was run in the
+  individual_components:
+    - component_id: common
+      name: common
+      paths:
+        - nncf/common
+        - "!nncf/**/torch_*.py"
+        - "!nncf/**/tensorflow_*.py"
+        - "!nncf/**/onnx_*.py"
+        - "!nncf/**/openvino_*.py"
+    - component_id: torch
+      name: torch
+      paths:
+        - nncf/torch
+        - nncf/**/torch_*.py
+    - component_id: tensorflow
+      name: tensorflow
+      paths:
+        - nncf/tensorflow
+        - nncf/**/tensorflow_*.py
+    - component_id: onnx
+      name: onnx
+      paths:
+        - nncf/onnx
+        - nncf/**/onnx_*.py
+    - component_id: openvino
+      name: openvino
+      paths:
+        - nncf/openvino
+        - nncf/**/openvino_*.py
+    - component_id: quantization
+      name: ptq
+      paths:
+        - nncf/quantization
diff --git a/docs/compression_algorithms/CompressWeights.md b/docs/compression_algorithms/CompressWeights.md
@@ -8,22 +8,30 @@ The Weights Compression algorithm is aimed at compressing the weights of the mod
 
 #### Supported modes
 
-By default, weights are compressed to 8-bit integer data type - "INT8" mode.
+By default, weights are compressed asymmetrically to 8-bit integer data type - "INT8_ASYM" mode.
 OpenVINO backend also supports 3 modes of mixed precision weight quantization with a 4-bit data type as a primary precision - INT4_SYM, INT4_ASYM and NF4. The primary precision in case of INT4_SYM mode is unsigned 4-bit integer and weights are quantized to it [symmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#symmetric-quantization) with a fixed zero point equals to 8. In case of INT4_ASYM mode - also unsigned 4-bit integer, but weight are quantized to it [asymmetrically](https://github.com/openvinotoolkit/nncf/blob/develop/docs/compression_algorithms/Quantization.md#asymmetric-quantization) with a typical non-fixed zero point. In case of NF4 mode - [nf4](https://arxiv.org/pdf/2305.14314v1.pdf) data type without zero point.
 All 4-bit modes have a grouped quantization support, when small group of weights (e.g. 128) in the channel dimension share quantization parameters (scale).
 All embeddings and last linear layers are always compressed to 8-bit integer data type.
-Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to 8-bit integer data type.
+Percent of the rest layers compressed to 4-bit can be configured by "ratio" parameter. E.g. ratio=0.9 means 90% of layers compressed to the corresponding 4-bit data type and the rest to 8-bit asymmetric integer data type.
 
 #### User guide
 
-- Compress weights to 8-bit integer data type.
+- Compress weights asymmetrically to 8-bit integer data type.
 
 ```python
 from nncf import compress_weights
 compressed_model = compress_weights(model)
 ```
 
-- Compress weights symmetrically to 4-bit integer data type with group size = 128, except embeddings and last linear layers - they are compressed to 8-bit integer data type.
+- Compress weights symmetrically to 8-bit integer data type.
+
+```python
+from nncf import compress_weights
+from nncf import CompressWeightsMode
+compressed_model = compress_weights(model, mode=CompressWeightsMode.INT8_SYM)
+```
+
+- Compress weights symmetrically to 4-bit integer data type with group size = 128, except embeddings and last linear layers - they are compressed asymmetrically to 8-bit integer data type.
 
 ```python
 from nncf import compress_weights
@@ -36,7 +44,7 @@ compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_SYM)
   If the accuracy or perplexity is still not satisfying, there are 2 more hyper-parameters to tune: `group_size` and `ratio`.
   Lower group size and less ratio of 4-bit layers usually improve accuracy at the sacrifice of inference speed.
   Below is the example how to compress weights of 90% of layers to 4-bit integer asymmetrically with the group size 64, and
-  the rest of layers to 8-bit integer data type. The same parametrization is applicable for `INT4_SYM` mode.
+  the rest of layers to 8-bit asymmetric integer data type. The same parametrization is applicable for `INT4_SYM` mode.
 
 ```python
 from nncf import compress_weights
@@ -45,7 +53,7 @@ compressed_model = compress_weights(model, mode=CompressWeightsMode.INT4_ASYM, g
 ```
 
 - `NF4` mode can be considered for improving accuracy, but currently models quantized to nf4 should not be faster models
-  quantized to 8-bit integer. Here's the example how to compress weights to nf4 data type with group size = 128.
+  quantized to 8-bit asymmetric integer. Here's the example how to compress weights to nf4 data type with group size = 128.
   Different `group_size` and `ratio` are also supported.
 
 ```python
@@ -79,7 +87,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">databricks/dolly-v2-3b</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">5.07</td>
     <td class="tg-0pky">0.05</td>
     <td class="tg-0pky">2.6</td>
@@ -107,7 +115,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">facebook/opt-6.7b</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">4.27</td>
     <td class="tg-0pky">0.01</td>
     <td class="tg-0pky">6.2</td>
@@ -135,7 +143,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">meta-llama/Llama-2-7b-chat-hf</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">3.29</td>
     <td class="tg-0pky">0.01</td>
     <td class="tg-0pky">6.3</td>
@@ -163,7 +171,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">togethercomputer/RedPajama-INCITE-7B-Instruct</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">4.17</td>
     <td class="tg-0pky">0.02</td>
     <td class="tg-0pky">6.4</td>
@@ -191,7 +199,7 @@ Here is the perplexity and model size before and after weight compression for di
   </tr>
   <tr>
     <td class="tg-0pky">meta-llama/Llama-2-13b-chat-hf</td>
-    <td class="tg-0pky">int8</td>
+    <td class="tg-0pky">int8_asym</td>
     <td class="tg-0pky">2.91</td>
     <td class="tg-0pky">0</td>
     <td class="tg-0pky">12.1</td>
@@ -218,7 +226,7 @@ Here is the perplexity and model size before and after weight compression for di
 - The algorithm is supported for OpenVINO and PyTorch models.
 - The compression applies in-place.
 - The compressed model is not trainable.
-- INT4_SYM, INT4_ASYM and NF4 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only.
+- INT8_SYM, INT4_SYM, INT4_ASYM and NF4 modes, grouped quantization and mixed precision selection is available for OpenVINO backend only.
 - NF4 support is experimental - models quantized to nf4 should not be faster models quantized to 8-bit integer.
 
 #### Additional resources

diff --git a/examples/tensorflow/object_detection/main.py b/examples/tensorflow/object_detection/main.py
@@ -323,8 +323,7 @@ def run(config):
 
     # Training parameters
     epochs = config.epochs
-    steps_per_epoch = train_builder.steps_per_epoch
-    num_test_batches = test_builder.steps_per_epoch
+    steps_per_epoch, num_test_batches = train_builder.steps_per_epoch, test_builder.steps_per_epoch
 
     # Create model builder
     model_builder = get_model_builder(config)
@@ -336,10 +335,7 @@ def run(config):
     )
 
     resume_training = config.ckpt_path is not None
-
-    compression_state = None
-    if resume_training:
-        compression_state = load_compression_state(config.ckpt_path)
+    compression_state = load_compression_state(config.ckpt_path) if resume_training else None
 
     with TFModelManager(model_builder.build_model, config.nncf_config, weights=config.get("weights", None)) as model:
         with strategy.scope():
@@ -384,6 +380,8 @@ def run(config):
     test_step = create_test_step_fn(strategy, compress_model, predict_post_process_fn)
 
     if "train" in config.mode:
+        if config.weights is None and not resume_training:
+            logger.warning("Pretrained checkpoint is not provided. This may lead to poor training results!")
         if is_accuracy_aware_training(config):
             train_summary_writer = SummaryWriter(config.log_dir, "train")
             timer = Timer()

diff --git a/examples/torch/classification/README.md b/examples/torch/classification/README.md
@@ -64,7 +64,9 @@ python main.py \
 - Use the `--resume` flag with the path to a previously saved model to resume training.
 - For Torchvision-supported image classification models, set `"pretrained": true` inside the NNCF config JSON file supplied via `--config` to initialize the model to be compressed with Torchvision-supplied pretrained weights, or, alternatively:
 - Use the `--weights` flag with the path to a compatible PyTorch checkpoint in order to load all matching weights from the checkpoint into the model - useful if you need to start compression-aware training from a previously trained uncompressed (FP32) checkpoint instead of performing compression-aware training from scratch.
-- Use the `--no_strip_on_export` to export not stripped model.
+- Use `--export-model-path` to specify the path to export the model in OpenVINO or ONNX format by using the .xml or .onnx suffix, respectively.
+- Use the `--no-strip-on-export` to export not stripped model.
+- Use the `--export-to-ir-via-onnx` to to export to OpenVINO, will produce the serialized OV IR object by first exporting the torch model object to an .onnx file and then converting that .onnx file to an OV IR file.
 
 ### Validate Your Model Checkpoint
 
@@ -86,7 +88,7 @@ To export trained model to the ONNX format, use the following command:
 python main.py -m export \
 --config=configs/quantization/mobilenet_v2_imagenet_int8.json \
 --resume=../../results/quantization/mobilenet_v2_int8/6/checkpoints/epoch_1.pth \
---to-onnx=../../results/mobilenet_v2_int8.onnx
+--to-ir=../../results
 ```
 
 ### Export to OpenVINO™ Intermediate Representation (IR)

diff --git a/...ples/torch/classification/configs/binarization/resnet18_imagenet_binarization_dorefa.json b/...ples/torch/classification/configs/binarization/resnet18_imagenet_binarization_dorefa.json
@@ -27,5 +27,6 @@
                                "{re}ResNet/Sequential\\[layer4\\]/BasicBlock\\[0\\]/Sequential\\[downsample\\]/.*"]
         }
     ],
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/examples/torch/classification/configs/binarization/resnet18_imagenet_binarization_xnor.json b/examples/torch/classification/configs/binarization/resnet18_imagenet_binarization_xnor.json
@@ -27,5 +27,6 @@
                                "{re}ResNet/Sequential\\[layer4\\]/BasicBlock\\[0\\]/Sequential\\[downsample\\]/.*"]
         }
     ],
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/.../classification/configs/mixed_precision/mobilenet_v2_imagenet_mixed_int_autoq_staged.json b/.../classification/configs/mixed_precision/mobilenet_v2_imagenet_mixed_int_autoq_staged.json
@@ -40,5 +40,6 @@
             "lr_poly_drop_duration_epochs": 10
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...es/torch/classification/configs/mixed_precision/mobilenet_v2_imagenet_mixed_int_hawq.json b/...es/torch/classification/configs/mixed_precision/mobilenet_v2_imagenet_mixed_int_hawq.json
@@ -35,5 +35,6 @@
             }
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...classification/configs/mixed_precision/mobilenet_v2_imagenet_mixed_int_manual_staged.json b/...classification/configs/mixed_precision/mobilenet_v2_imagenet_mixed_int_manual_staged.json
@@ -166,5 +166,6 @@
             "disable_wd_start_epoch": 50
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...orch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_autoq_staged.json b/...orch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_autoq_staged.json
@@ -45,5 +45,6 @@
             "lr_poly_drop_duration_epochs": 10
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/examples/torch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_hawq.json b/examples/torch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_hawq.json
@@ -35,5 +35,6 @@
             "disable_wd_start_epoch": 20
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...ples/torch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_manual.json b/...ples/torch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_manual.json
@@ -171,5 +171,6 @@
             }
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...rch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_manual_staged.json b/...rch/classification/configs/mixed_precision/resnet50_imagenet_mixed_int_manual_staged.json
@@ -173,5 +173,6 @@
             "disable_wd_start_epoch": 20
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...s/torch/classification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_hawq.json b/...s/torch/classification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_hawq.json
@@ -35,5 +35,6 @@
             }
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...lassification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_hawq_old_eval.json b/...lassification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_hawq_old_eval.json
@@ -9,5 +9,6 @@
     "target_device": "TRIAL",
     "compression": {
         "algorithm": "quantization"
-    }
+    },
+    "no_strip_on_export": true
 }
diff --git a/...torch/classification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_manual.json b/...torch/classification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_manual.json
@@ -99,5 +99,6 @@
             }
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/...lassification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_manual_staged.json b/...lassification/configs/mixed_precision/squeezenet1_1_imagenet_mixed_int_manual_staged.json
@@ -111,5 +111,6 @@
             "disable_wd_start_epoch": 50
         }
     },
-    "no_strip_on_export": true
+    "no_strip_on_export": true,
+    "export_to_ir_via_onnx": true
 }
diff --git a/examples/torch/classification/main.py b/examples/torch/classification/main.py
@@ -235,8 +235,7 @@ def model_eval_fn(model):
         load_state(model, model_state_dict, is_resume=True)
 
     if is_export_only:
-        export_model(compression_ctrl, config.to_onnx, config.no_strip_on_export)
-        logger.info(f"Saved to {config.to_onnx}")
+        export_model(compression_ctrl, config)
         return
 
     model, _ = prepare_model_for_execution(model, config)
@@ -328,8 +327,7 @@ def configure_optimizers_fn():
     config.mlflow.end_run()
 
     if "export" in config.mode:
-        export_model(compression_ctrl, config.to_onnx, config.no_strip_on_export)
-        logger.info(f"Saved to {config.to_onnx}")
+        export_model(compression_ctrl, config)
 
 
 def train(