From 764ddffb466704f44c1e5a6203e9e785c3859e29 Mon Sep 17 00:00:00 2001 From: andersendsa <199610634+andersendsa@users.noreply.github.com> Date: Thu, 25 Jun 2026 14:05:40 +0000 Subject: [PATCH] fix: Cache statistics in OpenVINO/ONNX Accuracy Restorer Currently, `quantize_with_accuracy_control_impl` performs the initial quantization pass and then may call `quantize_with_tune_hyperparams` which runs `HyperparameterTuner`. Both the initial quantization and the `HyperparameterTuner` step 0 were collecting the exact same statistics. This commit updates `quantize_impl` and its internal backend functions in OpenVINO and ONNX to explicitly collect and return `StatisticPointsContainer` by exposing a new `return_statistics` flag. The returned statistics are passed down to `quantize_with_tune_hyperparams` and the `HyperparameterTuner`, which now reuse these pre-computed statistics for the first step, removing the redundant, expensive statistics recalculation step. --- src/nncf/onnx/quantization/quantize_model.py | 17 ++++++--- .../openvino/quantization/quantize_ifmodel.py | 4 ++- .../openvino/quantization/quantize_model.py | 35 ++++++++++++++----- .../hyperparameter_tuner/algorithm.py | 26 +++++++++----- src/nncf/quantization/quantize_model.py | 3 ++ 5 files changed, 63 insertions(+), 22 deletions(-) diff --git a/src/nncf/onnx/quantization/quantize_model.py b/src/nncf/onnx/quantization/quantize_model.py index 4ec6b8c6111..c480f91b776 100644 --- a/src/nncf/onnx/quantization/quantize_model.py +++ b/src/nncf/onnx/quantization/quantize_model.py @@ -12,7 +12,7 @@ import sys from copy import deepcopy from pathlib import Path -from typing import Any, Callable, Iterable, TypeVar +from typing import Any, Callable, Iterable, TypeVar, Tuple, Union import onnx from onnx.external_data_helper import ExternalDataInfo @@ -22,6 +22,8 @@ import nncf from nncf.common.factory import build_graph +from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer +from nncf.quantization.algorithms.pipeline import collect_statistics from nncf.common.logging.logger import nncf_logger from nncf.common.quantization.structs import QuantizationPreset from nncf.data import Dataset @@ -132,7 +134,8 @@ def quantize_impl( model_type: ModelType | None = None, ignored_scope: IgnoredScope | None = None, advanced_parameters: AdvancedQuantizationParameters | None = None, -) -> onnx.ModelProto: + return_statistics: bool = False, +) -> Union[onnx.ModelProto, Tuple[onnx.ModelProto, StatisticPointsContainer]]: """ Implementation of the `quantize()` method for the ONNX backend. """ @@ -174,7 +177,9 @@ def quantize_impl( graph = GraphConverter.create_nncf_graph(model) warning_model_no_batchwise_support(graph, advanced_parameters, model_type, OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS) - quantized_model = quantization_algorithm.apply(model, graph, dataset=calibration_dataset) + statistic_points = quantization_algorithm.get_statistic_points(model, graph) + statistic_points = collect_statistics(statistic_points, model, graph, calibration_dataset) + quantized_model = quantization_algorithm.apply(model, graph, statistic_points, dataset=calibration_dataset) if external_data_dir: remove_metadata(model, MetadataKey.EXTERNAL_DATA_DIR) @@ -184,6 +189,8 @@ def quantize_impl( if is_weight_compression_needed(advanced_parameters): compress_quantize_weights_transformation(quantized_model) + if return_statistics: + return quantized_model, statistic_points return quantized_model @@ -217,7 +224,7 @@ def quantize_with_accuracy_control_impl( copied_parameters = deepcopy(advanced_quantization_parameters) copied_parameters.backend_params[BackendParameters.COMPRESS_WEIGHTS] = False - quantized_model = quantize_impl( + quantized_model, statistic_points = quantize_impl( model=model, calibration_dataset=calibration_dataset, preset=preset, @@ -227,6 +234,7 @@ def quantize_with_accuracy_control_impl( model_type=model_type, ignored_scope=ignored_scope, advanced_parameters=copied_parameters, + return_statistics=True, ) if advanced_accuracy_restorer_parameters.intermediate_model_dir: @@ -267,6 +275,7 @@ def quantize_with_accuracy_control_impl( model_type, ignored_scope, copied_parameters, + initial_statistic_points=statistic_points, ) tuned_quantized_metric_results = evaluator.collect_metric_results( tuned_quantized_model, validation_dataset, model_name="tuned" diff --git a/src/nncf/openvino/quantization/quantize_ifmodel.py b/src/nncf/openvino/quantization/quantize_ifmodel.py index b8d26381b75..b70bf4c5ed1 100644 --- a/src/nncf/openvino/quantization/quantize_ifmodel.py +++ b/src/nncf/openvino/quantization/quantize_ifmodel.py @@ -153,7 +153,7 @@ def apply_algorithm_if_bodies( """ nncf_logger.info(f"Iteration [{current_model_num}/{len(graphs)}] ...") parent_graph = graphs[graph_id] - quantized_model = algorithm.apply(parent_model, parent_graph, parent_statistic_points, parent_dataset) + quantized_model = algorithm.apply(parent_model, parent_graph, parent_statistic_points, dataset=parent_dataset) if get_number_if_op(parent_model) == 0: return quantized_model, current_model_num model_transformer_fp32 = factory.ModelTransformerFactory.create(parent_model) @@ -186,6 +186,7 @@ def apply_algorithm_if_bodies( then_dataset, subset_size, current_model_num + 1, + parent_statistic_points, ) else_quantized_model, current_model_num = apply_algorithm_if_bodies( algorithm, @@ -195,6 +196,7 @@ def apply_algorithm_if_bodies( else_dataset, subset_size, current_model_num + 1, + parent_statistic_points, ) model_transformer_int8 = factory.ModelTransformerFactory.create(quantized_model) quantized_model = _update_if_body(model_transformer_int8, if_node, True, then_quantized_model) diff --git a/src/nncf/openvino/quantization/quantize_model.py b/src/nncf/openvino/quantization/quantize_model.py index 19031fb1674..0186ccb73ff 100644 --- a/src/nncf/openvino/quantization/quantize_model.py +++ b/src/nncf/openvino/quantization/quantize_model.py @@ -11,13 +11,15 @@ from copy import deepcopy from pathlib import Path -from typing import Any, Callable, Iterable, TypeVar +from typing import Any, Callable, Iterable, TypeVar, Tuple, Union import openvino as ov from openvino._offline_transformations import compress_quantize_weights_transformation from nncf.common.factory import StatisticsAggregatorFactory from nncf.common.factory import build_graph +from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer +from nncf.quantization.algorithms.pipeline import collect_statistics from nncf.common.logging import nncf_logger from nncf.common.quantization.structs import QuantizationPreset from nncf.data import Dataset @@ -71,7 +73,8 @@ def native_quantize_if_op_impl( model_type: ModelType | None = None, ignored_scope: IgnoredScope | None = None, advanced_parameters: AdvancedQuantizationParameters | None = None, -) -> ov.Model: + return_statistics: bool = False, +) -> Union[ov.Model, Tuple[ov.Model, StatisticPointsContainer]]: """ Implementation of the `quantize()` method for the OpenVINO backend via the OpenVINO Runtime API. """ @@ -109,6 +112,7 @@ def _extract_all_subgraphs(model: ov.Model, current_id: str) -> None: model_type=model_type, ignored_scope=ignored_scope, advanced_parameters=advanced_parameters, + return_statistics=return_statistics, ) for graph in graphs.values(): if is_model_no_batchwise_support(graph, advanced_parameters, model_type, OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS): @@ -119,13 +123,16 @@ def _extract_all_subgraphs(model: ov.Model, current_id: str) -> None: f"The model consists of {if_ops_number} If node(-s) with then and else bodies. \ Main model and all If bodies will be quantized recursively." ) + statistic_points = quantization_algorithm.get_statistic_points(model, graphs[main_model_graph_id]) + statistic_points = collect_statistics(statistic_points, model, graphs[main_model_graph_id], calibration_dataset) quantized_model, _ = apply_algorithm_if_bodies( - quantization_algorithm, model, graphs, main_model_graph_id, calibration_dataset, subset_size, 1 + quantization_algorithm, model, graphs, main_model_graph_id, calibration_dataset, subset_size, 1, statistic_points ) if is_weight_compression_needed(advanced_parameters): compress_quantize_weights_transformation(quantized_model) + dump_parameters( quantized_model, { @@ -138,6 +145,8 @@ def _extract_all_subgraphs(model: ov.Model, current_id: str) -> None: "advanced_parameters": convert_to_dict_recursively(advanced_parameters), }, ) + if return_statistics: + return quantized_model, statistic_points return quantized_model @@ -152,7 +161,8 @@ def native_quantize_impl( model_type: ModelType | None = None, ignored_scope: IgnoredScope | None = None, advanced_parameters: AdvancedQuantizationParameters | None = None, -) -> ov.Model: + return_statistics: bool = False, +) -> Union[ov.Model, Tuple[ov.Model, StatisticPointsContainer]]: """ Implementation of the `quantize()` method for the OpenVINO backend via the OpenVINO Runtime API. """ @@ -165,14 +175,18 @@ def native_quantize_impl( model_type=model_type, ignored_scope=ignored_scope, advanced_parameters=advanced_parameters, + return_statistics=return_statistics, ) graph = GraphConverter.create_nncf_graph(model) warning_model_no_batchwise_support(graph, advanced_parameters, model_type, OPERATIONS_OUTPUT_HAS_NO_BATCH_AXIS) - quantized_model = quantization_algorithm.apply(model, graph, dataset=calibration_dataset) + statistic_points = quantization_algorithm.get_statistic_points(model, graph) + statistic_points = collect_statistics(statistic_points, model, graph, calibration_dataset) + quantized_model = quantization_algorithm.apply(model, graph, statistic_points, dataset=calibration_dataset) if is_weight_compression_needed(advanced_parameters): compress_quantize_weights_transformation(quantized_model) + dump_parameters( quantized_model, { @@ -185,6 +199,8 @@ def native_quantize_impl( "advanced_parameters": convert_to_dict_recursively(advanced_parameters), }, ) + if return_statistics: + return quantized_model, statistic_points return quantized_model @@ -219,7 +235,7 @@ def quantize_with_accuracy_control_impl( copied_parameters = deepcopy(advanced_quantization_parameters) copied_parameters.backend_params[BackendParameters.COMPRESS_WEIGHTS] = False - quantized_model = quantize_impl( + quantized_model, statistic_points = quantize_impl( model=model, calibration_dataset=calibration_dataset, preset=preset, @@ -229,6 +245,7 @@ def quantize_with_accuracy_control_impl( model_type=model_type, ignored_scope=ignored_scope, advanced_parameters=copied_parameters, + return_statistics=True, ) if advanced_accuracy_restorer_parameters.intermediate_model_dir: @@ -251,7 +268,6 @@ def quantize_with_accuracy_control_impl( nncf_logger.info(f"Accuracy drop: {accuracy_drop} ({drop_type})") - # TODO(andrey-churkin): Collect statistics only once if advanced_accuracy_restorer_parameters.tune_hyperparams and not should_terminate: model = remove_friendly_name_duplicates(model) tuned_quantized_model = quantize_with_tune_hyperparams( @@ -269,6 +285,7 @@ def quantize_with_accuracy_control_impl( model_type, ignored_scope, copied_parameters, + initial_statistic_points=statistic_points, ) tuned_quantized_metric_results = evaluator.collect_metric_results( tuned_quantized_model, validation_dataset, model_name="tuned" @@ -338,7 +355,8 @@ def quantize_impl( model_type: ModelType | None = None, ignored_scope: IgnoredScope | None = None, advanced_parameters: AdvancedQuantizationParameters | None = None, -) -> ov.Model: + return_statistics: bool = False, +) -> Union[ov.Model, Tuple[ov.Model, StatisticPointsContainer]]: """ Implementation of the `quantize()` method for the OpenVINO backend. """ @@ -359,6 +377,7 @@ def quantize_impl( model_type=model_type, ignored_scope=ignored_scope, advanced_parameters=advanced_parameters, + return_statistics=return_statistics, ) diff --git a/src/nncf/quantization/algorithms/hyperparameter_tuner/algorithm.py b/src/nncf/quantization/algorithms/hyperparameter_tuner/algorithm.py index cba61485ca9..0814f752807 100644 --- a/src/nncf/quantization/algorithms/hyperparameter_tuner/algorithm.py +++ b/src/nncf/quantization/algorithms/hyperparameter_tuner/algorithm.py @@ -225,6 +225,7 @@ def __init__( subset_size: int, initial_metric_results: MetricResults, quantized_metric_results: MetricResults, + initial_statistic_points: StatisticPointsContainer | None = None, ): """ :param pipeline_fn: Function to create pipeline. @@ -246,6 +247,7 @@ def __init__( self._subset_size = subset_size self._initial_metric_results = initial_metric_results self._quantized_metric_results = quantized_metric_results + self._initial_statistic_points = initial_statistic_points self._is_metric_mode = isinstance(self._initial_metric_results.values_for_each_item[0], float) @@ -290,8 +292,11 @@ def apply(self, model: TModel, validation_dataset: Dataset) -> TModel: # TODO(andrey-churkin): Think about how it can be avoided. params = apply_combination(self._init_params, best_settings) pipeline = self._pipeline_fn(**params) - container = pipeline.get_statistic_points_for_step(step_index, step_model, step_graph) - step_statistics = collect_statistics(container, step_model, step_graph, self._calibration_dataset) + if step_index == 0 and self._initial_statistic_points is not None: + step_statistics = self._initial_statistic_points + else: + container = pipeline.get_statistic_points_for_step(step_index, step_model, step_graph) + step_statistics = collect_statistics(container, step_model, step_graph, self._calibration_dataset) step_model = pipeline.run_step(step_index, step_statistics, step_model, step_graph) continue @@ -359,13 +364,16 @@ def _prepare_pipeline_step( self._pipelines[combination_key] = self._pipeline_fn(**kwargs) # Collect statistics required to execute `step_index`-th pipeline step - containers = [ - pipeline.get_statistic_points_for_step(step_index, step_model, step_graph) - for pipeline in self._pipelines.values() - ] - self._step_index_to_statistics[step_index] = collect_statistics( - containers, step_model, step_graph, self._calibration_dataset - ) + if step_index == 0 and self._initial_statistic_points is not None: + self._step_index_to_statistics[step_index] = self._initial_statistic_points + else: + containers = [ + pipeline.get_statistic_points_for_step(step_index, step_model, step_graph) + for pipeline in self._pipelines.values() + ] + self._step_index_to_statistics[step_index] = collect_statistics( + containers, step_model, step_graph, self._calibration_dataset + ) def _calculate_combination_score( self, diff --git a/src/nncf/quantization/quantize_model.py b/src/nncf/quantization/quantize_model.py index 1277ca32f8b..74c87d4f408 100644 --- a/src/nncf/quantization/quantize_model.py +++ b/src/nncf/quantization/quantize_model.py @@ -9,6 +9,7 @@ # See the License for the specific language governing permissions and # limitations under the License. from typing import Any, Callable, Iterable, TypedDict, TypeVar +from nncf.common.tensor_statistics.statistic_point import StatisticPointsContainer import nncf from nncf.common.graph import NNCFGraph @@ -723,6 +724,7 @@ def quantize_with_tune_hyperparams( model_type: ModelType | None = None, ignored_scope: IgnoredScope | None = None, advanced_quantization_parameters: AdvancedQuantizationParameters | None = None, + initial_statistic_points: StatisticPointsContainer | None = None, ) -> TModel: """ Applies post-training quantization algorithm with tune hyperparameters to provided model. @@ -778,6 +780,7 @@ def quantize_with_tune_hyperparams( tuner_subset_size, initial_metric_results, quantized_metric_results, + initial_statistic_points, ) quantized_model = hyperparameter_tuner.apply(model, validation_dataset)