pytorch · irtrukhina · Mar 24, 2026
@@ -72,22 +72,28 @@ class EdgeProgramToIRConverter:
     _default_target_spec = NeutronTargetSpec("imxrt700")
     _default_delegation_options = CustomDelegationOptions()
 
+    def __init__(self):
+        self.edge_to_tflite_map = {}
+
     def convert_program(
         self,
         edge_program: ExportedProgram,
         conversion_config: ConversionConfig = _default_conversion_config,
         neutron_target_spec: NeutronTargetSpec = _default_target_spec,
         custom_delegation_options: CustomDelegationOptions = _default_delegation_options,
-    ) -> tuple[bytes, dict[str, DataFormat]]:
+    ) -> tuple[bytes, dict[str, DataFormat], dict[int, tuple[int, ...]]]:
         """
         Convert ExportedProgram in Edge dialect to IR (TFLite flatbuffers) as bytes.
 
         :param edge_program: Converter ExportedProgram.
         :param conversion_config: ConversionConfig instance.
         :param neutron_target_spec: Object for querying the target platform to retrieve its properties.
         :param custom_delegation_options: Custom user options which affect node delegation.
-        :return: TFLite flatbuffers as bytes.
+        :return: TFLite flatbuffers as bytes, I/O formats, and edge-to-tflite mapping.
         """
+        # Reset the edge to tflite map for each conversion
+        self.edge_to_tflite_map = {}
+
         parameters_mapping = self.map_inputs_to_parameters(edge_program)
         dim_order_map = self.map_nodes_to_dim_order(edge_program)
 
@@ -110,14 +116,17 @@ def convert_program(
         # Apply optimizations and finalize the model.
         internal_tflite_model = cc.tflite_builder.finish()
 
+        # Get the final edge to tflite mapping after optimization
+        self.edge_to_tflite_map = cc.tflite_builder.edge_to_tflite_map
+
         # Extract the formats of the model's inputs and outputs.
         io_formats = cc.tflite_builder.get_io_formats(edge_program.graph_signature)
 
         # TFLite model generation
         flatbuffers_builder = flatbuffers.Builder()
         internal_tflite_model.gen_tflite(flatbuffers_builder)
 
-        return bytes(flatbuffers_builder.Output()), io_formats
+        return bytes(flatbuffers_builder.Output()), io_formats, self.edge_to_tflite_map
 
     @staticmethod
     def append_placeholders_and_tensors(nodes: list[Node], context: ConversionContext):
@@ -159,7 +168,6 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
             exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
         ]
-
         for node in nodes:
             if node.op == "call_function":
                 if node.target in qdq_related_functions and "cluster" in node.meta:
@@ -171,7 +179,22 @@ def _process_nodes(self, nodes: list[Node], conversion_context: ConversionContex
                     # The node was already processed alongside the Q/DQ ops.
                     pass
                 elif node.target in functions_converters:
+                    # Get TFLite op count BEFORE conversion
+                    tflite_op_count_before = len(conversion_context.tflite_builder.get_operators().vector)
+                    # Convert the node
                     functions_converters[node.target](conversion_context).convert(node)
+                    # Get TFLite op count AFTER conversion
+                    tflite_op_count_after = len(conversion_context.tflite_builder.get_operators().vector)
+
+                    # Track the mapping - store edge debug handle in operators
+                    edge_debug_handle = node.meta.get("debug_handle", None)
+                    if edge_debug_handle is not None and tflite_op_count_after > tflite_op_count_before:
+                        operators = conversion_context.tflite_builder.get_operators().vector
+                        for i in range(tflite_op_count_before, tflite_op_count_after):
+                            # Store edge debug handle in operator's temporary attribute
+                            operators[i].tmp_edge_debug_handle = edge_debug_handle
+                        logger.i(f"Tagged TFLite ops {list(range(tflite_op_count_before, tflite_op_count_after))} with edge debug_handle={edge_debug_handle} for node '{node.name}'")
+
                 else:
                     logger.e(
                         logger.Code.NOT_IMPLEMENTED,

@@ -85,6 +85,8 @@ class ModelBuilder:
 
     conversion_config: ConversionConfig
 
+    edge_to_tflite_map: dict[int, tuple[int, ...]]  # Mapping edge debug handles to tuple of TFLite operator indices
+
     _default_conversion_config = ConversionConfig()
 
     def __init__(
@@ -105,6 +107,7 @@ def __init__(
         self._nchw_tensor_version = {}
         self._skipped_output_map = {}
         self._zeros_tensor_map = {}
+        self.edge_to_tflite_map = {}
 
     def create_zeros_tensor(
         self, dims: List[int], name: str, dtype: np.dtype, can_reuse: bool = False
@@ -503,6 +506,9 @@ def finish(self) -> tflite_model.Model:
             self.conversion_config.optimization_blacklist,
         )
 
+        # Create the final edge-to-tflite mapping after model optimization
+        self._create_edge_to_tflite_mapping()
+
         self._keep_one_empty_buffer()
 
         # Remove outputs, which are not produced by any node. Otherwise, there would be errors after inference.
@@ -524,6 +530,24 @@ def finish(self) -> tflite_model.Model:
 
         return self._tfl_model
 
+    def _create_edge_to_tflite_mapping(self):
+        """Create edge-to-TFLite mapping and save it to the edge_to_tflite_map class variable.
+
+        This function should be called after all model optimizations have been applied to match the output TFLite model.
+        """
+
+        edge_to_tflite_dict = {}
+        for idx, op in enumerate(self.get_operators().vector):
+            if hasattr(op, 'tmp_edge_debug_handle') and op.tmp_edge_debug_handle is not None:
+                debug_handle = op.tmp_edge_debug_handle
+                if debug_handle not in edge_to_tflite_dict:
+                    edge_to_tflite_dict[debug_handle] = []
+                edge_to_tflite_dict[debug_handle].append(idx)
+
+        # Convert lists to tuples in the dictionary
+        self.edge_to_tflite_map = {k: tuple(v) for k, v in edge_to_tflite_dict.items()}
+        logger.i(f"\nFinal edge_to_tflite_map after optimization: {self.edge_to_tflite_map}")
+
     def _assign_io_tensor_indices(self, inputs, outputs, allow_inputs_stripping: bool):
         for tensor in outputs.tmp_outputs:
             try:

@@ -514,6 +514,9 @@ class Operator(meta.TFLiteObject):
     # If `True`, this is an extra operator added during conversion. It was not present in the original input model.
     tmp_added_extra: bool
 
+    # Edge program debug handle for mapping edge nodes to TFLite operators
+    tmp_edge_debug_handle: Optional[int]
+
     def __init__(
         self,
         inputs: OperatorInputs = None,
@@ -541,6 +544,8 @@ def __init__(
         self.tmp_version = 1
         self.tmp_added_extra = False
 
+        self.tmp_edge_debug_handle = None
+
     def uses_per_channel_quantization(self) -> bool:
         """Determine if this operator uses per-channel quantization."""
         for tensor in itertools.chain(self.tmp_inputs, self.tmp_outputs):

@@ -68,6 +68,7 @@ def convert(
         delegation_tag: str,
         fetch_constants_to_sram: bool = False,
         use_new_flow_neutron_c: bool = False,
+        use_profiling: bool = False,
     ) -> bytes:
         """
         Call Neutron Converter.
@@ -77,6 +78,7 @@ def convert(
         :param delegation_tag: The delegation tag of model partition.
         :param fetch_constants_to_sram: Add microcode that fetches weights from external memory.
         :param use_new_flow_neutron_c: Enable experimental MLIR-based flow for Neutron-C with improved INT8 operator support.
+        :param use_profiling: Enable profiling for neutron delegated model.
         This allows running models which do not fit into SRAM. Applies to Neutron-C only (microcontrollers).
 
         :return: TFLite model with Neutron microcode as bytes.
@@ -95,6 +97,13 @@ def convert(
         if hasattr(cctx.compilationOpts, "useNewFlowNeutronC"):
             cctx.compilationOpts.useNewFlowNeutronC = use_new_flow_neutron_c
 
+        if use_profiling:
+            cctx.compilationOpts.useProfiling = use_profiling
+            cctx.compilationOpts.dumpAfterImport = "console"
+            cctx.compilationOpts.dumpAfterGenerate = "console"
+            cctx.compilationOpts.verbose = True
+            #cctx.compilationOpts.dumpGraphs = 1
+
         # Try to use multiprocessing for isolation, but fall back to direct execution
         # if the environment doesn't support it (e.g., in sandcastle/build environments)
         try: