From 128a4bca3763bffc96c906fd084b76c4c4a19043 Mon Sep 17 00:00:00 2001 From: Baris Demir Date: Fri, 5 Jun 2026 17:00:46 +0100 Subject: [PATCH] Arm backend: Add Qwen3 VL E2E coverage Add full Qwen3 VL language and vision model tests for the Arm backend in FP32 and BF16 modes. Cover both TOSA and VGF no-quant paths, with BF16 VGF using an explicit FP profile that advertises BF16 support. Relax the FP32 language-model tolerance to match observed TOSA reference drift for the full decoder stack. Signed-off-by: Baris Demir Change-Id: I1fa6ffde632e0b252c4c19a95a854268ae01ba5a --- .../models/Qwen3_VL/test_qwen3_vl_model.py | 268 ++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py diff --git a/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py b/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py new file mode 100644 index 00000000000..6bd9c799f80 --- /dev/null +++ b/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_model.py @@ -0,0 +1,268 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Tuple + +import pytest +import torch +import torch.nn.functional as F +from executorch.backends.arm.test import common +from executorch.backends.arm.test.models.Qwen3_VL.qwen3_vl_test_config import ( + get_qwen3_vl_2b_instruct_checkpoint_config, +) +from executorch.backends.arm.test.tester.test_pipeline import ( + TosaPipelineFP, + VgfPipeline, +) +from transformers.models.qwen3_vl.modeling_qwen3_vl import ( + Qwen3VLTextModel, + Qwen3VLVisionModel, +) + +input_t = Tuple[torch.Tensor, ...] + + +def _make_qwen3_vl_2b_instruct_layer_config(): + config = get_qwen3_vl_2b_instruct_checkpoint_config() + config.text_config._attn_implementation = "sdpa" + config.vision_config._attn_implementation = "sdpa" + return config + + +def _make_text_position_ids( + batch_size: int, seq_length: int, device: torch.device +) -> torch.Tensor: + return torch.arange(seq_length, device=device).unsqueeze(0).repeat(batch_size, 1) + + +def _make_image_grid_thw(device: torch.device) -> torch.Tensor: + return torch.tensor([[1, 4, 4]], dtype=torch.long, device=device) + + +def _make_pixel_values(config, device: torch.device) -> torch.Tensor: + grid_thw = _make_image_grid_thw(device) + patch_volume = ( + config.vision_config.in_channels + * config.vision_config.temporal_patch_size + * config.vision_config.patch_size + * config.vision_config.patch_size + ) + num_patches = int(torch.prod(grid_thw[0]).item()) + return torch.randn(num_patches, patch_volume, device=device) + + +class Qwen3VLModelTestModule(torch.nn.Module): + @classmethod + def prepare_model_and_inputs(cls): + raise NotImplementedError + + +def _to_bfloat16_model_and_floating_inputs( + model: torch.nn.Module, inputs: input_t +) -> tuple[torch.nn.Module, input_t]: + """Convert model and floating inputs for BF16 backend coverage.""" + + return model.to(torch.bfloat16), tuple( + ( + x.to(torch.bfloat16) + if isinstance(x, torch.Tensor) and x.is_floating_point() + else x + ) + for x in inputs + ) + + +class TextModelWrapper(Qwen3VLModelTestModule): + def __init__(self, config) -> None: + super().__init__() + self.model = Qwen3VLTextModel(config.text_config) + + def forward( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + position_ids: torch.Tensor, + ) -> torch.Tensor: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + ) + return outputs.last_hidden_state + + @classmethod + def prepare_model_and_inputs(cls): + torch.manual_seed(0) + config = _make_qwen3_vl_2b_instruct_layer_config() + model = cls(config).eval() + input_ids = torch.randint(0, 128, (2, 8), dtype=torch.long) + attention_mask = torch.ones_like(input_ids) + position_ids = _make_text_position_ids(2, 8, input_ids.device) + return model, (input_ids, attention_mask, position_ids) + + +class LowerableVisionModelWrapper(Qwen3VLModelTestModule): + def __init__(self, config) -> None: + super().__init__() + self.visual = Qwen3VLVisionModel(config.vision_config) + + with torch.no_grad(): + grid_thw = _make_image_grid_thw(self.visual.pos_embed.weight.device) + pos_embeds = self.visual.fast_pos_embed_interpolate(grid_thw) + + rotary_pos_emb = self.visual.rot_pos_emb(grid_thw) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + cos = emb.cos() + sin = emb.sin() + + cu_seqlens = torch.repeat_interleave( + grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0] + ).cumsum(dim=0, dtype=torch.int32) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + self.register_buffer("pos_embeds", pos_embeds) + self.register_buffer("cos", cos) + self.register_buffer("sin", sin) + self.register_buffer("cu_seqlens", cu_seqlens) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + hidden_states = self.visual.patch_embed(pixel_values) + hidden_states = hidden_states + self.pos_embeds + + position_embeddings = (self.cos, self.sin) + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.visual.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=self.cu_seqlens, + position_embeddings=position_embeddings, + ) + if layer_num in self.visual.deepstack_visual_indexes: + deepstack_feature = self.visual.deepstack_merger_list[ + self.visual.deepstack_visual_indexes.index(layer_num) + ](hidden_states) + deepstack_feature_lists.append(deepstack_feature) + + hidden_states = self.visual.merger(hidden_states) + + # Keep deepstack feature extraction in the exported graph without + # changing the model output. + deepstack_residual = hidden_states.new_zeros(()) + for deepstack_feature in deepstack_feature_lists: + deepstack_residual = deepstack_residual + deepstack_feature.sum() * 0 + + return hidden_states + deepstack_residual + + @classmethod + def prepare_model_and_inputs(cls): + torch.manual_seed(0) + config = _make_qwen3_vl_2b_instruct_layer_config() + model = cls(config).eval() + pixel_values = _make_pixel_values(config, torch.device("cpu")) + return model, (pixel_values,) + + +@dataclass(frozen=True) +class Qwen3VLModelTestCase: + model_cls: type[Qwen3VLModelTestModule] + run_on_vulkan_runtime: bool = True + atol: float = 1e-3 + rtol: float = 1e-3 + + +TOSA_FP_TEST_CASES: dict[str, Qwen3VLModelTestCase] = { + "vision_model": Qwen3VLModelTestCase( + model_cls=LowerableVisionModelWrapper, + ), + "text_model": Qwen3VLModelTestCase( + model_cls=TextModelWrapper, + atol=3e-2, + rtol=1e-2, + ), +} + +VGF_NO_QUANT_TEST_CASES: dict[str, Qwen3VLModelTestCase] = { + "vision_model": Qwen3VLModelTestCase( + model_cls=LowerableVisionModelWrapper, + run_on_vulkan_runtime=False, + ), + "text_model": Qwen3VLModelTestCase( + model_cls=TextModelWrapper, + run_on_vulkan_runtime=False, + ), +} + + +@pytest.mark.slow +@common.parametrize("test_case", TOSA_FP_TEST_CASES) +def test_qwen3_vl_full_models_tosa_FP(test_case: Qwen3VLModelTestCase): + model, inputs = test_case.model_cls.prepare_model_and_inputs() + with torch.no_grad(): + pipeline = TosaPipelineFP[input_t]( + model, + inputs, + aten_op=[], + exir_op=[], + atol=test_case.atol, + rtol=test_case.rtol, + ) + pipeline.run() + + +@pytest.mark.slow +@common.parametrize("test_case", TOSA_FP_TEST_CASES) +def test_qwen3_vl_full_models_tosa_FP_bf16(test_case: Qwen3VLModelTestCase): + model, inputs = test_case.model_cls.prepare_model_and_inputs() + model, inputs = _to_bfloat16_model_and_floating_inputs(model, inputs) + with torch.no_grad(): + pipeline = TosaPipelineFP[input_t]( + model, + inputs, + aten_op=[], + exir_op=[], + tosa_extensions=["bf16"], + atol=1e-1, + rtol=1e-1, + ) + pipeline.run() + + +@pytest.mark.slow +@common.SkipIfNoModelConverter +@common.parametrize("test_case", VGF_NO_QUANT_TEST_CASES) +def test_qwen3_vl_full_models_vgf_no_quant(test_case: Qwen3VLModelTestCase): + model, inputs = test_case.model_cls.prepare_model_and_inputs() + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + model, + inputs, + aten_op=[], + exir_op=[], + quantize=False, + run_on_vulkan_runtime=test_case.run_on_vulkan_runtime, + ) + pipeline.run() + + +@pytest.mark.slow +@common.SkipIfNoModelConverter +@common.parametrize("test_case", VGF_NO_QUANT_TEST_CASES) +def test_qwen3_vl_full_models_vgf_no_quant_bf16(test_case: Qwen3VLModelTestCase): + model, inputs = test_case.model_cls.prepare_model_and_inputs() + model, inputs = _to_bfloat16_model_and_floating_inputs(model, inputs) + with torch.no_grad(): + pipeline = VgfPipeline[input_t]( + model, + inputs, + aten_op=[], + exir_op=[], + quantize=False, + run_on_vulkan_runtime=test_case.run_on_vulkan_runtime, + tosa_spec="TOSA-1.0+FP+bf16", + ) + pipeline.run()