Skip to content

Commit f3387d0

Browse files
tom-armBaris Demir
andauthored
Arm backend: Add Qwen3-VL_2B_IT FP32 layer tests (#19628)
Change-Id: I62d3848e0a6546e21d508b4ed565c2403b63f72d cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani --------- Signed-off-by: Tom Allsop <tom.allsop@arm.com> Co-authored-by: Baris Demir <baris.demir@arm.com>
1 parent 85bd01d commit f3387d0

3 files changed

Lines changed: 544 additions & 0 deletions

File tree

backends/arm/MODELS.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
- Inception v3 (IC3)
1111
- Llama
1212
- Gemma3n
13+
- Qwen3-VL
1314
- Long Short-Term Memory (LSTM)
1415
- MobileNet V1 0.25
1516
- MobileNet v2 (MV2)
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Copyright 2026 Arm Limited and/or its affiliates.
2+
#
3+
# This source code is licensed under the BSD-style license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
from transformers.models.qwen3_vl.configuration_qwen3_vl import (
7+
Qwen3VLConfig,
8+
Qwen3VLTextConfig,
9+
Qwen3VLVisionConfig,
10+
)
11+
12+
13+
def get_qwen3_vl_2b_instruct_checkpoint_config() -> Qwen3VLConfig:
14+
text_config = Qwen3VLTextConfig(
15+
attention_bias=False,
16+
attention_dropout=0.0,
17+
bos_token_id=151643, # type: ignore[call-arg]
18+
dtype="bfloat16",
19+
eos_token_id=151645, # type: ignore[call-arg]
20+
head_dim=128,
21+
hidden_act="silu",
22+
hidden_size=2048,
23+
initializer_range=0.02,
24+
intermediate_size=6144,
25+
max_position_embeddings=262144,
26+
num_attention_heads=16,
27+
num_hidden_layers=28,
28+
num_key_value_heads=8,
29+
rms_norm_eps=1e-6,
30+
rope_parameters={
31+
"mrope_interleaved": True, # type: ignore[dict-item]
32+
"mrope_section": [24, 20, 20], # type: ignore[dict-item]
33+
"rope_type": "default", # type: ignore[dict-item]
34+
"rope_theta": 5_000_000, # type: ignore[dict-item]
35+
},
36+
tie_word_embeddings=True, # type: ignore[call-arg]
37+
use_cache=True,
38+
vocab_size=151936,
39+
)
40+
vision_config = Qwen3VLVisionConfig(
41+
deepstack_visual_indexes=[5, 11, 17],
42+
depth=24,
43+
hidden_act="gelu_pytorch_tanh",
44+
hidden_size=1024,
45+
in_channels=3,
46+
initializer_range=0.02,
47+
intermediate_size=4096,
48+
num_heads=16,
49+
num_position_embeddings=2304,
50+
out_hidden_size=2048,
51+
patch_size=16,
52+
spatial_merge_size=2,
53+
temporal_patch_size=2,
54+
)
55+
return Qwen3VLConfig(
56+
architectures=["Qwen3VLForConditionalGeneration"],
57+
image_token_id=151655,
58+
text_config=text_config.to_dict(),
59+
tie_word_embeddings=True,
60+
video_token_id=151656,
61+
vision_config=vision_config.to_dict(),
62+
vision_end_token_id=151653,
63+
vision_start_token_id=151652,
64+
)

0 commit comments

Comments
 (0)