Vision model onnx conversion working

qti-kromero · qti-kromero · commit 1f69af3939e7 · 2025-08-19T16:08:52.000-07:00
diff --git a/examples/gemma3/qnn/README.md b/examples/gemma3/qnn/README.md
@@ -8,15 +8,15 @@ Requirements:
 * Python 3.10
 * uv - Used throughout the setup scripts, please follow the [publically available installation instructions](https://docs.astral.sh/uv/getting-started/installation/#installation-methods)
 
-This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the tutorial above:
+This repository contains an automated setup script for Linux that can be used to help automate many of the steps listed in the Phi-3.5 tutorial above:
 
 ```bash
 source env_setup.sh
 ```
 
 ## Optimization Process
 
-Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models separately before configuring them to work in concert at the onnxruntime-genai stage.
+Since Gemma-3-4B is a multi-modal model composed of both vision and text components, the strategy for optimizing it through Olive is to operate on the constituent models before configuring them to work in concert at the onnxruntime-genai stage.
 
 Thus, the following commands should be used to separately produce context binaries for the text and vision portions of the model, respectively.
 
diff --git a/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py b/examples/gemma3/qnn/custom_gemma3_4b_it_vision.py
@@ -4,17 +4,33 @@
 # --------------------------------------------------------------------------
 
 
+import logging
+
 import torch
 from transformers import AutoModel
 
+logger = logging.getLogger(__name__)
 
-def load_gemma3_model(model_path):
-    return AutoModel.from_pretrained("google/gemma-3-4b-it")
 
+class Gemma3VisualEmbeddingGenerator(torch.nn.Module):
+    def __init__(self, full_model):
+        super().__init__()
+        # Extract only the vision components
+        self.vision_tower = full_model.vision_tower
+        self.multi_modal_projector = full_model.multi_modal_projector
+
+    def forward(self, pixel_values):
+        # Process images through vision tower
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+        selected_image_feature = image_outputs.last_hidden_state
+        # Project to final embedding space
+        return self.multi_modal_projector(selected_image_feature)
+
+
+def load_gemma3_model(model_path):
+    full_model = AutoModel.from_pretrained("google/gemma-3-4b-it")
+    logger.info("Loaded full model: %s", full_model)
 
-def get_dummy_inputs(model_handler):
-    return {
-        "input_ids": torch.full((1, 256), 262144, dtype=torch.long),  # Image token ID
-        "pixel_values": torch.randn(1, 3, 896, 896, dtype=torch.float32),
-        "attention_mask": torch.ones((1, 256), dtype=torch.long),
-    }
+    vision_model = Gemma3VisualEmbeddingGenerator(full_model)
+    logger.info("Created vision-only model: %s", vision_model)
+    return vision_model
diff --git a/examples/gemma3/qnn/env_setup.sh b/examples/gemma3/qnn/env_setup.sh
@@ -1,3 +1,8 @@
+#!/bin/bash
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+# --------------------------------------------------------------------------
 
 # Installing setuptools to build Olive from source
 uv pip install setuptools
diff --git a/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json b/examples/gemma3/qnn/gemma3-4b-vision-qnn-config.json
@@ -3,12 +3,11 @@
         "type": "PyTorchModel",
         "model_script": "custom_gemma3_4b_it_vision.py",
         "model_loader": "load_gemma3_model",
-        "dummy_inputs_func": "get_dummy_inputs",
         "io_config": {
-            "input_names": [ "input_ids", "pixel_values", "attention_mask" ],
-            "input_shapes": [ [ 1, 256 ], [ 1, 3, 896, 896 ], [ 1, 256 ] ],
-            "input_types": [ "int64", "float32", "int64" ],
-            "output_names": [ "last_hidden_state" ],
+            "input_names": [ "pixel_values" ],
+            "input_shapes": [ [ 1, 3, 896, 896 ] ],
+            "input_types": [ "float32" ],
+            "output_names": [ "image_features" ],
             "output_shapes": [ [ 1, 256, 2560 ] ]
         }
     },
@@ -27,16 +26,23 @@
         }
     ],
     "passes": {
-        "conversion": { "type": "OnnxConversion", "target_opset": 17 },
+        "conversion": { "type": "OnnxConversion", "target_opset": 20 },
+        "surgery": { "type": "GraphSurgeries", "surgeries": [ { "surgeon": "MatMulAddToGemm" } ] },
         "quantization": {
             "type": "OnnxStaticQuantization",
             "quant_preprocess": true,
             "data_config": "gemma_vision_data_config",
-            "op_types_to_quantize": [ "MatMul", "LayerNormalization", "Gemm", "Sigmoid", "Gelu" ],
             "activation_type": "uint16",
             "precision": "uint8",
             "calibrate_method": "MinMax"
         },
+        "cb": {
+            "type": "EPContextBinaryGenerator",
+            "provider_options": {
+                "htp_graph_finalization_optimization_mode": "3",
+                "offload_graph_io_quantization": "0"
+            }
+        },
         "add_metadata": { "type": "AddOliveMetadata", "graph_name": "gemma-3-4b-it-vision" }
     },
     "target": "qnn_system",
diff --git a/examples/gemma3/qnn/user_script.py b/examples/gemma3/qnn/user_script.py