alibaba · huangzhengxiang · May 28, 2026 · May 28, 2026 · May 29, 2026 · May 29, 2026
diff --git a/.gitignore b/.gitignore
@@ -407,3 +407,5 @@ apps/iOS/MNNLLMChat/MNNLLMiOS/LocalModel/Qwen2.5-Omni-3B-MNN
 # smoke models converted on the host by ./test.sh local (sources are fetched
 # into /smoke_sources/, then MNNConvert writes the .caffe.mnn here)
 /resource/model/**/*.caffe.mnn
+
+model/
diff --git a/skills/support-new-llm/SKILL.md b/skills/support-new-llm/SKILL.md
@@ -26,6 +26,12 @@ MNN 的模型导出本质上是**对照 HuggingFace transformers 库中原始模
 
 > **🚨 测试标准要有定力**：每步的通过标准是明确的（如"C++ 能正确描述图片内容"），不能因为"差不多能跑"就跳过。"能感知到一些信号但描述不准确"不等于通过，必须达到与 HF 模型相当的输出质量才算完成。
 
+> **🚨 多模态/embedding 对齐先看 C++ 端真实输入输出**：不要只比 Python 导出逻辑。先直接打印并比对 C++ 运行时的 chat template、token ids、以及必要时的中间输入，确认 runtime tokenizer/Jinja/post_processor 与 HuggingFace 完全一致，再继续看视觉或量化路径。
+
+> **🚨 改导出模板先做“临时 config 覆盖”验证**：当修改 `llm_config.json` / `config.json` 里的 `jinja.chat_template` 时，不要一上来全量重导出模型。先用现有 MNN 模型目录配一个临时 `config.json` 覆盖 `base_dir + jinja/context`，直接跑 C++ `llm_demo`/`embedding_demo` 验证 prompt token 数、构造出的语义和最终结果是否与基线一致，再决定是否全量 re-export。
+
+> **🚨 ASR 模型默认按单轮接口设计**：除非用户明确要求多轮 ASR 会话，否则用户侧一次 `response` 结束后就应立即 `reset` 清空状态，不要把 ASR prompt/template 设计成依赖 trailing assistant、多轮裁剪或 prompt-cache 续写的形态。
+
 > **严禁访问以下目录**：`schema/private/` 和 `source/internal/`，包含内部私有代码，**不得读取、修改或引用**。
 
 > **禁止猜测**：如果不确定某个字段名或路径，必须通过工具读取实际文件确认。
@@ -183,7 +189,7 @@ modeling_*.py 中是否有全新的 Attention 类型（非标准 SDPA）?
 
 ## 常见陷阱
 
-**在开始之前，建议先浏览 `common-pitfalls.md`**，了解已知的常见问题和解决方案（RoPE 变体、dtype 级联、Jinja 限制、stop token、残差模式、MoE 支持要点、FakeLinear axis 陷阱、**do_map 静默失败与 rope_theta 间接存储**、非标准模型加载等）。
+**在开始之前，建议先浏览 `common-pitfalls.md`**，了解已知的常见问题和解决方案（RoPE 变体、dtype 级联、Jinja 限制、tokenizer `post_processor` 对齐、stop token、残差模式、MoE 支持要点、FakeLinear axis 陷阱、**do_map 静默失败与 rope_theta 间接存储**、非标准模型加载等）。
 
 ---
 

diff --git a/skills/support-new-llm/common-pitfalls.md b/skills/support-new-llm/common-pitfalls.md
@@ -431,7 +431,40 @@ VL（Vision-Language）模型的 `config.json` 通常是嵌套结构（`text_con
 
 ---
 
-## 13. do_map 静默失败与 rope_theta 间接存储
+## 13. 外部包模型的注册与复合配置嵌套
+
+### 问题描述
+
+有些模型并不直接由 `transformers` 主包提供，而是依赖外部包在 import 时执行 `AutoConfig.register(...)` / `AutoModel.register(...)` / `AutoProcessor.register(...)`。同时，这类模型的 **原始 `config.json` 字段布局** 和 **运行时 `PretrainedConfig` 对象的属性布局** 可能并不一致。
+
+典型现象：
+
+- `AutoConfig.from_pretrained()` 无法识别模型
+- audio / vision wrapper 初始化时报 `AttributeError`
+
+### 解决方案
+
+1. **先确认模型是否需要外部包注册。**
+   如果模型的 README/官方用法要求通过独立包加载，不要继续猜 `transformers` 原生是否支持，直接下载包并在 `config.py` 的外部 registry 中 import 对应注册模块。
+
+### 最小检查
+
+实现映射后，至少执行一次：
+
+```python
+cfg = LlmConfig.from_pretrained(model_path)
+print(cfg.model_type)
+print(type(cfg.origin_config), cfg.origin_config)
+```
+
+确认：
+
+- 外部包注册已生效
+- 运行时 config 的字段层级和 mapper / audio / vision wrapper 使用的路径一致
+
+---
+
+## 14. do_map 静默失败与 rope_theta 间接存储
 
 `ModelMapper.do_map()` 在源属性不存在时**不会报错**，静默设为 `None`，post-processing 再用默认值覆盖。最常见的受害者是 **rope_theta**：部分模型（如 LFM2）将 `rope_theta` 存在 `rope_parameters` dict 中而非顶层，导致映射 `'rope_theta': 'rope_theta'` 静默失败，rope_theta 被错误回退为 10000。
 
@@ -441,7 +474,7 @@ VL（Vision-Language）模型的 `config.json` 通常是嵌套结构（`text_con
 
 ---
 
-## 14. 非标准模型加载方式
+## 15. 非标准模型加载方式
 
 ### 问题描述
 
@@ -471,3 +504,30 @@ elif model_type == 'lfm2_audio':
 - 非标准加载的模型**权重路径可能不同**于标准 HF 模型，需要通过 `print(original_model)` 或 `state_dict().keys()` 确认实际路径
 - 嵌套的 config 结构可能需要在 `config.py` 的 `from_pretrained` 中手动提取子配置
 - 某些包的注意力实现默认使用 `flash_attention_2`，CPU 上需要手动切换为 `sdpa` 或 `eager`
+
+---
+
+## 16. Audio encoder 导出接口与 C++ runtime 输入约定不一致
+
+### 问题描述
+
+新增音频模型时，Python 导出侧的 audio encoder 接口很容易和 C++ `Omni::audioProcess()` 的既有假设不一致。常见错位包括：
+
+- 导出模型输入个数、形状没对齐
+- 导出配置用已有 `audio_type`，导致 runtime 走错分支
+
+
+### 解决方案
+
+- 为接口不兼容的模型定义独立 `audio_type`，不要复用已有类型名
+- 核对`export/utils/audio.py` 中导出的 ONNX/MNN 模型输入个数，以及导出模型的 `input_features` 真实 shape 约定
+- 在 `Omni::audioProcess()` 中按该模型的真实导出接口单独处理输入 shape / 输入数量
+
+### 最小验证
+
+至少做两步：
+
+1. `audio-only` prompt + `max_token_number=0`
+   验证音频文件加载、fbank、audio encoder prefill 能走通
+2. 单行 ASR 模板 prompt
+   验证实际 decode 文本与 Python 基线一致
diff --git a/skills/test-ci/SKILL.md b/skills/test-ci/SKILL.md
@@ -66,6 +66,36 @@ Valid filters: `all` (default) · `cpu` · `opencl` · `opencl-image` ·
   `logs/test-<UTC-timestamp>/<stage>.log` — read the named log of a failing
   stage for the trailing output. `rc=137` ≈ OOM-kill, `rc=139` ≈ SIGSEGV.
 
+## Important pitfall for rebuild-driven smoke tests
+
+When a verification depends on a freshly rebuilt binary (for example `llm_demo`
+or `embedding_demo` after resolving a merge conflict), do not trust smoke-test
+results gathered while the target is still compiling or before the final link
+step has completed. A stale executable can report an old runtime failure and
+send debugging in the wrong direction.
+
+Preferred flow:
+
+1. Wait for the target build to finish and confirm the final executable link
+   step succeeded.
+2. Only then rerun the smoke test and treat that result as authoritative.
+
+## Important pitfall for `llm_demo` prompt-file smoke tests
+
+`transformers/llm/engine/demo/llm_demo.cpp` reads prompt files **one line per
+prompt** in the default build. That means a multiline chat template (for
+example an ASR prompt laid out across several lines with `<|im_start|>` /
+`<|im_end|>`) is silently split into multiple independent prompts and usually
+fails or produces empty output.
+
+For multimodal / ASR smoke tests:
+
+* Prefer a **single-line** prompt file for `llm_demo`.
+* If a model requires a full chat template, collapse it to one line before
+  running the test.
+* Do not treat an empty decode or `decode tokens num = 1` from a multiline
+  prompt file as a model failure until you have retried with a one-line prompt.
+
 ## Environment variables
 
 | Var | Mode | Meaning |

diff --git a/source/backend/cpu/compute/ImageProcessFunction.cpp b/source/backend/cpu/compute/ImageProcessFunction.cpp
@@ -776,6 +776,60 @@ static void _sampleBilinearCommon(const unsigned char* source, unsigned char* de
     }
 }
 
+static inline float _cubicWeight(float x) {
+    const float a = -0.5f;
+    x = ::fabsf(x);
+    if (x <= 1.0f) {
+        return ((a + 2.0f) * x - (a + 3.0f)) * x * x + 1.0f;
+    }
+    if (x < 2.0f) {
+        return ((a * x - 5.0f * a) * x + 8.0f * a) * x - 4.0f * a;
+    }
+    return 0.0f;
+}
+
+static void _sampleCubicCommon(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t count,
+                               size_t iw, size_t ih, size_t yStride, size_t bpp) {
+    float dy   = points[1].fY;
+    float dx   = points[1].fX;
+    float xMax = iw - 1;
+    float yMax = ih - 1;
+
+    MNN::CV::Point curPoints;
+    curPoints.fX = points[0].fX;
+    curPoints.fY = points[0].fY;
+    for (int i = 0; i < count; ++i) {
+        float y = __clamp(curPoints.fY, 0, yMax);
+        float x = __clamp(curPoints.fX, 0, xMax);
+        int xBase = (int)floorf(x);
+        int yBase = (int)floorf(y);
+        float xF = x - (float)xBase;
+        float yF = y - (float)yBase;
+
+        float wx[4], wy[4];
+        for (int k = 0; k < 4; ++k) {
+            wx[k] = _cubicWeight((float)(k - 1) - xF);
+            wy[k] = _cubicWeight((float)(k - 1) - yF);
+        }
+
+        for (int b = 0; b < bpp; ++b) {
+            float v = 0.0f;
+            for (int ky = 0; ky < 4; ++ky) {
+                int sy = std::max(0, std::min((int)yMax, yBase + ky - 1));
+                float wyv = wy[ky];
+                for (int kx = 0; kx < 4; ++kx) {
+                    int sx = std::max(0, std::min((int)xMax, xBase + kx - 1));
+                    v += wyv * wx[kx] * source[sy * yStride + bpp * sx + b];
+                }
+            }
+            v = std::min(std::max(v, 0.0f), 255.0f);
+            dest[bpp * i + b] = (unsigned char)roundf(v);
+        }
+        curPoints.fY += dy;
+        curPoints.fX += dx;
+    }
+}
+
 void MNNSamplerC4Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
                           size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) {
 #ifdef MNN_USE_NEON
@@ -796,6 +850,18 @@ void MNNSamplerC1Bilinear(const unsigned char* source, unsigned char* dest, MNN:
     _sampleBilinearCommon(source, dest + sta, points, count, iw, ih, yStride, 1);
 #endif
 }
+void MNNSamplerC4Cubic(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                       size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) {
+    _sampleCubicCommon(source, dest + 4 * sta, points, count, iw, ih, yStride, 4);
+}
+void MNNSamplerC3Cubic(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                       size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) {
+    _sampleCubicCommon(source, dest + 3 * sta, points, count, iw, ih, yStride, 3);
+}
+void MNNSamplerC1Cubic(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                       size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride) {
+    _sampleCubicCommon(source, dest + sta, points, count, iw, ih, yStride, 1);
+}
 void MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
                        size_t iw, size_t ih, size_t yStride, int bpp) {
     dest = dest + bpp * sta;

diff --git a/source/backend/cpu/compute/ImageProcessFunction.hpp b/source/backend/cpu/compute/ImageProcessFunction.hpp
@@ -98,6 +98,12 @@ void MNNSamplerC3Bilinear(const unsigned char* source, unsigned char* dest, MNN:
                           size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
 void MNNSamplerC1Bilinear(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
                           size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+void MNNSamplerC4Cubic(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                       size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+void MNNSamplerC3Cubic(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                       size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
+void MNNSamplerC1Cubic(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,
+                       size_t count, size_t capacity, size_t iw, size_t ih, size_t yStride);
 void MNNSamplerNearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta, size_t count,
                        size_t iw, size_t ih, size_t yStride, int bpp);
 void MNNSamplerC4Nearest(const unsigned char* source, unsigned char* dest, MNN::CV::Point* points, size_t sta,

diff --git a/source/cv/ImageProcessUtils.cpp b/source/cv/ImageProcessUtils.cpp
@@ -177,6 +177,20 @@ SAMPLER ImageProcessUtils::choose(ImageFormat format, Filter type, bool identity
                 break;
         }
     }
+    if (FilterType_BICUBIC == filterType) {
+        switch (formatType) {
+            case ImageFormatType_RGBA:
+            case ImageFormatType_BGRA:
+                return MNNSamplerC4Cubic;
+            case ImageFormatType_GRAY:
+                return MNNSamplerC1Cubic;
+            case ImageFormatType_RGB:
+            case ImageFormatType_BGR:
+                return MNNSamplerC3Cubic;
+            default:
+                break;
+        }
+    }
 
     // Nearest
     switch (formatType) {

diff --git a/transformers/llm/engine/demo/llm_demo.cpp b/transformers/llm/engine/demo/llm_demo.cpp
@@ -105,7 +105,6 @@ static int benchmark(Llm* llm, const std::vector<std::string>& prompts, int max_
         if (prompt.substr(0, 1) == "#") {
             continue;
         }
-
         if (max_token_number >= 0) {
             llm->response(prompt, &std::cout, nullptr, 0);
             while (!llm->stoped() && context->gen_seq_len < max_token_number) {

diff --git a/transformers/llm/engine/demo/tokenizer_demo.cpp b/transformers/llm/engine/demo/tokenizer_demo.cpp
@@ -12,7 +12,7 @@ using namespace MNN::Transformer;
 
 int main(int argc, const char* argv[]) {
     if (argc < 2) {
-        std::cout << "Usage: " << argv[0] << " tokenizer.txt [bench|test]" << std::endl;
+        std::cout << "Usage: " << argv[0] << " tokenizer.txt [bench|test|encode] [text]" << std::endl;
         return 0;
     }
     std::string tokenizer_path = argv[1];
@@ -96,6 +96,23 @@ int main(int argc, const char* argv[]) {
         return 0;
     }
 
+    if (mode == "encode") {
+        if (argc < 4) {
+            std::cerr << "Usage: " << argv[0] << " tokenizer.txt encode \"text\"" << std::endl;
+            return 1;
+        }
+        std::unique_ptr<Tokenizer> tokenizer(Tokenizer::createTokenizer(tokenizer_path));
+        auto ids = tokenizer->encode(argv[3]);
+        std::cout << "len=" << ids.size() << std::endl;
+        std::cout << "ids=[";
+        for (size_t i = 0; i < ids.size(); ++i) {
+            if (i > 0) std::cout << ", ";
+            std::cout << ids[i];
+        }
+        std::cout << "]" << std::endl;
+        return 0;
+    }
+
     // Default mode: encode + decode correctness test
     auto t0 = std::chrono::high_resolution_clock::now();
     std::unique_ptr<Tokenizer> tokenizer(Tokenizer::createTokenizer(tokenizer_path));

diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp
@@ -267,7 +267,7 @@ class MNN_PUBLIC Embedding : public Llm {
     static float cos_sim(Express::VARP var0, Express::VARP var1);
     virtual bool load() override;
 
-    Express::VARP ids_embedding(const std::vector<int>& ids);
+    virtual Express::VARP ids_embedding(const std::vector<int>& ids);
     Express::VARP txt_embedding(const std::string& txt);
     std::vector<Express::VARP> forwardRaw(Express::VARP hiddenState, Express::VARP mask, Express::VARP inputPos, Express::VARPS extraArgs = {}) override;
     int dim() const;

diff --git a/transformers/llm/engine/src/embedding.cpp b/transformers/llm/engine/src/embedding.cpp
@@ -6,6 +6,7 @@
 //
 
 #include "llm/llm.hpp"
+#include "omni.hpp"
 #include "llmconfig.hpp"
 #include "tokenizer/tokenizer.hpp"
 #include "diskembedding.hpp"
@@ -31,7 +32,8 @@ float Embedding::cos_sim(VARP var0, VARP var1) {
 
 Embedding* Embedding::createEmbedding(const std::string& config_path, bool load) {
     std::shared_ptr<LlmConfig> config(new LlmConfig(config_path));
-    Embedding* embedding = new Embedding(config);
+    Embedding* embedding = config->is_visual() ? static_cast<Embedding*>(new Omni(config))
+                                               : new Embedding(config);
     if (load) {
         embedding->load();
     }
@@ -53,11 +55,8 @@ bool Embedding::load() {
     }
 
     initRuntime();
-    printf("load tokenizer\n");
-    std::cout << mConfig->tokenizer_file() << std::endl;
     // 1. load vocab
     mTokenizer.reset(Tokenizer::createTokenizer(mConfig->tokenizer_file()));
-    printf("load tokenizer Done\n");
     mDiskEmbedding.reset(new DiskEmbedding(mConfig));
     setChatTemplate();
     // 2. load model
@@ -69,9 +68,12 @@ bool Embedding::load() {
     }
     module_config.rearrange    = true;
     auto model_path            = mConfig->llm_model();
+    auto weight_path           = mConfig->llm_weight();
     MNN_PRINT("load %s ... ", model_path.c_str());
+    mRuntimeManager->setExternalFile(weight_path);
     mModule.reset(Module::load({"input_ids", "attention_mask", "position_ids"}, {"sentence_embeddings"},
                                    model_path.c_str(), mRuntimeManager, &module_config));
+    mRuntimeManager->setExternalFile("");
     if (nullptr == mModule.get()) {
         return false;
     }