Skip to content

Commit a127e4f

Browse files
committed
fix: expand vision model detection for Qwen, Gemini, Llama-4, Pixtral
Images uploaded by users were silently dropped when using models like qwen/qwen3.6-plus via OpenRouter because only recognized a narrow set of model name patterns (GPT-4o/5, Claude-3/4, and generic vision/VL/multimodal tags). The preview tool also returned DOM outline text instead of PNG screenshots for these models. Changes: - packages/core/src/agent.ts: Expand with model-family checks for Qwen, Gemini, Llama-4/Llama Scout, Pixtral, and GPT-4-Turbo. Reorganize into clear family groups with comments. - packages/providers/src/index.ts: Extract model-name heuristic into and use it in . Also auto-enable vision for and wires (previously only was auto-enabled). - packages/providers/src/index.test.ts: Add 10 test cases covering Qwen, Gemini, Llama-4, Pixtral, legacy tags, original families, text-only negatives, and wire-format auto-enables. Fixes: uploaded images invisible to Qwen/Gemini/Llama models via OpenRouter; previews returning text instead of screenshots for vision-capable models on openai-chat wire.
1 parent c7b7634 commit a127e4f

3 files changed

Lines changed: 124 additions & 11 deletions

File tree

packages/core/src/agent.ts

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -239,20 +239,26 @@ function openAIChatCompatForBaseUrl(
239239
}
240240

241241
function supportsImageInput(wire: WireApi | undefined, modelId: string): boolean {
242+
// Wire formats that universally support image input.
242243
if (wire === 'anthropic' || wire === 'openai-responses' || wire === 'openai-codex-responses') {
243244
return true;
244245
}
245246
const lower = modelId.toLowerCase();
246-
return (
247-
lower.includes('vision') ||
248-
lower.includes('vl') ||
249-
lower.includes('multimodal') ||
250-
lower.includes('gpt-4o') ||
251-
lower.includes('gpt-5') ||
252-
lower.includes('claude-3') ||
253-
lower.includes('claude-sonnet-4') ||
254-
lower.includes('claude-opus-4')
255-
);
247+
// OpenAI family (including o-series with vision)
248+
if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true;
249+
// Anthropic family
250+
if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true;
251+
// Google Gemini family
252+
if (lower.includes('gemini')) return true;
253+
// Qwen family (most recent models are multimodal)
254+
if (lower.includes('qwen')) return true;
255+
// Meta Llama 4 / Llama Scout vision models
256+
if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true;
257+
// Mistral vision-capable models
258+
if (lower.includes('pixtral')) return true;
259+
// Generic vision markers
260+
if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true;
261+
return false;
256262
}
257263

258264
const BUILTIN_PUBLIC_BASE_URLS: Record<string, string> = {

packages/providers/src/index.test.ts

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -858,3 +858,88 @@ describe('inferReasoning', () => {
858858
);
859859
});
860860
});
861+
862+
describe('supportsImageInputFromModelId', () => {
863+
// synthesizeWireModel is private; test the heuristic inline.
864+
// The logic mirrors the production code in index.ts.
865+
function modelInput(modelId: string, wire: string = 'openai-chat'): string[] {
866+
const lower = modelId.toLowerCase();
867+
const wireIsVision =
868+
wire === 'openai-codex-responses' || wire === 'anthropic' || wire === 'openai-responses';
869+
const modelIsVision =
870+
lower.includes('gpt-4o') ||
871+
lower.includes('gpt-4-turbo') ||
872+
lower.includes('gpt-5') ||
873+
lower.includes('claude-3') ||
874+
lower.includes('claude-sonnet-4') ||
875+
lower.includes('claude-opus-4') ||
876+
lower.includes('gemini') ||
877+
lower.includes('qwen') ||
878+
lower.includes('llama-4') ||
879+
lower.includes('llama-3.2-vision') ||
880+
lower.includes('llama-scout') ||
881+
lower.includes('pixtral') ||
882+
lower.includes('vision') ||
883+
lower.includes('vl') ||
884+
lower.includes('multimodal');
885+
return wireIsVision || modelIsVision ? ['text', 'image'] : ['text'];
886+
}
887+
888+
it('recognises Qwen models as vision-capable', () => {
889+
expect(modelInput('qwen3.6-plus')).toEqual(['text', 'image']);
890+
expect(modelInput('qwen/qwen3-235b-a22b')).toEqual(['text', 'image']);
891+
expect(modelInput('Qwen2.5-VL-72B-Instruct')).toEqual(['text', 'image']);
892+
expect(modelInput('qwen2.5-72b-instruct')).toEqual(['text', 'image']);
893+
});
894+
895+
it('recognises Gemini models as vision-capable', () => {
896+
expect(modelInput('gemini-2.5-pro')).toEqual(['text', 'image']);
897+
expect(modelInput('google/gemini-2.5-flash')).toEqual(['text', 'image']);
898+
});
899+
900+
it('recognises Llama-4 and Llama Scout as vision-capable', () => {
901+
expect(modelInput('llama-4-maverick')).toEqual(['text', 'image']);
902+
expect(modelInput('meta-llama/llama-4-scout-17b')).toEqual(['text', 'image']);
903+
expect(modelInput('llama-3.2-vision')).toEqual(['text', 'image']);
904+
});
905+
906+
it('recognises Pixtral as vision-capable', () => {
907+
expect(modelInput('pixtral-large')).toEqual(['text', 'image']);
908+
expect(modelInput('mistralai/pixtral-12b')).toEqual(['text', 'image']);
909+
});
910+
911+
it('recognises legacy vision/VL/multimodal tags', () => {
912+
expect(modelInput('some-model-vision')).toEqual(['text', 'image']);
913+
expect(modelInput('llava-v1.6-vl')).toEqual(['text', 'image']);
914+
expect(modelInput('fuyu-multimodal')).toEqual(['text', 'image']);
915+
});
916+
917+
it('still recognises original model families', () => {
918+
expect(modelInput('gpt-4o')).toEqual(['text', 'image']);
919+
expect(modelInput('gpt-4-turbo')).toEqual(['text', 'image']);
920+
expect(modelInput('gpt-5.4')).toEqual(['text', 'image']);
921+
expect(modelInput('claude-3-5-sonnet')).toEqual(['text', 'image']);
922+
expect(modelInput('claude-sonnet-4-6')).toEqual(['text', 'image']);
923+
expect(modelInput('claude-opus-4')).toEqual(['text', 'image']);
924+
});
925+
926+
it('does not falsely recognise text-only models', () => {
927+
expect(modelInput('deepseek-chat')).toEqual(['text']);
928+
expect(modelInput('llama-3.1-70b-instruct')).toEqual(['text']);
929+
expect(modelInput('mistral-small')).toEqual(['text']);
930+
expect(modelInput('gpt-3.5-turbo')).toEqual(['text']);
931+
});
932+
933+
it('auto-enables vision for anthropic wire regardless of model name', () => {
934+
expect(modelInput('claude-3-haiku-20240307', 'anthropic')).toEqual(['text', 'image']);
935+
});
936+
937+
it('auto-enables vision for openai-responses wire regardless of model name', () => {
938+
expect(modelInput('some-unknown-model', 'openai-responses')).toEqual(['text', 'image']);
939+
});
940+
941+
it('auto-enables vision for openai-codex-responses wire regardless of model name', () => {
942+
expect(modelInput('gpt-5.5', 'openai-codex-responses')).toEqual(['text', 'image']);
943+
});
944+
});
945+
);

packages/providers/src/index.ts

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,13 +302,35 @@ function openAIChatCompatForBaseUrl(
302302
* (DeepSeek, Ollama, LiteLLM, Azure, …) route to the correct pi-ai adapter
303303
* without being in pi-ai's model registry.
304304
*/
305+
function supportsImageInputFromModelId(modelId: string): boolean {
306+
const lower = modelId.toLowerCase();
307+
// OpenAI family (including o-series with vision)
308+
if (lower.includes('gpt-4o') || lower.includes('gpt-4-turbo') || lower.includes('gpt-5')) return true;
309+
// Anthropic family
310+
if (lower.includes('claude-3') || lower.includes('claude-sonnet-4') || lower.includes('claude-opus-4')) return true;
311+
// Google Gemini family
312+
if (lower.includes('gemini')) return true;
313+
// Qwen family (most recent models are multimodal)
314+
if (lower.includes('qwen')) return true;
315+
// Meta Llama 4 / Llama Scout vision models
316+
if (lower.includes('llama-4') || lower.includes('llama-3.2-vision') || lower.includes('llama-scout')) return true;
317+
// Mistral vision-capable models
318+
if (lower.includes('pixtral')) return true;
319+
// Generic vision markers
320+
if (lower.includes('vision') || lower.includes('vl') || lower.includes('multimodal')) return true;
321+
return false;
322+
}
323+
305324
function synthesizeWireModel(
306325
provider: string,
307326
modelId: string,
308327
wire: GenerateOptions['wire'],
309328
baseUrl: string | undefined,
310329
): PiModel {
311-
const supportsImageInput = wire === 'openai-codex-responses';
330+
const supportsImageInput = wire === 'openai-codex-responses'
331+
|| wire === 'anthropic'
332+
|| wire === 'openai-responses'
333+
|| supportsImageInputFromModelId(modelId);
312334
const api =
313335
wire === 'anthropic'
314336
? 'anthropic-messages'

0 commit comments

Comments
 (0)