Skip to content

Commit 856ec00

Browse files
authored
fix: preserve tool result images in chat completions (#626)
1 parent b253a82 commit 856ec00

3 files changed

Lines changed: 150 additions & 7 deletions

File tree

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
"@moonshot-ai/kosong": patch
3+
"@moonshot-ai/kimi-code": patch
4+
---
5+
6+
Preserve image outputs from tools when using OpenAI-compatible chat completions.

packages/kosong/src/providers/openai-legacy.ts

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,8 @@ const OPENAI_CHAT_TOOL_CALL_ID_POLICY: ToolCallIdPolicy = {
5050
normalize: (id) => sanitizeToolCallId(id, 64),
5151
maxLength: 64,
5252
};
53+
const TOOL_RESULT_IMAGE_PROMPT = 'Attached image(s) from tool result:';
54+
const TOOL_RESULT_IMAGE_PLACEHOLDER = '(see attached image)';
5355

5456
function extractReasoningContent(
5557
source: unknown,
@@ -165,7 +167,7 @@ function convertMessage(
165167
: toolMessageConversion;
166168

167169
if (effectiveConversion !== null) {
168-
result.content = convertToolMessageContent(message, effectiveConversion);
170+
result.content = convertToolMessageContentForChat(message, effectiveConversion);
169171
} else {
170172
// Pure-text tool result with no conversion configured: serialize via the
171173
// generic content-part path so single-text messages become a plain string.
@@ -217,6 +219,70 @@ function convertMessage(
217219

218220
return result;
219221
}
222+
223+
function convertToolMessageContentForChat(
224+
message: Message,
225+
conversion: ToolMessageConversion,
226+
): string | OpenAIContentPart[] {
227+
const content = convertToolMessageContent(message, conversion);
228+
if (
229+
typeof content === 'string' &&
230+
content.length === 0 &&
231+
message.content.some((part) => part.type === 'image_url')
232+
) {
233+
return TOOL_RESULT_IMAGE_PLACEHOLDER;
234+
}
235+
return content;
236+
}
237+
238+
function toolResultImageParts(message: Message): OpenAIContentPart[] {
239+
const images: OpenAIContentPart[] = [];
240+
for (const part of message.content) {
241+
if (part.type !== 'image_url') continue;
242+
images.push({
243+
type: 'image_url',
244+
image_url:
245+
part.imageUrl.id === undefined
246+
? { url: part.imageUrl.url }
247+
: { url: part.imageUrl.url, id: part.imageUrl.id },
248+
});
249+
}
250+
return images;
251+
}
252+
253+
function appendToolResultImagesMessage(
254+
messages: OpenAIMessage[],
255+
pendingToolResultImages: OpenAIContentPart[],
256+
): void {
257+
if (pendingToolResultImages.length === 0) return;
258+
messages.push({
259+
role: 'user',
260+
content: [{ type: 'text', text: TOOL_RESULT_IMAGE_PROMPT }, ...pendingToolResultImages],
261+
});
262+
pendingToolResultImages.length = 0;
263+
}
264+
265+
function convertHistoryMessages(
266+
history: readonly Message[],
267+
reasoningKey: string | undefined,
268+
toolMessageConversion: ToolMessageConversion,
269+
): OpenAIMessage[] {
270+
const messages: OpenAIMessage[] = [];
271+
const pendingToolResultImages: OpenAIContentPart[] = [];
272+
273+
for (const msg of history) {
274+
if (msg.role !== 'tool') {
275+
appendToolResultImagesMessage(messages, pendingToolResultImages);
276+
}
277+
messages.push(convertMessage(msg, reasoningKey, toolMessageConversion));
278+
if (msg.role === 'tool') {
279+
pendingToolResultImages.push(...toolResultImageParts(msg));
280+
}
281+
}
282+
283+
appendToolResultImagesMessage(messages, pendingToolResultImages);
284+
return messages;
285+
}
220286
export class OpenAILegacyStreamedMessage implements StreamedMessage {
221287
private _id: string | null = null;
222288
private _usage: TokenUsage | null = null;
@@ -437,9 +503,9 @@ export class OpenAILegacyChatProvider implements ChatProvider {
437503
history,
438504
OPENAI_CHAT_TOOL_CALL_ID_POLICY,
439505
);
440-
for (const msg of normalizedHistory) {
441-
messages.push(convertMessage(msg, this._reasoningKey, this._toolMessageConversion));
442-
}
506+
messages.push(
507+
...convertHistoryMessages(normalizedHistory, this._reasoningKey, this._toolMessageConversion),
508+
);
443509

444510
const kwargs: Record<string, unknown> = normalizeGenerationKwargs(
445511
this._model,

packages/kosong/test/openai-legacy.test.ts

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -287,7 +287,7 @@ describe('OpenAILegacyChatProvider', () => {
287287
]);
288288
});
289289

290-
it('tool call with image result flattens to text to satisfy API constraints', async () => {
290+
it('tool call with image result keeps the tool result textual and reattaches images as user input', async () => {
291291
// OpenAI Chat Completions `tool` messages only accept text content.
292292
// Even when toolMessageConversion is unset, a tool result containing
293293
// image_url / audio_url / video_url parts must not be serialized as a
@@ -319,15 +319,86 @@ describe('OpenAILegacyChatProvider', () => {
319319
];
320320
const body = await captureRequestBody(provider, '', [], history);
321321

322-
const toolMsg = (body['messages'] as Record<string, unknown>[])[2]!;
322+
const messages = body['messages'] as Record<string, unknown>[];
323+
const toolMsg = messages[2]!;
323324
expect(toolMsg['role']).toBe('tool');
324325
expect(toolMsg['tool_call_id']).toBe('call_abc123');
325326
// Content must be a plain string, not a content-part array.
326327
expect(typeof toolMsg['content']).toBe('string');
327328
// The text segment must survive; the image must not appear as a
328-
// structured image_url part anywhere in the serialized content.
329+
// structured image_url part inside the tool message.
329330
expect(toolMsg['content']).toContain('5');
330331
expect(Array.isArray(toolMsg['content'])).toBe(false);
332+
expect(messages[3]).toEqual({
333+
role: 'user',
334+
content: [
335+
{ type: 'text', text: 'Attached image(s) from tool result:' },
336+
{ type: 'image_url', image_url: { url: 'https://example.com/image.png' } },
337+
],
338+
});
339+
});
340+
341+
it('groups consecutive tool result images after all matching tool messages', async () => {
342+
const provider = createProvider();
343+
const history: Message[] = [
344+
{ role: 'user', content: [{ type: 'text', text: 'Fetch both images' }], toolCalls: [] },
345+
{
346+
role: 'assistant',
347+
content: [{ type: 'text', text: 'ok' }],
348+
toolCalls: [
349+
{ type: 'function', id: 'call_first', name: 'first_image', arguments: '{}' },
350+
{ type: 'function', id: 'call_second', name: 'second_image', arguments: '{}' },
351+
],
352+
},
353+
{
354+
role: 'tool',
355+
content: [
356+
{ type: 'image_url', imageUrl: { url: 'https://example.com/first.png' } },
357+
],
358+
toolCallId: 'call_first',
359+
toolCalls: [],
360+
},
361+
{
362+
role: 'tool',
363+
content: [
364+
{ type: 'text', text: 'second' },
365+
{ type: 'image_url', imageUrl: { url: 'https://example.com/second.png' } },
366+
],
367+
toolCallId: 'call_second',
368+
toolCalls: [],
369+
},
370+
];
371+
const body = await captureRequestBody(provider, '', [], history);
372+
373+
expect(body['messages']).toEqual([
374+
{ role: 'user', content: 'Fetch both images' },
375+
{
376+
role: 'assistant',
377+
content: 'ok',
378+
tool_calls: [
379+
{
380+
type: 'function',
381+
id: 'call_first',
382+
function: { name: 'first_image', arguments: '{}' },
383+
},
384+
{
385+
type: 'function',
386+
id: 'call_second',
387+
function: { name: 'second_image', arguments: '{}' },
388+
},
389+
],
390+
},
391+
{ role: 'tool', content: '(see attached image)', tool_call_id: 'call_first' },
392+
{ role: 'tool', content: 'second', tool_call_id: 'call_second' },
393+
{
394+
role: 'user',
395+
content: [
396+
{ type: 'text', text: 'Attached image(s) from tool result:' },
397+
{ type: 'image_url', image_url: { url: 'https://example.com/first.png' } },
398+
{ type: 'image_url', image_url: { url: 'https://example.com/second.png' } },
399+
],
400+
},
401+
]);
331402
});
332403

333404
it('parallel tool calls', async () => {

0 commit comments

Comments
 (0)