Skip to content

Commit a30f7a6

Browse files
committed
[Bugfix] DeepSeek V4 reasoning parser: don't split DSML tool-call marker across streaming deltas
In thinking mode the V4 reasoning parser treats the DSML tool-call start marker (<|DSML|tool_calls>) as an implicit end-of-reasoning when the model omits </think>. Detection used text.find(marker), which only matches the complete marker; when the marker arrives split across streaming deltas ("<|DSML|tool" then "_calls>"), the partial prefix is emitted as reasoning and only "_calls>" reaches content, so the tool parser never sees the start token. The tool call leaks as text and the agent loop ends with nothing to dispatch. Relates to vllm-project#41132 and vllm-project#40801. Hold back trailing bytes of current_text that form a partial prefix of an implicit-end marker (via partial_tag_overlap) until a later delta resolves them; on completion hand the whole marker (taken from current_text, so a straddled marker stays intact) to content. Bookkeeping is delta-relative (len(previous_text) - held_len), preserving the existing within-delta path. Signed-off-by: tobymao <toby.mao@gmail.com>
1 parent a66054f commit a30f7a6

2 files changed

Lines changed: 83 additions & 13 deletions

File tree

tests/reasoning/test_deepseekv4_reasoning_parser.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,3 +316,49 @@ def test_single_end_token_delta_returns_none(parser):
316316
delta_token_ids=[END_TOKEN_ID],
317317
)
318318
assert out is None
319+
320+
321+
def test_implicit_end_marker_split_across_deltas(parser):
322+
"""Regression (vLLM #41132 / #40801): the DSML tool-call start marker can
323+
arrive split across streaming deltas ("<|DSML|tool" then "_calls>"). The
324+
partial prefix must not leak into reasoning, and the completed marker must
325+
reach content whole so the tool parser sees its start token. Previously the
326+
prefix was emitted as reasoning and only "_calls>" reached content, so the
327+
tool call was lost and the agent loop ended with nothing to dispatch.
328+
"""
329+
# 1. plain reasoning flows through
330+
d1 = parser.extract_reasoning_streaming(
331+
previous_text="",
332+
current_text="thinking ",
333+
delta_text="thinking ",
334+
previous_token_ids=[],
335+
current_token_ids=[300],
336+
delta_token_ids=[300],
337+
)
338+
assert d1 is not None
339+
assert d1.reasoning == "thinking "
340+
assert d1.content is None
341+
342+
# 2. partial marker prefix -- held back, never emitted as reasoning
343+
d2 = parser.extract_reasoning_streaming(
344+
previous_text="thinking ",
345+
current_text="thinking <|DSML|tool",
346+
delta_text="<|DSML|tool",
347+
previous_token_ids=[300],
348+
current_token_ids=[300, 301],
349+
delta_token_ids=[301],
350+
)
351+
assert "DSML" not in ((d2.reasoning if d2 else None) or "")
352+
353+
# 3. marker completes -- whole marker reaches content, nothing leaks
354+
d3 = parser.extract_reasoning_streaming(
355+
previous_text="thinking <|DSML|tool",
356+
current_text="thinking " + DSML_MARKER,
357+
delta_text="_calls>",
358+
previous_token_ids=[300, 301],
359+
current_token_ids=[300, 301, 302],
360+
delta_token_ids=[302],
361+
)
362+
assert d3 is not None
363+
assert (d3.reasoning or "") == ""
364+
assert d3.content == DSML_MARKER

vllm/reasoning/deepseek_v4_reasoning_parser.py

Lines changed: 37 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs):
5858
# the rest of the stream is content and the orchestrator's
5959
# is_reasoning_end check must return True for every subsequent delta.
6060
self._implicit_end_seen: bool = False
61+
# Length of the trailing partial implicit-end marker held back from the
62+
# previous streaming delta (0 if none). With previous_text this lets us
63+
# recover how much has already been emitted as reasoning, so a marker
64+
# split across deltas is never half-emitted.
65+
self._held_len: int = 0
6166

6267
def _find_implicit_end_marker(self, text: str) -> tuple[str, int] | None:
6368
"""Return ``(marker, index)`` of the earliest implicit end marker in
@@ -141,21 +146,40 @@ def extract_reasoning_streaming(
141146

142147
marker_in_current = self._find_implicit_end_marker(current_text)
143148
if marker_in_current is None:
144-
# No marker anywhere; parent's classification stands.
145-
return ret
146-
147-
# First sighting of the implicit end marker.
149+
# No COMPLETE marker yet, but the tail of current_text may be a
150+
# partial implicit-end marker split across streaming deltas
151+
# (e.g. "<|DSML|tool" before "_calls>" arrives). Emitting that
152+
# partial prefix as reasoning would corrupt the marker so the tool
153+
# parser never sees its start token. Hold those trailing bytes back
154+
# until a later delta resolves them. Local import reuses the
155+
# canonical helper without a module-load reasoning->tool_parsers
156+
# import cycle.
157+
from vllm.tool_parsers.utils import partial_tag_overlap
158+
159+
# Reasoning already emitted == previous_text minus the bytes held
160+
# back at the end of the previous delta (delta-relative, so it does
161+
# not assume earlier reasoning flowed through this method).
162+
prev_emitted = len(previous_text) - self._held_len
163+
overlap = max(
164+
(partial_tag_overlap(current_text, m) for m in self.implicit_end_markers),
165+
default=0,
166+
)
167+
self._held_len = overlap
168+
sendable = len(current_text) - overlap
169+
if sendable > prev_emitted:
170+
return DeltaMessage(reasoning=current_text[prev_emitted:sendable])
171+
# Everything new is a potential partial marker -- emit nothing.
172+
return None
173+
174+
# First sighting of the COMPLETE implicit end marker. Emit reasoning
175+
# between what we have already sent and the marker, then hand the whole
176+
# marker (reconstructed from current_text, so a straddled marker stays
177+
# intact) plus anything after it to content for the tool parser.
148178
self._implicit_end_seen = True
149179
_marker_str, marker_idx_current = marker_in_current
150-
# Position within delta_text where the marker begins.
151-
marker_idx_delta = marker_idx_current - len(previous_text)
152-
if marker_idx_delta < 0:
153-
# Marker straddles into previous_text but wasn't detected there
154-
# (parent path didn't hit). Treat all of delta_text as content.
155-
return DeltaMessage(content=delta_text)
156-
157-
reasoning_part = delta_text[:marker_idx_delta] or None
158-
content_part = delta_text[marker_idx_delta:] or None
180+
prev_emitted = len(previous_text) - self._held_len
181+
reasoning_part = current_text[prev_emitted:marker_idx_current] or None
182+
content_part = current_text[marker_idx_current:] or None
159183
if reasoning_part is None and content_part is None:
160184
return ret
161185
return DeltaMessage(reasoning=reasoning_part, content=content_part)

0 commit comments

Comments
 (0)