Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions packages/lmi/src/lmi/llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,7 @@ async def acompletion(self, messages: list[Message], **kwargs) -> list[LLMResult
cost=cost,
system_fingerprint=completions.system_fingerprint,
reasoning_content=reasoning_content,
finish_reason=choice.finish_reason,
)
)
return results
Expand Down Expand Up @@ -1079,6 +1080,7 @@ async def acompletion_iter(
role = None
reasoning_content = []
used_model = None
choice = None
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be an unused local?

async for completion in stream_completions:
if not used_model:
used_model = completion.model or self.name
Expand Down Expand Up @@ -1106,6 +1108,11 @@ async def acompletion_iter(
except Exception as e:
logger.warning(f"Failed to calculate cost for {used_model}: {e}")

# Extract finish_reason from the last completion chunk
finish_reason = (
getattr(choice, "finish_reason", None) if choice else None
)
Comment on lines +1112 to +1114
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we know the type here, do we need this getattr?


result = LLMResult(
model=used_model,
text=text,
Expand All @@ -1119,6 +1126,7 @@ async def acompletion_iter(
cache_read_tokens=cache_read,
cache_creation_tokens=cache_creation,
cost=cost,
finish_reason=finish_reason,
)

if text:
Expand Down
4 changes: 4 additions & 0 deletions packages/lmi/src/lmi/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,10 @@ class LLMResult(BaseModel):
reasoning_content: str | None = Field(
default=None, description="Reasoning content from LLMs such as DeepSeek-R1."
)
finish_reason: str | None = Field(
default=None,
description="The reason the model stopped generating tokens (e.g., 'stop', 'length', 'tool_calls', 'refusal').",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Talk about why it would be None

)

def __str__(self) -> str:
return self.text or ""
Expand Down
127 changes: 127 additions & 0 deletions packages/lmi/tests/test_llms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1201,10 +1201,137 @@ def mock_router_method(_self, _override_config=None):

assert results.text == "I'm sorry, but I can't assist with that request."
assert results.model == CommonLLMNames.GPT_41.value
assert results.finish_reason == "stop"
assert "the llm request was refused" in caplog.text.lower()
assert "attempting to fallback" in caplog.text.lower()


@pytest.mark.asyncio
async def test_finish_reason_stored_in_result() -> None:
"""Test that finish_reason is properly stored in LLMResult for different scenarios."""
llm = LiteLLMModel(name=CommonLLMNames.GPT_4O.value)

messages = [Message(content="Say 'hello'")]

# Mock the router to simulate different finish reasons
mock_router_obj = Mock()

# Test with "stop" finish reason (normal completion)
mock_completion = Mock()
mock_message = Mock(content="hello", reasoning_content="")
mock_message.model_dump.return_value = {
"role": "assistant",
"content": "hello",
}
mock_completion.choices = [
Mock(
finish_reason="stop",
message=mock_message,
)
]
mock_completion.usage = Mock(prompt_tokens=5, completion_tokens=1)
mock_completion.model = CommonLLMNames.GPT_4O.value
mock_completion.system_fingerprint = None

mock_router_obj.acompletion = AsyncMock(return_value=mock_completion)

def mock_router_method(_self, _override_config=None):
return mock_router_obj

with patch.object(LiteLLMModel, "get_router", new=mock_router_method):
results = await llm.call_single(messages)

assert results.finish_reason == "stop"
assert results.text == "hello"

# Test with "length" finish reason (hit token limit)
mock_message_length = Mock(content="truncated text", reasoning_content="")
mock_message_length.model_dump.return_value = {
"role": "assistant",
"content": "truncated text",
}
mock_completion_length = Mock()
mock_completion_length.choices = [
Mock(
finish_reason="length",
message=mock_message_length,
)
]
mock_completion_length.usage = Mock(prompt_tokens=5, completion_tokens=100)
mock_completion_length.model = CommonLLMNames.GPT_4O.value
mock_completion_length.system_fingerprint = None

mock_router_obj.acompletion = AsyncMock(return_value=mock_completion_length)

with patch.object(LiteLLMModel, "get_router", new=mock_router_method):
results_length = await llm.call_single(messages)

assert results_length.finish_reason == "length"
assert results_length.text == "truncated text"


@pytest.mark.asyncio
async def test_finish_reason_in_streaming() -> None:
"""Test that finish_reason is properly captured in streaming completions."""
model = LiteLLMModel(name=CommonLLMNames.OPENAI_TEST.value)
messages = [Message(content="Say hello")]

def _build_mock_completion(
delta_content: str = "",
delta_role: str = "assistant",
finish_reason: str | None = None,
usage: Any = None,
) -> Mock:
# Create delta with spec to prevent auto-creation of attributes
mock_delta = Mock(spec=['content', 'role'])
mock_delta.content = delta_content
mock_delta.role = delta_role

mock_choice = Mock()
mock_choice.finish_reason = finish_reason
mock_choice.logprobs = None
mock_choice.delta = mock_delta

mock_completion = Mock()
mock_completion.model = "test-model"
mock_completion.choices = [mock_choice]
mock_completion.usage = usage

return mock_completion

# Mock the router to simulate streaming with finish_reason
with patch.object(model, "_router") as mock_router:
# Create mock completions - finish_reason typically only in last chunk
mock_chunk1 = _build_mock_completion(delta_content="Hello")
mock_chunk2 = _build_mock_completion(delta_content=" world")
mock_chunk_final = _build_mock_completion(
delta_content="!",
finish_reason="stop",
usage=Mock(prompt_tokens=5, completion_tokens=3),
)

# Create async generator
async def mock_stream(): # noqa: RUF029
async def mock_stream_iter(): # noqa: RUF029
yield mock_chunk1
yield mock_chunk2
yield mock_chunk_final

return mock_stream_iter()

mock_router.acompletion.return_value = mock_stream()

# Test streaming
async_iterable = await model.acompletion_iter(messages)
results = [result async for result in async_iterable]

# Verify finish_reason is captured
assert len(results) == 1
result = results[0]
assert result.finish_reason == "stop"
assert result.text == "Hello world!"


@pytest.mark.asyncio
@pytest.mark.parametrize(
("model_name", "expected_tool_role_count"),
Expand Down