Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 19 additions & 11 deletions pydantic_ai_slim/pydantic_ai/toolsets/_tool_search.py
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Docstring says "matches any term" but code requires ALL terms to match

The docstring at line 161 states the search "matches any term against tool names and descriptions", implying OR semantics. However, the new implementation at line 174 uses if score == len(terms), which requires ALL search terms to be present in the entry's search_terms set (AND semantics). The old code used any(term in searchable for term in terms) which did match the "any term" description. This is a stale docstring that now contradicts the actual behavior, violating the coding guideline rule:198 ("Rename methods/functions when their behavior changes — names must reflect actual scope") from agent_docs/index.md.

(Refers to line 161)

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import re
from dataclasses import dataclass
from typing import Annotated, Any

Expand All @@ -18,6 +19,7 @@
_DISCOVERED_TOOLS_METADATA_KEY = 'discovered_tools'

_MAX_SEARCH_RESULTS = 10
_SEARCH_TOKEN_RE = re.compile(r'[a-z0-9]+')


class _SearchToolArgs(TypedDict):
Expand All @@ -42,9 +44,8 @@ class _SearchToolArgs(TypedDict):
@dataclass(kw_only=True)
class _SearchIndexEntry:
name: str
name_lower: str
description: str | None
description_lower: str | None
search_terms: set[str]


@dataclass(kw_only=True)
Expand Down Expand Up @@ -90,9 +91,8 @@ async def get_tools(self, ctx: RunContext[AgentDepsT]) -> dict[str, ToolsetTool[
search_index = [
_SearchIndexEntry(
name=name,
name_lower=name.lower(),
description=tool.tool_def.description,
description_lower=tool.tool_def.description.lower() if tool.tool_def.description else None,
search_terms=self._search_terms(name, tool.tool_def.description),
)
for name, tool in deferred.items()
if name not in discovered
Expand Down Expand Up @@ -148,6 +148,13 @@ async def call_tool(
return await self._search_tools(tool_args, tool)
return await self.wrapped.call_tool(name, tool_args, ctx, tool)

@staticmethod
def _search_terms(name: str, description: str | None) -> set[str]:
search_terms = set(_SEARCH_TOKEN_RE.findall(name.lower()))
if description:
search_terms.update(_SEARCH_TOKEN_RE.findall(description.lower()))
return search_terms

async def _search_tools(self, tool_args: dict[str, Any], search_tool: _SearchTool[AgentDepsT]) -> ToolReturn:
"""Search for tools matching the keywords.

Expand All @@ -159,15 +166,16 @@ async def _search_tools(self, tool_args: dict[str, Any], search_tool: _SearchToo
if not keywords:
raise ModelRetry('Please provide search keywords.')

terms = keywords.lower().split()
terms = self._search_terms(keywords, None)

matches: list[dict[str, str | None]] = []
scored_matches: list[tuple[int, dict[str, str | None]]] = []
for entry in search_tool.search_index:
searchable = entry.name_lower + (' ' + entry.description_lower if entry.description_lower else '')
if any(term in searchable for term in terms):
matches.append({'name': entry.name, 'description': entry.description})
if len(matches) >= _MAX_SEARCH_RESULTS:
break
score = len(terms & entry.search_terms)
if score == len(terms):
Comment on lines +169 to +174
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🔴 Non-alphanumeric keywords produce empty token set, matching ALL tools

When the model sends keywords containing only non-alphanumeric characters (e.g., "---", "!!!", "...", or whitespace-only strings like " "), _search_terms returns an empty set because _SEARCH_TOKEN_RE ([a-z0-9]+) finds no matches. The guard at line 166 (if not keywords) only checks for empty/falsy strings, so these inputs pass through. Then at line 174, len(terms & entry.search_terms) is 0 and len(terms) is also 0, so the condition score == len(terms) evaluates to 0 == 0True for every entry in the search index, causing all deferred tools to be returned. The old code was immune to this because "---".split() produces ["---"], which wouldn't substring-match any tool. The fix is to check for an empty terms set after tokenization and raise ModelRetry.

Suggested change
terms = self._search_terms(keywords, None)
matches: list[dict[str, str | None]] = []
scored_matches: list[tuple[int, dict[str, str | None]]] = []
for entry in search_tool.search_index:
searchable = entry.name_lower + (' ' + entry.description_lower if entry.description_lower else '')
if any(term in searchable for term in terms):
matches.append({'name': entry.name, 'description': entry.description})
if len(matches) >= _MAX_SEARCH_RESULTS:
break
score = len(terms & entry.search_terms)
if score == len(terms):
terms = self._search_terms(keywords, None)
if not terms:
raise ModelRetry('Please provide search keywords.')
scored_matches: list[tuple[int, dict[str, str | None]]] = []
for entry in search_tool.search_index:
score = len(terms & entry.search_terms)
if score == len(terms):
Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

scored_matches.append((score, {'name': entry.name, 'description': entry.description}))

scored_matches.sort(key=lambda item: item[0], reverse=True)
matches = [match for _, match in scored_matches[:_MAX_SEARCH_RESULTS]]
Comment on lines +177 to +178
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Sorting by score is a no-op — all matched entries have identical scores

At _tool_search.py:177, scored_matches is sorted by score in descending order. However, the filter condition at line 174 is if score == len(terms), which means every entry that passes the filter has exactly the same score (len(terms)). The sort therefore never reorders anything. This appears to be scaffolding for a future scoring model (e.g., partial matches or TF-IDF weighting), but as written it's dead code that adds slight overhead and could mislead readers into thinking ranking is actually happening. The test test_tool_search_toolset_prefers_specific_term_matches has a name suggesting ranking behavior, but it actually tests that AND-filtering excludes tools missing a term — not that results are ranked.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Comment on lines +169 to +178
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 Behavioral change from OR to AND matching semantics may affect recorded VCR cassettes

The old search used any(term in searchable for term in terms) (OR — match if ANY keyword appears as a substring), while the new code uses score == len(terms) (AND — ALL keywords must appear as exact tokens). This is a significant semantic change. For example, a search for 'stock weather' would previously match both stock_price (has 'stock') and get_weather (has 'weather'), but now matches neither because no single tool has both tokens. This is likely intentional (the new tests confirm it), but it means the VCR cassettes for integration tests in tests/test_tool_search.py (lines 257-311) may need re-recording if any model sends multi-keyword queries where only a subset of terms match a given tool. The existing snapshot expectations at lines 258-310 should be verified against the cassettes.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.


tool_names = [match['name'] for match in matches]

Expand Down
56 changes: 56 additions & 0 deletions tests/test_tool_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,6 +485,62 @@ async def test_tool_search_toolset_search_matches_description():
assert rv[0]['name'] == 'crypto_price'


async def test_tool_search_toolset_prefers_specific_term_matches():
toolset: FunctionToolset[None] = FunctionToolset()

@toolset.tool_plain(defer_loading=True)
def github_get_me() -> str: # pragma: no cover
"""Get the authenticated GitHub profile."""
return 'me'

@toolset.tool_plain(defer_loading=True)
def github_create_gist() -> str: # pragma: no cover
"""Create a new GitHub gist."""
return 'gist'

searchable = ToolSearchToolset(wrapped=toolset)
ctx = _build_run_context(None)

tools = await searchable.get_tools(ctx)
search_tool = tools[_SEARCH_TOOLS_NAME]

result = await searchable.call_tool(_SEARCH_TOOLS_NAME, {'keywords': 'github profile'}, ctx, search_tool)
assert result == snapshot(
ToolReturn(
return_value=[{'name': 'github_get_me', 'description': 'Get the authenticated GitHub profile.'}],
metadata={'discovered_tools': ['github_get_me']},
)
)


async def test_tool_search_toolset_does_not_match_substrings_inside_words():
toolset: FunctionToolset[None] = FunctionToolset()

@toolset.tool_plain(defer_loading=True)
def github_get_me() -> str: # pragma: no cover
"""Get my GitHub profile."""
return 'me'

@toolset.tool_plain(defer_loading=True)
def github_add_comment_to_pending_review() -> str: # pragma: no cover
"""Add a pending review comment on GitHub."""
return 'comment'

searchable = ToolSearchToolset(wrapped=toolset)
ctx = _build_run_context(None)

tools = await searchable.get_tools(ctx)
search_tool = tools[_SEARCH_TOOLS_NAME]

result = await searchable.call_tool(_SEARCH_TOOLS_NAME, {'keywords': 'get me'}, ctx, search_tool)
assert result == snapshot(
ToolReturn(
return_value=[{'name': 'github_get_me', 'description': 'Get my GitHub profile.'}],
metadata={'discovered_tools': ['github_get_me']},
)
)


async def test_tool_search_toolset_search_returns_no_matches():
"""Test that search returns empty list when no matches."""
toolset = _create_function_toolset()
Expand Down
Loading