Skip to content

Commit 08a3780

Browse files
Fix TextChunker orphan chunk gluing using word count instead of token count
In ProcessParagraphs, the orphan chunk merging logic was splitting strings by spaces to get word counts and comparing those against the token limit. This caused merged chunks to exceed the target token count when the token counter produces different counts than naive word splitting. Use GetTokenCount() consistently, matching how the rest of the method already measures chunk sizes. Fixes #13713
1 parent 5f282a9 commit 08a3780

File tree

1 file changed

+4
-10
lines changed

1 file changed

+4
-10
lines changed

dotnet/src/SemanticKernel.Core/Text/TextChunker.cs

Lines changed: 4 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -192,18 +192,12 @@ private static List<string> ProcessParagraphs(List<string> paragraphs, int adjus
192192

193193
if (GetTokenCount(lastParagraph, tokenCounter) < adjustedMaxTokensPerParagraph / 4)
194194
{
195-
var lastParagraphTokens = lastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
196-
var secondLastParagraphTokens = secondLastParagraph.Split(s_spaceChar, StringSplitOptions.RemoveEmptyEntries);
195+
var lastParagraphTokenCount = GetTokenCount(lastParagraph, tokenCounter);
196+
var secondLastParagraphTokenCount = GetTokenCount(secondLastParagraph, tokenCounter);
197197

198-
var lastParagraphTokensCount = lastParagraphTokens.Length;
199-
var secondLastParagraphTokensCount = secondLastParagraphTokens.Length;
200-
201-
if (lastParagraphTokensCount + secondLastParagraphTokensCount <= adjustedMaxTokensPerParagraph)
198+
if (lastParagraphTokenCount + secondLastParagraphTokenCount <= adjustedMaxTokensPerParagraph)
202199
{
203-
var newSecondLastParagraph = string.Join(" ", secondLastParagraphTokens);
204-
var newLastParagraph = string.Join(" ", lastParagraphTokens);
205-
206-
paragraphs[paragraphs.Count - 2] = $"{newSecondLastParagraph} {newLastParagraph}";
200+
paragraphs[paragraphs.Count - 2] = $"{secondLastParagraph} {lastParagraph}";
207201
paragraphs.RemoveAt(paragraphs.Count - 1);
208202
}
209203
}

0 commit comments

Comments
 (0)