MODSetter · MODSetter · Jun 20, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py b/surfsense_backend/alembic/versions/166_add_chunk_char_spans.py
@@ -0,0 +1,31 @@
+"""add chunks.start_char/end_char for citation offsets
+
+Char offsets into the document's source_markdown (half-open span) let citations
+resolve the exact passage a chunk came from. Nullable because historical rows
+have no span; they populate on the next connector sync or user edit/reindex.
+
+No backfill: a bulk UPDATE of every chunk on a large HNSW-indexed table rewrites
+every secondary index per row (see migration 165 for the same reasoning).
+
+Revision ID: 166
+Revises: 165
+"""
+
+from collections.abc import Sequence
+
+from alembic import op
+
+revision: str = "166"
+down_revision: str | None = "165"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+
+def upgrade() -> None:
+    op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS start_char INTEGER;")
+    op.execute("ALTER TABLE chunks ADD COLUMN IF NOT EXISTS end_char INTEGER;")
+
+
+def downgrade() -> None:
+    op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS end_char;")
+    op.execute("ALTER TABLE chunks DROP COLUMN IF EXISTS start_char;")
diff --git a/...ckend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py b/...ckend/app/agents/chat/multi_agent_chat/main_agent/middleware/kb_persistence/middleware.py
@@ -18,7 +18,6 @@
 
 from __future__ import annotations
 
-import asyncio
 import logging
 from datetime import UTC, datetime
 from typing import Any
@@ -58,9 +57,8 @@
     FolderRevision,
     shielded_async_session,
 )
-from app.indexing_pipeline.document_chunker import chunk_text
+from app.indexing_pipeline.cache.cached_indexing import build_chunk_embeddings
 from app.utils.document_converters import (
-    embed_texts,
     generate_content_hash,
     generate_unique_identifier_hash,
 )
@@ -234,24 +232,23 @@ async def _create_document(
     session.add(doc)
     await session.flush()
 
-    summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
+    summary_embedding, chunk_embeddings = await build_chunk_embeddings(
+        content, use_code_chunker=False
+    )
     doc.embedding = summary_embedding
-    chunks = chunk_text(content)
-    if chunks:
-        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
-        session.add_all(
-            [
-                Chunk(
-                    document_id=doc.id,
-                    content=text,
-                    embedding=embedding,
-                    position=i,
-                )
-                for i, (text, embedding) in enumerate(
-                    zip(chunks, chunk_embeddings, strict=True)
-                )
-            ]
-        )
+    session.add_all(
+        [
+            Chunk(
+                document_id=doc.id,
+                content=sl.text,
+                embedding=embedding,
+                position=i,
+                start_char=sl.start_char,
+                end_char=sl.end_char,
+            )
+            for i, (sl, embedding) in enumerate(chunk_embeddings)
+        ]
+    )
     return doc
 
 
@@ -287,26 +284,25 @@ async def _update_document(
         search_space_id,
     )
 
-    summary_embedding = (await asyncio.to_thread(embed_texts, [content]))[0]
+    summary_embedding, chunk_embeddings = await build_chunk_embeddings(
+        content, use_code_chunker=False
+    )
     document.embedding = summary_embedding
 
     await session.execute(delete(Chunk).where(Chunk.document_id == document.id))
-    chunks = chunk_text(content)
-    if chunks:
-        chunk_embeddings = await asyncio.to_thread(embed_texts, chunks)
-        session.add_all(
-            [
-                Chunk(
-                    document_id=document.id,
-                    content=text,
-                    embedding=embedding,
-                    position=i,
-                )
-                for i, (text, embedding) in enumerate(
-                    zip(chunks, chunk_embeddings, strict=True)
-                )
-            ]
-        )
+    session.add_all(
+        [
+            Chunk(
+                document_id=document.id,
+                content=sl.text,
+                embedding=embedding,
+                position=i,
+                start_char=sl.start_char,
+                end_char=sl.end_char,
+            )
+            for i, (sl, embedding) in enumerate(chunk_embeddings)
+        ]
+    )
     return document
 
 

diff --git a/...p/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md b/...p/agents/chat/multi_agent_chat/main_agent/system_prompt/prompts/citations/on.md
@@ -1,42 +1,58 @@
 <citations>
-Citations reach the answer through two channels. Use whichever applies — and
-never invent ids you didn't see. Citation ids are resolved by exact-match
-lookup; a wrong id silently breaks the link, so when in doubt, omit.
+Citations reach the answer through three channels. Use whichever applies, and
+never invent ids you didn't see: ids are matched exactly, so a wrong one
+silently breaks the link — when in doubt, omit. Always write a citation as
+plain `[citation:…]` brackets — no markdown links, no footnote numbers, no
+parentheses.
 
-### Channel A — chunk blocks injected this turn
+### Channel A — web_search chunk blocks injected this turn
 When `web_search` returns `<document>` / `<chunk id='…'>` blocks in this
-turn:
+turn, the chunk `id` is the result's URL:
 
-1. For each factual statement taken from those chunks, add
-   `[citation:chunk_id]` using the **exact** id from a visible
-   `<chunk id='…'>` tag. Copy digit-for-digit (or the URL verbatim);
-   do not retype from memory.
-2. `<document_id>` is the parent doc id, **not** a citation source —
-   only ids inside `<chunk id='…'>` count.
-3. Multiple chunks → `[citation:id1], [citation:id2]` (comma-separated,
+1. For each factual statement taken from a chunk, add `[citation:<url>]`
+   using the **exact** id from a visible `<chunk id='…'>` tag. Copy the
+   URL verbatim; do not retype it from memory.
+2. Multiple chunks → `[citation:url1], [citation:url2]` (comma-separated,
    each id copied individually).
-4. Never invent, normalise, or guess at adjacent ids; if unsure, omit.
-5. Plain brackets only — no markdown links, no footnote numbering.
+3. Never invent, normalise, or guess at a URL; if unsure, omit.
 
 ### Channel B — citations relayed by a `task` specialist
-A `task(...)` tool message may contain `[citation:<chunk_id>]` markers
-the specialist already attached to its prose. The specialist saw the
-underlying `<chunk id='…'>` blocks; you didn't. So:
+A `task(...)` tool message may contain `[citation:…]` markers the
+specialist already attached to its prose — line citations
+(`[citation:d<id>#L<a>-<b>]`) or chunk ids (`[citation:N]`). The
+specialist read the underlying document and tied each marker to a
+passage; you didn't. So:
 
 1. **Preserve those markers verbatim** in your final answer — do not
    reformat, renumber, drop, or wrap them in markdown links. When you
    paraphrase a specialist sentence, copy the marker character-for-
-   character; do not regenerate the id from memory (LLMs reliably
-   corrupt nearby digits).
+   character; do not regenerate it from memory (LLMs reliably corrupt
+   nearby digits).
 2. Keep each marker attached to the sentence the specialist attached
    it to.
 3. Do **not** add new `[citation:…]` markers of your own to a
    specialist's prose; if a fact has no marker, the specialist
-   couldn't tie it to a chunk and neither can you.
+   couldn't tie it to a source and neither can you.
 4. When a specialist returns JSON, the citation markers live inside
    the prose-bearing fields (e.g. a summary or excerpt). Pull them
    along with the surrounding sentence when you quote.
 
-If neither channel surfaces citation markers this turn, do not fabricate
-them.
+### Channel C — your knowledge base (search hits and `read_file`)
+Knowledge-base facts are cited by line range using the document id:
+`[citation:d<document_id>#L<start>-<end>]` (a single line is `#L<n>-<n>`).
+
+1. `search_knowledge_base` prints a ready `[citation:d…#L…-…]` token above each
+   matched passage. When that passage supports your point, copy the token
+   verbatim — that is the entire citation.
+2. When you `read_file` a `/documents/...` path, its header gives the
+   `<document_id>` and an optional `<matched_lines>` pointer, and the body is
+   shown with line numbers; cite the lines you actually used. Use `read_file`
+   when you need more context than a search passage shows.
+3. Copy document ids and line numbers exactly as shown — never estimate,
+   shift, or invent them.
+4. Older documents without a numbered body instead show `<chunk id='N'>`
+   blocks; cite those with `[citation:N]`, copying the id exactly.
+
+If none of these channels surfaces a citable source this turn, do not
+fabricate citations.
 </citations>