Skip to content

Commit f81bb1e

Browse files
Adityav369claude
andauthored
Add field projection to document listing (#408)
Add an optional `fields` argument to list_documents() (sync, async, folder, and user scopes) so callers fetch only the document fields they need. The projection is pushed into the SQL SELECT, so listing metadata never reads the full document text stored in system_metadata.content. - external_id and content_type are always included so projected responses parse into Document objects; metadata_types is included when a metadata field is requested so typed values (datetime/date/decimal) are reconstructed instead of returned as strings. - Nested fields supported (e.g. metadata.client). - DB-level column projection via labeled SELECT in list_documents_flexible. - Bump SDK to 1.2.3; add unit tests for the SDK and server projection paths. Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
1 parent 03a93e8 commit f81bb1e

12 files changed

Lines changed: 301 additions & 14 deletions

File tree

core/database/postgres_database.py

Lines changed: 122 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,42 @@
3030
"summary_updated_at",
3131
}
3232

33+
# Maps a public Document field name to the underlying table column. Used to SELECT
34+
# only the columns a projection needs, so listing metadata never reads the heavy
35+
# `system_metadata.content` (the full document text).
36+
DOCUMENT_PROJECTION_COLUMN_MAP = {
37+
"external_id": DocumentModel.external_id,
38+
"content_type": DocumentModel.content_type,
39+
"filename": DocumentModel.filename,
40+
"metadata": DocumentModel.doc_metadata,
41+
"metadata_types": DocumentModel.metadata_types,
42+
"storage_info": DocumentModel.storage_info,
43+
"system_metadata": DocumentModel.system_metadata,
44+
"additional_metadata": DocumentModel.additional_metadata,
45+
"chunk_ids": DocumentModel.chunk_ids,
46+
"folder_name": DocumentModel.folder_name,
47+
"folder_path": DocumentModel.folder_path,
48+
"folder_id": DocumentModel.folder_id,
49+
"app_id": DocumentModel.app_id,
50+
"end_user_id": DocumentModel.end_user_id,
51+
}
52+
DOCUMENT_PROJECTION_ORDER = [
53+
"external_id",
54+
"content_type",
55+
"filename",
56+
"metadata",
57+
"metadata_types",
58+
"storage_info",
59+
"system_metadata",
60+
"additional_metadata",
61+
"chunk_ids",
62+
"folder_name",
63+
"folder_path",
64+
"folder_id",
65+
"app_id",
66+
"end_user_id",
67+
]
68+
3369

3470
class PostgresDatabase:
3571
"""PostgreSQL implementation for document metadata storage."""
@@ -403,10 +439,16 @@ async def list_documents_flexible(
403439
include_status_counts: bool = False,
404440
include_folder_counts: bool = False,
405441
return_documents: bool = True,
442+
fields: Optional[List[str]] = None,
406443
sort_by: Optional[str] = None,
407444
sort_direction: str = "desc",
408445
) -> Dict[str, Any]:
409-
"""List documents with optional aggregate metadata. Field projection is handled at application layer."""
446+
"""List documents with optional aggregate metadata and projected document fields.
447+
448+
When ``fields`` is provided, only the underlying columns required to serve those
449+
fields are selected from Postgres, so listing metadata never reads the full
450+
document text stored in ``system_metadata.content``.
451+
"""
410452
limit = max(limit, 0) if limit is not None else None
411453
skip = max(skip, 0)
412454

@@ -440,16 +482,24 @@ async def list_documents_flexible(
440482

441483
final_where_clause = " AND ".join(where_clauses) if where_clauses else "TRUE"
442484

443-
documents: List[Document] = []
485+
documents: List[Any] = []
444486
returned_count = 0
445487
has_more = False
446488

447489
fetch_documents = return_documents and (limit is None or limit > 0)
448490

449491
if fetch_documents:
450-
# Note: We always select all columns from the database
451-
# Field projection is handled at the application layer for simplicity
452-
base_query = select(DocumentModel).where(text(final_where_clause).bindparams(**filter_params))
492+
projection_fields = self._resolve_document_projection_fields(fields)
493+
if projection_fields:
494+
# Select only the columns the projection needs (skips the heavy
495+
# system_metadata/content read entirely).
496+
selected_columns = self._document_projection_columns(projection_fields)
497+
base_query = select(*selected_columns).where(
498+
text(final_where_clause).bindparams(**filter_params)
499+
)
500+
else:
501+
base_query = select(DocumentModel).where(text(final_where_clause).bindparams(**filter_params))
502+
453503
order_clause = self._resolve_document_sort_clause(sort_by, sort_direction)
454504
if order_clause is not None:
455505
base_query = base_query.order_by(order_clause, DocumentModel.external_id.asc())
@@ -462,13 +512,20 @@ async def list_documents_flexible(
462512
base_query = base_query.limit(fetch_limit)
463513

464514
result = await session.execute(base_query)
465-
doc_models = result.scalars().all()
466-
467-
if fetch_limit is not None and len(doc_models) > limit:
468-
has_more = True
469-
doc_models = doc_models[:limit]
470515

471-
documents = [Document(**_document_model_to_dict(doc_model)) for doc_model in doc_models]
516+
if projection_fields:
517+
documents = [
518+
self._document_projection_row_to_dict(row, projection_fields) for row in result.mappings()
519+
]
520+
if fetch_limit is not None and len(documents) > limit:
521+
has_more = True
522+
documents = documents[:limit]
523+
else:
524+
doc_models = result.scalars().all()
525+
if fetch_limit is not None and len(doc_models) > limit:
526+
has_more = True
527+
doc_models = doc_models[:limit]
528+
documents = [Document(**_document_model_to_dict(doc_model)) for doc_model in doc_models]
472529
returned_count = len(documents)
473530

474531
total_count: Optional[int] = None
@@ -568,6 +625,60 @@ def _resolve_document_sort_clause(self, sort_by: Optional[str], sort_direction:
568625
f"{direction} NULLS LAST"
569626
)
570627

628+
@staticmethod
629+
def _resolve_document_projection_fields(fields: Optional[List[str]]) -> Optional[set]:
630+
"""Resolve requested API fields to the document table columns needed to serve them.
631+
632+
Note: ``summary_*`` and ``page_count`` are derived from ``system_metadata``, so
633+
projecting them reads that column (which also holds the full document text). Plain
634+
``metadata`` lives in its own column and stays lightweight.
635+
"""
636+
if not fields:
637+
return None
638+
639+
requested_roots = {field.strip().split(".", 1)[0] for field in fields if field and field.strip()}
640+
if not requested_roots:
641+
return None
642+
643+
# external_id is always needed to identify each document.
644+
resolved_fields = {"external_id"}
645+
for root in requested_roots:
646+
if root in DOCUMENT_PROJECTION_COLUMN_MAP:
647+
resolved_fields.add(root)
648+
elif root in SUMMARY_METADATA_KEYS:
649+
# summary_* values are derived from system_metadata.
650+
resolved_fields.add("system_metadata")
651+
elif root == "page_count":
652+
resolved_fields.add("system_metadata")
653+
resolved_fields.add("chunk_ids")
654+
655+
return resolved_fields
656+
657+
@staticmethod
658+
def _document_projection_columns(fields: set):
659+
"""Return a stable list of labeled SQLAlchemy columns for a document projection."""
660+
return [
661+
DOCUMENT_PROJECTION_COLUMN_MAP[field].label(field) for field in DOCUMENT_PROJECTION_ORDER if field in fields
662+
]
663+
664+
@staticmethod
665+
def _document_projection_row_to_dict(row: Any, fields: set) -> Dict[str, Any]:
666+
"""Convert a projected document row (a SQLAlchemy mapping) to the public document dict shape."""
667+
document = dict(row)
668+
669+
for key in ("metadata", "metadata_types", "storage_info", "system_metadata", "additional_metadata"):
670+
if key in document and document[key] is None:
671+
document[key] = {}
672+
if "chunk_ids" in document and document["chunk_ids"] is None:
673+
document["chunk_ids"] = []
674+
675+
system_metadata = document.get("system_metadata") or {}
676+
if "system_metadata" in fields and isinstance(system_metadata, dict):
677+
for key in SUMMARY_METADATA_KEYS:
678+
document[key] = system_metadata.get(key)
679+
680+
return document
681+
571682
async def update_document(
572683
self,
573684
document_id: str,

core/routes/documents.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ async def list_docs(
102102
include_status_counts=request.include_status_counts,
103103
include_folder_counts=request.include_folder_counts,
104104
return_documents=request.return_documents,
105+
fields=request.fields,
105106
sort_by=request.sort_by,
106107
sort_direction=request.sort_direction,
107108
)
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""Unit tests for document field projection (list_docs `fields`)."""
2+
3+
from sqlalchemy import select
4+
5+
from core.database.postgres_database import PostgresDatabase
6+
from core.routes.utils import project_document_fields
7+
8+
9+
class TestResolveProjectionFields:
10+
"""PostgresDatabase._resolve_document_projection_fields."""
11+
12+
def test_no_fields_returns_none(self):
13+
assert PostgresDatabase._resolve_document_projection_fields(None) is None
14+
assert PostgresDatabase._resolve_document_projection_fields([]) is None
15+
assert PostgresDatabase._resolve_document_projection_fields([" "]) is None
16+
17+
def test_always_includes_external_id(self):
18+
assert PostgresDatabase._resolve_document_projection_fields(["metadata"]) == {
19+
"external_id",
20+
"metadata",
21+
}
22+
23+
def test_nested_field_resolves_to_root_column(self):
24+
# "metadata.client" only needs the doc_metadata column.
25+
assert PostgresDatabase._resolve_document_projection_fields(["metadata.client"]) == {
26+
"external_id",
27+
"metadata",
28+
}
29+
30+
def test_summary_key_requires_system_metadata(self):
31+
assert PostgresDatabase._resolve_document_projection_fields(["summary_storage_key"]) == {
32+
"external_id",
33+
"system_metadata",
34+
}
35+
36+
def test_page_count_requires_system_metadata_and_chunk_ids(self):
37+
assert PostgresDatabase._resolve_document_projection_fields(["page_count"]) == {
38+
"external_id",
39+
"system_metadata",
40+
"chunk_ids",
41+
}
42+
43+
44+
class TestProjectionColumns:
45+
"""The generated SQL only selects the projected columns."""
46+
47+
def test_metadata_projection_does_not_read_content(self):
48+
fields = PostgresDatabase._resolve_document_projection_fields(["metadata"])
49+
sql = str(select(*PostgresDatabase._document_projection_columns(fields)))
50+
assert "documents.external_id" in sql
51+
assert "documents.doc_metadata" in sql
52+
# The heavy column (system_metadata holds the full document text) is not selected.
53+
assert "system_metadata" not in sql
54+
55+
56+
class TestProjectionRowToDict:
57+
"""PostgresDatabase._document_projection_row_to_dict."""
58+
59+
def test_none_jsonb_normalized(self):
60+
row = {"external_id": "doc-1", "metadata": None, "chunk_ids": None}
61+
out = PostgresDatabase._document_projection_row_to_dict(row, {"external_id", "metadata", "chunk_ids"})
62+
assert out["metadata"] == {}
63+
assert out["chunk_ids"] == []
64+
65+
def test_summary_keys_derived_when_system_metadata_present(self):
66+
row = {"external_id": "doc-1", "system_metadata": {"summary_storage_key": "s3://x"}}
67+
out = PostgresDatabase._document_projection_row_to_dict(row, {"external_id", "system_metadata"})
68+
assert out["summary_storage_key"] == "s3://x"
69+
70+
71+
class TestProjectDocumentFields:
72+
"""core.routes.utils.project_document_fields (application-layer shaping)."""
73+
74+
def test_projects_requested_fields_only(self):
75+
doc = {"external_id": "d1", "content_type": "text/plain", "metadata": {"a": 1}, "system_metadata": {"big": "x"}}
76+
out = project_document_fields(doc, ["metadata"])
77+
assert set(out) == {"external_id", "metadata"}
78+
assert out["metadata"] == {"a": 1}
79+
80+
def test_nested_projection(self):
81+
doc = {"external_id": "d1", "metadata": {"client": "ExampleCo", "doc_type": "invoice", "secret": "z"}}
82+
out = project_document_fields(doc, ["metadata.client", "metadata.doc_type"])
83+
assert out["metadata"] == {"client": "ExampleCo", "doc_type": "invoice"}
84+
85+
def test_no_fields_returns_all(self):
86+
doc = {"external_id": "d1", "metadata": {"a": 1}}
87+
out = project_document_fields(doc, None)
88+
assert out["metadata"] == {"a": 1}
89+
assert out["external_id"] == "d1"

sdks/python/CHANGELOG.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [1.2.3] - 2026-06-18
11+
12+
### Added
13+
- `list_documents(fields=[...])` on sync, async, folder, and user-scoped clients: request only
14+
the document fields you need (e.g. `["metadata"]`). The server reads and returns only those
15+
columns, so listing metadata never downloads the full document text. `external_id` and
16+
`content_type` are always included; `metadata_types` is included automatically when a metadata
17+
field is requested so typed values (datetime/date/decimal) are reconstructed rather than
18+
returned as raw strings. Nested fields are supported (e.g. `["metadata.client"]`).
19+
1020
## [1.2.2] - 2026-02-09
1121

1222
### Added

sdks/python/README.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,11 @@ renamed = moved.rename("specs-v2")
9393
# Scope queries to a path and include descendants with folder_depth=-1
9494
chunks = folder.retrieve_chunks(query="design notes", folder_depth=-1)
9595
docs = db.list_documents(folder_name="/projects/alpha", folder_depth=-1)
96+
97+
# List only the fields you need. The server reads and returns just those columns, so
98+
# the full document text is never downloaded — fast for large corpora.
99+
for doc in db.list_documents(fields=["metadata"]).documents:
100+
print(doc.external_id, doc.metadata)
96101
```
97102

98103
`Folder.full_path` is exposed on folder objects, and `Document.folder_path` mirrors server responses for tracing scope.

sdks/python/morphik/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,4 @@
1414
"DocumentQueryResponse",
1515
]
1616

17-
__version__ = "1.2.2"
17+
__version__ = "1.2.3"

sdks/python/morphik/_internal.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,7 @@ def _prepare_list_documents_request(
428428
completed_only: bool,
429429
sort_by: Optional[str],
430430
sort_direction: str,
431+
fields: Optional[List[str]] = None,
431432
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
432433
"""Prepare request for list_docs endpoint"""
433434
params = {}
@@ -450,6 +451,15 @@ def _prepare_list_documents_request(
450451
"sort_by": sort_by,
451452
"sort_direction": sort_direction,
452453
}
454+
if fields:
455+
# Always include the fields required to reconstruct a Document client-side, so
456+
# projected responses still parse into Document objects. When any metadata field
457+
# is requested, also pull metadata_types so typed values (datetime/date/decimal)
458+
# are reconstructed instead of returned as raw strings.
459+
projected = ["external_id", "content_type", *fields]
460+
if any(field.split(".", 1)[0] == "metadata" for field in fields):
461+
projected.append("metadata_types")
462+
data["fields"] = list(dict.fromkeys(projected))
453463
return params, data
454464

455465
def _prepare_batch_get_documents_request(

sdks/python/morphik/_scoped_ops.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ def _scoped_list_documents(
277277
completed_only: bool,
278278
sort_by: Optional[str],
279279
sort_direction: str,
280+
fields: Optional[List[str]] = None,
280281
):
281282
params, data = self._logic._prepare_list_documents_request(
282283
skip,
@@ -291,6 +292,7 @@ def _scoped_list_documents(
291292
completed_only,
292293
sort_by,
293294
sort_direction,
295+
fields,
294296
)
295297

296298
return self._execute_scoped_operation(

0 commit comments

Comments
 (0)