3030 "summary_updated_at" ,
3131}
3232
33+ # Maps a public Document field name to the underlying table column. Used to SELECT
34+ # only the columns a projection needs, so listing metadata never reads the heavy
35+ # `system_metadata.content` (the full document text).
36+ DOCUMENT_PROJECTION_COLUMN_MAP = {
37+ "external_id" : DocumentModel .external_id ,
38+ "content_type" : DocumentModel .content_type ,
39+ "filename" : DocumentModel .filename ,
40+ "metadata" : DocumentModel .doc_metadata ,
41+ "metadata_types" : DocumentModel .metadata_types ,
42+ "storage_info" : DocumentModel .storage_info ,
43+ "system_metadata" : DocumentModel .system_metadata ,
44+ "additional_metadata" : DocumentModel .additional_metadata ,
45+ "chunk_ids" : DocumentModel .chunk_ids ,
46+ "folder_name" : DocumentModel .folder_name ,
47+ "folder_path" : DocumentModel .folder_path ,
48+ "folder_id" : DocumentModel .folder_id ,
49+ "app_id" : DocumentModel .app_id ,
50+ "end_user_id" : DocumentModel .end_user_id ,
51+ }
52+ DOCUMENT_PROJECTION_ORDER = [
53+ "external_id" ,
54+ "content_type" ,
55+ "filename" ,
56+ "metadata" ,
57+ "metadata_types" ,
58+ "storage_info" ,
59+ "system_metadata" ,
60+ "additional_metadata" ,
61+ "chunk_ids" ,
62+ "folder_name" ,
63+ "folder_path" ,
64+ "folder_id" ,
65+ "app_id" ,
66+ "end_user_id" ,
67+ ]
68+
3369
3470class PostgresDatabase :
3571 """PostgreSQL implementation for document metadata storage."""
@@ -403,10 +439,16 @@ async def list_documents_flexible(
403439 include_status_counts : bool = False ,
404440 include_folder_counts : bool = False ,
405441 return_documents : bool = True ,
442+ fields : Optional [List [str ]] = None ,
406443 sort_by : Optional [str ] = None ,
407444 sort_direction : str = "desc" ,
408445 ) -> Dict [str , Any ]:
409- """List documents with optional aggregate metadata. Field projection is handled at application layer."""
446+ """List documents with optional aggregate metadata and projected document fields.
447+
448+ When ``fields`` is provided, only the underlying columns required to serve those
449+ fields are selected from Postgres, so listing metadata never reads the full
450+ document text stored in ``system_metadata.content``.
451+ """
410452 limit = max (limit , 0 ) if limit is not None else None
411453 skip = max (skip , 0 )
412454
@@ -440,16 +482,24 @@ async def list_documents_flexible(
440482
441483 final_where_clause = " AND " .join (where_clauses ) if where_clauses else "TRUE"
442484
443- documents : List [Document ] = []
485+ documents : List [Any ] = []
444486 returned_count = 0
445487 has_more = False
446488
447489 fetch_documents = return_documents and (limit is None or limit > 0 )
448490
449491 if fetch_documents :
450- # Note: We always select all columns from the database
451- # Field projection is handled at the application layer for simplicity
452- base_query = select (DocumentModel ).where (text (final_where_clause ).bindparams (** filter_params ))
492+ projection_fields = self ._resolve_document_projection_fields (fields )
493+ if projection_fields :
494+ # Select only the columns the projection needs (skips the heavy
495+ # system_metadata/content read entirely).
496+ selected_columns = self ._document_projection_columns (projection_fields )
497+ base_query = select (* selected_columns ).where (
498+ text (final_where_clause ).bindparams (** filter_params )
499+ )
500+ else :
501+ base_query = select (DocumentModel ).where (text (final_where_clause ).bindparams (** filter_params ))
502+
453503 order_clause = self ._resolve_document_sort_clause (sort_by , sort_direction )
454504 if order_clause is not None :
455505 base_query = base_query .order_by (order_clause , DocumentModel .external_id .asc ())
@@ -462,13 +512,20 @@ async def list_documents_flexible(
462512 base_query = base_query .limit (fetch_limit )
463513
464514 result = await session .execute (base_query )
465- doc_models = result .scalars ().all ()
466-
467- if fetch_limit is not None and len (doc_models ) > limit :
468- has_more = True
469- doc_models = doc_models [:limit ]
470515
471- documents = [Document (** _document_model_to_dict (doc_model )) for doc_model in doc_models ]
516+ if projection_fields :
517+ documents = [
518+ self ._document_projection_row_to_dict (row , projection_fields ) for row in result .mappings ()
519+ ]
520+ if fetch_limit is not None and len (documents ) > limit :
521+ has_more = True
522+ documents = documents [:limit ]
523+ else :
524+ doc_models = result .scalars ().all ()
525+ if fetch_limit is not None and len (doc_models ) > limit :
526+ has_more = True
527+ doc_models = doc_models [:limit ]
528+ documents = [Document (** _document_model_to_dict (doc_model )) for doc_model in doc_models ]
472529 returned_count = len (documents )
473530
474531 total_count : Optional [int ] = None
@@ -568,6 +625,60 @@ def _resolve_document_sort_clause(self, sort_by: Optional[str], sort_direction:
568625 f"{ direction } NULLS LAST"
569626 )
570627
628+ @staticmethod
629+ def _resolve_document_projection_fields (fields : Optional [List [str ]]) -> Optional [set ]:
630+ """Resolve requested API fields to the document table columns needed to serve them.
631+
632+ Note: ``summary_*`` and ``page_count`` are derived from ``system_metadata``, so
633+ projecting them reads that column (which also holds the full document text). Plain
634+ ``metadata`` lives in its own column and stays lightweight.
635+ """
636+ if not fields :
637+ return None
638+
639+ requested_roots = {field .strip ().split ("." , 1 )[0 ] for field in fields if field and field .strip ()}
640+ if not requested_roots :
641+ return None
642+
643+ # external_id is always needed to identify each document.
644+ resolved_fields = {"external_id" }
645+ for root in requested_roots :
646+ if root in DOCUMENT_PROJECTION_COLUMN_MAP :
647+ resolved_fields .add (root )
648+ elif root in SUMMARY_METADATA_KEYS :
649+ # summary_* values are derived from system_metadata.
650+ resolved_fields .add ("system_metadata" )
651+ elif root == "page_count" :
652+ resolved_fields .add ("system_metadata" )
653+ resolved_fields .add ("chunk_ids" )
654+
655+ return resolved_fields
656+
657+ @staticmethod
658+ def _document_projection_columns (fields : set ):
659+ """Return a stable list of labeled SQLAlchemy columns for a document projection."""
660+ return [
661+ DOCUMENT_PROJECTION_COLUMN_MAP [field ].label (field ) for field in DOCUMENT_PROJECTION_ORDER if field in fields
662+ ]
663+
664+ @staticmethod
665+ def _document_projection_row_to_dict (row : Any , fields : set ) -> Dict [str , Any ]:
666+ """Convert a projected document row (a SQLAlchemy mapping) to the public document dict shape."""
667+ document = dict (row )
668+
669+ for key in ("metadata" , "metadata_types" , "storage_info" , "system_metadata" , "additional_metadata" ):
670+ if key in document and document [key ] is None :
671+ document [key ] = {}
672+ if "chunk_ids" in document and document ["chunk_ids" ] is None :
673+ document ["chunk_ids" ] = []
674+
675+ system_metadata = document .get ("system_metadata" ) or {}
676+ if "system_metadata" in fields and isinstance (system_metadata , dict ):
677+ for key in SUMMARY_METADATA_KEYS :
678+ document [key ] = system_metadata .get (key )
679+
680+ return document
681+
571682 async def update_document (
572683 self ,
573684 document_id : str ,
0 commit comments