[python] Fix Ray read_paimon dropping nested projection (reads nested leaves as NULL) (#8269)

TheR1sing3un · web-flow · commit 54545a9ae5f5 · 2026-06-19T08:56:51.000+08:00
diff --git a/paimon-python/pypaimon/read/datasource/ray_datasource.py b/paimon-python/pypaimon/read/datasource/ray_datasource.py
@@ -124,6 +124,7 @@ def get_read_tasks(self, parallelism: int, **kwargs) -> List:
         table = self._split_provider.table()
         predicate = self._split_provider.predicate()
         read_type = self._split_provider.read_type()
+        nested_name_paths = self._split_provider.nested_name_paths()
         splits = self._split_provider.splits()
         limit = self._split_provider.limit()
         if not splits:
@@ -148,11 +149,17 @@ def _get_read_task(
                 read_type=read_type,
                 schema=schema,
                 limit=limit,
+                nested_name_paths=nested_name_paths,
         ) -> Iterable[pyarrow.Table]:
             """Read function that will be executed by Ray workers."""
             from pypaimon.read.table_read import TableRead
+            # nested_name_paths must be forwarded so a nested-leaf projection
+            # widens to the parent struct and extracts the leaves; without it
+            # the worker treats the flattened leaf names as missing top-level
+            # columns and reads every projected leaf as NULL.
             worker_table_read = TableRead(
-                table, predicate, read_type, limit=limit)
+                table, predicate, read_type, limit=limit,
+                nested_name_paths=nested_name_paths)
 
             batch_reader = worker_table_read.to_arrow_batch_reader(splits)
             has_data = False
@@ -179,6 +186,7 @@ def _get_read_task(
             read_type=read_type,
             schema=schema,
             limit=limit,
+            nested_name_paths=nested_name_paths,
         )
 
         read_tasks = []
diff --git a/paimon-python/pypaimon/read/datasource/split_provider.py b/paimon-python/pypaimon/read/datasource/split_provider.py
@@ -67,6 +67,17 @@ def limit(self) -> Optional[int]:
         """
         return None
 
+    def nested_name_paths(self) -> Optional[List[List[str]]]:
+        """Parallel name paths for a nested-leaf projection, or ``None``.
+
+        Forwarded to the per-task ``TableRead`` so a projection like
+        ``['mv.latest_value.x']`` is read by widening to the parent struct and
+        extracting the requested leaves. Without it the worker treats the
+        flattened leaf names as missing top-level columns and reads every
+        projected leaf as NULL.
+        """
+        return None
+
 
 class CatalogSplitProvider(SplitProvider):
     """Plan splits from a fully-qualified table identifier and catalog options.
@@ -124,6 +135,7 @@ def __init__(
         self._table_cached = None
         self._splits_cached = None
         self._read_type_cached = None
+        self._nested_name_paths_cached = None
 
     def _ensure_table(self):
         if self._table_cached is None:
@@ -154,6 +166,7 @@ def _ensure_planned(self):
         if self._limit is not None:
             rb = rb.with_limit(self._limit)
         self._read_type_cached = rb.read_type()
+        self._nested_name_paths_cached = rb._nested_name_paths()
         self._splits_cached = rb.new_scan().plan().splits()
 
     @property
@@ -171,6 +184,10 @@ def read_type(self):
         self._ensure_planned()
         return self._read_type_cached
 
+    def nested_name_paths(self) -> Optional[List[List[str]]]:
+        self._ensure_planned()
+        return self._nested_name_paths_cached
+
     def predicate(self):
         return self._predicate
 
@@ -190,12 +207,13 @@ class PreResolvedSplitProvider(SplitProvider):
     """
 
     def __init__(self, table, splits: List[Split], read_type, predicate=None,
-                 limit: Optional[int] = None):
+                 limit: Optional[int] = None, nested_name_paths=None):
         self._table = table
         self._splits = splits
         self._read_type = read_type
         self._predicate = predicate
         self._limit = limit
+        self._nested_name_paths = nested_name_paths
 
     def table(self):
         return self._table
@@ -206,6 +224,9 @@ def splits(self) -> List[Split]:
     def read_type(self):
         return self._read_type
 
+    def nested_name_paths(self) -> Optional[List[List[str]]]:
+        return self._nested_name_paths
+
     def predicate(self):
         return self._predicate
 
diff --git a/paimon-python/pypaimon/read/table_read.py b/paimon-python/pypaimon/read/table_read.py
@@ -530,6 +530,7 @@ def to_ray(
                 read_type=self.read_type,
                 predicate=self.predicate,
                 limit=self.limit,
+                nested_name_paths=self.nested_name_paths,
             )
         )
         ds = ray.data.read_datasource(
diff --git a/paimon-python/pypaimon/tests/ray_data_test.py b/paimon-python/pypaimon/tests/ray_data_test.py
@@ -834,6 +834,41 @@ def process_blob(batch):
             "Blob data column should match"
         )
 
+    def test_to_ray_with_nested_projection(self):
+        """to_ray() respects a nested-leaf projection.
+
+        Sibling of the read_paimon() nested-projection test: this exercises
+        the PreResolvedSplitProvider entry point (TableRead.to_ray), which
+        must also forward nested_name_paths to the worker TableRead. Without
+        it the worker treats the flattened leaf name as a missing top-level
+        column and reads the projected leaf as NULL.
+        """
+        inner = pa.struct([('a', pa.int64()), ('b', pa.string())])
+        pa_schema = pa.schema([('id', pa.int64()), ('payload', inner)])
+        schema = Schema.from_pyarrow_schema(pa_schema)
+        self.catalog.create_table('default.test_ray_nested_proj', schema, False)
+        table = self.catalog.get_table('default.test_ray_nested_proj')
+
+        write_builder = table.new_batch_write_builder()
+        writer = write_builder.new_write()
+        writer.write_arrow(pa.Table.from_pylist(
+            [{'id': 1, 'payload': {'a': 10, 'b': 'x'}},
+             {'id': 2, 'payload': {'a': 20, 'b': 'y'}}],
+            schema=pa_schema))
+        commit = write_builder.new_commit()
+        commit.commit(writer.prepare_commit())
+        writer.close()
+
+        read_builder = table.new_read_builder().with_projection(['id', 'payload.a'])
+        table_read = read_builder.new_read()
+        splits = read_builder.new_scan().plan().splits()
+
+        ray_dataset = table_read.to_ray(splits, override_num_blocks=1)
+        rows = {r['id']: r for r in ray_dataset.take_all()}
+        self.assertEqual(set(rows.keys()), {1, 2})
+        self.assertEqual(rows[1]['payload_a'], 10)
+        self.assertEqual(rows[2]['payload_a'], 20)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/paimon-python/pypaimon/tests/ray_integration_test.py b/paimon-python/pypaimon/tests/ray_integration_test.py
@@ -119,6 +119,31 @@ def test_read_paimon_with_projection(self):
         self.assertEqual(set(df.columns), {'id', 'name'})
         self.assertEqual(len(df), 2)
 
+    def test_read_paimon_with_nested_projection(self):
+        """read_paimon() respects a nested-leaf projection.
+
+        Regression for the worker-side TableRead being rebuilt without
+        nested_name_paths: a projection like ['payload.a'] used to read every
+        nested leaf as NULL because the worker treated the flattened leaf name
+        as a missing top-level column.
+        """
+        from pypaimon.ray import read_paimon
+
+        inner = pa.struct([('a', pa.int64()), ('b', pa.string())])
+        pa_schema = pa.schema([('id', pa.int32()), ('payload', inner)])
+        identifier = self._create_and_populate_table(
+            'test_read_nested_proj', pa_schema,
+            {'id': [1, 2],
+             'payload': [{'a': 10, 'b': 'x'}, {'a': 20, 'b': 'y'}]},
+        )
+
+        ds = read_paimon(identifier, self.catalog_options,
+                         projection=['id', 'payload.a'])
+        rows = {r['id']: r for r in ds.take_all()}
+        self.assertEqual(set(rows.keys()), {1, 2})
+        self.assertEqual(rows[1]['payload_a'], 10)
+        self.assertEqual(rows[2]['payload_a'], 20)
+
     def test_read_paimon_with_filter(self):
         """read_paimon() pushes down a predicate filter."""
         from pypaimon.ray import read_paimon

Original file line number	Diff line number	Diff line change
`@@ -530,6 +530,7 @@ def to_ray(`
`530`	`530`	`read_type=self.read_type,`
`531`	`531`	`predicate=self.predicate,`
`532`	`532`	`limit=self.limit,`
	`533`	`+ nested_name_paths=self.nested_name_paths,`
`533`	`534`	`)`
`534`	`535`	`)`
`535`	`536`	`ds = ray.data.read_datasource(`