Skip to content

Review processing of nested data #851

@krlmlr

Description

@krlmlr

igraph

options(conflicts.policy = list(warn.conflicts = FALSE))
library(tidyverse)
library(igraph)

# Create Parquet file

set.seed(42)

data <- tibble(
  id = 10:30,
  graph = map(id, ~ sample_gnp(n = ., p = 0.3)),
  payload = runif(length(id)),
)

# Can't write graph
parquet_data <-
  data |>
  mutate(edgelists = map(graph, as_data_frame)) |>
  select(-graph)

# FIXME: Can't write with duckplyr?

arrow::write_parquet(parquet_data, "igraph.parquet")

# Read and process data with duckplyr

lazy_data <- duckplyr::read_parquet_duckdb("igraph.parquet")

igraph_computation_input <-
  lazy_data |>
  select(id, edgelists) |>
  # Explicitly bring into memory
  collect()

igraph_result <-
  igraph_computation_input |>
  mutate(
    graph = map(edgelists, graph_from_data_frame),
    num_components = map_int(graph, ~ diameter(.x))
  ) |>
  select(id, num_components)

# Continue with lazy operation

lazy_data_with_result <-
  lazy_data |>
  left_join(igraph_result, by = "id")

lazy_data_with_result |>
  explain()
#> ┌───────────────────────────┐
#> │         PROJECTION        │
#> │    ────────────────────   │
#> │             id            │
#> │          payload          │
#> │         edgelists         │
#> │       num_components      │
#> │                           │
#> │          ~21 rows         │
#> └─────────────┬─────────────┘
#> ┌─────────────┴─────────────┐
#> │         HASH_JOIN         │
#> │    ────────────────────   │
#> │      Join Type: LEFT      │
#> │                           │
#> │        Conditions:        ├──────────────┐
#> │ id_x IS NOT DISTINCT FROM │              │
#> │            id_y           │              │
#> │                           │              │
#> │          ~21 rows         │              │
#> └─────────────┬─────────────┘              │
#> ┌─────────────┴─────────────┐┌─────────────┴─────────────┐
#> │       READ_PARQUET        ││     R_DATAFRAME_SCAN      │
#> │    ────────────────────   ││    ────────────────────   │
#> │         Function:         ││      Text: data.frame     │
#> │        READ_PARQUET       ││                           │
#> │                           ││        Projections:       │
#> │        Projections:       ││             id            │
#> │             id            ││       num_components      │
#> │          payload          ││                           │
#> │         edgelists         ││                           │
#> │                           ││                           │
#> │          ~21 rows         ││          ~21 rows         │
#> └───────────────────────────┘└───────────────────────────┘

Created on 2026-02-22 with reprex v2.1.1

nested

library(tidyverse)

data <- tibble(
  a = 1:5,
  b = vctrs::as_list_of(map(1:5, ~ letters[seq_len(.x)])),
  c = tibble(c1 = 11:15, c2 = 21:25),
  d = vctrs::as_list_of(map(1:5, ~ tibble(x1 = rev(seq_len(.x)), x2 = LETTERS[seq_len(.x)]))),
  e = vctrs::as_list_of(map(1:5, ~ tibble(x = vctrs::as_list_of(map(1:3, ~ tibble(z = vctrs::list_of(1, 2))))))),
)

data
#> # A tibble: 5 × 5
#>       a           b  c$c1   $c2                  d                  e
#>   <int> <list<chr>> <int> <int> <list<tibble[,2]>> <list<tibble[,1]>>
#> 1     1         [1]    11    21            [1 × 2]            [3 × 1]
#> 2     2         [2]    12    22            [2 × 2]            [3 × 1]
#> 3     3         [3]    13    23            [3 × 2]            [3 × 1]
#> 4     4         [4]    14    24            [4 × 2]            [3 × 1]
#> 5     5         [5]    15    25            [5 × 2]            [3 × 1]

data$e[[3]]$x[[1]]$z
#> <list_of<double>[2]>
#> [[1]]
#> [1] 1
#> 
#> [[2]]
#> [1] 2

duckplyr::compute_parquet(data, "nested-duckdb.parquet")
#> Error in `duckplyr::compute_parquet()`:
#> ! Can't convert columns of class <vctrs_list_of/vctrs_vctr/list> to
#>   relational. Affected column: `b`.

Created on 2026-02-22 with reprex v2.1.1

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions