sirius-db · felipeblazing · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/scripts/nvl72/.gitignore b/scripts/nvl72/.gitignore
@@ -0,0 +1,3 @@
+# Benchmark run outputs (timestamped result dirs, nsys profiles) — not source.
+results/
+results_profile/
diff --git a/scripts/nvl72/README.md b/scripts/nvl72/README.md
@@ -0,0 +1,39 @@
+# NVL72 (GB200) multi-GPU TPC-H benchmark harness
+
+Reproducible TPC-H SF1000 benchmarking for Super Sirius on a GB200 NVL72 node
+(2 Grace + 4 Blackwell in the configuration these configs target). Each query runs
+in its own DuckDB process with the referenced columns pinned in the host tier across
+the configured GPU count, then is timed over cold+warm iterations.
+
+## Contents
+
+- `configs/sirius_{1,2,4}gpu.yaml` — Sirius configs differing only in `topology.num_gpus`.
+  Tuned for SF1000 on GB200: `scan_task_batch_size`/`concat_batch_bytes = 5 GiB`,
+  host tier `capacity_bytes = 430 GiB` (~90% of one Grace LPDDR node), GPU usage 0.9.
+- `run_benchmarks.sh` — driver: runs the 22 queries per scenario, each in an isolated
+  process, with per-query timeout and status classification (ok / cuda_error / timeout /
+  fallback); writes a timestamped dir under `results/` (CSV + summary).
+- `profile_query.sh` — nsys-profile a single pinned query (capture scoped to execution
+  via `profiler_start()`/`profiler_stop()`).
+- `gen_query_sql.py` — generates per-query SQL (views + `SET gpu_execution` + per-query
+  column pinning), reusing `queries.py` / `tpch_pin_columns.py` from `test/tpch_performance/`.
+
+## Usage
+
+From the repo root, with Sirius built (`pixi run make -j$(nproc)`):
+
+```bash
+# all 7 scenarios (disk/host {1,2,4} GPU + gpu 4-GPU), all 22 queries
+./scripts/nvl72/run_benchmarks.sh
+
+# subset
+SCENARIOS="host_4gpu" QUERIES="1,9,21" ./scripts/nvl72/run_benchmarks.sh
+
+# point at a different dataset
+DATA=/path/to/sf1000 ./scripts/nvl72/run_benchmarks.sh
+```
+
+Env knobs: `SCENARIOS`, `QUERIES`, `DATA`, `DUCKDB`, `QUERY_TIMEOUT`, `ITERATIONS`, `OUT`.
+
+> Reading SF1000 from a GPFS mount (e.g. `/scratch`) currently requires the temporary
+> io_uring workaround in `src/io/uring/uring_reactor.cpp` (see the TODO there).
diff --git a/scripts/nvl72/configs/sirius_1gpu.yaml b/scripts/nvl72/configs/sirius_1gpu.yaml
@@ -0,0 +1,34 @@
+# Sirius config for TPC-H SF1000 multi-GPU benchmark — 1 GPU.
+# Only `topology.num_gpus` differs across the 1/2/4-GPU configs in this dir.
+# GPU usage fraction 0.9 of a ~189 GiB GB200 ~= 170 GiB per GPU.
+# Host capacity is PER host space, and Sirius creates one host space per CPU
+# NUMA node. On this GB200 there are 2 such nodes (Grace LPDDR, ~478 GiB each,
+# nodes 0/1); the other NUMA nodes (2,10,18,26) are GPU HBM and belong to the
+# GPU tier, not the host tier. 430 GiB ~= 90% of one Grace node -> ~860 GiB
+# total host tier. (free/htop's 1.65 TiB = ~957 GiB Grace LPDDR + ~736 GiB HBM.)
+sirius:
+  topology:
+    num_gpus: 1
+  memory:
+    gpu:
+      usage_limit_fraction: 0.9
+      reservation_limit_fraction: 0.9
+    host:
+      capacity_bytes: 430 GiB
+  executor:
+    pipeline:
+      num_threads: 4
+    duckdb_scan:
+      num_threads: 2
+    task_creator:
+      num_threads: 2
+    downgrade:
+      num_threads: 1
+      monitor_period_ms: 10
+  operator_params:
+    scan_task_batch_size: 5368709120   # 5 GiB: fewer, larger scan tasks -> ~8x fewer allocations (q9 256s->32s); see scan-batch sweep
+    default_scan_task_varchar_size: 256
+    max_sort_partition_bytes: 0
+    hash_partition_bytes: 100000000
+    concat_batch_bytes: 5368709120
+    max_build_hash_table_bytes: 90000000
diff --git a/scripts/nvl72/configs/sirius_2gpu.yaml b/scripts/nvl72/configs/sirius_2gpu.yaml
@@ -0,0 +1,34 @@
+# Sirius config for TPC-H SF1000 multi-GPU benchmark — 2 GPUs.
+# Only `topology.num_gpus` differs across the 1/2/4-GPU configs in this dir.
+# GPU usage fraction 0.9 of a ~189 GiB GB200 ~= 170 GiB per GPU.
+# Host capacity is PER host space, and Sirius creates one host space per CPU
+# NUMA node. On this GB200 there are 2 such nodes (Grace LPDDR, ~478 GiB each,
+# nodes 0/1); the other NUMA nodes (2,10,18,26) are GPU HBM and belong to the
+# GPU tier, not the host tier. 430 GiB ~= 90% of one Grace node -> ~860 GiB
+# total host tier. (free/htop's 1.65 TiB = ~957 GiB Grace LPDDR + ~736 GiB HBM.)
+sirius:
+  topology:
+    num_gpus: 2
+  memory:
+    gpu:
+      usage_limit_fraction: 0.9
+      reservation_limit_fraction: 0.9
+    host:
+      capacity_bytes: 430 GiB
+  executor:
+    pipeline:
+      num_threads: 4
+    duckdb_scan:
+      num_threads: 2
+    task_creator:
+      num_threads: 2
+    downgrade:
+      num_threads: 1
+      monitor_period_ms: 10
+  operator_params:
+    scan_task_batch_size: 5368709120   # 5 GiB: fewer, larger scan tasks -> ~8x fewer allocations (q9 256s->32s); see scan-batch sweep
+    default_scan_task_varchar_size: 256
+    max_sort_partition_bytes: 0
+    hash_partition_bytes: 100000000
+    concat_batch_bytes: 5368709120
+    max_build_hash_table_bytes: 90000000
diff --git a/scripts/nvl72/configs/sirius_4gpu.yaml b/scripts/nvl72/configs/sirius_4gpu.yaml
@@ -0,0 +1,34 @@
+# Sirius config for TPC-H SF1000 multi-GPU benchmark — 4 GPUs.
+# Only `topology.num_gpus` differs across the 1/2/4-GPU configs in this dir.
+# GPU usage fraction 0.9 of a ~189 GiB GB200 ~= 170 GiB per GPU.
+# Host capacity is PER host space, and Sirius creates one host space per CPU
+# NUMA node. On this GB200 there are 2 such nodes (Grace LPDDR, ~478 GiB each,
+# nodes 0/1); the other NUMA nodes (2,10,18,26) are GPU HBM and belong to the
+# GPU tier, not the host tier. 430 GiB ~= 90% of one Grace node -> ~860 GiB
+# total host tier. (free/htop's 1.65 TiB = ~957 GiB Grace LPDDR + ~736 GiB HBM.)
+sirius:
+  topology:
+    num_gpus: 4
+  memory:
+    gpu:
+      usage_limit_fraction: 0.9
+      reservation_limit_fraction: 0.9
+    host:
+      capacity_bytes: 430 GiB
+  executor:
+    pipeline:
+      num_threads: 4
+    duckdb_scan:
+      num_threads: 2
+    task_creator:
+      num_threads: 2
+    downgrade:
+      num_threads: 1
+      monitor_period_ms: 10
+  operator_params:
+    scan_task_batch_size: 5368709120   # 5 GiB: fewer, larger scan tasks -> ~8x fewer allocations (q9 256s->32s); see scan-batch sweep
+    default_scan_task_varchar_size: 256
+    max_sort_partition_bytes: 0
+    hash_partition_bytes: 100000000
+    concat_batch_bytes: 5368709120
+    max_build_hash_table_bytes: 90000000
diff --git a/scripts/nvl72/gen_query_sql.py b/scripts/nvl72/gen_query_sql.py
@@ -0,0 +1,144 @@
+#!/usr/bin/env python3
+"""Generate a DuckDB CLI script for ONE TPC-H query in the multi-GPU 1k benchmark.
+
+The emitted script:
+  1. registers a view per TPC-H table over the parquet files (explicit file
+     list, exactly like performance_test.py so pin path-matching works),
+  2. enables transparent GPU execution (`SET gpu_execution = true`),
+  3. for pinned scenarios, pins the query's referenced columns into the chosen
+     cache tier (host/gpu) using the per-query column map in
+     ``tpch_pin_columns.py``,
+  4. runs the query `--iterations` times, each wrapped in `.timer on/off` with a
+     unique marker line so the driver can attribute each "Run Time (s): real X"
+     line to a specific (query, iteration),
+  5. unpins.
+
+Timing contract for the driver: `.timer` is ON *only* around each query
+statement, so every "Run Time (s): real" line in the CLI output belongs to a
+query iteration, immediately preceded by its `MARKER_PREFIX q<N> iter<k>` line.
+
+This reuses the project's tested query text and per-query pin column lists; it
+does not redefine them. Plain Python — no duckdb import — so it runs under any
+interpreter.
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import os
+import sys
+
+# Reuse the repo's query text and pin-column map. This script lives in
+# scripts/nvl72/; queries.py and tpch_pin_columns.py live in test/tpch_performance/.
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_REPO = os.path.dirname(os.path.dirname(_HERE))
+_TPCH_PERF = os.path.join(_REPO, "test", "tpch_performance")
+sys.path.insert(0, _TPCH_PERF)
+
+from queries import QUERIES  # noqa: E402
+import tpch_pin_columns as pin  # noqa: E402
+
+TPCH_TABLES = [
+    "customer",
+    "lineitem",
+    "nation",
+    "orders",
+    "part",
+    "partsupp",
+    "region",
+    "supplier",
+]
+
+MARKER_PREFIX = "###SIRIUS_BENCH"
+
+
+def resolve_parquet_files(parquet_dir: str, table: str) -> list[str]:
+    """Same resolution order as performance_test.py: <t>.parquet, <t>_*.parquet, <t>/*.parquet."""
+    candidates: list[str] = []
+    for pattern in (
+        os.path.join(parquet_dir, f"{table}.parquet"),
+        os.path.join(parquet_dir, f"{table}_*.parquet"),
+        os.path.join(parquet_dir, table, "*.parquet"),
+    ):
+        candidates.extend(sorted(glob.glob(pattern)))
+    return candidates
+
+
+def emit(
+    parquet_dir: str, qnum: int, tier: str, iterations: int, profile: bool = False
+) -> str:
+    lines: list[str] = []
+    # 1. views over the parquet tables
+    for table in TPCH_TABLES:
+        files = resolve_parquet_files(parquet_dir, table)
+        if not files:
+            raise FileNotFoundError(
+                f"No parquet files for table '{table}' in {parquet_dir}"
+            )
+        file_list = ",".join(f"'{f}'" for f in files)
+        lines.append(
+            f"CREATE OR REPLACE VIEW {table} AS SELECT * FROM read_parquet([{file_list}]);"
+        )
+
+    # 2. transparent GPU execution
+    lines.append("SET gpu_execution = true;")
+    lines.append(".timer off")
+
+    # 3. pin (host/gpu tiers only)
+    if tier in ("host", "gpu"):
+        os.environ["SIRIUS_PIN_TIER"] = tier
+        lines.append(pin.emit_pin(qnum, parquet_dir).strip())
+
+    # 4. timed iterations
+    query = QUERIES[f"q{qnum}"].strip().rstrip(";")
+    if profile:
+        # Profiling mode: pin + views are already done (outside the capture
+        # window). Bracket ONLY the query execution with cudaProfilerStart/Stop
+        # so `nsys --capture-range=cudaProfilerApi` records query time only —
+        # no pin population, no CUDA-context init. Single iteration.
+        lines.append(f"SELECT '{MARKER_PREFIX} q{qnum} iter0' AS marker;")
+        lines.append("CALL profiler_start();")
+        lines.append(".timer on")
+        lines.append(query + ";")
+        lines.append(".timer off")
+        lines.append("CALL profiler_stop();")
+    else:
+        for it in range(iterations):
+            lines.append(f"SELECT '{MARKER_PREFIX} q{qnum} iter{it}' AS marker;")
+            lines.append(".timer on")
+            lines.append(query + ";")
+            lines.append(".timer off")
+
+    # 5. unpin
+    if tier in ("host", "gpu"):
+        lines.append(pin.emit_unpin(qnum).strip())
+
+    return "\n".join(lines) + "\n"
+
+
+def main(argv: list[str]) -> int:
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--data", required=True, help="TPC-H parquet directory")
+    ap.add_argument("--query", required=True, type=int, help="query number 1..22")
+    ap.add_argument(
+        "--tier",
+        default="none",
+        choices=["none", "host", "gpu"],
+        help="pin tier; 'none' reads from disk (no pinning)",
+    )
+    ap.add_argument("--iterations", default=2, type=int)
+    ap.add_argument(
+        "--profile",
+        action="store_true",
+        help="wrap the (single) query in CALL profiler_start()/profiler_stop() for nsys "
+        "--capture-range=cudaProfilerApi, so only query execution is captured (pin excluded)",
+    )
+    args = ap.parse_args(argv)
+    sys.stdout.write(
+        emit(args.data, args.query, args.tier, args.iterations, args.profile)
+    )
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main(sys.argv[1:]))