Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions scripts/nvl72/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Benchmark run outputs (timestamped result dirs, nsys profiles) — not source.
results/
results_profile/
39 changes: 39 additions & 0 deletions scripts/nvl72/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# NVL72 (GB200) multi-GPU TPC-H benchmark harness

Reproducible TPC-H SF1000 benchmarking for Super Sirius on a GB200 NVL72 node
(2 Grace + 4 Blackwell in the configuration these configs target). Each query runs
in its own DuckDB process with the referenced columns pinned in the host tier across
the configured GPU count, then is timed over cold+warm iterations.

## Contents

- `configs/sirius_{1,2,4}gpu.yaml` — Sirius configs differing only in `topology.num_gpus`.
Tuned for SF1000 on GB200: `scan_task_batch_size`/`concat_batch_bytes = 5 GiB`,
host tier `capacity_bytes = 430 GiB` (~90% of one Grace LPDDR node), GPU usage 0.9.
- `run_benchmarks.sh` — driver: runs the 22 queries per scenario, each in an isolated
process, with per-query timeout and status classification (ok / cuda_error / timeout /
fallback); writes a timestamped dir under `results/` (CSV + summary).
- `profile_query.sh` — nsys-profile a single pinned query (capture scoped to execution
via `profiler_start()`/`profiler_stop()`).
- `gen_query_sql.py` — generates per-query SQL (views + `SET gpu_execution` + per-query
column pinning), reusing `queries.py` / `tpch_pin_columns.py` from `test/tpch_performance/`.

## Usage

From the repo root, with Sirius built (`pixi run make -j$(nproc)`):

```bash
# all 7 scenarios (disk/host {1,2,4} GPU + gpu 4-GPU), all 22 queries
./scripts/nvl72/run_benchmarks.sh

# subset
SCENARIOS="host_4gpu" QUERIES="1,9,21" ./scripts/nvl72/run_benchmarks.sh

# point at a different dataset
DATA=/path/to/sf1000 ./scripts/nvl72/run_benchmarks.sh
```

Env knobs: `SCENARIOS`, `QUERIES`, `DATA`, `DUCKDB`, `QUERY_TIMEOUT`, `ITERATIONS`, `OUT`.

> Reading SF1000 from a GPFS mount (e.g. `/scratch`) currently requires the temporary
> io_uring workaround in `src/io/uring/uring_reactor.cpp` (see the TODO there).
34 changes: 34 additions & 0 deletions scripts/nvl72/configs/sirius_1gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Sirius config for TPC-H SF1000 multi-GPU benchmark — 1 GPU.
# Only `topology.num_gpus` differs across the 1/2/4-GPU configs in this dir.
# GPU usage fraction 0.9 of a ~189 GiB GB200 ~= 170 GiB per GPU.
# Host capacity is PER host space, and Sirius creates one host space per CPU
# NUMA node. On this GB200 there are 2 such nodes (Grace LPDDR, ~478 GiB each,
# nodes 0/1); the other NUMA nodes (2,10,18,26) are GPU HBM and belong to the
# GPU tier, not the host tier. 430 GiB ~= 90% of one Grace node -> ~860 GiB
# total host tier. (free/htop's 1.65 TiB = ~957 GiB Grace LPDDR + ~736 GiB HBM.)
sirius:
topology:
num_gpus: 1
memory:
gpu:
usage_limit_fraction: 0.9
reservation_limit_fraction: 0.9
host:
capacity_bytes: 430 GiB
executor:
pipeline:
num_threads: 4
duckdb_scan:
num_threads: 2
task_creator:
num_threads: 2
downgrade:
num_threads: 1
monitor_period_ms: 10
operator_params:
scan_task_batch_size: 5368709120 # 5 GiB: fewer, larger scan tasks -> ~8x fewer allocations (q9 256s->32s); see scan-batch sweep
default_scan_task_varchar_size: 256
max_sort_partition_bytes: 0
hash_partition_bytes: 100000000
concat_batch_bytes: 5368709120
max_build_hash_table_bytes: 90000000
34 changes: 34 additions & 0 deletions scripts/nvl72/configs/sirius_2gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Sirius config for TPC-H SF1000 multi-GPU benchmark — 2 GPUs.
# Only `topology.num_gpus` differs across the 1/2/4-GPU configs in this dir.
# GPU usage fraction 0.9 of a ~189 GiB GB200 ~= 170 GiB per GPU.
# Host capacity is PER host space, and Sirius creates one host space per CPU
# NUMA node. On this GB200 there are 2 such nodes (Grace LPDDR, ~478 GiB each,
# nodes 0/1); the other NUMA nodes (2,10,18,26) are GPU HBM and belong to the
# GPU tier, not the host tier. 430 GiB ~= 90% of one Grace node -> ~860 GiB
# total host tier. (free/htop's 1.65 TiB = ~957 GiB Grace LPDDR + ~736 GiB HBM.)
sirius:
topology:
num_gpus: 2
memory:
gpu:
usage_limit_fraction: 0.9
reservation_limit_fraction: 0.9
host:
capacity_bytes: 430 GiB
executor:
pipeline:
num_threads: 4
duckdb_scan:
num_threads: 2
task_creator:
num_threads: 2
downgrade:
num_threads: 1
monitor_period_ms: 10
operator_params:
scan_task_batch_size: 5368709120 # 5 GiB: fewer, larger scan tasks -> ~8x fewer allocations (q9 256s->32s); see scan-batch sweep
default_scan_task_varchar_size: 256
max_sort_partition_bytes: 0
hash_partition_bytes: 100000000
concat_batch_bytes: 5368709120
max_build_hash_table_bytes: 90000000
34 changes: 34 additions & 0 deletions scripts/nvl72/configs/sirius_4gpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Sirius config for TPC-H SF1000 multi-GPU benchmark — 4 GPUs.
# Only `topology.num_gpus` differs across the 1/2/4-GPU configs in this dir.
# GPU usage fraction 0.9 of a ~189 GiB GB200 ~= 170 GiB per GPU.
# Host capacity is PER host space, and Sirius creates one host space per CPU
# NUMA node. On this GB200 there are 2 such nodes (Grace LPDDR, ~478 GiB each,
# nodes 0/1); the other NUMA nodes (2,10,18,26) are GPU HBM and belong to the
# GPU tier, not the host tier. 430 GiB ~= 90% of one Grace node -> ~860 GiB
# total host tier. (free/htop's 1.65 TiB = ~957 GiB Grace LPDDR + ~736 GiB HBM.)
sirius:
topology:
num_gpus: 4
memory:
gpu:
usage_limit_fraction: 0.9
reservation_limit_fraction: 0.9
host:
capacity_bytes: 430 GiB
executor:
pipeline:
num_threads: 4
duckdb_scan:
num_threads: 2
task_creator:
num_threads: 2
downgrade:
num_threads: 1
monitor_period_ms: 10
operator_params:
scan_task_batch_size: 5368709120 # 5 GiB: fewer, larger scan tasks -> ~8x fewer allocations (q9 256s->32s); see scan-batch sweep
default_scan_task_varchar_size: 256
max_sort_partition_bytes: 0
hash_partition_bytes: 100000000
concat_batch_bytes: 5368709120
max_build_hash_table_bytes: 90000000
144 changes: 144 additions & 0 deletions scripts/nvl72/gen_query_sql.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
#!/usr/bin/env python3
"""Generate a DuckDB CLI script for ONE TPC-H query in the multi-GPU 1k benchmark.

The emitted script:
1. registers a view per TPC-H table over the parquet files (explicit file
list, exactly like performance_test.py so pin path-matching works),
2. enables transparent GPU execution (`SET gpu_execution = true`),
3. for pinned scenarios, pins the query's referenced columns into the chosen
cache tier (host/gpu) using the per-query column map in
``tpch_pin_columns.py``,
4. runs the query `--iterations` times, each wrapped in `.timer on/off` with a
unique marker line so the driver can attribute each "Run Time (s): real X"
line to a specific (query, iteration),
5. unpins.

Timing contract for the driver: `.timer` is ON *only* around each query
statement, so every "Run Time (s): real" line in the CLI output belongs to a
query iteration, immediately preceded by its `MARKER_PREFIX q<N> iter<k>` line.

This reuses the project's tested query text and per-query pin column lists; it
does not redefine them. Plain Python — no duckdb import — so it runs under any
interpreter.
"""
from __future__ import annotations

import argparse
import glob
import os
import sys

# Reuse the repo's query text and pin-column map. This script lives in
# scripts/nvl72/; queries.py and tpch_pin_columns.py live in test/tpch_performance/.
_HERE = os.path.dirname(os.path.abspath(__file__))
_REPO = os.path.dirname(os.path.dirname(_HERE))
_TPCH_PERF = os.path.join(_REPO, "test", "tpch_performance")
sys.path.insert(0, _TPCH_PERF)

from queries import QUERIES # noqa: E402
import tpch_pin_columns as pin # noqa: E402

TPCH_TABLES = [
"customer",
"lineitem",
"nation",
"orders",
"part",
"partsupp",
"region",
"supplier",
]

MARKER_PREFIX = "###SIRIUS_BENCH"


def resolve_parquet_files(parquet_dir: str, table: str) -> list[str]:
"""Same resolution order as performance_test.py: <t>.parquet, <t>_*.parquet, <t>/*.parquet."""
candidates: list[str] = []
for pattern in (
os.path.join(parquet_dir, f"{table}.parquet"),
os.path.join(parquet_dir, f"{table}_*.parquet"),
os.path.join(parquet_dir, table, "*.parquet"),
):
candidates.extend(sorted(glob.glob(pattern)))
return candidates


def emit(
parquet_dir: str, qnum: int, tier: str, iterations: int, profile: bool = False
) -> str:
lines: list[str] = []
# 1. views over the parquet tables
for table in TPCH_TABLES:
files = resolve_parquet_files(parquet_dir, table)
if not files:
raise FileNotFoundError(
f"No parquet files for table '{table}' in {parquet_dir}"
)
file_list = ",".join(f"'{f}'" for f in files)
lines.append(
f"CREATE OR REPLACE VIEW {table} AS SELECT * FROM read_parquet([{file_list}]);"
)

# 2. transparent GPU execution
lines.append("SET gpu_execution = true;")
lines.append(".timer off")

# 3. pin (host/gpu tiers only)
if tier in ("host", "gpu"):
os.environ["SIRIUS_PIN_TIER"] = tier
lines.append(pin.emit_pin(qnum, parquet_dir).strip())

# 4. timed iterations
query = QUERIES[f"q{qnum}"].strip().rstrip(";")
if profile:
# Profiling mode: pin + views are already done (outside the capture
# window). Bracket ONLY the query execution with cudaProfilerStart/Stop
# so `nsys --capture-range=cudaProfilerApi` records query time only —
# no pin population, no CUDA-context init. Single iteration.
lines.append(f"SELECT '{MARKER_PREFIX} q{qnum} iter0' AS marker;")
lines.append("CALL profiler_start();")
lines.append(".timer on")
lines.append(query + ";")
lines.append(".timer off")
lines.append("CALL profiler_stop();")
else:
for it in range(iterations):
lines.append(f"SELECT '{MARKER_PREFIX} q{qnum} iter{it}' AS marker;")
lines.append(".timer on")
lines.append(query + ";")
lines.append(".timer off")

# 5. unpin
if tier in ("host", "gpu"):
lines.append(pin.emit_unpin(qnum).strip())

return "\n".join(lines) + "\n"


def main(argv: list[str]) -> int:
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--data", required=True, help="TPC-H parquet directory")
ap.add_argument("--query", required=True, type=int, help="query number 1..22")
ap.add_argument(
"--tier",
default="none",
choices=["none", "host", "gpu"],
help="pin tier; 'none' reads from disk (no pinning)",
)
ap.add_argument("--iterations", default=2, type=int)
ap.add_argument(
"--profile",
action="store_true",
help="wrap the (single) query in CALL profiler_start()/profiler_stop() for nsys "
"--capture-range=cudaProfilerApi, so only query execution is captured (pin excluded)",
)
args = ap.parse_args(argv)
sys.stdout.write(
emit(args.data, args.query, args.tier, args.iterations, args.profile)
)
return 0


if __name__ == "__main__":
raise SystemExit(main(sys.argv[1:]))
Loading
Loading