Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,14 @@ jobs:

# Single pass: autotune search + codegen + compilation happen inline,
# so --measure-compile-time reports end-to-end user-visible compile time.
${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 python benchmarks/run.py \
# HELION_AUTOTUNE_LOG is per-kernel so the per-config CSV / .meta.jsonl
# sidecars don't clobber across the kernel loop (the sink appends, so
# every input shape for this kernel accumulates into one file pair).
# --autotune-metrics-json captures the per-run kernel-level summary
# (kernel_id, source, shapes, hardware) and is overwrite-safe on its own.
${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 \
HELION_AUTOTUNE_LOG="$TEST_REPORTS_DIR/autotune-$kernel" \
python benchmarks/run.py \
--op $kernel \
--helion-backend "${{ inputs.backend }}" \
--metrics speedup,accuracy,latency \
Expand All @@ -213,6 +220,7 @@ jobs:
--input-sample-mode equally-spaced-k \
--output "$TEST_REPORTS_DIR/helionbench.json" \
--append-to-output \
--autotune-metrics-json "$TEST_REPORTS_DIR/autotune-metrics-$kernel.json" \
--keep-going \
${{ inputs.custom-args }}

Expand Down
5 changes: 3 additions & 2 deletions docs/api/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,9 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:

.. autoattribute:: Settings.autotune_log

When set, Helion writes per-config autotuning telemetry (kernel id, sample id, config index, generation, status, perf, compile time, timestamp, config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
The kernel identity (id, name, source, input shapes, dtypes, hardware) is written once per run to ``<value>.meta.json``. ``kernel_id`` is a stable content hash (of the kernel source and code-generation settings) that appears on every CSV row, acting as the foreign key to join rows back to the sidecar and group them by kernel across runs; ``sample_id`` additionally identifies each ``(kernel, config)`` pair so repeated benchmarks of the same config can be deduplicated.
When set, Helion writes per-config autotuning telemetry (run id, kernel id, sample id, config index, generation, status, perf, compile time, timestamp, decorator, config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
The kernel identity (run id, kernel id, name, source, input shapes, dtypes, hardware) is appended, one JSON record per run, to the ``<value>.meta.jsonl`` sidecar, and a lossless node-link dump of the kernel's device IR is appended, one record per run, to the ``<value>.ir.jsonl`` sidecar (loadable with ``networkx.node_link_graph(record, edges="links")`` and joined to the other files on ``run_id``). All of these files are opened in append mode, so multiple autotune runs that share one base path (e.g. many kernels and input shapes benchmarked in a single process) accumulate instead of overwriting each other.
Three content-derived ids tie the data together. ``kernel_id`` (a hash of the kernel source and code-generation settings) is shape/dtype independent, grouping every row for one kernel across shapes and runs. ``run_id`` (a hash of ``kernel_id`` plus input shapes, dtypes, and hardware) identifies a single autotune invocation, so each CSV row joins to exactly one ``.meta.jsonl`` record and a config's measured perf can be attributed to the specific shape/dtype/hardware it was measured on. ``sample_id`` (a hash of the kernel source and the config) identifies each ``(kernel, config)`` pair. Together ``(run_id, sample_id)`` is the natural primary key for the per-config rows: ``run_id`` pins the kernel+shape+dtype+hardware instance and ``sample_id`` pins the config.
Controlled by ``HELION_AUTOTUNE_LOG``.

.. autoattribute:: Settings.autotune_compile_timeout
Expand Down
39 changes: 36 additions & 3 deletions helion/autotuner/base_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@
from .benchmark_provider import _unset_fn
from .benchmarking import clear_jit_fast_path_caches
from .benchmarking import interleaved_bench
from .ir_features import IrGraphRecord
from .ir_features import extract_ir_graph
from .logger import AutotuningLogger
from .metrics import AutotuneMetrics
from .metrics import KernelMetadata
Expand Down Expand Up @@ -275,8 +277,9 @@ def _prepare(self) -> None:
random_seed=self.settings.autotune_random_seed,
search_algorithm=type(self).__name__,
)
# Written once per run to the <autotune_log>.meta.json sidecar so the
# per-config CSV rows can be grouped by kernel across runs.
# Appended once per run to the <autotune_log>.meta.jsonl sidecar so the
# per-config CSV rows can be joined back to it (run_id) and grouped by
# kernel (kernel_id) across runs. run_id is derived inside KernelMetadata.
self._kernel_metadata: KernelMetadata = KernelMetadata(
kernel_id=kernel_id,
kernel_name=kernel_name,
Expand All @@ -294,6 +297,34 @@ def _prepare(self) -> None:
autotune_metrics=self._autotune_metrics,
)
self.benchmark_provider.set_budget_exceeded_fn(self._autotune_budget_exceeded)
# Device IR is config-independent, so dump it once per run (joined to the
# per-config CSV rows on run_id). Only when telemetry is on.
self._ir_graph: IrGraphRecord | None = self._extract_ir_graph()

def _extract_ir_graph(self) -> IrGraphRecord | None:
"""Best-effort device-IR node-link dump for the autotune-log sidecar.

Returns ``None`` (no IR artifact) when telemetry is off or the device IR
is unavailable (e.g. a backend without a standard device IR); extraction
never breaks autotuning.
"""
if not self.settings.autotune_log:
return None
host_function = getattr(self.kernel, "host_function", None)
device_ir = getattr(host_function, "device_ir", None)
if device_ir is None:
return None
try:
return extract_ir_graph(
device_ir,
run_id=self._kernel_metadata.run_id,
kernel_id=self._kernel_metadata.kernel_id,
kernel_name=self._kernel_metadata.kernel_name,
input_shapes=self._kernel_metadata.input_shapes,
)
except Exception:
self.log.debug("Failed to extract device IR features", exc_info=True)
return None

def _autotune_budget_exceeded(self) -> bool:
budget = self.settings.autotune_budget_seconds
Expand Down Expand Up @@ -476,7 +507,9 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
with exit_stack:
if self.settings.autotune_log:
exit_stack.enter_context(
self.log.autotune_logging(metadata=self._kernel_metadata)
self.log.autotune_logging(
metadata=self._kernel_metadata, ir_graph=self._ir_graph
)
)
self.log.reset()
# Autotuner triggers bugs in remote triton compile service.
Expand Down
29 changes: 17 additions & 12 deletions helion/autotuner/benchmark_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -333,18 +333,21 @@ def __init__(
)
self._jobs = self._decide_num_jobs()

def _sample_id(self, config: Config) -> str:
"""Return a stable per-(kernel, config) id for telemetry rows.

Computed as ``sha256(kernel_source + decorator(config))`` so the same
kernel benchmarked with the same config produces the same id across
runs, enabling label aggregation/dedup for the cost-model dataset.
def _sample_identity(self, config: Config) -> tuple[str, str]:
"""Return ``(sample_id, decorator)`` for a config's telemetry rows.

``decorator`` is ``format_kernel_decorator(config)`` -- the canonical
``@helion.kernel(...)`` string that reproduces this config -- and is
collected as a structured artifact. ``sample_id`` is
``sha256(kernel_source + decorator)``, a stable per-(kernel, config) id
so the same kernel benchmarked with the same config produces the same id
across runs, enabling label aggregation/dedup for the cost-model dataset.
The decorator is computed once here and reused for both ids and the row.
"""
payload = (
self._autotune_metrics.kernel_source
+ self.kernel.format_kernel_decorator(config, self.settings)
)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
decorator = self.kernel.format_kernel_decorator(config, self.settings)
payload = self._autotune_metrics.kernel_source + decorator
sample_id = hashlib.sha256(payload.encode("utf-8")).hexdigest()
return sample_id, decorator

def _compute_baseline(
self,
Expand Down Expand Up @@ -817,7 +820,7 @@ def benchmark(
process_group_name=self.kernel.env.process_group_name,
)
):
sample_id = self._sample_id(config)
sample_id, decorator = self._sample_identity(config)
self.log.record_autotune_entry(
AutotuneLogEntry(
generation=self._autotune_metrics.num_generations,
Expand All @@ -826,6 +829,7 @@ def benchmark(
compile_time=compile_time,
config=config,
sample_id=sample_id,
decorator=decorator,
)
)
perf = self._benchmark_function(config, fn)
Expand All @@ -838,6 +842,7 @@ def benchmark(
compile_time=compile_time,
config=config,
sample_id=sample_id,
decorator=decorator,
)
)
results[valid_indices[index]] = BenchmarkResult(
Expand Down
Loading
Loading