Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,14 @@ jobs:

# Single pass: autotune search + codegen + compilation happen inline,
# so --measure-compile-time reports end-to-end user-visible compile time.
${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 python benchmarks/run.py \
# HELION_AUTOTUNE_LOG is per-kernel so the per-config CSV / .meta.jsonl
# sidecars don't clobber across the kernel loop (the sink appends, so
# every input shape for this kernel accumulates into one file pair).
# --autotune-metrics-json captures the per-run kernel-level summary
# (kernel_id, source, shapes, hardware) and is overwrite-safe on its own.
${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 \
HELION_AUTOTUNE_LOG="$TEST_REPORTS_DIR/autotune-$kernel" \
python benchmarks/run.py \
--op $kernel \
--helion-backend "${{ inputs.backend }}" \
--metrics speedup,accuracy,latency \
Expand All @@ -213,6 +220,7 @@ jobs:
--input-sample-mode equally-spaced-k \
--output "$TEST_REPORTS_DIR/helionbench.json" \
--append-to-output \
--autotune-metrics-json "$TEST_REPORTS_DIR/autotune-metrics-$kernel.json" \
--keep-going \
${{ inputs.custom-args }}

Expand Down
6 changes: 4 additions & 2 deletions docs/api/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,8 +143,10 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:

.. autoattribute:: Settings.autotune_log

When set, Helion writes per-config autotuning telemetry (kernel id, sample id, config index, generation, status, perf, compile time, timestamp, config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
The kernel identity (id, name, source, input shapes, dtypes, hardware) is written once per run to ``<value>.meta.json``. ``kernel_id`` is a stable content hash (of the kernel source and code-generation settings) that appears on every CSV row, acting as the foreign key to join rows back to the sidecar and group them by kernel across runs; ``sample_id`` additionally identifies each ``(kernel, config)`` pair so repeated benchmarks of the same config can be deduplicated.
When set, Helion writes per-config autotuning telemetry (run id, timestamp, config index, generation, status, perf, compile time, minimized config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
The kernel identity (run id, name, source, input shapes, dtypes, hardware, full ``helion.settings``, and ``config_defaults``) is appended, one JSON record per run, to the ``<value>.meta.jsonl`` sidecar. All three files are opened in append mode, so multiple autotune runs that share one base path (e.g. many kernels and input shapes benchmarked in a single process) accumulate instead of overwriting each other; because ``run_id`` is content-stable, a consumer should de-duplicate on ``run_id`` when the same base path is reused across re-runs (the ``.log`` file just interleaves and carries no join key).
``run_id`` is the single join key. It is a content hash derived directly from the kernel source, the code-generation-affecting settings, and the input shapes, dtypes, and hardware, so the same invocation yields the same ``run_id`` across processes and runs and each CSV row joins to exactly one ``.meta.jsonl`` record. Runs whose search space is restricted to user-pinned ``configs`` (without ``force_autotune``) are excluded from collection.
The per-config ``config`` column is *minimized* (values equal to the kernel's defaults are dropped). To reconstruct the config as benchmarked, merge it over the run's ``config_defaults``: ``{**config_defaults, **json.loads(row_config)}``. The ``settings`` mapping is serialized JSON-safe (``json.dumps(default=str)``), so non-serializable values such as ``index_dtype`` (a ``torch.dtype``) and callables are stored as strings for analysis rather than as faithful round-trippable objects.
Controlled by ``HELION_AUTOTUNE_LOG``.

.. autoattribute:: Settings.autotune_compile_timeout
Expand Down
30 changes: 23 additions & 7 deletions helion/autotuner/base_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,12 +253,10 @@ def _prepare(self) -> None:
if budget is not None:
self.log(f"Autotune budget: {budget}s")
kernel_obj = getattr(self.kernel, "kernel", None)
kernel_id = ""
kernel_source = ""
if kernel_obj is not None:
try:
kernel_source = kernel_obj.kernel_source()
kernel_id = kernel_obj.kernel_id()
except OSError:
self.log.debug("Failed to read Helion kernel source", exc_info=True)
kernel_name = getattr(kernel_obj, "name", "")
Expand All @@ -267,23 +265,27 @@ def _prepare(self) -> None:
dtypes = str([str(t.dtype) for t in tensors])
hardware = get_device_name(extract_device(self.args)) or ""
self._autotune_metrics: AutotuneMetrics = AutotuneMetrics(
kernel_id=kernel_id,
kernel_name=kernel_name,
kernel_source=kernel_source,
input_shapes=input_shapes,
hardware=hardware,
random_seed=self.settings.autotune_random_seed,
search_algorithm=type(self).__name__,
)
# Written once per run to the <autotune_log>.meta.json sidecar so the
# per-config CSV rows can be grouped by kernel across runs.
# Appended once per run to the <autotune_log>.meta.jsonl sidecar so the
# per-config CSV rows can be joined back to it on run_id. run_id is
# derived inside KernelMetadata directly from the kernel identity; the
# full settings carry the codegen/reproduction context. config_defaults
# carries the run's default config so a consumer can rebuild each row's
# minimized config via {**config_defaults, **row_config}.
self._kernel_metadata: KernelMetadata = KernelMetadata(
kernel_id=kernel_id,
kernel_name=kernel_name,
kernel_source=kernel_source,
input_shapes=input_shapes,
dtypes=dtypes,
hardware=hardware,
settings=self.settings.to_dict(),
config_defaults=self.config_spec.default_config().config,
)
self.benchmark_provider = self._benchmark_provider_cls(
kernel=self.kernel,
Expand All @@ -295,6 +297,20 @@ def _prepare(self) -> None:
)
self.benchmark_provider.set_budget_exceeded_fn(self._autotune_budget_exceeded)

def _is_restricted_search(self) -> bool:
"""Whether the search space is the user's pinned configs.

A kernel decorated with ``configs=[...]`` (and not ``force_autotune``)
tunes only between those user-chosen configs, so its telemetry is a
biased, non-representative slice; such runs are excluded from data
collection. ``force_autotune`` searches the full space and is collected.
Best-effort: a missing kernel object / ``configs`` attribute reads as
"not restricted" so a genuine search is never dropped by accident.
"""
kernel_obj = getattr(self.kernel, "kernel", None)
configs = getattr(kernel_obj, "configs", None)
return bool(configs) and not self.settings.force_autotune

def _autotune_budget_exceeded(self) -> bool:
budget = self.settings.autotune_budget_seconds
if budget is None or self._autotune_budget_start is None:
Expand Down Expand Up @@ -474,7 +490,7 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
start = time.perf_counter()
exit_stack = contextlib.ExitStack()
with exit_stack:
if self.settings.autotune_log:
if self.settings.autotune_log and not self._is_restricted_search():
exit_stack.enter_context(
self.log.autotune_logging(metadata=self._kernel_metadata)
)
Expand Down
59 changes: 26 additions & 33 deletions helion/autotuner/benchmark_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import abc
import datetime
import functools
import hashlib
from itertools import count
from itertools import starmap
import math
Expand Down Expand Up @@ -333,19 +332,6 @@ def __init__(
)
self._jobs = self._decide_num_jobs()

def _sample_id(self, config: Config) -> str:
"""Return a stable per-(kernel, config) id for telemetry rows.

Computed as ``sha256(kernel_source + decorator(config))`` so the same
kernel benchmarked with the same config produces the same id across
runs, enabling label aggregation/dedup for the cost-model dataset.
"""
payload = (
self._autotune_metrics.kernel_source
+ self.kernel.format_kernel_decorator(config, self.settings)
)
return hashlib.sha256(payload.encode("utf-8")).hexdigest()

def _compute_baseline(
self,
) -> tuple[object, Sequence[int], Sequence[object] | None]:
Expand Down Expand Up @@ -817,29 +803,36 @@ def benchmark(
process_group_name=self.kernel.env.process_group_name,
)
):
sample_id = self._sample_id(config)
self.log.record_autotune_entry(
AutotuneLogEntry(
generation=self._autotune_metrics.num_generations,
status="started",
perf_ms=None,
compile_time=compile_time,
config=config,
sample_id=sample_id,
)
# Only minimize/record when telemetry is active (an active log
# sink). Record the minimized config (defaults dropped) so the
# dataset stays lean; benchmark the original config so behaviour
# is unchanged. ``minimized is not None`` <=> recording, and also
# narrows the type for AutotuneLogEntry(config=...).
minimized = (
config.minimize(self.config_spec) if self.log.recording else None

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should just remove this minimized path, and we dont have to log to the AutotuneLogEntry, since we're already including it in the metadata csv.

)
if minimized is not None:
self.log.record_autotune_entry(
AutotuneLogEntry(
generation=self._autotune_metrics.num_generations,
status="started",
perf_ms=None,
compile_time=compile_time,
config=minimized,
)
)
perf = self._benchmark_function(config, fn)
status = "ok" if math.isfinite(perf) else "error"
self.log.record_autotune_entry(
AutotuneLogEntry(
generation=self._autotune_metrics.num_generations,
status=status,
perf_ms=perf if math.isfinite(perf) else None,
compile_time=compile_time,
config=config,
sample_id=sample_id,
if minimized is not None:
self.log.record_autotune_entry(
AutotuneLogEntry(
generation=self._autotune_metrics.num_generations,
status=status,
perf_ms=perf if math.isfinite(perf) else None,
compile_time=compile_time,
config=minimized,
)
)
)
results[valid_indices[index]] = BenchmarkResult(
config=config,
fn=fn,
Expand Down
81 changes: 50 additions & 31 deletions helion/autotuner/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,9 +120,10 @@ def autotune_logging(
) -> Iterator[AutotuneLogSink | None]:
"""Attach an :class:`AutotuneLogSink` for the duration of a tuning run.

When ``metadata`` is provided, the kernel identity (source, id, shapes)
is written once to the ``<base>.meta.json`` sidecar so the per-config
CSV rows can be grouped by kernel across runs.
When ``metadata`` is provided, the kernel identity (source, shapes,
dtypes, hardware, settings) is appended as one record to the
``<base>.meta.jsonl`` sidecar so the per-config CSV rows can be joined
back to it via ``run_id``.
"""

path = base_path or self._settings.autotune_log
Expand All @@ -138,6 +139,16 @@ def autotune_logging(
sink.end_run()
self._detach_sink()

@property
def recording(self) -> bool:
"""True when an autotune log sink is attached (telemetry is collected).

Lets callers skip building/serializing log entries (e.g. minimizing the
config) when telemetry is disabled — purely an optimization; the entry
path no-ops without a sink regardless.
"""
return self._log_sink is not None

def record_autotune_entry(self, entry: AutotuneLogEntry) -> None:
"""Write a structured autotune log entry when a sink is active."""

Expand Down Expand Up @@ -269,8 +280,6 @@ class AutotuneLogEntry(NamedTuple):
perf_ms: float | None
compile_time: float | None
config: Config
# Stable per-(kernel, config) id: sha256(kernel_source + decorator(config)).
sample_id: str = ""


class AutotuneLogSink:
Expand All @@ -282,7 +291,7 @@ def __init__(self, base_path: str, metadata: KernelMetadata | None = None) -> No
self._base_path = Path(base_path)
self.csv_path = self._base_path.with_suffix(".csv")
self.log_path = self._base_path.with_suffix(".log")
self.meta_path = self._base_path.with_suffix(".meta.json")
self.meta_path = self._base_path.with_suffix(".meta.jsonl")
self._metadata = metadata
self._csv_file: io.TextIOWrapper | None = None
self._csv_writer: CsvWriter | None = None
Expand All @@ -308,26 +317,35 @@ def open(self) -> None:
self.csv_path.parent.mkdir(parents=True, exist_ok=True)
self.log_path.parent.mkdir(parents=True, exist_ok=True)
if self._metadata is not None:
self.meta_path.write_text(
json.dumps(self._metadata.to_dict(), indent=2), encoding="utf-8"
)
self._csv_file = self.csv_path.open("w", encoding="utf-8", newline="")
# Append one identity record (JSON Lines) per run so a single log
# path can accumulate every (kernel, input shape) autotuned in the
# process without clobbering earlier runs. CSV rows join back to
# these records via run_id. default=str keeps the dump JSON-safe when
# settings carry non-serializable values (torch.dtype, enums, callables).
with self.meta_path.open("a", encoding="utf-8") as meta_file:
meta_file.write(
json.dumps(self._metadata.to_dict(), default=str) + "\n"
)
# Append rather than truncate so multiple autotune runs sharing one base
# path accumulate; write the header only for a new or empty file.
write_header = not self.csv_path.exists() or self.csv_path.stat().st_size == 0
self._csv_file = self.csv_path.open("a", encoding="utf-8", newline="")
self._csv_writer = csv.writer(self._csv_file)
self._csv_writer.writerow(
[
"kernel_id",
"sample_id",
"timestamp_s",
"config_index",
"generation",
"status",
"perf_ms",
"compile_time_s",
"config",
]
)
self._csv_file.flush()
handler = logging.FileHandler(self.log_path, mode="w", encoding="utf-8")
if write_header:
self._csv_writer.writerow(
[
"run_id",
"timestamp_s",
"config_index",
"generation",
"status",
"perf_ms",
"compile_time_s",
"config",
]
)
self._csv_file.flush()
handler = logging.FileHandler(self.log_path, mode="a", encoding="utf-8")
handler.setLevel(logging.DEBUG)
self._log_handler = handler

Expand Down Expand Up @@ -366,20 +384,21 @@ def record(self, entry: AutotuneLogEntry) -> None:
compile_field = ""
if entry.compile_time is not None:
compile_field = f"{entry.compile_time:.2f}"
# kernel_id is the foreign key joining each row back to the kernel
# identity stored once in the .meta.json sidecar.
kernel_id = self._metadata.kernel_id if self._metadata is not None else ""
# run_id joins each row to exactly one .meta.jsonl record (kernel +
# input shape/dtype/hardware).
run_id = self._metadata.run_id if self._metadata is not None else ""
self._csv_writer.writerow(
[
kernel_id,
entry.sample_id,
run_id,
timestamp_field,
self._config_counter,
entry.generation,
entry.status,
perf_field,
compile_field,
str(entry.config),
# Compact JSON (not Config.to_json()'s indent=2) keeps each CSV
# cell on one line; round-trips via Config.from_json.
json.dumps(entry.config.config, separators=(",", ":")),
]
)
if self._csv_file is not None:
Expand Down
Loading
Loading