pytorch · IshanAryendu · Jun 10, 2026 · Jun 10, 2026 · Jun 11, 2026 · Jun 11, 2026
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -198,7 +198,14 @@ jobs:
 
             # Single pass: autotune search + codegen + compilation happen inline,
             # so --measure-compile-time reports end-to-end user-visible compile time.
-            ${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 python benchmarks/run.py \
+            # HELION_AUTOTUNE_LOG is per-kernel so the per-config CSV / .meta.jsonl
+            # sidecars don't clobber across the kernel loop (the sink appends, so
+            # every input shape for this kernel accumulates into one file pair).
+            # --autotune-metrics-json captures the per-run kernel-level summary
+            # (kernel_id, source, shapes, hardware) and is overwrite-safe on its own.
+            ${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 \
+            HELION_AUTOTUNE_LOG="$TEST_REPORTS_DIR/autotune-$kernel" \
+            python benchmarks/run.py \
                 --op $kernel \
                 --helion-backend "${{ inputs.backend }}" \
                 --metrics speedup,accuracy,latency \
@@ -213,6 +220,7 @@ jobs:
                 --input-sample-mode equally-spaced-k \
                 --output "$TEST_REPORTS_DIR/helionbench.json" \
                 --append-to-output \
+                --autotune-metrics-json "$TEST_REPORTS_DIR/autotune-metrics-$kernel.json" \
                 --keep-going \
                 ${{ inputs.custom-args }}
 

diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -143,8 +143,10 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:
 
 .. autoattribute:: Settings.autotune_log
 
-   When set, Helion writes per-config autotuning telemetry (kernel id, sample id, config index, generation, status, perf, compile time, timestamp, config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
-   The kernel identity (id, name, source, input shapes, dtypes, hardware) is written once per run to ``<value>.meta.json``. ``kernel_id`` is a stable content hash (of the kernel source and code-generation settings) that appears on every CSV row, acting as the foreign key to join rows back to the sidecar and group them by kernel across runs; ``sample_id`` additionally identifies each ``(kernel, config)`` pair so repeated benchmarks of the same config can be deduplicated.
+   When set, Helion writes per-config autotuning telemetry (run id, timestamp, config index, generation, status, perf, compile time, minimized config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
+   The kernel identity (run id, name, source, input shapes, dtypes, hardware, full ``helion.settings``, and ``config_defaults``) is appended, one JSON record per run, to the ``<value>.meta.jsonl`` sidecar. All three files are opened in append mode, so multiple autotune runs that share one base path (e.g. many kernels and input shapes benchmarked in a single process) accumulate instead of overwriting each other; because ``run_id`` is content-stable, a consumer should de-duplicate on ``run_id`` when the same base path is reused across re-runs (the ``.log`` file just interleaves and carries no join key).
+   ``run_id`` is the single join key. It is a content hash derived directly from the kernel source, the code-generation-affecting settings, and the input shapes, dtypes, and hardware, so the same invocation yields the same ``run_id`` across processes and runs and each CSV row joins to exactly one ``.meta.jsonl`` record. Runs whose search space is restricted to user-pinned ``configs`` (without ``force_autotune``) are excluded from collection.
+   The per-config ``config`` column is *minimized* (values equal to the kernel's defaults are dropped). To reconstruct the config as benchmarked, merge it over the run's ``config_defaults``: ``{**config_defaults, **json.loads(row_config)}``. The ``settings`` mapping is serialized JSON-safe (``json.dumps(default=str)``), so non-serializable values such as ``index_dtype`` (a ``torch.dtype``) and callables are stored as strings for analysis rather than as faithful round-trippable objects.
    Controlled by ``HELION_AUTOTUNE_LOG``.
 
 .. autoattribute:: Settings.autotune_compile_timeout

diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -253,12 +253,10 @@ def _prepare(self) -> None:
         if budget is not None:
             self.log(f"Autotune budget: {budget}s")
         kernel_obj = getattr(self.kernel, "kernel", None)
-        kernel_id = ""
         kernel_source = ""
         if kernel_obj is not None:
             try:
                 kernel_source = kernel_obj.kernel_source()
-                kernel_id = kernel_obj.kernel_id()
             except OSError:
                 self.log.debug("Failed to read Helion kernel source", exc_info=True)
         kernel_name = getattr(kernel_obj, "name", "")
@@ -267,23 +265,27 @@ def _prepare(self) -> None:
         dtypes = str([str(t.dtype) for t in tensors])
         hardware = get_device_name(extract_device(self.args)) or ""
         self._autotune_metrics: AutotuneMetrics = AutotuneMetrics(
-            kernel_id=kernel_id,
             kernel_name=kernel_name,
             kernel_source=kernel_source,
             input_shapes=input_shapes,
             hardware=hardware,
             random_seed=self.settings.autotune_random_seed,
             search_algorithm=type(self).__name__,
         )
-        # Written once per run to the <autotune_log>.meta.json sidecar so the
-        # per-config CSV rows can be grouped by kernel across runs.
+        # Appended once per run to the <autotune_log>.meta.jsonl sidecar so the
+        # per-config CSV rows can be joined back to it on run_id. run_id is
+        # derived inside KernelMetadata directly from the kernel identity; the
+        # full settings carry the codegen/reproduction context. config_defaults
+        # carries the run's default config so a consumer can rebuild each row's
+        # minimized config via {**config_defaults, **row_config}.
         self._kernel_metadata: KernelMetadata = KernelMetadata(
-            kernel_id=kernel_id,
             kernel_name=kernel_name,
             kernel_source=kernel_source,
             input_shapes=input_shapes,
             dtypes=dtypes,
             hardware=hardware,
+            settings=self.settings.to_dict(),
+            config_defaults=self.config_spec.default_config().config,
         )
         self.benchmark_provider = self._benchmark_provider_cls(
             kernel=self.kernel,
@@ -295,6 +297,20 @@ def _prepare(self) -> None:
         )
         self.benchmark_provider.set_budget_exceeded_fn(self._autotune_budget_exceeded)
 
+    def _is_restricted_search(self) -> bool:
+        """Whether the search space is the user's pinned configs.
+
+        A kernel decorated with ``configs=[...]`` (and not ``force_autotune``)
+        tunes only between those user-chosen configs, so its telemetry is a
+        biased, non-representative slice; such runs are excluded from data
+        collection. ``force_autotune`` searches the full space and is collected.
+        Best-effort: a missing kernel object / ``configs`` attribute reads as
+        "not restricted" so a genuine search is never dropped by accident.
+        """
+        kernel_obj = getattr(self.kernel, "kernel", None)
+        configs = getattr(kernel_obj, "configs", None)
+        return bool(configs) and not self.settings.force_autotune
+
     def _autotune_budget_exceeded(self) -> bool:
         budget = self.settings.autotune_budget_seconds
         if budget is None or self._autotune_budget_start is None:
@@ -474,7 +490,7 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
         start = time.perf_counter()
         exit_stack = contextlib.ExitStack()
         with exit_stack:
-            if self.settings.autotune_log:
+            if self.settings.autotune_log and not self._is_restricted_search():
                 exit_stack.enter_context(
                     self.log.autotune_logging(metadata=self._kernel_metadata)
                 )

diff --git a/helion/autotuner/benchmark_provider.py b/helion/autotuner/benchmark_provider.py
@@ -3,7 +3,6 @@
 import abc
 import datetime
 import functools
-import hashlib
 from itertools import count
 from itertools import starmap
 import math
@@ -333,19 +332,6 @@ def __init__(
         )
         self._jobs = self._decide_num_jobs()
 
-    def _sample_id(self, config: Config) -> str:
-        """Return a stable per-(kernel, config) id for telemetry rows.
-
-        Computed as ``sha256(kernel_source + decorator(config))`` so the same
-        kernel benchmarked with the same config produces the same id across
-        runs, enabling label aggregation/dedup for the cost-model dataset.
-        """
-        payload = (
-            self._autotune_metrics.kernel_source
-            + self.kernel.format_kernel_decorator(config, self.settings)
-        )
-        return hashlib.sha256(payload.encode("utf-8")).hexdigest()
-
     def _compute_baseline(
         self,
     ) -> tuple[object, Sequence[int], Sequence[object] | None]:
@@ -817,29 +803,36 @@ def benchmark(
                     process_group_name=self.kernel.env.process_group_name,
                 )
             ):
-                sample_id = self._sample_id(config)
-                self.log.record_autotune_entry(
-                    AutotuneLogEntry(
-                        generation=self._autotune_metrics.num_generations,
-                        status="started",
-                        perf_ms=None,
-                        compile_time=compile_time,
-                        config=config,
-                        sample_id=sample_id,
-                    )
+                # Only minimize/record when telemetry is active (an active log
+                # sink). Record the minimized config (defaults dropped) so the
+                # dataset stays lean; benchmark the original config so behaviour
+                # is unchanged. ``minimized is not None`` <=> recording, and also
+                # narrows the type for AutotuneLogEntry(config=...).
+                minimized = (
+                    config.minimize(self.config_spec) if self.log.recording else None
                 )
+                if minimized is not None:
+                    self.log.record_autotune_entry(
+                        AutotuneLogEntry(
+                            generation=self._autotune_metrics.num_generations,
+                            status="started",
+                            perf_ms=None,
+                            compile_time=compile_time,
+                            config=minimized,
+                        )
+                    )
                 perf = self._benchmark_function(config, fn)
                 status = "ok" if math.isfinite(perf) else "error"
-                self.log.record_autotune_entry(
-                    AutotuneLogEntry(
-                        generation=self._autotune_metrics.num_generations,
-                        status=status,
-                        perf_ms=perf if math.isfinite(perf) else None,
-                        compile_time=compile_time,
-                        config=config,
-                        sample_id=sample_id,
+                if minimized is not None:
+                    self.log.record_autotune_entry(
+                        AutotuneLogEntry(
+                            generation=self._autotune_metrics.num_generations,
+                            status=status,
+                            perf_ms=perf if math.isfinite(perf) else None,
+                            compile_time=compile_time,
+                            config=minimized,
+                        )
                     )
-                )
                 results[valid_indices[index]] = BenchmarkResult(
                     config=config,
                     fn=fn,

diff --git a/helion/autotuner/logger.py b/helion/autotuner/logger.py
@@ -120,9 +120,10 @@ def autotune_logging(
     ) -> Iterator[AutotuneLogSink | None]:
         """Attach an :class:`AutotuneLogSink` for the duration of a tuning run.
 
-        When ``metadata`` is provided, the kernel identity (source, id, shapes)
-        is written once to the ``<base>.meta.json`` sidecar so the per-config
-        CSV rows can be grouped by kernel across runs.
+        When ``metadata`` is provided, the kernel identity (source, shapes,
+        dtypes, hardware, settings) is appended as one record to the
+        ``<base>.meta.jsonl`` sidecar so the per-config CSV rows can be joined
+        back to it via ``run_id``.
         """
 
         path = base_path or self._settings.autotune_log
@@ -138,6 +139,16 @@ def autotune_logging(
                 sink.end_run()
                 self._detach_sink()
 
+    @property
+    def recording(self) -> bool:
+        """True when an autotune log sink is attached (telemetry is collected).
+
+        Lets callers skip building/serializing log entries (e.g. minimizing the
+        config) when telemetry is disabled — purely an optimization; the entry
+        path no-ops without a sink regardless.
+        """
+        return self._log_sink is not None
+
     def record_autotune_entry(self, entry: AutotuneLogEntry) -> None:
         """Write a structured autotune log entry when a sink is active."""
 
@@ -269,8 +280,6 @@ class AutotuneLogEntry(NamedTuple):
     perf_ms: float | None
     compile_time: float | None
     config: Config
-    # Stable per-(kernel, config) id: sha256(kernel_source + decorator(config)).
-    sample_id: str = ""
 
 
 class AutotuneLogSink:
@@ -282,7 +291,7 @@ def __init__(self, base_path: str, metadata: KernelMetadata | None = None) -> No
         self._base_path = Path(base_path)
         self.csv_path = self._base_path.with_suffix(".csv")
         self.log_path = self._base_path.with_suffix(".log")
-        self.meta_path = self._base_path.with_suffix(".meta.json")
+        self.meta_path = self._base_path.with_suffix(".meta.jsonl")
         self._metadata = metadata
         self._csv_file: io.TextIOWrapper | None = None
         self._csv_writer: CsvWriter | None = None
@@ -308,26 +317,35 @@ def open(self) -> None:
         self.csv_path.parent.mkdir(parents=True, exist_ok=True)
         self.log_path.parent.mkdir(parents=True, exist_ok=True)
         if self._metadata is not None:
-            self.meta_path.write_text(
-                json.dumps(self._metadata.to_dict(), indent=2), encoding="utf-8"
-            )
-        self._csv_file = self.csv_path.open("w", encoding="utf-8", newline="")
+            # Append one identity record (JSON Lines) per run so a single log
+            # path can accumulate every (kernel, input shape) autotuned in the
+            # process without clobbering earlier runs. CSV rows join back to
+            # these records via run_id. default=str keeps the dump JSON-safe when
+            # settings carry non-serializable values (torch.dtype, enums, callables).
+            with self.meta_path.open("a", encoding="utf-8") as meta_file:
+                meta_file.write(
+                    json.dumps(self._metadata.to_dict(), default=str) + "\n"
+                )
+        # Append rather than truncate so multiple autotune runs sharing one base
+        # path accumulate; write the header only for a new or empty file.
+        write_header = not self.csv_path.exists() or self.csv_path.stat().st_size == 0
+        self._csv_file = self.csv_path.open("a", encoding="utf-8", newline="")
         self._csv_writer = csv.writer(self._csv_file)
-        self._csv_writer.writerow(
-            [
-                "kernel_id",
-                "sample_id",
-                "timestamp_s",
-                "config_index",
-                "generation",
-                "status",
-                "perf_ms",
-                "compile_time_s",
-                "config",
-            ]
-        )
-        self._csv_file.flush()
-        handler = logging.FileHandler(self.log_path, mode="w", encoding="utf-8")
+        if write_header:
+            self._csv_writer.writerow(
+                [
+                    "run_id",
+                    "timestamp_s",
+                    "config_index",
+                    "generation",
+                    "status",
+                    "perf_ms",
+                    "compile_time_s",
+                    "config",
+                ]
+            )
+            self._csv_file.flush()
+        handler = logging.FileHandler(self.log_path, mode="a", encoding="utf-8")
         handler.setLevel(logging.DEBUG)
         self._log_handler = handler
 
@@ -366,20 +384,21 @@ def record(self, entry: AutotuneLogEntry) -> None:
         compile_field = ""
         if entry.compile_time is not None:
             compile_field = f"{entry.compile_time:.2f}"
-        # kernel_id is the foreign key joining each row back to the kernel
-        # identity stored once in the .meta.json sidecar.
-        kernel_id = self._metadata.kernel_id if self._metadata is not None else ""
+        # run_id joins each row to exactly one .meta.jsonl record (kernel +
+        # input shape/dtype/hardware).
+        run_id = self._metadata.run_id if self._metadata is not None else ""
         self._csv_writer.writerow(
             [
-                kernel_id,
-                entry.sample_id,
+                run_id,
                 timestamp_field,
                 self._config_counter,
                 entry.generation,
                 entry.status,
                 perf_field,
                 compile_field,
-                str(entry.config),
+                # Compact JSON (not Config.to_json()'s indent=2) keeps each CSV
+                # cell on one line; round-trips via Config.from_json.
+                json.dumps(entry.config.config, separators=(",", ":")),
             ]
         )
         if self._csv_file is not None: