pytorch · IshanAryendu · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026 · Jun 10, 2026
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -198,7 +198,14 @@ jobs:
 
             # Single pass: autotune search + codegen + compilation happen inline,
             # so --measure-compile-time reports end-to-end user-visible compile time.
-            ${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 python benchmarks/run.py \
+            # HELION_AUTOTUNE_LOG is per-kernel so the per-config CSV / .meta.jsonl
+            # sidecars don't clobber across the kernel loop (the sink appends, so
+            # every input shape for this kernel accumulates into one file pair).
+            # --autotune-metrics-json captures the per-run kernel-level summary
+            # (kernel_id, source, shapes, hardware) and is overwrite-safe on its own.
+            ${{ inputs.env-vars }} HELION_PRINT_OUTPUT_CODE=1 \
+            HELION_AUTOTUNE_LOG="$TEST_REPORTS_DIR/autotune-$kernel" \
+            python benchmarks/run.py \
                 --op $kernel \
                 --helion-backend "${{ inputs.backend }}" \
                 --metrics speedup,accuracy,latency \
@@ -213,6 +220,7 @@ jobs:
                 --input-sample-mode equally-spaced-k \
                 --output "$TEST_REPORTS_DIR/helionbench.json" \
                 --append-to-output \
+                --autotune-metrics-json "$TEST_REPORTS_DIR/autotune-metrics-$kernel.json" \
                 --keep-going \
                 ${{ inputs.custom-args }}
 

diff --git a/docs/api/settings.md b/docs/api/settings.md
@@ -143,8 +143,9 @@ def my_kernel(x: torch.Tensor) -> torch.Tensor:
 
 .. autoattribute:: Settings.autotune_log
 
-   When set, Helion writes per-config autotuning telemetry (kernel id, sample id, config index, generation, status, perf, compile time, timestamp, config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
-   The kernel identity (id, name, source, input shapes, dtypes, hardware) is written once per run to ``<value>.meta.json``. ``kernel_id`` is a stable content hash (of the kernel source and code-generation settings) that appears on every CSV row, acting as the foreign key to join rows back to the sidecar and group them by kernel across runs; ``sample_id`` additionally identifies each ``(kernel, config)`` pair so repeated benchmarks of the same config can be deduplicated.
+   When set, Helion writes per-config autotuning telemetry (run id, kernel id, sample id, config index, generation, status, perf, compile time, timestamp, decorator, config JSON) to ``<value>.csv`` and mirrors the autotune log output to ``<value>.log`` for population-based autotuners (currently ``PatternSearch`` and ``DifferentialEvolution``).
+   The kernel identity (run id, kernel id, name, source, input shapes, dtypes, hardware) is appended, one JSON record per run, to the ``<value>.meta.jsonl`` sidecar, and a lossless node-link dump of the kernel's device IR is appended, one record per run, to the ``<value>.ir.jsonl`` sidecar (loadable with ``networkx.node_link_graph(record, edges="links")`` and joined to the other files on ``run_id``). All of these files are opened in append mode, so multiple autotune runs that share one base path (e.g. many kernels and input shapes benchmarked in a single process) accumulate instead of overwriting each other.
+   Three content-derived ids tie the data together. ``kernel_id`` (a hash of the kernel source and code-generation settings) is shape/dtype independent, grouping every row for one kernel across shapes and runs. ``run_id`` (a hash of ``kernel_id`` plus input shapes, dtypes, and hardware) identifies a single autotune invocation, so each CSV row joins to exactly one ``.meta.jsonl`` record and a config's measured perf can be attributed to the specific shape/dtype/hardware it was measured on. ``sample_id`` (a hash of the kernel source and the config) identifies each ``(kernel, config)`` pair. Together ``(run_id, sample_id)`` is the natural primary key for the per-config rows: ``run_id`` pins the kernel+shape+dtype+hardware instance and ``sample_id`` pins the config.
    Controlled by ``HELION_AUTOTUNE_LOG``.
 
 .. autoattribute:: Settings.autotune_compile_timeout

diff --git a/helion/autotuner/base_search.py b/helion/autotuner/base_search.py
@@ -36,6 +36,8 @@
 from .benchmark_provider import _unset_fn
 from .benchmarking import clear_jit_fast_path_caches
 from .benchmarking import interleaved_bench
+from .ir_features import IrGraphRecord
+from .ir_features import extract_ir_graph
 from .logger import AutotuningLogger
 from .metrics import AutotuneMetrics
 from .metrics import KernelMetadata
@@ -275,8 +277,9 @@ def _prepare(self) -> None:
             random_seed=self.settings.autotune_random_seed,
             search_algorithm=type(self).__name__,
         )
-        # Written once per run to the <autotune_log>.meta.json sidecar so the
-        # per-config CSV rows can be grouped by kernel across runs.
+        # Appended once per run to the <autotune_log>.meta.jsonl sidecar so the
+        # per-config CSV rows can be joined back to it (run_id) and grouped by
+        # kernel (kernel_id) across runs. run_id is derived inside KernelMetadata.
         self._kernel_metadata: KernelMetadata = KernelMetadata(
             kernel_id=kernel_id,
             kernel_name=kernel_name,
@@ -294,6 +297,34 @@ def _prepare(self) -> None:
             autotune_metrics=self._autotune_metrics,
         )
         self.benchmark_provider.set_budget_exceeded_fn(self._autotune_budget_exceeded)
+        # Device IR is config-independent, so dump it once per run (joined to the
+        # per-config CSV rows on run_id). Only when telemetry is on.
+        self._ir_graph: IrGraphRecord | None = self._extract_ir_graph()
+
+    def _extract_ir_graph(self) -> IrGraphRecord | None:
+        """Best-effort device-IR node-link dump for the autotune-log sidecar.
+
+        Returns ``None`` (no IR artifact) when telemetry is off or the device IR
+        is unavailable (e.g. a backend without a standard device IR); extraction
+        never breaks autotuning.
+        """
+        if not self.settings.autotune_log:
+            return None
+        host_function = getattr(self.kernel, "host_function", None)
+        device_ir = getattr(host_function, "device_ir", None)
+        if device_ir is None:
+            return None
+        try:
+            return extract_ir_graph(
+                device_ir,
+                run_id=self._kernel_metadata.run_id,
+                kernel_id=self._kernel_metadata.kernel_id,
+                kernel_name=self._kernel_metadata.kernel_name,
+                input_shapes=self._kernel_metadata.input_shapes,
+            )
+        except Exception:
+            self.log.debug("Failed to extract device IR features", exc_info=True)
+            return None
 
     def _autotune_budget_exceeded(self) -> bool:
         budget = self.settings.autotune_budget_seconds
@@ -476,7 +507,9 @@ def autotune(self, *, skip_cache: bool = False) -> Config:
         with exit_stack:
             if self.settings.autotune_log:
                 exit_stack.enter_context(
-                    self.log.autotune_logging(metadata=self._kernel_metadata)
+                    self.log.autotune_logging(
+                        metadata=self._kernel_metadata, ir_graph=self._ir_graph
+                    )
                 )
             self.log.reset()
             # Autotuner triggers bugs in remote triton compile service.

diff --git a/helion/autotuner/benchmark_provider.py b/helion/autotuner/benchmark_provider.py
@@ -333,18 +333,21 @@ def __init__(
         )
         self._jobs = self._decide_num_jobs()
 
-    def _sample_id(self, config: Config) -> str:
-        """Return a stable per-(kernel, config) id for telemetry rows.
-
-        Computed as ``sha256(kernel_source + decorator(config))`` so the same
-        kernel benchmarked with the same config produces the same id across
-        runs, enabling label aggregation/dedup for the cost-model dataset.
+    def _sample_identity(self, config: Config) -> tuple[str, str]:
+        """Return ``(sample_id, decorator)`` for a config's telemetry rows.
+
+        ``decorator`` is ``format_kernel_decorator(config)`` -- the canonical
+        ``@helion.kernel(...)`` string that reproduces this config -- and is
+        collected as a structured artifact. ``sample_id`` is
+        ``sha256(kernel_source + decorator)``, a stable per-(kernel, config) id
+        so the same kernel benchmarked with the same config produces the same id
+        across runs, enabling label aggregation/dedup for the cost-model dataset.
+        The decorator is computed once here and reused for both ids and the row.
         """
-        payload = (
-            self._autotune_metrics.kernel_source
-            + self.kernel.format_kernel_decorator(config, self.settings)
-        )
-        return hashlib.sha256(payload.encode("utf-8")).hexdigest()
+        decorator = self.kernel.format_kernel_decorator(config, self.settings)
+        payload = self._autotune_metrics.kernel_source + decorator
+        sample_id = hashlib.sha256(payload.encode("utf-8")).hexdigest()
+        return sample_id, decorator
 
     def _compute_baseline(
         self,
@@ -817,7 +820,7 @@ def benchmark(
                     process_group_name=self.kernel.env.process_group_name,
                 )
             ):
-                sample_id = self._sample_id(config)
+                sample_id, decorator = self._sample_identity(config)
                 self.log.record_autotune_entry(
                     AutotuneLogEntry(
                         generation=self._autotune_metrics.num_generations,
@@ -826,6 +829,7 @@ def benchmark(
                         compile_time=compile_time,
                         config=config,
                         sample_id=sample_id,
+                        decorator=decorator,
                     )
                 )
                 perf = self._benchmark_function(config, fn)
@@ -838,6 +842,7 @@ def benchmark(
                         compile_time=compile_time,
                         config=config,
                         sample_id=sample_id,
+                        decorator=decorator,
                     )
                 )
                 results[valid_indices[index]] = BenchmarkResult(