Genesis-Embodied-AI · duburcqa · Apr 20, 2026 · claude · Apr 21, 2026 · claude
diff --git a/quadrants/codegen/llvm/codegen_llvm.cpp b/quadrants/codegen/llvm/codegen_llvm.cpp
@@ -1727,6 +1727,10 @@ std::string TaskCodeGenLLVM::init_offloaded_task_function(OffloadedStmt *stmt, s
   current_loop_reentry = nullptr;
   current_while_after_loop = nullptr;
 
+  // Reset the adstack function-scope accumulator for this task. The budget is per-task (per LLVM
+  // function), so the count must not carry over from the previous offloaded stmt.
+  ad_stack_fn_scope_bytes_ = 0;
+
   task_function_type =
       llvm::FunctionType::get(llvm::Type::getVoidTy(*llvm_context), {llvm::PointerType::get(context_ty, 0)}, false);
 
@@ -2106,6 +2110,38 @@ void TaskCodeGenLLVM::visit(InternalFuncStmt *stmt) {
 void TaskCodeGenLLVM::visit(AdStackAllocaStmt *stmt) {
   QD_ASSERT_INFO(stmt->max_size > 0, "Adaptive autodiff stack's size should have been determined.");
   auto type = llvm::ArrayType::get(llvm::Type::getInt8Ty(*llvm_context), stmt->size_in_bytes());
+
+  // Guard against LLVM worker-thread stack overflow before silent memory corruption ensues.
+  // Gated on CPU arches because only there do LLVM allocas become worker-thread stack frame slots bounded by
+  // the OS thread-stack limit. On CUDA / AMDGPU the same LLVM allocas are lowered to per-thread GPU local
+  // memory (a separate address space sized by the driver, not shared with the CPU call stack), so the 256 KB
+  // CPU-stack budget is not meaningful there and the check would falsely reject valid GPU kernels with
+  // f64 loop-carried variables (4 adstacks at `ad_stack_size=4096` already cross 256 KB).
+  //
+  // Adstacks are allocated at function entry (`create_entry_block_alloca`) so they are live for the entire
+  // task invocation and their sizes sum directly into the LLVM stack frame. A kernel that exceeds the thread
+  // stack does not fault at the push - it simply trashes adjacent stack memory, and downstream reverse-mode
+  // accumulators read zero, producing silently-wrong gradients that look indistinguishable from a broken
+  // backward chain. Fail loudly with a message that tells the user how to unblock: either lower
+  // `ad_stack_size`, shrink the per-kernel adstack count by shifting some dynamic loops back to
+  // `qd.static(range(...))` unrolls, or use a backend that heap-backs adstacks.
+  //
+  // Budget: 256 KB leaves headroom inside the ~512 KB default macOS secondary-thread stack for other locals
+  // and nested call frames. Linux defaults are larger (~8 MB), so the same limit is strictly conservative
+  // there.
+  if (arch_is_cpu(current_arch())) {
+    constexpr std::size_t kFnScopeAdStackBudgetBytes = 256 * 1024;
+    ad_stack_fn_scope_bytes_ += stmt->size_in_bytes();
+    QD_ERROR_IF(ad_stack_fn_scope_bytes_ > kFnScopeAdStackBudgetBytes,
+                "LLVM autodiff-stack budget exceeded: cumulative `AdStackAllocaStmt` size {} bytes in task "
+                "'{}' crosses the {} byte function-scope budget. Every adstack is allocated on the worker "
+                "thread stack, so scaling past this point silently corrupts the stack frame and zeros the "
+                "reverse-mode gradient without raising. Options: lower `ad_stack_size=N` in `qd.init()`, "
+                "reduce the number of loop-carried values in dynamic reverse-mode loops, or keep the "
+                "existing `qd.static(range(...))` unrolls on the reverse-mode path.",
+                ad_stack_fn_scope_bytes_, kernel_name, kFnScopeAdStackBudgetBytes);
+  }
+
   auto alloca = create_entry_block_alloca(type, sizeof(int64));
   llvm_val[stmt] = builder->CreateBitCast(alloca, llvm::PointerType::getUnqual(*llvm_context));
   call("stack_init", llvm_val[stmt]);

diff --git a/quadrants/codegen/llvm/codegen_llvm.h b/quadrants/codegen/llvm/codegen_llvm.h
@@ -63,6 +63,14 @@ class TaskCodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
   // The task_codegen_id represents the id of the offloaded task
   int task_codegen_id{0};
 
+  // Running total of bytes reserved by `AdStackAllocaStmt`s emitted via `create_entry_block_alloca` in
+  // the current task. Every adstack lives at function scope on the worker-thread stack, so the sum of
+  // their sizes adds directly to the LLVM stack frame. If the sum exceeds the worker thread's stack
+  // (~512 KB on macOS, 8 MB on Linux by default) the frame silently clobbers adjacent stack pages,
+  // which has shown up in Genesis-style kernels as zero gradients with no SIGBUS. We raise before
+  // codegen emits anything that cannot run correctly.
+  std::size_t ad_stack_fn_scope_bytes_{0};
+
   std::unordered_map<const Stmt *, std::vector<llvm::Value *>> loop_vars_llvm;
 
   std::unordered_map<Function *, llvm::Function *> func_map;

diff --git a/tests/python/test_adstack.py b/tests/python/test_adstack.py
@@ -546,3 +546,68 @@
 @test_utils.test(require=[qd.extension.adstack, qd.extension.data64], default_fp=qd.f64)
 def test_adstack_sum_linear_f64(use_static_loop, use_varying_coeff, n_iter):
     _run_sum_linear(qd.f64, use_static_loop, use_varying_coeff, n_iter, rel_tol=1e-14)
+
+
+def test_adstack_codegen_budget_guard_runs_in_child_process(tmp_path):
+    # Per-task codegen guard: the sum of `AdStackAllocaStmt::size_in_bytes()` in a single LLVM task must not cross
+    # the ~256 KB CPU worker-thread stack budget. Beyond that the frame silently clobbers adjacent stack memory and
+    # the reverse pass returns zero / garbage gradients. The guard runs inside the LLVM compilation worker thread
+    # pool; the underlying `QD_ERROR_IF` throws across a thread boundary that does not propagate the exception
+    # back to Python, so it surfaces as a loud `std::terminate` / SIGABRT rather than a catchable Python
+    # exception. The test runs the overflowing kernel in a child process and asserts the child aborts with a
+    # non-zero exit code and the guard message reaches stderr; that is enough to prove the guard fires and does
+    # not let silent stack-frame clobbering through.
+    if not is_extension_supported(qd.cpu, qd.extension.adstack):
+        pytest.skip("adstack extension not available on cpu")
+    if not is_extension_supported(qd.cpu, qd.extension.data64):
+        pytest.skip("f64 extension not available on cpu")
+
+    child_script = textwrap.dedent(
+        """
+        import quadrants as qd
+
+        qd.init(arch=qd.cpu, ad_stack_experimental_enabled=True, ad_stack_size=4096, default_fp=qd.f64)
+
+        n = 4
+        x = qd.field(qd.f64, shape=n, needs_grad=True)
+        y = qd.field(qd.f64, shape=(), needs_grad=True)
+        n_iter = qd.field(qd.i32, shape=())
+
+        @qd.kernel
+        def compute():
+            for i in x:
+                v1 = x[i]
+                v2 = x[i]
+                v3 = x[i]
+                v4 = x[i]
+                v5 = x[i]
+                for _ in range(n_iter[None]):
+                    v1 = qd.sin(v1)
+                    v2 = qd.sin(v2)
+                    v3 = qd.sin(v3)
+                    v4 = qd.sin(v4)
+                    v5 = qd.sin(v5)
+                y[None] += v1 + v2 + v3 + v4 + v5
+
+        for i in range(n):
+            x[i] = 0.1 + 0.1 * i
+        n_iter[None] = 3
+        y[None] = 0.0
+        compute()
+        y.grad[None] = 1.0
+        for i in range(n):
+            x.grad[i] = 0.0
+        compute.grad()
+        """
+    )
+    script_path = tmp_path / "budget_guard_child.py"
+    script_path.write_text(child_script)
+    result = subprocess.run([sys.executable, str(script_path)], capture_output=True, check=False)
+    assert (
+        result.returncode != 0
+    ), "child exited with returncode 0 but the budget guard was expected to terminate the process"
+    combined = (result.stdout + result.stderr).decode()
+    assert "autodiff-stack budget exceeded" in combined, (
+        f"expected guard message in child output; got:\nstdout:\n{result.stdout.decode()}\n"
+        f"stderr:\n{result.stderr.decode()}"
+    )