Luce-Org · davide221 · Jun 10, 2026 · Jun 9, 2026 · Jun 9, 2026 · Jun 10, 2026
diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt
@@ -672,6 +672,11 @@ if(DFLASH27B_TESTS)
         target_include_directories(bench_laguna_generate PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
         target_link_libraries(bench_laguna_generate PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
     endif()
+    if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/bench_laguna_spark.cpp")
+        add_executable(bench_laguna_spark test/bench_laguna_spark.cpp)
+        target_include_directories(bench_laguna_spark PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})
+        target_link_libraries(bench_laguna_spark PRIVATE dflash_common ggml ${DFLASH27B_GGML_BACKEND_TARGET})
+    endif()
     if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_laguna_daemon.cpp")
         add_executable(test_laguna_daemon test/test_laguna_daemon.cpp)
         target_include_directories(test_laguna_daemon PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS})

diff --git a/server/src/gemma4/gemma4_graph.cpp b/server/src/gemma4/gemma4_graph.cpp
@@ -155,7 +155,9 @@ static ggml_tensor * build_gemma4_attn_block(
     ggml_tensor * attn_mask_full,
     ggml_tensor * attn_mask_swa,
     int kv_start,
-    int n_tokens)
+    int n_tokens,
+    ggml_tensor * kv_idx_full = nullptr,   // [n_tokens] I32 absolute rows (graph input)
+    ggml_tensor * kv_idx_swa  = nullptr)   // [n_tokens] I32 ring rows pos%swa_size (graph input)
 {
     const int head_dim   = gemma4_head_dim(w, il);
     const int n_head     = w.n_head;
@@ -207,20 +209,35 @@ static ggml_tensor * build_gemma4_attn_block(
                               0.0f, 1.0f, 32.0f, 1.0f);
 
         // Write K/V to cache (ring-buffer position for SWA layers)
-        const int write_pos = is_swa ? (kv_start % cache_len) : kv_start;
         ggml_tensor * Kcur_T = ggml_permute(ctx, Kcur, 0, 2, 1, 3);
         ggml_tensor * Vcur_T = ggml_permute(ctx, Vcur, 0, 2, 1, 3);
 
-        ggml_tensor * k_slot = ggml_view_3d(ctx, cache_k,
-            head_dim, n_tokens, n_head_kv,
-            cache_k->nb[1], cache_k->nb[2],
-            cache_k->nb[1] * (size_t)write_pos);
-        ggml_tensor * v_slot = ggml_view_3d(ctx, cache_v,
-            head_dim, n_tokens, n_head_kv,
-            cache_v->nb[1], cache_v->nb[2],
-            cache_v->nb[1] * (size_t)write_pos);
-        ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur_T, k_slot));
-        ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur_T, v_slot));
+        ggml_tensor * kvi = is_swa ? kv_idx_swa : kv_idx_full;
+        if (kvi) {
+            // CUDA-graph-stable append: dst is the whole cache tensor (stable
+            // pointer), the row index is a graph INPUT (data changes per step,
+            // pointer doesn't). A write_pos-offset view changes node properties
+            // every step, which resets the ggml-cuda CUDA-graph warmup and
+            // forfeits replay. For SWA layers the caller fills the index with
+            // (pos % swa_size), which also handles ring wrap-around mid-chunk
+            // correctly (the offset-view path wrote a contiguous block).
+            ggml_tensor * Krows = ggml_cont(ctx, Kcur_T);
+            ggml_tensor * Vrows = ggml_cont(ctx, Vcur_T);
+            ggml_build_forward_expand(gf, ggml_set_rows(ctx, cache_k, Krows, kvi));
+            ggml_build_forward_expand(gf, ggml_set_rows(ctx, cache_v, Vrows, kvi));
+        } else {
+            const int write_pos = is_swa ? (kv_start % cache_len) : kv_start;
+            ggml_tensor * k_slot = ggml_view_3d(ctx, cache_k,
+                head_dim, n_tokens, n_head_kv,
+                cache_k->nb[1], cache_k->nb[2],
+                cache_k->nb[1] * (size_t)write_pos);
+            ggml_tensor * v_slot = ggml_view_3d(ctx, cache_v,
+                head_dim, n_tokens, n_head_kv,
+                cache_v->nb[1], cache_v->nb[2],
+                cache_v->nb[1] * (size_t)write_pos);
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur_T, k_slot));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur_T, v_slot));
+        }
     }
     // else: KV-sharing layer — cache already written by source layer
 
@@ -282,7 +299,9 @@ static ggml_tensor * build_gemma4_layer(
     ggml_tensor * per_layer_input,  // [n_embd_per_layer, n_tokens] or nullptr
     int kv_start,
     int n_tokens,
-    int capture_idx = -1)  // >=0: write to target_feat at this capture slot
+    int capture_idx = -1,  // >=0: write to target_feat at this capture slot
+    ggml_tensor * kv_idx_full = nullptr,
+    ggml_tensor * kv_idx_swa  = nullptr)
 {
     const Gemma4Layer & L = w.layers[il];
     ggml_tensor * inp_f32 = graph_tensor_f32(ctx, inp);
@@ -293,7 +312,7 @@ static ggml_tensor * build_gemma4_layer(
     // Attention
     cur = build_gemma4_attn_block(ctx, gf, w, L, cache, il, cur,
                                     positions, attn_mask_full, attn_mask_swa,
-                                    kv_start, n_tokens);
+                                    kv_start, n_tokens, kv_idx_full, kv_idx_swa);
 
     // Post-attn norm
     if (L.attn_post_norm) {
@@ -603,9 +622,16 @@ bool gemma4_step(
     int                     kv_start,
     std::vector<float> &    out_logits)
 {
-    // Allocate graph context
+    // Allocate graph context. Persistent thread_local arena: rebuilt graphs
+    // land at identical addresses every step, so the ggml-cuda CUDA-graph
+    // cache (keyed on nodes[0], memcmps node properties) can replay the
+    // captured graph instead of re-launching every kernel per token.
+    const size_t arena_size = ggml_tensor_overhead() * 16384 + ggml_graph_overhead() + 16 * 1024 * 1024;
+    static thread_local std::vector<uint8_t> g_arena;
+    if (g_arena.size() < arena_size) g_arena.resize(arena_size);
     ggml_init_params ip{};
-    ip.mem_size = ggml_tensor_overhead() * 16384 + ggml_graph_overhead() + 16 * 1024 * 1024;
+    ip.mem_size = arena_size;
+    ip.mem_buffer = g_arena.data();
     ip.no_alloc = true;
     ggml_context * ctx = ggml_init(ip);
     ggml_cgraph * gf = ggml_new_graph_custom(ctx, 16384, false);
@@ -616,6 +642,18 @@ bool gemma4_step(
     ggml_tensor * pp = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
     ggml_set_input(pp);
 
+    // K/V append row indices (set_rows path; data-only per step -> stable
+    // node properties -> CUDA-graph replay). DFLASH_GEMMA4_NO_KVPAD=1 restores
+    // the legacy offset-view cpy append.
+    static const bool g_no_kvpad = (std::getenv("DFLASH_GEMMA4_NO_KVPAD") != nullptr);
+    ggml_tensor * kvi_full = nullptr, * kvi_swa = nullptr;
+    if (!g_no_kvpad) {
+        kvi_full = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+        ggml_set_input(kvi_full);
+        kvi_swa = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
+        ggml_set_input(kvi_swa);
+    }
+
     // Token IDs input (for per-layer embedding lookup)
     ggml_tensor * tok_ids = nullptr;
     if (token_ids && w.per_layer_tok_embd && w.per_layer_model_proj && w.n_embd_per_layer > 0) {
@@ -689,7 +727,8 @@ bool gemma4_step(
         }
         cur = build_gemma4_layer(ctx, gf, w, cache, il, cur, pp,
                                    mk_full_f16, mk_swa_f16, pl_input,
-                                   kv_start, n_tokens, cap_idx);
+                                   kv_start, n_tokens, cap_idx,
+                                   kvi_full, kvi_swa);
     }
 
     // Final norm
@@ -729,6 +768,17 @@ bool gemma4_step(
     std::vector<int32_t> pos((size_t)n_tokens);
     for (int i = 0; i < n_tokens; ++i) pos[i] = kv_start + i;
     ggml_backend_tensor_set(pp, pos.data(), 0, ggml_nbytes(pp));
+    if (kvi_full) {
+        // Full layers append at the absolute position; SWA layers at the ring
+        // slot. Per-token modular indices also land chunks that cross the
+        // ring wrap boundary correctly (the offset-view path wrote one
+        // contiguous block).
+        ggml_backend_tensor_set(kvi_full, pos.data(), 0, ggml_nbytes(kvi_full));
+        GGML_ASSERT(swa_size > 0);
+        std::vector<int32_t> ring((size_t)n_tokens);
+        for (int i = 0; i < n_tokens; ++i) ring[i] = (kv_start + i) % swa_size;
+        ggml_backend_tensor_set(kvi_swa, ring.data(), 0, ggml_nbytes(kvi_swa));
+    }
 
     // Set token IDs for per-layer embedding
     if (tok_ids && token_ids) {

diff --git a/server/src/gemma4/gemma4_loader.cpp b/server/src/gemma4/gemma4_loader.cpp
@@ -548,6 +548,11 @@ bool create_gemma4_cache_partial(ggml_backend_t backend,
         return false;
     }
 
+    // Zero-init KV: the FA views span the 256-padded tail beyond the written
+    // positions (mask gates it to -inf), so reads there must dequantise to
+    // finite values — F16 garbage can be NaN/Inf and NaN + (-inf) = NaN.
+    ggml_backend_buffer_clear(out.buf, 0);
+
     out.cur_pos = 0;
     out.max_ctx = max_ctx;
     out.n_layer = w.n_layer;

diff --git a/server/src/internal.h b/server/src/internal.h
@@ -621,7 +621,8 @@ QwenLayerPrefnOutputs build_qwen35_layer_prefn(
     ggml_tensor *         attn_mask,
     int                   kv_start,
     int                   n_tokens,
-    int                   fa_window = 0);
+    int                   fa_window = 0,
+    ggml_tensor *         kv_write_rows = nullptr);
 
 } // namespace dflash::common
 

diff --git a/server/src/laguna/laguna_internal.h b/server/src/laguna/laguna_internal.h
@@ -239,6 +239,13 @@ struct LagunaGraphInputs {
     ggml_tensor * attn_mask_swa;  // optional [kv_len, n_tokens] F16 (causal + sliding window) for SWA layers
     int           n_tokens;
     int           kv_start;
+    // CUDA-graph-stable decode: when kv_pad > 0 the FA K/V views + masks span
+    // kv_pad slots (stride-rounded; mask gates validity) and the K/V append
+    // goes through ggml_set_rows with kv_idx as a graph INPUT, so node
+    // properties stay identical across decode steps and the ggml-cuda
+    // CUDA-graph cache replays instead of re-launching every kernel.
+    int           kv_pad = 0;             // 0 = legacy exact-length views + cpy append
+    ggml_tensor * kv_idx = nullptr;       // [n_tokens] I32 cache row indices (graph input)
     bool          output_logits = true;
     bool          output_hidden_states = false;
     // If true, lm_head only runs on the LAST token (saves ~6 GB of logit memory