[NV] llm-d: prefix-cache experiment (shared prefix + caching ON)

ezrasilvera · ezrasilvera · commit 3b28e61997f0 · 2026-06-20T20:48:44.000+03:00
Tests whether our ~4 req/s ceiling is relieved when effective prefill is
cheap - the leading explanation for the upstream gap now that the load
generator (nyann Go client also capped ~3 req/s) and serving stack are
ruled out. Prefill-only probe confirmed prefill compute is the wall at 8k
ISL; this checks if a cached shared prefix shrinks that work.

- benchmark_lib.sh: run_benchmark_serving gains an optional --random-prefix-len
  (only appended when &gt; 0; default 0 leaves all existing paths byte-identical).
  benchmark_serving.py already supports it - one fixed random prefix prepended
  to every request.
- server.sh: pass --random-prefix-len when BENCH_RANDOM_PREFIX_LEN is set
  (env-gated; normal sweep unchanged).
- job.slurm: forward BENCH_RANDOM_PREFIX_LEN into the container env.
- dsv4-fp4-gb200-mid-curve-megamoe-prefixcache.yaml: new recipe = the
  pre-reference-match baseline config WITH prefix caching ON (no
  --no-enable-prefix-caching), so the only delta vs our known ~4 req/s
  baseline is the shared-prefix workload - clean attribution.
- nvidia-master.yaml: dsv4-fp4-gb200-llm-d-vllm-prefixcache config-key with
  isl=512 unique suffix + BENCH_RANDOM_PREFIX_LEN=7680 shared prefix (8192
  total, matches baseline) + osl=1024.

If req/s jumps well above ~4 with the 7680-token prefix cached, prefill
compute is the wall and prefix caching is the lever - supporting the
upstream-workload explanation. Default benchmark_serving client + ignore_eos
forces a clean OSL=1024 (unlike nyann).

Signed-off-by: Ezra Silvera &lt;ezra@il.ibm.com&gt;
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -8716,6 +8716,53 @@ dsv4-fp4-gb200-llm-d-vllm-nyann:
           - "DECODE_NODES=2"
           - "GPUS_PER_NODE=4"
 
+# Prefix-cache experiment variant of dsv4-fp4-gb200-llm-d-vllm-mid-curve-megamoe.
+# Uses the prefix-cache recipe (server-side prefix caching ON) and a
+# shared-prefix workload: BENCH_RANDOM_PREFIX_LEN=7680 prepends a fixed 7680-
+# token prefix to every request, with isl=512 the unique suffix (7680+512=8192
+# total, same as baseline; osl=1024). With caching ON the shared span is a hit
+# after warmup, so effective prefill drops ~16x to the suffix. If req/s jumps
+# vs the ~4 req/s baseline, prefill-compute is the wall and prefix caching is
+# the lever (default benchmark_serving client, ignore_eos forces OSL). Dispatch
+# with --no-evals.
+dsv4-fp4-gb200-llm-d-vllm-prefixcache:
+  image: ghcr.io/ezrasilvera/llm-d-nokube-vllm:vllm0.23
+  model: deepseek-ai/DeepSeek-V4-Pro
+  model-prefix: dsv4
+  runner: gb200
+  precision: fp4
+  framework: llm-d-vllm
+  multinode: true
+  disagg: true
+  scenarios:
+    fixed-seq-len:
+    - isl: 512
+      osl: 1024
+      search-space:
+      - spec-decoding: "none"
+        conc-list: [256, 512, 1024]
+        prefill:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+          - "GPUS_PER_NODE=4"
+          - "RANDOM_RANGE_RATIO=0.8"
+          - "CONFIG_FILE=dsv4-fp4-gb200-mid-curve-megamoe-prefixcache.yaml"
+          - "BENCH_NUM_PROMPTS_MULTIPLIER=10"
+          - "MAX_FAILURE_RATE=0.5"
+          - "BENCH_RANDOM_PREFIX_LEN=7680"
+        decode:
+          num-worker: 1
+          tp: 1
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=2"
+          - "GPUS_PER_NODE=4"
+
 # MTP2 variant of dsv4-fp4-gb200-dynamo-vllm. Uses the vLLM 0.20.1 image
 # and hand-picked 8k/1k Pareto points mirrored from NVIDIA/srt-slurm.
 dsv4-fp4-gb200-dynamo-vllm-mtp2:
diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -219,6 +219,7 @@ run_benchmark_serving() {
     local server_pid=""
     local tokenizer=""
     local tokenizer_mode=""
+    local random_prefix_len=0
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -250,6 +251,10 @@ run_benchmark_serving() {
                 random_range_ratio="$2"
                 shift 2
                 ;;
+            --random-prefix-len)
+                random_prefix_len="$2"
+                shift 2
+                ;;
             --num-prompts)
                 num_prompts="$2"
                 shift 2
@@ -382,6 +387,14 @@ run_benchmark_serving() {
         --result-filename "$result_filename.json"
     )
 
+    # Optional shared prefix: prepend a fixed random prefix of N tokens to
+    # every request. With server-side prefix caching enabled this makes the
+    # shared portion a cache hit after warmup, so effective prefill shrinks to
+    # the unique suffix. Only added when > 0; default 0 leaves behavior intact.
+    if [[ "${random_prefix_len:-0}" -gt 0 ]]; then
+        benchmark_cmd+=(--random-prefix-len "$random_prefix_len")
+    fi
+
     if [[ -n "$endpoint" ]]; then
         benchmark_cmd+=(--endpoint "$endpoint")
     fi
diff --git a/benchmarks/multi_node/llm-d-recipes/dsv4-fp4-gb200-mid-curve-megamoe-prefixcache.yaml b/benchmarks/multi_node/llm-d-recipes/dsv4-fp4-gb200-mid-curve-megamoe-prefixcache.yaml
@@ -0,0 +1,152 @@
+# DeepSeek-V4-Pro (FP4) on GB200, MegaMOE mid-curve 1P+1D - PREFIX-CACHE
+# experiment variant of dsv4-fp4-gb200-mid-curve-megamoe.yaml.
+#
+# Hypothesis under test: our throughput is pinned at ~4 req/s because every
+# request does a full 8k-token prefill, and prefill compute is the wall
+# (confirmed: prefill-only direct probe also caps ~3.84 req/s). The upstream
+# wide-ep-lws guide may report far higher throughput because its effective
+# prefill is cheap - i.e. a large shared prefix served from the prefix cache.
+#
+# This recipe enables server-side prefix caching (prefix caching is ON here -
+# the --no-enable-prefix-caching flag from the reference-match variant is
+# removed). Paired with BENCH_RANDOM_PREFIX_LEN in the config-key, the bench
+# prepends a fixed shared prefix to every request; after warmup that span is a
+# cache hit, so effective prefill shrinks to the unique suffix. If req/s jumps
+# vs the ~4 req/s baseline, prefill-compute is the wall AND prefix caching is
+# the lever (supporting the upstream-workload explanation of the gap).
+#
+# Engine config is the pre-reference-match baseline (prefix caching ON,
+# default max-num-batched-tokens, gpu-mem 0.9) so the ONLY difference vs our
+# known ~4 req/s baseline run is the shared-prefix workload - clean attribution.
+# Topology unchanged: 1 prefill + 1 decode, TP=1 DP=8 EP=8 DEP8, 16 GPUs.
+#
+# Selected via additional-settings:
+#   CONFIG_FILE=dsv4-fp4-gb200-mid-curve-megamoe-prefixcache.yaml
+#   PREFILL_NODES=2 DECODE_NODES=2 GPUS_PER_NODE=4 BENCH_RANDOM_PREFIX_LEN=7680
+
+# ---- EPP scheduling config ----
+apiVersion: llm-d.ai/v1alpha1
+kind: EndpointPickerConfig
+
+plugins:
+  - name: file-disc
+    type: file-discovery
+    parameters:
+      path: /tmp/endpoints.yaml
+      watchFile: false
+
+  - type: disagg-headers-handler
+  - type: always-disagg-pd-decider
+  - type: disagg-profile-handler
+    parameters:
+      deciderPluginName: always-disagg-pd-decider
+  - type: prefill-filter
+  - type: decode-filter
+  - type: prefix-cache-scorer
+  - type: queue-scorer
+  - type: kv-cache-utilization-scorer
+  - type: active-request-scorer
+  - type: max-score-picker
+
+schedulingProfiles:
+  - name: prefill
+    plugins:
+      - pluginRef: prefill-filter
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: queue-scorer
+        weight: 2
+      - pluginRef: kv-cache-utilization-scorer
+        weight: 2
+      - pluginRef: max-score-picker
+  - name: decode
+    plugins:
+      - pluginRef: decode-filter
+      - pluginRef: active-request-scorer
+        weight: 2
+      - pluginRef: prefix-cache-scorer
+        weight: 3
+      - pluginRef: max-score-picker
+
+dataLayer:
+  discovery:
+    pluginRef: file-disc
+
+# ---- Per-role vLLM flags ----
+# Baseline engine config WITH prefix caching ON (no --no-enable-prefix-caching).
+# --enable-cumem-allocator kept (KV in cumem for MNNVL transfer on v0.23+);
+# UCX_TLS includes rc so cross-node KV uses MNNVL/IB, never TCP.
+prefill:
+  tp: 1
+  enable-expert-parallel: true
+  extra-args: >-
+    --kv-cache-dtype fp8
+    --max-model-len 9280
+    --enforce-eager
+    --gpu-memory-utilization 0.9
+    --enable-cumem-allocator
+    --no-async-scheduling
+    --block-size 256
+    --tokenizer-mode deepseek_v4
+    --moe-backend deep_gemm_mega_moe
+    --enable-ep-weight-filter
+    --no-disable-hybrid-kv-cache-manager
+    --numa-bind
+  env:
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: "NVL"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,rc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    VLLM_SPARSE_INDEXER_MAX_LOGITS_MB: "1024"
+    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: "2048"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_USE_DEEP_GEMM: "1"
+    NVIDIA_GDRCOPY: "enabled"
+    VLLM_HTTP_TIMEOUT_KEEP_ALIVE: "120"
+
+decode:
+  tp: 1
+  enable-expert-parallel: true
+  extra-args: >-
+    --kv-cache-dtype fp8
+    --max-model-len 9280
+    --max-num-seqs 512
+    --max-num-batched-tokens 512
+    --max-cudagraph-capture-size 512
+    --gpu-memory-utilization 0.85
+    --enable-cumem-allocator
+    --block-size 256
+    --compilation-config {"cudagraph_mode":"FULL_DECODE_ONLY","mode":0}
+    --stream-interval 50
+    --tokenizer-mode deepseek_v4
+    --moe-backend deep_gemm_mega_moe
+    --enable-ep-weight-filter
+    --no-disable-hybrid-kv-cache-manager
+  env:
+    NCCL_CUMEM_ENABLE: "1"
+    NCCL_MNNVL_ENABLE: "1"
+    NCCL_NVLS_ENABLE: "1"
+    NCCL_P2P_LEVEL: "NVL"
+    UCX_MEMTYPE_CACHE: "n"
+    UCX_MEMTYPE_REG_WHOLE: "n"
+    UCX_TLS: "cuda_copy,cuda_ipc,rc,tcp"
+    UCX_CUDA_IPC_ENABLE_MNNVL: "y"
+    VLLM_USE_NCCL_SYMM_MEM: "1"
+    TILELANG_CLEANUP_TEMP_FILES: "1"
+    TORCH_SYMMMEM: "NVSHMEM"
+    VLLM_SKIP_P2P_CHECK: "1"
+    VLLM_RANDOMIZE_DP_DUMMY_INPUTS: "1"
+    VLLM_USE_DEEP_GEMM: "1"
+    NVIDIA_GDRCOPY: "enabled"
+
+# ---- SLURM resource directives ----
+slurm:
+  time_limit: "08:00:00"
diff --git a/benchmarks/multi_node/llm-d/job.slurm b/benchmarks/multi_node/llm-d/job.slurm
@@ -184,10 +184,10 @@ elif [[ "$LLMD_CONTAINER_ENGINE" == "pyxis" ]]; then
     # Optional load-generator selector + prefill-probe flag (from additional-settings).
     # Explicitly forwarded so the toggle can't silently fall through to the
     # default benchmark_serving sweep if srun's --export=ALL is ever restricted.
-    export BENCH_TOOL NYANN_DURATION NYANN_WARMUP PREFILL_ONLY_PROBE
+    export BENCH_TOOL NYANN_DURATION NYANN_WARMUP PREFILL_ONLY_PROBE BENCH_RANDOM_PREFIX_LEN
 
     PYXIS_ENV_LIST="NUM_NODES,PREFILL_NODES,DECODE_NODES,ALL_IPS,PREFILL_LEADER_IP,DECODE_LEADER_IP"
-    PYXIS_ENV_LIST+=",BENCH_TOOL,NYANN_DURATION,NYANN_WARMUP,PREFILL_ONLY_PROBE"
+    PYXIS_ENV_LIST+=",BENCH_TOOL,NYANN_DURATION,NYANN_WARMUP,PREFILL_ONLY_PROBE,BENCH_RANDOM_PREFIX_LEN"
     PYXIS_ENV_LIST+=",PREFILL_DP_ADDR,DECODE_DP_ADDR,MODEL_NAME,GPUS_PER_NODE"
     PYXIS_ENV_LIST+=",PREFILL_DP_SIZE,DECODE_DP_SIZE"
     PYXIS_ENV_LIST+=",BENCH_INPUT_LEN,BENCH_OUTPUT_LEN,BENCH_MAX_CONCURRENCY"
diff --git a/benchmarks/multi_node/llm-d/server.sh b/benchmarks/multi_node/llm-d/server.sh
@@ -582,6 +582,15 @@ PY
             )
         fi
 
+        # Optional shared-prefix workload (prefix-cache experiment): prepend a
+        # fixed random prefix of BENCH_RANDOM_PREFIX_LEN tokens to every
+        # request. With server-side prefix caching ON, that shared span is a
+        # cache hit after warmup, so effective prefill drops to the unique
+        # suffix (--input-len). Default unset -> normal full-prefill sweep.
+        if [[ "${BENCH_RANDOM_PREFIX_LEN:-0}" -gt 0 ]]; then
+            bench_extra_args+=(--random-prefix-len "$BENCH_RANDOM_PREFIX_LEN")
+        fi
+
         run_benchmark_serving \
             --bench-serving-dir /workspace \
             --tokenizer /models \