Skip to content

Commit 87cf03e

Browse files
committed
agent: add Kimi Mooncake LMCacheMP disagg recipe
1 parent 86e7761 commit 87cf03e

8 files changed

Lines changed: 624 additions & 7 deletions

File tree

.github/configs/amd-master.yaml

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1366,6 +1366,52 @@ kimik2.5-fp4-mi355x-vllm-disagg:
13661366
additional-settings:
13671367
- "DECODE_NODES=2"
13681368

1369+
# Agentic multinode 1P1D bring-up: Mooncake(tcp) carries the current-request
1370+
# prefill->decode KV transfer; LMCacheMP is enabled only on the prefill engine
1371+
# for local host-DRAM L2 prefix reuse. Decode intentionally uses Mooncake only
1372+
# to avoid decode-side LMCache lookup/retrieve racing the remote-prefill load.
1373+
kimik2.5-fp4-mi355x-vllm-disagg-agentic:
1374+
image: yukiozzz/kimi-lmc-mc-rocm:dmabuf
1375+
model: amd/Kimi-K2.5-MXFP4
1376+
model-prefix: kimik2.5
1377+
runner: mi355x-disagg
1378+
precision: fp4
1379+
framework: vllm-disagg
1380+
multinode: true
1381+
disagg: true
1382+
scenarios:
1383+
agentic-coding:
1384+
- duration: 1800
1385+
search-space:
1386+
- spec-decoding: "none"
1387+
conc-list: [ 8, 16, 32 ]
1388+
prefill:
1389+
num-worker: 1
1390+
tp: 8
1391+
ep: 1
1392+
dp-attn: false
1393+
additional-settings:
1394+
- "PREFILL_NODES=1"
1395+
- "ROUTER_TYPE=mc-proxy"
1396+
- "PREFILL_KV_CONNECTOR=mooncake-lmcachemp"
1397+
- "DECODE_KV_CONNECTOR=mooncake"
1398+
- "MC_PROTOCOL=tcp"
1399+
- "ENABLE_PREFIX_CACHING=1"
1400+
- "MAX_MODEL_LEN=262144"
1401+
- "WEKA_LOADER_OVERRIDE=semianalysis_cc_traces_weka_with_subagents_256k"
1402+
- "LMCACHE_L1_SIZE_GB=2500"
1403+
- "LMCACHE_L1_INIT_SIZE_GB=20"
1404+
- "LMCACHE_L1_READ_TTL_SECONDS=3600"
1405+
- "LMCACHE_CHUNK_SIZE=256"
1406+
- "LMCACHE_MAX_WORKERS=8"
1407+
decode:
1408+
num-worker: 1
1409+
tp: 8
1410+
ep: 8
1411+
dp-attn: false
1412+
additional-settings:
1413+
- "DECODE_NODES=1"
1414+
13691415
dsr1-fp4-mi355x-sglang-disagg:
13701416
image: lmsysorg/sglang-rocm:v0.5.12-rocm720-mi35x-20260519
13711417
model: amd/DeepSeek-R1-0528-MXFP4-v2

benchmarks/multi_node/amd_utils/bench.sh

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,38 @@ source "$(dirname "$0")/../../benchmark_lib.sh"
5555

5656
REPO_ROOT="$(cd "$(dirname "$0")/../../.." && pwd)"
5757

58+
if [[ "${IS_AGENTIC:-0}" == "1" ]]; then
59+
export PORT="${ROUTER_PORT}"
60+
export MODEL="${MODEL:-${BENCH_MODEL}}"
61+
export DURATION="${DURATION:-1800}"
62+
export INFMAX_CONTAINER_WORKSPACE="${INFMAX_CONTAINER_WORKSPACE:-/workspace}"
63+
export AGENTIC_OUTPUT_DIR="${AGENTIC_OUTPUT_DIR:-/workspace}"
64+
export RESULT_FILENAME="${RESULT_FILENAME:-agentic_bench}"
65+
66+
RESULT_DIR="${RESULT_DIR:-/workspace/LOGS/agentic}"
67+
mkdir -p "$RESULT_DIR"
68+
69+
resolve_trace_source
70+
install_agentic_deps
71+
72+
# Multinode agentic matrix entries carry a single concurrency, but keep
73+
# the loop so local one-off runs can pass a small x-separated list.
74+
replay_failed=0
75+
for max_concurrency in "${chosen_concurrencies[@]}"; do
76+
export CONC="$max_concurrency"
77+
export USERS="$max_concurrency"
78+
build_replay_cmd "$RESULT_DIR"
79+
run_agentic_replay_and_write_outputs "$RESULT_DIR" || replay_failed=1
80+
81+
if [[ "$ENGINE" == "vllm-disagg" ]]; then
82+
echo "[BENCH] Cooldown: waiting 10s for idle KV block reaper..."
83+
sleep 10
84+
fi
85+
done
86+
87+
exit "$replay_failed"
88+
fi
89+
5890
for max_concurrency in "${chosen_concurrencies[@]}"; do
5991

6092
export_file="${profile_folder}/concurrency_${max_concurrency}_req_rate_${chosen_req_rate}_gpus_$((prefill_gpus+decode_gpus))_ctx_${prefill_gpus}_gen_${decode_gpus}"

benchmarks/multi_node/amd_utils/job.slurm

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,28 @@ export RESULT_FILENAME="${RESULT_FILENAME:-}"
312312
export SPEC_DECODING="${SPEC_DECODING:-}"
313313
export IS_MULTINODE="${IS_MULTINODE:-false}"
314314

315+
# Agentic / custom vLLM-disagg connector knobs (threaded from submit.sh)
316+
export IS_AGENTIC="${IS_AGENTIC:-0}"
317+
export DURATION="${DURATION:-1800}"
318+
export MODEL="${MODEL:-}"
319+
export ROUTER_TYPE="${ROUTER_TYPE:-vllm-router}"
320+
export ROUTER_PORT="${ROUTER_PORT:-30000}"
321+
export ENABLE_PREFIX_CACHING="${ENABLE_PREFIX_CACHING:-}"
322+
export MAX_MODEL_LEN="${MAX_MODEL_LEN:-}"
323+
export WEKA_LOADER_OVERRIDE="${WEKA_LOADER_OVERRIDE:-}"
324+
export VLLM_BIND_IP="${VLLM_BIND_IP:-}"
325+
export PREFILL_KV_CONNECTOR="${PREFILL_KV_CONNECTOR:-moriio}"
326+
export DECODE_KV_CONNECTOR="${DECODE_KV_CONNECTOR:-moriio}"
327+
export MC_PROTOCOL="${MC_PROTOCOL:-tcp}"
328+
export LMCACHE_HOST="${LMCACHE_HOST:-127.0.0.1}"
329+
export LMCACHE_PORT="${LMCACHE_PORT:-5555}"
330+
export LMCACHE_HTTP_PORT="${LMCACHE_HTTP_PORT:-8080}"
331+
export LMCACHE_L1_SIZE_GB="${LMCACHE_L1_SIZE_GB:-2500}"
332+
export LMCACHE_L1_INIT_SIZE_GB="${LMCACHE_L1_INIT_SIZE_GB:-20}"
333+
export LMCACHE_L1_READ_TTL_SECONDS="${LMCACHE_L1_READ_TTL_SECONDS:-3600}"
334+
export LMCACHE_CHUNK_SIZE="${LMCACHE_CHUNK_SIZE:-256}"
335+
export LMCACHE_MAX_WORKERS="${LMCACHE_MAX_WORKERS:-8}"
336+
315337
SANITIZED_USER=$(echo "$USER_NAME" | tr -c 'a-zA-Z0-9_.-' '_')
316338
export DOCKER_CONT_NAME="container_${ENGINE}_${SANITIZED_USER}_${MODEL_NAME}_${SLURM_JOB_ID}"
317339

@@ -385,6 +407,26 @@ DOCKER_ENV_COMMON=(
385407
-e DECODE_ENABLE_DP=\$DECODE_ENABLE_DP
386408
-e DECODE_MTP_SIZE=\$DECODE_MTP_SIZE
387409
-e IS_MULTINODE=\$IS_MULTINODE
410+
-e IS_AGENTIC=\$IS_AGENTIC
411+
-e DURATION=\$DURATION
412+
-e MODEL=\$MODEL
413+
-e ROUTER_TYPE=\$ROUTER_TYPE
414+
-e ROUTER_PORT=\$ROUTER_PORT
415+
-e ENABLE_PREFIX_CACHING=\$ENABLE_PREFIX_CACHING
416+
-e MAX_MODEL_LEN=\$MAX_MODEL_LEN
417+
-e WEKA_LOADER_OVERRIDE=\$WEKA_LOADER_OVERRIDE
418+
-e VLLM_BIND_IP=\$VLLM_BIND_IP
419+
-e PREFILL_KV_CONNECTOR=\$PREFILL_KV_CONNECTOR
420+
-e DECODE_KV_CONNECTOR=\$DECODE_KV_CONNECTOR
421+
-e MC_PROTOCOL=\$MC_PROTOCOL
422+
-e LMCACHE_HOST=\$LMCACHE_HOST
423+
-e LMCACHE_PORT=\$LMCACHE_PORT
424+
-e LMCACHE_HTTP_PORT=\$LMCACHE_HTTP_PORT
425+
-e LMCACHE_L1_SIZE_GB=\$LMCACHE_L1_SIZE_GB
426+
-e LMCACHE_L1_INIT_SIZE_GB=\$LMCACHE_L1_INIT_SIZE_GB
427+
-e LMCACHE_L1_READ_TTL_SECONDS=\$LMCACHE_L1_READ_TTL_SECONDS
428+
-e LMCACHE_CHUNK_SIZE=\$LMCACHE_CHUNK_SIZE
429+
-e LMCACHE_MAX_WORKERS=\$LMCACHE_MAX_WORKERS
388430
)
389431

390432
# Engine-specific env vars

0 commit comments

Comments
 (0)