Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions integrations/vllm_plugin/vllm_tt/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
# SPDX-FileCopyrightText: Portions (c) 2025 Tenstorrent AI ULC

import contextlib
import os
import sys
from dataclasses import dataclass
from typing import TYPE_CHECKING, Optional, Union, cast

Expand Down Expand Up @@ -62,6 +64,21 @@ class TTConfig:
# above it, prefills batch as usual. 0 = off; needs min_num_seqs < max.
prefill_batch_threshold: int = 0

# KV-cache high-watermark for *fresh* prefill admission, as a fraction of
Comment thread
kmabeeTT marked this conversation as resolved.
# the block pool (tt-xla: large-context concurrency thrash). AscendScheduler
# stops admitting NEW prefills once doing so would leave less than this
# fraction of the pool free, reserving headroom for in-flight requests to
# finish decoding instead of evicting (preempting) them and re-prefilling.
# Continuation chunks of already-started prefills and decode are NOT gated
# (decode may use the pool to 100%). A forward-progress guard always admits
# at least one prefill when nothing is running, so a single large prompt is
# never starved. 0.0 = off (legacy 1% watermark for all prefills).
# Default 0.25 (reserve 25% free => stop admitting above ~75% usage).
# Override at runtime with env var TT_XLA_PREFILL_KV_WATERMARK_PERCENT (a
# percent, e.g. 25), which takes precedence over additional_config.
# Resolved/validated in TTPlatform.check_and_update_config.
prefill_kv_watermark: float = 0.25

batch_size: int = 1
enable_precompile_all: bool = True

Expand Down Expand Up @@ -314,6 +331,23 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
raise ValueError(
"additional_config['prefill_batch_threshold'] must be >= 0."
)

# Resolve prefill_kv_watermark to a concrete fraction the scheduler can
# read directly: default, then env override (percent), then validate.
# See TTConfig.prefill_kv_watermark.
if additional_config.get("prefill_kv_watermark") is None:
additional_config["prefill_kv_watermark"] = TTConfig.prefill_kv_watermark
env_wm = os.environ.get("TT_XLA_PREFILL_KV_WATERMARK_PERCENT")

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We are mostly using env variable as TTXLA_***

Suggested change
env_wm = os.environ.get("TT_XLA_PREFILL_KV_WATERMARK_PERCENT")
env_wm = os.environ.get("TTXLA_PREFILL_KV_WATERMARK_PERCENT")

if env_wm is not None:
additional_config["prefill_kv_watermark"] = float(env_wm) / 100.0
wm = float(additional_config["prefill_kv_watermark"])
if not (0.0 <= wm < 1.0):
raise ValueError(
"prefill_kv_watermark (TT_XLA_PREFILL_KV_WATERMARK_PERCENT / 100) "
f"must be in [0, 1); got {wm}."
)
additional_config["prefill_kv_watermark"] = wm

vllm_config.additional_config = additional_config

# Stash cpu_sampling so validate_request() can read it without
Expand Down
24 changes: 23 additions & 1 deletion integrations/vllm_plugin/vllm_tt/scheduler/ascend_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ def __init__(
add_cfg = vllm_config.additional_config or {}
self.prefill_batch_threshold = int(add_cfg.get("prefill_batch_threshold") or 0)
self.b1_min_num_seqs = int(add_cfg.get("min_num_seqs") or 0)
# Fresh-prefill KV-cache admission watermark (0.0 = off). See
# TTConfig.prefill_kv_watermark.
self.prefill_kv_watermark = float(add_cfg.get("prefill_kv_watermark") or 0.0)

def schedule(self) -> SchedulerOutput:
# Super's schedule handles chunked prefill which is schedule both prefill and decode in one request.
Expand Down Expand Up @@ -247,7 +250,26 @@ def skip_cur_request(req=request):
skip_cur_request()
continue

watermark = getattr(self.scheduler_config, "watermark", 0.01)
base_watermark = getattr(self.scheduler_config, "watermark", 0.01)
# Apply the high-watermark only to FRESH prefills
# (num_computed_tokens == 0); continuation chunks keep the base
# watermark so an in-flight prefill is never stranded. The
# forward-progress guard falls back to the base watermark when
# nothing is running or scheduled yet, so a single large prompt that
# exceeds the reserve still gets admitted instead of deadlocking.
# See TTConfig.prefill_kv_watermark.
nothing_scheduled_yet = not (
scheduled_new_reqs or scheduled_resumed_reqs or scheduled_running_reqs
)
force_progress = not self.running and nothing_scheduled_yet
if (
self.prefill_kv_watermark > 0.0
and request.num_computed_tokens == 0
and not force_progress
):
watermark = self.prefill_kv_watermark
else:
watermark = base_watermark
if not self._check_watermark_for_prefill(
request, num_new_tokens, blocks, watermark
):
Expand Down