tenstorrent · AleksKnezevic · Jun 30, 2026 · Jun 30, 2026 · Jun 30, 2026 · mmanzoorTT
@@ -3,6 +3,8 @@
 # SPDX-FileCopyrightText: Portions (c) 2025 Tenstorrent AI ULC
 
 import contextlib
+import os
+import sys
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Optional, Union, cast
 
@@ -62,6 +64,21 @@ class TTConfig:
     # above it, prefills batch as usual. 0 = off; needs min_num_seqs < max.
     prefill_batch_threshold: int = 0
 
+    # KV-cache high-watermark for *fresh* prefill admission, as a fraction of
+    # the block pool (tt-xla: large-context concurrency thrash). AscendScheduler
+    # stops admitting NEW prefills once doing so would leave less than this
+    # fraction of the pool free, reserving headroom for in-flight requests to
+    # finish decoding instead of evicting (preempting) them and re-prefilling.
+    # Continuation chunks of already-started prefills and decode are NOT gated
+    # (decode may use the pool to 100%). A forward-progress guard always admits
+    # at least one prefill when nothing is running, so a single large prompt is
+    # never starved. 0.0 = off (legacy 1% watermark for all prefills).
+    # Default 0.25 (reserve 25% free => stop admitting above ~75% usage).
+    # Override at runtime with env var TT_XLA_PREFILL_KV_WATERMARK_PERCENT (a
+    # percent, e.g. 25), which takes precedence over additional_config.
+    # Resolved/validated in TTPlatform.check_and_update_config.
+    prefill_kv_watermark: float = 0.25
+
     batch_size: int = 1
     enable_precompile_all: bool = True
 
@@ -314,6 +331,23 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
             raise ValueError(
                 "additional_config['prefill_batch_threshold'] must be >= 0."
             )
+
+        # Resolve prefill_kv_watermark to a concrete fraction the scheduler can
+        # read directly: default, then env override (percent), then validate.
+        # See TTConfig.prefill_kv_watermark.
+        if additional_config.get("prefill_kv_watermark") is None:
+            additional_config["prefill_kv_watermark"] = TTConfig.prefill_kv_watermark
+        env_wm = os.environ.get("TT_XLA_PREFILL_KV_WATERMARK_PERCENT")
-        env_wm = os.environ.get("TT_XLA_PREFILL_KV_WATERMARK_PERCENT")
+        env_wm = os.environ.get("TTXLA_PREFILL_KV_WATERMARK_PERCENT")
-        env_wm = os.environ.get("TT_XLA_PREFILL_KV_WATERMARK_PERCENT")
+        env_wm = os.environ.get("TTXLA_PREFILL_KV_WATERMARK_PERCENT")
+        if env_wm is not None:
+            additional_config["prefill_kv_watermark"] = float(env_wm) / 100.0
+        wm = float(additional_config["prefill_kv_watermark"])
+        if not (0.0 <= wm < 1.0):
+            raise ValueError(
+                "prefill_kv_watermark (TT_XLA_PREFILL_KV_WATERMARK_PERCENT / 100) "
+                f"must be in [0, 1); got {wm}."
+            )
+        additional_config["prefill_kv_watermark"] = wm
+
         vllm_config.additional_config = additional_config
 
         # Stash cpu_sampling so validate_request() can read it without

@@ -57,6 +57,9 @@ def __init__(
         add_cfg = vllm_config.additional_config or {}
         self.prefill_batch_threshold = int(add_cfg.get("prefill_batch_threshold") or 0)
         self.b1_min_num_seqs = int(add_cfg.get("min_num_seqs") or 0)
+        # Fresh-prefill KV-cache admission watermark (0.0 = off). See
+        # TTConfig.prefill_kv_watermark.
+        self.prefill_kv_watermark = float(add_cfg.get("prefill_kv_watermark") or 0.0)
 
     def schedule(self) -> SchedulerOutput:
         # Super's schedule handles chunked prefill which is schedule both prefill and decode in one request.
@@ -247,7 +250,26 @@ def skip_cur_request(req=request):
                         skip_cur_request()
                         continue
 
-            watermark = getattr(self.scheduler_config, "watermark", 0.01)
+            base_watermark = getattr(self.scheduler_config, "watermark", 0.01)
+            # Apply the high-watermark only to FRESH prefills
+            # (num_computed_tokens == 0); continuation chunks keep the base
+            # watermark so an in-flight prefill is never stranded. The
+            # forward-progress guard falls back to the base watermark when
+            # nothing is running or scheduled yet, so a single large prompt that
+            # exceeds the reserve still gets admitted instead of deadlocking.
+            # See TTConfig.prefill_kv_watermark.
+            nothing_scheduled_yet = not (
+                scheduled_new_reqs or scheduled_resumed_reqs or scheduled_running_reqs
+            )
+            force_progress = not self.running and nothing_scheduled_yet
+            if (
+                self.prefill_kv_watermark > 0.0
+                and request.num_computed_tokens == 0
+                and not force_progress
+            ):
+                watermark = self.prefill_kv_watermark
+            else:
+                watermark = base_watermark
             if not self._check_watermark_for_prefill(
                 request, num_new_tokens, blocks, watermark
             ):