llm_runner: plumb prefill temperature

mergennachin · mergennachin · commit 7fde821ef1f0 · 2026-06-12T13:10:32.000-07:00
Session-based serving drives generation as prefill plus token steps instead of one monolithic generate call. For that path to be correct, the first sampled token produced during prefill must honor the same sampling inputs as the rest of the decode loop; otherwise requests using temperature can silently start greedily and then switch behavior on later tokens.

This threads optional temperature through TextPrefiller and exposes the existing TextTokenGenerator logit-processor application so token-step callers can reuse the same sampling preparation as generate(). The goal is to remove a divergence point before session-backed serving starts depending on these primitives.

Default behavior remains greedy, so existing callers that do not pass temperature keep the same semantics. The added tests focus on the new non-default path and on sharing the logit-processor logic rather than duplicating it.
diff --git a/extension/llm/runner/test/test_text_prefiller.cpp b/extension/llm/runner/test/test_text_prefiller.cpp
@@ -79,8 +79,8 @@ class TextPrefillerTest : public Test {
     MOCK_METHOD(
         ::executorch::runtime::Result<uint64_t>,
         prefill_chunk,
-        (std::vector<uint64_t>&, int64_t&),
-        ());
+        (std::vector<uint64_t>&, int64_t&, float),
+        (override));
   };
 
   // Create a mock TextPrefiller
@@ -112,9 +112,9 @@ TEST_F(TextPrefillerTest, PrefillCallsPrefillChunkOnceWhenPromptFits) {
   int64_t start_pos = 0;
 
   // Expect prefill_chunk to be called exactly once with the entire prompt
-  EXPECT_CALL(*prefiller, prefill_chunk(_, _))
+  EXPECT_CALL(*prefiller, prefill_chunk(_, _, _))
       .Times(1)
-      .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+      .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos, float) {
         // Verify the tokens passed to prefill_chunk
         EXPECT_EQ(tokens.size(), prompt_tokens.size());
         for (size_t i = 0; i < tokens.size(); i++) {
@@ -217,14 +217,14 @@ TEST_F(TextPrefillerTest, PrefillHandlesPrefillChunkErrorsCorrectly) {
     InSequence seq;
 
     // First chunk: tokens [1, 2, 3] - succeeds
-    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
-        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos, float) {
           return Result<uint64_t>(10);
         });
 
     // Second chunk: tokens [4, 5] - fails
-    EXPECT_CALL(*prefiller, prefill_chunk(_, _))
-        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos) {
+    EXPECT_CALL(*prefiller, prefill_chunk(_, _, _))
+        .WillOnce([&](std::vector<uint64_t>& tokens, int64_t& pos, float) {
           return Result<uint64_t>(Error::InvalidArgument);
         });
   }
diff --git a/extension/llm/runner/text_prefiller.cpp b/extension/llm/runner/text_prefiller.cpp
@@ -28,7 +28,8 @@ TextPrefiller::TextPrefiller(
 
 ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     std::vector<uint64_t>& prompt_tokens,
-    int64_t& start_pos) {
+    int64_t& start_pos,
+    float temperature) {
   ET_CHECK_MSG(!prompt_tokens.empty(), "Prompt cannot be null");
   if (!text_decoder_runner_->is_method_loaded()) {
     ET_CHECK_OK_OR_RETURN_ERROR(text_decoder_runner_->load());
@@ -54,8 +55,15 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
               num_tokens_to_prefill_with,
           prompt_tokens_to_process.begin());
 
-      // Process this chunk
-      auto chunk_result = prefill_chunk(prompt_tokens_to_process, start_pos);
+      // Process this chunk. Only the LAST chunk produces the first generated
+      // token, so apply `temperature` there; intermediate chunks just prefill.
+      const bool is_last_chunk =
+          num_tokens_to_process + num_tokens_to_prefill_with >=
+          num_prompt_tokens;
+      auto chunk_result = prefill_chunk(
+          prompt_tokens_to_process,
+          start_pos,
+          is_last_chunk ? temperature : 0.0f);
       ET_CHECK_OK_OR_RETURN_ERROR(chunk_result.error());
       cur_token = chunk_result.get();
 
@@ -65,13 +73,14 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill(
     return cur_token;
   } else {
     // If prompt tokens don't exceed max_seq_len_, process them directly
-    return prefill_chunk(prompt_tokens, start_pos);
+    return prefill_chunk(prompt_tokens, start_pos, temperature);
   }
 }
 
 ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
     std::vector<uint64_t>& prompt_tokens,
-    int64_t& start_pos) {
+    int64_t& start_pos,
+    float temperature) {
   // enable_parallel_prefill_ maybe set even when not using kv cache
   // When kv cache is not used, start pos is ignored
   int32_t num_prompt_tokens = prompt_tokens.size();
@@ -92,7 +101,8 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
         Info, "Prefill token result numel(): %zu", outputs_res.get().numel());
 
     start_pos += num_prompt_tokens;
-    cur_token = text_decoder_runner_->logits_to_token(outputs_res.get());
+    cur_token =
+        text_decoder_runner_->logits_to_token(outputs_res.get(), temperature);
   } else { // sequential prefill
     int64_t pos = 0; // position in the sequence
     // NOLINTNEXTLINE(facebook-hte-ParameterUncheckedArrayBounds)
@@ -128,7 +138,8 @@ ::executorch::runtime::Result<uint64_t> TextPrefiller::prefill_chunk(
       start_pos++;
     }
 
-    cur_token = text_decoder_runner_->logits_to_token(logits_tensor);
+    cur_token =
+        text_decoder_runner_->logits_to_token(logits_tensor, temperature);
   }
   return cur_token;
 }
diff --git a/extension/llm/runner/text_prefiller.h b/extension/llm/runner/text_prefiller.h
@@ -32,22 +32,28 @@ class ET_EXPERIMENTAL TextPrefiller {
    * tokenizer.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * Module.
+   * @param temperature Sampling temperature for the first generated token
+   * (which is sampled here during prefill). Defaults to greedy (0.0).
    * @return The next token of the LLM Module after prefill.
    */
   virtual ::executorch::runtime::Result<uint64_t> prefill(
       std::vector<uint64_t>& prompt_tokens,
-      int64_t& start_pos);
+      int64_t& start_pos,
+      float temperature = 0.0f);
 
   /**
    * Helper method to prefill a chunk of tokens.
    * @param prompt_tokens The chunk of text prompt tokens to process.
    * @param start_pos The starting position in KV cache of the input in the LLM
    * Module.
+   * @param temperature Sampling temperature for the token produced by this
+   * chunk. Defaults to greedy (0.0).
    * @return The next token of the LLM Module after prefilling this chunk.
    */
   virtual ::executorch::runtime::Result<uint64_t> prefill_chunk(
       std::vector<uint64_t>& prompt_tokens,
-      int64_t& start_pos);
+      int64_t& start_pos,
+      float temperature = 0.0f);
 
   /**
    * Load the necessary resources for the TextPrefiller.
diff --git a/extension/llm/runner/text_token_generator.h b/extension/llm/runner/text_token_generator.h
@@ -55,6 +55,18 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     return logit_processors_.size();
   }
 
+  /// Apply the registered logit processors (grammar/tool masks, penalties,
+  /// top-k/top-p, ...) to `logits` in order, before sampling. Both the
+  /// generate() loop and session decode_one() call this so the two decode paths
+  /// stay consistent.
+  inline ::executorch::runtime::Error apply_logit_processors(
+      executorch::aten::Tensor& logits) {
+    for (auto& processor : logit_processors_) {
+      ET_CHECK_OK_OR_RETURN_ERROR(processor->process(logits));
+    }
+    return ::executorch::runtime::Error::Ok;
+  }
+
   virtual ~TextTokenGenerator() = default;
 
   /**
@@ -126,9 +138,7 @@ class ET_EXPERIMENTAL TextTokenGenerator {
 
       prev_token = cur_token;
 
-      for (auto& processor : logit_processors_) {
-        ET_CHECK_OK_OR_RETURN_ERROR(processor->process(logits_tensor));
-      }
+      ET_CHECK_OK_OR_RETURN_ERROR(apply_logit_processors(logits_tensor));
 
       stats_->on_sampling_begin();
       cur_token =
@@ -180,6 +190,11 @@ class ET_EXPERIMENTAL TextTokenGenerator {
     should_stop_.store(true, std::memory_order_relaxed);
   }
 
+  /// Whether `token` is an end-of-sequence token (used by single-step decode).
+  inline bool is_eos(uint64_t token) const {
+    return eos_ids_->find(token) != eos_ids_->end();
+  }
+
   /**
    * Load the necessary resources for TextTokenGenerator.
    * This method should be called before using the generate() method.