Add a persistent LFM2.5 formatter helper for macOS integrations

seyeong-han · seyeong-han · commit ac38c8445ab8 · 2026-05-12T18:48:56.000-07:00
Long-lived companion process for the LFM2.5-350M MLX formatter, mirroring the parakeet helper introduced in #18861. Wraps an executorch::extension::llm::TextLLMRunner with the same JSON-line stdin/stdout protocol the macOS ExecuWhisper app already uses for the parakeet ASR helper, so the formatter model can stay loaded and KV-warm across requests. Wire contract (kProtocolVersion=1): Requests: {"type":"format", "version":1, "request_id":..., "prompt":..., "max_new_tokens":..., "temperature":...} {"type":"shutdown", "version":1} Responses: {"type":"ready", "version":1} {"type":"status", "version":1, "request_id":..., "phase":..., "message":...} {"type":"result", "version":1, "request_id":..., "text":..., "stdout":..., "stderr":..., "tokens_per_second":<opt double>} {"type":"error", "version":1, "request_id":<opt>, "message":..., "details":<opt>} The Swift counterpart lives at ExecuWhisper/Services/FormatterHelperProtocol.swift in meta-llama/internal-llama-cookbook (end-to-end-use-cases/ExecuWhisper). Build via the existing make target: cd ~/executorch make lfm_2_5_formatter-mlx which produces: cmake-out/examples/models/llama/lfm25_formatter_helper cmake-out/examples/models/llama/mlx.metallib The new lfm_2_5_formatter-mlx Make target depends on the existing lfm_2_5-mlx target; the llama-mlx CMake build preset's targets list now includes lfm25_formatter_helper alongside llama_main.
diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx lfm_2_5_formatter-mlx llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -117,7 +117,8 @@ help:
 	@echo "  llama-cuda          - Build Llama runner with CUDA backend"
 	@echo "  llama-cuda-debug    - Build Llama runner with CUDA backend (debug mode)"
 	@echo "  llama-cpu           - Build Llama runner with CPU backend"
-	@echo "  lfm_2_5-mlx         - Build LFM2.5 runner with MLX backend"
+	@echo "  lfm_2_5-mlx         - Build LFM2.5 runner (llama_main) with MLX backend"
+	@echo "  lfm_2_5_formatter-mlx - Build LFM2.5 persistent formatter helper (lfm25_formatter_helper) with MLX backend"
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
@@ -329,11 +330,20 @@ llama-cuda-debug:
 lfm_2_5-mlx:
 	@echo "==> Building and installing ExecuTorch with MLX..."
 	cmake --workflow --preset mlx-release
-	@echo "==> Building LFM2.5 runner with MLX..."
+	@echo "==> Building LFM2.5 runner + persistent formatter helper with MLX..."
 	cd examples/models/llama && cmake --workflow --preset llama-mlx
 	@echo ""
 	@echo "✓ Build complete!"
-	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
+	@echo "  Binaries:"
+	@echo "    cmake-out/examples/models/llama/llama_main"
+	@echo "    cmake-out/examples/models/llama/lfm25_formatter_helper"
+
+# Same workflow as lfm_2_5-mlx; named target for the macOS ExecuWhisper
+# integration which only needs the persistent formatter helper. Both targets
+# rely on the `llama-mlx` build preset, which already lists
+# `lfm25_formatter_helper` alongside `llama_main`.
+lfm_2_5_formatter-mlx: lfm_2_5-mlx
+	@echo "  Helper: cmake-out/examples/models/llama/lfm25_formatter_helper"
 
 llava-cpu:
 	@echo "==> Building and installing ExecuTorch..."
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
@@ -245,3 +245,55 @@ elseif(UNIX)
   set_target_properties(llama_main PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'")
 endif()
 # Windows doesn't need rpath - DLLs are found via standard Windows search order
+
+# -------------------------------------------------------------------------- #
+# LFM2.5 formatter helper (persistent companion process)
+#
+# Long-lived sibling of llama_main that wraps the same TextLLMRunner with a
+# JSON-line stdin/stdout protocol. The macOS ExecuWhisper app keeps this
+# binary warm across requests so the formatter model is loaded once per
+# session. Build with `make lfm_2_5_formatter-mlx` from the repo root, or
+# `cmake --workflow --preset llama-mlx` from this directory.
+# -------------------------------------------------------------------------- #
+
+set(_formatter_helper_srcs
+    lfm25_formatter_helper.cpp lfm25_formatter_helper_protocol.cpp
+)
+set(_formatter_helper_include_directories
+    ${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include
+)
+
+add_executable(lfm25_formatter_helper ${_formatter_helper_srcs})
+
+if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL
+                                          "MinSizeRel"
+)
+  target_link_options_gc_sections(lfm25_formatter_helper)
+  if(NOT APPLE)
+    target_link_options(lfm25_formatter_helper PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(
+  lfm25_formatter_helper PUBLIC ${_formatter_helper_include_directories}
+)
+target_link_libraries(
+  lfm25_formatter_helper PUBLIC llama_runner ${link_libraries}
+)
+target_compile_options(
+  lfm25_formatter_helper PUBLIC ${_common_compile_options}
+)
+
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(lfm25_formatter_helper)
+endif()
+
+if(APPLE)
+  target_link_options(
+    lfm25_formatter_helper PRIVATE -Wl,-rpath,@loader_path
+  )
+elseif(UNIX)
+  set_target_properties(
+    lfm25_formatter_helper PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+  )
+endif()
diff --git a/examples/models/llama/CMakePresets.json b/examples/models/llama/CMakePresets.json
@@ -94,7 +94,7 @@
             "name": "llama-mlx",
             "displayName": "Build Llama runner with MLX backend",
             "configurePreset": "llama-mlx",
-            "targets": ["llama_main"]
+            "targets": ["llama_main", "lfm25_formatter_helper"]
         }
     ],
     "workflowPresets": [
diff --git a/examples/models/llama/lfm25_formatter_helper.cpp b/examples/models/llama/lfm25_formatter_helper.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Persistent companion process for the LFM2.5 formatter model.
+//
+// Loads an `executorch::extension::llm::TextLLMRunner` once and stays alive,
+// reading newline-delimited JSON `format` requests from stdin and writing
+// `result`/`status`/`error` messages to stdout. The wire contract is in
+// lfm25_formatter_helper_protocol.h.
+//
+// Built and run by the macOS ExecuWhisper app via `FormatterBridge.swift`,
+// which expects the binary at
+//   ${EXECUTORCH_PATH}/cmake-out/examples/models/llama/lfm25_formatter_helper
+// and the companion shader bundle at
+//   $(dirname binary)/mlx.metallib
+
+#include <gflags/gflags.h>
+
+#include <chrono>
+#include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <string>
+
+#include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/runtime/platform/log.h>
+
+#include "lfm25_formatter_helper_protocol.h"
+
+DEFINE_string(model_path, "model.pte", "Path to LFM2.5 formatter model (.pte).");
+DEFINE_string(
+    tokenizer_path,
+    "tokenizer.json",
+    "Path to the HuggingFace-format tokenizer.json file.");
+DEFINE_string(
+    tokenizer_config_path,
+    "tokenizer_config.json",
+    "Path to the HuggingFace-format tokenizer_config.json file (read by the "
+    "tokenizers crate when present in the same directory as tokenizer.json; "
+    "accepted here for symmetry with FormatterBridge.swift).");
+DEFINE_int32(
+    default_max_new_tokens,
+    256,
+    "Fallback max_new_tokens when a request omits it. The Swift bridge always "
+    "sets max_new_tokens, so this is mostly a safety net.");
+
+namespace {
+
+namespace fp = lfm25_formatter::helper_protocol;
+
+// Run a single format request through the warm runner. Captures generated
+// text via the token callback, captures stats via the stats callback, and
+// computes a tokens_per_second figure for the response.
+void format_text(
+    executorch::extension::llm::TextLLMRunner& runner,
+    const std::string& prompt,
+    int max_new_tokens,
+    double temperature,
+    std::string& text_out,
+    std::string& stdout_out,
+    std::string& stderr_out,
+    std::optional<double>& tokens_per_second_out) {
+  text_out.clear();
+  stdout_out.clear();
+  stderr_out.clear();
+  tokens_per_second_out.reset();
+
+  // Reset KV cache + stats so each request is independent.
+  runner.reset();
+
+  executorch::extension::llm::GenerationConfig config;
+  config.echo = false;
+  config.ignore_eos = false;
+  config.max_new_tokens = max_new_tokens;
+  config.temperature = static_cast<float>(temperature);
+
+  std::string accumulated;
+  std::optional<executorch::extension::llm::Stats> last_stats;
+
+  const auto err = runner.generate(
+      prompt,
+      config,
+      [&](const std::string& token_text) { accumulated.append(token_text); },
+      [&](const executorch::extension::llm::Stats& stats) {
+        last_stats = stats;
+      });
+
+  if (err != ::executorch::runtime::Error::Ok) {
+    throw std::runtime_error(
+        "TextLLMRunner::generate returned non-Ok error code");
+  }
+
+  text_out = std::move(accumulated);
+
+  if (last_stats.has_value()) {
+    stdout_out =
+        "PyTorchObserver " +
+        executorch::extension::llm::stats_to_json_string(*last_stats);
+
+    const long inference_ms =
+        last_stats->inference_end_ms - last_stats->inference_start_ms;
+    if (inference_ms > 0 && last_stats->num_generated_tokens > 0) {
+      tokens_per_second_out = static_cast<double>(
+                                  last_stats->num_generated_tokens) *
+          1000.0 / static_cast<double>(inference_ms);
+    }
+  }
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  // tokenizer_config_path is documented above; reference it so the symbol is
+  // not stripped, and so an unsupported value at least surfaces in the log.
+  if (!FLAGS_tokenizer_config_path.empty()) {
+    ET_LOG(
+        Info,
+        "Tokenizer config path: %s",
+        FLAGS_tokenizer_config_path.c_str());
+  }
+
+  try {
+    auto tokenizer = ::executorch::extension::llm::load_tokenizer(
+        FLAGS_tokenizer_path);
+    if (!tokenizer || !tokenizer->is_loaded()) {
+      throw std::runtime_error(
+          "Failed to load tokenizer: " + FLAGS_tokenizer_path);
+    }
+
+    auto runner = ::executorch::extension::llm::create_text_llm_runner(
+        FLAGS_model_path, std::move(tokenizer));
+    if (!runner) {
+      throw std::runtime_error(
+          "Failed to construct TextLLMRunner from " + FLAGS_model_path);
+    }
+    if (runner->load() != ::executorch::runtime::Error::Ok) {
+      throw std::runtime_error(
+          "TextLLMRunner::load failed for " + FLAGS_model_path);
+    }
+
+    if (!fp::write_message(std::cout, fp::encode_ready_message())) {
+      std::cerr << "Failed to write helper ready message." << std::endl;
+      return 1;
+    }
+
+    while (true) {
+      fp::Request request;
+      std::string request_error;
+      if (!fp::read_request(std::cin, &request, &request_error)) {
+        if (request_error.empty()) {
+          // Clean EOF on stdin — graceful shutdown.
+          return 0;
+        }
+        fp::write_message(
+            std::cout,
+            fp::encode_error_message(
+                std::nullopt,
+                "Failed to read helper request",
+                request_error));
+        return 1;
+      }
+
+      if (request.type == fp::Request::Type::Shutdown) {
+        return 0;
+      }
+
+      const auto& format_request = *request.format;
+      try {
+        if (format_request.prompt.empty()) {
+          throw std::runtime_error("Empty prompt.");
+        }
+
+        const int max_new_tokens = format_request.max_new_tokens > 0
+            ? format_request.max_new_tokens
+            : FLAGS_default_max_new_tokens;
+
+        fp::write_message(
+            std::cout,
+            fp::encode_status_message(
+                format_request.request_id,
+                "formatting",
+                "Generating formatted text..."));
+
+        std::string text;
+        std::string stdout_payload;
+        std::string stderr_payload;
+        std::optional<double> tokens_per_second;
+        format_text(
+            *runner,
+            format_request.prompt,
+            max_new_tokens,
+            format_request.temperature,
+            text,
+            stdout_payload,
+            stderr_payload,
+            tokens_per_second);
+
+        fp::write_message(
+            std::cout,
+            fp::encode_result_message(
+                format_request.request_id,
+                text,
+                stdout_payload,
+                stderr_payload,
+                tokens_per_second));
+      } catch (const std::exception& e) {
+        fp::write_message(
+            std::cout,
+            fp::encode_error_message(
+                format_request.request_id,
+                "Helper formatting failed",
+                e.what()));
+      }
+    }
+  } catch (const std::exception& e) {
+    fp::write_message(
+        std::cout,
+        fp::encode_error_message(
+            std::nullopt,
+            "Failed to start LFM2.5 formatter helper",
+            e.what()));
+    return 1;
+  }
+}
diff --git a/examples/models/llama/lfm25_formatter_helper_protocol.cpp b/examples/models/llama/lfm25_formatter_helper_protocol.cpp
diff --git a/examples/models/llama/lfm25_formatter_helper_protocol.h b/examples/models/llama/lfm25_formatter_helper_protocol.h

Original file line number	Diff line number	Diff line change
`@@ -94,7 +94,7 @@`
`94`	`94`	`"name": "llama-mlx",`
`95`	`95`	`"displayName": "Build Llama runner with MLX backend",`
`96`	`96`	`"configurePreset": "llama-mlx",`
`97`		`- "targets": ["llama_main"]`
	`97`	`+ "targets": ["llama_main", "lfm25_formatter_helper"]`
`98`	`98`	`}`
`99`	`99`	`],`
`100`	`100`	`"workflowPresets": [`