Skip to content

Commit ac38c84

Browse files
committed
Add a persistent LFM2.5 formatter helper for macOS integrations
Long-lived companion process for the LFM2.5-350M MLX formatter, mirroring the parakeet helper introduced in #18861. Wraps an executorch::extension::llm::TextLLMRunner with the same JSON-line stdin/stdout protocol the macOS ExecuWhisper app already uses for the parakeet ASR helper, so the formatter model can stay loaded and KV-warm across requests. Wire contract (kProtocolVersion=1): Requests: {"type":"format", "version":1, "request_id":..., "prompt":..., "max_new_tokens":..., "temperature":...} {"type":"shutdown", "version":1} Responses: {"type":"ready", "version":1} {"type":"status", "version":1, "request_id":..., "phase":..., "message":...} {"type":"result", "version":1, "request_id":..., "text":..., "stdout":..., "stderr":..., "tokens_per_second":<opt double>} {"type":"error", "version":1, "request_id":<opt>, "message":..., "details":<opt>} The Swift counterpart lives at ExecuWhisper/Services/FormatterHelperProtocol.swift in meta-llama/internal-llama-cookbook (end-to-end-use-cases/ExecuWhisper). Build via the existing make target: cd ~/executorch make lfm_2_5_formatter-mlx which produces: cmake-out/examples/models/llama/lfm25_formatter_helper cmake-out/examples/models/llama/mlx.metallib The new lfm_2_5_formatter-mlx Make target depends on the existing lfm_2_5-mlx target; the llama-mlx CMake build preset's targets list now includes lfm25_formatter_helper alongside llama_main.
1 parent 46dbe46 commit ac38c84

6 files changed

Lines changed: 546 additions & 5 deletions

File tree

Makefile

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@
9191
#
9292
# ==============================================================================
9393

94-
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx llava-cpu gemma3-cuda gemma3-cpu clean help
94+
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx lfm_2_5_formatter-mlx llava-cpu gemma3-cuda gemma3-cpu clean help
9595

9696
help:
9797
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -117,7 +117,8 @@ help:
117117
@echo " llama-cuda - Build Llama runner with CUDA backend"
118118
@echo " llama-cuda-debug - Build Llama runner with CUDA backend (debug mode)"
119119
@echo " llama-cpu - Build Llama runner with CPU backend"
120-
@echo " lfm_2_5-mlx - Build LFM2.5 runner with MLX backend"
120+
@echo " lfm_2_5-mlx - Build LFM2.5 runner (llama_main) with MLX backend"
121+
@echo " lfm_2_5_formatter-mlx - Build LFM2.5 persistent formatter helper (lfm25_formatter_helper) with MLX backend"
121122
@echo " llava-cpu - Build Llava runner with CPU backend"
122123
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
123124
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
@@ -329,11 +330,20 @@ llama-cuda-debug:
329330
lfm_2_5-mlx:
330331
@echo "==> Building and installing ExecuTorch with MLX..."
331332
cmake --workflow --preset mlx-release
332-
@echo "==> Building LFM2.5 runner with MLX..."
333+
@echo "==> Building LFM2.5 runner + persistent formatter helper with MLX..."
333334
cd examples/models/llama && cmake --workflow --preset llama-mlx
334335
@echo ""
335336
@echo "✓ Build complete!"
336-
@echo " Binary: cmake-out/examples/models/llama/llama_main"
337+
@echo " Binaries:"
338+
@echo " cmake-out/examples/models/llama/llama_main"
339+
@echo " cmake-out/examples/models/llama/lfm25_formatter_helper"
340+
341+
# Same workflow as lfm_2_5-mlx; named target for the macOS ExecuWhisper
342+
# integration which only needs the persistent formatter helper. Both targets
343+
# rely on the `llama-mlx` build preset, which already lists
344+
# `lfm25_formatter_helper` alongside `llama_main`.
345+
lfm_2_5_formatter-mlx: lfm_2_5-mlx
346+
@echo " Helper: cmake-out/examples/models/llama/lfm25_formatter_helper"
337347

338348
llava-cpu:
339349
@echo "==> Building and installing ExecuTorch..."

examples/models/llama/CMakeLists.txt

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,3 +245,55 @@ elseif(UNIX)
245245
set_target_properties(llama_main PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'")
246246
endif()
247247
# Windows doesn't need rpath - DLLs are found via standard Windows search order
248+
249+
# -------------------------------------------------------------------------- #
250+
# LFM2.5 formatter helper (persistent companion process)
251+
#
252+
# Long-lived sibling of llama_main that wraps the same TextLLMRunner with a
253+
# JSON-line stdin/stdout protocol. The macOS ExecuWhisper app keeps this
254+
# binary warm across requests so the formatter model is loaded once per
255+
# session. Build with `make lfm_2_5_formatter-mlx` from the repo root, or
256+
# `cmake --workflow --preset llama-mlx` from this directory.
257+
# -------------------------------------------------------------------------- #
258+
259+
set(_formatter_helper_srcs
260+
lfm25_formatter_helper.cpp lfm25_formatter_helper_protocol.cpp
261+
)
262+
set(_formatter_helper_include_directories
263+
${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include
264+
)
265+
266+
add_executable(lfm25_formatter_helper ${_formatter_helper_srcs})
267+
268+
if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL
269+
"MinSizeRel"
270+
)
271+
target_link_options_gc_sections(lfm25_formatter_helper)
272+
if(NOT APPLE)
273+
target_link_options(lfm25_formatter_helper PRIVATE "LINKER:-s")
274+
endif()
275+
endif()
276+
277+
target_include_directories(
278+
lfm25_formatter_helper PUBLIC ${_formatter_helper_include_directories}
279+
)
280+
target_link_libraries(
281+
lfm25_formatter_helper PUBLIC llama_runner ${link_libraries}
282+
)
283+
target_compile_options(
284+
lfm25_formatter_helper PUBLIC ${_common_compile_options}
285+
)
286+
287+
if(TARGET mlxdelegate)
288+
executorch_target_copy_mlx_metallib(lfm25_formatter_helper)
289+
endif()
290+
291+
if(APPLE)
292+
target_link_options(
293+
lfm25_formatter_helper PRIVATE -Wl,-rpath,@loader_path
294+
)
295+
elseif(UNIX)
296+
set_target_properties(
297+
lfm25_formatter_helper PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
298+
)
299+
endif()

examples/models/llama/CMakePresets.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@
9494
"name": "llama-mlx",
9595
"displayName": "Build Llama runner with MLX backend",
9696
"configurePreset": "llama-mlx",
97-
"targets": ["llama_main"]
97+
"targets": ["llama_main", "lfm25_formatter_helper"]
9898
}
9999
],
100100
"workflowPresets": [
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
// Persistent companion process for the LFM2.5 formatter model.
10+
//
11+
// Loads an `executorch::extension::llm::TextLLMRunner` once and stays alive,
12+
// reading newline-delimited JSON `format` requests from stdin and writing
13+
// `result`/`status`/`error` messages to stdout. The wire contract is in
14+
// lfm25_formatter_helper_protocol.h.
15+
//
16+
// Built and run by the macOS ExecuWhisper app via `FormatterBridge.swift`,
17+
// which expects the binary at
18+
// ${EXECUTORCH_PATH}/cmake-out/examples/models/llama/lfm25_formatter_helper
19+
// and the companion shader bundle at
20+
// $(dirname binary)/mlx.metallib
21+
22+
#include <gflags/gflags.h>
23+
24+
#include <chrono>
25+
#include <iostream>
26+
#include <optional>
27+
#include <stdexcept>
28+
#include <string>
29+
30+
#include <executorch/extension/llm/runner/irunner.h>
31+
#include <executorch/extension/llm/runner/llm_runner_helper.h>
32+
#include <executorch/extension/llm/runner/stats.h>
33+
#include <executorch/extension/llm/runner/text_llm_runner.h>
34+
#include <executorch/runtime/platform/log.h>
35+
36+
#include "lfm25_formatter_helper_protocol.h"
37+
38+
DEFINE_string(model_path, "model.pte", "Path to LFM2.5 formatter model (.pte).");
39+
DEFINE_string(
40+
tokenizer_path,
41+
"tokenizer.json",
42+
"Path to the HuggingFace-format tokenizer.json file.");
43+
DEFINE_string(
44+
tokenizer_config_path,
45+
"tokenizer_config.json",
46+
"Path to the HuggingFace-format tokenizer_config.json file (read by the "
47+
"tokenizers crate when present in the same directory as tokenizer.json; "
48+
"accepted here for symmetry with FormatterBridge.swift).");
49+
DEFINE_int32(
50+
default_max_new_tokens,
51+
256,
52+
"Fallback max_new_tokens when a request omits it. The Swift bridge always "
53+
"sets max_new_tokens, so this is mostly a safety net.");
54+
55+
namespace {
56+
57+
namespace fp = lfm25_formatter::helper_protocol;
58+
59+
// Run a single format request through the warm runner. Captures generated
60+
// text via the token callback, captures stats via the stats callback, and
61+
// computes a tokens_per_second figure for the response.
62+
void format_text(
63+
executorch::extension::llm::TextLLMRunner& runner,
64+
const std::string& prompt,
65+
int max_new_tokens,
66+
double temperature,
67+
std::string& text_out,
68+
std::string& stdout_out,
69+
std::string& stderr_out,
70+
std::optional<double>& tokens_per_second_out) {
71+
text_out.clear();
72+
stdout_out.clear();
73+
stderr_out.clear();
74+
tokens_per_second_out.reset();
75+
76+
// Reset KV cache + stats so each request is independent.
77+
runner.reset();
78+
79+
executorch::extension::llm::GenerationConfig config;
80+
config.echo = false;
81+
config.ignore_eos = false;
82+
config.max_new_tokens = max_new_tokens;
83+
config.temperature = static_cast<float>(temperature);
84+
85+
std::string accumulated;
86+
std::optional<executorch::extension::llm::Stats> last_stats;
87+
88+
const auto err = runner.generate(
89+
prompt,
90+
config,
91+
[&](const std::string& token_text) { accumulated.append(token_text); },
92+
[&](const executorch::extension::llm::Stats& stats) {
93+
last_stats = stats;
94+
});
95+
96+
if (err != ::executorch::runtime::Error::Ok) {
97+
throw std::runtime_error(
98+
"TextLLMRunner::generate returned non-Ok error code");
99+
}
100+
101+
text_out = std::move(accumulated);
102+
103+
if (last_stats.has_value()) {
104+
stdout_out =
105+
"PyTorchObserver " +
106+
executorch::extension::llm::stats_to_json_string(*last_stats);
107+
108+
const long inference_ms =
109+
last_stats->inference_end_ms - last_stats->inference_start_ms;
110+
if (inference_ms > 0 && last_stats->num_generated_tokens > 0) {
111+
tokens_per_second_out = static_cast<double>(
112+
last_stats->num_generated_tokens) *
113+
1000.0 / static_cast<double>(inference_ms);
114+
}
115+
}
116+
}
117+
118+
} // namespace
119+
120+
int main(int argc, char** argv) {
121+
gflags::ParseCommandLineFlags(&argc, &argv, true);
122+
123+
// tokenizer_config_path is documented above; reference it so the symbol is
124+
// not stripped, and so an unsupported value at least surfaces in the log.
125+
if (!FLAGS_tokenizer_config_path.empty()) {
126+
ET_LOG(
127+
Info,
128+
"Tokenizer config path: %s",
129+
FLAGS_tokenizer_config_path.c_str());
130+
}
131+
132+
try {
133+
auto tokenizer = ::executorch::extension::llm::load_tokenizer(
134+
FLAGS_tokenizer_path);
135+
if (!tokenizer || !tokenizer->is_loaded()) {
136+
throw std::runtime_error(
137+
"Failed to load tokenizer: " + FLAGS_tokenizer_path);
138+
}
139+
140+
auto runner = ::executorch::extension::llm::create_text_llm_runner(
141+
FLAGS_model_path, std::move(tokenizer));
142+
if (!runner) {
143+
throw std::runtime_error(
144+
"Failed to construct TextLLMRunner from " + FLAGS_model_path);
145+
}
146+
if (runner->load() != ::executorch::runtime::Error::Ok) {
147+
throw std::runtime_error(
148+
"TextLLMRunner::load failed for " + FLAGS_model_path);
149+
}
150+
151+
if (!fp::write_message(std::cout, fp::encode_ready_message())) {
152+
std::cerr << "Failed to write helper ready message." << std::endl;
153+
return 1;
154+
}
155+
156+
while (true) {
157+
fp::Request request;
158+
std::string request_error;
159+
if (!fp::read_request(std::cin, &request, &request_error)) {
160+
if (request_error.empty()) {
161+
// Clean EOF on stdin — graceful shutdown.
162+
return 0;
163+
}
164+
fp::write_message(
165+
std::cout,
166+
fp::encode_error_message(
167+
std::nullopt,
168+
"Failed to read helper request",
169+
request_error));
170+
return 1;
171+
}
172+
173+
if (request.type == fp::Request::Type::Shutdown) {
174+
return 0;
175+
}
176+
177+
const auto& format_request = *request.format;
178+
try {
179+
if (format_request.prompt.empty()) {
180+
throw std::runtime_error("Empty prompt.");
181+
}
182+
183+
const int max_new_tokens = format_request.max_new_tokens > 0
184+
? format_request.max_new_tokens
185+
: FLAGS_default_max_new_tokens;
186+
187+
fp::write_message(
188+
std::cout,
189+
fp::encode_status_message(
190+
format_request.request_id,
191+
"formatting",
192+
"Generating formatted text..."));
193+
194+
std::string text;
195+
std::string stdout_payload;
196+
std::string stderr_payload;
197+
std::optional<double> tokens_per_second;
198+
format_text(
199+
*runner,
200+
format_request.prompt,
201+
max_new_tokens,
202+
format_request.temperature,
203+
text,
204+
stdout_payload,
205+
stderr_payload,
206+
tokens_per_second);
207+
208+
fp::write_message(
209+
std::cout,
210+
fp::encode_result_message(
211+
format_request.request_id,
212+
text,
213+
stdout_payload,
214+
stderr_payload,
215+
tokens_per_second));
216+
} catch (const std::exception& e) {
217+
fp::write_message(
218+
std::cout,
219+
fp::encode_error_message(
220+
format_request.request_id,
221+
"Helper formatting failed",
222+
e.what()));
223+
}
224+
}
225+
} catch (const std::exception& e) {
226+
fp::write_message(
227+
std::cout,
228+
fp::encode_error_message(
229+
std::nullopt,
230+
"Failed to start LFM2.5 formatter helper",
231+
e.what()));
232+
return 1;
233+
}
234+
}

0 commit comments

Comments
 (0)