diff --git a/CMakePresets.json b/CMakePresets.json
index 99a0ebee12c..5ae9da1fefe 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -336,7 +336,10 @@
         "CMAKE_BUILD_TYPE": "Release",
         "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",
         "ET_MLX_ENABLE_OP_LOGGING": "OFF",
-        "ET_MIN_LOG_LEVEL": "Error"
+        "ET_MIN_LOG_LEVEL": "Error",
+        "EXECUTORCH_BUILD_KERNELS_LLM": "ON",
+        "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
+        "EXECUTORCH_BUILD_KERNELS_OPTIMIZED": "ON"
       }
     },
     {
diff --git a/Makefile b/Makefile
index ba61dddce44..5fe8793fe0a 100644
--- a/Makefile
+++ b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx lfm_2_5_formatter-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -123,6 +123,8 @@ help:
 	@echo "  llama-cuda          - Build Llama runner with CUDA backend"
 	@echo "  llama-cuda-debug    - Build Llama runner with CUDA backend (debug mode)"
 	@echo "  llama-cpu           - Build Llama runner with CPU backend"
+	@echo "  lfm_2_5-mlx         - Build LFM2.5 runner (llama_main) with MLX backend"
+	@echo "  lfm_2_5_formatter-mlx - Build LFM2.5 persistent formatter helper (lfm25_formatter_helper) with MLX backend"
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
@@ -372,6 +374,24 @@ llama-cuda-debug:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
 
+lfm_2_5-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building LFM2.5 runner + persistent formatter helper with MLX..."
+	cd examples/models/llama && cmake --workflow --preset llama-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binaries:"
+	@echo "    cmake-out/examples/models/llama/llama_main"
+	@echo "    cmake-out/examples/models/llama/lfm25_formatter_helper"
+
+# Same workflow as lfm_2_5-mlx; named target for the macOS ExecuWhisper
+# integration which only needs the persistent formatter helper. Both targets
+# rely on the `llama-mlx` build preset, which already lists
+# `lfm25_formatter_helper` alongside `llama_main`.
+lfm_2_5_formatter-mlx: lfm_2_5-mlx
+	@echo "  Helper: cmake-out/examples/models/llama/lfm25_formatter_helper"
+
 llava-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release
diff --git a/examples/models/lfm2/README.md b/examples/models/lfm2/README.md
index 4f52c576442..c740456f8dc 100644
--- a/examples/models/lfm2/README.md
+++ b/examples/models/lfm2/README.md
@@ -3,6 +3,10 @@
 
 [LFM2.5](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) is an updated version with improved training (28T tokens vs 10T) and extended context length support (32K tokens).
 
+Pre-exported ExecuTorch MLX artifacts for LFM2.5 350M and 1.2B are available
+on the Hugging Face Hub at
+[younghan-meta/LFM2.5-ExecuTorch-MLX](https://huggingface.co/younghan-meta/LFM2.5-ExecuTorch-MLX).
+
 ## Instructions
 
 LFM2 uses the same example code as optimized Llama model, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
@@ -47,6 +51,24 @@ python -m extension.llm.export.export_llm \
   +export.output_name="lfm2_5_1_2b_8da4w.pte"
 ```
 
+Export LFM2.5 350M to MLX on Apple Silicon, quantized with 4-bit weights:
+```
+python -m extension.llm.export.export_llm \
+  --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+  +base.model_class="lfm2_5_350m" \
+  +base.params="examples/models/lfm2/config/lfm2_5_350m_config.json" \
+  +export.output_name="lfm2_5_350m_mlx_4w.pte"
+```
+
+Export LFM2.5 1.2B to MLX on Apple Silicon, quantized with 4-bit weights:
+```
+python -m extension.llm.export.export_llm \
+  --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+  +base.model_class="lfm2_5_1_2b" \
+  +base.params="examples/models/lfm2/config/lfm2_5_1_2b_config.json" \
+  +export.output_name="lfm2_5_1_2b_mlx_4w.pte"
+```
+
 To export with extended context (e.g., 2048 tokens):
 ```
 python -m extension.llm.export.export_llm \
@@ -58,6 +80,17 @@ python -m extension.llm.export.export_llm \
   +export.output_name="lfm2_5_1_2b_8da4w.pte"
 ```
 ### Example run
+For MLX on Apple Silicon, build or install ExecuTorch with MLX enabled. The
+easiest local path is:
+```
+conda activate et-mlx
+python install_executorch.py
+xcrun -sdk macosx --find metal
+```
+
+The `metal` command must resolve to an Xcode path, not fail under standalone
+Command Line Tools.
+
 With ExecuTorch pybindings:
 ```
 python -m examples.models.llama.runner.native \
@@ -72,7 +105,31 @@ python -m examples.models.llama.runner.native \
   --temperature 0.3
 ```
 
-With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
+With ExecuTorch pybindings and an LFM2.5 MLX export:
+```
+python -m examples.models.llama.runner.native \
+  --model lfm2_5_350m \
+  --pte lfm2_5_350m_mlx_4w.pte \
+  --tokenizer ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
+  --tokenizer_config ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer_config.json \
+  --prompt "<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
+  --params examples/models/lfm2/config/lfm2_5_350m_config.json \
+  --max_len 128 \
+  -kv \
+  --temperature 0.3
+```
+
+Find the Hugging Face cache snapshot directory with:
+```
+python - <<'PY'
+from pathlib import Path
+root = Path.home() / ".cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots"
+for path in root.glob("*/tokenizer.json"):
+    print(path.parent)
+PY
+```
+
+With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) for general runner details):
 ```
 cmake-out/examples/models/llama/llama_main \
   --model_path lfm2_700m_8da4w.pte \
@@ -81,4 +138,18 @@ cmake-out/examples/models/llama/llama_main \
   --temperature 0.3
 ```
 
+Build the C++ runner with MLX support for LFM2.5:
+```
+make lfm_2_5-mlx
+```
+
+Then run an LFM2.5 MLX export with the C++ runner:
+```
+cmake-out/examples/models/llama/llama_main \
+  --model_path lfm2_5_350m_mlx_4w.pte \
+  --tokenizer_path ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
+  --prompt="<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
+  --temperature 0.3
+```
+
 To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
diff --git a/examples/models/lfm2/config/lfm2_5_350m_config.json b/examples/models/lfm2/config/lfm2_5_350m_config.json
new file mode 100644
index 00000000000..975ccbccca7
--- /dev/null
+++ b/examples/models/lfm2/config/lfm2_5_350m_config.json
@@ -0,0 +1,33 @@
+{
+  "dim": 1024,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 4608,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 16,
+  "norm_eps": 1e-5,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 65536,
+  "use_hf_rope": true,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true,
+  "layer_types": [
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv"
+  ]
+}
diff --git a/examples/models/lfm2/config/lfm2_mlx_4w.yaml b/examples/models/lfm2/config/lfm2_mlx_4w.yaml
new file mode 100644
index 00000000000..fa7b16fd418
--- /dev/null
+++ b/examples/models/lfm2/config/lfm2_mlx_4w.yaml
@@ -0,0 +1,15 @@
+base:
+  metadata: '{"get_bos_id": 1, "get_eos_ids":[7]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: bf16
+
+quantization:
+  qmode: 4w
+  group_size: 64
+
+backend:
+  mlx:
+    enabled: True
diff --git a/examples/models/lfm2/test_lfm2_5_mlx.py b/examples/models/lfm2/test_lfm2_5_mlx.py
new file mode 100644
index 00000000000..598de11315c
--- /dev/null
+++ b/examples/models/lfm2/test_lfm2_5_mlx.py
@@ -0,0 +1,102 @@
+import ast
+import json
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+CONFIG_DIR = REPO_ROOT / "examples" / "models" / "lfm2" / "config"
+EXPORT_LLAMA_LIB = REPO_ROOT / "examples" / "models" / "llama" / "export_llama_lib.py"
+LLM_CONFIG = REPO_ROOT / "extension" / "llm" / "export" / "config" / "llm_config.py"
+
+
+def _load_json_config(name: str) -> dict:
+    with open(CONFIG_DIR / name, "r") as f:
+        return json.load(f)
+
+
+def _module_ast(path: Path) -> ast.Module:
+    return ast.parse(path.read_text())
+
+
+def _literal_assignment(module: ast.Module, name: str):
+    for node in module.body:
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name) and target.id == name:
+                    return ast.literal_eval(node.value)
+    raise AssertionError(f"{name} not found")
+
+
+def _class_string_assignments(module: ast.Module, class_name: str) -> dict[str, str]:
+    for node in module.body:
+        if isinstance(node, ast.ClassDef) and node.name == class_name:
+            values = {}
+            for stmt in node.body:
+                if (
+                    isinstance(stmt, ast.Assign)
+                    and len(stmt.targets) == 1
+                    and isinstance(stmt.targets[0], ast.Name)
+                ):
+                    values[stmt.targets[0].id] = ast.literal_eval(stmt.value)
+            return values
+    raise AssertionError(f"{class_name} not found")
+
+
+def test_lfm2_5_models_are_registered() -> None:
+    export_module = _module_ast(EXPORT_LLAMA_LIB)
+    model_types = _class_string_assignments(_module_ast(LLM_CONFIG), "ModelType")
+    executor_defined_models = _literal_assignment(
+        export_module, "EXECUTORCH_DEFINED_MODELS"
+    )
+    hf_repo_ids = _literal_assignment(export_module, "HUGGING_FACE_REPO_IDS")
+
+    assert "lfm2_5_350m" in executor_defined_models
+    assert "lfm2_5_1_2b" in executor_defined_models
+    assert model_types["lfm2_5_350m"] == "lfm2_5_350m"
+    assert model_types["lfm2_5_1_2b"] == "lfm2_5_1_2b"
+    assert hf_repo_ids["lfm2_5_350m"] == "LiquidAI/LFM2.5-350M"
+    assert hf_repo_ids["lfm2_5_1_2b"] == "LiquidAI/LFM2.5-1.2B-Instruct"
+
+
+def test_lfm2_5_architecture_configs_match_expected_shapes() -> None:
+    expected = {
+        "lfm2_5_350m_config.json": {
+            "dim": 1024,
+            "hidden_dim": 4608,
+            "n_heads": 16,
+            "n_kv_heads": 8,
+        },
+        "lfm2_5_1_2b_config.json": {
+            "dim": 2048,
+            "hidden_dim": 8192,
+            "n_heads": 32,
+            "n_kv_heads": 8,
+        },
+    }
+
+    for filename, expected_fields in expected.items():
+        cfg = _load_json_config(filename)
+        for key, value in expected_fields.items():
+            assert cfg[key] == value
+        assert cfg["n_layers"] == 16
+        assert len(cfg["layer_types"]) == cfg["n_layers"]
+        assert cfg["layer_types"].count("full_attention") == 6
+        assert cfg["layer_types"].count("conv") == 10
+        assert cfg["vocab_size"] == 65536
+        assert cfg["rope_theta"] == 1000000.0
+        assert cfg["use_hf_rope"] is True
+        assert cfg["use_qk_norm"] is True
+        assert cfg["qk_norm_before_rope"] is True
+
+
+def test_lfm2_mlx_config_enables_mlx_backend() -> None:
+    cfg = OmegaConf.load(CONFIG_DIR / "lfm2_mlx_4w.yaml")
+    assert cfg.base.metadata == '{"get_bos_id": 1, "get_eos_ids":[7]}'
+    assert cfg.model.use_kv_cache is True
+    assert cfg.model.use_sdpa_with_kv_cache is True
+    assert cfg.model.dtype_override == "bf16"
+    assert cfg.quantization.qmode == "4w"
+    assert cfg.quantization.group_size == 64
+    assert cfg.backend.mlx.enabled is True
diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt
index 6d5b5cc2566..a8c940ee228 100644
--- a/examples/models/llama/CMakeLists.txt
+++ b/examples/models/llama/CMakeLists.txt
@@ -261,3 +261,55 @@ elseif(UNIX)
   set_target_properties(llama_main PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'")
 endif()
 # Windows doesn't need rpath - DLLs are found via standard Windows search order
+
+# -------------------------------------------------------------------------- #
+# LFM2.5 formatter helper (persistent companion process)
+#
+# Long-lived sibling of llama_main that wraps the same TextLLMRunner with a
+# JSON-line stdin/stdout protocol. The macOS ExecuWhisper app keeps this
+# binary warm across requests so the formatter model is loaded once per
+# session. Build with `make lfm_2_5_formatter-mlx` from the repo root, or
+# `cmake --workflow --preset llama-mlx` from this directory.
+# -------------------------------------------------------------------------- #
+
+set(_formatter_helper_srcs
+    lfm25_formatter_helper.cpp lfm25_formatter_helper_protocol.cpp
+)
+set(_formatter_helper_include_directories
+    ${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include
+)
+
+add_executable(lfm25_formatter_helper ${_formatter_helper_srcs})
+
+if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL
+                                          "MinSizeRel"
+)
+  target_link_options_gc_sections(lfm25_formatter_helper)
+  if(NOT APPLE)
+    target_link_options(lfm25_formatter_helper PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(
+  lfm25_formatter_helper PUBLIC ${_formatter_helper_include_directories}
+)
+target_link_libraries(
+  lfm25_formatter_helper PUBLIC llama_runner ${link_libraries}
+)
+target_compile_options(
+  lfm25_formatter_helper PUBLIC ${_common_compile_options}
+)
+
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(lfm25_formatter_helper)
+endif()
+
+if(APPLE)
+  target_link_options(
+    lfm25_formatter_helper PRIVATE -Wl,-rpath,@loader_path
+  )
+elseif(UNIX)
+  set_target_properties(
+    lfm25_formatter_helper PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+  )
+endif()
diff --git a/examples/models/llama/CMakePresets.json b/examples/models/llama/CMakePresets.json
index f52b289523f..735f4e3907e 100644
--- a/examples/models/llama/CMakePresets.json
+++ b/examples/models/llama/CMakePresets.json
@@ -48,6 +48,21 @@
                 "string": "${hostSystemName}",
                 "list": ["Linux", "Windows"]
             }
+        },
+        {
+            "name": "llama-mlx",
+            "displayName": "Llama runner with MLX backend",
+            "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/llama",
+            "cacheVariables": {
+                "CMAKE_BUILD_TYPE": "Release",
+                "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out",
+                "EXECUTORCH_BUILD_MLX": "ON"
+            },
+            "condition": {
+                "lhs": "${hostSystemName}",
+                "type": "equals",
+                "rhs": "Darwin"
+            }
         }
     ],
     "buildPresets": [
@@ -74,6 +89,12 @@
             "displayName": "Build Llama runner with CUDA backend",
             "configurePreset": "llama-cuda",
             "targets": ["llama_main"]
+        },
+        {
+            "name": "llama-mlx",
+            "displayName": "Build Llama runner with MLX backend",
+            "configurePreset": "llama-mlx",
+            "targets": ["llama_main", "lfm25_formatter_helper"]
         }
     ],
     "workflowPresets": [
@@ -132,6 +153,20 @@
                     "name": "llama-cuda"
                 }
             ]
+        },
+        {
+            "name": "llama-mlx",
+            "displayName": "Configure and build Llama runner with MLX backend",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "llama-mlx"
+                },
+                {
+                    "type": "build",
+                    "name": "llama-mlx"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py
index 46c9113a211..d33953f8c41 100644
--- a/examples/models/llama/export_llama_lib.py
+++ b/examples/models/llama/export_llama_lib.py
@@ -115,6 +115,7 @@
     "lfm2_350m",  # hybrid
     "lfm2_700m",  # hybrid
     "lfm2_1_2b",  # hybrid
+    "lfm2_5_350m",  # hybrid
     "lfm2_5_1_2b",  # hybrid
 ]
 TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"]
@@ -133,6 +134,7 @@
     "lfm2_350m": "LiquidAI/LFM2-350M",
     "lfm2_700m": "LiquidAI/LFM2-700M",
     "lfm2_1_2b": "LiquidAI/LFM2-1.2B",
+    "lfm2_5_350m": "LiquidAI/LFM2.5-350M",
     "lfm2_5_1_2b": "LiquidAI/LFM2.5-1.2B-Instruct",
 }
 
diff --git a/examples/models/llama/lfm25_formatter_helper.cpp b/examples/models/llama/lfm25_formatter_helper.cpp
new file mode 100644
index 00000000000..d1683a5a168
--- /dev/null
+++ b/examples/models/llama/lfm25_formatter_helper.cpp
@@ -0,0 +1,277 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Persistent companion process for the LFM2.5 formatter model.
+//
+// Loads an `executorch::extension::llm::TextLLMRunner` once and stays alive,
+// reading newline-delimited JSON `format` requests from stdin and writing
+// `result`/`status`/`error` messages to stdout. The wire contract is in
+// lfm25_formatter_helper_protocol.h.
+//
+// Built and run by the macOS ExecuWhisper app via `FormatterBridge.swift`,
+// which expects the binary at
+//   ${EXECUTORCH_PATH}/cmake-out/examples/models/llama/lfm25_formatter_helper
+// and the companion shader bundle at
+//   $(dirname binary)/mlx.metallib
+
+#include <gflags/gflags.h>
+
+#include <chrono>
+#include <iostream>
+#include <optional>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <executorch/extension/llm/runner/irunner.h>
+#include <executorch/extension/llm/runner/llm_runner_helper.h>
+#include <executorch/extension/llm/runner/stats.h>
+#include <executorch/extension/llm/runner/text_llm_runner.h>
+#include <executorch/runtime/platform/log.h>
+
+#include "lfm25_formatter_helper_protocol.h"
+
+DEFINE_string(model_path, "model.pte", "Path to LFM2.5 formatter model (.pte).");
+DEFINE_string(
+    tokenizer_path,
+    "tokenizer.json",
+    "Path to the HuggingFace-format tokenizer.json file.");
+DEFINE_string(
+    tokenizer_config_path,
+    "tokenizer_config.json",
+    "Path to the HuggingFace-format tokenizer_config.json file (read by the "
+    "tokenizers crate when present in the same directory as tokenizer.json; "
+    "accepted here for symmetry with FormatterBridge.swift).");
+DEFINE_int32(
+    default_max_new_tokens,
+    256,
+    "Fallback max_new_tokens when a request omits it. The Swift bridge always "
+    "sets max_new_tokens, so this is mostly a safety net.");
+
+namespace {
+
+namespace fp = lfm25_formatter::helper_protocol;
+
+// Run a single format request through the warm runner. Captures generated
+// text via the token callback, captures stats via the stats callback, and
+// computes a tokens_per_second figure for the response.
+void format_text(
+    executorch::extension::llm::TextLLMRunner& runner,
+    const std::string& prompt,
+    int max_new_tokens,
+    double temperature,
+    std::string& text_out,
+    std::string& stdout_out,
+    std::string& stderr_out,
+    std::optional<double>& tokens_per_second_out) {
+  text_out.clear();
+  stdout_out.clear();
+  stderr_out.clear();
+  tokens_per_second_out.reset();
+
+  // Reset KV cache + stats so each request is independent.
+  runner.reset();
+
+  executorch::extension::llm::GenerationConfig config;
+  config.echo = false;
+  config.ignore_eos = false;
+  config.max_new_tokens = max_new_tokens;
+  config.temperature = static_cast<float>(temperature);
+
+  std::string accumulated;
+  std::optional<executorch::extension::llm::Stats> last_stats;
+
+  // The TextLLMRunner's text generator invokes the token callback for every
+  // produced token, including the EOS token (id 7 = "<|im_end|>") that
+  // signals end-of-generation. Without filtering, the literal "<|im_end|>"
+  // string ends up in the user-visible output. Filter known stop strings
+  // here so the rest of the pipeline doesn't have to.
+  static const std::vector<std::string> kStopStrings = {
+      "<|im_end|>", "<|endoftext|>"};
+
+  // The runner unconditionally prints every generated token and a final
+  // PyTorchObserver stats line to stdout (see
+  // extension/llm/runner/text_llm_runner.cpp). That conflicts with our
+  // JSON-line wire protocol, which also writes to stdout, because the parent
+  // process treats every stdout line as a protocol message. Silence stdout
+  // for the duration of generate() by redirecting fd 1 to /dev/null, then
+  // restore the parent-facing pipe before we emit the protocol response.
+  std::cout.flush();
+  std::fflush(stdout);
+  int saved_stdout_fd = ::dup(STDOUT_FILENO);
+  int devnull_fd = ::open("/dev/null", O_WRONLY);
+  if (saved_stdout_fd >= 0 && devnull_fd >= 0) {
+    ::dup2(devnull_fd, STDOUT_FILENO);
+    ::close(devnull_fd);
+  }
+
+  const auto err = runner.generate(
+      prompt,
+      config,
+      [&](const std::string& token_text) {
+        for (const auto& stop : kStopStrings) {
+          if (token_text == stop) {
+            return;
+          }
+        }
+        accumulated.append(token_text);
+      },
+      [&](const executorch::extension::llm::Stats& stats) {
+        last_stats.emplace(stats);
+      });
+
+  // Restore the parent-facing stdout pipe so subsequent protocol writes
+  // (status, result, error) reach the parent process.
+  std::fflush(stdout);
+  if (saved_stdout_fd >= 0) {
+    ::dup2(saved_stdout_fd, STDOUT_FILENO);
+    ::close(saved_stdout_fd);
+  }
+
+  if (err != ::executorch::runtime::Error::Ok) {
+    throw std::runtime_error(
+        "TextLLMRunner::generate returned non-Ok error code");
+  }
+
+  text_out = std::move(accumulated);
+
+  if (last_stats.has_value()) {
+    stdout_out =
+        "PyTorchObserver " +
+        executorch::extension::llm::stats_to_json_string(*last_stats);
+
+    const long inference_ms =
+        last_stats->inference_end_ms - last_stats->inference_start_ms;
+    if (inference_ms > 0 && last_stats->num_generated_tokens > 0) {
+      tokens_per_second_out = static_cast<double>(
+                                  last_stats->num_generated_tokens) *
+          1000.0 / static_cast<double>(inference_ms);
+    }
+  }
+}
+
+} // namespace
+
+int main(int argc, char** argv) {
+  gflags::ParseCommandLineFlags(&argc, &argv, true);
+
+  // tokenizer_config_path is documented above; reference it so the symbol is
+  // not stripped, and so an unsupported value at least surfaces in the log.
+  if (!FLAGS_tokenizer_config_path.empty()) {
+    ET_LOG(
+        Info,
+        "Tokenizer config path: %s",
+        FLAGS_tokenizer_config_path.c_str());
+  }
+
+  try {
+    auto tokenizer = ::executorch::extension::llm::load_tokenizer(
+        FLAGS_tokenizer_path);
+    if (!tokenizer || !tokenizer->is_loaded()) {
+      throw std::runtime_error(
+          "Failed to load tokenizer: " + FLAGS_tokenizer_path);
+    }
+
+    auto runner = ::executorch::extension::llm::create_text_llm_runner(
+        FLAGS_model_path, std::move(tokenizer));
+    if (!runner) {
+      throw std::runtime_error(
+          "Failed to construct TextLLMRunner from " + FLAGS_model_path);
+    }
+    if (runner->load() != ::executorch::runtime::Error::Ok) {
+      throw std::runtime_error(
+          "TextLLMRunner::load failed for " + FLAGS_model_path);
+    }
+
+    if (!fp::write_message(std::cout, fp::encode_ready_message())) {
+      std::cerr << "Failed to write helper ready message." << std::endl;
+      return 1;
+    }
+
+    while (true) {
+      fp::Request request;
+      std::string request_error;
+      if (!fp::read_request(std::cin, &request, &request_error)) {
+        if (request_error.empty()) {
+          // Clean EOF on stdin — graceful shutdown.
+          return 0;
+        }
+        fp::write_message(
+            std::cout,
+            fp::encode_error_message(
+                std::nullopt,
+                "Failed to read helper request",
+                request_error));
+        return 1;
+      }
+
+      if (request.type == fp::Request::Type::Shutdown) {
+        return 0;
+      }
+
+      const auto& format_request = *request.format;
+      try {
+        if (format_request.prompt.empty()) {
+          throw std::runtime_error("Empty prompt.");
+        }
+
+        const int max_new_tokens = format_request.max_new_tokens > 0
+            ? format_request.max_new_tokens
+            : FLAGS_default_max_new_tokens;
+
+        fp::write_message(
+            std::cout,
+            fp::encode_status_message(
+                format_request.request_id,
+                "formatting",
+                "Generating formatted text..."));
+
+        std::string text;
+        std::string stdout_payload;
+        std::string stderr_payload;
+        std::optional<double> tokens_per_second;
+        format_text(
+            *runner,
+            format_request.prompt,
+            max_new_tokens,
+            format_request.temperature,
+            text,
+            stdout_payload,
+            stderr_payload,
+            tokens_per_second);
+
+        fp::write_message(
+            std::cout,
+            fp::encode_result_message(
+                format_request.request_id,
+                text,
+                stdout_payload,
+                stderr_payload,
+                tokens_per_second));
+      } catch (const std::exception& e) {
+        fp::write_message(
+            std::cout,
+            fp::encode_error_message(
+                format_request.request_id,
+                "Helper formatting failed",
+                e.what()));
+      }
+    }
+  } catch (const std::exception& e) {
+    fp::write_message(
+        std::cout,
+        fp::encode_error_message(
+            std::nullopt,
+            "Failed to start LFM2.5 formatter helper",
+            e.what()));
+    return 1;
+  }
+}
diff --git a/examples/models/llama/lfm25_formatter_helper_protocol.cpp b/examples/models/llama/lfm25_formatter_helper_protocol.cpp
new file mode 100644
index 00000000000..85da95c27dd
--- /dev/null
+++ b/examples/models/llama/lfm25_formatter_helper_protocol.cpp
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "lfm25_formatter_helper_protocol.h"
+
+#include <nlohmann/json.hpp>
+
+#include <istream>
+#include <optional>
+#include <ostream>
+#include <string>
+
+namespace lfm25_formatter::helper_protocol {
+namespace {
+
+using json = nlohmann::json;
+
+} // namespace
+
+bool read_request(
+    std::istream& input,
+    Request* request,
+    std::string* error_message) {
+  std::string header_line;
+  if (!std::getline(input, header_line)) {
+    return false;
+  }
+  if (header_line.empty()) {
+    if (error_message) {
+      *error_message = "Received empty helper request header.";
+    }
+    return false;
+  }
+
+  json payload;
+  try {
+    payload = json::parse(header_line);
+  } catch (const std::exception& e) {
+    if (error_message) {
+      *error_message =
+          std::string("Failed to parse helper request: ") + e.what();
+    }
+    return false;
+  }
+
+  const std::string type = payload.value("type", "");
+  if (payload.value("version", -1) != kProtocolVersion) {
+    if (error_message) {
+      *error_message = "Unsupported helper protocol version.";
+    }
+    return false;
+  }
+
+  if (type == "shutdown") {
+    request->type = Request::Type::Shutdown;
+    request->format.reset();
+    return true;
+  }
+
+  if (type != "format") {
+    if (error_message) {
+      *error_message = "Unsupported helper request type: " + type;
+    }
+    return false;
+  }
+
+  if (!payload.contains("prompt") || !payload["prompt"].is_string()) {
+    if (error_message) {
+      *error_message = "Missing helper prompt field.";
+    }
+    return false;
+  }
+
+  FormatRequest format_request;
+  format_request.request_id = payload.value("request_id", "");
+  format_request.prompt = payload.value("prompt", "");
+  format_request.max_new_tokens = payload.value("max_new_tokens", 0);
+  format_request.temperature = payload.value("temperature", 0.0);
+
+  request->type = Request::Type::Format;
+  request->format = format_request;
+  return true;
+}
+
+std::string encode_ready_message() {
+  return json{{"type", "ready"}, {"version", kProtocolVersion}}.dump();
+}
+
+std::string encode_status_message(
+    const std::optional<std::string>& request_id,
+    const std::string& phase,
+    const std::string& message) {
+  json payload = {
+      {"type", "status"},
+      {"version", kProtocolVersion},
+      {"phase", phase},
+      {"message", message},
+  };
+  if (request_id.has_value()) {
+    payload["request_id"] = *request_id;
+  }
+  return payload.dump();
+}
+
+std::string encode_result_message(
+    const std::string& request_id,
+    const std::string& text,
+    const std::string& stdout_payload,
+    const std::string& stderr_payload,
+    const std::optional<double>& tokens_per_second) {
+  json payload = {
+      {"type", "result"},
+      {"version", kProtocolVersion},
+      {"request_id", request_id},
+      {"text", text},
+      {"stdout", stdout_payload},
+      {"stderr", stderr_payload},
+  };
+  if (tokens_per_second.has_value()) {
+    payload["tokens_per_second"] = *tokens_per_second;
+  }
+  return payload.dump();
+}
+
+std::string encode_error_message(
+    const std::optional<std::string>& request_id,
+    const std::string& message,
+    const std::optional<std::string>& details) {
+  json payload = {
+      {"type", "error"},
+      {"version", kProtocolVersion},
+      {"message", message},
+  };
+  if (request_id.has_value()) {
+    payload["request_id"] = *request_id;
+  }
+  if (details.has_value()) {
+    payload["details"] = *details;
+  }
+  return payload.dump();
+}
+
+bool write_message(std::ostream& output, const std::string& line) {
+  output << line << '\n';
+  output.flush();
+  return output.good();
+}
+
+} // namespace lfm25_formatter::helper_protocol
diff --git a/examples/models/llama/lfm25_formatter_helper_protocol.h b/examples/models/llama/lfm25_formatter_helper_protocol.h
new file mode 100644
index 00000000000..a1cb4001142
--- /dev/null
+++ b/examples/models/llama/lfm25_formatter_helper_protocol.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// LFM2.5 formatter helper: a long-lived companion process that wraps an
+// `executorch::extension::llm::TextLLMRunner` with a JSON-line stdin/stdout
+// protocol. The macOS ExecuWhisper app launches this binary once per app
+// session, sends a `format` request, and reads the rewritten dictation off
+// stdout — preserving the model's KV cache and warm state across requests.
+//
+// Wire contract (kProtocolVersion=1):
+//
+//   Requests (one JSON object per line, then optional payload):
+//     {"type": "format",   "version": 1,
+//      "request_id": "<uuid>",
+//      "prompt": "<chat-templated prompt>",
+//      "max_new_tokens": <int>,
+//      "temperature": <double>}
+//     {"type": "shutdown", "version": 1}
+//
+//   Responses (one JSON object per line):
+//     {"type": "ready",   "version": 1}                      // emitted once at startup
+//     {"type": "status",  "version": 1, "request_id": ...,
+//      "phase": "<short>", "message": "<human>"}             // optional progress updates
+//     {"type": "result",  "version": 1, "request_id": ...,
+//      "text": "<generated>", "stdout": "", "stderr": "",
+//      "tokens_per_second": <double?>}                       // success
+//     {"type": "error",   "version": 1, "request_id": <opt>,
+//      "message": "<short>", "details": <opt>}               // failure
+//
+// The Swift wire contract this matches lives at
+//   ExecuWhisper/Services/FormatterHelperProtocol.swift
+// in the internal-llama-cookbook ExecuWhisper app.
+
+#pragma once
+
+#include <cstddef>
+#include <istream>
+#include <optional>
+#include <ostream>
+#include <string>
+
+namespace lfm25_formatter::helper_protocol {
+
+constexpr int kProtocolVersion = 1;
+
+struct FormatRequest {
+  std::string request_id;
+  std::string prompt;
+  int max_new_tokens = 0;
+  double temperature = 0.0;
+};
+
+struct Request {
+  enum class Type {
+    Format,
+    Shutdown,
+  };
+
+  Type type = Type::Shutdown;
+  std::optional<FormatRequest> format;
+};
+
+bool read_request(
+    std::istream& input,
+    Request* request,
+    std::string* error_message);
+
+std::string encode_ready_message();
+std::string encode_status_message(
+    const std::optional<std::string>& request_id,
+    const std::string& phase,
+    const std::string& message);
+std::string encode_result_message(
+    const std::string& request_id,
+    const std::string& text,
+    const std::string& stdout_payload,
+    const std::string& stderr_payload,
+    const std::optional<double>& tokens_per_second);
+std::string encode_error_message(
+    const std::optional<std::string>& request_id,
+    const std::string& message,
+    const std::optional<std::string>& details);
+
+bool write_message(std::ostream& output, const std::string& line);
+
+} // namespace lfm25_formatter::helper_protocol
diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py
index fa22ddad7ac..01afb5bee18 100644
--- a/extension/llm/export/config/llm_config.py
+++ b/extension/llm/export/config/llm_config.py
@@ -52,6 +52,7 @@ class ModelType(str, Enum):
     lfm2_350m = "lfm2_350m"
     lfm2_700m = "lfm2_700m"
     lfm2_1_2b = "lfm2_1_2b"
+    lfm2_5_350m = "lfm2_5_350m"
     lfm2_5_1_2b = "lfm2_5_1_2b"