pytorch · seyeong-han · Apr 28, 2026 · Apr 28, 2026 · May 13, 2026
@@ -336,7 +336,10 @@
         "CMAKE_BUILD_TYPE": "Release",
         "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",
         "ET_MLX_ENABLE_OP_LOGGING": "OFF",
-        "ET_MIN_LOG_LEVEL": "Error"
+        "ET_MIN_LOG_LEVEL": "Error",
+        "EXECUTORCH_BUILD_KERNELS_LLM": "ON",
+        "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
+        "EXECUTORCH_BUILD_KERNELS_OPTIMIZED": "ON"
       }
     },
     {

diff --git a/Makefile b/Makefile
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx lfm_2_5_formatter-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -123,6 +123,8 @@ help:
 	@echo "  llama-cuda          - Build Llama runner with CUDA backend"
 	@echo "  llama-cuda-debug    - Build Llama runner with CUDA backend (debug mode)"
 	@echo "  llama-cpu           - Build Llama runner with CPU backend"
+	@echo "  lfm_2_5-mlx         - Build LFM2.5 runner (llama_main) with MLX backend"
+	@echo "  lfm_2_5_formatter-mlx - Build LFM2.5 persistent formatter helper (lfm25_formatter_helper) with MLX backend"
 	@echo "  llava-cpu           - Build Llava runner with CPU backend"
 	@echo "  gemma3-cuda         - Build Gemma3 runner with CUDA backend"
 	@echo "  gemma3-cpu          - Build Gemma3 runner with CPU backend"
@@ -372,6 +374,24 @@ llama-cuda-debug:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/llama/llama_main"
 
+lfm_2_5-mlx:
+	@echo "==> Building and installing ExecuTorch with MLX..."
+	cmake --workflow --preset mlx-release
+	@echo "==> Building LFM2.5 runner + persistent formatter helper with MLX..."
+	cd examples/models/llama && cmake --workflow --preset llama-mlx
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binaries:"
+	@echo "    cmake-out/examples/models/llama/llama_main"
+	@echo "    cmake-out/examples/models/llama/lfm25_formatter_helper"
+
+# Same workflow as lfm_2_5-mlx; named target for the macOS ExecuWhisper
+# integration which only needs the persistent formatter helper. Both targets
+# rely on the `llama-mlx` build preset, which already lists
+# `lfm25_formatter_helper` alongside `llama_main`.
+lfm_2_5_formatter-mlx: lfm_2_5-mlx
+	@echo "  Helper: cmake-out/examples/models/llama/lfm25_formatter_helper"
+
 llava-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release

@@ -3,6 +3,10 @@
 
 [LFM2.5](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) is an updated version with improved training (28T tokens vs 10T) and extended context length support (32K tokens).
 
+Pre-exported ExecuTorch MLX artifacts for LFM2.5 350M and 1.2B are available
+on the Hugging Face Hub at
+[younghan-meta/LFM2.5-ExecuTorch-MLX](https://huggingface.co/younghan-meta/LFM2.5-ExecuTorch-MLX).
+
 ## Instructions
 
 LFM2 uses the same example code as optimized Llama model, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
@@ -47,6 +51,24 @@ python -m extension.llm.export.export_llm \
   +export.output_name="lfm2_5_1_2b_8da4w.pte"
 ```
 
+Export LFM2.5 350M to MLX on Apple Silicon, quantized with 4-bit weights:
+```
+python -m extension.llm.export.export_llm \
+  --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+  +base.model_class="lfm2_5_350m" \
+  +base.params="examples/models/lfm2/config/lfm2_5_350m_config.json" \
+  +export.output_name="lfm2_5_350m_mlx_4w.pte"
+```
+
+Export LFM2.5 1.2B to MLX on Apple Silicon, quantized with 4-bit weights:
+```
+python -m extension.llm.export.export_llm \
+  --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+  +base.model_class="lfm2_5_1_2b" \
+  +base.params="examples/models/lfm2/config/lfm2_5_1_2b_config.json" \
+  +export.output_name="lfm2_5_1_2b_mlx_4w.pte"
+```
+
 To export with extended context (e.g., 2048 tokens):
 ```
 python -m extension.llm.export.export_llm \
@@ -58,6 +80,17 @@ python -m extension.llm.export.export_llm \
   +export.output_name="lfm2_5_1_2b_8da4w.pte"
 ```
 ### Example run
+For MLX on Apple Silicon, build or install ExecuTorch with MLX enabled. The
+easiest local path is:
+```
+conda activate et-mlx
+python install_executorch.py
+xcrun -sdk macosx --find metal
+```
+
+The `metal` command must resolve to an Xcode path, not fail under standalone
+Command Line Tools.
+
 With ExecuTorch pybindings:
 ```
 python -m examples.models.llama.runner.native \
@@ -72,7 +105,31 @@ python -m examples.models.llama.runner.native \
   --temperature 0.3
 ```
 
-With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
+With ExecuTorch pybindings and an LFM2.5 MLX export:
+```
+python -m examples.models.llama.runner.native \
+  --model lfm2_5_350m \
+  --pte lfm2_5_350m_mlx_4w.pte \
+  --tokenizer ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
+  --tokenizer_config ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer_config.json \
+  --prompt "<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
+  --params examples/models/lfm2/config/lfm2_5_350m_config.json \
+  --max_len 128 \
+  -kv \
+  --temperature 0.3
+```
+
+Find the Hugging Face cache snapshot directory with:
+```
+python - <<'PY'
+from pathlib import Path
+root = Path.home() / ".cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots"
+for path in root.glob("*/tokenizer.json"):
+    print(path.parent)
+PY
+```
+
+With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) for general runner details):
 ```
 cmake-out/examples/models/llama/llama_main \
   --model_path lfm2_700m_8da4w.pte \
@@ -81,4 +138,18 @@ cmake-out/examples/models/llama/llama_main \
   --temperature 0.3
 ```
 
+Build the C++ runner with MLX support for LFM2.5:
+```
+make lfm_2_5-mlx
+```
+
+Then run an LFM2.5 MLX export with the C++ runner:
+```
+cmake-out/examples/models/llama/llama_main \
+  --model_path lfm2_5_350m_mlx_4w.pte \
+  --tokenizer_path ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
+  --prompt="<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
+  --temperature 0.3
+```
+
 To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
@@ -0,0 +1,33 @@
+{
+  "dim": 1024,
+  "ffn_dim_multiplier": 1,
+  "hidden_dim": 4608,
+  "n_heads": 16,
+  "n_kv_heads": 8,
+  "n_layers": 16,
+  "norm_eps": 1e-5,
+  "rope_theta": 1000000.0,
+  "use_scaled_rope": false,
+  "vocab_size": 65536,
+  "use_hf_rope": true,
+  "use_qk_norm": true,
+  "qk_norm_before_rope": true,
+  "layer_types": [
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv",
+    "full_attention",
+    "conv"
+  ]
+}
@@ -0,0 +1,15 @@
+base:
+  metadata: '{"get_bos_id": 1, "get_eos_ids":[7]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: bf16
+
+quantization:
+  qmode: 4w
+  group_size: 64
+
+backend:
+  mlx:
+    enabled: True
@@ -0,0 +1,102 @@
+import ast
+import json
+from pathlib import Path
+
+from omegaconf import OmegaConf
+
+
+REPO_ROOT = Path(__file__).resolve().parents[3]
+CONFIG_DIR = REPO_ROOT / "examples" / "models" / "lfm2" / "config"
+EXPORT_LLAMA_LIB = REPO_ROOT / "examples" / "models" / "llama" / "export_llama_lib.py"
+LLM_CONFIG = REPO_ROOT / "extension" / "llm" / "export" / "config" / "llm_config.py"
+
+
+def _load_json_config(name: str) -> dict:
+    with open(CONFIG_DIR / name, "r") as f:
+        return json.load(f)
+
+
+def _module_ast(path: Path) -> ast.Module:
+    return ast.parse(path.read_text())
+
+
+def _literal_assignment(module: ast.Module, name: str):
+    for node in module.body:
+        if isinstance(node, ast.Assign):
+            for target in node.targets:
+                if isinstance(target, ast.Name) and target.id == name:
+                    return ast.literal_eval(node.value)
+    raise AssertionError(f"{name} not found")
+
+
+def _class_string_assignments(module: ast.Module, class_name: str) -> dict[str, str]:
+    for node in module.body:
+        if isinstance(node, ast.ClassDef) and node.name == class_name:
+            values = {}
+            for stmt in node.body:
+                if (
+                    isinstance(stmt, ast.Assign)
+                    and len(stmt.targets) == 1
+                    and isinstance(stmt.targets[0], ast.Name)
+                ):
+                    values[stmt.targets[0].id] = ast.literal_eval(stmt.value)
+            return values
+    raise AssertionError(f"{class_name} not found")
+
+
+def test_lfm2_5_models_are_registered() -> None:
+    export_module = _module_ast(EXPORT_LLAMA_LIB)
+    model_types = _class_string_assignments(_module_ast(LLM_CONFIG), "ModelType")
+    executor_defined_models = _literal_assignment(
+        export_module, "EXECUTORCH_DEFINED_MODELS"
+    )
+    hf_repo_ids = _literal_assignment(export_module, "HUGGING_FACE_REPO_IDS")
+
+    assert "lfm2_5_350m" in executor_defined_models
+    assert "lfm2_5_1_2b" in executor_defined_models
+    assert model_types["lfm2_5_350m"] == "lfm2_5_350m"
+    assert model_types["lfm2_5_1_2b"] == "lfm2_5_1_2b"
+    assert hf_repo_ids["lfm2_5_350m"] == "LiquidAI/LFM2.5-350M"
+    assert hf_repo_ids["lfm2_5_1_2b"] == "LiquidAI/LFM2.5-1.2B-Instruct"
+
+
+def test_lfm2_5_architecture_configs_match_expected_shapes() -> None:
+    expected = {
+        "lfm2_5_350m_config.json": {
+            "dim": 1024,
+            "hidden_dim": 4608,
+            "n_heads": 16,
+            "n_kv_heads": 8,
+        },
+        "lfm2_5_1_2b_config.json": {
+            "dim": 2048,
+            "hidden_dim": 8192,
+            "n_heads": 32,
+            "n_kv_heads": 8,
+        },
+    }
+
+    for filename, expected_fields in expected.items():
+        cfg = _load_json_config(filename)
+        for key, value in expected_fields.items():
+            assert cfg[key] == value
+        assert cfg["n_layers"] == 16
+        assert len(cfg["layer_types"]) == cfg["n_layers"]
+        assert cfg["layer_types"].count("full_attention") == 6
+        assert cfg["layer_types"].count("conv") == 10
+        assert cfg["vocab_size"] == 65536
+        assert cfg["rope_theta"] == 1000000.0
+        assert cfg["use_hf_rope"] is True
+        assert cfg["use_qk_norm"] is True
+        assert cfg["qk_norm_before_rope"] is True
+
+
+def test_lfm2_mlx_config_enables_mlx_backend() -> None:
+    cfg = OmegaConf.load(CONFIG_DIR / "lfm2_mlx_4w.yaml")
+    assert cfg.base.metadata == '{"get_bos_id": 1, "get_eos_ids":[7]}'
+    assert cfg.model.use_kv_cache is True
+    assert cfg.model.use_sdpa_with_kv_cache is True
+    assert cfg.model.dtype_override == "bf16"
+    assert cfg.quantization.qmode == "4w"
+    assert cfg.quantization.group_size == 64
+    assert cfg.backend.mlx.enabled is True
@@ -1,3 +1,3 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -261,3 +261,55 @@
   set_target_properties(llama_main PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'")
 endif()
 # Windows doesn't need rpath - DLLs are found via standard Windows search order
+
+# -------------------------------------------------------------------------- #
+# LFM2.5 formatter helper (persistent companion process)
+#
+# Long-lived sibling of llama_main that wraps the same TextLLMRunner with a
+# JSON-line stdin/stdout protocol. The macOS ExecuWhisper app keeps this
+# binary warm across requests so the formatter model is loaded once per
+# session. Build with `make lfm_2_5_formatter-mlx` from the repo root, or
+# `cmake --workflow --preset llama-mlx` from this directory.
+# -------------------------------------------------------------------------- #
+
+set(_formatter_helper_srcs
+    lfm25_formatter_helper.cpp lfm25_formatter_helper_protocol.cpp
+)
+set(_formatter_helper_include_directories
+    ${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include
+)
+
+add_executable(lfm25_formatter_helper ${_formatter_helper_srcs})
+
+if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL
+                                          "MinSizeRel"
+)
+  target_link_options_gc_sections(lfm25_formatter_helper)
+  if(NOT APPLE)
+    target_link_options(lfm25_formatter_helper PRIVATE "LINKER:-s")
+  endif()
+endif()
+
+target_include_directories(
+  lfm25_formatter_helper PUBLIC ${_formatter_helper_include_directories}
+)
+target_link_libraries(
+  lfm25_formatter_helper PUBLIC llama_runner ${link_libraries}
+)
+target_compile_options(
+  lfm25_formatter_helper PUBLIC ${_common_compile_options}
+)
+
+if(TARGET mlxdelegate)
+  executorch_target_copy_mlx_metallib(lfm25_formatter_helper)
+endif()
+
+if(APPLE)
+  target_link_options(
+    lfm25_formatter_helper PRIVATE -Wl,-rpath,@loader_path
+  )
+elseif(UNIX)
+  set_target_properties(
+    lfm25_formatter_helper PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
+  )
+endif()