Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,10 @@
"CMAKE_BUILD_TYPE": "Release",
"CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out",
"ET_MLX_ENABLE_OP_LOGGING": "OFF",
"ET_MIN_LOG_LEVEL": "Error"
"ET_MIN_LOG_LEVEL": "Error",
"EXECUTORCH_BUILD_KERNELS_LLM": "ON",
"EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON",
"EXECUTORCH_BUILD_KERNELS_OPTIMIZED": "ON"
}
},
{
Expand Down
22 changes: 21 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@
#
# ==============================================================================

.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx lfm_2_5_formatter-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help

help:
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
Expand Down Expand Up @@ -123,6 +123,8 @@ help:
@echo " llama-cuda - Build Llama runner with CUDA backend"
@echo " llama-cuda-debug - Build Llama runner with CUDA backend (debug mode)"
@echo " llama-cpu - Build Llama runner with CPU backend"
@echo " lfm_2_5-mlx - Build LFM2.5 runner (llama_main) with MLX backend"
@echo " lfm_2_5_formatter-mlx - Build LFM2.5 persistent formatter helper (lfm25_formatter_helper) with MLX backend"
@echo " llava-cpu - Build Llava runner with CPU backend"
@echo " gemma3-cuda - Build Gemma3 runner with CUDA backend"
@echo " gemma3-cpu - Build Gemma3 runner with CPU backend"
Expand Down Expand Up @@ -372,6 +374,24 @@ llama-cuda-debug:
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/llama/llama_main"

lfm_2_5-mlx:
@echo "==> Building and installing ExecuTorch with MLX..."
cmake --workflow --preset mlx-release
@echo "==> Building LFM2.5 runner + persistent formatter helper with MLX..."
cd examples/models/llama && cmake --workflow --preset llama-mlx
@echo ""
@echo "✓ Build complete!"
@echo " Binaries:"
@echo " cmake-out/examples/models/llama/llama_main"
@echo " cmake-out/examples/models/llama/lfm25_formatter_helper"

# Same workflow as lfm_2_5-mlx; named target for the macOS ExecuWhisper
# integration which only needs the persistent formatter helper. Both targets
# rely on the `llama-mlx` build preset, which already lists
# `lfm25_formatter_helper` alongside `llama_main`.
lfm_2_5_formatter-mlx: lfm_2_5-mlx
@echo " Helper: cmake-out/examples/models/llama/lfm25_formatter_helper"

llava-cpu:
@echo "==> Building and installing ExecuTorch..."
cmake --workflow --preset llm-release
Expand Down
73 changes: 72 additions & 1 deletion examples/models/lfm2/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@

[LFM2.5](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) is an updated version with improved training (28T tokens vs 10T) and extended context length support (32K tokens).

Pre-exported ExecuTorch MLX artifacts for LFM2.5 350M and 1.2B are available
on the Hugging Face Hub at
[younghan-meta/LFM2.5-ExecuTorch-MLX](https://huggingface.co/younghan-meta/LFM2.5-ExecuTorch-MLX).

## Instructions

LFM2 uses the same example code as optimized Llama model, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details.
Expand Down Expand Up @@ -47,6 +51,24 @@ python -m extension.llm.export.export_llm \
+export.output_name="lfm2_5_1_2b_8da4w.pte"
```

Export LFM2.5 350M to MLX on Apple Silicon, quantized with 4-bit weights:
```
python -m extension.llm.export.export_llm \
--config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+base.model_class="lfm2_5_350m" \
+base.params="examples/models/lfm2/config/lfm2_5_350m_config.json" \
+export.output_name="lfm2_5_350m_mlx_4w.pte"
```

Export LFM2.5 1.2B to MLX on Apple Silicon, quantized with 4-bit weights:
```
python -m extension.llm.export.export_llm \
--config examples/models/lfm2/config/lfm2_mlx_4w.yaml \
+base.model_class="lfm2_5_1_2b" \
+base.params="examples/models/lfm2/config/lfm2_5_1_2b_config.json" \
+export.output_name="lfm2_5_1_2b_mlx_4w.pte"
```

To export with extended context (e.g., 2048 tokens):
```
python -m extension.llm.export.export_llm \
Expand All @@ -58,6 +80,17 @@ python -m extension.llm.export.export_llm \
+export.output_name="lfm2_5_1_2b_8da4w.pte"
```
### Example run
For MLX on Apple Silicon, build or install ExecuTorch with MLX enabled. The
easiest local path is:
```
conda activate et-mlx
python install_executorch.py
xcrun -sdk macosx --find metal
```

The `metal` command must resolve to an Xcode path, not fail under standalone
Command Line Tools.

With ExecuTorch pybindings:
```
python -m examples.models.llama.runner.native \
Expand All @@ -72,7 +105,31 @@ python -m examples.models.llama.runner.native \
--temperature 0.3
```

With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner):
With ExecuTorch pybindings and an LFM2.5 MLX export:
```
python -m examples.models.llama.runner.native \
--model lfm2_5_350m \
--pte lfm2_5_350m_mlx_4w.pte \
--tokenizer ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
--tokenizer_config ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer_config.json \
--prompt "<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
--params examples/models/lfm2/config/lfm2_5_350m_config.json \
--max_len 128 \
-kv \
--temperature 0.3
```

Find the Hugging Face cache snapshot directory with:
```
python - <<'PY'
from pathlib import Path
root = Path.home() / ".cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots"
for path in root.glob("*/tokenizer.json"):
print(path.parent)
PY
```

With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) for general runner details):
```
cmake-out/examples/models/llama/llama_main \
--model_path lfm2_700m_8da4w.pte \
Expand All @@ -81,4 +138,18 @@ cmake-out/examples/models/llama/llama_main \
--temperature 0.3
```

Build the C++ runner with MLX support for LFM2.5:
```
make lfm_2_5-mlx
```

Then run an LFM2.5 MLX export with the C++ runner:
```
cmake-out/examples/models/llama/llama_main \
--model_path lfm2_5_350m_mlx_4w.pte \
--tokenizer_path ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots/<snapshot>/tokenizer.json \
--prompt="<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \
--temperature 0.3
```

To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
33 changes: 33 additions & 0 deletions examples/models/lfm2/config/lfm2_5_350m_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"dim": 1024,
"ffn_dim_multiplier": 1,
"hidden_dim": 4608,
"n_heads": 16,
"n_kv_heads": 8,
"n_layers": 16,
"norm_eps": 1e-5,
"rope_theta": 1000000.0,
"use_scaled_rope": false,
"vocab_size": 65536,
"use_hf_rope": true,
"use_qk_norm": true,
"qk_norm_before_rope": true,
"layer_types": [
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv",
"full_attention",
"conv"
]
}
15 changes: 15 additions & 0 deletions examples/models/lfm2/config/lfm2_mlx_4w.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
base:
metadata: '{"get_bos_id": 1, "get_eos_ids":[7]}'

model:
use_kv_cache: True
use_sdpa_with_kv_cache: True
dtype_override: bf16

quantization:
qmode: 4w
group_size: 64

backend:
mlx:
enabled: True
102 changes: 102 additions & 0 deletions examples/models/lfm2/test_lfm2_5_mlx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import ast
import json
from pathlib import Path

from omegaconf import OmegaConf


REPO_ROOT = Path(__file__).resolve().parents[3]
CONFIG_DIR = REPO_ROOT / "examples" / "models" / "lfm2" / "config"
EXPORT_LLAMA_LIB = REPO_ROOT / "examples" / "models" / "llama" / "export_llama_lib.py"
LLM_CONFIG = REPO_ROOT / "extension" / "llm" / "export" / "config" / "llm_config.py"


def _load_json_config(name: str) -> dict:
with open(CONFIG_DIR / name, "r") as f:
return json.load(f)


def _module_ast(path: Path) -> ast.Module:
return ast.parse(path.read_text())


def _literal_assignment(module: ast.Module, name: str):
for node in module.body:
if isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Name) and target.id == name:
return ast.literal_eval(node.value)
raise AssertionError(f"{name} not found")


def _class_string_assignments(module: ast.Module, class_name: str) -> dict[str, str]:
for node in module.body:
if isinstance(node, ast.ClassDef) and node.name == class_name:
values = {}
for stmt in node.body:
if (
isinstance(stmt, ast.Assign)
and len(stmt.targets) == 1
and isinstance(stmt.targets[0], ast.Name)
):
values[stmt.targets[0].id] = ast.literal_eval(stmt.value)
return values
raise AssertionError(f"{class_name} not found")


def test_lfm2_5_models_are_registered() -> None:
export_module = _module_ast(EXPORT_LLAMA_LIB)
model_types = _class_string_assignments(_module_ast(LLM_CONFIG), "ModelType")
executor_defined_models = _literal_assignment(
export_module, "EXECUTORCH_DEFINED_MODELS"
)
hf_repo_ids = _literal_assignment(export_module, "HUGGING_FACE_REPO_IDS")

assert "lfm2_5_350m" in executor_defined_models
assert "lfm2_5_1_2b" in executor_defined_models
assert model_types["lfm2_5_350m"] == "lfm2_5_350m"
assert model_types["lfm2_5_1_2b"] == "lfm2_5_1_2b"
assert hf_repo_ids["lfm2_5_350m"] == "LiquidAI/LFM2.5-350M"
assert hf_repo_ids["lfm2_5_1_2b"] == "LiquidAI/LFM2.5-1.2B-Instruct"


def test_lfm2_5_architecture_configs_match_expected_shapes() -> None:
expected = {
"lfm2_5_350m_config.json": {
"dim": 1024,
"hidden_dim": 4608,
"n_heads": 16,
"n_kv_heads": 8,
},
"lfm2_5_1_2b_config.json": {
"dim": 2048,
"hidden_dim": 8192,
"n_heads": 32,
"n_kv_heads": 8,
},
}

for filename, expected_fields in expected.items():
cfg = _load_json_config(filename)
for key, value in expected_fields.items():
assert cfg[key] == value
assert cfg["n_layers"] == 16
assert len(cfg["layer_types"]) == cfg["n_layers"]
assert cfg["layer_types"].count("full_attention") == 6
assert cfg["layer_types"].count("conv") == 10
assert cfg["vocab_size"] == 65536
assert cfg["rope_theta"] == 1000000.0
assert cfg["use_hf_rope"] is True
assert cfg["use_qk_norm"] is True
assert cfg["qk_norm_before_rope"] is True


def test_lfm2_mlx_config_enables_mlx_backend() -> None:
cfg = OmegaConf.load(CONFIG_DIR / "lfm2_mlx_4w.yaml")
assert cfg.base.metadata == '{"get_bos_id": 1, "get_eos_ids":[7]}'
assert cfg.model.use_kv_cache is True
assert cfg.model.use_sdpa_with_kv_cache is True
assert cfg.model.dtype_override == "bf16"
assert cfg.quantization.qmode == "4w"
assert cfg.quantization.group_size == 64
assert cfg.backend.mlx.enabled is True
52 changes: 52 additions & 0 deletions examples/models/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
Expand Down Expand Up @@ -261,3 +261,55 @@
set_target_properties(llama_main PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'")
endif()
# Windows doesn't need rpath - DLLs are found via standard Windows search order

# -------------------------------------------------------------------------- #
# LFM2.5 formatter helper (persistent companion process)
#
# Long-lived sibling of llama_main that wraps the same TextLLMRunner with a
# JSON-line stdin/stdout protocol. The macOS ExecuWhisper app keeps this
# binary warm across requests so the formatter model is loaded once per
# session. Build with `make lfm_2_5_formatter-mlx` from the repo root, or
# `cmake --workflow --preset llama-mlx` from this directory.
# -------------------------------------------------------------------------- #

set(_formatter_helper_srcs
lfm25_formatter_helper.cpp lfm25_formatter_helper_protocol.cpp
)
set(_formatter_helper_include_directories
${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include
)

add_executable(lfm25_formatter_helper ${_formatter_helper_srcs})

if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL
"MinSizeRel"
)
target_link_options_gc_sections(lfm25_formatter_helper)
if(NOT APPLE)
target_link_options(lfm25_formatter_helper PRIVATE "LINKER:-s")
endif()
endif()

target_include_directories(
lfm25_formatter_helper PUBLIC ${_formatter_helper_include_directories}
)
target_link_libraries(
lfm25_formatter_helper PUBLIC llama_runner ${link_libraries}
)
target_compile_options(
lfm25_formatter_helper PUBLIC ${_common_compile_options}
)

if(TARGET mlxdelegate)
executorch_target_copy_mlx_metallib(lfm25_formatter_helper)
endif()

if(APPLE)
target_link_options(
lfm25_formatter_helper PRIVATE -Wl,-rpath,@loader_path
)
elseif(UNIX)
set_target_properties(
lfm25_formatter_helper PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
)
endif()
Loading
Loading