diff --git a/CMakePresets.json b/CMakePresets.json index 99a0ebee12c..5ae9da1fefe 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -336,7 +336,10 @@ "CMAKE_BUILD_TYPE": "Release", "CMAKE_INSTALL_PREFIX": "${sourceDir}/cmake-out", "ET_MLX_ENABLE_OP_LOGGING": "OFF", - "ET_MIN_LOG_LEVEL": "Error" + "ET_MIN_LOG_LEVEL": "Error", + "EXECUTORCH_BUILD_KERNELS_LLM": "ON", + "EXECUTORCH_BUILD_KERNELS_QUANTIZED": "ON", + "EXECUTORCH_BUILD_KERNELS_OPTIMIZED": "ON" } }, { diff --git a/Makefile b/Makefile index ba61dddce44..5fe8793fe0a 100644 --- a/Makefile +++ b/Makefile @@ -91,7 +91,7 @@ # # ============================================================================== -.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help +.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral-mlx voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal voxtral_realtime-mlx voxtral_tts-cpu voxtral_tts-cuda whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal parakeet-mlx parakeet-vulkan dinov2-cuda dinov2-cuda-debug sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu lfm_2_5-mlx lfm_2_5_formatter-mlx llava-cpu gemma3-cuda gemma3-cpu gemma4_31b-cuda qwen3_5_moe-cuda qwen3_5_moe-metal clean help help: @echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make \`. Available targets:" @@ -123,6 +123,8 @@ help: @echo " llama-cuda - Build Llama runner with CUDA backend" @echo " llama-cuda-debug - Build Llama runner with CUDA backend (debug mode)" @echo " llama-cpu - Build Llama runner with CPU backend" + @echo " lfm_2_5-mlx - Build LFM2.5 runner (llama_main) with MLX backend" + @echo " lfm_2_5_formatter-mlx - Build LFM2.5 persistent formatter helper (lfm25_formatter_helper) with MLX backend" @echo " llava-cpu - Build Llava runner with CPU backend" @echo " gemma3-cuda - Build Gemma3 runner with CUDA backend" @echo " gemma3-cpu - Build Gemma3 runner with CPU backend" @@ -372,6 +374,24 @@ llama-cuda-debug: @echo "✓ Build complete!" @echo " Binary: cmake-out/examples/models/llama/llama_main" +lfm_2_5-mlx: + @echo "==> Building and installing ExecuTorch with MLX..." + cmake --workflow --preset mlx-release + @echo "==> Building LFM2.5 runner + persistent formatter helper with MLX..." + cd examples/models/llama && cmake --workflow --preset llama-mlx + @echo "" + @echo "✓ Build complete!" + @echo " Binaries:" + @echo " cmake-out/examples/models/llama/llama_main" + @echo " cmake-out/examples/models/llama/lfm25_formatter_helper" + +# Same workflow as lfm_2_5-mlx; named target for the macOS ExecuWhisper +# integration which only needs the persistent formatter helper. Both targets +# rely on the `llama-mlx` build preset, which already lists +# `lfm25_formatter_helper` alongside `llama_main`. +lfm_2_5_formatter-mlx: lfm_2_5-mlx + @echo " Helper: cmake-out/examples/models/llama/lfm25_formatter_helper" + llava-cpu: @echo "==> Building and installing ExecuTorch..." cmake --workflow --preset llm-release diff --git a/examples/models/lfm2/README.md b/examples/models/lfm2/README.md index 4f52c576442..c740456f8dc 100644 --- a/examples/models/lfm2/README.md +++ b/examples/models/lfm2/README.md @@ -3,6 +3,10 @@ [LFM2.5](https://huggingface.co/LiquidAI/LFM2.5-1.2B-Instruct) is an updated version with improved training (28T tokens vs 10T) and extended context length support (32K tokens). +Pre-exported ExecuTorch MLX artifacts for LFM2.5 350M and 1.2B are available +on the Hugging Face Hub at +[younghan-meta/LFM2.5-ExecuTorch-MLX](https://huggingface.co/younghan-meta/LFM2.5-ExecuTorch-MLX). + ## Instructions LFM2 uses the same example code as optimized Llama model, while the checkpoint, model params, and tokenizer are different. Please see the [Llama README page](../llama/README.md) for details. @@ -47,6 +51,24 @@ python -m extension.llm.export.export_llm \ +export.output_name="lfm2_5_1_2b_8da4w.pte" ``` +Export LFM2.5 350M to MLX on Apple Silicon, quantized with 4-bit weights: +``` +python -m extension.llm.export.export_llm \ + --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \ + +base.model_class="lfm2_5_350m" \ + +base.params="examples/models/lfm2/config/lfm2_5_350m_config.json" \ + +export.output_name="lfm2_5_350m_mlx_4w.pte" +``` + +Export LFM2.5 1.2B to MLX on Apple Silicon, quantized with 4-bit weights: +``` +python -m extension.llm.export.export_llm \ + --config examples/models/lfm2/config/lfm2_mlx_4w.yaml \ + +base.model_class="lfm2_5_1_2b" \ + +base.params="examples/models/lfm2/config/lfm2_5_1_2b_config.json" \ + +export.output_name="lfm2_5_1_2b_mlx_4w.pte" +``` + To export with extended context (e.g., 2048 tokens): ``` python -m extension.llm.export.export_llm \ @@ -58,6 +80,17 @@ python -m extension.llm.export.export_llm \ +export.output_name="lfm2_5_1_2b_8da4w.pte" ``` ### Example run +For MLX on Apple Silicon, build or install ExecuTorch with MLX enabled. The +easiest local path is: +``` +conda activate et-mlx +python install_executorch.py +xcrun -sdk macosx --find metal +``` + +The `metal` command must resolve to an Xcode path, not fail under standalone +Command Line Tools. + With ExecuTorch pybindings: ``` python -m examples.models.llama.runner.native \ @@ -72,7 +105,31 @@ python -m examples.models.llama.runner.native \ --temperature 0.3 ``` -With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) to build the runner): +With ExecuTorch pybindings and an LFM2.5 MLX export: +``` +python -m examples.models.llama.runner.native \ + --model lfm2_5_350m \ + --pte lfm2_5_350m_mlx_4w.pte \ + --tokenizer ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots//tokenizer.json \ + --tokenizer_config ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots//tokenizer_config.json \ + --prompt "<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \ + --params examples/models/lfm2/config/lfm2_5_350m_config.json \ + --max_len 128 \ + -kv \ + --temperature 0.3 +``` + +Find the Hugging Face cache snapshot directory with: +``` +python - <<'PY' +from pathlib import Path +root = Path.home() / ".cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots" +for path in root.glob("*/tokenizer.json"): + print(path.parent) +PY +``` + +With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your computer to validate](../llama/README.md#step-3-run-on-your-computer-to-validate) for general runner details): ``` cmake-out/examples/models/llama/llama_main \ --model_path lfm2_700m_8da4w.pte \ @@ -81,4 +138,18 @@ cmake-out/examples/models/llama/llama_main \ --temperature 0.3 ``` +Build the C++ runner with MLX support for LFM2.5: +``` +make lfm_2_5-mlx +``` + +Then run an LFM2.5 MLX export with the C++ runner: +``` +cmake-out/examples/models/llama/llama_main \ + --model_path lfm2_5_350m_mlx_4w.pte \ + --tokenizer_path ~/.cache/huggingface/hub/models--LiquidAI--LFM2.5-350M/snapshots//tokenizer.json \ + --prompt="<|startoftext|><|im_start|>user\nWho are you?<|im_end|>\n<|im_start|>assistant\n" \ + --temperature 0.3 +``` + To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section. diff --git a/examples/models/lfm2/config/lfm2_5_350m_config.json b/examples/models/lfm2/config/lfm2_5_350m_config.json new file mode 100644 index 00000000000..975ccbccca7 --- /dev/null +++ b/examples/models/lfm2/config/lfm2_5_350m_config.json @@ -0,0 +1,33 @@ +{ + "dim": 1024, + "ffn_dim_multiplier": 1, + "hidden_dim": 4608, + "n_heads": 16, + "n_kv_heads": 8, + "n_layers": 16, + "norm_eps": 1e-5, + "rope_theta": 1000000.0, + "use_scaled_rope": false, + "vocab_size": 65536, + "use_hf_rope": true, + "use_qk_norm": true, + "qk_norm_before_rope": true, + "layer_types": [ + "conv", + "conv", + "full_attention", + "conv", + "conv", + "full_attention", + "conv", + "conv", + "full_attention", + "conv", + "full_attention", + "conv", + "full_attention", + "conv", + "full_attention", + "conv" + ] +} diff --git a/examples/models/lfm2/config/lfm2_mlx_4w.yaml b/examples/models/lfm2/config/lfm2_mlx_4w.yaml new file mode 100644 index 00000000000..fa7b16fd418 --- /dev/null +++ b/examples/models/lfm2/config/lfm2_mlx_4w.yaml @@ -0,0 +1,15 @@ +base: + metadata: '{"get_bos_id": 1, "get_eos_ids":[7]}' + +model: + use_kv_cache: True + use_sdpa_with_kv_cache: True + dtype_override: bf16 + +quantization: + qmode: 4w + group_size: 64 + +backend: + mlx: + enabled: True diff --git a/examples/models/lfm2/test_lfm2_5_mlx.py b/examples/models/lfm2/test_lfm2_5_mlx.py new file mode 100644 index 00000000000..598de11315c --- /dev/null +++ b/examples/models/lfm2/test_lfm2_5_mlx.py @@ -0,0 +1,102 @@ +import ast +import json +from pathlib import Path + +from omegaconf import OmegaConf + + +REPO_ROOT = Path(__file__).resolve().parents[3] +CONFIG_DIR = REPO_ROOT / "examples" / "models" / "lfm2" / "config" +EXPORT_LLAMA_LIB = REPO_ROOT / "examples" / "models" / "llama" / "export_llama_lib.py" +LLM_CONFIG = REPO_ROOT / "extension" / "llm" / "export" / "config" / "llm_config.py" + + +def _load_json_config(name: str) -> dict: + with open(CONFIG_DIR / name, "r") as f: + return json.load(f) + + +def _module_ast(path: Path) -> ast.Module: + return ast.parse(path.read_text()) + + +def _literal_assignment(module: ast.Module, name: str): + for node in module.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == name: + return ast.literal_eval(node.value) + raise AssertionError(f"{name} not found") + + +def _class_string_assignments(module: ast.Module, class_name: str) -> dict[str, str]: + for node in module.body: + if isinstance(node, ast.ClassDef) and node.name == class_name: + values = {} + for stmt in node.body: + if ( + isinstance(stmt, ast.Assign) + and len(stmt.targets) == 1 + and isinstance(stmt.targets[0], ast.Name) + ): + values[stmt.targets[0].id] = ast.literal_eval(stmt.value) + return values + raise AssertionError(f"{class_name} not found") + + +def test_lfm2_5_models_are_registered() -> None: + export_module = _module_ast(EXPORT_LLAMA_LIB) + model_types = _class_string_assignments(_module_ast(LLM_CONFIG), "ModelType") + executor_defined_models = _literal_assignment( + export_module, "EXECUTORCH_DEFINED_MODELS" + ) + hf_repo_ids = _literal_assignment(export_module, "HUGGING_FACE_REPO_IDS") + + assert "lfm2_5_350m" in executor_defined_models + assert "lfm2_5_1_2b" in executor_defined_models + assert model_types["lfm2_5_350m"] == "lfm2_5_350m" + assert model_types["lfm2_5_1_2b"] == "lfm2_5_1_2b" + assert hf_repo_ids["lfm2_5_350m"] == "LiquidAI/LFM2.5-350M" + assert hf_repo_ids["lfm2_5_1_2b"] == "LiquidAI/LFM2.5-1.2B-Instruct" + + +def test_lfm2_5_architecture_configs_match_expected_shapes() -> None: + expected = { + "lfm2_5_350m_config.json": { + "dim": 1024, + "hidden_dim": 4608, + "n_heads": 16, + "n_kv_heads": 8, + }, + "lfm2_5_1_2b_config.json": { + "dim": 2048, + "hidden_dim": 8192, + "n_heads": 32, + "n_kv_heads": 8, + }, + } + + for filename, expected_fields in expected.items(): + cfg = _load_json_config(filename) + for key, value in expected_fields.items(): + assert cfg[key] == value + assert cfg["n_layers"] == 16 + assert len(cfg["layer_types"]) == cfg["n_layers"] + assert cfg["layer_types"].count("full_attention") == 6 + assert cfg["layer_types"].count("conv") == 10 + assert cfg["vocab_size"] == 65536 + assert cfg["rope_theta"] == 1000000.0 + assert cfg["use_hf_rope"] is True + assert cfg["use_qk_norm"] is True + assert cfg["qk_norm_before_rope"] is True + + +def test_lfm2_mlx_config_enables_mlx_backend() -> None: + cfg = OmegaConf.load(CONFIG_DIR / "lfm2_mlx_4w.yaml") + assert cfg.base.metadata == '{"get_bos_id": 1, "get_eos_ids":[7]}' + assert cfg.model.use_kv_cache is True + assert cfg.model.use_sdpa_with_kv_cache is True + assert cfg.model.dtype_override == "bf16" + assert cfg.quantization.qmode == "4w" + assert cfg.quantization.group_size == 64 + assert cfg.backend.mlx.enabled is True diff --git a/examples/models/llama/CMakeLists.txt b/examples/models/llama/CMakeLists.txt index 6d5b5cc2566..a8c940ee228 100644 --- a/examples/models/llama/CMakeLists.txt +++ b/examples/models/llama/CMakeLists.txt @@ -261,3 +261,55 @@ elseif(UNIX) set_target_properties(llama_main PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'") endif() # Windows doesn't need rpath - DLLs are found via standard Windows search order + +# -------------------------------------------------------------------------- # +# LFM2.5 formatter helper (persistent companion process) +# +# Long-lived sibling of llama_main that wraps the same TextLLMRunner with a +# JSON-line stdin/stdout protocol. The macOS ExecuWhisper app keeps this +# binary warm across requests so the formatter model is loaded once per +# session. Build with `make lfm_2_5_formatter-mlx` from the repo root, or +# `cmake --workflow --preset llama-mlx` from this directory. +# -------------------------------------------------------------------------- # + +set(_formatter_helper_srcs + lfm25_formatter_helper.cpp lfm25_formatter_helper_protocol.cpp +) +set(_formatter_helper_include_directories + ${_common_include_directories} ${EXECUTORCH_ROOT}/third-party/json/include +) + +add_executable(lfm25_formatter_helper ${_formatter_helper_srcs}) + +if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL + "MinSizeRel" +) + target_link_options_gc_sections(lfm25_formatter_helper) + if(NOT APPLE) + target_link_options(lfm25_formatter_helper PRIVATE "LINKER:-s") + endif() +endif() + +target_include_directories( + lfm25_formatter_helper PUBLIC ${_formatter_helper_include_directories} +) +target_link_libraries( + lfm25_formatter_helper PUBLIC llama_runner ${link_libraries} +) +target_compile_options( + lfm25_formatter_helper PUBLIC ${_common_compile_options} +) + +if(TARGET mlxdelegate) + executorch_target_copy_mlx_metallib(lfm25_formatter_helper) +endif() + +if(APPLE) + target_link_options( + lfm25_formatter_helper PRIVATE -Wl,-rpath,@loader_path + ) +elseif(UNIX) + set_target_properties( + lfm25_formatter_helper PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'" + ) +endif() diff --git a/examples/models/llama/CMakePresets.json b/examples/models/llama/CMakePresets.json index f52b289523f..735f4e3907e 100644 --- a/examples/models/llama/CMakePresets.json +++ b/examples/models/llama/CMakePresets.json @@ -48,6 +48,21 @@ "string": "${hostSystemName}", "list": ["Linux", "Windows"] } + }, + { + "name": "llama-mlx", + "displayName": "Llama runner with MLX backend", + "binaryDir": "${sourceDir}/../../../cmake-out/examples/models/llama", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release", + "CMAKE_FIND_ROOT_PATH": "${sourceDir}/../../../cmake-out", + "EXECUTORCH_BUILD_MLX": "ON" + }, + "condition": { + "lhs": "${hostSystemName}", + "type": "equals", + "rhs": "Darwin" + } } ], "buildPresets": [ @@ -74,6 +89,12 @@ "displayName": "Build Llama runner with CUDA backend", "configurePreset": "llama-cuda", "targets": ["llama_main"] + }, + { + "name": "llama-mlx", + "displayName": "Build Llama runner with MLX backend", + "configurePreset": "llama-mlx", + "targets": ["llama_main", "lfm25_formatter_helper"] } ], "workflowPresets": [ @@ -132,6 +153,20 @@ "name": "llama-cuda" } ] + }, + { + "name": "llama-mlx", + "displayName": "Configure and build Llama runner with MLX backend", + "steps": [ + { + "type": "configure", + "name": "llama-mlx" + }, + { + "type": "build", + "name": "llama-mlx" + } + ] } ] } diff --git a/examples/models/llama/export_llama_lib.py b/examples/models/llama/export_llama_lib.py index 46c9113a211..d33953f8c41 100644 --- a/examples/models/llama/export_llama_lib.py +++ b/examples/models/llama/export_llama_lib.py @@ -115,6 +115,7 @@ "lfm2_350m", # hybrid "lfm2_700m", # hybrid "lfm2_1_2b", # hybrid + "lfm2_5_350m", # hybrid "lfm2_5_1_2b", # hybrid ] TORCHTUNE_DEFINED_MODELS = ["llama3_2_vision"] @@ -133,6 +134,7 @@ "lfm2_350m": "LiquidAI/LFM2-350M", "lfm2_700m": "LiquidAI/LFM2-700M", "lfm2_1_2b": "LiquidAI/LFM2-1.2B", + "lfm2_5_350m": "LiquidAI/LFM2.5-350M", "lfm2_5_1_2b": "LiquidAI/LFM2.5-1.2B-Instruct", } diff --git a/examples/models/llama/lfm25_formatter_helper.cpp b/examples/models/llama/lfm25_formatter_helper.cpp new file mode 100644 index 00000000000..d1683a5a168 --- /dev/null +++ b/examples/models/llama/lfm25_formatter_helper.cpp @@ -0,0 +1,277 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Persistent companion process for the LFM2.5 formatter model. +// +// Loads an `executorch::extension::llm::TextLLMRunner` once and stays alive, +// reading newline-delimited JSON `format` requests from stdin and writing +// `result`/`status`/`error` messages to stdout. The wire contract is in +// lfm25_formatter_helper_protocol.h. +// +// Built and run by the macOS ExecuWhisper app via `FormatterBridge.swift`, +// which expects the binary at +// ${EXECUTORCH_PATH}/cmake-out/examples/models/llama/lfm25_formatter_helper +// and the companion shader bundle at +// $(dirname binary)/mlx.metallib + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "lfm25_formatter_helper_protocol.h" + +DEFINE_string(model_path, "model.pte", "Path to LFM2.5 formatter model (.pte)."); +DEFINE_string( + tokenizer_path, + "tokenizer.json", + "Path to the HuggingFace-format tokenizer.json file."); +DEFINE_string( + tokenizer_config_path, + "tokenizer_config.json", + "Path to the HuggingFace-format tokenizer_config.json file (read by the " + "tokenizers crate when present in the same directory as tokenizer.json; " + "accepted here for symmetry with FormatterBridge.swift)."); +DEFINE_int32( + default_max_new_tokens, + 256, + "Fallback max_new_tokens when a request omits it. The Swift bridge always " + "sets max_new_tokens, so this is mostly a safety net."); + +namespace { + +namespace fp = lfm25_formatter::helper_protocol; + +// Run a single format request through the warm runner. Captures generated +// text via the token callback, captures stats via the stats callback, and +// computes a tokens_per_second figure for the response. +void format_text( + executorch::extension::llm::TextLLMRunner& runner, + const std::string& prompt, + int max_new_tokens, + double temperature, + std::string& text_out, + std::string& stdout_out, + std::string& stderr_out, + std::optional& tokens_per_second_out) { + text_out.clear(); + stdout_out.clear(); + stderr_out.clear(); + tokens_per_second_out.reset(); + + // Reset KV cache + stats so each request is independent. + runner.reset(); + + executorch::extension::llm::GenerationConfig config; + config.echo = false; + config.ignore_eos = false; + config.max_new_tokens = max_new_tokens; + config.temperature = static_cast(temperature); + + std::string accumulated; + std::optional last_stats; + + // The TextLLMRunner's text generator invokes the token callback for every + // produced token, including the EOS token (id 7 = "<|im_end|>") that + // signals end-of-generation. Without filtering, the literal "<|im_end|>" + // string ends up in the user-visible output. Filter known stop strings + // here so the rest of the pipeline doesn't have to. + static const std::vector kStopStrings = { + "<|im_end|>", "<|endoftext|>"}; + + // The runner unconditionally prints every generated token and a final + // PyTorchObserver stats line to stdout (see + // extension/llm/runner/text_llm_runner.cpp). That conflicts with our + // JSON-line wire protocol, which also writes to stdout, because the parent + // process treats every stdout line as a protocol message. Silence stdout + // for the duration of generate() by redirecting fd 1 to /dev/null, then + // restore the parent-facing pipe before we emit the protocol response. + std::cout.flush(); + std::fflush(stdout); + int saved_stdout_fd = ::dup(STDOUT_FILENO); + int devnull_fd = ::open("/dev/null", O_WRONLY); + if (saved_stdout_fd >= 0 && devnull_fd >= 0) { + ::dup2(devnull_fd, STDOUT_FILENO); + ::close(devnull_fd); + } + + const auto err = runner.generate( + prompt, + config, + [&](const std::string& token_text) { + for (const auto& stop : kStopStrings) { + if (token_text == stop) { + return; + } + } + accumulated.append(token_text); + }, + [&](const executorch::extension::llm::Stats& stats) { + last_stats.emplace(stats); + }); + + // Restore the parent-facing stdout pipe so subsequent protocol writes + // (status, result, error) reach the parent process. + std::fflush(stdout); + if (saved_stdout_fd >= 0) { + ::dup2(saved_stdout_fd, STDOUT_FILENO); + ::close(saved_stdout_fd); + } + + if (err != ::executorch::runtime::Error::Ok) { + throw std::runtime_error( + "TextLLMRunner::generate returned non-Ok error code"); + } + + text_out = std::move(accumulated); + + if (last_stats.has_value()) { + stdout_out = + "PyTorchObserver " + + executorch::extension::llm::stats_to_json_string(*last_stats); + + const long inference_ms = + last_stats->inference_end_ms - last_stats->inference_start_ms; + if (inference_ms > 0 && last_stats->num_generated_tokens > 0) { + tokens_per_second_out = static_cast( + last_stats->num_generated_tokens) * + 1000.0 / static_cast(inference_ms); + } + } +} + +} // namespace + +int main(int argc, char** argv) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + + // tokenizer_config_path is documented above; reference it so the symbol is + // not stripped, and so an unsupported value at least surfaces in the log. + if (!FLAGS_tokenizer_config_path.empty()) { + ET_LOG( + Info, + "Tokenizer config path: %s", + FLAGS_tokenizer_config_path.c_str()); + } + + try { + auto tokenizer = ::executorch::extension::llm::load_tokenizer( + FLAGS_tokenizer_path); + if (!tokenizer || !tokenizer->is_loaded()) { + throw std::runtime_error( + "Failed to load tokenizer: " + FLAGS_tokenizer_path); + } + + auto runner = ::executorch::extension::llm::create_text_llm_runner( + FLAGS_model_path, std::move(tokenizer)); + if (!runner) { + throw std::runtime_error( + "Failed to construct TextLLMRunner from " + FLAGS_model_path); + } + if (runner->load() != ::executorch::runtime::Error::Ok) { + throw std::runtime_error( + "TextLLMRunner::load failed for " + FLAGS_model_path); + } + + if (!fp::write_message(std::cout, fp::encode_ready_message())) { + std::cerr << "Failed to write helper ready message." << std::endl; + return 1; + } + + while (true) { + fp::Request request; + std::string request_error; + if (!fp::read_request(std::cin, &request, &request_error)) { + if (request_error.empty()) { + // Clean EOF on stdin — graceful shutdown. + return 0; + } + fp::write_message( + std::cout, + fp::encode_error_message( + std::nullopt, + "Failed to read helper request", + request_error)); + return 1; + } + + if (request.type == fp::Request::Type::Shutdown) { + return 0; + } + + const auto& format_request = *request.format; + try { + if (format_request.prompt.empty()) { + throw std::runtime_error("Empty prompt."); + } + + const int max_new_tokens = format_request.max_new_tokens > 0 + ? format_request.max_new_tokens + : FLAGS_default_max_new_tokens; + + fp::write_message( + std::cout, + fp::encode_status_message( + format_request.request_id, + "formatting", + "Generating formatted text...")); + + std::string text; + std::string stdout_payload; + std::string stderr_payload; + std::optional tokens_per_second; + format_text( + *runner, + format_request.prompt, + max_new_tokens, + format_request.temperature, + text, + stdout_payload, + stderr_payload, + tokens_per_second); + + fp::write_message( + std::cout, + fp::encode_result_message( + format_request.request_id, + text, + stdout_payload, + stderr_payload, + tokens_per_second)); + } catch (const std::exception& e) { + fp::write_message( + std::cout, + fp::encode_error_message( + format_request.request_id, + "Helper formatting failed", + e.what())); + } + } + } catch (const std::exception& e) { + fp::write_message( + std::cout, + fp::encode_error_message( + std::nullopt, + "Failed to start LFM2.5 formatter helper", + e.what())); + return 1; + } +} diff --git a/examples/models/llama/lfm25_formatter_helper_protocol.cpp b/examples/models/llama/lfm25_formatter_helper_protocol.cpp new file mode 100644 index 00000000000..85da95c27dd --- /dev/null +++ b/examples/models/llama/lfm25_formatter_helper_protocol.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "lfm25_formatter_helper_protocol.h" + +#include + +#include +#include +#include +#include + +namespace lfm25_formatter::helper_protocol { +namespace { + +using json = nlohmann::json; + +} // namespace + +bool read_request( + std::istream& input, + Request* request, + std::string* error_message) { + std::string header_line; + if (!std::getline(input, header_line)) { + return false; + } + if (header_line.empty()) { + if (error_message) { + *error_message = "Received empty helper request header."; + } + return false; + } + + json payload; + try { + payload = json::parse(header_line); + } catch (const std::exception& e) { + if (error_message) { + *error_message = + std::string("Failed to parse helper request: ") + e.what(); + } + return false; + } + + const std::string type = payload.value("type", ""); + if (payload.value("version", -1) != kProtocolVersion) { + if (error_message) { + *error_message = "Unsupported helper protocol version."; + } + return false; + } + + if (type == "shutdown") { + request->type = Request::Type::Shutdown; + request->format.reset(); + return true; + } + + if (type != "format") { + if (error_message) { + *error_message = "Unsupported helper request type: " + type; + } + return false; + } + + if (!payload.contains("prompt") || !payload["prompt"].is_string()) { + if (error_message) { + *error_message = "Missing helper prompt field."; + } + return false; + } + + FormatRequest format_request; + format_request.request_id = payload.value("request_id", ""); + format_request.prompt = payload.value("prompt", ""); + format_request.max_new_tokens = payload.value("max_new_tokens", 0); + format_request.temperature = payload.value("temperature", 0.0); + + request->type = Request::Type::Format; + request->format = format_request; + return true; +} + +std::string encode_ready_message() { + return json{{"type", "ready"}, {"version", kProtocolVersion}}.dump(); +} + +std::string encode_status_message( + const std::optional& request_id, + const std::string& phase, + const std::string& message) { + json payload = { + {"type", "status"}, + {"version", kProtocolVersion}, + {"phase", phase}, + {"message", message}, + }; + if (request_id.has_value()) { + payload["request_id"] = *request_id; + } + return payload.dump(); +} + +std::string encode_result_message( + const std::string& request_id, + const std::string& text, + const std::string& stdout_payload, + const std::string& stderr_payload, + const std::optional& tokens_per_second) { + json payload = { + {"type", "result"}, + {"version", kProtocolVersion}, + {"request_id", request_id}, + {"text", text}, + {"stdout", stdout_payload}, + {"stderr", stderr_payload}, + }; + if (tokens_per_second.has_value()) { + payload["tokens_per_second"] = *tokens_per_second; + } + return payload.dump(); +} + +std::string encode_error_message( + const std::optional& request_id, + const std::string& message, + const std::optional& details) { + json payload = { + {"type", "error"}, + {"version", kProtocolVersion}, + {"message", message}, + }; + if (request_id.has_value()) { + payload["request_id"] = *request_id; + } + if (details.has_value()) { + payload["details"] = *details; + } + return payload.dump(); +} + +bool write_message(std::ostream& output, const std::string& line) { + output << line << '\n'; + output.flush(); + return output.good(); +} + +} // namespace lfm25_formatter::helper_protocol diff --git a/examples/models/llama/lfm25_formatter_helper_protocol.h b/examples/models/llama/lfm25_formatter_helper_protocol.h new file mode 100644 index 00000000000..a1cb4001142 --- /dev/null +++ b/examples/models/llama/lfm25_formatter_helper_protocol.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// LFM2.5 formatter helper: a long-lived companion process that wraps an +// `executorch::extension::llm::TextLLMRunner` with a JSON-line stdin/stdout +// protocol. The macOS ExecuWhisper app launches this binary once per app +// session, sends a `format` request, and reads the rewritten dictation off +// stdout — preserving the model's KV cache and warm state across requests. +// +// Wire contract (kProtocolVersion=1): +// +// Requests (one JSON object per line, then optional payload): +// {"type": "format", "version": 1, +// "request_id": "", +// "prompt": "", +// "max_new_tokens": , +// "temperature": } +// {"type": "shutdown", "version": 1} +// +// Responses (one JSON object per line): +// {"type": "ready", "version": 1} // emitted once at startup +// {"type": "status", "version": 1, "request_id": ..., +// "phase": "", "message": ""} // optional progress updates +// {"type": "result", "version": 1, "request_id": ..., +// "text": "", "stdout": "", "stderr": "", +// "tokens_per_second": } // success +// {"type": "error", "version": 1, "request_id": , +// "message": "", "details": } // failure +// +// The Swift wire contract this matches lives at +// ExecuWhisper/Services/FormatterHelperProtocol.swift +// in the internal-llama-cookbook ExecuWhisper app. + +#pragma once + +#include +#include +#include +#include +#include + +namespace lfm25_formatter::helper_protocol { + +constexpr int kProtocolVersion = 1; + +struct FormatRequest { + std::string request_id; + std::string prompt; + int max_new_tokens = 0; + double temperature = 0.0; +}; + +struct Request { + enum class Type { + Format, + Shutdown, + }; + + Type type = Type::Shutdown; + std::optional format; +}; + +bool read_request( + std::istream& input, + Request* request, + std::string* error_message); + +std::string encode_ready_message(); +std::string encode_status_message( + const std::optional& request_id, + const std::string& phase, + const std::string& message); +std::string encode_result_message( + const std::string& request_id, + const std::string& text, + const std::string& stdout_payload, + const std::string& stderr_payload, + const std::optional& tokens_per_second); +std::string encode_error_message( + const std::optional& request_id, + const std::string& message, + const std::optional& details); + +bool write_message(std::ostream& output, const std::string& line); + +} // namespace lfm25_formatter::helper_protocol diff --git a/extension/llm/export/config/llm_config.py b/extension/llm/export/config/llm_config.py index fa22ddad7ac..01afb5bee18 100644 --- a/extension/llm/export/config/llm_config.py +++ b/extension/llm/export/config/llm_config.py @@ -52,6 +52,7 @@ class ModelType(str, Enum): lfm2_350m = "lfm2_350m" lfm2_700m = "lfm2_700m" lfm2_1_2b = "lfm2_1_2b" + lfm2_5_350m = "lfm2_5_350m" lfm2_5_1_2b = "lfm2_5_1_2b"