Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ include(${PROJECT_SOURCE_DIR}/tools/cmake/Codegen.cmake)
include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
include(CMakeDependentOption)
include(ExternalProject)
include(FetchContent)
include(GNUInstallDirs)

if(NOT CMAKE_CXX_STANDARD)
Expand Down Expand Up @@ -406,6 +407,14 @@ set(_common_include_directories
$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/executorch/runtime/core/portable_type/c10>
)

if(TARGET jinja2cpp)
install(
TARGETS jinja2cpp
EXPORT ExecuTorchTargets
DESTINATION ${CMAKE_INSTALL_LIBDIR}
)
endif()

#
# The `_<target>_srcs` lists are defined by executorch_load_build_variables.
#
Expand Down Expand Up @@ -803,7 +812,7 @@ endif()

if(EXECUTORCH_BUILD_EXTENSION_LLM)
if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
set(SUPPORT_REGEX_LOOKAHEAD ON)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/chat_template)
# llama/runner/CMakeLists.txt builds a shared library libllama_runner.so
# that transitively depends on tokenizers. Need to build tokenizers with
# -fPIC.
Expand Down
51 changes: 24 additions & 27 deletions examples/models/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
Expand Down Expand Up @@ -48,6 +48,28 @@

include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)

# Jinja2Cpp is required by the chat formatter that the llama runner links
# against. When this example CMake file is run standalone (i.e. without the
# parent ExecuTorch build that already pulls Jinja2Cpp from
# extension/llm/chat_template), declare it here as well so the target exists.
# This guard prevents redeclaring it when the parent build has already added
# the dependency.
if(NOT TARGET jinja2cpp)
include(FetchContent)
cmake_policy(SET CMP0077 NEW)
FetchContent_Declare(
jinja2cpp
GIT_REPOSITORY https://github.com/jinja2cpp/Jinja2Cpp.git
GIT_TAG 1.3.2
GIT_SUBMODULES_RECURSE TRUE
)
set(JINJA2CPP_BUILD_TESTS OFF CACHE BOOL "" FORCE)
set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "" FORCE)
set(JINJA2CPP_INSTALL OFF CACHE BOOL "" FORCE)
set(SUPPORT_REGEX_LOOKAHEAD ON CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(jinja2cpp)
endif()

if(NOT PYTHON_EXECUTABLE)
resolve_python_executable()
endif()
Expand Down Expand Up @@ -107,13 +129,8 @@
endif()

# quantized_ops_lib: Register quantized op kernels into the runtime
if(TARGET quantized_ops_lib)
list(APPEND link_libraries quantized_kernels quantized_ops_lib)
get_target_property(_quantized_imported quantized_ops_lib IMPORTED)
if(NOT _quantized_imported)
executorch_target_link_options_shared_lib(quantized_ops_lib)
endif()
endif()
executorch_target_link_options_shared_lib(quantized_ops_lib)
list(APPEND link_libraries quantized_kernels quantized_ops_lib)

if(TARGET custom_ops)
executorch_target_link_options_shared_lib(custom_ops)
Expand Down Expand Up @@ -168,15 +185,6 @@
executorch_target_link_options_shared_lib(xnnpack_backend)
endif()

# CUDA backend
if(EXECUTORCH_BUILD_CUDA)
find_package(CUDAToolkit REQUIRED)
list(APPEND link_libraries aoti_cuda_backend)
if(NOT MSVC)
executorch_target_link_options_shared_lib(aoti_cuda_backend)
endif()
endif()

# Vulkan backend
if(TARGET vulkan_backend)
list(APPEND link_libraries vulkan_backend)
Expand All @@ -203,12 +211,6 @@
executorch_target_link_options_shared_lib(mpsdelegate)
endif()

# MLX backend
if(TARGET mlxdelegate)
list(APPEND link_libraries mlxdelegate mlx)
executorch_target_link_options_shared_lib(mlxdelegate)
endif()

# Openvino backend
if(TARGET openvino_backend)
find_package(OpenVINO REQUIRED)
Expand Down Expand Up @@ -237,11 +239,6 @@

add_executable(llama_main ${_srcs})

# Copy MLX metallib for runtime if MLX delegate is enabled
if(TARGET mlxdelegate)
executorch_target_copy_mlx_metallib(llama_main)
endif()

# Only strip symbols for Release and MinSizeRel builds.
if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL
"MinSizeRel"
Expand Down
90 changes: 86 additions & 4 deletions examples/models/llama/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ If you're interested in deploying on non-CPU backends, [please refer the non-cpu
```
cmake --workflow llm-release
```
If you build with `make llama-cpu` and hit a RapidJSON CMake error, run it as:
```
CMAKE_POLICY_VERSION_MINIMUM=3.5 make llama-cpu
```
Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the section of Common Issues and Mitigations below for solutions.

2. Build llama runner.
Expand All @@ -252,6 +256,87 @@ popd
cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
```

### Chat Format for Instruct Models

For **Instruct models** (e.g., Llama-3.2-1B-Instruct), use either
`--chat_format` (built-in) or `--chat_template_file` (any HuggingFace /
vLLM-style Jinja template) to wrap your prompt in the appropriate chat
template. Without this, Instruct models may not generate end-of-turn tokens
and will run until max tokens.

#### Universal Jinja templates (recommended)

The runner supports **any HuggingFace / vLLM-style Jinja2 template** via
`--chat_template_file`. Templates from
[vLLM's examples directory](https://github.com/vllm-project/vllm/tree/main/examples)
or HuggingFace `tokenizer_config.json` files work out of the box:

```bash
# Use any Jinja template from vLLM, HuggingFace, or your own:
cmake-out/examples/models/llama/llama_main \
--model_path=<model.pte> \
--tokenizer_path=<tokenizer.model> \
--chat_template_file=path/to/template.jinja \
--prompt="What is the capital of France?"
```

#### Built-in formats (convenience)

```bash
# Basic usage with chat format
cmake-out/examples/models/llama/llama_main \
--model_path=<model.pte> \
--tokenizer_path=<tokenizer.model> \
--chat_format=llama3 \
--prompt="What is the capital of France?"
```

**Template/model compatibility:**
- Use Llama templates (`llama3` or the Llama vLLM template) with Llama models.
- Using a Gemma template with a Llama model will cause the model to echo Gemma tokens.

**Supported chat formats:**
| Format | Models | Template Style |
|--------|--------|----------------|
| `llama3` | Llama 3.x Instruct | `<\|begin_of_text\|><\|start_header_id\|>user...` |
| `gemma3` | Gemma 3 Instruct | `<bos><start_of_turn>user...` |
| `jinja` | Custom template | Jinja2 chat template from file (requires `--chat_template_file`) |
| `none` | Base models (default) | No formatting |

**Additional options:**
| Flag | Description | Default |
|------|-------------|---------|
| `--chat_format` | Chat template format (llama3, gemma3, jinja, none) | `none` |
| `--chat_template_file` | Path to custom Jinja2 template (overrides `--chat_format`) | (empty) |
| `--system_prompt` | System prompt to set assistant behavior | (empty) |
| `--echo` | Echo input prompt in output (set to false for clean output) | `true` |

**Example with system prompt and clean output:**
```bash
cmake-out/examples/models/llama/llama_main \
--model_path=<model.pte> \
--tokenizer_path=<tokenizer.model> \
--chat_format=llama3 \
--system_prompt="You are a helpful assistant. Be concise." \
--echo=false \
--prompt="What is the capital of France?"

# Output: The capital of France is Paris.
```

**Example with a custom template file:**
```bash
cmake-out/examples/models/llama/llama_main \
--model_path=<model.pte> \
--tokenizer_path=<tokenizer.model> \
--chat_template_file=./my_template.jinja \
--prompt="Hello!"
```

**Build note:** If you see a CMake error about RapidJSON requiring
`CMAKE_POLICY_VERSION_MINIMUM=3.5`, add `CMAKE_POLICY_VERSION_MINIMUM=3.5`
to your build environment when running `make llama-cpu`.

To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON`

If you an error about "RE2 failed to compile pattern with lookahead:...SUPPORT_REGEX_LOOKAHEAD=ON", add "-DSUPPORT_REGEX_LOOKAHEAD=ON" when building the runner.
Expand All @@ -277,9 +362,6 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
-DEXECUTORCH_ENABLE_LOGGING=1 \
-DPYTHON_EXECUTABLE=python \
-DEXECUTORCH_BUILD_XNNPACK=ON \
Expand All @@ -303,7 +385,7 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
-DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-DSUPPORT_REGEX_LOOKAHEAD=ON \
-DSUPPORT_REGEX_LOOKAHEAD=ON
-Bcmake-out-android/examples/models/llama \
examples/models/llama

Expand Down
64 changes: 64 additions & 0 deletions examples/models/llama/main.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
Expand All @@ -7,6 +7,7 @@
* @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
*/

#include <executorch/examples/models/llama/runner/chat_formatter.h>
#include <executorch/examples/models/llama/runner/runner.h>
#include <gflags/gflags.h>
#include <fstream>
Expand Down Expand Up @@ -87,6 +88,27 @@
"forward",
"Method name to execute in the model (e.g., 'forward', 'lora_forward').");

DEFINE_string(
chat_format,
"none",
"Chat template format for Instruct models. Supported formats: llama3, gemma3, jinja, none (default: none). "
"When set, the prompt will be wrapped in the appropriate chat template.");

DEFINE_string(
chat_template_file,
"",
"Path to a custom Jinja2 chat template file. Overrides --chat_format.");

DEFINE_string(
system_prompt,
"",
"System prompt for chat format (optional). Sets the behavior/personality of the assistant.");

DEFINE_bool(
echo,
true,
"Echo the input prompt in the output. Set to false to only show generated text.");

// Helper function to parse comma-separated string lists
std::vector<std::string> parseStringList(const std::string& input) {
std::vector<std::string> result;
Expand Down Expand Up @@ -143,6 +165,47 @@
prompt = prompt_storage.c_str();
}

// Parse chat format and create formatter
auto chat_format = example::parse_chat_format(FLAGS_chat_format);
std::unique_ptr<example::ChatFormatter> chat_formatter;
try {
chat_formatter =
example::create_chat_formatter(chat_format, FLAGS_chat_template_file);
} catch (const std::invalid_argument& ex) {
ET_LOG(Error, "Invalid chat format configuration: %s", ex.what());
return 1;
} catch (const std::exception& ex) {
ET_LOG(Error, "Failed to load chat template: %s", ex.what());
return 1;
}
const bool using_chat_template =
chat_format != example::ChatFormat::None ||
!FLAGS_chat_template_file.empty();

// Apply chat formatting to the prompt (no-op when chat_format=none and no
// template file provided).
std::string formatted_prompt =
chat_formatter->format(prompt, FLAGS_system_prompt);
prompt = formatted_prompt.c_str();

if (using_chat_template) {
if (!FLAGS_chat_template_file.empty()) {
ET_LOG(
Info,
"Using chat template file: %s",
FLAGS_chat_template_file.c_str());
} else {
ET_LOG(Info, "Using chat format: %s", FLAGS_chat_format.c_str());
}
if (FLAGS_num_bos > 0 && chat_formatter->includes_bos()) {
ET_LOG(
Info,
"Note: Chat format '%s' already includes BOS token. "
"Consider setting --num_bos=0 to avoid duplicate BOS tokens.",
FLAGS_chat_format.c_str());
}
}

float temperature = FLAGS_temperature;

int32_t seq_len = FLAGS_seq_len;
Expand Down Expand Up @@ -200,6 +263,7 @@
}
// generate
executorch::extension::llm::GenerationConfig config{
.echo = FLAGS_echo,
.temperature = temperature};

config.ignore_eos = FLAGS_ignore_eos;
Expand Down
1 change: 1 addition & 0 deletions examples/models/llama/runner/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ set(llama_runner_deps

target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
target_link_libraries(llama_runner PUBLIC tokenizers::tokenizers)
target_link_libraries(llama_runner PRIVATE jinja2cpp)

target_include_directories(
llama_runner PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
Expand Down
Loading
Loading