pytorch · seyeong-han · May 13, 2026 · May 13, 2026 · May 13, 2026 · May 13, 2026
@@ -103,6 +103,7 @@ include(${PROJECT_SOURCE_DIR}/tools/cmake/Codegen.cmake)
 include(${PROJECT_SOURCE_DIR}/tools/cmake/Utils.cmake)
 include(CMakeDependentOption)
 include(ExternalProject)
+include(FetchContent)
 include(GNUInstallDirs)
 
 if(NOT CMAKE_CXX_STANDARD)
@@ -406,6 +407,14 @@ set(_common_include_directories
     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/executorch/runtime/core/portable_type/c10>
 )
 
+if(TARGET jinja2cpp)
+  install(
+    TARGETS jinja2cpp
+    EXPORT ExecuTorchTargets
+    DESTINATION ${CMAKE_INSTALL_LIBDIR}
+  )
+endif()
+
 #
 # The `_<target>_srcs` lists are defined by executorch_load_build_variables.
 #
@@ -803,7 +812,7 @@ endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_LLM)
   if(EXECUTORCH_BUILD_EXTENSION_LLM_RUNNER)
-    set(SUPPORT_REGEX_LOOKAHEAD ON)
+    add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/chat_template)
     # llama/runner/CMakeLists.txt builds a shared library libllama_runner.so
     # that transitively depends on tokenizers. Need to build tokenizers with
     # -fPIC.

@@ -1,3 +1,3 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
@@ -48,6 +48,28 @@
 
 include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake)
 
+# Jinja2Cpp is required by the chat formatter that the llama runner links
+# against. When this example CMake file is run standalone (i.e. without the
+# parent ExecuTorch build that already pulls Jinja2Cpp from
+# extension/llm/chat_template), declare it here as well so the target exists.
+# This guard prevents redeclaring it when the parent build has already added
+# the dependency.
+if(NOT TARGET jinja2cpp)
+  include(FetchContent)
+  cmake_policy(SET CMP0077 NEW)
+  FetchContent_Declare(
+    jinja2cpp
+    GIT_REPOSITORY https://github.com/jinja2cpp/Jinja2Cpp.git
+    GIT_TAG 1.3.2
+    GIT_SUBMODULES_RECURSE TRUE
+  )
+  set(JINJA2CPP_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+  set(JINJA2CPP_BUILD_SHARED OFF CACHE BOOL "" FORCE)
+  set(JINJA2CPP_INSTALL OFF CACHE BOOL "" FORCE)
+  set(SUPPORT_REGEX_LOOKAHEAD ON CACHE BOOL "" FORCE)
+  FetchContent_MakeAvailable(jinja2cpp)
+endif()
+
 if(NOT PYTHON_EXECUTABLE)
   resolve_python_executable()
 endif()
@@ -107,13 +129,8 @@
 endif()
 
 # quantized_ops_lib: Register quantized op kernels into the runtime
-if(TARGET quantized_ops_lib)
-  list(APPEND link_libraries quantized_kernels quantized_ops_lib)
-  get_target_property(_quantized_imported quantized_ops_lib IMPORTED)
-  if(NOT _quantized_imported)
-    executorch_target_link_options_shared_lib(quantized_ops_lib)
-  endif()
-endif()
+executorch_target_link_options_shared_lib(quantized_ops_lib)
+list(APPEND link_libraries quantized_kernels quantized_ops_lib)
 
 if(TARGET custom_ops)
   executorch_target_link_options_shared_lib(custom_ops)
@@ -168,15 +185,6 @@
   executorch_target_link_options_shared_lib(xnnpack_backend)
 endif()
 
-# CUDA backend
-if(EXECUTORCH_BUILD_CUDA)
-  find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda_backend)
-  if(NOT MSVC)
-    executorch_target_link_options_shared_lib(aoti_cuda_backend)
-  endif()
-endif()
-
 # Vulkan backend
 if(TARGET vulkan_backend)
   list(APPEND link_libraries vulkan_backend)
@@ -203,12 +211,6 @@
   executorch_target_link_options_shared_lib(mpsdelegate)
 endif()
 
-# MLX backend
-if(TARGET mlxdelegate)
-  list(APPEND link_libraries mlxdelegate mlx)
-  executorch_target_link_options_shared_lib(mlxdelegate)
-endif()
-
 # Openvino backend
 if(TARGET openvino_backend)
   find_package(OpenVINO REQUIRED)
@@ -237,11 +239,6 @@
 
 add_executable(llama_main ${_srcs})
 
-# Copy MLX metallib for runtime if MLX delegate is enabled
-if(TARGET mlxdelegate)
-  executorch_target_copy_mlx_metallib(llama_main)
-endif()
-
 # Only strip symbols for Release and MinSizeRel builds.
 if(CMAKE_BUILD_TYPE STREQUAL "Release" OR CMAKE_BUILD_TYPE STREQUAL
                                           "MinSizeRel"

@@ -238,6 +238,10 @@ If you're interested in deploying on non-CPU backends, [please refer the non-cpu
 ```
 cmake --workflow llm-release
 ```
+If you build with `make llama-cpu` and hit a RapidJSON CMake error, run it as:
+```
+CMAKE_POLICY_VERSION_MINIMUM=3.5 make llama-cpu
+```
 Note for Mac users: There's a known linking issue with Xcode 15.1. Refer to the section of Common Issues and Mitigations below for solutions.
 
 2. Build llama runner.
@@ -252,6 +256,87 @@ popd
 cmake-out/examples/models/llama/llama_main --model_path=<model pte file> --tokenizer_path=<tokenizer.model> --prompt=<prompt>
 ```
 
+### Chat Format for Instruct Models
+
+For **Instruct models** (e.g., Llama-3.2-1B-Instruct), use either
+`--chat_format` (built-in) or `--chat_template_file` (any HuggingFace /
+vLLM-style Jinja template) to wrap your prompt in the appropriate chat
+template. Without this, Instruct models may not generate end-of-turn tokens
+and will run until max tokens.
+
+#### Universal Jinja templates (recommended)
+
+The runner supports **any HuggingFace / vLLM-style Jinja2 template** via
+`--chat_template_file`. Templates from
+[vLLM's examples directory](https://github.com/vllm-project/vllm/tree/main/examples)
+or HuggingFace `tokenizer_config.json` files work out of the box:
+
+```bash
+# Use any Jinja template from vLLM, HuggingFace, or your own:
+cmake-out/examples/models/llama/llama_main \
+    --model_path=<model.pte> \
+    --tokenizer_path=<tokenizer.model> \
+    --chat_template_file=path/to/template.jinja \
+    --prompt="What is the capital of France?"
+```
+
+#### Built-in formats (convenience)
+
+```bash
+# Basic usage with chat format
+cmake-out/examples/models/llama/llama_main \
+    --model_path=<model.pte> \
+    --tokenizer_path=<tokenizer.model> \
+    --chat_format=llama3 \
+    --prompt="What is the capital of France?"
+```
+
+**Template/model compatibility:**
+- Use Llama templates (`llama3` or the Llama vLLM template) with Llama models.
+- Using a Gemma template with a Llama model will cause the model to echo Gemma tokens.
+
+**Supported chat formats:**
+| Format | Models | Template Style |
+|--------|--------|----------------|
+| `llama3` | Llama 3.x Instruct | `<\|begin_of_text\|><\|start_header_id\|>user...` |
+| `gemma3` | Gemma 3 Instruct | `<bos><start_of_turn>user...` |
+| `jinja` | Custom template | Jinja2 chat template from file (requires `--chat_template_file`) |
+| `none` | Base models (default) | No formatting |
+
+**Additional options:**
+| Flag | Description | Default |
+|------|-------------|---------|
+| `--chat_format` | Chat template format (llama3, gemma3, jinja, none) | `none` |
+| `--chat_template_file` | Path to custom Jinja2 template (overrides `--chat_format`) | (empty) |
+| `--system_prompt` | System prompt to set assistant behavior | (empty) |
+| `--echo` | Echo input prompt in output (set to false for clean output) | `true` |
+
+**Example with system prompt and clean output:**
+```bash
+cmake-out/examples/models/llama/llama_main \
+    --model_path=<model.pte> \
+    --tokenizer_path=<tokenizer.model> \
+    --chat_format=llama3 \
+    --system_prompt="You are a helpful assistant. Be concise." \
+    --echo=false \
+    --prompt="What is the capital of France?"
+
+# Output: The capital of France is Paris.
+```
+
+**Example with a custom template file:**
+```bash
+cmake-out/examples/models/llama/llama_main \
+    --model_path=<model.pte> \
+    --tokenizer_path=<tokenizer.model> \
+    --chat_template_file=./my_template.jinja \
+    --prompt="Hello!"
+```
+
+**Build note:** If you see a CMake error about RapidJSON requiring
+`CMAKE_POLICY_VERSION_MINIMUM=3.5`, add `CMAKE_POLICY_VERSION_MINIMUM=3.5`
+to your build environment when running `make llama-cpu`.
+
 To build for CoreML backend and validate on Mac, replace `-DEXECUTORCH_BUILD_XNNPACK=ON` with `-DEXECUTORCH_BUILD_COREML=ON`
 
 If you an error about "RE2 failed to compile pattern with lookahead:...SUPPORT_REGEX_LOOKAHEAD=ON", add "-DSUPPORT_REGEX_LOOKAHEAD=ON" when building the runner.
@@ -277,9 +362,6 @@ cmake -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_NAMED_DATA_MAP=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_LLM_RUNNER=ON \
     -DEXECUTORCH_ENABLE_LOGGING=1 \
     -DPYTHON_EXECUTABLE=python \
     -DEXECUTORCH_BUILD_XNNPACK=ON \
@@ -303,7 +385,7 @@ cmake  -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_LLM=ON \
-    -DSUPPORT_REGEX_LOOKAHEAD=ON \
+    -DSUPPORT_REGEX_LOOKAHEAD=ON
     -Bcmake-out-android/examples/models/llama \
     examples/models/llama
 

@@ -1,3 +1,3 @@
 /*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 * All rights reserved.
@@ -7,6 +7,7 @@
  * @lint-ignore-every CLANGTIDY facebook-hte-Deprecated
  */
 
+#include <executorch/examples/models/llama/runner/chat_formatter.h>
 #include <executorch/examples/models/llama/runner/runner.h>
 #include <gflags/gflags.h>
 #include <fstream>
@@ -87,6 +88,27 @@
     "forward",
     "Method name to execute in the model (e.g., 'forward', 'lora_forward').");
 
+DEFINE_string(
+    chat_format,
+    "none",
+    "Chat template format for Instruct models. Supported formats: llama3, gemma3, jinja, none (default: none). "
+    "When set, the prompt will be wrapped in the appropriate chat template.");
+
+DEFINE_string(
+    chat_template_file,
+    "",
+    "Path to a custom Jinja2 chat template file. Overrides --chat_format.");
+
+DEFINE_string(
+    system_prompt,
+    "",
+    "System prompt for chat format (optional). Sets the behavior/personality of the assistant.");
+
+DEFINE_bool(
+    echo,
+    true,
+    "Echo the input prompt in the output. Set to false to only show generated text.");
+
 // Helper function to parse comma-separated string lists
 std::vector<std::string> parseStringList(const std::string& input) {
   std::vector<std::string> result;
@@ -143,6 +165,47 @@
     prompt = prompt_storage.c_str();
   }
 
+  // Parse chat format and create formatter
+  auto chat_format = example::parse_chat_format(FLAGS_chat_format);
+  std::unique_ptr<example::ChatFormatter> chat_formatter;
+  try {
+    chat_formatter =
+        example::create_chat_formatter(chat_format, FLAGS_chat_template_file);
+  } catch (const std::invalid_argument& ex) {
+    ET_LOG(Error, "Invalid chat format configuration: %s", ex.what());
+    return 1;
+  } catch (const std::exception& ex) {
+    ET_LOG(Error, "Failed to load chat template: %s", ex.what());
+    return 1;
+  }
+  const bool using_chat_template =
+      chat_format != example::ChatFormat::None ||
+      !FLAGS_chat_template_file.empty();
+
+  // Apply chat formatting to the prompt (no-op when chat_format=none and no
+  // template file provided).
+  std::string formatted_prompt =
+      chat_formatter->format(prompt, FLAGS_system_prompt);
+  prompt = formatted_prompt.c_str();
+
+  if (using_chat_template) {
+    if (!FLAGS_chat_template_file.empty()) {
+      ET_LOG(
+          Info,
+          "Using chat template file: %s",
+          FLAGS_chat_template_file.c_str());
+    } else {
+      ET_LOG(Info, "Using chat format: %s", FLAGS_chat_format.c_str());
+    }
+    if (FLAGS_num_bos > 0 && chat_formatter->includes_bos()) {
+      ET_LOG(
+          Info,
+          "Note: Chat format '%s' already includes BOS token. "
+          "Consider setting --num_bos=0 to avoid duplicate BOS tokens.",
+          FLAGS_chat_format.c_str());
+    }
+  }
+
   float temperature = FLAGS_temperature;
 
   int32_t seq_len = FLAGS_seq_len;
@@ -200,6 +263,7 @@
   }
   // generate
   executorch::extension::llm::GenerationConfig config{
+      .echo = FLAGS_echo,
       .temperature = temperature};
 
   config.ignore_eos = FLAGS_ignore_eos;

@@ -53,6 +53,7 @@ set(llama_runner_deps
 
 target_link_libraries(llama_runner PUBLIC ${llama_runner_deps})
 target_link_libraries(llama_runner PUBLIC tokenizers::tokenizers)
+target_link_libraries(llama_runner PRIVATE jinja2cpp)
 
 target_include_directories(
   llama_runner PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include