From a83e7c479568df009375a0154b00123abcf585c7 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Fri, 22 May 2026 12:20:46 -0700 Subject: [PATCH 001/103] Fix 2 broken tests caused by D105910457 Differential Revision: D105973185 Pull Request resolved: https://github.com/pytorch/executorch/pull/19736 --- backends/vulkan/test/op_tests/utils/gen_computegraph.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/vulkan/test/op_tests/utils/gen_computegraph.py b/backends/vulkan/test/op_tests/utils/gen_computegraph.py index a09b4d36b18..507719b8555 100644 --- a/backends/vulkan/test/op_tests/utils/gen_computegraph.py +++ b/backends/vulkan/test/op_tests/utils/gen_computegraph.py @@ -286,7 +286,7 @@ def create_aten_fn_call(self) -> str: def create_aten_method_call(self) -> str: # For functions with only Method variant, we fallback to the function # declared in MethodOperators.h - cpp_sig = gen_static_dispatch_backend_call_signature(self.f_sig, self.f) + cpp_sig = gen_static_dispatch_backend_call_signature(self.f) exprs = translate_args(self.f_sig, cpp_sig) func_call = f"at::_ops::{self.f_sig.name()}::call({exprs});" return func_call From ec764702419ddc62570c06a282cb34f6d0ed0172 Mon Sep 17 00:00:00 2001 From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com> Date: Fri, 22 May 2026 22:51:45 +0200 Subject: [PATCH 002/103] Cortex_M backend: Add more model tests (#19720) Add model tests of currently not supported models - yolo11 - wav2letter - silero_vad cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Adrian Lundell --- .../cortex_m/test/models/test_silero_vad.py | 94 +++++++++++++++++++ .../cortex_m/test/models/test_wav2letter.py | 34 +++++++ backends/cortex_m/test/models/test_yolo11.py | 45 +++++++++ 3 files changed, 173 insertions(+) create mode 100644 backends/cortex_m/test/models/test_silero_vad.py create mode 100644 backends/cortex_m/test/models/test_wav2letter.py create mode 100644 backends/cortex_m/test/models/test_yolo11.py diff --git a/backends/cortex_m/test/models/test_silero_vad.py b/backends/cortex_m/test/models/test_silero_vad.py new file mode 100644 index 00000000000..27b958627bb --- /dev/null +++ b/backends/cortex_m/test/models/test_silero_vad.py @@ -0,0 +1,94 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase +from executorch.examples.models.silero_vad.export_silero_vad import ( + CONTEXT_SIZE, + HIDDEN_DIM, + SileroVAD16k, + WINDOW_SIZE, +) + + +ops_before_transforms: dict[str, int] = { + "executorch_exir_dialects_edge__ops_aten_abs_default": 2, + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 3, + "executorch_exir_dialects_edge__ops_aten_arange_start_step": 1, + "executorch_exir_dialects_edge__ops_aten_cat_default": 1, + "executorch_exir_dialects_edge__ops_aten_convolution_default": 6, + "executorch_exir_dialects_edge__ops_aten_index_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_linear_default": 2, + "executorch_exir_dialects_edge__ops_aten_mean_dim": 1, + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 3, + "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2, + "executorch_exir_dialects_edge__ops_aten_relu_default": 5, + "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2, + "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4, + "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1, + "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1, + "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 2, + "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, + "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 12, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 11, +} +ops_after_transforms: dict[str, int] = { + "executorch_exir_dialects_edge__ops_aten_abs_default": 2, + "executorch_exir_dialects_edge__ops_aten_add_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_arange_start_step": 1, + "executorch_exir_dialects_edge__ops_aten_cat_default": 1, + "executorch_exir_dialects_edge__ops_aten_convolution_default": 6, + "executorch_exir_dialects_edge__ops_aten_index_Tensor": 1, + "executorch_exir_dialects_edge__ops_aten_linear_default": 2, + "executorch_exir_dialects_edge__ops_aten_mean_dim": 1, + "executorch_exir_dialects_edge__ops_aten_mul_Tensor": 3, + "executorch_exir_dialects_edge__ops_aten_pow_Tensor_Scalar": 2, + "executorch_exir_dialects_edge__ops_aten_relu_default": 5, + "executorch_exir_dialects_edge__ops_aten_select_copy_int": 2, + "executorch_exir_dialects_edge__ops_aten_sigmoid_default": 4, + "executorch_exir_dialects_edge__ops_aten_slice_copy_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_split_with_sizes_copy_default": 1, + "executorch_exir_dialects_edge__ops_aten_sqrt_default": 1, + "executorch_exir_dialects_edge__ops_aten_squeeze_copy_dims": 2, + "executorch_exir_dialects_edge__ops_aten_sub_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_tanh_default": 2, + "executorch_exir_dialects_edge__ops_aten_unsqueeze_copy_default": 2, + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 6, + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 6, + "executorch_exir_dialects_edge__ops_cortex_m_quantized_add_default": 1, +} + + +pt_model = SileroVAD16k().eval() + +x = torch.randn( + 1, CONTEXT_SIZE + WINDOW_SIZE +) # (1, 576) — 64 context + 512 audio samples +state = torch.zeros(2, 1, HIDDEN_DIM) # (2, 1, 128) — [h, c] LSTM state + +test_cases = { + "silero_vad_16k": McuTestCase( + model=pt_model, + example_inputs=lambda: (x, state), + ), +} + + +@parametrize("test_case", test_cases) +def test_dialect_silero_vad_16k(test_case): + """This model currently does largely not lower to accelerated kernels due to missing LSTM and conv1d support, this test is to track development progress.""" + inputs = test_case.get_example_inputs() + tester = CortexMTester(test_case.model, inputs) + tester.test_dialect( + ops_before_transforms, + ops_after_transforms, + qtol=10, + ) diff --git a/backends/cortex_m/test/models/test_wav2letter.py b/backends/cortex_m/test/models/test_wav2letter.py new file mode 100644 index 00000000000..ddc5354293c --- /dev/null +++ b/backends/cortex_m/test/models/test_wav2letter.py @@ -0,0 +1,34 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase +from executorch.examples.models.wav2letter.model import Wav2LetterModel + + +ops_before_transforms: dict[str, int] = {} +ops_after_transforms: dict[str, int] = {} + +model = Wav2LetterModel() +pt_model = model.get_eager_model() + +test_cases = { + "wav2letter": McuTestCase( + model=pt_model, + example_inputs=lambda: model.get_example_inputs(), + ), +} + + +@parametrize("test_case", test_cases) +def test_dialect_wav2letter(test_case): + """This model currently does largely not lower to accelerated kernels due to missing conv1d support, this test is to track development progress.""" + inputs = test_case.get_example_inputs() + tester = CortexMTester(test_case.model, inputs) + tester.test_dialect( + ops_before_transforms, + ops_after_transforms, + qtol=10, + ) diff --git a/backends/cortex_m/test/models/test_yolo11.py b/backends/cortex_m/test/models/test_yolo11.py new file mode 100644 index 00000000000..f17c5ced331 --- /dev/null +++ b/backends/cortex_m/test/models/test_yolo11.py @@ -0,0 +1,45 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch +from executorch.backends.arm.test.common import parametrize + +from executorch.backends.cortex_m.test.tester import CortexMTester, McuTestCase + +YOLO = pytest.importorskip( + "ultralytics", + reason="ultralytics is optional; install it locally to run YOLO tests.", +).YOLO + + +ops_before_transforms: dict[str, int] = {} +ops_after_transforms: dict[str, int] = {} + + +WEIGHTS = "yolo11n.pt" +yolo = YOLO(WEIGHTS) +pt_model = yolo.model.eval() + +test_cases = { + "yolo11n": McuTestCase( + model=pt_model, + example_inputs=lambda: ( + torch.randn(1, 3, 640, 640).to(memory_format=torch.channels_last), + ), + ), +} + + +@parametrize("test_case", test_cases) +def test_dialect_yolo11(test_case): + """This model currently does not lower in the cortex-m backend, this test is to track development progress.""" + inputs = test_case.get_example_inputs() + tester = CortexMTester(test_case.model, inputs) + tester.test_dialect( + ops_before_transforms, + ops_after_transforms, + qtol=10, + ) From 158c5d8f109479ecfb9ca6ef5e638a4961f5b379 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Fri, 22 May 2026 17:39:32 -0700 Subject: [PATCH 003/103] Convert Android LLM extension from Java to Kotlin (#19211) Differential Revision: D102880053 Pull Request resolved: https://github.com/pytorch/executorch/pull/19211 --- extension/android/BUCK | 11 +- .../android/executorch_android/build.gradle | 1 + .../llm/{LlmCallback.java => LlmCallback.kt} | 27 +- .../extension/llm/LlmGenerationConfig.java | 198 ---- .../extension/llm/LlmGenerationConfig.kt | 78 ++ .../executorch/extension/llm/LlmModule.java | 823 ---------------- .../executorch/extension/llm/LlmModule.kt | 898 ++++++++++++++++++ .../extension/llm/LlmModuleConfig.java | 252 ----- .../extension/llm/LlmModuleConfig.kt | 134 +++ .../extension/llm/package-info.java | 51 - 10 files changed, 1129 insertions(+), 1344 deletions(-) rename extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/{LlmCallback.java => LlmCallback.kt} (53%) delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java diff --git a/extension/android/BUCK b/extension/android/BUCK index c7e275805e2..110b428575d 100644 --- a/extension/android/BUCK +++ b/extension/android/BUCK @@ -47,13 +47,14 @@ non_fbcode_target(_kind = fb_android_library, name = "executorch_llama", warnings_as_errors = False, srcs = [ - "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java", - "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java", - "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java", - "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java", + "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt", + "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt", + "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt", + "executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt", ], autoglob = False, - language = "JAVA", + language = "KOTLIN", + extra_kotlinc_arguments = ["-Xjvm-default=all"], deps = [ ":executorch", "//fbandroid/java/com/facebook/jni:jni", diff --git a/extension/android/executorch_android/build.gradle b/extension/android/executorch_android/build.gradle index 3ee5b5877b3..2dbe0e1fb5f 100644 --- a/extension/android/executorch_android/build.gradle +++ b/extension/android/executorch_android/build.gradle @@ -51,6 +51,7 @@ android { } kotlinOptions { jvmTarget = "11" + freeCompilerArgs += ["-Xjvm-default=all"] } } diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt similarity index 53% rename from extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt index 4e834d06721..3b56986bf14 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmCallback.kt @@ -6,45 +6,42 @@ * LICENSE file in the root directory of this source tree. */ -package org.pytorch.executorch.extension.llm; +package org.pytorch.executorch.extension.llm -import com.facebook.jni.annotations.DoNotStrip; -import org.pytorch.executorch.annotations.Experimental; +import com.facebook.jni.annotations.DoNotStrip +import org.pytorch.executorch.annotations.Experimental /** - * Callback interface for Llama model. Users can implement this interface to receive the generated + * Callback interface for Llm model. Users can implement this interface to receive the generated * tokens and statistics. * - *

Warning: These APIs are experimental and subject to change without notice + * Warning: These APIs are experimental and subject to change without notice */ @Experimental -public interface LlmCallback { +interface LlmCallback { /** * Called when a new result is available from JNI. Users will keep getting onResult() invocations * until generate() finishes. * * @param result Last generated token */ - @DoNotStrip - public void onResult(String result); + @DoNotStrip fun onResult(result: String) /** * Called when the statistics for the generate() is available. * - *

The result will be a JSON string. See extension/llm/stats.h for the field definitions. + * The result will be a JSON string. See extension/llm/stats.h for the field definitions. * * @param stats JSON string containing the statistics for the generate() */ - @DoNotStrip - default void onStats(String stats) {} + @DoNotStrip fun onStats(stats: String) {} /** * Called when an error occurs during generate(). * - * @param errorCode Error code from the ExecuTorch runtime (see {@link - * org.pytorch.executorch.ExecutorchRuntimeException}) + * @param errorCode Error code from the ExecuTorch runtime (see + * [org.pytorch.executorch.ExecutorchRuntimeException]) * @param message Human-readable error description */ - @DoNotStrip - default void onError(int errorCode, String message) {} + @DoNotStrip fun onError(errorCode: Int, message: String) {} } diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java deleted file mode 100644 index db7941aadad..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.java +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch.extension.llm; - -/** - * Configuration class for controlling text generation parameters in LLM operations. - * - *

This class provides settings for text generation behavior including output formatting, - * generation limits, and sampling parameters. Instances should be created using the {@link - * #create()} method and the fluent builder pattern. - */ -public class LlmGenerationConfig { - private final boolean echo; - private final int maxNewTokens; - private final boolean warming; - private final int seqLen; - private final float temperature; - private final int numBos; - private final int numEos; - - private LlmGenerationConfig(Builder builder) { - this.echo = builder.echo; - this.maxNewTokens = builder.maxNewTokens; - this.warming = builder.warming; - this.seqLen = builder.seqLen; - this.temperature = builder.temperature; - this.numBos = builder.numBos; - this.numEos = builder.numEos; - } - - /** - * Creates a new Builder instance for constructing generation configurations. - * - * @return a new Builder with default configuration values - */ - public static Builder create() { - return new Builder(); - } - - /** - * @return true if input prompt should be included in the output - */ - public boolean isEcho() { - return echo; - } - - /** - * @return maximum number of tokens to generate (-1 for unlimited) - */ - public int getMaxNewTokens() { - return maxNewTokens; - } - - /** - * @return true if model warming is enabled - */ - public boolean isWarming() { - return warming; - } - - /** - * @return maximum sequence length for generation (-1 for default) - */ - public int getSeqLen() { - return seqLen; - } - - /** - * @return temperature value for sampling (higher = more random) - */ - public float getTemperature() { - return temperature; - } - - /** - * @return number of BOS tokens to prepend - */ - public int getNumBos() { - return numBos; - } - - /** - * @return number of EOS tokens to append - */ - public int getNumEos() { - return numEos; - } - - /** - * Builder class for constructing LlmGenerationConfig instances. - * - *

Provides a fluent interface for configuring generation parameters with sensible defaults. - * All methods return the builder instance to enable method chaining. - */ - public static class Builder { - private boolean echo = true; - private int maxNewTokens = -1; - private boolean warming = false; - private int seqLen = -1; - private float temperature = 0.8f; - private int numBos = 0; - private int numEos = 0; - - Builder() {} - - /** - * Sets whether to include the input prompt in the generated output. - * - * @param echo true to include input prompt, false to return only new tokens - * @return this builder instance - */ - public Builder echo(boolean echo) { - this.echo = echo; - return this; - } - - /** - * Sets the maximum number of new tokens to generate. - * - * @param maxNewTokens the token limit (-1 for unlimited generation) - * @return this builder instance - */ - public Builder maxNewTokens(int maxNewTokens) { - this.maxNewTokens = maxNewTokens; - return this; - } - - /** - * Enables or disables model warming. - * - * @param warming true to generate initial tokens for model warmup - * @return this builder instance - */ - public Builder warming(boolean warming) { - this.warming = warming; - return this; - } - - /** - * Sets the maximum sequence length for generation. - * - * @param seqLen maximum sequence length (-1 for default behavior) - * @return this builder instance - */ - public Builder seqLen(int seqLen) { - this.seqLen = seqLen; - return this; - } - - /** - * Sets the temperature for random sampling. - * - * @param temperature sampling temperature (typical range 0.0-1.0) - * @return this builder instance - */ - public Builder temperature(float temperature) { - this.temperature = temperature; - return this; - } - - /** - * Sets the number of BOS tokens to prepend. - * - * @param numBos number of BOS tokens - * @return this builder instance - */ - public Builder numBos(int numBos) { - this.numBos = numBos; - return this; - } - - /** - * Sets the number of EOS tokens to append. - * - * @param numEos number of EOS tokens - * @return this builder instance - */ - public Builder numEos(int numEos) { - this.numEos = numEos; - return this; - } - - /** - * Constructs the LlmGenerationConfig instance with the configured parameters. - * - * @return new LlmGenerationConfig instance with current builder settings - */ - public LlmGenerationConfig build() { - return new LlmGenerationConfig(this); - } - } -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt new file mode 100644 index 00000000000..c0f8956fb7f --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmGenerationConfig.kt @@ -0,0 +1,78 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch.extension.llm + +/** + * Configuration class for controlling text generation parameters in LLM operations. + * + * This class provides settings for text generation behavior including output formatting, generation + * limits, and sampling parameters. Instances should be created using the [create] method and the + * fluent builder pattern. + */ +class LlmGenerationConfig +private constructor( + @get:JvmName("isEcho") val echo: Boolean, + val maxNewTokens: Int, + @get:JvmName("isWarming") val warming: Boolean, + val seqLen: Int, + val temperature: Float, + val numBos: Int, + val numEos: Int, +) { + + companion object { + /** + * Creates a new Builder instance for constructing generation configurations. + * + * @return a new Builder with default configuration values + */ + @JvmStatic fun create(): Builder = Builder() + } + + /** + * Builder class for constructing LlmGenerationConfig instances. + * + * Provides a fluent interface for configuring generation parameters with sensible defaults. All + * methods return the builder instance to enable method chaining. + */ + class Builder internal constructor() { + private var echo: Boolean = true + private var maxNewTokens: Int = -1 + private var warming: Boolean = false + private var seqLen: Int = -1 + private var temperature: Float = 0.8f + private var numBos: Int = 0 + private var numEos: Int = 0 + + /** Sets whether to include the input prompt in the generated output. */ + fun echo(echo: Boolean): Builder = apply { this.echo = echo } + + /** Sets the maximum number of new tokens to generate. */ + fun maxNewTokens(maxNewTokens: Int): Builder = apply { this.maxNewTokens = maxNewTokens } + + /** Enables or disables model warming. */ + fun warming(warming: Boolean): Builder = apply { this.warming = warming } + + /** Sets the maximum sequence length for generation. */ + fun seqLen(seqLen: Int): Builder = apply { this.seqLen = seqLen } + + /** Sets the temperature for random sampling. */ + fun temperature(temperature: Float): Builder = apply { this.temperature = temperature } + + /** Sets the number of BOS tokens to prepend. */ + fun numBos(numBos: Int): Builder = apply { this.numBos = numBos } + + /** Sets the number of EOS tokens to append. */ + fun numEos(numEos: Int): Builder = apply { this.numEos = numEos } + + /** Constructs the LlmGenerationConfig instance with the configured parameters. */ + fun build(): LlmGenerationConfig = + LlmGenerationConfig(echo, maxNewTokens, warming, seqLen, temperature, numBos, numEos) + } +} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java deleted file mode 100644 index 0c467b13f44..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.java +++ /dev/null @@ -1,823 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch.extension.llm; - -import com.facebook.jni.HybridData; -import com.facebook.jni.annotations.DoNotStrip; -import java.io.Closeable; -import java.nio.ByteBuffer; -import java.util.List; -import java.util.concurrent.locks.ReentrantLock; -import org.pytorch.executorch.ExecuTorchRuntime; -import org.pytorch.executorch.ExecutorchRuntimeException; -import org.pytorch.executorch.annotations.Experimental; - -/** - * LlmModule is a wrapper around the Executorch LLM. It provides a simple interface to generate text - * from the model. - * - *

Warning: These APIs are experimental and subject to change without notice - */ -@Experimental -public class LlmModule implements Closeable { - - public static final int MODEL_TYPE_TEXT = 1; - public static final int MODEL_TYPE_TEXT_VISION = 2; - public static final int MODEL_TYPE_MULTIMODAL = 2; - - private final HybridData mHybridData; - private final ReentrantLock mLock = new ReentrantLock(); - private volatile boolean mDestroyed = false; - private static final int DEFAULT_SEQ_LEN = 128; - private static final boolean DEFAULT_ECHO = true; - private static final float DEFAULT_TEMPERATURE = -1.0f; - private static final int DEFAULT_BOS = 0; - private static final int DEFAULT_EOS = 0; - private static final int DEFAULT_LOAD_MODE = LlmModuleConfig.LOAD_MODE_MMAP; - - @DoNotStrip - private static native HybridData initHybrid( - int modelType, - String modulePath, - String tokenizerPath, - float temperature, - List dataFiles, - int numBos, - int numEos, - int loadMode); - - private LlmModule( - int modelType, - String modulePath, - String tokenizerPath, - float temperature, - List dataFiles, - int numBos, - int numEos, - int loadMode) { - ExecuTorchRuntime.getRuntime(); - ExecuTorchRuntime.validateFilePath(modulePath, "model path"); - ExecuTorchRuntime.validateFilePath(tokenizerPath, "tokenizer path"); - - mHybridData = - initHybrid( - modelType, modulePath, tokenizerPath, temperature, dataFiles, numBos, numEos, loadMode); - } - - /** - * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and - * dataFiles. - */ - public LlmModule( - int modelType, - String modulePath, - String tokenizerPath, - float temperature, - List dataFiles, - int numBos, - int numEos) { - this( - modelType, - modulePath, - tokenizerPath, - temperature, - dataFiles, - numBos, - numEos, - DEFAULT_LOAD_MODE); - } - - /** - * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and - * dataFiles. - */ - public LlmModule( - int modelType, - String modulePath, - String tokenizerPath, - float temperature, - List dataFiles) { - this( - modelType, - modulePath, - tokenizerPath, - temperature, - dataFiles, - DEFAULT_BOS, - DEFAULT_EOS, - DEFAULT_LOAD_MODE); - } - - /** - * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and - * data path. - */ - public LlmModule( - int modelType, - String modulePath, - String tokenizerPath, - float temperature, - String dataPath, - int numBos, - int numEos) { - this( - modelType, - modulePath, - tokenizerPath, - temperature, - dataPath != null ? List.of(dataPath) : List.of(), - numBos, - numEos); - } - - /** - * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and - * data path. - */ - public LlmModule( - int modelType, String modulePath, String tokenizerPath, float temperature, String dataPath) { - this(modelType, modulePath, tokenizerPath, temperature, dataPath, DEFAULT_BOS, DEFAULT_EOS); - } - - /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */ - public LlmModule(String modulePath, String tokenizerPath, float temperature) { - this( - MODEL_TYPE_TEXT, - modulePath, - tokenizerPath, - temperature, - List.of(), - DEFAULT_BOS, - DEFAULT_EOS); - } - - /** - * Constructs a LLM Module for a model with given model path, tokenizer, temperature and data - * path. - */ - public LlmModule(String modulePath, String tokenizerPath, float temperature, String dataPath) { - this( - MODEL_TYPE_TEXT, - modulePath, - tokenizerPath, - temperature, - List.of(dataPath), - DEFAULT_BOS, - DEFAULT_EOS); - } - - /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */ - public LlmModule(int modelType, String modulePath, String tokenizerPath, float temperature) { - this(modelType, modulePath, tokenizerPath, temperature, List.of(), DEFAULT_BOS, DEFAULT_EOS); - } - - /** Constructs a LLM Module for a model with the given LlmModuleConfig */ - public LlmModule(LlmModuleConfig config) { - this( - config.getModelType(), - config.getModulePath(), - config.getTokenizerPath(), - config.getTemperature(), - config.getDataPath() != null ? List.of(config.getDataPath()) : List.of(), - config.getNumBos(), - config.getNumEos(), - config.getLoadMode()); - } - - private void checkNotDestroyed() { - if (mDestroyed) throw new IllegalStateException("LlmModule has been destroyed"); - } - - private void checkNotReentrant() { - if (mLock.getHoldCount() > 1) { - throw new IllegalStateException("Cannot call LlmModule methods from within a callback"); - } - } - - /** - * Releases native resources. Callers must ensure no other methods are in-flight. Call {@link - * #stop()} and wait for {@link #generate(String, LlmCallback)} to return before calling this - * method. - */ - @Override - public void close() { - if (mLock.tryLock()) { - try { - if (mLock.getHoldCount() > 1) { - throw new IllegalStateException( - "Cannot close module from within a callback during execution"); - } - if (!mDestroyed) { - mDestroyed = true; - mHybridData.resetNative(); - } - } finally { - mLock.unlock(); - } - } else { - throw new IllegalStateException("Cannot close module while method is executing"); - } - } - - /** - * @deprecated Use {@link #close()} instead. - */ - @Deprecated - public void resetNative() { - close(); - } - - /** - * Start generating tokens from the module. - * - * @param prompt Input prompt - * @param llmCallback callback object to receive results. - */ - public void generate(String prompt, LlmCallback llmCallback) { - generate( - prompt, - DEFAULT_SEQ_LEN, - llmCallback, - DEFAULT_ECHO, - DEFAULT_TEMPERATURE, - DEFAULT_BOS, - DEFAULT_EOS); - } - - /** - * Start generating tokens from the module. - * - * @param prompt Input prompt - * @param seqLen sequence length - * @param llmCallback callback object to receive results. - */ - public void generate(String prompt, int seqLen, LlmCallback llmCallback) { - generate( - null, - 0, - 0, - 0, - prompt, - seqLen, - llmCallback, - DEFAULT_ECHO, - DEFAULT_TEMPERATURE, - DEFAULT_BOS, - DEFAULT_EOS); - } - - /** - * Start generating tokens from the module. - * - * @param prompt Input prompt - * @param llmCallback callback object to receive results - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - */ - public void generate(String prompt, LlmCallback llmCallback, boolean echo) { - generate( - null, - 0, - 0, - 0, - prompt, - DEFAULT_SEQ_LEN, - llmCallback, - echo, - DEFAULT_TEMPERATURE, - DEFAULT_BOS, - DEFAULT_EOS); - } - - /** - * Start generating tokens from the module. - * - * @param prompt Input prompt - * @param seqLen sequence length - * @param llmCallback callback object to receive results - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - */ - public void generate(String prompt, int seqLen, LlmCallback llmCallback, boolean echo) { - generate(prompt, seqLen, llmCallback, echo, DEFAULT_TEMPERATURE, DEFAULT_BOS, DEFAULT_EOS); - } - - /** - * Start generating tokens from the module. - * - * @param prompt Input prompt - * @param seqLen sequence length - * @param llmCallback callback object to receive results - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - * @param temperature temperature for sampling (use negative value to use module default) - * @param numBos number of BOS tokens to prepend - * @param numEos number of EOS tokens to append - */ - public void generate( - String prompt, - int seqLen, - LlmCallback llmCallback, - boolean echo, - float temperature, - int numBos, - int numEos) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos); - if (err != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate"); - } - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native int generateNative( - String prompt, - int seqLen, - LlmCallback llmCallback, - boolean echo, - float temperature, - int numBos, - int numEos); - - /** - * Start generating tokens from the module. - * - * @param prompt Input prompt - * @param config the config for generation - * @param llmCallback callback object to receive results - */ - public void generate(String prompt, LlmGenerationConfig config, LlmCallback llmCallback) { - int seqLen = config.getSeqLen(); - boolean echo = config.isEcho(); - float temperature = config.getTemperature(); - int numBos = config.getNumBos(); - int numEos = config.getNumEos(); - generate(null, 0, 0, 0, prompt, seqLen, llmCallback, echo, temperature, numBos, numEos); - } - - /** - * Start generating tokens from the module. - * - * @param image Input image as a byte array - * @param width Input image width - * @param height Input image height - * @param channels Input image number of channels - * @param prompt Input prompt - * @param seqLen sequence length - * @param llmCallback callback object to receive results. - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - */ - public void generate( - int[] image, - int width, - int height, - int channels, - String prompt, - int seqLen, - LlmCallback llmCallback, - boolean echo) { - generate( - image, - width, - height, - channels, - prompt, - seqLen, - llmCallback, - echo, - DEFAULT_TEMPERATURE, - DEFAULT_BOS, - DEFAULT_EOS); - } - - /** - * Start generating tokens from the module. - * - * @param image Input image as a byte array - * @param width Input image width - * @param height Input image height - * @param channels Input image number of channels - * @param prompt Input prompt - * @param seqLen sequence length - * @param llmCallback callback object to receive results. - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - * @param temperature temperature for sampling (use negative value to use module default) - */ - public void generate( - int[] image, - int width, - int height, - int channels, - String prompt, - int seqLen, - LlmCallback llmCallback, - boolean echo, - float temperature) { - generate( - image, - width, - height, - channels, - prompt, - seqLen, - llmCallback, - echo, - temperature, - DEFAULT_BOS, - DEFAULT_EOS); - } - - /** - * Start generating tokens from the module. - * - * @param image Input image as a byte array - * @param width Input image width - * @param height Input image height - * @param channels Input image number of channels - * @param prompt Input prompt - * @param seqLen sequence length - * @param llmCallback callback object to receive results. - * @param echo indicate whether to echo the input prompt or not (text completion vs chat) - * @param temperature temperature for sampling (use negative value to use module default) - * @param numBos number of BOS tokens to prepend - * @param numEos number of EOS tokens to append - */ - public void generate( - int[] image, - int width, - int height, - int channels, - String prompt, - int seqLen, - LlmCallback llmCallback, - boolean echo, - float temperature, - int numBos, - int numEos) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - if (image != null) { - int nativeResult = prefillImagesInput(image, width, height, channels); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } - int err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos); - if (err != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate"); - } - } finally { - mLock.unlock(); - } - } - - /** - * Prefill the KV cache with the given image input. - * - * @param image Input image as a byte array - * @param width Input image width - * @param height Input image height - * @param channels Input image number of channels - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillImages(int[] image, int width, int height, int channels) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int nativeResult = prefillImagesInput(image, width, height, channels); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - /** - * Prefill a multimodal Module with the given image input via a direct ByteBuffer. The buffer data - * is accessed directly without JNI array copies, unlike {@link #prefillImages(int[], int, int, - * int)}. The ByteBuffer must contain raw uint8 pixel data in CHW format with at least channels * - * height * width bytes remaining. Only the first channels * height * width bytes from the - * buffer's current position are read; the position of the original ByteBuffer is not modified. - * - * @param image Input image as a direct ByteBuffer containing uint8 pixel data - * @param width Input image width - * @param height Input image height - * @param channels Input image number of channels - * @throws IllegalArgumentException if the ByteBuffer is not direct or has insufficient remaining - * bytes - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillImages(ByteBuffer image, int width, int height, int channels) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - if (!image.isDirect()) { - throw new IllegalArgumentException("Input ByteBuffer must be direct."); - } - long expectedBytes; - try { - long pixels = Math.multiplyExact((long) width, (long) height); - expectedBytes = Math.multiplyExact(pixels, (long) channels); - } catch (ArithmeticException ex) { - throw new IllegalArgumentException( - "width*height*channels is too large and overflows the allowed range.", ex); - } - if (width <= 0 - || height <= 0 - || channels <= 0 - || expectedBytes > Integer.MAX_VALUE - || image.remaining() < expectedBytes) { - throw new IllegalArgumentException( - "ByteBuffer remaining (" - + image.remaining() - + ") must be at least width*height*channels (" - + expectedBytes - + ")."); - } - // slice() so that getDirectBufferAddress on the native side returns a pointer - // starting at the current position, not the base address. - int nativeResult = prefillImagesInputBuffer(image.slice(), width, height, channels); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - /** - * Prefill a multimodal Module with the given normalized image input via a direct ByteBuffer. The - * buffer data is accessed directly without JNI array copies, unlike {@link - * #prefillImages(float[], int, int, int)}. The ByteBuffer must contain normalized float pixel - * data in CHW format with at least channels * height * width * 4 bytes remaining. Only the first - * channels * height * width floats from the buffer's current position are consumed. The buffer - * must use the platform's native byte order (set via {@code - * buffer.order(ByteOrder.nativeOrder())}). - * - * @param image Input normalized image as a direct ByteBuffer containing float pixel data in - * native byte order - * @param width Input image width - * @param height Input image height - * @param channels Input image number of channels - * @throws IllegalArgumentException if the ByteBuffer is not direct, has insufficient remaining - * bytes, is not float-aligned, or does not use native byte order - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillNormalizedImage(ByteBuffer image, int width, int height, int channels) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - if (!image.isDirect()) { - throw new IllegalArgumentException("Input ByteBuffer must be direct."); - } - if (image.order() != java.nio.ByteOrder.nativeOrder()) { - throw new IllegalArgumentException( - "Input ByteBuffer must use native byte order (ByteOrder.nativeOrder())."); - } - if (image.position() % Float.BYTES != 0) { - throw new IllegalArgumentException( - "Input ByteBuffer position (" + image.position() + ") must be 4-byte aligned."); - } - final long expectedBytes; - try { - int wh = Math.multiplyExact(width, height); - long whc = Math.multiplyExact((long) wh, (long) channels); - long totalBytes = Math.multiplyExact(whc, (long) Float.BYTES); - if (totalBytes > Integer.MAX_VALUE) { - throw new IllegalArgumentException( - "ByteBuffer size (width*height*channels*4) exceeds Integer.MAX_VALUE bytes: " - + totalBytes); - } - expectedBytes = totalBytes; - } catch (ArithmeticException e) { - throw new IllegalArgumentException( - "Overflow while computing width*height*channels*4 for ByteBuffer size.", e); - } - if (width <= 0 || height <= 0 || channels <= 0 || image.remaining() < expectedBytes) { - throw new IllegalArgumentException( - "ByteBuffer remaining (" - + image.remaining() - + ") must be at least width*height*channels*4 (" - + expectedBytes - + ")."); - } - if (image.remaining() % Float.BYTES != 0) { - throw new IllegalArgumentException( - "ByteBuffer remaining (" - + image.remaining() - + ") must be a multiple of 4 (float size)."); - } - // slice() so that getDirectBufferAddress on the native side returns a pointer - // starting at the current position, not the base address. - int nativeResult = prefillNormalizedImagesInputBuffer(image.slice(), width, height, channels); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - private native int prefillImagesInput(int[] image, int width, int height, int channels); - - private native int prefillImagesInputBuffer( - ByteBuffer image, int width, int height, int channels); - - private native int prefillNormalizedImagesInputBuffer( - ByteBuffer image, int width, int height, int channels); - - /** - * Prefill the KV cache with the given normalized image input. - * - * @param image Input normalized image as a float array - * @param width Input image width - * @param height Input image height - * @param channels Input image number of channels - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillImages(float[] image, int width, int height, int channels) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int nativeResult = prefillNormalizedImagesInput(image, width, height, channels); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - private native int prefillNormalizedImagesInput( - float[] image, int width, int height, int channels); - - /** - * Prefill the KV cache with the given preprocessed audio input. - * - * @param audio Input preprocessed audio as a byte array - * @param batch_size Input batch size - * @param n_bins Input number of bins - * @param n_frames Input number of frames - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillAudio(byte[] audio, int batch_size, int n_bins, int n_frames) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int nativeResult = prefillAudioInput(audio, batch_size, n_bins, n_frames); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - private native int prefillAudioInput(byte[] audio, int batch_size, int n_bins, int n_frames); - - /** - * Prefill the KV cache with the given preprocessed audio input. - * - * @param audio Input preprocessed audio as a float array - * @param batch_size Input batch size - * @param n_bins Input number of bins - * @param n_frames Input number of frames - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillAudio(float[] audio, int batch_size, int n_bins, int n_frames) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int nativeResult = prefillAudioInputFloat(audio, batch_size, n_bins, n_frames); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - private native int prefillAudioInputFloat( - float[] audio, int batch_size, int n_bins, int n_frames); - - /** - * Prefill the KV cache with the given raw audio input. - * - * @param audio Input raw audio as a byte array - * @param batch_size Input batch size - * @param n_channels Input number of channels - * @param n_samples Input number of samples - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillRawAudio(byte[] audio, int batch_size, int n_channels, int n_samples) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int nativeResult = prefillRawAudioInput(audio, batch_size, n_channels, n_samples); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - private native int prefillRawAudioInput( - byte[] audio, int batch_size, int n_channels, int n_samples); - - /** - * Prefill the KV cache with the given text prompt. - * - * @param prompt The text prompt to prefill. - * @throws ExecutorchRuntimeException if the prefill failed - */ - @Experimental - public void prefillPrompt(String prompt) { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int nativeResult = prefillTextInput(prompt); - if (nativeResult != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed"); - } - } finally { - mLock.unlock(); - } - } - - // returns status - private native int prefillTextInput(String prompt); - - /** - * Reset the context of the LLM. This will clear the KV cache and reset the state of the LLM. - * - *

The startPos will be reset to 0. - */ - public void resetContext() { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - resetContextNative(); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native void resetContextNative(); - - /** Stop current generate() before it finishes. */ - public void stop() { - if (mDestroyed) return; - stopNative(); - } - - @DoNotStrip - private native void stopNative(); - - /** Force loading the module. Otherwise the model is loaded during first generate(). */ - public void load() { - mLock.lock(); - try { - checkNotReentrant(); - checkNotDestroyed(); - int err = loadNative(); - if (err != 0) { - throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to load model"); - } - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native int loadNative(); -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt new file mode 100644 index 00000000000..f95e796b83b --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModule.kt @@ -0,0 +1,898 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch.extension.llm + +import com.facebook.jni.HybridData +import com.facebook.jni.annotations.DoNotStrip +import java.io.Closeable +import java.nio.ByteBuffer +import java.nio.ByteOrder +import java.util.concurrent.locks.ReentrantLock +import org.pytorch.executorch.ExecuTorchRuntime +import org.pytorch.executorch.ExecutorchRuntimeException +import org.pytorch.executorch.annotations.Experimental + +/** + * LlmModule is a wrapper around the Executorch LLM. It provides a simple interface to generate text + * from the model. + * + * Warning: These APIs are experimental and subject to change without notice + */ +@Experimental +class LlmModule +private constructor( + modelType: Int, + modulePath: String, + tokenizerPath: String, + temperature: Float, + dataFiles: List, + numBos: Int, + numEos: Int, + loadMode: Int, +) : Closeable { + + private val mHybridData: HybridData + private val mLock = ReentrantLock() + @Volatile private var mDestroyed = false + + init { + ExecuTorchRuntime.getRuntime() + ExecuTorchRuntime.validateFilePath(modulePath, "model path") + ExecuTorchRuntime.validateFilePath(tokenizerPath, "tokenizer path") + mHybridData = + initHybrid( + modelType, + modulePath, + tokenizerPath, + temperature, + dataFiles, + numBos, + numEos, + loadMode, + ) + } + + /** + * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and + * dataFiles. + */ + constructor( + modelType: Int, + modulePath: String, + tokenizerPath: String, + temperature: Float, + dataFiles: List, + numBos: Int, + numEos: Int, + ) : this( + modelType, + modulePath, + tokenizerPath, + temperature, + dataFiles, + numBos, + numEos, + DEFAULT_LOAD_MODE, + ) + + /** + * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and + * dataFiles. + */ + constructor( + modelType: Int, + modulePath: String, + tokenizerPath: String, + temperature: Float, + dataFiles: List, + ) : this( + modelType, + modulePath, + tokenizerPath, + temperature, + dataFiles, + DEFAULT_BOS, + DEFAULT_EOS, + DEFAULT_LOAD_MODE, + ) + + /** + * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and + * data path. + */ + constructor( + modelType: Int, + modulePath: String, + tokenizerPath: String, + temperature: Float, + dataPath: String?, + numBos: Int, + numEos: Int, + ) : this( + modelType, + modulePath, + tokenizerPath, + temperature, + listOfNotNull(dataPath), + numBos, + numEos, + ) + + /** + * Constructs a LLM Module for a model with given type, model path, tokenizer, temperature, and + * data path. + */ + constructor( + modelType: Int, + modulePath: String, + tokenizerPath: String, + temperature: Float, + dataPath: String?, + ) : this( + modelType, + modulePath, + tokenizerPath, + temperature, + dataPath, + DEFAULT_BOS, + DEFAULT_EOS, + ) + + /** Constructs a LLM Module for a model with given model path, tokenizer, temperature. */ + constructor( + modulePath: String, + tokenizerPath: String, + temperature: Float, + ) : this( + MODEL_TYPE_TEXT, + modulePath, + tokenizerPath, + temperature, + emptyList(), + DEFAULT_BOS, + DEFAULT_EOS, + ) + + /** + * Constructs a LLM Module for a model with given model path, tokenizer, temperature and data + * path. + */ + constructor( + modulePath: String, + tokenizerPath: String, + temperature: Float, + dataPath: String, + ) : this( + MODEL_TYPE_TEXT, + modulePath, + tokenizerPath, + temperature, + listOf(dataPath), + DEFAULT_BOS, + DEFAULT_EOS, + ) + + /** Constructs a LLM Module for a model with given path, tokenizer, and temperature. */ + constructor( + modelType: Int, + modulePath: String, + tokenizerPath: String, + temperature: Float, + ) : this( + modelType, + modulePath, + tokenizerPath, + temperature, + emptyList(), + DEFAULT_BOS, + DEFAULT_EOS, + ) + + /** Constructs a LLM Module for a model with the given LlmModuleConfig */ + constructor( + config: LlmModuleConfig + ) : this( + config.modelType, + config.modulePath, + config.tokenizerPath, + config.temperature, + listOfNotNull(config.dataPath), + config.numBos, + config.numEos, + config.loadMode, + ) + + private fun checkNotDestroyed() { + if (mDestroyed) throw IllegalStateException("LlmModule has been destroyed") + } + + private fun checkNotReentrant() { + if (mLock.holdCount > 1) { + throw IllegalStateException("Cannot call LlmModule methods from within a callback") + } + } + + /** + * Releases native resources. Callers must ensure no other methods are in-flight. Call [stop] and + * wait for [generate] to return before calling this method. + */ + override fun close() { + if (mLock.tryLock()) { + try { + if (mLock.holdCount > 1) { + throw IllegalStateException("Cannot close module from within a callback during execution") + } + if (!mDestroyed) { + mDestroyed = true + mHybridData.resetNative() + } + } finally { + mLock.unlock() + } + } else { + throw IllegalStateException("Cannot close module while method is executing") + } + } + + /** @deprecated Use [close] instead. */ + @Deprecated("Use close() instead", replaceWith = ReplaceWith("close()")) + fun resetNative() { + close() + } + + // --- generate overloads --- + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param llmCallback callback object to receive results. + */ + fun generate(prompt: String, llmCallback: LlmCallback) { + generate( + prompt, + DEFAULT_SEQ_LEN, + llmCallback, + DEFAULT_ECHO, + DEFAULT_TEMPERATURE, + DEFAULT_BOS, + DEFAULT_EOS, + ) + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param seqLen sequence length + * @param llmCallback callback object to receive results. + */ + fun generate(prompt: String, seqLen: Int, llmCallback: LlmCallback) { + generate( + null, + 0, + 0, + 0, + prompt, + seqLen, + llmCallback, + DEFAULT_ECHO, + DEFAULT_TEMPERATURE, + DEFAULT_BOS, + DEFAULT_EOS, + ) + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param llmCallback callback object to receive results + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + */ + fun generate(prompt: String, llmCallback: LlmCallback, echo: Boolean) { + generate( + null, + 0, + 0, + 0, + prompt, + DEFAULT_SEQ_LEN, + llmCallback, + echo, + DEFAULT_TEMPERATURE, + DEFAULT_BOS, + DEFAULT_EOS, + ) + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param seqLen sequence length + * @param llmCallback callback object to receive results + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + */ + fun generate(prompt: String, seqLen: Int, llmCallback: LlmCallback, echo: Boolean) { + generate(prompt, seqLen, llmCallback, echo, DEFAULT_TEMPERATURE, DEFAULT_BOS, DEFAULT_EOS) + } + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param seqLen sequence length + * @param llmCallback callback object to receive results + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + * @param temperature temperature for sampling (use negative value to use module default) + * @param numBos number of BOS tokens to prepend + * @param numEos number of EOS tokens to append + */ + fun generate( + prompt: String, + seqLen: Int, + llmCallback: LlmCallback, + echo: Boolean, + temperature: Float, + numBos: Int, + numEos: Int, + ) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos) + if (err != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate") + } + } finally { + mLock.unlock() + } + } + + @DoNotStrip + private external fun generateNative( + prompt: String, + seqLen: Int, + llmCallback: LlmCallback, + echo: Boolean, + temperature: Float, + numBos: Int, + numEos: Int, + ): Int + + /** + * Start generating tokens from the module. + * + * @param prompt Input prompt + * @param config the config for generation + * @param llmCallback callback object to receive results + */ + fun generate(prompt: String, config: LlmGenerationConfig, llmCallback: LlmCallback) { + generate( + null, + 0, + 0, + 0, + prompt, + config.seqLen, + llmCallback, + config.echo, + config.temperature, + config.numBos, + config.numEos, + ) + } + + /** + * Start generating tokens from the module. + * + * @param image Input image as a byte array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @param prompt Input prompt + * @param seqLen sequence length + * @param llmCallback callback object to receive results. + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + */ + fun generate( + image: IntArray?, + width: Int, + height: Int, + channels: Int, + prompt: String, + seqLen: Int, + llmCallback: LlmCallback, + echo: Boolean, + ) { + generate( + image, + width, + height, + channels, + prompt, + seqLen, + llmCallback, + echo, + DEFAULT_TEMPERATURE, + DEFAULT_BOS, + DEFAULT_EOS, + ) + } + + /** + * Start generating tokens from the module. + * + * @param image Input image as a byte array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @param prompt Input prompt + * @param seqLen sequence length + * @param llmCallback callback object to receive results. + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + * @param temperature temperature for sampling (use negative value to use module default) + */ + fun generate( + image: IntArray?, + width: Int, + height: Int, + channels: Int, + prompt: String, + seqLen: Int, + llmCallback: LlmCallback, + echo: Boolean, + temperature: Float, + ) { + generate( + image, + width, + height, + channels, + prompt, + seqLen, + llmCallback, + echo, + temperature, + DEFAULT_BOS, + DEFAULT_EOS, + ) + } + + /** + * Start generating tokens from the module. + * + * @param image Input image as a byte array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @param prompt Input prompt + * @param seqLen sequence length + * @param llmCallback callback object to receive results. + * @param echo indicate whether to echo the input prompt or not (text completion vs chat) + * @param temperature temperature for sampling (use negative value to use module default) + * @param numBos number of BOS tokens to prepend + * @param numEos number of EOS tokens to append + */ + fun generate( + image: IntArray?, + width: Int, + height: Int, + channels: Int, + prompt: String, + seqLen: Int, + llmCallback: LlmCallback, + echo: Boolean, + temperature: Float, + numBos: Int, + numEos: Int, + ) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + if (image != null) { + val nativeResult = prefillImagesInput(image, width, height, channels) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } + val err = generateNative(prompt, seqLen, llmCallback, echo, temperature, numBos, numEos) + if (err != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to generate") + } + } finally { + mLock.unlock() + } + } + + // --- prefill methods --- + + /** + * Prefill the KV cache with the given image input. + * + * @param image Input image as a byte array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillImages(image: IntArray, width: Int, height: Int, channels: Int) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val nativeResult = prefillImagesInput(image, width, height, channels) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + /** + * Prefill a multimodal Module with the given image input via a direct ByteBuffer. The buffer data + * is accessed directly without JNI array copies, unlike [prefillImages]. The ByteBuffer must + * contain raw uint8 pixel data in CHW format with at least channels * height * width bytes + * remaining. Only the first channels * height * width bytes from the buffer's current position + * are read; the position of the original ByteBuffer is not modified. + * + * @param image Input image as a direct ByteBuffer containing uint8 pixel data + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @throws IllegalArgumentException if the ByteBuffer is not direct or has insufficient remaining + * bytes + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillImages(image: ByteBuffer, width: Int, height: Int, channels: Int) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + require(image.isDirect) { "Input ByteBuffer must be direct." } + val expectedBytes: Long + try { + val pixels = Math.multiplyExact(width.toLong(), height.toLong()) + expectedBytes = Math.multiplyExact(pixels, channels.toLong()) + } catch (ex: ArithmeticException) { + throw IllegalArgumentException( + "width*height*channels is too large and overflows the allowed range.", + ex, + ) + } + require( + width > 0 && + height > 0 && + channels > 0 && + expectedBytes <= Int.MAX_VALUE.toLong() && + image.remaining().toLong() >= expectedBytes + ) { + "ByteBuffer remaining (${image.remaining()}) must be at least width*height*channels ($expectedBytes)." + } + // slice() so that getDirectBufferAddress on the native side returns a pointer + // starting at the current position, not the base address. + val nativeResult = prefillImagesInputBuffer(image.slice(), width, height, channels) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + /** + * Prefill a multimodal Module with the given normalized image input via a direct ByteBuffer. The + * buffer data is accessed directly without JNI array copies, unlike [prefillImages]. The + * ByteBuffer must contain normalized float pixel data in CHW format with at least channels * + * height * width * 4 bytes remaining. Only the first channels * height * width floats from the + * buffer's current position are consumed. The buffer must use the platform's native byte order + * (set via `buffer.order(ByteOrder.nativeOrder())`). + * + * @param image Input normalized image as a direct ByteBuffer containing float pixel data in + * native byte order + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @throws IllegalArgumentException if the ByteBuffer is not direct, has insufficient remaining + * bytes, is not float-aligned, or does not use native byte order + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillNormalizedImage(image: ByteBuffer, width: Int, height: Int, channels: Int) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + require(image.isDirect) { "Input ByteBuffer must be direct." } + require(image.order() == ByteOrder.nativeOrder()) { + "Input ByteBuffer must use native byte order (ByteOrder.nativeOrder())." + } + require(image.position() % Float.SIZE_BYTES == 0) { + "Input ByteBuffer position (${image.position()}) must be 4-byte aligned." + } + val expectedBytes: Long + try { + val wh = Math.multiplyExact(width, height) + val whc = Math.multiplyExact(wh.toLong(), channels.toLong()) + val totalBytes = Math.multiplyExact(whc, Float.SIZE_BYTES.toLong()) + if (totalBytes > Int.MAX_VALUE.toLong()) { + throw IllegalArgumentException( + "ByteBuffer size (width*height*channels*4) exceeds Integer.MAX_VALUE bytes: $totalBytes", + ) + } + expectedBytes = totalBytes + } catch (e: ArithmeticException) { + throw IllegalArgumentException( + "Overflow while computing width*height*channels*4 for ByteBuffer size.", + e, + ) + } + require( + width > 0 && height > 0 && channels > 0 && image.remaining().toLong() >= expectedBytes + ) { + "ByteBuffer remaining (${image.remaining()}) must be at least width*height*channels*4 ($expectedBytes)." + } + require(image.remaining() % Float.SIZE_BYTES == 0) { + "ByteBuffer remaining (${image.remaining()}) must be a multiple of 4 (float size)." + } + // slice() so that getDirectBufferAddress on the native side returns a pointer + // starting at the current position, not the base address. + val nativeResult = prefillNormalizedImagesInputBuffer(image.slice(), width, height, channels) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + private external fun prefillImagesInput( + image: IntArray, + width: Int, + height: Int, + channels: Int, + ): Int + + private external fun prefillImagesInputBuffer( + image: ByteBuffer, + width: Int, + height: Int, + channels: Int, + ): Int + + private external fun prefillNormalizedImagesInputBuffer( + image: ByteBuffer, + width: Int, + height: Int, + channels: Int, + ): Int + + /** + * Prefill the KV cache with the given normalized image input. + * + * @param image Input normalized image as a float array + * @param width Input image width + * @param height Input image height + * @param channels Input image number of channels + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillImages(image: FloatArray, width: Int, height: Int, channels: Int) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val nativeResult = prefillNormalizedImagesInput(image, width, height, channels) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + private external fun prefillNormalizedImagesInput( + image: FloatArray, + width: Int, + height: Int, + channels: Int, + ): Int + + /** + * Prefill the KV cache with the given preprocessed audio input. + * + * @param audio Input preprocessed audio as a byte array + * @param batchSize Input batch size + * @param nBins Input number of bins + * @param nFrames Input number of frames + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillAudio(audio: ByteArray, batchSize: Int, nBins: Int, nFrames: Int) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val nativeResult = prefillAudioInput(audio, batchSize, nBins, nFrames) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + private external fun prefillAudioInput( + audio: ByteArray, + batchSize: Int, + nBins: Int, + nFrames: Int, + ): Int + + /** + * Prefill the KV cache with the given preprocessed audio input. + * + * @param audio Input preprocessed audio as a float array + * @param batchSize Input batch size + * @param nBins Input number of bins + * @param nFrames Input number of frames + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillAudio(audio: FloatArray, batchSize: Int, nBins: Int, nFrames: Int) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val nativeResult = prefillAudioInputFloat(audio, batchSize, nBins, nFrames) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + private external fun prefillAudioInputFloat( + audio: FloatArray, + batchSize: Int, + nBins: Int, + nFrames: Int, + ): Int + + /** + * Prefill the KV cache with the given raw audio input. + * + * @param audio Input raw audio as a byte array + * @param batchSize Input batch size + * @param nChannels Input number of channels + * @param nSamples Input number of samples + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillRawAudio(audio: ByteArray, batchSize: Int, nChannels: Int, nSamples: Int) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val nativeResult = prefillRawAudioInput(audio, batchSize, nChannels, nSamples) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + private external fun prefillRawAudioInput( + audio: ByteArray, + batchSize: Int, + nChannels: Int, + nSamples: Int, + ): Int + + /** + * Prefill the KV cache with the given text prompt. + * + * @param prompt The text prompt to prefill. + * @throws ExecutorchRuntimeException if the prefill failed + */ + @Experimental + fun prefillPrompt(prompt: String) { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val nativeResult = prefillTextInput(prompt) + if (nativeResult != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(nativeResult, "Prefill failed") + } + } finally { + mLock.unlock() + } + } + + // returns status + private external fun prefillTextInput(prompt: String): Int + + /** + * Reset the context of the LLM. This will clear the KV cache and reset the state of the LLM. + * + * The startPos will be reset to 0. + */ + fun resetContext() { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + resetContextNative() + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun resetContextNative() + + /** Stop current generate() before it finishes. */ + fun stop() { + if (mDestroyed) return + stopNative() + } + + @DoNotStrip private external fun stopNative() + + /** Force loading the module. Otherwise the model is loaded during first generate(). */ + fun load() { + mLock.lock() + try { + checkNotReentrant() + checkNotDestroyed() + val err = loadNative() + if (err != 0) { + throw ExecutorchRuntimeException.makeExecutorchException(err, "Failed to load model") + } + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun loadNative(): Int + + companion object { + const val MODEL_TYPE_TEXT = 1 + const val MODEL_TYPE_TEXT_VISION = 2 + const val MODEL_TYPE_MULTIMODAL = 2 + + private const val DEFAULT_SEQ_LEN = 128 + private const val DEFAULT_ECHO = true + private const val DEFAULT_TEMPERATURE = -1.0f + private const val DEFAULT_BOS = 0 + private const val DEFAULT_EOS = 0 + private const val DEFAULT_LOAD_MODE = LlmModuleConfig.LOAD_MODE_MMAP + + @DoNotStrip + @JvmStatic + private external fun initHybrid( + modelType: Int, + modulePath: String, + tokenizerPath: String, + temperature: Float, + dataFiles: List, + numBos: Int, + numEos: Int, + loadMode: Int, + ): HybridData + } +} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java deleted file mode 100644 index feb52a2b34b..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.java +++ /dev/null @@ -1,252 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch.extension.llm; - -/** - * Configuration class for initializing a LlmModule. - * - *

{@link #create()} method and the fluent builder pattern. - */ -public class LlmModuleConfig { - private final String modulePath; - private final String tokenizerPath; - private final float temperature; - private final String dataPath; - private final int modelType; - private final int numBos; - private final int numEos; - private final int loadMode; - - /** Load entire model file into a buffer (no mmap). */ - public static final int LOAD_MODE_FILE = 0; - - /** Load model via mmap without mlock (default). Pages faulted in on demand. */ - public static final int LOAD_MODE_MMAP = 1; - - /** Load model via mmap and pin all pages with mlock. */ - public static final int LOAD_MODE_MMAP_USE_MLOCK = 2; - - /** Load model via mmap and attempt mlock, ignoring mlock failures. */ - public static final int LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3; - - private LlmModuleConfig(Builder builder) { - this.modulePath = builder.modulePath; - this.tokenizerPath = builder.tokenizerPath; - this.temperature = builder.temperature; - this.dataPath = builder.dataPath; - this.modelType = builder.modelType; - this.numBos = builder.numBos; - this.numEos = builder.numEos; - this.loadMode = builder.loadMode; - } - - /** Model type constant for text-only models. */ - public static final int MODEL_TYPE_TEXT = 1; - - /** Model type constant for text-and-vision multimodal models. */ - public static final int MODEL_TYPE_TEXT_VISION = 2; - - /** Model type constant for generic multimodal models. */ - public static final int MODEL_TYPE_MULTIMODAL = 2; - - /** - * Creates a new Builder instance for constructing LlmModuleConfig objects. - * - * @return a new Builder instance with default configuration values - */ - public static Builder create() { - return new Builder(); - } - - // Getters with documentation - /** - * @return Path to the compiled model module (.pte file) - */ - public String getModulePath() { - return modulePath; - } - - /** - * @return Path to the tokenizer file or directory - */ - public String getTokenizerPath() { - return tokenizerPath; - } - - /** - * @return Temperature value for sampling (higher = more random) - */ - public float getTemperature() { - return temperature; - } - - /** - * @return Optional path to additional data files - */ - public String getDataPath() { - return dataPath; - } - - /** - * @return Type of model (text-only or text-vision) - */ - public int getModelType() { - return modelType; - } - - /** - * @return Number of BOS tokens to prepend - */ - public int getNumBos() { - return numBos; - } - - /** - * @return Number of EOS tokens to append - */ - public int getNumEos() { - return numEos; - } - - /** - * @return Load mode for the model file (one of LOAD_MODE_* constants) - */ - public int getLoadMode() { - return loadMode; - } - - /** - * Builder class for constructing LlmModuleConfig instances with optional parameters. - * - *

The builder provides a fluent interface for configuring model parameters and validates - * required fields before construction. - */ - public static class Builder { - private String modulePath; - private String tokenizerPath; - private float temperature = 0.8f; - private String dataPath = ""; - private int modelType = MODEL_TYPE_TEXT; - private int numBos = 0; - private int numEos = 0; - private int loadMode = LOAD_MODE_MMAP; - - Builder() {} - - /** - * Sets the path to the module. - * - * @param modulePath Path to module - * @return This builder instance for method chaining - */ - public Builder modulePath(String modulePath) { - this.modulePath = modulePath; - return this; - } - - /** - * Sets the path to the tokenizer. - * - * @param tokenizerPath Path to tokenizer - * @return This builder instance for method chaining - */ - public Builder tokenizerPath(String tokenizerPath) { - this.tokenizerPath = tokenizerPath; - return this; - } - - /** - * Sets the temperature for sampling generation. - * - * @param temperature Temperature value (typical range 0.0-1.0) - * @return This builder instance for method chaining - */ - public Builder temperature(float temperature) { - this.temperature = temperature; - return this; - } - - /** - * Sets the path to optional additional data files. - * - * @param dataPath Path to supplementary data resources - * @return This builder instance for method chaining - */ - public Builder dataPath(String dataPath) { - this.dataPath = dataPath; - return this; - } - - /** - * Sets the model type (text-only or multimodal). - * - * @param modelType One of MODEL_TYPE_TEXT, MODEL_TYPE_TEXT_VISION, MODEL_TYPE_MULTIMODAL - * @return This builder instance for method chaining - */ - public Builder modelType(int modelType) { - this.modelType = modelType; - return this; - } - - /** - * Sets the number of BOS tokens to prepend. - * - * @param numBos number of BOS tokens - * @return This builder instance for method chaining - */ - public Builder numBos(int numBos) { - this.numBos = numBos; - return this; - } - - /** - * Sets the number of EOS tokens to append. - * - * @param numEos number of EOS tokens - * @return This builder instance for method chaining - */ - public Builder numEos(int numEos) { - this.numEos = numEos; - return this; - } - - /** - * Sets the load mode for the model file. Defaults to {@link #LOAD_MODE_MMAP} (mmap without - * mlock), which avoids pinning model pages in RAM. - * - * @param loadMode One of LOAD_MODE_FILE, LOAD_MODE_MMAP, LOAD_MODE_MMAP_USE_MLOCK, - * LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS - * @return This builder instance for method chaining - * @throws IllegalArgumentException if {@code loadMode} is not one of the supported constants - */ - public Builder loadMode(int loadMode) { - if (loadMode != LOAD_MODE_FILE - && loadMode != LOAD_MODE_MMAP - && loadMode != LOAD_MODE_MMAP_USE_MLOCK - && loadMode != LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS) { - throw new IllegalArgumentException("Unknown load mode: " + loadMode); - } - this.loadMode = loadMode; - return this; - } - - /** - * Constructs the LlmModuleConfig instance with validated parameters. - * - * @return New LlmModuleConfig instance with configured values - * @throws IllegalArgumentException if required fields are missing - */ - public LlmModuleConfig build() { - if (modulePath == null || tokenizerPath == null) { - throw new IllegalArgumentException("Module path and tokenizer path are required"); - } - return new LlmModuleConfig(this); - } - } -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt new file mode 100644 index 00000000000..2d65633bb9f --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/LlmModuleConfig.kt @@ -0,0 +1,134 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch.extension.llm + +/** + * Configuration class for initializing a LlmModule. + * + * Use [create] method and the fluent builder pattern. + */ +class LlmModuleConfig +private constructor( + val modulePath: String, + val tokenizerPath: String, + val temperature: Float, + val dataPath: String?, + val modelType: Int, + val numBos: Int, + val numEos: Int, + val loadMode: Int, +) { + + companion object { + /** Load entire model file into a buffer (no mmap). */ + const val LOAD_MODE_FILE = 0 + + /** Load model via mmap without mlock (default). Pages faulted in on demand. */ + const val LOAD_MODE_MMAP = 1 + + /** Load model via mmap and pin all pages with mlock. */ + const val LOAD_MODE_MMAP_USE_MLOCK = 2 + + /** Load model via mmap and attempt mlock, ignoring mlock failures. */ + const val LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3 + + /** Model type constant for text-only models. */ + const val MODEL_TYPE_TEXT = 1 + + /** Model type constant for text-and-vision multimodal models. */ + const val MODEL_TYPE_TEXT_VISION = 2 + + /** Model type constant for generic multimodal models. */ + const val MODEL_TYPE_MULTIMODAL = 2 + + /** + * Creates a new Builder instance for constructing LlmModuleConfig objects. + * + * @return a new Builder instance with default configuration values + */ + @JvmStatic fun create(): Builder = Builder() + } + + /** + * Builder class for constructing LlmModuleConfig instances with optional parameters. + * + * The builder provides a fluent interface for configuring model parameters and validates required + * fields before construction. + */ + class Builder internal constructor() { + private var modulePath: String? = null + private var tokenizerPath: String? = null + private var temperature: Float = 0.8f + private var dataPath: String? = "" + private var modelType: Int = MODEL_TYPE_TEXT + private var numBos: Int = 0 + private var numEos: Int = 0 + private var loadMode: Int = LOAD_MODE_MMAP + + /** Sets the path to the module. */ + fun modulePath(modulePath: String): Builder = apply { this.modulePath = modulePath } + + /** Sets the path to the tokenizer. */ + fun tokenizerPath(tokenizerPath: String): Builder = apply { this.tokenizerPath = tokenizerPath } + + /** Sets the temperature for sampling generation. */ + fun temperature(temperature: Float): Builder = apply { this.temperature = temperature } + + /** Sets the path to optional additional data files. */ + fun dataPath(dataPath: String?): Builder = apply { this.dataPath = dataPath } + + /** Sets the model type (text-only or multimodal). */ + fun modelType(modelType: Int): Builder = apply { this.modelType = modelType } + + /** Sets the number of BOS tokens to prepend. */ + fun numBos(numBos: Int): Builder = apply { this.numBos = numBos } + + /** Sets the number of EOS tokens to append. */ + fun numEos(numEos: Int): Builder = apply { this.numEos = numEos } + + /** + * Sets the load mode for the model file. Defaults to [LOAD_MODE_MMAP] (mmap without mlock), + * which avoids pinning model pages in RAM. + * + * @throws IllegalArgumentException if loadMode is not one of the supported constants + */ + fun loadMode(loadMode: Int): Builder { + require( + loadMode == LOAD_MODE_FILE || + loadMode == LOAD_MODE_MMAP || + loadMode == LOAD_MODE_MMAP_USE_MLOCK || + loadMode == LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS + ) { + "Unknown load mode: $loadMode" + } + return apply { this.loadMode = loadMode } + } + + /** + * Constructs the LlmModuleConfig instance with validated parameters. + * + * @throws IllegalArgumentException if required fields are missing + */ + fun build(): LlmModuleConfig { + require(modulePath != null && tokenizerPath != null) { + "Module path and tokenizer path are required" + } + return LlmModuleConfig( + modulePath!!, + tokenizerPath!!, + temperature, + dataPath, + modelType, + numBos, + numEos, + loadMode, + ) + } + } +} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java deleted file mode 100644 index 86e19d09133..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm/package-info.java +++ /dev/null @@ -1,51 +0,0 @@ -/** - * ExecuTorch LLM extension for Android. - * - *

This package provides Java bindings for running large language models (LLMs) on Android using - * ExecuTorch. It supports text generation, tokenization, and streaming token callbacks. - * - *

Quick Start

- * - *
{@code
- * import org.pytorch.executorch.extension.llm.LlmModule;
- *
- * // Load a Llama model
- * LlmModule llm = new LlmModule(
- *     "/data/local/tmp/llama.pte",
- *     "/data/local/tmp/tokenizer.bin",
- *     0.8f
- * );
- * llm.load();
- *
- * // Generate text token by token
- * llm.generate("Hello, my name is", 200, new LlmCallback() {
- *     public void onResult(String token) {
- *         System.out.print(token);
- *     }
- *     public void onStats(String stats) {
- *         System.out.println("\nStats: " + stats);
- *     }
- * });
- * }
- * - *

Key Classes

- * - * - * - *

More Resources

- * - * - */ -package org.pytorch.executorch.extension.llm; From 6bda6c490ed8c2e2ac02049725b9a454dc92ec07 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Fri, 22 May 2026 18:25:34 -0700 Subject: [PATCH 004/103] Globally serialize XNNPACK execution, add logging (#19742) Differential Revision: D106123930 Pull Request resolved: https://github.com/pytorch/executorch/pull/19742 --- backends/xnnpack/runtime/XNNPACKBackend.cpp | 53 ++++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index c20fa985f46..2fe1e4d162e 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -41,6 +42,13 @@ using executorch::runtime::FreeableBuffer; using executorch::runtime::Result; using executorch::runtime::Span; +// Global mutex for all XNNPACK operations. This is temporary, tracked by +// T272407942. +static std::mutex& global_xnnpack_mutex() { + static std::mutex m; + return m; +} + class XnnpackBackend final : public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface { public: @@ -66,6 +74,8 @@ class XnnpackBackend final BackendInitContext& context, FreeableBuffer* processed, ArrayRef compile_specs) const override { + const std::lock_guard global_lock(global_xnnpack_mutex()); + auto executor = context.get_runtime_allocator() ->allocateInstance(); if (executor == nullptr) { @@ -129,6 +139,17 @@ class XnnpackBackend final Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err); return err; } + + ET_LOG( + Info, + "XnnpackBackend::init delegate=%p workspace_id=%" PRIu64 + " workspace_ptr=%p program_id=0x%" PRIxPTR " weight_cache=%s", + (void*)executor, + workspace->id(), + (void*)workspace_ptr, + program_id, + use_weight_cache ? "true" : "false"); + return executor; } @@ -136,15 +157,27 @@ class XnnpackBackend final BackendExecutionContext& context, DelegateHandle* handle, Span args) const override { + const std::lock_guard global_lock(global_xnnpack_mutex()); + auto executor = static_cast(handle); + auto workspace = executor->get_workspace(); + ET_LOG( + Info, + "XnnpackBackend::execute begin delegate=%p workspace_id=%" PRIu64 + " num_args=%zu weight_cache=%s", + (void*)executor, + workspace->id(), + (size_t)args.size(), + executor->uses_weight_cache() ? "true" : "false"); + std::unique_lock lock_weights_cache( weights_cache_mutex_, std::defer_lock); if (executor->uses_weight_cache()) { lock_weights_cache.lock(); } - auto [raii_lock, _] = executor->get_workspace()->acquire(); + auto [raii_lock, _] = workspace->acquire(); // Prepare Inputs/Outputs and Propagate Input Shapes Error err = executor->prepare_args(args); @@ -161,12 +194,29 @@ class XnnpackBackend final // Convert output data types if necessary (e.g., int32 -> int64 for Long) err = executor->convert_outputs(args); + ET_LOG( + Info, + "XnnpackBackend::execute end delegate=%p workspace_id=%" PRIu64 + " err=0x%x", + (void*)executor, + workspace->id(), + (unsigned int)err); + return err; } void destroy(DelegateHandle* handle) const override { if (handle != nullptr) { + const std::lock_guard global_lock(global_xnnpack_mutex()); + auto executor = static_cast(handle); + auto workspace = executor->get_workspace(); + + ET_LOG( + Info, + "XnnpackBackend::destroy delegate=%p workspace_id=%" PRIu64, + (void*)executor, + workspace->id()); #ifdef ENABLE_XNNPACK_PROFILING executor->print_avg_op_timings(); @@ -183,7 +233,6 @@ class XnnpackBackend final // the same backend instance. Make sure to hold onto the workspace // shared_ptr, as the pointer in the executor is freed, which includes // the mutex referenced by raii_lock. - auto workspace = executor->get_workspace(); auto [raii_lock, _] = workspace->acquire(); // XNNExecutor is not trivially destructible. Since this was constructed From 12f62f2eb869eddbe4c612efe3f957bfc965aff0 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Fri, 22 May 2026 20:48:11 -0700 Subject: [PATCH 005/103] [ET Device Support] Module: allocate device memory for planned buffers (#19746) https://github.com/pytorch/executorch/pull/18476 clone version due to bot crash --- extension/module/module.cpp | 78 ++++++- extension/module/module.h | 9 + extension/module/targets.bzl | 1 + .../module/test/module_device_memory_test.cpp | 218 ++++++++++++++++++ extension/module/test/targets.bzl | 22 +- .../executorch/build/build_variables.bzl | 2 + test/models/targets.bzl | 1 + 7 files changed, 328 insertions(+), 3 deletions(-) create mode 100644 extension/module/test/module_device_memory_test.cpp diff --git a/extension/module/module.cpp b/extension/module/module.cpp index 5422fb15b71..11fea031603 100644 --- a/extension/module/module.cpp +++ b/extension/module/module.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include namespace executorch { @@ -367,6 +368,51 @@ Module::make_planned_memory_with_shared_arenas( return planned; } +std::unique_ptr Module::make_planned_memory_with_devices( + const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) { + auto planned = std::make_unique(); + const size_t num_buffers = method_meta.num_memory_planned_buffers(); + planned->planned_buffers.reserve(num_buffers); + planned->planned_spans.reserve(num_buffers); + planned->device_buffers.reserve(num_buffers); + planned->planned_devices.reserve(num_buffers); + + for (size_t i = 0; i < num_buffers; ++i) { + auto size = method_meta.memory_planned_buffer_size(i); + ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i); + auto device = method_meta.memory_planned_buffer_device(i); + ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i); + planned->planned_devices.push_back(device.get()); + + if (device->is_cpu()) { + planned->planned_buffers.emplace_back(size.get()); + planned->planned_spans.emplace_back( + planned->planned_buffers.back().data(), size.get()); + } else { + // Allocate device memory via DeviceAllocator and store the RAII buffer. + planned->planned_buffers.emplace_back(); // empty CPU placeholder + auto dmb = runtime::DeviceMemoryBuffer::create( + size.get(), device->type(), device->index()); + ET_CHECK_MSG( + dmb.ok(), + "Failed to allocate device memory for buffer %zu (device_type=%d)", + i, + static_cast(device->type())); + planned->planned_spans.emplace_back(dmb->as_span()); + planned->device_buffers.push_back(std::move(dmb.get())); + } + } + + // HierarchicalAllocator owns the per-buffer Device metadata so the + // MemoryManager can later expose it via planned_buffer_devices(). + planned->planned_memory = std::make_unique( + runtime::Span>( + planned->planned_spans.data(), planned->planned_spans.size()), + runtime::Span( + planned->planned_devices.data(), planned->planned_devices.size())); + return planned; +} + runtime::Result> Module::get_mem_planned_buffer_sizes( const std::string& method_name) { auto meta_res = program_->method_meta(method_name.c_str()); @@ -422,10 +468,38 @@ runtime::Error Module::load_method( MethodHolder method_holder; if (!planned_memory) { - if (!share_memory_arenas_) { + // Check if any buffers need device memory allocation. + auto meta_res = program_->method_meta(method_name.c_str()); + ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error()); + auto& meta = meta_res.get(); + + bool has_device_buffers = false; + for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) { + auto dev = meta.memory_planned_buffer_device(i); + if (dev.ok() && !dev->is_cpu()) { + has_device_buffers = true; + break; + } + } + + if (has_device_buffers) { + // Device memory with shared arenas is not yet supported. + ET_CHECK_OR_RETURN_ERROR( + !share_memory_arenas_, + NotSupported, + "Device memory buffers are not yet compatible with " + "share_memory_arenas. Please disable share_memory_arenas " + "when using models with device-planned memory."); + + // Device-aware path: allocate CPU and device buffers. The device + // span is owned by the HierarchicalAllocator inside PlannedMemory. + method_holder.planned_memory = make_planned_memory_with_devices(meta); + planned_memory = method_holder.planned_memory->planned_memory.get(); + } else if (!share_memory_arenas_) { auto sizes_res = get_mem_planned_buffer_sizes(method_name); ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error()); method_holder.planned_memory = make_planned_memory(sizes_res.get()); + planned_memory = method_holder.planned_memory->planned_memory.get(); } else { auto sizes_res = get_mem_planned_buffer_sizes(method_name); ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error()); @@ -442,8 +516,8 @@ runtime::Error Module::load_method( } method_holder.planned_memory = make_planned_memory_with_shared_arenas(sizes, shared_arenas_); + planned_memory = method_holder.planned_memory->planned_memory.get(); } - planned_memory = method_holder.planned_memory->planned_memory.get(); } method_holder.memory_manager = std::make_unique( diff --git a/extension/module/module.h b/extension/module/module.h index 47ead23032e..91c7feaad9b 100644 --- a/extension/module/module.h +++ b/extension/module/module.h @@ -18,6 +18,8 @@ #include #include +#include + #ifdef USE_ATEN_LIB #define ET_MODULE_NAMESPACE module::aten #else // !USE_ATEN_LIB @@ -716,6 +718,11 @@ class Module { struct PlannedMemory { std::vector> planned_buffers; std::vector> planned_spans; + std::vector device_buffers; + /// Per-buffer Device (type + index) metadata used by + /// HierarchicalAllocator. Owns the storage backing the device span the + /// allocator references, so it must outlive `planned_memory`. + std::vector planned_devices; std::unique_ptr planned_memory; }; std::unique_ptr make_planned_memory( @@ -723,6 +730,8 @@ class Module { std::unique_ptr make_planned_memory_with_shared_arenas( const std::vector& buffer_sizes, std::vector>& shared_arenas); + std::unique_ptr make_planned_memory_with_devices( + const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta); runtime::Result> get_mem_planned_buffer_sizes( const std::string& method_name); runtime::Result> get_max_mem_planned_buffer_sizes(); diff --git a/extension/module/targets.bzl b/extension/module/targets.bzl index fa80203831a..e622b138ff6 100644 --- a/extension/module/targets.bzl +++ b/extension/module/targets.bzl @@ -30,6 +30,7 @@ def define_common_targets(): "//executorch/runtime/backend:backend_options", "//executorch/runtime/backend:backend_options_map", "//executorch/runtime/executor:program_no_prim_ops" + aten_suffix, + "//executorch/runtime/core:device_memory_buffer", ], ) diff --git a/extension/module/test/module_device_memory_test.cpp b/extension/module/test/module_device_memory_test.cpp new file mode 100644 index 00000000000..5031273ac2b --- /dev/null +++ b/extension/module/test/module_device_memory_test.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Tests that Module's device-aware memory allocation path works correctly. + * + * Uses ModuleAddWithDevice.pte which has: + * non_const_buffer_sizes: [0, 48] (1 buffer, index 0 reserved) + * non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}] + * + * Since we don't have a real CUDA backend, we test that: + * 1. CPU-only models load through Module without invoking device allocator + * 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock + */ + +#include + +#include + +#include +#include +#include + +using executorch::extension::Module; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::DeviceMemoryBuffer; +using executorch::runtime::Error; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace { + +class MockCudaAllocator : public DeviceAllocator { + public: + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignment = kDefaultAlignment) override { + (void)alignment; + allocate_count_++; + last_allocate_size_ = nbytes; + last_allocate_index_ = index; + buffer_ = std::make_unique(nbytes); + return static_cast(buffer_.get()); + } + + void deallocate(void* ptr, DeviceIndex index) override { + deallocate_count_++; + buffer_.reset(); + } + + Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override { + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + int allocate_count_ = 0; + int deallocate_count_ = 0; + size_t last_allocate_size_ = 0; + DeviceIndex last_allocate_index_ = -1; + + private: + std::unique_ptr buffer_; +}; + +} // namespace + +static MockCudaAllocator g_mock_cuda; + +class ModuleDeviceMemoryTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + register_device_allocator(&g_mock_cuda); + } + + void SetUp() override { + g_mock_cuda.allocate_count_ = 0; + g_mock_cuda.deallocate_count_ = 0; + g_mock_cuda.last_allocate_size_ = 0; + g_mock_cuda.last_allocate_index_ = -1; + } +}; + +TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) { + const char* path = std::getenv("ET_MODULE_ADD_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set"; + + Module module(path); + auto err = module.load_method("forward"); + ASSERT_EQ(err, Error::Ok); + + EXPECT_EQ(g_mock_cuda.allocate_count_, 0) + << "CPU-only model should not allocate device memory"; +} + +TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) { + // Directly test DeviceMemoryBuffer::create with the registered mock. + // This verifies the RAII allocation/deallocation path that Module uses. + { + auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0); + ASSERT_TRUE(result.ok()); + auto buf = std::move(result.get()); + + EXPECT_EQ(g_mock_cuda.allocate_count_, 1); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48); + EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0); + EXPECT_NE(buf.data(), nullptr); + EXPECT_EQ(buf.size(), 48); + + // as_span() wraps the device pointer for HierarchicalAllocator. + auto span = buf.as_span(); + EXPECT_EQ(span.data(), static_cast(buf.data())); + EXPECT_EQ(span.size(), 48); + + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0); + } + // RAII deallocation on scope exit. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1); +} + +TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) { + // Verify MethodMeta reports the correct device for buffers in the + // device-annotated model, without needing to load the full method. + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + Module module(path); + auto err = module.load(); + ASSERT_EQ(err, Error::Ok); + + auto meta = module.method_meta("forward"); + ASSERT_TRUE(meta.ok()); + + // ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA. + ASSERT_EQ(meta->num_memory_planned_buffers(), 1); + + auto size = meta->memory_planned_buffer_size(0); + ASSERT_TRUE(size.ok()); + EXPECT_EQ(size.get(), 48); + + auto device = meta->memory_planned_buffer_device(0); + ASSERT_TRUE(device.ok()); + EXPECT_EQ(device->type(), DeviceType::CUDA); + EXPECT_EQ(device->index(), 0); +} + +TEST_F(ModuleDeviceMemoryTest, DeviceModelWithSharedArenasReturnsNotSupported) { + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + // share_memory_arenas = true with a device-annotated model should fail. + Module module( + path, + Module::LoadMode::File, + /*event_tracer=*/nullptr, + /*memory_allocator=*/nullptr, + /*temp_allocator=*/nullptr, + /*share_memory_arenas=*/true); + + auto err = module.load_method("forward"); + EXPECT_EQ(err, Error::NotSupported); +} + +TEST_F( + ModuleDeviceMemoryTest, + LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) { + const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH"); + ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set"; + + { + Module module(path); + auto err = module.load_method("forward"); + + // Regardless of whether load_method succeeds or fails (e.g. due to + // backend init issues), the device-aware memory allocation path + // (make_planned_memory_with_devices) runs BEFORE backend init. + EXPECT_EQ(g_mock_cuda.allocate_count_, 1) + << "Expected 1 device allocation for the CUDA buffer" + << " (actual: " << g_mock_cuda.allocate_count_ << ")" + << ", deallocate_count=" << g_mock_cuda.deallocate_count_ + << ", load_method returned error=" << static_cast(err); + EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48) + << "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)"; + EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0) + << "Expected device_index=0 (cuda:0)"; + + if (err == Error::Ok) { + // Success path: MethodHolder moved into methods_ map. + // DeviceMemoryBuffer is alive as long as Module is alive. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 0) + << "No deallocation while method is loaded"; + } else { + // Error path: local MethodHolder destroyed on return from load_method. + // RAII deallocation already happened. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1) + << "RAII deallocation on error path"; + } + } + + // After Module destroyed, all device memory must be freed. + EXPECT_EQ(g_mock_cuda.deallocate_count_, 1) + << "Expected deallocation after Module destroyed"; +} diff --git a/extension/module/test/targets.bzl b/extension/module/test/targets.bzl index f0d7e449efd..4dc3fb537f3 100644 --- a/extension/module/test/targets.bzl +++ b/extension/module/test/targets.bzl @@ -28,7 +28,7 @@ def define_common_targets(is_fbcode=False): aten_suffix = ("_aten" if aten_mode else "") runtime.cxx_test( - name = "test" + aten_suffix, + name = "module_test" + aten_suffix, srcs = [ "module_test.cpp", ], @@ -68,6 +68,26 @@ def define_common_targets(is_fbcode=False): ], ) + runtime.cxx_test( + name = "module_device_memory_test" + aten_suffix, + srcs = [ + "module_device_memory_test.cpp", + ], + deps = [ + "//executorch/kernels/portable:generated_lib" + aten_suffix, + "//executorch/extension/module:module" + aten_suffix, + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core:device_memory_buffer", + ], + env = { + "ET_MODULE_ADD_WITH_DEVICE_PATH": "$(location fbcode//executorch/test/models:exported_program_with_device_info[ModuleAddWithDevice.pte])", + "ET_MODULE_ADD_PATH": "$(location fbcode//executorch/test/models:exported_programs[ModuleAdd.pte])", + }, + compiler_flags = [ + "-Wno-error=deprecated-declarations", + ], + ) + runtime.filegroup( name = "resources", srcs = native.glob([ diff --git a/shim_et/xplat/executorch/build/build_variables.bzl b/shim_et/xplat/executorch/build/build_variables.bzl index b0545b8ce18..659a128994f 100644 --- a/shim_et/xplat/executorch/build/build_variables.bzl +++ b/shim_et/xplat/executorch/build/build_variables.bzl @@ -50,6 +50,8 @@ PLATFORM_SRCS = [ EXECUTORCH_CORE_SRCS = sorted([ "runtime/backend/interface.cpp", + "runtime/core/device_allocator.cpp", + "runtime/core/device_memory_buffer.cpp", "runtime/core/evalue.cpp", "runtime/core/exec_aten/util/tensor_shape_to_c_string.cpp", "runtime/core/exec_aten/util/tensor_util_portable.cpp", diff --git a/test/models/targets.bzl b/test/models/targets.bzl index c9fb67b7d31..a80244b1383 100644 --- a/test/models/targets.bzl +++ b/test/models/targets.bzl @@ -226,6 +226,7 @@ def define_common_targets(): default_outs = ["."], visibility = [ "//executorch/runtime/executor/test/...", + "//executorch/extension/module/test/...", ], ) From c27cc5d5bb872603ec90378c486049bc2c77a382 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Fri, 22 May 2026 20:54:37 -0700 Subject: [PATCH 006/103] [ET Device Support] CudaAllocator: device memory allocator for CUDA backend (#19747) clone https://github.com/pytorch/executorch/pull/18477 due to bot crash --- backends/aoti/slim/core/storage.h | 44 ++-- backends/aoti/slim/core/targets.bzl | 1 + backends/cuda/runtime/TARGETS | 29 +++ backends/cuda/runtime/cuda_allocator.cpp | 258 +++++++++++++++++++++++ backends/cuda/runtime/cuda_allocator.h | 84 ++++++++ backends/cuda/runtime/cuda_backend.cpp | 9 + 6 files changed, 395 insertions(+), 30 deletions(-) create mode 100644 backends/cuda/runtime/cuda_allocator.cpp create mode 100644 backends/cuda/runtime/cuda_allocator.h diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h index 73c4d32d955..a3d17a89903 100644 --- a/backends/aoti/slim/core/storage.h +++ b/backends/aoti/slim/core/storage.h @@ -13,6 +13,7 @@ #ifdef CUDA_AVAILABLE #include #include +#include #endif #include @@ -107,9 +108,6 @@ struct DeviceTraits { /// @param device The target CUDA device (used to get the stream). /// @return Pointer to allocated device memory. static void* allocate(size_t nbytes, const c10::Device& device) { - // Get the current stream for this device (set by CUDAStreamGuard if any) - // This follows PyTorch's pattern where the allocator assumes the caller - // has already set the correct device via CUDAStreamGuard. auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(device.index()); ET_CHECK_MSG( @@ -118,31 +116,23 @@ struct DeviceTraits { static_cast(device.index())); cudaStream_t stream = stream_result.get(); - void* data = nullptr; - ET_CUDA_CHECK(cudaMallocAsync(&data, nbytes, stream)); - return data; + auto result = executorch::backends::cuda::CudaAllocator::allocate_async( + nbytes, device.index(), stream); + ET_CHECK_MSG( + result.ok(), + "CudaAllocator::allocate_async failed for %zu bytes on device %d", + nbytes, + static_cast(device.index())); + return result.get(); } - /// Frees CUDA device memory on the current stream. - /// @param ptr Pointer to device memory to free. static void free(void* ptr) { - // Get the current stream for the current device - // Currently all cuda slimtensors should be on the same device same stream, - // so we can just use the stream on current device. - // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to - // support multiple devices. auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1); ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream"); - ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get())); + executorch::backends::cuda::CudaAllocator::deallocate_async( + ptr, -1, stream_result.get()); } - /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously. - /// @param dst Destination pointer. - /// @param src Source pointer. - /// @param nbytes Number of bytes to copy. - /// @param dst_device Destination device. - /// @param src_device Source device. - /// @param stream CUDA stream for async copy. static void memcpy_async( void* dst, const void* src, @@ -151,7 +141,6 @@ struct DeviceTraits { const c10::Device& src_device, cudaStream_t stream) { cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; - if (src_device.is_cpu()) { direction = cudaMemcpyHostToDevice; } else if (dst_device.is_cpu()) { @@ -164,15 +153,11 @@ struct DeviceTraits { static_cast(dst_device.index())); } - ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream)); + auto err = executorch::backends::cuda::CudaAllocator::memcpy_async( + dst, src, nbytes, direction, stream); + ET_CHECK_MSG(err == executorch::runtime::Error::Ok, "memcpy_async failed"); } - /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously. - /// @param dst Destination pointer. - /// @param src Source pointer. - /// @param nbytes Number of bytes to copy. - /// @param dst_device Destination device. - /// @param src_device Source device. static void memcpy( void* dst, const void* src, @@ -180,7 +165,6 @@ struct DeviceTraits { const c10::Device& dst_device, const c10::Device& src_device) { cudaMemcpyKind direction = cudaMemcpyDeviceToDevice; - if (src_device.is_cpu()) { direction = cudaMemcpyHostToDevice; } else if (dst_device.is_cpu()) { diff --git a/backends/aoti/slim/core/targets.bzl b/backends/aoti/slim/core/targets.bzl index b9148305c91..42a7b79da6e 100644 --- a/backends/aoti/slim/core/targets.bzl +++ b/backends/aoti/slim/core/targets.bzl @@ -19,6 +19,7 @@ def define_common_targets(): "//executorch/runtime/platform:platform", "//executorch/backends/aoti/slim/c10/cuda:exception", "//executorch/backends/aoti/slim/cuda:guard", + "//executorch/backends/cuda/runtime:cuda_allocator", ], ) diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS index f13f41ab8b7..c8449a95718 100644 --- a/backends/cuda/runtime/TARGETS +++ b/backends/cuda/runtime/TARGETS @@ -74,6 +74,33 @@ runtime.cxx_library( ], ) +runtime.cxx_library( + name = "cuda_allocator", + srcs = [ + "cuda_allocator.cpp", + ], + headers = [ + "cuda_allocator.h", + ], + # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole) + link_whole = True, + supports_python_dlopen = True, + visibility = ["PUBLIC"], + exported_deps = [ + "//executorch/runtime/core:device_allocator", + ], + deps = [ + "//executorch/runtime/platform:platform", + ], + nvcc_flags = get_nvcc_arch_args() + [ + "-_NVCC_HOST_COMPILER_FLAG_", + "gcc", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], +) + runtime.cxx_library( name = "cuda_backend", srcs = [ @@ -92,6 +119,8 @@ runtime.cxx_library( deps = [ ":cuda_platform", ":runtime_shims", + ":cuda_allocator", + ":cuda_platform", "//executorch/backends/aoti:aoti_common_slim", "//executorch/backends/aoti/slim/core:slimtensor", "//executorch/backends/aoti/slim/factory:empty", diff --git a/backends/cuda/runtime/cuda_allocator.cpp b/backends/cuda/runtime/cuda_allocator.cpp new file mode 100644 index 00000000000..94294b08fa0 --- /dev/null +++ b/backends/cuda/runtime/cuda_allocator.cpp @@ -0,0 +1,258 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include + +#include + +namespace executorch::backends::cuda { + +using executorch::runtime::Error; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +Result +CudaAllocator::allocate(size_t nbytes, DeviceIndex index, size_t alignment) { + // index == -1 means "use the current CUDA device"; any value < -1 is invalid. + ET_CHECK_OR_RETURN_ERROR( + index >= -1, + InvalidArgument, + "CudaAllocator::allocate: invalid device index %d (must be >= -1)", + static_cast(index)); + + // Alignment must be a non-zero power of 2. + ET_CHECK_OR_RETURN_ERROR( + alignment != 0 && (alignment & (alignment - 1)) == 0, + InvalidArgument, + "CudaAllocator::allocate: alignment must be a power of 2, got %zu", + alignment); + + // cudaMalloc is documented to return memory aligned to at least 256 bytes, + // which trivially satisfies kDefaultAlignment (alignof(void*)). For any + // requested alignment <= 256 bytes, the returned pointer is already aligned. + // Stricter alignment would require over-allocation plus bookkeeping that + // deallocate() does not currently support, so reject that case. + constexpr size_t kCudaMallocAlignment = 256; + ET_CHECK_OR_RETURN_ERROR( + alignment <= kCudaMallocAlignment, + NotSupported, + "CudaAllocator::allocate: requested alignment %zu exceeds cudaMalloc's " + "guaranteed alignment of %zu bytes; stricter alignment is not supported", + alignment, + kCudaMallocAlignment); + + void* ptr = nullptr; + int prev_device = 0; + cudaError_t prev_device_err = cudaGetDevice(&prev_device); + + // If index == -1, fall back to the current device returned by cudaGetDevice + // and skip the set/restore round-trip. + const bool switch_device = index >= 0 && prev_device_err == cudaSuccess && + static_cast(index) != prev_device; + if (switch_device) { + cudaSetDevice(index); + } + + cudaError_t err = cudaMalloc(&ptr, nbytes); + + if (switch_device) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMalloc failed: %s (requested %zu bytes on device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::MemoryAllocationFailed; + } + + // Sanity check: the pointer returned by cudaMalloc should already meet the + // requested alignment. If a future CUDA runtime weakens this guarantee, we + // want to fail loudly rather than silently return a misaligned pointer. + if ((reinterpret_cast(ptr) & (alignment - 1)) != 0) { + ET_LOG( + Error, + "cudaMalloc returned pointer %p not aligned to %zu bytes", + ptr, + alignment); + cudaFree(ptr); + return Error::MemoryAllocationFailed; + } + + return ptr; +} + +void CudaAllocator::deallocate(void* ptr, DeviceIndex index) { + if (ptr == nullptr) { + return; + } + + int prev_device = 0; + cudaError_t prev_device_err = cudaSuccess; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + cudaSetDevice(index); + } + } + + cudaError_t err = cudaFree(ptr); + + if (index >= 0 && prev_device_err == cudaSuccess) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaFree failed: %s (ptr=%p, device %d)", + cudaGetErrorString(err), + ptr, + static_cast(index)); + } +} + +// TODO(gasoonjia): Add support for async copy +Error CudaAllocator::copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) { + int prev_device = 0; + cudaError_t prev_device_err = cudaSuccess; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + cudaSetDevice(index); + } + } + + cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyHostToDevice); + + if (index >= 0 && prev_device_err == cudaSuccess) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpy H2D failed: %s (%zu bytes, device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::Internal; + } + return Error::Ok; +} + +// TODO(gasoonjia): Add support for async copy +Error CudaAllocator::copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) { + int prev_device = 0; + cudaError_t prev_device_err = cudaSuccess; + + if (index >= 0) { + prev_device_err = cudaGetDevice(&prev_device); + if (prev_device_err == cudaSuccess) { + cudaSetDevice(index); + } + } + + cudaError_t err = cudaMemcpy(dst, src, nbytes, cudaMemcpyDeviceToHost); + + if (index >= 0 && prev_device_err == cudaSuccess) { + cudaSetDevice(prev_device); + } + + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpy D2H failed: %s (%zu bytes, device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::Internal; + } + return Error::Ok; +} + +DeviceType CudaAllocator::device_type() const { + return DeviceType::CUDA; +} + +CudaAllocator& CudaAllocator::instance() { + static CudaAllocator allocator; + return allocator; +} + +Result CudaAllocator::allocate_async( + size_t nbytes, + DeviceIndex index, + cudaStream_t stream) { + void* ptr = nullptr; + cudaError_t err = cudaMallocAsync(&ptr, nbytes, stream); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMallocAsync failed: %s (requested %zu bytes on device %d)", + cudaGetErrorString(err), + nbytes, + static_cast(index)); + return Error::MemoryAllocationFailed; + } + return ptr; +} + +void CudaAllocator::deallocate_async( + void* ptr, + DeviceIndex index, + cudaStream_t stream) { + if (ptr == nullptr) { + return; + } + cudaError_t err = cudaFreeAsync(ptr, stream); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaFreeAsync failed: %s (ptr=%p, device %d)", + cudaGetErrorString(err), + ptr, + static_cast(index)); + } +} + +Error CudaAllocator::memcpy_async( + void* dst, + const void* src, + size_t nbytes, + cudaMemcpyKind direction, + cudaStream_t stream) { + cudaError_t err = cudaMemcpyAsync(dst, src, nbytes, direction, stream); + if (err != cudaSuccess) { + ET_LOG( + Error, + "cudaMemcpyAsync failed: %s (%zu bytes)", + cudaGetErrorString(err), + nbytes); + return Error::Internal; + } + return Error::Ok; +} + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/cuda_allocator.h b/backends/cuda/runtime/cuda_allocator.h new file mode 100644 index 00000000000..fcd8224305a --- /dev/null +++ b/backends/cuda/runtime/cuda_allocator.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include + +namespace executorch::backends::cuda { + +/** + * CUDA implementation of DeviceAllocator. + * + * Uses cudaMalloc/cudaFree for allocation and cudaMemcpy for host-device + * transfers. This allocator is automatically registered as a singleton + * with the DeviceAllocatorRegistry when the CUDA backend library is linked. + * + * All CUDA memory operations in the CUDA backend should go through this + * allocator for consistent memory management. + */ +class CudaAllocator final : public executorch::runtime::DeviceAllocator { + public: + executorch::runtime::Result allocate( + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index, + size_t alignment = kDefaultAlignment) override; + + void deallocate(void* ptr, executorch::runtime::etensor::DeviceIndex index) + override; + + executorch::runtime::Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index) override; + + executorch::runtime::Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index) override; + + executorch::runtime::etensor::DeviceType device_type() const override; + + /// Returns the global CudaAllocator singleton. + static CudaAllocator& instance(); + + // --- Async (stream-based) operations for SlimTensor/Storage layer --- + + /** + * Allocate device memory asynchronously on the given CUDA stream. + */ + static executorch::runtime::Result allocate_async( + size_t nbytes, + executorch::runtime::etensor::DeviceIndex index, + cudaStream_t stream); + + /** + * Deallocate device memory asynchronously on the given CUDA stream. + */ + static void deallocate_async( + void* ptr, + executorch::runtime::etensor::DeviceIndex index, + cudaStream_t stream); + + /** + * Copy memory asynchronously on the given CUDA stream. + * Supports H2D, D2H, and D2D based on src/dst device types. + */ + static executorch::runtime::Error memcpy_async( + void* dst, + const void* src, + size_t nbytes, + cudaMemcpyKind direction, + cudaStream_t stream); +}; + +} // namespace executorch::backends::cuda diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp index 1497ba1e376..d2738f7a976 100644 --- a/backends/cuda/runtime/cuda_backend.cpp +++ b/backends/cuda/runtime/cuda_backend.cpp @@ -40,6 +40,7 @@ // Include our shim layer headers #include #include +#include #include #include #include @@ -1273,5 +1274,13 @@ auto cls = cuda::CudaBackend(); executorch::runtime::Backend backend{"CudaBackend", &cls}; static executorch::runtime::Error success_with_compiler = register_backend(backend); + +// Auto-register the CudaAllocator so that DeviceMemoryBuffer::create(CUDA) +// works whenever the CUDA backend library is linked. +static bool cuda_allocator_registered = [] { + executorch::runtime::register_device_allocator( + &cuda::CudaAllocator::instance()); + return true; +}(); } // namespace } // namespace executorch::backends From 7d8063f9e6221ad8724f122ad3ec4cbb1aae2fc6 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Fri, 22 May 2026 20:56:14 -0700 Subject: [PATCH 007/103] [ET Device Support] Define AOT device copy ops registry (#19748) clone https://github.com/pytorch/executorch/pull/18728 due to bot crash --- exir/passes/BUCK | 8 +++ exir/passes/_device_copy_ops_registry.py | 58 +++++++++++++++++++ exir/tests/TARGETS | 11 ++++ exir/tests/test_device_copy_ops.py | 73 ++++++++++++++++++++++++ 4 files changed, 150 insertions(+) create mode 100644 exir/passes/_device_copy_ops_registry.py create mode 100644 exir/tests/test_device_copy_ops.py diff --git a/exir/passes/BUCK b/exir/passes/BUCK index 954f1cfdb4f..4647388b388 100644 --- a/exir/passes/BUCK +++ b/exir/passes/BUCK @@ -381,6 +381,14 @@ fbcode_target(_kind = runtime.python_library, ], ) +fbcode_target(_kind = runtime.python_library, + name = "device_copy_ops_registry", + srcs = ["_device_copy_ops_registry.py"], + deps = [ + "//caffe2:torch", + ], +) + fbcode_target(_kind = runtime.python_library, name = "memory_format_ops_pass", srcs = [ diff --git a/exir/passes/_device_copy_ops_registry.py b/exir/passes/_device_copy_ops_registry.py new file mode 100644 index 00000000000..a62b88d4234 --- /dev/null +++ b/exir/passes/_device_copy_ops_registry.py @@ -0,0 +1,58 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Registry for device copy ops used to insert explicit H2D (host-to-device) +and D2H (device-to-host) data transfer operations at delegate boundaries. + +These ops are inserted by PropagateDevicePass when enable_non_cpu_memory_planning +is True, making the graph functional by explicitly transferring data between +CPU and device memory. + +Follows the same registration pattern as dim_order_ops_registry.py. +""" + +import torch +from torch.library import impl, Library + +lib = Library("et_copy", "DEF") + +# _h2d_copy: copies a CPU tensor to device memory. +# At tracing time, this is a clone (both on CPU). At runtime, the out tensor +# is memory-planned on device, and the kernel calls +# DeviceAllocator::copy_host_to_device. +lib.define("_h2d_copy(Tensor self) -> Tensor") +lib.define("_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)") + +# _d2h_copy: copies a device tensor to CPU memory. +# At tracing time, this is a clone (both on CPU). At runtime, the self tensor +# has device memory, and the kernel calls DeviceAllocator::copy_device_to_host. +lib.define("_d2h_copy(Tensor self) -> Tensor") +lib.define("_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!)") + + +@impl(lib, "_h2d_copy", "CompositeImplicitAutograd") +def _h2d_copy_impl(self: torch.Tensor) -> torch.Tensor: + # During tracing, both tensors are on CPU. Just clone to represent the transfer. + return self.clone() + + +@impl(lib, "_h2d_copy.out", "CompositeImplicitAutograd") +def _h2d_copy_out_impl(self: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor: + out.copy_(self) + return out + + +@impl(lib, "_d2h_copy", "CompositeImplicitAutograd") +def _d2h_copy_impl(self: torch.Tensor) -> torch.Tensor: + # During tracing, both tensors are on CPU. Just clone to represent the transfer. + return self.clone() + + +@impl(lib, "_d2h_copy.out", "CompositeImplicitAutograd") +def _d2h_copy_out_impl(self: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor: + out.copy_(self) + return out diff --git a/exir/tests/TARGETS b/exir/tests/TARGETS index 322f72c870a..21493a69644 100644 --- a/exir/tests/TARGETS +++ b/exir/tests/TARGETS @@ -504,3 +504,14 @@ python_unittest( "//executorch/exir/passes:propagate_device_pass", ], ) + +python_unittest( + name = "device_copy_ops", + srcs = [ + "test_device_copy_ops.py", + ], + deps = [ + "//caffe2:torch", + "//executorch/exir/passes:device_copy_ops_registry", + ], +) diff --git a/exir/tests/test_device_copy_ops.py b/exir/tests/test_device_copy_ops.py new file mode 100644 index 00000000000..805159d9d81 --- /dev/null +++ b/exir/tests/test_device_copy_ops.py @@ -0,0 +1,73 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +# Import the registry to register the ops +import executorch.exir.passes._device_copy_ops_registry # noqa: F401 + +import torch + + +class DeviceCopyOpsRegistryTest(unittest.TestCase): + """Tests that et_copy._h2d_copy and et_copy._d2h_copy ops are correctly + registered and produce expected outputs during tracing (CPU-only).""" + + def test_h2d_copy_functional(self): + """_h2d_copy should return a clone of the input tensor.""" + x = torch.randn(2, 3) + result = torch.ops.et_copy._h2d_copy(x) + self.assertEqual(result.shape, x.shape) + self.assertEqual(result.dtype, x.dtype) + self.assertTrue(torch.equal(result, x)) + # Should be a new tensor, not the same object + self.assertFalse(result.data_ptr() == x.data_ptr()) + + def test_d2h_copy_functional(self): + """_d2h_copy should return a clone of the input tensor.""" + x = torch.randn(4, 5) + result = torch.ops.et_copy._d2h_copy(x) + self.assertEqual(result.shape, x.shape) + self.assertEqual(result.dtype, x.dtype) + self.assertTrue(torch.equal(result, x)) + self.assertFalse(result.data_ptr() == x.data_ptr()) + + def test_h2d_copy_out_variant(self): + """_h2d_copy.out should copy data into the provided out tensor.""" + x = torch.randn(3, 3) + out = torch.empty(3, 3) + result = torch.ops.et_copy._h2d_copy.out(x, out=out) + self.assertTrue(result is out) + self.assertTrue(torch.equal(out, x)) + + def test_d2h_copy_out_variant(self): + """_d2h_copy.out should copy data into the provided out tensor.""" + x = torch.randn(2, 4) + out = torch.empty(2, 4) + result = torch.ops.et_copy._d2h_copy.out(x, out=out) + self.assertTrue(result is out) + self.assertTrue(torch.equal(out, x)) + + def test_h2d_copy_preserves_dtype(self): + """_h2d_copy should work with various dtypes.""" + for dtype in [torch.float32, torch.float16, torch.int32, torch.int64]: + x = torch.ones(2, 2, dtype=dtype) + result = torch.ops.et_copy._h2d_copy(x) + self.assertEqual(result.dtype, dtype) + self.assertTrue(torch.equal(result, x)) + + def test_h2d_copy_scalar_tensor(self): + """_h2d_copy should handle 0-dim tensors.""" + x = torch.tensor(3.14) + result = torch.ops.et_copy._h2d_copy(x) + self.assertEqual(result.shape, torch.Size([])) + self.assertTrue(torch.equal(result, x)) + + def test_d2h_copy_empty_tensor(self): + """_d2h_copy should handle empty tensors.""" + x = torch.empty(0, 3) + result = torch.ops.et_copy._d2h_copy(x) + self.assertEqual(result.shape, torch.Size([0, 3])) From d757776f51bc41aedac47fe51dd020474726774c Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Sat, 23 May 2026 11:50:33 -0700 Subject: [PATCH 008/103] Add extension_llm_runner to CMake deps (#19749) Differential Revision: D106162684 Pull Request resolved: https://github.com/pytorch/executorch/pull/19749 --- examples/models/parakeet/main.cpp | 9 +++++---- extension/asr/runner/CMakeLists.txt | 2 +- extension/asr/runner/transducer_runner.cpp | 16 ++++++++++++---- extension/asr/runner/transducer_runner.h | 13 +++++++++++-- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp index 249e8fd14d4..b8a052004e4 100644 --- a/examples/models/parakeet/main.cpp +++ b/examples/models/parakeet/main.cpp @@ -152,13 +152,14 @@ int main(int argc, char** argv) { ET_LOG(Error, "Preprocessing failed."); return 1; } - auto mel_features = preprocess_result.get(); + auto preprocess_out = preprocess_result.get(); // --- Transcribe --- ET_LOG(Info, "Running TDT greedy decode..."); - auto result = runner.transcribe(mel_features, [](const std::string& piece) { - std::cout << piece << std::flush; - }); + auto result = runner.transcribe( + preprocess_out.features, + [](const std::string& piece) { std::cout << piece << std::flush; }, + preprocess_out.length); if (!result.ok()) { ET_LOG(Error, "Transcription failed."); diff --git a/extension/asr/runner/CMakeLists.txt b/extension/asr/runner/CMakeLists.txt index 66974aa2a24..b47cddaf48c 100644 --- a/extension/asr/runner/CMakeLists.txt +++ b/extension/asr/runner/CMakeLists.txt @@ -22,7 +22,7 @@ endif() include(${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake) set(runner_deps executorch_core extension_module extension_tensor - tokenizers::tokenizers + extension_llm_runner tokenizers::tokenizers ) # Define runner library diff --git a/extension/asr/runner/transducer_runner.cpp b/extension/asr/runner/transducer_runner.cpp index 3461cb09cc1..7b9298845a9 100644 --- a/extension/asr/runner/transducer_runner.cpp +++ b/extension/asr/runner/transducer_runner.cpp @@ -200,7 +200,7 @@ Error TransducerRunner::load() { return Error::Ok; } -Result<::executorch::extension::TensorPtr> TransducerRunner::preprocess( +Result TransducerRunner::preprocess( ::executorch::extension::TensorPtr raw_audio) { if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -229,12 +229,18 @@ Result<::executorch::extension::TensorPtr> TransducerRunner::preprocess( "Preprocessor returned unexpected output."); auto mel = outputs[0].toTensor(); - return std::make_shared<::executorch::aten::Tensor>(std::move(mel)); + int64_t mel_len = mel.sizes()[1]; // default to tensor dim + if (outputs.size() >= 2 && outputs[1].isTensor()) { + mel_len = outputs[1].toTensor().const_data_ptr()[0]; + } + return PreprocessResult{ + std::make_shared<::executorch::aten::Tensor>(std::move(mel)), mel_len}; } Result> TransducerRunner::transcribe( ::executorch::extension::TensorPtr preprocessed_features, - std::function token_callback) { + std::function token_callback, + int64_t features_length) { if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); } @@ -242,7 +248,9 @@ Result> TransducerRunner::transcribe( stats_.inference_start_ms = ::executorch::extension::llm::time_in_ms(); // --- Encode --- - int64_t mel_len_value = preprocessed_features->size(1); + // Use provided length, or fall back to tensor dimension + int64_t mel_len_value = + features_length > 0 ? features_length : preprocessed_features->size(1); std::vector mel_len_data = {mel_len_value}; auto mel_len = ::executorch::extension::from_blob( mel_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); diff --git a/extension/asr/runner/transducer_runner.h b/extension/asr/runner/transducer_runner.h index ee819590141..aed0ad84cd6 100644 --- a/extension/asr/runner/transducer_runner.h +++ b/extension/asr/runner/transducer_runner.h @@ -29,6 +29,14 @@ using ::executorch::extension::llm::Stats; using ::executorch::runtime::Error; using ::executorch::runtime::Result; +/** + * Preprocessed audio features with actual (unpadded) length. + */ +struct PreprocessResult { + ::executorch::extension::TensorPtr features; + int64_t length; // Actual number of valid frames (excluding padding) +}; + /** * A decoded token with frame-level timing information. */ @@ -97,7 +105,7 @@ class ET_EXPERIMENTAL TransducerRunner { * @returns Preprocessed features tensor (e.g., mel spectrogram), * ready to pass to transcribe(). */ - Result<::executorch::extension::TensorPtr> preprocess( + Result preprocess( ::executorch::extension::TensorPtr raw_audio); /** @@ -112,7 +120,8 @@ class ET_EXPERIMENTAL TransducerRunner { */ Result> transcribe( ::executorch::extension::TensorPtr preprocessed_features, - std::function token_callback = {}); + std::function token_callback = {}, + int64_t features_length = -1); /** * Returns a reference to the loaded tokenizer, or nullptr if not loaded. From b69cbcd6ffefe6e13fa25c4ea9285786b04692ca Mon Sep 17 00:00:00 2001 From: roman-janik-nxp Date: Sun, 24 May 2026 11:43:13 +0200 Subject: [PATCH 009/103] NXP backend: Enable Add Tensor with new Neutron flow (#19550) ### Summary Add tests verifying correct support for add.tensor by the Neutron backend using the new Neutron MLIR flow. ### Test plan Unit tests provided. cc @robert-kalmar --- .../ops_converters/add_tensor_converter.py | 42 ++- .../test_add_tensor_converter.py | 263 +++++++++++++++++- backends/nxp/tests/models.py | 4 +- backends/nxp/tests/ops_aliases.py | 1 + 4 files changed, 293 insertions(+), 17 deletions(-) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py index fd28b077b8a..673af19310f 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/add_tensor_converter.py @@ -3,6 +3,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import torch + +from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, NodeConverter, @@ -23,11 +26,33 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - if NodeConverter.uses_shape_broadcasting(node): - # Shape broadcasting may require the addition of `Transpose` ops during conversion. - return False + if custom_delegation_options.use_new_flow_neutron_c: + if not NodeConverter.at_least_one_input_shape_matches_the_output_shape( + node + ): + return False - return True + # If one input is in channel first and ranks of input tensors are not equal, we need to add Transposes + # Transpose is currently not supported for new flow + if any( + input_node.meta[NXP_NODE_FORMAT].is_channels_first() + for input_node in node.all_input_nodes + ) and NodeConverter._node_inputs_ranks_not_equal(node): + return False + + supported_types = [torch.int8, torch.uint8] + if not NodeConverter.uses_quantization_type_for_io( + node, supported_types, [0, 1], [0] + ): + return False + + return True + else: + if NodeConverter.uses_shape_broadcasting(node): + # Shape broadcasting may require the addition of `Transpose` ops during conversion. + return False + + return True @staticmethod def _is_supported_in_IR( @@ -43,12 +68,13 @@ def _is_supported_in_IR( return True - # add.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1) def convert(self, node: Node): - """Convert 'add_tensor' operator to TFLite 'add'.""" + """Convert 'add_tensor' operator to NeutronIR 'Add'. + The ExecuTorch schema is: + add.Tensor(Tensor self, Tensor other, Scalar alpha=1) + """ self.assert_convertible(node) - t_op = self._create_tflite_op_with_io_tensors(node) - t_op.builtin_options = add_options.Add() + self.builder.append_operators([t_op]) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py index 1aa58ab5d95..4a656eb9517 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_add_tensor_converter.py @@ -1,7 +1,8 @@ -# Copyright 2025 NXP +# Copyright 2025-2026 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + import numpy as np import pytest import torch @@ -9,17 +10,29 @@ from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) -from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator +from executorch.backends.nxp.tests.executorch_pipeline import ( + ModelInputSpec, + to_quantized_edge_program, +) from executorch.backends.nxp.tests.executors import ( convert_run_compare, + graph_contains_any_of_ops, ToChannelFirstPreprocess, ToChannelLastPreprocess, ) +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier from executorch.backends.nxp.tests.models import ( AddTensorConvModule, AddTensorModule, AddTensorOneInputModule, ) +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + AddTensor, + Convolution, + ExecutorchDelegateCall, +) from torch.export import ExportedProgram from executorch.backends.nxp.tests.use_qat import * # noqa F403 @@ -92,20 +105,26 @@ def test_add_tensor_one_input_quant_conversion(mocker, input_shape, use_qat): @pytest.mark.parametrize( - "input_shape", + "x_input_shape", [ pytest.param((1, 4, 8, 8), id="4D."), pytest.param((1, 4, 5, 5), id="4D, product of dims is not a multiple of 8."), ], ) -def test_add_tensor_w_conv_quant_conversion(mocker, input_shape, use_qat): +def test_add_tensor_w_conv_quant_conversion(mocker, x_input_shape, use_qat): model = AddTensorConvModule() converter_spy = mocker.spy(EdgeProgramToIRConverter, "convert_program") + n, c, h, w = x_input_shape + y_input_shape = (n, 8, h, w) + # Run conversion _ = to_quantized_edge_program( - model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False + model, + [x_input_shape, y_input_shape], + use_qat=use_qat, + use_neutron_for_format_conversion=False, ) # Capture generated model @@ -114,7 +133,13 @@ def test_add_tensor_w_conv_quant_conversion(mocker, input_shape, use_qat): # Capture converted program exported_program: ExportedProgram = converter_spy.call_args.args[1] - input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8) + input_data_1 = (np.random.random(x_input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + input_data_2 = (np.random.random(y_input_shape).astype(np.float32) * 50).astype( + np.int8 + ) + input_data = {0: input_data_1, 1: input_data_2} convert_run_compare( exported_program, @@ -149,7 +174,7 @@ def test_add_tensor_broadcasting_unsupported_quant_conversion( nodes = list(edge_program.graph.nodes) # Broadcast is not supported, node is not converted - assert nodes[6].target.__name__ == "aten.add.Tensor" # Add Tensor is not delegated. + assert nodes[6].target == AddTensor # Add Tensor is not delegated. # Capture converted program # exported_program: ExportedProgram = converter_spy.call_args.args[1] @@ -159,3 +184,227 @@ def test_add_tensor_broadcasting_unsupported_quant_conversion( # input_data = {0: x_input_data, 1: y_input_data} # # convert_run_compare(exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data) + + +class TestAddTensorNewNeutronFlow: + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param((1,), id="1D."), + pytest.param((6, 5), id="2D."), + pytest.param((1, 4, 7), id="3D."), + pytest.param((2, 4, 3, 15), id="4D."), + pytest.param( + (6, 82), + id="2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 68, 7), + id="3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 4, 9, 11, 4), + id="5D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__basic_nsys_inference(self, x_input_shape, mocker): + x_input_spec = ModelInputSpec(x_input_shape) + model = AddTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={} + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + [x_input_spec, x_input_spec], + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param((1,), id="1D."), + pytest.param((6, 5), id="2D."), + pytest.param((1, 4, 7), id="3D."), + pytest.param((2, 4, 3, 15), id="4D."), + pytest.param( + (1, 4, 9, 11, 4), + id="5D.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__basic_nsys_inference_qat(self, x_input_shape, mocker): + x_input_spec = ModelInputSpec(x_input_shape) + model = AddTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={} + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + [x_input_spec, x_input_spec], + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + use_qat=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D." + ), + pytest.param( + [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))], + id="2 inputs 3D.", + ), + pytest.param( + [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D." + ), + pytest.param( + [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))], + id="2 inputs 2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__broadcast(self, input_spec, mocker): + model = AddTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={AddTensor: 1}, expected_non_delegated_ops={} + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + input_spec, + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((4, 1)), ModelInputSpec((1, 6))], id="2 inputs 2D." + ), + pytest.param( + [ModelInputSpec((1, 3, 4)), ModelInputSpec((5, 3, 1))], + id="2 inputs 3D.", + ), + pytest.param( + [ModelInputSpec((6, 4)), ModelInputSpec((6, 6, 1))], + id="2 inputs 2D + 3D.", + ), + ], + ) + def test__broadcast_unsupported(self, input_spec): + # Broadcast where at least one of the inputs is not equal to output is not supported + model = AddTensorModule() + + delegated_ep = to_quantized_edge_program( + model, input_spec, use_new_flow_neutron_c=True + ).exported_program() + + # Make sure the `add.Tensor` was NOT delegated. + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [AddTensor]) + + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param( + (1, 4, 5, 5), id="4D, product of dims is not a multiple of 8." + ), + ], + ) + def test__w_conv(self, x_input_shape, mocker): + model = AddTensorConvModule() + + n, c, h, w = x_input_shape + y_input_spec = ModelInputSpec((n, 8, h, w)) + x_input_spec = ModelInputSpec(x_input_shape) + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops={AddTensor: 1, Convolution: 1}, + expected_non_delegated_ops={}, + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + [x_input_spec, y_input_spec], + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 8, 5, 1))], + id="2 inputs 4D + 4D.", + ), + pytest.param( + [ModelInputSpec((1, 4, 5, 67)), ModelInputSpec((1, 8, 5, 1))], + id="2 inputs 4D + 4D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__w_conv_broadcast(self, input_spec, mocker): + model = AddTensorConvModule() + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops={AddTensor: 1, Convolution: 1}, + expected_non_delegated_ops={}, + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + input_spec, + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 5))], + id="2 inputs 4D + 2D.", + ), + pytest.param( + [ModelInputSpec((1, 4, 4, 10)), ModelInputSpec((1, 4, 1))], + id="2 inputs 4D + 3D.", + ), + ], + ) + def test__w_conv_unsupported(self, input_spec): + model = AddTensorConvModule() + + delegated_ep = to_quantized_edge_program( + model, input_spec, use_new_flow_neutron_c=True + ).exported_program() + + # Make sure the `add.Tensor` was NOT delegated. + assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) + assert graph_contains_any_of_ops(delegated_ep.graph, [AddTensor]) diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py index 045dcfaba40..1292c4cf17d 100644 --- a/backends/nxp/tests/models.py +++ b/backends/nxp/tests/models.py @@ -656,9 +656,9 @@ def __init__(self): super().__init__() self.conv = Conv2dModule(padding=1, stride=1) - def forward(self, x): + def forward(self, x, y): x = self.conv(x) - return x + x + return x + y class AddTensorOneInputModule(torch.nn.Module): diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py index ec58072658d..9e6bedc5dba 100644 --- a/backends/nxp/tests/ops_aliases.py +++ b/backends/nxp/tests/ops_aliases.py @@ -13,6 +13,7 @@ Abs = exir_ops.edge.aten.abs.default AdaptiveAvgPool2D = exir_ops.edge.aten._adaptive_avg_pool2d.default +AddTensor = exir_ops.edge.aten.add.Tensor AvgPool2D = exir_ops.edge.aten.avg_pool2d.default Bmm = exir_ops.edge.aten.bmm.default ConstantPadND = exir_ops.edge.aten.constant_pad_nd.default From ba6074c3868abb8f602a22565445b52f8b5bdfb1 Mon Sep 17 00:00:00 2001 From: Julian Chan <128482247+julianchan-meta@users.noreply.github.com> Date: Sun, 24 May 2026 23:53:19 -0700 Subject: [PATCH 010/103] Back out "Globally serialize XNNPACK execution, add logging" (#19752) Differential Revision: D106254596 Pull Request resolved: https://github.com/pytorch/executorch/pull/19752 --- backends/xnnpack/runtime/XNNPACKBackend.cpp | 53 +-------------------- 1 file changed, 2 insertions(+), 51 deletions(-) diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index 2fe1e4d162e..c20fa985f46 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -42,13 +41,6 @@ using executorch::runtime::FreeableBuffer; using executorch::runtime::Result; using executorch::runtime::Span; -// Global mutex for all XNNPACK operations. This is temporary, tracked by -// T272407942. -static std::mutex& global_xnnpack_mutex() { - static std::mutex m; - return m; -} - class XnnpackBackend final : public ::executorch::ET_RUNTIME_NAMESPACE::BackendInterface { public: @@ -74,8 +66,6 @@ class XnnpackBackend final BackendInitContext& context, FreeableBuffer* processed, ArrayRef compile_specs) const override { - const std::lock_guard global_lock(global_xnnpack_mutex()); - auto executor = context.get_runtime_allocator() ->allocateInstance(); if (executor == nullptr) { @@ -139,17 +129,6 @@ class XnnpackBackend final Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err); return err; } - - ET_LOG( - Info, - "XnnpackBackend::init delegate=%p workspace_id=%" PRIu64 - " workspace_ptr=%p program_id=0x%" PRIxPTR " weight_cache=%s", - (void*)executor, - workspace->id(), - (void*)workspace_ptr, - program_id, - use_weight_cache ? "true" : "false"); - return executor; } @@ -157,27 +136,15 @@ class XnnpackBackend final BackendExecutionContext& context, DelegateHandle* handle, Span args) const override { - const std::lock_guard global_lock(global_xnnpack_mutex()); - auto executor = static_cast(handle); - auto workspace = executor->get_workspace(); - ET_LOG( - Info, - "XnnpackBackend::execute begin delegate=%p workspace_id=%" PRIu64 - " num_args=%zu weight_cache=%s", - (void*)executor, - workspace->id(), - (size_t)args.size(), - executor->uses_weight_cache() ? "true" : "false"); - std::unique_lock lock_weights_cache( weights_cache_mutex_, std::defer_lock); if (executor->uses_weight_cache()) { lock_weights_cache.lock(); } - auto [raii_lock, _] = workspace->acquire(); + auto [raii_lock, _] = executor->get_workspace()->acquire(); // Prepare Inputs/Outputs and Propagate Input Shapes Error err = executor->prepare_args(args); @@ -194,29 +161,12 @@ class XnnpackBackend final // Convert output data types if necessary (e.g., int32 -> int64 for Long) err = executor->convert_outputs(args); - ET_LOG( - Info, - "XnnpackBackend::execute end delegate=%p workspace_id=%" PRIu64 - " err=0x%x", - (void*)executor, - workspace->id(), - (unsigned int)err); - return err; } void destroy(DelegateHandle* handle) const override { if (handle != nullptr) { - const std::lock_guard global_lock(global_xnnpack_mutex()); - auto executor = static_cast(handle); - auto workspace = executor->get_workspace(); - - ET_LOG( - Info, - "XnnpackBackend::destroy delegate=%p workspace_id=%" PRIu64, - (void*)executor, - workspace->id()); #ifdef ENABLE_XNNPACK_PROFILING executor->print_avg_op_timings(); @@ -233,6 +183,7 @@ class XnnpackBackend final // the same backend instance. Make sure to hold onto the workspace // shared_ptr, as the pointer in the executor is freed, which includes // the mutex referenced by raii_lock. + auto workspace = executor->get_workspace(); auto [raii_lock, _] = workspace->acquire(); // XNNExecutor is not trivially destructible. Since this was constructed From ee4c90ad03f33398cbfa93cfed09caf04fca6099 Mon Sep 17 00:00:00 2001 From: Per Held Date: Mon, 25 May 2026 08:59:44 +0200 Subject: [PATCH 011/103] Arm backend: Exclude build metadata from license checks Treat BUCK and TARGETS files as build metadata in the Arm pre-push license check so they do not need copyright headers. Signed-off-by: Per Held Change-Id: I4b3bbd1e03ba4b9c38fd06225156344985f0cc70 --- backends/arm/scripts/pre-push | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/arm/scripts/pre-push b/backends/arm/scripts/pre-push index 8e26463cd94..6aa32d07286 100755 --- a/backends/arm/scripts/pre-push +++ b/backends/arm/scripts/pre-push @@ -177,7 +177,7 @@ for COMMIT in ${COMMITS}; do for committed_file in "${license_files[@]}"; do # Skip files with certain extensions case "$committed_file" in - *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl) + *.md|*.md.in|*.json|*.yml|*.yaml|*.cmake|*.patch|.gitignore|*.bzl|BUCK|*/BUCK|TARGETS|*/TARGETS) echo -e "${INFO} Skipping license check for ${committed_file} (excluded extension)" continue ;; From b73df0b4696885c6e03f3789daeece8376078364 Mon Sep 17 00:00:00 2001 From: roman-janik-nxp Date: Mon, 25 May 2026 13:49:04 +0200 Subject: [PATCH 012/103] NXP backend: Enable Sub Tensor with new Neutron flow (#19588) ### Summary Add tests verifying correct support for sub.tensor by the Neutron backend using the new Neutron MLIR flow. ### Test plan Unit tests provided. cc @robert-kalmar @JakeStevens @digantdesai @rascani --- .../ops_converters/sub_tensor_converter.py | 40 ++- .../test_avg_pool2d_converter.py | 9 +- .../test_max_pool_2d_converter.py | 7 +- .../test_mul_tensor_converter.py | 5 - .../test_sub_tensor_converter.py | 260 +++++++++++++++++- backends/nxp/tests/ops_aliases.py | 1 + 6 files changed, 289 insertions(+), 33 deletions(-) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py index e97f4bf63c2..79dbcbcc012 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/sub_tensor_converter.py @@ -3,6 +3,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import torch + +from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, NodeConverter, @@ -23,11 +26,33 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - if NodeConverter.uses_shape_broadcasting(node): - # Shape broadcasting may require the addition of `Transpose` ops during conversion. - return False + if custom_delegation_options.use_new_flow_neutron_c: + if not NodeConverter.at_least_one_input_shape_matches_the_output_shape( + node + ): + return False - return True + # If one input is in channel first and ranks of input tensors are not equal, we need to add Transposes + # Transpose is currently not supported for new flow + if any( + input_node.meta[NXP_NODE_FORMAT].is_channels_first() + for input_node in node.all_input_nodes + ) and NodeConverter._node_inputs_ranks_not_equal(node): + return False + + supported_types = [torch.int8, torch.uint8] + if not NodeConverter.uses_quantization_type_for_io( + node, supported_types, [0, 1], [0] + ): + return False + + return True + else: + if NodeConverter.uses_shape_broadcasting(node): + # Shape broadcasting may require the addition of `Transpose` ops during conversion. + return False + + return True @staticmethod def _is_supported_in_IR( @@ -45,9 +70,12 @@ def _is_supported_in_IR( return True - # sub.Tensor Node format: (Tensor self, Tensor other, *, Scalar alpha=1) def convert(self, node: Node): - """Convert 'sub_tensor' operator to NeutronIR 'Sub'.""" + """Convert 'sub_tensor' operator to NeutronIR 'Sub'. + The ExecuTorch schema is: + sub.Tensor(Tensor self, Tensor other, *, Scalar alpha=1) + """ + self.assert_convertible(node) t_op = self._create_tflite_op_with_io_tensors(node) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py index 2c73ccd8092..193b7ecf9ab 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_avg_pool2d_converter.py @@ -6,6 +6,7 @@ import numpy as np import pytest import torch + from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) @@ -29,13 +30,8 @@ ToNHWCPreprocess, ) from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier -from executorch.backends.nxp.tests.model_output_comparator import ( - NumericalStatsOutputComparator, -) from executorch.backends.nxp.tests.models import AvgPool2dConvModule, AvgPool2dModule - from executorch.backends.nxp.tests.nsys_testing import lower_run_compare - from executorch.backends.nxp.tests.ops_aliases import ( AvgPool2D, ExecutorchDelegateCall, @@ -45,6 +41,7 @@ Unsqueeze, ViewCopy, ) + from torch.export import ExportedProgram from executorch.backends.nxp.tests.use_qat import * # noqa F403 @@ -320,7 +317,6 @@ def test__basic_nsys_inference(self, mocker): def test__basic_nsys_inference_qat(self, mocker): input_shape = (2, 9, 6, 15) model = AvgPool2dModule(False, 0) - comparator = NumericalStatsOutputComparator() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={} ) @@ -329,7 +325,6 @@ def test__basic_nsys_inference_qat(self, mocker): model, input_shape, graph_verifier, - output_comparator=comparator, use_new_flow_neutron_c=True, use_qat=True, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py index 583dc2bfd04..9062d5efbfc 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_max_pool_2d_converter.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import numpy as np +import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( @@ -17,9 +18,6 @@ ToChannelLastPreprocess, ) from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier -from executorch.backends.nxp.tests.model_output_comparator import ( - NumericalStatsOutputComparator, -) from executorch.backends.nxp.tests.nsys_testing import lower_run_compare from executorch.backends.nxp.tests.ops_aliases import ( ExecutorchDelegateCall, @@ -32,7 +30,6 @@ ViewCopy, ) from executorch.backends.nxp.tests.use_qat import * # noqa F403 -import pytest class MaxPool1DModule(torch.nn.Module): @@ -286,7 +283,6 @@ def test__basic_nsys_inference(self, mocker): def test__basic_nsys_inference_qat(self, mocker): input_shape = (2, 11, 7, 16) # The old flow limited the batch size to 1. model = MaxPool2dModule() - comparator = NumericalStatsOutputComparator() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1}, @@ -297,7 +293,6 @@ def test__basic_nsys_inference_qat(self, mocker): model, input_shape, graph_verifier, - output_comparator=comparator, use_new_flow_neutron_c=True, use_qat=True, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py index 927af47bbf5..90113f484ad 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mul_tensor_converter.py @@ -21,9 +21,6 @@ ToChannelLastPreprocess, ) from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier -from executorch.backends.nxp.tests.model_output_comparator import ( - NumericalStatsOutputComparator, -) from executorch.backends.nxp.tests.models import ( MulTensorConvModule, MulTensorModule, @@ -256,7 +253,6 @@ def test__basic_nsys_inference(self, x_input_shape, mocker): def test__basic_nsys_inference_qat(self, x_input_shape, mocker): x_input_spec = ModelInputSpec(x_input_shape) model = MulTensorModule() - comparator = NumericalStatsOutputComparator() graph_verifier = DetailedGraphVerifier( mocker, expected_delegated_ops={MulTensor: 1}, expected_non_delegated_ops={} ) @@ -265,7 +261,6 @@ def test__basic_nsys_inference_qat(self, x_input_shape, mocker): model, [x_input_spec, x_input_spec], graph_verifier, - output_comparator=comparator, use_new_flow_neutron_c=True, use_qat=True, ) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py index 9ce3e93f39b..2734e89bc5d 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_sub_tensor_converter.py @@ -1,7 +1,8 @@ -# Copyright 2025 NXP +# Copyright 2025-2026 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. + import numpy as np import pytest import torch @@ -9,18 +10,29 @@ from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) -from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator +from executorch.backends.nxp.tests.executorch_pipeline import ( + ModelInputSpec, + to_quantized_edge_program, +) from executorch.backends.nxp.tests.executors import ( convert_run_compare, + graph_contains_any_of_ops, ToChannelFirstPreprocess, ToChannelLastPreprocess, ) +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier from executorch.backends.nxp.tests.models import ( SubTensorConvModule, SubTensorModule, SubTensorOneInputModule, ) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + Convolution, + ExecutorchDelegateCall, + SubTensor, +) from torch.export import ExportedProgram from executorch.backends.nxp.tests.use_qat import * # noqa F403 @@ -63,7 +75,7 @@ def test_sub_tensor_quant_conversion(mocker, input_shape, use_qat): input_data = {0: input_data_1, 1: input_data_2} nodes = list(exported_program.graph.nodes) - assert nodes[4].target == exir_ops.edge.aten.sub.Tensor + assert nodes[4].target == SubTensor convert_run_compare( exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data @@ -96,7 +108,7 @@ def test_sub_tensor_one_input_quant_conversion(mocker, input_shape, use_qat): input_data = (np.random.random(input_shape).astype(np.float32) * 50).astype(np.int8) nodes = list(exported_program.graph.nodes) - assert nodes[2].target == exir_ops.edge.aten.sub.Tensor + assert nodes[2].target == SubTensor convert_run_compare( exported_program, tfl_model=tflite_flatbuffers_model, input_data=input_data @@ -141,7 +153,7 @@ def test_sub_tensor_w_conv_quant_conversion(mocker, x_input_shape, use_qat): input_data = {0: input_data_1, 1: input_data_2} nodes = list(exported_program.graph.nodes) - assert nodes[15].target == exir_ops.edge.aten.sub.Tensor + assert nodes[15].target == SubTensor convert_run_compare( exported_program, @@ -176,6 +188,236 @@ def test_sub_tensor_broadcasting_unsupported_quant_conversion( nodes = list(edge_program.graph.nodes) # Broadcast is not supported, node is not converted - assert ( - nodes[6].target == exir_ops.edge.aten.sub.Tensor - ) # Sub Tensor is not delegated. + assert nodes[6].target == SubTensor # Sub Tensor is not delegated. + + +class TestSubTensorNewNeutronFlow: + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param((1,), id="1D."), + pytest.param((6, 5), id="2D."), + pytest.param((1, 4, 7), id="3D."), + pytest.param( + (6, 82), + id="2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 68, 7), + id="3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (2, 4, 3, 15), + id="4D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 4, 9, 11, 4), + id="5D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__basic_nsys_inference(self, x_input_shape, mocker): + x_input_spec = ModelInputSpec(x_input_shape) + model = SubTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={} + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + [x_input_spec, x_input_spec], + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param((1,), id="1D."), + pytest.param((6, 5), id="2D."), + pytest.param((2, 4, 3, 15), id="4D."), + pytest.param( + (1, 4, 7), + id="3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + (1, 4, 9, 11, 4), + id="5D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__basic_nsys_inference_qat(self, x_input_shape, mocker): + x_input_spec = ModelInputSpec(x_input_shape) + model = SubTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={} + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + [x_input_spec, x_input_spec], + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + use_qat=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((4, 6)), ModelInputSpec((1, 6))], id="2 inputs 2D." + ), + pytest.param( + [ModelInputSpec((4,)), ModelInputSpec((4, 4))], id="2 inputs 1D + 2D." + ), + pytest.param( + [ModelInputSpec((5, 3, 4)), ModelInputSpec((1, 3, 1))], + id="2 inputs 3D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + pytest.param( + [ModelInputSpec((69, 73)), ModelInputSpec((1, 73))], + id="2 inputs 2D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__broadcast(self, input_spec, mocker): + model = SubTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={SubTensor: 1}, expected_non_delegated_ops={} + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + input_spec, + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((4, 1)), ModelInputSpec((1, 6))], id="2 inputs 2D." + ), + pytest.param( + [ModelInputSpec((1, 3, 4)), ModelInputSpec((5, 3, 1))], + id="2 inputs 3D.", + ), + pytest.param( + [ModelInputSpec((6, 4)), ModelInputSpec((6, 6, 1))], + id="2 inputs 2D+3D.", + ), + ], + ) + def test__broadcast_unsupported(self, input_spec): + # Broadcast where at least one of the inputs is not equal to output is not supported + model = SubTensorModule() + + delegated_ep = to_quantized_edge_program( + model, input_spec, use_new_flow_neutron_c=True + ).exported_program() + + # Make sure the `sub.Tensor` was NOT delegated. + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [SubTensor]) + + @pytest.mark.parametrize( + "x_input_shape", + [ + pytest.param( + (1, 4, 5, 5), id="4D, product of dims is not a multiple of 8." + ), + ], + ) + def test__w_conv(self, x_input_shape, mocker): + model = SubTensorConvModule() + + n, c, h, w = x_input_shape + y_input_spec = ModelInputSpec((n, 8, h, w)) + x_input_spec = ModelInputSpec(x_input_shape) + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops={SubTensor: 1, Convolution: 1}, + expected_non_delegated_ops={}, + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + [x_input_spec, y_input_spec], + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((1, 4, 7, 1)), ModelInputSpec((1, 8, 1, 1))], + id="2 inputs 4D + 4D.", + ), + pytest.param( + [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 8, 5, 1))], + id="2 inputs 4D + 4D incorrect.", + marks=pytest.mark.xfail(reason="AIR-14602: incorrect results"), + ), + ], + ) + def test__w_conv_broadcast(self, input_spec, mocker): + model = SubTensorConvModule() + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops={SubTensor: 1, Convolution: 1}, + expected_non_delegated_ops={}, + ) + dataset_creator = RandomDatasetCreator(low=-1.0, high=1.0) + + lower_run_compare( + model, + input_spec, + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, + ) + + @pytest.mark.parametrize( + "input_spec", + [ + pytest.param( + [ModelInputSpec((1, 4, 5, 5)), ModelInputSpec((1, 5))], + id="2 inputs 4D + 2D.", + ), + pytest.param( + [ModelInputSpec((1, 4, 4, 10)), ModelInputSpec((1, 4, 1))], + id="2 inputs 4D + 3D.", + ), + ], + ) + def test__w_conv_unsupported(self, input_spec): + model = SubTensorConvModule() + + delegated_ep = to_quantized_edge_program( + model, input_spec, use_new_flow_neutron_c=True + ).exported_program() + + # Make sure the `sub.Tensor` was NOT delegated. + assert graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) + assert graph_contains_any_of_ops(delegated_ep.graph, [SubTensor]) diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py index 9e6bedc5dba..7f855dd63af 100644 --- a/backends/nxp/tests/ops_aliases.py +++ b/backends/nxp/tests/ops_aliases.py @@ -37,6 +37,7 @@ Squeeze = exir_ops.edge.aten.squeeze.default SqueezeDim = exir_ops.edge.aten.squeeze.dim SqueezeDims = exir_ops.edge.aten.squeeze.dims +SubTensor = exir_ops.edge.aten.sub.Tensor Unsqueeze = exir_ops.edge.aten.unsqueeze.default UpsampleBilinear2D = exir_ops.edge.aten.upsample_bilinear2d.vec UpsampleNearest2D = exir_ops.edge.aten.upsample_nearest2d.vec From 03e14ef8b3964deb589f3f172b4bbee7d206795a Mon Sep 17 00:00:00 2001 From: Youngsik Yang Date: Tue, 26 May 2026 01:55:50 +0900 Subject: [PATCH 013/103] Arm backend: Add bf16 support for aten.index_select and aten.unfold_copy (#19751) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #17097, which added BF16 support to the TOSA GATHER op. `aten.index_select` and `aten.unfold_copy` both lower via TOSA GATHER but their support checks were not updated at the time. In both decompositions(`DecomposeIndexSelectToGatherPass()` and `DecomposeUnfoldToGatherPass()`), the bf16 values tensor flows through dtype-agnostic reshape ops and `tosa.GATHER`, which accepts `BF16`. The support check was the only blocker. | Op | bf16 before | bf16 after | |---------------------|:-----------:|:----------:| | `aten.gather` | ✅ | ✅ | | `aten.index.Tensor` | ✅ | ✅ | | `aten.slice_copy` | ✅ | ✅ | | `aten.index_select` | ❌ | ✅ | | `aten.unfold_copy` | ❌ | ✅ | Changes: - `index_select_support.py`, `unfold_copy_support.py`: extend float branch to include `bfloat16`; add bf16 extension guard; update rejection message. - `test_index_select.py`, `test_unfold_copy.py`: add isolated `_tosa_FP_bf16` test functions using `TosaPipelineFP(..., tosa_extensions=["bf16"])`. ### Test plan `test_index_select_tosa_FP_bf16` and `test_unfold_copy_tosa_FP_bf16` exercise the bf16 path end-to-end through `TosaPipelineFP` with the bf16 extension enabled, following the same pattern of the existing `test_slice_tensor_tosa_FP_bf16` from #17492 --- .../operator_support/index_select_support.py | 14 ++++++-- .../operator_support/unfold_copy_support.py | 14 ++++++-- backends/arm/test/ops/test_index_select.py | 32 +++++++++++++++++++ backends/arm/test/ops/test_unfold_copy.py | 24 ++++++++++++++ 4 files changed, 78 insertions(+), 6 deletions(-) diff --git a/backends/arm/operator_support/index_select_support.py b/backends/arm/operator_support/index_select_support.py index a3188e739c7..285b2cfe79f 100644 --- a/backends/arm/operator_support/index_select_support.py +++ b/backends/arm/operator_support/index_select_support.py @@ -77,8 +77,16 @@ def is_node_tosa_supported( f"{node.target}: dtype {values_dtype} requires INT profile.", ) return False - # fp16/fp32: either FP profile, or INT profile (via quantization) - elif values_dtype in (torch.float16, torch.float32): + # fp16/fp32/bf16: either FP profile, or INT profile (via quantization) + elif values_dtype in (torch.float16, torch.float32, torch.bfloat16): + if values_dtype == torch.bfloat16 and not tosa_spec.support_extension( + "bf16" + ): + self.reporter.report_reject( + node, + f"{node.target}: dtype {values_dtype} requires bf16 extension.", + ) + return False if not (tosa_spec.support_float() or tosa_spec.support_integer()): self.reporter.report_reject( node, @@ -90,7 +98,7 @@ def is_node_tosa_supported( self.reporter.report_reject( node, f"{node.target}: unsupported values dtype {values_dtype}; " - "expected bool/int8/int16/int32/float16/float32.", + "expected bool/int8/int16/int32/float16/bfloat16/float32.", ) return False diff --git a/backends/arm/operator_support/unfold_copy_support.py b/backends/arm/operator_support/unfold_copy_support.py index bf6c1cad22e..ac9fc7d0ee3 100644 --- a/backends/arm/operator_support/unfold_copy_support.py +++ b/backends/arm/operator_support/unfold_copy_support.py @@ -84,8 +84,16 @@ def is_node_tosa_supported( f"{node.target}: dtype {values_dtype} requires INT profile.", ) return False - # fp16/fp32: either FP profile, or INT profile (via quantization) - elif values_dtype in (torch.float16, torch.float32): + # fp16/fp32/bf16: either FP profile, or INT profile (via quantization) + elif values_dtype in (torch.float16, torch.float32, torch.bfloat16): + if values_dtype == torch.bfloat16 and not tosa_spec.support_extension( + "bf16" + ): + self.reporter.report_reject( + node, + f"{node.target}: dtype {values_dtype} requires bf16 extension.", + ) + return False if not (tosa_spec.support_float() or tosa_spec.support_integer()): self.reporter.report_reject( node, @@ -97,7 +105,7 @@ def is_node_tosa_supported( self.reporter.report_reject( node, f"{node.target}: unsupported values dtype {values_dtype}; " - "expected bool/int8/int16/int32/float16/float32.", + "expected bool/int8/int16/int32/float16/bfloat16/float32.", ) return False diff --git a/backends/arm/test/ops/test_index_select.py b/backends/arm/test/ops/test_index_select.py index bb5f0a92c51..4de19d30daf 100644 --- a/backends/arm/test/ops/test_index_select.py +++ b/backends/arm/test/ops/test_index_select.py @@ -61,6 +61,26 @@ def forward(self, input_: torch.Tensor, dim: int, index_: torch.Tensor): torch.tensor([3, 1], dtype=torch.int32), # [W=2] ), } +test_data_fp_bf16: dict[str, input_params] = { + # Rank-2: [K, C] -> index_select dim=0 => [W, C] + "test_bf16_rank2_dim0": ( + torch.tensor( + [[0.5, 1.25, 2.5], [3.5, 4.25, 5.75], [6.5, 7.25, 8.75]], + dtype=torch.bfloat16, + ), # [K=3, C=3] + 0, + torch.tensor([2, 0], dtype=torch.int32), # [W=2] + ), + # Rank-3: [N, K, C] -> index_select dim=-1 => [N, K, W] + "test_bf16_rank3_dim_neg1": ( + torch.tensor( + [[[0.5, 1.5], [2.5, 3.5]], [[4.5, 5.5], [6.5, 7.5]]], + dtype=torch.bfloat16, + ), # [N=2, K=2, C=2] + -1, + torch.tensor([1, 0], dtype=torch.int32), # [W=2] + ), +} # ---- INT profile: integer inputs + bool ---- test_data_int: dict[str, input_params] = { @@ -104,6 +124,18 @@ def test_index_select_tosa_FP(test_data: input_params): pipeline.run() +@common.parametrize("test_data", test_data_fp_bf16) +def test_index_select_tosa_FP_bf16(test_data: input_params): + pipeline = TosaPipelineFP[input_params]( + IndexSelect(), + test_data, + aten_op=IndexSelect.aten_op, + exir_op=IndexSelect.exir_op, + tosa_extensions=["bf16"], + ) + pipeline.run() + + @common.parametrize("test_data", test_data_int | test_data_fp) def test_index_select_tosa_INT(test_data: input_params): # INT profile runs quantized, so we test both int inputs and float inputs here. diff --git a/backends/arm/test/ops/test_unfold_copy.py b/backends/arm/test/ops/test_unfold_copy.py index 2b502a9be10..baa4b7f64bc 100644 --- a/backends/arm/test/ops/test_unfold_copy.py +++ b/backends/arm/test/ops/test_unfold_copy.py @@ -120,6 +120,18 @@ def forward(self, input_: torch.Tensor, dim_: int, size_: int, step_: int): ), } +test_data_bf16: dict[str, input_params] = { + "test_bf16_2d_dim1": ( + torch.tensor( + [[0.1, 0.2, 0.3, 0.4, 0.5], [1.1, 1.2, 1.3, 1.4, 1.5]], + dtype=torch.bfloat16, + ), # [B=2, T=5] + 1, + 3, + 2, # U=(5-3)//2+1=2 -> [B=2, U=2, C=3] + ), +} + @common.parametrize("test_data", test_data_fp) def test_unfold_copy_tosa_FP(test_data: input_params): @@ -132,6 +144,18 @@ def test_unfold_copy_tosa_FP(test_data: input_params): pipeline.run() +@common.parametrize("test_data", test_data_bf16) +def test_unfold_copy_tosa_FP_bf16(test_data: input_params): + pipeline = TosaPipelineFP[input_params]( + UnfoldCopy(), + test_data, + aten_op=UnfoldCopy.aten_op, + exir_op=UnfoldCopy.exir_op, + tosa_extensions=["bf16"], + ) + pipeline.run() + + @common.parametrize("test_data", test_data_int | test_data_fp) def test_unfold_copy_tosa_INT(test_data: input_params): pipeline = TosaPipelineINT[input_params]( From b581615fa86dd2357d866064427a0b93b2ad947f Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Tue, 26 May 2026 09:50:10 +0200 Subject: [PATCH 014/103] Cortex-M backend: Add AoT scratch-buffer planning. (#19636) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is done for conv, depthwise conv, transpose conv, and bmm. Add scratch tensors to the operator signatures, which are then assigned exir.memory.alloc. These allocs are automatically memory planned by ExecuTorch. Introduce `required_cmsis_buffer_size`which computes the buffer size from node properties + the Cortex-M configuration. The function uses functions registered by target in backends/cortex_m/passes/scratch_buffer_sizes.py This is used to set the size of the allocs in ConvertToCortexMPass Finally, modify the kernels to use the new scratch tensor instead of allocating temporary memory. Add a new macro CORTEX_M_ENABLE_RUNTIME_CHECKS to do a safety check that the aot computed buffer size is equal to the buffer size computed at runtime. Use this when testing. cc @psiddh @AdrianLundell @digantdesai @rascani @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell --------- Signed-off-by: Erik Lundell Co-authored-by: Måns Nilsson --- backends/arm/scripts/build_executorch.sh | 8 + backends/cortex_m/CMakeLists.txt | 9 + .../ops/op_quantized_batch_matmul.cpp | 35 +-- backends/cortex_m/ops/op_quantized_conv2d.cpp | 34 +-- .../ops/op_quantized_depthwise_conv2d.cpp | 31 +- .../ops/op_quantized_transpose_conv2d.cpp | 44 +-- backends/cortex_m/ops/operators.py | 28 +- backends/cortex_m/ops/operators.yaml | 9 +- backends/cortex_m/passes/__init__.py | 1 + .../passes/convert_to_cortex_m_pass.py | 64 ++++- .../cortex_m/passes/scratch_buffer_sizes.py | 266 ++++++++++++++++++ backends/cortex_m/test/build_test_runner.sh | 4 +- 12 files changed, 451 insertions(+), 82 deletions(-) create mode 100644 backends/cortex_m/passes/scratch_buffer_sizes.py diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh index 54d2091d1f4..5ac2674f964 100755 --- a/backends/arm/scripts/build_executorch.sh +++ b/backends/arm/scripts/build_executorch.sh @@ -7,6 +7,7 @@ # Optional parameter: # --build_type= "Release" | "Debug" | "RelWithDebInfo" | "UndefinedSanitizer" | "AddressSanitizer" # --etdump build with devtools-etdump support +# --cmake-args= Additional arguments passed to cmake configure set -eu @@ -24,6 +25,7 @@ build_type="Release" build_devtools=OFF build_with_etdump=OFF is_linux_musl=0 +extra_cmake_args=() target_cpu="" help() { @@ -33,6 +35,7 @@ help() { echo " --build_type= Build with Release, Debug, RelWithDebInfo, UndefinedSanitizer or AddressSanitizer, default is ${build_type}" echo " --devtools Build Devtools libs" echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log" + echo " --cmake-args= Additional arguments passed to cmake configure" echo " --toolchain= Toolchain can be specified (arm-none-eabi-gcc, arm-zephyr-eabi-gcc, aarch64-linux-musl-gcc). Default: ${toolchain}" echo " --target_cpu= Override the toolchain's default TARGET_CPU (e.g. cortex-m4). Switching target_cpu reuses the same cmake-out dir, so clear ${et_build_root}/cmake-out first to avoid stale per-CPU artifacts. Default: unset (toolchain default)." exit 0 @@ -45,6 +48,10 @@ for arg in "$@"; do --build_type=*) build_type="${arg#*=}";; --devtools) build_devtools=ON ;; --etdump) build_with_etdump=ON ;; + --cmake-args=*) + # shellcheck disable=SC2206 + extra_cmake_args=(${arg#*=}) + ;; --toolchain=*) toolchain="${arg#*=}";; --target_cpu=*) target_cpu="${arg#*=}";; *) @@ -89,6 +96,7 @@ cmake_args=( -DEXECUTORCH_BUILD_DEVTOOLS=${build_devtools} -DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump} -DEXECUTORCH_BAREMETAL_SKIP_INSTALL=OFF + "${extra_cmake_args[@]}" ) if [[ -n "${target_cpu}" ]]; then diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt index 876c65982e6..627406c1935 100644 --- a/backends/cortex_m/CMakeLists.txt +++ b/backends/cortex_m/CMakeLists.txt @@ -30,6 +30,10 @@ set(CMSIS_NN_LOCAL_PATH "" CACHE PATH "Path to existing local CMSIS-NN installation" ) +option(CORTEX_M_ENABLE_RUNTIME_CHECKS + "Enable additional Cortex-M runtime assertions and validation checks" + OFF +) # Try to find existing / local CMSIS-NN installation. This is useful for # debugging and testing with local changes. This is not common, as the CMSIS-NN @@ -107,6 +111,11 @@ target_link_libraries( PRIVATE executorch PRIVATE kernels_util_all_deps ) +target_compile_definitions( + cortex_m_kernels + PRIVATE + $<$:CORTEX_M_ENABLE_RUNTIME_CHECKS> +) # Include directories for cortex_m_kernels target_include_directories( diff --git a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp index e6bc5a949ce..345753ca8fc 100644 --- a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp +++ b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp @@ -1,6 +1,7 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. + * Copyright 2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -71,6 +72,7 @@ Tensor& quantized_batch_matmul_out( int64_t output_offset, int64_t output_multiplier, int64_t output_shift, + const Tensor& scratch, Tensor& out) { if (!validate_batch_matmul_arguments(context, lhs, rhs_transposed, out)) { return out; @@ -100,25 +102,26 @@ Tensor& quantized_batch_matmul_out( quant_params.multiplier = static_cast(output_multiplier); quant_params.shift = static_cast(output_shift); - const int32_t buf_size = arm_fully_connected_s8_get_buffer_size(&out_dims); - cmsis_nn_context ctx; ctx.buf = nullptr; - ctx.size = 0; - - if (buf_size > 0) { - auto buffer_or_error = context.allocate_temp(buf_size); - if (!buffer_or_error.ok()) { - ET_LOG( - Error, - "quantized_batch_matmul: failed to allocate scratch buffer (%d bytes)", - buf_size); - context.fail(buffer_or_error.error()); - return out; - } - ctx.buf = buffer_or_error.get(); - ctx.size = buf_size; + ctx.size = scratch.nbytes(); + if (ctx.size > 0) { + ctx.buf = scratch.mutable_data_ptr(); + } + +#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS + const int32_t runtime_buffer_bytes = + arm_fully_connected_s8_get_buffer_size(&out_dims); + if (ctx.size != static_cast(runtime_buffer_bytes)) { + ET_LOG( + Error, + "quantized_batch_matmul: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(ctx.size), + runtime_buffer_bytes); + context.fail(Error::Internal); + return out; } +#endif const arm_cmsis_nn_status status = arm_batch_matmul_s8( &ctx, diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp index 7d4433690f6..8af374c03f8 100644 --- a/backends/cortex_m/ops/op_quantized_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp @@ -112,6 +112,7 @@ Tensor& quantized_conv2d_out( const Tensor& requantize_shifts, const int64_t activation_min, const int64_t activation_max, + const Tensor& scratch, Tensor& out) { if (!validate_conv2d_arguments( context, @@ -182,31 +183,30 @@ Tensor& quantized_conv2d_out( cmsis_nn_context cmsis_context; cmsis_context.buf = nullptr; - cmsis_context.size = 0; + cmsis_context.size = scratch.nbytes(); + if (cmsis_context.size > 0) { + cmsis_context.buf = scratch.mutable_data_ptr(); + } - const int32_t buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size( +#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS + const int32_t runtime_buffer_bytes = arm_convolve_wrapper_s8_get_buffer_size( &conv_params, &input_dims, &filter_dims, &output_dims); - if (buffer_bytes < 0) { + if (runtime_buffer_bytes < 0) { ET_LOG( Error, "quantized_conv2d_out: CMSIS-NN buffer size calculation failed"); context.fail(Error::Internal); return out; } - if (buffer_bytes > 0) { - auto buffer_or_error = - context.allocate_temp(buffer_bytes, kCortexMMveAlignment); - if (!buffer_or_error.ok()) { - ET_LOG( - Error, - "quantized_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)", - static_cast(buffer_bytes), - static_cast(buffer_or_error.error())); - context.fail(buffer_or_error.error()); - return out; - } - cmsis_context.buf = buffer_or_error.get(); - cmsis_context.size = buffer_bytes; + if (scratch.nbytes() != static_cast(runtime_buffer_bytes)) { + ET_LOG( + Error, + "quantized_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + static_cast(runtime_buffer_bytes)); + context.fail(Error::Internal); + return out; } +#endif const arm_cmsis_nn_status status = arm_convolve_wrapper_s8( &cmsis_context, diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp index 8dec61e0af1..21d4f257501 100644 --- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp @@ -150,6 +150,7 @@ Tensor& quantized_depthwise_conv2d_out( const Tensor& requantize_shifts, const int64_t activation_min, const int64_t activation_max, + const Tensor& scratch, Tensor& out) { if (!validate_depthwise_conv2d_arguments( context, @@ -220,32 +221,32 @@ Tensor& quantized_depthwise_conv2d_out( cmsis_nn_context cmsis_context; cmsis_context.buf = nullptr; - cmsis_context.size = 0; + cmsis_context.size = scratch.nbytes(); + if (cmsis_context.size > 0) { + cmsis_context.buf = scratch.mutable_data_ptr(); + } - const int32_t buffer_bytes = arm_depthwise_conv_wrapper_s8_get_buffer_size( - &dw_conv_params, &input_dims, &filter_dims, &output_dims); - if (buffer_bytes < 0) { +#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS + const int32_t runtime_buffer_bytes = + arm_depthwise_conv_wrapper_s8_get_buffer_size( + &dw_conv_params, &input_dims, &filter_dims, &output_dims); + if (runtime_buffer_bytes < 0) { ET_LOG( Error, "quantized_depthwise_conv2d_out: CMSIS-NN buffer size calculation failed"); context.fail(Error::Internal); return out; } - - auto buffer_or_error = context.allocate_temp( - static_cast(buffer_bytes), kCortexMMveAlignment); - if (!buffer_or_error.ok()) { + if (scratch.nbytes() != static_cast(runtime_buffer_bytes)) { ET_LOG( Error, - "quantized_depthwise_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)", - static_cast(buffer_bytes), - static_cast(buffer_or_error.error())); - context.fail(buffer_or_error.error()); + "quantized_depthwise_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + static_cast(runtime_buffer_bytes)); + context.fail(Error::Internal); return out; } - cmsis_context.buf = buffer_or_error.get(); - cmsis_context.size = buffer_bytes; - +#endif const arm_cmsis_nn_status status = arm_depthwise_conv_wrapper_s8( &cmsis_context, &dw_conv_params, diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp index e3f6135c7b9..d2b66b18802 100644 --- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp @@ -1,6 +1,7 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. + * Copyright 2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -97,6 +98,8 @@ Tensor& quantized_transpose_conv2d_out( const Tensor& requantize_shifts, const int64_t activation_min, const int64_t activation_max, + const Tensor& scratch, + const Tensor& output_scratch, Tensor& out) { if (!validate_transpose_conv2d_arguments( context, @@ -179,44 +182,43 @@ Tensor& quantized_transpose_conv2d_out( cmsis_nn_context cmsis_context; cmsis_context.buf = nullptr; - cmsis_context.size = 0; + cmsis_context.size = scratch.nbytes(); + if (cmsis_context.size > 0) { + cmsis_context.buf = scratch.mutable_data_ptr(); + } cmsis_nn_context output_context; output_context.buf = nullptr; - output_context.size = 0; - + output_context.size = output_scratch.nbytes(); + if (output_context.size > 0) { + output_context.buf = output_scratch.mutable_data_ptr(); + } +#ifdef CORTEX_M_ENABLE_RUNTIME_CHECKS const int32_t buffer_bytes = arm_transpose_conv_s8_get_buffer_size( &transpose_conv_params, &input_dims, &filter_dims, &output_dims); - auto buffer_or_error = context.allocate_temp( - static_cast(buffer_bytes), kCortexMMveAlignment); - if (!buffer_or_error.ok()) { + if (scratch.nbytes() != static_cast(buffer_bytes)) { ET_LOG( Error, - "quantized_transpose_conv2d_out: failed to allocate scratch buffer (%d bytes, error %d)", - buffer_bytes, - static_cast(buffer_or_error.error())); - context.fail(buffer_or_error.error()); + "quantized_transpose_conv2d_out: scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(scratch.nbytes()), + buffer_bytes); + context.fail(Error::Internal); return out; } - cmsis_context.buf = buffer_or_error.get(); - cmsis_context.size = buffer_bytes; const int32_t output_buffer_bytes = arm_transpose_conv_s8_get_reverse_conv_buffer_size( &transpose_conv_params, &input_dims, &filter_dims); - auto output_buffer_or_error = context.allocate_temp( - static_cast(output_buffer_bytes), kCortexMMveAlignment); - if (!output_buffer_or_error.ok()) { + if (output_scratch.nbytes() != static_cast(output_buffer_bytes)) { ET_LOG( Error, - "quantized_transpose_conv2d_out: failed to allocate output scratch buffer (%d bytes, error %d)", - output_buffer_bytes, - static_cast(output_buffer_or_error.error())); - context.fail(output_buffer_or_error.error()); + "quantized_transpose_conv2d_out: output scratch buffer size incorrect - actual: (%d) needed: (%d)", + static_cast(output_scratch.nbytes()), + output_buffer_bytes); + context.fail(Error::Internal); return out; } - output_context.buf = output_buffer_or_error.get(); - output_context.size = output_buffer_bytes; +#endif const arm_cmsis_nn_status status = arm_transpose_conv_wrapper_s8( &cmsis_context, diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index 2c35ed8730b..d4393bc7ada 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -271,13 +271,15 @@ def quantized_mul_impl( "quantized_batch_matmul(" "Tensor lhs, int lhs_zero_point, " "Tensor rhs_transposed, int rhs_zero_point, " - "int output_zero_point, int output_multiplier, int output_shift) -> Tensor" + "int output_zero_point, int output_multiplier, int output_shift, " + "Tensor scratch) -> Tensor" ) lib.define( "quantized_batch_matmul.out(" "Tensor lhs, int lhs_zero_point, " "Tensor rhs_transposed, int rhs_zero_point, " "int output_zero_point, int output_multiplier, int output_shift, " + "Tensor scratch, " "*, Tensor(a!) out) -> Tensor(a!)" ) @@ -291,6 +293,7 @@ def quantized_batch_matmul_meta( output_zero_point: int, output_multiplier: int, output_shift: int, + scratch: torch.Tensor, ) -> torch.Tensor: batch, lhs_rows, inner = lhs.shape batch_rhs, rhs_cols, inner_rhs = rhs_transposed.shape @@ -307,6 +310,7 @@ def quantized_batch_matmul_impl( output_zero_point: int, output_multiplier: int, output_shift: int, + scratch: torch.Tensor, ) -> torch.Tensor: # Offsets are negated zero points (CMSIS-NN convention) lhs_fp = lhs.to(torch.float32) + float(lhs_zero_point) @@ -638,7 +642,8 @@ def pad_impl( "Tensor requantize_multipliers, " "Tensor requantize_shifts, " "int activation_min, " - "int activation_max" + "int activation_max, " + "Tensor scratch" ") -> Tensor" ) @@ -657,6 +662,7 @@ def pad_impl( "Tensor requantize_shifts, " "int activation_min, " "int activation_max, " + "Tensor scratch, " "*, Tensor(a!) out" ") -> Tensor(a!)" ) @@ -733,6 +739,7 @@ def quantized_conv2d_meta( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: stride_vals = list(stride) padding_vals = list(padding) @@ -762,6 +769,7 @@ def quantized_conv2d_impl( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: if input.dim() != 4 or weight.dim() != 4: raise RuntimeError("quantized_conv2d expects 4D input and weight tensors") @@ -830,7 +838,8 @@ def quantized_conv2d_impl( "Tensor requantize_multipliers, " "Tensor requantize_shifts, " "int activation_min, " - "int activation_max" + "int activation_max, " + "Tensor scratch" ") -> Tensor" ) @@ -850,6 +859,7 @@ def quantized_conv2d_impl( "Tensor requantize_shifts, " "int activation_min, " "int activation_max, " + "Tensor scratch, " "*, Tensor(a!) out" ") -> Tensor(a!)" ) @@ -870,6 +880,7 @@ def quantized_depthwise_conv2d_meta( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: stride_vals = list(stride) padding_vals = list(padding) @@ -900,6 +911,7 @@ def quantized_depthwise_conv2d_impl( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, ) -> torch.Tensor: if input.dim() != 4 or weight.dim() != 4: raise RuntimeError( @@ -973,7 +985,9 @@ def quantized_depthwise_conv2d_impl( "Tensor requantize_multipliers, " "Tensor requantize_shifts, " "int activation_min, " - "int activation_max" + "int activation_max, " + "Tensor scratch, " + "Tensor output_scratch" ") -> Tensor" ) @@ -992,6 +1006,8 @@ def quantized_depthwise_conv2d_impl( "Tensor requantize_shifts, " "int activation_min, " "int activation_max, " + "Tensor scratch, " + "Tensor output_scratch, " "*, Tensor(a!) out) -> Tensor(a!)" ) @@ -1057,6 +1073,8 @@ def quantized_transpose_conv2d_meta( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, + output_scratch: torch.Tensor, ) -> torch.Tensor: stride_vals = list(stride) padding_vals = list(padding) @@ -1095,6 +1113,8 @@ def quantized_transpose_conv2d_impl( requantize_shifts: torch.Tensor, activation_min: int, activation_max: int, + scratch: torch.Tensor, + output_scratch: torch.Tensor, ) -> torch.Tensor: """ Reference implementation of quantized transposed convolution. diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index e0ebbfab868..8db109dea43 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -65,19 +65,20 @@ - arg_meta: null kernel_name: cortex_m::pad_out -- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: cortex_m::quantized_conv2d_out -- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) + +- func: cortex_m::quantized_depthwise_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int depth_multiplier, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: cortex_m::quantized_depthwise_conv2d_out -- func: cortex_m::quantized_transpose_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_transpose_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] output_padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, Tensor scratch, Tensor output_scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null @@ -94,7 +95,7 @@ - arg_meta: null kernel_name: cortex_m::quantized_max_pool2d_out -- func: cortex_m::quantized_batch_matmul.out(Tensor lhs, int lhs_zero_point, Tensor rhs_transposed, int rhs_zero_point, int output_zero_point, int output_multiplier, int output_shift, *, Tensor(a!) out) -> Tensor(a!) +- func: cortex_m::quantized_batch_matmul.out(Tensor lhs, int lhs_zero_point, Tensor rhs_transposed, int rhs_zero_point, int output_zero_point, int output_multiplier, int output_shift, Tensor scratch, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/backends/cortex_m/passes/__init__.py b/backends/cortex_m/passes/__init__.py index 92179ec6654..c379461949f 100644 --- a/backends/cortex_m/passes/__init__.py +++ b/backends/cortex_m/passes/__init__.py @@ -33,6 +33,7 @@ def _ensure_cortex_m_dependencies() -> None: _ensure_cortex_m_dependencies() +from .cortex_m_pass import CortexMPass # noqa # usort: skip from .activation_fusion_pass import ActivationFusionPass # noqa from .clamp_hardswish_pass import ClampHardswishPass # noqa from .convert_to_cortex_m_pass import ConvertToCortexMPass # noqa diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index 418f6cd63ff..e61ddaf63bc 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -6,25 +6,32 @@ # LICENSE file in the root directory of this source tree. import executorch.backends.cortex_m.ops.operators # noqa +import executorch.exir as exir import torch import torch.fx from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor + +from executorch.backends.cortex_m.passes import CortexMPass from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot +from executorch.backends.cortex_m.passes.scratch_buffer_sizes import ( + required_cmsis_nn_buffer_sizes, +) from executorch.backends.transforms.utils import ( create_constant_placeholder, get_param_tensor, is_param_node, ) - -from executorch.backends.xnnpack._passes.xnnpack_pass import XNNPACKPass from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.passes import make_alloc_node +from torch._subclasses.fake_tensor import FakeTensorMode + from torch.export.graph_signature import InputKind from torch.fx.passes.infra.pass_manager import PassResult -class ConvertToCortexMPass(XNNPACKPass): +class ConvertToCortexMPass(CortexMPass): """ Cortex-M backend pass for replacing supported quantized kernels with Cortex-M accelerated kernels. @@ -33,6 +40,15 @@ class ConvertToCortexMPass(XNNPACKPass): by call_operator. """ + def _create_uninitialized_alloc_node(self): + """Create an unitialized alloc node to be initialize at a later point.""" + with FakeTensorMode() as mode: + return make_alloc_node( + self.exported_program.graph_module, + mode.from_tensor(torch.empty(0)), + None, + ) + def _compute_kernel_sum(self, weights, bias, input_offset, weight_offset): """ Computes the precomputed kernel sum term (bias optional) @@ -238,6 +254,9 @@ def _get_convolution_replacement(self, node): torch.tensor(quantized_shifts, dtype=torch.int32), ) + with node.graph.inserting_before(node): + scratch = self._create_uninitialized_alloc_node() + if use_depthwise_conv: # Compute depth_multiplier for depthwise convolution # For depthwise: output_channels = input_channels * depth_multiplier @@ -263,6 +282,7 @@ def _get_convolution_replacement(self, node): quantized_shift_tensor, output_qmin, output_qmax, + scratch, ) return exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default, new_args else: @@ -280,9 +300,36 @@ def _get_convolution_replacement(self, node): quantized_shift_tensor, output_qmin, output_qmax, + scratch, ) return exir_ops.edge.cortex_m.quantized_conv2d.default, new_args + def _initialize_alloc_node_size(self, node: torch.fx.Node) -> None: + """For nodes with a registered buffer size function for node.target, set the buffer sizes + of the last n args, which should be exir.memory.alloc nodes. For nodes without a + registered function, do nothing. + """ + + scratch_buffer_sizes = required_cmsis_nn_buffer_sizes( + node, self.target_config.backend + ) + if scratch_buffer_sizes is None: + return + + # Assume that scratch_buffer_sizes are given from left to right in the call signature of node.target. + for i, scratch_buffer_size in enumerate(reversed(scratch_buffer_sizes)): + scratch_arg = node.args[-(i + 1)] + if ( + not isinstance(scratch_arg, torch.fx.Node) + or scratch_arg.target != exir.memory.alloc + ): + raise RuntimeError( + f"Expected scratch alloc node as final argument(s) for {node.target}, got {scratch_arg}." + ) + + # buffer size is given in bytes, always use uint8 as dtype. + scratch_arg.args = (((scratch_buffer_size,), torch.uint8),) + def _get_transpose_conv2d_replacement(self, node): """ Transform aten.convolution with transposed=True to cortex_m.quantized_transpose_conv2d @@ -363,6 +410,10 @@ def _get_transpose_conv2d_replacement(self, node): torch.tensor(quantized_shifts, dtype=torch.int32), ) + with node.graph.inserting_before(node): + scratch = self._create_uninitialized_alloc_node() + output_scratch = self._create_uninitialized_alloc_node() + new_args = ( x, weight_nhwc, @@ -377,6 +428,8 @@ def _get_transpose_conv2d_replacement(self, node): quantized_shift_tensor, output_qmin, output_qmax, + scratch, + output_scratch, ) return exir_ops.edge.cortex_m.quantized_transpose_conv2d.default, new_args @@ -415,6 +468,9 @@ def _get_bmm_replacement(self, node): args=(rhs_node, [0, 2, 1]), ) + with node.graph.inserting_before(node): + scratch = self._create_uninitialized_alloc_node() + args = ( lhs_node, -lhs_zp, @@ -423,6 +479,7 @@ def _get_bmm_replacement(self, node): output_zp, output_mult, output_shift, + scratch, ) return exir_ops.edge.cortex_m.quantized_batch_matmul.default, args @@ -459,6 +516,7 @@ def call(self, graph_module: torch.fx.GraphModule) -> PassResult: args=args, kwargs={}, ) + self._initialize_alloc_node_size(cortex_m_op) node.replace_all_uses_with(cortex_m_op) graph_module.graph.erase_node(node) diff --git a/backends/cortex_m/passes/scratch_buffer_sizes.py b/backends/cortex_m/passes/scratch_buffer_sizes.py new file mode 100644 index 00000000000..36f3f8bbc17 --- /dev/null +++ b/backends/cortex_m/passes/scratch_buffer_sizes.py @@ -0,0 +1,266 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from collections.abc import Callable +from typing import Any, cast + +import cmsis_nn # type: ignore[import-not-found, import-untyped] +import executorch.backends.cortex_m.ops.operators # noqa + +import torch +import torch.fx + +from executorch.exir.dialects._ops import ops as exir_ops + +BufferSizeFunction = Callable[[cmsis_nn.Backend, torch.fx.Node], list[int]] + + +def _tensor_from_node(node: torch.fx.Node) -> torch.Tensor: + if "val" in node.meta: + return node.meta["val"] + elif node.op == "call_function": + args = ( + _tensor_from_node(arg) if isinstance(arg, torch.fx.Node) else arg + for arg in node.args + ) + return node.target(*args, **node.kwargs) # type: ignore[operator] + else: + raise RuntimeError("Encountered non-call_function without 'val' meta.") + + +def _shape_from_node(node: torch.fx.Node) -> torch.Size: + return _tensor_from_node(node).shape + + +def _get_common_conv_buffer_size_inputs( + conv_node: torch.fx.Node, + *, + stride_arg_idx: int = 3, + padding_arg_idx: int = 4, + dilation_arg_idx: int = 5, +) -> tuple[ + list[int], + list[int], + list[int], + list[int], + list[int], + list[int], +]: + x = cast(torch.fx.Node, conv_node.args[0]) + weight = cast(torch.fx.Node, conv_node.args[1]) + stride = cast(list[int], conv_node.args[stride_arg_idx]) + padding = cast(list[int], conv_node.args[padding_arg_idx]) + dilation = cast(list[int], conv_node.args[dilation_arg_idx]) + + # Input is NCHW (PyTorch); CMSIS-NN wants NHWC dims. + n, c_in, height, width = _shape_from_node(x) + + weight_shape = _shape_from_node(weight) + + # Output is NCHW; convert to NHWC dims. + out_n, out_c, out_h, out_w = _shape_from_node(conv_node) + + input_nhwc = [n, height, width, c_in] + output_nhwc = [out_n, out_h, out_w, out_c] + stride_hw = [int(stride[0]), int(stride[1])] + padding_hw = [int(padding[0]), int(padding[1])] + dilation_hw = [int(dilation[0]), int(dilation[1])] + + return ( + input_nhwc, + list(weight_shape), + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) + + +def cmsis_nn_conv_buffer_size( + backend: cmsis_nn.Backend, + conv_node: torch.fx.Node, +) -> list[int]: + ( + input_nhwc, + weight_shape, + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) = _get_common_conv_buffer_size_inputs(conv_node=conv_node) + input_offset = cast(int, conv_node.args[6]) + output_offset = cast(int, conv_node.args[7]) + output_qmin = cast(int, conv_node.args[10]) + output_qmax = cast(int, conv_node.args[11]) + + # Weight is in OHWI layout after conversion. + c_out, kernel_h, kernel_w, c_in = weight_shape + filter_nhwc = [c_out, kernel_h, kernel_w, c_in] + + return [ + int( + cmsis_nn.convolve_wrapper_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + output_nhwc=output_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ) + ] + + +def cmsis_nn_depthwise_conv_buffer_size( + backend: cmsis_nn.Backend, + conv_node: torch.fx.Node, +) -> list[int]: + ( + input_nhwc, + weight_shape, + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) = _get_common_conv_buffer_size_inputs(conv_node=conv_node) + depth_multiplier = cast(int, conv_node.args[6]) + input_offset = cast(int, conv_node.args[7]) + output_offset = cast(int, conv_node.args[8]) + output_qmin = cast(int, conv_node.args[11]) + output_qmax = cast(int, conv_node.args[12]) + + # Weight is in IHWO layout after conversion. + _, kernel_h, kernel_w, c_out = weight_shape + filter_nhwc = [c_out, kernel_h, kernel_w, 1] + + return [ + int( + cmsis_nn.depthwise_conv_wrapper_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + output_nhwc=output_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + ch_mult=depth_multiplier, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ) + ] + + +def cmsis_nn_batch_matmul_buffer_size( + backend: cmsis_nn.Backend, + matmul_node: torch.fx.Node, +) -> list[int]: + rhs_transposed = cast(torch.fx.Node, matmul_node.args[2]) + rhs_shape = _shape_from_node(rhs_transposed) + + _, rhs_cols, inner = rhs_shape + + return [ + int( + cmsis_nn.fully_connected_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + filter_nhwc=[inner, -1, -1, rhs_cols], # H and W values are unused. + ) + ) + ] + + +def cmsis_nn_transpose_conv_buffer_size( + backend: cmsis_nn.Backend, + conv_node: torch.fx.Node, +) -> list[int]: + ( + input_nhwc, + weight_shape, + output_nhwc, + stride_hw, + padding_hw, + dilation_hw, + ) = _get_common_conv_buffer_size_inputs( + conv_node=conv_node, + stride_arg_idx=3, + padding_arg_idx=4, + dilation_arg_idx=6, + ) + output_padding = cast(list[int], conv_node.args[5]) + input_offset = cast(int, conv_node.args[7]) + output_offset = cast(int, conv_node.args[8]) + output_qmin = cast(int, conv_node.args[11]) + output_qmax = cast(int, conv_node.args[12]) + c_out, kernel_h, kernel_w, kernel_c_in = weight_shape + filter_nhwc = [c_out, kernel_h, kernel_w, kernel_c_in] + padding_offsets_hw = [int(output_padding[0]), int(output_padding[1])] + + return [ + int( + cmsis_nn.transpose_conv_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + output_nhwc=output_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + padding_offsets_hw=padding_offsets_hw, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ), + int( + cmsis_nn.transpose_conv_reverse_conv_buffer_size( + backend, + cmsis_nn.DataType.A8W8, + input_nhwc=input_nhwc, + filter_nhwc=filter_nhwc, + padding_hw=padding_hw, + stride_hw=stride_hw, + dilation_hw=dilation_hw, + padding_offsets_hw=padding_offsets_hw, + input_offset=input_offset, + output_offset=output_offset, + activation_min=output_qmin, + activation_max=output_qmax, + ) + ), + ] + + +_target_to_buffer_sizes_registry: dict[Any, BufferSizeFunction] = { + exir_ops.edge.cortex_m.quantized_conv2d.default: cmsis_nn_conv_buffer_size, + exir_ops.edge.cortex_m.quantized_depthwise_conv2d.default: cmsis_nn_depthwise_conv_buffer_size, + exir_ops.edge.cortex_m.quantized_batch_matmul.default: cmsis_nn_batch_matmul_buffer_size, + exir_ops.edge.cortex_m.quantized_transpose_conv2d.default: cmsis_nn_transpose_conv_buffer_size, +} + + +def required_cmsis_nn_buffer_sizes( + node: torch.fx.Node, backend: cmsis_nn.Backend +) -> list[int] | None: + """Returns a sequence of scratch buffer sizes required by node, in bytes. + If no function is registered to compute this for the target of the node, return None. + """ + if node.target not in _target_to_buffer_sizes_registry: + return None + + buffer_size_function = _target_to_buffer_sizes_registry[node.target] + return buffer_size_function(backend, node) diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh index bdca1a21e7c..a67c5a907a4 100755 --- a/backends/cortex_m/test/build_test_runner.sh +++ b/backends/cortex_m/test/build_test_runner.sh @@ -28,7 +28,7 @@ fi script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")") et_root_dir=$(realpath "${script_dir}/../../..") build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh" -${build_executorch} --devtools --target_cpu="${target_cpu}" +${build_executorch} --devtools --target_cpu="${target_cpu}" --cmake-args="-DCORTEX_M_ENABLE_RUNTIME_CHECKS=ON" # Build executor runner with selected aten ops and semi hosting build_dir="${et_root_dir}/arm_test" @@ -48,4 +48,4 @@ aten::unsqueeze_copy.out,\ aten::select_copy.int_out,\ aten::amax.out" -${build_executor_runner} --pte=semihosting --bundleio --target="${target}" --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0" +${build_executor_runner} --pte=semihosting --bundleio --target="${target}" --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0 -DET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=0" From 5fc929fa88e3b76c7ef26a482c896b344054ef48 Mon Sep 17 00:00:00 2001 From: qti-chenweng <168707118+chenweng-quic@users.noreply.github.com> Date: Tue, 26 May 2026 16:55:09 +0800 Subject: [PATCH 015/103] Qualcomm AI Engine Direct - Refactor llama runner for dynamic IO dtypes (#19146) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary To enable GPU backend support in the Llama runner, refactoring is required because the dtypes of kv_cache, attention_mask, and logits are currently hardcoded, preventing floating‑point models from running. This PR focuses on removing the hardcode dtype for them. #### Key changes - Remove template parameter from KVManager, LhdTokenGenerator, MultimodalPromptProcessor, and related runner classes - Detect kv_cache and attention_mask dtypes dynamically from MethodMeta at construction time instead of compile-time bitwidth detection - Switch to std::byte* pointer arithmetic with getDtypeSize() for all buffer offsets; add fill_mask() helper for multi-dtype attention mask filling - Update spec_prop pass for custom llama op for sharding case greater than 1 ### Test plan ``` python backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder /local/mnt/workspace/chenweng/executorch/executorch/build-android --device acfa9311 --executorch_root . --artifact_dir ./stories_110m_pte_size --llama_artifacts . --use_fp16 ``` image cc @cccclai @cbilgin @abhinaykukkadapu --- backends/qualcomm/_passes/build_quant_io.py | 48 +-- backends/qualcomm/tests/test_qnn_delegate.py | 18 +- backends/qualcomm/tests/utils.py | 1 + .../stories260k_hybrid_llama_qnn.pte | Bin 1355520 -> 1350272 bytes .../llama/decoder_runtime_evaluator.py | 2 +- .../oss_scripts/llama/decoder_utils.py | 6 +- examples/qualcomm/oss_scripts/llama/llama.py | 70 +++- .../oss_scripts/llama/qnn_llama_runner.cpp | 25 +- .../llama/qnn_multimodal_runner.cpp | 38 +- .../oss_scripts/llama/runner/decoder_runner.h | 28 +- .../oss_scripts/llama/runner/kv_manager.cpp | 366 +++++++++++------- .../oss_scripts/llama/runner/kv_manager.h | 43 +- .../llama/runner/lhd_token_generator.cpp | 29 +- .../llama/runner/lhd_token_generator.h | 18 +- .../multimodal_lhd_token_generator.cpp | 26 +- .../multimodal_lhd_token_generator.h | 18 +- .../multimodal_prompt_processor.cpp | 53 ++- .../multimodal_prompt_processor.h | 51 ++- .../multimodal_runner/multimodal_runner.cpp | 73 ++-- .../multimodal_runner/multimodal_runner.h | 12 +- .../multimodal_token_generator.cpp | 50 +-- .../multimodal_token_generator.h | 43 +- .../llama/runner/prompt_processor.cpp | 84 ++-- .../llama/runner/prompt_processor.h | 30 +- .../oss_scripts/llama/runner/runner.cpp | 71 ++-- .../oss_scripts/llama/runner/runner.h | 13 +- .../llama/runner/token_generator.cpp | 80 ++-- .../llama/runner/token_generator.h | 30 +- .../qualcomm/oss_scripts/llama/runner/utils.h | 41 ++ .../llama/wrappers/attention_sink_wrappers.py | 2 + .../llama/wrappers/llm_wrappers.py | 46 ++- exir/passes/spec_prop_pass.py | 15 +- extension/android/jni/jni_layer_llama.cpp | 43 +- extension/llm/custom_ops/model_sharding.py | 24 +- extension/llm/custom_ops/op_fallback.py | 29 ++ 35 files changed, 820 insertions(+), 706 deletions(-) create mode 100644 extension/llm/custom_ops/op_fallback.py diff --git a/backends/qualcomm/_passes/build_quant_io.py b/backends/qualcomm/_passes/build_quant_io.py index d43842e84a5..057dcc0f864 100644 --- a/backends/qualcomm/_passes/build_quant_io.py +++ b/backends/qualcomm/_passes/build_quant_io.py @@ -5,11 +5,10 @@ # LICENSE file in the root directory of this source tree. import torch from executorch.backends.qualcomm.utils.constants import QCOM_QUANTIZED_IO -from executorch.exir.delegate import executorch_call_delegate -from executorch.exir.pass_base import ExportPass, ProxyValue +from executorch.exir.delegate import executorch_call_delegate +from executorch.exir.pass_base import ExportPass, PassResult from executorch.exir.tensor import TensorSpec -from torch.utils import _pytree as pytree class BuildQuantIo(ExportPass): @@ -28,22 +27,27 @@ def _make_spec(self, x): else: return None - def placeholder(self, name: str, arg, meta): - if quantized_dtype := meta.data.get(QCOM_QUANTIZED_IO, None): - arg = arg.to(dtype=quantized_dtype) - meta["spec"] = self._make_spec(arg) - return super().placeholder(name, arg, meta) - - def call_getitem(self, value, key: int, meta): - meta["spec"] = value.node.meta["spec"][key] - return super().call_getitem(value, key, meta) - - def call_delegate(self, lowered_module, args, kwargs, meta): - args_data, _ = pytree.tree_map_only( - ProxyValue, lambda x: x.data, (args, kwargs) - ) - meta["spec"] = pytree.tree_map( - self._make_spec, - executorch_call_delegate(lowered_module, *args_data), - ) - return super().call_delegate(lowered_module, args, kwargs, meta) + def _build(self, graph_module: torch.fx.GraphModule) -> torch.fx.GraphModule: + # Forcedly update delegate node's meta['spec'] to get correct output + # tensor size in runtime + call_delegates = [ + node + for node in graph_module.graph.nodes + if node.op == "call_function" and node.target == executorch_call_delegate + ] + for n in graph_module.graph.nodes: + if QCOM_QUANTIZED_IO in n.meta: + n.meta["val"] = n.meta["val"].to(dtype=n.meta[QCOM_QUANTIZED_IO]) + n.meta["spec"] = self._make_spec(n.meta["val"]) + + for call_delegate in call_delegates: + spec = [] + for user in list(call_delegate.users): + spec.append(self._make_spec(user.meta["val"])) + call_delegate.meta["spec"] = tuple(spec) + + def call(self, graph_module: torch.fx.GraphModule): + self._build(graph_module) + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + return PassResult(graph_module, True) diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index 6d5b44d7a35..ee6678fa499 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -7730,8 +7730,11 @@ def test_llama_stories_110m(self): "--max_context_len", "128", ] + if self.use_fp16: + cmds.append("--use_fp16") self.add_default_cmds(cmds) - + print(" ".join(cmds)) + exit(0) golden_start_with = "Once upon a time," p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: @@ -7750,7 +7753,10 @@ def test_llama_stories_110m(self): # x86 does not allow weight sharing, so we don't check pte size if not self.enable_x86_64: pte_size = msg["pte_size"] - self.assertLessEqual(pte_size, 135_000_000) # 135MB + if self.use_fp16: + self.assertLessEqual(pte_size, 275_000_000) # 275MB + else: + self.assertLessEqual(pte_size, 135_000_000) # 135MB if not self.compile_only and not self.enable_x86_64: self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai @@ -10087,6 +10093,13 @@ def setup_environment(): choices=["wikitext_ppl", "hellaswag_acc_norm", "sqnr"], type=str, ) + parser.add_argument( + "-F", + "--use_fp16", + help="If specified, will run in fp16 precision and discard ptq setting", + action="store_true", + default=False, + ) args, ns_args = parser.parse_known_args(namespace=unittest) TestQNN.host = args.host @@ -10114,6 +10127,7 @@ def setup_environment(): TestQNN.backend = args.backend TestQNN.static_llm_eval_method = args.static_llm_eval_method TestQNN.direct_build_folder = args.direct_build_folder + TestQNN.use_fp16 = args.use_fp16 return sys.argv[:1] + ns_args diff --git a/backends/qualcomm/tests/utils.py b/backends/qualcomm/tests/utils.py index d8802f74e68..c22ee8371e0 100644 --- a/backends/qualcomm/tests/utils.py +++ b/backends/qualcomm/tests/utils.py @@ -221,6 +221,7 @@ class TestQNN(unittest.TestCase): static_llm_eval_method = "" direct_build_folder: str = "" dsp_heap_profile_filename = "htp_heap_usage.txt" + use_fp16 = False @classmethod def setUpClass(cls): diff --git a/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte b/examples/qualcomm/oss_scripts/llama/artifacts/stories260k_hybrid_llama_qnn.pte index ad6bee06146c78f8fe1df1c77c610d72dcda8c13..5903c5b5c32277c0eaa795ae65c54370451900e8 100644 GIT binary patch delta 306914 zcmcG%dt6j?{>Og~!;Gk-q5`4}7X|OAsAyEESX5Scjmipr5ShLJJcFfxZ3M%GXw z5N1ZQVdM@mjDo?2QJ7>HixUl_D8Vq66QI~<7%SrVbC6-|9YEG-s_bVNjeUsI%P=Z> z7{<|VhLPhoj4fftSPC?ZeXj+BQeK+lngcG>S1W(!g&Bsa2A}O25NH}7(d*7vgF!E1 z>k4dlo!V>L%fVm-5`H|F2Uf=nxTK*=61MJxU}r;H9P+%w!C;$%zkr?J3PP{dbw&)| zha7QIT+H959oF`F=5sfN-(rqj`o{lMepN*&0u$$F& zG-yzF8q~Y~e`=77$D42dTZ?)f-P6^g#{+}o23!)wnCTG;+xJOV{izOl<+s6Lri1^4 zonMOQC$~!;-n-GNKP&3`H>Zuza?1ZpFUsG#?|&@6yZ5bstG;yo>z%cFPCxpe$D_8( zCT!`!uJ+VBoCX!Vf9pNdBHrp6j|byukt`yjS3_Jdx8&uI{ z6E=&5S}KyC8i(A6?@kB59y`BfWf%=z^6+)`Lf`b;qBo~)dAqAYCneD!Sxx?j2FVb8 z{o%j2D3U=74bf}A{^u5@ciDs;Vd)LED90h+iEo~R$Na0SMa5n6@Uh5gXx-WO-ZpK< zyIn2%IE5C;`tm=v=+2M-y+zv^x>|I3`+sgxLzhk1Iu`Cwi&`A=Jbc?6`~~bnV-&^A z(7hJE4>>LJ_xkPiX@?uST6EJ$S|lrsv?$ea<$w$H)yAKN@{kOSpB>k0*r&g5%(&7r z^ozOV!72~ha4zo=+sg2Gti6Iq%m47?VX#AYQ14@1jl|qouO|X9ES#tkx7uN!PY%Cf zu!rDhXO|-UdK8GV%!O;FNf1fEnlWnZE#}Y|)|3C#SSenv`v)n;O^eTSS4eP?|C zz_ye7rZAsF^)CH^{5N2)e`>TYJ8;ZAJFq^@H#Fw+|DLP}&z*L%rvH#EiFu=wg|DM? zu&#Ulf$V8pv^+a7ef*_Ed$XYYw-$9Jd5yMv31VFVX;GaSIk=t)9ct3;-*;l8O3Ur$ zlud{o>S_MJX4*}|f1jzq(>IC!5~1OeInm9KIl+&4KTljKK$>$-pnPKF(2oC)L|udQ z0*(KDBAPUGCv!B^r1O5Jq_`IrTspkG_Qu48OnJ2FZSvYN?E3LcE;tC zqlSL@)ql@cgsVLXH&NH`v-#uuCNVQZ$@1w-Q)gPC<#wXN6C#Ixz_R<_n$?wP9VPuf zkxax2re`RVbYv*gVlB5bk{hvL$;A4=sA%ZVIB^h;nu>DC0zexGjF8=opFkH=q8>3 z|F@nzkJ|o_ac~|JG}OMBj$kmfexzx6cHo^U7Y$wa-TyVq>H2YKW$IjcbC{f=EYgdi z37@3p*@0K4UNm$S8+~a@sORKQXYZB3>#g29iF$tD7XP$O-&Y!_Iun zUAzBYnRe00RXk}VUj|nJCUUiLji3|ctX#n%dq!^t!#*=_JE40L6eis0fu>tWrj zET1KB$>Fb(O1ZIKE|Q0DG0nY{ZMHH>+1e+!>Hsc zhzKss8-=VdABeg`Ts&!~>-)x8WutRJ6u5f65%(hTf zwr`C?JIA55eVZKGvfpXTPm73-(h{j^#~-va9NLyYXiFSgnTw&S1_UJXHitI%587sj zc8xxb=#JF4lwwd#tNSM(oy!3QNzQT5b`5S#*k6= zi$~t{S!>yP{*+5hpk~hDQXgwZBbqH-|9`9*IXq;gT7KNwV8<}Yni1$f zFFM-qut{m_ng)r{wGPdEhh{)v{k(|i!wzk^Lz@`w(@e6~IW)H49EY~mp-qe~b7?Y73DP2~w|r@J~g zF?y#%Q}8>@QHQ4DcbXK!0)692h+J2`gcWBb|9>yDZ<=n*RI)|pvp-Ju=8tnTW z(!}Tyno0J0r)F>-EDEt`y!P|gIu6T3v0F^OoJxm(1=ul-{%|A8To2`tsC#~I9~X$6 z7nSa}!h3noYd`qAN{BRgUEW!OGf}{Ier#`ftuJ-SW!>gmJ%OVOB31I)o)vxGdV0lq z=biQOJ4ed4#~O2@{ZyLDW1Z0EU41Rp{g*86yZiF5GDUSjtCk1)UiwhqN4A|~Ih65$6^FM2 zyDoh?thTr1R#~_rtiKIsT(-F1(+TBPgf%>1#P+z?vTD6-ao@ex9@}fFFu;lWm z!!mmY@^6?B$XQg_x9Gq$D~~MwV(?Jll|{PoI9XRL8+sxg8kFLDKMM;56 zue!}_3;gS1EcOfe`J%MsrpZ`@r^aDXNcf> zHgbNh!_|SO?#?eQ7-1Ss!%ZWP>%cJ6$mQBU)HHIq_NJJ8He(uf$)+)fi;r3i-x$-_ zk%}GH(b1++&NYH-&nVL<;A$Ib8uxRha~&Xd30Ge_q;-gCG;q~&Rd5w^+Qk#z^*5UyNf}-z|~I;>yeR0x@zcv!1S7--6k3P!F^GshVN|;_6cl$YJ>o9 z;4K?O1di4CB6$9kAnk@aIgs_VTTDDS4G%20i6PN23xA_M_-mrt+H59xMu2}w4D34; z*PV~5a$v_314o`tc7uLMR0&_0!#q{seAz_3IbJE(U)@j4vey_<(o~Q_H_J!#(@q@9LQ2jW$@lrR_0^rl5tI@!-pGbLoT!e7$8vC` zt(a(3LLNvAOxfM78N(*eC6453aWq-A*uaAa@5o+v)R8Sh#p016_<@ zU8jH+RU%m1DVV@U9zl7hfQGdoxT{mZxJSL+9=x+tKnpVw{HaqgnT-H~H6rLGX=!LV zf;(vTpOf97+Y?p(H|7veBlxz&z$&Es+k<6^YO6?7!M8FVw)B=n)gjUnq^lDH3C~%! zeg?AzX+6@L8OrB|ON1K)+2fS*PJ8g$M76Jl9+?APW~*eJN}yK}sdl&UWQ&@gL0ltc4uO5Yhfp%*0v@}IZtF0>VNnc;h{ zJ$ON4U^fcTd|N?J870v3spf^z;h=K^bN>?GGs7CI2KYIhoM{W6+sP*wDGlwx*}}bI zKhem6&*XM`VPZHaC(w7ts2*}_8#|z9SyX1Q(E^ zC!b)nz{hm*iAL0c_TcDFo^E8qM|Sc_Mj?E7C!cJT!-op@O1=|}TKEtq$BE`JcN2Ip z(>vJ{@lks)AyLiBM4AqcN1D`~Zq9=YVwz`|gL}xNGu9v&Xld!gZIBqPl`b_J5ky;B zjKe=}5BB3`_>rJ#;J$(JyOTZhLA~j|Q<0W|dm&A<=3ou12hq+W7t1O~W(3_ut*U-v z^)o(UY1Vpq<6xvBaErdXl$Hw%C%4a=X7#L5g22@&pfS}5!a4;MT#vx)6cD5pfgyrk z5`&rGYh)vt5J;#S;>iYfB&Z`lbH5gW+Yn+zk9RL9G+9 z_fsAh34z|P4DqA`k0+>uBF+QE!9{fH*?GyVP@G;wn78Jvd3XAJyPcWkr$#-26`e6@}BV7_Yq6NvfT@vC% z9cmA@B&glLk|7iPO+sMm--mb#fnN(u?2cPG>|Zv+r51KLAux82Rc#~eE1Qurys16- zWrEuJ8{tyHUx;RiCl}aE_r7Yml)(OJGZ`(|=Qfj}f_-K)87Ej%g4)zZINxXO!9${9 zh=2!eBi5uM*rzt*KCOT?G6=6(sdmCXu^FkFVIMPC?S%7w&hu9^3>NT!ZNyZ_gMDZ- zmV`3c2R0*B4eWiJk*WdKkf2s|5UvgU9z9FAlz;N{7HIWd4(uJ9$*{rp+l*9|u(xeS zsy(oM32JGOa4q1s5&}nFAL5B>=JB5pIJnnpP$ujjHlsm>us3Zc!vuT7W-?5$y$Sc_ zhwPQu*3a4xteL3Ic&13%7m4+GbNs}fYc;AZfH32L3-sIS_C z6$xs$;B4?ZZau-p;57*_?pwio!1?sB;8yT_>KB~yulC?P>KB{~&Y^z6W#H4P-wmz>XHmc4X7EYWFF5M! z_Fy{o3(f|QrGCN1;L+6Y37g=q0gnoNwr`Xw^@O?IP0$h0aqg&Z+JnPH5t#P2+dTu8 zBFtWbOQ1sn+ut6gW_QDD8#D~pz_#Jfv zQ-5Hkjt`9eaFizx_$@U9%YiNNfpx&0z;CGc!%^-Q;Makx4-M;ig0#`s%8DCrJxzOB zgE9lbXSC;|5#7>^6_8KFIPmZRw`U9NLv1U)U^F0jH(srZ3>)PR|B-dZjw`*I4&7_n z4)YX%|89qzV62Di5<9i9Z_)kLKeY$9+R4Z5pN@8yfKIU$tj9(TG&ArFibl|6TOl3b{k1)q7U*?wv^x_t3dL|a zQyN9El)w_?m7oMmE{(5;`2uep9PU028W=d%G`L%ak@TDFy9OUh^kjpg;?oI5p!iKmr!3O3- zBPJV*!9V!|Q=3vECK(lwqmYC21{Ljr9`UKAv0=kJN5S9t)Pu1+6Jp!h0QdrfK1=Xq z0>8xLvtgdaz%MN4!1~X^J@-Qo*@4rIJ&=!xG>~Po75t%3Z5l|_qz<00KK16nu(6(8 z@H;-$A#y+XEni^k=k{5t2Kt83VV(oPS8a`qS9p-8mQQURgi{)Lm#vcF&x5|?3*`TE zm}d>}d0QissTTTYpURKJsR{h7FVMSLPPJa6BUmG3m?s1Hq%EJwH$^*wkNZ?_9}erm zkJ^e!#!g6;FR&Yj!@!M}oKtO-*%7P^Y(M7no?!1-GZ8%0DUjzlf(JVV%&>9<4|EFT zX^!CjP65-l5rOIy(8Tbt4(^^%7Dg-;!F?j=C247DE`kdBG+rL?;D9f%>PsH*z!T0zA`w+JR*E2FA_PIKOcl!bfU$OTBuJfrQA}#{2^#zt9t^ls_si}!{N*%b& z7dY~jJzgzPztCZx*zk^EiLH_G%7(5WmN+d37uzZsuS)20UtrVWVV=FfB3mQl)e60e zx{{bZDejKol@_*-x4G~uXrbuJz>Bp`lGVZ&G0ub0HG?nLI>{F0=?GrtQ@cf%4KB2G z*14}3eo0{Pckw;btVbmfx*Ak5N90s3bEA>-4qJtxKdyD-%%z@2;u?T@?TXI>PYGAVh z3%+Fy0?oAKvd($CcW}(!g*+2|n(V#hYZL z!qYqXWFr?oLAdpD=met#KF+5S&NqjP>sY3Gl59lwLq_}5s`EK{wt+`k3VJui+Yua2 ze_nuM4tN-fWX}pvGEzB?%1UiiBVgY}Fs4mCB#|k4QRuLG96>_B?Oh_G-^Qm>9+ zoYq^1)f@zaL~osSi(vyS^)Po8I0j9!XD_I~c9Ib=S`qXUC+mdk>)pX2TJGdY_IiDUYJ8gqeYvKP5#l6nh|s5P3t<0tvU`&V3`_per}GT338k*Wsvl`vjC z0Kbe=tI`P92L8e}V#1_EcLbYl#%#%f{nKWoDu#V-Gg4K;K8p)<5N;2!DNZdNPq-HF zArV`B7sZ!m4%&(jEcxr(k$Eo}YglhzU6c;$yW;G~$ zKu7R^&1g_6>_eN;ce$_+Y$n45dtVq&I$%Sbnl+Jd_2BnxBO25Sd)H<($QRoY+#eTM z)z0G%xGzo}5tkxxeOzE4;tJrKacXKh;+^1E>GuxK62O<4S3uvuj^Ing0OkOniwkT8 zmH?k253mOK1brCfECJliyaIX$u{DSbECpr)?~4m;0u}-9CUbCB5O?1?U>@-5xchdSquk}dtM2>O9PQo-x=h8Jz!u;_ zHB+F^$CF8|6qo}%M?E931UOTDDzFB4hUygtYy?hMQv`bBJAx;xB?2>n8ERvgIniAV zJVBW*bDVn%aFV)MU?Xs%dPQJlLPv1C8XOMH298t5!c7jx;ilWY0zSs#)&gqOz(;HS zNpg6Gj}*@7Ig#~Tz2e4owj0+O@SzsBT}$A}T7R;fpy7jsbA$#a+NwkkuCYlS!FbzM z_5(%^JWlH~SYj(KQuGcZQra9N>@sKFh~4sRZ0?E!kqM#`u}aL6(r z<*o;Rq{GT2XoJ5`xeGWCj~Kx^qQg!w=EL5!!%pjCx;<;)uj#O?T0E5yyn=Z))i#6w zYKKjib7IuUj^Oiy)NavbgP+wp*>e=bpB$tTPD57>epKsN$&3d0#zAV8=)y;FWY9X+ zJR==`4?QlrLhxN$CwD*v{5G0#I=VgJQmvErx596v=S7z?x+8e4)=97C!k5tdqALS0 z(mH8iE&O8IcLus<@I_i@-8-r5dugBOvca>pPIhF)@H1)O40P4tQ$;u2`s|hiuwm{x@G!MH%1m}21`bi1V@&JgV((a4 zE#8VUhr2U?NtU}xisr^%3?HP9MVl$zC!`q_uqdLP6_()H0q#%Iv&^^%hOH6WPi>4Q zc=$N$_KY@%c018XgSaHXa8DkveV|H+VZAN~|29y~Vvc%tfR7JUH^uNaxCz`kP~CYp z8N$<;wgc7HvzfMO;O{NpVH8&uR!udmqhj!NFMK!$o>M!V_gz%kZL5=T@r4mdL)l_No2!;-?8(iqPFI%no9p+9&dK@Poue z?#=_>C;iOhC!WDs1HZ@O*2Ak7y4F_v=P@iz@ITpVxv#w^upZdzqvD+bUu~;pxhRCL zwAE?n;JqGxqwwMGoxl}pWdhI37T^s6?dPR0qa%2|`cyPI!0T*{WGjIdt0{?SYJk_; z8cEa$y+&;mjrT-$H?~G@>P+Y|)t+c3dkTR|2CClY((H2ZRcdAu;#%Mp1696=8^Mdz z#w2ztQ77>-K$(Miw$A_-s*44d054Im2;2rNP=kk<<2=p4d^-9(9xhQQcLWz$*natt z4bNlHL{|*HNb96Q)$j|s7tcr60KP!$q)FkK9l`nB(V|NSpKt4|mk)*TbCsFG0#Xh< zM_>}Sf>8^bYcpnLBW$+K$QC|Dc5PVty!J=)Mkm{R$B+}A_U`XHMLj3QY|%0Jb1MFY&ba& z14n2tc?23!S+a*jXP?(Ip($D~V`D5vkZkEyS{_U6dU%pHu=WBw5hPj$*4e!onjlql zemw1+E?dh^34NcA#MdREF?mShx+K(JhGbBegj6+1*n);4Q(Xg+*e(g}X+tugOG4kL zoFZFP+83wz8WyK-c(lzU^JQ^|N7;Ov=qusOAs(mzbmJu$8^|F|(IJGb@cjH328^JwA!*eoQwxvLy^-TucQjv7; zlE~ym6451*$%~|$C9ysZm&uF7!yQ*Z__)e4Ou*Go6hJ~q5 z$Jo^t!Ay%uwG}XfmG@#zOkltEcKqw(_B*-X=t0KGR)B5G?g)i z{u-;sUP8V07Izm%2rb7RvDH(I2hhtTT#4G`S6{|LlCuTMH%UHE;g5A1$=ogyOx>jhj zrZRkI@lrxl89(UfvFgZW#4G}TW~q2;f}6A%t=kDbq$#ayhJFgY953(Lyn0|nCd#uD z{1Kxp@;vZ|v1*mb%fRnqvxw!o7F-{z@&z}8Un7p-sGN>q9UUk*8~hil5?l z@GR;V+yI_I{er{i@YJCGtN7qM6P&I3o@5SF&t{n2-OJ&p*u2k)@M?IL&6f#30H12} zw}pG>^5hUcN*yZV7@h?`N%%N-F)+hcEjx)5{1*6Ri`(zY8{z5dGjV-(39gazczZ0I zR}R25TQ%)udF22fYjNAP2A*o`OPAu>1RtfgpNwnNIXpjXRi8|G)=WR0B`8xV6b@gboqP?j=`_7h-8WH!`6~LX%a~6c*AVc%scCr^>nvPq2BG z@OpT>&6f*rgZpe=B|P<9d7@bH_6yI053=}%Y0z@$KwF(Ad?!5C<^{rA-~(*FUbyc( zd5l=$>V)UOqiudvcnLg8wNH}`8?e6`lLc%9_EQT4de4^!$6A4zz(}=AU=gs7`c_~i zu(yhz4y*_EQZog%0ljLaz|{HjjCe+19f$NpV6XMbuWnU8YUfcM&AC&*r>5&pLUD*rTI?0NGzB3Zf~6J!cz z!e3CkPNR`Uz-?N^x@T0vpHlIsqpAm1YnAMC+TfeiN>QaQ;FCtJk{wPS{9g5`sLFxm zTE%*6?1bO0rksJQ1?bl*=}=#ONAM=KQB*m=Vy$8gH%j13RlBHafQz+CTHOf0L@k(s z%Da%=w5aUQ$uptnsa>Ke0?x5D(ixS|vsC<4`k( zWcBHp98cB&Cn__?9NaD4*ajJ=#-C*-dK!RZ2dGsgY+8+r<((wnqupu1;c7c&x$}U7 zm3cN>tE82jdHnDs;fbDVVB!EZYo%;Hz=OnNXtxQ*aY&?1`0nT>GL5Au>)R^1&*a4*6QfPy0iR+7 z%MjOqpHyS!A#MO}iBYRW+y;I^twiijxk6qsFs{k!Xc+@B2fj)8aCZstL0gq{4&A>6 zzQN+w!>9pzudS}S4Nv1rKJ!2c+p@B><{U) z5iGL=V?8UtOKhdyJs`MB^_tJZWi4sYD}}P8E#>`#t&t@%6S`O}5lsw-S1} zdP_9*z{_lnEL?5SLN(?BG^xvEt)%JJ*(w)SU^ALr0?QYc?5+mp*$O(k9(JKHpY`no z3T#C(ze|F%Z@DbBoi>!3jU=~ALYyKb^SUGyU4djymxO`aiDXungz}q_orgHN&}Py3Q5zR`#v z-7>H;hhHn7U`gSD@18-O22T@pvU@&ojHT*1NygbILy&5_O6nTuXsIMlEx(&Jq5(e2 z=KJ$mBii62ZC)q(lw#R)TJ}i`(dWSV*h~B)*P}0n53~7l(O1Ix7)UjghFqoxK3zZ2Y7#MUc;W>}=naPh8?4eFFj?P}9u zy=_LyJXkMbEX+VJ%Tk3b%;27COrh=40PA5hT-spWZAOJDH_C?9vLsawtedc8PchKL zGwMFVRf65N5qHoYSh&r&ds<*F6@QtXD(WVAePJnPH6>#!^e0Wp=qq7oO&p1t4Q`EAM-(xOz(0T&+3x`=pxnLyu}I zgSL{*jYT=dgO5b3gZIlR9{jB~qjhD_7ENhg4fN}1wQez$G=L9Bt5px+)dv2O2qI5e z#rub7wNK=k^xQUd-cTICC_0l!Bau)7ibmfCfNEdHe(!PnKd0yBZHsQ4>^ zMZmwRnF1?;&#RRJ>w(XzX9TtZpH!a;O!do$2)(WX<^eaVDFVxZN-Yt%6L^o>D6j>1 zmwHQ}Z?(L+Zx@&YELCHQfF-~i)dGPvz-!f7fsMc=YL`IoE%HeER$wOZVimsxSOmOC z%@kM(JXft0SPz`1o>{`n^zd8dz4Yp(9J~80<$#<4pKWpbGx;L;Oj|$gVb0c7@Uw-F zb2k9bRCAZfVP9M$%H)N#aN9K#ewwZCQ-$kt_^HCjxvPQG)!^lF*k6Y00r)hF+pgZ* zrl%2p2M>s7wN-TK;8Y9SeuePS+#Zjjs{oJEI`P{B z9~rHdimnwr!os#+${HRD%uLbcf`@6H_?5wjGE=M3)q+#BPTJQDPiD4?F6vJ_2(*qH z!pMdXrf$&{gOgYa9s^f{6KR{^25T_d;<97mfT2UmaxMXRZT z_kahIPjD+ZHd<{GoN^cIJ#7)33yxuZ5L^b1rX5d!Yr#?6ae|w{{b_^XsB#weXtf*c z&Ib2Y+i%bxHx)yBsn2dO2dk-D*eq1RBh=~}n85piVPZMjW2|LGj8ePr;jEAf{v}HF zUcs%L2mV2A_M1b!6Vha>QjXxeD3!mNy^FPV*@56|%X+xHTRseJwj9)v6{g1%yNK(lv)gaKNMa1z8b;XQEJvxC>y|Ugp@2?M)=*Fd!p1*G59)B+8^InLzmIYXHYeOudr3p zJ>mCu1TUqHqDluZuvOAGh0yb9;Z{@?;JLO+dS(ywY}zNPR`6-IO8O-wz;Z$BwsO3l z1I|+8SFu}L0nAi${T#2QgDRmXtCg!b>g)xcWcjQ8t2pW$ho`H}rF`m7s+0V7rtfvNiyuVr}@&n*j3)|n*4p%%y`>U;g=KP!nZqYiK8hP-;{ngZ4@LL0J);j6f zT6j}`mH!-mP2fgbXPwWy_sh3q2A6RLjl7jU%z%GjacdV{2z}31r`59SUJu`It0n7B z=v%hBPP`Ap-?Y_|IO+j;q*}q7#5)VVNBD4e5%BNo*xl@L)1GG&S^@u?&DWQ)3*8BS z#pZs|H^b{}-XwbOgM10W=KDpT4&QC_wA;|_y<@SQgIi@pZ_SGD~%GoJ4g z?T7qDNTRzH_>wK3V8lMu5!_)DzD*5z(I)9e0ptZW{dPRcfY00V$;K8)txZla>LKhB zZfA$r3Vgc1+VmnDMqj180caO-Ht^B@>WGMozz?gLcOb3+KEz<|K)e&I)J73E0|Shm zh`k%-4Zyc|Fv&83_o(-Mz&zkx zYKp*e;GfjRYuU={zu{ktJWYNR4rt$N+bj-wFhS6AOj z^a|iAfg`zn_CQK(GQntt++>l6iAG8lF9iCleShV7kqf?|zv>WN2ELw~W+%87d>xZo za5K1=ju9NSnaxsvwOepD_!_!ba54Dm{wiS?3r{t8nUtICt_Lm=Xnknk3cJc?w95C0 zEO?@mWgr{2h+w;9tiYG4XV%GB0WW2&cFS0SFIMd$ZU!!-BoTWbmDO?P-H6kH7xh;M zMVtq|Ky5_qE(4yY`rc~}QKg&N<7|VUE3-8@?7*X{_r2!Oz|n*J#q02DSxk{9T6bqE zbe8QF$=@IH81oU#wDc;ejvLMoKg%{)A1ekm2+p(v$o<;@J>Aj=3iwMvhVhuJ$kXqk zq!BFgNLiPDj4Tl;4o&Vh#SE}Rs4O3!=Gex#|kduRB*CdBH~=& z5azguOTdHGE)iD)llrS!uOhAoC#qf>5VryoSW86gdrBT!3q+g^^f5n0Tm+6&&xp7J zIEeZA8seScfvR1^&A?dZr-;2z%j1qOq%c2$G0aa9=YgZuMiG|*qqy;2M_dE$uf7#= z1F&CzwMxWo;J#|g{fJX)IH7aUlQZUJ*N ze*kgRGn~u0k42mb?ygpfxDXgYf4_ma9NbO4CE{A3hyE6EBiOCtA4D9!mD4!=E#g$L zODz#`E-;M#7I6vKRJ%l64K(QQH#t1kgL%^TeuzV5E4ZVd+9Yz{vmC(tseF-VgWLM4 zBO)&X|JF}!6?p~t*M4g1KalSP|I$x&h`bs6b3e6C0mB`b|s6@U338s0p*`)t&*ObTUZ-{n#v7Rk=5+}s7)1!?SOT@|ab9~?!e`n1 zVd3@g9Gkx_ybXSq&4a>IUzE3~l0l7o0-gsy-Qw0?VoNv55oC7>_`8b;PU#X%Hd+v5 zbqP)|d^==CKK2;HyYMj{B02CW7PnXE5_qO?nh!kLQoVTXh5{2=oh!nbA4|5=p26-zqc^l`rUcXCckoRm^yH-%UfE{OkK}E(z53a4yXK5IQgw!p`ZN#CtrvDe)+|5 z{M^9RQp`W4au27x3k~B%3>R4%`+s+0=j8FU{xCKJQNO_D-EA02Ek>zPco9{h%+!I! z-GiZ%KTXxMxlTcMkgJa04!h+H(^xT&8G<|!`_Ruyu#=y}4&r56e-#EMIg-vajjRVv zBbH!0!b|x@G7jfvu#>Ol7fp5$paSQ?IJd4OX(P!;7{(Za-bmr0pER(OKgzF8^KdD^{(E@J6ZS^IfUtA+>S7 zYk>K*dVap^4D%7yE^N1&-rYOE{OH9o7r2s5&XTHkPM??;{rRpvrdqzhm8p_^UY}}P ztx?)j12O-^yMt9!D3sMyTbB=t82PqX&w7=nqt3Q)1DU7)^^K$jnQdhqD zy6V2nHDWs2n?qWr*{sX$$tIKTCgg;(yvi5aONz|omB_ibvLxE*+3K2Qu5oFHJWESL z=9t`tISH+~TN*NlKR(-*f34nG=2|#-z-%Hu#=Jy9;4vm~Z^e@%i<*vBW zSj>hw%<4WdjY}|-CW+Z>T{LUHahb!tR~~mVGFEfHY1G&A>kqTC;4?ffqYyjNT4_w; zvoKg?#PDYU@fUT)Ip65r70AfJ`ErMI7|x>{&VHP)Kp!e@jxLVDF&68fz3Ilve09pz zu3`Nmdbo_Mkh5q+ihRTA>ZYq*i5C#&N{4y09svF|tj@Oh37OW%Wh}vtb;EL3ryKm+ zSB4?!ct{#a#3|Bo=F(U5E>hoL?Hb!}q^HhUh8(*{k#Cx>CS2p1T>4Nxn<)H-=m(;I z$H7b7^D82CF0qN~s@WJpo#&vFIz?B3 zzr2DA)!Bsayd$nstKY<6uI@KUaIOw|6#0X?z+_$E^}esH0uNJQt}amQ){}s}omC*V zh7?#plJgz0`a23tqQLWxl-dQsDx0(uERW9RV`&2^E-x zZ=Uwuj{ZJf$WUF#qO63eGL-jI$U zhy=KW09p8j3aP?3OZ)CZzd;u=Oc&zb-q$K*3x%Aj3lY1r)7e*?LLjLGW4lWTc8Vid zA4eeuJ#v~ZL;~DKA=!>XlJL#ezHgvcx{%?zkSQ_0O_3hiK_O>Yg_IhRVt5CE_|GZ? zW4lKVn8xXjV3CeOA}HicU5NOv!Fh(GkScs(Z9p$i$^Kf>yfy@gbAwpED{ zDTa3v=qz0b##SL&1Ut(StgoYxND7&$3laaj@Xv7+Qh;xc8go56g4t>=e?-s1nuRk` z?va(ucj*tGGFPzGc;b3j?^3#OyiO+T#9CeXC|&v8W_PRdSPD9VlZ_KB{;TTGVVt5@1E=4X=PacKV!b*~1 zj$l0;1sy3x2^OK30N2DW` z*D8?yPr%?X74}Cd6-s~-0xUru>cT3V`=OTtML%B`2sfg2(8rKh=mPm*z>muKVMUVF zgKH=-3OgxK>{bzf8T#&p{10P$5|U(qBUmp-7apa+7<5vg1SloI)%b-9EWmdldMQx! z7w7`nkQsbc{w7!dUhH=KLE5_`7(8L3ntPLLfIAKkQ{9u~O*99owKutvr^h3~C(=WV zzn;lt$Dh--5vMO%U9aPkk&*2aTOhw|ONH0Po2y!@u$Fy7?%%}zuW8SXL? zb?6?1-brXV$^XBFmJiHicV#4)MuK|d2G)5KcUippBE3Pc^Eb*mpXhFShmh+18?ka} zs{vTusIB<@<5=}d!YT|kwvp4+XG>{6&~U3aR^k}09Y-Q6?Fzrjl^XkZ)99wPGTT>o zgqbCFXFGj|L zTwbZ9GqA_d@wkW--N5;V@h9YP>qKPWd(`3EC(rUNl{1YohsOTkaCYH*mv)wOOw|jf z@x8-2(ss6dbMgJr;Tw+cT8D2PzOC9fhQhAoX6l7L(`u7?#P1r^Zw;OBGd9v`Vv~1< z+UKX!nw~d}pB%n7FH{|3zh!{S_yxHPj@U0aLycNZXJ(P>xVBG1ewEHO-R?i*!7m=2 zGKJiS$@iPiCw2?z^Z;@>=U{7XVzbEot0Pv4UIeD0KS2I=onQP5@V`&)O_?28Tva%? zY3DH*6fIN-S2H_i^m7?Oomk|ToS|ZFaV3-*&zVMt!+xbsCfoXmSeNmcP9{E=lI#KG zl1pZD0lrPz_eAt9j#-hxtoSE7W&NQu~_jQocv07i?}bKX%4|(XU#Eors<; z<6jucf^G7ke2w*XGkS*Jz}8x?lL&UWGniov)x9V8@iz9c-=LGJBL1)A|0wd%eVm2w z*K)%mi~OE0dOBb2zSR{|dYt3*w>q)N|BmBh7)fHFmAGk}X|ya=)*0i*6Ub&*)#xA7op5HT6F!{YW^F9)r;v|yA!4|lK-+a87+alC z^(_BLgdhRfw*r@wyrd~1<7Dig#6icc9z)qG7v3r12f7hjAYjslr>E3k2igA>h zL8)({;|8<*AH@Gvxg$e&j34KE?L3DBn{*+wbRqkGnr{^{f+I)vAu{7&x(|AtD z&LjWBy3X0U&bNm4wCc>D&Oc)(9cI~)re4>Htv#=vF^%UPv0@!{&Y;d(bkbqsUxoiZ zXNMWo`J#4SEC#xeIl7Sgn{TrUnL!~h=t9Kq5dz6)Pz1LAr9wG?RZ+-Gj#vX7h0LLl z9lDUk^3l$t1b7#xP>0pw{8#NN?sC}ci`>`d2{NPx!)@PVTcKh7_smywf2{7-u1ZkCZtDrMky>(GAb2VLeoUFP=p z?9y~p@Jz*NpsAtMK$kb@73$VJo^Tv!{ z>q;D0rA*^lttPo~Pv_{1x}Sf8X>8T%%TNb0)WLGsuz^X)@6qzBkl&}>d`jF(tL`(6 z^{CNA?)7lsrjJ6$nd9uo0_bFJC%|{er4Va3 zi|+>QD<8am=h)3=vYUMXT{n`8E^fYI9K#RJZnl)Q;V%BUD2H=@$Bwjs9(c&%+zsa+ zw6pAHv+#Y;;Tz@fEyQ=D!#4uoA0560_*Rx`=kAoVgceKpou|96Zp7Qx!D9>i|A(=Y z?i0JkXBfuM*wKArYaKlNd~&|Y5o?w1y=mxUISZ^tC&MWISK$8(a_PO8EdJMovtK() zKP_FTcHhlzHi8!tx9Y?qzY53Su$RQ=b4?ZBKbgiY4&PEo;u$2qO(z!rBK+I&k;GYY zk+)3yCZS)ZJAjVzqb@Z*Joe<2Q$dR<JHPm-4V>M3z7M= zf+;YqW1F@xQm`PpN%K&SC=Hg%WwBD1Bsf;S98nL;l-M<4%Wj zv||n|W)7@%_;$xv-dl*T%%3ED%N@Qk4&S0uobPrx_rSTE!#NA*b=tX=h&%Q2#@gX0 zwsGdcH|^7q*JWJ&oxR|T-A|{pv=K*^H(6(7ovpjgG?rrnj1-$m^xnmxbs#O$Wn7C= zCXgt9CO}UDgw}y3oUhT&U!ec1F62U8$Sv{BRw2hZmR+X{5xXDM_k9Lioe# zgpBjwu?pEjA(!bw#IE^tHLpTC;a1Zqboh^RX>?ouF-z&B6 zU(xT-g%s#Q@(vqTA$uugi7rI!zCK-Da-XZ;nFjtv4*&6v+>zv7rgMwWH{>4f$eo1m zQuK%-PqE&T_6w&;a}mkj<5uv-4E5!Gyj4nKtmf!6ClKnHe3h_)ML(9Z&Ot8QLy^Bh ziqUre7#HjAsk?=b%N@SM93}2xj+~2577Ouz6aO)e5}WYN)xM{q->SRk61#g!ja5}! ztVuG0LeAHPh+#c}#_2*Zw#La%u=5#iQ{^K2mRN*@xy>!e<;ysih z=O8sfxrUTh(T^ADxU=znRu_4x9>evs-?D1VpvDWalg<*mx5<&NYsA)`OXOMLh&94d z<6dgaLnoaj{`>KtZ0dJ{@^mY}d7*Z`2!n0;{GC-sX$Fb&bz+gfi(`gPT&NQp_)d5D zj&vm6PvR_{Sp46^|0GA}B;k9i_FaU2yY2vXmVUyQ8o6uiPhRI!$SJxIF?^pu@+CZZ zxX3nC*3~*bw>r%cY?Pyr0~C_23laYh@So}^qzT_MwC}a(|7;af${tV;h3Ap6*6p;I zLQdC(h~WVOW$8k&wWkZg&U6GD?I@&)LT2bf#Q!7wPjM7dh40zgx73ZnbG-9=fT37H z;Y-R18j6G? zY4$OOLT2eg#PA@2X6QmNwtB!I*my^yy- zvvRGz8};VSAfoP9fO@i4^X}YuG((Mg$Td6`@rhde5aP$Q7r&(nuhOQgOd|s|n#jL0 zF~`$`p}mdK&1Ia7PVPX_4WZHJI2zq_scD>~eZN3|2w&NxU!kYp%P(DKopI)K#+ibh zY|_Q<>N7Y85r><&)cUwo&Nu~^@KL%W*nHjlvPYfI8E2XnuFJ5~E0 zL;s`WjFZh4buc;xEmCyx^V!+q3%AcWbr+jPlEZm`W1qR4ZDz8=xhKvSX=mA^7T`O? z;T!AlEyj1K!`F+ie2qa8i?1Kwlv3^7MrF%sv2@>+y8GU|_Brc}vzHzIaO|Y}#IBgu zFC>txf!JC{jj94ZMR&wnt$S}8`ZetEN20^k^1lv$a$DQ8(ku*wjnK{>Hf>kyLax$< zyfbUFRmg}amoZuwB6c?rh$&zdf~{4EK_R0Yv2Jk`vV%g#=t9JQ#TktMWx5h6gkNl7 zq-tmR{@XRWkRn~k)MsC@&OB=~5lvMY<4dtus#@eLdC@>sDQe%%{T?($^}4 zZxu;^n+YIa!2nu^=A?W+QrFIj2#-4^)C?w6e{|jGPSGVWuG~bOXP&A&K2dl04s^_k z70iifhi|V1I4{x8GAXifj&e8;ayXaZJiy`H8{cIP-vWGNwC`Ui?2UZ2?_n0<1DtUN zB9{lT$m413)%YvDFDp7xEVn9KohJ3h`4&H(iMM zN8``arE4@O#G`!=p#O*63Eyv~6ONO+yUs1TWPDfa-15|tncydPgu_4Ek$Vfdd+OZc zpMpQT!Oq-O`1a7gkDz~DcfvKHP6#tnt(mZcLVD>!#BczCI3!wA5@Y*uOfat_SeB!Z zH5AfE7b5<#`16jVvydcwdu!jX(AV3Y5cVjY5YdYd_H}O24a1igHCFCxb?z#j5Jw&U zr#W(OBlizFxA+go|1L-FI()y^zFX1n)tyjmcfwL*#^wfVChVmXe$<7C;UEI>*3c>h zW2+Og2-fNdHr-LkdJ6ej7b5;~_}}d)qyXQalv&NL;up+hYqFWJPh@m{rtyR;@x%hU z>bTA@iYU8ux!2j{8k29eZ{Ap{{S7-=Ma1qGbx$=fD4OQ;slLN~vZKHZ3T#Iw3x~M; z%Km{@s-2@`P+*((os53BPJO*j{g)+oShHmWse?MT*tHRd7rxR_r3S`UPgPM!ha=bt zjzY313`g!6nCzH_hrnbHh$f2DJaT^Mm5wmTu; zz}V`9B!YeE2zH{QkU13cFI|WPa1mg$qmV3o52KfP@&xgo)boT-P?#sBes*l%=+GD9 z{)(>iMqTG)-d1Z?&!^6>v6Fcsc5c#C+jSb)T4gq|dHdE8>m)~+^C`0holF|>_u&7y z$v}hc#|?FUr=2fH_%~h1O}daLzDu`e&0-2Uq6-nb2m)=8VY&G5$j|}Aq5n& zUl($Dk5Z-^0iJSn*#E=Wd4NY%eO>>gP^E+#Iun}o-US&tNbemdKoIOGCR+cK(-CsO3Q^o;@Vsb+5bFpDz;)Qe)y^Yi5<(8?ps13Fpa=xK0GL~m z5a6R0Je&3%R>&+XWY_uc93hJka>NQzTrqffzksvh5bMq$xIXl7wYNgrS0UhLlpM1{ zL{J=pS3N@ffIp^PQ)mtGFW4zGBAoVolCQ)--V$J2U!!$R>u`nVlP)jPKf!#88=jP| z&hs+m3()$BjoiTRFI%1&;X9w_M##@CF2uV@goz(3uc$^y`f{ku~^c=+w57`J?c^z0M zaRcN~Vi)D*Q<9nRtft0Fru@&sD(HUJN} zqUX#R;>@|n11=7Dy9MjKYQVnh!B+BM=Yf6CgG~Xu(}VSa-K$doG$}uz?AELK(i;w} zCpq5l!;j1Hhv@Cw0CmGN8@m$o^Hk9L9_&0@tq!!E<#_)JZ5lT5&H?`pcxJ5eW0_#r zTI`>+PX()C3%bJ=G&SeF>fG@3D3fGfvqBU&7ot5@2*K{1SBnJms)uYoWST!6u-fI5 z%o|pS@bkdG=MkdkvtGB@{3w_Pwx|hz@gmohRpfrt=2qJRu={N85Zqm`K(hq1&VyfI zb8E^BC-(-MTlj_GKd`wqWg=kLTWm?%&wHj!|Ky~g*+gRjDyT*uLw67`kDb?b{Cv1; zqX%5sGwh2P_P0FP5?~Kotfov3Pc&}!V5@kr_krE&V7W0<;*x-mdccOnTP*k^+CO2% zaiuYH?23DE@jz#*E69q=B2HH!#SNwRAHu_wP^`1ng%FbI;TmGASB3V*K~~%{8a0PR zFbsl^AzvUiB$Vr4eZ-pqXKRm~+5Mmu6 zIgNQC-{lIXC}@zUBtC?^V1=k{00dupgcyXZwnEm>{-G5z&kFhD)9Q|pV+dJeg($8M zJYQKM#JWS%m|L0`J!AttLdqfJWh+DkeIfY9BP5evykxO=(tg+qnQw)(x*^p$AS^-1 zLsp35I>Pg<6*50)l^Dn#^pN%S2sz8;@DVFS1f3xG&LhMJ_F;>?oAx7C$O0?mhQBkN zimXD&V^)aby1)}WW0er>mW1c`AN7#+^9V6iBy5F)6`-H`2(SFPd zS!9K5{N*u6$nK)!d4qtfBt?N&LG+syLa-x5uXsG=AsgTkl7f(Dtq|dRf&arJ#0U17 zaGzI`su!I@m<}voLi<+u&R3Ia2Wbph;>v*EvnublD*xL4kE602<(f;JHW|g$A<5rX zB(aXjNJHKO;Nj}#5xJiEGoLo?F~Zjc{}1>I(M=`?@H`8yL%{oE!&kn>1+q4Y7uv+~ z*9Z8oO?;0{90I$*gYE7~93k=DHnH#xz$dWbMDu2XT|~Re@&SBrwzjv7{9cKBuY<28 zRVYy}!Q5;4TfnhzZ1}viwMN@L_*E478^N6nH=T6e0?JJP=24$gh<-~>(nKYw;crw2kFutF5r6rzGw2*K_Yf^5oZ_XMJt zo*p6l5ppwa5~8+d;0t?%_`u$3v3+R&z!4HI`34v7;pD!BHWfi_Ex;DBx$n2RBMlPF zG!OnNPwr#no@#Ro-x9n&f*dVE4%pji*8m-bZ-X5m-A-tLmc8K)&~27~1RMu=I`9n! zXcF3H;FmV}Tf<$#YJ0$H3&B6#gYD(fmV>sLw5cw_w*jwD)*pikl3 zWCv&jzuOg5KA)+bU^2oFy~z_a6Kw3av~P{Bws69F?sg)neCFLY|3GtgbgZ4`5}9Rr zZQnNQEmYgX*>5IYUfz#;vUUHBd#6S50a8f&M#}IaKFzeGWEE#}Qyb(vCyo^Edak2Q z1FW`z-1SuV^es@77x6825ADathCf@+#9G3c>jwNBbW`NNo0`-Fr{mH?m~L`%F5?A! z4|b%jsZOlZ*|Tr5d4<0R{N*;UPOPQ~FW_73QQ8w)T==xkV4YY8&{%-vYD^o>ehiq$ z?unJe{XO8So>S+8oH_@2u%*D(vsmZET9}vgJy@Ry`zY8U4we(E5|;+tzysF1fd*Uf z7qs6-kC`y$VS6#@y2`!RC|H)4^odjT6n7uJZv>C3N33(N;X}w!57$Cly$-Zb3Uc5d zL8FFE1ouPG1Olx~KNbQ!+=5Tje!CU&h!wKw9`{wANeCHbg(z+@Jk6{SVjUr&LVOtB z!?g%5=fo-{OAs=~3K7AB5VY_J$pJjtg8!s_x)t)M6;l7+geyD4vl}5-TOo>j7@k&E z2(iu`kiP&QhIhGwDGIt9GGd(1@9#!PM=L~ak3i7I3ekkh1njp$@}+Q@@=U1V+%(?$o~*;s*qB}|I09&j1J?JZanDl;FSe)oW@c_!3T zV7q&;Wx;mvU?X6=SuCGrGc#<59~(gV^j#YIfE#b5%VF)|^mqu^t?AZ3P}D z#2agp`6RrDxH+%kZ`KF!*baK(*TfV@_2J>)Hz!qVd6kW=z`^5R-tq{odBSSY2PZYj zPv_%9@5J%DUvXx5xbIs@O@qBXdG3gIfaXc)?MGWwZwd5vzo6Gd5)({c54Hu{mhyzL zC0MzFNu~+yQLQy7UQh5^M2h(z#v2rinZ^X+)P1b^He7T|QjNyk-8I9Tk8aBQ0J7M< zO;dc#9o=}Ea=dFJKfh}d@V{wqNlGo+oMwvR5SrC*#u<(LNQuRti`*5p{ z688e#*RW|=s)u(H(V`m7rAeaZx&z&&nvBIDScYQu!f9#}SH_C@AY3^+sd}BrFa2rK`J{7*)yc(O7kE>TIO$Xs4)k;5cF`5ald6TUizF2)nN812(5{|qoEHrXpWl{Lt;un^n_{#52wg?8Nh9BqU+ZQl&d-OS8%=6^-nw1EkCk~VQw$Hwt^JVw9F zN^Pd&c~)iX!8R*+oZ#pE9cZZlmK-TF!%ZsZOAYV8y-0H9%J}H0{~*{7{awB*!00{2 z8Q}rDlFF1$G+77zrV@ebv7Rd4I6PczXHv5uoK-IjfS|%)r2KPYI-^{eF4)aHc9jS7o8t%k zrYM0L%rgvTn8B<@P<`xb0^l+^OPEx(J1gc!jHV>frS(Rq}8P5L3f`3bnn_8yQ`luhh4u^;hkI>>*IGaYz8WIh@ob4S@(#|6zV1ggFY{GR#> z+!ec>$MNc$P1$}1qxybDOjqhFOkZpok8OPmnKS}_v+C}p;pq&wbC(nNt0jWzafb+wgvvB!V0dEp6v6stDUU*TH0Q_GsuJ z)C`|SZ2~_hI%}+J-eMIu$Js`{v0V29+XR)ger>7bdkLx*mTD3|uZ>5QVzQz7)>7#` z6mLO=YID+3$@dCWtt{2`{Jb$9RSA<0)pwTaIbd6%q95jzrIPP;sM=VnoB4T*LlrX{ zN}4Q|(-}*pBHj{k3wRozX2>acD=|Z~kvGJCz+aX@l>fk0u8)aAPk3`XKK3Vb6F-aa z6BTATmh(PeFmO8Q5%_{8Oyjch&`E?+vffXBJXc^tfuyo~8c@(*otEv}{{7jOjj0FS-N10+9Y zlRr<}Qj%-Zx5PfS$>mQcxqMm(3wi8KPVS&dbb4OaWbgBvj{s^6XAn4XV4iKmj#o;Y z;VKcwSJHfHJ0h-TbVMS?V2|ddmW}i#u<9QzA zgRB$r?^`1IwjfTvMV2Q=M^y8>2!|vGgO^*c$xxe3J&!odK@y0)mnmeS^u22 zWA0?hZ(g$ua(02C3!tgkDogMjzc8>>@gM`F9;za7&e^(M1~pElO@LL{EFPRk zzja%Q^S#R(K-`4BzlfTuaIod zL!21DU*JrJ2zNP~++^k#Vycr&2Oln>Xe`1{F09T~-Z15}0BS@3info!2}k&*SVYgH zA{Z?5Dvwpr=P){NLQIoo#PVy3?ir2_b!Wp9IHq#5zLd72@Tlz=ZBPaEo-*?qj&+!l zzY0&6(L9Ma_+;)g4r%-R<~hKcfBKHKS`>pF*!jB8<@j}TD1Wf6>zhs6J@6qq_Dk!LHH467V|jHkekTv1oONr&UcQO+He6g zt1ORvi*U%d&hq4rxQ0#Bpg9(T=mkJ3kxuv@Q6io0=U}h%*i#|*Qs8Izr82ACX00m^HKdZR)G9>kbM~b4Vbv29SPdi=Lqm4;6<0r_YXmJAsY_hWlJRAJrK#a(GunM zgvZg27QHocK=E4GBk@v7pdH;=nyzJG~n2-(%d|IhNs z_W(TdMJ!Lesss&0O8}`SIy-zuPXYkuV%a>7*ON4Q@}$M+3Y!Bao%ng!wo72r_xjBf zu3&SPm_`U5r}aM`(9>468bYX+ZyWXmkK^?|vXghnxMV(dJ<}Mn3B*5ZdE|Q#9{ILg zo_M_v?F{mE8LR3~k~M*dzxtr5nPbW1zY8+?c4ALmLX;m-AuB2Y0Zk#ghWNRbN4|&Q zk#CpfnXd8oT$h|>s5g_|1xS~6o!Lt}Gc|UimNoN<(Xc7z3x;44T=!smc^r4x7=n4O zKvMxR%@7#Wnekd|Ar1UUQNwmr!r@%#)^cTCK$~_%we6ry@$<1~2~t_#=cktJYm8n` z6r5Z7hl5@A4&{qX)a}r)h+#;I`U(oFFBzv7?_u6Cv?4?bB1(=d)F6_6-UMJb( z%(!zE-6KPYWcRqqOhvFQNOnE(H(>W$PWgAiDc?dYGm-ko^uQT{9FAi5xn#bofLcO! zBY?%0NWS+Wl5deibdflRClj6rz*L~Jx@jpBJCx*)=;eickmTKn%9$x=-0>Cg1mk-X zG52GKcpPthv-kMT%@(7Zl&`_uO#B1b;Y(nGS%7W@q)RJrPd2US=@fpyMTTGFF8_LZ zE8pGNO2{-6o};|eTR**>=5qOXtFmbg*R8}&wLJ1g;F0fMmnRr|Om~P3M7LW}l>oJY zXexkd*lm_f{@swtCsA4j@rE)0*9JFF=AH^L>~ec0+tR4TGJ!Q3e>#^1-Q`KBFYGXTu8MDlHeNWR6E zNTi+{8b?$JqM#L}|2U^TJa-T`6Ps#`$Z{Y{cgcKdfUbmW z7Ht7bB;Nsuy+S9d=Gy=L1dk)QAltak za6ffLIgRTHEX2>%*b7)>GjA#)ju4;XMrl9hkIuIf@M{8pioXM*wX24(Yw(vxg!~KJ zy1pmyzXCVnI2#f7jchc+hbw_llLT;>uj(tzikMfOW?EEu9#pyvDeyI&4aeYvH?2BZ{06? zMFhBkx`9}lQ<~C=%La%09_Nd1ug6z~PRLh;p`1^^|FEUt_hH{yo^=#UA52_Rcb~JB z=aBm*%hw6)_@VA4;tjCvININ6dDS+7HtpC?V$0$C0Q=VE4S#bysc4XK=MF5v46vY< zfQlhTTk=!ba|9p2JYw7x2&TWqbOSSyMB06y##Ru25VMJ#@6A*meXe1U#k2*JqBE!V zi)XN706vP7D2M6|1f=2Cc5nri=DOxg5#1|$D{Y7!ViR0Nf=P(dHnJ4c6D}WPc||ej zB7g>4(3OBn(t9n!iwpOm}}|5Vgi?ApA!2ymbV8CA7X}sDNalx7(c5{%ljd06n=iB$o~i( z1MVCpGP$~sq|LcWSL35J3a|>xAxfK)?z+^I_2f7@kEemIaBtH6Bpl-=^zqewrXvX- zCT_G%Cf_novgIVZnl@d()V7>9#XqmzLCJ38!S6QN29k9dMH%1x%AM_0WUQOaBmwS3 zGA;E-v5uBh{>LGeZw>b4#T{@aZ3r%2xm2Zmd|8L-oWb}%2B5Pgl5+(_^63srL^0wV zB0of(tSCBXx*+Ot;=@>1%O(F3xa89gdm^4F@|zrP(Yjbwe0t7wg=`7_?v_ZtRS?Oy z&Jx8Fl?hKbFodKu1G9LTJKox5UuR$FNn2HhKy44wroATkCjXp7neL2>@H=fz4

_ z^#Gttfwt^Fx;pQ?GG7qqy%dBt@wF_(5#m$aXwRkKD*X8nBY(YsyL0!!M~alr_`<^& zfB5v4F0&qmy=jy}H7-G+b}b3iD=7cQ2+ToXKO5hQ_=kzthQ(wx-w_+zAwThbZG3m) zpCJBeU7ZBbvp2B`0sxv=z!d-<1)z;;6Luy}0z*76!a2*3zL;(dKb>DZOU)^NfU#`qlKF}P>JC{JaV;#7e5)anZ=;Cdi8m*F@H7Xbs`di& z6rET~ygKm%m|$$Lee|N01#|%LIDlmYZ^nKE5L?y=_pB{#d_Ur!Azqu6;`KybEPoF1 zZHU)q#oNb%rUyNHjt48S)-V6g4bb0%E?iP?W+AtuO_yr5ouf^=WH$C9!hXZFZ)m4v zC_)|X;J$TfE9be>$c_Zl!SdE6X3)?JyseoGSL(bdVl@t?kuA|7ehE@N|DbWU#E#G$ z2-Co*e7tAUTt$zgr!rpIb*d--GxS;~wXN9q@WiWLfKCUiyJSATd&u;H>_q_8ERlRK zK_p+q66GzQTRk5{HNmJO)xn&nLfVFRV7fZ$sRU1joQ{O(NR2amiM@KlJ1R|YlI!yM zGIqHYAb%zTA7Y0jlb+PGm>OX8nZTq-uuDSm0MaCA)+uKb)xO5h5WO?L!6(0F^S)S-HcJ=n& z2T?;n8pry8hSU3X0MwI>B-iuhvAqw_lM>|*!mN=N9)HK?SZKhD{8W#<_p;)OF)PVMp z6j-~rFmHj;KMKnO)R0V~ZAVOHBCOhI zd~?Whc&ynVL3$h^aXFh<_<+MwJ{$`@I7$-jwzI*SVz&Q95raGkIJE0w0q_1S=?|E+2=n zXesG|(VU3lmeCdi3c{5ukYxyqCz5HkdZ#|3N;MX;J4 zYF|5wMSG}wUmYdAJ@nDT{NO5(csOn0n8ZGK-Br}fS(hO;D}O;qwKEm62?czSgj(mv zJPBLl*Nw0@VT7I*45$B?RK3t;z!}Sw>CPK_AIWKw94w40jUzcAb9rxff7Ic$JvIZM zZeWWN!w(;3#!TcN{y<}j;MX-`77hKmM!bu!I6l=>zTKV;q9tuP^dP!HXvd*P!90!- z0mUFF%d{53V7e&M53y)Xb%yj~Q6hGb5N~LAN)#e4KzDzFBigB2bV_?~M=vA7Rnbcg5cg6ObRy=Iy84e_JsjoC6WFkG8+882LW% zh>59W(0Q+rFGii=@jq6K2!(R7$0KJSuuCfHV85M!Tf8v ze4jX6yhNmvz*Sf;%uALN;k%gt=A)!Enc@NWsC^ut$~+S16ETZ7!s3puJ#88}F^@w$ z&H?YE%EG@CM9l{Teu_oQ+?R%aSd?fPZHAtGX%mm8?b(>bnK6lSh>zkW#L)hQO)UIc z^6IJ<{EUC1iTl%_t4S2NjJ60>_zGaO3KatGN;zT4#uEQ3WCw};+`}~pzczj^S0-HF zfYS;K(6(X#>n|E0m7y5bnLPg5CKBLv5*;Eg$CGFner<{{d?g zCqA9{Q#M{=PZFou<7SJyiS4?c81R zIDk$(uEy2p%QyO$fBszrcFLCveugZ!7O7 zwTYi1ZYww~%P+Cs|A7l_;Y+J5ZaHypgZqT}XHEd@^B+J1a6dqerIyUMl{S@(<RYWd7Ahf zV$WkE;>4#Be~5URQ*j(RpTJ>s)VMa;;)a1k1&0+ZM-UQh#LHtIqfVL;x^; zjWr+kaE?$mH({z)e=tq(YkFpfJN}teuKDT1W554xtl18_INmpi8l1rj;P>ya zZTPkIuEAfXp^MxWzW-0|w71b*4&zD?(Pz6-Pd+yG6#whMP9Qa@jHt@Ac4@vRZYqzf z^Ha&UQ_09Lyc&IS{uSb2L?Zh9+78=^2Z?>3PMoG~6u-ymMEJ|U*@`Ndw_WAFb%YH; z`57R)tOWVWB0;{NuxUK5&QB}Bk?*FiVBP^kQ_2=%^d~=79;=_6WOmb5i!QxI+ix7P z)tv-@x9KP}0~6s6cXlvELan@g+5=bvqnYvnebOw^Cc`2!H)9tK4DG_Zb**Spzi2V` z#E`M(U3eHG-^gll|!w3e_(g= zxH>;k8zZ)odCiL5PRt*ORqv7!8#AmPu@%j$dTLH;wUUpbR;zeCzo%nUu-x%<)mAod zShX8y9FE%G0laS2%2yt>^8JO)=5cj?UaYpVdDE)hMa)^$s(1NNs}pMc(pEC-v`3rB8c%ojRWYfDhKxBfZH(DqJFyY$2s-g6G3)VZ^1X_$Lfpw$ z#RS!pytA*WNqzsf(fiAci8K3rCiUGnM(;g5+H8WnCKE{sD^MXN{0Cda?<}cK$cLl)Q%j5WBtC+0l>{~@p4r+CR zn8@!JtNrO;P6?S->t;emq4i(FmszdyRYI$LiR8V9$JP1$Vy!;2)N0*F%-?8L-wI+6 z@HoEKs^+;fNso0%E)!f1Rs}vx?B&DVo%0TUKhD)?9#oI;yZ6eplZ|xZ3@&jAm$%N6 zHouAJ!=T6C^qZAd#{n?s&>`K0u!ndYUxzPfR#*Wa5>yuf+K)o~{vQI&g~ZiN!g{}X z-g0~jt^pj{Cg$?H-fen`PFRkh)K@oYkgT#K#{o8mM5}kf{}+ikYM732ykI%L0@n-< zO>bTEXJ1MPCnZmBYMOLNR$G#90k(uh({Ay9k>ug1W#+)K#&Vni*9HzvhllxnevFV{ z^u)oqdW2kVvLJcUB{4q%YzK+PvM}~o9HRId@ zqaw&bNiUD%t7TsDkbRt&O!FRG%{aIkaeURyLuX34CnzcVEJ_wmPf9Ko%&R5MJV@Tl z*}fDydXQU)%Sc#=tmW}NjXi@si)la4!j@voFzsIXD6lwFu>_7G1vc#*^$46iOex1d z(Ypx!{_vTk8%x#Zmwy2+W+PFeBG_|0UVwiQwu0ZiD{~j?X5TVipQ!n3tK1|To*VTT zNopdzKr){ZYVGnAg{vD=VI>?A_I5ZuIXN}>D51p&)&DepswK}85>^b;ex~E>Q&`Tr z1QRBXP1Y!GbcP#KoVdsFr&#=V-2cm$f+bTUB$(c$yoaC9TfV3HJp|wXT)v<~I0iz2 zN?=Q9C}9a(HgJ6<@k`KsfLHVL0e-3vyW!e@Ew?hQZ*QI3|5cBTd=d`TQvaj)!_-oE zJ!$#^*az`HjJdT06`jWKK=$53PC6U&w|O^wf4<~O!KV(7c;>eenFTV-jC=x0P3T%s z_8;h`2jPybGY9K9lDxh|KL0G4^6h~=g{L@q!kNk>@qygZ{4|kdVxESZ5QqDpR?G_? ztighlIO{$|V;$N9mw>`QsExt);U zXD(p6nM4mrM_Nxp=Ly+p59R=(;3+;tK1RV+=tmLuIJP2uya0=9>{OoX%Xhf-ySS1< z38B=bEk-UMGIB+M71KtBfNOGH4ty+gLH#R(&Bf0MIfYro)y4MJc|KXG^#Sv{hyj`+ z?4F?)iWkVocO24Im)g-5G<#_jDc?`#+*ZAgM%|Ez^nD7b|F1WDXG)t~fe*JB@#v5d zj~94++K71&X+r6LZC^tF#KE_t86Cx!;C}*(U7Y*ie0K`>%KtpXps?#FgxCTeEHh#u zWHnLM0mdw4C`={n9{gIa z;%m=xm9H`;LC=8I0W6cBE!3+I=+)l4dE?6DJKHd>Zbim-9eO_`oJdRpLFb0M1XUo} z9Bxu5x#EBWz_KY|?T70*2ap6mkJtLmeZWAad`RwRhq*(qk=V@8FQ$z~e|{)e&?Y(` z_7-}!gi91oE|r$>qTf8=X7>FI#h+xZs;q>pfM!+rm9ohdYbskh*`}Ow8UG-3IgARe zjzumoX0P^}n+a3p@tVyUZqhWuZpEJvo^qf_)ycQ;YiYD8q`^RrQZ&~@=T;m;S^VrX zW*fgxBd(0?!;yYAj#$MNAqI$9h6P^c|7s+*B-!-;!v9W3su8z2{9fVYis776$%Sgr zi3B=vJ7FrzNy3)W35C_hc0lQtC3yS=zqyUDXeTz&3H7rEHUocDc|(-TC*|3|rJSYV zD4$N8l&e;UQLeZVnBu(3Lo0d9w$1e=Jn0#HII1RrwXi#AC<}=ie!t4~HOH@~x<`?0 z8_9I17$scJPm{+jKH4qrW!{k-2~fJOwlX~J`rqm z?rq3$$D{!-GMdRWL@VAHRSh6i#dUY7iqC*V%VrXBs<;OBAQe|!D@<|Tiu+#Rt=%@+ z=kSapnMS4&b}fF*J^6oT?ltOyb2t{Ae8))ku7_|rkM+Mi+yGERxR&Inn0$@FXhT^? z^LqCyBp6B0xx2neV+60aI`)!p9XbSSg588)BPjn2Mo_-KnC`dYk6;6n#R%SLWo$;q z8DumCBN-btPMaMa?epqrXy#D-0pN7Zd7XUExFa7k@b%qU6P{8$xm1QIE=-vDtH2gOF;DqK&rwqK0t`s3U?!h(C{rEI=czh)-!8pW!aH)_}&~*CLYt zM;4KMow4W|+5^{qh@*>gIgbOZq0!)|n6rBfWdUpF=#TAdOfbXa2xzY0=lPE~3D$z^ zPs0JmtsQ|Hx81bA>rNd;&9mT|nRG^LtkpalZEMi1;l2Xvh+l=1{~8J>-)QV(D5HfF zp9Ue(Zz6j4%JG$czR<;$F&Y_%k#Qv$$#_o-Zl9y0NnRZ-OhDdb0det;k;7OiIUmwWb$>%UHx^xu7%A~LKo1OqAW-!2?5hLI*?3=t|E^IA`=wU_N1LoxncVIez>19R8 ze;p#^yBa$YD}qUH5}lE5Z91;>o2#sdkpO>1gn*;4la~;YhKN2^#DiegX3zuTJ7T@9 z2>EY7gnZ+$uRJ2O>3F7F8*_Yx-}D2R%9sTJJ{))aTbjOBsA7IcsL01+KVCv;_Da8L z0;V9BA8le4yEz+JC-Vt_LWnqmX}1d9&zsr_#O+gHU?NWbz-^l0*K(Z7eEyjE+!g2M za2f~cnCaivBruPg+VBwJpAp_&;qZP0)hF>*%@yW2+g=L`yARlLU_CADGhjImtWMs= z*3QfU*4)CD06PJ!mm}^B;+!I9flhQry1hBhsI?-T+NG$E_oci2Yl%;DdN;la8SZ|j zvq2mD+WqB!mfc^z0G2~%;_Uvp-*V8}NnZX`Au~z;ue7C5Fua& z_V*=3WFx$q6|oA;>xj^F>y6d4BIKWj2z)_vGY!9AQbb3ShKL#fHMY|MJ`s0p+dEZ^ z9?PJNaQKenYhW(79TM01p=@$eb`6%u;)pl4bLdemK#Wd`ZYWYY9z-t{i?~$Ov4mcrC)lP`0>9A{~QX=I5DfvxD_-3xC+%d56Z zE^pBJAMr#y9l|1(m|ZaR=rm4tmp$ly>@AD;)dSWX(D^ov$-|GB6MEw~zNU`vFtIBz z`Vi5+F=7RhtG>1u83XsaKbLw!)t~2Tlm|lrdHMp;H z-*WsM@4~kUbPTNmzmA|&u;Q`&y2vJwAb|vBY=Q`&5hT$0^A_y$m;{+5C~aePJn2TP zjwVyFQ#SUf6%!yf)yC@d(4APF9B#u-6YG4;IvVUJxGcfi&-XJ%E7{L=?>8O$fZ$8* z=lWoBd2q!^dYmK~8Sb+lBf*u!Ulu<3f4zzqD)G(2>{&SIya#9(%X3TY*?|r|9tdFPl_!d4^b?m1)yht4-fDqeJaXEQO({kWBsM|CkAP zurD`LXD+PItmS@F39u&JJc7#ENtbsrwTd&rTF3FVF;%S)aU~)|C%@T2{;A|B$rW;8 z(jlM9sVFQ#yR-t`dz4- zd+q)Ma__k(@DR`36NlTUxb%r+<`3K^5kL7-5XK9H9k5D=0~-Tf%C8`mry-t8DA}I6X?p=iv+qV&d1KjB=Av{{KRSqb)Kkr(GWU^ z>?`ApSBE%0zbRzJimV`FCy;a@rWYwHV7Q%k6s}NpSk@hv`AtD9R#(v8h|M5=5vK9e z{L3A~oMj2-GMhk$g~BAzLE#=u&+^Ahkg&{e3fly_qV*wxE@t;)`INxJAA!H9jn&0! zB(XtVsW{e}0ydZ{m9vXw5?sUv>*6$uU|pCNV+CVk1H=}$!gatXf^Z!!m^P+1vGIh@ zS?V{%bdyz_g!!2~yE5FX*!6&Zf=m}6`O9*lkdL7>g&>Q$06Ag@2_e(~8j5Y78QDrFsl26(b^t>-$+^HDm?pu4$-Fg7qK#0e^S@h_Vg3Oo93S63C)`V&WR?Igy${s8Ngq9h&Mxg z+!Cwpge6|%5W5GOc9!>3c>i9_-PDO!odZn(-W;&bM;^Zhyd}fECf^SDOG_&M5lH2G z3Dc{(F=CyWtpAnx3yb+4%-0t4vc;&99w81U@(gc{2NTpyudemQySdl-RFZi2e&2%$ z<^+N4tUkq!n+O|!;5OgjCtr&EUDI9P*Z4F)zM)}jhI`4rgWzxRYpCTvhCun&>il2+ z61@tb@T3L(0O+Izy#i2}c)jw0`3{U$l&<%EnasM@hp~!a`0H%X&G{K&GM|dB|6}xE zB5HLqxf44BiB3WCzsxB}z9+GAFhCyroM!G_?P;4_*Z6)UpAF`JSk;*1nI!*#SZDlC z5*#6UkiZc3Jq;?A{2$S4`Ci9-ki;ARoTqtvy2bnm=9I;}VKMP46#%mzOi-oLfuTQY zby#=`ON&t(KcogOLDYV1)qY3t_6+xMa5upp*~Iea zkXXLAvAUNK8(@ZhYB9fn`O#u_TTDE$K_8$`tXds92BB65j#XH_7`2(1yy4u6Jx%P6 z4ELaMAF-d=#PWYZV)@>|>R&=^<`cZPJi7n?3g~AGde?&DsSN;%SOG;@njauQhnEMi z?f99WDe_n6TB3V^HQ0W(>1|e=+G^0Ic>2QI8W?{i8f<&Y^Jt;(CbKi8FLL&2KZCZ@ z5|;p6&qI6&;vJS)ZS^fNBkH_gLQ8e1<^3F9-T#;7{=YeXO@_C@I_tj}-~;qTM=N$) zv&)joUkg(CboyxsS&Ud`FAtIN9g8UmrlG~?^dn4+5C`K2vm1=F{&i*>%(&~k_A1uM zs{RHw-k^Fg z&JrJoxB}1T3~b!MnWZ6!-vq32{~hqV8Sd4LRcO{(Qu%*Jtb978=sOfNNj+knIpl+D zgTpd2>%^?-Cp(WSUdH}^8!g?R5FC5HvQi8vin6gPe%3it!H z*@&NfDe?!>UEf>y)Tux8{LdbG#g=WDH)b;py7c}5;w-(Ak5h{2afx1q5V+N1{smKz zUJ0W^iZJmy7k2mh3G4zYrjwuKm3(Awa-Pi!t=DFam_pwO^Qc z0zxQU28IHp1Zx7SP9ja1H?Ubun58sUp$clNK^qg=tjF{o$RXHu7=QIKtkBM0B&u1; zJuM^VIZNCE?98DTc)Rda%yP@Cwpp~X2L;VWh>OBD47`yB59u?E&l$#>60>6e#{&R?xPX~~`aK)R58lUGaCLurLUxbngqyvaR@k$n9f~>OY z>!7|CiL@ne#u|eej(Mu$?wTZe!D8BgnN5$iCBKC=A$|mQgPwomuc628G6A&Of*Js- zO(JclTd}4Dj>2LSg%G#KV%mcVkx08~Hr9-Iy)^5oth=(f?7s*msFMCg>E5TLy5>KR zy-b5jD*t%|$hQR>08zY>1`zO)#Uvu27!?&JVlnY5>I3tTuK${7O#sy;k+$M(SZflE z#bSFJLG^>SryarEMNhRAZ^zmYe>E1{(@ZdrfT5o$Wx$joPjs?RPYx!Thb^HROVg+= zcL#O_Wa-!pjKAgtl|4U~$1SD`m>Wr?9aWeP#AjemBDVt$ zM*HsnEGC!=s2qv3%k9EC0=OnlqD&Gkv7oA8ZYGg-Ibk{xKPgV42$(0qXpa5OT-wJR z)A`~x>+)>Fq2jPi%QD%@mUM<0lI~rYC9FEJD_W^i%LpQp8=zuo})wi>8TDUZ(?(- zX!#QnEuT)tW8iVVW;wtw{53gPb1cI6ge)i-0VOR+Cu4!)^)v)#E*Rh`wZN?8xgSl& zC&b5Oyo+EF zRhwr8v;_0AN5BpQEVKgDwwAVyu7IGq5wd$?1Q^C`ffbMjU@Akf$qKlN*qgD~A&4O0 zZkwbonAIfF_WV9Jh4|P`1{ktMy8l;M+5&hLfc9nqZUGQ0Dg@wOE2;wVRj7(~#r;@s z;-_J;Rq-L}9xJLLm>0>Rt$ZJLt3+x4_m~;Vbm5KxpSU@M@*AcHRSMk&1BnS~d zB3rFQS=sgC* z2-OV+t>R$-zm8EHh^b5{=CO~*v#ns^10>ew^`R&6 z3uJr|^9l}-INjzAjsW;wOkzL8QKc%CFYlNcKaT@8vGC(atj*>VPvSKYyd0C5`)xBG zoR13k0~Z;@_-ki7?g6d^^a{q`+)`x%9A_1e0{DH5-~fW6J(S>3tYC&H!wMFD0)n-9 zeXjfe=%jj`jBj93B4=!Q=3s(NJOJSKnAxml6&0$)ISBPW)@r!M_E`9BB-S$hBHCfj zpJpA1^;i_B#D0h-THqjnI{<3geHjIU+W=r=3@!)3lfeOTcFr^Xcm@Q_57-i7b=W$A zWnl?^{-eu;$NXl}cS(;3pdIgf0NU+-#-54;nDeOLTxSz#r~85UBgFoOJsl@LjrePc z*M2aO_;W2TRCp7wYdXs(TJn~t%A{n`#{@X-H+&ba(mJ_X@Op^SxZ|Sprde-yX+}ue z9>)xP+xh(_VtD9N%!G;j!ylc5Zp6>=%y$zauHkrg48a5NX}QZckqXOq0*g+GZ1}ce zR!`9R7MTbo?hjBsngsBlygf1jhA!FxbtvR{tPZJt5cp_En$i(1*`2l{Gl@}01`-qP zNQTo9CM$7f2n~#5ih5Wy-R<3AeCpj41WlrM9}zeVpL*x`=$(9DVbR|0BFWnqdgpX5 zK<9>f_>032#EihFn`LK9{*f`)c6=6l4uaT8mHmj{3eNil{<9&tp|c z?W4e_*`aPig|xQ>-zAyGdIT{l(MW=#C7S4z$W=QAzp8O7$tP2dPXUj{r)tP|E!B|k zYb;ugJ@DZt9Db^CH8@-Cpz)=<3C7}t$FKeVC#s>n{%>qG1hLh~q8c5ES2eCD z{@<8tgh&!is%m6<9y<$w$A0i>Y?jWMk;#K|AROeg%R z#x#;&OEq$EcEzV^$ag)}kndY8T8;g1e-KlRoZzE;H#b0A4pTSXP0$@5JkFl}Gu6;` zeja-*PBqe~MmHO;9q||9wHqd|MmNQYPawXhjSmh2@GAiAg2@0j#{pzL>^D6CR6|gZ z_l4my$A^-814l}VbhjGierwj(R$tEDBb@J2 zO3t5T`Vm*%_Al6p7h6ej3OhoY53%T18hym?B_V{v`7R|NQ>m}Ts&5Oy(l2uwJF3r$ z{uekvVVcZ0K7$=2_P;vd2iw9XQ}2(EOFjS1=K4f8W8P#2KywRF zvZDZ$NGo09a$@2GjASb-P8A9xPSyDd`}F@17XsU&C18!0#{Ww;IaT3D>@C0=+t0B{ z%w6wi;lBxgaAk72;6pWfKGLP9X#+zgW>6*!eHyx((Me`duWxzD=7&a}FFESuY*e&) zzGkJfNxlC>kHyJ743=>YWEXQ z)Am`IC+QV{YXDY#j!{q5Mr&X7{1WrlKZlvq)|O;0@n2JtpNank+d{0TB*DBjvArTL z!s{HSy;b}?B=u1&fEfr0Ql;?>x~xnA{^yTX&N>V|n+PJGz@{wjwXU-H?h%fL2vegzYM__~@pR_r5Seh#8mm)w7` zgMj{9ZL~_=O>HZ74J3^bD|`}(f4QXC?xvml{${cniX znoqut_sYx6>tr5=USpU89v^buUp8wq4hMW zHzjwUQlq=6Z#()Dm?m^ox$|THkT?FQ^e~NVN4J7Wq@yZKQS2`;@kgb5&@{9ieG<@F zI;wL@KI|}{|Er@EpN{r4C9K$&;b@9j;R|BtFDbUCNpZwpZT#B+C81WmD~A1hNwqyp zDOYUc-C#NlcQ?4x#8<;kGqR7c(RnX6r@{lea!#3ZpWl>p@)`e5pvk0@zT#LS8)N+9 z^fYB`y1igJkxo0*5C4@830YX8-$Zzs(PWXg^mn{y#6{A1*IHLdw5WfB#D6Kx5L_VX zZL%RrwL%Udqz4>YYiIwzIf8x6aY)Kq61}qA3lc4Xzag3U|0SfaNx0u{%3F?4;OGm7 zrkD1@|Kwn^F^Nt|`k6FH%2|>eNCrTnQBQ*8XGSmXD1yIGk^ZJ592H#-a{`XRaA+t~ zuzYcNa=%IGR34%MCLN**5a~j25~9QbPG_8Naa6(tXMPWGHv1^n_4SYA>t`x^xbj`# z3V+!>xkR751byNV^fr||1O+b;IK6ih_i}OJjy>2szT?o6e=%q8@4t1veIz-gtK2}7 z+ObBL&Rx26EzosBm#R=+&TLcvZsg8MMJjHGO{FpZI`v3Z zJwkkdG*G%*6*z0_$ge9@EM1N19`AHh@oskH7jN%FxN{=;;4G^L)*U<`v*#fX@Ry&E2SgrLl6L3e)@) z7P^;@0}xl7-xYUVh8t6cxJ3LZ7XK5^h{;zLOJ?kA60fgC`k9~QE#K4p?t?F%_>^CS z*Fz{!s(>_yVWrlJm0r7(By{%S4w$E{%A!{d7$@~P2dd^g=o{9S6lCa zt{CB+F=oyuGKVBkzCD?wo-Rm1+EK}%1By;yqkRmUVKKSXu;S&*On&{7N) zp2hP}g)%OD!%_aJ{@>x?Vr{;HqYl*-z6Mq=+@~+!aP%9nH)k?>#TK^CA}qnryPfCr zi3DIgOxX+U&0+4y>!s}7jQm;JX!M_=Z8L46tA*7^M}u%$-{ewh9YcQex0~7bD`EA? zTvJ&IDU9MG;W>knOEn<7dnF)iI$8g+%zpye0<$1>utvGe!E~7aAxs0nj$yvcaFbt7 z*g5>A5hj0Ab^-Y`4I5(mo)$znAGQ6LI2OFqfe-t+arJ0dToGc(?!1~An#+d|h^2nM zBRFe~F!@bw0_%kv^h>UoM}F2kzd25<3iC0s8V1$_Xqh3esHSC+a#|}K^qWTFFdAyOgZLIj;{$- zIW&jXQx1i;z*<7=V5jg$tJ{p~$~Q#iZHB45tOBRJX9!ezHHyb5uL8$niu0D&H!H!MCRWu5 z6Prpkni5wp+;IjV)p|PAvud@~%1ZQ`2zN^+E95snSdAO#+oxz0tQqz_{&J+2 zKiJ>(Nm^S{XYi9B&b4+0KNB6k!6tpS-+Ta0_hVV$uBUIr1#<(bQ$&d`1`jcFLVmL! zFwe>Q1-_B~O30O1I{+M{9iHa=)W;wW6080$qrVmCZ!6-ormK*|hxOyJdkqgiFeo{{ zir_iw4KWA6mnYXAp7`C@mH{Nb3)TV-H01eM#~)$5YbOL_} ze{^u0Q-GHE7~7y}1+gkP10q%Ms3q7+XFr2L9cYam!LQ28KZnZ8Hw^0lK7QqgnglBU zAvhIpCAga?o^S)P_TZ8Y4pl(a^UdZ%2?T3qX+a#NO)5KK66u{;yeMcsCRUYxnb^uy zx(#vl!ZQZ(br#X0COR`~RBJbpj+{vdEh*e2i(Z_#>10qg+tPfd3w~uF;GbLxzG_1L2 z(Ty|-^fdxpecgk$9QvvfwZpdKS6}6yPhaH=U|lcS*HI>&zHYP9UjsJ<>EMFS*4ZOQ zdkF123D(3v$izR&#Me&N2Ww4ioQdyzo=KnT-eE^4w+t-wF2)`IE0Xtt-O`=N+we2&2qp94EHmzaAI zcz0V~wav1;FI!${On<$ZTBG(C#8=_@;=23YH^@)r^9z7g{*HhT(-S|yj@Y~SQ|XEP z%k7joOWcK7GSa+fMJc8#R}x^(8|&`lG_serqR^===%Z1jW3YjU8HDjyU*#P-H=Alq z+1AD(_VdwT#&93ta)74*?#|c?I}y7IP;{P0#Uwh*ZM2y)%WvKXr_SsLcT0x5iaTrk zU!kEaglZ_pRSTyXhefYiy_u_>A&)Ni<&fyZZEL{+X{?R0K{t0(^b^NJ=j+{HP+UfW zd5t)&fzGUfMAms%xU~iv!`n0=eAl4ll1(Hy3rlnak1@yDr(U(Ve&F(f>uzx^z_oJL zP7G|U$p-d@g^dDM5LizOYXj_x+$HD9cD2a@_Bt??r#oFbLV2XJ8}=rCZ4dIVh&$Hq zSHVY1PL>E4U$YLJ##p#&46(S7lZLT9&KPa5+OI+TWoYjOpk8>+$mEKX z^HwNe=Ad~!;pND2J2`qX6_jHVHk|6lp9<+Ffi#f&xk>=jRa3C8z`Dn)?09pW3Huf>RkjZ*k5XBUV-IXIepN{R8dONW3D{^TJ&Pc> zK-nnY3PwG?6HG69EX-tV1Q>c8bXHDuDBQ_3!5p9IH(RWTK>&{-LbKv3EDOJeSN_#; zhxh6{obK=jOr{ktTrI?lD?^{mnQ#g9S*TxT)z1ag2lWD7kBxNI=Sk#>&oFZkztoB! z0q#S@Yg*_@k>{)k`D-JhcM#__SjHtpq$6Uv6>&G9euxm!jo8>rikN6RB4Q;tAdPi7 zXvT8arJxTO?{aUGIX=OxAdr)+Pl1yrGVuJNZ033V+6V4s`jlq+42D3{rzefQ;!U4x zOd8W?6)>VwM7MODOZ5UiO{uF9^ihU8fpv~sjlUAH^1l>!G{@yBvOAjNqdvZE-_;tS zjAjjFnx7)8OLdjtTI_n%#GRiM=|wP_pNqi^pt!=^gayFFUy+kc0=So*4l#W)0Der5 zw6+FdFIf@t*GGhW*I_qYLPQp14_Oh9ff^3SlrBiYLJgp$fv>j$H>P{!m=Y7Yohs zo5ula5^Vrj9U$@11EJA)NfqDd0g)0ZVMBl&3LmL$!U} zER|%u15@I2oB~!);{Ce$-pM@$q|v)4Em(*Drhs+upMp)#i_P6Uk2AhcSd31{&B5q& zd5Q*@uVOtRi#sP6-)Al6 zJeZKhykRk#6Tyon#Vve9!V35s&`AX7PJc8u$CAl^2W0ZC!+KvrKnSjo#rzBAE{l27 zVi3R^0swnwr615-K%94-)7sd6?m=xTpusklgPJpgLw6*YyR6s)B)Ao^x(UA;n~z^- zEcsJ7W63uii=MIiFtOU`gN5EhmVY`Qj{vC2sFQxJxRY^|Gf&PU;=<-NI*u>hEQCv| zNL(!${{RAR!{*TuXTHUbe;VVz0FZjGbIjHBUT2s)urV>c577I&iPc=$#8y3(-s==O z9$SRJJRI_G$<<6@HgNq&!w)fG9N zSgmng4=2XN8e;Fau{uLfBv$9=IXXVfx4|l>Zh>=V@G%J+Jc8hH1nUAj7n>AA93ghG zjn#?l8e(-Cn~yDs;SCY{0I?ccon>;Iixu_HJB}0LjAKR|-$XOzq&rL^n?zzAiWXvv zV-g2Qe6x+!g>5{sx~Sca-4wkSiq2?1!MECAU4SMKtc%aR*p!&w`-r{8#_D`CnOL1| z?#CXC5n+g(W@ELbXArCHJh+&Dro|BFunMNyV4Yd6C0OT`hh#Ct5n^vAR%O*C`0Kc5 zfJt#GI9a}6O!==hx7oxZyN<-VmOp|miAfwH@eCWQ{rnbUwUs}H&9t#><<6YSBzC%u z)%JZWvD&)B+W%+81P2J7X@j+IPbFA;_W!UuV`2@lci326`>!Wf*Zn6IA0yaD?486G zN7ON5zs-PK%Vioiiw68DYG}qbiOYG}>KHfFLE5zMtF1Y0iqFFCB7ye(Oq=K*w)j&E zS>r3-aPFrwZ|8#tmNy4%XAkcu@TSui6tNn+SmLFYcsj&OE%AAXH(t&C@!XI@9Dq0g zSeuk~wrTWQyV}#(LW_x(K8M)xHdg0?8;I4J;92YmJ@=-)Kf3n)1dk(FCECu;H7 zvu6;TfnPhC{BN_P$+rxfOG59CMvmMKP0x)?uzVlVa0)){q%*N=@K+|kUkxqky`0Oj zMG(f6)tN*FaT6U7Uy}!>o5eh5F)C z*gcmJ5Q6MFE8q(-JuGIW#l#cf2XieL!W-*akk-w;4%|%+9-81E{G+E0Lf7!+@C52M zLV=Sf;^YrkNpn4Z;#1`BIL`ImgipitG0dmwl_qisyAi+Ul>BWGDBmjV-uRuO|0o<_ zL0Jm90M zp0|HCB%wp7p|hcbbWi~m79@y(Aib)g3JM6Q2qNrKq)Ug;goI`R?0^L^AU14>y+KeE zu}~E(2=Di~_iS>rZ2bQ|FCWKb=gc$n%$zyro^3Z8>%rJGt$!`0`LI)mWBsa^p0R!@r}N5VyFyLfTQ>x8+#9e; z`3>OT$V6AV8AvbdRQl_2D&3RFBRGrJpdqT#*J6%=>0~imEhbteg8@K&EP)e%&O*Ss zd;;okok`ydXEV`hg)D$TM75l5_mfONi#Y>kn8j#?2oq5WVS*-;`M=E)_!-bS2j8=#+R}x_UQvls=349CaBm{JAo{O}$&ZKXN zGwHS=ORY1{*6UOt#M7{V#ry!Kv&C$;m@B4$_$C18MoZu@pi>ag^?p9m#yXQefHUcy zN0#MAfHQxx#rz27R*QMTVj>9yb>s>FYHJDn0_b-L=n%00xyd?{z8}t{(|TSGfv7z| zHQQOtIWTuwjFz)75mVrPu{4t=yx9^s0_ZdZbTCd7Q zVvd99W-%{WOwI~We^nr3EME$+1bzkd2LyE5S%TbRok>3cXVPgIra~ZE1=2{Slg0cF zW~9Yv848o50=$Rg5J&;k#S-`p(0K^x)UyofY@JDeJIOX>0~#vVRI$Ysu~B zr*>2yJKqcGeCl=Lz0MiJPr_IK#AhkwI2Q0ckK^BQ+|)YO*nI2wY3o=SylEZt!H>{w zgSfIin&CLkB#~=vB0jKxlSnGKmB=;Lr}Q)Nxl;T8GYmX>C0l1&{5WfFL2-Z$0im@au9TfU_1B6c1>TK-llFn5YC; z0XJ9zCz<~X5YT~iEpol}C4CCMf>LTjS#Jq=u7FN9X%J{-LFWOD0VIqz6k)v0q{D*K zOJG`B0(l^P5du0Ju0xWnFX`v@WdE0PC$h;B@HC7|fNcC0lo!wv2neGW5rm0WffO)R zErC;z{u2T^aHb(utS{*Y^<@5~)P}Oz5{RsTA7|Aq=mMZ|5D-QiiZIbCkjZ{h4UC#Q zAEdA9;jWq;C_*%MO~xu2Zo?U5ur7l-guINTsVtm7}Fs{emCOpS>%&!zfN_7X*jF}h{PJ|XkC*7qK?&DFyvv@ zQ93ZpXRg9d=R?`u0+}7c@98Elt7_-xRbfXlms)MK)Ton^E~!@6$r8YmrRMG}21!e((bWsuJCG1~EA_E1$B!baOCK$a%{ERLO!G0$K@37o9f+!v0+( zG9v>ypT$%I(*|<=Q5RY)1-GcgvE2xbX!Uefhnpw|F=ghW;$1Sk%WUfpU2=l~y1_b#Il zP3V1`7V$Ws$Kd_UuJCVmcempM#MBGXWp!hLQv^PZgdJ_9+f&FeuMOM+;D-@`Qz8Oo zQu;8yDR{$pybUb;+XU85`ffNdanBGlcrGHaA#i>Ryj~dt)Xw;x2lzXfKO%6M^h-WV zxFxvHA_%8N5X^8NXYcG+JHLHg48}4T`~dk5N4|LX4GS*U?j}O^XRNc@*uUlRbr4^t ztuKwe&6svT`8tNLJopOg-H))d6r7#2fQkV20no1ZA@W0%;0F7%*1jJ02e8+UC;K05 z0Iz>P_J3fn)zlXI@4DL8|H49g7b%4rwAE)s6Z}6&| z@SjjQZG6G6`Cqtn`6*k`h|*=i^k0CeogM5CUcc4ubpkmHQT}s|Ry?&cz5t*8ygOhn zT6=9g=dstO^DA=xO8a0a{jPwyYysMEE&$MGa|*d|B_Nc9{UvL!ed15-wMU#q{)uA0 zcLX1~#=bNOw8wr7nQH}nityUHdzUKRX|@;=E}5DpMCO?QRq_r16DWCj+}UCyO6-q# ztRDK_1{QuCfweL|a{{|@%jhfQWg`Ly2>hK5+zDW*h`?$#FA+79_py5A2OC)U`v|PY z{lXJicSfs#WUYT?Y;bhU0y_g74^R#HjR&aPBvy(boN+gw_y&hMpdIutrbN5vAIRk> zQCt?2+_WGDnU+m2_ z*;Drl4>h%J#R_s+M4^)Zk~;aJ*s8&w!6orjA`j|To!marUA4kDuhFgfe#3@wzijRv z#{c-I-+%RBTY}Y8SGU+nj0=0pgB@wHO8X@GqD+W?5QN7M<%3&jKSHVgldgiN4amo& zWhLR}ke!tK*f4$&yc%El3CyN^9!A0|z9xifA>sDf&4z5!n9h#@hO2M`nW%SaV;H`z zi!wQcO}Gl9hQWw`oaM}?=*h%CgrE~1nj&GGrYL})ec-}X3Iqqa zjNU+5h4FDXJ9WL=wcp{a4i3r#^xhkWZ#jifnI2mjJNJgkxXWkuTl|CIFMwCCe2?h% zj#2RLtzky+fvC4Ea2mkN05bs{LmK@DFa_Y77B?4M-W%M^kAu7VKX94D`6x9wxF_gU zo|pua+F)vI{@~kIsqCBe=h}otUYq6R2mWftQ$|xLK96}9yQ;QI>)UXmBH#~*Zr`b! zWscx?eO>=LelPC&aPKff*$-H(D)bmwRp&?KTHWsbzrZ2N`@RLMN>2k;g?>Vs{U3o- zz`o~6ZovR|1-u5i3=;Z~v1-J3=KV{$SlQ^T(F(`c+l=%zy9^`~XCF8K6HpdkVXq8y zbdbI7vaAY(?G5(NTH>0fU2Ql>G5!nE9NVm<^{gziWC6g>*kH=~Yl10bUE~z3Wkj$L zJ@GvDD!ZofpT2G)KVz@Ot|B@4@kWbZbKZJIP~+r6oI z4yP4pn58&=tTnZ#yG zo3wboNzm*dNOcIVAxP_H9C~4W5=n+J4d;Lh5On0Pyf{*1uIJt{cbgY%l~m$ysFKR7 zE48Ym`;z9E5~{SzR%tx;m#EUOI+gzhK)3S$k1}~~S~Ak?w1n5nL;T2|UWYV#z*= zj~gK?{O`z3U{zhcz~X!ZGJ7sZnU^iwEr8--tL*Z?wj-eEWJj7;E!inxVj-(T@;Rh5 znExOfblAF$H`*jN?cU^?dz+ZuI8)=Tz}f8V#`9!0I!aUBV-oA1YuvDe`^TMgHryDK zSm*i1bw6%wUb7Whhwrw{n18XnfavC9o`%a2Y>%zT6M*tk5tWb^*G=bKD>BNY+lpuj z=cOWAzUL9$zgU~sjmU7V|65TuvYoE4&t^}>JtobvU5}Gw=1&g(M!ErxPWB$N(URQ; zCLdbb_q4=|C_r*{Q2c;n7&uxw`nDgav@kpDue0Lo(fpJd0F&6aE$KH5Q6_;b;;gi9D*TD26nl`8|{}l}e=_C>p zFc~~kHG2o~VS0Q7xokS`N8N@Kt?8aP`7AfeIh@>Yj^bpICGr+TQt+Xr*5`lrF`oCu z2M5ej>qOtT8jKUI0Nt3j_y0uX0h5G}CDzA>_!x!{byv#&>4P0G#wp1J(;6qstrLAC zV+2k#>4R}{*Z)Lhq8W&fWv&nNIX*_>LsMA*85+$G@twWApwBx$51OesO2v_`17G8) zF3$_#V8V8oXYd3qif&>Q-2}73lHT!WRktjW1R35Iwgqbe0d z!uO6DMa4!V_aI}CvB)^&UgSO`T#!Uj2yt_hVpSWh!DlT!z5D^Ek8#S8!$Qb-0KFjE z2id}7?^-Mj*-n#r%iz;*v+o^KG*6?|II9l9-|%&FGuKaHu(gQ%fF)1_?AQlbIF(}E zTRk{4!@NEUSU0dZ1NbD*2iYph*2H@fheROk8}L)VCsl<)GB9~rS$mq77vEeV;l6wl3KCYgtETANhm z^eh2IxFn+cyG};r53J{AF_X*;59W6l6LdgRasi3hEDz|sCy-9I-auaBGr>^Ngm}np zr=qIl5XkV(e?%qeqGDut({$SRfdkn$Z25CZgTJ3^a53-2Lk;HOOx;r$XAd#i%CZzP z8@-a0{=;6`B_GZPJ&&8`CUTO9%)|jtx%E6SA|s{(pC5&InQiG~?hRkGaDNnS`)4;G8-H! z_dZSZ_(JSdE!j<`TCypJEI_ZdCjH;6HR;Ops&$UVHj-@SSch6{ejJWHk?n8_4gr}8 z$QblmYjW6^wI*FfM1*+#+RWuA@60(#l_II{sq&6Ole2FYvuoN__nxTiKCOKjC*3Kp z98^N)0p`Op=04}EgcveQk?$ zyv77f1}$ig_6?W^fT_&ec-c2@m<#g*9gn^YQKe746~E}NM-~!wD4T+&7{6uOlV#vuaQBE{mLc{T3OoTLaE{SUo^)4|?j*(v6Ic$~Gp4|Gk;MdFf+%nq z5<0wxO>S`E79!van9$yS0rMbOC4;%iKg=ytl{~!dHTgv9x~1HP{UHq4g90*W2D>W! zQXP*AaNLWmCZb=)??LQJ*_wad#kH%3t?UZPIkFK6u&YnOR)v3u!uO)^Rk72w^9>*8 zud;T*uf2k$F6eywmC5Kbk6s(ln z>(nJk`-)C=WH9>DprxAnDu*PETf5 zgWKu$Nw`g%4`62Sf@zc{3mG5aVSv>m0(YQd3fcf!Lr_nTIL+y!%B{hS02A(w0wkac z*G7h;FNrUHHP=FWRBC)I5Jd!TM6Lt3b?4ZM}jU_;_b@D#g0X znr^`K7EJ5O6c;m@i`nj%jJ0;s@yriW4Vh6u%0lBQ+UrOQUepD0HL?;|dF+P6M6VL% zDakv`O}Rd{x1v{nNWX^u zkgf}|9v{*BW2zb0BM>y*0cwuZ0Cs|-fZdU`0He)udiQ|oW(lh=W*_AIVV5_n`h3&04xCXm< zskf|-Enhp_vN6tfKh5l=BDY!dI+!=WyqROZcn!NqIl8ppDE7%ant84|(@hev0l?_w zhU)g_?c7dJgN^o8y4(5sjsFI|xquxW{j$4f7=!#{*xZg@CEi0R^Pv~ZYmwII)otj5 z=26;B%09?eoJViB8KyOJ766#YcC-bDFH=ihWx|&?0%q{V7;_^w>PXpiX7=T~2l6z& zuk6UXdE+0)Z7kCvfc~W!RG{WavSlj$EQm_iAKB(1N(DXF*N07dw}5F2PV;jO1#e8j zTY{OF`Vj@ME%S7E$2jvd)A%U#P1d{(=2v6h+A%-#D0f53X+Fyw?GZ5Tta)e5n_+&V z%wd#<>kaP4l@m7G>;-l+FipoVBvWdbyGdLJv^{#%3DSSu(>>N~N0E=~d`HGAQcaD0 z!kFx5AR7rc9-AHf)FxA&HX8UQZ!Qeas}DKyK1dh7m|z{B0{pedaUmSvVjXMj8|(OC z>v#eVcUi|3aJ&sgt!v#cG-Yu(?zW^_!zS3VoAzb z7y5-2>3pDREhIZLre4z6t&Hhtbu01`I&Zts=B)O zMqbWIHS1!#m#%MYLyaR;YeIEYs8rv2cU$mu={%D}h^sNzg{CPX#0a;pFTA3a6w8Aa6z#&QIZ5V5`E7!S*tR(^k+1xdFWj zC;dDMC*2U_`K-e6+&0WG#@SmInDnjzbG;27062l*0^WhV6%jm>Ygj8Ad_1;hn7gsG z#kQplF8w0}mu>{|LRN5M>$Qa~;N0K~%|L=D+2HK}P9(U11Ch5Qf@cuiZ-X;z3K2Y+ zU)mv6(T6XfS2GpT>Dj>J_|Wx}X>y+pRIv^oBA5>cT5#=Uj9#Uaej)6o8;R_}fv4v@ z>9)fFA=!Dd+D+nz$Nf0UqQ1f~_z-dwE z^%19%IZqZ-qv&Rs1bk^)@f9?a$xb9jAzwsf7b3e780&JPQ+5ybXq~XfAYWqZxtwI} z(MKU$+y>AkXfFPB`5BA6YW;KhaZmSMS%0N4E=tzFVqAKdd#dh+aY^*Lic9|)7hCD( zB3p1W9(&JK+}**s$R=9MX)uQ^X1>L6J@#IpT>?Qsr7c@M9hePUJrx*_Y_@E}&qdO1 z2|XMVq+iMu=xCyY*`Kjc zW=vD6v1N?);~OdF2|)V@q&fZ3vg6*Rq1#4L&fd+Oub*UgAWiG|W563djxXXkk9DlE zO^#!#V*P5~f3kin<9Fk9dhCPo&bLp}f;f%=oCsr`mus>ny&ICd6FzL>Y;f6JONGJ* zu4Np!Vi{BQG}eqUowY;A&jkGikv~6ytE58i$Y0I3@YomNWu-0w6U+-(^Zp*-BE#HE z>}`PK(W_L_uOK7owj%G~Frrd!_mfC|i}?i17K_UxP-X zd>(l}7XrF@+i45>4A7GTVZYO2q7lew&jx zISX7ZC3p*3N~g`K$nVhfv{crnn@%hJVKF*vT@Pjm_S29*a$@W~;OM`pfPM$0nfee= zF$$%t`!M7Jdd-ydYsp!f|Tq^A__tm~9sGlEq}#i5X`VI{lY<6QAR^1P%f! z4gp=^ha-PlXVS03nRG8BpX5dWU;kLl7hs;Um{%+&i-4~9Sp?egIekDXwT_7`DVvUn z(~4I739oKM)If^q?K+XbGKx<+q zV)90>fR74ff>GZb0#lL-==3le`Q9cY{RWbeZXfb>Rt2cPw*r2$`ObpA0`!~(y>3C# zDv+7XNA4|Y9V^>FT1U!Rh%Xn?dv6JtV_?b?-#N?7@8X_gW&`>Lb|Mu%m!xqjIF7M0 zJV$^eGSvBH9P%T2)Ctmm583ebErBb+3C6T)GeI9HHO{0wyRnN!x^Zw0A;BIVQp>jC5>11i~XIQ)Xlq+5ez;zXH6p7PRg_L0S$ z0h4JlYb{1)_Ph+^?lT?vY(AJorfoqGQ2nS&W~uoYBb7?0^L8|q4(5*_3At3NHI+I9 zMwL1VrVNScg!drwsU<1>SCEu$GZOqRm&`)2{KSIJ0{YH^wg3_)T4n(-pIZXIgQ)@m zo$?++KC`~0{}Er(aefXuf6XY|jcOX2(J51iuK&`4{seT=f^>jBW=VS*&A|k~d;vyJ zCvFoUTmKM{1xwTBLwPGjjT8$`asiC*)N{uHlyNVg369w#`D ze3Qa?%_G*tm~U-x9loXzJOpM5@~aKb!OPug3I2_2wf=Qk4PvZI>T={lgmDVSN33IA zPD9x0Qo0KHGs4!7?O|)HYsge=b^Ul0`8UEgMDqssVoX8_1 z1!ax(U`(4wDzXzt`a?Yx>^iT?I`6QXSF!pOb=Ns0UblYBgWcLI+iwr{kA2oJV?ncx zL2Vu@kr#1X5T})_<7znW8)Aw6UCBLo!}txrnqOTXrjebl4QWUoiwT!FgJ#@sZM7{= z$5vbNMkGGVmip_;=g0UhjEnO79~f61#*;4$=OAyQ*PbB#Pc)%)+7t5PBwFKUK=K`n z`3uZxi_xASjP{YN#^o8OL%;xf8<5Ifl#=zL+TQFX1P;0FG8gaD-gc*^cpg#ho{0NR8eCV)1e?MShR z0BIae-@{fFe3Jk(SWViB7a|{`*J_gf0;@^79Z2E4%ztjnNiQLOU>%+(cr~U(>v%Ep zEP73e^gmNx>9m`bhE%jENn?7Sv6#QXoVA!OI`IpXV@~XpqyT!}68IWWSqNw?FGZe1 zPofFZKaaEUW!KQ%J8U40y}+34^{ldtC3lsrTy>iJKyVhP{gVrw%dR2d3)b=XfM54G zj>YjV>sVv^tz$i{lw&1O&pNJw<40EVO~-ZT9lsgO{Z6oYUZo8!xt+V2C1S7dAZNW1 zmDa*%UcwNcei@D>WBsEb=0)_H+iwV74!t(2704@=l=M3xCEZp;e-cYm;mOwNsSspd zwwPmj#pg{6+Gar!nYfJz=v6=*7!yQn)nJ#@Zgj**D1`2Fu8`s9!Cz3V*BGlzk7(=* z#g(nPqtg{M?9Pv2&250HknZ`01(`F@Sm2uQG%bB~>LikF< zUe)@Rs+6Z%+NxJ08_}zArGJscSL?!}r=0p5S<%Xs$+FsDK|cU`TOjQ9lv9{!)k*=g z8I1Ol5)kggUZM@_Iiv`fDoA)Q39@&)^8#p-1yu#~OfQ~l0eAr^3ZN0@A& z>0!uKAX8mv)*)LgY3W~vv~9%{WZEei0!4I;TrGEe+r-_fYeM!VON<^h~xojy%nj zH8mMLjaz0hC%_!E7(I~{COc8)pY76J0f15g5kKJ{KsS=9Hk2L6?ASbJ1!Jm$#@aHb zJv{?C9?u6xaM}*h0kbCC@03xzLfpxHwJ?q6hpVjLJXB*1)d>6L@1byKa_d-wYZ=s@ zF2^TuTpy?HvmHA`GjO=ZVlG0a9b~lS?nQnB(=aC)x4K|fTTCo*))7ZLZcw0;fEq;! z6aw@pAWg}41o97aul-MfS&Lpfne_W8uXK89T@ydirX)ah*IUfbU=F#MpwUxnfwJc$ zdrAzTb%6BZ!ga8|rfYcLwmrjgLhXEPk5sx{qW;&7HmiM34&a|Vq5 zC{g5827-X50n!du63{p72im9KK$-!F+`%#-_OK0d31S@}rfv02PX zjTpy>p>16;g3SR$ZaM~1vjFJ8pfg*?cJ6sie>2g0UI;YznVHz($1l5a!zdm9SWsq; zB^i%?1C`J=E|TXUc^!UiC4yQmZdb!{wgp`#vrc5D?e`tzFF-d$%j`y;9L%6;v^~FzxJ|=bNRa7=Fzhtn zMt){OX1+}zF9Vy%TGs+4@HYvx%+5MH=^&tm79?+-$y$5(0puS*fhbuUFbgauADAr! z(iSevMKGP%V3hPB*s0U0FATEY61EYX1Y*_nEr-QWOIffc0cxhh+T#K zXH1WF%R@+bdisMKfP|-YBhwS20{4MaU(^G4lD>%5{D&JbC87^P5s&>me!vEngD(iI z-ScBl;9&&4Ga_&bfyaYWC1b#KqmtSwKSW}K7 z$ju}KW`fPI0hn`6hS>=QNC0)#ZmGS*xd#aE9y&|VU@*yMCz(!)4HoW6A*)nd#$oUAy+PT7knqr^9!!$ZGp?j6Se_@|971w9u?e^25+%#+@?F&KyIS2=;rX0*ADLMTN3D3fK2t9x>>`dowIVP#0;ItWvWXr)W!6BAR6O8|fC~yX` zax4PlD6)IT&*M9B;K#ol9EOZGw9h?cCP3stBs^0a37*-S8>U;_)!;J0X@C04gPR2A zA>mkr&MFiz1KI6uRBn82gKh5Mk@V7Cvt zS~Az-D^IuV>2fZDnFR9&Pl9Q8q8 zOwP7g2M$!n3(#&kVs!} z;pu2U*qx3(=(r~88WuGFc5yFYKjLgC4ls6BL^?lx0sb^lAYl4ifUZ54u-A3x1TrFu zeQWIdS$pl|m$BC_{u^@774|{i{cpuboGm~*so^~d?V_iV`>p`+K~?MrU|)p{hGJjz z26uxxg8fA7%j4p9e%B}Yb!4o?d72+)!!zBT+q9$v_{_77*Cc=g9|5(goI@r@3D}x| zw*$~}>V|z@Z$G3ie>JvpP}8pFZ;e@a+S_+>7t9J=YTAK2(;mQvht8~z_ssf@=r!y8 z2{D^l{}Q7%=rrrnEn?QC`yL6;dMdtG>ZMG^vv%W55=;gdp7tSw-Dz)&PSdVUF@~JA zA)ZI3M9KMROFq|(y^1^v`$USYZ6_0%hJDT=vx`z?=VmscR@{i)0`CV{8KC;~D6;Sh zU^dQxyV(+Swh0Joh5v=jjY4oQ1lwEt=CuhF3qVWzBBDP$7um-H0d)K^b;=vDrGp&+ zQm_QIPxJn~qRfNZwoQr+E7P3w5_bXNW6s!BwQcrV8(y2k_jkxL{I5sUV#9fy#_z>l zAI^PlGMIr*7OU#L1y+?ihOE(a{TJ8|xQhj=f*%4_^?pFs{vUx2*v_8h77lQq@V^GR zgCx|5G3P!v%KY0-ZZ=x^n{b?NGt!wX0}0`*A%H4WU^4bKnMyU^Psl24bM`>?REApm zOcN)N+dGGeqs+AfD9#2QDi17Pu=@zY>CwUZ^f1hp7K>blT_I>1y+HXp9$<$dw++knEBC%$!Xy0p1asA zb2^AI=f({zI7+hIB>u9u(r-sAMHyRR5`P$2>G;%2=2~J@r)CR?(Yu*TEBGtio`h3X zIIA`55_Io5+UBQ+J$JELX08E@jZ^T?nRyla7i0_A|LU4dYSP@+WCVdrQxk2`KO>9P za#=+|;r+6A9h)IPoD7>;_6u>+2ljGs3fY>Iyytcn%guFQs#(&Rom|3NxSG7DkP*7I zPPnboXpa^EY5-V4kiq~|oy*8}j3@F_721!KL0n?X%ndd>EvT|&r*=AtECIu!GLIoz zj8T&#KH04>*W2tC;iNCw$-!A<8{n!0vs2> zRIw#-t_qYytNV9kIUwz8k0ZIxpEG?cO?69l34!}VR`@@W7r|DII(;imHIZfh1M2}L zz*gDm3#%^!iavj-rluu35lj`x>NeaWVlXT8exACnxf;-lXHBr1hb9WGCT6 zD`YP|8X!@t8KlJ7*`%4hI7zfl_Tgj-PSkL3{+E*+KGMxme3Z66^b+t?e5eoJ!AI{1 zBEd#XlP)gItB1%2<7*x;WnCxc08VD$M01%Bc{hrq=zcbuB>a@YkFFCR;HMYQ`91Mc zFC2_5d`mbu94O@muYtYHrknB}Cw+4|u{LQgF171Ne26!v%XOxl2RmPPb2 zMf(K#6!{GK9QgwI68Q=V7bTI&@P)Y9#-20fE8w&ikH5g_^E^G5zXHfN*zZxXkS!4l zv;k!ApU%E=tYgvaJ9q5*({Q8ErEuBn@nStzqu;MjXNrV z)r~ujfj!OBOyLS4hr#Lmv=#|m%bz>Ij^~Qzm|jGkGt@OIj3MV>=gu8+@GmdrNLK{$ z-X&)r*&MMB9|e+mjnCu>I|Mfuf5H?+V$c_}gi`9cx&-thv>!jgOn!NjU*d4oh^%@{ zVLQ0fy~&YJ<=wisTpUhR-cvZ=NqOa<7;+Ts+h7kMk>#ZqJon7mWMc7wi-gPg_~lyn zSq{ad2hJQbeK9mqSobbt3}wVxU1LZW2iGX(RnI+hHko*Q z*2bND{z^a*E{=Q+>F*KwYs`ztSwYmP`8}YM9?-EIplpnY2@DPInltg}n$j`_xh)UFy;BSxQCF7~x zvV$)lST=QP*;RR0rI#)0W@6&;vztWZqZINh_=6;KC`v+C2e$_@~!y_AFA+s(3Xkuz*;>EH38C&gchEX5SQ0&*s`*ypk3yEX$> zeHJgz-};qvrtk&0IZC5|xd04Q!gIim_jV_GHHu6o`aEL{297Xxk}~IV)d7`g-Z_I0=8f^a8(M#IBT0 zuWY58Eos9v89Us-4L5eE`IFJq;CI$n~->O7)TD6FjaN-V)`>V z$g~FXS2%|f!PZpgC}!a#nvjIjtb$~s53_Fpd+Egfyk$>tBlh28C-y4UQP|6_HzGSP z`%K=*`Oe1LO?NLCYI+%^Sq=Gtfzmk9z^xr!U1RimEwO(L6=i<@^f=r^a6M~022SPJ zPVnh4st)E%>bGA;l%qA}IF6~xp+@?Na>%p}@-J?kW+@q@8b4ZwnzP1KLj|jeoIoG0 z#&uLfI#uIu(BYfJs2V?Er)sF^nN&k|brIQls}X?wFW8FwWNaURd|WLI&!oQk72lHQ zBLtzRdl(bi@mce9xWoiZ1}_|$bU6NOIj*Nz_24L2ZR9ufWr!{P)4bv+T}MRsj0t}m z8Nbf=Ph5XnOj4bI`4wCuOKcXn_sEU3f+lQ;y%YEpM!0f}9-w?oigPoA$61Fb86$^) z986$Lfv-hk2%HDOq2V)>p_@nW+~T=yS>LVm)eV@Bz=qqe5y@yuuF_jy2N?)g{CZej z#egb)$yR(KPF3-bu~Wr0uScl3?5;*+=dE~%iXXDUUWDkvq3+Do!}e43YCq{a(SG&X zqU?o)`!9iXFMAwrB6tcfD}Mq|Q#gm<4>E=I!DwqaN!d=j_YP1Y(=m^6X7DjHm83tn zJT?=rK1mDK0Qn5PW>ETPnL+6~A$fD1!N-k{8T``HSPqTJ&}axoG|uw9N%OnI3s9nY`wyUB8VjF*Mv0ZYocjy}0-q7n*yyc#ydll25{x=UTC?W-1hpSPB)Pa0e8cfl&3MyPv|K1*U1xG9hj zt~*l5A;BWFD>sXT56~Ny#7;nqSVUTE*C6}RYY|D`p`)v7jXu1F^00>Da2#Hgn@A*s zHS`uZD(37S>sY|rIXWY;4Ke1YQ-!87=53HSG1h{sP3;CTZ!Iv;ycwLwq%JDN;@>G~lFgbr_+@~26Nq5f>Frgg1=dDqgo z8X7~PaXlE(h>L+pUWZ4coIIW}>F_vUajn4(2iMZ#@AAqFsOR;K+02(N{IU5 z2D*984s@+P%G$|~Qj-sGtmeE8Yz=Cxsksh$7QJdLeL6Lku0N89zbk9(ycqMG*;_MU zo&hK_^#JyRjDS6n3OUJSHD@{`p0^|hgQ*D#P0S6*bC!hk8zCXxZAjTj5;VPuac25? zlMaa;mP8YP10W$_U!-C#B&I^*1xsQCm|Bp~EVn{-SrXE3hJ@+``q}j=mH%i?Zuy zeF1Kx>n~{TVRmX$Kov@dj13HEaY+9dEg{_?q!tdZTpT8YXVk_=M{EYC9ibJt{ZTt& zhaoAGb4Lj7jH29SHsOp~1>`J+))9$2kou7Dtop2lHWdPoxfz-J!PFrkO=UY|i%m%S zCrC)TAxLc{6l9*gtYVyn_OjVqBVeAeB$5FRf`otrkp{Vt7zl~SEs05Bu7ZRbqdl_K zl8}BYB%~XT1ncBl)>lkYjevR5lIRF<2qXj?jObDntrAI)*ltNo1ydIiYPSx^HcLYK z?T|nhG@}`~DpwM_&D81v^E5!sZC8K?qR#D$P8Gw)G4~3vC5rBG^OQ|UzJ~Q?a}%?n z$hEm-bhJjmECxjHB{vB)~qScow_!5D@#!{2p?rk6KSLuEJtOz8e@ z))gWhze|G_tn=+Oz&hLBi8Rj%&T6!QfR=#K@jwU6^Xi;PkktVjTjHDyeIbx3IfjK`*x4aA$+p{_Ws z{L0$jH#{nF# z1gsv=x%LK{M`g`9-&>>TwmZ6f&_tg;`a30DJB z*>yfmqwG4H-i;Kuq;)jap)5ezAGHB={G5q@9X;5@4+jptEN>0d(#h zgOrR2pf94Vx3)T#EWlPrl5t37Ys-z8GO`b!u3MtxgR0AxYQ<`Kj}ZW1NQZ zRE)KsKT(t?S=YJG`Nm@uk8y7MxxRrJ0tdvrOpql*-4{IifSZoKH2$RT%nFe1LByVc zgQg;D=^w6m-a~LoHNNa%S?idw2-Z4cJb+Y<$S?ruhp{Dgf)1^BQtt5b+(gsOU`%tZ zvEhvAFgp=R#!r`g)L)InhrvQOrLei&btr3n`k0bgfYo$5sWr7tm$R9ki_&1vN721t zW?CZhbr(c*;(JK-@9HqgB9h&7)dFS?AT^!NFpH^z&M%XZ8WC0SQ-#?!fKGHH382%Q z{CCU6|6cqD=K*58p|0Ou+i_3=P#1;wyrcWL&&5vX9obD_&Fdd+V;(_I1rwwnJH*w^ zM<>#{dM<(V9rz3(jUq^=RpS-;g&1pobOam?>l6&9A>DIfeY9%8EU*D|Wn4;tf!I$+ znnVN$(MMyCyHlw1#I;vUp>w!?Daw4k97Xqv8G|#Cm9sIB)xqRpq<=1C(;<5=7*#`8 z&}ER-)pI6twa9AyhnHUfw&QF79TvtBKnI1{$beh|3?#t)Hh`{Z%L$;1*&L)rL^b^Q zzt7t0VzmL=psrN&kn626S1R`-{un=CjdgL_h_NnAk03Wh*k)Aaiwl;x4jA`AT!)K= z$Zfe0KUyVVCLZU>RZ%K%1W-$wN>{PFkO?@`1xWg9xKK!!hP1@lcT?dKzr$<~?l&2T2&Cgi4E2>5aKki{GW6R?=g z788v?hM(^i1OcfzbXD6#iFFZLgtUt&aflL6!8VblrsGqKD>~U^@J!eY#6}G&oAFG8 zPFG8iyKNuu}u^DUq@?C?_r<1rUZfWEA&8X}%cY`ge_5T{+Ysf^`r4_T*^Q*buQV%`E14-sLuTTBEI2jd4b5=;=E4Rx*W-Hu8C=!C+%_h%I4%LN!} z@0Z;~i0U8Pgc*%qd%yJahPb+}=+qw>5V;mxU9u-2x1!fnOaCWHN%xe_|DAKmDx(r# z2(X~H0p%krfu2T$iI!CeOgAuEQM%r*=;B`Mmx}-xe&|x~R)A*i8D9U-LU}2t7M)Bc zAw6)SQ;_sUuXA;)(BFa+oJYQvnf0u;w+*gqd@8{o0+T9#5y1_?dtvL$|6YunQ7T>6 zryzY9P^qN ztC3z2Y%@51_Y+%!@58n^WJB0ZMQ*c!rN0PS>2@J~av`gmRSmG114WqsLa-I+WebYN zHlqSx0sthkIw!1xtj-8)k-iaR4P z9I&8w0TqF)9PP1~Xk`6hZnR`|PFM|Doe|QJ+at)P1i5guv26Ead>w3cIGBYb+rZNE z&Y+`v1N|MjkPVghnYI@59+(7+dDCK|kqrg_-DKJ7(6I)#I&f@3QX<&umP_p{**CDg z9r2Suk=GaWz`b?$6c&rjdik)*E^2gr8>|0O~!A1p8f1jeAn@tg5SQXWil-dE#?<6oh?SEA7RFmpofTqNdwaajI;iAW?Rd=>%8_1GQ_fO zZMVuaE*6c&7Dqo}+Yz!lq%20Rwt=N@PhjbEQW}v9*#Lo?TFkFn|F>F@j!FVWW1Gok z_*y_(|5FL{C}ef2c>%dQf@~%m%r%zm32Zw-b}4qtkmfe9^c{%1OzU5#l##ixO(SqK z3;GREHwVP)coq|lYzmm`EP)Tew1$8#Kr4_I=+&gsm%x`kY402+{4~=IHjoBOGAMvf zDHANYo)-hjr z4|VUu^Zt5oJB~A%_m+S)?;iqgLngYKtwfTnQ|U|NR5~3}9=wvQ({Let`7P#iFg09E z&^e_D6hXuR1prkA#Bf7hif?Pj-cOK;D7=T1P!XT0f}IX2vU?Ds`lmxmb@V! ztE+}iNgdSdKR05mtKJ%H`LIJ?SXz64XpVsz~nCMp3|NF6W~AR%}O z(A|8jLJhM6S;hzS>M_Wdssb7t$(Xjvok#$ObCEQJhq^~isOVsV{;}EV7sGVbwT_EW z-`61%_B((^Y+(IrY(Hb{ltJ?%p4#AWKC;0&<_ROBrEy9ck<5@)xCX;hOaZ9>5EZzj8E%_B>7?=gf2A#ioto!x49Z#TA78DO? z44G;>eHFPA+eJuZrfCE!c|6ZjO<8F$Zxcwn>1)Vv>=z@R$~tijm_*_PRi2N@s2%0e zHUBZBw9QodN;s2Fr=*!UidLNv&dOTM*I?>fj7~|yjE_>C0GKkm{)=@1Sd1f6ZN(YL zC_qb*$V@XitdzIaDF@~NfwUFxMMh)442jG%157zEa7Z`_x&FinH|3&OzR_FJI@I6* z2DRnxL&o521@aWaL+(1wU+-uckO@2PL$_D9el_+2W7=}% zcO3p!BG1UL_J15ZbCQC?N)~hq(Cv`Xmijt!FUG474;gnJ!T2oZ0+^iy((^uH?!$gH z;so-<@q^*pcTVMlrvar9NW0vAWITYiQ37R@Dx7#wxSYG2mz#scyHI14~~{kzB}Y$FMWDF10o z_-_dBrefNfMOZuR79?`YG9i3U&;L~xP2ArE(cY{ej{%4zl}3=WHpn@G45KRA6+b|x zV*ex(S(N}m{;-MUCC2Z>&{qCF@;LUp#m-y$06~7YLH;4g9x~B3svy&_-HJqJl0uLR zHb^2tE)qn0zk)o0ePkjj#5iy5g9QPc2cT{DLu4C(E6PiNzbv3U_Ls2NmMi=1Q39l4 z|0ni(=cE$$1$fIA)2xjyK(pG%?j0>svK3jo+CMn(Gc|JXV|RD!JOoN_9O;`Q2MGKw z5_Zh9HsdGo-!`xxT+xWY{)jjkWrJBvoWPls=3g6F4t^rAw$hJ1f!`zO2N8h{fiGHI z6>vX;)6V#b2lpYEgAuq){Uy@N0JYS9XTFEiRN4g(A~P`Fi9|M43Ybe4a}i7iJ)&Ju zn1`|7g+%s92+UE7`2);d#L*@c{D}X}0`PK_K>B-cnHKafpnU|=CM3*k>|coz$Peaw zFq->nRHAgm6c{QDJlqRG<456L0or`N!-3{r4t}MS+Iqi4!gK#Qfxke)jx_fH0{>tG z*9SL{uQPf5YESV#4mokzf2C}Sq@g&5gVOktgJlHP7Wt_s@RtPr8VNg6;4}gsx44Gj zR)EtU^0^214VZ5uaG}yZ^JCC5t^u%IM5!`pDtv@XWiojj4lKd}2%iA&=U3rR5?DLl zH=e+m1U-s`{fKZTfq$}bgN*@>h$vMGj>7;aPy~?_9{*x97Jemxwb^{-3H%*Fk40qc zC-Bdfa4m4B^^Gv?Y)3r69{~Lr!8ilpZX`qdH^el>)(d3~$% zSGcK85cKB=#%ZOv2fPhj2Vm8RDGK2jKUq^`A|CsBe9~qt{8<8PnP!F)=KN`XA?R-r z8K;o(M;2HY;O_vn?2d+k;7)=$6@fG04uXrPC$w|M@jj7u&EJssvE`YtLpGhCzJLh@ zlS&25#}=R+FCKt)yVJ-=Q2^;B1Llym*G`ugd+l=PkV8@ITVwwz_Sz3xVW01s?A7xM zF9SHsC*0MHnG8x6evE-L{z>Y;adB0HpW`QqlOMU8-ksc)`3Es|zm|#U{^a-1u;HO! zHj_Ao>7NclU!o_xzdiq3#w-1w!S@Svr74tjYbmgF-yz|SIFG*zh@}&BZkduvOMDGb z3AF~8KW9QF#0-S_E1xvaW96gqufT^B=|v*iD*m>KOu|Nqe2Yyuk%yc_=v5Bx<{Jj| zf`&5eKE%!L2s&lgf&VQhyDxBf7@e|{ZXMZ4_dOEM?jM3&%+Aie9}LOu_4x0Jf5pr| z7f*a=N6yETYd1cRe1(I^o;q40U|t8H3f)NX0udGRdn%#|1w4;cA&u_?AFfbODx|F? zc$r|D>%JJNL~mdeF41H=*J8ULy{gfP|E-`JUju#%ovI;S8r6{Q7!t0AUQhBN;i=n= zpA@R`COBK|pcyvAEyp`3wZgRP=BFCkasNWTk5Y}jRO4;zRgGlq3r18!TTD2ps*&M& z?DRj6-vytmx56bZ^!D3;nYT($Z#sQ4N)tKU`#{1}r>+j6}QmMvau zHKaR^gsTyU@A!ynm>^6(1Zc})#td<*@c}wjLtAAClrlW*ibth4?&XCw5hBen;ByT`@xZ zgcKx95hQ%0qoVv?91K1T$2SvT0$;LN<#rq_d6}P)Liy>h{{n}|??s#67{E7?pIYV@ zB=3LZmsy+-OM4PqG{D`mUV~63$-K;1H76PNm!tHyGSTI#1WprdHU%)w5C9Ud+6+|W zQ)Hk5|B4jG_P-KH0lV7#v%!?{1%fHtQ%LduBUl=< zy2l0+(eDW+lG-H{tW-p-5ca`z0Gcn&|5g3mDx3sxmbTN}mP8_N3~~$oaor|f8lQW; zb7vw}Fok;6Bra)~f7z}1O&UJ6^-RR)ANAeMy0gVh?E0|h7D}(iJ3D#jRA1q^^1jZg z6${gn8-Z76cBTR!)r>QT=pf~wLJK+w{#Z{a;b%Jh-g%yzBfV-i*!)$yW8|;8X`L(o zM8sQbL;jm>`U9}9Px>4|g60f>bN`*c=gvBN%qB~C8cs$-SolAX@;M3Tz&r+~21(8V zQwpL=FpmADVt!gO`aNe}HCrrOwNfM4s*Qe^K$gX1N+VGw%M7Zzylcr;i|%hE4^`&9 zlAVcEEOFSHz;ZxkA*<{PfT^1+*}Z1DC95TU17x+FHTC(yXiKbw z+>l#o-F|r8B#ueQJFa0~vkYhDXe+%3@lUv|Quh{%D-qE``%U7=gYl!epL(iIbQX|% z$y2dok;cT0J}a-A6*kY6V9JrF>QWGC04DmR>>L6UykOvTAm>qPGnFU5(LK|`FZ`JafqW76>vvOad;V+B6cVpspCkJNo7 zRetPTjjvbO+1~kFV)rU#WXb#g(`vC;aUw@VCq?4}4}A zo`cGpE-p2xbX*Bv*VL;^^Af>KP&MgZC9sJnHmX8VBz(W2HdN$Bq%D$+v_oz}+9Nk3 zm^uk3^3xaMhB&un#?8TJEgrv%&)0bFDSrt_N9=2`tZE_lE-vk|)>t>z*yb@fY|%6O zrYHR~;%4KlIs`w)+5OF2N5yca8;5kV1iGhIOyuk}57Q!;evau`o*K%mIC2YSIuNf# z`~~@FK6aQG*{vMv+UQBzBj^(>KF`&hyU>+Hyf;0m1NY9>-=p}*D8Sz)aQy`xhpTWX zP$^^~1I4Vv(e+$iY4jpg7e`(AS)ZS%Ob_xeJrj~+VzY00(&3rd1^7@gzk}>PiYa^| z!co;U1lt&Ki^GRjLRS$VBHN- z5Wt?h3%zf~c`)%FO!r(cB6hzAGccbA)FT4qWquzZN}BL1JcsFk%V|A#9#y(+8 z_T`YCgzk&!lKE1eYM)v)Ln%Jov8S2_p!tNpTs2qGeOG~=Sh<9mS5B1xvKP|0L z98d9Ff4v=tYjG%$ADO~{mX-qD&(e}E2@#=|IJ$$M0e+$~{mB2W6&!0nK9`+;$ydpSzk81Zddk^9^_}u@Rj(fGLj8ia16svG zI{<5+D>SJBZ8QiduWcJfIU=~c0=;^ETnS{X_oz>{B zzavMh#1>)H7drCN>_#V-dmV%IP1}{fKP-x_j>%{8uEB?ye#_1;y;frgU$!KJzR;O> zrr+{I@3@HYCz6Vqk30K9eFxoD^N8OI6}feMKIr4lay+8u#|e?-uJ`3L_1lPg>7=!h zqeUi6DaxoXwDF(Bq`BKt2X@Jm=+4-`J>m-%d2fFyQ)#g;^!jZJ+Yjs#F-j}tV^p3O zv_CaFlGSyK@|lZMG&;G}t*s(QrUih=$kEaL@|nYXG&=f@*e3BgC#1*=n~E{&3*8uV&m9lG;SGLIe4!$hN}35F zU#MvC`oQplk&Zu&&Bv%5Ul+)Xi5%^nKc88%gHiK-zqoCYIma|9#;7m!&a(IW?K$iX ztJxZVdtq}^iZ3*_Rkems?TZ*K`UW4uyD3GUt2SJ{&s%YNs7ULE`OM8DG`gwjoW>E@ zB6(I8V$>HJRc~|C#lPnqEmWjNqAAnS7y9tT{OZ%oMo=!=hpsQvQR&XBes5n-*qxK( z^A+}$+?9WB`~}~NHSw|aM)ry~6F$3m+1)?%|I3#bynmpT`#6L?6ufh&@s)d!Rjdj5 z(1Fe4uT`D>{+}GMzFm*xfIV@K@zt8a8qJ11urtM46=OGsU0ES;?cvcISYNyWRNujbbro!^Ss_GZ!N!;Kc=7a=>ot zZhTKKVWnlmHa!_@c0YM32gyY}mgj&y*+DSYU^eVnpDUwHuCO%+Y(Y=s`)U>oEF0FY zYn&N(Th5ZaQ|rkbu#ygpm6r`G!;5`7Ze;(veED3J^*LaRyFjuB%PI?2w?}i{-+3fw zB`4OIl>@f?M#0K-Ol>$n{*zs?^W!hp4pv{dBl*+LeUb~UNl$+HnT5$eZ~iH{#fgQ< zpKkppx!>Ru$?qQ@o&4a|`N=cdKahND+iA&DetIc+@PM_+X71|bMw137&wg-o^5P*c zCnvmDxLwMNdy=2~^y}nrD+iMcU09wx?T-b?NpDR}?s#}w^7V7(B|rJ}>f~;TbCSok zUz=R|)5nwd+|e_byrbIsNN(35E&1=2Ta))b{Y-LV`#kL?k4jCxH*Q7pp;K#;_l`Z3ynV>L zosF4(1o_Weckj1F{P zpZcx#-Q8e!tE&&Wuhw?B9s8=fx7Q4Cj~y52e)o-f?uIRY@MzH`54oRMRNX!2)mrYw z{lC?GzD{_qq9Zv{zfSbf4VX z)V-`)BlpyrRo$9z4R^-E8ty8ulxQElTgiQWNjtag$qMclL$7IR1MX;le%#Rg*mytp zS8W2_Ep9s83GJJ@ziL)=|AUG?71r&l>gzfB`ZcXV>gU>rE5Fh9%|5Mtp7o~|`OR7F z&2P_Yt19f(g1`7es}p}xJK+6HtGVJEZBMgLH2+fvwez{3X|1k*tQ~2xL7U;ZswJjH zx-)NI*WPUSp?0gsPOak2ue7)AihKLAkF=+oozuE5xvi}W`Ce;&Zlg9o(C*GWWOonk z__;?r-qO(4PN?L5?nz(wSBvYrpE$Bl3mv{z^BH(u`(xv~+JKk$YB{a&C&wZ|H{-!H1< zULMoX{oK9tTK(UuxGxsp)b12q(blB=spa=k+%ra1aqoLdao2 z!yP=bf_qdG1C6>Xl;yEV-hvn#vu34wt6-denzp=jpf+VdC+&3fv)ZmdHfyWi zoT2TRzd@V6I82*5xSRHUn>L#7{*CBroLg=j6ICY0qYK)eesvr@isnc+J@6)jF02YyQJt)w;gdPaF2~ zV(p8mleK0iR%)|8pP-#inW9ZD_G(2rL$x8ReY8;>;98jJOQVeTFErT6^uJ!XrUI9YD@#;(?ky-EK+ zI63ue+3vOUzP|8Jzo~%>Zty1oi@F+)0mK*G#sWR)zTXYof+F7senB?hjbxq+cHD6+ zXkzoK?s&tY2iqKXhA)Wl@v7^R496L&C#M;XG|K-()bo8Di#_GaYf}uzn;0$C495dv zdHp{XYy47nyyn>*Q$1ah4aci=H!ay%P(&O{tdEJAn8~(#)l2Edg3A9~)IGkAnVun- zXiwuk6O08hG`=p;aNPUn8vS28Wm9#pU$sSnIXuSfl9Oqin{D@Y`90If(Sse;F4fGr zNztBVoJ;lbcA1%F7zuiNl}9v^IJ$B*VJG=D8zBAayH+6O2(F~27j#A3lVNYgqwzr zMS&}K@@o*_CLWs0TN4-1GHGfF0b}aiN03ciy53p^hpx+sdM#CXQP+-^lJp zzam#N$30uniY%j_%kJ&GD$_U<;N$K5WH;lC9^Bkf>iIC!=;x#+j!b8pw{u!|;|v|_ zTwecA#hKzWFkb4O*Mdo$g3}bm+qoIGN{Oe_jed_~Hyl%g|9LaI=Ml$D->PD`e8U{~ zD)?SZQr|A}ebm9V<_C>$gWEbTR1BU(ZJ5p5etCxab_K=T{`aox+b4(vlhl?)!JQnH zeEPJkN`7#<`ad4^{AkBKPsf1MIc59sa_7&nk(cJ&88>fHL3pzWNxSA&$ zy_mYmLy3BDjN?L$xk*MZKOZIePq%UM0h#7qjb5YmdNE=O$A~H1g(-Y;8-I3pr)Ko} zgZLoD=v5S~Iv&_;CyRpP9G7f$PQC?GLBUJDCb<5@0F?FM1aK$yV9jx<$evv7&r0v7 zJF44kql$tu91naQqc-q~$^}imRutUBalz(0PUTzQ$C&oMGu-0a-VR?M*Yd9FwSxd( zAr=Mob(Gp%-DA~jS$1#y!6fyX?#g!Ds}jI>2LR2CQ%8O1qu0-NJfVkYiyrjnk}xI3 zIH|h^aNeEP#CMN7Zh0O_Fiy_%QCd`?%Z+#gEFso7*~_l9r~tNv`l)HgNq$`MpNhIC z$1&T}>vu~FCvlWb%ZqUkWvTzCoZPKeR4~x-yxlh|$>3NH;aEDV@tI`&(Nv?F?i%WN z;HkEV;*Z@8{@{=|zOcLTY)j(TlpkaC|Eai#&$-3F^|$qf!??znja^7Hsuj`b?kwZk zCooYv-Kh4`KWL)7b1px<+2agGCkA01?U$GTsrY-@(oR|DGRHz+I>Ir* z!`~qD#&2c>ej(n$(GKFGZbr3s|35ey`iEXHk|jemH;whqG^*VtZoY~;r*qgn$R`-!6rDhK7D3ZUgr!xNbD4 zM~`uYx{iWqK&BE(dfpg^K5%F$0wp-Phf_z2F|-E+pJZd`3}Ev!jG;}*e@_EU(&cVa&Hq3}9V{+4(HS(W-t^r#BuW1^YCV;!IL85)e-JY=ho9!PT?Nxx34N4Xw; z6DhyQ0F)BrXm%R)jgT#*JPw&JvDFjzFH+u2zt&UUCFJv;c{Uvu|evT<;t zU5WRn{0rq?DaJuwkckgTH^iTOjKAO2NN-QQPW^V`Gs#B!hu}FB2faQ@{Lk=@6O}CE z;7#yblZ=CQ@{7nnPi&lOr1OSLd^qK2!F>dNwU45E#yJ+W3q-66mBD}xAbvgFNasEg zKc}m4a5m+xsm8$|$`dieD^>9ux*G@e=<$x=P$R=gp9Ql3S%b)dOn-vr<|2QOGE?E8 zoM5#pt4n(k^Tkb0oePD*pJ5M zLw-R1Cb2i=HNpfP=3S%sV$8DWjsF$=*Od2Do<;cu8hHrZ9m>BG8(`u+>T@UuQ?7?; z-YbnCfi8UAGtu#(Z_iY9ogO{O(ZJF4mdzXUyC@ExAR_F(~fp;%=$ES-DbrbGXnTC#P8G8rYFH|%~G4zC-%rtn=T-Jnx(GuCI1b)0?N(N z+d}L{c?0#;h$8rAQR z9*_KvOk;a>>Wi?$>O8h5&3(Z$iRuas=@vJUzoWvY7f` zx*1ths9%<99N{hL$WY3>TpD?vyvxpnp21mF7g0{R$O^>skT)T^8_-LLtwrPnjd6{O z3{Eq$j*=fo{dw|QcjL%vVi@^V@EgDzLw+lrm`=F>d`rsHF#izczVIr7H|R9CEm1zw zN}N?5t}jPK_pWs_j?AQ`)!1wX;z4&KD}gu)#c9MC6kh_jotEcO{(=0j#D?T~Gc)oc zP8ySU(cPaZw@)hT**x%jz&(e<+vq<*PeaK6#PEhtKlf+7z{}xifu3l09>en;xFP7y zLoWd5EPo@T;qQgl6TRb<$HI>xeocM`_>=VLd&*Vtkqj;iANkaC;mzU~n?)Mvy^h`% z`f;CfI(!w}e(HJ2be?CXqsTMUfA+9C1}8z(hImyKv4M<>h(*Cy{VUlr&HP(&W%=M( zRpquEl^Lg+^*d$#!?HX+zO25H)S+zZZj$wRM18`K{zVRxu0zw?mHKxmD<+%SQvaoa zW)>4>oQ8-Vbj?527a3I_e9gbeX6V7!{hJXIuKO1gf^PU{ly&88X{lJU0|Y1khW}T%PbO`M_k_}PEAZy+^#@Gt(+K@<4Sk8x>xnLe-S!XmJt3&6w`s;GO8wRU3(;E8UD}@` zl%nqfSuVcIzU#l%)s^qW_`fFTuHXCc-8b;@Mn#jaN4vjvR@J8gHt3?N zd#GPPyWddXwTo)(qP|lMzwX)YjXD5<=Oa-Iz;pSCs!r7NfA`-Od@)8ka-drjaZL94g>y-TA- zsdi^oF;TL8onJ6-LMu33w$x{6;&((s==pO*hDa61d=HT=YShCF{ac7nr@tRkZjDkM zY%{iMKjKQvTq8ft9QumRrK0gl&enwJO3tvLIgD&e@)ywOeaonCM3?d_IXC(a7oNiW zFcEx^%5WNeim?(sdJ&QsyC5KX4~H|<_c@d1aL)EjN-=WJu^;*^!N_IN9zT`xX|^gC z6O58IlnYagl4azdN;FDX#K-@UWR#5X@y0*X)hPLn^84T(A&#M*=d$tf$wuxg#1=_L zZU{QViFk;w48AtFj^IA0UUwV0TWwz5+-KscrTCglw9FC`ketH@s_hEk4_dN1JcD)>9#^gO3i_mrHcxSD3z zEX0Qc>jll3YLt9H#?b zt;B31KNJwZfzdySSOKH^0cC%1cPLM$6M`Q|qa`%*Xqu5bjxrmB5AW~99WdXO`UU8RP<{?xEd0-D*N5_2j+Pj#hNAPtT*_Rk z;#1HSo&KEqPidzYyv~$g#dj>_LV$C z&4^Sb`l8#K`cfhb+4$Gc-$Z@wN_s(6=ktTNqhCl_V?=_9b?8wXxSim*bj6P&FPeCYWB&Uv1a1Z1B=ZXvWm zzz&h#hujau8PM-i??sNwSbQeU@d;G?ee$fXHgz_K!qzeDqzi0{(fl{9;T z{4w&UrQNO6voVNILWi4We5sFK5GZD!ctF#DsePE_B0TP={4tHx26T+Zk5Yacb7E|N zK}1Y_o(Pn%TZwOqINJ#G6sbaa1+zyyjypgf#h5n*Ux!$PIq|q-CglXmi)m*LaU{=1 zsxkYxw&WI(|D5_9VNQ0=L%VDo;(5MU!u~#f6tNP#YYYy{_W1SqNP#z*`Z45RrhW|d zSKzZ@j}OGpUhq5Uw0ItP0{oBEe~X_$cyALw_hI%`cg}8JBCaK^aZ&*CI8M(}ejTMS z%B;|HYtq6T7C{T>Z%CXS;cMWHpE-?=TbraFWN8)} ze@wl&nB%$aXhg*j0<$LDL52{jG@Yt3`oQvP} znz%4UwR7MSf1*5)zpeFtl4=hC_k5~4E0O%`#A4zr=u99#FkPK>p7<5`-PAkaJHTgw zdjp;I=-(i}AxWJzN6hS|&e@JxJO)mD8-(s^=4{<&)=mJ;)6`j9=o5D&tM&^BT!P3} zHgOk54^sXR0Zxs?J2b54HDmEJOZR=)8JgNUOSRVp*fc}6e@lE3!WamPh&pi_&dyNp zrThW{2WdA*I7P^h`Vln5XXS}KiTZ?xol8Bl;-R$w_BiQxk*!Pm0`U99Y&y)cB=MWB zs{H}D&B%@D3s8Qyr zeBxVny`VXlpuP;3V`B4U*yMN*_+T+YlY>93| z@()qJg>pB_2PuC>`wFopdfCEHmOATQVl+Ii7>P&FpC)EY3uem^Dt{%JKqHkPEhfpQ z{fVzJ2n&fZh;vU#+)Vuu%DxQBYD@$ttFw+$zXyIX^(WF)yC^pS$IT?M0iEptt~+LU zu94{A;IiUP{1tqE;)~?F+cEKobD8Hy$de(LQu#Yzf67-8Nro&&v_Cydg>;+|j;1Wc zmVzHm4_P24cA|bd@e^>|91?F(ZiUXDbV5w=4n&>;B`&8vjo7^Gq*admC|?Z;b58Kg z;sTXe0)pF0VpVK@ObcUikPqVo!^0VwSW3Au`8bA!k9QJJQ=d)@h9~kpyQ#B2BtM8@ z@KA0<%VLgSW)9t=9E;-u^mzJZo>Vu9dE`C6>jf>HuO-iVn?f;_y!4Q$BANd6-=XZwTgAfPV#3dR{B%3g2@ub>G&`4vtyA5STf$^aP5W zS^2G5{uE%d6U<*wX5E_jEH=fpV-~VHR>r`w5Li99X(t9#|${6CDkCma*ACezX~ z`Xu^#5l7*ax6^lVvRtO+b2z9)-y+HXAGY{^1sxAHajlr-yt@jthamA*)C`=h#er7rK%ZXhr{H|(DNVV>QY=pk*h=Tcf{Nb zHA97XFhec=h4KPu(`l$7_4}y54DJSTA$e{@QLmBr5Syf{#ctxK~g%}Ml1>VEN z7r?EgzJ#(JT$ubo@Cu*1v~^B(6mMp$b{3HCYUgYhwukS&1b&TvJOz9>@?Rk{3i?s% z>mlEV`p$xp*Uq`#R;&j_IGe>5bBT;frq}EZqOQ_MPPC|Tlv!Ixxl!RR8}%7wE-_JC z$#ZuUCX25VI}-K$2v(lO+&rT;lU@TX4BE$(2UF%?N8QH6pOkk}&k{ZA8ugi!uTbtm z{1|g$BmOdSt%#3M=KdK~A6^pWso(?AIY6H4Mbs`LTahSnP`N%toxjYtkA0S}Rf!sn zJHb6no>M+*J$l=P9{B>|qv&wki*i=b3pzL#WEOJ@M{$aqjn=2pCrNGt%vw3>4B+YH z6Ns#-qntGMI(e>389!iJOuxgFIYBc%E#npI*^$n*(JfGTj2cd9Q#m;q`!M`LK12Ng z@?4psW}w`c7RFQmGv)cTphtIfHgFVQqrqOl^uZmSD?>K|5YHlZ(KPP{MNP-5n9z?B zgFwDXy&l@h8QN|g$ag`sN1A67gJGF! zaRdU-!O%lHJI{LRbyu&Pw|TYI-PPgah}$yM;mzz`Z9%3we754%>QX<}$E)?vRIl77 zF6pKYe~J3vvee<+n6yrm=fW#RhxM!W+GFaKD9X#=$AiC2ektX-J=NhHEbUg7dWBoR zwxgSR#b4aAW0=J$6b!&$ko=V7q@L=P@x%?3xd&-+nD~kE`;=pd{V1;>9(qh2&Q?l0 z44;XrwSdP-t+l0b_I(;l-{G67e-CqCQO=|jufsb|JqKQ!i&?frx~B^!P9g3Oco!hn zUfO9uJXF$d(-`XrjZcBDaI4nt5eE@P4?id0kus;QmIjZxp$&!iHRUacbFj5J9eQ6eGGZj=qtALm_o&=ChED?VQ7`oh=e70%W5y0pYXf68_TR-J7a8sQ zbanV{%I)bEcTtUv&~V05izDx%+>`n)v~UB5y(zaOuamD!{EhOjL|!V>`jHpI`W=z^ zuU-8`FNk$6vxVqEan7)?f^@Zz1ybCCR5j!Tu@+@63vpaf3oj5SWvGRRh=-`x^WvPk zBZNh7+-cytE1m~?-vQLa;+@Uvgz#hYala?4A+vqFao=`T3%OjyrKbVRi+8SV_g0o# z$fY{&=`1xQnHYg$b&y;mLO2w0YeD{q!n4Gg#9e}!kl+jrs-3JBa>m3pK!GzPZV8_B z`~>GJIuWcnFA;p*;GRoW3paurPyQQ}*Td7J-Ok;?%V>)=YFt9P8u9}QzG!pNjH@7& zf)kwyzB|NsgA;iw_ysC@P?EFM_q6DYY7^3GVJ7)(^ zO;HOw(=M0ZLS}ByMkMtE8o_pTV;c3pY$8p4?y*OMYAMck`I=2_gM-PMq-#7@W+<#%6fmf>vY8hRGRwkG6XgKL-{8jE@V~Oa%4_^>wIko@K1bp`1>W z3gyj2eL^>uh#Q1!SEjRB^EGw(Ja7+*O$hvfz@reJq6f&Gm0<5=kv@pEJ0$2bvM>jBK}OjUZH*m`LBrdukm}-^MTu%!SMJ% ze@$!Zf1s?V_HYIV@ieXRI1qcYjK*AvVtHd>jh^3wHO`t#6#1NY%_{nLjB3%JM-b-r zAA5!R%H(;P*7#5A#er-O&X3618q2#|YuItbCNMtph`)e;n);tPKKH;KA@YDJb{-vI zGaEY<{T$*3`1jCX_&_h{>6}?-%^tw50RIO0Ex@AzO%#I}{FpN=^cCdWP;(0!SGU-m z$V5>dM?E*!Se{?4(eoa2uJBB!=6y6G(GnxEkFtx%ho_B)gG{9?4%{}BY$zS0t>)x= zU|isuf#by^@!@=97PhhHXlXCy4R{eFx(oa*>R+WCPWb}3-o&RejmGH;SNLAe1)e5| zuo8@2hOO_2{(yE6ONnAyJWi}Z=%}_J6?1&rZFd4^IoSvW#Kh`6%j=ZpJX)7*R)N8J7cXidwguarr9c zWtqmX;dVvk&9`A^sQ;4ueqs&EtBFk>GcFG$9;N&=@m^2kaw5Eh?#AVHl=&e3@}HEm zGL6fd(0Q2puc&_%-8U6Q_w;ox_FV3ZgMjYFu+bo1MX5E2X_TKLswjMgLRVrx@(DeR z%O6r-7X^OSP<;n{8~8TjPT~OSXH)++_?IYW({c&;>O@|!QR8|V!}yGyw~)(zeBuYE zzOWxVo69yxcY7I^d8^eKn&5fN7`9v(XJFoi{uRbQKyE9tA5s54$V}p+*gQoXO!KKI z3?r@qe*zo&gly;1K9?IqJ4KNf7Ss*29t%8(ar_-P@4^pjM*VGM9nenG+>6vVB|nh* zc*b`o_~+pBtGv{1G}o5;B5-SYf~Cm=7`IX8c@HAZt`hh#78t*gEAMuv^Zii_)63d zWEeYu4^J}=aK(-o3!WQFL=<^$G!a4YZi_OFx2EyOFzXLL6_cCEk7gg1!wI#8`YzOq zVdvHu5d;2r@Xd&`hI(-sr`pnV-!VbWGSe1= z{S!_cu{*uxllh1VwDBcv++~zl{6uUZe}#Mup7=mGVh+9QK{=Dyl`?05IYsu-mYzQ* zXr3o+BIzA;r2}-{SxRFiAJGdL9-Kz-lHP$c#( zV<_K3w-&gM;nk-8BY2sVyMh0Rn2rAT#BumN1fEZd(pVNp1jAzw5wVi~*x{{#&t5BH zGTnbbHI`z}2=X0Nkd(?|*{1A;y zqW(ujdJwC?=flhhH^+M#aVL$7d5}*0n)oV>9whR@=7Ez8&O!JY#6!ZI4HHA1Gksl% z=)U8FLVedW#(LQJpkrY_@*T_R2-XV+&LhC*y$3c>?!sssqaks*n}~s(2FmyTtQTNqlVU3^iA*&3IUzdmG&+(Qi+= zHF^g5JG9FsEB3cEHMbf0{@rOOJI}ekPOeydPXOKy*d)?xh>wXTgQo;FXkM~}^hwlj zP~C@C8310@Q%m-t!XB$+HRXV0;Dbl8qpD1e9yFRIZ!VYHScT?!{u}!Z)uSNr70MhDNl3c@nPJ-K4?ULmWjpnD{-t z*Au1(h3R9*IhX7989_Dl+T)#%=<(va-s7FMeRy#@J*Jl|M zLv2dR+HS_gE5t?JjobXFPRc9bc+oFq3^+Y?5|6JF6b)~tyc%KN^h=qG0KaK9r6Yoq5&ofvaXXL58-x>csJ96n$}`CG4r59U zbSe_3Q~wM)!@voAAL^sP59?)2oGjK|lX=iFaW5_U&_X=OR`b z9%WW~6Ppk#P!30F3@yf!-wy5q@iQEEr+yncKZ1J`2d61Fqh;P9Oo{Gk+~!v-rZfhh zK)Wthpi`W4Jv-8j+jlAN>So*)J^dQpv6S<{%>v&LoypYG&g}-&SEHS#l%43kL!Obl zokIKT@#P}6hBu117~Ff{FW~32D8v7b@*v91;jOqX*Wy!II`Cdn!LqM`D4m_${BN={9WSvlq;b3 zDLM`!yPhqhiJo*LdEda6)pL z5&kvsHOP;rx$elUB7T#E9Q8lI3?Y7&Vr*e|7{eWH%XSpHg5y>c^AXMI`O|nby5$+t zk@T`L?%51)8A5Cnnro~jH{teXVSq<#2d6!i}*8qc7HLvg1beZFr80Dw)CgS`>^4G$YxPJfU*yUa*^pn zJU}@#*$9s#|9}o~D~(A7$D`$#w#mkpN#K%c<`$7BDqD_G|1i8LaD9oqwh_Z_dkZVp z81{+b`_N&{z2!sdmD_s33~s?q=w2xN+vIDK??$XfWGx}9#KG|S z)Fq}pgTZYu=57`yiDzka1^g$#wWJ&a{}6nBl3~jXu@0CQv|#XOh)*C{g`S@Q+ywD9 zlp<)y2maf<|0ZwDT2chWQyrUN5okq;zqHOpT(1k^N@}L zK8~0;Q7)6@`A$qD~%KM$UGd{K~OWQuL~H{h32&dM+{_58Wcr2`Ki zqR7Ld!`wAhHgB0v5ziqb9s(o*7f(B0LxxSK8rs#!EJo&O{Jh5DPeJl#|KXR2H3AM!83tk;&awy_8~PRt3lFT$#HmzlF0O$TK+)^P8g8Qp%#9z4u`S9B=Ap z-WB)#`OcZa7fC84Pa$%NBzwqAenY>?bv#oKUf{eCy%gDw)O4n%3v#?Wm?><9z^Fm= z!r{YvKW z=T|vv`%HOQ`~u2<_!l(~eg1*ZDO+UG=?6aR8`u^W6*RE-3aBxkv)L<7Y5t6X_#Z!H zH@sQV_15;j4eIQ%^LxTYCI7q?%i`lcijpIK{p=%-WE;QjpW3=b&pmL~^l_*O6Gx4j znlZMGUM0(3Q-9pYRWl%d%;-S_C#deYRufve<61nbx6HDS&|CSqTz|pa>PxfiHS|qc z{NCe%7!i4(f# z4IeZ_B{{xbi?*%w$9!E`Eahxo9}c`P4*3g(d8wF<7n7VZ4W@vjw_9j=657V>>Q$6q{E3a`N5KO} z!W8~+Qxbo75u#|d{2$Lx|D((WNmkj}w)`M-92Z!LevxWB{K9{{?*RX?#QIh}nJzmEK^Vz~ZdsB^|B{bjsr@` z{5Mjs+1p-U&+&5&vwf$((%W9k_Llx$Z+oI`wf?6leXPeiT(xYc^p<_>A^KE@tAn23 z$L`Q~QCoqwltBHW!_`3lppX6OfIzOHLRJpqs-n!k_B%E`r$5v~Id+$RE}Q@HU9kAh zR}Tqv`Ra=Y|3@)+usuNEl5G#ty@TzMdQPw_M+gMz=K`7Ah7-n@bzA}Zp(^|@q$b7k z|0U>Sv+aIGAyr*@v{~J?Soc!a0|rCh65vYGLn^y!>(2!6llNvL-pZ~ZJs=R7b3v|b zsb7V<|2O($`(v)WPeJ0lcA_PhK6aS!nqv zMI+IAc(BW*@6184eDqvpUF;?Xm^wfGW05#rI9hkGhfOsPuR~1PN@7~;mFv4!`8=7SZ>sNV#BbI*T;El_S|Qu` z3D@n_nPVYhp4Il~zt(rvuUtb>ygRrVpS#sCvZmg!forLU2YX6=APx8m;j1BEbz~H$ z3>2jnG%;_*by3u*e1#mbDNo5J9D#7SFo=Jmx#`q%zM4&+n^IOMPNsb7@vI8WFT>4x zw{n0&J-#3)mHFbww-v7120^K{t()`xu};MUk@G0PMML7B5UY);2Q>P@g9jbz=$TJ( zPgxw|>QNz`4u$HK8@cLwfQY-?e|Q)Ey-jS&AnCm_k`GDmLiP#iUG(XDyo=LU_#Gm> zi~4-Li+cKEUY><@IZoV)_zOad_6!?*-R?05K{Rk4AoMQxrAwRa%|BOUX5p^b6V zXe>rYNu!;lQAW#0si(imk!d+=Jb$taEO!bq`@96TkY_hzB&Q>sM^eg0xSsw~BUg3L zY$&b&rYI_NJQKL+f`A9hF7P*Alx`N8CnlE3G!SE??BA~hD5?rZ{8iQN;II%N_@mHo zBlSasC5b!7wjfI!Ofl#JIOuOdB?OV47Ln12wEbIzzo%_edRR=>lP1MP?&U?GD2q+mr1+?39n3K0Li zAAB2)b7en9`%0q%*B?RdE1eGf_giACE0DSC&)bwe8D9P^6&q~J@Y!JQb41t&&Q;QFH9nY!m zS{@TL%;NAC4&zCRArk`q5aAf!s5kt0c;yBSxXI*&L6HxU z21PzeYJNcdRbf!%noEa6)%r2w80X=zD?s5;0K3wFM!hfyw(JbbgE??h$qR!X0ca>4+5u%q(cRSlD2K3xG+5!CK+|A82GhX_g94XM1IciO zL9nL5kb^d*jYVs)#o#dv^7u{|^q2w}2#C#7xkD9lTL8kLXrPfaNLhJQiXNiAR2t-3 z=_RFfuK9;Jzm>%pi~uJL3S1`oY19jY;<_p(;kg4grQLN;*;QYFhFBb)#bJjsho(Rl z0vV9X4JsIHFO8gqdrcZ)H|GVWJW}1y9I2HU=_ri|Tn}_I$qOT5=hs+ttPm%W7O`B5 zkqa1!l179;PXxNdDK}!k?IgLq$)A@-xC?lJDNAh*(@3eGUFj^12wYoqdXo1DC*qbN zh9n2EXp2}si;=|`=^~8?fp!S=f>Un93pYk`2b2Fn8fhhsRLSaU8o7s&hSCVQ$CUO6 z^+8A&0bDkD5DT%0Wm}A_z(`|hL^RX^em{#56>cLvu(hjkg(kohz0D+h16!z`*2>k` z(~KG%2ZRQ2{pEm)ySmuSiO3#KLCgFS7ZY)F=FVtd_jCB3$)tqrwJA-}5ED~qn8%NL zfo)DbXHdXcOkI$@50}0FEjv{7eguEd*seSbP|QpL>`03PA&cGvE}NPAZOS7SvHq6c zucY@a$bmQco!}3GSAG~2df$?~q92%SZ{P{zThZUzcx@>AI|TTD(_f{KKDL(4=VS4r z>`NQj7ki`O=8+7hE3LqZt_WN-R)(T2x&l^o#a}6;FM+0?Q&aI1ApO)AV<-I#|9|k4 zgP-c~#Fz_Ss0RZ7L_5`}7h^8kA7B}#wG2}Ya$=Z7t~1_7GTZ+eCj(zWQd9t%NKtW+ zYe~`h)R)N4KPo%Fxm7c>^I>$pCOFY~f$Ks8qogUYWv3Hu)UjwiK7~f;g;*sh8*x~h zq;Mz%R0PIQUw$CHkn55cy%zbV(qJVp4@uD#)L)VYSsr;wDHDHgYZ{EkU_EfcpuokU z|D@0m!zR{J93CZvMuIJ3IhG;ZhQS8pgh3$?kHC013z*O>L1jXV| zj9?)KD_XRkuo&EpK?gZuQ1FuolLV4fPgS^fV)PK=w#?c~Zln_T&@g>1DcKTB>!_O*{P<}SU7$KJS3 zX=S@|n-tgY0Fi2(qbPHsR@Mi4?6s;DZm}u1EX=p2=nFmehdi!8yK+aiC}^S3=0g)L zvN-kzuu2#1ynGpG{3o=o zvrqbJul=F8Xb#2$Fkw9R4s4|1}HwNyx8CatP52xzy6|PzyI4?hOlf z9Na<+w-D}i$@Qzj70O1Bxg?4$iIvqoYpEBKv2c4M!vRaO|cl6fRQuOh~O`JoJ++QPDbU(VA>Al82G+Od}p8KnTo7V2i~_ z4&-ko`7QFlOC!nB$jR7y<~=_TBfF##ft!ZTR%rxm*`ayErWh8j@fIVg9-Mq;I#B|I zKt2N7EJhTJd@9LB}M!u3p1a2le+tCqo1gvQ! z2dyv5v^)U;vT&Zca!h>f%uabe_LX+#JNMqszah=P%=lG}^?FVaZ5G;;9Q-lmaz z7qS#y(f#vWA^kuEn;eT1OeF+@;Uz^JceBYWcFF(@o zzQs$Pr5{1`<3n;xg#gk1Pz3hE`D=>N^arxd^Z+gmZOfNwfW$iSziQN=(H*kUK?vWH z0|4C%O*zoSX%2vkDX|@#9|i<&I0g=o7jqMA+1$j$N2V5+1_2sn=|M0(*hy0KK;%Xs zPzKNLt5)lEVL;vN$zjt+cJ(W@;&7-2^gB%Ig+xAt@A73 z#9l<;n&2}BK9d|v?*NgT$BOu8IF)GM+{T4K_FBlpCjRwVn9`0Bm}Mc4hkVN< zv&knK^K43ig*)8Bjf6YL!kqy3R}1%?2l8x54#3$SIed@F;WHjCHuq2WI5_jbiQyBt z#2n5;gv989HTO>nT5~O0C*|mMAs<%Tt}GxY4x`{F<*=lqzMdF9m9HGg^CkHq0AC0r z`mUw+>YnT%yRt~OEmA2tEPkMfwtMjv!&lB+o3hYC|Ju@aHf=vG+ZOz;Ijn2pls9d_ zT`alH$Qz<*^1aK(z@wb}>CMjZRP1C5ER{wCFbpBZR%QgSlE7EUY@4#gB6e6B5mRU} zySXQClnsL5x6EOl!!MsgUbth)i}Ro@`3~k3Vv5ug;>@A(B*h#O$@v^gU&xfpo&?OY zDdQ~U5teBX%`}*3;ZB5GQF24z7Z&guf`vQM!c{$xCtJvqAp2R!2INVS{0GJY=`lAY z@d(~aP?EphTeh2X+Lfu`#0^W}3@&Q^=rASB$MubgDq2%4S_{$glwS&b1MJE)l41@C zfn5l=5D>#BHc>f{pOWNK@&VFFUumSE-UZXhevC|)Mg;C7bgD=rV9m?3f>wT+mM1`f z7Rcim3g8~sXOA=@avviQU@;N`d4@Fd4f(3l2p=kVp)0+|v@-WDJ25g)8WFhd=v0$N zz?w#knJkcVEMoI5Ms8tbkTfC$b|4UBF>-FEO?g6c50bAUjbuwB?HdP}MmAz(h%_Q_ zMd*n4v&5mv_DCl=hz+)g&9fLO#mG=;L zjHqxQliV2cKGKM1pmfr&@>TQLl(M|)BaH~)a)eq)BVbJ<=lI>{y)9zXEJhY%q@Oe* z_$%OtS&S$c=_|P@wYWvwrIRP5kr{<=m`3hlq`x!*P959WzI5e@+h`_yw(4*1_*s?i-m@173 z%yf&9l^BVaMnvu<`0dJ#aIsLlkmDrz2>=zOk-^f)rOGYMITDN!O&Srnl?b($M!=TM z5wsG(Pz;DOKT{Adz$%GUaU~C!!WuDFt`W;OOT2#nAA4PK&uh(BNWM*RTWrm+^l2@9 zN+u`n7Q)IaSm{*WC%7s$lFGQAfYAUd%0AJkm%P$re%S}zVf3jhKylv^xK#*6%RUK% z;*xZ3IxkO1BLd^G7}a!8?`P2ac-i~m2#c-af7SbQ(`-tH zZ1gFFE6EPz$_`Xq-O4;D(R3gkoalhSt;RsC>;PE#$^boRV-fROda#Wigp(6PDFoIa z5O3*$0k^H>&L!_BjSQ7W`j0Bx7o}q4QE5crUPnhm$9!b~aM?UXECP@ijoFNbCP$-u zZ}VS`Mj@lo&SGMwr2{+ZKnHT71ES$|G@SHz2Rw?GM%zpBvj7~j2g76!>R%5xhccTU zbOa}cQsCY|C`Aq>*s}SJR-{F%z+z-KMxvwS@+KVM>J7cC{8K_Xd74AI-9$(`M~Je$%*LY@cGwS30?4JiZ?BOxn+ z@arK=1N|+K5jLfvgscXVQSQ}*m^-^1AR$2Pbmvq2&z=1D&h8lN%EsixK@qu9ZmT{1 zJ}An$LA-V$xuxXWF^*zc&S~K#D4%>d#N641v9oIePHasCu5}Ju5}VxF0XBDbIZxP> zP>a~x@}P)ic^J$3X3~fd2uGl|G$M9(Ubsyq_b&NPDSz+ml30~DC&v{jK;#y%IPMGg zubrJS(55_WA?I0^)(MQjBNp;x$k`^DOB)U6z-?jSjyQ|jV#PA7R8okd!NbIh_n!Bq)Dq2~zPRY^hLVhv3t3Yz% zFbaM;dF;dN< zb=qQNCq`;WBk(;+CIYz#lpD!`93;sP(ZDWgBu^TN*}Ti#<^@-?E48E%fqN94VbTa# zbDL-Mw<$F(T1TXjPUKgz&8s7B+hRzBKwAVj5&qieg>ay1OL9~4om2jvP^nC)y5z)! z61jyOS5C#hwt0n2kcTYX(Uv)p%ABZI&hn#Zct8!`xM-< z7H$sQ`jWege0_S%Wl+4M;w30O+uSVsWQafFV*}?A*GK`p#5HalLX3v_LW_C7>_>$~ zOqatK3MP+jo}HW+J;A>We*$oE7{xYEoFYDwyNvup(#U9OWb%j#;#%!d@^DfCpfDnU zXAqhsjQ}<;)`b}HwTLaZ7)ixQC22(P&%)<2`|ZbivGM81JEGR8FGVV|u@x30Jup&L8WH^O;Cn4bLf}@BT+ep^LgbJ< zDUH-K%-5xr{TTU48WFf_2o*>pU`-=luA?`ln83VfF_MpwU!)O{yAGd+Cx0E9!oEDb z{#laW1Q0BZjFU!ulrZy#z8E97r4fPq9w8nFnIi($G@|z59S@7va~31n7`Y>j2>yBa z^Gzcj`A`D#uabNnKtpMSv&)OV@<*+f=Ak)@k>8{dfxC&<Lo7Y2+MtrMnibmn=qR zW8@EMMDTwS53d(nhD5>0?~?o;fO^u%1ZgDcwPU7{l^FR`8WFe)=q!;&z?w!1F>=qM z^}NN%V2nHbHv%gxM!b;EOLAxO)uoY1(#ZBv^UOJN4+_w1!3fpG$I7PK;Q+75fyTYBqx&(l15l8dC^xkC6|4U z>Eh5_kwygWD|B9xM!=TMkzTwJBD6g8T7aJZdq|XEjQl8#h}>QTR$7cG7`ZBq^dw(H z8X=fQ7JgawFBsDUAN518itX>(ulwnqqDlqh{t@hPTXvA5IbZMTWT>9hLNMvh!8l0z*>tD zFWe)NJBoZQY2+zsWc`hH<{XK}$Z=^z;Ep`roo$9362N7X1hHclv1cqs+F|6RG$I6! zV&n~r5f$zUJ+?2O3ZDYT%`o5%*4&wL&0W4z-T(!r`NfLPJXum;o%L{>bMp~YVt=dz#v5MRNG&ulgIy^9sL@EPt8lbq@3gB^Eye<0=wrp;) zEdRzLw#?H1X!?JaoH$tp{{(#A?fUDAuF(JQOfH`a&jCif(Qx0YM2KB!h zN+pvQNhEJOyk3$ghX21i1pGpE<#Twz0yLlFOZ5w*?Ll=|A1b@0nt@s$sWwEP-cRoG z3f(s43klf+q+j_l{~IE<-+LrvKal?AeK8^CJC`8|Hsvb`IRfPI4E}DotC6|A3yI@3 z5^`dD_g*lcYgx8;XIW3~BWK`zQw4iDVRw|9We?VUG@w@55vo8$rQL_V7BU9mJG1ll8DNF!oUV=XJ{9*n@d7Ve>33Zdx_Bv*ViwND;7PuY?4C{1Q>HFtDp zF|tD%5x~a~+Aoa&HjRW}WV=P|ti?z$f35c;X+-dQ!9Qp*;)T0Yax0Vn)HI@>-)OJq z$!8PyiELZsBHxcv7VdPoCnPrnZVueF z7VdZpHwW%|lgmX?G&}?H*A}uD@;XW0M!pq27FY9`@}#@-O=I&Sx{EJJb&Bg>@`k*kZq z1&fhF$j?e6&yasa8ksGPjQXR{G?I^z=cEyVtAb96Gy>K%64H)G$}22lYb{2OVMLck zgg{jUE?bPK7Y;O08kr+bZ?X0&Y|E=17O^)i zM($zcWobkR)JLGyV#I*^lH{%=A0~~=l}3K}bgyY-F-BhbzZg3Y_^7I;jh_txM2ew? z&PM4S1gXjm(os~JbQ7faE=_hLRf-^jQ3Om76tNfVXrkB)Dq=5*q9R~#sG#5fx%X^x zv+VMI{v0-$GtbO3bLzdjdt-^nt~`E1KUgLhyF}7>KV+-V*)AWEPa$%9YKaJ+ z0{%B2ks#RXEOs~T%`A~smdI1TZgz=mEz6sR%RMGZGOUE7KP(Z9T_QQH_~?hv*_}Qj zhpEU)OGNm};7|C7 z#KGBGpR=t#BAXzx-Vzc1Pw<9CCbFQVfxW?EJJ;fnT-P#LV~PA%YQI~NG7OTL$GxQ$ zL-QAo5^$uY6iDFGLnuBEp+A=6{(8 z6MoMEywHNvFlb*x*E&JelQ5|Ld3j>=gg*m`3GUyFUdvV24yg-FOFQawoy zis2y1Mbb(HLsmmtpZVroJrzwr&CR4mE$H&>fqy z$@UfC8456lHjPcSl>ndY%RdNqyo=?0G9QCRwkI}NBFjJWCdmkhjI%^!cNUJSSt8iF zL~>dL%tT8>HamPo_CsWXC8D-c;A{AZ&@;u+`;r+#~tQO zIe<3xsoH8XGW8Uy$if$_#OFsW_FdX9pc~3$=0CxO$P_N!2GPX$O;S@QdV%M`x)&lh zQi%>oRs}vW;)6e6dr@cbOV|#tw4sIX48EZatuuHI*ufV2Iqg0A6uyr#L}&1!H0lhl z#&sNp8v~A`mhtgj&!k%aRm+o zZter;03Kn%-_ibt3Wn2-b`Esk?Oi=}=1(SEg`KJ=yP;{E#qgucIBeakr@SOSA>#A( znXO(I+Aj%l`IJhddQJ|8;h;4R)N_smB8~)(SUk}i@M&Hh}ycO@ougqq5+i#xR)hT zhW6gR0X2eK0exxHcv9O9bZcj@k@1vQhL3>wu=D1IywOO)W(tGiQXe=2@I@AkFOvrN z5+C>)-+-C{_A(##IFB$rtmr0;oTM*XVXQGB;}6sL zI;E|^&6xOY!=(bIrO)y8tO|T2_ZfXF7Eb5Wq_HVC>*3T5H)S(VJyR^XV-CpH7Ma45 zGnr;u_0>g}ksjHjVVuW9*V1C>9j@ehN3Rf^C(~?TG@ew!>$qZ;` zy0RT@kwiX8<0Bn~iI6d$=_1Zy{X{eMfY9m9Fd(@3I^6=EBVM z8D)~ob*A|o8>U&G%oQ`r)E}AiHXqaQxjKp~mF*&*t8%!~geDJ@f}TCVrTQ)tPdeo+ zs;znL$7>s2p=P#amVd5Tr76Q-8XL3`+=m1aL5TCvX*0l^N#Gw5*U#r#y`I-`*Bq}? zxBEI3jH$}lsD~XUMm~>>_k5Pkykb_F8f9lh7j3G6*z_l#0SI+70VF%K{H!unE4}5I`d<8?p1gF7_~kOM zpIN5HL}HsP89OGNPtZozadpi4;8oHU@C+nl8PlALg3E(6<@*KWw= z{CowX!Q@XgK%(W{L_@|=Z+0V7fvY0J0Tk&mbn1*;GEK6Xsyg!^tjch7Gef}mGBpm_ zRkVbk!1gZ)Yur_^gr$2Q!j&?wpG_y+L2(*#W8|mD+IP&tEWt538o23%tPDhb{yhBF zGLb6rb7iCwdVWrle&JLvNBnqCP2=1ld@?qKb!`UB@}>goo0-CZtwLKFY7f4(8Azj? z)^owJbB+=BtMrq4t@GK8k7wlL#__B^Xd~R|$0N>hxlxH5GR^+K* zJ&-YR^DJ&MUU5tSoW{3?*j$E-hF5K51UUb22{T;G@yv2J@}5O8FeKEHelw%Tc@t?U z_7joF$v*)behtc|afpBTPd^t&ue$0XJ)s5O#7jg+v4X$D{Tzx_9lMg~uup0aFa3uv zcNhnCAT2>72|5Dgj`0BJVZcXb%p@$+_*sM^3G8BwXmbgp*US#T95AKuL9!ZLCS#&E zu5@@w2baj}PDDgnr;My%X0tior1Ooe5+?su$CR}w99H@hShD>>~KNrwRbm}1K_R&Go^+h5b#8`3%QfXc_R(2sd zbkKj6pdOY~9)cy&|7!_K_u3E`UVu^zr|~)iDaz1Ku&haeV7_OL$zX$dfS)~R%F z;I3$#g^J;UbkL*}3ti`5x)n62(x+oZxpPKx^Z8}!vE z`f*&(y~4X=a|!y%5L<*SMy^HF>vwCFfR__ahCexf(Xl$XxOYmqb5t4f+Fm0z1;*%!q3UU>%Y{phmndvjp#jZ%vlACMDW; zj1bMvb<8`K=tl58Au0_0W2XCvMkGM;tJA$$-w_;&^7 zdnE!3_#R5FW)sNb?a5x3L|8NYa z@exTN!OgysNu}(cfD;*pf(bQ(jehtTrAm@OSu0bK=IA~{r*R?OZnBl`CPdjrmDIgmjsRNW@R7 zn#tJbnAbgjq2MW)RL5akfUjAH(rw3~bhjWI&X83qS-lBPS?vLL8Chuu9fZt}n^bj^ z|B_?g(3@Hs^~K4c1|-@6I$%jicRwVgyA=_M_*vC3sbuw*msRjzh}6VidvFh-tHfXq zqx&#=Ff;ygbd%sg8&tq7f(p1ra)!)vpL5J3;FS4;;I1I^4qyf&3*u&8)8vx*!pfA`4f;~-}VyW#t+GM|6^IyTViDcPe4-o9RjVwZYXS%_H1bfN``;RgQ33d_o z`_R=SvlZw*j_y~tThEEHAXRjAcYt?yP_O@a01eKxN-4k&O0NLhkR6bUyRkgynCHN$ z{~rT4fd20S=1OECxY*OVw#ladpS1~vN>Yv#GI9WATN2Ve3<>GBBO(#M|Ld6CXB?Ad z9sUWCbMco7?hbUxxRLIKF`jNX`p7ByTCM-ObS#ZhC%j#VJ`A_RI@1lC^KjNxZI+yL zJ0U0C4&)B-@hX(N&oMi}sY1^{Vh|O&7)%d#i_K6y?3k>KuNL6iNWWA3sf z&LUu4NOY5(B_Z9TkdRJ&DH8E3lwwk;(A{8l{?y%<sXg$E3xfG+Zg2jD92%J@5Qzneh*WnsT>4*V>cU_fo;@5eUwh>?(^ol znFRBI4Wd)>GJ^D>jf0W7E{aeNLGHIf^zhBG1nG<2GK9YtSr{ZU^<8qj5mPP-TI#~# zEB{W zk!}T|P2IPAh1|#3sgOA@@UR@uS+F*s`Z&7`{U+;3y7zG;-Ad~yYH4yiB#fU-Fbv%! zk=^JWrp(Rgv?S?@;lqMaLniK8;|`IeBIBLxa&#K^&jC+Y$=r%FjSe|mMQ2;C74AA*;_U*0_NHu0e9*fU*H?w;-KiE+LT4FAI^}IG7AD z8DJP!N!tAr$@KI@Yoeq7@5E8z7^rQ+Hkg3@2&N+gW6``ACs;bcmRgLqn#;jxo5?_4 zii1f3a~+s+nn@{)(gs8r!p}pIUOvQ_>AjkpF*|N@H|j8J+DZjv3fBYMka)we3$3uhr2NVk>=uGuZ-c4rFl_=X zM-)tR_;nlXVS?Sv`f{>H`;L^{I0Ree1v9^aZA38bZH!^F2EEol>94(l``uD*M0Vq> zU;)g`Kg8O!+WI&b+s63NVm%63XL(8gIlQD>fjoeVs3Lf4G)ux-&sVTMm?rqT3j6ie zk96PQM>;Jn@?^B+c@^k>Vlb> z{(@s>fYCx)12Rn^n69xo8pUiH)a7b;8Em0 zDhQ?(zF5r7wVt1lBjv|9lJZ9DC|0S`aJ1NxDg|a4ju;wxV!a8W*KC%4FM9-B+}V7B z3AThbmL5lKkI^Rkb;v%9nyUXlzH&eXs|pE787&| zwrMt?bT9e>o+RMaHlW&GqD}TU*no|A{n*BPlz81j%-Q!J|GHq$NqyciqrHIUY#b~n zp!S<-$XM%G`X_KK-FoC%HD`Y&-N-3H`o$i*70IdtcQ@4fqP!G?g&#~IVuOty}s zJBlOeG%n;Q)?f|eXc8D5uyo+d9pxSN9>%^DxVw=(Ncd4cu7(}zpzNB{^;&=26V?_HvC}Ig|1=oJqOaI*Zp0`8XS3$&~@O5@*`o=Og{CKk1*rpL91PFT^1i3NXsE za~w0sva19zghPs823lv*y@507GOe?CQt*!0YP`P_uw%CsGZ zzBWA}{c*;fcKq9r+wu1r@-3oW|1l^yS8+wt_<+0XJNx-VQXW534(#>t>Crf$w&K1_!p>Y(gesE+){m4fMsUftOn{T*~0 zHfAp}pDyh|TWu0m+fLe+qrV-=#^?>?2Si<(c_g1duBua&zRcF!qAmpW11Oz(vXIBH zehc~5q8JF?ZkfX;?7(RK5W6(He)P3|E=6}IepX_ijXVzi01}D*xaTKivYzAPc7U|# zBmr7OZFQo#9=Q~~)(PqNQ(x(JAn)KPUSH+!bIc{4zhG5xOYyfF%w^V>bnoFyx;r#B z<*X6E_;#p*QJG4|_XDVUrULwljMiYg6L}Zhdq{L;f+W<+Vy*zQ8_ZhlcOeI{KZF!i zL4E3W0ic$E)TxaC9fBctZl@Y9=gz0C4kXm}K5g2|cO%O&`Uv?0p)1T2%#bduJY#pX zbKfjVa7>QgD9wJ>F|F|t#78T9%&O&W&mlQk1^AOf2VV9JKz-b&tqt55(1!$GkG)3d zYV1Ene1TnDf*bHWlTN@kU{$&9V1J`f}*Q8Lc3vIyj0nZ_zc0_@10{CUjfXSw_4cHXSrv%ilcMq}&`>$dJbmP@C+4~&R z1#Bn?+ctpb5>UIGz&8Ut95Y}wbCC_$49w>QyaoGvkxcBrjvKJPNhe^6O|UK4dBkJm z^8RSStpLA?8L);4+JMagcT!5%`$&1? z1l9jq3$>l;RjU$>?uw}2H89=ZS?$innWkq)oc+rAjMy%jolMVqw58xiZNJf`E%ZKw zCc|Gfb8ofs;`$2n<>|WE2eH2(1mGGd`Ts?VgE3) z3;ScpX@kq{>xL%(8OPMO0o#Gq_b9eoZ0Ih4$72SpV;TWc#(e<&u(%e-!ibdUB7 zfA?s~Ez*7b-pmH<2mW6IW_tnMH((11(*Nq;AOV{KP(1^jYtM)2Pb+yI_DR^Qf?9qX79IUYE3GOPij-8#=;Ml&NGhxQVtVlVg>! zLJZR9#n4qY$@n^#YRlJRE+Vz@y`lN9_eE;KpgK|;iBusAgMeegDNi}3iUnr^UIJMA zzyTzJtphF?1DpG#W2)ICZUfsoItk+E=tD7)pqzG~BVBR~T|HCP5|OhoM6^G=gG7j& z2Z_2dh-5$Em>QPI?O@wkA|Vb8z8$PujG*;n=xUhimWZ5P2NCTX2Yp29L83tnBB>Cm zWwCcur2fkQYXdpt12+WNCNN2f)X|*a5E$V+(BC!BfGW@JQ?79QGvC7oZf{EV(56^))J7POG0FSKLK=j zkMv=4g6os5egXyp2B1dpT>w{r(|+-R57!DzYa~J>i?#p*IQwzO6tm=~f_>0OKBUzz z2j`Js`aLh|_b|UFSk`jTkKkIUv^z#fw)$QcNiuRJhdyeFUGM76j#uX5w{GOqOMx*-aLr zpcw=`3ova-3jv=$%v%wkV!G=qKzyLgsfO(wR>#uTZUJ`sF9CTvv(+9xb$2#R=aIBr zv>fk^Z-r@r-&CY466vXDaIM1-2WS(N(A#A#elhq~;59s@g|I!G|sE$ z#h`m4WAW6B*I%(IM`lZj(q@UbHvEE4>wt6z#(26M>?7kT%+K-@Dbqe*nbej;%D?MY zUL^QLf~!W|Z19Qvtl+=d;L8crmf%mL|I-GS?j3?lmy1M#XYjN9_=1}>QvU;7B;Ew# zJ);e*yA3!JkOKZ?1FppR1q6h-Ibj1zcaVV6J&!~JUdPY!;}2*E_>V8(cmk?#df0dx zhrM_{f%zLF249dLO)CiLKb9+V$`ZMrpzR>?9J-U1h;)Y_BHar}gve5URscVdyeIe! zFF0nX`|-KO3q1e-qEq}N?dc=u@B`qxkh;i`;yiT@1Acb*{LKU0ZPopAPrU&xg7Kxi z>a(k*uqU%HFJ{VzT`p2}^2k2V;z%|Aw zat<$UJWSkiCmol8=3VOs|K{x}o=w>zHmBnfz=^xxj&#f+AEo^sr4sHW+n@RKBkY?I zE~H-?a03uZj5tthU*GU^_4n-Uj z4|5He=&p48Um3sT1AXHI%?A{_FPoZI!Kg2FMf@u>=ODxyV}GWu6iKc&%`0H(!cg!z zz@6yB_W-|+QhoS1x>YgCo&Yeq4|$e+5uMi+%}sJvuQ!WC29e z_+Fz6X65otp=RbyALyVDv=C72vNtnt_%QGLFkvv!W%s6hGjky11AXKJWdMp@_GacS zFe*O#3i8X_~vNtymfz`~(0o$3Xhu#PLAWD_}36vkRY(|%uC+MH1 zONgN>hVBuYkOI6wLh_%FTnDijgFKX@uX{Y~1HI$}tpXIg+|A8iALdmb=6W!(%iY{O z>chO@V(5RdTMa07xtp8E0jb<~keVHS|Px&xU`Y<B0a_eT(f0X-E zvT?9KQ@Opt-O40yY&&$*prPh&e3YaAgD>Eu3uXTdC%aLq>`$Y+v3IN~+3K{i^WOJq zx&(Ch_-(Bt>>BRTA05$S$&u-G!~peLl?X zKFpn9B3&3#AkBR(2+HJMA83aUbQhr5eb~x8;KS_lVRnOw-G!~p{a_j}{+yS|r~}jO zb4YxH+zrS#-F&N66LX^na$W<|5zH6Ze~avii)m^$fKiuf|G#6b*QJHk4VfIBMvCL4 zP$`|a;}zAlFgIBOZ{n;I1oFTgK^}@rpqbfdF>izE3`YCScgVxS=w^^S4ki~TVVr2rp2W$CN#00)8JjG` zJGpdlM{-eJ`xv@*W~tBDKd1R}=l=yUfNf*w+L-HbrYR<3|DNVd1M&PAxYlMlxN}*) zg_||atH=)|Qz}G0dMkE6@*?sQ@-p%Y@+$Hg@;Xu+V`aD7<2Q)xD z+Xt%d1HE4qB>Y5>Utlv`!q46#fkGGaya07OU4VT~_-s6ACSUR6V# zO!B0?gQpxCJgwxk-v`gdnFd;OoONYfDBBar9P}!;^sm@*HbCh3(q!w|Hk^m)}-Mt z-RG|x{{Gb>^q(4O(88lYd0jl6`a^_nr~O-Avw2lk2g&LU7UZ52uXX!A5N2&}u)P1( z_+TcQvnuoc2{S?X0CEKE9sIfrxt#eOS$E#UPbe1#4L3GvpCx;Tx7P1=%xD~eNy>vv z-`U;(-9r1@4A8N(Re)$&g7l|N0TPk#031b>sYqr={Oxwk7>`)+FZ@-7Sb`;W7mJ|# zA)zQ%Cd-0TAT!UQzEzPONZK7dhuPXHp+zB# zmCZTG_tD0;^X(Oz$tM_BAd?ch`R9-5)qaszM5ZriS+mP zRSKXfmYuR+2svfW^J>kHQRK47WV$7H54I&B$KyjGm%07zGF3u{7+e)I_!`#jmv_v9 zXo90n=|U4A#x!u6Rc|teZebP($0JnD73fhp(vP3y=^CT|o!FxpxZ;Xm-zTUqGWmBo z=5laQNZJc-<5br!RA30+4LkrN{J2JoSk$!9X;9B+Q1_>;41wifF>NA~jQk1U1fsy) zlngiT$_>gTr3)S~lu9y#z%uKCkHer8MK~LrittXEDq}@93rTR3?_x3-n**^|@&}1^ zE6K~gG;$?+RF3o$NnW~U$lr03cTDOI#|+M#`Y!Je4h3F?%OR}I)N$VE`-xvBl4mt^ zC;52@HvAftO>0KJ{!B@(@A$SIsrBJUGQ)X4jo z98)QqTt@;T*Wgh=-O05qfD!1}Y#iyn7~|=xpyytT`%J)B@gd@oX|s#pv$6u_DsY-& z8qEv(cq5r-TAN>8Te~Okz*v9KY!t?kYD}XVnqEI+sA`PFMAb<3*-Xbqqx?5(Gtt^O z*r;<~AZQsnmxq6=b4ML4Q>F1CY_gC9w{ufX{vCY%#o8ah{v7P9d-mH;x9@6FvF~c_ z`(ppYc;3eL>_5jo^Z^5I^HK^kdYNwqU2IaobqA-y9-)XmC@hb*Hr>#vu+n`-VWs0q z_yvXi2Ck}szp&2Mfaz&DTnUGt;ZPe4ON@7w^*SK`8Ura`(%nqp4#)JsL8RC-D5QGD zaClX*$MU0!-5VQzb%);hcIe5h7kVY~4WxULNJW%=(dqBdJ5qi^CQ`OUB17a2fa)oK zCMKIN$av1oN8q;LOubbZxfDGrNBYT3E$Lby0Z7JPSGt>wP_|<(0jTc3n$rA6X@Y>X z`o0AL9h%}-riaPj;h4)TemeL+!JljK?}B~rjQE}=l#7FY*1;ki{EdUU)`7a=qcb|_ zWwODyviOzYPl2y*@w%P+*%|S@LnZ?UEvw=Hna}D;Me>!I_P5p;Liuo^5cBV z0NHj0ml5FGApcQCeYR*Zq7Q|fg|818@VuB&8=H3CL;o{|5=?6xaDnPB&DS$6wJf(r z{=z{b##(Z!BhF6#jykp)Oo!>`Ui5!e1w*GBt~JY8q>k+F0LtQ`5rlEYYPr=y>a z{J8*cj=3nNn&M#PfD^mHXATz)u1?)ksJqx)fG;hc@)c(B)BtRSBoWxRzZ9&U`Ha0# z2SCbPEADDC*J^tK@^4h;X*+moH@0MMzQFc2GS|9O1L=ewl_ULBM!s|xAqfPIJ@WgS z{4B?GJj$n(#Q0@^*TGo8wn+InjI&{Up(Qa0%mzqkF{_1iMi1vC>4)Qs?sj+H8SK1W zXj2@uji62T?U4$J^nWtJRANSYxa4h|L7qq}*m~0mc#+L+Ip8bFPT-4?V!rGmU6J6f z(wCSFvP-cg>0Wk9MGP_A{(QBdzkZV76OhU3pwkERELN! ziGu!3f&96a{2DMrAumi1q_`#TUYSLLd+aYYsj#nS*{1{C2>T$Q6r`>tA$=Mor0asz zh(jV5Uk$)$fxDgQU5x471bZ!TU+HB2HKmWg!u2z`WKkanbWsu*Q-7h?w~f%j!MTxt zhLf3y^hVB#+bx%w0Aw2ad^K7~|AR42P-(%f3?xEQc0SAfUbgsF#V9Tvr`ln~?3xf{^cWF*i)q)gn5 z2AJ&ae2E1hGa*SAFiQ!R06|@2EM}#vYKf?A8Ev|-xfE%F&lZRlx{q1tYTw8SXnlu{ zAvZIJFS+k@Obx)A1tGwhluGloDN@}!m3|UVrMpaRak5JxyINo{cOKxMu_PzVAf!&w zR6J(t$?JiYOX-eFPeGb>PRJP zALZ$XIyij6!C^B82c2YxAc=rt930%Q_~#jRIVYl*4-ij5^vtpD=JLc4Y^gSlWF8P!0i0 zTE|IX|M5BAiQf{|ui8#pze}v&Civ}~$ox|VpW=AaT;8H9y}^6BA&yG{=DgNYXS%OP zdFQ#Z7?;AHGo0HM`eW{AVaE&z%B~I5N&gYUlt!;TL;Ba*Go(8kohqsG##$=bmhZfn zVYF1S8NQDZgOBfI^1Ta;Rm!qB*hMLIDr$$6L9c2_{|8w~w;ZX2kEm+7zS3^xs}&Yg z9?ZWMvr?E4AX|z^CA@D^-@;&-3kGnSUMm-qgP)_|MiT*zD#8P|3n)z#ZlWg zw8?%BQk6K3NTw^Vno*xo@piY-|GTFbo_11ibIh;S`ENM*-RJx)od05-tL+c#e6Drg z6sO&+^Urbq&mKyA_+j6JhXBIR3a2Alkw9rK6gzt|k0UwTtwo!Ku${=%u&1?iVFSEWlw z&Vfw4bwSq=C&1~vpp$Tgn2T3mxBQV4m&RWXJ9<&2tGm?vgEJi%OY5({S}QWwHl|;sPp|uqz{g-Ks05(U{oLM$CCPehl1;aTuS~W zSe5)IuvuiO(^PllYfD!8A0R7T7^&}1lKAc=GJRtL4qtS;)iH-H@C3krEil6Z<7YY4 zq>$yeHW8ior;&)x`d1-+izZSqWo-E~Mqe+E$r zWRt)Vi~9?l9>XNuQj3d9z`y?wGx?hxa}2Pikk0j`>=Y`PF7C{@EXHiPJcjNHa};0d zR{5GiHFSM28W~bh4eI{|BqQZdr5fJ@Qh)veXb08MIj%2q+-4>HFH}RiWk|C#R3kt& zezds1!6i@);Z|6j``0vl%cgII8g5dl#t$|Fo&RT%fzJM8kf8+`6ipz71l|FoT~24f zb?kCF|4l$zg5iJnBY@iEf;)KY>1`W8XZtw>(7ApbGAvGj009nyQH6d5a|adDS@3e? zUG!Aak^VO-B;6{c6~3Z2pGd{|YZsZ0Ib?zV08FGr0$y){Q3J+Tp%Er~lVjckt88`d zyMb(V=9`AJjmkD{YruSft;UwFAyo~$?Dv=9gnINDq0O_o!WB3=Nyd(Gp1Sy4&4-w81CJis2W2dv8?Ap?G`VZeG z7roAU(%)jwdO6moj@uvMb0~)GqkLwqrT#Bir!u_2=k$5&RQg5M>5J%8sdLcXMWu9h z9FFWquS!Y(HojT(tFZ)a0_XtQ zAMcuqvv}2Rn8#*Tdl_S4)Bb`-2c~m=e9byfB&o_i-%0p>)B09h70-9b=m06_jc_{0 zI?u;>^{(h!HY?a_Q?@$h4IHTaryzGX8EI!rLk?KS(w~K6>2!7+guf`#k%cE5fb3g3 z|0&zzFsf>SIy?#(MW_In4e%iXYQXAzcyN?=HtYq)KVb9d%LlR3fR$Z4-`IKty$0-o z5DgRT;CdLPvip}T_K=-+s;iN`=tWZcvmq&+&ThlckX`N;$2@9rC5Tbo;ucw4Bsur~ zUp_(I70}_Gd9r)l68VpS_drA&&RFCz>rnbKIFwGuwozvwk_wS0Ev_U)YFeBYK5(J9 zWln*}6PAenVA{P9(Z)0$dD=RZz8nsv3nOV~AQFJcQxVh&+Q{Q%U+_OyS5(+RfGPbF^tqP+KBxvcDF&iddT7y7bfiXMKV; zd~70)(^t7yzj<7IK5HFU1zg(a_+A`mTgPfEV;wKCj9;aB&0Dsi7j`+XR{ zSztBB0$}eWk&t$*$;cfvXppHPiPA`?v)*_-#hXI8w>oBr#gzgVv^bshgo{_vbSk>b zBN8$R0Ply1cDAX=PU}$m3OJNb$E=BGAd&)+yDaW(aOYZ_j# zu>^_%dH@32uV*5A(5p+NuS7=D={Pm%43)^pbj*DgR~lShi_>XJxadk0-2cnI#WD9< zB2@rB3=thnW+M-vS0$uBi>##6IbcG(N|4n37E>KelEvs8AdIgPA&pYE6b_&p0ckqb z1XK^^nofI>N^Bk*XsZJWwbiFho6-%)#WQpJT}q`6EOqye1V>lcBTXum zxd|UZeALCqDl(d@>u)*f4tQ>%18+5S&x3=^MjN;mpoRq2#n#QO5PaTu(%8ZX(|k%r%I+{vU=R1I?h%;q+&21*B5f2Gp30wb^VyE(NdviJkFP zCYKA)TaFgHNdfDFT}?o3IKuV=w-B*i6p0ri;Fab!8?X+brUcZcauaeHfJJcwrkM-^ zZUL(?(-dq)suCG953*8jqpdb%)K-Z$Ex$J+o$wh(GPVBlsyCs+FZSiytX@U$&L$TN zTdm{HfbZc<7je88$G2O@YP**cTcCIgVVBnYTE71D0E85w~6wMgtPNjIrfC~N~Z1iO}a+VF%O z2rjfFX279lsSTI{s3ifl7u<{t#(rtsfTK--fY;f8jlixWp!Ne{uLQTOXgqz&T!U_y zS#ATK3#c^#wdON`%?JR?;|9Fiq;r+L46OROCD5}C0AB_Rs084Ea3OEwW70Bt`Fx;%LfgCXR)9+f_nVBXP`&Edq-`zzmi$K@g#LffL zhQzdlZ$r|tzg~xc*nKy~u>`yxl3K?F9u00aayrQoW{oA;2253|p{1I= z$xOz64dSnc`^?!eELU4lH$V@Pnbztp$P@r;k?72VVAfeodoa}rqy>!Y4l^D58>0fL z>;}wQi|GaC;pv?Jw1P1!%nSeNk|$ILwD5m^oRvY12$7vKt4)T?6Xt~4|7 zbuRH8IlEz+w=O^9BXTb!?vFtv10u5p273e84PdpzKI_9i0PevU*whVJ5JU+W|1Y` z3Tz2TYcaYVc^dm76(GLby3n$|8Nl$EtcMl^D(r^QF?7SswU)SiZGt$n&OGlU{tP6Z zv&2J(iJygtc{a=!TksaZHv`tXz267U0hnvSerzh(kWFGc*wHbQxH_7!_W$8U2OM;l zSQ2t{3na8Gzw9Hi9|AANAQ6B>I@lDtEfs7zO0VTvrGE+g=vkpZi5q_am3SNKJgo^FI0Zv*$n zQ3V2PnO5KfF^FancpUbcF*~r|Lbqw7dKHPxn76^bgG2~qY5uzd?*_+Aw5&$}eu8_m zzD2@aBmBQ>p8J2-{F(qhnc%|rC%D##9lqf25%>@i!R;V>NBuw91|9`)UsT{=R3P-80Ds7JeXsHQ4U(1%98PA4UaEV-=e!oYwy|fP(>Q4D9p)KLYe|6wm=U1t8mL z(r9q|qX?`2{RGu5(=9>S4k}(vJ5TK|YTP6eQ3zfagJQD!?}5y>i|O@B?g9 z!F5F zD5HCkFXCX*!1Mt?w2;+NNs}3aF`9a z5a*W=@E(jt+JMsiNI>Zx^aVUhKsoXU3=(jJFW_tfs%O5k0jt>rufm37Lr{;pG40)* z(7gsVqb!ld1icg@_W@3|M5Ox}BGNtNBXSHM3g9Oa=2YZ>W4&=dI=FD5=l^1Kb?G)8 z0rw7I5mKK$jC_rQ=w}grYfKp5=8G-^+-DJHF)_MY2ReQ2!-2Zs5#*cLc%y#T5v{w$ zX=i>BoGSS!Z^D^x&j3v0dcB(kYD0VzpfcZ!9619nhl6cTaE#$3-4Ka<5m!I=uzFQs z-5g5KWYAZ_!y=k@{m>3|c|F1Dt2c)AY^iweid&_}2>cy^zegeuA^d@#%Y)H~$KE|H z^|Dx1X&YEo=yBv|G*%ZuesCJ4>TAI&)h@u4$vlA^iv~wwbEsAyi&eGm1FK3si5!oE z4MkW70bdGOB~xj>oa-HIb4X}eiDKpw+UnbC)Mm|5C3NZVD~^9e+y@V0n~UTyPA;?d zsz4p=mAy{svj3^jKI8&|WZlmKlywk*GS<;efS(Hiu6-D?t*pH=O~GDS>Yyb1U!v^u z7!NJ6SEZF)0t=1E>%qU~K8)3;zaaxiKl1H87L;&b{<-nM3Eu+;$C~8jV|uMz*2|oa zL!Gwo!eOYlx3#k-xR{MUw&=8!gx19l=%fd&}tv(B%sLi zO8kLTi4I`Yr@D+RK9Tn%$Z8+*5STb!YARumNsg-4{upn9sSkJKAT>t06EKSIyJAr~ zuM5VTYOcY_~Yw8~!p2lxwj(Pd7Ep(}sEKX3HmV0eV8(O z)KBd4jyLD|K&yP9vT@5h-kj^htnp#W#j9?JjOqbWd3Cj2n)2q5O)j$Uf0Wl7coR(W z_+>pN?(AXe;zwoPgP%LP7c58xQyh)~&C{OIZn_wFGKwo+&8u~Cte3&X= zip~uBpZGb-xQP#R%m+#a6nkb&Fpa^e>$GW=qwBO`y?{JV*J-={3xN5Q@-SNL3s~SDF6T0yNRI z@PX$0K-B=nF7rgw+=p4@!_)y&w9JtzPc-Fym?eA(uYg?;Q0y{KG)aI|X5E=7PnmUB z=0)U%|5uqOnTp_5<$J+*qiYFeD{A|nOIe>X!uE7sG3+YX3?9YV1hN*914s&9qsxn$ z>P|A1eVEBUOar|M6}!BXOeG&^nh(?vOziS927H*AK1?Gpv8v0Xt|pl(fK(nG5cZDs z%A>8eaI1`U=$LGhuUyuB=+5q^RgN1ArW!sQaOij(pUyb%!0|5SXbzDr_6+cn_J4nn zs({qbI@496pLL#l1$mi%)(Q5{Gft<;riM-AKJa6U^mC+3Cr6RxQA5RG3#o!B=1r$5 zCV4=&?t{j3H`Q?(B#FH^9ZpF_=xwA0pxFIA+0^o3M)@!;!4&Ck#-FMpmZPT zd_b}Ld$OtH!;JA^+JK4G-%7SN7*$#KszS+>SvRU)Lk|AG$~?scaG>#WKMqRt_D`os zGE<6F6xXb`R~@Tv1XHZrgUdmr2qNUh+DFv({6$(iL(X_zNc`O z$~4kZ?I6+>0={YFn?>VH35(eWrYjhoj^0Bqj)Af3|9Df%f^q;|3`pDZA*5Rj5b7Kq zCeE^%YNOd4GgEK#Dw>qHs#;y-P0-1vv?cHY&bmQBJJ$P1uebyznzJqDB{1E=Xc_(h z=^Ym{$&|I2SHbicNBuRIKScV(1x+?(0I^UdX^^gqnduaBj&=M7Ha#JuLHrTYFRtGy zrkwVF=A!#)x>4i#95H}a0Om6lzi0INAF_9xOSHch(Ji~{x6h(2!BIC%HG%6L^RxAH zkl^X~(HQ9--;c>M9-jQCnG_uSWF6=upJQ;K0n-Bq(_#}TaG*b4oC^L|i~j`tc<>rK zJ^wd8L(Y6cE8CN9OnMgNn)Y?BNAPY-X7fu9Xteg83X*_nK3{wKKoILGAU)NPfqn%y6_;q%c6p1)7=nU3*~ z=zr$?QRG=zKfrX`3clvI{DZ=W?saX`?G%oJkPF~_E1yM>lh2T^AvW3u$6@#gm1%eD z=S7}{brNh@u*G$|yswA*9PHKL+&|T^6(;&<#D5rao88E;ANPJ`^ab)S`g5#<3;KAv zJoJfFraAU{2;Z-~j*&xNu{BvM118_+?=t)elaJ(~=ccDiDD-U?Px&PcV)Q%UvAq7t zD=&JqV0aw&0w3D5r|{`ut?FL<6;4#zB*?x?X@&m^83*Vu^#36Kr;KSadjr8FM}7Y# zc#0QF3gY0L%NXr6-h)g?EM~r@QL{j8ho^W%5+M;BO#ETE(zY!G(>P@I5f{XcKBn__<5IO$^N0lq{p3(RrzW$tR zzAXUr)}RkNu_(3(*>8N9NO{2cRBs8#J)z3p$C>EeXChs#X~Cj2<~-YHQT~|l`TN7M_zR~sn6tbF(%s}|x z9m$cPd{ECE_W|1!0?ky6XU4%Lfcw#hyTFH=24+?Pj{euwDa_~^^8-MQ>;|xSoU+5x ze2dHm6yUWO;xGSiWdH6O&IRYB7tWAsJ5%4mPmqjT;Ab0ahI|}BG|R3*_}~0a{P4hQ zxIF59(l^Kg=54@b0iO>zl|@6q?~wVRG{?1OVr;@VJmhoOABO@RN8UrPMMHsp=;G8y2A@c*EMWB*-t%mp?i18QT9YR21e?v0j1k4XP=2I{gZB?#iJg6dnBZHOzDor*j z$qqyxK>flRE2A*R%H|{zj<%i8_mXTTpFyTOiTsM)qRf;UWs^gH;w=kNujNzZT+91Q zOb5eTrhg&JCBoo|2*nE>p(C2ZaNtHwX&oBq%=MUsX^lS)@^wTGKy7CF^-NcAI><7R0 zqtNcra)Alpa6dqOY+*OR&nJ76Bw~o)9Hx;waUwd8%tDh(&%F#5m?P|sT&#;oF@mlp zC?`|*0jBsiS#;b>*fPO_d#FH%5WVpy#;QOaUQ>{$3YaW_sB@UV=2gJT`XvJGBWqz# zAg`cT*3!=;Yw0RLU|r0t?|0oQK1DNUZKja0ljMlfK9?k68Tk;Jbi7OZdz;YcknRh%1udnqO;D zF@M;aUxayW%+I#rnh>g4R5;FT;R-CZ&XsL^^I?fe!Ovdnrw@Ko@KeV6X@QSco}bMB zYLzV&!ikRr*YOI68be()~q8NLL4^k&fue@2Z`@BZ73qW0uHZ zz;z)~&JsBv`?h>b;8>lqWur)hP5wg1JZVXMM!-Rk_z%DnmV|WwKtj5DJ`(--UA^ol zku@h^o(2~gev9a74bu(SY4|b9`QvH)@DxV81lgsSiTuNV_%r)FBIXT(J_`|zfCS_j z^qOSS&t{TIR~OlcWBKF7o#`B*OoH3R876gsW1a)25jC2MG@v3C69}du)sASL5dV-W z+CA5q+=Y&L);gGmgT^?hOfcau1l#3|_@yQre740e1m6sNa>zR93b@-D9V|2HILNXN zR^Xr|4ysvv53s$@h+l3}z~5o<8^NCsK2*ax=nMGLGdfsd@+sjC9FVa4F`3rP03c=; zI<1BORb~C7o3!92B0JI3rAgA)nBjK4d(rYJ7cFC*_aFTBnNRqeFx}9 zFA4K5w$G7-7T!|G2K1V?($8huq6?XZG~9%f*z0B$07wP9=07m ziNiTFpD+9(^8$x;)XQo<*mh(kY*S)#BM!VS@*ZznR2MF?>Unkx%q=!C`MQC`4waLB4F}H)OMpr)1KfA7=j9}U$H-d|PBF`z}haqh4slAqG zd}_f-FxzpUfueKkZ3P2GyIV1nVKY(NR@!u)ZH_#I^T!eX*Mrx?dstfM7k{;2oMbE} zE3oQToeV3HkZez={|l$yB3D_oU$2XyTWXeCB67AJB06HWLY^#2q@bed5V#JEx@Q=e z$LStzxuHt@v)npU!$zD**BW^WXR*2`o9Sx%90pYmHIzP zDUtzIMb^-uJ{pI^oiV50vKZa5Jce$CS#1-NubW9o{yQMK{)7tpU!xvo>ug3h14<<$ zfi6b2#?5G*$)4w!wKk(E;GQBOoeQcX>n#cCZ-In#osj1(iPPqPHY9GaB(?&&7!m?? zN4CW!af1m!V&)NVfXxE;G$eGksEN!#uhmuhEs&6|3-Us|`JWDn*?=ko+5xCLBn0Y- zY>!J~y~&@;mzFGv0Q3JfNNAI)jm$;Qfy0qL$aXB<44wb(#$nWf!`sPb1k4-@Dh{Zj z1rUaPA7Bkyo{KP{M<_@4mmcSR3{0#_b2Rsj1h+ZqA z^lfJrp5v;rB3?@y3z(y}3uu$QZn577xke;*fLH$4ZP@V9l^kmB+*`0Jrp$FhX0dht zt@_#Lye7^=*16hxSm*t$^QJg`$2#Za5`O9`x?yS9b)MjEvl~qAOvfz2ho-cy6)H21 zMCJtIt-9`Qhm0ttjz%kE%;{Ao9cLQJa+bwN)wm$ro6CSq14A!3AM?+? zE4-_wOTgR$W|7YS{)bMVf|%e-j8Xa@&mLy|%D7as%uGsg!Ptb^a#~#`>Js#rbsWTy5j5bIl4l zSO4c(=Lz`MO^P06+7vt-CmY|>zyoz}%!M{{oYo0OSH4dG)Fs!yh*q;OXZb5IR#*H4 zBA4u2VAoRi=&?!C8Sw-@l+9m=H?1^-u8*NxZ3bBK@;8KcR9$BpyNNG62y3VLb z0M`92@C3jKWGxaJyuw9gU4TmmHwYXM_lDff5Y;JZ0P>8REd3wJ)*;)07%ST^K^!1k zor2CsuC&3W?@Vy%h9LWFaNjCd7<{&423y=waFXZNo%Ub;cdxyaZGt=HZZZJ| z-cU>CE5L6x6z6Zo4O*x@cHyN-0T{$og)RP>l9Nk4HB4N^KN)$fNP^A6eRoYoP! z7(LzMNbf*Wx)I2JOEOl!XU}p>SBpCi?sAJ8X>mm=&HQ%1Lfu-xa6QjY3{J1DOD1dRpA~;QCwKRTjq|k1w)~`pDdD0@EGS!;<+0@C%U9 z8UG@rH+pTWI{*L8w1Pv_{9hA8w>DZA%Hh_y>17!z$OsrJfTq{4c#S)VW;>>@#r+L# zip6Pq2^Vh+r80*4*rdME`TrnU=``vfms*G4;IJDGrAtFzIYWta>9R{K?gwxKEiT>S z;#WA6w=AYP<}%CXSHSxrqbr!MNI&aT`cXKQPV@Y?GgKvYwqsgZ+&|!^S)8W1aM4vM zc&tx=vb3~BegpU-M0DZN4LRRBls*lI(rFg_eg-1B47}DBcM{wTi_31*kR@RFw0i7Fffq>48W08+N z0-?g~7Xm4kfR4(iAfRLN2T=sFt`6{rhCBkY>+BYJuuV2l1T~ynhzVDfF>g!V*NGZBja0=Z#CJF zs1I11RY|b#F}QSey8>y1UfE0EWJckE^cn7oH>3@2j@p{iCVSm&4}D26T@aQbdku(0 zvVSz2RdK;=_pW%5+wD!PB&?6A;A?qDBEVq1kv8QB*9LSP&5uQLL0i5kyh3SLD6EGiQ^@W)^?{ zynMXfuD8_L$UPgs=Bt-%f}~jMqR|k>w|^y?Wh5# z3xpr(Z(SUG4j3G8`C}c^2_sZR)b7MUe|Mpr=4mRQW4og>Wy(%9%C?c6VSvMtI@VE@ zcg9hbjaUEgt(XD-^oxgS7ODkwjD;ouiiwNPV_cYeFcDJS0OnzGu0yK2_EVTW2>+Fh z2kaMeZ_?BSY7SHf=sL!Oj!=`4?X=#HgpY@XCY$kaq&3(OHXnmJMTy-3_Z`B2Wd=Qa zcbg-uy=Yya85q=ddnxiYt$#zp4K6Yn7)-YY8^IP}P+P9peQ>`c{1<0%fvIl|>Vj}4 z_Oy{r(e>XqwEhbTH@Mgou=+K$1{(t}!Ju|CVDl~9-w6N38NA0dum+ES$-$sDtjmxe zX#5Z2+av$_gp%fFl6M4q&}3tXP$dItg=-Ge8;0iI`u+wBQ4)2{a6)w_ zP^Z=HDkPtPQqb7~(E?Y6g{aANY(CtU_Y>*=2=7u?23BM>fIlWO?NDQpO1_t_%Dbaf zWxDlNB|&8QI9b&)wPB98%rwhX2n_os3#bZ^BI^S5%w=VfJ*vko~2I9>ov#{ zB1^@QHa0RoSe7&RB0j@)%4_}-JB>ejVcC|jC*jO^xar7J*gA00Kw7-M$p@-wp&mfb zVNfgHwa7A{y5$XejGpsY14h>a*TPJk6gFV*4;jSs{PSOI3i?y&0h5BV7DOy%i>GLx z=EgCicRcm!<_!Wk(K*DCY#ga=*<9FsSS`s91=ucdU85mz2JnwIiaEenheoj|#Nh=Y zWsA)L>xgtM!4WOh`PLEdu`WGuqGyOBMH5}~vvuSa;Oj#jxhKTvgCS-2nxCvA(zOgn zy!C%cASFlR#IYfcIOAROtA+0XUIDD-cv*n$1$SHsTQJTwzrbn+wS{$=1zMc0LpJLC z9|5W`3#K^ccMBa0^a8U$i_`VUJB+#vn8392Wt&NY9*YSD_o)|=)pS_^6SEXVDG)McPobw^OArFbTCGP+Jh zYSNCQU)1wNkJA0eMszR0O`#DzT3N)hw5dH72*MvGP!HTFK$_A80w0kz#sVqy+818xm@jPmp0sa9 zdo9^&e@2Ld#kBv5_EGgwJAeiNni4t4nIQ&p0CwAmG$S?+_9Kd=UZX!4K$iVIRK7u`F6Ysxc@T?FI^D(%`0-XRF0ca}aBEv!)bZNiOwm*UPEoiSoj z>#yw-Ed~Q@;LRaBD*Z0O;Tdkq7Fuh?E(e22^2^be<8;r@zol&WI^h18E&#up+X=aUOoddlDH1_zg z@%;QTZE7=0CSc-Rj-ofy<|E29%v4r&j9+#q<*{KlgjGa(V!3<;{aM8LX~15t9v`Np zjIjnM&`1V9u?8oh{5%ZaLYvR6L6ud} zoum^t|IlZOtRt7vawLx2L90URh{{rML}l}lSgMBs%0SQ&{WX`@;VLsf)U`A0t%%ns zYel!|7`ts4lTCegH!=YQI!c^DYpqe$fvRzO=fwVgHx z#Bknwd{{BgykR*Vw7!B441_Xn)PtRlBWA+ zNom~mArSriwk=l%?u-S9T0tahW4hIEWKk{W$6`IWhf7s63^X|csP!Y@D|f% z%PL!&VU?)|kf{mSXw2&zQoRXSDN|~`yvjd36%x@sWaB;R+SqKgV3|=Sbp3M;il-sU zMh)5)A^8OHu5GUjw4}YF*QJKqUs=}Pe|(q$c-jJrx)p#T)4QG#dFIbO&1w{C!(52M@B8^oP0Kv{=vT+Tfewa5p^kIXM_uWYe4JRB4qLH? zNffRLDE*~OX=$6}7-%`x01OnE^nMF^4tS<#h@k@WEIK1}-4Yxf-d|C3xq{q4z@?85 ztA*#2_wxbB+P4NCAC_;P55VUJ;F-X&2VuaLI;hB=3os)Cj0XE9WVkZfuuSanVfkil z02&p5a)4qF!U!5tCSHJ1pXyq;=>-2e_(3A;29uym0}om#FfYXpTZ>R|?`frdSQVQV2MdTxkt}ahxIfs+CY|~vsYg}qJ~3HAh|e|RX%z?Y~c-%0`p1$ z>K%Y)0fnczz`Pt_P6{w{VdB#44MYX=YJkaviN?IU1t>ht1!f(P(yX~0A@VA!Fa_sS8OUk8`@l=>(DwZWvmTx0ihO))X5IM3ExfLfW zq}R*t600M%$1fV4WXg-F*)4Rkh4=~FE%-t}d*^Ls>G9aaQ4iU8^ zz4=5_fE)vKxb*r1Z>dQ?c6N{Bmi91@e4V^NX?_qr3zy4f5$8@mr{RBR4hxT-Muz~ zM^M(0B~*?)C5Lb(KD~Z+mz#S6Oy>aegto-+^e#7f0jOI5S_u=L-sR@L0Mj$T{0}B9 zyF5N@xp@Fc>Cpky2 zj?R8#{R78pva}jU0?$1PK0YkpECNzL>rB^)e%5*JQREaLonWs=5>KZUW}yuvAO7mo zE3SY^I2N2a`ucida$KPw-6JMFQm=c99^K7@D2))wqbO}gO6AZhu$!Mmtv*t;Q-Vs06h;B-rpOq*?bOA4mEF z{dbxl;VCKpcqu@^4X0O3vtu5KOS87~ie-G|E^21m2p&UO9U_n;Pa|t_COo~5m{|el zhye2{Ou6)yX7>>@C%~kaGUUGyC_KH7m>UC7qf&^g9GLL*K4NZ$(QMNB@3zp{`ZabBkqG!3==W>F8PH ztq{hp|4YnV3q1og5J)@U8e~HVw+W+OeKPHYIdmnYqI!CN#>XpwJNpejg%aJL#$Td@~p7j{N7CjmxZztAc@{P|E|6`^I1=FkoeQsq2 z3N&Cgq96?kaimm%z8g>ie}(1Wg^$8(?7Z{ee6&!XP&K@|E=sONi58D9P*Q7>zqHctUb=a~*{8g9qI}ttA!YwF(*wFv zLv^k6zieAx*%Kkk9yiwql+_JY_E-q_s8?2-f$PrUo>As5?-@CjavTC#Wzv7@9_{kW zSzUgs`r8#Hh} z!{fv1wCr{RI@|I4essP#-q-gsI=`a*xwIXj>+j&BfKu^v&Yyl2?%FSlwfdr@>x+4YQ?LCI6v0UUzZCfK7q@1NP*BKDQ2bFI?&4!}7EF z%Rw~MT}m}PpL@S*^eU1~d8$=#!T`Ulkn-T8fX*kHucZ*4w#G$YRzTkc=o3?fTueDP zL_LQxe(mEIzD9*S8ik^LypHBoDXo)t`JI)Hi2nOimy5uZ%w*nk`GjPOcpdqM)>!lY zv~Mt%^m^c_R;$byc->vpdp3=Es}9ura>~?7$>tL3H0)L9+AleZ_c{~Lc`s>IY$yeu z_2ie;CGmY{Git8c7#LnpoAf{K?()~_F8@^hXKR;n7-)}f8EArWIsOK+8|U{U3bh`3 zhOo+CpPv*4EueyJ0=gAo0 zU2#_iI$>N6tw#6>?y1g;@(F7fvbUi?}1@C7w~t0 zXsEtc!wczaHQI=rO?j$S(1%q*W$z%tRpL^jIp`_$SBVJvh6nVGMxU5X$hnkjm5`l( zqO1}s+>FR0J|1TN<@Iu26;(P>eaU!OQ|pGZ2ZlX!a>sK}qDze;lw3h-#lMRjg5~1{ z-XBmO!tb%nHbBo--g!9xpLyO9$8q@(#QCm?jzEz{@_H1VKg}P>3sL+DBl$e)>f)*D zE}%{Zwj!n~gBBqx-rHFPyh>rw2-lqNI~JJ_GZx3*vrh5RV#5;~Nv7n43I+JcOtT;CN|`OOpkvVD{%>_~a1n?2<6~JtqpQ8ZS8iUiPtSd0s&%%!ZPX_+P!Z!onk|6pdZ*V_v ziq7WCOz`T;-SAm-rHp@pq~UO^E00v)6d>yV%yalA6OhR(Sipx1&fv$`hYRM8^}Dnt z>;!a-0jbB&gJ{FA5yq|QLo)S`-_2mkHRQg>_VxI_ok9hE8-Cvr=X-Dn=HmC6u=HHi zyh^6Opz9R)r${#{)O9MqjajL(e~>yj65ew!7-zU^x>{$};LK$>vjaxX+=V0aK@<;OgC&Uqy(uEaB>Mw>gu!?Z8c7RrtL+HQ;n5hkt1ju8>hix zT5^_adc$aTzo}2$eo2Zf7&L)s^31q(?$T|Y)P`-QDnQQ`SVcTcZekAR` z_uKzV`?2NQziKjR-r!lu z!qx%b)}2>ZNOGacz*ZaE{#x2+(f%i|JrfsYtB|-c9~intrev6F+QVz;YT2C4(AARp z3sM&!WAkeXeiaX4V{b=eZD-mP77~dzgx`=<8V7f?V#aw#+G@=HMBCl;zIMH@kPeiq zXH|a6)Uux4z@^nu)NvMdRd*_NYTt&&wb0*|p>H}`Lp)$JCwslFJgRz6HOu02MV+kT z-+%`O6mLRt7pquxgRJ6?R&ht1Jj&{AhTfIac|60hIsVtT(AybSb8M29sB4&H#dwtr z+!!b}$ztO-$y-HUGr4EErjB*!Gq`&LW4J40IE^}rUDe&^k708(IyHnl;ZRpzHH4ev z(9+qgLhoLEoa~z<&%o=ZpucPCp+qVEfQnB^v6iX5NL|Y5&a_O*^^kBWiaF>{u$R0t zt^U&KmpyBAa^iPa`sU)4bRC0J-EoF_6%9_T(v3fZ33UXZQgk`swPZ*jSvO4*X6Ox* zO@`9p>M#lB;O)f>-oN0qV6VobXOZ%Qm;TSk+axaoZyINqYwBB%KgaO>*5hNX$8l-& z9QN7uCY$Uvge5cHN6tFW@CRZ8%CsGALSHr+{ThcFQ>NW$lk#70j9{af6 zqYOQ+jND4PWuO<~^u{{|Na*+|8srj+KR&L3yPokOj(|-DT=)!n%VaP4VuSbyHoBcCwDEUq2?wIH991;2h^cv6w&EW^9u&}HG)o)NxJk;tKRRFvv_$+ZCASz{zh3n z0@gYB#sU5rxEf$H^u$)uC9sjfe8q>hwYC5MjOSmGLLEAONB*Ef2L_d&&c3d)lTp$E zC2fHN=LPRF+vStje_Q5vm@_SNie);&1kM5;lf}ow|ANuOF1)q*!3qAloC0(_hII0Y z5GH|`d`>5co>GY zDM-1DQ*Ru`hr?CvAs`L7Pbu6*wzRGOjr>bFMXt*4q5oCZ4_(92lW6%a>F1gfI2OLB zdoSP|A`*Bc;4y&V>-YO6V~A^tt-%twW!7MSYcTPE+iEfxa9_eI@>gLC3H%>@{xuc& z>;#c#Fv7n8(g9VMpo{1xU4ClRHG~NsPz&kPuWVag11+Ylu7B#%HWJs?&~`U%8>;`; z(6Mh0<&;9>Ml^0j9vruH_SeiDWy{O8Y$J4;t|MboCq6BMT1@QH4I}Z@~$pD zFJdqy0enlLjzkoiBggn{&|dEUKM*b7X7DnoJ>a>ZeZa8&nudCj(u%iuWnsqMPlF@%@E~9_jb>7u!4qooEivI;Z zKcM(S6z{Z(Rd<0^tgBurZj0iptm0Pay K9NyHsns(7!1glJG`F)s7vGPVuC*YA5 ziqH?*@p8^|Od*XG^)EEuL)2Pw>maXN2UOk<2UK<%a=vvSetXUt=$O|m^9Rfj%be~p z^uN*(zdswGH?0G|0`0{Ct>en~8&;Xh2cS%41CR?6I1m}Ymv}7mC(K!v87L+S6y`v2 zf4;#3q;zR%%_m)2R=Xn?#bwKot+#2b`(N5kn}v?}_ss^{XmG2|Y6hQn+h}M0cLNIj z<*@vUmdW6I2T0xU1<==ILCa@4vXOFL(p0{jZ8$dgda_DwrjBA)bt|Y-d)-SLgPl(3 zQqW9U{l$oA&c>VBy}rMzbm=YN+JBQ({tXJ&1eEte`FmEm>YlU8b(cfR+oLqsDsRI8 zyyJe}4SVRfQr#H@^e(&t`VxNMO!{AIdPC%Ct5_9}qF80RQ!y5WabB0M`I$Jo%5vYr zyqEo|D1JS)ixuR3~EF0a6lqtHeaOPX0 z)6Ui$d4+OCr}C9JsWM%DPdZF+1qAo0iY$P<+7 z7_RbdL;bQrlwXXlkYl_ztt;Wyqg}6bAQh zwr>Aok_*t4mN^7-p=ERdDyA(q0^72`!xGR_FgkyvwZQY2E-Opbjil;HDwQhLy-c0< z_`XPI6kd(ohbU9+$kd1&*7(1g`TIPI=ff&f|KQ9|1g0(iC}f^>Qsw92BxO-=@4g0Q zVUgvt4KJ{e!I3N}BlzyWP?U66Yzr^r?Lvy7=O>VI9W&O}+F zHQwp(ntLqxBj6hz$j6Z_muN6%Gl=u7Bb#x4436kP+YPyoa*dhIQ7U#YXjFIxaw(1^ z8Z@VxB=dk(co0W^Mxo9FJ&^k;*O*cHNV23d9ZF`PFD$bzhmu7wQ9%w)Tx21gN(8kH zNv#1|2t<>#B$zkJL~MGUz}EaAbJ{~F zI48L}et%$!NcCcrL{L%#CC}oW_I4@h20W`wfxlLG$M)CF!`5&z%mxf=+a8Scr2T9p zVA$h)dmozI{!!OFWDV8?UV}mH-GYw=yfM_^dXsAnR)X1xL2ac&kdtVCQ$mB=OeO*5 zSbMc#pHu#|oeFjVZw@uM!OXS>D+6u9pmw*RNG9!XNoepRQ`FBjv#h}s*ypjQ4NmMS zaM4>s4Q@1ZtidWk?_yB9*jdP_w7)H(!H-Qg25+PL2YDWPlvm`yuH}x-#3|? zt--25TQI2IYBaRoaWEzg@$#XIeiy(8F;O_R@7;?JQrDo@zqF8+SNeTA~~!NcUp%V z(SAHpo<`gAkm`f6$ym4w zHUe7*_9e!G_64yc;1(fv#S1K0Tg{a)8Vk*|{!b)LTB}DQ*8x0;#HEQ3`0`gQEYun1 zHL|3o`h4Vi+CPNEXDJKjY8V~EbvQ{K&ETf}0@_xmyL62aeViX6CpqR?8tNFXhNqDg z{iEJYr<@4VRQ}iuzwCOeRJ+=dDAmTbIG|KI3r}#&byla^ogUD6gVjlS)U2~gXHZDH zw8{MM)=6P$`yeEhuZNUu~l}#Jm5;Ze;QmElYFB zX|9=R9hnMzSEwWFLyT?3j<(HiS4*aEfcjSVv~SE~@BA6r-EG6^SFaw}@!ft2kvoF_%~;r07|k(1tQA;KbWF z@J^T$nXVZJo56TUhi$`pFbr-KvZ*SgGJX{+KHV{6t=BmK4_EXWr{68NjAn&8{i&H? zotB>Ga9W$vjYuqWo7MkYLcGq*bj^6{bz9)hc&&9`UcXQK@~asA$6TLeoxYLwOGBOB z5?J~CA?coMJ=dnuBW}mi{t(=;wD#LeG=I|}^2^}Wzm4JBktuCB7b2fwEWUdq z*qdV8cWgrccLmU9Bj9s@_|&=pLoLu81086urFt~7J;Z=}nq!98fV5Pr}+Kz&wLSX#E#46}y2pr|{Bwcg8cafqW?7(R>kTKVSE{zbY#8__LDtousf zzCdCQMDu0L;K82<{x zdyp9C^?wAf&$Wh62P}y*tn|kK8CH6Ozboo6k%)vGm_D!%zQU%39&5 zPFBAT^ZW5Kl>LkX-M7$DQ14mmeA$7^d|u3wRl~f;r{yzK&$PVGGza0;1y>-yl=1%G zpRx+Pu~2x5W4c*T+kfTZek!K{{#phG4f|t=AiBcQEKT0yxGNlKNhjRrKG%D)Q@* zgNJ9MCZ7!Swo%IKw&Q${b@)}3`b0!9%3{XH@w8X^)t}v22$Yd)kRsZvs}3RRrlDkq zL2WwG*Hd=#)_dDq15Y>m+@v>*F3D*8T823hh3HDVga2K6nhzaip$D>G4?N!Nb59|4 zrH{?xjou1nNtB%s&^fJ4XDOE~PSZK2jMb@-z1XB5c~M6Ctc(gu9qRg^G{She7e_1g z^`o9iq?O51mR`}<8OMu$<~_`>=LDW^_PIGJz*eU) zSHv@lJQ8410!%d+E#U#iza^wpv;gwre`czj=*CKCLP1vV_2SIE%E#TFKck;jS zr};)=nWeW1lgOS72g|r6UB1xof*mHk$`z4ncSlTX8_9hr{rz;`pOYOrvD4ouWbX+!2pz&FwNUdn*d`-v$7@U>Qqd#Q3m*+fN2k-^+Z#=-8hcq zn%}b!O*IEFIMs5P*X=BzdEFjBb6S@OYM&I>K8vZ`k@o6FP32y6qo(l<$a>mq5lV&$ zOiex#fNQ*&?Rm(WWjde$HtjC{H?m*FespLP-;YTlWyzEsg{}zE=Ar8wvMNVzLh9m# zKU>H@(!=0JdH3==*}c9_m$uIa{y`r6gN$5|Lrh! zfRr80|Jh^8rczl3%P&U$fZl1+50*4Pd_VJv6t7HO zR_4rhb0o?l1aLpP9z&NLxeaMS0DeEmck@njgvW48SkTG<)DkGXpGjlD&{F}X4NQ1H z?=T}iVaoaUxFdQg$Jfd8Ez zKLgbN+y<%8H%>@d5@po_%Ch5?dC`{F<3JnY-w zkMd3;J%tFvr&Eck5nu)cm{VcGXH$tu2{32)Ow`EGX+YuAsl?O@KtlqCPKT*5ooohD zVbpcHG1hm2e^cyM?Ch@4b>v^uqWmG`3sV~f>a%7=c80Wt(8d^YVgzrZM?TC#5#QyO&ek%;~VQa*hw5;i+!t6hot z-8%3X%7){B*8Y2ta}qkR%lu}Ul`v<+XhFMI9|<`(AyjPsw9r#P=KyK2-iMr*kSQ^L zz%cl{Px{?C&Ob@DqWv5O-zywkUV@KZn08_Q<;=a2E7uj-z4D1P$u)(1T+tNucFo^b z(OQhpN0IjbqY^1XalVQ1yG;rFU+`otO#|C2_yW% z{9_%{r}gf^F)gK;w4WOmVL8XXGR_Hny~OfGdP^`51sXhO|DP1>F&QZM*DBEalnde2 z_aoq6{!a(?noRf-%fGF+1Rg}ey(k#@e^T(Z$woo3RiHOJ^Wiln&WC^HKONX-a^b)9 zdGo&B5?+b|rTl(m^#4i8H>Ln3U!Vj$Z5#iqKgpj`U2M8}VH|zq&7SgQ`^uFu{?zbm z^Ho4u!%$^=%Yi<(C#38vvpb-xNvN*fF^>B`j9OpWmtJ94W`y25JoBr@-qUp|)w=$V6lkG8wrP(VC*H zr0f2#lf&ngK6|?99+bADVi!sa$NLJknq3C)DvgVf&HNslyWMc~s@ZHR15?W$jP%L; zru)zn!MVNY+0fKq5f{PfQ9@I#@$RraW^r%plITLs^w1}z~H9rp}eBPlzP!CtO}}47An3AsF;L` zhiLjRvY&Epka{j8yM9?d<#ORoaxjh8w|G?&dl1KP*dNCE54-bn`BrX|AJEZ``rmMG z=Xhh_qox2QR|3Bc{0KS0hq>G`dGK#WNqugjVp9DBJz+F#!WxR}Irk8p9 zo%uWfo*V#Y0OgbegM;Wh^BG(_97(~^*T?%#F2~VanD=q+100?V^{`*J|`AvXhF`m;#mwl|6<8pP#mkNN%>iYNg-KNPd&b zx_GL}OQ@8ARmiOvxeZaIHIV|Vu%x?d-tZlZ+z4|Yj{T4Ja%vY2mBcyK8>gNc&KO$r z(Q*0cNOHBmes6N4JzVn^iqsz~nz0>_J}G(nh450FdC;DY?Q+@|VVqcF%ap2>u zKbTDPy$z=i|J@1qnw^%hb~N`4E8{AQ>VrQ?o;L?f2Ji-8YPpEH z^?w%E7Vptm-?LI=1$eKhUPJDtaU~iOmG{|7m-E59Hj;TLet<|;(@y=qmp1$20#SuT}OTl855(zW&h^5aly)>XLil_RuBb z3XyqmfiCes-=&#ASFE-HY$bpij}pPN0Jl`j`~Izz`fZp#WvQp}8ryg=jTh7QIooza zO8o(`=#sqg^OGs*?3!n3ude!-_T%X)=`KPR;!s#uWp{DS^Lz5vy`9qHtmj~3Z+qV} z*6*%+;q-;lF=XvQeqKu(evP2g^r!dqPh<22>t_jepTN%-eLtVv$ODdoF%{tBOSDxl ztR`vy(hG{|dE`aP)d?zJN++o7V`LF_!#m+;lX;YDUa__x#rDJ4*0r2Q#P8(tPsil_ zzA+^wT0U977zamQ2GmSlh{5%tGxaBLQi(15gx)XaRXFv!xUcDTIl2ugz-U;n=XP>0 zHtT?hI`Rt8Wi!c`0Lt{a43!;4idFVG!VMztk!VV_dLTAomXoy{-r&uL(`;@<#50Nb z6&OZP=Xc2ViL3yhu>eoqUi0{T+9!FV<5yEcrj}Z+D_kzz8y?4ML4Xdnf{Orpypibh zznLQVCGeWmS_p1pQfm=-6}hL(NBub&JY!NXr?H0O9vV+%C~8Q2ggiv!u%YO5a?LUr z4UHE_$RU!Xu6!AJgmMiHl`m&#sH_;tM_2iw!I<m zVO>=M^B9b>djgK%IG&+FdyS`LGF17nS^C2iL`l=5Hj2&|{e>vhJ?oGsDA%>W%2(U* zrZkt0tzXUKw4tfj=5-v#p0@$q!v^p_>J*IlH>i`&Pmz^4_dldEZKHgWIA_rsl=gYQ zbO$IRs+GWssvl9^OjH{I6;)L%R6lG~f0|r^ddl*{;ctcCWciw~DP?@r^P>FV`O9RZ z;7Jt3mX`%AFB&+H;DDAF)_iYynNJ&j^_G_bc6lK|W}Y>kh4DKu{;oBiO5-|Z#<2}l z-uT}p1LF&~00YK=9*P?N(NJ7_X6b`QL!nHKSznPytRcOr{17F!CIkd+9Z@5!UBX3SSO zrW@x`?VGP7_ff%1nkptu^#{@uG~CX`+5^;~+g06U>eRj^Vcds-wzPT9M%9P0q&K>n zuYJ_}YI;7mWbe0n_rXpJ=q*O?BE5GY#j3p0DsE*JcSP|ERx9ZT;DpM)L|zU!q4{5S+`x(1)`^Kg3vfcv z*T@qIoiI)=PRxR1K}n;j>3V}de&+a&MP5bMR2B+$sm@ ze^<&j;-L)eL*B%@a)Sg%9L7jV2iM#Iq)|T==3cr-%umQuFyW)#aU8hYZ9u2P{V~&D zj&+F}y-Qx8P!*eSLWSQV>k~LpfU-NS6Ia9J;e?opKG{Ej@G(1~QH04J`nPMihzm*&6L z_VFrpuw`#m>-Mt#Yn#-`cq!YYPR0KqOM!yh zWKKKB+<*b?aylQ*9pIl0|D^4sw5`buk!Y98YU`L8wAIN;mj*`>gywqrF8bL-J%{=1 zh#DQhV_iG^jeL^8<7~1s6G#s0e7FpUbv8VNJRaw81c$R}t2N|J+UlI4qt|Ch7UjI8 zseD`X^0KYgW!1WvN;Vf)l^;_oT$#j1iWL>4x$`Oh0X%1n8H~= zWk;d-85GAIGQ5j>eJOc@Wxj#A+A>`&(-9+q^Ml6}(kAH0ijrcV}l^Lix%0F#56Di002=Bt1%V%=;hz6HZrX5XX-NyK3)o~)Wp*2O3^v4G2 za`Y6o;X5>NmC@~OnceMP^+`+rGW*9-+x|EvdMcl1szUjPoFTmw_(pD=h-)*NN)P!=~ZOZ64Ca?LQy zy$koX7Sj76Wl7FR>#UT0g0ngs zR>k5E37jp!*-6u_0cD8>RY^mebNl0-kK6*oaXrB04SofShKXQF$@WsH_9} zHYErm>Ts|uYT=qwEw~l%fCW2RFmV($om`^mYn^!qR)3&aC!HUW)2&XGe}PVwbw=m= zWzP85|LkU^qdOaC`djb=z@IGG#e#{QspVwiOh4FKWK<{Q&lsRO9oK?AfYHEIwDvgv zXEH(uT1Y3_?La>R)du=86ynQwFauyTsyD&tYpFU~{feAnomcs8oL5;7bbU}3wYMI4 zLBkn`F38Mq%^(YY2>6Qyds;AY)G1C;GuI4;)q0@g*9z7H9lef(-G)(KO4kE@+UZQ% z>OQ7U&e3*p3xYrQlSAfyC8rypQY&yLS<-R44%(gSVX5VUrHyO4T1JQ3Vi+B0>%k=X zjNkt(jb;50)B`BCN6+Vmg3fs#6N$EHW{bB)cc%@%MyN7_lt%uc#vDz#w&>R}c*RW4 z_%!HAnYL(^^%&!qMR0=a5C6_-Gc?ISaF2Fd@y8?pl~$b*A0e$eA2tO1G(N2yI*;)U zo4vHvpGng}_8@W`rg}w!vZlUfL3Om_E zV0yz;reD!-2HxOj<9Nz+64r6PLr7O+zp`^8jnuDd^aXvS^L{$Uswe2zj5e-00Y;<$ zYnYm7(J5FXtqKdJH z9;It^vT2I!iR+sD){co-XSdN-m)zQ#laZEt`8p|%r>WvsrX0h-`-nn6_VgA&+GJGs z8+8J76yJ{lO~=|~hji;}V>#ze=RV>4*O%_=nH&TUp&ZqEuF) zQp$A{m-6-~O|#1TqI}qSEHFoHC{T-u)8P#(>F@JeWJLLW7^P8 zvqbIAAaecFENM@4x~__14k94o%KxRe-O3Gbk6&(V$`KmwuGZqN6YPmt42WL zGAtJtP$`$)+KrkHfEu;(aJ(J0M_KPaMwyMP$%RI3?|y1Z=|Q|*Z{u6p#Eg8j%GEH zG|Dw5RDL}JU58q4!0qOCXgw-5byPWnN&&jf`aQ-p3mVY5{A&0nW4$~D5T~Xh1#c;K*;QTL8XE%X~x*{8}p>^a_z%(4u z_MD0|unJZFFOH~8$F_rqaU{2;YZ`kT*Z(DewHXuwb#4=w$dL>jX#_}Tg93~Ul8Lp{jx!rI_SgvQ(f}|oe6HW`))U=NL09YSKbO322j&fx}_kbO zG6nwlFs2Z9ogZN|V-CQilNIg#&5#3>D=R9WMT9ET0VGAv&_1w=5+GnPhl!sp^b63f z7SaJkP*_&-VSa*9Li9(f+c4bj^Z;a5GIIVbgE7Or2H z`|k@fyZ-R?M$?W2e*LmC#dA^ow^gjVH>lHYbpo;$#f?#Vj8!}c#eL4?@Ok%p{)UC( zzhIT2LwL4{3~8TNwBNwihqHsxjwQ~Y90#0(K-hBs0BS;tw3D5L{0!6tC_F=ToNO}m zk2Ppeuo;8e%*6fz+q8l~*{kjxviAPe`L7uUwPR%>zXCN&Xt1tR(%d!wT7!Hq$-Ij} zZCYY~gKZvau!d7&4gQ6_2==sxoQy>O0Ln;cu%1(Z!D4GL34`xpP#cNhKVe&h8m#Gj z2}5;Pot)Z^_D@aewB^6jUD7?0JB(k@kUcjsPd4VGB`emlFIp za4X>Q14sQ>%h>~?y2=J%Pb%8bygT2hYE@PzumI zqSZR?B2@v}Bk|EjB7A<_GEHGNU`WgODM&TicZfHH9sP-rZ(wNd{zHaZk)fz|uKvis zc-I!FVu}>$kxmIW8NLIK$S*X!Plj}JLhK>fcCZyQl;Z5SPS%0Bo4~XbpN7<=edp?| zzZD{D?wB7elmWDn$g~unj?@C`f|N_4B7^xJMt3lAz zypOULL@H%3Pw_XSi*Z2r^qaCA4#0_lNG#%fd~jUel4!mn%@^0 zjP_wjP1Rxg(k ztsA>gqzp_#&ue5r8_#$omVpa!U=$Lc0f!9igpDf7M!+ZGwN@#4eIY=6l=*mFWWClR zb?lJRz1{EUNZ25*8xlkfrw~1|Te~^=?}cHlbOVs=kaQI`bS7{(%X)ncz-tw}*7|Tnh|{$^r@e)Rkse`HTyLG0o((vy z4Q*P$>F6XJm>lAD0bajty{-#DE{;rW9n?7aB^O4RbYy4P@PsX@r&KUvYS7G?-IO7qV ze$N`e9`3E^q_VkLraZ+7I#37{{Jogp-$jY$y_9UkxVGV|0>-bw@U=+H@q7Z)%9lyE zK5ZR84mLUkSR?Vw0DK+b^>LnO@pp7r0qTu|-hjWvWzCIqGK=4zv4+%sDt6SBg9CPE zYX8>^h()LVaWS+Suq8j94(N{a`*aB4m9))dS2`JqF)zbJ=%O{2=?9Yuqpmm$nVx{j z!OpXLC;5g30G$k^EDT3(NC4$?OZ9ncXb{XPFpB6LBqsq=*w8W0!I(U!O6BI!wfq=+ zZ)4$DHeJwM(7wbphFIqC^9!`8O<035F>NN_1EbALlxeK1tPc%Ub{-NN>yad1Pb@F5 zpv#b#7X$Vl7)!#Hlv&o`1Z(gW8o@{K!3>Rx^iKgVTStas^hz8V2K%aYL}i&cqOuVI zM>0?({Xs_}Bzv9bNYs0zzPX>smFU^lkx>{_zf1E~Jk}n{s ze=k68LV@lIoIxmhxD&scGaQq{i}`d_n)k52Vp9C5<#irv3a@S%h1^`mmp!Zx1>QsV zB8=%LET|2(HK5XaAu_iN3>x-0{W-(OEvN0}C^$u)jogxui?RYYaLoSz2_{Xqgkn$m z)6XkyxiT>CN*WO1nKsgvCq3A!6~CudNj6W>uC>k7uOs{{jmGe;7`_uxmWE;AE`F}# z>oBx8`BB!TCoQZr9Rya2vXI;U1I{5+^DV1PRUYYQ=_1(M|AWmUTk~L}idxZqb(Mdp zF2t*6iD&_pZEQpxSXLB`E;$yUcpegbsFxv#yKQ>~(UJBFKxbdIpC8xW#qoQ6$D^+L zDh1EgPm_>4XneSo71CYzSYwL16UG#=4q!5Ncbu^Zk>=4}X;+{FCisC|Li;mx!&QBB z52F5weZzGyecXE)U2*FPfv5Y`chYwhr|dGd&3!1;fjI|-{ZI4vwIPz8Y2R=iI_XrUs1FaV_lpv6FAD?)|J#9|or>BU5L#svRTbS#k#g-Ot*fyY#& zI}gL_VG(J$C^$7F-ASZY5e8G599QOdK?5g!-GQ2gX|>ElHjrD;ncg4gap-d7L7Y;$ z<1>c+bSF0eH3~ov0fnbI-N^|sO#{p_n2m3rH*n@yURQ*};B_^4DKgsUqX`xGscq<_zqqRA%Uf!i8?7SE z`EyXD`91|1gCebVc}SQezMe)-`l_^=tGm`TH(NcrZW=Km>R&WnhN21oCq<2&^p#s` zJb9poxd}yDA-&CpMM#U6u4S|c-G>BM$Q%}}TWGH}V-|+*VUT-cD_U;)G^jA(l1#<2 zWXk5EEQ0Y{QTF=jocJ+(1+oIep#u%(wg9uQ=Nv)IEr8#uQEDLu1^XiY3Vf$VS(ze0ALkg_Do?h5F7DMVLtDM$am!RW?L zA@4?n7-;0&X`{If$2Or%qxTx*`Gl!%=v=;Ur`j6V>`e{cs=l-B|& zJ#&cYxR8CpA?o;jAt|IRnX>8DfjiJO2?w;!T#LLO%MSL-IiQqYAAqI=pbZJL)7ZH# zz+4_+-cFdE#?B2e(Flu(yt^_0Z7fYsdFK?-3?OAk2Z|@gl0-OUGX+v31Hl$|%$Kbg!RJ@4VbLc3kAM{${=`p!%o5VHVg1dKMY8;~stnRF-H zGWWrZgwdim6M0|fzi>khoGc*id|KZxnBcGNR{~AJkPfL&A&EDfCQkZ_sWl#*SHoOv zGj%sBlJP|7q%s zL1~1P-ie2=G7c2wb;w6V7(R!aIF|&N*8+gn9BKi_0d^R<7robqp+9#eW zTQ`Ca>+`qY^6B%J+VW*hoT1i%#VES~2ejejAo~(J(8w8LnTKIU!Dt!ItrX>*_=Hen zXPAYS09^>ASuqRwHX+l*ISYmbD^2}(Mx$5*Vm9(aLd8v;;XsUP z??aX=IB;sE+)DepNNa8|=y3zF3)r)`q)WP$H*9~N)hnL?p5&TB-G3{o%Qr`?o|V{+ zK87MKpC2SvWb#c+L378c=bCe@0)1p`CA^l>58)%>js*CKlL0@%^3UoGm#0vm!SnV1 zNkN8_iGuU20=>_@8eV2TM=!i`67c%CI(ut04eh8xRbBU%|T)h>-&=|{)POF{DT}s z{zVQUdYV;bUX+!xhxILRtGg&|N5xi@t{?9!ya}aA0BxCG?Gb&oI5x$Gp12#+HvpKsNFT(`&FA;kW(p7wS2z;e%Y;g1+{9|i&nNOs5UvM z=o?Tm9Tm5N--et{Irlj|7wWiv+3l3eg`>zp6<)jWswAF69QAL)t*MpDp1YNkR;@oe zl*@g%x0_tTN6j55sS4Z;xJO(v>GiYd`(ScbrDQXZI*ohP zeKOg1L_GJsy%Ru^X%ibs!H4zvh3De&(j10YtCoS`)z(SBZdKJ8TdK~iI`cr)0T^gc z24tWT#^v~(h=cRh5k-C;dh+S1U#C72cvxQ>=ZpYbCBW8*mWRuEJ178E3qUpFn9VlA zN<=sqh7_m$jFWpwu`+u!X*xD8vjiM`SYKP`Oh65tzW}eL-xb+iNH@xra+SBfqAcaZ z`2a#!>XdTTy!n;}Wls8KBgjFWRST(uDD5aKHOi-aobCaxU4W}i`?}@0Xc?FUcys{l5CBtw z>Xie75w&xA!f9li2+wdJ?Q(KHasI(Y9N?GTo0nOKWucK(K{fjM+D<^l^{BWP{66G(%C#&=HSgMbWqFj# zg+2t`nAc2R<@W^yJ@Gz1k@vyRWielPpz?|6=zxwo==f|1_w2MR+>eqbz`O`Fqkz-W zj7uBiA~`i3a{^!$3;s5Nr?b-d0i-FMHE1prSI6fZlkBmTO;RB%x6^jys>r2a&jJ8UuVK}A)pFAZpR2RvLYh#GIlBqUs?H`5%imiq>En(4^c65vO$0gHw@{BrzD(_XEx3Fz;XK$SqYeRKjpQe>lqOGay zNZtj$%}y@k4iRnv3{frJf@Oq zA9dQInB;ki#}j=Uj3rE*1$hI?Y2$NV)+WI%!fCPpowbfmdsyO1d-c=2+E>+T6m7@v zu_SCH3DTt`v{m3pM?^{J81QupHrr8lIA8rS)5&p?%979_;P>U|SgaW$MgQ%1C+8^O zSTCGA#vjynz&p(IaMjublRPF;Z?O-M7sns@eY~5J1-NR zaZs7)hA0!A1DUvzD7sK~xJ=+rA%_d6P9XYg;2uQ3SQ~uvigU$wc{Sj)-FbDduGQky zdI2Xl;iQzPySk!CPWD9PWL?^^R(7SUO)VBWwb8$n13#?G`=wdsUP&t(=4A}5vMaCa zz$#n*zx{96O59JmIw-YDa*fWT_7O$3&(8QJFYS!f->N^=YStky^^DZt zYpgisL{8P)-@f8NUfR&qKT;1IUCpFjd?@PV+}v&AUrD8r*0^t2YPHlBm1!AE6sQI`(bX-_?fAd*rHCmc4%)ttn}{QAM?`A zN&O?Q+Jq{9@g{Hb#43k)1dsl$JUYcZKHkZB;>ojnK9Ewmy!*8;9#M^2=^x(no4P^k zXQWm||Du`KcjkU=>&Jih^ruc4SQL+CoRW0e=F z;lsC9Cx z&YIb0%fVnQ!%}PH)fiXh4~@F`~DQAu&!exhCr*nIv{q zir8Y`iZCNV1f|wfrM6gVtJIcS%AuB@YQ(PSqD6$-iqul%c^~xd&+mR-ujh~Z_V9YO zpZRW|@AtaSb*^)r>zv8#`uRUbL%9)M%Ic!J>Rp3d2Lye&_dl%V_xJX~sxkWqeYW@C z8~(hK(;wVy!M4%utlRk&vdTZuD_%LCkMi9nmG|Uhwx{;1klH_!Vw~dnAc#dqdRtH3 zN2ec`KG$piFD;#3@LlWYvJ1V-*{i2c|K?~$mUey>&RgJVXf*Yar|!SQnpiE28TFsAOYQ%Awz1~5FNIBSqtlD^k3UkaO-h1@JvpjD z_Df-Nty);sxcs1Lo+sT_89wpkf3vm!&GU52H_Z2ZtlzqnnRu09mZyG|VR?C!3p^i} zTe;EG;H*7ki#V0gUid6nDz@j%5Z6`|i4@B76PMVp;eAQ73+b=)QcDIMuy} zaP;pZ9#0=6HnmzV${8|6pRMynZKEn|u}j3GPuGe)8}h^((eAdf{S`6PW1cvEYk}A^ z__$c#W2zY1(ka%}*(XAFb`lRRoD~@jWpS?02jbGmMPl`Orzp6yM%?UvQPfU91{n?(jzI{u?q?|n>pyOTP9N=raIqGNe%LB9R zoojE}u2nVKjEBqHwpXcO8#Kh%w&{!NwwjGDIz?E~o3;<9RbLc>U6_1dENalmc4lK8+sul!Y@@1_ zw+TaKTjG?;wsM~riElQQvE7{B)TSToVVfUtL)g3A7f*K8w6z`TWjoZw*Vf>+(H7aP zj_pw0{HI@gx_PYJQ{Lb_vEqiP5x-w-TlkYWJmH+!pZr9G{Pd$({qv7vvB!79eDI*C zW<4VcT;GZ+bAA#B>h2caXOD>A(!Le%+}tIO*IpyWIj@N*dx*_Za#O6XxlP!cBULOPR>n4Wu)%g{ z+S|4dj~^BReU=NiZa2kW>o$ol3%(Ow8-FJjSGXqn{<=kY2W}Uw4j&a^5j#cc?VBPx zY`>V7Q6PTsSS_mGIVz%N9~5P$KNO=PZiwcepAsK;{aG|KE)`23{3_bzx!ETCaYWSh z+9zB+|0UAT=0Dx5jfW>UOU2@{m&NjvwQO7S%h+aHYTD*L{!P67u$=93;can0^Qy>= zeIhbC=xpQqm$Mxnt+U-rxGa2KoujyTZp`XA{v+MT4u7 zEj!3$o7?9nVM_P56?So8Ntt4a$4GJ7JweES z*v0JY-NZ*-T8eXFv&5diHi*Tm$B6@z*NCyx0>!8vN#fTgO@!ftY?1t9XOYlii+Ehr zTi7mziO5EuiyOPgh$}nZ64gUf#ksUbV$9J~qC!TdQ#>wTTMU^dL}C5uV(iH7V*AF6 zV&stqqWkyDL^rST;#%XuVt{v5F{pkw5!K+djvl8RPu^~gm7vByQ>SmW{ zaosGudoL1kTRMr}3#N&KqehClrx%K>{lmn$*pI}>LYK(z+Dr6Y<|g_#w~D~bo-uK}IA7zj*9vgH z&cR0k&euBn)sfEEd-|#Rk=c(=c=BJF{KqG}e0=h~F8cnZ*P9f>W#3YxsgIs7-^ucv ze4irUH}v`bm%N_n6~C*#7Zvv_Ue)vcuKBv7E%v(Mrc}7;`yk)zx>qH=^44wN%Z5e~ zYHYsYrk77SXZt8M_AK9UJfEn3Hwm1WpnmtR+pEK`*k;lG)%)w&O{OyAu~&xkSw;ty z%yxH$FB9r_(f=PRIs51H+uZ9yFv?IR?R5IL*)0}@ZFOOA? zt10xlP>rSF-}+aDmG3@3UfJ%(QO?#es&Ns`wa2JA`QRY%ZHgFAF`eCA@|6TNr|hdk z#c42(clM-+W+ZPDspeQnzA{QRK7Msc|FcngS#fz)oa)<^?a9_%9qNS%?ylC49BNm^ zY^->tO4r+n?#b$fcsEz;@yTk}ZOC<$YTgyE%=0o<@W@M1yIxePdl|bcL0(2LxQvqx z@mjc#(RjbgGd6pYdSNH*mUij|KQK|LR<`05^DbDuVcQPwgMAd{#r#R1w?V~RuHq%b zM#{-I_!!Gq(l1m@HH^1>7fNoru-sVL?9ZQN_&~^J#pGwaSGF^s_F~b;$mU&2bw8~d zKdqXY#;Lvwck)F0h)rHN*IgH;fQq@6@v*@zl4!HP@sfv*Pp*8A0Ao45eqp|;w(*|c z5R#&H%J->je5^OkNme`gxVu^{cBtnoxVc&lPEyY+=K99#&TS60lY}M$ODnyrl|4m0 zPXk;1t$$TGUVD3hhsN0r7{swu8mn`)s!Lhd!E*^}rw=GMh@yJD`eGDkL*sZud9Aw) zfyT!khRf0N`h3H?M&_D}f{*6+jF&vkBhUuwUCm}E%IiIJu4a$oh)*x%@M{c`f6v^5~+j1g}n-Xywwp?0cOPwtc(~{I#r|689bizvV|1G~NeDr*2r(3E0)e>gg8GVhz zocv<2%esLcxC`E=q8;GWB(-AG{|^=Q`b){|$C9CwjfPPh_SWbDYnT{GO~a{+Sq<{f(nq^qS3&j%|)sdwt~Q za%_!Juk>+uIkJ&s^)5$xf_mi=_J*4Q3FdEn3VK^Mx|S?>kt%C$zv)t#;Uzi32=*1dyT_wa-!O+4)$M3@BnNLkq7uM ziuL1+9$!OIlP8`J|Sn#~|bG4!z9KWkXvIb2rkpz`O#ij$EC3qmbv* z0oTC@(vCr23)&^fL!kK=WwoLD`N$h+*DB;a8hyrKX3Lw=YQkQ<&iYG?dStk}&gzYP z7r9-mdW3hOtbPgV5l$ED(>OJOUwXDG=(mHP#Hb0|@aIq*ak=TNcZfd)>XOwXxAET? ztsZg5J{9{s&_7;H;Eh3R5b`YizQKQ)n@({KG3GS&g{&M(GfwEQi^xo9c}vd9flT20J=RIkKSpLK98ofc z($GOz7NGeYOBhJu2|M7LLZNQt7(p(6QSX^N9s5oEtHD$TJ-?1=b*H9i{8egAMn9d* zSCDhb+z44v`(WZDu#ZMQPaMBRVf_mGL{2g1FwXN}dbc^sckF7yL4vzb-nU>Ul1(PK z6#H$kJ#wyA1P$f|D{CRevgoz`iT@AC-y>%se@YU5SI~MN`4L!yBDSFKip*~fTB}nu zuYX$m!b`m39B$lZXcI55RKi9WYZ&X?)4ME>G_l7B<6>u>>TWJeh2EUETa-Fxq)I_;X4!Ad2UTMJo6LFcy_2J6{ zlaSY-uL!0i|An5HZ7ltgU1gk(l3RaqqGM+ToiC0^e=w0E)7h9HKEqjxpBNZ&%XC z7#9OR8ODHsiSRL{g`cH@F>H4j;9CIpB>oO~AG?y8Vcg){UJtht3R{Z$E6hWo-{DZV zS3;jpIjqjZ+mP-cdluuN`xL(e&|N~GN8AJC?w}3qhyQf^yrR|Z!SK{cP`7)bpFkWJ z#_%hYy99kT}@7L!z48#!VMe#-SeP zED8B4K~45^*M+3xw-U5q4*-Mh>hUl6eS}TPYBJMINMGZ<@Now*(=x4^O z#}|SD$oz?ykl(QTxHF)~aaO$rDW_bB2lyMoUqZSMr_Uf;4#{bfF~^0N?P~G~>_gE1 zhFzqn$CrVD*cTIDgSdg%H`0i)$eH-_jg62o6#pi2N8)&kIz*+>T(&g%I6ugvIGxNd z$F=6&NK%iFC(|;@tcyo!ikch=_J?r{Xn~P$_=RjIb2jot?0GL`sa_o2CW zkefw6Z&^0}2k@Iqg(dKRNK5^&|3UZqp`Ua|$#iiz8o;OJ&8B;P!LKL0+3;1Ma+bd# zVZ?t&TpRdKArB(n0{(z~9R6o$(XYtmsDqcILz1Z@13eROGJj4!M1^k!e0j9vDRKhw zGJfBq@5pP%A=$1874uyaDrNPqrg9pDuCdCphBaQ`R50Is&AW`=@yxz9;URBIDz3T#zH@yo1(=G4B=S^AvqNKtX(G|3qj9cCt z@Cv-`9bhJ*bek2tSez}QP5h#BJewm{si_kFqsZHgS`W^ z?;(#S{wSE=-Wi=P$p*n4q(bu4b1skb>agP{>%b=ox)tOEVpv}ZQyl*lLE*Ubay zaS2Tt33d_r1_|sCIyH~@r`|iwTi~dU<_8Ymdw49xQWGrIQU;WIH|Gd+F7>_?wgyM7 z_6y(v9L^A+jq0fsGXR3}^!+I0ryPw@_{C7nTcoaq|J)2|f?+2g@(p^)pf{Ytrueu? zHMHU7Ce^K}K7}EJ!@-7K=oJ{|z{Fuw&m!+5gEGubQt1k&W=B;~&vR?m1NFQk=D<34MCH5m^fJIXq6al`l^tVw^+1oR5Az8Ms#qiGe8! zxdncy_yxnja11FXQ%~@Cs@$_YaVN3Q#;*aHrh%P_SI{3vR?_{Yh0c*^J|-eNUhX*x zk0;c|(m$jc_zgG|Od;YDZLEVI3_S-TSAv0q8e*Zc+xTg5EFeP8!^d-#kaJXS2U}7R zTje3;K|S_u#B=aNcKyAt9w3c!CP2ddM@V~`af+a|lpRVj&r;!7&PX^ywa-SjK=0v$qU3Ztramalp=v?A0^n=`7;(dpFwFI1=B=`9Y{ljFr4@*hW z5_uAF*Wsa$#nQI&)llS_#9Q$%#y$giQX9F?HthG3<*T0}??{rbdTT{kC?Xa?#rYt{ zy%1#j^d_z?L|kS%)A3A@&Yj>5*aA}+~Ia%%fiLWNd$2O{GO%`AwYV2}0{LU`aT~Qbinz!WwRj~m z>&@aP$jJ`1_)B=|p+AKFU3gb(cXb`5Y0lz~R8S#B?L7dGMKJLz@bVbs(Vz^&AsFJo zPS_(;)#7dF--3ZxYUHo*Z$i8t+zEC;KLP!E{69nPOy(l|D}mfc$`NhU-Yg~AY=8cg z@S>zl=_HLZ7V8Om&`vF8^HvIobhcG{&(;d3V>S_fh{C^yZX>kcpx=t81ALb<&w@Qj z9}h!sFc<&Rl%WjkEY0Xp>`&NPMAqf<8nTDuZlWI_;m#wU-gVKJKx-uI9O>qxuYHNO5Jbp^DTEY@Iw$P!LJOzDI)Zwi3W7Ck`-E^@l zkUhaw$?EU`y)Jfnk~;hrpbNpC(#}33#i60@=Fb)qxJl#$$QHG7+=oI3( z2m+cgdqStFY$m6hcR}oto z9#3#A`cL5Lji2V<0X^ShjqTk|9X?VUA|p9jhJQy!H!@i9e22`(@N7y%4dfGGAo4Wq z+!e$!D2LYuJ&<|(KXxD)t=PBYcM1HKic`>UhUa(uR#U+_LRUaGHMlERw`r{u- zULQ%x{79PQ++kNs_`$x|JxOYb*3uu~9fX{LUl#r~;TegZJS8>IS0qmzWC^~nu+wuT z{Js6yRn+AJHYTn=I1Rro_+O%)bDB*2FUZ}IxfzU|bMwU`+ELP|m9r#+b409~c7dJE zhmmR;+i%uU$miH}yd0?(*lh45~mjZg1i;KhTuT- zJYlt3W7M?I!3NQ4njbuUK-Lo0viMiUuLXY0e^$k&rft;bFK@2nMfdUOfM+b0h&VOv z8!{{+ViOU^!R6R%5t)tsj!@I?qwk_sivBioBL1I6scDO`7lQ%Fp)dG8r9xIB*8BJ= z>5`;4i+)2~MjE#LtU&nJA-^h*Sdz@KLc4Nz56w?H-7h#N74u_-j}3BBzf1arVT-6tFMT6 z4y^a^dw}0Lzi%94)Q0 zYHE!K(~!C4u*SlxHF`gKF2UAz#I-_RMBU-Yvu$b-t81%2$K($3uh?D4PVlaF^viKV z7UhsZCNuXOTFZ;r6|o+{gXN8NA1!1xW=#U$M0QV9i!Ote5}u78>uKV+ep z9G2<$e}tZ0B>XT*Fmw9{KlS{}(NtiTx;fmLc;{vWRD~);Q`=GAc;f z&Z0I!IR=e2RTL$ToW#%VMh^9m_@AdY|-P zf}3dWLeiebeiHk+7v*k5&laCG8Xj(Xt=HX@OkZvEX^&cJFtm?hoJx(~Bkv+fRh&+e z`~>oPiqrb`E+pE}XK%k~16T)gR&mcx5!NQwIgFkK#1+s+tMMfWUOByS6>wffIwc=hh&Pm>i$UG@5VxeX250)YB2A#tOoOKm-#1c0W z{Xpys&<{j^m3X$^t-jRr9sWCLbXo8;{=cLDnRlt7bore+ZfbL(EA7IEufy+;98F>fK!25<-s$q%S3{pjOd zoglm4b#q0XiIcM;-Ca?RL^-R!-W9bhUUnYxvz=(1GyVNU%fS2cCUcnym&b) z3i}GM5d0jTVc5GR$XUOEhw$Hr-blO=|785WfM*r_x3I5?ma`^mZL1?qFlO;cFKRs= zicejsag(f_IMlbxS)A%oJ7Q$_OAuTkkZsARJrsHbc^d?DPSkx8E9rH)jm=UF^`wCK z#>uk#TR7H9l-+*@=MylHfT^GYZl<#H=v~O4LU4ql3^`sfj zEGuCRaQguBCTQQndQ%IN7}vzYWm#16yNvLF+*}CWxqf&=>hG`kC0v z!#4>12=oT@xwLl?@{m|LixWFaAb$w1byqU$GYC7<@mPeQfHi2QpF=nEU z!_NX=aW>#|JdW~DjFBYat|4j~M&8_tT1qEO0WFX-t3+);e;nCBr!1ogbBvsI0{sEv z3(=pp%kG+78$YI#s2Vi3IesY=!`(uZk%PJHh zM?Y#ddOKME`E9N4`0vlxBr^h~Va_aW;iHQ1U}A|XPno;OFo+5=h&WC6aAy~F9oZjy z1l_`;`lxg06F@U@n!QbuoW*TpRCl_;iCl}!+88fp4BbNxr{YZbc#`q#q`D2JV|PAM zG8;)tW3tvG6rx;&lMHD*K^sWMeN)s_^f~A^()YEHkI@a<8JSM8D~an#adYthoT8NU zcceLn3q&f0#!_=*mVp39PL>jh$eESVmpF3hZ4qGnO1Hg{y<7=iu~%qe7u z1KIiiujc*T%>N-O&LE1&8cC*^v`K60Whx3n-cH-Klcku<7pUM3+7^QSf0*&V#cL;H z6KPW5FuR<%h3%}icDe8oJ5NoJ2ZF;B3ao*wG<{cUYWfb39eB))mlL(U zDh6kwlJPFHOJP1DvqK^G?4fCia-vM&kwm%hE^-cGV@XsK{bBUQ_}v1hV4sV9DRw7V zJ3%hAfqSv{!2UTHMqDg$^}tW@TZq00*&V+?BM;E|ca>-8?{O2?~HvV2qn_X0;T09_O*xBSBVGp%TefU}s)Ryhzd7@H>Xg37WX~d0e5=IYe3>)&PdK zXgH;xF~`JEJ`vt{euBOWc4nr~aWHoz!%*~hkSCKt32VVEbKwmVw!=;7(L!1nunq@p zS9uSqdCxO+ETw7#`dyISm(bPdm4KE~K+~0YZo;D()T~6cbEfb;WPhkx#XTDhE0MWw zC)R$C@YN9PWybl^yzDa8v@V zQr&8P{K=4rM=`SB1UFa2&vEh+Mr%ZZ9mjO5w7lusWO)f^YQ)FMvR@1s3}Yoc86tkH zf+CjV`8y1=!13T7Eiy7v3NTfTk(bb65j9|-gCeF=ladiBE!O+-Dxq`*Fxc>$8!s&`?K96L7fY0g$OPGst9*n3ZqAtq4~@wjB}E!`XrFpSvAur~7R3}T zT{oQ5+H8oCn)~`i67)U6T{;|z}u{|(2BnjnLz)I!7~lda6GPnH<6W$1ZhUi+ztd*f}lSEy%N;i zKFCLrYk<7Bk*fqIN=*%Zi3IdWlopzG@v1))aQJL&ZxX{(YX3KpCzDh$CrLG0x6}Sl&}#>>8Gc?MXKOgGVdk>n43DIL zvcbFfe~kVP$LBGA$3gBe!?S4s_cGz5;O`2qA^tJ^Q%aT0Hq!WNxd(852j_C|pMvk*T){n} z)dHT81ka373&OyMiK?BO%-~vyY5}W_;Cm^my)#%RRxRj=Kd&mrH$-MQ*@8 zfJIza4o4pP*66ivXJQSu;Qt8!x?mQ_rks5({yXT_q3}(Em&IZ55bD{D%*K_yI&mLj z?}5xVsa=3+wBrJLL4ALK^O1u=-uSYg*ZA=B(hf&Amb1JeTQG#6T@=AF3N~WwiR?x& zb3!mz0egFp*DecK2nG9~{}R8E6!9tYb&BPr4}KqcE!ZBuufQD~iTUt^Q`}Ab+u_G@ zJvai*#X8Oa|S#c zB8SPPWDGQAJMF_U@1QBo3FqZC`ybG?gN9v~VBXd&IFFqjxB^d*x2p~v z!}v{-j6nZ8B&lFI;@JxiwsE}2fICU9je`X62XGNdkAS?nTfkF?;3LE*f=9LD^c2xc z8gKA{L@^9C1sGP*$4cN()5*Z!`NU!@n5#{}ZxHafpkNJhYYIC_B5k@GP60b1QUZsW zPR{Eqz11X*-NEqN333rLXt*O*PCLWQ{s4Io^3^ywjTx4eid@7D8qRKJniAiarBA;^ zxkyRx%j$`(w<5*RPYP4y5vHn2`+kxdc>wK9GNd=Ay1B3jnKAk~_=> z>_NDX0egYE5Z$3U0qq>~p`gP6rFWuw(RT4E_gP6XE_sCleWEOTSkAZ|!3`H5;?E^ile zE0D*_CKk@oJa^<>$ZVNgZeo7`ZbkNhZ!bJXkTq1^0MMDB2JHafgZFLx`l6q%4d?Nu z920X}5SJKBR-zhI2mBPHV$L*~BV+7p&=0uhLO+yrDbOqie~N|<{Y4`Ez;9#KJl2{P zX1F|d9xQSAF*RAfA)S&jhWqTiPcVnjN`I&%5Eh$+?*8b_g^ zL`Kd&OBFo0F|urMqD`6qE){2e1G+AVv*}z+OrNm|89I__0r6c`ruI8-ZU8Y3_mC zpXHrIUyry@{5pa>1+uVq&tnB+VbvLQ7#`MOdE3zIN|elTOv$xrUI6hW*sEYq0xN>7 z&4O;C=j&cU>@ZnQ6PHTd8SHN8ZScC`R|Y$m01Myy2x^R94Eo=YdA6DN5PKVBD{>Ka zzKI+}-7HEh@4;7IyZ%lxjd!-DSmtTVS;*OAvv?7#ja61Uc}3_iP-Jc7!z5XVyn@b1 zz;8Fn*%z|RBYqdygLt;^EY0W)rbWwxWQqiTB&*NMh@+Qb{K{f~HV#v8$c{a9dqlYX{J0V6gP`JKZ_=-bNF@7QVX+5$U z?1kL~?=|oNLsvUQ+FYUy77O;}Fh+wciGr-eSE4{~{94i_+MG~}crKxqb~Hpgi!@xP zt?aV6@A-|O6F8rOOgmAEF><$P>BvYp0dlbNEa?Ak?f(`|lC)J7UqPP0Li;uq`;d&q zU0xYlvYHlYvznQMVWhZmv|t^2ZV`gCv-p3jh!;GHbE34o!!f=^D|1t={erQ)B|&wJ z0-MIF$GW3u+vC`LUMN}+uO55bT_>Nxejd9aQ9afHeKv903tyShszZBG=_mYWASWlP z4kcrfG^5+Gqlnyk9^>U5nZ>+gGA4p> zro)d{_#F>8p3ceAc=Kh9I*ex_xq^{<3J1T-DKn2d6myPrDQpI`Ezq<=(;7NnMQ~_k z`4LeWbP>TjK=L&jYNG!;P#$kT$?l*MHbuIxKc+ODDg~N*l1e*JZ&Al-0=WP>{vd$% z(eYEIg{d->=diF$c9a8KKyikMdGxIlc_=Aj!HF^Iv4>zY7+CVl1Mqi|aw^RIkh77s zMlsXS*VCk1l+gKOHXV<~kScz<6lh3%mUOz5UH{l#JlXMb@b?DfK}7xx13ONRO5h$U zm<3+tSSW!rSSK9|#XS?Z%OF#j<1*&y6r`ljkmi^M(Z~ce{qe{KYf?hK=@aR?{{Q`V XuKngMma4jaQeXSqmaq74WB2+$=Tk%Q delta 358527 zcmcG%4_KAu{m1_tIOl|VFw~=>9N?%&Btrx}DitagH7X)1GAdLoEGjgtQ&FK%or-N_ z)I;5Dk>R5MXtt=_hRzx**U)USMjIIw72C+s#>y?={9d2uIlu|~Ui0_6t{+}6c)vfN z`@Wz1&-3@316ICrapjSVSEp;5c6MIsq7B6j3R=)8y{PQBG>G>X|Mf2UNMQ9GR z6ErOkvLO>(V1;L2d}#HeiiSS~0vO{fmG31vnpU||)5=$9TGet*t7X%5XJ}ge>6+GX znx-|TQFEE5H80h){Yx~h)vIX-7HeADB7V|9TZ*O~UdYd6O^Z&_H0P=GXg=-E)3o9Q zO)Hv9z1f=9GE39;xU~w+8ApLwO>@w%;u#d6_l46bz=-xtVMCM2GX}@WniesU`~>pj zHLYMAc{_QVrtM}tjxh2Tb7-4>9td;9GaYx;r-9bmO|x z8?W2sTX*9vzS3KK>zEpC)60QC7eA|@5Wcr7`sUd9$k}xIxL(Zde!=L4ReCY)O>?bc zroF)mF-7D?da_FDnmxJs>h&cXLOY>tY7PX-nIH!f;h>C3GOei|T*TXx!-N z`%Gu0Y>BUY;v-WzvZ%NIAB#G1&xtPT8?Sy(FXsAp8@(`w`|8_+eT9aEyI< zWGY7%_4|%gPGH)umJ?mntLX3ObOw7d*TW7sJlxgq7+Q1Q`u0j<+5kNq>B-2V-tgT; zl}>28_MYgXUPFIJdm%?^7CReF7>m07qwinT60?jkSNjf*E~-ByUQ6BD5dS&lDy4IL zHJNc^QHO|2Oh#Gx>gb|ge-ewDv;RaFHSE`+0~ub_R*o^+Haa@(@Ms6>h>uL=$fADV zkt*|<^ZJP{Y8d?;ozCG!ZRHrF8DqF#zh~&mz?1q)Vp=`9k)Dh!>X;+dSkxRYA18MD zwbS3xUdW>Mu*h=wjr{6B74Q*Bj#RqgiF?2>QTXpV8mZ=^S3vqJ2g$jNv}oVGj50yf;_MY^0ML>B-2VjyY1L6WUQOA1Ah` zPt)JgUa+Wv=3S;6l+EM@Yx zanbAIXB8B}|9w|m$lNKZyqaLi$8tl&1T z6(_cW=g{BLUdRe2vS+a|V+FVBU)iT$m2F0hnc4oAfRUE|6otfnUj_oc5dS9SDy0x# z)n?on(-Go?DrHQC#Dgn%-pQ--1$%13GvrZu2Q#3 zal{#%G0KL6qnE}XIa$qIR3Nj7CS7elgmCJ;X<*a%54z@37>_ zTie9ttc#!jdr@c8-{X2Qx0?fs>c()V{oT+Fk>9>$ENTn6k)Dh!>X^e)I-zai!f;}X znm~U?dm)P&Vcvfji@NsP?_bm;vy3rUTmL$`sF@-0JnH6!_@^mXDV^i1#EcuG+((>{ zG0KcTj4tZ@=`8BR&J$hKH+T>l9mw#a`q{f_-00~0OlKuVzVeBWOy$U;e&3NQ^Er{T zbQeGW_oBYRb>p~R%=Q0qbWvaZhoK8C>#B@JEg(12laWOobEF!JI`K##P|a%yuK)QO z!kf(6XfM`V7P_S8@|r^)ctKu6upTvfVt@Dyr=C9}s-m64g0L}H_OC`4m6+BS%tc5& z;~ziGyBwqZYTBxlf_%BmxUsmK%xxH>l{UPE+|`4=6#4Hx`!kQ(fA>n{YK0K>_4KBDf(M%$J(3p+CoMWv-tYqe!^C7{}F_ zmoZP`yYXew*s+GUbiVG1kDj}WQ+gN4Dky|}$OaF%K-#w&4fBto#-33 z%nm;MHSnFSx!umWPjdn2BK^O&b_M-9erqlcA*vf&%Q+SZ@Y3bTS3f-&9oyk~5xRSp zWA1h?tX-7(-&?$xz8$|sCwnN>jV+$@AM)$4b==lw zIn0|LV{2=fO^!0-tBYQYZpk_)YVLhJ%ADAiPNz@DZ)qT8OEQzgTRKd9bW8ng=~m9~ zC$^>e^y&C5^_n-Q#+X(xgO$NC?I%9GrAzc)Pt@E^e>>5&noXaM+tS=_jt3eywlr;U zbW5_q7IMQ|J74db>zsQHmy;74*G&3#+}37way$%gEtzZO@C@!IKDwp$gy^|zdrx#+ zlj+lOTbkR>u`s+PJLmNX8Bp-6iQMSc4$pJWUHxSs&_lKc3gQ1gqQ}#(XSk(r}f z*zoxyipzzZKQdtWtBGSJWUGmexj9@YPHZ;A=-+W$oz-gI0Gn$!6bKBTM(T*0TiSfe zm2+2c@jbDv{PTOa(#)~I7>upROb)ME6>)Pb>*rrNH@Pnm=%rXK6vF>KgP$i{ z!Jqrh!p7V<^vf4~RFTi=)eXHAlXr-GkOl+cTq3~_S>S-f^tl%D!3AA3+X%&w1bu9! z8Ok6%xJ3v1AJRsXn$^_ys9=sJc>~N6rXh?jy7?&=myD1~X`z+$kCWqPl!atV$7HAL zNed%p><%gM?3fbqGkQX@U1PFleK$K6Jux~RZECaA#`g0=vbkfjX8oFw5>;bL#LqYo zlHEThYu4AyJ2GSY$HtV1pOF@lO)>RI@67t8Atj2xQ{tl3_!-ZJWE+mlc7|@y zcg9A~qD&p!wzQlZlFd3UTNRS6{7!b(qWBs6L$Z60%N`5K_8pf^;b0zKuS8P^dnGLw zhhz)KW-A&(vUR2oAKpRP6^$KJezJdSZa5rL{y=DJwI#$Bli2w zC0=d|M$aY1T)JfhGFHwdv-P*H81Jsb$hkz0U`fc&X0xDi?$eK*(Nj_Q|MO`if{QQP zHD*vVwYkCsnJ0bS@~Jb@L&|tV%jjP%kDuWS$(DR4`*HGgJ$*&QjHZwhjo&FD^}9o| zo!`k0*LMw!_R(qT;9e;8^Fp%O-^mWwuMR2U|4s?1-x`u_`A&AY{y<2H{_m8K`d;&r zZ_Hqlse`>6u3r*TqUbv%9$FbcqcJ2~e_XaRB-?&m*2%?wc=a5n4sKgo&JM|D9+&lp zWXr#kO*A1G2b-K{lATk=EptAjh2iltWLDPpru4KyE*R zr-<{;+-E=e;0tA|>p#4In`MR`t>rq^LoeI=U%uzOi!GLa=?yQ5ZA(!(V46)ekdCK;*doZ?A)ulXWfQXRq6v-(3hk4Z>kMmI*I^aLG58@K2_A^!F~k*Rof? zeD(d|Pd|B^{^{y_E#K7c3vYeChg2ES8P>b65SI?cQ>A zj1+!o!~GPVan1b{-f@kxQ24as`zic-@ySy7-RoYX@DHz#ikehvES|AUtdy2kr$2wk zWvA?xWs@&Hn8TNp$v)j;eEdlCKyjG18}`)c^Y2_+QR5{A)r&1!1o#)R2S9nMMLU*a z(R|Ri(4tjBq%?J!Me9kU9JrxnnMKQo{-qYJ29n^w5{p&@5zs{27c(Iq>RX{Z*`l>V zBlwdo6Q;1iXam_V1@2e$wMRfp$Ia;16Jss zPaYbQWPuZUJd}gV3ch7SG<46QAe2KEIH7AcQwU{{2~p5F%c4CCKF9zEbhwFu4>G_3 z9r273N+BI0pgqo_H9!fZL4}oXhhrHr6oVHuXyfI@Iw*n^7>HqAp%w}u1qNmkgE}aJ z6wsh;1~DiGFIb^Hn$Ca^GQkPm(=GbRTUJ&0oOBu-&@qK_Pzo8~fX>Mb4$2@CoX|Ci ziGec6ged6b_TyRbK?XRW<7C={Qpf-YbWWrkltJc1_CY72Ba$GLK^8-h4#Pkb_|q4Iyj*Fuk;$KAP+py-@{Bm z1C&7)IHCIsHU>3N2uX14a|Qu5PymTg(f=7EfNIDG5A=0Y8LA);+|c_e8wWq+f(v>+ zVYZ+WvLPCJJ|+g0kP9y8>7p+9As5`x`w_8^Sbskv7hKSDgvw9}*$@rgAF}g6Ib?wo zx;mL&D1%Ihg3iCN_TYmIa6reOS$imjbclfV511_|0q+MDtU(9U0@aWQF6jOf8vq}q zgB9A|XPuz{Jka|`ItJyC1y1N{ryi6+CPYE!dvpwZkO2_HurKn6IW=K%W<)IbrWfdjhUVu_&&3f^M< zQxI0@c$2AvO2~sm82BwS0F6)v+2Dr$RyGQCPz>qdfUe)LB;bcUNQ7f=&~d1TQpf-& z^t?_7UuXSm5Jiv%4(QrX55W)lkO%|6rV(g>Qb>mgXx~Q+N+1oaaCk2rgJMVl4Gy#r zgF;Aw{$CM;`U<|4LMB8(=P#Lh@IeMRpyLGp1~{ST71qCnZ#7T^Ua&%YGaCdSWP%gAUuJ{ghdl5=|IcX{YM}^{VBjT2 z26a#bUT{FyZk7hhAsbxK_af_m;6-{2rH~E|=-$O13ssN@Zs^@f4E&G_F6e2ZQ&0)n z5DncgupuaiEO0{C^Xywt2AL2AozF4T&s8u=LGe9wTL4&p(Od}LQ3TV*Q zKshLeG_XSZ6U3khlAwP(dp7tX8=TPjI30l!@WMbnZ9zpX--;j!1|DNQp%(JN1ARZG zF{pw(a6|7;SPJk%F1Vnlj-`M~$cAX>RxAaSLl!up>rrM7${_nu*58fje}tU|>Y)@e zAsTvXIio=>6hk^hLAOo^pc)Fn3lY%yFdGCvbjkO5K9aW{j3QpkV^=(vl?gHp(ZXy~nElE4qS z;DVkCIs}!F4KC>2O6Q?^E2aRE1Or=`9cTa_WPuC%%GpJr4oV;coX~S8jX(_)K^i!q z^9}|8<&X_7=)Ik?Pz?o;1OsJkuYzy&Py*@TfcD#%Dky~va6sp+i~_14A3V^13mtp-)f)$5}|)H`yW(8K6s$7l-@%X$54qrio=wES54qrm-i^ed3i7}WeI>L9RVA!{9>NX1H&77#kP9y8xt@_gC1gW1 zbYI5~59N>rPUtFT%ApK0AqqOLWuE~bWPk%Yt|10K$b_hCSpTjKREBcM1P8Piu`Eyo zNzi{aJqABygA+QhVwu1P>EM9&^~9hA(m{i^LY4@MAO!~2u@q1Xg%x~Df@5o0OQ?YY zNQC}1ECp0UK6s$7fQ~>FIdXz0Cy`cMnSkPcDMeK|7?)lgB$H!nm$ zXFgL4e#nO;khc-rFJs!E9CE<}$5t~kXaFB%feZRBWgt)oC6ED5=*gqwPyb@>fqcGzGZ+DdREa>sDUC#0|#_vFazL+Jn%sON*aM$C<3pt?6u6P zC|}NGKs5BEQw{u304X5vqPCsEN&RFm16^ZkPcR8n@>k7_*MWO=skssP!5^kfc8YD35p;Q`sdLxsD^y-KwknK zgDS`aH}uY>W8jBea6ylU_MsB8AsQ;W=dce$Ib?wg`erjDPzNQD0Z!bF6f)i3_u-}KsrQ0 z*EB{7e#nPJ7??^M&;X^72~Oy7(m|+(LhwQabWWjz;D`JQz9qrHWTpigzz3NS4Ly_C zAXGv&L_>EJ4L~_$f&A8O*U5|y${+)xpkpF2D1i)cLQf=12vv{=Q{8;)bf z>g$#{%S*@84~T}|ajXK=LNTO66m&;W6{?{SybuAM>Y>;9kZ_}w;XnpNL1#F71jt7^ z@JMYCK|a>O`wm(LL_?2-p@Sa^APELo%L7mcMc@UR)m$xufiAbHSyyenZ^Nc_ zH*eO@ubmp1Mef7loNjq~#)Q zY%@yPY`-21YkPo;b}w%o0byh*2{ULOpU znW@F3(y>0bp8S(}HdLvZkZsgrOWbPPTh!T)E;ds#wqEQ8w|;~=Zwv;m66J|V)iN>b z+-k)EYLuc2%*0||D;W%2?$)>c)F{8&3@_38318w?N2R>`H-mv(GqFs|#$3p3KQ_mP z%3-#DXPIrUM_)iIW;ENa_DQst==oG6+Bz6G&#flDO*9>SE~Bb9qFHXWR-)BJf50%z z=s9k+OQM}b&vxsFi2imkkm*)K5=}y%<<{dKH=<{{RnG5;mJ?m&);F2a43=JSnNQJcL$GY zIc^=POPA8xZsOzIYU?|eMtNeiv`{&+R1OUi#j|Q*f*w-tU)^Qy+$Ecs|fSUF@#6&JLr?{SoJ{qBF zE0KpsBW$*p$U~zMHtl?WFi^vJ;LkLlj(#v+Kh!kaUWD8x(&%k9?g7(Lr5RTpukYJw z^tKcCW7Em9{)y{yyxRU3noC0eP%^XadB`fWkSrwbe$&Y+;_fq@EFtb*)5-eb{PAjA zC(XG!1_SrR>$`T%wr3*m7HRaZ1b3I|=v^(Y(sXR91y^A@Hr0*W8n3o~NOO)41_N8- z_3batwtJD~@%pyi#()ZNcbbj?`EhrcPL>UKyXj=vaAomo(-E2*K;I^r*>=yL2Lrd7 zg&0sa?iSNAARo>buWx z5pI*|*iWMRBI+QoU@ zkG_I4?Z@b5bRMmU?ndV@eWINo4hFK=Z$xLJ&x}_`MElU`@oM5H=tlI?c(p=wJ358A z5^X&)7?@8NM0?R5juz2H=(u<_B)S$I9j}r`y3x(({y3E*x*L5oPHh$K>>3R8#Hn4PGtr;KsUxC&=*~Db@iTNI`u#Yy zLUcR&y*Rb$Gs_~Ide9Q&o#gxjqz|JA9sPSlG=oZVk@e8#mvf1G$aYA4DpUxI7W2Z`6C zUNGWPz76-hzNY0=+W_i0lvzIMb2-!RC0>AfM&dRpU#)L`*{Lts8)IuB{*=VWv(sz6 zxW+hr>)!b`*B5e1mxPQU8~3v~{m9<=_HyKoIDO*2`SvDcL!7<>*@=82PTz!d^l%Q3 z)3+njk&nmehma-6`Z#?ES&w`yPEY>ze0v-6r*ZmPu`7yC5&|JL1%#f3V*cp?z^`2pw6A zE*(=|?p^j1zrozbBCQXz!Pth{-Nyyr-8&dqKc62?oX9AENi!F-cRN3M_z^NmbnzLE9FMa*|A^NmbmzL5jS1m+u=baXJ_rf0|k z?|*&(X2PUif#%sFx*bB^pqMlk0{*FOdWVa)k&=i9Q81Nud8cpQt;xNt7k$_ad} z-}Oeqc;jZZ3EMZC_iCL4{$}RoU~u$tdpeq5!be&L14qofELaJ)V>G{1t0(XWef@!X z@))hP;o3&i%d`OkZyIS?ilndP=->O+JX=2Mb^Whz&9POY_Ue-k#M>KDEwO6F*Yenp z{uT93wQ2v9`?!Y=%(Eq-UerH65Tm-j=6O07y9*m{_n~&is)^so9WI&=s2-Sm(o*dJ zdb^={7)94fN5*q0(LDcRH;z^NzL5tvv>vPeCbi4Z57HUeIJ#Sp*%qrOznx$^fP6q| z>y>X?ZO3r;8|`{EPd}Guqg^gvS{|+Yps}^SheYv(5N2&4r3e#%xQt z9K9@7pLlq-y%D*T{b-1A8+u8szLs!5G9^}R8)BgD|8fY%>PHTnYmkji6q{f#Lwd}N ztU)8zOjc7+OCkx(=A0MkWS;DwY zLr2i1_qa_PjhwGQG#`sQ{n%3)!) zNXHQO8)J&ezMe+>KNMRSS%Uiam=ZFgI^zA(kp51)EwT;QH>MOfo!T+t|BzDV#XTu7 z82D0;{3Dl3R4>ZBj8^0Ru5TpXg8G{gmyPt|{;D4&?$U+=Jw{wck&XL8U+_Mcd(`I= zHy-1gaG#FphMf62h<~g<@cx`g%`y}?5?aEM>Xma}67f!#+Gh=$9hryzAhgVc)WvcF ztR(&?{X<%L7S$eF!jZaI&VlX3|KL&+ZDBmRqYvu~{=}nu*ihh*ORbe?Ho8sULez(P z+og6%v>tsxKS;C{^`=V=NwgQ;s>gQBu{*AL6NaI6#w01fCcP(33m@+eZS7pq#+tBLT+U zNT7Zsz*Myncx)uV1osp8=}3TOah){8>%Eyv3r;Ox&N!DT? zHIiK z6Z%1-?g+-(j@c$gCTjrmfJ@(c#CWpvj2jA6n~qt^#{F0vrx)ap*kK*A!=iskeU~Hn z2;WMeY9v6#UIO=z1lXi={7~RN2^dfPY&M<1y)J#zM_e(GewW%F$qB6*eUF5VC*)?_ z-KL{TC+;qnzP8Kgsdd6opwe`*p12BeTsx3kU25Aznkz?dF$>8G;>t}Y%ZIzubh381 zJ4`1F$8Rs)?$UEU=9wK?=2BZvrnwUIZ4x$mSBtyVbo8zTcZ=!RR5#9NIyU8q917g* z(pP-KGduDom)aCXa|P(lEK-)|SnEoe}bkr%r3BgjtVxwP>)&pVTb0%tLI$aLg#b_Zk$a+yotj;u$fy7WWHHsq-;eF!;# zoXhaP;Qo8^P#~TyBMXo-T>3U-HFAne--m2LM!NK)$X?`045f#A?QV3N2T^UP z-|K^@0n`C~!QU1~dZ!Kre#={{A=m6ha#blL-l|*vK3{+EZ??!<-2O48m~(AE@n1_R z^QHKH?61ZYW1%$nG`Rv$Oa-F30m#F?I;I$_t5p(zg<=bB&8V04ciH;7zuRm*xR=Bk zFU4!F>74t;s&{6D*=)Jko!EsoKk5a2M=!4@^xABDaLBkV{oJ{Vt?iBXHK>; zYFA9y{K!i5BV)>N9BEC&YmG7q#`UKY|InCn9DQ2EOwOb+YNCs_($Ei#DZ@d;@7{3c zG}|(6cWUuJj!~PaV{b+O$k67!Vn6n zy)kN=w=puA^j9MYO7JVoE72S%yBSvi(-H*PVF^TrXan6iUL!$H0x3N7p+K;|9 zMy(ayjJ}1ji0(%FV$?p-&Unt1j6-xL`lc8aHw*1UZ)Oal8_}gPYOCmW^o=ok@>>a$ zWE!k)PJ&~WM{W>Oh+h+y0^esnl1 zfVMTG2X#yT+zF}1T~asx8$Go@VZ2ukAg5<2@YP6~H;M6oH`6lFKI~^B=|x&2{v$Ij zhfF*6gOT)N%{q4|@Sd5Lqs5CoIFk1Am(YiJquJ^_hUIf z1$(|!^CZZLNNO85;(6HTM$*ePKmJKcPcZJ4o3T4)s%?p33AQfucKxOQ&ap+zlbg2> z{u^&gLj6P^{4cL%pz5Tee&M&Ak#G;|vj#ll7Nuy-m}(k1B`nU~f&TGKm6L3Vkxya` zV1J|^{C1AblgRN(jkzbCrsbmVM9;I8qi!*~BL`;__U4)T)`0|D2eOpj%!(j!3Rm`- z>d*7pfCqiUOf@v09_OO3m)h#WQ#lbrz2O-)F%!l*o%=Di!`5=Yq2>q)uyB{J~x0q&rC^Ez1VYR>ieh^eJVS)nUUUS zVpq&maSNzaiay;;$-Z5WT{ctS6iBciKzc>W>lOW&)S2p#RBl<$LBM zaXgWiArsB`5`N2+YxzvIHJJ*B(X-72uR&pE&3r`O==Lla3b>h~WVsPS$I~YZaUVL4 zwOL5K9v#a7CEkj5%~abZ-iwZz$rCkkXYx>B=1g@|;_2uaGkK&YUWAUGsaB*AuSQRw zsn$xo89i;L+9vT%^wgPZm&C0LhXT%->WIXX&{OD}#PiUTXR72>;^pW_`WDWBpGA)| zw7D4VcpGh{GG;A>I}tNyJl-VX2i3dbmO098<@K_B{C^BTPL{-v{kNH}Toh)t?Z$tl z?y_3u**cMZYNyr0y-F(gFCxu{@O11^6=@?=g8b6V$lBLqd(}qCv?2d)W@JMH*uSZR zl1W;`J&>7^r7ysKp%&OJJcS@Xo1xZvc|pKquuD=8QQ!sJC|@9H$$yhN-h)qRxl@<^5Nf{ zp|(q|5&hd>PKMQvZ=IorBxhYZ6!?vqGhWW{;`fJk$&s2Wr_Ms+`9i$n% zAGwpBQry;renG{Kmt{mgH$&y5$ugp!Q5z-fN#mxDF-tfb{iNC}VIOkG40Tk(_2?&5 z|#FZ2=KAlj4s~uBHrn!!|ZkA$E_v32D zl#+d?m-vHbDb~$-Iw$xsrDQoXi2qo<+nDKHVw~aZX+uunNw z@L;UAL|Nw8yvP!f#w#8LxEoBz$o;tM)y7HWnBOMcb%v8S4cl?W?9eM^hels38Ls3j zxqXxdMqcyCCbD5PLRBA;qR|MGUQgs|BQpH5M=O!5I8kKKd@p*vS~HpEof+JaiZptg zj$3CqS@0su@(yLyI-JCios(3jF-C)ciibe^G&*FRim4h1d=TKbuZy*OyuVhQ%5pk<4- z*xVUv;#suWg1(SGPo>RnbPn5^W{%Er7WdXc%jmq=?4V^n3$W)0t*j&VJkCkdW)u2c zBQ@LJj?M}eV{`-99|SF<^JH=_LrV3tGIK~ z?dXnZl_T1EE*ITswNj{SO`<*L z@fvcp+AcZ|{cN;4B-)RDDq0Pl7q-aWg8o^wp7Zpq@p8q|dhk2c`dDrh)}0?_v%Ajc zg#t6rO7M_|ub2EndpWYsOtEIXWsT=rB2|AKYq8lo@j6|9k9RF2vbpy+a^`y(>G&Fz z8YlN7+1!Sg;=s-Z0LR1HYg4E%lK`5Xq)Z>HA8 zQ}tQ=-G(=-cH%3|{5uy=H7Z9g1@B7LOk}y4TIi-~8U7B#PcX*agum4sEW0mnir{Zi z8{L*u?GYDp0+v*Q&5PVD(Rt>FEwP)-N*sN>v4G#G4oao{$m>~yb|$A6%^8$h#s$ii z%PCYXn8j^jHu74L=1pcf_8RqoWSWp0%nV1e)`=}rUrNStk(@MY&J8 zAbmVbbRIziB>Bx)K1tLq3 z7pW~G>yf!?r^q(sh3Z3*1IQc|nSe~Xl#`%ZF0ufbtu`X<)yOPkF($}efwl*K7Dq~= zMPAbA!7V2{kf7Z2_z-&ZYR)au`u1lM?3qX}hxMf#IzDu=QEa?4(ukcerDjbqj-JE# zxxqGB@ymFQ;`mFnB&dwlVODzzftYB0;_C_a0^~HKBrm0+ql|LAmWm!9tqvvf7HKcq z%07a&x$=hsL+aysmbnw;J(*N3o4_$ODUoHCqpuwM&(ZuMt%*RdnU^K)#C|@SU#vMU z9}0AtdD)@Uv41x6UgOn_5(4dJ-q=~Nheq?MS{s1_W?uH~0qh&2`9)gN6+?j*GtXr~ zE5N=wnqRC{6WDF$WhPs&&yVK4{GQ=ZpizCbfbpG1-gPA}G>qn#XxRkn%)Cr}Iribv z{8FunK((2d9k&yE-)MfB<|vSJfziIqUpjWHx`55uijjA!%?mBFPV#CsnA_Bj1(tYw zGxFBy>d=+)x(E6eseP&~YK`1b%}SRL#0pc6jO3cv%!LHg~$pS;H0!(HBUs=Gdx`=SrVqA{?=jAuPp-05mXA@y?6lW1d+MPBGA$4xXG&-dsF!OUWQBbKZ2bahnfXjk#1AIvP# zl5k|V@$a1)*v0mOH$=RUa|KVbMry`%@Q-O(kDVI34{jb-@zCMaX|nQ(Glmjs8kKF5zZm-!yed!ky@Us4ofItQ+KYgjGv< zJtL!t-7W?Hx%hdu0^}!Via%^-wN>Gd7~Z(lG-Lm4ru(F37yfqoyG%*kHjxt1sYX==qa zagr6c%dENXbnX#4@y`cq8rS!T>v@_t3Iy+R@Xx5nPvtoXvFYp^Q6+Mk zl+ink-yqY{2|UV-UC)dap>=b3M?SFIYOoKfjZ$q7a+_IgnQ>+7CQ!|cN;Stuxg*<4 zIlC8GHBB9rZ~^*06}epIAL(b6Zy?-+zFV!4a67Vcn%XAe0rXb2Q^KB2+6Tw#%EUhv762ML$j#wzEK{3sXxb- zh1?jdYAm#mK#5t^xWr+vGxGYxf7@*bu*GUt23u~}$d>!?8-mr1)pnK2J(m7R^UnuE0wmRgRU{&MZqm@8`Syi^&i@j1h5~KES;$-67Eca7}@51a z!Jn^U&tg}oMV`mrxtU#|1%0lBjfeVfT$bsWGKWttbdj0nOAub%Ii}-YqX2idIQBzi zCi~${vLB+)k_JXzf@mjl=4gbf14LGhMp$OgEj*cyM%Z*Vk(Fvtn)e}BaQ42L=IhbR zC2aJz6_;*0s`TQ{P+y*H_SShTFY}v@O{U{c6USK}nKn(u`Dm^hz054czS@jiYC6sY zowy~Y;}Edk#_g%;I2R=07OM}>p}9QdqG@X4Ei_k-PL;6HyGC4!>F8Y>ZlUSeR6j1+ zaPnfftBm*2xZ?bP$LdUU5~pC=vzMSxoyH@r`656qc7D(@rWWieLCYL=V-tf`*6wz0 z91P3HInW85KW?SX0`%NqF~;S`dV-dLHDTvW<3ZTmVmo$r(8|JLXBn2aGVb8fp3})~ zw3&^L4;Eu^K5SgjGP-(fY|ygBR;(*%*5;6<;btpf^1|H^6zSm z$WG*E>H(3CEknG*zgJ{B@&omy$P(mxD)s_oJ@TNsKx7;8Ewx4D0P=OUQ)JRsUg=gJ ziY!3Bq9SvU)yNmsa*-{_=hQ}#y~rok<04%Za%DLvG8?JXpvZFMLu$c=$R^~E)f$nV z$a~cTA{~`-&fj~XWw9+2d6#-9m&5Bz z>!}w}^&e6-3(sq(c(ZCb{uaZlbrn?IjlWquzwdzDtWqzg>aQ=NYV_T5lP=z@nvLIR z<`-5{wGw}Wcs{a$yv|HLcrjIb@YkAEd=#E)D*Rjs&7 zZeY-bhBj}Cy7B9#s_l|<`gy(%<~SgeDglWRv`5zI+F z>%CkTrmB6C^P=+&ZPqKoUp7_6-9xSxy*ikadad|Nr>d=z>qqAWb22{9eOwEssw0xi zLth-s$$f?&e-ZumlWRujPF0&kccU+4Y@(g_bMl|64vEf0UqF9F`_S1_Rr0;)M)diN zNpw5Yh=b_J>swUpgJNAC`D)sv1!Ix*6vCAnofoFv-beekTa(3RRA93E4Qm5K8kjX}g z;LBqr1Wu7+i9DZZ^|(1wGGT)8o=_V;p0S+H^V%`=OtU1Xz{DSOo-j(Pz!khUmXD7L zHf-F+_z6rfi?c$_I9sq`nfPuz^|$|!K{%`B$>XJ~EOQ-;4dXlbq`MtNG=NUUVW8F7fDEF1Lm@AN4cwZf5);9tX7-9~lZbf;m~LZ2UOdlzL@oyP?hPHR8irMxA*`yVIrE4|57_j1!n4 zi~qZN{aQ=xNhw-Xojj*w;%#2!U(NU;Eg#ck8h$Ma^Mz@=S_9^Dbx|=jT9Kcb@umEc zK;A_#4XNmLOYH>!F~gq>qoveMeX_tn{(C6t;sc1ouo+)WmwU7yT%UAi4 zw}KZhoDFijwRaOIlyqd3S#OE)h+a(KJ~}P+YS8zX^$wNrFxrB@OGVzu#C9Vq=*Qy} zbL^0tuQd|(BJX6bBwT>LUG0>xA9)+2k#H0G7Bwj0cI3@dRPuJh1L)1_f>QeA`5A9` z7@O4}e8gtga`7AGaF}B&LtbyDPFAEE@z+Vpe0pxfUSoFn!QXO$)Q%C@VB}Tl6CBj; zC*?&`SzCQ&9rPrc?A`?0CpCR0fo~M@IPT2VZ&#WB05>7|YRS!tG2tU4&+Mx=}RmT((7M#bJqxF0!_`F@VD zYbPJJQyV3miHv5x3ENB1)9KZnJgnBDr!nCYZ$VF`Tjj*N(N1RkdE$;;eAbSBN!*K` z%#=&K06mG0N!*V_Fz-kI7_B_SM;yZcHeGZxt^>JN%=-} z`(!mFx*dIJvP#}bU8{O%E9EP8QrD|q7yTmIUi1=|q{(U@W$d-+S0}5ZqFd3sC#yL5 zMZkQ=v>@(aeNXLS!X6V=RAvh*mRtc+HuvUlRL?m<+ag^ z?xIo>veHa&PgFdz2<@p z)%%1c%J!Q{n)aTuG+6GmKO4pWI>WEwn)wC#18b+K-UiA9_@i|Plx2tIPFp8`tL`Ip zm&nD)r|K52Pv@X4@y zmcG{Tvi)$ls{C`<2bQGQuJ}vXGx3)DUkm#-?6Rn-8E0k9UzB{aSIfw}W#h)% zFDy+}rIC*7)i;rj$Z^B5#jh<$4*%4`|5nH!u+uD+7Jga{9ab4`{0y&DNh#r-mT)yL zH9ThGJ1H(GXWMz>e3HXaQNu%k{Eg8)XKUJ>G{zsv8~J(@73#N;|B-Bv zFCWz1$cGlI`RvnbW|M;a82OU2q5i+*{{Vd8PkR~Knwf@A`CE+{`FzHcx#9n28TlGS zhx*&-^=vqk5*^?fU=KNi{~6>$O&d?VH>B=nHWKRl8Ts9C1NHs`O{ekSoBWvn>SQH< z8PQsyX^&B1_^TZq>eEi5w=7mI1*Ah6CH&VTZ}Wej@Fy~-(C8vMKK$jD1rG6N@+S*_ zPW`7SkWAg)8jI%sC;vZ6vZigM?(v5NJE zPq#Fw!6}X(SQ^w8aeLJor(?RMdOpIRPJax#<%E4;~gFeIaEV} z!y$cjJeo#-GhB{rwg0qm_y31Ys)lpJ+s4bxziO$$igE-9@wFcIx71`m_!tTsb ztFyxwrrdvDartnXqTjAD|DQ&wRzaMdHbZuouXD_Jr+Otjd|lZ6tJSm%!s9CBVN6a! zayX_~!?mla!oC`{i279=w+Y1Y8fQ;!|6z-E6?y5uQ++s zI`YGvyEfQ4X}dVs=0qmK!Td;v`K6>vO;d|ry8yBwKKpEmuQs(os)!Ll!_(?Q* zeMr@N*Qjkd;nNoNPYl;K5@(T(;$NI?{70`E*9VR33I9h4|JhNn-bKvQ8<(s1bHbA+ ze`(vV-AJ5rQ4*hcnVNQC_|mWmYgEpK;nONw=*C6lr5geApVF=zuxv-04Y9EQ9sbj$ zVB1p0&v59m#$SBl7h1!$OUOyTB-c%SmUpCIUDUleShtV-_~2B`F{VOO-(DC#uA+%f zTpH|z6zZapr8FX)prFx-LK?~oY3QDiPIS|W%YvPd`X5tY9&?8$r;@s>gLOY9KQ7n_ zPp}h@{XdMI3B1(v_y6D1w2+ots+-nnPl}>_ol?%_;EC5&N;7hUhnf>KJU-xGc$KEB$h*Bl#!6r z`vmD|BybE!I0B9I0?qYE?1aQgMnd=xz<2gY#K4ZmuE8Sv={8t26UKfW)eq;oy!$Ye zGIotZc5Y(4PFKE$@5VrIj1iRIt0d@(URA<3Aeek9;>H<4`ONkRZiL{;Mo_lbz{kOb zRjR@n32=7H(v8kJfFh(YmIBtoh-vO`(Kqb~(t&KRa$9mX=mv|eAdst%O-vHke zNTz#YV0#(%IqYSu08PRIBwUwje{@j=Wq+g7<a*$^X(Z`flbZ0W^1&m%hvvd3VfumS!t@I%2RGL?z} zKH9)zN${5uIl_okIH6xaWG_V87!mnxB+zgpf^XPk+xh!%Uaa#yBJ&~A&WM1o=AIzH z2!I(P#emzU4jUb9P_+Yosuvq`_Y4HmEwITOFapC ze_Jz}TQhze(=cepKAO=HKh;xyn_zJgSPeXUgJvXQ(#4B4^PgrU+(O#X8KVlS0BHhX z-`|WPz}*Zy1G|hQb1jTWwEm=kNI6DfS0f_7EyNjzpNQZa5Q!73o)_zav1~Lm>-AEI z#EggnY$d>WfSGz00B&I5IoKmcgneTYH+RpGO#&jdAX49m$nPoQ6k-<)L>D@UHQ z(37Wr&O|!-l8B#>kGP{!PYq*W9|>MDH#N|-gUWj?aj z5CnLR0OtbCbYn5#D%eF(3C=Wvymk2&^hOS%Mv%Qj5|ewm^&`QO-x-3{@DoA#y+HgK z*hR24Te*N?k|c+D!Dju#(5-`Db!;N20NV+0KHv<&BCs{Fi=gb2jUdOahQ+)UeMuw8 zC2JCs`(b9RtNxM?Kim>^wQ&?fIle@K3($)p&H=$VNosk)F7^mM0KvN0L{Rvb!CwR} zL$CmB9qb|~`&mX%GddDUy~@%TajBoQ0!DGuKsg-K%QjF(rKX(}t&{3^QnW(#MuvfkoN{mZ($+L zEIALbi!O~#4V0}14V;H#rh!HGMBKrK{TurLwx(mEjVtdgA9V+zF9%NgJ9?)UjEUCi z9$O!Ahj{)^TQS+JCAIbE&j-YnJBjSei@LIwTQTlryKrf0PN#DDi8x`(TbM&iI@iIw zDrwEEEr*Sbe4cEVv+=*2aN5}Gsw8=L#8vdL3Gbqz3eBzHVb2A7rD3b_8Uy>?*U`Q~ ztrR@zfvW@l)&uJg+br_Halk)V;%Z>urS0WY<2#pYP`Q7lsQVt1$|cjf-l^ZmM(b2A zSQl|Wd5|y1rkb4`tUI^FE2BiSdiYh~fKQkhWfz za)&0?k@jaxE7LuMxfX~nO1dN3p{DPOxZgbZUhA=L*p@@%SEC{Pz2L7yZ!}zt)PESZ z1@@gvjlD<6ER{bldRU@d<*57JNXTgeK~gx01diHmCS4^LFW`L~N!}50e|kZ`GD6z# zZ->x6BcxdO6YECwBBX&|4EArscEH|Q%TQ?yx}0s#!1)WK29AtZ(0@0f&kUS`+aqq5 z2cF~geJ%RF*u%~MyTY&X zd2#tiJf%NG;FspXePoI8j^lM)W+pa_m&-*~~U z^@vP>$k#?h0V)yTE{{kI?5l>|f&DNea=Z~Kzy98U$a09hVnpOuzIUqrDbdP_qO}qC zx`)5U%RPqNuUT$kDvr zR1r8CzUSc=d%1g(`(4W|Oeu0d?Bz~_ec!MfvFBUvewO>Q(v5@Mv&g;Ea?7s_aURVS z0q39y1qA!h3wE_fq(4MHFd_g(A|FZVEVZ?W9M9766*UhX*9r?9i9$cv|14UOhkE`fO)szXo><^B8Z zQcq9dRP5-hXDm%mLX{uO2X44Cu#>&d)9AHYNdE;(#8z<36gL2=jTEnmxaU0V0*~T8 z_CC*I!#}|90sj=3%wpl7_<~^v6TgC$XJA;KgnQ}dkwJa-LgaZPBFDW1dWJwEf^$Hm z2qG_f!7lZPl%!VMjfn7Hfq%{;k_P)Sb`?fvwYP%ZPin)YX#GTbWyHN?amRqIXe0+& zAHLY=kbvVpINpb!iY32q$g!PR;)riJmdJCj7wa;QV>vk9k4?oA{#)=b=}IM2tt8+L z20k5!ys@d5PNSE~F?b%d#L|Bc@D)ou*b>LUKHy<5_Yzkj@xzu__#eQ(1}>8~4)!6# zPRCx!3NR$oORvtI98{ndL>@IFa{P%vJB$d<;c!{aUxxRBUEvX_1(C;%i10syf72u4 zAo3V?^^(@6j9|BU!4`Q$T0>;55fT1x@E?QA5Q&4m1G_p(>%@DR zj+&~Ds$0t49C5d&-kHQT&0Xd;3bzjfH{I2#bOskJ$*UvoPBV|gyvxiJLvtMS-DW-& z^T%eMYUss0!!z@ln0K4GR$xxVG2eq3OWw1kxjE`x(QIV1wH})~ShmqL?@Ju_H}BGj zTVmMnu@9s)+OrO~ftEM1e;3`K-p3Wr)%a=4F24tRv+|&4+nNY&lXc)AeM!V!>jitp z>a4wLIX2(d7!d_{hydT97m;AE3idj~{)N4pca77Tt?E*2Wk@dDmFyV51IIDfI0?3t zfKwiL-v0e24?(HG^&a?qz&{wc8n5a35qG19o$q0rg1y1R&IJ3Dhb;zsQ=WkjhR*vc zUnRQI+J_80UM)>2v9|B(0Qg0W4 zo%T(*Q0&Hj1XwMqa8L4*aC2*a5?;|jt|1w>L?rm|UHNABsCTedjZN|yU1%Uv{^N5vj~wU_%7a-V0p zh3^ht8^=r$(qJzz>@(O8w<54;hDB)GATPMASPqeyMnsNX2vpUG;2ib_!7lQGt@4QM zgvf16$8>>m*tmao{I<_#!X& zc5D2jD-+6-H6DsBY_$j5uC%`m=$qn zdcjtDM0P;r3?m|ZC-6-?BE?|OO6@v_J;Y?(wAGkL*su1V=EqNu#@fSnswHSgklMy} ze8|?FRDMs;V>{UpO~FrdSAK`3rk%^(h$K0Nc<}2zezV{=7@MZ6AT{9E!sAy2cBo;G z#a_o!pJJ(J7u_DD-b?BeEVcY<5$C9|P9q%7;b*kx*@7 zI&zHkV%^~JTL{0Cu&GeO$H2FZaNjPoxWoXDHtkzf5( z_Zf6jnva%^@!&UlxtEfAoaGjz0l7PRwJ8RBvSBA;uV<;*PbSr@VV8vuTx*fKyQP+2 zBjR)pYjAS#SgSV9NrJ__V9Pur%OP^C5mA7~1nBA!DFWLAyE^H7Vtrx5cw&lSTsIM; zuXEp{F;)cyY&^m;aDQoabB#OJeq5u*&CbsO8CRU zAIpllzpn}a_c3q+hXzJuq7fN;R(OTc86w9U5&0cKpk77<-++iCRzEM+a*xP5i1alg zi8Aa~^>@jS^9mLNJix$LlAxgxIo*iV`MPCzsRNPzMnrx`5~q(5!8ah1zA)kjd9hY_ zL>_?1z}Ch@0gfU-KaWT;;8xhxa~BivOY6C*>bY1s9>TW73Lx*;;3iC}_{4H0D;#5p z%Rzj@U6`5CCe=^pnY*bMqyBFH5Ewg?tD)kF5!c4dx`F<~{yZHJZLo`S%a1w9>1MkCL%ntQJcBl*4&(}wfK2Dl{TE2Sikk!@XeNdqieIq^S`R{#o#+dPEAq9${F0BJHDqNNUSP>`VGriMnQ% zTej!HGG>D!OeS|vBt4Rc?jHUJ%dH`^j@->Hx9~53pJcf;WMT|~qYPUY`|;k8IbK8N zNQ@dXGG0Z`oe4NIWQrJkl|1m}UblCq+w(l^#bD1eta?2T_D~Odg@=uUtsGzzVdB|< zr+VN5z*PM@JT_rsH=jX)^+)vMeC;#hanMs+@LnE#V1Bw zz87nQRZkZ=V-h?lt%gx!NCBo0;9LUmHs}IPzrzfC1aykYg)s2Y! z#uG=MGZ7JdgXOdcA~n5O>xq>pfl*94L!^cgQGinjaK1+*4Y;;}pTpkKh)gvibFMll z*p;kHeaKwmQkG-uC zIopV=c=Vfq$Yh8(BO<>E#9`MN5WzPfQaqe%x2PBF9*;;*h?FoQ3UDd`aQ~-kAX3V( zZ)5LdMA*wEiSKHss|G|iLZqY-k>9z*;YvRsGL186*f%WGWxQYyctj>cq_h!HfawI7 z=MhPRJ=n0Tu(vZJ=NOR^u6ZyK{WR`?@s+6Am6>@V$76&7f#Pr-h%(9M9iFrG7v zXWMz1X`3%*d|~X|6Sz_=X)vs^7d=YdxC#*ev~uO{T?^!)**Eb)1nj$(82r-!}GOI(ur?6bte4*-9oSD#|Ae;c+3 z_M@x-=Z6JIxb=PVg0Z^~B7YeXISwMwO-2OgpaMyP?ec=%?hz>mkzykv{9y1aJR(J4 zKQ`<@>@5Q#siWs{iY&)){m635b^_R2EcZ;y9S6VL!{6cMu0rllEVuAO!LRgk7l8fD zu%ofJvLaj%79sV=JU%8}3lg6i2{{cX$Z8{jV>ko|^o1AbPLD(_NPKQ2gg+7dtsaRO z*e?xx8up`&#D$rjxb2(PK~FS?$Q~mi$B_iO&4}O}5J{iF6Mrw*T^^B!5ZP-)gdYX| z4v$DN*l)0FuG~hv--7)#YxzXJ$`-9#xoB|2eQmMN0e=k7MCa=v4!mAPuQ?+9NyOzd zEWt8+k?~3r`xOtn&g0b_Uaw=*91(sDcs{DMe~J`=ea*15vA4DY&B_$$>A$`V7M1=G zdBcdv@nizsYeaAk3KS>U4lmf<9+B1%dCQ0hKMwo`k4OR7H?gZgcN6cAOo66`1xokl z0l(Sr$Np!#RPP1RCMEmwHKx?G1>Bc;&#aGP{X0PG+0KwlF}-W1O_(A)E2+in#MG_@ zTde6o+@*!dbbFFjx}WU0@hMytrdU=#<{5e5Hps%^=!CTc^5p}*^PaG zO_i7J3NCma_ZpmR%fouZzKwnG*wmAks>iw7Nux)VC;f{Q>Irb#zh8{#YR9$VLA{54 z&I;WTQ**9%wpd=_Ujo0$@&;R1uumEG6YTxGt!sa6UAJM>)>X#E>>xJ-&TL%^TJz}M z17GQFKYP;Pr#)-}>{i286KSx|dDw*>_IR+*23WSPN}L4zln0Ihe!;-sVE>pF>#AdR zI8NP`om+y7j&*GOpT`fEJo&vs>z^Txs)uiI(Gfp}NB3T=AFX=2`q|i;o&R=>>NW*< zl>pBXfFT=P{uBXz*}%VH|3pMm^-kwHwOrMxd&zRk^g7TNEVp(@=T;0_2wJyh!WHLlH7!f(XNuZaF2+qOPPeIFw zd(aE^vk}n{s&!b@JzzwHe+&GpMnpp>4fbKfR>nRs7(z7hFpPtF68I2C4Ivq?rYBzq zoEbuKZZ|&affsqhpg+Ul5f3{D>>GyF5Gp=0;vV;~S9#dM3BZqe;JJX`^1!;!`GkSr zg;;-DtgD|nHiWKTd_uU7<2mX^{IulCZ!N8Vhd7J~`GzNDh-~s=Jw>dJnTw7eS?Zp| zs3D{Pw-Mky0%!oMA={GWqBO?4Y;J@^U#K0~w?B&>dheh~)5k;6x?yD`gY)ip@Ww{q*iqM=V`5u0g zmwOwzud&?1UkCmhFLx2x>kNAx_T#Jwmu8Ca@s{U+`Bd{~4Mi}f8Ta^GONWm^XJC(C_VrUSs=${5y5MfE% zziI)#6bIGmRRWetEm<0Ek|=5#aSJSF8}fEDeped5zYpFP6lx#*F2_%Wl3#-GB?(W% zgAcUV<~2@=%e+{Rc>ET^?@DYcl<-OLrAzqF3IJbW;O;oYjmSbHQu?>=10v-ZR*Q^? z{N@tqAR~frIIoGd(2Mn`M`S5P78{Z7-?NEQiRstud-3GA()3o2@O)G1pQk)UfG;rcsW=>KM0hNbB)&Vi?A<}nb%w}HBO<>m2~^dH z=n+P3Bu=c0yjUAOBI_V>p%FP%J-3hm`Ch>a0AFn2X(Z@rM6Na>m%I~><^B+vWklq6 z6>$zTBKQVG9I+B!tS3Ao4?tvgrsou3F#&3NL}Gx)16I$yPeA>R2lX6(K520@R-c$V)4d)~^c?Q3VQ zfX*-?3h*ugnivsnUtKw#GU%4?GX>5e5#nuVs0_?_sa@up_|^_OSE8w(zj}l+e&N2L1?UpIZ5@vGVo0eM4{s zw4KfW3HYge^4r-vH6<0T)_1sv{l%))jbb!svp)=*x=sG?Q>|83tvL21q(>NbFZR#a ztr9QMyjs+qXnCan5Zp1~w8Om?(bZ1!;D{ULVSn}Vjv?eNVz4I}_BZUG zTYHy=?XBsWUm5ADtDS`q8Er)5_)+iF>>D6B)&u`$i8VYHl6Z_ImjA~j?qG>EJc`Qj zt#`wg%HyfLH#|l%JWj@@;UU{K^in6V;qc(OP-(vO9%2)L!xFEH$I!)b4}1yWE(X@{ zC@RIb-o3=vc-R84Jv{6Ju-!at8f-7a@?9(U#n{x5Gt1SeyqDtiM6ceJ{^a9QPu>`< zQ{gzzcg>j8o*VTQIdb$yuaP6)YfntoUly%jp^y32U^_AO#MIX_%T=p8UM<@Duarao!_58RwAC>HjEyH8{aqK#+=+|RU^K&8tJnXG3ELt2=$PLKO zGCWwvp5eBX_V)p=(I%f85k5Is!41L*Wt{75*CEy6rf9>KukbZ@w54d4w0}X1bD3*} zj_!BTwJZ#O@I(FBGPUTYXny%)FjIseQ2W$+qf%Ebk5)VE zb4uSHeP!&5am=5pO!NJ7R-OT()0_r;c0^J-TxD=i{zRm@}p=M&}W!b5=x;s`PT) zbvAc>`T4KmsSPWl4Qk=t4YgVkMbZ>SovF_1n)(ipHuW%9A&JcWhlHl>CAo`CloOFr z=yk@6;aAnj)J^rdnVWgZ7kTiFn)b)E!|g!(E2f@U9C05TCM+<_-$!@mDF`NN1IooUt3^Nn{}P^`|wm(21H^!4Q+0DRPHV0Q5^{E zjzXVlXkQvCofSz5RWOq=F7s5V%G4Pd&+#dU%6==dHfws6q@g8(G$ma3XX7rPIIzxp z9Lz#WErUbYG9D7iEBjg2`DIIkf;@}vFCZ|Yi1z6N@l71#HlU+$nU?a)KW7_#d9L|#uv z4&wbCG?N#1Nb0@?d{4PH(25*tAZ(_1>fG8$E%cdz(2v$ir-5)XVKfjX$_J@y_;=7f z0bci=>LPW}tLoBEqw3OiK~Ce?t9t6BRnbGLlynga903B`yv?{=HH2UKmP%p@bM1io}*o_ua*Rg@hYL!=>oy^sQY_YaZOkZZVYUDkN<9R$~=ki9l;jgj)` zdC8Oh^`YT8;x0y0<_ED#W4xXHP&xb<8gcpUz)#q@$f=~7%jq-UDqKd?k!R~Rd+oXcR!e2C)RL zG`~XxkMHr&VA)NUO|Vsj!;zoRs|%%{K^ICl7@5g&*oBFMU3a?hM;tZgU&L@t&PAn6 zu+XdrycU5;TJ!i9aO!&DmeBP|&>gvmW54T*==z_HiB`oV!J{@moH8Ms!*^_bKJ|8_0!R^qzLT<73g6IXVScSP&pGHwV{ZTuU(~7U6mX zs}e8nV}naq2Y5$xYRx&kPEXBQ!3U8K3{}yMC)7v4G(MN$`XJR;5n3X<(dSbq>BrOa z(oqyQm*d<$pXVa<{3qa8Ec0~koNdEZc^R4s8=HsZ7_a4W=*qj#2&OTtV2Obt31Yg- zIQEB8jA8VtH6(@YAsV9Lbu{vYH6&Gn0bNK-q-0HT^Ki`Fl1gs#Q*rmXCD*AjNpj(a zBbR3-FJ{PnX~|b$dzj?M;MW@2W67mIH9>MI3y}FJawo6s=8*g=OMVu9b4V`SDC7!` z{g$UmzSolT;Yrt?wh#h2?quy**r zYJsHtfI!k+X>q3T>aCzboZ>Bfx?peg+KSj#lj{-Uv{(OgR&lQrOan`XiBv?oGmuL- z&Rvm1U3V(-8aNHCbMc#h;J~tRRUyY1su<{QCH`8M+Os}dHt_~=G^P}1E|pdOG03$X z`;{%AvODlqBksWVD2>pZ>V&*yZIFHvZIDidSio`aHdJw&x5eF?d!wfW$+dpYBe{U% zk)<5_$zvpc$B1fIdJVpv@tc5LXTEBb26@3#e4YVc?L4)YDxDg#Hd-}N(!FN^q`Z~@ zU2vX=EXxWIBfz`(s--*&cO7Y|HVNI3_t9$%N&hxoFWo}q9Ae}eLPd1_PQ_xf#dtXF zIuR@m;6n=}T`_^ATZAZ(N|!x`3JCOpk*W%&GjY1(*8|yXv7~>OSkhgETuiJCDcaA+ zUJ#WiMzC}kEXJ)I*M(rmqTgzPr2B|K(k(_5C>N>p=7`$@M%$#_U_NB<^u&Lv1Ptft z%uYIy*-8heoB6JUdkUaVOWI!z<+Q|qPJ_@tEfw4oaoh02B__Wq>Tv#Pfqw>lDf(C+ z8*9O$kJ&wqt}?#T)f*B17@-#Z^JJYFtom6ekg9IiGjaEv#kv*TQN#k`p0!xgH6+&Y z=oIThUT5VxfmCx5LcIV?Bj+AmA1e$g9LMqV=rzx!KXVYbZqO;zDvoo{k$g9vIr1Vn z&GQ6)mmN6IGbfO$IVO5FZ=z>5L8aYxf@!iV*nFmdNSuk>%CQE>dPD=Hm?`kGHRM5T zkJAvfzYp?~HRM5^K+d8iQqD%Sg5_>Wb+>tI+}&r%wX$78a^Z9dur@1s5y|g0U+w6Z z;@cO$bCKK3_g3DwB7g7^m-~(B_ZYjvR6DEw$OiPh<``DE(VZM(Hjy4w>3lCsDG3$!o2lon`Ds(Eog$9@BnG+=F*)cu657fDCqHyXLa z5=*~~ywW9B^1sC>aAv)mJV26VmPEF_ z*yKMCQ4&=}_kYwK8=OmCKcAc1HN(}th)S;rbGe#P2H^mbO~mnZWVr>!;65d&bh>=F z=|JYeuX2fa0l{twgT?BC8Az;2=x??-(tS-F>2&#^I0uqr5Gar%&>TRl;CVX9?i-S8 zKpvW7Kvr={a2k-pT`@4cw78m(FL101XsiTPCda&xtos$)AjqDnK`2IZjj;?xFqZCq z@57|TO8@`C606tSe#sAEa3=rDIm2#7Z+F30y$oX zZ0Fc-fn!s<0N;byQ))9hpjWsNEyC|${O>^SMAG;2gBbYX5|iK52`Kre6YFK@nd*7c zKYTL%FXfdeEAsjqfrb=@dzNV&Ux8j{F6md%8tEQF?&LUERni;zL5wg~EJA`4h;WVoLei^E+=5v$xRY$~g4 z4`7>x{#xW=oYdFvS@Qc?<=&dlZLm(^wz7z-B*KI-ek;I@B;m#Q&$e9BZB;JKeqkV?HN)a^m0d@a|v_TR^e`u&SUgB}(M1V0BlBk18miMO1J$ zF%zr#--i}>J@2(r6>*+F1!t8j3Yk%b+AWY3In_u?{}3dly8(HO7+DnwS3Ss02xG49NI;G!+EJ*-(9b&q}xs$>F%~TxwdoC5q#BI;0?QU_KB`{9MsMS(|b&mJ|`RSj) z!5}IiAWs{E^6^<6H@-03=v)uxSoEr%^iM)ky7kB_#K=`WNB8vzW5p7D$KXhppb(Net&@hTKF#Ls65{RZR{V(F5<9Pedng(Ufw*zp$Y zSiXy;KMQs&h=Z_c=~S>!ani-BAFPOAeeqWX8sZ|HSGbFnL zX)pu8XeJz_{x2*He>?hS6#X%%FZml5E=@4$?nmAwP_Di#pbG|t!D4*u)lDGQE%*<# zIMTgB9O*dw266nZOh~D+#0b<1kXq6l&`e5PgkCLql;ocy+1*l1OSnZFU>bp`Pa-YD z4$6MWN1d@(<9R%`AmZ?|~f!czpOY(ML zN-$P*9(WY_lH}O~&(Vku=6@XiW%0j_`~g7EZLBms0W|3Uayz+Y>={J+hSpa}oY z_-jlx#=jv|U5D>RCtNw(bYV z1zE?A;|Abkv|RY7PY9Ry&BWIyTr@2;&MIsEX9BIp7^5ZittCfcn@#Jr1gj<6V5h6> z?AE7gNh2dW49q=QWMh!=7p)}YDsvp%MJYnj(AX-#v#`~2yamay)#Z;q72=b1kZl64 z8m&A9+hlMXz-;nxdJd}RSeb;Yv8DO*6ph11CgM#oHl@Gr$M7WblKKE$c7Mfb<>4VH z-V%F~%4jMI^D>wSy6nmnfN2V*HeGxo_6=E;Nk7i{Khq+ug2hzAKaO*J1OZ~SPywE& zg%9HQG?J-IZS=aAl)EkPO+n=(U;2kPWV)k|aLL*tWtrm(q&VR^)ZqmPVId z+8CH4!Km*iV4p%Vt$f?OWZDzxtmyZBoW8GQjXX{3fAOGy9Mq~bgDL|Ex+Lp3$uSQ+ zg2V)HXOmbf+%sNcjVYBq6K6FxN8&>*@#)xh2YIPK?}PFe5jay8C5~qu7jRtJ5-Y$t zB-YCJJd!D|_If&d`iYYd^RqzUVuT!x?Imzp%w7~ucM-H0a4K|K?J@*bV=RJk6=OUG z+ml)2p@=TX_y?8CI*xOkZ;6F}g~VD%UQ%Mqn^1r20yGn7HO3@~s{yRZv^g2u)~v(@ z1kSXOK2H)Weh|1cxWg==aIcXtn4VrjZPD5wmanW^jj@;#RZmTOIojahn&2bkoyl(q zNAs_}x@O5l4OI2G1OnyG?-wpFy7+33j6&Ce*V_2-5<}dy3H*;U@cuiB25*e=hCuto5XxuBnblRw(+mTFBb-|_+b|w(RGpWC5 zq8cTU0-75kMkz?(<_b>^$vSkUiK{b48;I#_@g7qo_@n{O!K7TtYb4II#)slAY5*%#+>FNbu)tJhARihj} zybzZ9pX-&QA_i3>iFHyTCu;x|(5V{IeV0`YzxAp{l160uS=A`w_&e~bhVV0~hGM>o z1l7NcU4#HPrf_v8%PRUr1|( zc^%30z4p!n5o>HBIK4Q4zX8m&c8<07S9D_0og$2lJk@Nss=~>KBJOu>QB_WzIy%rAoi;v4v>axa zb3EnzlZ5gggS{jETJ66?TIcYO;J?rOwP1I`|7Co?Ms!IN#Mk^^iA3Tqx$6Nw(P=GO zibI*s;k0=JyRK<+v?!=v(Ss59m*JMD7~Kt*19y zk4;)}a0m5xnjZfIzXsOh_jG5SME@2t1l%Y@y{*nX38{>p7_I_ROmBbo6(87c09eI+ z7_bWa4laZ= zq|Us2Vu<(ZHw``Y*>_}~X zH(ImNTR_D=I5KnSOcLuj5P?3=e?%LDagP23A-4F zWv=P+ub(mMhU-STcZ~VNV9qM!zCAu~qtl7>9&{5ky=C~Y#5GS0xTH-P2Ce1tuW2)C z>F`lGY_(nf!Z$`<_0vfA9>LYGN;r|SDB)+w6pkCD#=px4QER$)aZ(}DD0@=~)$M2G z4375~f@8lA-Up*TyaAt{RAe{y0lfeJRU~XtEw>ZA`tXsafTk33tCIj98rw(Eoeo>+ z_8?O^hV4#z;sfKl71eQYRbzfZrg5B`YfZP=xN2G+2Ukr=?RT|E6(axDdm-1_E^S;N z<=1cB)Ix4D5@4%wMd>ENRl2W`vpKfP)^=Nr?b9fahpig(D>9wq+-z⪻Ke8K7C>P zCH_Ns|Nmnf4hz1UciD9#uXu2z+orn^D)D20&o1PHCj@#1od(+mbZ2A+TdLj%(L;x% zF;_>zICWkA+%_ZU4O7SgnA;(sPI_^1~G`yZe z*@q&iO1~lJbDX;fwcWE|G`zH?44?>FPloYc%eF3r|96+wbuWNd+aK5PnpViYMFKo; zY#&E=CTyi+J#!avY;5bg7me+9lzm~V=KPLaED#c`3LL9c;Q*=Qwj0~qu@8o=b_yeS z|9@i}b1xg)jbeLFAvYWe@RG6Jh;A}$rTYn)#j&xCx%-Um%P9N7R?YbXNphTMtGKJGRcvDX0ghnSe}L*{Xun^?>}XawHDXBh;v2A#wE|00msio-vN>$^E5e!zmf z4{!xRe!*u3@8ABfRvZwcfm=z8hb_h~a7D!U6`z^^j*-}ZUfy~@kcMtEK_0Roy8*5y zh=#l_{a*b)5ozRh5#v#d@ddcG#Lz&Tvp)v)|5u+JK%}vY+!J??Sdgy(-a!zJkNLb` z`F|qP#ML9l<6#W<9k_MGP*?s*es}5rIZ#vAoj{KfNDI(U;C>wxPN%I#I-E{TgDG~P zu1OAEV^{SFFW4WMV5xc^^GTfp@eOm}8@P?YwNDb4zcTOxxYW;qtLHX>Yr=vi+^iEQ z1em{&gIN&r&@I*lv^F!QBzJ3crfHm*2UCM8l^9ax3~g z!*^;O>Z0iRZgB7o)c&Nr0^Cg&Zw2oi@z#X#x&{HK6EFY)ybVJo3%H;;fzXvgL}(#c zozJi0Rc&}W)qQugO3yKG3||XM1=HUwF2Q0fL}_FN-dCf)2FWhiGpuAUUdopBvC4rcGkjQ?VZ(>tK}3z&#zZ`ONSF^q*nPn!`0#*(uPeb9I>k%6=TebR z*?Zqk7>VKF65T z`rhcB9(J&Y)ds@DCie3cx4S*i2_9(WfuP_{iV~FsvknZE&5J>@6qQv)mqO%vB)g(7 zK=$P6obXG%(f(QN(PSc$%s+9)@R8RIA6;hjgTwD3m_|_(g3TxlTYC_h)}v?K2UU8r z@BWqt_lGhqUV;zMJYl~a{!b+fO7SCm*qTvE+4f>n%!85J60lv1=p?OjZiiw1I4*0R z%Igp^gkbL(!;ZghSl=>z9~^cqfi#ZvH=YlowF<%y=eRq--++D@lHKYTiTV6At`GiP z5_VCN$=!4Kuxp5wXauV^1e-#+6^Nhz35N2WM(8ELE2nbJwCrWCe_1sEYhLQ_MR#Ie z3dipqxx2uXg@op1{Pu`jik~Vfzw-)1pYr&n&@(R+L4dAIOerfM{=`&}brw()v-q}% zyWR_UJ^_WPh}?)?^HPZxF)yXdLqzCiu)6Ln;#F0CnMQ5DoC~Z;O)l;bj9Qf>|B@j$ z5QN=^i{LeRL>QnFLDr|bd=afVWL&>zYH73cxggb!=`Y$v9BwudsFqk%pCik5R=GUSy^#eOPJe(FH*H2 zd$6%k)?I7kZn;JN1xOVl3sMc)us?F0?knT&7TnYWXx;0DVe%@t-TYN*;2wUrV-EHz z5R`oY3yEy{NrwASoJdP*p?`q)N&Ko>LmCeX{i@?DzjFAfu&a^cTltVQzSJ@HF>y|# zHC6FbYwpCp4jDvTJ2#{juZWg9plx$#+gt*uYM&FJGHp|Un#e;039FVEzbfu7!7bCa z*9ybt=i@dXeWpzVX_Itn)BV8Jrm1jHn=ZvqZBhvv(I)wwfXMHFHm%@e)0XRF8alI( zZ%C-|has0^s74SOdb2o4*9763CHMu0EhKx;3%G{k*h+qa5S)s%gXD!2sRkGoX+8FP z|J~)|*T&tIKvWhDv^>fpNL}PHU{;pytK#koql(tOS{PQY7H*5sXUZ~&vPh@0JdC5t zG7*23Wg&hli>lv{vdFIUK* z@vX>vOzss3s@^eV2Kr1f2a`xT6;l^-D&{EsRm|yoxN)s&s?!?K5;-m%o(*BFz&2V+dSK}cqdy-hM*lV#$q;W&9Szjp|(GV5&YUXqVykTn3b zKE)ptu9;9#Rw5;W-e~Stu8O;vmVY~WUL?QV8zL8>&nLh1m(vF6dLX*vk-IlqxO()) zh00HQ{TcDGq;CX9gX1yI03*KPM$dsHj&u=7TntECn+Hg?mdf&y)q~A*=#D%<7dGlB zw`*nG%_5$9Tk&qAw?+O~gc~479?;#chz|l8<7Z)90OO{{_(@2l_YdcAAr3~*(W_|EUrEuV>xDe^pG9lw=G+o@1xBGK6uLoypD_-Kwv8N5Q@D@! zM5`o{*Uh8rPMPmbs8%jQYYRcwAZu}H_Aed7rKl|t9wjRm7MbrF2VZY|ol%0yoqSF6 z&RU!GHOEuHt6Go2`gW=%*1eHuz(dI924}(aE&RkI&T9JGIKNyNR_kz_C!kl;rC&tT zr8^#Zo*=o$%h9eoO`mAV)8OtPxo~}uXAem3_@k>NKh2WAkMo5j*Q7cEnS@@GMfy0C zMY?uKX0osibejq2PnI;3<#dZzipuRIo>qhw$eHMY_zc5W1)C3_rzn`}+X@+hUIml>0tzPGaOBPZELc0Yc}3iv2u|%A1};VWgd2do z^H2N2`#Sp5DWfd;6*xalay9T6WTYjR{vwhGI`ljLGkJSAhvX+&^3mXKAh~dZk$3+| zPH_|L?3g2?E&0_rKSOd=sSPsLl1qOv$)y{E=#E$Jq0+&1C;1p~I$allTb6UEv~}Y| zko<8u-3UVkKN0!(0ETow1W(5KAbO;@G!wTMA9TR)Mf?t?yIAgmEvWwPQg{4N$K*F- z0tWu!%8%fM!)H3fpdZ7a14<2_Qp_{=Rxo$!m56J|baZnV3dfp%690ktcancJ@_hjBZ*U95z?%vXPjei**pg1_8#pA-J9EVP$RDbRLqQYsy#g*d}vfN@`r?{m~b-P9TBZ`7rq19j<6K|cF8 z^@J-C^m{kA67GX6`DB1MkzByBh;C^5$zvqvIxFaiRro$na`k&xq?IL?ejdrCI~Do- z-^t0%b4uFcx|6)6B|it?a*_*JfP9^myl4dj*~o ze6>-Goset8+mhqg@w(ngIf?qm|lSo`L+C)ynj8K7fv|T3H5SAJ9te>5fCX z8DZ($Ls+^b@=|sq$saI|!?>$~4gnN5&^!a>YNZ2H&&X;|csFFVADoQ*l|?oQ*_e?% zi1;5uR(rriU;fv?63ft$#L~@2Udd)_WQ%U(bL5682d0N%E-_3lvISro8G+r9-2nl1 z8wuARP(uvF&XeJXq2Xpwd$vc|vo*0qvi*on{+A)|8M#snX^rYPsc^fS*j`U92=;>d zDW1j__baebNS44)aW@k8@E~r2FFs)`iBW+sx4>#a84LUtf&XA5zvz<BaE!{KSNf!E07QWO*ZKEWP)JL4D>ai(hw2oN&{sP3C9YU7GQL` zd4=--!s$lWt<#V#B>Jz@O;6h2+=zcht~Vi`(57Sva+Hyf;a8YTw-C{jF-=Ra%mE1p zi6ag34VZ%byx8SQi={ZPVmTwH-D)?%=e2(v1lwBLG5lFtSjI|uf@H=If@+#z$ z|0sKs5>_+Jw_wUrc3~D9CRf?@_Yn^>vf9TML3Skm=OKwoS!^9_tK+Qgp7t`MdWHL# zJrMG@d&Lavn)soP$?v=gG>?D8bT!bECr|pzY~NHHohtAJhPSD}XdF+{{I7*U6_8;c z6+r2(M!xxv3dFDHPqiB6J22&_fG|r8ld}R~>VVNW(Pp~x{&8|Z|98kSP`c*Ob#oDd zX`Cq7O3FEg^p246SDQhCjVK;Z?sm#n_A_#pPBq}5;wKSUlyLo7@Uc3EpR2k z{k*_C2>gQumaV@9zSsh5fHt$h?-01o5}p(+?hu@M3kdu@ST&>q*e_^^_KA~_pDd>I zy|w?7Qv1K(|IIe&ni#=;G*D$g10f=e_J6`;5eYEG4CG&oKt(WnAaDl$XCgmaEa{IQ zM*mBhME?E{0tE#7%|QC_^I!-Fqpg`Rxd_C-{AvX9z`izRz~L(f?k6h zZGD>=oN-Pff8(n@{R!V5J>j9`n?ioS0zrQlu?~D;puYe$Hjws&0%f%>z!ZS_ z+z7l4sf!^n9se1~m*}-ul>YAyp^hswSAkf3DD+=y9YYQfQ z4T4FhT~qDMbkpjV8*ZzTEz0vD3xBII=oCVee}NvBOq z>^}$;umO3^FcAneH_Q^PVgltP0O$=sR5Y&t@)gj z-0M;9@e{KL!hia-c=D-!EbomD4VPyc_Ljv>v;ln6@P7jLwYnW-!bgrV0%MKOFL@!2iWF-JW?7P+<(FPwt+eTx|xKk z3MAb=K>AdMmoS_mVBRxK3oyr%D2YGgEMRuzzZ3CuYy7w_7){aN>9*a4+E>v3a{;}N zL331wI`oZntP`#QfpU%LV#=}8FeM;xq+wY015B>&jDz{mNOu8KL__DPrJUegghWT> zYAK)(4AcrxUs|ej1-oG$iz48$FicKYb}7`PV$`FV71`d-1jW*<*`a z8|;--BNI1XD#719w!ktj#Hf>q0v|-QcE}0;1P&^tIrf}ks)1<>8Lj85kc08nhh;o6 z;edt2vxcby<|^`NIbWszFAJa}lAS0G=mi7i18PqqEwihUa`<=7k*EO7^MwX$Ny37!Brq4BF2a z%%~jgboZjyV3xkV_J309;@}tpXN`ulzZm4v|2jAmhUZ6J&{n)eUpIG7D!G?*KM z$v-h1bjwi8XN7$bTU|&rPGGE#5r20(ax4K)M#|xcDHuF4vOQqI+M=xM1-p!34_h$V zV%V5{L9AZHDaeWyC)Pve-wgjm4FFo7??8IzNM20c9yLHm0QK?L(k%buawJe`9x?wT z@vnuymgPH<^^6HEy~5PR zzb}$Ko04FjFk)T7G=$hqTK~nYAAsyqB_X!a07nC;13=5}I;21T|LP|EH{q|1rM6n9 z_Tu(CzMb&(_mu^#TunSn4BBGG0kGgZVw(FJ;r?+E_BDI>LShV(3h*XZC|Y*cBf968M}5u$F$Kv0 zR}&{q;@d3oao}FgN}SG0T#$8~+N24yd3b{sx|kJBgjSPK>416ru{{c$8S7G{!o7Q%l)r8E|F@>mb98Z8_S zZZxz!ihjYMI#R0T%$e6yd-9B`yZI)EJ)xutFB$SXQOtSq*RyP8nh0=aX2A z*P~%x>d!pCf{a%pnG&myVkExa5)T3Ran_)UWmP6kX*0xwF68(|Yq9Vjkys1b6JCQC z>i*kRpfW%uPLucsfM9}!smq{nQF#k2kJkTg0Pe;46=X&Zz??;Vl-U59|8e~9!}oRMf*k(!@L!I94A-Ife?rSQ z;G0G=vuQE7s}W7*{e4|T%WnawHT^Vz{oDN0C6D5#X_M(99cwgaj@xpaOjNY)<3D0ItspY(j@;-lH4k7me6ADflWJbWsWYASqY7jp57 z&s^gu-J#xi^f)Gh-IGYgJ(3yHmk>Z}h5{_0ItuzElBv#(B)$pBH109tIEpjh$V>y5 zfXqW+F11+FT}CYFHfdjx3AUU-E3$$Wv%OwmtvCbV!0bwsER#H_talu=lH<$4*Ru-G zp?X>ZipZntP1Sit<+}_gj$??sV1k+v?kTQBuYHE}v&MzGE6}Ob=Rzo1$kR3gE<&eD zNp}UUlWsGTY1~S3tU@wmRjD+MODqJaaWom=psdzuYG(jJ7kS4)7je88d_8MjlGbUx zSVJPUj_rHUxU2BtFlgL`UgM^HX@8k+2Fd5plE(m_&9?&766qGw66v-enU>s2@U>Yj zDWWCkg3DCuQmfQC=m--OdbVT~}^JZU&%Re1oZzzMij* z=!bMKFK$=2p}{YK#AxMt7T{6>G$Ftyu-lMZa^r*F-H0#Zv(QFV>r79GronuOB(BfF zv>r@nTB5}Ojv|o;(^JUm|B)!oRqsWHOMz<#PF?pba$9a(*^nW?3sVE?m&{LG44yXU zsY17n3V%wa*b4E{<)eZh*m#zUsag0nuu8t%JuKqWWBNd91KH0saC8wsj&*@Fb$U@a<(^OH!NRul#R7gsA9)s7PQYJfS2qqF88+kv{I9`R zUGxy5?(x4Qxr=UY)$a>t&&$6Czh}XePcsi){@71?jVsyPP5d%?t=6g{g05&0PT`MI z&owMrbJR!V;u(ro2_VP1{84LqowTi2Fv}0L^>q0|d-fVWp_e-ij*Y10?S#9i9Ul;b z-^z!6A3?ld)Nlnf|8#5qJp50j`7h%85^^oR z|D*Zdci4}0XBxdbN;4{E!A}8RLpJ*Stlg-CcAo*LF>Sh+!~S-kVZjPgXmRcDHx`Ndsw@#9G&H{?dc{P+dBzzF=Q3s4djXcB-_(XHL~}E z83@}oeFE@l(8c?f= zRg@V}2UI5F-yeU5v1A=f##Zjw$-Wc0 zBU{M6ZwFaC;ShA1dBLrFDzd+suTaEbcV#rmR z&ZfU*yAx1v*ebhY$bLYvo2`=>WXY}sGa9nGx_*v43MNmon*Z|I8Fw)QE!)R&5{9iD ze2u*2u#H8wiy30cz67QZWR=}<;6V97aBZMD~7_ zt5RLf2%GiGVEU4^s&oQ504DYnbuq(j*6YAbAn9*(O3niGIH3P8Ys@0N7-+7uWS_*z zT*wOl9da;Fve%fAmh3(-{UEF4P9h)cycU~nSCj0rbv^|3a*nYaZ(6SM?ggFt@>i17DF=n&3g6S&4)Z3(;sXgdV7gT03R ze-JP^fxDX|0(Y=J^gi{o_+ZVOxAD;mA&F3|EIdwnm}H!EvQ7@--nj@E}>*L)ElS|J~3|G(a4^8YyLY0_}g#X9*CCwp+BhWiBk|K_8Y$;3xz>mv&v zdqX%;BYcLFqs(U>^XjQcZ?hLC*SJp15uCh=56$Hd$QS>!pFZX=enMSwqzlCf9989+ ztS+NRkW2)BG*F)yy52E#y-atHuNpCZ_4MN!5a3$B5?v3|&Eu?g)LHtO`tfB#x%qW- z;74ixg3EX>?9@r;8}0F79hdr=#NtPSCF7Gz6)IId`C44o;s~K!m+}Nm1^W?+-b~{= zigyG#iX20ZBPWoP$oGh?jlH`;m@k*tJJ_KIPHP+PnS;}!>?v~ilN zo`6dLz5uvw3m5pa901hmdOf0dVH;ytl3!&vcMKnM#O5^82z`RZCpU9-XVAyf2aqzl zr^Z=+{;WMq4j(`==JQnqkHdL56zD9{1br#%a8*55_X~OvItTVNKY!z=%6fu|jJuUh zc7yFaW^(fuA~*545l_zW;Y%+i_wj?Bdrd?`w7^Xr za+RB|gb>AFksqM>3u3X^_r(yc)*ehZ59XYUalcDPz-&-HK&?EW9v;xIzCd2$SAwC4 z31uKxiXy6Z=ka?1@z?Hq5`1t5YhZV1v%Q-J_o^t3S*i~W3vBUIEeg({EetWKM?%R1+1fgj2$htdKM}R(KS)T7egfQ|a=hBQaAU{>#!u(WHY)j!Hcks%>C1Z2$lX0+LyXyGRQ`>X+SU;$!>)>BNap|R+#a9ab z4DNXj7?n#bbPgDCL2g+wCunMc1D5a(xLRC{w7%9NOPKnaj4>Geld%SjIqopX2a)^; zMKBCGe2SX(Cl22$fY5sD%Ka6)+HC2UV;?PBbPXe z)c{O1m%qrx$sJ57^wC`UYGXsETngG;rs(jcZ-gaas9aRQI^-h5AR@zDx!lTE9c-{q z3iGlv!IX0$q%`_+z@$&@=ISm*&oc$*W*K#9qCy^jo2X4>W+Go<0I1@;LGYCnr!b(n z^rRMUw`CV6j^dQXRK-!3)uuQyEshk+qd13e3z|gBR}=atd{xXMNICS;;`F09(y2I? z*y4=mqXyfY zfS`(A0=XQ0w21u)B%O*_93RnJWr%3p5B63W&i+6qGAxhC&|AbH!Hf;|A;j(?n2Mc% z^@_3nezrsVX(nvgd&5HDy#-b4`2348Gc68NUeQO;JosQpDHr`y0hla%GVs z=v6G~H&QI=nj(pL7Hfz}qFBXk=$3?@Lg>n1RIGX=*non~YZ_lFbcQumnyXQ8P&3b( z8fr4x;EUm)B10ct!WF3vvx9@`NKu?rg6MdD*DIUw(wR2Q5A)dNOS12lQ4+ig^*9C* z3MJyrkP6`K@^C*lkV2tKV65!l#`t%#SD_M-Gw7AQ^beE0bS;p|d1gP%9G)IDr)}_O z!0jcta4nIFIl=jW3&*1%!GE#A4`95A;96G|khAEuUZfw%dXcUM5?wD%DW%?3impGc zmlW2^uh!om;NnvW3Z@cr4m~Cb(y#2#j-%t*Hsl0{=-(NvOfU_uVo-pINNEC0!lt@q z{tKmEFcaL{HQdhu-^}gmt>g1x*U7Q<`y+n;w0&amoy}RWQLwv+hv(Ae za~APPGYW#|aj4#u!zXAD5owLowY89+#)v)?|1Y+xK`&}>D>VpUry4XTNXxkNxlQ9s z*N}ZC;>I0mE?CSkFf+ka6(;?jt629@(|it{>r66q4r8u*&4A8N)JtWqf@Gmry`;aF zdP#RRQajIjrI=B-2F-WilwKEbPm-Q+S0Y#FtI}F~(fl&#)^Ba_*%+T9xbjOvj@sbT zuOzs1x(#K6JcExiNd!M)gI^2oDS``k6;jWsEiE4n9wzv48+;zdKNDPqs)ii1!KJ^C z;L>$N8s-^%v^jiB(3}Lvl`5ehxb1!13%ufGU$;Z%I&;D@l+n{Ll=F^AbBAFLK?A`b zu*IyAcENd_+#RqAc06>R$#3On#z^ixk6nI>C%aj!dHtgs%undmJ>%(wS#(12S_D%2 zv}3frvxFLppt>iY(-LD$5-stg#oY{U4!AlN)&W?@Sg=$R2kd7H3js?5c7=s?2G%te zY^+J+VsXmC?glm=SbYoY2CPS{bw1AQWnF#%j8qbo^sy8-;REQ@%P*o^%#3QotciSt zz9Ky-eaSYit_%8VIEZmM$^QDvhpxYoW)ahH8iy)iZRBGH5^*SfIxQmIwMZ)*#@1SFAUJI%L%{8bwn+Nn4jf`~b&5&NwS7>D2ru)SPl0P~3BH}?QP{4?RfP2}tt^iE zpSax3GHlO4U6WWB`OH$6eht*62f+pGi3EMYg9QK5249Em zF9cUZH9)?x!KHtI;L;68u2kY7e&MIHLF8NlCYoe|e+^Kxb_2j&(OJVbdtsY#4mS58 zy?+cHXGxogd_7AdBGCs4b@64QFJ48gv7SQg!&pxr`XgPftpYYiBDchuSFEiLwy$8T zL#bP&+M{amo6n}k%tiSHob1kT_({i?>5s z%a>&eS-yp>viu9%f@G-!U~}Y6OIZ5V5SDJT=0A`}mRVEz8ifU20F+@tQ!OS|mKk97 zTe3Ppz6Mzx9)}{mePjcW4ZUmG{*7@V*y_aC3VFu{mVPaPrJIIS&4X-^!0%hkzhE|4 z%*_@Pi!3*O2$}aRfkS}Kvut!GX@fkCo+>6t-w$V~oP*;b4tP)5KpN}MnCzz`^)0zU zHq&1@-+jN3hTQzDbH0n*#5W?W-z>0W^fij8<1cZ%-8$A_s&zcWI#vcltmEHsyz4q% z1)3Lj9EWlIELhF^XJG#!5uHBTB0H>4=?CIdx|v90ktM$Iuyr~$fV1Z<=rEvh7Id2h z`G`1}tSNjw0*o#*&%RdBKv9elR4Y zyB%pN<}rw8K;lJ<`4-Fsi=Y9fW#;Qd#`?m<+R7z>h$_NwmP|{h_uf(!ZVmmY)fkXzrgrkvetR56OzGzc0=ih zQbOtOKw9Kc_5c;!XffY`nP@SyEha|UO~z!tIRPf5%yis+0k%5orXpi}Y%^f{FvgfQ z(phF|C*Bajt}S*v%sv!`NAuMP?DDgx$?gvJ6aAy)<`MLmCrH0E#no-LA9plg;+eCD zbEvom$B$yH3P?YkY^0lmB;z4g<7M5*Hx(@A2$)G0quo`Qm=yrCO)oL2QFP|68?#YH z$LNXiF?18mRtTt?^0kv{>g+xanFIlEp9HAp9VK_cR>#N*$c;X>L6+0KmhIQr=7X$G zK)sOVHn8+51eWd|q*ESb!vtPwF~`8%q}wP6wA6xPu?+%RVcF{N_#$LEIGQQQWFOfu z7oqzs*>AAT4_TdZ`XZ}rVChFgR=SmXIig!0Y<1s*bPGBTXqpAB0w_!@vXLA3>^~Si zAJbv8QOusY-mJk!y>g%C|0OcgY4jFkrZ1xut_7`1ptmAmV+dPiX;ds4u ztg*+e;~T7FEzLWu<3I2_Vl+>MABZ@9b@zaWz$%eAu%$?Zb7II00{ozLs)5Zol}_is z9zM2^b*^?1nljB|3V?aSVs!o!#z(}#gu#R@fqY;}LtwDxe+Zy?)|m#j;7q#7NS{0i z7@WBiZkhS+|oTj)o9WAidG;6 zXA3Q+5SZ;2qvMt^u?PgfEC8eCTu0wEp*fv*E6)%h=PEeSEW$|LDWlyqr=G2Z5#2B< z9$VS#{GY`|Qk#HQ$~JIHe9kaGC|30kgnjbS4zWmz{$# zU~aVp-iFi?&KNp9jX`cfud`xE164RIN_jVug0s91i@NRS4D0ZFNE9I>ofpO;(=9pa zXF^UoofRkKkx+y~%WWt4x`WDa7SLT5q{E^>u`&t+nrR9A0H!Debcr30%(4Weza3xF z>C`kO4+24a-F|}WzX<#SXt4$9*d$Oa0uefXwk7Z*m|_snRr7iUvjn8S17FhV&@wF# z0=i>OC}ctB04=c~?Fj!x___KNqQi zzgfsCBp%zpSpk)2vi@5?RLfZbhH|7>$F%`JGa$!t58jX;V;yU32V>gv=OMLlI2+MD zy5#sgj(d&fW!0A`Iju9s?ITUBc1JWA(i76<~eCl9v6EY6HdPDj- zOpSCpirtE%SW7g_)Ld^dzk*q6F*=F~6KguNLL4$CSk{eT{Q{ZlBC-hS2Ph4>C{qI6 zU@_Ifj3!fED;6UIu%C(D?&^7-#`HKVgxdPZUm(2oFbzVR7Z!D zTaiiVRUPTWWGbD`ez)N#R&_E+bc)6N24=a72{~s!fnt>>1<+(b#7`&;>+WPq4Ch8N zVY+gf z+xuC^8r#H}E+TR~6o-qDO*xL8=F7lgUyHc}Oi##YJ710r!}e~(L&j}bFaxyyl|d;$ zn+c@-T%h3qmLUE>DS-N0Oc5}>38ZcIUSuTpOJf8wUTMtzMjvASWH+b6DrUG(aO5P&aE} zE}bdeO5{!)#p>n&jXB(6eh0I{V)WcVm{`r6MMbXzqh_uE>4pR8IX(Yhi!7(G;|1@#Wk` zAq!_$Tfk)ingG!LoQ|xH5j>222Wwv+`!?8XYnJ^QUjUsGGqCSu?US%?hP}4s)jI#L zjS;{A=mlJ1y_UjS{@>X$ zYYeC@%(V5w%sc?VTPkG|fY>$Gz6SO!u-CS`7I_f+i>3)uU9o4{8tJ4pyOVp$x(8bx znzNdJy`N@M;!Jl8b+DA-9J*8gbg=A(9{U98UrBLw*IE+I(Om@zZQ~;G5QcuzS&-;~ zy{cIP`@vLGTlG35bTfbq#9z*g32~;E1(gHz6y?;@bb)TcJ`?emGXkb37`>v>3e5Mt zv$tQTX~V1lyB$Y&P4~T7_UG&^u`vz;FqMO2JVVj8yB^VNe)`t$%LI-fQAZoWrjTJH z8@LU)pM8N-e1T2&W9skBD~hHzupFEqu(sg`Jc0KT^c7#=Ac31$Tw7%fPTT5(9^9*7 zUi0Cys8n+iCVs+I0Dto_4$^4R3`H=-_c%=8W;U?!KN48m>_eWwZxZw^U*I5tTY^(B zlmr(VN;7Du%Rr`M`~>3fg%m(7EGQAsc6vcOoiH=7e-iQcLKsYIWk`ZSFn{|J%pd_a zv{0@c&AY>8k#Q@Vp&XndL+$U6BGF#hPvG~EXr^19bv^DjW=*r2Ip z0oq8q0MI6~2YJH_u=YvV*R=Lp|6Q@y+TV-39m9TaYS7fdK0pGUu|G$}Gq4@|1pkXJ zr^5ihMWQ9tTnEX#HbAZ8Apm|+lg2w^IuSjn+Y_|lZ0!9VhFk)`Mti!W+^sWJl|WJ@z$muHYM>q8anrk;qZ=4LrYP*cAj?h zP;|E%amw;=jB;*(18o^{u$tn?>0_SS`~ZockZ5kZ@RLD1ma=40!7YW%CNK%smvn3K zCEepX>O~8E3P(TtoP}{##%3`BU|)ZsJ*1t&dLBDj^0+kkXoc^gdfEZb5=Rww?yR8_ zUy2cr1BeNw@<0CRm{Arz!@9u>7GQQ$^C+p zbBMQ6DI^!F08njo9l(CR?6k6@0B4FkkDV#vae463?3R$7c8lK#r0h7nJN;7$8y-5z zg+0kt)JsjuWCFpLlgSpqm)T6Ddyq_|d&-l^?>PU%mq`#JmBFEQDs_*oR26jE9dxO> zhQ+AM)T_t`81f&x1M{EBFL7G!tj5bNKpSs20NQllK)%QZIQIJVo7GqLf`eJiq3`RKGIs%@qRM76`bi-b$64K$1<+O9Ih9WUb!|TD?lPWeD$4VDzu8%)c}K zY{j98V?%y(33k)Msdcf+l=b; z;1xcc>e^Na$GUZ-GAe@Y9wZBA#jL%OsEfT4(4|WD1$_1f`;yp)R5Hy~p$_gm?E(;o z@qaN70xn^LDeEf;ri`_;6s({xSeU*zgT2bGL~j`ECbCmsf;&QqHAjU3`M;!Ycz5XM z12|9J;JMYt4JNVTGmR^M)YzQHsWz{bIQ`guSv@Yj_#|$%5f|rv8J`U>qNo0eTWy@R>8o`0 zNMB{s!dLo5eCg{C2K;K%pN{=x($|^eCFJOTXYaY6z$A0d(p!HCZMy<~au7ju_xo&q zHgA47dH)WmCOK^+@BsE_<#PZ@02BMW;y0MzELkvqRvlUVGdMkRh}Wd6dJ8tQ(09m;N~w8?xXGk4QWF^L6lZd7=s{^}D{ay8pc ze9FIRqxgP}%s;jU58`eWH4v@WkqUX%V6r)HYw#$TLDWDs`$(_J=2d*gc-xD^)T(D#x!pD7(75*(mw^7Z$xkkQR?KQ<5wrn2*G#IwZ z?jWK&M8+mN*<@L=3&7kCSzTb?ME?FSvb-kbX|J2icb07iPVR@T9K3_*Hl?xG!cFE| zOZG`HLm;c{K1QnKNp^}kYRQJQ(r3d~tNd-`JZ!a5mO}C>t=nExP2%|v8(uiu&>X=T zSNViB`1&H+UbzyR8l#`4nZ!R*{rvN8A{fw z(kDpOJhQ&ZoU~al1rsLecXd#H5758=L)Ih|VWT%EEZGe>Sq)j?KR{~cNp_m~!IFIz z%rM9*xlfT=x{;;_-KL;zE=-H=wEI*f(&#ot# z*ZMl&ea#mX;hRn7h@kn&5_kau!yzEThmcU+JPAxQKUxAW1KJM(?OBQcO~7PA#0`9l z*-PM`tq;9Z{VqOO^F}ujU+gAv@y#{J!sFyta~LP5tdlozasVeFT+-}nFQ|DtG>C*58j(YH9);5xp zx!du#ySntJZ{czgogH^*ra9<=_lgOh5yR#57`j`{rygH@WBR(qz2QJ^5;r@};%WZi zom`KSzj3|=A0O5MX_iUczrEf&AJ%)X(0ga=eL{d*92%5s2G7A%u+Nd`%{7`+Jl!9> zCDICMjp%mXS0Zf@=Of$-o=B6&g}MF1$qfxY!)a|EAH``Rdx{)>f#~+x1F;>9toE(c z0+8)7o0HhbU*+6fCI+hRC)4VGjSp4#JY*{l=C~jS-y&B7P65ovKrbpUz2LdI#w@cJC-_JxMj4VR zD{$xYK`Feg8(~cSud#7`Tr$EtQ%q&^(L*9A-CM+W&S&vfSs2#6%hF~UDb9^3u!*SY zUcqS{5|z^q1Qg-#5dB4qPKX?W%?={WJ`bj&2h-V$(Q(1c>@^Rlvj^103v!s}5Pua6 zB~2&@xyxu+RWcPDo-bZh$xBGE%RJs+p0VFaFy!3r;y~&8A2z9fu+YK2_20mmnyEO> zRx@(A)edejNh|2!b4SfPbiI8|>b+X%)xP!KbTct!a1P7O4GAfGG55i==@{ycI z+jI7cxA9d2QkC$vmP*RcaYW79%MvQaR|vw7&c%7-3b~fve!yCKRRFhV=?QoO=?xH- zv-HA)`1=vqsampIMYUve5_t!`mY($8SbEZZk9e0J?*f}XIxfhemRIr6-4vvh<|;5fLFiyJGtClTU0pWiC!dzN5@Lip_SD($ZUAy3W)1&{Nx8_;_WI zi!bmKPWq+qzMXGcmH_u5OLqAA<<4;sQ zvv~TN**|EW#auOhtSHBIUX_#i&&Uq+iA0tD`ChKBaDJYQ5Oy@<>bR14+|18>K@;Iv zZ1xiVd2r=nwhG)qyFoG;GDhNaom`n8GZOe5tbma zO#p(*{|xd1`ndGjlZscl{zZOeTM5c!Fflc(+ZGJO-9WiTult1W!tPRA&MU}CcE2F9 zD~1i5rrC%D2k>Vduw^;}Z&AkADdSn}`lbJKhdYz*^`43+F@rB*s#$su(+ zd}YD%y&n?0;QK3}e(8H>!&l|c#4MjPO>;~#)1G1DZzs-WR8{7`Asf+a+NIw~iKHt| zT~i1dd)h;06!~lbr)ggc?vvcp9{o=A9P=c=NOZHhx}Lbgoglnu^`n3N?69g$J@IJ+f@X{5ND3wij+*qpkZtIr z{WyqXNT+@ri97Y9-Yro-ZpBWy{0!f}J-HMkm&X}XE*d-U=F-_*O(f+q8Ps@ws%WLi z<*}KZt6RDs#0`+kV}O;*26EX=F6RlVTo#bPU2ZP9n<8wIn8YW6K`8hYu#PmcN|29` zQtG>mfPoZ#f@T~lFt^j>V$^e+@OL(SzBOuYUG-{{`1CJuc0+NaM(D1*R1V@dTtAws9Ag-X*w> z5dII}m+MJ;4Cc5B_O zCkk;>xYHz&!UACG>{Mu#rPhj61X+Y$wU+)7YAszuWHv0KwU)olU|oL;%;9T;<}Pq5 zNi%S-lcsQWklS^GYSly_EyWCKa<>gW5#vh1EbdSL!dY)m6GGGSAvZ z_nH*uaVfSXNtR(#O>Zn~qbZ3@J(;s^2f$Q1EnEHJ2`yVT6=&Jrgk51iAE4cHQ72Nv zZZbAHF|k`ZpOXu;;SU>N$523aX~fY#hRrneYNQTy#U&5Z7NxN2m)>rEe5E?M+u<$~ zM?2hlGXF~1_r!h+=5pFEJq`25xy=`u!|aQ^iw~xyK}7J`;1z)lJN6IZCKXl}2WvPo$%z-$pY^*8*8cp0QWLQj)aLHVVq?|@)2e^hHkN$jWaDP#~Dp05{;0hkf3d{S5=S(y~8Floxq7CqB$&! zglr_9F2tEp1o}wf;4Lizh!W2vB zIhu3z7?!#&Qss~_4Crj2ftH*Nq-&1cisKk(gY27ujJ6K%fKYidRNIwD#-i6$N&h4X zN_Q2q$d_PfGbPLCY~;<#ukh1*r1&0JxtF1#F2d_hlI7Vm96epNnk;f~$EeBI9gu z>7ORJbjiqKUvOf#u)*&&$=!qIdVp$_z5w6L-6&ea5k4O4_C$K-y}Y!Uv!Q9|9y7sm z7yNCwD{dQPHF3SIkplM{Ecd&xtq6CmpvuTZ%U$|s;4WQP%w<1<&G4(+67}BX!d4-8c8sXt&5|rnmJcn=kRFlW=Dw!}@b}cD=Nj!ilUO z#wGdv7mQnxO#s6rq%V5Z3DV~uK%4Pkn-rXu1~8`MqQ>GGlYIx|4gyTZW`hk>mNUnV zX>6e%-|l{4hy?muzoFj&7xg$Ui{nAovBrv7$9m2p$CGfp$vVCi$HT)cnEsbJ&%c5= z9tc*6oCDjMM072yh77Sjr7wt2={VZ*PRPYBKj z1bzb300KI=*Fc6^U()}JZRrtuE3DJ-ia6e+B}!kwfN>`241$4Wz*x3<{8nge|!zZKmaD zxDlM53lD1LG+cmyU997ez`l~>m^%;WH=V6tjlF99jVN)XBXk@|-2DYe-{)7#gsrCH*e8HL0486^$k6)qv8! zKn0|msP%s@PPCRibL=?F>dZF|Ea+oEZ(7ie0EKbqFMIkNOa_>ymaIIp@8h>LR9~#x@nDG2PEkqALZ&=q0`A| zkkpx{719p9=3e@jsEl+sBlqiBT&%f|bmH^+7W65gcP!{u7Zh{u!(iH5vO24N09hSJ z2O<$4*%ZiL<&b6lAHcW{Y<0@H5=pXwrQbtf>82y=a1xVj$2b$Pm_uORvzQqc6Lapt zR1*PR|33g!7Xmuzv_q;|XVOP-Cf#i0fxHM@%}3`g<})w{EM|_y#3B#|Qyq+RcN!gV zKcrk&0T_Yo^_45am8zE5LiR&!uYjyhMnR;e4J`dW0!tS{9?FAkfWUPu=5xKk_Mru( zSx_vtSsnO@JRt2}IuLh`+12;PXy6s*3UF#*;XWcs9imf^cbz1ue|86RF{z6&NhTC! zIo;gPy-GccVhI zBe|5wV`hWXb?whK}Q zz1Ft$h4Cfba3m8yno7?h)bSO@S80p+Q}dr-K{{3mGzpOBfa!n&fG!23dDN4NgUq9z zP>e_30}~G>I*k#obY(5sx3FD1#yx&@!!{AUuAI`(<;vO3IsY*by>vdyR=5?1RL0qY zH}NxtjxA>${s!<$ve)se2T}pOvX{OHiAbkomHwtxtn9PsvGNx47nsr^3(~Plpjg?5 z0aXH|bK+7!-C}nCN>dRVRY*1;Qz1R;xdHjoSD`Q$p3AV+bcVjbxDgrYdfp4E%mDfX z=|5s=MxSu#<=HBXDO-&lWK8xtroCXZ{n`eqNTcpwNHd>W=A5R2ByhR)`yJTta{Tft z9**OzV+|g$j>lTZsz{b~TnWebtY+;#_?zR{;MjmwCSQVWOd`70_d)VopVEJVPw8|_ zdodST)<10B&ZKejS{s7(1CT=2;a8AoLPomW4nPW8a?&4yoOB$a%)VS1#83B@Q~7v5A+6Rt(p(<{Xot3p{d)|D}B!BdeBaCj0y?R;23 z2hsp%f|Ax(Wd?eAd}ZLPgwB6*ror9}YTK2wk8t#Z&sh*>XKe5U?7Ly_Y_-@Qj1fG@ zHU6}<4`AOHd+oB*kdLGO>A!5YDFA)}po>fy05jRowUyq2Yyt2I;@{6xAaxduR=_uu zxEa%>OZZ6SSM*x0(tkcYbo_cBM^V^xEan85Qx>D+moTwN2f+O8kPew(BCKzx)Y`XiM;-_CIpQyM2B5zjkO|fT^C6i! z&xXM~f&CYVKhrRnf51?>HTDu?+D+wmJN^zMe_FpG9J}ithlefb zJY>c}M%!f=c?RQe5f2#$lL96SjGFH-m=;W3A~TL#EoUMS2IA?~rJw(L}?LK4LNd=(Xz!WU6g+ zF0vEAQN*8V0MKzj%Jf^vwIowrS|=gLENSV#g|u`!0e^)bPo~+41|faYV!jUnI%`2X z0Sgo>Qv>LPWnF+wBV?+5T%>n_IfnQ%4MO?{i@89Yi6pA+dLHr;_Q#QsKhun4ZaZ#4 z1qrl|K-#Yb+6~}Dj6e}EKY`KJ?sYI-xZ2%>{SxFe?6a?SzKdN)!za;y1gK}LR{|W^ z`{Ml@b@mP!WaG&7nC0~|_;Lgmegh{XZQt_|9qAJIS+EfAq#{w=M*O8f_>>cu^JhCo zhWP?(E4Itd>GTXr>3tmG@dq}r@S6y%eSM)PaAATL@deHZ#+d`)wB-L~N+yMv6Ya?h zkXI6^>jcuaE6f|%pYsHYF24YnPi->Qb$5XN zWTT6PvN;RDf7vyZ`V#oQT0ZM|3w>XBTU1T=SzTUkT`%hF%8-_??K)q_<6)X4UzV7<{OK-1k9UE zgLX|}-opMLBs$;GX~+WeHJFeB(Z4=08lpgn?WB#Kh$zUG6*+h8!nq?~oe=&#JZtwMwy^O#CAK4&eU$wZ#;Ld~7 z?zzH)O9E3>xZ0FW1UB+B1IBMy#ytS`Z=SPj?Mxr=RD6{pJ5^zk za$jUM>$`1p2 z-gg38i9{!-CjM$63~N(%zwjG?_qzbq7EKwc=&N)V1@RWvkbreylu!oa?|=^wSojMB z)?T&76SyuxuRx-HHi8Sb5Aj)GfYA!9r2^W4^{4h!f$|qR1lK?Kn_b<=`>0 zlhgZ=XjvOTvLO=1?E)9RiW{EW!0j2h&J$RE4gKq!-lP3%AQCN)3<5mfVFSy-W&&%I zUF8Yfh@ee8fj5H7BJlIz0uXL4M(hmQ(bsx#O~EuP#QaAC3K#+S9KdKvXHZg=d8f@H z8E2z?Sp+=2Aksk|M}6vXgx-3=CL#P5lF-hb?vP>rHF^4LRIPkj1R(PwKwUO;CRxX2 zL+6nK*m*A-!4}*c9Xo~svRmNkkKLAdOZ3(H&UNV8UhXw$6Szcd3F0J>${@-d{^#enePwn4MO+G}&{jlDL;Cz0wg?9;G+82hRuFc$mk+PWLkdhF{IqW(Ht z_U6F^wlvM_Wjs;hhsk)VpL3suly-aw83$Uwy>T$myQyNAMTJm1N6Z!iGHubv+MG20 z2>2ws%dTOM(F*De*;cw>PQgf<%w~*u=xj0zJx%l|`jY5};Pt^VoL?~7f=>H^bWhOH z(mjYo`=Sf}x*|~`+7ApwwteE7BtdxSROK#DRh~ev zsthCeLu1@8tlW!ERgvx~sU?{@00s zeG#pIp*-%#@1j7hNfC)S*jTmVW~#kfOG@+UnB#NTniSy3Ci%_q3-2)H;PpS^P2}4t8TNe z--Bt#=O(>+KN!dT5oOPb#JPV)7-#Ek2D5QCgACM2k03pBXOJs_@Ri&_-3C$u4G1Li z+RPMZkS|UoM8*#SsAfx5g^X3E&B#EE|0`ny$uzK~Rs)T-Bn4nx}2ZP;CFJgh8Eh+sez>6A0SB0FQ_nW(IcFS?{DA~!u(?|;7YLIhg&WBlSvs(jZB-yDN5o9cw z*vZ{z?y+P;Qvof3tuE?MAsqqb$<_vTTQS`vzLi<|?Y))F5}c`v7vU=*r;9y_rN?O3 z)h6+k1t4xdWM$XGPM@Qd^PrV6LMgs^DJa z`im;U{JZmgznOZ>?M|(PWmH6~;2A`J_E8(eXyl^#%QY3N%}w~w!drolZIBiIdE`d0 z)nZLw$gDOu15$TB1Skcz%5EPr1yJnyOE=Rj+1tRZfUGWm&mq0R{BLB}m|O7?65EwH zc?`C4unU=-7hAK&++xXY05b}*%I+0pTApNAn;Dj@R?>Zt)hgPF^eJRII~#}oHez0- zb(?LiN!*!P>BYU3%ygV_SxZ=gvog_UOP{)v&&TH84%eB)XGc|fKCP0u4T$nw1>|w^ zRP5cz%slg4Yi8O!9|kj;JXM!hkz4c3b4|$1vWcbvT1}$5Y`=sIP=3gN<>|KJI&-@v zdmlcYfUNK@Be&&AcAc5+kmdc6Er7;yCOVaQJu$ zHhMk5doioZb?L78Y%QBdwaTQI^98lJt~bfegC-;bI$>>tKq>^p_;qA_&s2qu;zY~*CYAaqx>lcmW*ztn~+89H(A%-r~jPv-qErzf3J9ftxzBae_hhlho&|L5FbVm$N z9J*ax`Ukt&SE|wZ>u%r))6q)K_s`?QaS-2R5*MbGUzAz??m~C(Ek6O5wO9a3bbDX7 zT5nRZ=$$+6qH>Fn#fbjM>JsE0WGS)?i54c20>y?dy?XGd|XjT}ghIUB(#K<{j*&pii*)Yn$=1FS>UT z@67`CL4B3=_W(Y!n)1gDTz?_QVI~d*dJpNt0ME4?LQmCmb?>7Wp;vKqA3tB`r;1pU z{QJ+L^~bxnS#gTFGPy!;eCQS2pCMb5S_*#vxgYGCVD}?#E!oHPs*#*A^Pc;3Y%=|D zf;!<}oLor}c_QO{#D92ve$$sRwYtVq`sCo98D*~W(J@hsf-m)w zDb90(4P)FaKP1z^44_V6z?|W|glNep707RfF;A96x;Xt|72Db07jhQb zti6dNa8?6SC2;l>m6V^45jE=uOX!O9H1 z#gW~{F|N(0*o{Q5g(iJ(7MgU25br`e2=z_YpB7rA0pC1u{e>Kdn{X)5XGo9%Ei^fN zzMiZ59K8sAf}@A|d5E7XYCZC=vy%5(-fWd~SB{=%%3h5RJy@s-l+y%b-#h9d2) zA2eO8w*%M>$D8c3kgZW~WojhV4w}xGX%-kU-BaCwU-RG^^yHKfN{{QFKQTRPXYu&! zyP_)#LD>#rGmxFW<@qQsqu=s;jNiLscd1RJNiWy#TWn>=sxn29jYx!Qcf*##23~^Z zZ=~}LcAL`=zv@kA6zN=xor<#;yP>2byQ9bx+3AFmsZI~fqUm%a9c6qN>4iQ@eJIqW z6ZLJD`V{DhdQa>`T~%ugb=g(6cDbl?MQUtgy+pS^JjM((%JDm-5d-CLqJfNVuI>za zPW{f!H~ylqk9%sdiQtC4q$xPn;&FoiLM@Jf*_>YLb#E<_sYMg)R0}oHU}_<|lgRd* zS|m}6=9V8L<{E0DavejOp^w&L7`2d2wRqCjV!W+IOYBq&mAnzPklhc6?7X!ILcImH zqCOkj1W^`Vr_<>?9Wm5*-u($;yYhZN2G^MCLCK^jt^0)j&Kz01kwt9 zBC(}^pV-nBh2yitj(&p=zh-l`zs)A8R?u7tt~_nK5ZsUC#+nY9s3G=F;5Halz`2Z0 z^>m&8Mu6vWP7KGsF+a7%wzC8oHeDg8I)9I}MISBVZ~{rEB0hr;tp#34bsDPuF+Sm^ zDr&MCl8Fq@Au{wxFbQK5|ZKwnS?(q2AMDUav zaV7w+H01~^BKY@o)K37lWu2lFr`>PSt2mk1=W}M|5tGKORI@xDCSE6!7VH#K6}@If z`VYuOx?=En5tp%NWs8YpR%(a`H131OPtf?;(m2ZpUd=g&N4wlS9yN#AoT`J<>Ej;a zopNW>H<^!UhE0R?r{9V%Gax$8ZYo>NUR>0|h2~l=e(vc~E)L~AQ07PWTxW5_s%dFv zLi02<&x&Sx(tfvxJ;(d4rVgehfp5cYXDXzOP9wF^t3uL$M1`a)Nu1qzR%n|URWoR= zu%Ry_^!tSVL!rT(XLc@dlaCko$Crw0nEvkm`0^nNS{L(*gxLv)@ohLmV6JqY2BMe8 z$I}pcDUUZFO(T{cKg{lN$FuK3QXjmkvJ#r7sfuVOfPdLnl?ePoQYazf@; z00|6eNoZg#OG3IhoJ5yIezM<*!{`#*LbTTJj~;` z9*z^OV~q`GOn$pqzmxI%j`iCFzbiTZKRIq#>&7rK0UVbDtVLCamidsD`3vyo^r`RA zt(>krYx)VZmzJr3xu$UtvD%WjI_yuRJbKNd^e;1u(p?URSBV{6)AE`g^Y6MLs za9W$Sz8J4dvtOo^I?*iG9H4EsQqe zAf0d-wo11->3&SQ=anvY1(@hU>3s-ML*|*7yV0LAl`SM0$RQvX1WCW<-HR&pv`K?a z70gwk5wK}Tg;dbLRSop2ko0@0kaSgG`CgtC+HNwd2TdF}Wmg~EH)JPVBDfEtpIeBg zmqlN4b#;P|!T2hIE4_b^{5H7suMk|isyI2AXYgmtD1zs+!JC51BDioB!F}kImL84< zj}W|o4Sqew?Fp_z{forg;L^WFaOrB|weM5@##R2T#7?9l}645_~ zO;Pk}o@?oZFX)5<`EkJ3et;W+M7t(mP(Ays8AU4;vGzl-|62AkCthiqt*p~*KG<_6 z8CY=(8w>0(utFAA4p?~yRws9~=gr=lK~v1arUE+x>=K9EPmuGi%D;zA)^HRLUliE*LGz9q$J*I&p?VRnM4^dX>l zE9)|bghuXzThMt zy|c$lCar4FoCl~m>;v%om~;4oGhGeRKlq4O7^BBA@`%ut=3mQQ@S{E5xb=zqlaG54 z?iZ}B&i`kz)!Dxha?0Aux2_BZ+r!pYr_x`r)rqu8Jm>zOea2b5@|A^gNpRm_JRp@< zW-u;+e1~4wcPs?%B#&e3e6u$T|-`%rcgK zFpQ<^hP<9fmH`N#w3s7cQY_|Li7F*3qZg5*rvet z2eHM`VQdGbx@W<%*nV#VOFxvr()C2%%!6#0z&}~cQT;KN(H7Lpf+ojc8wB*DC9sD; zJyN-$5BBAdpV0$JkiHDQ*u|X}M$2<_J7oiDtSn=)?~8n7$^D~u;dyL$2{@egEWqvu z+^-B0@B{1kb->j;jw|E%fOV{~>eg{T>v%H${G~s|X0y#E>&#&a&OWx7H^9`im;n~!BjR8JU=D)O<>paX zga&cF(X-Tc$Swlqb-m$R80J&UJVLOZsqRsw61Yz+3F*s2Lb`#-XL%6M;*(3CxtLJ! zO+d9RXpjZPVx9r$kR`AS%%faD^`g&Z$QS7KJVW{id7dHNRY>%C#$`0cQ;_jK&q!g_ ze6HvH+AMXh?r+z*F8;clxI5yLF|KXfW9XhSUqM1^U%rm@bkFClAyJ4k5kIo8cR^Ou zm)3Sf2;<`z>!^AqQrKr~F#g77y%*bFsm27bOG3Uzuia4kid0a#!N`|+ls$zC?zNb= zz|;j35@?78P0ro9?rwM)pT@Urb-X>%(>>Y-!Cc}ao5d5JSFqI~P-mIUPVPD8TL^g% zfx3<9KI}LIh5|BNoJwT=QF8Md13EZKzt$d{UPZ@4vkwBjvDGo62J!}aRY3a6WFy^B zY##_W?FF?wQW4BamCCce}s z@^v!Q(><(rfIw+qIa4U-+ZZeJS1|6A>Yib0W4qsGA^qiKF5MX9WFDEPki@$d^EQ|U z786Rfpvf^ZkMK?EcPv{SCBN^P`29`cR zVCk+$e$aC)UAO;%HIKCoJ0%Yj_-PAz2T&smn&^VeAn25&t3r2GB-J;3S|ishsfvv_v*oXVOo^nRGgC z{gnrS3_AT`i}?x6G>g$`OPG*nF0w0-0%#K;y*iKvXmrfxe91h5jha(7KlOBe)bisV_WeChtvuFw5zo010ffem?`!V&0Z~+TOyxAq_ZWWGogt1k~7ZV zoe#ofB_L(>H2>Sm(Mrds_Q(qKIxI?m9cM-9#v=uBme*NPx2L$z`a1xLp{Y0klZ>pg z&oD2caoVsfma_JUy2vAot5=Li5Vf-2C31@_cyWbM{0L-vdp4nr6 zHL}L~lD>_u|5EDMbZH(0!Z=%PL7xHY1_5DoY7!jH3~J9GiM-9Gkc87V2oFtbRzOMmq3*8_yW2C>c+fg7K~H=K87)~k9?sHAvyL@( zgfZ>;*CFrXw;Dd%TEA`Z+rB6Be__S@j^i*{hAlWB;H8k#-kyT|0=Rl^QciEAfSG48 zrNJB{p!W7r$T=`IatCxUK`?VIrl6jGFDH=p@X^R0fNI7F6aaK5AYz!0sMv^9-ayB` z2QnYMHYw?^f`D{mkV^QOjJ?01!%WRWi}^z5|DG14qnJRk<}-^!#sbT_94t!FM3YaBe zh@Wr@%q%jEHsx}(|MhX&72&bDk{%qokdN?GEb81mE6h1< zsdfAt;Ex~^bv&4EpL*PNtg(ZP=^7%(dQYz*(k;iYQ{4>w-DEL8gPEgm_Uj@s35mnF z5#k}^b}XQqE$DYZpAbmB#w4XVM<}&x-j$aFI@(p znSj*I9|O8Bm7NX12xJy|b+hyxa3-Ce8(e|2$=LhXdVt2f-D181)7N73+&~!T{O37r zIMa|tNoNDXU&6m4z^6ylbZe0=Jq@`MhXavf*h!o#Ayo@mUnR+C zyvNsL_zKOj&NMiILG9IY)(%I5ea^CQHp<%P$9^>S+MaJlu8I*njD3o=S2-tQudP}3 z?Op$&Tx>G{jIn@108#;HTfPMe#t2}rAC0}%_AgXp0X?Cu_g17F_Q{BUHR;ZKV=d-y zo&UaOPPFw3ln9^$;_rz7pj1mN5n_`drma`RuEyS9sQ|>rTl=EekHcQu?sTLh_Sw@! ze0QD*Fb)7?jdav{oU@>gSxr%R&w}9+aprpLbQYA|l2jb~N0K-9_oNFV8?K0O*YPOP9fYXD0&Efm6ZXAMFOd5uK*sJbtpMrnYMN zsfdp*h`*W<{7eGFD=MLc%W+hf_bIT?-XOxP0Po`r-{T;Ur{Ew!P~i(gUfb?WM6dZJ zP@b^_9*0EzY{Y*GSx_h#d23jsDzCgzJIKbmRHn1ENCa|{Q+dP5C6ZCpt z;2?o}TU=FeMZjrSyW_y6Q0-BCQ0}5y$JlGO$1a3YbN;Z@zvl8QYmdr1A4ie(kgHhux00Ts*}mjxS$Egg)R-64i=yd zWgq};Ch5qe7=X;OLDR|FYaV-fOjFNvgbNT=AFUmRr2-<)I4WZ?8MGc(dbm`aRhrFmvEFV zPy{fzCVkZJ@zp|I@sq@bkgdr)!b^i%k;{nJ3dIm~EBL*;$7VA&T0G0G&3J3m4I8Kg zy7IqmV@MX8Yb{6V?)CJehojq*;*{l*bT~#y_rO6piX#UvX$9f*E+krg+!gL!qI&`XJMQ*wpm;aaHzl19@0)>J&&C% zdE5wmw89TlJ?#LiiK7Zn(V0XgZi*3)1BkgNmH+Wi=Zq%kbp()psXYQTM+cWc5B~S~ z7@h{;uo*g4O1cQyNq09AO>PY_*7}l5A-PaXfa;@80EhXq)6$NX)|n#DV`qwZ+yZ-L?P9&nj?=r7Tx)Ei$t_Jq$v-96>Nww~Qzkw6-xFjqAMlkn6Y2JmiF8YlXeJNf z{6SwPL5Q>g7p>G9Td8*Fv_0rrHKdn&oyx%14Djw^r2d&=6ExvWR%2TW(9Sy)fOg%D zNLn@^$37YR_SRne)-dd~XKh9n#jsDp{wnONLTE7dp|&)$I{F^uegJxjuQzql=TvnL zZOap6ewaf~B{(-uiIj~qN!CFM4i@5I5!j_jdThLN7lt2FmZ2S~CBdPyL9 zHxB(?-Rf1ctwY?qakR6Qse07|tvEE9ES_D}!mgSvZNm<3RhJWZBZ0Rds_Iax`zXJc z27?cFMCLz0^VGCpm9-^cmGoX@(|>~5M3|B~7OP6N2dgTrL^l6F*c7m}qt6$Wof+mn z_uS2|n+oPPSKzq1&8Tq?wp^U*;`TU>w<5}@2)28WES%M~_DZ4&_DVq4D%o%I*&FQZ zV;@q2 zMDEjj5{og`96f<(e!Smuyo!0gaubDF-1bV|AkiKW;d9$2aYds`R`tHbG{C7guiJ3i zse!wu+gr?xK6kbehG}*%TJMZGw<~b{>@kUp(=S|zFA~bFf7v9??_GES2ia2iuR)S& z_}|^qA#Bz_{291XIA;$D5-;Id!IfL{f8%i}q@FgKL$)46WT#g|~;77{~WItLTkS2S%gqg3p(1-~8LQYf@#s z?Q_T(K-)!Mtz+BqZ`=NjO{%IlUL2^)X}mZ{7qMlNBu-K5+5YpM+9mnwWT%3$3$h}A zDVs*_zur;1r0Nc`{A&gCNE!%R(YhSA-6Z|%J*7$2bh3lVSqWKj?OnpQh@Ag7vb!aN zPIej?yKS%)04vyb{ad!X#rLcn!|G#_(jY5Zx1Md^za_g%GU;T+>2nQa#nHW-?MrgR zDKwX@)c@#Ob&|=oLVY8-Tm9I1 za?zVKpPaX7lIVgBZ2QUi*Zsd+Dxo)NG&wmmNlXz|inH`ml8W2(KboYURg2DidnGrW z>>$etQk=foO-_f|9?89kt>gWtCtZq8 zTH8qQW~;@;ajNN`FDGG~s*ST)NqRUrXc1UOSzH?SNi{rW$)Xcj2!X*65QI0e9TNm> z0R2w_dnFH@z*3T)K|q{{zW&by9MjFB!rw|^6fUV_h^NZW!4L=Z9x$3YHKL^RMKMgh z>grm?=BekcAy4XBbZCA1m%M*UWB2^o=>5_pYJ2M#Vu8^Fh8Xb&!06*t``=*L=(z*Z zB0#)!kj;R&0V2lOk^eUkFb+z)!1z_i5O46C!4SLtI2b>c_;-6!ecoS{ zC_QjWUtFC2Pca8{$X)*v@;m9VQ~I)hnf~p+D7(*L3s-BnyZYN60a2IBqG-Tg?wUj= zY+)<@hbb=5As0obuw7!i%yxzCD%($Njt0ZH0u49V`G+ZbU2%N^Wnmf9jhpyy$k$Z;RaBf6J-dAjS-rW2 zZ6ojx>^?d`N=w$Zux)2^{sw&mT604O{2UC;gD(s?;pKoQa(1vi$6iXu+t~=4xQ+Xo zO)zQ%&`o3$r05ZaR+XQ|6lu82pWmR5$#E$Fq#pjCAlsbo75O{aZjl>6ZgV!hdtdf$ zIJ;;+IS7rDqf`h6s*mjfASrY!IgU572lFKM1x_)jg>PI^5g7sC*v9}E0T2X*?Eq4M zy6845q_g?<&Ew(AgPaRq1+zXa7nq%>CkTHdE)YM%`Zk*YB>80#!SaezQkOrIr2K!A zBZ&R%l+(>A=gwbpiiq2anEymlJzAU(yFj{BbY)Arq}9LZN`8vPzSzzHs^#BiKcr1| zgS|qBL>KFDm#7j^AG%xz3mEAk|69+#rB zoR4sdgk5ZRA@YFjA)Dx-D2n&_kX^Pq{mVFQj<^iT#fYpYSfkWV1x>5k`VH1P(u}&yPeb1A5 z#!W)Sk>*Nh1V@?(+spPFVMRw8b=>%d0&!52MBD~CN5t%7`vp5DM}8L$G-3Oe&G|qJ zf%_93PaJ4=SH7iCglBUAZUj&yeaDuMLmX+M&_0edVcX9p2!#Ul6e%M``}wUy`&*x2 zJNfX>_LogG+yeu*51}tt7R2Yf*z0ku`GCEVPTue0&$$Lq2LQv_p7GiF@+aH%JmbuTgW{CCy^RMTGYQw<*+bGMtdKFjO5R~gmes?N;E;>v`8qzISAGtU?ChU z75ucWoqric76+&w|2rv`&5Dy+IwckSbCM(Dgx+{juy|QTd_IlcPcOVBhU+lZzGoB5 zY=z)FXM5p~hDlXp_E0 zw)5B+lK~>8DYio{xu1Hw#v9ZLm0f++`5sGlS$%k2%^ zybOKRNkMLfhL-B2Hi*6Iq_&7o>ZD-Az}!h8ZY9clm%d+4s{D?@{{8Hd?KanOQ31N8 z+LPVfy|*3r#8YT9*gf0TIB#_F1f+OL_0ol>O!&ZO&K}><74aDC>&BK1fA+Bxnqf-r z%r2DmmWG?9kEf6;9Sqxg&MCtC*_sM?4EEDc4~CuZ>grT4Go#!~&jUZ_oSSVuU!ooJ zab~iUpYkm;Nz|Ds&v*8Cm1$G~kHLQK`Am7yYV~RZQw5cdJLzb(*Rb&wUwXSI{Ke}% zo3T@jdVhv9lW|<{O1~@mWJbHB_RgLUN0rB8uwR%tv)$q6P8DUcFA0#3N9vgxo6q7dFdh40JPkubEg@>M_J%@3?eLtH@C2!u9#F zxE@19^6FM^?REC3mpr6(^F-%#jdp9{_GednDUZwJG1wQK?;8EqzP}XoP90a_rFW#o z88N?y`6c3IqqC>OP7mq)bipD&qQq+F#{K@hJRX=2zZ0?g(qC$MZQWPurFZ-8zj(b* zHnzM(Rd+jes#oJCw5^k)Z4G8O|4W`%hZWvmqc?^s@2<*FrbUfak~&OQY&XAB`n{H=7-oH{w0O6#GV7hC%DjHt zlv0OETKjC>scbrZLAhMrrj&g0u`=cH2TINFCMmI(CM)e`&Q?~f`&3D(FjE;Fm8DcT zy+Zlsjm|b@ORX=I8EY;nOTvaKbz|2l{g!N0&h+i4qzp+@o_{c0+1zf4;_Wh7d27o8 zrLI|1B4e_ZKjyDe_N>cMDn^;C6NjkEXxC4allMMT_KY~8tnD{j8PnFLtf{kCX|toJ z^6bhvWnx2FxtR8ba((PVWz`y+a`eGU<$m9v75~+_6L+~84YsYD>sebGo-0fHHn9#~ z=wWR;z|U%}a9mllvySyx>oQjN*gMMUy0xu7uH~$z6nCrJN2RS}v%RgK^fFlo|Nfow z=ERoP8LpMB4L@?T4teXbGGpd0rEZUh%A}#CtiO)Gq+EUKYYq6Nvh{XFv(@8!Z|nB* zKGu;VOIbIssA{d%_-C6EnpfG{Z+02$^o13yb9!A;T+T*Y)sPp;&~NHn9}K8tz15Yg*qe?`^fZl(QzzDrYVAd7iR! zV+rg14_aDHBV4UnweBd1uRl==ch<0W8|7~OzF8@2!~15dwN)MK_jPk8?r}4^+JaZC zS6)<_p@i*TsZ6-FS=sQ}0p-Y@MamENHz^Yu&s17}IZ5eTGfTPWJw~bc=tCvw;OEK_ zX|6J-dX^Hh{|jZ#xmilitdErF(=Dx0<+dn?ma58`jjNQ6*E5vVS((a_O>>lew?`@$ z0>4!DmtLjJjec8MUh9%_^xy@>{h{5aOrQO+l5%2$(r>k0abN6i9e(3m<=TS2N?60m zO3tI1%E@8#l^yG+DKj?AQ+__RRymhtS1vxCulzRiJ>_&@zA|^j`^uaZ%arZzA1Lh; zzgK$Io2wk^IY()JWPx(E{7fZK+pe@W_g4;Io}&21Zct{7w=3nsKUBUrvQ4@5dhWyn zVk}Hv?P|T7W3av!)6}}zDp{vLtY=M?PAOSuJgtLTnXNmXxL7})RNLC`xQq3@wm#N5 zZ+lr!S)VF>!zxyVoy>~X5xr|#-(T=TiRkf>a(uJV`o*x4*5T)VQBsmB zTfOr9t%D~#Q>r}pU1@j5-MVw6pY_91^{fjnZ?P$@M!HzPO*30RxK+;DrPFW9{%U2d zrr_gB-`*$#l>!t=3tUY_V zTRZJ?w=Nh}#%gbTQn_`lnswTt($?v7JgnN&n@YWrr^@~{){n};^z+KzoZYU&=P@!g%u@!E_b+b@bWF36hj_^z^|##hS2PCFFW`v;UY z{0RA1bH7$**S(^|eDGLV>UUMCcV)FQy_CtCc+6x?ZL`m&oNBBoi$|BRW{z;N9>}O> z?Q`OL#XoI{V(4>MdA53^((AK>${S4&DhqvnQPQq%R?LmIDIwn*09bm_(}2m-OGCG=zZmn$+wi{ zG0&BW-6gAih?n*7JCgND{B@;_-OHLa%o7gFURb-F{K}T=KP+IPPg(FjlfoEx4dGe*BA4Vb^6vJ%3$kQLRwf z_@bn>)~N@|zR~xTVV6r-7cM)c)cMWd`lMVX>xVDQ*6S62R8m?zRyOP_X*FnNts!ob z^~c`FmHdT|6~CO^iAUYs47OprS}UDH2Pv*Ul~ooGi&PA&%UT~A`Y5|nVwF_OWMz@- zSmlfCMw^}SgE|1(MTEJFH!laSu@4;jp<6t z#n+UCHk*~_c>@*ewGc&V{JC=Xn|GC)+p8$mf;%baQyVK2j-6IYO_*#`o_p6;MrA0< z(fS`K6UX*dwr}`Z8GEFG((llh%A4+!l-o^5D8oD|D#PpdQ9AZ$qjdh*U#WK@SotWS zqjF;8NaY`UMk(4RyV54#SMeCQP>J2#Q%Um3Aaf8)Lv1Z;oj;DZ1^&;7UdfbH)+{4~Rp=@Txwv6aZ3aLL`%(^jE_oVnQ~ zg?k}Q1}VI{BA1=X`ekyIT=rw^uS)JNTJTd#+;-3G^GYj;7N`Zc-RGFgzMx@!6**JQ z%y%zvTNNp1PB*Fx^WEQ6bMoEYKe_8}clkJ6&dg1}>+WlOZCbLnq#rK{zY3`q+;?B{ ziYxr;KD|<{6dKSuL94YBIX_NY@~KgZu*GY&ag0MBz{Ov^KMD&G zs&@(V1e>E`B1}Fq*~7k6w1ZLq2TacW=3pS@zfe8g&Erd%@9}%CTRLhpUq>#mYGyUl z-F(kr&P_L)%Nkr(;Zd_Z%umeb+6KF9ah52Jl+->t<79WbK{~lb1rf?dNoMc6>a8g zCcAtiUNe_`by6+#GEcJgqi8FNckZCg45j$eNNr}pt1J4~MyZ9~kVx#RT`9v?sLv*8 zy?Yz%?RzI_SJF-P_RrAo8}03j{jW%>g+7odpQv5APf=5{);pG>laODb*GbfR|NMVJ zqLg{A>$()}%FpWOrOljeN}Jt3DPx}D(vjX!Gs>8s*m{djiG$}3Iw_P+^Ft<}m%(1F zzbZNVfod#AKlS#Y*tTTt%3eBUJS4tg&s9TiLB6Wu#kgais>br>Nrv9J>E+GdWemP* zW_|O$Qoa_W6tXx*_Vwqj?$;GLOU-OxepIq2pLXMsqUEfe>gNqauQs4p8<}U9^4(#O zLXKGFEUw`ps#W$?vl^KTTnrsZ&T4GF=2{pjXW4S6HZgk{48FOUP0f!CF0E6v3%ME1 z%ms$BA0=xSJWcj?UnFT4e2n&XBNDZqYE}z#zHKWO3zl$XJA=Jl97*h`cE$ErB`1jo z##{|b1afkTrf7_0Z&x=_yO57OAFuU%1G)i_>i6o&sD**%NiJTZyV6^k3tV0DqU6T8 z=^^4Ga82Q3Ry*@GH{Y>X8yW1aK1z_+xJvd`&pOI$en9q#lGo&BbugDOc3pStJL7JYqMuu}GhE`M>vh!53{^jm5+gW@5gfw^?z)LD8SPNCu74n3#As)7vtrDJ z21D1}tXT66L#3{(NR+HvS3CZ4igunkE1tNnYF2{zhAr)e!QOUSvV7}xd}}{)cZ!@g022T0Vt>V3d826Va_q6{ zk!+r5J4T~Nn(S@EkzAMCMkmW@JtceFV*9I-9g1diYKatPacNs=c~Fv^wh(EiCziq> zFjl@*7iO=DK{0Mkc1I`kG?&{912vnU&!tdrk&|(k{Y;jjO1Z|Cr<0i zuS2{lsTRIvPPd(T=2YP$Vco3zZ+(HE-s z81rZwKXqx3$Vt{pKSVxB)aGqR&PKNUKVUTA57qcKQ@eRqigioUN8&G9gwQ&<^Q?{FKD1(b|oLWDM!34cL#o0MM_LzEV8!ap1TCrJNTEuMGkA_CCo5%)FmUm7K~aDsVeOCP|zFjk4xjvNO3 zE=9RcL`85H{yW510;vRc1G))LQN(`( z*JSMPQ~3t^VsILxD^wamei;6D&@Yg88@V24Q#r&6Cvu#(q<33mY!au%e@F7`Gvah5Vs*&URDj+iM*M}-RLg(eVH zP7X2=eI@qN$ie6Z*wertlqfHI13j`pHBL4!beRzkoh6ga?Mp4=wkdQjVe@eJN2-q| zn=>2D0-!mGlORxwh_4{%OGF70?_lqZT#l>(VRh3K^C{ojvD&uB278z}THCgcHM&Cy z+O}=z-(Xjlq?>EC?24r{miQEH+ZyDYSgkGRhp+~5THF3)Op4dGoj}hggK1OPh&VC} z(mAtulDP>>dos8y4dcMw#(^AmGC^xw8Ce6JW`>PLUyXlLqSlr-9K%`*guPSEbq#IR z{Hf+I5E;|VwQ8k<$0;rBES276CDR24pCS8`{{Z+uwv>{nL!D%Ka4SAnDmHZ6s4-&N#G94a++YyaY0q$v=b-skHG#D zekDaq{u0>`{}<#}!#@oFhIlRcJ@iTFjnUshuZ-TEJU8OMq0wBnf_fxt$^3i%YT-=I zFUg{LcM`SaNmN<{&ALRqNYPGoKn?-nU1U2DKP7G(m8YZsg#R~W4g9=d9dwOm`{TFJ zoe$6hqqGyEWz&iOmbi~;*<!4q|-gL9d*;b7c|xzLDfr!E9Z{>F zdDL<_1=rkD&pcQMkkwNK9(!GUu&7zjJwnQk;*&f4Ysj+3(HT4rspdcTIOQ@!!0dhD z@m&r1S2|!M9pO(vSrTd@tzoj(09jQuVRfO0ughtXdZ>^x^5#R9??kaoqotNhx&Yc? zwD`Kr6Q3>`ElWIO_$0b*k;u+4Svt6M624O=%jIafy3rmSLibmt>~HO4`F#Sbx0mH_ zvCo9YkJvk+@50_TRMxyq_Ta-Lu)r2Pqn)gA_6RPGR0~`zn|!Yk@dV2t2zMiM29aIS zPm5h~$klSRk30o{jiIt!Nk|HskV`^jtt--lwBDpurgf`G3n%V->?eUqMKUn7Gx)m` z-va#|@{b`0BZnXtI*7PgzNjz1N7yFPTLUu!sUUsZ%i1VpIpC|&o{wmsy0?TS-**Uw zEm%_Fxs%Ka6pRDVCL#{GTXy(@rB%gpmfRLZEFz-@@`Wh5#B6EJ9=K?>TnqUJfr9&G zmw|%Q>e7(#J&ZAc;_pyY zv}ghSE;_FaA?m3z4D3utS{X~aZA^@ocZK_*ryaCBChZa9(9d(La=n9=x)?n>MoXQG z|D8xJl}UWWA5mKBTSj}thaI)l%jjE(Yls|%ohL34;n7;&=g0==e&7s@((>RCQIhzI z#I?b`5Bn!pEpLOtt`?THq^G4mA)*_R+>bO9#9FwgwF`)CwfcrYQ5ehy#EpBCk_a+@*@WumSlRcD~LNv69|D z1K1?|ThTp`PtciPJMv3WG>;<9;wj&39Fq1gKs8G@-Zsn8w|C#win!19<&MI1pt zjlPE#X5ddiRz^1^XsOqcrQ;prC*m}5pP_H1C7&R}*exFrzW{p>`dN01H_`>BO|j>L z&qOw25e(L1uUJAgRyS8!in2K9Otfx5%|TrfgihZ87=SvXDW8l zLmILk_PzMOqos54AH)A8dVP5C3?L$g9##t~Tc+DmI{-T#xLKt2CF~gPgUFwe?~(o# zyB!*w$08DuJX?-}mX`@Slgipt~MBHwF=r z;Bc{w$TzA+UorZ`0!=lB_HmrEY2zXEuPIWIpi{KvB>EcqO!VzT{9@>r#LnGHL~V+2 z8zBynR4?>Q`dBQ-9Y%^iuYhiQAna(Kj$$Dx`!4F9Lthhe`-J~!+UrReK}#BV3A zB=QI3ZS0reQ;NJb$bCjeUsX$b{Zw%-X$sP0B2U5OBKi`L8lf{oPc2V{sbr|Dt69#} zNHqYoThUVcBOj4=3pYzEdFzlPX<+V>>Xyr%w(IP@pmvG!bsk3tbxV@_v#=5L2z|3; z5BeU-DJduqnTL$RUx>eTqTHYRo1mE~@^zje1oc54#($)f+<%DC9^{!UU*{?jR2w^^ zAm~Q2-2WW9nY_Wohj*5*kHVgquNoU$CZ%2H*bOR7lCPg9=!Y(H{|uZqiqY#qYXD&d zEGRlf?jMQXhPY0|1p>l#KIkUJXk^fdPV#k*prBLuKO(LHq&XskUL#+{ega+1Z*0l3 zjRWa@GN#AM*T)m_0(vZ{1XV_UjU0_kA>$eub%+C@|1k72=pQ57!|XnBB9Gg$pbJ#s zMUOU!M-?7HZ4RSNr!VNb#Nyzfx zy(ai1%chOUVDdO)M4rI@t{5#X87;>#{)RIG5+z8=z{z_&k&EbrS;$bxb4`g{3#k+6 zCOYFwNca+e68k>#k77T~&J{ZQ$i&4YvO0}zO`I4dtZPJ?*}2SkBOepr3%LM)iU|^d zmbtcHNFGaaKE`JRdmw)SB#LCwqrD&zOVVld5Oe`HhxnlsV}cmj7W;e1Z;0dK5P6S1 z&;*?46caIZ-x!g}*P%Z0I$?5o7!&L-P@O81~8F^`kq* zL3JN}Fw*u+H3nH0MVr>39L30E(@2?Ou_Udf*mjD{CTS-24Unmcew=O)N8})iEF-TU zBr=K5f|QycWXW{7Os1ON#?snsav|Xq{h;ne<;W^%!}Mnoh;7MyNJZwYk@KJ_&K*-g z49EVA1Xi~rJ0e*vdfB|+}{WnsULZX&Lc{y^{+&s{<-y zj0Rr-Ow9OShos(gsd8!5peb+X2ey zF`+$j3dx7C??Go~8q8_==rxMnLas!AM7}q&17t14)y2*sAG{2lZ32({$;jqNE_=Zi zSJfD9nVFQpAsoyho*;^f;WPrd5isV;VuvNXCqU7eQwLiB<-LXA>lFDJ(jxviItOUN zZsaE7)r;YlB_Uj0k9NVrA?;{RbTstIFq82i_ScEz%oIEk*@Frru|Ghc0TXpcgr&MU z;SL2E|3}qp5tgi4s|XNl5gMg=OD_05N{Rv99D7+JS729{bgTmY`3ha2C=QBPTFn4*s6Jx6tb&{lK}0PM)E&4R_?hwcPRr#`uoED5emlt*- zh`)({4*ImtavD2JVZxWjrC-^>Yat%uk{!h;?ueohydURooMQ_rn3*Fk?J{@=z!5z$1AOKaYGDjlj$3aNv9qh3#&NC8 zr#ajKD$T{dfc_?sImswj(Oc8!TtyXbgl^H7N(6pS^v)o*r@}oLbwh6q0~LQ2 z;(kZ}4e5iwCw|ebmywKr<(G%5G1fBIkgaCMS{gOti1cHE6h1Rn_WJ=@Azserv=AN| zC;MGPzMUXvA44+X@>BC;Evh-2NpJZ1SUFqGh+|>z3W4hCI7{71*?kT6@MqDoU%JsA zez~LU$LT6OK90crILnfjtCM9vPSxS>C(GH<$UrKUA(FX^AG;!a36Zx!n1`H%+$A#i z##?HYtpp}VOn7xNIYPqcBv7&--tq;F$V#x>K%^%U$E|?hYT`!0`A3jfkf-iQwCwer z2YcqI;T@1agWv*bPMYCv0x2tr73m$~)3qd41^0uZW+q$mT`q_=r>9t2m&)!y(jio) zd*R~*u|p}WFlMU-DVB8KcVa-JZaS!p-0(wm4%fQy9`I1JI?+kB1?G-Umc1?OP-g}) zQ_1;(oMd!%OL%#D<2FSOkjr5n&IK)-)yi$}fI16u{ zBKvJXwvCaqa|=3KGL0?s; z!@`MI_jaTIco}JNDv@Cc+Tt6?JLqabH_MzF{@qC|6RRyALc)M}%|8wO7=YE0JnZ#X zS9iCxbg53Fdab)9%eRnEuLJQBIozIy@n~ytW%L=4QL}njs<-iHaEARs?QY0@gas1D z-2Y!pX7JxtC(wu~M zsK-zV`;NwP(hakUx)*!0E6z9?)zh-PyFd4~VfRQ3Cy{4%i@8RJ%|d>U96;jR*m;7r zm}R-Jjo7PG$c+3FKhH(|8(;^{|2Fyym`y~V>VSXEGQE_49;z4NT(-iNiu#$oSVx`+ zpt_?M$BVxmVoksqNj^__{Uecmgu9@ZWqP!KIn=t)SU^6{GZ!-j2^*NA`IkUG!2UV* zZG;~{DwN%f9lHMg$m4;&e^cx~qpOErxA>N2HEr=oA`T>Li#Zd8X~I+RI&&QV8>r_< z_=5JG!Yta;0AMcvVYg^NN&GKjH2>$=#g1%6TnQvcYZ&it`E$n+7D3!}a@yNtOJ+CAbNl_&dp~2p&q%XwjKjy)BJu zWdYj^%R|a?b_?4|9*KSikaD^Ltw|ZM<5ucncUoV2PeMi0+Ae7d?&0SbfKB z0>J>%DeU#}chci{;=Gs_kHo|I#Y}9&t{@Mh{{t?fM|a_Wg#B~$0CXOe`gcRVlcf2_ zOPt~RSZ3O45yVU|Y%a7eBh7$pN8Uz?Vetk@jC4DUuZ2o;|)(aRk zfvzWv8lumIK@AvqqcbHAThUQlyzLb*d0(HcIjbiP}Kk7?B4hYXeIe zB)JOuFX(fVw1H_RNnT6b1?>CrA3~Nxeu=EzMY}lwc>@3Y$b!z=%}DY(q-Zynq1OWE zIeIcUYr(09{Q&mn=>HI}yY#bU*ar51fj6F^M0^fXQzG6?(FVSQloQDSp(98=@ptH? z4g3mwl&yAP&q%(hopyD{vd2^ELJ@MDkAx%FUs94%wIT zu^@NICIodw>- zenXH*W8QXn7M64vFeeTiL-RN@2i_vGI`-@+Esm#wfnRsj4wplJ5U(8$N5&9e0{fe> zFd)8JoEFC!J8%T?Tu=go=}s;*f#s#&04 zsvgJkVIWt^xL=W3l~Q$Fw;NU)CrJ#h9%H<8R)J&^18 z;cm!UG@5&vz>*YQOWasGWj^{H;yG;x_CjBU>`rH_NA95W7J$Pe(YU+BcY_?0;lOq@ zW-|5n4xO>pSb?myC63qt6{&3JPV&~)Tv0ly2v@SN5Xy)J}g)Sb^({gpuluG=N$4) zI^#338aj-gs7=x%f|!EEiSs}z$-~eeW3NEbog`JnzLTUR^dyRXjqC;ZRpclde~fq@ zPabCW99We+ZZ!hA^v9XVTTDK;WP#(@vxUeuG`2JHM;aL=NDPO>IY76Qycy6Sl3NhW z9jiEb#l>TvfSt!ffd$ybXs=BC+t~8~>4fw~=b2|KC^9kf^slX&D=dsKp(z~Jb^BHXaz~!TfIDU7N0@? zmr^Yp2?;U4ndS!`q1aCpsY&;)M3?FQQ?yK+hQ~niYe#-j}=ax5>MUb$! zfWL$MI!ey_fV?x5y@%eNGIThvEXsM`1JAu?-k0d5q9J^3EO(Y(WU86tnEK~&tqGF= zWF0*066PT!oD{7-h1o$x|ck8^URJs+#qVvB8Q;+BCn!zP=>Wdo~8OO@;GCJaZ8^1 z3vw8(xdToWSg83ESWd_jw~!H{dd7P!_~%Uqg?D`!3GZ>Yv{&@WqxURJ`Yop$Z<2Wi z2BIDPLH-IU_Tt?Cl2i*OS~6T%o~6fa7T-?ab(B+AQuDuC!)8g3ot>g=yQYqRP-Ybn6kfe<<8Ksz|DcWN<)svm{Y_d<&UD9N+efSx6i&dd3VR zPCaC28T4@o>Y1+E81<2zDeo}?7XomO0@(obuNXXTN5pvwuS5S5VBYkMnMNG{o_0)I z0LKD+t&=upFp@V&ANR&?6gcQM{Jg^&Qw|(A+JdGxwe<{i?Qkj~m;{^Imcev$jy)2MORP4P_phlYm?uujO!88JZcR<#4hI%}vk( z7V{E0i#Gvv(YewFn9z5k^OQUEF8-&;E#$j`w;Q_|`5tm8aw2%!kuBo2fNI3Oh5Z9@ zT}tPYa2RJpjFBWWwG5q{pykv-&L?OsvI|Mf)I)nwgy;RCT&HqMLLw?o%Q=g_82c#5 zB!lw>@?sRlu>S;}A97EO7Qos;=u`5xffGX<(~r=v!DYIk7EWbM1bj$D5Z&Mp@-&(^ zh=}DxvI-H}7+C>-6=Z)9fP6)hNalq(g1IMlPFJDj zh-XzZbTo{aD})XLM?tpZdF6D=+&%%d0liJ1GusMn+mU7JCXMTej|1( zct+w%U}q92>gO~iE*kq!=)A}l@H>9~>6Cy7^y_p-W%K}ea|09Ff_>vH=IS#UdZCcz zx)6F6@LYo3NfzUr7svzhuwRCx*e3@ml7+sM_;}*@YD55!fkHn<--+x?{zhagI)k}I z=+k6KA}`RGOo2ySV{}pP82Q{t1WXi@!dSm1w8p;htfY_%s z>4_K^Pe88^2kU>Bz`##&A&akMZe3i%d26G47yNbq8K|8Ov6LFLP{K;Lar7!k?*rec9o=)1fK#-9A`Iiva}e=&0x++WIxD+VlVEA|CXF) zk;I=`Vh$%}4&EFdr-W{zSP5Ffy0YVh>dQVfLyi+_z-sKtkQ66@{}u@yM=hLVS<*d; zZ~Duuz=|JmmN&&~$HyaE#%Ra;VlRl-k`}Ns^I5ERyc#PvXV5R;cS+EWcgH>ryI47! z6r~*(D?UHspM#!+UoH61GN(_{F;q@N$GKq3H50U?caWDUBi5gykYb@N8yLP!BKvpL zj`L(*&H!!^GK#!7>=J&lu+}q9JI+@%+CvK|!yzv}w_V>78TwdLfk!$?J*wv>T!%nh` zLyljN5dY)*muKqb4z-wiRIhZ;#6LJDjQrX8Qj_?bWR#@d;vYnBXnxaB^m_vVx#cXL z<&Ci^gVXxLSIhj_Oy+)7>V1GJ=0fN1D%gxSzlec#$ls7XRh(!C7CAKZlL~q^WH3Bz_`o zA-^e+#*coniy4P6|2`wgS?|NJsKwTRh#hR%C-?=Nvn34D;Guxv;GW~(o>R>v5zFa_-_uMo%P*}!3k{H1h`P6;xK++_=G_^7SFj^ z0yvQ3#eXIE?5sDZ|1Ol^$FT^gM)Hc42Fb66QQ8r}peFa4Q*IAFJL`SS{FWWtujDo- z_tX#kFWD$1@_o6RD$SPN5B$#6KJIsV>KUoYc1&9v}BKec*GQ#sQ+ugz5p%Wy1iPE->x82(Tj zm-Vb*TDEwsY7_d`YA>PxRHl}?uBEA(`f*v$)+K#7y6xauo`#2%QjeGQtWvVHZW?m7 zeP|i=*Rr0~ZKaAzI{wf1tQ{Fw+oyywGGG3c)9tIPWrR4#m)Gr`mNoyU_zJqcp!(FF z8L1Ah>*=Dd_V;xEq^hT^-VZdr_CHM-s@BHjtJ-UNwsaMH%0*otWU69tQAb3XTB!G1 zn_Sdu{+>P{xc`}~X4UcZQM02=K5Fj{ri!ZU>)B29Q%nOz5l<24uI3lj^(n%4P}%G2 zSyBD44Oyt3s$9vlL#|xabFe`e-Bm3FPfn02O8qdzR93w&19hy9XIVA71D~#h2$~f= zd#iFALatQg1iAud>J?wAWs9Z?qUvA;Le2~VRB8nMNxrx<0{&tOh&B!WZ}x+uzy4qQ z;r|aD|L^uy@o;dA371bQdb*Tzj7`R;82;k4(Mp_J)WCY4Ul^Assz228tXYSPgx!ZD zL0m~hxSh?9sm18KrZOBDe&XP0J6Ww(-?LiDs<^i^gSzslMo=|1y1wTe8#>k(5eGfY)N=FIqZX7njbOLsW$|A-#xX6By&!PtL599Nb^^Xb5Lq zQq8|`c60$xoBQgVO9B<5a~5!K>YPPe*29@{4&MEB&cf~vXJMxe0^h+|?;puv=a&TQ zP9NFtuBok~@uI*6fJ9+YV1N!J>=uB8T_o3K6OH#{^H;An^sLZsS5sL^C0sO_UFi4| z5VdF%b)5$_38VfSNe$R<7uY=1U`EZJ)W$kGtv1=Y4XRdS6|ZS{B~c%-~7 zN&EvH64zp>4)hUnHovJoOwVaUjyrQv(WSN7TGFJU_=xurzE0#f#jfW|+BJiegkQ8r z_(S#fa1E0ZoiLm8Zv=p86Wwb!NQVwdK9VdGRME{Or8pI6M}cN~fqH_V9@m>f$xeVR ziU1wE%NLB!P8gwJwD=Q4vXk2hzmfV5mhE+x5SF^?pf^!=a{nie&-z6@BhB7;G^9Z> zVf`w`&oE0X%qFP|dBpmNJf#oQ9rBoHh+MH}US9P?yaa*nq7KK0BcZi^6gDabrH>ioo<0E)qH zr$~eoR*)0H`w$GrCkTrCX!6^$|6el9CRrgsFcbmddKY>D+(8H3i+#IJkeR9-m*l#> zl%og7K`@dyK~ThnP#_GyAV{o(pdU4?PFO8%P7II1FdC;|C;-|45J6&b7s@0@;THsj zKSC!c&dMEiP&bYwQ(Z& z6oRq%1VI513qT}!#RN5S6YvXy!rwtB=t)exj(H5?J9L80sc*+8wJZNeJF$;vz_1yC zf}sd(Pl0Icf*`>~T}Y7tC$K;#f@dMv0-qo#@@4WlcfBMiliM7>ASnFGWcI)Kv#!i4z1xTs#Hh@C$n#A1SWl0)zdg2ErE6BGs7>!97RZ_o*H z7PRA%<`~Qlf;ZhwQYdkPAaORS1_e43AOOTEQ*@ypMQS*KHFP341A>0|1VK@#Ciz{- zDJCeB>yKX$6#gikAO)qGI%YrY8yy7M|E+X}UdbID44=ZVHi3en2&_ec?j#F>#5xFG zxnhuNIe|5DBKRQ$>*5mxMSgAady-d7P=a6`{DNRVaU&P4GZdH5`a0-v?3?s1;24AuZ0Lm5*ookL2sXec2-*ZdJpkSyv6!Jo za$^DnL*b9n35q#DBOP=C_RTuMAf2G8W}1UwMF=(}P7oAv4Z!C=2SH+s1|cmXPU zf~6eeAy*J2wrCJSu(}ggGbe&uAy`f9f5A`yv;<%niNy>xk_{vag2ErK6GUePr)WmA z8v9o|L6+O>!u4>RpQ8u6LC~ZV6mhMsh5f4^kYRAUdEoo5mBChkAG`!u#mq;~n8u z372~BH%JfkkPt%p>LKs`8RAFCV?89Ckp6l|B}Yhqfuen9kRA~t&QqQ&wRK{=i807b zUDCMEc!Ez15#d|en}uNP#X}@pGf2PbxvlUU=~yutYOjygIklIylhr=2aU9+(X_9^? zl%4J)LQB0VNz(`wlRLttFunbLe3se#zTqe4ouJ>8W;RIVc{(18k#LeGxd|at~gs(HhbrvAv%2kKWzSAHT zIOVoM)>(dfeQ`T6 zAD<7!8#+Z1*Bs1D?1Cb(f}*7DFi3gD*H00&te6+hr;xg(lM=ob0DOX7Cnfnoy+HeX)x7eFdY}kdTtoG3-#Q3NQvBIPPwg}avPC*(ka&#PI9(WvLDGO^yHv& zl5||ZJjLlE2B5opAmoLF#969bN8X~i|@*C$)STTC*#|ok)D3epd*Qb2?EJplPlvtsuSV1$4;2^%Ue&1Ztx-UqfSJ` zjRR*TenCXsB8eXH1M8v_R+RJo^Kgh<)`@5piR{S&Lr~A=W`e%QZ+EF+7`=d_+*>%PwIp1tNQNBEt7J068QU6S+e2 zZk@;l2Jb`*b2AE)ARwIuT+85u4N-u)R)T5l%#+Ao86~L;$=6 zz$Pamc5=Vfb8q1PUMG^Q6KVXZmt!t94k8D1A|h@uI9nV<*#87Oy5!0hgS6iXtb-Gg zM2LK^6A=JI0NCn8M1sgcJ@+C0Lpl-eJne)@H{9wth|GYCH4=?DqURRk=XY!!bFNN0k#|qDaS)jgk@dvc z#PKTvM*_4HAVGv+2NAxOD&;tVS)GXVhR8;ph{zvBzD8a#5slmpdhWaUck4tt>qM%( z;UkFHq?^ohHxnp`h`=O(b^|1c5bPk*8?a4IV8KpA+#!;y6A}5z9#Gy5uQDzS4<^z)k>ts}mvELBtQRtxjMePDFelvRx-4@;j5i--(D! z?lwL5Bm8>>5$cP(e>*}tT-)Lx;?BPLS|=g`y8?6&zvvNSi^d2)2(rToteq2)iV*om zCnEB@k$;H1;$0&_WT&1R_XPp_^e*Y96FKzmR0olY5Lu)X5pg{LI-(OHwrGq%B%2si zA6addVyYl+$)v@YsOJ;q!;K{;9WkpO!zSc0Hs^Z;an~zWLVIs8NMGu_M7dC>Hu=$} zrTFLrAK~i>l@sI?w@HJ>lA>Ij&ng1G)!WovZ`1L~gB@+EN1K)tD2AJe>jls$fJEmK zTQuANTc#5cG3}j*)Pu-BbRxp{I{8015wUNw8Ke~ih}Or``ZT@u;|cmtt@oq#tMsC| z0Dq@9poiXo^(Tw&3xjCDO5#KVL|ktOoC8}lfLN)a7*c+;V2u-)?9_q)TCf_Q=t=?5 z2Y?IY6nCXe?m9hp5B~i+k)Aq{7wdc-JsAa&wK@?I*B6{0!LbP@1Q(4{zVJBL2`tQs zNDxFm)QJdyegIr{A|gTLBR%&h{{N4$^MH@4dcOGGfJhUg(xfgO1Qbz`rfd-DARr9WLH`uUK)L_Ga4H2=T*cVa6h6Pb<7!W)5-v8h8-fVKSZ20~8IBqg`&Y5%P zmUrK_yktT~nUI>to)icfj*ulLgt)ky1kp~2B!u8_o+9LWkF2{VB!-adOo$Rq2LFjC zBnh^Fb`9xJhV*nBQYbPEX}|DAEY0ZX{SkMA0Y?IyVL;7;ea(gB+K9UmoGbWfYPp6r zmKjAZ6lWO4IR*Fiu39&^PfXk%aW`8CUpVQ`w2-+aAu&R3vXJS7%(jquK}b#AC%P38 zcdLb*MaVgW3I_J6+BhEELl#EdEwt&dP^%VO8zZ4ru7liWPL#LNrs<@%=W?CfgTHt> z-4t=l4Vy!IIjUBb_R%(q*^~Pp7vuMeJYA7bTorf}S8E(kljs7WvpK=KgLKJdd?dvq zyTC@VJ8iW&++SouL{J-o@9|4Wa2^M{z_0^oKZt>!fjfF<>ULD%G7o$L;GY99PfhZeEQq*EJ!~frdok5zSZmV7|+FRBx=dPwr z!zg?Q@PFcut9-q&T)#dU<}Z}1XAY#=1}hKRey z<67kj8IF)MzYB$ckGoC~{0lH8BnfyH0h)|wkYJGs$u%MG{&0_O{@i-z!))SI5yjPk z$3;RR#Ohv;rw$|TERSo23F%JTLhkR*`7RzR5kXxDvLHxJs084%4Lpzb>fVIv!Gt=G zHccqC4QE`-gXQ;NKDnh4cdm!+?9GXe%!#=vY#i{3fO%hP0M>*mToQ3}Ja88e+!gS= z6c}tzuoXRQ672bgty$Z-9k!|FS@k+J$qvr$tE!Z9^N9-%hzh)uh{_OYQ{~5CQ;k7( zfk$?#l~4P^7i_9=+BBhrzY7o_swv(Va=~6`*v}C9z6qho3E-~s*cT;)zmbfRJOXvW zQ{asdRf9-EG^DK+4Oy;7cCsgAGeX9j5aDkEpX~{Wf*ohrW(fV*gp3b_u>Oaiad=RY z^$3}0LKL_PqJvEc!GVzE;)t8zkq!5Re2;2$^9*6n8H~bxa7cfsmp_yp_|)6f?pT@;gG#Fd=GN1HPUo#4V1*-An`j zi_#BG$RraIdF8<12(S<#gH4FyZiA?S2_ZJD2to#VT&H+Kwj<+< z1mGbCJ^`gWO~_;u(r3rZ;nN}rImLu1ZY@NOO$f1JMXu%D8y?q4Pl(Gd=T0>t!r!kO z#HN9gP>4gwFauX6;3E?<#e|Id!UYvshL8~^L~$$OX>LM@4TKaTWVpvQ%o9>XMNXql zr@3qq+zvrYPe>B*NCRI=$=)&{Q%%Te2YeMw)-O1njW!{Q%ZKMM6GCh_G;odbxcZd{ ziMxd;8B3#th+qK(Z9E|fz+()&g!Zi_WSR-Nuy9o%vgxEkx6t0-Z z)qfu2pGBi~v_%Lx!Gwt55(wIPLSlfs8+bYGZ<~Up7th$ z*l>r214S6DCzBy4fKQ*BBY-Q5y6!Zbo7KI!S*%mqqM(cLS~qdp$o$o)mtKD zfC*9D)$nuxY2(KI_ zpm(EZg!eSP5ps-#X#5qp6r%1XL@zU{B1Kn6+|eG{U{A<;gdA%^guf1a4^N0gNC(5d zPy2f&WR?kO@NK)GA`^$Bq?42&M1eOz)YF6z90(~yNJo!sh$ke8kS-=f_yX{~Jt0Z3 zoeT0#uG%Di9AO-;vgdI7ao(Wt7Af_lL0=%}kxo_x(?Ke@#kzp^a~SSIaB~eemv-G^ zm4z#u7jfMTcQLqmgE&kGfb@L=i_VX@;|+KP!1|Q>5!qX$}V2|q_t6neK7jidMgEkGL@Lj==10QFd1-EtjI8Ai}UqZl*CSgIQ(iHP6 zVgn%#vhO{z37(Ka2>Ho`2>%-RgeN2l_D93UPbJ`cE6H3FQa>5KOLQ|rela16`v9T^ zCWP2PNMa&yD={*~Or2L!kvRzY&4j4!L+}@SLJB8xTlcGh8xgR_gz%7npVGDo4j&5< z@`njg-0KisYC?z&D}s>UJ+6tKkl_gV%Y+F32Ka@65bCel8U_5PfhQ60g9(8nL9_d- zR@-1`wj<;p6QZ~s5M5zHhz%>kcKEl)HI0VU97!PLKNBMS$8iX*3Y3IG3IYFX;I;&O zV?r)4A-z|;9SB*5kewz(ac{xHNiLYI#0ElQa20u6lRP202>FDz;I>W#TOnBF3CRV# z%fR>4ac-9hnQuZmTokNd_XWqoPfdv89)xF!2_ZHd8n||QT&I-?iMxs@`J6^=BqDeS zg6ljXQNW)WcpdGZn2-xi$P+Uz3+Bl02>H^4DDDw>ZZILl211hKdB2FqHOjxO%SOo8 zCPW15AXw%JDFXbJfj7{;+k`NM67ajVeS%jqT}6iG8xx|q$KbiiglM^IA4$NqM_f3v z6*Rh}B(63>zB3_eTMxl4o{&Po-G4G3A~2`K{mzF~i${R5~A@};6{j6n-GG-Ndno29$B6zq$5H;HX*`44}Og&q!8>5!~R42mnP(56SC>w zuLB`dhofYJlpsWbn;=?iLI@6o#2`z0WaB&`F@$V1A;P~1{sB)&F4*U2cLhi0^DfO7 z_>ob|Ud5frWp*c$y0v=h~d_PyWDWP1RaW%g^PiE(QuD|yE6UO zt}I~UB;HP9z$XA+Z3&fZp9#UeRB*sVZtXT3JPG_-gP&6{f1=&m750y~S1e=`A&V_! zZb?Y2A8#eGkXH#=Y9aGVLL4ElS;$+2TyG(V1|h+%U149|LqbS7YCgI?FK96jQWvA` zoF!kUO*^mJe$3@PB;L>^PT)NxhP{mT{TO!L+FfFEXv3CC$Hn-)H@9{V5?2Kt#Wfkn z3QgVG5$tj>J!$cPM|Ppjq3$$w0e(IJhqrbP?4uqw$HNW<`?!bg4|bD> zjm815H}GutNZ{^dCjAJ?8Pn#x`@CSnb5Pa%tTL^ZoVb7;sPnAi?6(;1H{817TcbieN zm_Qn{6<7tL9VUd}@Y({|B9CmYC**goElW*^@KwPVc|vl*E-~!swEt%pxhn%9Tw0HN z=8B*s>k)Fj2~l80h<2F}f&(EAvgC*%u++-yRGuK@lFPe>B%O@@uP zA>dCd$<-$0*cr8glh71|+-gD;_eUPL=q7~NaA+XA#mE#h+Y_=GA-9>3AP@LGo{-$m z+}bTSa7Vy@nUHHt$c(=VgNn>SNWKYC+}{v=YeI+(gd{uhZV!*^Y){A=2)W3F2>%cG z?*k#!U$=HefEO6JhZ2~OYfZ=}&%GQ7S%{EJOo-zCgXl*SLTo6cBk%t3xX$r}Y)8mt zCPa7_XKa2Bl$a2}ml}AGl$wx5Cgh2qt`CGPL&)VOL~&X0{Axmo4TL1%TIg||>j^1B z$d$AOw{{{Z2f^>2kV3#$82D8-njcNbViWS`#(x4KUvMXCCv(gb*9f5xB1M zxK1w-!Yg(txt2z4BqG=a!QY;cT)@{DcsuPsnUEzW z9lbe=1V``KF}zE}(=*fSOCf!kPn(J=4PT?7ve%bF(qCZMzjYk=)jD&%btZRe_<{45 zbS6%qE)xX0Zy>5_ogp@yXOLZ3Lgvo!gsehHo(U2DTkzF9A-Q04_wWWD^?nDvztMW1 zx`8OG_m19=x1<$WNWWPJZmwx0EN5DbW0b;}X8AruLk8GCLg9qrr z1nYqaetGv};IrGo&}$kU|mMqU}ATy}}u-SJ-D7@GF3~8ZdQl zw>L1v6&}g=fq-)b8?NK6n1hVsdnj%*igOAU&$j!!qGl0yvW5Ig$O;RYTM|-8$Pf$p zn~>WrWL`-~Zj*>R#X_=N25P(0n=-DrdP`D>3H7vuLS+~18Lj!hw3ajnL264qCa=njQo zmxC47;XNcCS?U8V-D&I1f$}sHB7(ypXlX)%<2cxnhFwm3lYS-lcf&c3kETt>akY)+ zRD2lN;`=+j(P)$h?%^G&bE&{s4}22fHUT)izbibHw~Tn$6Fuw%usI%f5ZJ>#Y%&fw zX5f2Jc8itoCM(~GtHRr=j{HfP6Npp!6xVAUZx?|`)gv~zttzU`TSh#t`)nyNX)?A#JXV9M90Y?cCq>Ga-s=15Za2 zLTqqC83GB>gvtfn-@vcZ-rSo|qnJ=9(WVKd zwlR!rSFrpZ+}{;d=Pe>0wx>5IrZ6XtOkv}I2LnFdz%3!j1>DX9_wvBg0UwnDgB=1k z=3x_H+Z*;>+Lv38Z>4&vPr_CtM*{nup^P6xuqvS7P2;%nhfEb%PK-V_n+i@Gz}rJS z;t}i-TIaXfuw-{?-L=q?=tGY&+FXRg34)IWr-`c^V-ehg9GbXb<8FlrJV25}9V}5J z_Ny)ch<7VfT zW&0OqSMI3s*RCgJx0RI2b!2jMfjm{2m&P$UA+*Z4SXJIhPb^}Atf^VF94ABSU zKOSE-9>r5HKfIMjylaGC9gOjF78eM`=;tW@1NpIn0G0a{3_QPT^W$QWH}S}yfShL6 z*dwnGxvD_VTr>P|2IBJy9z4Hl!?t&#ck(O>@cN;hRP9KDFB<#ft}~h%?HBMp4J|O& zqTrwNtJaD)r%4j^{Uqn{)UCRD7c`o#`hF|gTyr3v^3;zJlH30R43W`?z< zjrzvaR-HB_R1NutO?NUz4Vg>4{kBws^2G^u?uxm?;Lv!jha*Yfg=rghhHrmjzP6_~ z2Nw*USG8`=YIs^+%nQtqcI=F3mle}&&2ss>Vy=zx==+#9!Xq6=7>|58&{QwD{DP`= zXVCZHW2fq4j`FkWl;ByGgDj=KVQCX7 z)%|QsDc>=qtWj|0{Hk?3p*%>L6)5K23Ak4*xepo!s;o5un){g2_Rt}L}6WkcJM z!18kCO@e&&k-BzTqE->o3^{)B2X(r`Y^jV)P%yoYyANAtLhVk)IfarGKo#Fx3dw{$Q31Y-o@#3HG;wNN?&n3Px z@zv1wBb~b`AGtg_wgq=U!8aH3mdUEL*QA4L-#9&NiqU)ue5YgDRaaw8u&iL2S9Cjf z#@vD6YSH)i!Ck{R`El$lNtLl0F}2}V%mK!IEIc#d)s75r+%@KhBDS3$2qrG}Hzlsg zuFD*Dov~pAr|r7oHohz7z6YaSR^QR|0Rvxw_@D6cYw*2IQ3k#_J}sm%JmzBlLb^|1 z7qtp!R?xFbE}!Y}qmk-+m3Ba?p8aB^@->09Rl$vW&!);Q5BSdp*S8++1g?($W^nnM z18;**nkMl$1uLz|msG{&!1*h94dU+<=w^1J8vM+l#oZq?X!O+Z0u|=x;HGLg!h(|ix_Iy(3!|F1(#p4*QRoyTY>N*Vo4LL!vAVB zomnfvw=Rt!SElJ4p@5%&(J=oD<`#w-t- zNg9xx=j>bq>}&k!%|0KJ3vHHYr`M*%z$LXaXp)rn?eD$<$JEaj--)xs8N1e%978LB z`Mx|~aq$g-^facxo&+lCnOHt5Tk&`ik4kVq;Cfu)+EnP+IY=IJi52g}+#c{6!PN+S zm|iv{wjVamt4>N>Rxp4EyF9A%9k^;tb9X3-8`9$_KR3d@HRbZ3f*^Q;uju%ho76W= ztlEarruaFS#J1t_QWN_CVqaOzHvH|Sog_9ZNPLLPe>3LZP-4;@3`aal;>HAQHDWn? zLoDB1YypI6RR~K@?{nz=TYJhaRqr35_v`4rk`2Trf=lgErF&o9<^*rZ zXYf^V@0mCy=!-ZdVDoTS?xR8xDzqJ(D)cBK9-~6d!3@S`sY3cb1YbZ_FitgHBTDpv z^{6fho6w^cfZjI|@{L43F6QFeuDxyYL__mS~c2ip|Bmf+$a8ljy1A(V4IcGW(5 zmA@_KJ_e^=Jw>k`r&pRIL$Nc}tBjrE4vkLjFcJ00*DRmQ++lz=m66IskNL#ubu;_UOh*zo}^bwekyiW znZ2sxcD@&LFPMn>By53*wqTw&5%P^egnS81B8odDRjgHSHrI9e^y)<;tO>X!gohKe z37=;3GQ=&+IGYbBHJ2hzyJNKsz60G$CN8c7gHWdgY%Q&4_rK z3h5*<5<9!h3e|HPsL(6cp$6n@MWS|KUe?W)wq7}N5h3RSOd^U)#P6jmRNv)Lq1TMC z7TCieJd(Iqtyl65flt1Ruw`X-D((^+w#MA+)~hY_>REc#9?)p)oHBdWz~#`ZEheHN z30ouLC@>G22>He#LcWVJiP)=C8vm?d%{O!rdi8)2)&|@L!lP+_*a+nt3ZZ;gVmFrA zE0@TpQxAdDP`yp3HqfbKz)Zl-_c|5aG?iQ+qHe?Xm|JHeq9kmKhz@`rF%j}jK!ki( zVG^-dukdFDooeKAUXQuQLOypS*u&vFmiCU=qZtJFDbMcNJH<#{qHBh^g%Y z+61@^n@K|Ldbjgfp2r7SRRhLy?=xsir^ftV#Cd1Eb>JAVN06`+{wMJ%mwXo$=c4|L zNcMy!Q{%-n3a}7UGHo7rTC#^o_UsbMx$ev$Ss}@u1`90K?O+c4{lXiYTrg*VslsCf zyBCXb@_fof$QL|)r}5V>&(gq&CwSVS9JdW!@J+_{qjeQTld&n-RBRe1;%rRBg}jUD z853~=BBu5WkE7#gn`f~p5rywX+zMi=(bgW-@pk#)OIgQ@hz6Bh4HurpvvT+@$L2zk zR`TF{kPF%EAz8Eom?PopM*K?Sk?&G?v#nl9B za-u$|XCHK>EbXhB!I$msfU7ng5!XS?{1~{Y*nMqv%oKSLXubEk?IBWB*9Y#_bDmA9}VDW za>je|OD}AVC04@%63e#;OO(-rD2eY6$)X*>90OTz;@29Fd{@9D-(us*(1ExsdXp~! z0;KWMP0%6Kz7GJf?tYT{<5l!7?+YWg4)a7eBg-<*lY@dekveqJ`=p*3S!y|it4@x- z@Smeo!TPi$Ns=rfSY>KN@Dh4PRf89xuE(!UME;4hOE#?E*+Z_QO&2q@O`@$om@~1* z;cvs^lRRnyKTJ{Iy_h3^7mnuS4{yIXC>DfCqpcvgf}{gzyTKC5cMha*1or}elki4M zsJ3%yQ~VrE33bYO+7doO!XBq_8h-54;9ejS<1c5t8*|IR$JNWm5G*C(Kz=?6yO{

=3AX5Buv1buSRpNeGijQJ>RO42Fh>IQu@JhrD z0(6U!$u|cw`E-9Eva}u-b4BRsZ6TLy0`5Av27?)bE!XQs8Z$LagHXmeHo=oD;$5z0RuLiuzw->?sH zD>%enV&d9>yBfY>U@kT;`7VGKH@Gjam@hU0GUq1!?8>8YmVrod~L=# zQYXWFsF&`GCw0Spxv5iTn3pAr(>`)zX>O>{AWQZ-$mHV`w+KtYs}38r_~id0^lMD z#{!ydWb!S5Og=WMfUGQW`SkRxkj%xH3%9|Q3nmXc2Y+=|m;C3<41Jg2zZ-GP9a;FK zd4ylqV2Po0QF)-%_n0TmeP8ulFs}Z??<^rk) zK@WOVNKeP{^LT8I36TFxxa7MOTV2xA;!Cjv6uupC^MqmiV*;I(2FldaB%t%XM6=T{ znR;3TX1<3xI}H=h)Kj;OH|YVYhSHYIg&XPV1b&`~#qsO%tDA}?8P|QZ_;9*TE00%c zqu?rHI;dUCi)V+v ztBk9x2^(R;jt{vUpV#Iz3fW{poVwjO8hFUQaAMt6<+6GHd1<2~4vA01+W>DlnjRwT zeWZSbox|^%#}jDh*QnYDE#QCri(r%SpKgW!yu;O=5jNEX$fZtMgX%fOv%Jo4QDk9>4E;PJ)9Q5O(J zAsS+;dLPUi_#o0jn^UMe1;6$>`Def+-__V7h_bz|cud#ecUyUno`;#4hVeFlcrexi zMFE}afzC(+CC$XyBIM~VP9d2eH?5zPHU4^9veJ=o)y!S%) zz|oniG8zslHxhq6?U8C&gjddMu*V?HG}c9IF{4AWXg4rjAY*m8Q6hpzzGd*pcdhaG zW1Sl61UzF+RJ<#ot`KRxpNow#GWnN4Cf_3Li84gRAj>gPF)+u$bsq6C5CPW*;5aujf1zg&KtrNPj;@jGOc$J-}KI@K%QST;kh#@eg?MTQkHbh(F5W<5KjH2Y5RJ zpb$WN1L#0^2k|-q-hyq*5TAHG;*Rm+@ABf`DIFgV_VGdhM|*%(9^l>5fFM3e{IOpA zYA=3!hWH}lI}ooue=x&#^I54apYgwiYk4Oc8!fBhK(5kCp>N@h~S z>ER)v5aP}TJ_Yb{z&Z!qhTRKrcWF`KC7h$}afX=)W*8V<^9!*x#DA)UCHVqOJeNas zS3nKveQ!Y5%?^*OcL)gJ11$4uXyx)=kGXDOqhJSteT&s_6*ys6g8LBLd-*kT$AhV% zdSbr3d(O zF90f;OpS@xd>KKn3n{ZM+bgk$h~I;G^CcV_FwG1z70hrjxSNg%AL zfkp#b2}qaXJF#`df14puE|?aEnE_@b7+rSn!X735yRXgJvNzJUmJ{jTW7;@_?~n|>Caxu1Qll8xL^w{{dZm4D!L{6Bfe`MIMRqs`yQKhjU@0Md z%G}k-AY|v}m}_gjcm-@bu)6+#<6-N9tCtD;Ld>y6A z6*mDCj$?}qzWVN9$TXoKb4ODF9aMh9Qp4B?QO&UQ3gl3M8an?8{4U@QfNuo&hX-yB zutf&&&gWyUy7l4%u$wY=qGfT?CUAzGXc%nwTu0o~stkWdG8W~R@|Z3rwX8QH?T8>9 zO8&x9y*UgqtuqK(fuNdTH8i@UN2Y`ajC;VQUT6DH@~Z7G5Xpe(E$##9IhQusn{<7}Wp`|7fMJkrBK|gFwUmw`HeGxXGrWPtPa=K` z@r4$DbUMHGe@E#e05o>G#r}uVufY8PmMT3TFLZn=&GDV0!P6vKUa8V2z&r4&mM%zR z28Y!ut_~L;zdE|Eq*g<`2}I${Y~{+C>XDFaMzzLnE!LT&d-d`azIV^!rxE`K@wZ!i z7vj^as4v#o&qRJq{43##iR;$cyZLBTYFu?S^OfsDRUVSYR7xbBSs{0T{~JrKkgg=} zhNXI#+VOH&A^V%)F<@Ut@SV&*z4tBn;6{263+Z761Iz^Q7J$2mReVl{9wv#eWIfC$ zz70LpFcDfjsr}zIuc~PLQvsSd6(P_(5W%0b!bzz+ z&{gp_2E^$m$cOdQj8jlJ%cme<89}ZWIh8k308Y|&fp2}E_?;%VC zI|z_+CT!4?7{Y(>1l*2*duh{iL*IJ>9zZ}p6EKf93HT`xFg<)`bBGBL@sBhxd9o$! zkPAXlzu=RoGZ6JVqLknfOr+_e5)9+d9>-yDi1)W|3aVYo-xc|lHekVLhz>h19F}LC zOAx}p#IeuvztrS2EN`r_GfjhEx@XFPyY=#{XT5;xF);qr(5oRM8fpT{|^Z zh5$YpOW`!da>4xxPG5kUy$E2Zw8TkEoSLNC)c&zViq}(bT6sw;*7&(8yRROvKlI6b*2gu4Ky z=Hv-BpFXuD2f!60iRPn9y&J!B$p0HTxis`Mp(iIl&;iTnbeDt8fL{-$z|X~%sWqTUdvZuO{Y zpt6W>ar)W-3jzEKKzoo9_s@B_(~f9h%BDt=BYD{ddv z(|DiFDQ1ra-b$eU(KD4Rky?V~`MnKs-x1f0o``VH)G*0ww2daqSS%OQu&Fc8+p^Pg zjABWIerrPG2SSj8QcWO@@0&DeyyX9b@sjT)%*HEtx^IM$M-1Z_t~bD`v{y^`IpGo{ zu{(v>^vcE%ur-X2$8G~~Gys)p6Q&-dW-&hoLs1CeEmK^b@$8OZO;k1~w+(+&63c)4 zkkI!s{*iz)Y(~NGvGKUK0|DG|EkQsQ0+N7U#D-^3>=5v-3Amm3yAiN~SjN!3gI_}- z|6GPbzSpra@MIVYmkZZ>L2?(nBMzto$pw1FKi0jQVt=;giiawD;?VyEq+ zmp$m^2PS$Ic~&7>L-GRlzKM>j;e1rf`6f2L4AoIoe`Kof0n-uD!fY{2y68ege`rZG zNv|S@hIcDAp(IE6sx-d`$_ihKxQ|WoeSq#kv4&(bw!_Hep9h(I+ptL`Wd4kIMR08} z!E3>EMzAn%7$y|V`d9LGSOUdK04j4+%KR{8ehL3u*f@g9noJi{=8fQ3UhOmq@6Y8p z2WBFeN<1bp%@wziuVN&QB^_1VDGTtC-$k$&XxNVpLJd=Fw|X95GsI|OT}_WQKel62 z(t4~8Xk24-;6)Rnd9p4yoFh}ygd{ic{ffi^$u4Mmf7N=CmkM(3=TmH(adJx1(bKJ; z!kqw*dHgaPqk8Ua9$z6wJy6Ur^?-lsz{~j60r?wQ2VTRc4rq>DLkBchim=lO@>hm1 zbbwmWgI7Hfk0C-!X)+NKF_)hu;&o5Nsh)@}o`_B+;z1Lkd2=lyH2$ApGvO&Mg8V9D z;fs7#q9@`}L}+D910WGPvPs0lo`~U|h=)88!%W0Fe3~DcS5M}KbLtE@{P~d^Px4iX zMykLO9_eF7D*qZIeH5QAHV@OY6Idx*`T1?^as1lC<=>CHH~HSdQU{d;KTjvQt`9tD z83)HQePbOzVJw~Xkfi3qIx3*0HiKwYAh=lZ_#`n|HWx;EVxPdDnhR%|*r)LU37)lU zTXN0}-iWZ9Gv!m?lpA(Pb5w6c^(#X|#qW}I4k#_XxtI=>Dj6F|f(JK!d$;3J&pM5` zXDxwV_zd3gi(5ZGQ!xJ_J}bY%%RkEUv$Z6Io_cGgPHlMHgN{PtT9}M`NAf31@V}3p zPx^T#;Q}7#V+Y}fBlx^QE)s4Bs{vU@7kV%tTCDG3E5G60*BTKu?9YhExeH5;2(j)= z2)z+;MNGpxJ<$)&2~Pu?5uG{>I9A@Q3gz4##I-chr`Hy$0$M19?sSlCV z>U7umE}O=?yQ~6x;5w0BeMrnl*ga7im^RtFX&B!#Bx64d$o_6wuFI5@jD z;4BUaE_3qn_a)paqY?k=Gs2hyh)v;$3v@zV?8^ct?S35Harb(XriPN@!5*6;E0+OC zElMQ_3((rtI(pdn4(E<{t09dt5)#<6ci8z)!CVCJVoXBVRuelur@()#?2^i*BHovJ z{a4^V9{3V~mzIEwd!~x4@e=m;FqaX(Fat)_xZlH^ZJ>;&*!EIiEiTiF~MovHOXHCXDcF@FMLM8CAAcMXXq7B&=h^-`cU%kr(dx>EsawAv?{0Y0=uyG~JbahRl^kTr;fHZ^q zpB}!#)c~N%dIy$vo$`jHjk^r2HlT08_F)o#OVUg6X(E1x@8q7rJSe^Y^EZit$?*!o zsePJ1iLsZj9KOpTi&91<*spJx;Ae2RA}sR;)W$9JFckv~>tCQ%fHH4DZQPX}sEU{9 z9x$0Vpf>Ib4^z#wdy>YSMQxX zcNH8e`*(2krR?9saV4g zI4-%R=7YFO``WrQzv7e@WelZu0A-TeU$F=1b>=Z_>t=lwj=L~}Jj{b&G7n)}H`~LU z>|q`%NBuL8VOw{W2O8#q9!?)a1gCm_4j7uV6Zld`V*PwJehsaj8e7Sd*KT+lfunRW zxP)Jx;4yXNZ(Nc+HVnD&l4F@Nuf zvi{Gtc&-15#Q#L>UlxBC@xl79j6<7Ef;B%EKr0gb31H!raCMZUdyf$8?cJruc8>Kf zo8H|?5^dLo#E-%cv zp(XL!6`#T~ZBF58Y3FKHAC_11+&nk`D_;NAa`_&Te)&AN56J}(Yq@-ZuUm#0&M@w^ zGI#^lUcM}RagXc73|!2=(!fyRk*-#D&Afv<9rqs@dU`9C3hSAQA1El4l zm*G!gEdQXf6@Yz>CE%n!1w9e0hVTcl{TaeP!A`)ZA^Z~GjfmKLBRkT3tpC)7hHpA7 z6&MahdsnOSqq$Yz$#s*hXG-)Yq)MbNY(i+}vLERtd6>67%!^=xs%!t(wwMy3@tx{{ z-t|DQ0m@wVBi$4a^S+1K0wzP*srr%bbTGXCz_runFr2b$V`vLNhl8iFOiL&%dwVwx zu&{a=_B616gYEu5uwg}yaHV*gUHA^23Wukp3bP%(W>KYa!84$#0? z>i#ba=P4f9Q5neiO3wgT`dS`tVx@gYx}oYJ9hBHMOiGE(qWm8fm!D@KOPBvBcd7?E z!UOFnv;0T7VIHQvhxxe7@*m|!1Q=e8kh)_$P*IuXKgtaU)QTSITfVNC!YhjOXg_+i z6HMl-V>@@6VakEI5{&kY3fL}Tbp6-ZY{5$If{t_}0o7qlH8(y@yJrKN)I=4objgo$ zqb*qll3hhI&G!AVFUt_f{A=$<8K@GVs{v_@Dq>%i0n&sT3r6c-PoJ-v%1tQo&k#Qg zy98lr2O961f3#b{C4WqakGHE0!MzaZde64>{~$QVZGa%h1n7D9wcs@jpMyW^e+uZ} zc7l%uc&@)u2p)t$WAerSi{MzEgNwNnOn{ysegwSA{5AOV{-=PBt`Yd|!c%|O5`xDd zP{}J{d;VV}om>tiCqkmN-v*8gW`uJ|&)=`Yph|K=2~g*TW-Ay)m&g zutPDh1b0jZ-_foYT$;cVwjiUhBZP|wuQrsB9qxJpY|Ii9u<@B;MJlT@^rr@u`3d_O z`vvW|M}Kuzy|>C4!BK!zIl|E1F(Mq>S%rkmitV^4Iqxa-zn~-{4k~( zasBZhX!ryA@eUe%2V&*nKAiXgh`5r+|BR1V)@sC^1TJ|0rw9i@C{Q*wkcMm{oYF1y z9fV(kt`@j_`0izu`Yn1jn94QHo)h*eLCJTYGUiA)RM`P=x1?S8gRuzM#rT(ER5o5* z+N}`1M$Jb*QK8e@{Oo@P?i%*n@`*^2*A!i??bnZff8FR0_y2I)=yoL0>>o{%_sL*j1+NdGBUv8I zv0$in_Nj<$O|@0!>k(Ff`SX|8RuYpgpzmXgj^R=sunZJ^Je43!KB=bal11{4(l|8l&IZ z|E*(2clNlx_qZy;QPsGD4?#gz+>r4MSiW#197HdY;AYTt=H z9&u3uH6Lb?WJrD(b0~35@N3_ZKf50CXXw4RIfguSFf@?_nA7sq#`%JFrq0OtVCC(1}`{I3-R^g zs0H?Z{10M-Sl_7~o!83}@eNd<`I6Sbvn`Pr-yrf>%(Z|>Gc+5by61)ybS2M6uc=ITTY*CGs9jd>bB(xmKaDXh$&TA*_Lkx?LB+I|)ov6<)#tN7lz& zOG|hh;Kn2rxDi&jjD$H4^I-`~*b21p2`RF}mC{DQIv6wr|5*`jXN=-BxW*^S&YdIJbfvhk($?2|}s|i@WE=Lz1&JV#YAUG7idM*D)2#_xaYgJ~iV=lrZuWiD=MtDub z9SyF9gvUrE0aBi1($ytc6I62{mtoaBI09=-a1~-w6I6d-wGOeafcyG}@%sGOfLon% zggnEJt~uQhh~7DR4!*YhT#p#~6|Ii`q4~U^rWkznNhjZdlS5xO{5%a3d{pL5u;NLj zev01f9 zP@Ixa1;Ou+Ps1bMAqX3L?K4&TL6-|tMH4r1;$rxhd!DRUiV$A_xOyXYNoDDEFIiF@8Xg;?gF7;k} zP0q^1Fju3Rz^5(XfBYlN9S{m}l}P$XemFmlq@gN)Trv4Sp3c=l&VkqwkY(P3db*wW z$J~JcwKd#EWlp9t-NC3buaj&m=Fb<#+Z(1{E{DogGyJ{ahk@@Qd>n#p1nXE?HbHOK z2!bDs;4uhBK+w|!a0Uu4ab@HCxCr>~4ZjimDDb_($JGO=E!wAmzHY+P0UU0`?b6W`A@0{Uc9H~p->=3S1AFgT zj6ahnxF5O$;y;X7jcbhfY$I+>-0?+ z{~Mq70r?JM>B%<~OYH-3-5F>}m0Ei2)BgcvJI?-r%gO4Q;pO8aVw+&#lU%_{U2sER z_rRz5R1$ZB`xh=zGztf9!f*bTIrHUhY*JHB?;KPo#@CR?*YNG_ z5P*%XL*XkZ6u~LjkzUVICCv)<{Q)kIo_%JbngYJl6IBsWUz#Yj-DRR?nW#31>2IRq ztlh+4t9j~U_d}heDoilYtymRvUqGT6T_2Kr^TP{MS4h6bueot0f|uFcNC~7lNpUmG zgA+5%m*eB^D^sc@%TTHW=U@Z(E?+pQC|e#?`vx44*prBOjj9QED%QTts-5I=sM;RO z*9`Eg*^Iw{b1b1>8trN&6&d96sK|GKRgngO*W`y+$K%0%YozkuL?`5%i=DKOPVBrl z##dxmFE-GN*Xf0Tr(s8z*^9w$=h~Qi16)&jUyHl(Cx`G_E|}5S;o$VXo(C z#DFP12YO}j^>*7VvAF6?XZ`6QJO&${o|xtMHo;n2%?Z}=N!2+YQ(4O%sv&OY-F&S) zSPfM}u=nMMm+KS2zGIy7-=guCa~?JV(zNwjJmGQ}tM>qciM>e0-lSr}N*UxP*fl5+{KqZJ{xr1O zLuPnvTh06E4bvUW1BSWSFd2_+@p~ffbwFALx+7VeaeM6<%=pqEv6+IMiD>>S-qv$DFFr_Y@DfQkf;Z%|tr9p0d%$`j|7v<6-z8Y@GI}1P6Av5a z1TYU7=2F9C>Uq(shh=cZw2rKydgb{z?t65E0+-YY}GyV)~{J&M>2 zSG?dBULRR|#BfJ|+Y0VvFlS=N1UUBJv`b|G)Q7MhGhjo28v*Lrd=}Py9|@la_b51B zc-qjZNAts@+%RyDMV-T|0a5}mPm)4PD+)i*StOGs3J(nLIbVhQ`) zJ>E6&-f6sQdn#_kXB)9LmP?HIbBG%(=2rRIPQl%4E+?WptS5&9UY{SH6-R-+8^7j{ z{7+CB`E&qHz*Sa7BUC1SmjOEfeA$3HVhLDQNgDyY2ONjXb~=D=$PEvl!@yLc1D{|U zu;^`kSw3-8HLAE#3vluedUr4WYK)vak;EJG!!zJmT2|AbvdjM{)sgR9>@>*J${uXH z(Y1UzzG3=+dB!lhTNWm*q5-Ce{bUW8{m7p!Wqs#|s_qX}xg#KYJU=|&7^GtGs(-Zjk|x1^BArbdVITtfATf zZmD&u9k?g*!{gp~a7*yVr9=(SKq#jUkyr0SNFxZZHz6ItZ84nAg2I(0WCghEOvsVo zp3DzUS$vH!zyc#wLlQ#y;@CC&5R$(-=58=tCva~VPDe1|$`Y~@+>KwATgv2f?Q63C zK9wIHl_mpR2BCH|9iwi{IL;$7jOl6aW)spLAAxF_ar(*2)?pfkX? z45;ImfMxX{haTJtPCe)e?)m)i8b1}>E%>#1q{FQBA> zbZQYOC*YD#XNU7L3y8a1$PxzX2k1EiU233A0vwo& zOu$iKp3V=?i8H_~z^{tR|2!3yPlwMN5U}^^V*GUp&EfEQi2=I+eA|FJL<(5eP&8r; zF9xT*zCXBa;B*aHk5%W=dl_vF5u>(uXwxPAYOF7u7ht#P%H>}+vtqhzj&>UmbSYqM zu{!a*mdiI51Dr#ACCF1-Z0^kwcR6v|Vih;q+hP~ukCJ>S$#>T0C2mVT0FDbSZ@l#N zdZ4=!5Q4HrHqUm&D@;gmghGgpPjl0RxSJyGY7?TkF`kgCj8py_jq}=&Gu}?es}~{F z(dzs((!yIJ?iwRi+*ptF%&)_;kD_-QnKM)%DEk9kVrS9Th@PwMdD?W@z8pIa{)@3Y zG5+JzP;r;o)RQ5&yngruF_+lcMm!zxX&&(f5T9klY8yqHF56dN-C(;EyW5DrBw~rb~5n`Fkdw8Bxh*mv7y9YhFNFh#RtZ$VCIi?5qdk%5)K3W z2?=!|5qK29g&7kLapzmYlfZmPLS0c8VWWw^wj^O%zId+NaCgkjw}htx-c3SXRs|jl za8bsDL)`_Ia1fXsB-E92F_ugG;xZHFxqR})E#Gk6|9?tCT|NaK2XINogu~p0mT)kb zA`)uuEX5`fzqHJR<6I*W=2^lKU_T?Du9CvWrvSVzW5VGs7f>U{Z~~ysOcq^8mSATP ze0)MB@(O(nhnOPl0z7moXQx8kmt8)qs<0{$G8x)KU}I>2Sv z-V<%4n`k1>29t|OUAV5pW)XiAmOi12?uxhxhMA)4|I2`Mg}NR)2f)o(de4%8CL3rD ziN=A^1?mRuT;gxZkSG_-BrsY9Cxh8dHFP0jqvXAJe3lO@WB;Gva_)+`seq$^PXqh~ z)zHOA;F$oIV|!O)l$!!Z2UYESxr0-?T=}c z8x6tMbHZ(PC2gy?noXmv34+x2Hf`EcmtjvslEkVISHZZ7?&OaS7}pHqKk&HLz%@PO zipSLWA&uHbMfMCt&!v$$N_mDQ&m;aF;ax3cNe8;YmiSDFc92-Rr4ql8L1-a~2NBzU#8ek1XE$xZL^ z-30DMEXYOv;V}#Sam5g#t^CDM2=5N0B(EZ3O9mnND`M_c1HTWr zJ78_go4sDV4(^Q%*hXN7StoXY-I1{q@$ge==emi+nGPwK{>U#ZNw+F{`_SVWdfaMs&@g{l~MJ$3}qf(YHe$m z8)>4&(-YC!?_c*sZzp&By$q`JQ9asJp98oE)!I^}`hDWlmuC#oqfGP^;>Qmzz42)G z1XjH9XlJyQX3zfy%Wu3JYe}a<@;*tmC36IHAA#O$gSjK(##sDZ;&&0R?O5?UGKh4D z%rTLl5&wBmS=L|au^(A#IEn!I!=sHT28q_UNP1INU7+5>QtSI;lJCM&d&w$#9*J>} zK~+xx%T6A<3TzhGWx9sn>VZE2xEo6)T?aUci0-C*F~AtgQ|;Qe6KNs~`8}1KiD#SJ z1b3n(7e{%LYZHqv_mY23;xE#Y=R(%Ql79v8>x>g}R4GBxG(j+$>>305RaJ|#2Rz!Oh z^1IX(%b~KjB@&<_i8PrCy+q%Ww-SY;x|$3;<;w2tZFZ+yh`!eyDyW@kiosM$hEGU@ASvw+8zep0Si8 z7t=(Q*ypiA#I^&YO05RdMSsRIRjLG~I?9x(u`9wv$gO@r_y?q^S-%dl7-a2D*jnOW z%nz%C+Z|)Vy;qdW_itA#@ z<=5A5$j7nI{gx>?x#LM{+}V;p4yZfH1>#xN0F<6QN%G??iN<{qNi;+cVt=M3(aKVF zbLn7L!)QnrgHdH4<_{dXzsmqc0d+HwO1T7(DzXmyw+u`XGx>Nhu3&Wi@(tsS_^|;` zpfKOu^HajP`8&|rIOwJ;&t4NHhA9|TOzr~y$3HT-CirO%-mBJYe&}oL@oj`pQ|d3{ zn+ryK%}V&Jj!a!WQ_nfHYc5>7RREQq({(DpWZ%$ztLn~CdBs`mb!3$0VqPxQ>*+1elKEr9&Ut5a3kPj+PFusgN#r9 zCh*Dk1eW59e+fZW8et)X*+%#rz>)dklzJ54Ax0>FGYI8-(j)u|zH(`N+B^>i*PM=R z0r&OLuv1Uz+E#(!^l#aLBms|G{=tOc%jTj8s$qhjhu|~>X%0M&RX0KMw?L45Ni5aF zui@D*O;8crf2}|eAOF>f`zI8sEKg(mXBHH^UpdCgu4z5dIrlGc>isiV#eLukxjr2V zF4)z%$4z|(akXCCsBH}@&FJ%3Pbfxpf(N(x{KdidD_C@}xXTpT#PX z_y8>RO+MB5`Jj-4?_z<7>eVr)UNnGAlSs$Pa;Jpj{0i}v ziOn#Ff+5q{_1bi`Q)4wH`obYm(P=_y|_BLFG;y z>#yfe`e%3eI<+6qb2qL$q)|%y8$uiQ<{E;6Em&<-T}+Q9?puUU-_GkF(H}|) z-(TaO0yp-+hk{Cd(*odPf$CWWFJUteQwL1yn-+YGdS1!HwDd4_HU6n@TJS-tL`6Uv zQ$011$C$oAuQp+?l|823C-^71D&RGQ--B=2vv>%LJCjBAl)u#2@wYQejb}Inli97F zANbAgfv&Rk`lt7&8Y^KwsgEQPj){TUeA3@2CwHmUdG<~ zANa7c@hPs>=NAwB@{WP-dx$h%KR|R|hVkOr%~C@{j$~E_Upag~Stb5}>tH6FI`kIS zjLvwg!o8~T7ZKXsj~=Lo2WkT-^Ds?zzj&D19;Pjr%&TFt``N?9>w2KW0c9Si$?i8m zs?LuZCNPASnz-zfd-)<6d|CK@E#Zp0X&K1Mr2)gSn(AupY&!6h9s@HZn(F?rUj2!{ z{ovFb*oqy6(o!WauJ{!9``5g;CCct4P(=@PG@#5SpW^=V5>@dq$AHOP@+t054^z#< zbkHY{Gnagd`v;J&_qy7B!ev)iy0@|FVD@#r&*g&jH(0GIUBITXDs=IB4SR=Gq04qn zOsgvHU6H(HHP!tGfyVnM2pVQs4O#p>UN}n4fpX^4PO6I<`I5S*{cFAd3$EXLg+jK` zc&y%Al6h>Vx}6^AFAvm_teJ;qsw?s^|9Y5iU@{L4_kW*wpa^55GD%$wQ0AeT>UM!? zMaX~j=$a|v74KazIbfFCRc}jL6+Q){A<G&i-kx)AdRUu zqK%WoF)dx*aCJuWJnU`nCjjzOf@h|!7w-CZWpb?$^=bwTLH0b zf;Z7UfiNwzp2T0ys?lN8|DK`mIB5RC4gR)O&4Z8fXT{XH(_Q4on0w23>XQ6vcr=Vd zbp9%X$op*Kbk_)iZ6SeEg=VpSRh{>-PZ6Rmc{rBopb{?M8Lrm57Y}&vjsfm%cr?_% zz;jVN!;)ir$gqC*_GJ_JmbHwu_J~FG}Cs?9gfV~^bGd`m_ylrvNaA*PR_LDhGWX6;we7U?Ia<* z!MF3S;xQ}OWX^IM@IP-tsw3ojg!~_4=K&v8^}PMF3B4KV3c@DzqN1P_+Xhi-Dk3&S zO{DkU%Wi0bBBH3+&;$fQ1Z>#R#GeHfdsoB+5LE1lilDyV=ial)$>uKa-;cv4bLN?O zW=_3#cW>0Kdyt9Ah(sy*>o$9Roa{BS*D`t)fV}^2`6ikig1%<)ZNR?)UMp(<{}X<^O~q&*PtQpsCdn2a87g z-4HLiA7oOga{Ic80lIPKO~=+ijWiY5jtm$|^tGz^zKZk4* z8BRuKBD0X$$Q(pR`WV9h-WX}jYbeG;v0mxddpkZ`^Lr*f(^!c1YX9AbqThC`gx|zU%Z6ZRKL2e7(z2 zZ_S{d56$nnzT)}^dp?O%4aZ?P{b;OD09^5tp=2*ot}dlm$9D2Djc~{>hw^Eh(V3nz zZt9s7+BPA>%B7m8Xlp|S6zF`~MEqxD5#Db?6y$dZ*QvvMd_%8v9xFE4JnLY)JJ`h< zdk(mmwHQ9*fO3 zc%|pYJv}#9++42Lb2!s*8jiDTbNp<7A>*y+RdVT9*^=Y1m^_P#R}EK};^;~UPl=ir zaHf*WS<$4Dk`Lep`b)GmpyX=XMVo^EiYy_kWyo@b|Mm3g*a9Y9)qnPSW%tL)R8#NX zS9+}6)8m1P50vZqqT_2%u`l}nW<1IiT6-z?r4~;pnOb*0*kcub z!c=N1e*R#2k&gq2rsrJ<|LalNv4@#WsZ)&r~QC$qc=_&ahpsQB_y zcpKl-!A|2rcGume@U$+eYn7Z{jQjGBN zhrrwis4B13kv{sHKk=*gzj35LCpGZorNxgG%eysVHsDADv;mHqOz{WkDjdJZ0DYLY znh;f6Q`!{ZZ)7Dw?njhqMPxk^itv3<{B87!h4)g$+w?S=e*uWqF}xm&xQ{$xMU0&_ zlWW=W-JNIbOfv<`BIXh6s2RcY@MBo5?prawQT2LJaw+}9+#etTh;bypHJ#XBg^G9eh>DTOHd=dPWUBWE$e`Av$T3 z9mKSIf95x`Fl6p??C*5!*ZcPVpHz$kvH(2b0Pb=C4+jK@VSm44e~)ATNPvAV_Bq(s zfMA#*^+683t6Bda5~HCmnW^ExC|?&Qvs@g-pTMBG&;&{GI%JMXXPV6eM_tn@fqRh- z!Q4AP$qu*qe51k4#fD-=WOIV`eH^}@9Dp?kPY;m6YxrjV@C?RU%UXkI&RS+LY=Gop z8?=_?L1I1_P4)X>5~GK!V}IX@f6Df7)D+OeH(2wBF`tL|F*3(}M+kRD+-XyKgv3l9 z_=Oh#6!?YUkG1$Sz@AkaAN8JZU$TJYm>e7|#({dFIR*JJ$6wo0aIy%!Izjq_6iT{u zWHsc1J7KN~v3}iT3B3%V#Sqe`Zj{hDJOj*o2uSjb=9z2=-Dn-WgM%eFsACB9x|wDP&NN))tdXrkBM5CJBX_hEvP;}z ziGKm{TOeM~62Ab_J&sP162bXo?m^%tV07xHiDXKG0-%e@@xR*u+KM&ak+g2F8#Vh~o3MKaa4czcvyQTO607dg7 zW(Iim>t_(%O}4_fLN)-3Ps=#T6mM-@Wb&xe>~A?(iq!=KKIpJ&2&q|?l-mBVq-mN^XHIhw#{@WnSMj2!nHv*Ce8FllS8IiHeh#%{AJ_yRyDmAvc*EvKVNy z$OOEdEVO#pK(4n=rT-qM(&>@pZyqIyOp+LElh{KN(S2kg;EBl7NwZjDvdLnQ4cLQ# zf7^f;*?`HDxY=Zq#88_=SFmrBg!Ys)&Ho`Zpi6rWHctkQ@>sx#&I!=XGs7)G1^EGj z1TYsPpCqja^S@wT#0&$d5&S(QzM~=no{T(`v?5DQCKVZJ1NJ1~5gSk|ssbjj$So#B zMMi+Fq4obC83i52yB$kr?t)@G0G;$Y-UKh z2lzd*(EeT*xeWdB42SKIc`k5p&JP%f3rs&tM*e;yd(RjCc9qWF{GSrXs}Iiy{65+1kWe4F z(mIv?ce0S~8f1537ETfcIT3TUO(F%{uOuPd8Aw6dB;wz}ywilpqQ4Dz5x{a}A)v0a z6fk)fD@@L8zBFW$xDMyp={h4XTjThxhQb>{<~)owKC;MaH>K5KvoX>Q zy|R`59kP|~Y9t>YLDkE}*98_c2+SuFSpNkYU_osN;*`i_@OuQ%`GBxb)5X`z|MyE>lwUZf}evNfVJD#xsWt4h+pN0p?@LS9joH2)I^nZE0Mp#=>F^qB=^TToDy za_Q6FU^IRvlinD*RHxo^kyi;+l{oP(&F4DzVoNr9JpsOftxl)Sk&A3#=|3Q_bk`%V zi){eeECQcmF+;$7VKDIaU50-uJl`t)LqKcPxG)wV`Dpx2@z zeR&otRAv(r=HS|qHl3!`R)IFz>oolz{~b<97xh9V{wS`aa( zTjy8cppN6bDb71v=W45Ko$FLA=Q_BSv(AevfzNDwb^@nF`hfeCM099vi$u?`4%JWz zhf?Y!Y;csows@n;;}F@!;;sZ&-{N$l6)qW(Oo*IiiChBivmEBX4xT3=XVQQ!O{db8 zj0Y0(E~QKQb4h@1X@G8-ImePykSdUqf1Qjgl}U>J_X4O)WOcR$UIp-Y3)Bf$z+_pa zlU2ISsxP?DDX-3s?U1nbCw(>iNv8v8(p^RTcMCXJ~$jRty<4^j*V|?8e=zpTjgzzN37d*o6(Fg~RlN`{H z1dxB5yJQ0qC=1N74rXWqCX0Xy<_3y^Npmp6d`#5D*=fT_1myy%1*itjCXi7fnd*?z z1*vJBNuP}~>8?b6Ns?*dT)yk%V6OKt^nV-_l&PKqQriIyOb8T|X&6jB2Qw%EQz}v7 z`!~Y@)df_8-!G>OUy-Sf%x5CUq1PCZzBXe-IvvAn9c6G8)AOW>`BTakuk?xSh>gJS9U0s*h@k>?nRJPo&ObB4PSC{LFet` zFL0Jm0i<)A+IG+;fUXVxzTQN0PiCc9Y(t%6|7U~cj8kz z9mW4FOV(r%BcD@sWiU}?I112S3(`?spadcwC<{;}OJEA!Q3L@UFuNgD(QDz6{u)NS zbk`w&;|up?rgnrwZWZ1Ci-WFB00m9QEI`$rK-VQ;vIwYP@iC#rDh8&SgUKp|Vg8kx zj{pS91#}D`_2F1NHe>qW+k41YO9o{P+8UFs+M3g*^W7k1JK3hveJ3OQSUj^*A|{`= zPE8T_eSQU7gSO#dzl~Btoq8`oexX6#p@u@bLpo*j51tYRoYx&;NFK15(Q;-n*%lMj zh51a=-zFeN zj(3YW0w$`GP6Tuk0d*i8f$RWuOwxe&nW8xn^Pdek6YOCE>P#r?POxc#0dF<`+5{(o zX-7PcYEf@v_!>~{qybl&JOUQkfU^J}A)tk)FX356~z8phwf~cK|m!0U8I`hq3=2do9~h zx@|uT65PFI`xUm{A8AXcd(`#=ZCB}oMPsqpPoFl&VNb-p2NY(<`@uTA1<*jpVGM^q zTZd{JM4PtpambJOYl*Mk)?X@Hx!$U6()?X-D=*~U!cW2yKQa&SPk^;wk4Jt5+$w-f z7H7owZNTMVhJev_JpuU*OzYACy*(nAQ}26VG|i`g>A)<|*>MW84@@)+D0u07z~s%2 zm=6Jm0nY*aFS9_~vcTT~P6x|N@$B&zvK8h7FpMehEHH~M(>Y)w(gyp4Sr9daoY_8h z0COBb+p+*YqQc+iaO`7_{fPngVeCJ3?B_c6Csm;S35-kxz$XASP;MaWUnp}H>?a|A zk?tu3NgO9xU_Q5)6<~&e(Kb98IRxfZFwR);x+b5*KhykIpql{w21wiOROB$A_5p!% z0exvP_ktM#M%(Q)HecZ^ z1JZKVd%VAOZ-f9}g7F@y)*ljABk=LYhCbdA;@To-YQ*fYL=J*IFq-)vH!2}=Q-JQC z0Nu^zYfD7VE`o^m^vw>D0g$*hfJiz-c6!*Ti5vpl2e9_%7aj0*0JDO?lOm?TX7LZ$ zznyZHByn#5;ake;qUJu6Z?llIz7WyAo$nAC2#G-fL<%PI4G*w7Om+mDo#S5_4F~%& zdS+`x`Zhy3Y@&-=pG{IZJ<;b)@#(Gq6>;zaqUV;rYBLM*w%31wncZ483$rpnx7@s9 z6O*rgB&PGhHcLW}-Wyzj{vQ^QSq_=KZo%aNUk+G%{hJPWIJgl3*mSV3*(551t-yJ| z>=?V>>X`o}eRfCj+P{@zXzhQ)l8~b-A))<$yF+3$1jYoAD4Gy4+rX*|bQU;77w8-? z6N$e?(-qvgy8qV=ISY8=mbrI3LUfNOz=10=|&0d=orX zPKz-HcC+dMHLH+Exqx|_HWH6S)wqX7?Sj*hu{fG&omIkEm58u_YzKg8ns&PX`uMvf zyPDUo-7LOq%3d3levh|n6MEN@Sq6T837M#Q5;pHxHfsEVMs1-YGZ`Y&63Aph<}qtO zAN#%7YuB8CObH+#rV@`@``fYq3486BvY)E;KQTZAz!Lz}ce~#`KtIp`I8aPc>cue)SnwaN8w3s=+$N2pz zoMccb`RLCK(b?c_B)%7DfX_wZ2^Q}K%#dew{jX8D1Qy@JSUX=1GA|&*!s#LNjLq;q z?0>^vyPoXl2V|%RH$RWP65Px`y;;oEP41xfp1~JaXkfVp8aU74K;u9Ts*s%qf-Xr2 zoKJ;b0;m0ed2fDTKhT0R3waTJ{MOtX!?@Z6t6Ou?=@@Eh&BK6~uz-W-@(=& z2hlTly!&}=xSxlqc?g|It%Ot>q!eHwBBzO@@)-wf9UoooJ;C+pnzH{yx-fFT?CWoj zi*Pa&Cy9U5ryS>Z5t=dJ)-wxuEeDNtIG}gZp5>o@34N^T7#|awIgy0lHJCSmKY{*_ z$UzPII7aYWkoe$Q4w2iDxT7sN$_4l^Kscwp2=1zcz#8@LSjwkMHYyNtWS>SH&6wM4 zoT&EUI|#BOAyAA!j{uAhrtj%Xt$_28&FJHU>2><@F0dL*`1iVU6Yc!!jisS;QSx61 zxfQ*-Z57>ClWxOT)ZC1v93=Kzj7*kU0-f=EJp?rH79h*fDH0G)!(nsL@*nNyZW_c=`ayQI-i40D z(jJ3MElP(^Z!e$3?s+e5Dy!OR)20RLCJ90Bw*>i2k$0Ou48eP>{ZrUS>jKcivK+Z3 z0FXZ|WL5&uIM@#0rySq;6726o-;BW0$A+^Yp?g4rANLW03jPXW2>yWebrzUGdjD7b zzZAKj2KA{LPNCe=-HXKg^dd@gkIg3Drx6_HfK&F*gR4(6nn8C;0+NZ5ObmNf@@?$T zolO0)^-4LPQqHr^)YuJY%IGdc89iu&)3^RR9yptey(-g{t{O~bGzf1+7NBnmDd~5! zwn%p$axXqgtu4ea9u!(s=34^!5I7zJnx}V50Paeca0<=i27NhwvbG3v5db2^ecqbCHEjptuyK8L0GznBniIagu@UX|D8Wz zcYx^`g|R@ZEGUt2K8(kGm%3bK?n{{uP-e}F2ZcFGndejHi3HL@q(S&~mOlX3f~o9u zM>H~?kIZ9;zpxC4&!v_BLdazFT8N~N*@b8ly7<7lgN!vk9!AzDWE`(vj0C1yA{r;( zWcg#{VV_9N;?b`^5i!&mB;J99Cd39yLWd_+BFiB$${{h(Au-01 zxE>u@w$pIhpXCpwNAckfi`?iKo(X83%5b#fbdYr_{VUe#P;^=di^yyU6GRhgEiweX zR(0vmVpW%J9TGp|^kJNBBzXLc^C~{_nQ+nJ02|r(y5|BsoA zlK(VrN1~7S|6)sU1iI!B`4w>0%X|{+0Y8J!#7A6~Cw@%M`fcfRwircVG#UpCs5E`` zF8-KN{XEL*<@j2DPs6@~JuX)Hx`p?Y(IadvW3g*t3GDCBgdzAN$g>c54pCKR5#V`# zK9+S+PQ1sA<`R4iV3l__L55OZ&A1K7CFoUN>Cd6O(mjF1%X<|u-Y^5Zk3;&2 zY5pF#4gbq3q?$`=JZzi>XL;7Hs8>v}*63(U9LEo6m&gJMI7LvT=78a~b};>(ZJGuC??*^(c}JC`q3> zkEY8r{lTl;x==Z>ho4O8F3BrX(NroLPgTMdI_hFD2Q6V|PAB*dxvb*c{*?9M)oW(;Dy$vK8Ba z)y=$0@WP3F4BQgg2W~h-v>9zd&Qt%3i5kv@h?LJEaUz#O%23>V0 z%eH32%g8&}20t+<*IWRGFlk=*5=&zss=g^11EmLxzhJB~^_JD@d|Ug@Wd;&|1J#^w zi8O=A3W#XDyoP+5lt`ZG0j7mt#i%*Z-!I|wfIb3Lc8xAsq9bJ99dL8Ny4l@;E>O3< zfP4rxSr|pih?=A+bwM2t+7Hf$} zzJy#4R+Zg}BIiQUW=yO29i^?j~-oZTTV0M8CUQPEc>mu#n;%2FAf0f_Q?w{<>Xx@lGW1i(t&)7!j|q@&L5$CPVuJR!ik+0E$?K`!D%a4 zx5VjPET@_(A0fX(Ab4G0WzGhpik%JSV~A_{{Q~}amvDQQjC{?-*+6D)A6I27} zW8`4cpsP*T4{B82xy%BU_6=kt7_D@_A&HAkNoCfUCg3%J((2-1K##JE4bR~Tm<45Z z{IPA5fwM4GYlN?53^WmW5BUQU!LwkEY35QL*_&Wb6+ur{L2Fh?z@LfTL<(&F^DZ2Oz{5j zkU1HQx=!oJBDzj%$UDe5x=!2p0i<|lX#eLr%{hXtGbdRGdaLgy9J~$qU1UL!0|)Py zd%dYQZF$ETPjocL;zxa7A3rr5>NGsD4DD(R-u>p$PAG&LR2YzK5q(r#L3>NuE~gGo!;P3uf;Fy8Ysk0YZW z0!DL)tP?Ad5}->(SI-hi$Je|I{OKqG!;00*CsVfZq9}#Gw1F8!scoeTU$2ox( zJD7T4f(P+>Q{TZXaWM74M1x21deZ=q29H+9$Ak)6!0o_}FZ6@KF&T8p!?ev5a?d-D-3H(XhI0PEbm>DL}W;RB)Uv zFXK$OM*?sg0(1|X%JRkh_n6%o=xl8O@Ig;km6LB|0dECme(gNs@ljK+=Mx=zf7s!o ziWeQ|P>BE;Ogm-Xe7s+|FR5Pq0kCbTVze#)oQRx+oQ$+XPC-sZ;-zYAIG)FN0_^bB zr&q;kYknVu)4FUlhY#i8jy?xwHu&0Bo0=Wl~+);cV z1v}mUk==+KW&t)|BgddW#=}?T-Sl3*vT()w19<{$N9%)jzOfTBH5?x!@lgoA2>BH~ z_nfnH58!}uh;RaBGw>}8)osb)_Px*Jf`yUiKqR74;6DJ{5nuf@KLiH zC!GOj10IOD)uh{<2WCEM4&Z=dr`?2ubNI9vQ15$HgUY3v-)PgoSKE7IeImkpz3E+2 zKI)N^?qhJgDjtAjl=S{hu6?Vk>)n*%y40aKFAqvj7V zlsIhxBs)@KRdz6`4n^F`lCW1f9UX@|cF(12NZ1uni!#25&L*r}^! z7ag7B8-0)AZW^>ssG(sm-XuY{A1TLVISIgp{5+aWdRvEDGr~CBEIN2#v+hg*p)NMzNPwZL-?EjC%{!W|ut4HLcbudvK!bi_Q#$cSiUQe!|H` zIN>&i;Wl-tCkNKS&j*wFr1+6##Se1L91=16@S}lzHh$Vn@dxrM;+u~c$$M$62~#z; zrBMNXMlL2uA4Hi}M%E*th=~;pj+pO!V&TeQUWV8KONzIy^of*Gr0POy(=CjR{HvV@ zxjw;{x^VQZb<`b4@8js8b;M_^^sR(aj>yhCIEh3N*!^JjaE>Kl$I@jY{R?slq^nxe z8+grPdHvqTse|pS#Q6iel#42^Xf(2V-PSO4PGnD-qCpY!qXk?6;3EKkdBj%i#C}LA zBH0o1gSAb^b{n>bJX=2Agx|#pNtBMY#S~>l%+D4-1pKGq5Bsrx8dbe+)QuG))=$`K z0N;#lg}HvhzheJB`q~sj`m4sUXQBHI=|?eGr#y)l|@hV`C}dm@;(h;?@J^aBTTbpj@<42Cfp0ratDx}MqjFiiio zOqav*j)V%=W4w**&w;&n)ERG2n>+}An;0&E8!9TUZw-9_I!A+`PrWFLf zyKHc0C_Q6x2>J?ce2=lp3mq+q_4`6S2}$%vM+u{^0QWRjj1NQrbM(6KWhq@cwd9lHCh{RpNygZ zT|?|Ke8TL+h)+C5WYlbw@4N=ihv2-N@f~OIn$OfPz*g(qg_Qp!);BFN`W(xp*aod{ znS)^ihJ`m=pc^xf(&s4vST1&sWlA!@oIZZeWM}cE5es-5`zNsHQ07gg2e8dU`VcgJ z{ZtzNyvYRrvc-Q6{weSkEPgfEHKp-U?|Ibg$S%bz+Y2Tg2e0Bl-Oz)I9M16FVPAvZURx)#PRKT@g#@8wfoMp-Ps}LwdgJ`qGRN2)$@R5PHoz_!S2) z;Gl|iumSAGqvBsO1=mH)8{l=oItAjNF;$C)0r9;9EN`urD2sJl0(4KAZ8+0-k+bHE z7ga%Xa%{qQi7{T@vc&&^_)8G4W{Gcx^kYb==ew5f%Y2gq^EYvz0XKjF*Ny?Ft)nJR zE>AMxiom=LM%{NY1drtSTgczwwxd_~Nq-F8hc0Rw;_Ql~{a0Xeu8Ej;0V?B;09%l; zoYez2E+ONrYeVK8OL!!NPld4dpX0E-GLSIOcg!QeW0vt{1o#KW+C2V29<_|6KNiN) zoq)6cNeRDfvhnwXC43gRmJk;1cyQwr2`*}#d)slD<->N#5|7=4Zwl;I6y_U$%T;TtZ(T% z;X9NXg+2{4kCJIRnQjHBOhW+wAyYXEBiAL(^i`8irZ3olBf)j|0`j>H8_!EdmyIed z*?9e$;+5$&lTH>d*(?SD{+%qeY^NeGTBp)?!Krjzky%GcqUg$q$+JmR1ot0F2p7%Z zpW#Wfc-@4^BG(2S1+b?Lc&-hY&?Qb1+f31=5wqGRF&OMYlF(j4FPl~98QjsduNXYr z14sAsr3anSZuvrh?nSf45>${gA*cY|kVQwSNH!IDQ17FwU#kNQF~0?Dii}8FkvB}y zfQWg>1{@8pw+(on4Vb(lZ<;(Rves5)2-rWOltkNOMP$8os)n<1D&6_W;-h4dP8RFH zDG5CsB$XtDYmSUen#5bCfFw46i&Fi_7=RbqfEU<+$+OsQvdQ9Mo5fJDf0Bf@)XK<4 zn}qZcd`j0HxiL5i-bZFJ30)B}kJv1Z16ZCc1Z;_nPMXErCXXyO+kj)i^|1ka*nr8C zc*mrZ#3uE>7K&kj{~`s66kamtRF_A}-AAaHQL6fh9;%`!_y{<=a&zIr0J2WM|W zQL@jwJYtsH>>C5DNcIA@M#dz~{#}z!_P5!9H=G*(s#qDbbXOKl4U^>*(9;TCeaLBWs(q1AJCqdG>P}4ChzKqxzh$553rvN zc&QDTJc|!ZI$7KeR!PhSTV*cyEWsUv+=ZT(H0cjaDB1D0u>##o8|uj1hOGa;(gatOo6POcuyHIpwo3-T=X1cg$2wk!hr^EJ?Koa(9jon#^{Wqe$!{C* z_glZmN9Z z#Pi6iDkWB3Ya=oADx>s&kgarmkO}x`fxS~jqa-8vnrAULfcaO?cNORo3u=RrQz8#k z$Yt4FKspJX3+Rh+%Ch8gE9C79&j#qW2IvaR0(|K#DPOJhd{14+bwJj@+P&P0Q0Dp8 zSQi|vNBI{Ur(?S|!8VsG>l?6DmF8nxjVkGYS|3@69#us8zo?RQmmyPBCE}MUQ!f3p z*n$=TG828!xr8@O?LZ%X^DsJLknngu#vSgzR2v8liI-NE^ zZnS}={~NZ_U4cv&+W@k9OYmlkSqvrxvcg}cN2J#Sr z+-}I{KEW**tN(QjFAVJem%YA@A2$6l@xK7wt7fSmI3izd)&DxrpM^Y{5IC2>Xsz)kmwM9n0Ch5@L9XcJ@_4eD#1Qr9ppRI1Yb`g(xwjR4(SW~L>l82uq9 z|2io@SDY06Z~LN{T?lqFEN~XU5f-QuuYk$2DgZaz&&o^&7a}R0C7U6$tUu`o;7_`4 z$n!e?Co69bXS=8c&IUNj0?!lhC`7U$GRG2`0xkt2IuEu$vaLVquf<0cLQ|qo57Zx zf@DEX0d(m6-%*lE=gc{1FXz7o40L^*c$@|5@F-xitn$cesLg5`xN>Brqgoqeh$SNZ z_4t!c=f78y5z+at;PQwWZhc4q`z#8 zuj`FI7k`w!_-1yrGQT{66mlbCgaf)f0Tgrx+;C_a?O?7*z+|PA1d7KAgBewV;rU(( zprAX~5kO-B)qv<7WK@YvbyztW8DpJEe+j;%yAWBGkZGJi?23@Nq!`2e@0$P$$}|^H zpJEWVTN5xrndXDJ)WP&iz<7yj|Bvs@UN->h3#bPA>5PT)WU3?dDad8$Nirh+AjXJv zI+nkhY;ZBQau{3vEG`G!B#YDOTDZ%T4Xz@9S6Cw0nmQ2C0kb`FIeOjI-U6FbIcO*W z9WJ*ZI$U10cg2iR)ia`3V%G>e;XmQiM;{STAC;KQ?;rE?)wJ`gwjeVW|N4)u+w@0Y zheV&ke?yEF^BP3>odDtmQ@mWhA$Se;tm0mrGpVS24A^V`DCLc0T?6Mj;+=+EhcPc{ z(x-A*!JoHvzsK?YTG~)Y)K-o*+3T`k9f8|m_bIPBo1DPzs`p#hd_2QDy%#b*Gp*mH zV5@q5VX_J1k8m8Zj@4MrI@Wc89Jj#cbL&{|A!P7-E)LHHqhW9pm}5vl=gy8uSL;jq z3iy&vr|$LmC`B@Aon>+CKF@+~0aU|+bm|r;h=2jp%@VkqZm0|aoh3UX7oe|Am88Fr zK`mWB1WXqFreN{0pve;k)4c=}HGL95folt( zUVum-?KY}2nHhp_@0oySa^mVqTVt|S+Z5V#l)DORAKEU=r-Bz-%rg117+n$w(}Ea@V<;;M z=pswtW8s#OIvw+$NJ1ctuXYx5E0~%Vqhr1>K?I_Oob%fQ3X|zQ zfU2_?=&-8ta8d-q-)}#SHHX#Cbv;pq}dkpb(OceHpip>8P z0|Ra|r`v#c0GdTWodpLVH(~!$(tw|s0&ZG$wgFdyO(UR=gTmenF5itud@Y5P^i9*r z2D}r{90KadcO9}6Kta-gpPC#3o?!#t5B69B>ewgjGH@@u0gK~pH(kJZ&yHOIC`LdX z$Fh)H0lbzp;AbY2fM?l&4}h&nK%K~hT@LQ`z<}?VGi`!*ftgD@PF+!x4d_k)+mZ(S z+=K{tjt!U#SUZ+ZUc%l5?v22J@0qi~Xp_Z(^T?c+ZZ~uIQ~yTuY`q zfH4l>tzv+`i^@KNeb})d=h$x#u+PH2iDN%O_EF2|?EpXwK!yeAn(O%w&z5K&x*orIF^l$NTcci6t{5aT=+ysd`o{!@; zw8^j9M$x8id@%9={=Pu|wSMdBz2yu7#BkW!g4P163mNU#Ly*-Ne~CC`JdC+AWKOi0 zr@)LMkhbfg$b;DLKuQAnaUvWY+k(-`c{iB(QKo^;ha-`@0epZ2FP&eQOzxzf3^)up z4{&W}!KD^^FSrkpvP<;`<|Ht66p%4AfG*QHU>Nch_ECQtl#J@wCH!3o2XL(e*a;xf zJ{S8m$3Dxk|2n`vANyJ*_EB@a11Jmt6auITK>hhJS=Xm)v<(kO9>#t*Ql@Jdcy%o1 zIWXhs8g0WPkPX=HLHw?v|KsPmECSWGpiO{|Cy=(=QOHIB-vk7TfT?FOFM^psAZ@p! zkxkg|Ee=!?Ck&=87*0lMwWHi1uE7bYY`=XNpnD}i_mMdcXAI&AkskQQtDD54ct!k=KfWO$78dtB(V8+e`yXM82+th_>_x z9U?U#kye_B|K3tTuZTI`f*S(P0<8V{2?u;Ez?!AOUKR!2BjyB~L>TPxfk}K4K=@5` z^nX24YEqopr>3DLB4^h_MEmv@he#br)D0k#4UxuRYv8LtV_|HLe?@dH*hc7a7m@yg zp;QOmB>B|e&d?pNCm`_&gC%}cegP-B7xR}eY-UYCHxA5fdq8HN1?b*kH2UL2z6Oz) z&Ii%0VqphLJ+eAJAhUEbOSRyZfQJCi0{4=GZ2+!e0Jh+Qh)J9D)ItPr?XA9zQ$NWzD z4*QWpuSeETbwN2@1Vo7{Zs$aGFxWu)w{Du$h1@bY1 z8KQH=%Se2RpF;2sNIXHlqh>b)rpQ6dcr%RuCPVFfQ;^dDBxV?c@bA|CCG1nkP`jS& zPshGELz`eO_J7(0Kfr!~-au#lDK)CyQLcgI{;q-Z2M#n2EbY0UKiFXnLLm1h?4s;4gQ0k1i}gDx6Ws*l-V=!*E5*ou8&@$UqeB8Flg z4m7yrpe}u^xjG+-_wji|J|BrY+5#>DZZEjw30wqjctYS@8z?@uv>1J71Ur%WHC7fI>oSdt&1!%x~rkZD5Y zzigmEffW^fd6IxXAErHz-Xbk~P)1^qSfNTnThs^&hf#0#ON7twu7b7pDSJz0tfUc45 zPNZ)XM?rlPA)8li@PmLZBDg@e2^5fhZm*De1$$*Vnk=i5rDn%+40=@-IUy1S6el8`o#e#4Uf8%!Ta3v-9XljZx;eEa88_90y@_$t}p6)|d1*;7hvukgF1W#b-c%Z{B}-+cN$SP+u4e zw8DZC8D~-E?bxf#lPPm8%B)#&H`4DYW!9ry-vx(BI}O4=v$)F&=1TQ{WwK&N^9Ehv zd3-t^LuNr(hLgxn|Iut7L$8HM`g`m`^eDRcz`C1^H9j6du1?4}UcE30JYk7woSg7a zmOn@aBoN8%#)sZlWZqr9NpuVL>@V$bYjzCdqlf%(DdyVdsh1hT&rf0_&a#>8aDE!S zI7@%2<@^*nan`W92hJKEYmsa5T!+$PBH`C@l##I@OVU&vHDr6jv#vo#T#`QED9`4~^H#V`L#^dTZPec9E z(9edp;%80qUBEKB6m-jQs*}2WhI$tJEXJjl#>F4wy2I4#^>Wh-cQq~JG-P&|F&YAoZ#4s@_VUK8WEOL@P&w7_qRr@GBuD)(CTh42B2sQf;zXW@$gl(=1|l(Vt-zfJ?*3u^YQ-=yQ*CLW;l8t|2wYl(En=>rha zczGUKoRr88vj9vBzlu>)i#1v$d=yy-sO%c;t@GXzT}UbC16CL4M)w(Xfx7K6WC7S@ zT~K%dpANS;-CjNyoT~h|ZqeLu6kr74LVzuOwr1>bpXd{SwM^fPxXW}d1Q*-vM6dzL ziQF^DQf$j4M||yK{sshikp+uo6TpuHd=|MS2<%{UA$k*7ttA?qx3iXrGWALII!xe*5%ARTeAHgGZds>~QjWqfEL#m9v|W_Fr-r(`r~-?fRk8J}UQcMd*> zU*HpZ4pKKlhBd{1jH|o+YQQaVK%*ScP09pt*+u0mbBlu+>tJpN6TGH;WtQsWr*U=@ z9MBzrf)|yq%=mqNne`UZ3d+0%LQf&}MGgrnbIAPL<24MR%?Wl3NAGo$nhTe@pme?l-WH1*oNoN?l=3^Nm_tzg{? zr@6!YY~O&yS_lNM>tC4}U{tYeFh4?E%kL8GlT^&CBi|ISQbi`3-IWi`?@L{-pcmmF zkDwYrZz2yR4Z73J@`D8%Kw8W7j_gXx`x4pYBaHwZt-Sv7yUWyTF*l>tsthw2KN_T6 z@Y99h?ozFFh4H_ujFOx9WppX%206|m0nSnra6Xycre1nRMw6}?WqgLr?x-1R%hMHx z{c)-+UqiN#X>i}|GD959H4f$}Fu`57%M5ogS;ZLge;QD5-|aHP9MHgG5Q_ov1RpRCJeG0^RU6-~zw9wo#q`K_Iv~drV(p=>Ld7*EyhVfPzQy9&?!! z=z0h92AJUL>@ocu%wPxeCYa*KxRzdk_LwUGY4B)CtvW1dck`Fx(z~0#Zj{#THkac| z`^zQxT7@qy&2J&^R^t2@ysH(Ot1PG=pw)o1CTvIEgLN=wm$}knt^o5O7!A<3k@u5g zqPxx2fOrcpP2I45Xi%nmOn>XRKQ?P{tiF8*`8cWLJ?0uP+Lm=k_ZPNhZR6{)ABYqm zWTM>T(RAY^{oyF@`@E_n+@(;idb_f(d z#jvf#@al?>zz;7WQ1V2u(geOU zp>rerbrXLnosNV5aG;XEi+obriF+P&NPKV7@o_FbG>~WG;~w6@b*s(&M;#cyPf$wt zU4ZUe)75ddvWzp~_66Yf2I#&q=gAlI-(!{==&UFJ__e2N&dK*)_J-cyH1MABS1uMJ z_`^aAGUt$m!AV>H;J}z27^x->s^QBMclE*Vv z`yhNie!|3$+=8QY?m)}Q2S~4q6wwB63Ap2OZk@kyeyQg(_5fU+%$9>awYQJ^5bU?$ zszB~RB$CN@QLxkfAK6XIVen$}kz%0N;~AIh{!imxzVc%l+~+g?k4(O^4)xT+!U%uT z#c?H?CZ&xrjE+0=1nz z&L<+gz9Qh=|4U);h@F%lTU|M;YYpS=^SC*6O(kcr{b1@H%x?T=PUAxr59KvJg#c}+ zfCBU-xQKs_d;|F(5&3R|pPJQpP1zcMrfW1`qqdy`?&pAiDh)O++4c_RatHHs0>(|| zR4`OHZ5<@fqQd+27V+C;{4U~_mW-X!{$Nf6sDZQ@;EWtUvoFZ51A1Q4q~FJ@>`O`52ifvjg%!UQKD;0skV)xe{%YE{)pfG`Te7*_f2-A zeL0Q3tN2}Bqtgk{fRZb~w*(iVFOgrVz%R%_#QG`@N@tkP4leq=1N;rZA4z~I06RIr z9~|5PFn=cCN)r0foB>V)dkaJxQ+AQuf&8WY|9!Im5OK?YBH8c1ozEm5{7VX>i-YG_@WU$ zlRwXbq54{eGjn9b3W=P9UW*Qyp<@oVa8iaA8i;V$PUzs7e_NkFffZoHf>k*H~I14aRN zBA~$EAr+DY%p}8BU~1W_3}ig0B0nRkU{nps79@Nc-)FJ5N@xhSO6FIjyl2Y?4&uhK zQ$nVV&159Tp^=Pk3{B+~_dm_kB37?0)k4I<8Cc$oyi`!KQ(&Pf%e`WR+FWzmQ`HN^B!c2CwFHzR!X!Gc4SR z3LK0w#eTq075EvWUlDhD38>6@$!8HZH32K@e#(xlh5Zq!gI-xn-;AuKt4QRUfmzGf zucqj*PnV$*pB$x~MH?UuO0Bn|@Y(9+qQ{{F+75`>HOwE}mB8xbhm+CO<>&g?kYIQ)d}im6GR6(iu|`Iu z8^{Hfbmd8o`xD+L1}lS?pE!pX@D)2XWR7>@eLak^rWsP%24pAo0-oR)6+%h_x0A%}qb}VAd-qs-6P#lx<|VDPWPOEUscq5Sg`s?{gcbK znqtZ90bK~0R7)lmK>4GR`NL#Fro1IHlE6{O{7&FYWuJ)YPtd05H9r1CDxt4S zWu$L0I_fKH!=^sY$}UElmVcUpQzND-Kn=KGsLXCEQ^{7QBGIcN$-C(r&P!nwJqvHQIMeQS|n#Jo|hd+R?0iM6G zL>U3sKdOVnCJzV4SO;Mo{EUM%OW=604UdXHVnX0+fajc(rmi|a=5-VEAEj2$o9T~} z=K4$YAuN7D&r6#04JP_$l}*^~;6mpBZM@GAQCmaWWZxMw$C6A7Y?|>ZZtazKtfwtNnk3PY~;9EXYY zJ6I*M6zs_)q80lv@~8DFeItBIcP2h-i!AY7w%&$R$o~I_1>FXyxdokNK?y`WOfHy1 zK7sI1Ffpc{4%>gB(>fsCV5XjQ$KxlyOh?n`H%%#b>SeLY90sTho2~$V8xyqi{#I(| zWyF2QcXzA*Zpq(#+Pj+7Vp1I^a(9x-wc4yb#*|+Tf!xKAzy- znvQW1dT?pdFW{nxqIs8q5f09Uv^9X3+7{BLj5;7E5ug>np319E_l&*R=4YAr8+UYW z!tbF7hw|vj5wjZy%B%toYLS^7G(e6A7|$$T(iCq8``2WW*)B_JGPpY&Qr#f+jU}bF zJ1wcxEvdF5V@X9h{KP(HPwl_9tH;U@@Sh2_iI_b&(TE;{lLk~Hgo6}hA9{_A9jpU| zc5K86`h9*dU>y7rFkXI+n!T2(f-F)N1ULif2*Xwccgp7vD#})HQp9`*PF1T0t`1cb z?gXR(xZtXpP&!rn*2bFx?ygBuA9%J6=$3}c*aN9Xb|_TPI%4(%Rz=1FZb(J6JC;Mf zw@#(MktC!$2kCT_Br-|j$EeMsCct`RAz(UkLeeZ!LItPr_aneHCHrAvGQN}n2Bry8 z51c+A(+AO;-<2TSkuK)NY#p+%_`5bggV&UdUPRziM)}9_#yIGV173=!WIn_8CydqK zI>`P$E@&<9lpbBiO2}$-62w zlu1{82v%J+4(thZmA323$OqP^^h?zLQf4AuaoP$%(16Q3F=9Rjr~y|W;Bj=7aLti0 zxZnX-E|g7IePjbp19u-u2-nrdYfJp%xmbF@l@Eo;;!~T&c)*RwLWhQ`$S2mR^hb=}EJw5XvNr&uqZy0PnW}yV-!ryP{&KpjE_tX_J@$ zwlPWQ;8Gp=Vjtfg(!lzih2-17h2-ymL3B7kcf@>U2`b1k2r9sNNbjRmq@ZmyVs_Y) zGzEA9B@wVSl999`l|t!MAE8q9wm#SHf#zuiPqp6lZ0?>ktRu#s1nL<7ctTQ*?=93a}!;nbLyJ( z)5bHKq?%Jz9pxG#ThOyBr%AttL7IZiCgPj!oY$VDQ9GR)*U~6}4v_;0rV%`dR~;fd zv6o*ljTP?wnZ<|5bY{TQ*7*YPTjI{K?~U)LtZ%hFWqs=`DBsam7!9$`JLCM)D+zSs zwqobmIDZxg%4iM_T9A>BS&fiq(ChA;j$n__eIi);9B4vibt$3eEg}3x%{my#na+H} zAd--6@%m77Qp7xGfeQ)vjK%5bCR{=y#kfoo*lH8F1Kg=3po2~rd0`(P4OO40p$((f z>l3or!?oZ`fHeQr)|NH_bdeb$m`3y{UisB@p=upgQ=Ns?ylcT2C#Dy{%J1D^+dF>0 z#c!VVtF{hN>sS|=a;$}AjCIW03b6^@c$=@z*Pi1DC!$=NiS~__WReap16hq;<45|3 zsf=_wfDXr3vWjL?(da4*yaC{I78tR>R*-SVMsZ1V0ImVY;j-PZFejR9|L}PZ!T4VX zPHz^)_|Wr%*iqH6>}En(|Iut7LeH3rnmY;Ho(7!(n;~myP}!wFiRwtF%jIr(PAI## z>}p9{Z!!0PIo)D(wJc0RMLkSDtKT{>dU{Nnn9Yj$tTyxTY!~z!qe8J+nT3jni8q`(k zbk$~?R1Ub-B&FkCD`XRT?=MrRVG|Ce)FE=pQHbO~>kk`W0(WSJ#0 z7hD^NaLVE@yaQZr9jf6`97?CN+ti~FDQe5#Ww5v#!R1?=j$pziBa%r1cUU6x^wxY^ znCPfwRW8LR-;fdsV2pN!mvUb9E~cFY9nbZEH-e?hY+8sh+RuLa!;sFMZh)FM#O z^ofAE#}c>;%xMtNwtWh+()yDARD4ONvqM%e0p9E3AaI`r-3O?P1?li0P!NGEFb`M) z^TC`50i6@uBln|MMWug|ib|)$r|z_ton2A?@R`TqGsgm#0DQ#)b%+$ORg&qI(Z`jq~ynCCNUZn92w%sK|AI${k>aH=m?-)No6ZjR&hX7nomMzTADu|oyC^8dhA zyM(rMGFRJ~v}yak7-^3G(a6%uQGX9Vi}D{ckfrh98gHx3VynHyI^GI+;)D{%i*UTu zI#$~x+O&Q5L0aN;EV2yYS93Og@4SGIAMe`bpAFb}#_#dR#Qk8mY5mv9rytS<&3Uv;s2`S&HzhAt7(43-d0@L@??~Rc=+x@5%|* zPqY9>Q^-KuWnZK<4kjT^2L3jSqsf5q5m^hSDFaDIFA+Zz`;mxCn)vyI3QYoodE^29 zd6Q&yD7g$d3;S`p{~MfbS}2=rr`muI0d7V>9ZUp15AgWFfT^J=HefE8?F7`$dO31F z_S2Bk@uKmmc5En}fYWWjb%2`_P}{4(-2qMy3|Kxi%?8W^^9}*EgI9T&ao^v0B!|K?T7;R1DJ!99cWcU*_Pz1V5TvkwSo0VuEKsE zk~p9vkj&b{`Cpkm3#coZX{WjxxfZ~DBr&t_|0C`^z@wJ6f&RO>1TuKQvAtQ1#n>FB8sdc1 zmLpxP|6@3w9+lQ0DIhQ4HcKb78Y?iaOf|$wNf>8fHKr9-97mLQ5I1tO~XE^X2;9UI{(q?d0>r5LbDMhq-(I%GE z*EyB|W*J8SdVw6mmqPYkO<&)jAK>C^hqtFLbupcZ#y}dyGAf)c#nJK{&f4Ovo2kDB zeINA2>h}i6C%FVa%Y&(Jp)V>S7CK+E5+ZW}PO7Fm-xOyJLgWTF;~aaur)62v+h*Wx zI0`1PSS>~1&vF4O0kk{%r71wk2)dxR^OQRdeeuy-0ZO|HyVWk||LljOww6_-o+j!t z7}x5NTOWty0#eRJmRIg!0ujD?07NX=6HFkwK(ZDjUvO9h=@W^STjAHn0Mfz)QUT-o zxdQRbC1_|aGH*G>1R{L(0tidFE=_bP$vQx+&jrZ2GK!RD8dt%%55{6ye&5CRE9^Gp zVmkucR5OX{*fx493EgozG`Q=R3*tO7{=d`ggax+ZSwF%bbDbyC|{JDVkbAYx+xLD<0HbDzN z0{|_S`wvXej{e^YfNi;;J{O@#txVLrK%E!VVo7bsu^s)K({nzcUok<86}4xJrxzZv zdSdEWc*OK$WpsWu`01$@Cbu?&Zp4W{LB*2Xo?}-ol~M@$s;U1S`uWfo%dybkoeRje z=)Y!ytRJCsHp{iN0dme&%#{}lQ5&3y`CXr~zC&5X32GjPHNW=|d@o1NS+#(wzJYCV zY)^^%HAOHMD_#c|6qimL199 z3PO%Hg9{&eF@lR#tfNctg9JX5Be;Qc#S9)v#)-LBE$FGL*>V7y4+ zTLLd|34WNsM>#Cy)?yS**_xQhFN3@Uz{No9ie$K_9EnnS9z#tfa_tCF667=YH;b@M$9GMCEdtIHfKFCku z=MBoymO~6ok=zariR7ALgILe|0HZX5TcygRRLL|Jz@cbsNt+1L({y+qr`9N6hp+i$ zl4JtQLcc!1M3vs+sA&Qd@^JtYvc4Rrvw`VPBWsJZ8fI`Qf|Crvg;6ilC}(g5{n}=5 zFZ5p|xF~sdj#_4LAsiT|W5IIC+0_r~ZIf(xVGrqMIO`3hbgGq@l6 z4G1o}uLnncGq{jXAh?hX;5h3VoY>;kK#o3%pnvO`!Tm9MmEgjtpJ|jccm%;4m_bCp z&nAfIqJbR0b|e zg)h6lbisc53O|n&g)N)r8Vok+N*DGH*qy_!F9LFue&_Fo=rH~wgpN)X#XnX{sWEbI zrn8U_vN>gqT>M?epBPUUOn-yW6aGp)@uy2R7l#op4u_Zy%OESqU>99AudB1-!*O~Q zN3OHSs-k!cM`;%jmjEHg)g=?e5EDc>2M8W;73fE}fGFz%;(ZfD1s8`GaVW;h71N;@ zA|k8u4u?*EBVGJea`E?}>8~O(u~dZU5dZ|C=nw$cAxPXUzx? zo~9v~HN|)0L}3V2HXVjyHxhqh$PDJFX8IHIf%p?Ln~Oia6b^sO;ZVY1RnuX93`gNm zjHjU-)lG*&J_v_GHp0bWY5ZNw;jb;p1`^0Lt$G z#Jdl~b~_D3qAa60ZsZQg@gAf346b};Ce^WPjh)DTG{?>V!%n>4rT}*2m{;7#v|j4A zQHKdL8$@Z^ns0Se%nA&hxADcQ5|@5Ho>Dk^70eP=X-z*9^j(fy1ir%|syCDffAe=J zHu!M9F^K9FG;Kw)wXqe6j^)t*`VYq9-6VxgW07$r#v70 zX%EM05-NIu54bCu8TIMv%tl-TKEUx^4v~>R%HML#2S^c|i9saxlpQF&=;tXMf1;bq z7;I1eg^U1pKZkw?5FSlfG)%F^vYbc=x662}buCJbmD}$e}I!4KnohGcpfkvWw40rcd3~&@+c2Q-}cJ za(M8~ym>=I9eYk!eZF473j&-sJ>vnI9Ouk>I+@G#+h{WO251bx|GY*8r_OhIEnPv` zQ}jP}iZD)fG4{ZC%oF3MGV^c=nC@cZiOqYt*oe%%U2I-)vGMY!|E-rJfHFIfsDu|r zqEE{JZW(v5oA{DzNtxwO^_{Z| z3d!M*Z-hvpc_kk+lcqRrLM8%`ox}U-a%Tq--RI|Gqq^AmV58?==5X29#VFpzDDWT4 z94-gA*d)5x1YwhVmBV$}A0ts_F*jFGW-%`(acrc_VsU!Rkz?w(l(~=`WZGB3eiioO zrpIKCum6L+v$A?&IsE?WkcS6Dq(B^r{%V4w8&9TOb}3oQp6VJg{XBA!c_IrjE0KV& zx>Q0W^dUzum2jCA()XhOo%6Vm9PDEBqKi=hjB@X$Lh?-)n@AU%g4pCf8w$xcTx|44 zE=J)P<=#()H_F zw`zpsvJe%KZ<|>q0L(Hn|cVBQua+y{|To4MvmXg?Kjt-?8MM3>aYW*uE3?lH~~ zyVV_jPxpz-Cc)vrIr@vr;nxlZU%wG7oyC#p8Wl&G@wrXKqR(`V%rCu*Q?cv0#siuD zi;J_Vxj18&<}iG^Pn{>Jr)7oY^Y{Xj(wzTQWL=p-aS1=NNnuyWWH=TX@ba|ai>C-aj6nt7%RQ}Bb z&U+{Oa)yBVYnK|Upj zn9oZ&8s!qi8Ka1N&9r#~o8{OrHh91hqsISeR8;;0Bc@G_E*N>d<(y{K(eDGWa$Fy5 zWV%^1CYFt+$9Ssu=Sasl`#ltqTrv4v5kF7e{kgTJ<{T`TvU+` zIUPu1IsS_IE2=Y}V?RK|k{k`F+%_uC>03e$ztk)E%H&|_AbvzwYxvowC*(GNOFT1I zWqu;^+4);SKH}nRPcFVZbJ#h-l$68I*9g9l66}2Und_p!wTyfir+`zE_+L?`ipcU~ zjyN*ReIA#PkGa^CaIuNUCiiJvLOyEk|3#ouE=El;%6%S}kdM0rD(hmCfKBexxP<%z zHe%@&ch$e9>lTpALXP9&`tSehJLkwPDWAZB=(9K+e3kHQmw8aNoZaJT%ATH{CFCE? z1d{RfNRM|SSpu3wGRVEWCFP%8jHISdY?@({dwEOBKe^a=y4a**V^vqYdp~=JE3!D{ zV&vmul!j66O90cG%tX9f@ z<)D-tjs0otD`CQokWTnq2@uh(pK`SMCm=c{-)?6A4o1ObFP70J7>xgq1GDku=SclHx?Y+z+vk+)yvX%*al%c7hAnfGak{va1}>)4Eax5%yG6b-+)0e zn7@VMyCa-BtI_)&d5GztTq5`KkbTbK`WB!nB=WZD4_(QIzXLA*#^4VdU0Q?Tej4(c z4(B)KiUrwP4gshdj(R|=Lq$pB$g1ENHYmNkoLPy|8hHah$46z_mjPCxw8H)ij-J@o z#I`nvYh~l{({7kf#$AiKyk4rj9R5X0z?Z!OzRL5}DrzNg(Gy zEbkaQt}+kliR_rE=Y?!7zUWzpuLdd3JE4v*xU}{1l-AH#1>hpkOEe1P>o|IYz8{Ck zyeWw1({VR@Ep?6sM}L-=*SXj>bg}L8v~4zM0q_gcMi&7axfu1$VdR>@dW?1!MtGH^P+m)?>OQNO0GDDf51 z_ydh1_T#f9RtqGCL3Je$l;%^_Fr+(|Fb-w-SFS&aOU*& zql>Qr_!#`Ouk1uT@UgkOi{Vc$h66DglEd&xW)o`x6=6_rA$wd5Qvi~>u!G*Gi|A`j{2_G9c#AF^RuJ}b!lQH(2 z$)dS)##3w40*uAl^gYIPS(=35MvhS!^PgjBGQwnOK6+wl5_<13dW6o`9E*^PrAf#u zCOT!CkUv?P?AN`5ee}_$Ke03^_*>}WZyf%F&1Q~|k>@r2EoEgAvTry9p2ig9U1&5^ za^Q}?HGUQbpSaHQYb{;ypn>Bq;S~s%hFN|eN4mg z-MMLlvcWNyU{;_O!Y{&*_IuTAT}K;T#Uel3f>rw zar|vvIi=#x#%E8f_LY4p&2-JF1eeSo{CY|rHaS!g9DBSx(kB-@r{Oz7_}+@|t)}lp z{5Ey^e)3vVoJ%-$I?4CM$#MV|B2VGOMxMgSc8>7?gNWa)m0Aig)JK33Kx}^eLXnHz z2p78vPua1w8z`?tSWN5D^1#FHp{xdu%m{n9u=7ns9vw9A@9c1`E&i|ou*~x7U zev;fO-RZzlqTM*yMoPkoONm^||HMhi6DOa!WIB>eg^!&alb_0z%8=+37u`|l>X!19 z9LfsHlTj8cqv%iT!$|S>H00IHE*ZG;XynCRj!-L-q@cgahZrsAVA z@g1f6j?5=v@SjR2`#;J01E&K=<@RB}0|3Iuf2y1u@WjV-ackEaT%#$6aI%|Yh6{QI zmt^yiB|8Bv-FIDd?KyM}bZ59!XdD$1-QilHsn|>tc8ovA2&P2r!(?eDhT_EL811~qnz8p3$i?Z@WnPb$OWp!zD`1{aVC5OL*RB0ALU8|gZYxk+?m*+`|W;q==%6tR| zyNDp1{Kqo;KJoDpKCH@(p>o2<9*((Up|=o=GE0 z&0b*yZKCT=()M}c6R>oXBd7&l@WPkii-`U&$|05+8xCF|G2wg@&IO?FIA#N6u?dDn zN^&^#kfl*B`ro+d&&fsKhWqMJw1ps~A|24GAYjduX2E zGOv%Lsk-?%(w!7OYQoV{{H^4m2c*xS@1cW5gjlaejpTp)6CE)DIV-tiO)E0ODI1TB zeL`NLQ*^}ok3zghdfSH<@h)cn&_6iLyn)!*~7swLZW2jI~wvUTP~66695~ zMbL|OS{{0}lIog&4XPH>z$sf+IKTOq=fJ||Ulq1Sn13ztE^YoLUfJxl5HDhN{v|HP zoPUX}kK?bUHq}R7YHQ--xzrZzBd@pB5ApHz?~u`o5wy41JHOc{zZi1IZ$&;yVMF!S zO#7svK7We6k8~mG=V4qBjoNwBKDlAY-H;k-!BWN1S-Nb?-=xcZ58?c(-PS(DKd2CL z*|ut6Sd;r7csb4>a(3JHuQ#la&*Yu;Kx~c zq^Kbx{@kc5Cvx}+j$K)hR@wHh-@K^i+j6vqT*)Ys-8v`8+e}xW+d3&E+iSZ{u66>0 zr1WI?HQLqfp?z{XWosW6D1A0bl!9@y((kk07cHCWl z^j|qzL;m=@NOr5OunX>x;az;Qy-q7*^(yh#k4X>l@Q}~h$7O_Mh2*C?WV{b+-xS+17biWa9uwk`kgSc8KZ zuAtI}70GV765(nyYmtApx0{t+qWOQ&=9BU~kGkdKJC1UNc$Ud0{W?&@{VTFXV^<&j z78a&OwpE#YJ96(M%ZJD%|Dq=?v#z?dI0j#~w=J&arBp$`^*W!-;|GmD@XE>x5rgY@ z$Iy3lZnsxOm({KI>Jb-zA(IQ2lDfSr+bSj2tM;InE4mwPi)FXeZ{*q%H8PkMr?xt9_f5jfn5jt`$?Y_~Z1O>nHaJxhscwa$7znti)EyxE_~en`Dz@`vTSHfWz$5jhHDYMcA^gpN)29$AG8I+7BV- z)6pW=JsYjfxDTE(+nMl8w9>hleYWXov(wL-y}ka~Xk*Jyc`Dkp_-CS>824;$OPhCp z%B;eGXL9Q~^;xqwO0|Bs&HF2$Ejei=GS^p5tzIv?BQFcLI&HS?386wrFT;;U7=^^Lmm zxTlttP*S_TBt*LyQbv3API2vegs(RDaB)oy3D>@QFTXbNqX4bqlrp+Dq*-aLUUCJk z^}IkWI_odBdac^p<5oqrqpy2v3zA;Yu8%9EefvQk?bWjRwe?f$Y5i*z)n3W-w_2@x zDeZAwE$#KV@><^yD{5td|WvDj!Zjd%0GE1EtlULi&D4!PB zyrlN9d@1c4t%z2ocv0=QVmd!8weW}Mcj(VEII(q+ec4It)FVl+sQ$ASs2f&KQm=e` zMQw0(l6qwM1ND^-SJl0jd#Hn!XR0IP2dc@jBh{f-HmL2}%vK+d`$(-mq=Pzk@W<+; zjvLjG?}N10zxqb~{MgUx&n0y=;O`IA5x3u0BX*Ba6Hbp*Bgaipm#z9pZ4o|B?HxZ` zEpY4;UETCXy81=gdFs0>Ppk8zd#Gibty0@9UaKDI&{0i!yQ_Nt{a)&tsKsiK*GRR? zmowF}euktyT-hdul`8wyV9pXQ+q%n57!M534IWPEdQlq^qBn-=Wsq z)LOlB_Jlg5ilYA5^$qp>z`5$umAd-YVsnwCFHT~ThE z`d4IiZQ6SUw78`n+M1Z_>b{F#tB0CgQ|H$V($XIvSFd&|q)q8uP^(wOTkF8+Kl4j~ z)~{}mHaBl!?Va~e>gwk91vR5lh&H@-er@;p<7(=i7%lkd9ktTFvf8ZoOKAn#l+r3! z&8M{(URLW@po&)JH81Ve)Eny0B`RuZ$4hCGTNT#|{^qYOU6EJ&?uAfo>xDCFpDmtR zo@M}>C~Nk}OR86;8)`S-;@W^gUfQZVfm+SqytQ_(m(oUG)PpqX?5*c_=+84av3c9= zt@Zj2AK63C=a*k=`1Uz9=sfF`aYenc&PVI`#d`J155d~@=~vZx<&UWYd)$b0YhB@IRcm)m9m)`nE)%Ma``k~<4Ea)5YuP-sg4aB>##{U}`;0^CQY~0J zIANRmzEVI_&zw}>E##&3{L4f0@(t8}X4K?O`cn;l@2J}2K#(@h-&>oT>ZzSQdrqD0 zeNjDCIY8T2>|3?L-CJsAQ6H^z`$Ou;UHjGHvsSCaZvCMKc9ON|_I_G~{y;4^vyj&B z3;n(-1>e)2+o3K^|Mc@dq)4HZclxr);o1lU2^)Q zI>-AvwaD(>YIw|Xb+2u^8aDluy1l{{)%V1H^=#+uYSruCs0Yh^st(bARn_J%YDu@R zt4m95P;a%|qDtQ!CRZFOETOeMyjLyVb+MY&_PTm+#agw^tnbvR)r6!xyhc~XPTEm^{^r8FIa^Xe|NHmUflYo_>&`i*j&FNft?Rcy zop<+~`toWIZRE}UYB}E>>hMl`)VEIPn^dV{{8*>5jX z@8$8+N_6wqdN#_h-BrD{LNiCG_9_8dhT^ByAL^xbTYg_X^ia~;9Prn^tN)u?^7OCj z&AfhE_0B%JR;%iE^>i$=yNskYjeo3Gy&9m6s}ZJs*!7fJu)DwZo6oOmyVJ$AJck0c znFA&5-G@J`)o%Z)hHpKi+JCyJ)-I8yu6-P&l|6b(-P!LCwda{U+T0~a)$(`BXnz+d ztWAFGr(G;`N=>VMTV1m=NRy2)t)Y*kooahXy*AhLwi>a1*a4qBvfguReKoyNH`V)m zn7XKEoGPyf)Bcp(saw;UshtBxstddas>eN()aaYd)#=yTse@jBN&TteRCU{fRqFg@ z!`0pIeX0)2tf~&`n4+GqSyT0XW2~BXvW=SDV4eEtQWsUb&`?#Y%~7v!8KPeKx~N(r zvZeY{=W6QEgU8g+p(AzmQAjzpZ>FjqsPw)%d|(Il>ovcq1NT=}JAS`VZSOlm{jGX$ zwWog}wO6HfYT~O6)btO^s1*-Ks?(Aa)x&-IsLQtZRgEufYJ+RVRR1n>)n@Bjt6gVh zs(S_vR4e{4TOGZ#pZascJL{D^yfPIU;kWxzSHmZ?D;<_(KsV} zN*Vt$)&K2)IpXR6dd5!!(Et6CKLtR44t?$lKz|;C&IN$}Tt@v90Q&QpcG&>*ob&x` zQ)Amu58Lbi64n2$K>rYj^Z&n;Y3_0Kf3MWJ0?_}(zRnGR{%;PpTmb0*>S%u!fd21J zw`Tz8|1|o$CMk(4BwMB`l5Jcq6cq}Xs9X^ zm(ucjtP;fUY(*VORDw<+AD^TI@e|4JqV}xQzL~ls!Uhn8gP;2FAzAt-)0>D1|9Ayl$k3R4F&}%>}bshbSua~TU%b&L>%dTi;;&woGpiF$5H>zZU zGVuUl-A1!`?u|E-;;2%XealG$!RAqW=52?wI%@pyg+is()9y@9z zW!!PrD)I@v@CD-H-hRSA1jc3<$E$*JLvQMEqLuf9V){)*> zP8PrDDaB+qH_A5gkYZLgGs@a$-|)ShZ}Div7t@Tg6Zo29t7gXHruI2Ed^^}z-|+RF zam#m6tHpd@BL7SjQ9>1UVuzU}R+@A&$D`M0l4)@Md5>AgK{ z4d~MJwVt*HGvNK<--cJAFC-}GKFHlfeIl71Dsv$>KUhulq+T08(nH*#(U3MAj$*n<~e4g3?VC6pujhDN&D@ zqcDP~S3tGheNl5}1B%C>IH)P=SxFxbDmPJ3WPxH! zoN}xoQEw?qdKPGYWmNtNbyL)we&q(!{cO4&5HApPAH!fSDW9N&wm{P;$*ah}Yo;6% z!^TaNQplhv&7yRR!hZ%DZ=!90U+li9IkN$U2PkGT7=6fnF2mvj&_u?tlv0Ckpo#@e z6;K_4cM}DLCn!d-P`4oZaE8TfqHR-^V_$*FO;L0D1%($VdNU3Rk#hi3WhH3FK@Wpw zKQ%fIy~r{$&wUgW-k|72Eyj_%6|F9Obs2YGfA9cLE|P0iaemG&FK4s=vI`f^iXAL(=kRl_BYg$yNQCr7Ze2&j2>&) z*K6@ck4n&damMvMTw%0}H?Fg1ta2Ewd04Pc(8rNSW zcPmccioBR{Q^$f+7$Ua~KbYi8`7dxk1$C)aEL=XMt`i^hlD?gRe=s zi-IBu6c>ng6BJgogP`IYW>Hh)j2>HE^k>TdsaTb5qg0%FcGX-3!`mqu$>_Ihk&Y^VVuo`x*3c;E5F}}!gUBP>XwvZ zwA}=l##AN(nqr?B=6AW+7V73t^ViTF)NwG)^T1XTYM))e?}EM-2b@0}S4L8k7C*9j z0F8xKKyEX$aTB#?74#depWOyzE5dF%wLAzGfvgtTExEhs^{aM|LVm;b+nUn6``yTq&?e$X7!s7*ouThESNey#kIIoV3Nt4=616WVHfhRjc9@YfRpoXg(6~9e zHjIwOT;Vv!=WpqmD@Ji7TE+;?D-?vY4$$}?O%xPmK+%^u!3#bj?^1&y)F2&HOWhYWXEvZH3yR9D7pKX& z0BE{{rb-i~e=I1Cg2AW0G!p+ew|DdP{ ziW_mpTt&7;RgX31a=#>MMyyd{Gjw{KQ6k>M7Uhv(%w^qh7iG6y2^2jwV{SMotZ2nm zqXaj3qkhti5(A)?+)We|l|d0pH16U>6(#yva=)V*bMwK!mS~hH28sy@Mu|bx!c7zu zRX|Y;6g7$ZEj=*@nhHJVzNk5~0Yz0%EN5J_BIn=e|Lsr-o@<_{Nzh@S`mVV#x3v2x zD5`xZ*WstbpQ1h5(`{6ao>3DN%Uc*L-X&_E6l29(9#XVQ)IfMQN4NZgq82FL zOfps+BG->VGn?FZr5ROQL06_2RnLQFUxu-w1H79kC~AYk^8O*USVJuu(i7W=elgXk zI>&v{t*k&%2NWgAxew9OsljHV*N49kf0yxb+C^@)|Bops>Vo1tIrEOZ=&qnx0j=B2 zsLGwI=rquj1l9jIYEFNks0WG;M4L?X4;dE`(9Mh!g`QcSWK<0Y%>vrF-|8+3iu#~< zhjFls==Z5f9Mp>19rxx-?IzTHZvXE>WxX)r`93WvN<^1fwo@ z`Ko=JWYjI~J}Ot6(Et?TvBtVB9#5j(jWg=@1J#G1S?#{)R(7Cx2^2MnHj$|BPz#as z>I7q5Lu#=s-dMLCbd%DIbvz&DE((fBP`pC)-bA&cy_#sOTLLOiYVwEsqFeDm(GV27 zG-KTXat+jzHrc3W_LDw51k&29X`5F+CCnniZg% zCu-!L=$2nlGy+8$m0v_HS~NB4^4UYR3~Dk4RBx!px<&A-nj7m{!n=urqA@5=H8s}p znL~CIJ^@+xH>irxGd2Dd6bdMQjWX({5|v$V-FA$F&5BVsAJIqCBdtMqINqo`3Avjn zwTK2qJt}{SsJ)_%x?6~Ph8lQ-%1u#o`UOQ3P>iAm-w<^q<6;D8;>dj(Xd1AZU7!Z{ zlZjnf+7|aB|-5HIXB|GA=ZnQ@QBh#oKZ&}Y=&w?X zHk@O{s@0Kk5rw=pJ(LVOH&IZ;f#PfC#z<<>A;wr&ifBqBV_hNlMa`KFDB?j85vv^6 zJ#BG2+ zO#@YTP`Np}(6kqLheJG7i8WZ=z&e_*YOQf#SVrC4*<6;(nzjv#7}pP;ti~ZWT2-0h+q> z7@uIdi&BeZPz=+YmA9hJh7X_y`>Dl7s3mt3mE*ol@w55rzeF3UFYx*~xv7!LOZl~( znsN9CcrH*<`6V2!W0G+=kNc>s&1gk2B8mEo$CGGBRpapcMC}W04=VSgcoh^~dXk@gE#?Ot?hnq`VG_zF?9-31+Lms#c6IUd_St=ruaGiQk5=ZjHt%0U=Le)UaYmN z7c@53(0a?Z^4;SNtt#|jb3^Oz={72tpV1W*H;Jwhtq5q=K=VLj-4``yR-kwb6n~KW zeWJIAP9yr!7-JXT9VkC5!PwOTx(AfMBX<)8MK@4XOEk3ir6*AWKyw6CGc{wE;lAir zR-ot(igv1@?IG&c1ViI#rttR zsv-Oq(7f*=cOQx#ptw)8QA9gQE%+R`d@p*UJ~iQGRJ*=|_GDaeZRsvbEqa3D8YsR6 zg%xc-Xn1O+{6uIi7rE8`Kc=AQ|TDC zL#NJ?Z81k+`=GYCnUT*vy|>?Jk6M0?^P=6>$1gyiIsu*E(u}D=m@Y^&@>PTW2>-ID z`{*tn;uimw-nRSq^&6x+f?a5COkGdV9puv<5QUPAsXQ#_b})gyc8`8MCO!2G!DNEZ zrXmwljC{wSFHwmlfU$UY(fbebI5#aH+x`0c4c4ay5_EN{k?&9FUBK`?$CwEz#?(6h z0+=@+*gXb_if9CTrG=62E_4+j-h_^4jQkGeQ~s$_sF1s;B!xH!$xr4t-h0|Z2a3vw zpuf>gC80MNBg5!6cY(0{&U5^FB|j<8fPKerfLywX^>xrJ}6lUT!h)8 zBqfvk@+n)Pg9zV*uzhg9F;NMe1$`Hu7d)gC1;!}!79>02HY5D!@KMc_z?W%1i~Uu? z|3v%G$lG9l2fi!qJ>cu$r#t)w^m@v`$!(#O1%&6P+*3|Kxd2Kz2wc9gld=k)FW#rT56_Q+rEvEu zurxg1OU>kBF=Zuic|IoP1=bj~GtYN&R{IfFJyy@EHll{$jiZAR@^~3WPpN zH1^j+Zci|hHbN(%zXAO!@HP)A=4Ml4|1qePY9!r2e+_;-J*6Ts<8d?t#WKx|Bz_e$ zrUnL6QQTy|H`H&e{%NwYKLT-H`~@Q}+RWI$5C7MR%LkS*?C>2zgfpPs@Y5a`V~Fz*emwq4dGI=hVJuvn zZTHVYIETiD(8^%`8fW84!UJamFs=+0{QI>34qq5&d7v`9C!pfd--11#B*jc5_+|J- z1g{1kOH$nYh=~U5VECVi$IH-SKBt{mE!eY$1ENm6vA;gTYGkk*=Gw`5W-X;EN?0`vc&+ z;um{+mJW!q$x3;4$&I!pDdi(2Tcf+pmGV5}+b9b;_j?<)gN~$qVKb%t4CLP=DNEji zkAwz5AJcvsdMQy^G7GvhMOpF}de!ik5Be+gICNoiWy!D5@%Z7n`$jB~OJ4D`>5blC z`4aykJe>4xCy`hU9sh25n*x)xHpllpQ-7nm`XBamDvUH2K8t@bk5g{Zn&!oS*H9tX zC8-J-Hw4)$P`}mv{Rw-3*lFg>0xLG{A@-3B77#iH}qZnPfk@2-N3!(undhX&=9O%(V5k2~vvLPmA;_N}LI)ww z?>6e$Oj&XbK-I}G41|GDU--391Nt(ViA*+wyd?BG`g@_np&ik`LnSG;fo;u3%(8f=`>{tTkl45h?4?8h6-&H z733BD^ZLEtPjHKu5vz9qGhz*m>cv1$LX@J>fSt|L556}0 z)Z*8eqko}U8B2fa1E-xueW{SRd-m1pY_(x#;(&{UH1$Qfdm%={lV=n4VZ3!5|E7 zBAAB4T55WdnrX<{o+R$R?u35DoX8gKY?ffWIqWUKJPYUe7nRpZdtY$c9}IBvR>H43 zrUyftt(@auS+iy*&E9uP`5|h6Yy(IKzHZM$BZ)Q1WIaHq@(!5^0R4zcR3)QERN`0K zKPKaEXg?z=guFERZ$KG5=}Vz9eR&t!4!>iG?Ef&(#Xu9-_y zpqXi2b1Ag=A03%UQblt?uqtNp56w*GIo~rS4KSCs9{6=kZ7~|GYz4JS+u)W{UJ*b% zK9|@K#GOD+m!;<=(^@IKbOHH#_`1+{$#f?P{Dt0Y=p{h+A}@mda`@%YBG3)-%CWBK z^+;6obUqGA+{9~luAo=|1-dnnv&ykL1n7d|cIZrg_)-9~hIO=gQDgP_5;OA~W>;Ez z+s&oU8Y4&fT+;Pl#9iI#c!oK_t;|Jf#6M4deN)h-sm9JRl2qsG=Ely~ph1boPWE|q z%D|tKr8iuamF}qmyE;u2Fwd6b$~9wxI0RL7&pYpb*PyA*47C!`I$#|PpC1%; z;1i(?>pG{1#`Q>@Fz9^zd_;>5b9fbRL-BSI{w(r3^kiZ5UjzME(H)$kPeABkN$-iUKCW>PJ{xAQMvSPpXm%m+Cc-3pFlXf=g?`R9}%S~D= zT2Z3VZ(;IZH*rReX=RfH2r*Q(i>)+PtEqGrhY+_?q4P*9j`F_+ka>~N2U|Zmcgb6 z5IE|4!1Q3XuQxs0%=T&~A}g~D)6YuNZY6s53xE4B*27JU_9m@jYF;quXQt_!=HQP6uVaAy4 zF9<-_?)%#dJKRtV2OVoRdqC~8ANV&ctC@x;&6awmW+#(=Asjt?;6FMr51IvlG*2MR zeCS`yYreu?D<1k+D5daDpUBZ*3W6RZ)&)l2P19vtFxvlm=s(Gcg~SZL(iX{JH7cQ= zS2WLlw*c*#PoFU>+;3O+yd9olO^J&BeVdP=}~_{e`%3q=Hfon`^1 zAL8QB_n-nyCGt22{RXp&$R#L`M@6=Satk}M0(PO$M(9_8UU9g6>^~`5xrtgIP?td* z1{%Kg6{%rX8Co2-2Wihku6!mDY0cn@$RlZ|7KZ)%Ea!TW2#R;zht)Q&5Zt;UE^d(E9{l&Si*@tsb}s{D=P znWo`z(~+22yfQ7avMHx9jsS;DM`f{VKszJK5wM!sKGo#MnE-m4{FLR6H9)Fwt_2|; z0i# zZlw1Le&54S2K+HmnD4yK|jV6^@XhTPAH+y$VoFziPDzrrg3 z;~F+{4ef;mC9o%SF_sxnfAslwVPp{!Tuz`Sc;1WrFzub8oWG3Yw6_8X?^uZ(g#AHi z_b5eq8#)*MSJ`II@(GyI@`z9dUJ}(ChS3)|tB=#(P!(E?s)<)?6d+g$^rjKKEP6L^ zB3>tO8M}S(g9&D*y`AW^3wZ)&>qCIw4~B@AB8W2KUpF@-+MlA}@l@q@VlM<6lB5~~-sd!gf?f8dK-GLZ+UYZ2PT z840wJ%di)nc?QZYpvdFsiOt9avh0B#XC7q`{9{4UW5c8r#ZM{p=`VYhf53=3U&B;J zxf90KP>#nr_W~k2QF6X59XS->8vGA{TneT075f(f0o4mDE%9{;-=aQKaPuEfnsoMT+I`H599(SdVxCB-0Q1Z}{zsqsyWeDgt?PF=!qJ229eGF*L8oT_xO%L9Gqc2lJEAV#o(VN5c2P>>c{817SV{ei`i6 zkfu1BgyQZJdSTS&E_?&vN0ggj{04NhsMU(#fa>fmPX-6<4s}4;iYxA6MmA^A*}n)0s1T~`g#CivP9|bk zA-)z8z)tiLKEz~t0lI*Mu0e+Z@I`1K>fED=yi1uyzf6MOhnB=#yw8Sm8K0ilWg%PGn76d&@Z4t&@#v?(r%Y<(QTq6L+Tvr0HyQn6FDtjENH#q_?4S)?SGqhDLyLelw10j%$siAy`A!Tp-!4=-qLDk-scIf>XcVI@AUVFtwPX#E;M$1Yd}ZbuyFDYX@C`oYz9c2a|ba?B2(Y zRXx54dRw3c@w*;*(H2UdQRt0;avq64i~T^xQ~U(;)>y20XK9$96KOf~Cc#?smUG+{ zC-R?5I@>)9IH{!{Zr-%}%qf-zo^k5`XjXKTc{juIcZ+!BFG<7b^ZX#{1G0_K0m;fO zCR6-1c=mAd_sQe}^d_0Ha>t)dQ*I@rcN{tq-bRLBKu1!kLdcgRFNOU~^qI8r>ob&F zZ$Ss)?=*hOvIGP&6|XS&0EgV=558bm7S>!eVGQ9v@K_zCQ|0#aF@n=K+1@fkheoi~_ zn{^zlLSd^p3(g&IW^gXhnb8LG-kLSqUoYsCTNSd#Kv(nT{Kw`kH>;9X#rB!EW35?X zjTyZN_r=7&pHme70kRY~r-kAl4BeEhj9=s-#WhJ$#@CdjxZl#0@%ND5gYxKIoF{rJ z@?TSx@yFp;qZdKD)Iu2_gM3^|W&F?30q7qW)mj^wKC{!x0 z8hj^DDee;}H~!*+K=%^-4*W!67lr-1=t(UVf364k;zI%`!qA#UYe%Atp!IFEQ{eBB z>xWQo$NO_HC@u!C`DovbmsgRWqn+FLac^O-LGxqZ9eK}mWju2*t~7oM!vBfhSnNf% zZ9(OYoLa;&%KX1|phyG-W6YnmE-n)tK<3n&7s{As}0Q+|GLQM)6U1A zajZN3qM+5VACI1h@;!QY@pBOV8_-sVKLamnGZVD);rRe*{3vJw{9;+uq8TU_r5PC= zJ#4WXQjO!BSz;%r7{^)7ViQ^z8B?MCQjCm?v=4#ym2I(Gp`DDW-Uw|J*d+SsqqYi#j(v6HN4iue1v6 zcLFlG54#AO6e32c{eSKc`((E`j(<(F=rD$N$^VO!TVYv1Ci`l%~`g;bBYe4&|mvGHc|TI`H+Owc)3>P-?vnEt#sU zVdf>jpQ_Ya3XMj;33}=9Txus@&roW81^+pEFF<>|tkhyuCrio7nx8#w$=?x=nV3Ag zg|g-cacMp~V1&MhAZ$*PKOjg0l-|VIdw^mguEoY9`6AA)z#mId)-;C?CulqL>fuBL zokIKD$VWg+igxVSfh8Y+XG4OeyMdf3lUxURH{>Ufvvsp)&0$Qed4PpX3sW<> z5Qt;yDG3iZ>&I_QPv~xQwrggm0`9t^}#3_MZi+JM_#fa&t8J*dY>oJARPaON1 zJLo2|V=p7EG-;Z-8>(e~1Z%Y)cdALp%xjD+bGK#beQ91(SnVCn{iv^bjUhXBQ|ujk zgP7qEvy)7dK(-mWoecXxH-eZ~GFaCgwZMxYP@W8*#w+ikT2= zTzZ6jE?s>YM-LeSqChvyp|IGzHnMu`CG#_DtEVSe53`t8iw#WuA?7tuX-DGp+#lx9 zdDkqQmAOT&A!H4$kIg{81DHE!G4CZAm+}Dk5S3^NZP?Vf)D3!!48Mf(YVOY7P%jcX zMIyIo-vp2TrH49#0T}usC`2W=WQh5!nbCOwd=)Z}gx^95AE8$lUV`36?+e=Jnq&EI z^IB+{*~3;>T0L%!ac9OnX=Q%szK$+7uX>z)YL(O)^j0aYA!ALNE#~Fr3ji0hu_}O* z0rVbCJLV@Q#ozE_kaH^`rU9kpCRX;W_cLd`mAN%YYnpcwV$7?_%jQ@(Yw8~h zGV`|t5oRf^F|o@W?baCRYfk%7=4EDUfbrZ=%rJm;1h~YgzY4I$WO@Rg9}T#~mttaO z(bpy5^HHf^p}#gaF0pRKoa48_FU>>1W1F3M4_(YOrpa^+n&Ge^{6l6#K4<`r>mr|v z!xb3*LFv~Zw*Y@*u5TZh)7SzUYG!QB4r|O<3F(#tYgAhe)&gji&|2`U09Gltkx>Rf zc!aa_Q0RFQ>IlFe=~8y{F(1%XC86DEk3(Jsc^Zk#Cc#_W^@)imGj8t2L{kCogvIP) zaO{R|MPUzy%i!6t$J{3KCGZ~5pP=26jn1J^F^a`MYjCzP*FkHvTbWyf_%m}XoOX02 z{qJu+iD->>tIw@L^qB7=ihow=QUjd46*v&!P08#VGM@tQiez{cV27cXq0x+yEaZHl zu(Nm;z9F;{^cIP15%Zr1M=!NPk!jI6ABr2H>!2HOypYU4g^I!CO$jMkOelOG%aRBn%Md#!28<2~^evt}@bHHn2{;%baqx5xWGLEKXqECx5J=nX%*ce=5 z7Ia5m40?c*A|F}^!%Ok-i3DbE6l3*mKJz4HRpewzXDzIRZkS8_MDwIq)61OyKbj{? zD^sh42h7vGHJEai(Asv4Aj59}#!vlT;+i+c8^EGVD^r1Rcz>urbO7xE$Zs;J7f`7H zol@{5VoYZo^h3dihW<3iT?03nex>X9RWY(y+GPUNtRYHr84tCb4hIy0q0l>w6<^qZEc|}Y;=r8cq z0bT>1r(G`bC}zwM^yUEA+$ZSPAOGLf+UHnZHrrhLt$mPn-rsCaUnhL~O4Wx8LO}We z%8wHjyZo3;y6-w*M^J690%A%)+d%^vk9-F4|Bcn!B$>2YjP-1!)jd|PSSK`VW{N$C zc&f6c{#Alfo59<7Ahd)mHNMtVSv%N6YTO_}S^FpQ%JE9=H`m zyfU;K6?qxj37)Hf#ym<}djWdh*xg0`HFknOihK$*m<)M}vN5k8TDt>%UW8tII+P05 z_Cr1z{tMw3{;nv!HOH(o%2jjqu}0h??mLQq!QoD7jXZ0lwl}xh)^^HT#lp?ah1Ka+ zg{=*!uy?FmqAz>Uce_E<5M-@D*b$_0WS$JbFfv_8J1@;%8-iS>oxW?_h>Em_7DX=- zdII??*r%bt6FLa~D)ybQ<7-Ea3&VFnZ)u#ewlw@lJXN{2Hv$W>HB7C{EV;Eab>=Tg z)-{SXN3F48EiP6i#+aXPSaaB_h^6;ImhQYcUYE?5fhZu=T0-&;F_&a<$eOn%+LqL8Ax#;Ul@H}H&yC9kPlF1!-M*Z3;wDXi)!F7}nUZ=C?HqtTAL= z&smkQC`~GyP|e4&uAH?_AY}I?TQ_(WHqC`;-qc=y*aov{|;l( zNfi19g@&WyYU*E~&p zdEk%suZiDKG&Vihf68@kGHCxNV3_vb0at%bf;5wbLiLEE zrLe|zEvppeTDi(Is$w48sG2{jw))1g0nzlagaJR zGn+jq{yy}DAg9XNFCc#h79R!6!6(3(@Yhow#|hc5kv|y|7QnATl}(3UhXvO|&(EiwL;nrD>@PE_?xc{$Z}n(b#G176++2Jwdg7$)!RXl? zyc0c*K6P>E%iwPGoB8+j`b)L`WM7-?gneU2HKd+b9@dO*dJQw#zOFcu?_{*%D|>Di zCxc%d3AHVBN@9HXj<+^54A#wcO1SkR0=yb|qufI4{ZnyTD+s%u4w%@B`w|kWS zs@8O9{HN|X5;t=J*~!X( zYGOc-Bw~Pl^Se|(QrE&J%mLe6sm%T>_deXK<@?PSi*fjb44J7 zH+VyzQI}PbXHYKT2qbin{F}fd$gztXVnLO`$~{zx92fCIzvH2nL({L({8IQX8dpc-ozyX#ouQAqI3*kgg?c0R zGW;ocj7vbFyOHYzJr_QAb3=DQKLS5axvk;H;J*i5!SgzU>BR_4k)Z!!1*Z;jIbBOWU*rA0^2HzbziCu8bz^q9?@`q1a*(aR&>2|7QD|sBb>0>EUy<7ZJ&9J7PI?ykIr0>@(L zuicysHq1i2u~>4pY5zClIu7kZihdK7=HFs#nG{TM)6x_RvRqNFbhU;T?NU=7P~8u= z9-flyUugfza5EY;LXqiM;xm?0XPh2}E~Cz00UKlSI{0Xw!Igs0>$y(GPAuF_|4{>c z4!NJ<$4NKz5Td;nM4F zPpF4e^vhDkkUXi@qE@8X+GKk|gSs)LoCzD6k+a6LeAP=>-SLa;Yf5<~JxN?V`Y`3)Y zmea9}li<+9Xm~N$6Z{E_c>6jTq?cm}dzYcJvE(Y!H_#8v#4?8YlF8r*EI0CFB2Ec6 zcSFrE>~-kslw-z(xA71+wL{PHoL4Y-Rxt&e;qn&*XCj~;OYKx+fPz$`pDaT>4U?2fLBU8fhb%?JqWW0((!>vb#5&cKe{5}+}fIka9$JC(> zSoR?DeQf#%q_eLWI!Zs#8+i|{$;<<8g#T~yJxWY`i2NNqhp#*ohuTokKoVZXrHe^W z-AZ%Stzu!9eJ!=O9q&`^I4w;c>1pQ^YBd^N6%&e)lsxgT!Q^Q>E(H8I%)&tE*pNR*|X!$g;zRD1HI_f3T~Ysk!0co*`Ev%f zbB%JUJB@z&@=Rrw)Fk$D6q|=)><}e`(I^&yW*E;f9nDOz4uG{xBFvGY|JkrpzMo6*d|oJ=2|cF3DWA>N^>u|#`FQBGe5ag) zb$_>fCwu_89mr=x(>lY?L!UstD#&*=!gIN6zmjt5lHMfjggIsQ`@vn5mq$JhXZ?#Q zm#Z7aezh9SFEqPW>}d9_{za`w`IN36STiq;Me3)Nu11y`&51gO{je8}2cQ{KK>yun z#+=f>9X@kIe@&1>760$xYgl-7ffGIkosWDY==<=@+vGDIc@~^O`W@g%%9(*&fP90I zdmOo5pa;)z4)5=ZN1D>={E8*bsx{wOY{v|ZzKT8dC!#f0sZ}1bGcD7qObgZ+qkMW^ z4jaJ9R@VrPxzgzW2#u$LVKij^;itz9uSDbF@GnQvPSo)%a1IzVieu4J$W6fVpTXJS zeE8KsZk6~SB3~M~8hQE~|0d))TlK$1`Se%*(K$|Wc}E_oI#Lg-nUv~MnrmnnVQ1Bv zP1ie=vQ+d8yDm`=tX8V>i|n$u1+rhty@|siL z(qdfHi9giwsb|#Kqgk{p(A=@-zLdq9kH|A_W~$FtzG-&dqL`>*M`f+VinUnf8%X7Q z(9{Q-S6fz~*muwkv7{?>V`8B#n#Zv44X`&Bj3)g(=rQ0@R-=fi&#WGo7ES*_Ur1sRy4`L(W4hb51weXHFon$;gDHZ;b_GL@qaPpIeo z(f&0#4Wm&L6zPcvZl_9C;{irf|7R%3T+`17L&`aJ_P2q)6Fi9}{~>({=s?p7!sn8Y zkGzy~h0^q(;mbh21`}?L+&=is!2HJ3UqLTN{yEeB+fpFQcE2>1LTaa7?5bLxP%NnK z)uU;DLY~v8EL!Un&0kE6B-{Tk+Z?*0;TtHj6^)rW`T3$x*jUc~K-kn-CK|I6Gci#9 zB7D}A;d1CbSh|7qMctipFUUHj{8#W(JhKmcgZ$kmyuc|oLEvYR zTHb{Oc|cKqiG9IDGcEOms(aO`)*p(ESq)RIK>uq#B3f%y_k?Z#8s<%(eJ=_zDkkfE zG#Xch?v4fAq43Xy&urSC0hV9^tMqUKunaj~>G99UvH_%@2et*jz++}ic%3S4N^G=7 zZXO67X6MDf%C!Gx2waC?H40FVr8<&js>>#JPfC`^6Pk0Xt|c!yjSB9CZ2p;fr}Y9;FNw9=A= z@|b#7jWwwlApRww#cDq+HotnzE?T%d3bD!#pTM$PQP9j>Ognx6oQs7f`ftP%?ojzz zV}^}oXMm@Hk6`(9@B`9wC}%CW0-8Om@=w5dl-rG~Nq**rVV3j$cSzt4w_ojCbt}y^ zQ&L^46{#HcC+i!hB%a%y8avEbkZk|zLe)%~qj@fh96>SD(|wG^CqQ;k z!gIl=u+YqGo&?8&8a@9829v|?pEJr5MXgrNn&BumjD@M28Y}|KS-1OHxrB$}OH+wu zAn(YP_r%#zOu7eT(>L7opQ-#;8n^L}vjeLJZiT6)vl(^?^<4GTyqoV)#2G)`Te~}% zguUA_&&k}8G0siTcjAmP?x=hx^8?Ud;AFCQ<6hat$?Qq`J<#+xZY6jY_!4qlesW(3 zJDI_>ac(d2UkJ@tr!vn3p9Oi{(tR4agV0wa7hvA)%Eyn}J)Lqp+iYt0eEZlv+=F)g z9^w9+0DcR$EOg>V!?U5U zfW80=e@GwaE`a|I{D-(v zsLO}F;yuVW7Eg06%w>-#ZrYtY&F*rg*{YL;FW5btX|}H&W<4NF+uPl*VRqfW*6w&N zv-^1}x3w)N%kKG#-phENY=u#ow< zTaCKquCn_|ffN52Iv-1T^~jw7W+Hz#9{C-dMm}bv?$fzWCVjSh7|%3?zL)210zN;7{%!?9SHOI>{rj)DFLOWA;MSD^V#U|X;wd{!Wt^tCQ?vv@l!{~VgRmzxc~ z55GE5&PJxoN+bRlNOusw9{ynDS*5xz$jrn&#Kb)LU}>m2Qyt3kJMG*{t>AHclA#z8 z%7cm(jaKqxnq8BroZ>OIIrK#%-n@62O=eC)<3(tG4p=+aiGK#R!g3Sk&vc_7K#tKQ z^9uMq=?@NK(NXBvKvt#kJ>WT%$DWbf8UsDxMAB!I-v_@T$QDnWl})kRga>^HW47IJ z>|=MiWr6Ctgg(~js`mSg?H^R9qH9<=8OC{c@}J60wR=vKlQBd6%jMsqDSNffBg#6UYq3Wo;H`1>=q;>e+#CZ-bu? z&FB^XH|1OhZ9JHRN8+aae~AY;6mq5Udv>jKo~>i`3)SsvQvJhH`+D@Z_KZh8p85r~ zmOE{_SM8@avOq#)vZ+HQAkVpTNR7ASalaW8uGm+(zWSG5N_itD6&N7U~wkKL$Pw|0m=h zfsTV_-JO|327)g>9mY;OYHhGRo-9_6^`IRi%54uQ#npdYV`E32P+ixvJD=(iRc`T6 zyXI0yae%sh8qN6tXM7?GG55+Gje>))&{*05i`!$#Xe>ArECbnPa<4~jCHc-GUw`Cx zf|r>5q?<9L9{E;)2jN#D$IeA&6Lz~@wl*@^d2pGaXD*~5^;AdfuC_FlXB5R61>*-& zb*lcNneFQpJ0-T&s9bp@6>G%*HFnt4FuM^=H=}7CEHf7V(MF_{H%5g`$(e zY~AY#r`cod=%x0*-d@=lic8th zbJ^U-FHNYd_Bqlf!@XycthxEm+f>BkXNvFDObws9FrxptI7+x+O zy61v?9Kfvsa-Gj*x7SNsbva{+66)^bVaLmLT{o}5i5fX}1fmgedZFXxQ_JoEyWQ}w@qH?vMxn3lDls+A6L)*DpgkHN$#r+bdYr3 z;C8oR!DH}QX?O>Kh8_(vafogNIVg-SM6LstJ&VVRIybEL4_job6;i^0eaC(3x@mXUt~G%*%!2Oi=Z;L$U1iRRhraWsF{noE7IW}&KMjnkqPJBlgE ztL`;UYt)+UP1>K9#tkCxr)XG#W|Ps3-JB?&)NlvF-voY&XR6@=c73C}kz0!;lTrL4 z(mRqqomRrdMYlhebFJ5{jb}Q8#UN4cjsUkKzZ+5RvX3glm-o1CXcG`)>=z_v7S?`VBofxm$&!b-@q(1zZ^@JfI~sPb702PWc0p| z=AXeQW}-{5#MHg%FD`&KmMw(dO!~PP#&uqoGY{|C;M>rv;WPR~x%_HYvtEyB|4h`Q z7r@YrKjr{S)IQ|E%hIom8vIHKVm$ZjDIzY z_tBht#qRAW)EvwtTGf~bq26jAica#^CNSGv|v>Os}>X>O+3 zr}Am6&~y46vHRU}8K*Q}Q@4zY(NhGahq^M++Jg4Y1!T7n{z4S)OCLTEK0Er+XTZLA zWElBsV;SGxaGwN!gXW9(-Ui?k&^^e%0r?w9e~0I6fPVw)ZZ99da0ikwhl15U)ox{h zEEcMrYbK%|PODdWLK6Xv0h$0Pc8c{NWn777&Cv8Db;#9gw;ENm53InF&%wJ;cmj&j z&Xc1YJNVxH(2tVNn!sfjH)_U`R`6d#o|TN3V@r1p{1woBOjVQ49gFB|#Gh+|q7cSM zBs9l0T4hQhMYZZss2v2&t3S2ly*#2SRV&h{D$7$nDeqnn1x-}n56%G1 z42b<9w-bt*o{V{TluPPnZzZ~j^wU5iHx;?1q~Acg4?GUv%#^m^G1EHOdGJmLN8y?G zdA>UchH+tCT(l2YX)I8?Rvl_St-f77pt^941*&U}nra=bE=ct(Y6XAHyGPk2j=Idv zXl_QoMkr)PJ2SK4RV=p&8eau&f;K(n7LXShOgm0C zu`F!8J5i|?QC1}FTyBrgq0~>?9u^8pe!+)akUuk^tA5$1@1I@MzU4^l_ z98;+&mA}hR6nv6C&UbrGqM%pCINu|=iGt4H@o=K->9ldaX5mBu_qu(2guj40e7=@> ziGn`J*Cxws-v)3v@{Qs9F@F?v zJ(Tzbnm6DJ+VSw~F`_p4^HK6O(uY=yr4QnhHm}3ahTjxpZUM8A|1a_vfh7hhm+f@l zD#{%}dDAJE&qVm{#z`!*eRFvJ`Q+<|Uj3oJr2K2ZBGSKxe@#K6jMcU831}a9l;>=x zyp_n+G}SSPPujdm9o|i*H6&6;zFH)5_U=n3^GPzjg~xk={JvJ%BCrk*eHFQTDU^E| zzC*}A2lmBDi^1NM_d2)==RH9=L6qaf+1CrB*6{EvF=hh%DEZkLi=7&5Vxke8WoU3M zl7lEQ0ewI2G49%cWM}XIIWys3N;BUcOZQWILU6IFZ(f8d#LCBDrK>WLp?1cWqh5%#9M(2_TU*!kiCaztmhfD zl!AuPi%CB~dNw)@M3+gV_kiw(m0h4s3os$_F;y5lIn*$?uxRv7Hq4)-1OVBtm4gPkV%wC%BI?83W z@0&=zZD2o4eUkDf{T@pnPQpM-D1xFPbR;2)$AZXo)+(CtXS7MiV7-zp;0 z#6clSYz3d;p&P*|DES4>*@%*1@Kd9l`LNbqp7B-dP)Kaz-Ja%E7vZbelDj>hwKV}X z&1B%gdnlkO*1inwpcn@M$?k3>3h;Tdve=Rlp3jP#=Qx!$(#HkgEO3grW)b94b5W$q zxZrx|n$SNMI+b*+!APD{^aXOq3Z0_&pxJ6HqT>y+C@Z344RVRPs1n?j?^NcK-Wg=` zG|2E**_3>23{<`hy%{+gTaY)KDyfgyse3#VFE8TiTyPgpqPYh9B7qZw?LjUF1iu44 z;5bUC0y4lB%_Y4T8pc+Q^fYSTmlR?p7(mjLz#^jPS9Ey{dMNZ7Fm`ezAzXx)gWN_c zdX{8foGrS62Y-mFouJ3U=gwY`9q-CDJb>9-A&XT67n6xw3ejv z-LRtBV0{$p0*)bHE9AeVT=t`bRmjg7L(qiqy`HIO7F`N4lFSQ1-uxvIA}h) z6#RlRet_MZXHk(scJGQlrp&7;a{=@jxlU1#XRmGlt9?J8zpIBl!)$OR=|=70s5zeU-sCxtL(jmx zHl!~{ZDZzn$Qymv!Dst6I0x7514p3;yQ;yXl+%p#v7|2q=Rjwp$2!X6BXYst>}2P@ z^yD8g?`Y3m#k^K%K3Nu=Or{xRdY3}q1;_BP3~&|3y+`4*q2B@LQ&<%W8$#jhNN?Je zrT{)f{s8zr`3E9rj2>zbd3wko)1*qeir_^#PBM^xhHhe7>1fZy0Y%20A5elZcNL9> zPmBeZVK&2TusIr>&l5S5sjNymcO#0p6%xDyYz1yWZYTIK8n#C6TFN;}zSntj9V3rh z883po?NS*#^nmBZ_C?Q-+>va2sw8*~Xj;s0iXQ^MfMPhU3l5RUU-?5HH4-6w`T~k)J5#k%2lni7fmYU~W|!2A32jO55>ka&z+% zSyw{86;5QaCmwi&^p?;JE?F26Sd*W~3L$rx^kd+1(s{=%z^9V3YJp=w!ZI+H@>YT! z3KLluAU6>HOjE-XJyYw2$i&{My%2ny#Mt~vo<^N>3KFGT5no90BY0eQvOfdv z$e}p+d&rpyzK@z5!v>B~!aJ0Z$Kzr@Px3?uWbysMKqstghSrnOnv=P}izL%H_{GRX z={cmI2i{J$&O9ax-4_LT4JGgnk7Irl=#3f=KzrdY0%yagaRyFIYHoxE+`0kcp}3^;A|8vBFEb(N~8x)fYZr|$yp7M3nNGP1jb?5tKbimcn^zfVyP*ItTLr3MKoQ0C0?VMA$>kZb0Zi8A+K-c0qA5+c&)ZvOlAnlh}`c&o+%`uOQ zvw@#U=cTPc7_1Adby>_NWINl%A=E#;;oR|7tC7W4elcF5(z-vgb8 zTX;1x&=dKqp!cF@L+C8*VHO?efEziQjh&jx93p_SOe_Lh5q=Bdn)u$t=&cl934a$# zwt)UO519}BJo1Iey-Paxrvr1L>B_G0A$KvfdETew zcj1pU1Mf7?)c#B`OS#b$I34D8gj-N}d+75hY!1TvvFsN3CZ=1!??>pEhA;|cnUDq! zL+?kf6LQ8QFM%8=2F{|~@4#b(ZtT#*p7~ezgOiQK^Z0;eQeZDRUFeaN{0dL_x@#)- zxAEi#@W+E3PX$hq?`G&5kdJ*bo!=qJ;wCY_3z;Z2$n>!^c5*tO_`C%Uok!qVR4|p& z6sPB*;)BqwC}1|}^d5mq3R+9LX|AO__--C#`VbQ=0n!&zVh%VE6R#qFT@YYg_(x3pTx@$W=T=H8SIZT3?+Y9E35x0 zPEMVX$7)1b8PfVnPvc>%MoMd7$xCQ#db37U2Dd9sZsIw0sJ)dVh7T>p&(HafPR7ee5S`)Lb@09lb!`;L9;CvJ2jhz9_o^)a46(@9@2@# z+eu_g%V`45M+uxCauO96l1@NIT0*m?uBd|?%hd=w+)j5M_APW}UZR2%C#Mp69rVj6 z`wi&?c!VQ1=ODDRu zoQJ0dQ&?9XG81|+h1aKMrgcq3&V_$~d~brRY$Lajel7W$kzSMXIg)X9kj@UJa~z`v zK;K9CgN!F0MTwaddO4Xo!^}pg0iH3IH^#F1U@@6rhyNW)@P!PgHy)^tddq@leDx(E*_8N=$XE0U7Hf^~qNR22rYcqnWbG@oK~euF*>e2+|zfcuGRJ}~6` z%0oHrc9xU=9QoMnFahRNAZG&BG<847AXc##+{eTAB0mEq`B}`Qkr(ltHdO0*raGQL ziD78Sp1w1fOjE!n6lyxFF(8M54r6O1gTk;V(iT*c{C~pOsd*^Dl(gaq5B-&g=7RM> z&gY$_SauWiaMDLW-vQ0``~SBRZ=IT0Z+x{E<9ysGHtuClgY+NUyfu4i^&e`!1?y<_ F{{eXQhc5sC diff --git a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py index 7bebf513658..a75e67933e5 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_runtime_evaluator.py @@ -133,7 +133,7 @@ def _init_runner_base_cmd(self): base_cmd = " ".join( [ f"export LD_LIBRARY_PATH={self.qnn_sdk}/lib/x86_64-linux-clang/:{args.build_folder}/lib &&", - f"./{args.build_folder}/examples/qualcomm/oss_scripts/llama/{self.runner}", + f"{args.build_folder}/examples/qualcomm/oss_scripts/llama/{self.runner}", f"--decoder_model_version {DECODER_MODEL_VERSION[args.decoder_model]}", f"--tokenizer_path {self.runtime_tokenizer_path}", f"--output_path {self.device_output_response_path}", diff --git a/examples/qualcomm/oss_scripts/llama/decoder_utils.py b/examples/qualcomm/oss_scripts/llama/decoder_utils.py index 5380ff5220d..184eb857661 100644 --- a/examples/qualcomm/oss_scripts/llama/decoder_utils.py +++ b/examples/qualcomm/oss_scripts/llama/decoder_utils.py @@ -317,13 +317,9 @@ def retrieve_info_from_pte(pte_path: str) -> dict: pte_max_context_len = pte_max_seq_len # FP has no scale/zero_point, use following values, which is equivalent to not performing dequantize. - if kv_io_bit_width == 32: + if kv_io_bit_width == 32 or (logits_scale is None or logits_zero_point is None): logits_scale = 1 logits_zero_point = 0 - elif logits_scale is None or logits_zero_point is None: - raise RuntimeError( - "Unable to find scale/offset. The .pte file might be deprecated. Please generate a new .pte file" - ) assert output_vocab_size is not None, "Couldn't find the vocab size" assert pte_max_seq_len is not None, "Couldn't find the max_seq_len from pte" meta_info = { diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py index a8e28f96b71..ce0b7a80cfc 100755 --- a/examples/qualcomm/oss_scripts/llama/llama.py +++ b/examples/qualcomm/oss_scripts/llama/llama.py @@ -21,6 +21,7 @@ ) from executorch.backends.qualcomm.utils.utils import ( + generate_gpu_compiler_spec, generate_htp_compiler_spec, generate_qnn_executorch_compiler_spec, get_soc_to_chipset_map, @@ -119,9 +120,15 @@ def compile( # because the encoder is quite sensitive and quantization can make it harder for the model to distinguish # between images within the same conversation. to_skip = len(args.image_path) > 1 - backend_options = generate_htp_compiler_spec( - use_fp16=to_skip, - ) + if args.backend == "htp": + backend_options = generate_htp_compiler_spec( + use_fp16=to_skip, + ) + elif args.backend == "gpu": + backend_options = generate_gpu_compiler_spec() + else: + raise ValueError(f"Unsupported backend {args.backend}") + encoder_compile_specs = generate_qnn_executorch_compiler_spec( soc_model=get_soc_to_chipset_map()[args.soc_model], backend_options=backend_options, @@ -131,27 +138,40 @@ def compile( skip_quantize[modality] = to_skip compile_specs[modality] = encoder_compile_specs elif is_multimodal and modality == TOK_EMBEDDING: - backend_options = generate_htp_compiler_spec( - use_fp16=False, - # x86 emulator does not support weight sharing - use_weight_sharing=not args.enable_x86_64, - ) + if args.backend == "htp": + backend_options = generate_htp_compiler_spec( + use_fp16=False, + # x86 emulator does not support weight sharing + use_weight_sharing=not args.enable_x86_64, + ) + elif args.backend == "gpu": + backend_options = generate_gpu_compiler_spec() + else: + raise ValueError(f"Unsupported backend {args.backend}") + compile_specs[modality] = [ generate_qnn_executorch_compiler_spec( soc_model=get_soc_to_chipset_map()[args.soc_model], backend_options=backend_options, # x86 emulator does not support shared buffer shared_buffer=not args.enable_x86_64, + online_prepare=args.online_prepare, ) ] * len(TOK_EMBEDDING_GRAPH_NAMES) elif modality == TEXT_DECODER: # compile spec for text decoder - backend_options = generate_htp_compiler_spec( - use_fp16=False, - use_multi_contexts=decoder_model_config.num_sharding > 1, - # x86 emulator does not support weight sharing - use_weight_sharing=not args.enable_x86_64, - ) + if args.backend == "htp": + backend_options = generate_htp_compiler_spec( + use_fp16=args.use_fp16, + use_multi_contexts=decoder_model_config.num_sharding > 1, + # x86 emulator does not support weight sharing + use_weight_sharing=not args.enable_x86_64, + ) + elif args.backend == "gpu": + backend_options = generate_gpu_compiler_spec() + else: + raise ValueError(f"Unsupported backend {args.backend}") + skip_quantize[modality] = args.use_fp16 compile_specs[modality] = [ generate_qnn_executorch_compiler_spec( soc_model=get_soc_to_chipset_map()[args.soc_model], @@ -159,6 +179,7 @@ def compile( # x86 emulator does not support shared buffer shared_buffer=not args.enable_x86_64, use_mha2sha=True, + online_prepare=args.online_prepare, ) ] * len(DECODER_GRAPH_NAMES) @@ -172,7 +193,11 @@ def compile( ) # perform compilation - multi_modal_mgr.compile(compile_specs=compile_specs, pte_filenames=pte_filenames) + multi_modal_mgr.compile( + compile_specs=compile_specs, + pte_filenames=pte_filenames, + skip_quantize=skip_quantize, + ) def inference( @@ -529,6 +554,14 @@ def _build_parser(): help="Number of examples in few-shot context", ) + parser.add_argument( + "-F", + "--use_fp16", + help="If specified, will run in fp16 precision and discard ptq setting", + action="store_true", + default=False, + ) + parser.add_argument("-v", "--verbose", action="store_true") parser.add_argument( @@ -592,6 +625,12 @@ def export_llama(args) -> None: pte_filename = "lookahead_llama_qnn" else: raise RuntimeError(f"Unknown model_mode: {args.model_mode}.") + + if args.model_mode == "hybrid" and args.online_prepare: + raise RuntimeError( + "Currently hybrid mode is not compatible with online_prepare." + ) + if args.decoder_model == "stories260k": pte_filename = f"{args.decoder_model}_" + pte_filename pte_filenames = { @@ -740,6 +779,7 @@ def export_llama(args) -> None: def main(): parser = _build_parser() args = parser.parse_args() + args.build_folder = os.path.realpath(args.build_folder) try: export_llama(args) except Exception as e: diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp index d8d82fece33..9b8cdd7999e 100644 --- a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp @@ -210,7 +210,6 @@ std::string get_formatted_prompt( return formatted_prompt; } -template void start_runner( std::unique_ptr module, std::vector& prompts, @@ -219,7 +218,7 @@ void start_runner( gflags::GetCommandLineFlagInfoOrDie("tokenized_prompt").is_default ? false : true; // create llama runner - example::Runner runner( + example::Runner runner( std::move(module), FLAGS_decoder_model_version.c_str(), FLAGS_model_path.c_str(), @@ -298,26 +297,8 @@ int main(int argc, char** argv) { FLAGS_attention_sink_rope_path.c_str(), executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); } - // Using 8bit as default since this meta is introduced with 16bit kv io - // support and older models only have 8bit kv io. - example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8; - if (module->method_names()->count("get_kv_io_bit_width") > 0) { - kv_bitwidth = static_cast( - module->get("get_kv_io_bit_width").get().toScalar().to()); - } - - if (kv_bitwidth == example::KvBitWidth::kWidth8) { - start_runner( - std::move(module), prompts, std::move(attention_sink_rope_module)); - } else if (kv_bitwidth == example::KvBitWidth::kWidth16) { - start_runner( - std::move(module), prompts, std::move(attention_sink_rope_module)); - } else { - ET_CHECK_MSG( - false, - "Unsupported kv bitwidth: %ld", - static_cast(kv_bitwidth)); - } + start_runner( + std::move(module), prompts, std::move(attention_sink_rope_module)); return 0; } diff --git a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp index 29b6b9d7ddc..c9c2bd19940 100644 --- a/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/qnn_multimodal_runner.cpp @@ -137,7 +137,6 @@ std::vector CollectPrompts(int argc, char** argv) { return prompts; } -template void start_multimodal_runner( std::unique_ptr encoder, std::unique_ptr tok_embedding, @@ -150,7 +149,7 @@ void start_multimodal_runner( : true; // Create multimodal runner - example::QNNMultimodalRunner runner( + example::QNNMultimodalRunner runner( std::move(encoder), std::move(tok_embedding), std::move(text_decoder), @@ -289,35 +288,12 @@ int main(int argc, char** argv) { FLAGS_decoder_path.c_str(), executorch::extension::Module::LoadMode::MmapUseMlockIgnoreErrors); - // Using 8bit as default since this meta is introduced with 16bit kv io - // support and older models only have 8bit kv io. - example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8; - if (text_decoder->method_names()->count("get_kv_io_bit_width") > 0) { - kv_bitwidth = static_cast( - text_decoder->get("get_kv_io_bit_width") - .get() - .toScalar() - .to()); - } - // Start runner with appropriate KV bitwidth - if (kv_bitwidth == example::KvBitWidth::kWidth8) { - start_multimodal_runner( - std::move(encoder), - std::move(tok_embedding), - std::move(text_decoder), - prompts); - } else if (kv_bitwidth == example::KvBitWidth::kWidth16) { - start_multimodal_runner( - std::move(encoder), - std::move(tok_embedding), - std::move(text_decoder), - prompts); - } else { - ET_CHECK_MSG( - false, - "Unsupported kv bitwidth: %ld", - static_cast(kv_bitwidth)); - } + // Start runner + start_multimodal_runner( + std::move(encoder), + std::move(tok_embedding), + std::move(text_decoder), + prompts); return 0; } diff --git a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h index 888e9acd421..b714f737de3 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/decoder_runner.h @@ -8,6 +8,7 @@ #pragma once +#include #include #include #include @@ -56,19 +57,36 @@ class DecoderRunner { inline int32_t logits_to_token( const executorch::aten::Tensor& logits_tensor, int64_t pos) { - auto* logits = logits_tensor.mutable_data_ptr(); + std::byte* logits = logits_tensor.mutable_data_ptr(); auto num_tokens = logits_tensor.size(1); auto vocab_size = logits_tensor.size(2); static std::vector logits_f(vocab_size); - auto* logits_last = logits; + std::byte* logits_last = logits; // offset to the meaningful logit we want for prefill model. + executorch::aten::ScalarType logits_dtype = logits_tensor.scalar_type(); + size_t logits_nbytes = getDtypeSize(logits_dtype); if (num_tokens > 1) { - logits_last += pos * vocab_size; + logits_last += pos * vocab_size * logits_nbytes; } - // Discard dequantization (converting uint16_t to float) because the + // Discard dequantization (converting std::byte to float) because the // relative order of elements remains the same without conversion for (int i = 0; i < vocab_size; i++) { - logits_f[i] = logits_last[i]; + switch (logits_dtype) { + case executorch::aten::ScalarType::UInt16: + logits_f[i] = reinterpret_cast(logits_last)[i]; + break; + case executorch::aten::ScalarType::Byte: + logits_f[i] = reinterpret_cast(logits_last)[i]; + break; + case executorch::aten::ScalarType::Float: + logits_f[i] = reinterpret_cast(logits_last)[i]; + break; + default: + ET_CHECK_MSG( + false, + "The scalar_type %s of logits is not supported", + executorch::runtime::toString(logits_dtype)); + } } return sampler_->sample(logits_f.data()); } diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp index e5c12068bab..7288ca5fbd1 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.cpp @@ -7,24 +7,105 @@ */ #include +#include #include + +using executorch::runtime::MethodMeta; +using executorch::runtime::Result; +using executorch::runtime::TensorInfo; namespace example { -template -KVManager::KVManager(Metadata metadata) : metadata_(metadata) { + +namespace { +void fill_mask( + executorch::aten::ScalarType scalar_type, + std::byte* buf, + size_t size, + bool use_pos_value) { + if (use_pos_value) { + switch (scalar_type) { + case executorch::aten::ScalarType::UInt16: + std::fill_n(reinterpret_cast(buf), size, 65535u); + break; + case executorch::aten::ScalarType::Byte: + std::fill_n(reinterpret_cast(buf), size, 255u); + break; + case executorch::aten::ScalarType::Float: + std::fill_n(reinterpret_cast(buf), size, 0.0); + break; + default: + ET_CHECK_MSG( + false, + "Unsupported scalar type %s", + executorch::runtime::toString(scalar_type)); + break; + } + } else { + switch (scalar_type) { + case executorch::aten::ScalarType::UInt16: + std::fill_n(reinterpret_cast(buf), size, 0u); + break; + case executorch::aten::ScalarType::Byte: + std::fill_n(reinterpret_cast(buf), size, 0u); + break; + // -65535 acts as the additive "very negative" attention-mask value; + // chosen as a large finite negative so masked positions effectively + // zero out after softmax without relying on -inf. + case executorch::aten::ScalarType::Float: + std::fill_n(reinterpret_cast(buf), size, -65535.0); + break; + default: + ET_CHECK_MSG( + false, + "Unsupported scalar type %s", + executorch::runtime::toString(scalar_type)); + break; + } + } +} +} // namespace + +KVManager::KVManager(Metadata metadata, std::unique_ptr method_meta) + : metadata_(metadata) { + Result attention_mask = method_meta->input_tensor_meta(1); + attention_mask_dtype_ = attention_mask->scalar_type(); + + // inputs are [input_tokens, attention_mask, (sliding window attention_mask), + // (input_pos), kv_caches] search kv_cache in inputs + for (int i = 2; i < method_meta->num_inputs(); i++) { + Result tensor_meta = method_meta->input_tensor_meta(i); + // k_cache: [1, n_heads, head_dim, seq_len] + size_t tensor_nbytes = tensor_meta->nbytes(); + size_t expected_tensor_nbytes = metadata_.head_dim * metadata_.num_heads * + metadata_.max_cache_len * getDtypeSize(tensor_meta->scalar_type()); + if (tensor_nbytes != expected_tensor_nbytes) { + // Not a kv_cache tensor (e.g. input_pos, sliding window attention mask). + continue; + } + if (kv_cache_dtype_ == executorch::aten::ScalarType::Undefined) { + kv_cache_dtype_ = tensor_meta->scalar_type(); + } else { + ET_CHECK_MSG( + tensor_meta->scalar_type() == kv_cache_dtype_, + "Currently mixed scalar type of kv_cache is not allowed"); + } + } + ET_CHECK_MSG( + kv_cache_dtype_ != executorch::aten::ScalarType::Undefined, + "kv_cache_dtype was not detected from method inputs"); k_cache_.resize(metadata_.num_layers); v_cache_.resize(metadata_.num_layers); // Calculate cache size size_t cache_in_bytes = metadata_.num_layers * metadata_.num_heads * - metadata_.head_dim * metadata_.max_cache_len * sizeof(T); + metadata_.head_dim * metadata_.max_cache_len * + getDtypeSize(kv_cache_dtype_); size_t cache_out_bytes = metadata_.num_layers * metadata_.num_heads * - metadata_.head_dim * metadata_.max_ar_len * sizeof(T); + metadata_.head_dim * metadata_.max_ar_len * getDtypeSize(kv_cache_dtype_); total_cache_size_ = 2 * (cache_in_bytes + cache_out_bytes); }; -template -void KVManager::init_attention_mask( - uint16_t* attention_mask, +void KVManager::init_attention_mask( + std::byte* attention_mask, const std::vector& attention_map, int32_t ar_len, int32_t n_past) { @@ -33,38 +114,51 @@ void KVManager::init_attention_mask( "The size of attention_map (%zu) doesn't match with ar_len (%d)", attention_map.size(), ar_len); - uint16_t neg_val = 0; - uint16_t pos_val = 65535; // Clear the attention mask - std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val); + fill_mask( + attention_mask_dtype_, + attention_mask, + ar_len * metadata_.context_len, + /*use_pos_value=*/false); // SMART_MASK requires special handling of attention mask - uint16_t* past_ptr = attention_mask; - uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len); + std::byte* past_ptr = attention_mask; + std::byte* new_ptr = attention_mask + + (metadata_.context_len - ar_len) * getDtypeSize(attention_mask_dtype_); // All inputs will necessarily attend to n_past and itself for (int i = 0; i < ar_len; i++) { // Iterate across ar_len if (attention_map[i] < 0) { // If negative, attend to only past tokens - std::fill_n(past_ptr, n_past, pos_val); + fill_mask( + attention_mask_dtype_, + past_ptr, + n_past, + /*use_pos_value=*/true); } else { // If positive, copy attention map from (relative to 0th input) parent // Parent token index const int32_t pidx = attention_map[i]; - uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len; + std::byte* parent_ptr = attention_mask + + pidx * metadata_.context_len * getDtypeSize(attention_mask_dtype_); std::memcpy( - past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t)); + past_ptr, + parent_ptr, + metadata_.context_len * getDtypeSize(attention_mask_dtype_)); } // Attend to itself - new_ptr[i] = pos_val; - past_ptr += metadata_.context_len; - new_ptr += metadata_.context_len; + fill_mask( + attention_mask_dtype_, + new_ptr + i * getDtypeSize(attention_mask_dtype_), + 1, + /*use_pos_value=*/true); + past_ptr += metadata_.context_len * getDtypeSize(attention_mask_dtype_); + new_ptr += metadata_.context_len * getDtypeSize(attention_mask_dtype_); } } -template -void KVManager::init_attention_mask( - uint16_t* attention_mask, +void KVManager::init_attention_mask( + std::byte* attention_mask, const std::vector& attention_map, int32_t ar_len, int32_t n_past, @@ -75,30 +169,44 @@ void KVManager::init_attention_mask( "The size of attention_map (%zu) doesn't match with ar_len (%d)", attention_map.size(), ar_len); - uint16_t neg_val = 0; - uint16_t pos_val = 65535; // Clear the attention mask - std::fill_n(attention_mask, ar_len * metadata_.context_len, neg_val); + fill_mask( + attention_mask_dtype_, + attention_mask, + ar_len * metadata_.context_len, + /*use_pos_value=*/false); // SMART_MASK requires special handling of attention mask - uint16_t* past_ptr = attention_mask; - uint16_t* new_ptr = attention_mask + (metadata_.context_len - ar_len); + std::byte* past_ptr = attention_mask; + std::byte* new_ptr = attention_mask + + (metadata_.context_len - ar_len) * getDtypeSize(attention_mask_dtype_); // All inputs will necessarily attend to n_past and itself for (int i = 0; i < ar_len; i++) { // Iterate across ar_len if (attention_map[i] < 0) { // If negative, attend to only past tokens - std::fill_n(past_ptr, n_past, pos_val); + fill_mask( + attention_mask_dtype_, + past_ptr, + n_past, + /*use_pos_value=*/true); } else { // If positive, copy attention map from (relative to 0th input) parent // Parent token index const int32_t pidx = attention_map[i]; - uint16_t* parent_ptr = attention_mask + pidx * metadata_.context_len; + std::byte* parent_ptr = attention_mask + + pidx * metadata_.context_len * getDtypeSize(attention_mask_dtype_); std::memcpy( - past_ptr, parent_ptr, metadata_.context_len * sizeof(uint16_t)); + past_ptr, + parent_ptr, + metadata_.context_len * getDtypeSize(attention_mask_dtype_)); } // Attend to itself - new_ptr[i] = pos_val; + fill_mask( + attention_mask_dtype_, + new_ptr + i * getDtypeSize(attention_mask_dtype_), + 1, + /*use_pos_value=*/true); // mask by limitation of sliding_window int32_t available_context_len = position_offset.empty() @@ -107,87 +215,73 @@ void KVManager::init_attention_mask( // if available_context_len is less than 0, it means we need to mask some // tokens in the past to avoid exceeding the sliding window if (available_context_len < 0) { - std::fill_n(past_ptr, -available_context_len, neg_val); + fill_mask( + attention_mask_dtype_, + past_ptr, + -available_context_len, + /*use_pos_value=*/false); } - past_ptr += metadata_.context_len; - new_ptr += metadata_.context_len; + past_ptr += metadata_.context_len * getDtypeSize(attention_mask_dtype_); + new_ptr += metadata_.context_len * getDtypeSize(attention_mask_dtype_); } } -template -void KVManager::update_attention_mask( - uint16_t* attention_mask, +void KVManager::update_attention_mask( + std::byte* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update) { - uint16_t pos_val = 65535; - uint16_t* cur_ptr = attention_mask; - cur_ptr += n_past; + std::byte* cur_ptr = + attention_mask + n_past * getDtypeSize(attention_mask_dtype_); for (int i = 0; i < ar_len; i++) { - std::fill_n(cur_ptr, n_update, pos_val); - cur_ptr += metadata_.context_len; + fill_mask(attention_mask_dtype_, cur_ptr, n_update, /*use_pos_value=*/true); + cur_ptr += metadata_.context_len * getDtypeSize(attention_mask_dtype_); } } -template -void KVManager::update_attention_mask( - uint16_t* attention_mask, +void KVManager::update_attention_mask( + std::byte* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update, int32_t sliding_window, const std::vector& position_offset) { - uint16_t pos_val = 65535; - uint16_t neg_val = 0; - uint16_t* cur_ptr = attention_mask; - cur_ptr += n_past; + std::byte* cur_ptr = + attention_mask + n_past * getDtypeSize(attention_mask_dtype_); for (int i = 0; i < ar_len; i++) { - std::fill_n(cur_ptr, n_update, pos_val); + fill_mask(attention_mask_dtype_, cur_ptr, n_update, /*use_pos_value=*/true); int32_t available_cache_len = position_offset.empty() ? sliding_window - (i + 1) : sliding_window - (position_offset[i] + 1); if (n_past + n_update > available_cache_len) { - std::fill_n( - cur_ptr - n_past, n_past + n_update - available_cache_len, neg_val); + fill_mask( + attention_mask_dtype_, + cur_ptr - n_past * getDtypeSize(attention_mask_dtype_), + n_past + n_update, + /*use_pos_value=*/false); } - cur_ptr += metadata_.context_len; + cur_ptr += metadata_.context_len * getDtypeSize(attention_mask_dtype_); } } -template -void KVManager::init_cache(IMemAlloc* buffer_manager, int32_t ar_len) { +void KVManager::init_cache(IMemAlloc* buffer_manager, int32_t ar_len) { cur_ar_len_ = ar_len; - const size_t max_in_cache_block_in_bytes = - metadata_.max_cache_len * sizeof(T); - const size_t max_out_cache_block_in_bytes = metadata_.max_ar_len * sizeof(T); - - const size_t cache_in_bytes = - metadata_.num_heads * metadata_.head_dim * max_in_cache_block_in_bytes; - const size_t cache_out_bytes = - metadata_.num_heads * metadata_.head_dim * max_out_cache_block_in_bytes; + const size_t cache_in_bytes = metadata_.num_heads * metadata_.head_dim * + metadata_.max_cache_len * getDtypeSize(kv_cache_dtype_); + const size_t cache_out_bytes = metadata_.num_heads * metadata_.head_dim * + metadata_.max_ar_len * getDtypeSize(kv_cache_dtype_); for (int layer = 0; layer < metadata_.num_layers; ++layer) { - // Allocate buffer for key cache and value cache - T* single_layer_k_cache_in = - reinterpret_cast(buffer_manager->allocate(cache_in_bytes)); - T* single_layer_k_cache_out = - reinterpret_cast(buffer_manager->allocate(cache_out_bytes)); - T* single_layer_v_cache_in = - reinterpret_cast(buffer_manager->allocate(cache_in_bytes)); - T* single_layer_v_cache_out = - reinterpret_cast(buffer_manager->allocate(cache_out_bytes)); - - k_cache_[layer].buffer = single_layer_k_cache_in; - k_cache_[layer].output_buffer = single_layer_k_cache_out; - v_cache_[layer].buffer = single_layer_v_cache_in; - v_cache_[layer].output_buffer = single_layer_v_cache_out; + k_cache_[layer].buffer = buffer_manager->allocate(cache_in_bytes); + k_cache_[layer].output_buffer = buffer_manager->allocate(cache_out_bytes); + v_cache_[layer].buffer = buffer_manager->allocate(cache_in_bytes); + v_cache_[layer].output_buffer = buffer_manager->allocate(cache_out_bytes); } } -template -void KVManager::rearrange_cache(int32_t ar_len_dst) { +void KVManager::rearrange_cache(int32_t ar_len_dst) { // Don't need to rearrange if cur_ar_len_ is equal to target ar_len if (cur_ar_len_ == ar_len_dst) return; @@ -199,75 +293,73 @@ void KVManager::rearrange_cache(int32_t ar_len_dst) { cur_ar_len_ = ar_len_dst; } -template -void KVManager::rearrange_key(KVCache& k_cache, int32_t ar_len_dst) { +void KVManager::rearrange_key(KVCache& k_cache, int32_t ar_len_dst) { const int32_t src_cache_num = (cur_ar_len_ == metadata_.context_len) ? metadata_.context_len : metadata_.context_len - cur_ar_len_; const int32_t dst_cache_num = metadata_.context_len - ar_len_dst; - T* k_cache_in_read_ptr = k_cache.buffer; - T* k_cache_in_write_ptr = k_cache.buffer; - + std::byte* k_cache_in_read_ptr = k_cache.buffer; + std::byte* k_cache_in_write_ptr = k_cache.buffer; + size_t src_cache_nbytes = src_cache_num * getDtypeSize(kv_cache_dtype_); + size_t dst_cache_nbytes = dst_cache_num * getDtypeSize(kv_cache_dtype_); if (src_cache_num > dst_cache_num) { // copy from first dimension for (int i = 0; i < metadata_.head_dim * metadata_.num_heads; i++) { - std::memmove( - k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_num * sizeof(T)); - k_cache_in_read_ptr += src_cache_num; - k_cache_in_write_ptr += dst_cache_num; + std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, dst_cache_nbytes); + k_cache_in_read_ptr += src_cache_nbytes; + k_cache_in_write_ptr += dst_cache_nbytes; } } else { k_cache_in_read_ptr += - (metadata_.head_dim * metadata_.num_heads - 1) * src_cache_num; + (metadata_.head_dim * metadata_.num_heads - 1) * src_cache_nbytes; k_cache_in_write_ptr += - (metadata_.head_dim * metadata_.num_heads - 1) * dst_cache_num; + (metadata_.head_dim * metadata_.num_heads - 1) * dst_cache_nbytes; // copy from last dimension for (int i = 0; i < metadata_.head_dim * metadata_.num_heads; i++) { - std::memmove( - k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_num * sizeof(T)); - k_cache_in_read_ptr -= src_cache_num; - k_cache_in_write_ptr -= dst_cache_num; + std::memmove(k_cache_in_write_ptr, k_cache_in_read_ptr, src_cache_nbytes); + k_cache_in_read_ptr -= src_cache_nbytes; + k_cache_in_write_ptr -= dst_cache_nbytes; } } } -template -void KVManager::rearrange_value(KVCache& v_cache, int32_t ar_len_dst) { +void KVManager::rearrange_value(KVCache& v_cache, int32_t ar_len_dst) { const int32_t src_cache_num = (cur_ar_len_ == metadata_.context_len) ? metadata_.context_len : metadata_.context_len - cur_ar_len_; const int32_t dst_cache_num = metadata_.context_len - ar_len_dst; - T* v_cache_in_read_ptr = v_cache.buffer; - T* v_cache_in_write_ptr = v_cache.buffer; + std::byte* v_cache_in_read_ptr = v_cache.buffer; + std::byte* v_cache_in_write_ptr = v_cache.buffer; + size_t src_cache_nbytes = src_cache_num * getDtypeSize(kv_cache_dtype_); + size_t dst_cache_nbytes = dst_cache_num * getDtypeSize(kv_cache_dtype_); if (src_cache_num > dst_cache_num) { // copy from first dimension for (int i = 0; i < metadata_.num_heads; i++) { std::memmove( v_cache_in_write_ptr, v_cache_in_read_ptr, - dst_cache_num * metadata_.head_dim * sizeof(T)); - v_cache_in_read_ptr += src_cache_num * metadata_.head_dim; - v_cache_in_write_ptr += dst_cache_num * metadata_.head_dim; + dst_cache_nbytes * metadata_.head_dim); + v_cache_in_read_ptr += src_cache_nbytes * metadata_.head_dim; + v_cache_in_write_ptr += dst_cache_nbytes * metadata_.head_dim; } } else { v_cache_in_read_ptr += - metadata_.head_dim * (metadata_.num_heads - 1) * src_cache_num; + metadata_.head_dim * (metadata_.num_heads - 1) * src_cache_nbytes; v_cache_in_write_ptr += - metadata_.head_dim * (metadata_.num_heads - 1) * dst_cache_num; + metadata_.head_dim * (metadata_.num_heads - 1) * dst_cache_nbytes; // copy from last dimension for (int i = 0; i < metadata_.num_heads; i++) { std::memmove( v_cache_in_write_ptr, v_cache_in_read_ptr, - src_cache_num * metadata_.head_dim * sizeof(T)); - v_cache_in_read_ptr -= src_cache_num * metadata_.head_dim; - v_cache_in_write_ptr -= dst_cache_num * metadata_.head_dim; + src_cache_nbytes * metadata_.head_dim); + v_cache_in_read_ptr -= src_cache_nbytes * metadata_.head_dim; + v_cache_in_write_ptr -= dst_cache_nbytes * metadata_.head_dim; } } } -template -void KVManager::update_cache( +void KVManager::update_cache( int32_t ar_len, int32_t n_past, int32_t n_update, @@ -283,20 +375,19 @@ void KVManager::update_cache( } } -template -void KVManager::update_key( - KVCache& k_cache, +void KVManager::update_key( + KVCache& k_cache, int32_t n_past, int32_t n_update, const std::vector& selected) { - T* write_ptr = k_cache.buffer; - T* read_ptr = k_cache.output_buffer; - const int32_t copy_size = n_update * sizeof(T); + std::byte* write_ptr = k_cache.buffer; + std::byte* read_ptr = k_cache.output_buffer; + const int32_t copy_size = n_update * getDtypeSize(kv_cache_dtype_); const int32_t iter_size = (cur_ar_len_ == metadata_.context_len) - ? metadata_.context_len - : metadata_.context_len - cur_ar_len_; - const int32_t out_size = cur_ar_len_; - const int32_t past_size = n_past; + ? metadata_.context_len * getDtypeSize(kv_cache_dtype_) + : (metadata_.context_len - cur_ar_len_) * getDtypeSize(kv_cache_dtype_); + const int32_t out_size = cur_ar_len_ * getDtypeSize(kv_cache_dtype_); + const int32_t past_size = n_past * getDtypeSize(kv_cache_dtype_); const int32_t n_iter = metadata_.head_dim * metadata_.num_heads; write_ptr += past_size; @@ -316,7 +407,11 @@ void KVManager::update_key( for (int i = 0; i < n_iter; ++i) { auto wp = write_ptr, rp = read_ptr; for (auto ind : true_indices) { - *wp++ = rp[ind]; + std::memmove( + wp, + rp + ind * getDtypeSize(kv_cache_dtype_), + getDtypeSize(kv_cache_dtype_)); + wp += getDtypeSize(kv_cache_dtype_); } write_ptr += iter_size; read_ptr += out_size; @@ -324,21 +419,25 @@ void KVManager::update_key( } } -template -void KVManager::update_value( - KVCache& v_cache, +void KVManager::update_value( + KVCache& v_cache, int32_t n_past, int32_t n_update, const std::vector& selected) { - T* write_ptr = v_cache.buffer; - T* read_ptr = v_cache.output_buffer; - const int32_t copy_size = n_update * metadata_.head_dim * sizeof(T); - const int32_t past_size = n_past * metadata_.head_dim; + std::byte* write_ptr = v_cache.buffer; + std::byte* read_ptr = v_cache.output_buffer; + const int32_t copy_size = + n_update * metadata_.head_dim * getDtypeSize(kv_cache_dtype_); + const int32_t past_size = + n_past * metadata_.head_dim * getDtypeSize(kv_cache_dtype_); const int32_t n_iter = metadata_.num_heads; const int32_t iter_size = (cur_ar_len_ == metadata_.context_len) - ? metadata_.context_len * metadata_.head_dim - : (metadata_.context_len - cur_ar_len_) * metadata_.head_dim; - const int32_t out_size = cur_ar_len_ * metadata_.head_dim; + ? metadata_.context_len * metadata_.head_dim * + getDtypeSize(kv_cache_dtype_) + : (metadata_.context_len - cur_ar_len_) * metadata_.head_dim * + getDtypeSize(kv_cache_dtype_); + const int32_t out_size = + cur_ar_len_ * metadata_.head_dim * getDtypeSize(kv_cache_dtype_); write_ptr += past_size; @@ -354,13 +453,14 @@ void KVManager::update_value( auto wp = write_ptr, rp = read_ptr; for (auto sel : selected) { if (sel) { - std::memcpy(wp, rp, metadata_.head_dim * sizeof(T)); - wp += metadata_.head_dim; + std::memcpy( + wp, rp, metadata_.head_dim * getDtypeSize(kv_cache_dtype_)); + wp += metadata_.head_dim * getDtypeSize(kv_cache_dtype_); update_times--; if (update_times == 0) break; } - rp += metadata_.head_dim; + rp += metadata_.head_dim * getDtypeSize(kv_cache_dtype_); } write_ptr += iter_size; read_ptr += out_size; @@ -368,8 +468,4 @@ void KVManager::update_value( } } -// Explicit instantiations -template class KVManager; -template class KVManager; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h index 06fe88517a7..3b8e67dd38d 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h +++ b/examples/qualcomm/oss_scripts/llama/runner/kv_manager.h @@ -8,6 +8,7 @@ #pragma once #include +#include #include #include #include @@ -15,17 +16,15 @@ namespace example { // Structure to hold key-value cache buffers -template struct KVCache { - T* buffer; - T* output_buffer; + std::byte* buffer; + std::byte* output_buffer; }; /** * @class KVManager * @brief Class for kv cache update, rearrangement, and buffer allocatation. */ -template class KVManager { public: struct Metadata { @@ -36,7 +35,9 @@ class KVManager { int64_t num_heads; int64_t num_layers; }; - KVManager(Metadata metadata); + KVManager( + Metadata metadata, + std::unique_ptr method_meta); /** * @brief Allocate buffer for KV cache and set the cur_ar_len_. @@ -71,7 +72,7 @@ class KVManager { * @param n_past Number of past elements in the cache. */ void init_attention_mask( - uint16_t* attention_mask, + std::byte* attention_mask, const std::vector& attention_map, int32_t ar_len, int32_t n_past); @@ -98,7 +99,7 @@ class KVManager { * @param position_offset (optional) attention mask position offset of */ void init_attention_mask( - uint16_t* attention_mask, + std::byte* attention_mask, const std::vector& attention_map, int32_t ar_len, int32_t n_past, @@ -114,7 +115,7 @@ class KVManager { * @param n_update Number of elements to be updated. */ void update_attention_mask( - uint16_t* attention_mask, + std::byte* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update); @@ -132,7 +133,7 @@ class KVManager { * lookahead decoder */ void update_attention_mask( - uint16_t* attention_mask, + std::byte* attention_mask, int32_t ar_len, int32_t n_past, int32_t n_update, @@ -152,10 +153,10 @@ class KVManager { int32_t n_update, const std::vector& selected); - const std::vector>& get_k_cache_() const { + const std::vector& get_k_cache_() const { return k_cache_; } - const std::vector>& get_v_cache_() const { + const std::vector& get_v_cache_() const { return v_cache_; } @@ -169,15 +170,19 @@ class KVManager { private: // Helper functions to rearrange and update key and value caches - void rearrange_key(KVCache& k_cache, int32_t ar_len_dst); - void rearrange_value(KVCache& v_cache, int32_t ar_len_dst); + + void rearrange_key(KVCache& k_cache, int32_t ar_len_dst); + + void rearrange_value(KVCache& v_cache, int32_t ar_len_dst); + void update_key( - KVCache& k_cache, + KVCache& k_cache, int32_t n_past, int32_t n_update, const std::vector& selected); + void update_value( - KVCache& v_cache, + KVCache& v_cache, int32_t n_past, int32_t n_update, const std::vector& selected); @@ -186,10 +191,14 @@ class KVManager { Metadata metadata_; size_t total_cache_size_; int32_t cur_ar_len_; + executorch::aten::ScalarType attention_mask_dtype_ = + executorch::aten::ScalarType::Undefined; + executorch::aten::ScalarType kv_cache_dtype_ = + executorch::aten::ScalarType::Undefined; // Store start pointer of k and v cache for input and output // input: layer -> head * head_dim * max_cache_len // output: layer -> head * head_dim * max_ar_len - std::vector> k_cache_; - std::vector> v_cache_; + std::vector k_cache_; + std::vector v_cache_; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp index f7e44292f26..298fc1ac9ff 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.cpp @@ -13,20 +13,19 @@ using executorch::runtime::Result; namespace example { -template -void LhdTokenGenerator::prepare_io( +void LhdTokenGenerator::prepare_io( std::vector input_tokens, std::vector input_pos) { for (int i = 0; i < metadata_.ar_len; i++) { if (i < input_tokens.size()) { // Prepare pos data - this->input_pos_.data[i] = input_pos[i]; + reinterpret_cast(this->input_pos_.data)[i] = input_pos[i]; // Support CPU 4-bit embedding, which requires int64 input. // However, for QNN embedding, only int32 input is needed. // Therefore, we need to cast to the correct type to write the data. if (metadata_.use_int64_token) { - this->input_toks_.data[i] = input_tokens[i]; + reinterpret_cast(this->input_toks_.data)[i] = input_tokens[i]; } else { int32_t* input_toks_ptr = reinterpret_cast(this->input_toks_.data); @@ -36,8 +35,7 @@ void LhdTokenGenerator::prepare_io( } } -template -void LhdTokenGenerator::init_attention_mask(int32_t n_past) { +void LhdTokenGenerator::init_attention_mask(int32_t n_past) { std::vector attention_map; attention_map.reserve(metadata_.ar_len); // Initialize attention mask with current position @@ -73,8 +71,7 @@ void LhdTokenGenerator::init_attention_mask(int32_t n_past) { } } -template -void LhdTokenGenerator::init_lookahead_branch( +void LhdTokenGenerator::init_lookahead_branch( const std::vector& tokens) { for (int i = 0; i < metadata_.ngram - 1; ++i) { for (int j = 0; j < metadata_.window; ++j) { @@ -91,8 +88,7 @@ void LhdTokenGenerator::init_lookahead_branch( is_lhd_branch_initialized_ = true; } -template -void LhdTokenGenerator::init_verification_branch(uint64_t cur_token) { +void LhdTokenGenerator::init_verification_branch(uint64_t cur_token) { const int g_cur = ngrams_pool_.cnt[cur_token]; v_branch_.resize(g_cur); @@ -116,8 +112,7 @@ void LhdTokenGenerator::init_verification_branch(uint64_t cur_token) { } } -template -void LhdTokenGenerator::update_ngrams_pool() { +void LhdTokenGenerator::update_ngrams_pool() { std::vector ngram(metadata_.ngram - 1); // n-gram pool generation for (int f = 0; f < metadata_.window; ++f) { @@ -170,8 +165,7 @@ void LhdTokenGenerator::update_ngrams_pool() { } } -template -void LhdTokenGenerator::update_lookahead_branch( +void LhdTokenGenerator::update_lookahead_branch( const executorch::aten::Tensor& logits_tensor) { for (int i = 0; i < metadata_.window; i++) { lhd_branch_prev_[i] = lhd_branch_[0][i]; @@ -189,8 +183,7 @@ void LhdTokenGenerator::update_lookahead_branch( } } -template -Result LhdTokenGenerator::generate( +Result LhdTokenGenerator::generate( std::vector tokens, int64_t start_pos, int32_t seq_len, @@ -427,8 +420,4 @@ Result LhdTokenGenerator::generate( return pos - start_pos; } -// Explicit instantiations -template class LhdTokenGenerator; -template class LhdTokenGenerator; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h index 796dde88014..8fdffb8af72 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h +++ b/examples/qualcomm/oss_scripts/llama/runner/lhd_token_generator.h @@ -15,8 +15,8 @@ namespace example { * @brief Class for generating the token using decoder and key-value manager * with lookahead decoding. */ -template -class LhdTokenGenerator : public TokenGenerator { + +class LhdTokenGenerator : public TokenGenerator { public: struct Metadata { int32_t context_len; @@ -34,18 +34,19 @@ class LhdTokenGenerator : public TokenGenerator { LhdTokenGenerator( tokenizers::Tokenizer* tokenizer, DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& forward_name, std::unique_ptr>&& eos_ids, Metadata metadata, - executorch::llm::Stats* stats) - : TokenGenerator( + executorch::llm::Stats* stats, + std::unique_ptr method_meta) + : TokenGenerator( tokenizer, decoder_runner, kv_manager, forward_name, std::move(eos_ids), - typename TokenGenerator::Metadata{ + TokenGenerator::Metadata{ metadata.context_len, metadata.num_heads, metadata.num_layers, @@ -54,7 +55,8 @@ class LhdTokenGenerator : public TokenGenerator { metadata.use_int64_token, metadata.sliding_window, metadata.cache_mode}, - stats), + stats, + std::move(method_meta)), metadata_(metadata), lhd_branch_(metadata.ngram - 1, std::vector(metadata.window)), lhd_branch_prev_(metadata.window), @@ -104,7 +106,7 @@ class LhdTokenGenerator : public TokenGenerator { private: // Bring base class's virtual prepare_io into scope so the overload below // does not hide it (-Woverloaded-virtual). - using TokenGenerator::prepare_io; + using TokenGenerator::prepare_io; /** * @brief Fill in I/O buffers with prompt token and position. * @param cur_token Current token. diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp index 14a93104e1a..de8d1bea0fe 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.cpp @@ -13,8 +13,7 @@ using executorch::runtime::Result; namespace example { -template -void MultimodalLhdTokenGenerator::prepare_io( +void MultimodalLhdTokenGenerator::prepare_io( std::vector input_tokens, std::vector input_pos) { for (int i = 0; i < metadata_.ar_len; i++) { @@ -51,8 +50,7 @@ void MultimodalLhdTokenGenerator::prepare_io( } } -template -void MultimodalLhdTokenGenerator::init_attention_mask(int32_t n_past) { +void MultimodalLhdTokenGenerator::init_attention_mask(int32_t n_past) { std::vector attention_map; attention_map.reserve(metadata_.ar_len); // Initialize attention mask with current position @@ -88,8 +86,7 @@ void MultimodalLhdTokenGenerator::init_attention_mask(int32_t n_past) { } } -template -void MultimodalLhdTokenGenerator::init_lookahead_branch( +void MultimodalLhdTokenGenerator::init_lookahead_branch( const std::vector& tokens) { for (int i = 0; i < metadata_.ngram - 1; ++i) { for (int j = 0; j < metadata_.window; ++j) { @@ -106,9 +103,7 @@ void MultimodalLhdTokenGenerator::init_lookahead_branch( is_lhd_branch_initialized_ = true; } -template -void MultimodalLhdTokenGenerator::init_verification_branch( - uint64_t cur_token) { +void MultimodalLhdTokenGenerator::init_verification_branch(uint64_t cur_token) { const int g_cur = ngrams_pool_.cnt[cur_token]; v_branch_.resize(g_cur); @@ -132,8 +127,7 @@ void MultimodalLhdTokenGenerator::init_verification_branch( } } -template -void MultimodalLhdTokenGenerator::update_ngrams_pool() { +void MultimodalLhdTokenGenerator::update_ngrams_pool() { std::vector ngram(metadata_.ngram - 1); // n-gram pool generation for (int f = 0; f < metadata_.window; ++f) { @@ -186,8 +180,7 @@ void MultimodalLhdTokenGenerator::update_ngrams_pool() { } } -template -void MultimodalLhdTokenGenerator::update_lookahead_branch( +void MultimodalLhdTokenGenerator::update_lookahead_branch( const executorch::aten::Tensor& logits_tensor) { for (int i = 0; i < metadata_.window; i++) { lhd_branch_prev_[i] = lhd_branch_[0][i]; @@ -205,8 +198,7 @@ void MultimodalLhdTokenGenerator::update_lookahead_branch( } } -template -Result MultimodalLhdTokenGenerator::generate( +Result MultimodalLhdTokenGenerator::generate( std::vector tokens, int64_t start_pos, int32_t seq_len, @@ -412,8 +404,4 @@ Result MultimodalLhdTokenGenerator::generate( return pos - start_pos; } -// Explicit instantiations -template class MultimodalLhdTokenGenerator; -template class MultimodalLhdTokenGenerator; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h index 7494afec6da..6ffe285e536 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_lhd_token_generator.h @@ -15,9 +15,7 @@ namespace example { * @class MultimodalLhdTokenGenerator * @brief Extended LhdTokenGenerator with multimodal embedding support */ -template -class MultimodalLhdTokenGenerator - : public example::MultimodalTokenGenerator { +class MultimodalLhdTokenGenerator : public example::MultimodalTokenGenerator { public: struct Metadata { int32_t context_len; @@ -37,19 +35,20 @@ class MultimodalLhdTokenGenerator tokenizers::Tokenizer* tokenizer, TokenEmbeddingProcessor* embedding_runner, DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& forward_name, std::unique_ptr>&& eos_ids, Metadata metadata, - executorch::llm::Stats* stats) - : MultimodalTokenGenerator( + executorch::llm::Stats* stats, + std::unique_ptr method_meta) + : MultimodalTokenGenerator( tokenizer, embedding_runner, decoder_runner, kv_manager, forward_name, std::move(eos_ids), - typename MultimodalTokenGenerator::Metadata{ + MultimodalTokenGenerator::Metadata{ metadata.context_len, metadata.num_heads, metadata.num_layers, @@ -59,7 +58,8 @@ class MultimodalLhdTokenGenerator metadata.sliding_window, metadata.cache_mode, metadata.embedding_dim}, - stats), + stats, + std::move(method_meta)), tok_embedding_runner_(embedding_runner), metadata_(metadata), lhd_branch_(metadata.ngram - 1, std::vector(metadata.window)), @@ -110,7 +110,7 @@ class MultimodalLhdTokenGenerator private: // Bring base class's virtual prepare_io into scope so the overload below // does not hide it (-Woverloaded-virtual). - using TokenGenerator::prepare_io; + using TokenGenerator::prepare_io; /** * @brief Fill in I/O buffers with prompt token and position. * @param cur_token Current token. diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp index 2859e16a42a..f63a431791b 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.cpp @@ -16,13 +16,13 @@ using executorch::runtime::TensorInfo; namespace example { -template -MultimodalPromptProcessor::MultimodalPromptProcessor( +MultimodalPromptProcessor::MultimodalPromptProcessor( DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, - Metadata metadata) - : PromptProcessor( + Metadata metadata, + std::unique_ptr method_meta) + : PromptProcessor( decoder_runner, kv_manager, method_name, @@ -33,7 +33,8 @@ MultimodalPromptProcessor::MultimodalPromptProcessor( metadata.vocab_size, metadata.use_int64_token, metadata.sliding_window, - metadata.cache_mode}), + metadata.cache_mode}, + std::move(method_meta)), metadata_(metadata) { // Set input_toks_.size to 0 since we use embeddings instead input_toks_.size = 0; @@ -41,8 +42,7 @@ MultimodalPromptProcessor::MultimodalPromptProcessor( metadata_.ar_len * metadata_.embedding_dim * sizeof(float); }; -template -void MultimodalPromptProcessor::init_io( +void MultimodalPromptProcessor::init_io( IMemAlloc* buffer_manager, Result method_meta) { size_t idx = 0; @@ -66,8 +66,7 @@ void MultimodalPromptProcessor::init_io( // [I]: attention_mask Result attention_mask = method_meta->input_tensor_meta(idx++); - attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(attention_mask_.size)); + attention_mask_.data = buffer_manager->allocate(attention_mask_.size); attention_mask_.tensor = std::make_unique( attention_mask->scalar_type(), attention_mask->sizes().size(), @@ -83,8 +82,8 @@ void MultimodalPromptProcessor::init_io( if (metadata_.cache_mode == CacheMode::HybridCache) { Result window_attention_mask = method_meta->input_tensor_meta(idx++); - window_attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.data = + buffer_manager->allocate(window_attention_mask_.size); window_attention_mask_.tensor = std::make_unique( window_attention_mask->scalar_type(), window_attention_mask->sizes().size(), @@ -120,32 +119,29 @@ void MultimodalPromptProcessor::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_in_ : v_cache_in_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->input_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].buffer; - cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].buffer, const_cast( kv_cache->dim_order().data())); input_tensors_.emplace_back(cache[layer].get()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].buffer, cache[layer]->nbytes(), kv_cache.get()); } } } // [O]: logits Result logits = method_meta->output_tensor_meta(0); - logits_.data = - reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.data = buffer_manager->allocate(logits_.size); logits_.tensor = std::make_unique( logits->scalar_type(), logits->sizes().size(), @@ -160,21 +156,22 @@ void MultimodalPromptProcessor::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_out_ : v_cache_out_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->output_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].output_buffer; cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].output_buffer, const_cast(kv_cache->dim_order().data())); output_tensors_.emplace_back(cache[layer].get()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].output_buffer, + cache[layer]->nbytes(), + kv_cache.get()); } } @@ -186,8 +183,7 @@ void MultimodalPromptProcessor::init_io( } // prepare embedding -template -void MultimodalPromptProcessor::prepare_io( +void MultimodalPromptProcessor::prepare_io( const TensorStruct& prompt_embedding, int32_t num_prompt_tokens, int64_t prompt_pos, @@ -208,8 +204,7 @@ void MultimodalPromptProcessor::prepare_io( } } -template -Result MultimodalPromptProcessor::prefill( +Result MultimodalPromptProcessor::prefill( const TensorStruct& prompt_embedding, int64_t start_pos, bool dump_logits, @@ -301,8 +296,4 @@ Result MultimodalPromptProcessor::prefill( return cur_token; } -// Explicit instantiations -template class MultimodalPromptProcessor; -template class MultimodalPromptProcessor; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h index fcfc07c9590..c2769ed9f50 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_prompt_processor.h @@ -16,8 +16,7 @@ namespace example { * @class MultimodalPromptProcessor * @brief Extended PromptProcessor with multimodal embedding support */ -template -class MultimodalPromptProcessor : public example::PromptProcessor { +class MultimodalPromptProcessor : public example::PromptProcessor { public: struct Metadata { int32_t context_len; @@ -33,9 +32,10 @@ class MultimodalPromptProcessor : public example::PromptProcessor { MultimodalPromptProcessor( DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, - Metadata metadata); + Metadata metadata, + std::unique_ptr method_meta); int64_t get_num_heads() const { return metadata_.num_heads; @@ -74,34 +74,29 @@ class MultimodalPromptProcessor : public example::PromptProcessor { * @return Total I/O size in bytes. */ inline const size_t total_prompt_processor_io_size_in_bytes() const { - if (metadata_.cache_mode == CacheMode::HybridCache) { - return input_toks_.size + input_pos_.size + attention_mask_.size + - window_attention_mask_.size + logits_.size + input_embedding_.size; - } else { - return input_toks_.size + input_pos_.size + attention_mask_.size + - logits_.size + input_embedding_.size; - } + return input_toks_.size + input_pos_.size + attention_mask_.size + + window_attention_mask_.size + logits_.size + input_embedding_.size; } private: // Reuse members from token_generator - using PromptProcessor::decoder_runner_; - using PromptProcessor::kv_manager_; - using PromptProcessor::method_name_; - using PromptProcessor::k_cache_in_; - using PromptProcessor::v_cache_in_; - using PromptProcessor::k_cache_out_; - using PromptProcessor::v_cache_out_; - using PromptProcessor::input_toks_; - using PromptProcessor::input_pos_; - using PromptProcessor::attention_mask_; - using PromptProcessor::window_attention_mask_; - using PromptProcessor::logits_; - using PromptProcessor::inputs_; - using PromptProcessor::input_tensors_; - using PromptProcessor::output_tensors_; - using PromptProcessor::prompt_all_logits_; - using PromptProcessor::is_bert; + using PromptProcessor::attention_mask_; + using PromptProcessor::decoder_runner_; + using PromptProcessor::input_pos_; + using PromptProcessor::input_tensors_; + using PromptProcessor::input_toks_; + using PromptProcessor::inputs_; + using PromptProcessor::is_bert; + using PromptProcessor::k_cache_in_; + using PromptProcessor::k_cache_out_; + using PromptProcessor::kv_manager_; + using PromptProcessor::logits_; + using PromptProcessor::method_name_; + using PromptProcessor::output_tensors_; + using PromptProcessor::prompt_all_logits_; + using PromptProcessor::v_cache_in_; + using PromptProcessor::v_cache_out_; + using PromptProcessor::window_attention_mask_; /** * @brief Fill in I/O buffers with embedding data and position. diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp index 32e3baf27a9..32575994222 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.cpp @@ -74,17 +74,17 @@ void print_performance_report( void save_logits( const std::string& dump_logits_path, - const std::vector& prefill_logits, - const std::vector& decode_logits) { + const std::vector& prefill_logits, + const std::vector& decode_logits) { std::ofstream outFile(dump_logits_path.c_str(), std::ios::binary); if (outFile.is_open()) { outFile.write( reinterpret_cast(prefill_logits.data()), - prefill_logits.size() * sizeof(uint16_t)); + prefill_logits.size()); outFile.write( reinterpret_cast(decode_logits.data()), - decode_logits.size() * sizeof(uint16_t)); + decode_logits.size()); outFile.close(); } else { ET_CHECK_MSG(false, "Error saving the dump logits file"); @@ -93,8 +93,7 @@ void save_logits( } // namespace -template -QNNMultimodalRunner::QNNMultimodalRunner( +QNNMultimodalRunner::QNNMultimodalRunner( std::unique_ptr encoder, std::unique_ptr tok_embedding, std::unique_ptr text_decoder, @@ -148,16 +147,14 @@ QNNMultimodalRunner::QNNMultimodalRunner( ET_LOG(Info, "eval mode=%d", eval_mode_); } -template -bool QNNMultimodalRunner::is_loaded() const { +bool QNNMultimodalRunner::is_loaded() const { return encoder_->is_loaded() && tok_embedding_->is_loaded() && text_decoder_->is_loaded() && embedding_merger_ && tokenizer_ && decoder_runner_ && prompt_processor_ && token_generator_ && kv_manager_ && buffer_manager_; } -template -Error QNNMultimodalRunner::load() { +Error QNNMultimodalRunner::load() { if (is_loaded()) { return Error::Ok; } @@ -298,19 +295,22 @@ Error QNNMultimodalRunner::load() { sliding_window = ET_UNWRAP(text_decoder_->get("get_sliding_window")).toInt(); } - kv_manager_ = std::make_unique>(typename KVManager::Metadata{ - context_len_, - head_dim, - max_ar_len, - max_cache_len, - num_heads, - num_layers}); - - prompt_processor_ = std::make_unique>( + kv_manager_ = std::make_unique( + KVManager::Metadata{ + context_len_, + head_dim, + max_ar_len, + max_cache_len, + num_heads, + num_layers}, + std::make_unique(std::move( + text_decoder_->method_meta(token_generator_method_name).get()))); + + prompt_processor_ = std::make_unique( decoder_runner_.get(), kv_manager_.get(), prompt_processor_method_name, - typename MultimodalPromptProcessor::Metadata{ + MultimodalPromptProcessor::Metadata{ context_len_, num_heads, num_layers, @@ -319,7 +319,9 @@ Error QNNMultimodalRunner::load() { use_int64_token, sliding_window, cache_mode_, - static_cast(dim)}); + static_cast(dim)}, + std::make_unique(std::move( + text_decoder_->method_meta(prompt_processor_method_name).get()))); // Initialize EmbeddingGenerator tok_embedding_generator_ = std::make_unique( @@ -333,14 +335,14 @@ Error QNNMultimodalRunner::load() { static_cast(dim)}); if (eval_mode_ == EvalMode::kLookaheadDecoding) { // Initialize TokenGenerator - token_generator_ = std::make_unique>( + token_generator_ = std::make_unique( tokenizer_.get(), tok_embedding_generator_.get(), decoder_runner_.get(), kv_manager_.get(), token_generator_method_name, std::move(eos_ids), - typename MultimodalLhdTokenGenerator::Metadata{ + MultimodalLhdTokenGenerator::Metadata{ context_len_, num_heads, num_layers, @@ -353,16 +355,18 @@ Error QNNMultimodalRunner::load() { sliding_window, cache_mode_, static_cast(dim)}, - &stats_); + &stats_, + std::make_unique(std::move( + text_decoder_->method_meta(token_generator_method_name).get()))); } else { - token_generator_ = std::make_unique>( + token_generator_ = std::make_unique( tokenizer_.get(), tok_embedding_generator_.get(), decoder_runner_.get(), kv_manager_.get(), token_generator_method_name, std::move(eos_ids), - typename MultimodalTokenGenerator::Metadata{ + MultimodalTokenGenerator::Metadata{ context_len_, num_heads, num_layers, @@ -372,7 +376,9 @@ Error QNNMultimodalRunner::load() { sliding_window, cache_mode_, static_cast(dim)}, - &stats_); + &stats_, + std::make_unique(std::move( + text_decoder_->method_meta(token_generator_method_name).get()))); } buffer_manager_ = std::make_unique(); @@ -409,8 +415,7 @@ Error QNNMultimodalRunner::load() { return Error::Ok; } -template -executorch::runtime::Error QNNMultimodalRunner::generate( +executorch::runtime::Error QNNMultimodalRunner::generate( const std::vector& inputs, const llm::GenerationConfig& config, std::function token_callback, @@ -561,8 +566,7 @@ executorch::runtime::Error QNNMultimodalRunner::generate( return Error::Ok; } -template -Result QNNMultimodalRunner::get_model_version() { +Result QNNMultimodalRunner::get_model_version() { if (!is_loaded()) { stats_.model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -571,16 +575,11 @@ Result QNNMultimodalRunner::get_model_version() { return model_version_; } -template -Result QNNMultimodalRunner::get_encoder_method_meta() { +Result QNNMultimodalRunner::get_encoder_method_meta() { if (!is_loaded()) { ET_CHECK_OK_OR_RETURN_ERROR(load()); } return encoder_->method_meta(kEncoderForwardName); } -// Explicit instantiations -template class QNNMultimodalRunner; -template class QNNMultimodalRunner; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h index 5407d5712b7..363ded0f055 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_runner.h @@ -66,12 +66,6 @@ inline Modality modality_of(const ModelVersion& model_version) { [](const auto& model) { return modality_of(model); }, model_version); } -enum KvBitWidth { - kWidth8 = 8, - kWidth16 = 16, -}; - -template class QNNMultimodalRunner : public executorch::extension::llm::MultimodalRunner { public: @@ -139,11 +133,11 @@ class QNNMultimodalRunner ModelVersion model_version_; std::unique_ptr buffer_manager_; - std::unique_ptr> kv_manager_; + std::unique_ptr kv_manager_; std::unique_ptr tokenizer_; std::unique_ptr decoder_runner_; - std::unique_ptr> prompt_processor_; - std::unique_ptr> token_generator_; + std::unique_ptr prompt_processor_; + std::unique_ptr token_generator_; std::unique_ptr encoder_runner_; std::unique_ptr tok_embedding_runner_; std::unique_ptr tok_embedding_processor_; diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp index 2ed8ae51f1d..e3f6f8e214e 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.cpp @@ -15,17 +15,17 @@ using executorch::runtime::TensorInfo; namespace example { // Constructor with embedding runner support -template -MultimodalTokenGenerator::MultimodalTokenGenerator( +MultimodalTokenGenerator::MultimodalTokenGenerator( tokenizers::Tokenizer* tokenizer, TokenEmbeddingProcessor* tok_embedding_runner, DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, std::unique_ptr>&& eos_ids, Metadata metadata, - executorch::llm::Stats* stats) - : TokenGenerator( + executorch::llm::Stats* stats, + std::unique_ptr method_meta) + : TokenGenerator( tokenizer, decoder_runner, kv_manager, @@ -39,7 +39,8 @@ MultimodalTokenGenerator::MultimodalTokenGenerator( metadata.use_int64_token, metadata.sliding_window, metadata.cache_mode}, - stats), + stats, + std::move(method_meta)), tok_embedding_runner_(tok_embedding_runner), metadata_(metadata) { // Set input_toks_.size to 0 since we use embeddings instead @@ -48,8 +49,7 @@ MultimodalTokenGenerator::MultimodalTokenGenerator( metadata_.ar_len * metadata_.embedding_dim * sizeof(float); } -template -void MultimodalTokenGenerator::init_io( +void MultimodalTokenGenerator::init_io( IMemAlloc* buffer_manager, Result method_meta) { size_t idx = 0; @@ -73,8 +73,7 @@ void MultimodalTokenGenerator::init_io( // [I]: attention_mask Result attention_mask = method_meta->input_tensor_meta(idx++); - attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(attention_mask_.size)); + attention_mask_.data = buffer_manager->allocate(attention_mask_.size); attention_mask_.tensor = std::make_unique( attention_mask->scalar_type(), attention_mask->sizes().size(), @@ -90,8 +89,8 @@ void MultimodalTokenGenerator::init_io( if (metadata_.cache_mode == CacheMode::HybridCache) { Result window_attention_mask = method_meta->input_tensor_meta(idx++); - window_attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.data = + buffer_manager->allocate(window_attention_mask_.size); window_attention_mask_.tensor = std::make_unique( window_attention_mask->scalar_type(), window_attention_mask->sizes().size(), @@ -126,30 +125,27 @@ void MultimodalTokenGenerator::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_in_ : v_cache_in_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->input_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].buffer; - cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].buffer, const_cast(kv_cache->dim_order().data())); input_tensors_.emplace_back(cache[layer].get()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].buffer, cache[layer]->nbytes(), kv_cache.get()); } } // [O]: logits Result logits = method_meta->output_tensor_meta(0); - logits_.data = - reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.data = buffer_manager->allocate(logits_.size); logits_.tensor = std::make_unique( logits->scalar_type(), logits->sizes().size(), @@ -164,21 +160,22 @@ void MultimodalTokenGenerator::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_out_ : v_cache_out_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->output_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].output_buffer; cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].output_buffer, const_cast(kv_cache->dim_order().data())); output_tensors_.emplace_back(cache[layer].get()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].output_buffer, + cache[layer]->nbytes(), + kv_cache.get()); } } @@ -190,8 +187,7 @@ void MultimodalTokenGenerator::init_io( } // This function only considers the case where token_generator_ar_len equals 1. -template -void MultimodalTokenGenerator::prepare_io( +void MultimodalTokenGenerator::prepare_io( uint64_t cur_token, int64_t start_pos) { // Generate embedding for current token using embedding runner @@ -209,8 +205,4 @@ void MultimodalTokenGenerator::prepare_io( *input_pos_.data = static_cast(start_pos); } -// Explicit instantiations -template class MultimodalTokenGenerator; -template class MultimodalTokenGenerator; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h index 9eb9c79aaa4..2d0bf9385b4 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h +++ b/examples/qualcomm/oss_scripts/llama/runner/multimodal_runner/multimodal_token_generator.h @@ -16,8 +16,7 @@ namespace example { * @class MultimodalTokenGenerator * @brief Extended TokenGenerator with multimodal embedding support */ -template -class MultimodalTokenGenerator : public example::TokenGenerator { +class MultimodalTokenGenerator : public example::TokenGenerator { public: struct Metadata { int32_t context_len; @@ -36,11 +35,12 @@ class MultimodalTokenGenerator : public example::TokenGenerator { tokenizers::Tokenizer* tokenizer, TokenEmbeddingProcessor* tok_embedding_runner, DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, std::unique_ptr>&& eos_ids, Metadata metadata, - executorch::llm::Stats* stats); + executorch::llm::Stats* stats, + std::unique_ptr method_meta); virtual ~MultimodalTokenGenerator() = default; @@ -54,36 +54,31 @@ class MultimodalTokenGenerator : public example::TokenGenerator { override; inline const size_t total_token_generator_io_size_in_bytes() const { - if (metadata_.cache_mode == CacheMode::HybridCache) { - return input_toks_.size + input_pos_.size + attention_mask_.size + - window_attention_mask_.size + logits_.size + input_embedding_.size; - } else { - return input_toks_.size + input_pos_.size + attention_mask_.size + - logits_.size + input_embedding_.size; - } + return input_toks_.size + input_pos_.size + attention_mask_.size + + window_attention_mask_.size + logits_.size + input_embedding_.size; } protected: // Reuse members from token_generator - using TokenGenerator::kv_manager_; - using TokenGenerator::input_pos_; - using TokenGenerator::attention_mask_; - using TokenGenerator::window_attention_mask_; - using TokenGenerator::inputs_; - using TokenGenerator::input_tensors_; - using TokenGenerator::output_tensors_; + using TokenGenerator::attention_mask_; + using TokenGenerator::input_pos_; + using TokenGenerator::input_tensors_; + using TokenGenerator::inputs_; + using TokenGenerator::kv_manager_; + using TokenGenerator::output_tensors_; + using TokenGenerator::window_attention_mask_; // Additional members specific to multimodal TensorStruct input_embedding_; private: // Reuse members from token_generator - using TokenGenerator::input_toks_; - using TokenGenerator::logits_; - using TokenGenerator::k_cache_in_; - using TokenGenerator::v_cache_in_; - using TokenGenerator::k_cache_out_; - using TokenGenerator::v_cache_out_; + using TokenGenerator::input_toks_; + using TokenGenerator::k_cache_in_; + using TokenGenerator::k_cache_out_; + using TokenGenerator::logits_; + using TokenGenerator::v_cache_in_; + using TokenGenerator::v_cache_out_; // Additional members specific to multimodal TokenEmbeddingProcessor* tok_embedding_runner_; diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp index 59744d488bd..0cb52246a39 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.cpp @@ -17,12 +17,12 @@ using executorch::runtime::Span; using executorch::runtime::TensorInfo; namespace example { -template -PromptProcessor::PromptProcessor( +PromptProcessor::PromptProcessor( DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, - Metadata metadata) + Metadata metadata, + std::unique_ptr method_meta) : decoder_runner_(decoder_runner), kv_manager_(kv_manager), method_name_(method_name), @@ -32,33 +32,41 @@ PromptProcessor::PromptProcessor( k_cache_out_.resize(metadata_.num_layers); v_cache_out_.resize(metadata_.num_layers); // Calculate I/O size + Result attention_mask = method_meta->input_tensor_meta(1); + Result logits = method_meta->output_tensor_meta(0); input_toks_.size = metadata_.ar_len * sizeof(int64_t); - if (is_bert()) + if (is_bert()) { input_pos_.size = 0; - else + } else { input_pos_.size = metadata_.ar_len * sizeof(int32_t); + } + attention_mask_.dtype = attention_mask->scalar_type(); + attention_mask_.size = metadata_.ar_len * metadata_.context_len * + attention_mask_.getElementSize(); switch (metadata_.cache_mode) { case CacheMode::StaticCahce: - attention_mask_.size = - metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); window_attention_mask_.size = 0; break; - case CacheMode::HybridCache: - attention_mask_.size = - metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); - window_attention_mask_.size = - metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); + case CacheMode::HybridCache: { + Result window_attention_mask = + method_meta->input_tensor_meta(2); + window_attention_mask_.dtype = window_attention_mask->scalar_type(); + window_attention_mask_.size = metadata_.ar_len * metadata_.context_len * + window_attention_mask_.getElementSize(); break; + } default: ET_CHECK_MSG(false, "Unsupported llama cache mode"); break; } - logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t); + logits_.dtype = logits->scalar_type(); + logits_.size = + metadata_.ar_len * metadata_.vocab_size * logits_.getElementSize(); }; -template -void PromptProcessor::init_io( + +void PromptProcessor::init_io( IMemAlloc* buffer_manager, Result method_meta) { size_t idx = 0; @@ -80,8 +88,7 @@ void PromptProcessor::init_io( // [I]: attention_mask Result attention_mask = method_meta->input_tensor_meta(idx++); - attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(attention_mask_.size)); + attention_mask_.data = buffer_manager->allocate(attention_mask_.size); attention_mask_.tensor = std::make_unique( attention_mask->scalar_type(), attention_mask->sizes().size(), @@ -97,8 +104,8 @@ void PromptProcessor::init_io( if (metadata_.cache_mode == CacheMode::HybridCache) { Result window_attention_mask = method_meta->input_tensor_meta(idx++); - window_attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.data = + buffer_manager->allocate(window_attention_mask_.size); window_attention_mask_.tensor = std::make_unique( window_attention_mask->scalar_type(), window_attention_mask->sizes().size(), @@ -136,33 +143,30 @@ void PromptProcessor::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_in_ : v_cache_in_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->input_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].buffer; - cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].buffer, const_cast( kv_cache->dim_order().data())); input_tensors_.emplace_back(cache[layer].get()); cache_inputs_.emplace_back(input_tensors_.back()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].buffer, cache[layer]->nbytes(), kv_cache.get()); } } } // [O]: logits Result logits = method_meta->output_tensor_meta(0); - logits_.data = - reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.data = buffer_manager->allocate(logits_.size); logits_.tensor = std::make_unique( logits->scalar_type(), logits->sizes().size(), @@ -177,21 +181,22 @@ void PromptProcessor::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_out_ : v_cache_out_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->output_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].output_buffer; cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].output_buffer, const_cast(kv_cache->dim_order().data())); output_tensors_.emplace_back(cache[layer].get()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].output_buffer, + cache[layer]->nbytes(), + kv_cache.get()); } } // Prepare the vector of EValue to run inference @@ -201,13 +206,11 @@ void PromptProcessor::init_io( } } -template -const std::vector& PromptProcessor::get_all_logits() { +const std::vector& PromptProcessor::get_all_logits() { return prompt_all_logits_; } -template -void PromptProcessor::prepare_io( +void PromptProcessor::prepare_io( const std::vector& prompt_tokens, int64_t prompt_pos, int64_t start_pos) { @@ -232,8 +235,7 @@ void PromptProcessor::prepare_io( } } -template -Result PromptProcessor::prefill( +Result PromptProcessor::prefill( std::vector prompt_tokens, int64_t start_pos, bool dump_logits, @@ -339,7 +341,9 @@ Result PromptProcessor::prefill( prompt_all_logits_.insert( prompt_all_logits_.end(), logits_.data, - logits_.data + metadata_.ar_len * metadata_.vocab_size); + logits_.data + + metadata_.ar_len * metadata_.vocab_size * + logits_.getElementSize()); } // In the last run, offset to the meaningful logits. if (i == num_iters - 1) { @@ -369,8 +373,4 @@ Result PromptProcessor::prefill( return cur_token; } -// Explicit instantiations -template class PromptProcessor; -template class PromptProcessor; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h index 599f7050d83..5317a8a77e1 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h +++ b/examples/qualcomm/oss_scripts/llama/runner/prompt_processor.h @@ -21,7 +21,7 @@ namespace example { * @class PromptProcessor * @brief Class for processing prompts using decoder and key-value manager. */ -template + class PromptProcessor { public: struct Metadata { @@ -36,9 +36,10 @@ class PromptProcessor { }; PromptProcessor( DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, - Metadata metadata); + Metadata metadata, + std::unique_ptr method_meta); virtual ~PromptProcessor() = default; @@ -55,9 +56,9 @@ class PromptProcessor { /** * @brief Get the all logits generated * - * @return std::vector& all the logits generated + * @return std::vector& all the logits generated */ - virtual const std::vector& get_all_logits(); + virtual const std::vector& get_all_logits(); /** * Prefill an LLM Module with the given text input. @@ -79,13 +80,8 @@ class PromptProcessor { * @return Total I/O size in bytes. */ inline const size_t total_prompt_processor_io_size_in_bytes() const { - if (metadata_.cache_mode == CacheMode::HybridCache) { - return input_toks_.size + input_pos_.size + attention_mask_.size + - window_attention_mask_.size + logits_.size; - } else { - return input_toks_.size + input_pos_.size + attention_mask_.size + - logits_.size; - } + return input_toks_.size + input_pos_.size + attention_mask_.size + + window_attention_mask_.size + logits_.size; } protected: @@ -105,7 +101,7 @@ class PromptProcessor { int64_t prompt_pos, int64_t start_pos); DecoderRunner* decoder_runner_; - KVManager* kv_manager_; + KVManager* kv_manager_; std::string method_name_; // metadata @@ -114,9 +110,9 @@ class PromptProcessor { // inputs and outputs TensorStruct input_toks_; TensorStruct input_pos_; - TensorStruct attention_mask_; - TensorStruct window_attention_mask_; - TensorStruct logits_; + TensorStructRaw attention_mask_; + TensorStructRaw window_attention_mask_; + TensorStructRaw logits_; // layer -> TensorImpl std::vector> k_cache_in_; @@ -131,6 +127,6 @@ class PromptProcessor { std::vector cache_inputs_; // Unused by default, only used when dump_logits_path is provided. - std::vector prompt_all_logits_; + std::vector prompt_all_logits_; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp index 0a4a8b9abb5..7257e869dcc 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp @@ -66,17 +66,17 @@ void print_performance_report( void save_logits( const std::string& dump_logits_path, - const std::vector& prefill_logits, - const std::vector& decode_logits) { + const std::vector& prefill_logits, + const std::vector& decode_logits) { std::ofstream outFile(dump_logits_path.c_str(), std::ios::binary); if (outFile.is_open()) { outFile.write( reinterpret_cast(prefill_logits.data()), - prefill_logits.size() * sizeof(uint16_t)); + prefill_logits.size()); outFile.write( reinterpret_cast(decode_logits.data()), - decode_logits.size() * sizeof(uint16_t)); + decode_logits.size()); outFile.close(); } else { ET_CHECK_MSG(false, "Error saving the dump logits file"); @@ -85,8 +85,7 @@ void save_logits( } // namespace -template -Runner::Runner( +Runner::Runner( std::unique_ptr module, const std::string& decoder_model_version, const std::string& model_path, @@ -152,14 +151,12 @@ Runner::Runner( ET_LOG(Info, "eval mode=%d", eval_mode_); } -template -bool Runner::is_loaded() const { +bool Runner::is_loaded() const { return module_->is_loaded() && tokenizer_ && decoder_runner_ && prompt_processor_ && token_generator_ && kv_manager_ && buffer_manager_; } -template -Error Runner::load() { +Error Runner::load() { if (is_loaded()) { return Error::Ok; } @@ -275,13 +272,16 @@ Error Runner::load() { if (module_->method_names()->count("get_sliding_window") > 0) { sliding_window = ET_UNWRAP(module_->get("get_sliding_window")).toInt(); } - kv_manager_ = std::make_unique>(typename KVManager::Metadata{ - context_len_, - head_dim, - max_ar_len, - max_cache_len, - num_heads, - num_layers}); + kv_manager_ = std::make_unique( + KVManager::Metadata{ + context_len_, + head_dim, + max_ar_len, + max_cache_len, + num_heads, + num_layers}, + std::make_unique( + std::move(module_->method_meta(token_generator_method_name).get()))); if (attention_sink_rope_module_ != nullptr) { attention_sink_rope_runner_ = std::make_unique( @@ -290,11 +290,11 @@ Error Runner::load() { attention_sink_rope_runner_->load(method_names)); } - prompt_processor_ = std::make_unique>( + prompt_processor_ = std::make_unique( decoder_runner_.get(), kv_manager_.get(), prompt_processor_method_name, - typename PromptProcessor::Metadata{ + PromptProcessor::Metadata{ context_len_, num_heads, num_layers, @@ -302,15 +302,17 @@ Error Runner::load() { vocab_size, use_int64_token, sliding_window, - cache_mode_}); + cache_mode_}, + std::make_unique( + std::move(module_->method_meta(prompt_processor_method_name).get()))); if (eval_mode_ == EvalMode::kLookaheadDecoding) { - token_generator_ = std::make_unique>( + token_generator_ = std::make_unique( tokenizer_.get(), decoder_runner_.get(), kv_manager_.get(), token_generator_method_name, std::move(eos_ids), - typename LhdTokenGenerator::Metadata{ + LhdTokenGenerator::Metadata{ context_len_, num_heads, num_layers, @@ -322,15 +324,17 @@ Error Runner::load() { gcap_, sliding_window, cache_mode_}, - &stats_); + &stats_, + std::make_unique(std::move( + module_->method_meta(token_generator_method_name).get()))); } else { - token_generator_ = std::make_unique>( + token_generator_ = std::make_unique( tokenizer_.get(), decoder_runner_.get(), kv_manager_.get(), token_generator_method_name, std::move(eos_ids), - typename TokenGenerator::Metadata{ + TokenGenerator::Metadata{ context_len_, num_heads, num_layers, @@ -339,7 +343,9 @@ Error Runner::load() { use_int64_token, sliding_window, cache_mode_}, - &stats_); + &stats_, + std::make_unique(std::move( + module_->method_meta(token_generator_method_name).get()))); } buffer_manager_ = std::make_unique(); @@ -360,8 +366,7 @@ Error Runner::load() { return Error::Ok; } -template -Error Runner::generate( +Error Runner::generate( const std::string& prompt, const llm::GenerationConfig& config, std::function token_callback, @@ -370,8 +375,7 @@ Error Runner::generate( prompt, false, config, token_callback, stats_callback); } -template -Error Runner::generate_from_prompt_or_file( +Error Runner::generate_from_prompt_or_file( const std::string& prompt, bool tokenized_prompt, const llm::GenerationConfig& config, @@ -500,8 +504,7 @@ Error Runner::generate_from_prompt_or_file( return Error::Ok; } -template -Result Runner::get_decoder_model_version() { +Result Runner::get_decoder_model_version() { if (!is_loaded()) { stats_.model_load_start_ms = time_in_ms(); ET_CHECK_OK_OR_RETURN_ERROR(load()); @@ -510,8 +513,4 @@ Result Runner::get_decoder_model_version() { return decoder_model_version_; } -// Explicit instantiations -template class Runner; -template class Runner; - } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h index 39ce62c2d9f..5d03a12f61a 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/runner.h +++ b/examples/qualcomm/oss_scripts/llama/runner/runner.h @@ -46,12 +46,6 @@ enum DecoderModelVersion { kGemma2, }; -enum KvBitWidth { - kWidth8 = 8, - kWidth16 = 16, -}; - -template class Runner : public executorch::extension::llm::IRunner { public: explicit Runner( @@ -121,14 +115,15 @@ class Runner : public executorch::extension::llm::IRunner { DecoderModelVersion decoder_model_version_; std::unique_ptr buffer_manager_; - std::unique_ptr> kv_manager_; + std::unique_ptr kv_manager_; std::unique_ptr tokenizer_; std::unique_ptr decoder_runner_; std::unique_ptr attention_sink_rope_runner_; - std::unique_ptr> prompt_processor_; - std::unique_ptr> token_generator_; + std::unique_ptr prompt_processor_; + std::unique_ptr token_generator_; // stats executorch::llm::Stats stats_; }; + } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp index 8ab82d932e1..098fcf9efa6 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp +++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.cpp @@ -17,15 +17,15 @@ using executorch::runtime::Span; using executorch::runtime::TensorInfo; namespace example { -template -TokenGenerator::TokenGenerator( +TokenGenerator::TokenGenerator( tokenizers::Tokenizer* tokenizer, DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, std::unique_ptr>&& eos_ids, Metadata metadata, - executorch::llm::Stats* stats) + executorch::llm::Stats* stats, + std::unique_ptr method_meta) : tokenizer_(tokenizer), decoder_runner_(decoder_runner), kv_manager_(kv_manager), @@ -39,32 +39,37 @@ TokenGenerator::TokenGenerator( v_cache_out_.resize(metadata_.num_layers); // Calculate I/O size + Result attention_mask = method_meta->input_tensor_meta(1); + Result logits = method_meta->output_tensor_meta(0); + input_toks_.size = metadata_.ar_len * sizeof(int64_t); input_pos_.size = metadata_.ar_len * sizeof(int32_t); - attention_mask_.size = - metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); + attention_mask_.dtype = attention_mask->scalar_type(); + attention_mask_.size = metadata_.ar_len * metadata_.context_len * + attention_mask_.getElementSize(); switch (metadata_.cache_mode) { case CacheMode::StaticCahce: - attention_mask_.size = - metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); window_attention_mask_.size = 0; break; - case CacheMode::HybridCache: - attention_mask_.size = - metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); - window_attention_mask_.size = - metadata_.ar_len * metadata_.context_len * sizeof(uint16_t); + case CacheMode::HybridCache: { + Result window_attention_mask = + method_meta->input_tensor_meta(2); + window_attention_mask_.dtype = window_attention_mask->scalar_type(); + window_attention_mask_.size = metadata_.ar_len * metadata_.context_len * + window_attention_mask_.getElementSize(); break; + } default: ET_CHECK_MSG(false, "Unsupported llama cache mode"); break; } - logits_.size = metadata_.ar_len * metadata_.vocab_size * sizeof(uint16_t); + logits_.dtype = logits->scalar_type(); + logits_.size = + metadata_.ar_len * metadata_.vocab_size * logits_.getElementSize(); } -template -void TokenGenerator::init_io( +void TokenGenerator::init_io( IMemAlloc* buffer_manager, Result method_meta) { size_t idx = 0; @@ -86,8 +91,7 @@ void TokenGenerator::init_io( // [I]: attention_mask Result attention_mask = method_meta->input_tensor_meta(idx++); - attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(attention_mask_.size)); + attention_mask_.data = buffer_manager->allocate(attention_mask_.size); attention_mask_.tensor = std::make_unique( attention_mask->scalar_type(), attention_mask->sizes().size(), @@ -103,8 +107,8 @@ void TokenGenerator::init_io( if (metadata_.cache_mode == CacheMode::HybridCache) { Result window_attention_mask = method_meta->input_tensor_meta(idx++); - window_attention_mask_.data = reinterpret_cast( - buffer_manager->allocate(window_attention_mask_.size)); + window_attention_mask_.data = + buffer_manager->allocate(window_attention_mask_.size); window_attention_mask_.tensor = std::make_unique( window_attention_mask->scalar_type(), window_attention_mask->sizes().size(), @@ -141,31 +145,28 @@ void TokenGenerator::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_in_ : v_cache_in_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->input_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].buffer; - cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].buffer, const_cast(kv_cache->dim_order().data())); input_tensors_.emplace_back(cache[layer].get()); cache_inputs_.emplace_back(input_tensors_.back()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].buffer, cache[layer]->nbytes(), kv_cache.get()); } } // [O]: logits Result logits = method_meta->output_tensor_meta(0); - logits_.data = - reinterpret_cast(buffer_manager->allocate(logits_.size)); + logits_.data = buffer_manager->allocate(logits_.size); logits_.tensor = std::make_unique( logits->scalar_type(), logits->sizes().size(), @@ -180,21 +181,22 @@ void TokenGenerator::init_io( for (int cache_group = 0; cache_group < 2; ++cache_group) { std::vector>& cache = (cache_group == 0 ? k_cache_out_ : v_cache_out_); - std::vector> cache_ptrs = (cache_group == 0) + std::vector cache_ptrs = (cache_group == 0) ? kv_manager_->get_k_cache_() : kv_manager_->get_v_cache_(); for (int layer = 0; layer < metadata_.num_layers; ++layer, ++index) { Result kv_cache = method_meta->output_tensor_meta(index); - T* cache_ptr = cache_ptrs[layer].output_buffer; cache[layer] = std::make_unique( kv_cache->scalar_type(), kv_cache->sizes().size(), const_cast(kv_cache->sizes().data()), - cache_ptr, + cache_ptrs[layer].output_buffer, const_cast(kv_cache->dim_order().data())); output_tensors_.emplace_back(cache[layer].get()); buffer_manager->add_memory_info( - cache_ptr, cache[layer]->nbytes(), kv_cache.get()); + cache_ptrs[layer].output_buffer, + cache[layer]->nbytes(), + kv_cache.get()); } } // Prepare the vector of EValue to run inference @@ -204,14 +206,12 @@ void TokenGenerator::init_io( } } -template -const std::vector& TokenGenerator::get_all_logits() { +const std::vector& TokenGenerator::get_all_logits() { return token_all_logits_; } // This function only considers the case where token_generator_ar_len equals 1. -template -void TokenGenerator::prepare_io(uint64_t cur_token, int64_t start_pos) { +void TokenGenerator::prepare_io(uint64_t cur_token, int64_t start_pos) { // update input_tok *input_toks_.data = metadata_.use_int64_token ? cur_token : static_cast(cur_token); @@ -219,8 +219,7 @@ void TokenGenerator::prepare_io(uint64_t cur_token, int64_t start_pos) { *input_pos_.data = static_cast(start_pos); } -template -Result TokenGenerator::generate( +Result TokenGenerator::generate( std::vector tokens, int64_t start_pos, int32_t seq_len, @@ -306,7 +305,9 @@ Result TokenGenerator::generate( token_all_logits_.insert( token_all_logits_.end(), logits_.data, - logits_.data + metadata_.ar_len * metadata_.vocab_size); + logits_.data + + metadata_.ar_len * metadata_.vocab_size * + logits_.getElementSize()); } ET_CHECK_OK_OR_RETURN_ERROR(logits_res.error()); executorch::aten::Tensor& logits_tensor = logits_res.get(); @@ -374,8 +375,5 @@ Result TokenGenerator::generate( return pos - start_pos; } -// Explicit instantiations -template class TokenGenerator; -template class TokenGenerator; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h index 7f9264b1102..6945d907a76 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/token_generator.h +++ b/examples/qualcomm/oss_scripts/llama/runner/token_generator.h @@ -22,7 +22,7 @@ namespace example { * @class TokenGenerator * @brief Class for generating the token using decoder and key-value manager. */ -template + class TokenGenerator { public: struct Metadata { @@ -38,11 +38,12 @@ class TokenGenerator { TokenGenerator( tokenizers::Tokenizer* tokenizer, DecoderRunner* decoder_runner, - KVManager* kv_manager, + KVManager* kv_manager, const std::string& method_name, std::unique_ptr>&& eos_ids, Metadata metadata, - executorch::llm::Stats* stats); + executorch::llm::Stats* stats, + std::unique_ptr method_meta); virtual ~TokenGenerator() = default; /** @@ -58,9 +59,9 @@ class TokenGenerator { /** * @brief Get the all logits generated * - * @return std::vector& all the logits generated + * @return std::vector& all the logits generated */ - virtual const std::vector& get_all_logits(); + virtual const std::vector& get_all_logits(); /**    * @brief Generate tokens. @@ -78,28 +79,23 @@ class TokenGenerator { bool dump_logits, AttentionSinkRopeRunner* attention_sink_rope_runner); inline const size_t total_token_generator_io_size_in_bytes() const { - if (metadata_.cache_mode == CacheMode::HybridCache) { - return input_toks_.size + input_pos_.size + attention_mask_.size + - window_attention_mask_.size + logits_.size; - } else { - return input_toks_.size + input_pos_.size + attention_mask_.size + - logits_.size; - } + return input_toks_.size + input_pos_.size + attention_mask_.size + + window_attention_mask_.size + logits_.size; } protected: tokenizers::Tokenizer* tokenizer_; DecoderRunner* decoder_runner_; - KVManager* kv_manager_; + KVManager* kv_manager_; std::string method_name_; std::unique_ptr> eos_ids_; // inputs and outputs TensorStruct input_toks_; TensorStruct input_pos_; - TensorStruct attention_mask_; - TensorStruct window_attention_mask_; - TensorStruct logits_; + TensorStructRaw attention_mask_; + TensorStructRaw window_attention_mask_; + TensorStructRaw logits_; // layer -> TensorImpl std::vector> k_cache_in_; @@ -128,6 +124,6 @@ class TokenGenerator { Metadata metadata_; // Unused by default, only used when dump_logits_path is provided. - std::vector token_all_logits_; + std::vector token_all_logits_; }; } // namespace example diff --git a/examples/qualcomm/oss_scripts/llama/runner/utils.h b/examples/qualcomm/oss_scripts/llama/runner/utils.h index bef6b1a2017..df6dddfdc6e 100644 --- a/examples/qualcomm/oss_scripts/llama/runner/utils.h +++ b/examples/qualcomm/oss_scripts/llama/runner/utils.h @@ -8,10 +8,16 @@ #pragma once #include +#include #include #include // Template struct to hold tensor data and tensor + +// TODO: Refactor these struct to use TensorPtr +// see https://docs.pytorch.org/executorch/stable/extension-tensor.html + +// TensorStruct whose dtype known in compile time template struct TensorStruct { std::unique_ptr tensor; @@ -20,3 +26,38 @@ struct TensorStruct { // data size in bytes size_t size; }; + +inline size_t getDtypeSize(executorch::aten::ScalarType dtype) { + switch (dtype) { + case executorch::aten::ScalarType::Float: + return sizeof(float); + case executorch::aten::ScalarType::Double: + return sizeof(double); + case executorch::aten::ScalarType::Int: + return sizeof(int32_t); + case executorch::aten::ScalarType::Long: + return sizeof(int64_t); + case executorch::aten::ScalarType::Byte: + return sizeof(uint8_t); + case executorch::aten::ScalarType::UInt16: + return sizeof(uint16_t); + default: + ET_CHECK_MSG( + false, + "Unsupported scalar type %s", + executorch::runtime::toString(dtype)); + break; + } +} + +// TensorStruct whose dtype known in runtime, and raw file is used +struct TensorStructRaw { + std::unique_ptr tensor; + std::byte* data; + // data size in bytes + size_t size; + executorch::aten::ScalarType dtype; + size_t getElementSize() const { + return getDtypeSize(dtype); + } +}; diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/attention_sink_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/attention_sink_wrappers.py index 48386f181d8..de857dfc17c 100644 --- a/examples/qualcomm/oss_scripts/llama/wrappers/attention_sink_wrappers.py +++ b/examples/qualcomm/oss_scripts/llama/wrappers/attention_sink_wrappers.py @@ -13,6 +13,7 @@ import torch from executorch.backends.qualcomm._passes import TagQuantIO +from executorch.backends.qualcomm._passes.build_quant_io import BuildQuantIo from executorch.backends.qualcomm._passes.qnn_pass_manager import ( get_capture_program_passes, ) @@ -460,6 +461,7 @@ def compile(self, attention_sink_evictor_pte_path: str): alloc_graph_input=False, alloc_graph_output=False, ), + passes=[BuildQuantIo()], extract_delegate_segments=True, ) exec_prog_mgr = edge_prog_mgr.to_executorch(executorch_config) diff --git a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py index ef72e0765fd..0d5052c89bd 100644 --- a/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py +++ b/examples/qualcomm/oss_scripts/llama/wrappers/llm_wrappers.py @@ -19,6 +19,7 @@ import torch from executorch.backends.qualcomm._passes import FoldQDQ, I64toI32, TagQuantIO +from executorch.backends.qualcomm._passes.build_quant_io import BuildQuantIo from executorch.backends.qualcomm._passes.qnn_pass_manager import ( get_capture_program_passes, ) @@ -607,23 +608,28 @@ def quantize(self, request: Request): # noqa: C901 ): return + data = request.method_data[TEXT_DECODER] # check bit width graph io fixed_point_type = {"kv_type": torch.float32, "io_type": torch.float32} - if self.quant_recipe.get_kv_io_bit_width() == 8: - fixed_point_type["kv_type"] = torch.uint8 - elif self.quant_recipe.get_kv_io_bit_width() == 16: - fixed_point_type["kv_type"] = torch.uint16 + if data.skip_quantize: + # already init as float32 + return else: - raise RuntimeError( - f"unknown kv io bit width {self.quant_recipe.get_kv_io_bit_width()}" - ) + if self.quant_recipe.get_kv_io_bit_width() == 8: + fixed_point_type["kv_type"] = torch.uint8 + elif self.quant_recipe.get_kv_io_bit_width() == 16: + fixed_point_type["kv_type"] = torch.uint16 + else: + raise RuntimeError( + f"unknown kv io bit width {self.quant_recipe.get_kv_io_bit_width()}" + ) - if self.quant_recipe.get_logits_output_bit_width() == 16: - fixed_point_type["io_type"] = torch.uint16 - else: - raise RuntimeError( - f"unknown logits io bit width {self.quant_recipe.get_logits_output_bit_width()}" - ) + if self.quant_recipe.get_logits_output_bit_width() == 16: + fixed_point_type["io_type"] = torch.uint16 + else: + raise RuntimeError( + f"unknown logits io bit width {self.quant_recipe.get_logits_output_bit_width()}" + ) data = request.method_data[TEXT_DECODER] audio_turns = request.method_data[ @@ -906,7 +912,11 @@ def compile(self, request: Request): # noqa: C901 # here we use a mechanism to make sure the encoding align correctly and # save AoT quantization time as well. # --- - if self.prefill.decoder is not None and self.prefill.model_args.use_kv_cache: + if ( + self.prefill.decoder is not None + and self.prefill.model_args.use_kv_cache + and not request.method_data[TEXT_DECODER].skip_quantize + ): self._encoding_override( decode_model=self.decode.decoder, prefill_model=self.prefill.decoder, @@ -973,6 +983,7 @@ def compile(self, request: Request): # noqa: C901 alloc_graph_input=False, alloc_graph_output=False, ), + passes=[BuildQuantIo()], ) tok_embedding_exec_prog_mgr = tok_embedding_edge_prog_mgr.to_executorch( executorch_config @@ -1009,6 +1020,7 @@ def compile(self, request: Request): # noqa: C901 alloc_graph_input=False, alloc_graph_output=False, ), + passes=[BuildQuantIo()], ) exec_prog_mgr = edge_prog_mgr.to_executorch(executorch_config) data = request.method_data[TEXT_DECODER] @@ -1127,7 +1139,9 @@ def compile(self, request: Request): if self.control_args.verbose: print_delegation_info(edge_prog_mgr.exported_program().graph_module) - exec_prog_mgr = edge_prog_mgr.to_executorch(ExecutorchBackendConfig()) + exec_prog_mgr = edge_prog_mgr.to_executorch( + ExecutorchBackendConfig(passes=[BuildQuantIo()]) + ) data = request.method_data[self.modality] with open( f"{self.control_args.artifact}/{data.pte_filename}.pte", "wb" @@ -1223,6 +1237,7 @@ def compile( self, compile_specs: Dict[str, List[CompileSpec]], pte_filenames: Dict[str, str], + skip_quantize: Dict[str, bool], ): compile_request = Request( inspect.currentframe().f_code.co_name, @@ -1230,6 +1245,7 @@ def compile( m: Request.Data( compile_spec=compile_specs[m], pte_filename=pte_filenames[m], + skip_quantize=skip_quantize[m] if m in skip_quantize else False, ) for m in self._modalities }, diff --git a/exir/passes/spec_prop_pass.py b/exir/passes/spec_prop_pass.py index 9adbf65dd90..73f943e55e0 100644 --- a/exir/passes/spec_prop_pass.py +++ b/exir/passes/spec_prop_pass.py @@ -11,6 +11,7 @@ import torch from executorch.exir.delegate import executorch_call_delegate +from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, ProxyValue from executorch.exir.tensor import TensorSpec from torch.export.exported_program import ExportGraphSignature @@ -18,6 +19,14 @@ from torch.fx.passes.infra.pass_base import PassResult from torch.utils import _pytree as pytree +# register llama.fallback (optional — only needed for QNN/llama sharding paths) +try: + import executorch.extension.llm.custom_ops.op_fallback # noqa: F401 + + _llama_fallback_default = exir_ops.edge.llama.fallback.default +except (ImportError, AttributeError): + _llama_fallback_default = None + # pyre-ignore def make_spec(x): @@ -75,9 +84,9 @@ def get_spec(x): elif node.op == "call_function" and node.target == operator.getitem: value_spec = pytree.tree_map(get_spec, node.args[0]) node.meta["spec"] = value_spec[node.args[1]] - elif ( - node.op == "call_function" - and node.target == executorch_call_delegate + elif node.op == "call_function" and node.target in ( + executorch_call_delegate, + _llama_fallback_default, ): # Note: We currently rely on delegate node specs not being regenerated, # as the spec is set somewhat manually when adding the call delegate node. diff --git a/extension/android/jni/jni_layer_llama.cpp b/extension/android/jni/jni_layer_llama.cpp index e072694f913..b9215f978bc 100644 --- a/extension/android/jni/jni_layer_llama.cpp +++ b/extension/android/jni/jni_layer_llama.cpp @@ -206,41 +206,14 @@ class ExecuTorchLlmJni : public facebook::jni::HybridClass { data_files_vector, cpp_load_mode); std::string decoder_model = "llama3"; // use llama3 for now - // Using 8bit as default since this meta is introduced with 16bit kv io - // support and older models only have 8bit kv io. - example::KvBitWidth kv_bitwidth = example::KvBitWidth::kWidth8; - if (module->method_names()->count("get_kv_io_bit_width") > 0) { - kv_bitwidth = static_cast( - module->get("get_kv_io_bit_width") - .get() - .toScalar() - .to()); - } - - if (kv_bitwidth == example::KvBitWidth::kWidth8) { - runner_ = std::make_unique>( - std::move(module), - decoder_model.c_str(), - model_path->toStdString().c_str(), - tokenizer_path->toStdString().c_str(), - "", - "", - temperature_); - } else if (kv_bitwidth == example::KvBitWidth::kWidth16) { - runner_ = std::make_unique>( - std::move(module), - decoder_model.c_str(), - model_path->toStdString().c_str(), - tokenizer_path->toStdString().c_str(), - "", - "", - temperature_); - } else { - ET_CHECK_MSG( - false, - "Unsupported kv bitwidth: %ld", - static_cast(kv_bitwidth)); - } + runner_ = std::make_unique( + std::move(module), + decoder_model.c_str(), + model_path->toStdString().c_str(), + tokenizer_path->toStdString().c_str(), + "", + "", + temperature_); model_type_category_ = MODEL_TYPE_CATEGORY_LLM; #endif #if defined(EXECUTORCH_BUILD_MEDIATEK) diff --git a/extension/llm/custom_ops/model_sharding.py b/extension/llm/custom_ops/model_sharding.py index 6838b0958a2..916b13a90b8 100644 --- a/extension/llm/custom_ops/model_sharding.py +++ b/extension/llm/custom_ops/model_sharding.py @@ -7,8 +7,9 @@ import re from typing import List -import torch +import executorch.extension.llm.custom_ops.op_fallback # noqa: F401 +import torch from executorch.backends.qualcomm.utils.constants import ( QCOM_PASS_ACTIVATE_KEY, QCOM_PASS_ARGS_KWARGS_DEFAULTS_KEY, @@ -17,27 +18,6 @@ from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult from torch.export.exported_program import ExportedProgram -from torch.library import impl, Library - - -fallback_op_lib = Library("llama", "DEF") -# registering an operator. -fallback_op_lib.define("fallback(Tensor input) -> Tensor") - - -@impl(fallback_op_lib, "fallback") -def fallback_impl(a: torch.Tensor) -> torch.Tensor: - return a - - -# registering the out variant. -fallback_op_lib.define("fallback.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)") - - -@impl(fallback_op_lib, "fallback.out") -def fallback_out_impl(a: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor: - out.copy_(a) - return out class SplitGraph(ExportPass): diff --git a/extension/llm/custom_ops/op_fallback.py b/extension/llm/custom_ops/op_fallback.py new file mode 100644 index 00000000000..e94c81db51a --- /dev/null +++ b/extension/llm/custom_ops/op_fallback.py @@ -0,0 +1,29 @@ +# Copyright (c) Qualcomm Innovation Center, Inc. +# All rights reserved +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# pyre-ignore-all-errors + +import torch + +from torch.library import impl, Library + +fallback_op_lib = Library("llama", "DEF") +# registering an operator. +fallback_op_lib.define("fallback(Tensor input) -> Tensor") + + +@impl(fallback_op_lib, "fallback") +def fallback_impl(a: torch.Tensor) -> torch.Tensor: + return a + + +# registering the out variant. +fallback_op_lib.define("fallback.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)") + + +@impl(fallback_op_lib, "fallback.out") +def fallback_out_impl(a: torch.Tensor, *, out: torch.Tensor) -> torch.Tensor: + out.copy_(a) + return out From 75fb249849b905c79f243f5f1ed2efe6620f6876 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Tue, 26 May 2026 02:09:16 -0700 Subject: [PATCH 016/103] add cuda allocator to cmake target (#19764) (#19764) Summary: Pull Request resolved: https://github.com/pytorch/executorch/pull/19764 Reviewed By: kirklandsign Differential Revision: D106332819 --- backends/cuda/CMakeLists.txt | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt index 217c893efe5..d56e994eab4 100644 --- a/backends/cuda/CMakeLists.txt +++ b/backends/cuda/CMakeLists.txt @@ -103,7 +103,7 @@ install( ) # CUDA-specific AOTI shim symbols (dynamically linked) -set(_aoti_cuda_shim_sources runtime/shims/memory.cpp +set(_aoti_cuda_shim_sources runtime/cuda_allocator.cpp runtime/shims/memory.cpp runtime/shims/cuda_guard.cpp ) @@ -180,8 +180,12 @@ install( # CUDA backend implementation set(_aoti_cuda_backend_sources runtime/cuda_backend.cpp) +if(_cuda_is_msvc_toolchain) + # MSVC links aoti_cuda_backend into portable_lib without relying on C++ + # symbols exported from aoti_cuda_shims.dll. + list(APPEND _aoti_cuda_backend_sources runtime/cuda_allocator.cpp) +endif() -# CUDA backend implementation add_library(aoti_cuda_backend STATIC ${_aoti_cuda_backend_sources}) target_include_directories( From c5e3e2bb0e8d8591b316d9d9b26ddc3967ae3a6c Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Tue, 26 May 2026 14:50:16 +0200 Subject: [PATCH 017/103] Arm backend: Fix missing init in VGFSetup (#19765) As documented at https://vkdoc.net/man/VkDataGraphPipelineSessionBindPointRequirementARM .stype of VkDataGraphPipelineSessionBindPointRequirementARM should alway be set to VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENT_ARM cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Erik Lundell --- backends/arm/runtime/VGFSetup.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/backends/arm/runtime/VGFSetup.cpp b/backends/arm/runtime/VGFSetup.cpp index b62a6b2ec23..307d0ab266e 100644 --- a/backends/arm/runtime/VGFSetup.cpp +++ b/backends/arm/runtime/VGFSetup.cpp @@ -793,9 +793,14 @@ bool VgfRepr::process_vgf( return false; } - vector - bind_point_requirements; - bind_point_requirements.resize(bind_point_count); + vector bind_point_requirements( + bind_point_count, + { + .sType = + VK_STRUCTURE_TYPE_DATA_GRAPH_PIPELINE_SESSION_BIND_POINT_REQUIREMENT_ARM, + .pNext = nullptr, + }); + result = vkGetDataGraphPipelineSessionBindPointRequirementsARM( vk_device, &bind_point_requirements_info, From a89f1b4b2ed977caea66376daa023d0b9bdfb461 Mon Sep 17 00:00:00 2001 From: Per Held Date: Fri, 8 May 2026 15:00:45 +0200 Subject: [PATCH 018/103] Arm backend: Enable CPPCHECK for Cortex-M Enable CPPCHECK for Cortex-M sources and headers. The Cortex-M kernels are registered through generated wrappers, so cppcheck cannot see direct call sites for the exported *_out entry points and reports them as unused. Keep narrow unusedFunction suppressions for those registration-visible functions. The scratch buffer context header is linted as a standalone header but currently exposes helper API without in-tree call sites, so suppress unusedFunction at file scope there instead of dropping Cortex-M header coverage. Keep the quantize and dequantize context parameters non-const to match the generated kernel ABI; changing them to const changes the mangled symbols used by registration. Signed-off-by: Per Held Change-Id: I3bcb6e5d3f125ae400005d1b033b24a07eb7924f --- .lintrunner.toml | 2 ++ backends/cortex_m/ops/cmsis_scratch_buffer_context.h | 1 + backends/cortex_m/ops/cortex_m_ops_common.h | 4 ++-- backends/cortex_m/ops/op_dequantize_per_tensor.cpp | 1 + backends/cortex_m/ops/op_maximum.cpp | 3 ++- backends/cortex_m/ops/op_minimum.cpp | 3 ++- backends/cortex_m/ops/op_pad.cpp | 1 + backends/cortex_m/ops/op_quantize_per_tensor.cpp | 1 + backends/cortex_m/ops/op_quantized_add.cpp | 4 ++-- backends/cortex_m/ops/op_quantized_avg_pool2d.cpp | 1 + backends/cortex_m/ops/op_quantized_batch_matmul.cpp | 1 + backends/cortex_m/ops/op_quantized_conv2d.cpp | 1 + backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp | 1 + backends/cortex_m/ops/op_quantized_linear.cpp | 1 + backends/cortex_m/ops/op_quantized_max_pool2d.cpp | 1 + backends/cortex_m/ops/op_quantized_mul.cpp | 4 ++-- backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp | 1 + backends/cortex_m/ops/op_softmax.cpp | 1 + backends/cortex_m/ops/op_transpose.cpp | 1 + 19 files changed, 25 insertions(+), 8 deletions(-) diff --git a/.lintrunner.toml b/.lintrunner.toml index 3ee436f61e8..02380ce1356 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -112,6 +112,8 @@ include_patterns = [ 'backends/arm/**/*.cpp', 'backends/arm/**/*.h', 'backends/arm/**/*.hpp', + 'backends/cortex_m/**/*.cpp', + 'backends/cortex_m/**/*.h', 'examples/arm/**/*.cpp', 'examples/arm/**/*.h', 'examples/arm/**/*.hpp', diff --git a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h index 4672f05e777..656309abcee 100644 --- a/backends/cortex_m/ops/cmsis_scratch_buffer_context.h +++ b/backends/cortex_m/ops/cmsis_scratch_buffer_context.h @@ -1,3 +1,4 @@ +// cppcheck-suppress-file unusedFunction /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. diff --git a/backends/cortex_m/ops/cortex_m_ops_common.h b/backends/cortex_m/ops/cortex_m_ops_common.h index 4c0f83d6eb6..2e3f49dd861 100644 --- a/backends/cortex_m/ops/cortex_m_ops_common.h +++ b/backends/cortex_m/ops/cortex_m_ops_common.h @@ -113,8 +113,7 @@ inline void validate_quantization_params( const int64_t shift2, const int64_t output_zero_point, const int64_t output_multiplier, - const int64_t output_shift, - Tensor& output) { + const int64_t output_shift) { validate_single_quant_params( zero_point1, multiplier1, shift1, "Single quant Input1"); validate_single_quant_params( @@ -346,6 +345,7 @@ inline bool prepare_cmsis_pool2d_config( // https://github.com/ARM-software/CMSIS-NN/blob/main/Include/arm_nnsupportfunctions.h#L1625 // multiplier: Range {ARM_NN_Q31_MIN + 1, Q32_MAX} // shift : Range {-31, 30} +// cppcheck-suppress unusedFunction inline bool validate_per_channel_quant_params( const Int64ArrayRef multipliers, const Int64ArrayRef shifts, diff --git a/backends/cortex_m/ops/op_dequantize_per_tensor.cpp b/backends/cortex_m/ops/op_dequantize_per_tensor.cpp index ca648f74695..136bce297b0 100644 --- a/backends/cortex_m/ops/op_dequantize_per_tensor.cpp +++ b/backends/cortex_m/ops/op_dequantize_per_tensor.cpp @@ -100,6 +100,7 @@ F dequantize_val(float scale, int32_t zero_point, Q qvalue) { } // namespace Tensor& dequantize_per_tensor_out( + // cppcheck-suppress constParameterReference KernelRuntimeContext& context, const Tensor& input, double scale, diff --git a/backends/cortex_m/ops/op_maximum.cpp b/backends/cortex_m/ops/op_maximum.cpp index fc76f5c8c48..936ef273684 100644 --- a/backends/cortex_m/ops/op_maximum.cpp +++ b/backends/cortex_m/ops/op_maximum.cpp @@ -1,5 +1,5 @@ /* - * Copyright 2025 Arm Limited and/or its affiliates. + * Copyright 2025-2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -12,6 +12,7 @@ namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +// cppcheck-suppress unusedFunction Tensor& maximum_out( KernelRuntimeContext& context, const Tensor& input1, diff --git a/backends/cortex_m/ops/op_minimum.cpp b/backends/cortex_m/ops/op_minimum.cpp index 5a75cb8a1dc..3324a4e39d7 100644 --- a/backends/cortex_m/ops/op_minimum.cpp +++ b/backends/cortex_m/ops/op_minimum.cpp @@ -1,7 +1,7 @@ /* * Copyright (c) Meta Platforms, Inc. and affiliates. * All rights reserved. - * Copyright 2025 Arm Limited and/or its affiliates. + * Copyright 2025-2026 Arm Limited and/or its affiliates. * * This source code is licensed under the BSD-style license found in the * LICENSE file in the root directory of this source tree. @@ -14,6 +14,7 @@ namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +// cppcheck-suppress unusedFunction Tensor& minimum_out( KernelRuntimeContext& context, const Tensor& input1, diff --git a/backends/cortex_m/ops/op_pad.cpp b/backends/cortex_m/ops/op_pad.cpp index e59f986c37d..57b5257873e 100644 --- a/backends/cortex_m/ops/op_pad.cpp +++ b/backends/cortex_m/ops/op_pad.cpp @@ -19,6 +19,7 @@ constexpr size_t kMaxSupportedDims = 4; } // namespace +// cppcheck-suppress unusedFunction Tensor& pad_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_quantize_per_tensor.cpp b/backends/cortex_m/ops/op_quantize_per_tensor.cpp index 7809db379c7..d8bb34c6eb4 100644 --- a/backends/cortex_m/ops/op_quantize_per_tensor.cpp +++ b/backends/cortex_m/ops/op_quantize_per_tensor.cpp @@ -97,6 +97,7 @@ Q quantize_val( } // namespace Tensor& quantize_per_tensor_out( + // cppcheck-suppress constParameterReference KernelRuntimeContext& context, const Tensor& input, double scale, diff --git a/backends/cortex_m/ops/op_quantized_add.cpp b/backends/cortex_m/ops/op_quantized_add.cpp index f607977aa48..f93bb6c1be9 100644 --- a/backends/cortex_m/ops/op_quantized_add.cpp +++ b/backends/cortex_m/ops/op_quantized_add.cpp @@ -13,6 +13,7 @@ namespace cortex_m { namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +// cppcheck-suppress unusedFunction Tensor& quantized_add_out( KernelRuntimeContext& context, const Tensor& input1_int8, @@ -49,8 +50,7 @@ Tensor& quantized_add_out( input2_shift, output_zero_point, output_multiplier, - output_shift, - out); + output_shift); ET_LOG( Debug, diff --git a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp index fc04edcc82b..0d22971f89b 100644 --- a/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp +++ b/backends/cortex_m/ops/op_quantized_avg_pool2d.cpp @@ -12,6 +12,7 @@ namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +// cppcheck-suppress unusedFunction Tensor& quantized_avg_pool2d_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp index 345753ca8fc..fd0859e8b00 100644 --- a/backends/cortex_m/ops/op_quantized_batch_matmul.cpp +++ b/backends/cortex_m/ops/op_quantized_batch_matmul.cpp @@ -63,6 +63,7 @@ bool validate_batch_matmul_arguments( } // namespace +// cppcheck-suppress unusedFunction Tensor& quantized_batch_matmul_out( KernelRuntimeContext& context, const Tensor& lhs, diff --git a/backends/cortex_m/ops/op_quantized_conv2d.cpp b/backends/cortex_m/ops/op_quantized_conv2d.cpp index 8af374c03f8..3d4f19e10d0 100644 --- a/backends/cortex_m/ops/op_quantized_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_conv2d.cpp @@ -98,6 +98,7 @@ bool validate_conv2d_arguments( } } // namespace +// cppcheck-suppress unusedFunction Tensor& quantized_conv2d_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp index 21d4f257501..a8e1fc21ed7 100644 --- a/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_depthwise_conv2d.cpp @@ -135,6 +135,7 @@ bool validate_depthwise_conv2d_arguments( } } // namespace +// cppcheck-suppress unusedFunction Tensor& quantized_depthwise_conv2d_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_quantized_linear.cpp b/backends/cortex_m/ops/op_quantized_linear.cpp index 5d018cbc0c4..7448058de8e 100644 --- a/backends/cortex_m/ops/op_quantized_linear.cpp +++ b/backends/cortex_m/ops/op_quantized_linear.cpp @@ -13,6 +13,7 @@ namespace cortex_m { namespace native { using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +// cppcheck-suppress unusedFunction Tensor& quantized_linear_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp index 181a29c1b65..ca1b00ff340 100644 --- a/backends/cortex_m/ops/op_quantized_max_pool2d.cpp +++ b/backends/cortex_m/ops/op_quantized_max_pool2d.cpp @@ -10,6 +10,7 @@ namespace cortex_m { namespace native { +// cppcheck-suppress unusedFunction Tensor& quantized_max_pool2d_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_quantized_mul.cpp b/backends/cortex_m/ops/op_quantized_mul.cpp index 524e74a6b9f..93ce2303d64 100644 --- a/backends/cortex_m/ops/op_quantized_mul.cpp +++ b/backends/cortex_m/ops/op_quantized_mul.cpp @@ -18,6 +18,7 @@ constexpr int32_t kInt8ActivationMax = std::numeric_limits::max(); using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +// cppcheck-suppress unusedFunction Tensor& quantized_mul_out( KernelRuntimeContext& context, const Tensor& input1_int8, @@ -50,8 +51,7 @@ Tensor& quantized_mul_out( kZeroShift, output_zero_point, output_multiplier, - output_shift, - out); + output_shift); // Extract quantization parameters int8_t* input1_ptr = input1_int8.data_ptr(); diff --git a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp index d2b66b18802..e7ecbc7c7b4 100644 --- a/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp +++ b/backends/cortex_m/ops/op_quantized_transpose_conv2d.cpp @@ -83,6 +83,7 @@ bool validate_transpose_conv2d_arguments( } } // namespace +// cppcheck-suppress unusedFunction Tensor& quantized_transpose_conv2d_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_softmax.cpp b/backends/cortex_m/ops/op_softmax.cpp index c07a538db84..97d78d07a05 100644 --- a/backends/cortex_m/ops/op_softmax.cpp +++ b/backends/cortex_m/ops/op_softmax.cpp @@ -36,6 +36,7 @@ inline int64_t normalize_dim(const Tensor& tensor, int64_t dim) { } // namespace +// cppcheck-suppress unusedFunction Tensor& softmax_out( KernelRuntimeContext& context, const Tensor& input, diff --git a/backends/cortex_m/ops/op_transpose.cpp b/backends/cortex_m/ops/op_transpose.cpp index 7fcbc034283..9ef144296b7 100644 --- a/backends/cortex_m/ops/op_transpose.cpp +++ b/backends/cortex_m/ops/op_transpose.cpp @@ -22,6 +22,7 @@ constexpr size_t kMaxSupportedDims = 4; } // namespace +// cppcheck-suppress unusedFunction Tensor& transpose_out( KernelRuntimeContext& context, const Tensor& input, From 0bf018f3cce25add0608e6fdd44773bf10cd4209 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Tue, 26 May 2026 18:14:17 +0200 Subject: [PATCH 019/103] Add Yolo26 to matrix of tested models on RISC-V (#19741) ### Summary It relates to https://github.com/pytorch/executorch/issues/18833. It doesn't add Yolo on baremetal, but it at least makes sure that it works using Portable Kernels and XNNPACK backends. ### Test plan It's only adding a model to CI, so the CI is the test plan. --- .github/workflows/riscv64.yml | 31 ++++++++++++++++--------------- examples/riscv/aot_riscv.py | 33 +++++++++++++++++++++++++++++++++ examples/riscv/requirements.txt | 1 + examples/riscv/setup.sh | 5 ++++- 4 files changed, 54 insertions(+), 16 deletions(-) diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml index 14b9ad62047..a7a5273e2b0 100644 --- a/.github/workflows/riscv64.yml +++ b/.github/workflows/riscv64.yml @@ -28,21 +28,22 @@ jobs: strategy: fail-fast: false matrix: - include: - - { model: add, xnnpack: false, quantize: false } - - { model: add, xnnpack: true, quantize: false } - - { model: mv2, xnnpack: false, quantize: false } - - { model: mv2, xnnpack: true, quantize: false } - - { model: mv2, xnnpack: true, quantize: true } - - { model: mobilebert, xnnpack: false, quantize: false } - - { model: mobilebert, xnnpack: true, quantize: false } - - { model: mobilebert, xnnpack: true, quantize: true } - - { model: llama2, xnnpack: false, quantize: false } - - { model: llama2, xnnpack: true, quantize: false } - - { model: llama2, xnnpack: true, quantize: true } - - { model: resnet18, xnnpack: false, quantize: false } - - { model: resnet18, xnnpack: true, quantize: false } - - { model: resnet18, xnnpack: true, quantize: true } + model: + - add + - mv2 + - mobilebert + - llama2 + - resnet18 + - yolo26 + xnnpack: [true, false] + quantize: [true, false] + exclude: + # We only enable quantization with XNNPACK + - xnnpack: false + quantize: true + # We don't test quantization for Yolo26 + - model: yolo26 + quantize: true permissions: id-token: write contents: read diff --git a/examples/riscv/aot_riscv.py b/examples/riscv/aot_riscv.py index 529e2b1e767..edc30c2653b 100644 --- a/examples/riscv/aot_riscv.py +++ b/examples/riscv/aot_riscv.py @@ -114,12 +114,45 @@ def build_resnet18(): return model, example_inputs, test_inputs, False +def build_yolo26(): + # Mirrors examples/models/yolo26/export_and_validate.py: predict() once + # to materialise the predictor state Ultralytics expects pre-export. + import numpy as np + from ultralytics import YOLO + + input_h, input_w = 320, 320 + yolo = YOLO("yolo26n") + yolo.predict( + np.ones((input_h, input_w, 3)), + imgsz=(input_h, input_w), + device="cpu", + ) + + class Wrapper(torch.nn.Module): + def __init__(self): + super().__init__() + self.model = yolo.model.to(torch.device("cpu")).eval() + + def forward(self, x): + # yolo.model emits (predictions, feature_maps) in eval; keep the + # predictions tensor so BundledIO sees a single tensor output. + out = self.model(x) + return out[0] if isinstance(out, (tuple, list)) else out + + model = Wrapper().eval() + torch.manual_seed(0) + example_inputs = (torch.randn(1, 3, input_h, input_w),) + test_inputs = [example_inputs] + return model, example_inputs, test_inputs, False + + MODELS = { "add": build_add, "mv2": build_mv2, "mobilebert": build_mobilebert, "llama2": build_llama2, "resnet18": build_resnet18, + "yolo26": build_yolo26, } diff --git a/examples/riscv/requirements.txt b/examples/riscv/requirements.txt index 273e7156a1d..649696ae65c 100644 --- a/examples/riscv/requirements.txt +++ b/examples/riscv/requirements.txt @@ -1,2 +1,3 @@ torchvision transformers +ultralytics diff --git a/examples/riscv/setup.sh b/examples/riscv/setup.sh index 955c8ca3386..48d5ed27642 100755 --- a/examples/riscv/setup.sh +++ b/examples/riscv/setup.sh @@ -33,7 +33,10 @@ ${SUDO} apt-get install -y --no-install-recommends \ cmake \ file \ ca-certificates \ - qemu-user-static + qemu-user-static \ + libglib2.0-0t64 \ + libxcb1 \ + libgl1 if [[ -n "${GCC_VERSION+x}" ]]; then ${SUDO} update-alternatives --install /usr/bin/riscv64-linux-gnu-gcc riscv64-linux-gnu-gcc /usr/bin/riscv64-linux-gnu-gcc${GCC_VERSION:+-${GCC_VERSION}} 100 From 6128a45130a0e6504c48b8bbdf01259f28ad964c Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 26 May 2026 09:29:07 -0700 Subject: [PATCH 020/103] Convert minibench Java files to Kotlin (#19760) Convert BenchmarkActivity, BenchmarkMetric, LlmBenchmark, LlmModelRunner, and ModelRunner from Java to Kotlin. Differential Revision: D106195816 --- .../pytorch/minibench/BenchmarkActivity.java | 136 ------------------ .../pytorch/minibench/BenchmarkActivity.kt | 116 +++++++++++++++ .../pytorch/minibench/BenchmarkMetric.java | 74 ---------- .../org/pytorch/minibench/BenchmarkMetric.kt | 54 +++++++ .../org/pytorch/minibench/LlmBenchmark.java | 123 ---------------- .../org/pytorch/minibench/LlmBenchmark.kt | 91 ++++++++++++ .../org/pytorch/minibench/LlmModelRunner.java | 110 -------------- .../org/pytorch/minibench/LlmModelRunner.kt | 91 ++++++++++++ .../org/pytorch/minibench/ModelRunner.java | 99 ------------- .../java/org/pytorch/minibench/ModelRunner.kt | 90 ++++++++++++ ...xampleUnitTest.java => ExampleUnitTest.kt} | 15 +- 11 files changed, 449 insertions(+), 550 deletions(-) delete mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java create mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.kt delete mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java create mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.kt delete mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.java create mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.kt delete mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.java create mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.kt delete mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java create mode 100644 extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.kt rename extension/benchmark/android/benchmark/app/src/test/java/org/pytorch/minibench/{ExampleUnitTest.java => ExampleUnitTest.kt} (55%) diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java deleted file mode 100644 index 5e1dd48926b..00000000000 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.minibench; - -import android.app.Activity; -import android.content.Intent; -import android.os.Bundle; -import android.os.Handler; -import android.os.HandlerThread; -import android.os.Looper; -import android.system.ErrnoException; -import android.system.Os; -import com.google.gson.Gson; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.List; - -public class BenchmarkActivity extends Activity { - - File mModel; - int mNumIter; - int mNumWarmupIter; - String mTokenizerPath; - float mTemperature; - String mPrompt; - - HandlerThread mHandlerThread; - BenchmarkHandler mHandler; - - List mResult; - - @Override - protected void onCreate(Bundle savedInstanceState) { - super.onCreate(savedInstanceState); - - try { - Os.setenv("ADSP_LIBRARY_PATH", getApplicationInfo().nativeLibraryDir, true); - } catch (ErrnoException e) { - finish(); - } - - Intent intent = getIntent(); - File modelDir = new File(intent.getStringExtra("model_dir")); - File model = - Arrays.stream(modelDir.listFiles()) - .filter(file -> file.getName().endsWith(".pte")) - .findFirst() - .get(); - - int numIter = intent.getIntExtra("num_iter", 50); - int numWarmupIter = intent.getIntExtra("num_warm_up_iter", 10); - String tokenizerPath = intent.getStringExtra("tokenizer_path"); - float temperature = intent.getFloatExtra("temperature", 0.8f); - String prompt = intent.getStringExtra("prompt"); - - mModel = model; - mNumIter = numIter; - mNumWarmupIter = numWarmupIter; - mTokenizerPath = tokenizerPath; - mTemperature = temperature; - mPrompt = prompt; - if (mPrompt == null) { - mPrompt = "The ultimate answer"; - } - mResult = new ArrayList<>(); - - mHandlerThread = new HandlerThread("ModelRunner"); - mHandlerThread.start(); - mHandler = new BenchmarkHandler(mHandlerThread.getLooper(), this); - - mHandler.sendEmptyMessage(BenchmarkHandler.MESSAGE_RUN_BENCHMARK); - } - - void writeResult() { - try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) { - Gson gson = new Gson(); - writer.write(gson.toJson(mResult)); - } catch (IOException e) { - e.printStackTrace(); - } finally { - finish(); - } - } -} - -class BenchmarkHandler extends Handler { - public static int MESSAGE_RUN_BENCHMARK = 1; - public static int MESSAGE_LLM_RUN_BENCHMARK = 2; - - ModelRunner mModelRunner; - BenchmarkActivity mBenchmarkActivity; - - LlmModelRunner mLlmModelRunner; - LlmBenchmark mLlmBenchmark; - - public BenchmarkHandler(Looper looper, BenchmarkActivity benchmarkActivity) { - super(looper); - mModelRunner = new ModelRunner(); - mBenchmarkActivity = benchmarkActivity; - } - - @Override - public void handleMessage(android.os.Message msg) { - if (msg.what == MESSAGE_RUN_BENCHMARK) { - mModelRunner.runBenchmark( - mBenchmarkActivity.mModel, - mBenchmarkActivity.mNumWarmupIter, - mBenchmarkActivity.mNumIter, - mBenchmarkActivity.mResult); - - if (mBenchmarkActivity.mTokenizerPath == null) { - mBenchmarkActivity.writeResult(); - } else { - this.sendEmptyMessage(MESSAGE_LLM_RUN_BENCHMARK); - } - } else if (msg.what == MESSAGE_LLM_RUN_BENCHMARK) { - mLlmBenchmark = - new LlmBenchmark( - mBenchmarkActivity, - mBenchmarkActivity.mModel.getPath(), - mBenchmarkActivity.mTokenizerPath, - mBenchmarkActivity.mPrompt, - mBenchmarkActivity.mTemperature, - mBenchmarkActivity.mResult); - } - } -} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.kt b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.kt new file mode 100644 index 00000000000..b1d69c5f24f --- /dev/null +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.kt @@ -0,0 +1,116 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench + +import android.app.Activity +import android.os.Bundle +import android.os.Handler +import android.os.HandlerThread +import android.os.Looper +import android.os.Message +import android.system.Os +import com.google.gson.Gson +import java.io.File +import java.io.FileWriter +import java.io.IOException + +class BenchmarkActivity : Activity() { + + lateinit var model: File + var numIter: Int = 0 + var numWarmupIter: Int = 0 + var tokenizerPath: String? = null + var temperature: Float = 0.8f + var prompt: String = "The ultimate answer" + + private lateinit var handlerThread: HandlerThread + private lateinit var handler: BenchmarkHandler + + val results: MutableList = mutableListOf() + + override fun onCreate(savedInstanceState: Bundle?) { + super.onCreate(savedInstanceState) + + try { + Os.setenv("ADSP_LIBRARY_PATH", applicationInfo.nativeLibraryDir, true) + } catch (e: android.system.ErrnoException) { + finish() + return + } + + val intent = intent + val modelDir = File(intent.getStringExtra("model_dir")!!) + model = modelDir.listFiles()!!.first { it.name.endsWith(".pte") } + + numIter = intent.getIntExtra("num_iter", 50) + numWarmupIter = intent.getIntExtra("num_warm_up_iter", 10) + tokenizerPath = intent.getStringExtra("tokenizer_path") + temperature = intent.getFloatExtra("temperature", 0.8f) + prompt = intent.getStringExtra("prompt") ?: "The ultimate answer" + + handlerThread = HandlerThread("ModelRunner") + handlerThread.start() + handler = BenchmarkHandler(handlerThread.looper, this) + + handler.sendEmptyMessage(BenchmarkHandler.MESSAGE_RUN_BENCHMARK) + } + + fun writeResult() { + try { + FileWriter("${filesDir}/benchmark_results.json").use { writer -> + writer.write(Gson().toJson(results)) + } + } catch (e: IOException) { + e.printStackTrace() + } finally { + finish() + } + } +} + +private class BenchmarkHandler( + looper: Looper, + private val activity: BenchmarkActivity, +) : Handler(looper) { + + private val modelRunner = ModelRunner() + + override fun handleMessage(msg: Message) { + when (msg.what) { + MESSAGE_RUN_BENCHMARK -> { + modelRunner.runBenchmark( + activity.model, + activity.numWarmupIter, + activity.numIter, + activity.results, + ) + if (activity.tokenizerPath == null) { + activity.writeResult() + } else { + sendEmptyMessage(MESSAGE_LLM_RUN_BENCHMARK) + } + } + MESSAGE_LLM_RUN_BENCHMARK -> { + LlmBenchmark( + activity, + activity.model.path, + activity.tokenizerPath!!, + activity.prompt, + activity.temperature, + activity.results, + ) + } + } + } + + companion object { + const val MESSAGE_RUN_BENCHMARK = 1 + const val MESSAGE_LLM_RUN_BENCHMARK = 2 + } +} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java deleted file mode 100644 index 66ab50550a4..00000000000 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.minibench; - -import android.app.ActivityManager; -import android.os.Build; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -class BenchmarkMetric { - public static class BenchmarkModel { - // The model name, i.e. stories110M - String name; - String backend; - String quantization; - - public BenchmarkModel(final String name, final String backend, final String quantization) { - this.name = name; - this.backend = backend; - this.quantization = quantization; - } - } - - BenchmarkModel benchmarkModel; - - // The metric name, i.e. TPS - String metric; - - // The actual value and the option target value - double actualValue; - double targetValue; - - public static class DeviceInfo { - // Let's see which information we want to include here - final String device = Build.BRAND; - // The phone model and Android release version - final String arch = Build.MODEL; - final String os = "Android " + Build.VERSION.RELEASE; - final long totalMem = new ActivityManager.MemoryInfo().totalMem; - final long availMem = new ActivityManager.MemoryInfo().availMem; - } - - DeviceInfo deviceInfo = new DeviceInfo(); - - public BenchmarkMetric( - final BenchmarkModel benchmarkModel, - final String metric, - final double actualValue, - final double targetValue) { - this.benchmarkModel = benchmarkModel; - this.metric = metric; - this.actualValue = actualValue; - this.targetValue = targetValue; - } - - // TODO (huydhn): Figure out a way to extract the backend and quantization information from - // the .pte model itself instead of parsing its name - public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) { - final Matcher m = - Pattern.compile("(?\\w+)_(?[\\w\\+]+)_(?\\w+)").matcher(model); - if (m.matches()) { - return new BenchmarkMetric.BenchmarkModel( - m.group("name"), m.group("backend"), m.group("quantization")); - } else { - return new BenchmarkMetric.BenchmarkModel(model, "", ""); - } - } -} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.kt b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.kt new file mode 100644 index 00000000000..7bed1ab05c0 --- /dev/null +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.kt @@ -0,0 +1,54 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench + +import android.app.ActivityManager +import android.os.Build + +class BenchmarkMetric( + val benchmarkModel: BenchmarkModel, + val metric: String, + val actualValue: Double, + val targetValue: Double, +) { + data class BenchmarkModel( + val name: String, + val backend: String, + val quantization: String, + ) + + class DeviceInfo { + val device: String = Build.BRAND + val arch: String = Build.MODEL + val os: String = "Android ${Build.VERSION.RELEASE}" + val totalMem: Long = ActivityManager.MemoryInfo().totalMem + val availMem: Long = ActivityManager.MemoryInfo().availMem + } + + val deviceInfo: DeviceInfo = DeviceInfo() + + companion object { + // TODO (huydhn): Figure out a way to extract the backend and quantization information from + // the .pte model itself instead of parsing its name + @JvmStatic + fun extractBackendAndQuantization(model: String): BenchmarkModel { + val pattern = Regex("(?\\w+)_(?[\\w+]+)_(?\\w+)") + val match = pattern.matchEntire(model) + return if (match != null) { + BenchmarkModel( + match.groups["name"]!!.value, + match.groups["backend"]!!.value, + match.groups["quantization"]!!.value, + ) + } else { + BenchmarkModel(model, "", "") + } + } + } +} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.java deleted file mode 100644 index 0c0436d2676..00000000000 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.java +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.minibench; - -import android.util.Log; -import java.util.List; -import org.json.JSONException; -import org.json.JSONObject; - -public class LlmBenchmark implements LlmModelRunnerCallback { - LlmModelRunner mLlmModelRunner; - - String mPrompt; - StatsInfo mStatsInfo; - - List mResults; - BenchmarkActivity mActivity; - - LlmBenchmark( - BenchmarkActivity activity, - String modelFile, - String tokenizerPath, - String prompt, - float temperature, - List results) { - mResults = results; - mActivity = activity; - mStatsInfo = new StatsInfo(); - mStatsInfo.modelName = modelFile.substring(modelFile.lastIndexOf('/') + 1).replace(".pte", ""); - mPrompt = prompt; - mLlmModelRunner = new LlmModelRunner(modelFile, tokenizerPath, temperature, this); - mStatsInfo.loadStart = System.nanoTime(); - } - - @Override - public void onModelLoaded(int status) { - mStatsInfo.loadEnd = System.nanoTime(); - mStatsInfo.loadStatus = status; - if (status != 0) { - Log.e("LlmBenchmarkRunner", "Loaded failed: " + status); - onGenerationStopped(); - return; - } - mStatsInfo.generateStart = System.nanoTime(); - mLlmModelRunner.generate(mPrompt); - } - - @Override - public void onTokenGenerated(String token) {} - - @Override - public void onStats(String stats) { - float tps = 0; - try { - JSONObject jsonObject = new JSONObject(stats); - int numGeneratedTokens = jsonObject.getInt("generated_tokens"); - int inferenceEndMs = jsonObject.getInt("inference_end_ms"); - int promptEvalEndMs = jsonObject.getInt("prompt_eval_end_ms"); - tps = (float) numGeneratedTokens / (inferenceEndMs - promptEvalEndMs) * 1000; - mStatsInfo.tps = tps; - } catch (JSONException e) { - Log.e("LLM", "Error parsing JSON: " + e.getMessage()); - } - } - - @Override - public void onGenerationStopped() { - mStatsInfo.generateEnd = System.nanoTime(); - - final BenchmarkMetric.BenchmarkModel benchmarkModel = - BenchmarkMetric.extractBackendAndQuantization(mStatsInfo.modelName); - // The list of metrics we have atm includes: - // Load status - mResults.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsInfo.loadStatus, 0)); - // Model load time - mResults.add( - new BenchmarkMetric( - benchmarkModel, - "llm_model_load_time(ms)", - (mStatsInfo.loadEnd - mStatsInfo.loadStart) * 1e-6, - 0.0f)); - // LLM generate time - mResults.add( - new BenchmarkMetric( - benchmarkModel, - "generate_time(ms)", - (mStatsInfo.generateEnd - mStatsInfo.generateStart) * 1e-6, - 0.0f)); - // Token per second - mResults.add(new BenchmarkMetric(benchmarkModel, "token_per_sec", mStatsInfo.tps, 0.0f)); - mActivity.writeResult(); - } -} - -class StatsInfo { - int loadStatus; - long loadStart; - long loadEnd; - long generateStart; - long generateEnd; - float tps; - String modelName; - - @Override - public String toString() { - return "loadStart: " - + loadStart - + "\nloadEnd: " - + loadEnd - + "\ngenerateStart: " - + generateStart - + "\ngenerateEnd: " - + generateEnd - + "\n" - + tps; - } -} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.kt b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.kt new file mode 100644 index 00000000000..5c75519f870 --- /dev/null +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmark.kt @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench + +import android.util.Log +import org.json.JSONException +import org.json.JSONObject + +class LlmBenchmark( + private val activity: BenchmarkActivity, + modelFile: String, + tokenizerPath: String, + private val prompt: String, + temperature: Float, + private val results: MutableList, +) : LlmModelRunnerCallback { + + private val runner: LlmModelRunner + private val statsInfo = StatsInfo() + + init { + statsInfo.modelName = modelFile.substringAfterLast('/').removeSuffix(".pte") + runner = LlmModelRunner(modelFile, tokenizerPath, temperature, this) + statsInfo.loadStart = System.nanoTime() + } + + override fun onModelLoaded(status: Int) { + statsInfo.loadEnd = System.nanoTime() + statsInfo.loadStatus = status + if (status != 0) { + Log.e("LlmBenchmarkRunner", "Loaded failed: $status") + onGenerationStopped() + return + } + statsInfo.generateStart = System.nanoTime() + runner.generate(prompt) + } + + override fun onTokenGenerated(token: String) {} + + override fun onStats(stats: String) { + try { + val json = JSONObject(stats) + val numGeneratedTokens = json.getInt("generated_tokens") + val inferenceEndMs = json.getInt("inference_end_ms") + val promptEvalEndMs = json.getInt("prompt_eval_end_ms") + statsInfo.tps = numGeneratedTokens.toFloat() / (inferenceEndMs - promptEvalEndMs) * 1000 + } catch (e: JSONException) { + Log.e("LLM", "Error parsing JSON: ${e.message}") + } + } + + override fun onGenerationStopped() { + statsInfo.generateEnd = System.nanoTime() + + val benchmarkModel = BenchmarkMetric.extractBackendAndQuantization(statsInfo.modelName) + results.add(BenchmarkMetric(benchmarkModel, "load_status", statsInfo.loadStatus.toDouble(), 0.0)) + results.add( + BenchmarkMetric( + benchmarkModel, + "llm_model_load_time(ms)", + (statsInfo.loadEnd - statsInfo.loadStart) * 1e-6, + 0.0, + )) + results.add( + BenchmarkMetric( + benchmarkModel, + "generate_time(ms)", + (statsInfo.generateEnd - statsInfo.generateStart) * 1e-6, + 0.0, + )) + results.add(BenchmarkMetric(benchmarkModel, "token_per_sec", statsInfo.tps.toDouble(), 0.0)) + activity.writeResult() + } +} + +private class StatsInfo { + var loadStatus: Int = 0 + var loadStart: Long = 0 + var loadEnd: Long = 0 + var generateStart: Long = 0 + var generateEnd: Long = 0 + var tps: Float = 0f + var modelName: String = "" +} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.java deleted file mode 100644 index 3a345d3465b..00000000000 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.java +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.minibench; - -import android.os.Handler; -import android.os.HandlerThread; -import android.os.Looper; -import android.os.Message; -import android.util.Log; -import org.pytorch.executorch.extension.llm.LlmCallback; -import org.pytorch.executorch.extension.llm.LlmModule; - -/** A helper class to handle all model running logic within this class. */ -public class LlmModelRunner implements LlmCallback { - LlmModule mModule = null; - - String mModelFilePath = ""; - String mTokenizerFilePath = ""; - - LlmModelRunnerCallback mCallback = null; - - HandlerThread mHandlerThread = null; - Handler mHandler = null; - - /** - * ] Helper class to separate between UI logic and model runner logic. Automatically handle - * generate() request on worker thread. - * - * @param modelFilePath - * @param tokenizerFilePath - * @param callback - */ - LlmModelRunner( - String modelFilePath, - String tokenizerFilePath, - float temperature, - LlmModelRunnerCallback callback) { - mModelFilePath = modelFilePath; - mTokenizerFilePath = tokenizerFilePath; - mCallback = callback; - - mModule = new LlmModule(mModelFilePath, mTokenizerFilePath, 0.8f); - mHandlerThread = new HandlerThread("LlmModelRunner"); - mHandlerThread.start(); - mHandler = new LlmModelRunnerHandler(mHandlerThread.getLooper(), this); - - mHandler.sendEmptyMessage(LlmModelRunnerHandler.MESSAGE_LOAD_MODEL); - } - - int generate(String prompt) { - Message msg = Message.obtain(mHandler, LlmModelRunnerHandler.MESSAGE_GENERATE, prompt); - msg.sendToTarget(); - return 0; - } - - void stop() { - mModule.stop(); - } - - @Override - public void onResult(String result) { - mCallback.onTokenGenerated(result); - } - - @Override - public void onStats(String result) { - mCallback.onStats(result); - } -} - -class LlmModelRunnerHandler extends Handler { - public static int MESSAGE_LOAD_MODEL = 1; - public static int MESSAGE_GENERATE = 2; - - private final LlmModelRunner mLlmModelRunner; - - public LlmModelRunnerHandler(Looper looper, LlmModelRunner llmModelRunner) { - super(looper); - mLlmModelRunner = llmModelRunner; - } - - @Override - public void handleMessage(android.os.Message msg) { - if (msg.what == MESSAGE_LOAD_MODEL) { - int status = 0; - try { - mLlmModelRunner.mModule.load(); - } catch (Exception e) { - status = - (e instanceof org.pytorch.executorch.ExecutorchRuntimeException) - ? ((org.pytorch.executorch.ExecutorchRuntimeException) e).getErrorCode() - : -1; - } - mLlmModelRunner.mCallback.onModelLoaded(status); - } else if (msg.what == MESSAGE_GENERATE) { - try { - mLlmModelRunner.mModule.generate((String) msg.obj, mLlmModelRunner); - } catch (Exception e) { - Log.e("LlmModelRunner", "generate() failed", e); - } - mLlmModelRunner.mCallback.onGenerationStopped(); - } - } -} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.kt b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.kt new file mode 100644 index 00000000000..29b9b177fb6 --- /dev/null +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmModelRunner.kt @@ -0,0 +1,91 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench + +import android.os.Handler +import android.os.HandlerThread +import android.os.Looper +import android.os.Message +import android.util.Log +import org.pytorch.executorch.ExecutorchRuntimeException +import org.pytorch.executorch.extension.llm.LlmCallback +import org.pytorch.executorch.extension.llm.LlmModule + +/** A helper class to handle all model running logic within this class. */ +class LlmModelRunner( + modelFilePath: String, + tokenizerFilePath: String, + temperature: Float, + val callback: LlmModelRunnerCallback, +) : LlmCallback { + + val module: LlmModule = LlmModule(modelFilePath, tokenizerFilePath, temperature) + private val handlerThread: HandlerThread = HandlerThread("LlmModelRunner") + private val handler: Handler + + init { + handlerThread.start() + handler = LlmModelRunnerHandler(handlerThread.looper, this) + handler.sendEmptyMessage(LlmModelRunnerHandler.MESSAGE_LOAD_MODEL) + } + + fun generate(prompt: String): Int { + val msg = Message.obtain(handler, LlmModelRunnerHandler.MESSAGE_GENERATE, prompt) + msg.sendToTarget() + return 0 + } + + fun stop() { + module.stop() + } + + override fun onResult(result: String) { + callback.onTokenGenerated(result) + } + + override fun onStats(stats: String) { + callback.onStats(stats) + } +} + +private class LlmModelRunnerHandler( + looper: Looper, + private val runner: LlmModelRunner, +) : Handler(looper) { + + override fun handleMessage(msg: Message) { + when (msg.what) { + MESSAGE_LOAD_MODEL -> { + val status = + try { + runner.module.load() + 0 + } catch (e: ExecutorchRuntimeException) { + e.errorCode + } catch (e: Exception) { + -1 + } + runner.callback.onModelLoaded(status) + } + MESSAGE_GENERATE -> { + try { + runner.module.generate(msg.obj as String, runner) + } catch (e: Exception) { + Log.e("LlmModelRunner", "generate() failed", e) + } + runner.callback.onGenerationStopped() + } + } + } + + companion object { + const val MESSAGE_LOAD_MODEL = 1 + const val MESSAGE_GENERATE = 2 + } +} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java deleted file mode 100644 index 915496a25af..00000000000 --- a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.java +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.minibench; - -import android.os.Debug; -import java.io.File; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; -import org.pytorch.executorch.Module; - -public class ModelRunner { - /** - * @return list of #BenchmarkMetric - */ - public void runBenchmark( - File model, int numWarmupIter, int numIter, List results) { - long pssIdle = Debug.getPss(); - - List latency = new ArrayList<>(); - - long loadStart = System.nanoTime(); - Module module = Module.load(model.getPath()); - int errorCode = 0; - try { - module.loadMethod("forward"); - } catch (Exception e) { - errorCode = - (e instanceof org.pytorch.executorch.ExecutorchRuntimeException) - ? ((org.pytorch.executorch.ExecutorchRuntimeException) e).getErrorCode() - : -1; - } - long loadEnd = System.nanoTime(); - - final BenchmarkMetric.BenchmarkModel benchmarkModel = - BenchmarkMetric.extractBackendAndQuantization(model.getName().replace(".pte", "")); - - if (errorCode != 0) { - results.add( - new BenchmarkMetric( - benchmarkModel, "model_load_time(ms)", (loadEnd - loadStart) * 1e-6, 0.0f)); - results.add(new BenchmarkMetric(benchmarkModel, "load_status", errorCode, 0)); - module.destroy(); - return; - } - - try { - for (int i = 0; i < numWarmupIter; i++) { - module.forward(); - } - - for (int i = 0; i < numIter; i++) { - long start = System.nanoTime(); - module.forward(); - double forwardMs = (System.nanoTime() - start) * 1e-6; - latency.add(forwardMs); - } - - module.etdump(); - - // Currently the result has large variance from outliers, so only use - // 80% samples in the middle (trimmean 0.2) - Collections.sort(latency); - int resultSize = latency.size(); - List usedLatencyResults = latency.subList(resultSize / 10, resultSize * 9 / 10); - - results.add( - new BenchmarkMetric( - benchmarkModel, - "avg_inference_latency(ms)", - latency.stream().mapToDouble(l -> l).average().orElse(0.0f), - 0.0f)); - results.add( - new BenchmarkMetric( - benchmarkModel, - "trimmean_inference_latency(ms)", - usedLatencyResults.stream().mapToDouble(l -> l).average().orElse(0.0f), - 0.0f)); - // Model load time - results.add( - new BenchmarkMetric( - benchmarkModel, "model_load_time(ms)", (loadEnd - loadStart) * 1e-6, 0.0f)); - // Load status - results.add(new BenchmarkMetric(benchmarkModel, "load_status", errorCode, 0)); - // RAM PSS usage - results.add( - new BenchmarkMetric( - benchmarkModel, "ram_pss_usage(mb)", (Debug.getPss() - pssIdle) / 1024, 0)); - } finally { - module.destroy(); - } - } -} diff --git a/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.kt b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.kt new file mode 100644 index 00000000000..0f292b0d900 --- /dev/null +++ b/extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench/ModelRunner.kt @@ -0,0 +1,90 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.minibench + +import android.os.Debug +import java.io.File +import org.pytorch.executorch.ExecutorchRuntimeException +import org.pytorch.executorch.Module + +class ModelRunner { + + fun runBenchmark( + model: File, + numWarmupIter: Int, + numIter: Int, + results: MutableList, + ) { + val pssIdle = Debug.getPss() + val latency = mutableListOf() + + val loadStart = System.nanoTime() + val module = Module.load(model.path) + var errorCode = 0 + try { + module.loadMethod("forward") + } catch (e: ExecutorchRuntimeException) { + errorCode = e.errorCode + } catch (e: Exception) { + errorCode = -1 + } + val loadEnd = System.nanoTime() + + val benchmarkModel = + BenchmarkMetric.extractBackendAndQuantization(model.name.removeSuffix(".pte")) + + if (errorCode != 0) { + results.add( + BenchmarkMetric(benchmarkModel, "model_load_time(ms)", (loadEnd - loadStart) * 1e-6, 0.0)) + results.add(BenchmarkMetric(benchmarkModel, "load_status", errorCode.toDouble(), 0.0)) + module.destroy() + return + } + + try { + repeat(numWarmupIter) { module.forward() } + + repeat(numIter) { + val start = System.nanoTime() + module.forward() + latency.add((System.nanoTime() - start) * 1e-6) + } + + module.etdump() + + // Currently the result has large variance from outliers, so only use + // 80% samples in the middle (trimmean 0.2) + latency.sort() + val trimmed = latency.subList(latency.size / 10, latency.size * 9 / 10) + + results.add( + BenchmarkMetric( + benchmarkModel, + "avg_inference_latency(ms)", + latency.average(), + 0.0, + )) + results.add( + BenchmarkMetric( + benchmarkModel, + "trimmean_inference_latency(ms)", + trimmed.average(), + 0.0, + )) + results.add( + BenchmarkMetric(benchmarkModel, "model_load_time(ms)", (loadEnd - loadStart) * 1e-6, 0.0)) + results.add(BenchmarkMetric(benchmarkModel, "load_status", errorCode.toDouble(), 0.0)) + results.add( + BenchmarkMetric( + benchmarkModel, "ram_pss_usage(mb)", (Debug.getPss() - pssIdle) / 1024.0, 0.0)) + } finally { + module.destroy() + } + } +} diff --git a/extension/benchmark/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java b/extension/benchmark/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.kt similarity index 55% rename from extension/benchmark/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java rename to extension/benchmark/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.kt index c6a6a76a4d8..b98a49e4bf9 100644 --- a/extension/benchmark/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.java +++ b/extension/benchmark/android/benchmark/app/src/test/java/org/pytorch/minibench/ExampleUnitTest.kt @@ -6,20 +6,19 @@ * LICENSE file in the root directory of this source tree. */ -package org.pytorch.minibench; +package org.pytorch.minibench -import static org.junit.Assert.*; - -import org.junit.Test; +import org.junit.Assert.assertEquals +import org.junit.Test /** * Example local unit test, which will execute on the development machine (host). * - * @see Testing documentation + * @see [Testing documentation](http://d.android.com/tools/testing) */ -public class ExampleUnitTest { +class ExampleUnitTest { @Test - public void addition_isCorrect() { - assertEquals(4, 2 + 2); + fun addition_isCorrect() { + assertEquals(4, 2 + 2) } } From 043c404bf8146391dbc8ff89e732d2479f8c7bb9 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Tue, 26 May 2026 10:21:55 -0700 Subject: [PATCH 021/103] Cortex-M backend: enable Cortex-M0+ builds against Corstone-300 (#19731) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary Extend the Cortex-M cross-CPU build pipeline to Armv6-M by patching two upstream issues that block the Corstone-300 target source and the CMSIS Cortex DFP from building for `cortex-m0plus`: * `core_platform/0003-*.patch` guards the `HardFault_Handler` in `targets/corstone-300/target.cpp`. The handler uses an `ite eq` IT-block in inline asm and dereferences the SCB CFSR/BFAR/MMFAR fault-status registers; both are Armv7-M / Armv8-M Mainline only. The patch wraps the rich handler in `__ARM_ARCH_7M__ / 7EM / 8M_MAIN / 8_1M_MAIN` and falls back to a minimal stub on Armv6-M / Armv8-M Baseline (M0/M0+/M23). * `core_software/0002-*.patch` fixes `cmsis.cmake`'s handling of the M0+ device. The Cortex DFP names the device directory and headers `ARMCM0plus` (lowercase suffix), while the device sources (`startup_ARMCM0plus.c`, `system_ARMCM0plus.c`) gate their implementations on the `ARMCM0P` preprocessor macro — three different spellings. The previous `string(TOUPPER ...)` produced `ARMCM0PLUS`: the include path lookup failed and the source files hit their `#error device not specified!` guard. Override `ARM_CPU` to `ARMCM0plus` for the directory + filename and introduce a separate `CMSIS_DEVICE_CPU_DEFINE` set to `ARMCM0P` for the cmsis_startup and cmsis_system compile-definitions; all other cores still drive both paths from the uppercased default. Both patches are layered via the existing `patch_repo` mechanism; the `corstone_utils.cmake` TODO is updated so the deletion plan for 0002 and 0003 is documented together. ### Test Plan Locally validated end-to-end on the Corstone-300 FVP with the `qadd` model: `cortex-m0plus` build links a runner that includes `startup_ARMCM0plus.c` / `system_ARMCM0plus.c` and the patched `target.cpp`, and the FVP run prints `TEST: BundleIO index[0] Test_result: PASS` with all error stats zero. The bundled `libcmsis-nn.a` reports `Tag_CPU_arch: v6S-M` and `Tag_THUMB_ISA_use: Thumb-1` with zero DSP / MVE / saturating instructions, confirming the scalar code path was exercised. Authored with Claude. cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell --- backends/arm/scripts/corstone_utils.cmake | 11 +-- ...-Guard-HardFault-Handler-for-Armv6-M.patch | 49 ++++++++++++ ...irectory-case-and-compile-define-mis.patch | 77 +++++++++++++++++++ 3 files changed, 132 insertions(+), 5 deletions(-) create mode 100644 examples/arm/ethos-u-setup/core_platform/0003-Guard-HardFault-Handler-for-Armv6-M.patch create mode 100644 examples/arm/ethos-u-setup/core_software/0002-Fix-ARMCM0plus-directory-case-and-compile-define-mis.patch diff --git a/backends/arm/scripts/corstone_utils.cmake b/backends/arm/scripts/corstone_utils.cmake index 58ce4f9a919..34f04ba1225 100644 --- a/backends/arm/scripts/corstone_utils.cmake +++ b/backends/arm/scripts/corstone_utils.cmake @@ -50,11 +50,12 @@ function(fetch_ethos_u_content ETHOS_SDK_PATH ET_DIR_PATH) WORKING_DIRECTORY ${ET_DIR_PATH} ) # Always patch the core_platform repo since this is fast enough. TODO: - # examples/arm/ethos-u-setup/core_platform/0002-*.patch is a transient bridge - # that guards Armv8-M-only MPU init so the source compiles for non-Armv8-M - # Cortex-M cores. Once the same guard lands upstream in ethos-u/core_platform - # and ${core_platform_base_rev} is bumped past that commit, delete the 0002 - # patch. + # examples/arm/ethos-u-setup/core_platform/0002-*.patch and 0003-*.patch are + # transient bridges that guard Armv8-M-only MPU init and the Armv7-M-and-newer + # HardFault handler so the Corstone-300 target source compiles for older + # Cortex-M cores. Once the equivalent guards land upstream in + # ethos-u/core_platform and ${core_platform_base_rev} is bumped past those + # commits, delete the 0002 and 0003 patches. set(core_platform_base_rev "26.02") execute_process( COMMAND diff --git a/examples/arm/ethos-u-setup/core_platform/0003-Guard-HardFault-Handler-for-Armv6-M.patch b/examples/arm/ethos-u-setup/core_platform/0003-Guard-HardFault-Handler-for-Armv6-M.patch new file mode 100644 index 00000000000..57a27cb3dee --- /dev/null +++ b/examples/arm/ethos-u-setup/core_platform/0003-Guard-HardFault-Handler-for-Armv6-M.patch @@ -0,0 +1,49 @@ +From 380045853a133f298cee1bcf0c959b93ea94f9a2 Mon Sep 17 00:00:00 2001 +From: RJ Ascani +Date: Wed, 13 May 2026 15:42:13 -0700 +Subject: [PATCH] Guard HardFault_Handler for Armv6-M / Armv8-M Baseline + +The Corstone-300 HardFault_Handler is written for Armv7-M / Armv8-M +Mainline: it uses an `ite eq` IT-block in inline asm, and dereferences +the SCB CFSR/BFAR/MMFAR fault-status registers. Neither is available +on Armv6-M (Cortex-M0/M0+) or Armv8-M Baseline (Cortex-M23), so the +file fails to compile when the Corstone-300 target source is built +with `-mcpu=cortex-m0plus` to exercise the scalar CMSIS-NN code paths +on the Corstone-300 M55 simulator (an ISA superset). + +Wrap the Mainline-only implementation in +`__ARM_ARCH_7M__ / 7EM / 8M_MAIN / 8_1M_MAIN` and fall back to a +minimal `printf("Hard fault"); exit(1)` stub on Baseline cores. +--- + targets/corstone-300/target.cpp | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/targets/corstone-300/target.cpp b/targets/corstone-300/target.cpp +index bda2248..4aa3eea 100644 +--- a/targets/corstone-300/target.cpp ++++ b/targets/corstone-300/target.cpp +@@ -246,6 +246,11 @@ struct ExcContext { + }; + + void HardFault_Handler() { ++ // Armv6-M (M0/M0+) and Armv8-M Baseline (M23) lack the IT instruction and ++ // the SCB CFSR/BFAR/MMFAR fault-status registers, so the rich handler ++ // can't compile or run there. Fall back to a minimal stub on those cores. ++#if defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7EM__) || defined(__ARM_ARCH_8M_MAIN__) || \ ++ defined(__ARM_ARCH_8_1M_MAIN__) + int irq; + struct ExcContext *e; + uint32_t sp; +@@ -267,6 +272,9 @@ void HardFault_Handler() { + sp); + printf( + "%11s cfsr=0x%08" PRIx32 " bfar=0x%08" PRIx32 " mmfar=0x%08" PRIx32 "\n", "", SCB->CFSR, SCB->BFAR, SCB->MMFAR); ++#else ++ printf("Hard fault\n"); ++#endif + exit(1); + } + } +-- +2.53.0 + diff --git a/examples/arm/ethos-u-setup/core_software/0002-Fix-ARMCM0plus-directory-case-and-compile-define-mis.patch b/examples/arm/ethos-u-setup/core_software/0002-Fix-ARMCM0plus-directory-case-and-compile-define-mis.patch new file mode 100644 index 00000000000..96dcdd9f29d --- /dev/null +++ b/examples/arm/ethos-u-setup/core_software/0002-Fix-ARMCM0plus-directory-case-and-compile-define-mis.patch @@ -0,0 +1,77 @@ +From 1ee9cf9c956ea6a266fc79dfa62071131f162510 Mon Sep 17 00:00:00 2001 +From: RJ Ascani +Date: Wed, 13 May 2026 15:48:07 -0700 +Subject: [PATCH] Fix ARMCM0plus directory case and compile-define mismatch +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The Cortex DFP names the Cortex-M0+ device directory and headers +`ARMCM0plus` (lowercase suffix), while the device source files +(`startup_ARMCM0plus.c`, `system_ARMCM0plus.c`) gate their +implementations on the `ARMCM0P` preprocessor macro — three different +spellings. `cmsis.cmake` previously did +`string(TOUPPER \"ARMCM\${CPU_NUMBER}\" ARM_CPU)`, producing +`ARMCM0PLUS`: the include path lookup fails and the source files hit +their `#error device not specified!` guard. + +Override `ARM_CPU` to `ARMCM0plus` and introduce a separate +`CMSIS_DEVICE_CPU_DEFINE` set to `ARMCM0P` for the cmsis_startup and +cmsis_system compile-definitions; all other cores still drive both +paths from the uppercased default. +--- + cmsis.cmake | 20 ++++++++++++++++++-- + 1 file changed, 18 insertions(+), 2 deletions(-) + +diff --git a/cmsis.cmake b/cmsis.cmake +index 7f2b93f..c49f205 100644 +--- a/cmsis.cmake ++++ b/cmsis.cmake +@@ -23,6 +23,15 @@ endif() + + string(TOUPPER "ARMCM${CPU_NUMBER}" ARM_CPU) + ++# Cortex-M0+ is special: the Cortex DFP names the device directory and headers ++# `ARMCM0plus` (lowercase suffix), while the device sources gate their ++# implementations on the `ARMCM0P` preprocessor macro. Override both so the ++# directory lookup and `#include` resolution succeed; the compile-definition ++# override is applied instead of `CMSIS_DEVICE_CPU_FEATURE` further down. ++if(CPU_NUMBER STREQUAL "0plus") ++ set(ARM_CPU "ARMCM0plus") ++endif() ++ + # Set CPU specific features + if(CMAKE_SYSTEM_PROCESSOR MATCHES "cortex-m33(\\+|$)") + set(ARM_FEATURES "_DSP_FP") +@@ -50,6 +59,13 @@ else() + cmake_path(SET CMSIS_DEVICE_CPU_FEATURE "${ARM_CPU}") + endif() + ++# Macro the device sources gate on. Matches CMSIS_DEVICE_CPU_FEATURE for most ++# cores; Cortex-M0+ keys off `ARMCM0P`, not `ARMCM0plus`. ++set(CMSIS_DEVICE_CPU_DEFINE "${CMSIS_DEVICE_CPU_FEATURE}") ++if(CPU_NUMBER STREQUAL "0plus") ++ set(CMSIS_DEVICE_CPU_DEFINE "ARMCM0P") ++endif() ++ + target_include_directories(cmsis_device INTERFACE ${CMSIS_DEVICE_PATH}/${ARM_CPU}/Include) + + target_compile_options(cmsis_device INTERFACE +@@ -66,12 +82,12 @@ target_sources(cmsis_startup INTERFACE + set_source_files_properties(${CMSIS_DEVICE_PATH}/${ARM_CPU}/Source/startup_${ARM_CPU}.c + PROPERTIES COMPILE_FLAGS -Wno-redundant-decls) + +-target_compile_definitions(cmsis_startup INTERFACE ${CMSIS_DEVICE_CPU_FEATURE}) ++target_compile_definitions(cmsis_startup INTERFACE ${CMSIS_DEVICE_CPU_DEFINE}) + target_link_libraries(cmsis_startup INTERFACE cmsis_device) + + # CMSIS system + add_library(cmsis_system INTERFACE) + target_sources(cmsis_system INTERFACE + ${CMSIS_DEVICE_PATH}/${ARM_CPU}/Source/system_${ARM_CPU}.c) +-target_compile_definitions(cmsis_system INTERFACE ${CMSIS_DEVICE_CPU_FEATURE}) ++target_compile_definitions(cmsis_system INTERFACE ${CMSIS_DEVICE_CPU_DEFINE}) + target_link_libraries(cmsis_system INTERFACE cmsis_startup) +-- +2.53.0 + From fb3f6eba471ad2f59003b3cd7cb0f5396f0060cd Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Tue, 26 May 2026 11:07:31 -0700 Subject: [PATCH 022/103] Harden against concurrency violations (#19734) (#19734) Differential Revision: D106026285 Pull Request resolved: https://github.com/pytorch/executorch/pull/19734 --- backends/xnnpack/runtime/XNNExecutor.cpp | 52 +++++++++++++++++-- backends/xnnpack/runtime/XNNExecutor.h | 10 ++++ backends/xnnpack/runtime/XNNPACKBackend.cpp | 45 ++++++++++++++-- .../xnnpack/runtime/XNNWorkspaceManager.cpp | 2 + backends/xnnpack/targets.bzl | 2 + .../test/runtime/test_workspace_manager.cpp | 4 ++ backends/xnnpack/test/targets.bzl | 3 ++ 7 files changed, 109 insertions(+), 9 deletions(-) diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp index 103a8812931..1cba33a91e6 100644 --- a/backends/xnnpack/runtime/XNNExecutor.cpp +++ b/backends/xnnpack/runtime/XNNExecutor.cpp @@ -23,6 +23,28 @@ using executorch::runtime::is_contiguous_dim_order; using executorch::runtime::kTensorDimensionLimit; using executorch::runtime::Span; +namespace { +class InUseGuard { + public: + explicit InUseGuard(std::atomic& flag) : flag_(flag) {} + ~InUseGuard() { + if (!dismissed_) { + flag_.store(false, std::memory_order_release); + } + } + void dismiss() { + dismissed_ = true; + } + + InUseGuard(const InUseGuard&) = delete; + InUseGuard& operator=(const InUseGuard&) = delete; + + private: + std::atomic& flag_; + bool dismissed_ = false; +}; +} // namespace + /** * Initializes the XNNExecutor with the runtime and given number of * inputs/outputs externals_ is resized to the total number of inputs and @@ -71,6 +93,21 @@ ET_NODISCARD Error XNNExecutor::initialize( * delegate->execute() */ ET_NODISCARD Error XNNExecutor::prepare_args(Span args) { + ET_CHECK_MSG( + !destroyed_.load(std::memory_order_acquire), + "XNNExecutor::prepare_args called after destroy"); + + bool was_in_use = in_use_.exchange(true, std::memory_order_acquire); + if (was_in_use) { + ET_LOG(Error, "XNNExecutor::prepare_args called concurrently"); + } + ET_DCHECK_MSG(!was_in_use, "XNNExecutor::prepare_args called concurrently"); + + InUseGuard in_use_guard(in_use_); + if (was_in_use) { + in_use_guard.dismiss(); + } + ET_CHECK_OR_RETURN_ERROR( runtime_ != nullptr, Internal, @@ -142,6 +179,7 @@ ET_NODISCARD Error XNNExecutor::prepare_args(Span args) { return err; } + in_use_guard.dismiss(); return Error::Ok; } @@ -152,6 +190,8 @@ ET_NODISCARD Error XNNExecutor::prepare_args(Span args) { * After which we then execute the runtime through invoke_runtime. */ ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) { + InUseGuard in_use_guard(in_use_); + ET_CHECK_OR_RETURN_ERROR( runtime_ != nullptr, Internal, @@ -160,11 +200,13 @@ ET_NODISCARD Error XNNExecutor::forward(BackendExecutionContext& context) { xnn_status status = xnn_setup_runtime_v2( runtime_.get(), externals_.size(), externals_.data()); - ET_CHECK_OR_RETURN_ERROR( - status == xnn_status_success, - Internal, - "Internal Error: Setting up the runtime failed with code: %s", - xnn_status_to_string(status)); + if (status != xnn_status_success) { + ET_LOG( + Error, + "Internal Error: Setting up the runtime failed with code: %s", + xnn_status_to_string(status)); + return Error::Internal; + } auto error = profiler_.start(context.event_tracer()); if (error != Error::Ok) { diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h index fa7c8360be4..0af8b6056b0 100644 --- a/backends/xnnpack/runtime/XNNExecutor.h +++ b/backends/xnnpack/runtime/XNNExecutor.h @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -36,11 +37,20 @@ class XNNExecutor { std::vector externals_; std::vector packed_data_names_; std::shared_ptr workspace_; + std::atomic in_use_{false}; + std::atomic destroyed_{false}; public: XNNExecutor(std::shared_ptr workspace) : workspace_(workspace) {} + ~XNNExecutor() { + ET_CHECK_MSG( + !in_use_.load(std::memory_order_acquire), + "XNNExecutor destroyed while in use"); + destroyed_.store(true, std::memory_order_release); + } + inline size_t getNumInputs() { return input_ids_.size(); } diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index c20fa985f46..a02cf98771b 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -16,6 +16,7 @@ #include #include +#include #include #include @@ -129,6 +130,17 @@ class XnnpackBackend final Error, "XNNCompiler::compileModel failed: 0x%x", (unsigned int)err); return err; } + + ET_LOG( + Info, + "XnnpackBackend::init delegate=%p workspace_id=%" PRIu64 + " workspace_ptr=%p program_id=0x%" PRIxPTR " weight_cache=%s", + (void*)executor, + workspace->id(), + (void*)workspace_ptr, + program_id, + use_weight_cache ? "true" : "false"); + return executor; } @@ -138,13 +150,23 @@ class XnnpackBackend final Span args) const override { auto executor = static_cast(handle); + auto workspace = executor->get_workspace(); + ET_LOG( + Info, + "XnnpackBackend::execute begin delegate=%p workspace_id=%" PRIu64 + " num_args=%zu weight_cache=%s", + (void*)executor, + workspace->id(), + (size_t)args.size(), + executor->uses_weight_cache() ? "true" : "false"); + std::unique_lock lock_weights_cache( weights_cache_mutex_, std::defer_lock); if (executor->uses_weight_cache()) { lock_weights_cache.lock(); } - auto [raii_lock, _] = executor->get_workspace()->acquire(); + auto [raii_lock, _] = workspace->acquire(); // Prepare Inputs/Outputs and Propagate Input Shapes Error err = executor->prepare_args(args); @@ -161,20 +183,36 @@ class XnnpackBackend final // Convert output data types if necessary (e.g., int32 -> int64 for Long) err = executor->convert_outputs(args); + ET_LOG( + Info, + "XnnpackBackend::execute end delegate=%p workspace_id=%" PRIu64 + " err=0x%x", + (void*)executor, + workspace->id(), + (unsigned int)err); + return err; } void destroy(DelegateHandle* handle) const override { if (handle != nullptr) { auto executor = static_cast(handle); + auto workspace = executor->get_workspace(); + + ET_LOG( + Info, + "XnnpackBackend::destroy delegate=%p workspace_id=%" PRIu64, + (void*)executor, + workspace->id()); + + const std::lock_guard lock_weights_cache( + weights_cache_mutex_); #ifdef ENABLE_XNNPACK_PROFILING executor->print_avg_op_timings(); #endif if (executor->uses_weight_cache()) { - const std::lock_guard lock_weights_cache( - weights_cache_mutex_); weights_cache_->delete_packed_data(executor->get_packed_data_names()); } @@ -183,7 +221,6 @@ class XnnpackBackend final // the same backend instance. Make sure to hold onto the workspace // shared_ptr, as the pointer in the executor is freed, which includes // the mutex referenced by raii_lock. - auto workspace = executor->get_workspace(); auto [raii_lock, _] = workspace->acquire(); // XNNExecutor is not trivially destructible. Since this was constructed diff --git a/backends/xnnpack/runtime/XNNWorkspaceManager.cpp b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp index d3550da5cc7..e115074a108 100644 --- a/backends/xnnpack/runtime/XNNWorkspaceManager.cpp +++ b/backends/xnnpack/runtime/XNNWorkspaceManager.cpp @@ -61,7 +61,9 @@ XNNWorkspaceManager::get_or_create_workspace( return create_result.error(); } +#ifndef XNNPACK_WORKSPACE_ALWAYS_LOCK create_result.get()->disable_locking(); +#endif return create_result.get(); } else if (mode == WorkspaceSharingMode::PerModel) { return get_or_create_model_workspace(program_id); diff --git a/backends/xnnpack/targets.bzl b/backends/xnnpack/targets.bzl index 868e68e5b8c..b3af589df10 100644 --- a/backends/xnnpack/targets.bzl +++ b/backends/xnnpack/targets.bzl @@ -14,6 +14,8 @@ def _get_preprocessor_flags(): if native.read_config("executorch", "xnnpack_weights_cache", "0") != "0": preprocessor_flags.append("-DENABLE_XNNPACK_WEIGHTS_CACHE") + preprocessor_flags.append("-DXNNPACK_WORKSPACE_ALWAYS_LOCK") + # Enable if not disabled through config return preprocessor_flags diff --git a/backends/xnnpack/test/runtime/test_workspace_manager.cpp b/backends/xnnpack/test/runtime/test_workspace_manager.cpp index a7689966635..a239d19b415 100644 --- a/backends/xnnpack/test/runtime/test_workspace_manager.cpp +++ b/backends/xnnpack/test/runtime/test_workspace_manager.cpp @@ -116,7 +116,11 @@ TEST_F(XNNWorkspaceManagerTest, DisabledModeAcquireDoesNotLock) { auto [lock, ptr] = workspace->acquire(); ASSERT_NE(ptr, nullptr); +#ifdef XNNPACK_WORKSPACE_ALWAYS_LOCK + EXPECT_TRUE(lock.owns_lock()); +#else EXPECT_FALSE(lock.owns_lock()); +#endif } TEST_F(XNNWorkspaceManagerTest, PerModelMode) { diff --git a/backends/xnnpack/test/targets.bzl b/backends/xnnpack/test/targets.bzl index 812986a12e6..d690e1c9dcd 100644 --- a/backends/xnnpack/test/targets.bzl +++ b/backends/xnnpack/test/targets.bzl @@ -96,6 +96,9 @@ def define_common_targets(): runtime.cxx_test( name = "test_workspace_manager", srcs = ["runtime/test_workspace_manager.cpp"], + preprocessor_flags = [ + "-DXNNPACK_WORKSPACE_ALWAYS_LOCK", + ], deps = [ third_party_dep("XNNPACK"), "//executorch/backends/xnnpack:xnnpack_backend", From 50ee05ec1533ac61724ef0d3e4913b77af04faf6 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 26 May 2026 14:00:32 -0700 Subject: [PATCH 023/103] Convert Experimental, DType, MethodMetadata from Java to Kotlin Differential Revision: D106394605 Pull Request resolved: https://github.com/pytorch/executorch/pull/19775 --- extension/android/BUCK | 10 ++-- .../executorch/{DType.java => DType.kt} | 26 +++------ .../pytorch/executorch/MethodMetadata.java | 34 ----------- .../org/pytorch/executorch/MethodMetadata.kt | 12 ++++ .../{Experimental.java => Experimental.kt} | 7 ++- .../executorch/annotations/package-info.java | 2 - .../org/pytorch/executorch/package-info.java | 57 ------------------- 7 files changed, 31 insertions(+), 117 deletions(-) rename extension/android/executorch_android/src/main/java/org/pytorch/executorch/{DType.java => DType.kt} (77%) delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.kt rename extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/{Experimental.java => Experimental.kt} (68%) delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/package-info.java delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/package-info.java diff --git a/extension/android/BUCK b/extension/android/BUCK index 110b428575d..bae5579b2a8 100644 --- a/extension/android/BUCK +++ b/extension/android/BUCK @@ -8,17 +8,19 @@ non_fbcode_target(_kind = fb_android_library, warnings_as_errors = False, required_for_source_only_abi = True, srcs = [ - "executorch_android/src/main/java/org/pytorch/executorch/DType.java", + "executorch_android/src/main/java/org/pytorch/executorch/DType.kt", "executorch_android/src/main/java/org/pytorch/executorch/EValue.java", "executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.java", "executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java", - "executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.java", + "executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.kt", "executorch_android/src/main/java/org/pytorch/executorch/Module.java", "executorch_android/src/main/java/org/pytorch/executorch/Tensor.java", - "executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.java", + "executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt", ], autoglob = False, - language = "JAVA", + language = "KOTLIN", + pure_kotlin = False, + extra_kotlinc_arguments = ["-Xjvm-default=all"], deps = [ "//fbandroid/java/com/facebook/jni:jni", "//fbandroid/libraries/soloader/java/com/facebook/soloader/nativeloader:nativeloader", diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.kt similarity index 77% rename from extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.java rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.kt index 3aca4871d64..a58baa34b60 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/DType.kt @@ -6,17 +6,17 @@ * LICENSE file in the root directory of this source tree. */ -package org.pytorch.executorch; +package org.pytorch.executorch -import org.pytorch.executorch.annotations.Experimental; +import org.pytorch.executorch.annotations.Experimental /** * Codes representing tensor data types. * - *

Warning: These APIs are experimental and subject to change without notice + * Warning: These APIs are experimental and subject to change without notice */ @Experimental -public enum DType { +enum class DType(@JvmField val jniCode: Int) { // NOTE: "jniCode" must be kept in sync with scalar_type.h. // NOTE: Never serialize "jniCode", because it can change between releases. @@ -68,18 +68,10 @@ public enum DType { BITS16(22), ; - final int jniCode; - - DType(int jniCode) { - this.jniCode = jniCode; - } - - public static DType fromJniCode(int jniCode) { - for (DType dtype : values()) { - if (dtype.jniCode == jniCode) { - return dtype; - } - } - throw new IllegalArgumentException("No DType found for jniCode " + jniCode); + companion object { + @JvmStatic + fun fromJniCode(jniCode: Int): DType = + entries.find { it.jniCode == jniCode } + ?: throw IllegalArgumentException("No DType found for jniCode $jniCode") } } diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.java deleted file mode 100644 index a46b27ab39e..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.java +++ /dev/null @@ -1,34 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch; - -/** Immutable metadata for a method in a Module. */ -public class MethodMetadata { - private final String mName; - private final String[] mBackends; - - MethodMetadata(String name, String[] backends) { - mName = name; - mBackends = backends; - } - - /** - * @return Method name - */ - public String getName() { - return mName; - } - - /** - * @return Backends used for this method - */ - public String[] getBackends() { - return mBackends; - } -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.kt new file mode 100644 index 00000000000..2f25f32c92f --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.kt @@ -0,0 +1,12 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch + +/** Immutable metadata for a method in a Module. */ +class MethodMetadata internal constructor(val name: String, val backends: Array) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt similarity index 68% rename from extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.java rename to extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt index f5f36fc56da..1a38bb13b99 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.java +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt @@ -6,13 +6,14 @@ * LICENSE file in the root directory of this source tree. */ -package org.pytorch.executorch.annotations; +package org.pytorch.executorch.annotations /** * This annotation indicates that an API is experimental and may change or be removed at any time. * It does not provide any guarantees for API stability or backward-compatibility. * - *

This status is not permanent, and APIs marked with this annotation will need to be either made + * This status is not permanent, and APIs marked with this annotation will need to be either made * more robust or removed in the future. */ -public @interface Experimental {} +@Retention(AnnotationRetention.BINARY) +annotation class Experimental diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/package-info.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/package-info.java deleted file mode 100644 index 2173a04c69d..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/package-info.java +++ /dev/null @@ -1,2 +0,0 @@ -/** Annotations used by ExecuTorch Android Java/JNI package. */ -package org.pytorch.executorch.annotations; diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/package-info.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/package-info.java deleted file mode 100644 index 7a5ed0bb5a5..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/package-info.java +++ /dev/null @@ -1,57 +0,0 @@ -/** - * ExecuTorch Android Java API. - * - *

This package provides Java bindings for running ExecuTorch models on Android. Use these - * classes to load a {@code .pte} model file and run inference directly from your Java or Kotlin - * Android app — no C++ required. - * - *

Quick Start

- * - *

Step 1. Add the dependency to your {@code app/build.gradle.kts}: - * - *

{@code
- * dependencies {
- *     implementation("org.pytorch:executorch-android:${executorch_version}")
- * }
- * }
- * - *

Step 2. Load your model and run inference: - * - *

{@code
- * import org.pytorch.executorch.EValue;
- * import org.pytorch.executorch.Module;
- * import org.pytorch.executorch.Tensor;
- *
- * // Load your exported .pte model file
- * Module module = Module.load("/data/local/tmp/model.pte");
- *
- * // Build an input tensor  e.g. a 1x3x224x224 image
- * float[] inputData = new float[1 * 3 * 224 * 224];
- * Tensor inputTensor = Tensor.fromBlob(inputData, new long[]{1, 3, 224, 224});
- *
- * // Run inference
- * EValue[] output = module.forward(EValue.from(inputTensor));
- *
- * // Read the result
- * float[] scores = output[0].toTensor().getDataAsFloatArray();
- * }
- * - *

Key Classes

- * - *
    - *
  • {@link org.pytorch.executorch.Module} — load and run a {@code .pte} model - *
  • {@link org.pytorch.executorch.Tensor} — create input tensors and read outputs - *
  • {@link org.pytorch.executorch.EValue} — wrap inputs and unwrap outputs - *
  • {@link org.pytorch.executorch.DType} — supported data types (FLOAT, INT32, etc.) - *
- * - *

More Resources

- * - * - */ -package org.pytorch.executorch; From 5d36c7c953f58eb7807a0ef45c83b13ab8881da3 Mon Sep 17 00:00:00 2001 From: roman-janik-nxp Date: Tue, 26 May 2026 23:27:14 +0200 Subject: [PATCH 024/103] =?UTF-8?q?NXP=20backend:=20Improve=20docs=20for?= =?UTF-8?q?=20NXP=20eIQ=20Neutron=20Kernel=20Selective=20Kernel=E2=80=A6?= =?UTF-8?q?=20(#19772)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit … Registration ### Summary Docs improvement. ### Test plan Docs only. cc @robert-kalmar @JakeStevens @digantdesai @rascani --- .../backends/nxp/nxp-kernel-selection.md | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/docs/source/backends/nxp/nxp-kernel-selection.md b/docs/source/backends/nxp/nxp-kernel-selection.md index 3ff61323694..307f06d1d02 100644 --- a/docs/source/backends/nxp/nxp-kernel-selection.md +++ b/docs/source/backends/nxp/nxp-kernel-selection.md @@ -1,25 +1,25 @@ # NXP eIQ Neutron Kernel Selective Kernel Registration -The NXP ExecuTorch backend supports selective Neutron kernel registration for `Neutron-C` targets, which decreases the +The NXP ExecuTorch backend supports selective Neutron kernel registration for `Neutron-C` targets, which reduces the size of the Neutron Firmware. During the backend's conversion to the Neutron representation by the Neutron Converter, microcode for the Neutron accelerator is generated. The microcode consists of kernel calls executed by the Neutron Driver. The code for kernel call functions is -distributed in Neutron Firmware. +distributed in the Neutron Firmware. -The `eiq_neutron_sdk.neutron_converter` optionally generates the `*_kernel_selection.c` file, registering -only kernels that are required for a particular model or in the case of ExecuTorch, a delegated subgraph. This -`*_kernel_selection.c`, when used during the application linking, takes precedence over the default list of registered +The `eiq_neutron_sdk.neutron_converter` optionally generates a `*_kernel_selection.c` file, registering +only kernels that are required for a particular model or, in the case of ExecuTorch, a delegated subgraph. This +`*_kernel_selection.c`, when used during application linking, takes precedence over the default list of registered kernels in the Neutron Firmware, and allows the linker to include only the necessary Neutron kernels. -This software is required for deployment on an edge device (e.g. `i.MXRT700`) and is -distributed via the MCUXpresso SDK. The MCUXpresso SDK enables building of a final application that is then flashed on +The Neutron Firmware is required for deployment on an edge device (e.g. `i.MX RT700`) and is +distributed via the MCUXpresso SDK. The MCUXpresso SDK enables the building of a final application that is then flashed on the edge device. For more details about this process, see [eIQ ExecuTorch Library User Guide](https://mcuxpresso.nxp.com/mcuxsdk/latest/html/middleware/eiq/executorch/docs/nxp/ugindex.html). -By default, for Neutron-C targets like `i.MXRT700`, all kernel implementations are present in the Neutron Firmware, which +By default, for Neutron-C targets like `i.MX RT700`, all kernel implementations are present in the Neutron Firmware, which is linked to the final application. This enables an easy build process for any model, but increases the size of the -final application with unused code. In the case of limited RAM, you can link only kernels that are used in the set of -models deployed. This way you can reduce the size of the final app by linking only selected kernels, used in one or -multiple models. +final application with unused code. In memory-constrained environments, you can link only the kernels required by the +deployed models. This way you can reduce the size of the final application by linking only selected kernels, used in one +or more models. The feature works as follows: The Neutron Converter with the appropriate flag exports a kernel selection file for each converted subgraph, the kernel selection files are then merged and ready to be included in the MCUXpresso SDK to use for @@ -30,7 +30,7 @@ a selection-only build. ## Export kernel selection file -To turn on this feature on the side of NXP ExecuTorch backend, use the parameter `--dump_kernel_selection_code` in +To enable this feature in the NXP ExecuTorch backend, use the parameter `--dump_kernel_selection_code` in `aot_neutron_compile.py`. An example with the CifarNet model: ```commandline @@ -43,7 +43,7 @@ This command will create a `*_kernel_selection.c` file alongside the converted P ## Kernel Registration for Multiple Models -If you want to use or experiment with multiple models in one application while having reduced kernel set, you can +If you want to use or experiment with multiple models in one application while having a reduced kernel set, you can create one kernel selection file with the script `merge_kernel_selection_code.py`: ```commandline From cedfd486dc6bcc7fef3015d1b949c958a247c4ec Mon Sep 17 00:00:00 2001 From: Per Held Date: Tue, 26 May 2026 23:43:37 +0200 Subject: [PATCH 025/103] Arm backend: Validate TOSA resize parameters (#19757) Re-upload with BUCK changes. Share TOSA RESIZE parameter validation between upsample support checks and fake RESIZE lowering so invalid nearest and bilinear resize parameters are rejected before delegation. Change-Id: I57c267aca96d733879ae90329267e44adce399c6 cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Per Held --- backends/arm/operator_support/TARGETS | 1 + .../arm/operator_support/upsample_support.py | 82 ++++-- .../misc/tosa_dialect/test_tosa_resize.py | 26 +- .../arm/test/ops/test_upsample_nearest2d.py | 11 + backends/arm/tosa/BUCK | 11 + backends/arm/tosa/dialect/BUCK | 1 + backends/arm/tosa/dialect/ops/resize.py | 62 ++--- backends/arm/tosa/resize_utils.py | 259 ++++++++++++++++++ 8 files changed, 389 insertions(+), 64 deletions(-) create mode 100644 backends/arm/tosa/resize_utils.py diff --git a/backends/arm/operator_support/TARGETS b/backends/arm/operator_support/TARGETS index 8f6721bd911..a2fd054d472 100644 --- a/backends/arm/operator_support/TARGETS +++ b/backends/arm/operator_support/TARGETS @@ -6,6 +6,7 @@ runtime.python_library( deps = [ "//executorch/backends/arm:constants", "//executorch/backends/arm/_passes:passes", + "//executorch/backends/arm/tosa:resize_utils", "//executorch/backends/arm/tosa:tosa", "//executorch/backends/transforms:remove_getitem_op", "//executorch/backends/xnnpack/_passes:xnnpack_passes", diff --git a/backends/arm/operator_support/upsample_support.py b/backends/arm/operator_support/upsample_support.py index bd03a4d2b4f..42e88f08521 100644 --- a/backends/arm/operator_support/upsample_support.py +++ b/backends/arm/operator_support/upsample_support.py @@ -13,9 +13,53 @@ SupportedTOSAOperatorCheck, ) from executorch.backends.arm.tosa import TosaSpecification +from executorch.backends.arm.tosa.resize_utils import get_tosa_resize_validation_error from executorch.exir.dialects._ops import ops as exir_ops +def _is_upsample_node_tosa_supported( + support_check: SupportedTOSAOperatorCheck, + node: fx.Node, + tosa_spec: TosaSpecification, + *, + align_corners: bool, +) -> bool: + input_node = ensure_type(fx.Node, node.args[0]) + input_size_yx = get_first_fake_tensor(input_node).shape[2:] + output_size_yx = get_first_fake_tensor(node).shape[2:] + + try: + scale_y_n, scale_y_d, offset_y, border_y = ( + RewriteUpsamplePass.get_resize_parameters_1d( + input_size_yx[0], output_size_yx[0], align_corners + ) + ) + scale_x_n, scale_x_d, offset_x, border_x = ( + RewriteUpsamplePass.get_resize_parameters_1d( + input_size_yx[1], output_size_yx[1], align_corners + ) + ) + except RuntimeError as err: + support_check.reporter.report_reject(node, str(err)) + return False + + # Validate the exact TOSA RESIZE parameters that RewriteUpsamplePass will + # emit so support checks and fake-op validation reject the same cases. + validation_error = get_tosa_resize_validation_error( + input_hw=input_size_yx, + output_hw=output_size_yx, + scale=[scale_y_n, scale_y_d, scale_x_n, scale_x_d], + offset=[offset_y, offset_x], + border=[border_y, border_x], + tosa_spec=tosa_spec, + ) + if validation_error is not None: + support_check.reporter.report_reject(node, validation_error) + return False + + return True + + @register_tosa_support_check class UpsampleNearest2dSupported(SupportedTOSAOperatorCheck): """Provide the explicit TOSA support gate for nearest upsample.""" @@ -23,9 +67,11 @@ class UpsampleNearest2dSupported(SupportedTOSAOperatorCheck): targets = [exir_ops.edge.aten.upsample_nearest2d.vec] def is_node_tosa_supported( - self, _node: fx.Node, _tosa_spec: TosaSpecification + self, node: fx.Node, tosa_spec: TosaSpecification ) -> bool: # type: ignore[override, misc] - return True + return _is_upsample_node_tosa_supported( + self, node, tosa_spec, align_corners=False + ) @register_tosa_support_check @@ -37,33 +83,9 @@ class UpsampleBilinear2dSupported(SupportedTOSAOperatorCheck): targets = [exir_ops.edge.aten.upsample_bilinear2d.vec] def is_node_tosa_supported( - self, node: fx.Node, _tosa_spec: TosaSpecification + self, node: fx.Node, tosa_spec: TosaSpecification ) -> bool: # type: ignore[override, misc] - input_node = ensure_type(fx.Node, node.args[0]) align_corners = ensure_type(bool, node.args[2]) - input_size_yx = get_first_fake_tensor(input_node).shape[2:] - output_size_yx = get_first_fake_tensor(node).shape[2:] - - try: - scale_y_n, scale_y_d, _, _ = RewriteUpsamplePass.get_resize_parameters_1d( - input_size_yx[0], output_size_yx[0], align_corners - ) - scale_x_n, scale_x_d, _, _ = RewriteUpsamplePass.get_resize_parameters_1d( - input_size_yx[1], output_size_yx[1], align_corners - ) - except RuntimeError as err: - self.reporter.report_reject(node, str(err)) - return False - - # get_resize_parameters_1d() returns the TOSA RESIZE scale fraction for - # each spatial dimension. For align_corners=False, this is the effective - # output_size / input_size ratio, so the 1/16 boundary is checked - # directly in the same representation that RESIZE lowering will use. - if scale_y_d >= 16 * scale_y_n or scale_x_d >= 16 * scale_x_n: - self.reporter.report_reject( - node, - "Bilinear RESIZE downscale must be strictly greater than 1/16", - ) - return False - - return True + return _is_upsample_node_tosa_supported( + self, node, tosa_spec, align_corners=align_corners + ) diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py b/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py index d9d8b89feb6..0a90de5c0c0 100644 --- a/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py +++ b/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py @@ -33,13 +33,14 @@ def _expr(sym: torch.SymInt) -> sympy.Expr: return sympy.sympify(getattr(sym.node, "expr", sym.node._expr)) -def test_bilinear_resize_rejects_exact_one_sixteenth_downscale(): +@pytest.mark.parametrize("resize_mode", ("nearest", "bilinear")) +def test_resize_rejects_exact_one_sixteenth_downscale(resize_mode: str): with TosaLoweringContext( TosaSpecification.create_from_string("TOSA-1.0+INT") ), FakeTensorMode() as mode: with pytest.raises( TosaValueError, - match="Bilinear RESIZE downscale must be strictly greater than 1/16", + match="RESIZE downscale must be strictly greater than 1/16", ): exir_ops.backend.tosa.RESIZE.default( mode.from_tensor( @@ -48,7 +49,26 @@ def test_bilinear_resize_rejects_exact_one_sixteenth_downscale(): [2, 32, 2, 32], [15, 15], [-15, -15], - resize_mode="bilinear", + resize_mode=resize_mode, + ) + + +def test_resize_rejects_scale_numerator_over_tosa_limit(): + with TosaLoweringContext( + TosaSpecification.create_from_string("TOSA-1.0+INT") + ), FakeTensorMode() as mode: + with pytest.raises( + TosaValueError, + match="RESIZE scale numerator must be <= 2048", + ): + exir_ops.backend.tosa.RESIZE.default( + mode.from_tensor(torch.randint(0, 10, (1, 3, 4, 2), dtype=torch.int8)), + # 2049 violates scale_n <= 1 << 11, while 2049/2 still stays + # within MAX_SCALE so this test isolates the numerator rule. + [2049, 2, 4, 2], + [0, 0], + [0, 0], + resize_mode="nearest", ) diff --git a/backends/arm/test/ops/test_upsample_nearest2d.py b/backends/arm/test/ops/test_upsample_nearest2d.py index 5781e4ed29d..d8bf4d7dbd5 100644 --- a/backends/arm/test/ops/test_upsample_nearest2d.py +++ b/backends/arm/test/ops/test_upsample_nearest2d.py @@ -198,6 +198,17 @@ def test_upsample_nearest2d_vec_tosa_FP_interpolate(test_data: torch.Tensor): pipeline.run() +def test_upsample_nearest2d_vec_tosa_does_not_delegate_exact_one_sixteenth_downscale(): + pipeline = OpNotSupportedPipeline[input_t1]( + Interpolate(size=None, scale_factor=1.0 / 16.0), + (torch.randn(1, 3, 256, 448),), + {exir_op: 1}, + n_expected_delegates=0, + ) + + pipeline.run() + + @common.parametrize("test_data", test_data_suite) def test_upsample_nearest2d_vec_tosa_INT(test_data: torch.Tensor): test_data, size, scale_factor, compare_outputs = test_data() diff --git a/backends/arm/tosa/BUCK b/backends/arm/tosa/BUCK index 46ff6648c54..81d1f62437f 100644 --- a/backends/arm/tosa/BUCK +++ b/backends/arm/tosa/BUCK @@ -41,6 +41,17 @@ fbcode_target(_kind = runtime.python_library, ], ) +fbcode_target(_kind = runtime.python_library, + name = "resize_utils", + srcs = [ + "resize_utils.py", + ], + deps = [ + "//caffe2:torch", + ":specification", + ], +) + fbcode_target(_kind = runtime.python_library, name = "tosa", srcs = [ diff --git a/backends/arm/tosa/dialect/BUCK b/backends/arm/tosa/dialect/BUCK index 4e7f5837766..5081f5d6945 100644 --- a/backends/arm/tosa/dialect/BUCK +++ b/backends/arm/tosa/dialect/BUCK @@ -22,6 +22,7 @@ fbcode_target(_kind = runtime.python_library, deps = [ ":core", "//caffe2:torch", + "//executorch/backends/arm/tosa:resize_utils", "//executorch/backends/arm/tosa:tosa", ], ) diff --git a/backends/arm/tosa/dialect/ops/resize.py b/backends/arm/tosa/dialect/ops/resize.py index c48ff508afc..8a2d4c5e60a 100644 --- a/backends/arm/tosa/dialect/ops/resize.py +++ b/backends/arm/tosa/dialect/ops/resize.py @@ -8,6 +8,10 @@ import torch from executorch.backends.arm.tosa.dialect.lib import TosaValueError from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op +from executorch.backends.arm.tosa.resize_utils import ( + calculate_tosa_resize_output_hw, + get_tosa_resize_validation_error, +) from executorch.backends.arm.tosa.specification import ( get_context_spec, @@ -50,23 +54,17 @@ def _get_output_dtype( return output_dtype -def _validate_resize_parameters(scale, border, resize_mode): - def in_int16_range(values): - return all( - (x >= -(2**15)) and (x <= 2**15 - 1) for x in values if isinstance(x, int) - ) - - if not in_int16_range(scale): - raise TosaValueError("scale is out of the int16 range", op="RESIZE") - if not in_int16_range(border): - raise TosaValueError("border is out of the int16 range", op="RESIZE") - if resize_mode == "bilinear": - scale_y_n, scale_y_d, scale_x_n, scale_x_d = scale - if scale_y_d >= 16 * scale_y_n or scale_x_d >= 16 * scale_x_n: - raise TosaValueError( - "Bilinear RESIZE downscale must be strictly greater than 1/16", - op="RESIZE", - ) +def _validate_resize_parameters(input_hw, output_hw, scale, offset, border, tosa_spec): + validation_error = get_tosa_resize_validation_error( + input_hw=input_hw, + output_hw=output_hw, + scale=scale, + offset=offset, + border=border, + tosa_spec=tosa_spec, + ) + if validation_error is not None: + raise TosaValueError(validation_error, op="RESIZE") @register_fake_tosa_op( @@ -88,24 +86,26 @@ def RESIZE( f"Input tensor must be 4D, but got {x.dim()}D", op="RESIZE" ) _validate_resize_mode(resize_mode) - _validate_resize_parameters(scale, border, resize_mode) output_dtype = _get_output_dtype(x.dtype, tosa_spec, resize_mode) input_shape = x.shape - scale_y_n, scale_y_d, scale_x_n, scale_x_d = scale - offset_y, offset_x = offset - border_y, border_x = border H, W = input_shape[1], input_shape[2] - # RESIZE first upscales the input by an integer value, to "upscale space". - H_upscaled = (H - 1) * scale_y_n - # offset and border are provided in this scale, therefore adjust for these while in this space. - H_shifted = H_upscaled - offset_y + border_y - # Then, complete the RESIZE by downscaling with another integer value, approximating multplication with a fraction. - OH = (H_shifted // scale_y_d) + 1 - # Mirror the same computation horizontally for the output width. - W_upscaled = (W - 1) * scale_x_n - W_shifted = W_upscaled - offset_x + border_x - OW = (W_shifted // scale_x_d) + 1 + _validate_resize_parameters((H, W), None, scale, offset, border, tosa_spec) + output_hw = calculate_tosa_resize_output_hw((H, W), scale, offset, border) + _validate_resize_parameters((H, W), output_hw, scale, offset, border, tosa_spec) + if output_hw is None: + scale_y_n, scale_y_d, scale_x_n, scale_x_d = scale + offset_y, offset_x = offset + border_y, border_x = border + # RESIZE first upscales the input by an integer value to "upscale + # space". Offset and border are encoded in that space, then RESIZE + # completes by downscaling with another integer value, approximating + # multiplication by a fraction. + OH = ((H - 1) * scale_y_n - offset_y + border_y) // scale_y_d + 1 + OW = ((W - 1) * scale_x_n - offset_x + border_x) // scale_x_d + 1 + else: + OH, OW = output_hw + fake_aten_tensor = torch.empty( size=(input_shape[0], OH, OW, input_shape[3]), dtype=output_dtype ) diff --git a/backends/arm/tosa/resize_utils.py b/backends/arm/tosa/resize_utils.py new file mode 100644 index 00000000000..6c716bfa59c --- /dev/null +++ b/backends/arm/tosa/resize_utils.py @@ -0,0 +1,259 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Sequence + +import torch + +from executorch.backends.arm.tosa.specification import TosaSpecification + +_MAX_RESIZE_DIMENSION = 16384 +_MAX_RESIZE_SCALE_NUMERATOR = 1 << 11 +_MAX_SCALE = 2048 +_MAX_SCALE_LEVEL_8K = 256 +_INT16_MIN = -(2**15) +_INT16_MAX = 2**15 - 1 + + +def _as_concrete_ints(values: Sequence[int | torch.SymInt]) -> list[int] | None: + if all(isinstance(value, int) for value in values): + return [int(value) for value in values] + return None + + +def _concrete_int_values(values: Sequence[int | torch.SymInt]) -> list[int]: + return [int(value) for value in values if isinstance(value, int)] + + +def _first_outside_range( + values: Sequence[int], min_value: int, max_value: int +) -> int | None: + return next( + (value for value in values if value < min_value or value > max_value), None + ) + + +def _max_scale(tosa_spec: TosaSpecification) -> int: + return _MAX_SCALE_LEVEL_8K if getattr(tosa_spec, "level_8k", False) else _MAX_SCALE + + +def _validate_dimensions( + input_hw: Sequence[int | torch.SymInt], + output_hw: Sequence[int | torch.SymInt] | None, +) -> str | None: + concrete_dimensions: list[int] = [] + input_hw_ints = _as_concrete_ints(input_hw) + output_hw_ints = _as_concrete_ints(output_hw) if output_hw is not None else None + if input_hw_ints is not None: + concrete_dimensions.extend(input_hw_ints) + if output_hw_ints is not None: + concrete_dimensions.extend(output_hw_ints) + + invalid_dimension = next( + ( + dimension + for dimension in concrete_dimensions + if dimension >= _MAX_RESIZE_DIMENSION + ), + None, + ) + if invalid_dimension is not None: + return ( + "RESIZE dimensions must be less than " + f"{_MAX_RESIZE_DIMENSION}; got {invalid_dimension}" + ) + return None + + +def _validate_scale( + scale: Sequence[int | torch.SymInt], + tosa_spec: TosaSpecification, +) -> str | None: + invalid_scale = _first_outside_range( + _concrete_int_values(scale), _INT16_MIN, _INT16_MAX + ) + if invalid_scale is not None: + return ( + "RESIZE scale must be in int16 range " + f"[{_INT16_MIN}, {_INT16_MAX}]; got {invalid_scale}" + ) + + scale_ints = _as_concrete_ints(scale) + if scale_ints is None: + return None + + scale_y_n, scale_y_d, scale_x_n, scale_x_d = scale_ints + if min(scale_y_n, scale_y_d, scale_x_n, scale_x_d) <= 0: + return f"RESIZE scale values must be positive; got {scale_ints}" + + max_scale = _max_scale(tosa_spec) + if scale_y_n > max_scale * scale_y_d or scale_x_n > max_scale * scale_x_d: + return ( + f"RESIZE scale ratio must be <= MAX_SCALE ({max_scale}); " + f"got y={scale_y_n}/{scale_y_d}, x={scale_x_n}/{scale_x_d}" + ) + + if ( + scale_y_n > _MAX_RESIZE_SCALE_NUMERATOR + or scale_x_n > _MAX_RESIZE_SCALE_NUMERATOR + ): + return ( + "RESIZE scale numerator must be <= " + f"{_MAX_RESIZE_SCALE_NUMERATOR}; got y={scale_y_n}, x={scale_x_n}" + ) + + # The scale values are already in the doubled rational representation that + # TOSA RESIZE lowering emits, so the lower-bound downscale rule can be + # checked directly against them. + if scale_y_d >= 16 * scale_y_n or scale_x_d >= 16 * scale_x_n: + return ( + "RESIZE downscale must be strictly greater than 1/16; " + f"got y={scale_y_n}/{scale_y_d}, x={scale_x_n}/{scale_x_d}" + ) + return None + + +def _validate_offset( + offset: Sequence[int | torch.SymInt], + scale_ints: list[int], +) -> str | None: + offset_ints = _as_concrete_ints(offset) + if offset_ints is None: + return None + + scale_y_n, _, scale_x_n, _ = scale_ints + offset_y, offset_x = offset_ints + if offset_y < -scale_y_n or offset_y >= 16 * scale_y_n: + return ( + f"RESIZE offset_y must be in [{-scale_y_n}, {16 * scale_y_n}); " + f"got {offset_y}" + ) + if offset_x < -scale_x_n or offset_x >= 16 * scale_x_n: + return ( + f"RESIZE offset_x must be in [{-scale_x_n}, {16 * scale_x_n}); " + f"got {offset_x}" + ) + return None + + +def _validate_border( + border: Sequence[int | torch.SymInt], + scale_ints: list[int], +) -> str | None: + invalid_border = _first_outside_range( + _concrete_int_values(border), _INT16_MIN, _INT16_MAX + ) + if invalid_border is not None: + return ( + "RESIZE border must be in int16 range " + f"[{_INT16_MIN}, {_INT16_MAX}]; got {invalid_border}" + ) + + border_ints = _as_concrete_ints(border) + if border_ints is None: + return None + + scale_y_n, _, scale_x_n, _ = scale_ints + border_y, border_x = border_ints + if border_y < -16 * scale_y_n or border_y >= scale_y_n: + return ( + f"RESIZE border_y must be in [{-16 * scale_y_n}, {scale_y_n}); " + f"got {border_y}" + ) + if border_x < -16 * scale_x_n or border_x >= scale_x_n: + return ( + f"RESIZE border_x must be in [{-16 * scale_x_n}, {scale_x_n}); " + f"got {border_x}" + ) + return None + + +def _validate_output_shape( + input_hw: Sequence[int | torch.SymInt], + output_hw: Sequence[int | torch.SymInt] | None, + scale: Sequence[int | torch.SymInt], + offset: Sequence[int | torch.SymInt], + border: Sequence[int | torch.SymInt], +) -> str | None: + if output_hw is None: + return None + + output_hw_ints = _as_concrete_ints(output_hw) + expected_output_hw = calculate_tosa_resize_output_hw( + input_hw, scale, offset, border + ) + if ( + output_hw_ints is not None + and expected_output_hw is not None + and tuple(output_hw_ints) != expected_output_hw + ): + return ( + "RESIZE output shape is inconsistent with input and parameters; " + f"expected {expected_output_hw}, got {tuple(output_hw_ints)}" + ) + return None + + +def calculate_tosa_resize_output_hw( + input_hw: Sequence[int | torch.SymInt], + scale: Sequence[int | torch.SymInt], + offset: Sequence[int | torch.SymInt], + border: Sequence[int | torch.SymInt], +) -> tuple[int, int] | None: + input_hw_ints = _as_concrete_ints(input_hw) + scale_ints = _as_concrete_ints(scale) + offset_ints = _as_concrete_ints(offset) + border_ints = _as_concrete_ints(border) + if ( + input_hw_ints is None + or scale_ints is None + or offset_ints is None + or border_ints is None + ): + return None + + input_h, input_w = input_hw_ints + scale_y_n, scale_y_d, scale_x_n, scale_x_d = scale_ints + offset_y, offset_x = offset_ints + border_y, border_x = border_ints + + # RESIZE first upscales the input by an integer value to "upscale space". + # Offset and border are encoded in that space, then RESIZE completes by + # downscaling with another integer value, approximating multiplication by a + # fraction. + return ( + ((input_h - 1) * scale_y_n - offset_y + border_y) // scale_y_d + 1, + ((input_w - 1) * scale_x_n - offset_x + border_x) // scale_x_d + 1, + ) + + +def get_tosa_resize_validation_error( + *, + input_hw: Sequence[int | torch.SymInt], + output_hw: Sequence[int | torch.SymInt] | None, + scale: Sequence[int | torch.SymInt], + offset: Sequence[int | torch.SymInt], + border: Sequence[int | torch.SymInt], + tosa_spec: TosaSpecification, +) -> str | None: + scale_ints = _as_concrete_ints(scale) + + validation_error = _validate_dimensions(input_hw, output_hw) + if validation_error is not None: + return validation_error + validation_error = _validate_scale(scale, tosa_spec) + if validation_error is not None: + return validation_error + if scale_ints is None: + return None + + for validation_error in ( + _validate_offset(offset, scale_ints), + _validate_border(border, scale_ints), + _validate_output_shape(input_hw, output_hw, scale, offset, border), + ): + if validation_error is not None: + return validation_error + return None From 29c3a232ca7f1db4140b1ae653f88750ea13e704 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Tue, 26 May 2026 17:53:22 -0400 Subject: [PATCH 026/103] Fix cortex_m test failures from D106339880 Differential Revision: D106408368 Pull Request resolved: https://github.com/pytorch/executorch/pull/19783 --- backends/cortex_m/passes/BUCK | 1 + backends/cortex_m/passes/convert_to_cortex_m_pass.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/cortex_m/passes/BUCK b/backends/cortex_m/passes/BUCK index 4e49c8cd319..f1b7b9a201d 100644 --- a/backends/cortex_m/passes/BUCK +++ b/backends/cortex_m/passes/BUCK @@ -36,6 +36,7 @@ fbcode_target(_kind = runtime.python_library, "decompose_hardswish_pass.py", "decompose_mean_pass.py", "quantized_clamp_activation_pass.py", + "scratch_buffer_sizes.py", ], deps=[ "//caffe2:torch", diff --git a/backends/cortex_m/passes/convert_to_cortex_m_pass.py b/backends/cortex_m/passes/convert_to_cortex_m_pass.py index e61ddaf63bc..5704645caf8 100644 --- a/backends/cortex_m/passes/convert_to_cortex_m_pass.py +++ b/backends/cortex_m/passes/convert_to_cortex_m_pass.py @@ -12,7 +12,7 @@ import torch.fx from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor -from executorch.backends.cortex_m.passes import CortexMPass +from executorch.backends.cortex_m.passes.cortex_m_pass import CortexMPass from executorch.backends.cortex_m.passes.passes_utils import quantize_multiplier_aot from executorch.backends.cortex_m.passes.scratch_buffer_sizes import ( required_cmsis_nn_buffer_sizes, From ae4fdb5fda63dc7ef8f5a34e55b2d8233ba8a941 Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Tue, 26 May 2026 16:19:58 -0700 Subject: [PATCH 027/103] Set test seed per-test (#19744) ### Summary In https://github.com/pytorch/executorch/pull/19651, I added a global seed for pytest runs. This was intended to reduce random tolerance flakes, but didn't actually do so in practice. This is because the parallel test runners don't guarantee any ordering, so random state is unstable between runs. I've updated it to set the seed per-test. This should hopefully make the random state invariant of test execution order. --- backends/cadence/aot/tests/test_replace_ops_passes.py | 2 ++ conftest.py | 8 ++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/backends/cadence/aot/tests/test_replace_ops_passes.py b/backends/cadence/aot/tests/test_replace_ops_passes.py index 170da6deb09..a73ef02c996 100644 --- a/backends/cadence/aot/tests/test_replace_ops_passes.py +++ b/backends/cadence/aot/tests/test_replace_ops_passes.py @@ -1250,6 +1250,7 @@ def test_replace_conv1d_with_linear(self) -> None: inputs, "ReplaceTrivialConvWithLinear", rtol=2e-5, + atol=5e-6, ) # Assert that conv1d is trivially converted to linear @@ -1294,6 +1295,7 @@ def test_replace_conv2d_with_linear(self) -> None: inputs, "ReplaceTrivialConvWithLinear", rtol=2e-5, + atol=5e-6, ) # Assert that conv2d is trivially converted to linear diff --git a/conftest.py b/conftest.py index 19d777a74e0..be0e6e4ea3d 100644 --- a/conftest.py +++ b/conftest.py @@ -1,3 +1,4 @@ +import hashlib import sys import torch @@ -13,5 +14,8 @@ "backends/apple/**", ] -# Seed the run -torch.manual_seed(42) + +def pytest_runtest_setup(item): + # Set a stable seed for each test based on a hash of the test name. + seed = int(hashlib.sha256(item.nodeid.encode()).hexdigest(), 16) % (2**32) + torch.manual_seed(seed) From b4d62edb4b1f941e84d9a3d675e2a082bd09c2a6 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 26 May 2026 16:24:48 -0700 Subject: [PATCH 028/103] Collapse Experimental.kt annotation onto a single line to satisfy linter Differential Revision: D106430647 Pull Request resolved: https://github.com/pytorch/executorch/pull/19790 --- .../java/org/pytorch/executorch/annotations/Experimental.kt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt index 1a38bb13b99..42a5980d6ba 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt @@ -15,5 +15,4 @@ package org.pytorch.executorch.annotations * This status is not permanent, and APIs marked with this annotation will need to be either made * more robust or removed in the future. */ -@Retention(AnnotationRetention.BINARY) -annotation class Experimental +@Retention(AnnotationRetention.BINARY) annotation class Experimental From 034b044382d95894eab62f1a258fc2fec6f3a34a Mon Sep 17 00:00:00 2001 From: Ethan Ng Date: Tue, 26 May 2026 17:15:16 -0700 Subject: [PATCH 029/103] Handle out_dtype in ReplacePT2DequantWithCadenceDequantPass (#19743) Differential Revision: D105630451 Pull Request resolved: https://github.com/pytorch/executorch/pull/19743 --- backends/cadence/aot/replace_ops.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py index 4b60feb2121..50112a4eb66 100644 --- a/backends/cadence/aot/replace_ops.py +++ b/backends/cadence/aot/replace_ops.py @@ -162,14 +162,31 @@ def targets(self) -> list[EdgeOpOverload]: def maybe_remove_or_replace(self, node: torch.fx.Node) -> bool: ns = exir_ops.edge if isinstance(node.target, EdgeOpOverload) else torch.ops + out_dtype = node.kwargs.get("out_dtype") + kwargs = {k: v for k, v in node.kwargs.items() if k != "out_dtype"} with node.graph.inserting_before(node): new_node = node.graph.call_function( ns.cadence.dequantize_per_tensor.default, args=node.args, - kwargs=node.kwargs, + kwargs=kwargs, ) - new_node.meta = node.meta - node.replace_all_uses_with(new_node) + new_node.meta = node.meta.copy() + if ( + out_dtype is not None + and out_dtype != torch.float32 + and "val" in new_node.meta + ): + new_node.meta["val"] = new_node.meta["val"].to(torch.float32) + if out_dtype is not None and out_dtype != torch.float32: + with node.graph.inserting_after(new_node): + cast_node = node.graph.call_function( + ns.aten.to.dtype, + args=(new_node, out_dtype), + ) + cast_node.meta = node.meta.copy() + node.replace_all_uses_with(cast_node) + else: + node.replace_all_uses_with(new_node) return True From 79fe3a30148d4cebbff9a2f89254469787e74256 Mon Sep 17 00:00:00 2001 From: Daisuke Majima Date: Wed, 27 May 2026 09:25:09 +0900 Subject: [PATCH 030/103] Add coreml_compute_plan.py: report which CoreML ops dispatch to ANE / GPU / CPU (#19252) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Summary CoreML decides at compile/load time which device each MIL operation will execute on, and coremltools 9.0+ exposes that through `MLComputePlan`. The recurring question on the issue tracker is *"why isn't my model running fully on the ANE?"* — for example: - #4091 — `llama model is not fully lowered to ANE` - #11541 — `CoreML model is crashing on iPhone GPU, but not on iPhone CPU or macOS GPU` - #8439 — `ANE compile OOMs on certain input shapes` - #8445 — `CPU Overhead After ANE Execution` Today the only way for an ExecuTorch user to answer it is to break out Swift / Xcode. This PR adds a Python wrapper around `MLComputePlan` so the answer is one shell command: ``` $ python coreml_compute_plan.py --model_path my_model.mlpackage \ --compute_units cpu_and_ne --show_non_ane === my_model.mlpackage === ANE: 412 / 480 ( 85.8%) CPU: 68 / 480 ( 14.2%) Non-ANE op types: 32 ios17.cast 18 ios17.gather 12 ios17.reshape 6 ios17.constexpr_blockwise_shift_scale ``` Inputs supported: | Input | Behavior | |---|---| | `.pte` | Extract every Core ML partition into a tempdir, then analyze each. | | `.mlpackage` | Compile to `.mlmodelc` in a tempdir, then analyze. | | `.mlmodelc` | Analyze directly. | The PTE path reuses the same JSON/named-data extraction logic that `extract_coreml_models.py` uses, and is inlined into the script so it can be run against a plain CoreML model without depending on the executorch package. ### Test plan Added `test_coreml_compute_plan.py` covering: - `_device_name(...)` for `None` and a stub `MLNeuralEngineComputeDevice`. - `_COMPUTE_UNIT_CHOICES` mapping (`cpu_and_ne` / `all`). - `analyze_one(...)` end-to-end on a tiny `relu(x @ x.T) + x.sum()` mlpackage built with `coremltools.convert(...)`: returns rows for every dispatched op, with a `main` function and the expected MIL op types (`matmul`, `relu`, `add`, `reduce_sum`). ``` $ python -m pytest examples/apple/coreml/scripts/test_coreml_compute_plan.py -v ============================== 7 passed in 3.68s =============================== ``` I also ran the script against a few hand-built `.mlpackage` and `.mlmodelc` files on macOS 26 with coremltools 9.0 and verified the output matches what `MLComputePlan` returns directly. Authored with Claude. cc @kimishpatel @YifanShenSZ @cymbalrush @metascroy --- examples/apple/coreml/scripts/BUCK | 13 + .../coreml/scripts/coreml_compute_plan.py | 236 ++++++++++++++++++ .../coreml/scripts/extract_coreml_models.py | 15 +- .../scripts/test_coreml_compute_plan.py | 161 ++++++++++++ 4 files changed, 422 insertions(+), 3 deletions(-) create mode 100644 examples/apple/coreml/scripts/coreml_compute_plan.py create mode 100644 examples/apple/coreml/scripts/test_coreml_compute_plan.py diff --git a/examples/apple/coreml/scripts/BUCK b/examples/apple/coreml/scripts/BUCK index 164feb8d306..42a97ea893f 100644 --- a/examples/apple/coreml/scripts/BUCK +++ b/examples/apple/coreml/scripts/BUCK @@ -16,6 +16,19 @@ fbcode_target(_kind = python_binary, ], ) +fbcode_target(_kind = python_binary, + name = "coreml_compute_plan", + srcs = [ + "coreml_compute_plan.py", + ], + main_function = "executorch.examples.apple.coreml.scripts.coreml_compute_plan.main", + deps = [ + "//executorch/backends/apple/coreml:executorchcoreml", + "//executorch/exir:schema", + "//executorch/exir/_serialize:lib", + ], +) + fbcode_target(_kind = python_binary, name = "export", srcs = [ diff --git a/examples/apple/coreml/scripts/coreml_compute_plan.py b/examples/apple/coreml/scripts/coreml_compute_plan.py new file mode 100644 index 00000000000..c0ca08db831 --- /dev/null +++ b/examples/apple/coreml/scripts/coreml_compute_plan.py @@ -0,0 +1,236 @@ +# Copyright © 2026 Apple Inc. All rights reserved. +# +# Please refer to the license found in the LICENSE file in the root directory of the source tree. + +"""Report which CoreML operations would dispatch to ANE / GPU / CPU. + +The CoreML runtime decides at compile/load time which compute device each +MIL operation will run on; that decision is exposed by ``MLComputePlan`` +in coremltools 9.0+. This script wraps that API so users can answer +"why isn't my model running on the ANE?" without writing Swift. + +Usage:: + + # Analyze a CoreML model directly (mlpackage or compiled mlmodelc). + python coreml_compute_plan.py --model_path path/to/model.mlpackage + + # Analyze every Core ML partition embedded in an ExecuTorch .pte. + python coreml_compute_plan.py --model_path path/to/program.pte + + # Show ops that fell off the ANE, grouped by op type. + python coreml_compute_plan.py --model_path model.mlpackage --show_non_ane + + # Pick which devices the runtime is allowed to consider. + python coreml_compute_plan.py --model_path model.mlpackage \\ + --compute_units cpu_and_ne +""" + +import argparse +import os +import sys +import tempfile +from collections import Counter +from typing import Iterable, List, Tuple + +import coremltools as ct +from coremltools.models.compute_device import ( + MLCPUComputeDevice, + MLGPUComputeDevice, + MLNeuralEngineComputeDevice, +) +from coremltools.models.compute_plan import MLComputePlan + +from executorch.examples.apple.coreml.scripts.extract_coreml_models import ( + extract_coreml_models, +) + + +_DEVICE_NAMES: List[Tuple[type, str]] = [ + (MLNeuralEngineComputeDevice, "ANE"), + (MLGPUComputeDevice, "GPU"), + (MLCPUComputeDevice, "CPU"), +] + +_COMPUTE_UNIT_CHOICES = { + "all": ct.ComputeUnit.ALL, + "cpu_and_ne": ct.ComputeUnit.CPU_AND_NE, + "cpu_and_gpu": ct.ComputeUnit.CPU_AND_GPU, + "cpu_only": ct.ComputeUnit.CPU_ONLY, +} + + +def _device_name(device) -> str: + if device is None: + return "unknown" + for cls, name in _DEVICE_NAMES: + if isinstance(device, cls): + return name + return type(device).__name__ + + +def _iter_operations(block) -> Iterable: + for op in block.operations: + yield op + for nested in getattr(op, "blocks", None) or []: + yield from _iter_operations(nested) + + +def _ensure_compiled(model_path: str, tmpdir: str) -> str: + """Return a `.mlmodelc` path; compile from `.mlpackage` if needed.""" + if model_path.endswith(".mlmodelc"): + return model_path + if model_path.endswith(".mlpackage"): + dest = os.path.join( + tmpdir, os.path.basename(model_path).replace(".mlpackage", ".mlmodelc") + ) + return str(ct.models.utils.compile_model(model_path, destination_path=dest)) + raise ValueError(f"Expected a .mlpackage or .mlmodelc path, got: {model_path}") + + +def analyze_one( + model_path: str, compute_units: ct.ComputeUnit +) -> List[Tuple[str, str, str]]: + """Return [(function, operator_name, device)] for every op that has a plan. + + coremltools 9.0's ``MLComputePlan.load_from_path`` only exposes usage for + the default function of a multifunction package, so a multifunction + .mlpackage is analyzed function-by-function by projecting each function + as the ``main`` of a temp single-function copy. + """ + function_names = _mlpackage_function_names(model_path) + if len(function_names) <= 1: + return _analyze_compiled(model_path, compute_units) + rows: List[Tuple[str, str, str]] = [] + with tempfile.TemporaryDirectory() as tmpdir: + for fname in function_names: + projected = _project_to_single(model_path, fname, tmpdir) + for _, op_name, device in _analyze_compiled(projected, compute_units): + rows.append((fname, op_name, device)) + return rows + + +def _analyze_compiled( + model_path: str, compute_units: ct.ComputeUnit +) -> List[Tuple[str, str, str]]: + with tempfile.TemporaryDirectory() as tmpdir: + compiled = _ensure_compiled(model_path, tmpdir) + plan = MLComputePlan.load_from_path(compiled, compute_units=compute_units) + program = plan.model_structure.program + if program is None: + raise RuntimeError( + f"{model_path} is not an MLProgram model; this tool only supports " + "the MLProgram backend (the CoreML backend executorch produces today)." + ) + + rows: List[Tuple[str, str, str]] = [] + for fname, fn in program.functions.items(): + for op in _iter_operations(fn.block): + usage = plan.get_compute_device_usage_for_mlprogram_operation(op) + if usage is None: + # Constants and similar non-dispatched ops don't have a plan. + continue + rows.append( + ( + fname, + op.operator_name, + _device_name(usage.preferred_compute_device), + ) + ) + return rows + + +def _mlpackage_function_names(model_path: str) -> List[str]: + """Names of the MLProgram functions inside an .mlpackage, or [] otherwise.""" + if not model_path.endswith(".mlpackage"): + return [] + spec = ct.models.MLModel(model_path, skip_model_load=True).get_spec() + if spec.WhichOneof("Type") != "mlProgram": + return [] + return list(spec.mlProgram.functions.keys()) + + +def _project_to_single(src_mlpackage: str, function_name: str, tmpdir: str) -> str: + """Re-save ``src_mlpackage`` with only ``function_name`` exposed as ``main``.""" + from coremltools.models.utils import MultiFunctionDescriptor, save_multifunction + + dest = os.path.join(tmpdir, f"{function_name}.mlpackage") + desc = MultiFunctionDescriptor() + desc.add_function( + src_mlpackage, + src_function_name=function_name, + target_function_name="main", + ) + desc.default_function_name = "main" + save_multifunction(desc, dest) + return dest + + +def _print_report( + label: str, rows: List[Tuple[str, str, str]], show_non_ane: bool +) -> None: + print(f"\n=== {label} ===") + if not rows: + print(" (no dispatched operations found)") + return + by_device = Counter(device for _, _, device in rows) + total = sum(by_device.values()) + for device in ("ANE", "GPU", "CPU", "unknown"): + count = by_device.get(device, 0) + if count == 0: + continue + pct = 100.0 * count / total + print(f" {device}: {count:5d} / {total} ({pct:5.1f}%)") + + if show_non_ane: + non_ane = [(fn, op_name) for fn, op_name, dev in rows if dev != "ANE"] + if non_ane: + print("\n Non-ANE op types:") + for op_name, count in Counter(op for _, op in non_ane).most_common(): + print(f" {count:5d} {op_name}") + + +def main() -> int: + parser = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + parser.add_argument( + "--model_path", + required=True, + help="Path to a .pte, .mlpackage, or .mlmodelc.", + ) + parser.add_argument( + "--compute_units", + default="cpu_and_ne", + choices=sorted(_COMPUTE_UNIT_CHOICES), + help="Which devices the runtime may use when planning dispatch.", + ) + parser.add_argument( + "--show_non_ane", + action="store_true", + help="List op types that did not get assigned to the ANE.", + ) + args = parser.parse_args() + + compute_units = _COMPUTE_UNIT_CHOICES[args.compute_units] + model_path = args.model_path + + if model_path.endswith(".pte"): + with open(model_path, "rb") as f: + pte_data = f.read() + with tempfile.TemporaryDirectory() as out_dir: + extracted = extract_coreml_models(pte_data, out_dir=out_dir) + if not extracted: + print( + f"{model_path} does not contain any CoreML delegate partitions.", + file=sys.stderr, + ) + return 1 + for path in extracted: + rows = analyze_one(str(path), compute_units) + _print_report(path.name, rows, args.show_non_ane) + else: + rows = analyze_one(model_path, compute_units) + _print_report(os.path.basename(model_path.rstrip("/")), rows, args.show_non_ane) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/examples/apple/coreml/scripts/extract_coreml_models.py b/examples/apple/coreml/scripts/extract_coreml_models.py index 685b6b594f3..8956550eb4d 100644 --- a/examples/apple/coreml/scripts/extract_coreml_models.py +++ b/examples/apple/coreml/scripts/extract_coreml_models.py @@ -9,7 +9,7 @@ import shutil from pathlib import Path -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Union from executorch.backends.apple.coreml import executorchcoreml from executorch.exir._serialize._program import deserialize_pte_binary @@ -22,7 +22,12 @@ COREML_BACKEND_ID = "CoreMLBackend" -def extract_coreml_models(pte_data: bytes): +def extract_coreml_models( + pte_data: bytes, + out_dir: Optional[Union[str, Path]] = None, +) -> List[Path]: + out_root = Path(out_dir) if out_dir is not None else Path("extracted_coreml_models") + pte_file = deserialize_pte_binary(pte_data) program = pte_file.program @@ -44,6 +49,7 @@ def extract_coreml_models(pte_data: bytes): ] # Track extracted models to avoid duplicates (multifunction models share partitions) + extracted_paths: List[Path] = [] extracted_keys: set = set() model_index: int = 1 @@ -95,7 +101,7 @@ def extract_coreml_models(pte_data: bytes): if model_name is None: model_name = f"model_{model_index}" - model_path: Path = Path() / "extracted_coreml_models" / model_name + model_path: Path = out_root / model_name if model_path.exists(): shutil.rmtree(model_path.absolute()) os.makedirs(model_path.absolute()) @@ -104,11 +110,14 @@ def extract_coreml_models(pte_data: bytes): coreml_processed_bytes, str(model_path.absolute()) ): print(f"Core ML models are extracted and saved to path = {model_path}") + extracted_paths.append(model_path) model_index += 1 if len(coreml_delegates) == 0: print("The model isn't delegated to Core ML.") + return extracted_paths + def main() -> None: """ diff --git a/examples/apple/coreml/scripts/test_coreml_compute_plan.py b/examples/apple/coreml/scripts/test_coreml_compute_plan.py new file mode 100644 index 00000000000..83f06b7a2a8 --- /dev/null +++ b/examples/apple/coreml/scripts/test_coreml_compute_plan.py @@ -0,0 +1,161 @@ +# Copyright © 2026 Apple Inc. All rights reserved. +# +# Please refer to the license found in the LICENSE file in the root directory of the source tree. + +"""Tests for coreml_compute_plan.py.""" + +import os +import shutil +import tempfile +import unittest +from collections import Counter + +import coremltools as ct +import torch +from coremltools.models.utils import MultiFunctionDescriptor, save_multifunction + +from executorch.examples.apple.coreml.scripts.coreml_compute_plan import ( + _COMPUTE_UNIT_CHOICES, + _device_name, + analyze_one, +) + + +class _Op: + def __init__(self, operator_name: str, blocks=None): + self.operator_name = operator_name + self.blocks = blocks or [] + + +class _Block: + __slots__ = ("operations",) + + def __init__(self, ops): + self.operations = ops + + +def _build_small_mlpackage(out_dir: str) -> str: + class M(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.relu(x @ x.T) + x.sum() + + model = M().eval() + ep = torch.export.export(model, (torch.randn(8, 8),), strict=True) + ep = ep.run_decompositions({}) + mlmodel = ct.convert( + ep, + source="pytorch", + convert_to="mlprogram", + minimum_deployment_target=ct.target.iOS17, + skip_model_load=True, + ) + out = os.path.join(out_dir, "tiny.mlpackage") + mlmodel.save(out) + return out + + +class TestDeviceName(unittest.TestCase): + def test_none_device(self): + self.assertEqual(_device_name(None), "unknown") + + def test_known_device_classes(self): + from coremltools.models.compute_device import MLNeuralEngineComputeDevice + + # Don't construct the device classes directly (they wrap proxies that + # may be unavailable in some envs); just confirm the type-mapping path + # returns sensible names by mocking the isinstance check with a fake. + class FakeNE(MLNeuralEngineComputeDevice): + def __init__(self): + pass + + self.assertEqual(_device_name(FakeNE()), "ANE") + + +class TestComputeUnitChoices(unittest.TestCase): + def test_includes_cpu_and_ne(self): + self.assertEqual(_COMPUTE_UNIT_CHOICES["cpu_and_ne"], ct.ComputeUnit.CPU_AND_NE) + + def test_includes_all(self): + self.assertEqual(_COMPUTE_UNIT_CHOICES["all"], ct.ComputeUnit.ALL) + + +class TestAnalyzeOne(unittest.TestCase): + """End-to-end: build a tiny mlpackage and analyze it.""" + + @classmethod + def setUpClass(cls): + cls.tmpdir = tempfile.mkdtemp() + cls.mlpackage = _build_small_mlpackage(cls.tmpdir) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdir, ignore_errors=True) + + def test_returns_rows_for_dispatched_ops(self): + rows = analyze_one(self.mlpackage, ct.ComputeUnit.CPU_AND_NE) + self.assertGreater(len(rows), 0, "expected at least one dispatched op") + # Every row is (function_name, operator_name, device_name). + for fname, op_name, device in rows: + self.assertIsInstance(fname, str) + self.assertIsInstance(op_name, str) + self.assertIn(device, {"ANE", "GPU", "CPU", "unknown"}) + + def test_main_function_present(self): + rows = analyze_one(self.mlpackage, ct.ComputeUnit.CPU_ONLY) + self.assertIn("main", {fname for fname, _, _ in rows}) + + def test_op_types_for_relu_matmul_model(self): + # The toy model is `relu(x @ x.T) + x.sum()` so the lowered MIL + # should at least contain matmul, relu, add and reduce_sum. + rows = analyze_one(self.mlpackage, ct.ComputeUnit.CPU_ONLY) + op_types = Counter(op for _, op, _ in rows) + # Op names are versioned (e.g. "ios17.matmul"), so match by suffix. + suffixes = {name.split(".")[-1] for name in op_types} + for expected in ("matmul", "relu", "add", "reduce_sum"): + self.assertIn(expected, suffixes, f"missing op {expected}: {suffixes}") + + +class TestAnalyzeOneMultifunction(unittest.TestCase): + """Verify analyze_one walks every function of a multifunction .mlpackage. + + coremltools 9.0's MLComputePlan.load_from_path only exposes usage for + the default function, so analyze_one re-projects each function through + MultiFunctionDescriptor to surface plans for the rest. + """ + + @classmethod + def setUpClass(cls): + cls.tmpdir = tempfile.mkdtemp() + single = _build_small_mlpackage(cls.tmpdir) + desc = MultiFunctionDescriptor() + desc.add_function( + single, src_function_name="main", target_function_name="prefill" + ) + desc.add_function( + single, src_function_name="main", target_function_name="decode" + ) + desc.default_function_name = "prefill" + cls.multi = os.path.join(cls.tmpdir, "multi.mlpackage") + save_multifunction(desc, cls.multi) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdir, ignore_errors=True) + + def test_reports_every_function(self): + rows = analyze_one(self.multi, ct.ComputeUnit.CPU_ONLY) + fnames = {fname for fname, _, _ in rows} + self.assertEqual(fnames, {"prefill", "decode"}) + + def test_each_function_lowers_the_same_ops(self): + rows = analyze_one(self.multi, ct.ComputeUnit.CPU_ONLY) + per_fn: dict = {} + for fname, op_name, _ in rows: + per_fn.setdefault(fname, set()).add(op_name.split(".")[-1]) + for fname in ("prefill", "decode"): + self.assertIn("matmul", per_fn.get(fname, set()), f"{fname} missing matmul") + self.assertIn("relu", per_fn.get(fname, set()), f"{fname} missing relu") + + +if __name__ == "__main__": + unittest.main() From fb420f302ee73d2e1abebb18e423c6dff20309ab Mon Sep 17 00:00:00 2001 From: Gregory Comer Date: Tue, 26 May 2026 18:50:49 -0700 Subject: [PATCH 031/103] Fix bug with mixed weight cache + workspace sharing Differential Revision: D106412035 Pull Request resolved: https://github.com/pytorch/executorch/pull/19777 --- backends/xnnpack/runtime/XNNExecutor.cpp | 2 +- backends/xnnpack/runtime/XNNExecutor.h | 2 +- backends/xnnpack/runtime/XNNPACKBackend.cpp | 36 ++------------------- backends/xnnpack/runtime/XNNWorkspace.h | 9 ++++++ 4 files changed, 13 insertions(+), 36 deletions(-) diff --git a/backends/xnnpack/runtime/XNNExecutor.cpp b/backends/xnnpack/runtime/XNNExecutor.cpp index 1cba33a91e6..5a150f92b6b 100644 --- a/backends/xnnpack/runtime/XNNExecutor.cpp +++ b/backends/xnnpack/runtime/XNNExecutor.cpp @@ -93,7 +93,7 @@ ET_NODISCARD Error XNNExecutor::initialize( * delegate->execute() */ ET_NODISCARD Error XNNExecutor::prepare_args(Span args) { - ET_CHECK_MSG( + ET_DCHECK_MSG( !destroyed_.load(std::memory_order_acquire), "XNNExecutor::prepare_args called after destroy"); diff --git a/backends/xnnpack/runtime/XNNExecutor.h b/backends/xnnpack/runtime/XNNExecutor.h index 0af8b6056b0..2d709678c1c 100644 --- a/backends/xnnpack/runtime/XNNExecutor.h +++ b/backends/xnnpack/runtime/XNNExecutor.h @@ -45,7 +45,7 @@ class XNNExecutor { : workspace_(workspace) {} ~XNNExecutor() { - ET_CHECK_MSG( + ET_DCHECK_MSG( !in_use_.load(std::memory_order_acquire), "XNNExecutor destroyed while in use"); destroyed_.store(true, std::memory_order_release); diff --git a/backends/xnnpack/runtime/XNNPACKBackend.cpp b/backends/xnnpack/runtime/XNNPACKBackend.cpp index a02cf98771b..9eaadda86f8 100644 --- a/backends/xnnpack/runtime/XNNPACKBackend.cpp +++ b/backends/xnnpack/runtime/XNNPACKBackend.cpp @@ -16,7 +16,6 @@ #include #include -#include #include #include @@ -101,6 +100,7 @@ class XnnpackBackend final lock_weights_cache.lock(); weights_cache_->initialize_for_runtime( context.get_runtime_allocator(), named_data_map); + workspace->set_uses_weight_cache(); } auto [workspace_lock, workspace_ptr] = workspace->acquire(); @@ -131,16 +131,6 @@ class XnnpackBackend final return err; } - ET_LOG( - Info, - "XnnpackBackend::init delegate=%p workspace_id=%" PRIu64 - " workspace_ptr=%p program_id=0x%" PRIxPTR " weight_cache=%s", - (void*)executor, - workspace->id(), - (void*)workspace_ptr, - program_id, - use_weight_cache ? "true" : "false"); - return executor; } @@ -151,18 +141,10 @@ class XnnpackBackend final auto executor = static_cast(handle); auto workspace = executor->get_workspace(); - ET_LOG( - Info, - "XnnpackBackend::execute begin delegate=%p workspace_id=%" PRIu64 - " num_args=%zu weight_cache=%s", - (void*)executor, - workspace->id(), - (size_t)args.size(), - executor->uses_weight_cache() ? "true" : "false"); std::unique_lock lock_weights_cache( weights_cache_mutex_, std::defer_lock); - if (executor->uses_weight_cache()) { + if (executor->uses_weight_cache() || workspace->uses_weight_cache()) { lock_weights_cache.lock(); } @@ -183,14 +165,6 @@ class XnnpackBackend final // Convert output data types if necessary (e.g., int32 -> int64 for Long) err = executor->convert_outputs(args); - ET_LOG( - Info, - "XnnpackBackend::execute end delegate=%p workspace_id=%" PRIu64 - " err=0x%x", - (void*)executor, - workspace->id(), - (unsigned int)err); - return err; } @@ -199,12 +173,6 @@ class XnnpackBackend final auto executor = static_cast(handle); auto workspace = executor->get_workspace(); - ET_LOG( - Info, - "XnnpackBackend::destroy delegate=%p workspace_id=%" PRIu64, - (void*)executor, - workspace->id()); - const std::lock_guard lock_weights_cache( weights_cache_mutex_); diff --git a/backends/xnnpack/runtime/XNNWorkspace.h b/backends/xnnpack/runtime/XNNWorkspace.h index b7ef442c460..e1b452a0a8b 100644 --- a/backends/xnnpack/runtime/XNNWorkspace.h +++ b/backends/xnnpack/runtime/XNNWorkspace.h @@ -59,6 +59,14 @@ class XNNWorkspace { lock_required_ = false; } + void set_uses_weight_cache() { + uses_weight_cache_.store(true, std::memory_order_release); + } + + bool uses_weight_cache() const { + return uses_weight_cache_.load(std::memory_order_acquire); + } + static runtime::Result> create() { // Because this class can't be moved, we need to construct it in-place. xnn_workspace_t workspace = nullptr; @@ -80,6 +88,7 @@ class XNNWorkspace { std::mutex mutex_; uint64_t id_; bool lock_required_ = true; + std::atomic uses_weight_cache_{false}; WorkspacePtr workspace_; }; From 77df9b79ae212c6a538ff16f3538954a5bac10ca Mon Sep 17 00:00:00 2001 From: Andrew Grebenisan <33402477+DrJessop@users.noreply.github.com> Date: Tue, 26 May 2026 20:08:12 -0700 Subject: [PATCH 032/103] New exported program pass manager and exported program passes (#16986) Differential Revision: D91725222 Pull Request resolved: https://github.com/pytorch/executorch/pull/16986 --- backends/arm/test/tester/test_pipeline.py | 2 +- .../_passes/recompose_pad_maxpool2d.py | 7 +- backends/qualcomm/_passes/utils.py | 33 ++- exir/BUCK | 12 + exir/_program_utils.py | 104 ++++++++ exir/pass_base.py | 58 ++++- exir/pass_manager.py | 201 +++++++++++++-- exir/program/BUCK | 1 + exir/program/_program.py | 163 ++++-------- exir/tests/test_pass_infra.py | 243 +++++++++++++++++- 10 files changed, 671 insertions(+), 153 deletions(-) create mode 100644 exir/_program_utils.py diff --git a/backends/arm/test/tester/test_pipeline.py b/backends/arm/test/tester/test_pipeline.py index 7e7f576e35c..86a5f857e58 100644 --- a/backends/arm/test/tester/test_pipeline.py +++ b/backends/arm/test/tester/test_pipeline.py @@ -48,7 +48,7 @@ from executorch.backends.arm.vgf.compile_spec import VgfCompileSpec from executorch.backends.test.harness.stages import StageType from executorch.exir.pass_base import ExportPass -from torch._export.pass_base import PassType +from executorch.exir.pass_manager import PassType from torch.export.graph_signature import InputKind, OutputKind from torchao.quantization.pt2e.quantizer import QuantizationSpec diff --git a/backends/qualcomm/_passes/recompose_pad_maxpool2d.py b/backends/qualcomm/_passes/recompose_pad_maxpool2d.py index 81b4836f251..6a8374cb66a 100644 --- a/backends/qualcomm/_passes/recompose_pad_maxpool2d.py +++ b/backends/qualcomm/_passes/recompose_pad_maxpool2d.py @@ -13,12 +13,8 @@ from executorch.exir.pass_base import ExportPass, PassResult from executorch.exir.passes import dead_code_elimination_pass -from torch._subclasses.fake_tensor import FakeTensorMode - - -def add_fake_tensor_to_node(padding_node, input_shape, padding_args, dtype): - fake_mode = FakeTensorMode() +def add_fake_tensor_to_node(padding_node, input_shape, padding_args, dtype, fake_mode): with fake_mode: batch, channels, height, width = input_shape pad_left, pad_right, pad_top, pad_bottom = padding_args @@ -114,6 +110,7 @@ def call(self, graph_module: torch.fx.GraphModule): # noqa C901 input_node.meta["val"].shape, padding, input_node.meta["val"].dtype, + input_node.meta["val"].fake_mode, ) if quant_attrs: padding_node.meta["quant_attrs"] = node.meta["quant_attrs"] diff --git a/backends/qualcomm/_passes/utils.py b/backends/qualcomm/_passes/utils.py index 542fa1115a6..91a7cfdc69a 100755 --- a/backends/qualcomm/_passes/utils.py +++ b/backends/qualcomm/_passes/utils.py @@ -137,7 +137,23 @@ def copy_nn_module_stack(src, target): target.meta["nn_module_stack"] = value -def merge_decomposed_graph( +def _unify_fake_mode(node: torch.fx.Node, fake_mode) -> None: + val = node.meta.get("val") + if val is None: + return + if isinstance(val, FakeTensor) and val.fake_mode is not fake_mode: + node.meta["val"] = fake_mode.from_tensor(val) + elif isinstance(val, (list, tuple)): + unified = [] + for v in val: + if isinstance(v, FakeTensor) and v.fake_mode is not fake_mode: + unified.append(fake_mode.from_tensor(v)) + else: + unified.append(v) + node.meta["val"] = type(val)(unified) + + +def merge_decomposed_graph( # noqa: C901 remap: Dict[str, torch.fx.Node], target_node: torch.fx.Node, target_graph: torch.fx.GraphModule, @@ -148,6 +164,16 @@ def merge_decomposed_graph( [torch.fx.Node, torch.fx.Node, Dict[str, torch.fx.Node]], None ] = None, ) -> None: + target_fake_mode = None + target_val = target_node.meta.get("val") + if isinstance(target_val, FakeTensor): + target_fake_mode = target_val.fake_mode + elif isinstance(target_val, (list, tuple)): + for v in target_val: + if isinstance(v, FakeTensor): + target_fake_mode = v.fake_mode + break + def default_output_process(node): for user in node.users.copy(): # remap @@ -170,10 +196,13 @@ def default_output_process(node): # replace node map from string to graph node remap[decomposed_node] = remap.pop(decomposed_node.name) else: - remap[decomposed_node] = target_graph.node_copy( + copied = target_graph.node_copy( decomposed_node, arg_transform=lambda x, remap=remap: remap[x], ) + if target_fake_mode is not None: + _unify_fake_mode(copied, target_fake_mode) + remap[decomposed_node] = copied def is_float_tensor(node: torch.fx.Node) -> bool: diff --git a/exir/BUCK b/exir/BUCK index f00b3f1c787..d70900c02ae 100644 --- a/exir/BUCK +++ b/exir/BUCK @@ -259,6 +259,16 @@ fbcode_target(_kind = runtime.python_library, ], ) +fbcode_target(_kind = runtime.python_library, + name = "_program_utils", + srcs = [ + "_program_utils.py", + ], + deps = [ + "//caffe2:torch", + ], +) + fbcode_target(_kind = runtime.python_library, name = "pass_manager", srcs = [ @@ -266,7 +276,9 @@ fbcode_target(_kind = runtime.python_library, ], deps = [ "fbsource//third-party/pypi/typing-extensions:typing-extensions", + ":_program_utils", ":error", + ":pass_base", "//caffe2:torch", ], ) diff --git a/exir/_program_utils.py b/exir/_program_utils.py new file mode 100644 index 00000000000..d0d2039d93a --- /dev/null +++ b/exir/_program_utils.py @@ -0,0 +1,104 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe + +import torch +from torch.export.exported_program import ( + ConstantArgument, + ExportGraphSignature, + InputSpec, + OutputSpec, +) + + +def _get_updated_range_constraints(gm): + def get_shape_env(gm): + vals = [ + node.meta["val"] + for node in gm.graph.nodes + if node.meta.get("val", None) is not None + ] + from torch._guards import detect_fake_mode # type: ignore[21] + + fake_mode = detect_fake_mode(vals) + if fake_mode is not None: + return fake_mode.shape_env + for v in vals: + if isinstance(v, torch.SymInt): + return v.node.shape_env + + shape_env = get_shape_env(gm) + if shape_env is None: + return {} + range_constraints = { + shape_env.replacements.get(k, k): v for k, v in shape_env.var_to_range.items() + } + # Only when we have an unbacked symint, and it's used as constructor inputs, + # runtime_var_to_range will make a difference compated to var_to_range. + # e.g. [2, oo) -> [0, oo) + for k, v in shape_env.var_to_range.items(): + if k not in shape_env.replacements: + range_constraints[k] = v + return range_constraints + + +def _get_updated_graph_signature( + old_signature: ExportGraphSignature, + new_gm: torch.fx.GraphModule, +) -> ExportGraphSignature: + """ + Update the graph signature's user_input/user_outputs. + """ + new_input_specs = [] + i = 0 + for node in new_gm.graph.nodes: + if node.op != "placeholder": + continue + + assert i < len( + old_signature.input_specs + ), "Number of inputs changed after transformation" + old_input_spec = old_signature.input_specs[i] + arg = ( + old_input_spec.arg + if isinstance(old_input_spec.arg, ConstantArgument) + # pyre-fixme[20]: Argument `class_fqn` expected. + else type(old_input_spec.arg)(node.name) + ) + new_input_specs.append( + InputSpec( + old_input_spec.kind, + arg, + old_input_spec.target, + persistent=old_input_spec.persistent, + ) + ) + i += 1 + + output_node = new_gm.graph.output_node() + assert output_node.op == "output" + + new_output_specs = [] + for i, node in enumerate(output_node.args[0]): + assert i < len( + old_signature.output_specs + ), "Number of outputs changed after transformation" + old_output_spec = old_signature.output_specs[i] + arg = ( + old_output_spec.arg + if isinstance(old_output_spec.arg, ConstantArgument) + # pyre-fixme[20]: Argument `class_fqn` expected. + else type(old_output_spec.arg)(node.name) + ) + new_output_specs.append( + OutputSpec(old_output_spec.kind, arg, old_output_spec.target) + ) + + new_signature = ExportGraphSignature( + input_specs=new_input_specs, output_specs=new_output_specs + ) + return new_signature diff --git a/exir/pass_base.py b/exir/pass_base.py index 8ab0c675240..f93dd75d156 100644 --- a/exir/pass_base.py +++ b/exir/pass_base.py @@ -6,10 +6,11 @@ # LICENSE file in the root directory of this source tree. # pyre-strict - import operator import traceback +from abc import ABC, abstractmethod from contextlib import nullcontext +from dataclasses import dataclass from typing import ( Any, Callable, @@ -27,9 +28,7 @@ import torch from executorch.exir import memory - from executorch.exir.delegate import executorch_call_delegate, is_lowered_module - from executorch.exir.dialects.edge._ops import EdgeOpOverload from executorch.exir.error import ExportError, ExportErrorType from torch import fx @@ -37,6 +36,7 @@ from torch._subclasses import FakeTensorMode, UnsupportedFakeTensorException from torch._subclasses.fake_tensor import FakeTensor from torch._subclasses.functional_tensor import FunctionalTensor, FunctionalTensorMode +from torch.export import ExportedProgram from torch.fx import traceback as fx_traceback from torch.fx.experimental.proxy_tensor import PythonKeyTracer from torch.fx.graph import CodeGen @@ -182,6 +182,58 @@ class ExportPassBaseError(RuntimeError): pass +@dataclass(frozen=True) +class ExportedProgramPassResult: + exported_program: ExportedProgram + modified: bool + + +class ExportedProgramPassBase(ABC): + """ + Base interface for implementing passes that operate on ExportedProgram. + """ + + def __call__(self, exported_program: ExportedProgram) -> ExportedProgramPassResult: + """ + Runs the precondition check, the pass itself, and the postcondition check. + """ + + self.requires(exported_program) + res = self.call(exported_program) + self.ensures(exported_program) + return res + + @abstractmethod + def call(self, exported_program: ExportedProgram) -> ExportedProgramPassResult: + """ + The pass that is run through the given exported program. To implement a + pass, it is required to implement this function. + + Args: + exported_program: The exported program we will run a pass on + """ + + def requires(self, exported_program: ExportedProgram) -> None: # noqa: B027 + """ + This function will be called before the pass is run and will check that + the given exported program contains the preconditions needed to run the + pass. It is not required to implement this function. + + Args: + exported_program: The exported program we will run checks on + """ + + def ensures(self, exported_program: ExportedProgram) -> None: # noqa: B027 + """ + This function will be called after the pass is run and will check that + the given exported program contains the postconditions needed to run the + pass. It is not required to implement this function. + + Args: + exported_program: The exported program we will run checks on + """ + + class _ExportPassBase(PassBase): """ Interpreter-based pass class to help users maintain the IR spec while writing diff --git a/exir/pass_manager.py b/exir/pass_manager.py index b812ccea7b8..351e98651dd 100644 --- a/exir/pass_manager.py +++ b/exir/pass_manager.py @@ -5,28 +5,46 @@ # LICENSE file in the root directory of this source tree. # pyre-strict - -from typing import Callable, List, Optional, Union +import copy +import inspect +import logging +from typing import Callable, List, Optional, Type, TypeAlias, Union import torch import torch.fx.passes.infra.pass_manager as fx import torch.utils._pytree as pytree +from executorch.exir._program_utils import ( + _get_updated_graph_signature, + _get_updated_range_constraints, +) from executorch.exir.error import ExportError, ExportErrorType +from executorch.exir.pass_base import ExportedProgramPassBase, ExportedProgramPassResult +from torch._export.verifier import Verifier +from torch.export import ExportedProgram from torch.fx.passes.infra.pass_base import PassResult -from typing_extensions import TypeAlias +from torch.fx.passes.infra.pass_manager import pass_result_wrapper + +logger = logging.getLogger(__name__) +logger.setLevel(logging.WARNING) + +PassType: TypeAlias = Union[ + ExportedProgramPassBase, Callable[[torch.fx.GraphModule], Optional[PassResult]] +] + -PassType: TypeAlias = Callable[[torch.fx.GraphModule], Optional[PassResult]] +def _get_pass_name(fn: PassType) -> str: + """Returns a human-readable name for a pass.""" + return fn.__name__ if inspect.isfunction(fn) else type(fn).__name__ class PassManager(fx.PassManager): """ - Class to run multiple passes on a given graph module. The PassManager is - callable so to run it, we can just call the PassManager instance. + Runs multiple passes on a GraphModule. - Private Attributes: - * **passes**: A list of callable passes - * **params**: An instance of PassManagerParams containing the result of the - flags set in the constructor. + This is the legacy PassManager that extends torch.fx.passes.infra.pass_manager.PassManager. + Use this when you need to run passes on a GraphModule directly. + + For running passes on ExportedProgram, use ExportedProgramPassManager instead. """ def __init__( @@ -34,14 +52,11 @@ def __init__( passes: Optional[Union[List[PassType], List[List[PassType]]]] = None, run_checks_after_each_pass: bool = False, suppress_check_failures: bool = False, + steps: int = 1, ) -> None: - r""" - Args: - passes: A list of passes - enable_debug_pass: set to true to enable the debug passes - run_checks_after_each_pass: whether to run checks and linting after each pass - """ - + logger.warning( + "PassManager is deprecated. Please use ExportedProgramPassManager instead." + ) # Flatten the passes to a list of callables passes = passes if passes else [] flattened_passes = [ @@ -52,6 +67,7 @@ def __init__( flattened_passes, run_checks_after_each_pass=run_checks_after_each_pass, suppress_check_failures=suppress_check_failures, + steps=steps, ) def check(self, module: torch.nn.Module) -> None: @@ -65,10 +81,9 @@ def check(self, module: torch.nn.Module) -> None: node's spec field is a tuple) - Ensure that the graph module has type torch.fx.GraphModule """ - assert isinstance(module, fx.GraphModule) + assert isinstance(module, torch.fx.GraphModule) module.recompile() module.graph.lint() - # TODO(qihan): use verifier.check_is_exir for node in module.graph.nodes: if node.op == "call_method": @@ -76,3 +91,151 @@ def check(self, module: torch.nn.Module) -> None: ExportErrorType.NOT_SUPPORTED, f"call_method `{node}` is not supported except for backend delegate.", ) + + +class ExportedProgramPassManager(fx.PassManager): + """ + Runs multiple passes on an ExportedProgram. + + This PassManager is specifically designed for ExportedProgram and supports + both GraphModule-only passes and ExportedProgram-aware passes. + + For running passes on GraphModule directly, use PassManager instead. + """ + + def __init__( + self, + passes: Optional[Union[List[PassType], List[List[PassType]]]] = None, + constraints: Optional[List[Callable[[Callable, Callable], bool]]] = None, + run_checks_after_each_pass: bool = False, + suppress_check_failures: bool = False, + steps: int = 1, + ) -> None: + wrapped_passes = ( + [ + ( + fn + if isinstance(fn, ExportedProgramPassBase) + else pass_result_wrapper(fn) + ) + for fn in pytree.tree_flatten(passes)[0] + ] + if passes + else [] + ) + + super().__init__( + wrapped_passes, + constraints=constraints, + run_checks_after_each_pass=run_checks_after_each_pass, + suppress_check_failures=suppress_check_failures, + steps=steps, + ) + + def check(self, exported_program: ExportedProgram) -> None: + """Validates graph module invariants.""" + graph_module = exported_program.graph_module + graph_module.recompile() + graph_module.graph.lint() + + for node in graph_module.graph.nodes: + if node.op == "call_method": + raise ExportError( + ExportErrorType.NOT_SUPPORTED, + f"call_method `{node}` is not supported except for backend delegate.", + ) + + exported_program.validate() + + # pyre-ignore[14]: Intentionally overriding with different signature for ExportedProgram + def __call__( # noqa: C901 + self, + exported_program: ExportedProgram, + override_verifiers: Optional[list[Type[Verifier]]] = None, + ) -> ExportedProgramPassResult: + """ + Runs passes on an ExportedProgram. + + Handles both GraphModule-only passes and ExportedProgram-aware passes. Will create a shallow copy of the exported program before running passes. + + Args: + exported_program: The exported program to transform. + + Returns: + ExportedProgramPassResult containing the transformed program. + """ + if not self._validated: + self.solve_constraints() + + exported_program = copy.copy(exported_program) + + if override_verifiers: + exported_program._verifiers = override_verifiers + + self.check(exported_program) + + overall_modified = False + + for _ in range(self.steps): + step_modified = False + + for i, fn in enumerate(self.passes): + pass_modified = False + try: + if not isinstance(fn, ExportedProgramPassBase): + res = fn(exported_program.graph_module) + if res is None: + raise TypeError( + f"The result of pass {_get_pass_name(fn)} should be type PassResult. " + "Please wrap it with pass_result_wrapper()" + ) + + if res.modified: + # Not running _update_exported_program_graph_module here because it is + # possible that the verifier will fail upon new ExportedProgram construction, + # and we should only run verification after each pass if + # run_checks_after_each_pass is True. + res.graph_module.recompile() + exported_program._graph_module = res.graph_module + exported_program._graph_signature = ( + _get_updated_graph_signature( + exported_program.graph_signature, + res.graph_module, + ) + ) + exported_program._range_constraints = ( + _get_updated_range_constraints(res.graph_module) + ) + pass_modified = True + + else: + assert isinstance(fn, ExportedProgramPassBase) + ep_res = fn(exported_program) + exported_program = ep_res.exported_program + + if ep_res.modified: + pass_modified = True + exported_program.graph_module.recompile() + + if self.run_checks_after_each_pass: + self.check(exported_program) + + if pass_modified: + step_modified = True + logger.debug( + "Graph after pass '%s': %s", + _get_pass_name(fn), + exported_program.graph_module.graph, + ) + + except Exception as e: + prev_names = [_get_pass_name(p) for p in self.passes[:i]] + msg = f"An error occurred when running the '{_get_pass_name(fn)}' pass after the following passes: {prev_names}" + raise Exception(msg) from e # noqa: TRY002 + + overall_modified = overall_modified or step_modified + if not step_modified: + break + + self.check(exported_program) + return ExportedProgramPassResult(exported_program, overall_modified) diff --git a/exir/program/BUCK b/exir/program/BUCK index 7d9642efdb7..11f62edd99e 100644 --- a/exir/program/BUCK +++ b/exir/program/BUCK @@ -22,6 +22,7 @@ fbcode_target(_kind = runtime.python_library, ], deps = [ "//caffe2:torch", + "//executorch/exir:_program_utils", "//executorch/exir:error", "//executorch/exir:graph_module", "//executorch/exir:pass_base", diff --git a/exir/program/_program.py b/exir/program/_program.py index b3d94c8ffd7..485d72bbe45 100644 --- a/exir/program/_program.py +++ b/exir/program/_program.py @@ -5,8 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# pyre-unsafe - +# pyre-strict import copy import io import logging @@ -38,7 +37,8 @@ from executorch.exir.operator.convert import _pybind_schema_to_native_schema from executorch.exir.operator.util import _QUANT_PRIMITIVES from executorch.exir.pass_base import PassBase -from executorch.exir.pass_manager import PassType +from executorch.exir.pass_manager import ExportedProgramPassManager, PassType + from executorch.exir.passes import ( base_post_op_replace_passes, base_pre_op_replace_passes, @@ -88,17 +88,11 @@ from torch.export._remove_auto_functionalized_pass import ( unsafe_remove_auto_functionalized_pass, ) -from torch.export.exported_program import ( - ConstantArgument, - ExportGraphSignature, - InputKind, - InputSpec, - OutputSpec, - TensorArgument, -) +from torch.export.exported_program import InputKind, InputSpec, TensorArgument from torch.fx import _pytree as fx_pytree from torch.fx._compatibility import compatibility -from torch.fx.passes.infra.pass_manager import PassManager +from torch.fx.passes.infra.pass_manager import PassManager as GraphModulePassManager + from torch.utils import _pytree as pytree Val = Any @@ -131,93 +125,10 @@ def wrapper(*args: Any, **kwargs: Any) -> Any: transform_op_to_aten_op = {} -def _get_updated_range_constraints(gm): - def get_shape_env(gm): - vals = [ - node.meta["val"] - for node in gm.graph.nodes - if node.meta.get("val", None) is not None - ] - from torch._guards import detect_fake_mode # type: ignore[21] - - fake_mode = detect_fake_mode(vals) - if fake_mode is not None: - return fake_mode.shape_env - for v in vals: - if isinstance(v, torch.SymInt): - return v.node.shape_env - - shape_env = get_shape_env(gm) - if shape_env is None: - return {} - range_constraints = { - shape_env.replacements.get(k, k): v for k, v in shape_env.var_to_range.items() - } - # Only when we have an unbacked symint, and it's used as constructor inputs, - # runtime_var_to_range will make a difference compated to var_to_range. - # e.g. [2, oo) -> [0, oo) - for k, v in shape_env.var_to_range.items(): - if k not in shape_env.replacements: - range_constraints[k] = v - return range_constraints - - -def _get_updated_graph_signature( - old_signature: ExportGraphSignature, - new_gm: torch.fx.GraphModule, -) -> ExportGraphSignature: - """ - Update the graph signature's user_input/user_outputs. - """ - new_input_specs = [] - i = 0 - for node in new_gm.graph.nodes: - if node.op != "placeholder": - continue - - assert i < len( - old_signature.input_specs - ), "Number of inputs changed after transformation" - old_input_spec = old_signature.input_specs[i] - arg = ( - old_input_spec.arg - if isinstance(old_input_spec.arg, ConstantArgument) - # pyre-fixme[20]: Argument `class_fqn` expected. - else type(old_input_spec.arg)(node.name) - ) - new_input_specs.append( - InputSpec( - old_input_spec.kind, - arg, - old_input_spec.target, - persistent=old_input_spec.persistent, - ) - ) - i += 1 - - output_node = new_gm.graph.output_node() - assert output_node.op == "output" - - new_output_specs = [] - for i, node in enumerate(output_node.args[0]): - assert i < len( - old_signature.output_specs - ), "Number of outputs changed after transformation" - old_output_spec = old_signature.output_specs[i] - arg = ( - old_output_spec.arg - if isinstance(old_output_spec.arg, ConstantArgument) - # pyre-fixme[20]: Argument `class_fqn` expected. - else type(old_output_spec.arg)(node.name) - ) - new_output_specs.append( - OutputSpec(old_output_spec.kind, arg, old_output_spec.target) - ) - - new_signature = ExportGraphSignature( - input_specs=new_input_specs, output_specs=new_output_specs - ) - return new_signature +from executorch.exir._program_utils import ( # noqa: E402 + _get_updated_graph_signature, + _get_updated_range_constraints, +) def _transform( @@ -243,13 +154,13 @@ def _transform( ), f"Expected all passes to be of PassType, not list or Verifier. Use override_verifiers kwarg instead. Got: {list(passes)}" return _transform_with_pass_manager( - self, PassManager(list(passes)), override_verifiers + self, ExportedProgramPassManager(list(passes)), override_verifiers ) def _transform_with_pass_manager( - self, - pass_manager: PassManager, + self: ExportedProgram, + pass_manager: Union[ExportedProgramPassManager, GraphModulePassManager], override_verifiers: None | list[Type[Verifier]] = None, ) -> "ExportedProgram": """ @@ -258,22 +169,26 @@ def _transform_with_pass_manager( Args: self: The ExportedProgram instance to transform pass_manager: An instance of PassManager to apply transformations. + - ExportedProgramPassManager: operates on the full ExportedProgram + - GraphModulePassManager: operates on the GraphModule only override_verifiers: Optional list of verifier classes to use instead of the default verifiers. This is needed if the transforms yields illegal graph that the default verifier cannot handle. Returns: ExportedProgram: A new ExportedProgram with the transformations applied, or self if no changes were made """ - res = pass_manager(self.graph_module) - transformed_gm = res.graph_module if res is not None else self.graph_module - assert transformed_gm is not None - - if transformed_gm is self.graph_module and not res.modified: - return self - - return _update_exported_program_graph_module( - self, transformed_gm, override_verifiers - ) + if isinstance(pass_manager, ExportedProgramPassManager): + res = pass_manager(self, override_verifiers) + if not res.modified: + return self + return res.exported_program + else: + res = pass_manager(self.graph_module) + if not res.modified: + return self + return _update_exported_program_graph_module( + self, res.graph_module, override_verifiers + ) def _update_exported_program_graph_module( @@ -1324,7 +1239,12 @@ def collect_named_data_store_outputs( def to_edge_transform_and_lower( # noqa: C901 programs: Union[ExportedProgram, Dict[str, ExportedProgram]], transform_passes: Optional[ - Union[Sequence[PassType], Dict[str, Sequence[PassType]], PassManager] + Union[ + Sequence[PassType], + Dict[str, Sequence[PassType]], + GraphModulePassManager, + ExportedProgramPassManager, + ] ] = None, partitioner: Optional[ Union[List[Partitioner], Dict[str, List[Partitioner]]] @@ -1359,7 +1279,7 @@ def to_edge_transform_and_lower( # noqa: C901 2) a dictionary - only method names specified in the dictionary will be transformed with their corresponding passes - 3) an instance of a PassManager - + 3) an instance of a PassManager (either a GraphModulePassManager or an ExportedProgramPassManager) - all methods in the given EdgeProgramManager will be transformed with the given PassManager instance. @@ -1604,7 +1524,12 @@ def exported_program(self, method_name: str = "forward") -> ExportedProgram: @et_logger("transform") def transform( self, - passes: Union[Sequence[PassType], Dict[str, Sequence[PassType]], PassManager], + passes: Union[ + Sequence[PassType], + Dict[str, Sequence[PassType]], + ExportedProgramPassManager, + GraphModulePassManager, + ], compile_config: Optional[EdgeCompileConfig] = None, ) -> "EdgeProgramManager": """ @@ -1618,7 +1543,7 @@ def transform( 2) a dictionary mapping method names to lists of passes - only method names specified in the dictionary will be transformed with their corresponding passes. - 3) a PassManager instance - + 3) a PassManager (either ExportedProgramPassManager or GraphModulePassManager) instance - all methods in the given EdgeProgramManager will be transformed with the given PassManager instance. compile_config: Compile config to use for veriy the correctness of model @@ -1637,13 +1562,15 @@ def transform( # Cast passes parameter upfront. passes_seq: Optional[Sequence[PassType]] = None passes_dict: Optional[Dict[str, Sequence[PassType]]] = None - pass_manager: Optional[PassManager] = None + pass_manager: Optional[ + Union[ExportedProgramPassManager, GraphModulePassManager] + ] = None if isinstance(passes, Sequence): passes_seq = passes if isinstance(passes, dict): passes_dict = passes - if isinstance(passes, PassManager): + if isinstance(passes, (ExportedProgramPassManager, GraphModulePassManager)): pass_manager = passes for name, program in self._edge_programs.items(): diff --git a/exir/tests/test_pass_infra.py b/exir/tests/test_pass_infra.py index ded3c0e849d..7df6b76b93a 100644 --- a/exir/tests/test_pass_infra.py +++ b/exir/tests/test_pass_infra.py @@ -9,14 +9,22 @@ import unittest +import executorch.exir as exir import torch -from executorch.exir import to_edge -from executorch.exir.pass_base import ExportPassBaseError, ProxyValue -from executorch.exir.pass_manager import PassManager +from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.pass_base import ( + ExportedProgramPassBase, + ExportedProgramPassResult, + ExportPassBaseError, + ProxyValue, +) +from executorch.exir.pass_manager import ExportedProgramPassManager, PassManager from executorch.exir.passes import ScalarToTensorPass from executorch.exir.passes.pass_registry import PassRegistry -from torch.export import Dim, export -from torch.fx.passes.infra.pass_base import PassBase +from executorch.exir.program import to_edge +from torch.export import Dim, export, ExportedProgram +from torch.export.graph_signature import InputKind, InputSpec, TensorArgument +from torch.fx.passes.infra.pass_base import PassBase, PassResult class TestPassInfra(unittest.TestCase): @@ -216,3 +224,228 @@ def test_rejects_implicit_symbolic_scalar_coercions(self) -> None: with self.assertRaisesRegex(ExportPassBaseError, "converted to float"): float(ProxyValue(sym_float, torch.fx.Graph().placeholder("x"))) + + +class TestExportedProgramPassManager(unittest.TestCase): + def test_runs_graph_module_passes_on_exported_program(self) -> None: + """ + Tests that ExportedProgramPassManager runs GraphModule passes + on an ExportedProgram and the graph is correctly modified. + """ + + def replace_add_with_mul(gm: torch.fx.GraphModule) -> PassResult: + modified = False + for node in gm.graph.find_nodes( + op="call_function", target=exir_ops.edge.aten.add.Tensor + ): + node.target = exir_ops.edge.aten.mul.Tensor + modified = True + return PassResult(gm, modified) + + def f(x: torch.Tensor) -> torch.Tensor: + y = torch.add(x, x) + z = torch.add(y, x) + return z + + exported_program = ( + exir.capture(f, (torch.randn(10),), exir.CaptureConfig()) + .to_edge() + .exported_program + ) + + pm = ExportedProgramPassManager(passes=[replace_add_with_mul]) + result = pm(exported_program) + + # Verify return type + self.assertIsInstance(result, ExportedProgramPassResult) + self.assertTrue(result.modified) + + # Check that all add ops were replaced with mul + self.assertEqual( + len( + result.exported_program.graph.find_nodes( + op="call_function", target=exir_ops.edge.aten.add.Tensor + ) + ), + 0, + ) + + def test_updates_constants_on_exported_program(self) -> None: + """ + Tests that ExportedProgramPassManager can update constants + in the ExportedProgram using an ExportedProgram-aware pass. + """ + + class DoubleConstantsPass(ExportedProgramPassBase): + """Pass that doubles all constant tensor values in the ExportedProgram.""" + + def call(self, ep: ExportedProgram) -> ExportedProgramPassResult: + modified = False + for key, const in ep.constants.items(): + if isinstance(const, torch.Tensor): + ep.constants[key] = const * 2 + modified = True + return ExportedProgramPassResult(ep, modified) + + class ModuleWithConstant(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.weight = torch.ones(3) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + self.weight + + module = ModuleWithConstant() + exported_program = to_edge( + torch.export.export(module, (torch.randn(3),)) + ).exported_program() + + # Verify there are constants in the ExportedProgram + self.assertGreater( + len(exported_program.constants), 0, "Expected constants in ExportedProgram" + ) + + # Store original constant values + original_values = { + key: const.clone() + for key, const in exported_program.constants.items() + if isinstance(const, torch.Tensor) + } + + pm = ExportedProgramPassManager(passes=[DoubleConstantsPass()]) + result = pm(exported_program) + + self.assertIsInstance(result, ExportedProgramPassResult) + self.assertTrue(result.modified) + + # Verify constants were doubled + for key, original_const in original_values.items(): + new_const = result.exported_program.constants[key] + self.assertTrue( + torch.allclose(new_const, original_const * 2), + f"Constant {key} was not doubled correctly", + ) + + def test_adds_constant_to_exported_program(self) -> None: + """ + Tests that ExportedProgramPassManager can add a new constant + to the ExportedProgram, including updating the graph and input specs. + """ + + class AddConstantPass(ExportedProgramPassBase): + """Pass that adds a new constant tensor to the ExportedProgram.""" + + def call(self, ep: ExportedProgram) -> ExportedProgramPassResult: + graph = ep.graph_module.graph + sig = ep.graph_signature + + # Find the first user input to insert before it + placeholders = graph.find_nodes(op="placeholder") + assert len(placeholders) == 1 + user_input_node = placeholders[0] + + # Create a new constant tensor + new_constant_name = "_test_added_constant" + new_constant_tensor = torch.tensor([1.0, 2.0, 3.0]) + + # Add placeholder node for the new constant + with graph.inserting_before(user_input_node): + new_placeholder = graph.placeholder(new_constant_name) + # Set up meta for the new placeholder + new_placeholder.meta["val"] = new_constant_tensor + + # Add the constant to the constants dict + ep.constants[new_constant_name] = new_constant_tensor + + # Update input specs to include the new constant + new_input_spec = InputSpec( + kind=InputKind.CONSTANT_TENSOR, + arg=TensorArgument(name=new_placeholder.name), + target=new_constant_name, + persistent=False, + ) + sig.input_specs = (new_input_spec, sig.input_specs[0]) + + return ExportedProgramPassResult(ep, modified=True) + + class IdentityModule(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return x + + exported_program = to_edge( + torch.export.export(IdentityModule(), (torch.randn(3),)) + ).exported_program() + assert len(exported_program.constants) == 0 + assert len(exported_program.graph_signature.input_specs) == 1 + + pm = ExportedProgramPassManager(passes=[AddConstantPass()]) + result = pm(exported_program) + + self.assertIsInstance(result, ExportedProgramPassResult) + self.assertTrue(result.modified) + + # Verify the new constant was added to constants dict + self.assertEqual(len(result.exported_program.constants), 1) + self.assertIn("_test_added_constant", result.exported_program.constants) + self.assertTrue( + torch.allclose( + result.exported_program.constants["_test_added_constant"], + torch.tensor([1.0, 2.0, 3.0]), + ) + ) + + # Verify input_specs was updated + self.assertEqual( + len(result.exported_program.graph_signature.input_specs), + 2, + ) + + # Verify the new placeholder exists in the graph + placeholder_names = [ + node.target + for node in result.exported_program.graph_module.graph.find_nodes( + op="placeholder" + ) + ] + self.assertTrue(len(placeholder_names) == 2) + + # Verify the new input spec has the correct kind + new_spec = None + for spec in result.exported_program.graph_signature.input_specs: + if spec.target == "_test_added_constant": + new_spec = spec + break + self.assertIsNotNone(new_spec) + self.assertEqual(new_spec.kind, InputKind.CONSTANT_TENSOR) + + def test_invalid_pass_creates_call_method(self) -> None: + """ + Tests that ExportedProgramPassManager detects invalid passes + that introduce call_method nodes. + """ + + def introduce_call_method(gm: torch.fx.GraphModule) -> PassResult: + node = list(gm.graph.nodes)[-2] + with gm.graph.inserting_after(node): + gm.graph.call_method("torch.ops.relu", (torch.randn(2),)) + return PassResult(gm, True) + + def f(x: torch.Tensor) -> torch.Tensor: + y = torch.add(x, x) + return y + + exported_program = ( + exir.capture(f, (torch.randn(10),), exir.CaptureConfig()) + .to_edge() + .exported_program + ) + + pm = ExportedProgramPassManager( + passes=[introduce_call_method], run_checks_after_each_pass=True + ) + + with self.assertRaisesRegex(Exception, "call_method"): + pm(exported_program) From 2c9c9dda6eaf3ad764b2dc260a503efc01526eef Mon Sep 17 00:00:00 2001 From: Usamah Date: Wed, 27 May 2026 10:43:09 +0100 Subject: [PATCH 033/103] Arm backend: Enable Swin2SR TOSA ref tests (#19771) Summary: - Enable Swin2SR FP and INT TOSA pipelines to run through the reference model. - Keep quantized VGF runtime execution Linux-only until Darwin VKML validation is available. - Record current Swin2SR partition boundaries and track delegation gaps in MLETORCH-2163. Test Plan: - lintrunner on test_swin2sr_arm.py - backends/arm/scripts/pre-push cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Usamah Zaheer --- backends/arm/test/models/test_swin2sr_arm.py | 41 +++++++++++++------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/backends/arm/test/models/test_swin2sr_arm.py b/backends/arm/test/models/test_swin2sr_arm.py index 6bf9b2a18d5..e4fc6f07950 100644 --- a/backends/arm/test/models/test_swin2sr_arm.py +++ b/backends/arm/test/models/test_swin2sr_arm.py @@ -3,6 +3,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import sys from typing import Tuple import torch @@ -17,7 +18,7 @@ input_t = Tuple[torch.Tensor] -exir_ops = [ +ops_expected_absent_after_lowering = [ "executorch_exir_dialects_edge__ops_aten_add_Tensor", "executorch_exir_dialects_edge__ops_aten_convolution_default", "executorch_exir_dialects_edge__ops_aten_layer_norm_default", @@ -27,6 +28,21 @@ "executorch_exir_dialects_edge__ops_aten_softmax_int", ] +# TODO/MLETORCH-2163: Investigate Swin2SR delegation gaps around index/view +# in FP and Q/DQ, clamp, and expand_copy in INT. +swin2sr_fp_lowered_outer_graph_ops = { + "torch.ops.higher_order.executorch_call_delegate": 2, + "executorch_exir_dialects_edge__ops_aten_index_Tensor": 2, + "executorch_exir_dialects_edge__ops_aten_view_copy_default": 2, +} +swin2sr_int_lowered_outer_graph_ops = { + "torch.ops.higher_order.executorch_call_delegate": 3, + "executorch_exir_dialects_edge__ops_aten_clamp_default": 4, + "executorch_exir_dialects_edge__ops_aten_expand_copy_default": 4, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 5, + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 6, +} + class TinySwin2SR(torch.nn.Module): def __init__(self): @@ -62,12 +78,10 @@ def test_swin2sr_tosa_FP(): model, model_inputs, aten_op=[], - exir_op=exir_ops, + exir_op=ops_expected_absent_after_lowering, use_to_edge_transform_and_lower=True, ) - pipeline.pop_stage("check_count.exir") - # TODO: MLETORCH-2134 re-enable once Swin2SR runs on the TOSA ref model. - pipeline.pop_stage("run_method_and_compare_outputs") + pipeline.change_args("check_count.exir", swin2sr_fp_lowered_outer_graph_ops) pipeline.run() @@ -77,12 +91,10 @@ def test_swin2sr_tosa_INT(): model, model_inputs, aten_op=[], - exir_op=exir_ops, + exir_op=ops_expected_absent_after_lowering, use_to_edge_transform_and_lower=True, ) - pipeline.pop_stage("check_count.exir") - # TODO: MLETORCH-2134 re-enable once Swin2SR runs on the TOSA ref model. - pipeline.pop_stage("run_method_and_compare_outputs") + pipeline.change_args("check_count.exir", swin2sr_int_lowered_outer_graph_ops) pipeline.run() @@ -93,13 +105,12 @@ def test_swin2sr_vgf_quant(): model, model_inputs, aten_op=[], - exir_op=exir_ops, + exir_op=ops_expected_absent_after_lowering, use_to_edge_transform_and_lower=True, quantize=True, + run_on_vulkan_runtime=sys.platform == "linux", ) - pipeline.pop_stage("check_count.exir") - # TODO: MLETORCH-2134 re-enable once Swin2SR runs on the TOSA ref model. - pipeline.pop_stage("run_method_and_compare_outputs") + pipeline.change_args("check_count.exir", swin2sr_int_lowered_outer_graph_ops) pipeline.run() @@ -110,9 +121,9 @@ def test_swin2sr_vgf_no_quant(): model, model_inputs, aten_op=[], - exir_op=exir_ops, + exir_op=ops_expected_absent_after_lowering, use_to_edge_transform_and_lower=True, quantize=False, ) - pipeline.pop_stage("check_count.exir") + pipeline.change_args("check_count.exir", swin2sr_fp_lowered_outer_graph_ops) pipeline.run() From dd00d42d7d0a751ddbf99d72efee802c427c654b Mon Sep 17 00:00:00 2001 From: SaoirseARM <44364573+SaoirseARM@users.noreply.github.com> Date: Wed, 27 May 2026 10:56:01 +0100 Subject: [PATCH 034/103] Arm backend: Fix nested control-flow partition checks (#19697) - Updates so that the outer cond graph is picked up. - Updates to nested quantization. - Removes need for increased threshold. Signed-off-by: Saoirse Stewart --- backends/arm/_passes/arm_pass_utils.py | 49 +------- .../arm/_passes/control_flow_const_inline.py | 8 +- backends/arm/_passes/insert_rescales_pass.py | 8 +- .../arm/_passes/scalars_to_attribute_pass.py | 8 +- .../operator_support/control_flow_support.py | 26 +++-- backends/arm/operators/op_cond_if.py | 19 +++- backends/arm/operators/op_while.py | 19 +++- backends/arm/quantizer/arm_quantizer.py | 105 ++++++++++++------ backends/arm/test/ops/test_cond.py | 2 - backends/arm/tosa/backend.py | 61 +++++++++- backends/arm/tosa/mapping.py | 1 + backends/arm/tosa/partitioner.py | 8 +- 12 files changed, 193 insertions(+), 121 deletions(-) diff --git a/backends/arm/_passes/arm_pass_utils.py b/backends/arm/_passes/arm_pass_utils.py index 000f92135eb..f66b17b9da2 100644 --- a/backends/arm/_passes/arm_pass_utils.py +++ b/backends/arm/_passes/arm_pass_utils.py @@ -9,7 +9,7 @@ import operator import traceback from inspect import isclass -from typing import cast, List, Optional, Sequence, Tuple +from typing import cast, Optional, Sequence import torch import torch.fx @@ -19,10 +19,6 @@ from executorch.exir import ExportedProgram from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload -from executorch.exir.graph_module import ( - _get_control_flow_submodules, - get_control_flow_submodules, -) from executorch.exir.pass_base import NodeMetadata from torch._export.utils import ( @@ -36,7 +32,6 @@ from torch._ops import OpOverload from torch._subclasses.fake_tensor import FakeTensor from torch.export.graph_signature import InputKind -from torch.fx import GraphModule, Node def is_submodule_node(node: torch.fx.Node): @@ -364,48 +359,6 @@ def set_node_arg(node: torch.fx.Node, i: int | str, value): raise RuntimeError("Invalid type") -def is_nested_control_flow_graph(graph_module: GraphModule) -> bool: - """Returns True if graph_module is a nested control-flow graph.""" - - # Find all top-level control-flow submodules - top_cf = get_control_flow_submodules(graph_module) - # For each submodule, see if it itself has control-flow inside - for _, submod, _ in top_cf: - if get_control_flow_submodules(submod): - return True - return False - - -def get_cond_while_submodules_nested( - graph_module: GraphModule, - apply_quantization: bool = False, -) -> List[Tuple[str, GraphModule, Node]]: - """Recursively find cond/while_loop submodules in an GraphModule. - - In nested control flow graphs, FX records the submodule functions - (true/false or cond/body) in reverse order compared to top-level graphs. We - must swap the indices when nested so that cond (first) and body/true_fn - (second) are consistently identified across all nesting levels. - - """ - - # Determine arg indices based on nesting and whether only cond branch is needed - nested = is_nested_control_flow_graph(graph_module) - # cond: [true_fn, false_fn] or swapped if nested - cond_indices = [2, 1] if nested else [1, 2] - # while_loop: [cond_fn, body_fn] or swapped if nested - while_indices = [1, 0] if nested else [0, 1] - if apply_quantization: - # only keep the cond_fn for while_loop (first index) when quantizing. - while_indices = [while_indices[0]] - mapping = { - torch.ops.higher_order.cond: cond_indices, - torch.ops.higher_order.while_loop: while_indices, - } - # collect cond/while submodules (using mapping indices) - return _get_control_flow_submodules(graph_module, mapping) - - def to_2tuple(value): """Normalizes scalars, and 1-element sequences to a tuple of length 2.""" if isinstance(value, int): diff --git a/backends/arm/_passes/control_flow_const_inline.py b/backends/arm/_passes/control_flow_const_inline.py index cc76e5d9957..177ad30754e 100644 --- a/backends/arm/_passes/control_flow_const_inline.py +++ b/backends/arm/_passes/control_flow_const_inline.py @@ -7,12 +7,10 @@ import torch from executorch.backends.arm._passes.arm_pass import ArmPass -from executorch.backends.arm._passes.arm_pass_utils import ( - get_cond_while_submodules_nested, - is_submodule_node, -) +from executorch.backends.arm._passes.arm_pass_utils import is_submodule_node from executorch.backends.transforms.utils import is_get_attr_node from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.graph_module import get_cond_while_submodules from executorch.exir.pass_base import ExportPass, PassResult from torch.fx import GraphModule @@ -37,7 +35,7 @@ class ControlFlowConstInlinePass(ArmPass): def _convert_getattr(self, graph_module): modified = False - for _, submodule, _ in get_cond_while_submodules_nested(graph_module): + for _, submodule, _ in get_cond_while_submodules(graph_module): for submodule_node in submodule.graph.nodes: if submodule_node.target in self._targeted_ops: self._convert_getattr(submodule) diff --git a/backends/arm/_passes/insert_rescales_pass.py b/backends/arm/_passes/insert_rescales_pass.py index 06c27005440..45374c12c3b 100644 --- a/backends/arm/_passes/insert_rescales_pass.py +++ b/backends/arm/_passes/insert_rescales_pass.py @@ -509,7 +509,13 @@ def _rescale_submodule_inputs( input_node = input_nodes[qargs_index] if len(input_node.users) == 0: continue - if len(out_qparams_map := input_node.meta.get("output_qparams", {})) != 1: + out_qparams_map = input_node.meta.get("output_qparams", {}) + if len(out_qparams_map) == 0: + # Nested control-flow submodules may also expose frozen captured + # values as placeholders. Those are not control-flow boundary + # inputs, so there is no qparam pair to bridge with a RESCALE. + continue + if len(out_qparams_map) != 1: raise ValueError( f"Expected submodule input {input_node} to have exactly one output qparam, got {out_qparams_map}" ) diff --git a/backends/arm/_passes/scalars_to_attribute_pass.py b/backends/arm/_passes/scalars_to_attribute_pass.py index 0473caf91e7..63a38b8cb2f 100644 --- a/backends/arm/_passes/scalars_to_attribute_pass.py +++ b/backends/arm/_passes/scalars_to_attribute_pass.py @@ -8,11 +8,9 @@ import torch from executorch.backends.arm._passes import ArmPass -from executorch.backends.arm._passes.arm_pass_utils import ( - get_cond_while_submodules_nested, - get_first_fake_tensor, -) +from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass +from executorch.exir.graph_module import get_cond_while_submodules from executorch.exir.pass_base import ExportPass, PassResult from torch.fx import GraphModule, Node from torchao.quantization.pt2e.utils import get_new_attr_name_with_prefix @@ -98,7 +96,7 @@ def handle_control_nodes(self, graph_module: GraphModule) -> None: """Apply scalar argument conversion on subgraphs of control-flow nodes. """ - for _, submodule, _ in get_cond_while_submodules_nested(graph_module): + for _, submodule, _ in get_cond_while_submodules(graph_module): for submodule_node in submodule.graph.nodes: self._convert_scalar_args(submodule, submodule_node) diff --git a/backends/arm/operator_support/control_flow_support.py b/backends/arm/operator_support/control_flow_support.py index b34ebeaece0..f5251357cd3 100644 --- a/backends/arm/operator_support/control_flow_support.py +++ b/backends/arm/operator_support/control_flow_support.py @@ -19,6 +19,13 @@ from torch.fx.passes.operator_support import OperatorSupportBase +def _owning_graph_module(node: fx.Node) -> fx.GraphModule: + graph_module = getattr(node.graph, "owning_module", None) + if not isinstance(graph_module, fx.GraphModule): + raise RuntimeError(f"Could not resolve owning GraphModule for node {node}") + return graph_module + + def _fully_partitioned(submodule: fx.GraphModule) -> bool: """Check that all nested control-flow ops within this submodule are also fully partitioned. @@ -27,8 +34,8 @@ def _fully_partitioned(submodule: fx.GraphModule) -> bool: for submodule_node in submodule.graph.nodes: if submodule_node.target in ControlFlowOpSupported._targeted_ops: - if _submodules_fully_partitioned(submodule_node, submodule): - return True + if not _submodules_fully_partitioned(submodule_node, submodule): + return False if submodule_node.op != "call_function": continue @@ -56,13 +63,18 @@ def _fully_partitioned(submodule: fx.GraphModule) -> bool: return True -def _submodules_fully_partitioned(node: fx.Node, graph_module: fx.GraphModule) -> bool: +def _submodules_fully_partitioned( + node: fx.Node, graph_module: fx.GraphModule | None = None +) -> bool: """Returns whether the submodule arguments to a cond node were fully partitioned. Updates "val" meta of the submodules if they are. """ + if graph_module is None: + graph_module = _owning_graph_module(node) + match node.target: case torch.ops.higher_order.cond: submodule_args = node.args[1:3] @@ -129,9 +141,7 @@ def is_node_supported( node, f"Submodule had unsupported user {user}" ) return False - if not _submodules_fully_partitioned( - user, self.exported_program.graph_module - ): + if not _submodules_fully_partitioned(user): self.reporter.report_reject( node, "One submodule was not fully partitioned" ) @@ -174,9 +184,7 @@ def is_node_supported( ) return False - if not _submodules_fully_partitioned( - node, self.exported_program.graph_module - ): + if not _submodules_fully_partitioned(node): self.reporter.report_reject( node, "Submodule was not fully partitioned." ) diff --git a/backends/arm/operators/op_cond_if.py b/backends/arm/operators/op_cond_if.py index 05d38e2a1f0..513100c2b15 100644 --- a/backends/arm/operators/op_cond_if.py +++ b/backends/arm/operators/op_cond_if.py @@ -17,7 +17,11 @@ validate_num_inputs, validate_valid_dtype, ) -from executorch.backends.arm.tosa.mapping import TosaArg # type: ignore +from executorch.backends.arm.tosa.mapping import ( # type: ignore + TOSA_CONTROL_FLOW_REGION_NAME_META, + TOSA_TENSOR_NAME_META, + TosaArg, +) from torch.fx import Node @@ -38,7 +42,12 @@ def define_node( validate_cf_extension(self.target, self.tosa_spec) attr = ts.TosaSerializerAttribute() - if_graph, else_graph = (cast(Node, arg).target for arg in node.args[1:3]) + if_graph, else_graph = ( + cast(Node, arg).meta.get( + TOSA_CONTROL_FLOW_REGION_NAME_META, str(cast(Node, arg).target) + ) + for arg in node.args[1:3] + ) attr.CondIfAttribute(if_graph, else_graph) self._serialize_operator( @@ -47,7 +56,11 @@ def define_node( ts.Op.COND_IF, [ inputs[0].name, - *(subgraph_input.name for subgraph_input in inputs[-1].special), + *( + subgraph_input.name + + subgraph_input.meta.get(TOSA_TENSOR_NAME_META, "") + for subgraph_input in inputs[-1].special + ), ], output.multiple_output_names, attr, diff --git a/backends/arm/operators/op_while.py b/backends/arm/operators/op_while.py index 2b6314d3454..58501dd3ba0 100644 --- a/backends/arm/operators/op_while.py +++ b/backends/arm/operators/op_while.py @@ -15,8 +15,14 @@ validate_cf_extension, validate_num_inputs, ) -from executorch.backends.arm.tosa.mapping import map_dtype, TosaArg +from executorch.backends.arm.tosa.mapping import ( + map_dtype, + TOSA_CONTROL_FLOW_REGION_NAME_META, + TOSA_TENSOR_NAME_META, + TosaArg, +) from executorch.backends.arm.tosa.utils import normalize_symint + from torch.fx import Node @@ -46,7 +52,12 @@ def define_node( ) attr = ts.TosaSerializerAttribute() - cond_graph, body_graph = (str(cast(Node, arg).target) for arg in node.args[:2]) + cond_graph, body_graph = ( + cast(Node, arg).meta.get( + TOSA_CONTROL_FLOW_REGION_NAME_META, str(cast(Node, arg).target) + ) + for arg in node.args[:2] + ) attr.WhileLoopAttribute(cond_graph, body_graph) input_names: list[str] = [] @@ -55,7 +66,9 @@ def define_node( raise ValueError( f"{self.target}: Unsupported carried input type {type(loop_input)}." ) - input_names.append(loop_input.name) + input_names.append( + loop_input.name + loop_input.meta.get(TOSA_TENSOR_NAME_META, "") + ) num_inputs = len(input_names) num_outputs = len(output.multiple_output_names) diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py index f1dfb5f1323..3508410509c 100644 --- a/backends/arm/quantizer/arm_quantizer.py +++ b/backends/arm/quantizer/arm_quantizer.py @@ -40,6 +40,10 @@ from executorch.backends.cortex_m.quantizer.pattern_matcher import PatternMatcher from executorch.backends.cortex_m.quantizer_reporter import QuantizerReporter +from executorch.exir.graph_module import ( + _get_control_flow_submodules, + get_cond_while_submodules, +) from torch._ops import OpOverload @@ -52,10 +56,6 @@ from executorch.backends.arm.common.arm_compile_spec import ( ArmCompileSpec, ) # isort: skip -from executorch.backends.arm._passes.arm_pass_utils import ( - get_cond_while_submodules_nested, - is_submodule_node, -) from executorch.backends.arm.quantizer.arm_quantizer_utils import ( _get_int32_bias_qspec, @@ -107,6 +107,29 @@ logger = logging.getLogger(__name__) +def get_cond_while_submodules_ao( + graph_module: GraphModule, + apply_quantization: bool = False, +) -> list[tuple[str, GraphModule, Node]]: + """Return cond/while submodules for the current graph module. + + Quantization handles ``while_loop`` body functions natively in torchao, so + only the ``while_loop`` cond function is processed explicitly there. + + """ + + if not apply_quantization: + return get_cond_while_submodules(graph_module) + + return _get_control_flow_submodules( + graph_module, + { + torch.ops.higher_order.cond: [1, 2], + torch.ops.higher_order.while_loop: [0], + }, + ) + + @functools.lru_cache def get_symmetric_quantization_config( is_per_channel: bool = True, @@ -810,42 +833,56 @@ def _quantize_with_submodules( prepare_fn = prepare_qat_pt2e if is_qat else prepare_pt2e prepared = prepare_fn(model, self) - # Prepare conditional submodules (e.g., if/while bodies) - # prepare only cond branches and while_loop cond_fn - for name, submodule, _ in get_cond_while_submodules_nested( - prepared, apply_quantization=True - ): - prepared.set_submodule(name, prepare_fn(submodule, self), strict=True) - for submodule_node in submodule.graph.nodes: - if is_submodule_node(submodule_node): - for nested_name, nested_sub, _ in get_cond_while_submodules_nested( - submodule, apply_quantization=True - ): - prepared.set_submodule( - nested_name, prepare_fn(nested_sub, self), strict=True - ) + + def _prepare_control_flow_submodules( + source_graph_module: GraphModule, prefix: str = "" + ) -> None: + for name, submodule, _ in get_cond_while_submodules_ao( + source_graph_module, apply_quantization=True + ): + qualified_name = f"{prefix}.{name}" if prefix else name + prepared.set_submodule( + qualified_name, prepare_fn(submodule, self), strict=True + ) + _prepare_control_flow_submodules(submodule, qualified_name) + + _prepare_control_flow_submodules(prepared) for inp in calibration_samples: prepared(*inp) - # Prepare conditional submodules (e.g., if/while bodies) - # convert only cond branches and while_loop cond_fn - for _, submodule, _ in get_cond_while_submodules_nested( - prepared, apply_quantization=True + def _convert_control_flow_submodule( + graph_module: GraphModule, + ) -> GraphModule: + converted_submodules: list[tuple[str, GraphModule]] = [] + for name, submodule, _ in get_cond_while_submodules_ao( + graph_module, apply_quantization=True + ): + converted_submodules.append( + (name, _convert_control_flow_submodule(submodule)) + ) + converted_graph_module = convert_pt2e( + graph_module, fold_quantize=fold_quantize + ) + for name, converted_submodule in converted_submodules: + converted_graph_module.set_submodule( + name, converted_submodule, strict=True + ) + return converted_graph_module + + converted_top_level_submodules: list[tuple[str, GraphModule]] = [] + for name, submodule, _ in list( + get_cond_while_submodules_ao(prepared, apply_quantization=True) ): - converted = convert_pt2e(submodule, fold_quantize=fold_quantize) - for submodule_node in submodule.graph.nodes: - if is_submodule_node(submodule_node): - for nested_name, nested_sub, _ in get_cond_while_submodules_nested( - submodule, apply_quantization=True - ): - converted.set_submodule( - nested_name, - convert_pt2e(nested_sub, fold_quantize=fold_quantize), - strict=True, - ) + converted_top_level_submodules.append( + (name, _convert_control_flow_submodule(submodule)) + ) + + converted = convert_pt2e(prepared, fold_quantize=fold_quantize) + for name, converted_submodule in converted_top_level_submodules: + converted.set_submodule(name, converted_submodule, strict=True) - return convert_pt2e(prepared, fold_quantize=fold_quantize) + return converted class _TOSAQuantizerV1(Quantizer): diff --git a/backends/arm/test/ops/test_cond.py b/backends/arm/test/ops/test_cond.py index 8c6d9ef329c..6f489f0ab01 100644 --- a/backends/arm/test/ops/test_cond.py +++ b/backends/arm/test/ops/test_cond.py @@ -250,8 +250,6 @@ def test_cond_tosa_INT(case: Callable[[], tuple[torch.nn.Module, tuple]]): example_inputs, aten_op, tosa_extensions=["cf"], - frobenius_threshold=0.8, - cosine_threshold=0.8, # MLETORCH-1808 ) _set_branch_calibration_samples(pipeline, module, example_inputs) # Make sure no cond ops are left after partitioning. diff --git a/backends/arm/tosa/backend.py b/backends/arm/tosa/backend.py index 6b864e284b1..b0cae15022d 100644 --- a/backends/arm/tosa/backend.py +++ b/backends/arm/tosa/backend.py @@ -23,9 +23,6 @@ import tosa_serializer as ts -from executorch.backends.arm._passes.arm_pass_utils import ( - get_cond_while_submodules_nested, -) from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec from executorch.backends.arm.common.debug import debug_fail, debug_tosa_dump from executorch.backends.arm.debug.schema import DebugHook @@ -35,9 +32,13 @@ process_placeholder, ) from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec -from executorch.backends.arm.tosa.mapping import TOSA_TENSOR_NAME_META +from executorch.backends.arm.tosa.mapping import ( + TOSA_CONTROL_FLOW_REGION_NAME_META, + TOSA_TENSOR_NAME_META, +) from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult from executorch.exir.backend.compile_spec_schema import CompileSpec +from executorch.exir.graph_module import get_cond_while_submodules from torch.export.exported_program import ExportedProgram from torch.fx import Graph, GraphModule, Node @@ -45,6 +46,15 @@ logger = logging.getLogger(__name__) +def _qualify_control_flow_region_name( + parent_region_name: str | None, child_region_name: str +) -> str: + """Return a globally unique TOSA region name for nested control flow.""" + if parent_region_name is None: + return child_region_name + return f"{parent_region_name}__{child_region_name}" + + def _annotate_external_ids(ep_graph: Graph) -> Dict[str, int]: """Assign deterministic output IDs to leaf outputs. @@ -325,6 +335,43 @@ def _preprocess_module( # noqa: C901 RuntimeError: If an FX node with an unsupported op kind is found. """ + + def _annotate_control_flow_region_names( + graph_module: GraphModule, parent_region_name: str | None + ) -> None: + for node in graph_module.graph.nodes: + if node.op != "call_function": + continue + + match node.target: + case torch.ops.higher_order.cond: + arg_indices = [1, 2] + case torch.ops.higher_order.while_loop: + arg_indices = [0, 1] + case _: + continue + + for arg_index in arg_indices: + submodule_node = node.args[arg_index] + if not isinstance(submodule_node, Node): + raise RuntimeError( + f"Expected control flow submodule arg {arg_index} to be a Node." + ) + if submodule_node.op != "get_attr": + raise RuntimeError( + f"Expected control flow submodule arg {arg_index} to be a get_attr node." + ) + if not isinstance(submodule_node.target, str): + raise RuntimeError( + "Expected control flow submodule target to be a string." + ) + + submodule_node.meta[TOSA_CONTROL_FLOW_REGION_NAME_META] = ( + _qualify_control_flow_region_name( + parent_region_name, submodule_node.target + ) + ) + tosa_spec = compile_spec.tosa_spec node_to_id_map = _annotate_external_ids(graph_module.graph) artifact_path = compile_spec._get_intermediate_path() @@ -348,6 +395,8 @@ def _preprocess_module( # noqa: C901 else: logger.debug("No re-sorting outputs (workaround) during TOSA lowering.") + _annotate_control_flow_region_names(graph_module, submodule_name) + if submodule_name is not None: tosa_graph.startRegion(submodule_name) tosa_graph.currRegion.addBasicBlock(submodule_name) @@ -396,7 +445,7 @@ def _preprocess_module( # noqa: C901 raise # Recursively preprocess controlflow submodules. - for name, submodule, control_flow_node in get_cond_while_submodules_nested( + for name, submodule, control_flow_node in get_cond_while_submodules( graph_module ): TOSABackend._regularize_submodule(submodule, control_flow_node) @@ -406,7 +455,7 @@ def _preprocess_module( # noqa: C901 compile_spec, tosa_graph, debug_hook, - submodule_name=name, + submodule_name=_qualify_control_flow_region_name(submodule_name, name), containing_graph_module=graph_module, ) diff --git a/backends/arm/tosa/mapping.py b/backends/arm/tosa/mapping.py index b37c41a070b..0e91120c3b8 100644 --- a/backends/arm/tosa/mapping.py +++ b/backends/arm/tosa/mapping.py @@ -17,6 +17,7 @@ import tosa_serializer as ts from executorch.backends.arm.tosa.specification import TosaSpecification +TOSA_CONTROL_FLOW_REGION_NAME_META = "tosa_control_flow_region_name" TOSA_TENSOR_NAME_META = "tosa_tensor_name" UNSUPPORTED_DTYPES = ( diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py index bd900f4cc81..d93e212c314 100644 --- a/backends/arm/tosa/partitioner.py +++ b/backends/arm/tosa/partitioner.py @@ -21,10 +21,7 @@ from typing import Callable, cast, List, Optional, Sequence, Tuple import torch -from executorch.backends.arm._passes.arm_pass_utils import ( - get_cond_while_submodules_nested, - get_first_fake_tensor, -) +from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor from executorch.backends.arm._passes.convert_expand_copy_to_repeat import ( calculate_multiples, ) @@ -43,6 +40,7 @@ ) from executorch.exir.backend.utils import tag_constant_data, WhyNoPartitionReporter from executorch.exir.dialects._ops import ops as exir_ops +from executorch.exir.graph_module import get_cond_while_submodules from torch.export.exported_program import ExportedProgram from torch.fx import GraphModule from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner, Partition @@ -400,7 +398,7 @@ def _tag_module( # noqa tags: set[str] = set() if tag_iterator is None: tag_iterator = count(0) - for _, submodule, _ in get_cond_while_submodules_nested(module): + for _, submodule, _ in get_cond_while_submodules(module): submodule_tags = self._tag_module( submodule, containing_program, reporter, tag_iterator ) From d83aa08ad3ea82902addd9736a6bbf311fa7fd26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?= <33344797+martinlsm@users.noreply.github.com> Date: Wed, 27 May 2026 13:07:30 +0200 Subject: [PATCH 035/103] Arm backend: Reuse identical CONST_SHAPE nodes (#19770) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Cache CONST_SHAPE nodes created by InsertConstShapesPass and reuse them when a later view/repeat needs the same shape. This removes duplicate shape constants. This improvement is model dependent. Models with few repeated literal shapes will not see any meaningful change, but some models can benefit from it notably. The table below shows the results of a local test lowering DeiT Tiny to TOSA-FP. The lowering time reduced in this run, likely because passes following InsertConstShapesPass had fewer nodes to iterate over. | Metric | Baseline | Optimized | Delta | | -------------- | -------- | --------- | ---------------- | | Total ops | 2106 | 1736 | -370 (-17.6%) | | CONST_SHAPE | 466 | 96 | -370 (-79.4%) | | TOSA size | 23.82 MB | 23.75 MB | -71.6 KB (-0.3%) | | Execution time | 118.7 s | 78.4 s | -40.3 s (-34.0%) | Signed-off-by: Martin Lindström --- backends/arm/_passes/insert_const_shapes.py | 22 ++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/backends/arm/_passes/insert_const_shapes.py b/backends/arm/_passes/insert_const_shapes.py index b03394379d9..059731857b4 100644 --- a/backends/arm/_passes/insert_const_shapes.py +++ b/backends/arm/_passes/insert_const_shapes.py @@ -26,6 +26,10 @@ class InsertConstShapesPass(ArmPass): exir_ops.edge.aten.repeat.default, } + def __init__(self) -> None: + super().__init__() + self._const_shape_cache: dict[tuple[int, ...], Any] = {} + @staticmethod def _is_shape_arg(arg: Any) -> bool: """Return True when `arg` looks like a literal shape list/tuple.""" @@ -46,13 +50,17 @@ def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False) # Insert a const node for the shape argument if op == exir_ops.edge.aten.view_copy.default: arg = meta.data["val"].shape - const_node = super().call_shape_operator( - exir_ops.backend.tosa.CONST_SHAPE.default, - (arg,), - {}, - meta, - True, - ) + shape = tuple(arg) + const_node = self._const_shape_cache.get(shape) + if const_node is None: + const_node = super().call_shape_operator( + exir_ops.backend.tosa.CONST_SHAPE.default, + (arg,), + {}, + meta, + True, + ) + self._const_shape_cache[shape] = const_node new_args.append(const_node) updated = True else: From 85dfa447a06990757de19b640a76e72d695ceb6a Mon Sep 17 00:00:00 2001 From: Martin Pavella Date: Wed, 27 May 2026 14:58:48 +0200 Subject: [PATCH 036/103] NXP backend: Add `mean.dim` support with new Neutron flow. (#19740) ### Summary Add `mean.dim` support with new Neutron flow. ### Test plan Unit tests provided. cc @robert-kalmar @JakeStevens @digantdesai @rascani --- backends/nxp/backend/edge_helper.py | 2 +- .../max_pool2d_with_indices_converter.py | 4 +- .../ops_converters/mean_dim_converter.py | 113 ++++++--- .../node_converter/test_mean_dim_converter.py | 217 +++++++++++++++++- backends/nxp/tests/ops_aliases.py | 1 + 5 files changed, 297 insertions(+), 40 deletions(-) diff --git a/backends/nxp/backend/edge_helper.py b/backends/nxp/backend/edge_helper.py index 957b673bb6a..1ea86f589ac 100644 --- a/backends/nxp/backend/edge_helper.py +++ b/backends/nxp/backend/edge_helper.py @@ -318,7 +318,7 @@ def is_no_op_on_neutron(node: Node, parameters_mapping: dict[str, Parameter]) -> input_data = torch.rand(val.shape, dtype=val.dtype) * 10 - 5 args_with_random_data.append(input_data) - case list(): + case list() if any(isinstance(a, Node) for a in arg): # Lists of input nodes are not supported to keep the code simple. It is not crucial to support this # case as the affected operators are either not supported on Neutron, or are extremely unlikely to # be no-ops (e.g. GRU). One exception is `aten.cat`, which is explicitly supported above. diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool2d_with_indices_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool2d_with_indices_converter.py index 975aaf57625..b7e761c45e6 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool2d_with_indices_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/max_pool2d_with_indices_converter.py @@ -152,9 +152,7 @@ def _get_node_args( :return: Tuple of (kernel_size, stride, padding, dilation, ceil_mode). """ kernel_size = node.args[1] - stride = node.args[ - 2 - ] # The default value is equal to the kernel_size, so it is never empty here. + stride = try_get_arg(node, 2) or kernel_size padding = try_get_arg(node, 3) or (0, 0) dilation = try_get_arg(node, 4) or (1, 1) ceil_mode = try_get_arg(node, 5) or False diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py index c4b828df39f..4ba56a6b755 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/mean_dim_converter.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import torch + from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT from executorch.backends.nxp.backend.ir.converter.conversion.translator import ( @@ -11,6 +12,7 @@ ) from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, + is_not_qdq_node, NodeConverter, ) from executorch.backends.nxp.backend.ir.converter.node_converters.shared.reduce_utils import ( @@ -21,10 +23,40 @@ ) from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node +from torch.fx.passes.infra.partitioner import Partition from torch.nn import Parameter class MeanDimConverter(NodeConverter): + + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + ) -> bool: + if custom_delegation_options.use_new_flow_neutron_c: + dim, keepdim = MeanDimConverter._get_attrs(node) + input_shape = node.args[0].meta["val"].shape + + is_alone_in_partition = cls.is_node_alone_in_partition( + node, partition_list, filter_fn=is_not_qdq_node + ) + + if ( + is_alone_in_partition + and keepdim + and all(input_shape[d] == 1 for d in dim) + ): + # The operator is a no-op, so the Neutron Converter will skip it. If it's the only node in the + # partition, the graph would end up empty. + return False + + return True + @staticmethod def _is_supported_on_target( node: Node, @@ -32,34 +64,49 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - keepdim = node.args[2] if len(node.args) >= 3 else False - rank = len(node.args[0].meta["val"].shape) - dim = [MeanDimConverter._to_pos_dim(d, rank) for d in node.args[1]] + if custom_delegation_options.use_new_flow_neutron_c: + # Requirements specified by the new Neutron flow documentation. + + if not NodeConverter.uses_quantization_type_for_io( + node, + supported_types=[torch.int8, torch.uint8], + input_indices=[0], + output_indices=[0], + ): + return False - if rank != 4 or not keepdim: - # neutron-converter/src/OperatorC/GlobalAvgPoolPlugin.cpp#74-77 - return False + return True - # The `mean.dim` gets converted to AveragePool by the NeutronConverter, so the channels must be a - # multiple of `num_macs`. - # neutron-converter/src/OperatorC/GlobalAvgPoolPlugin.cpp#59-85 - num_macs = neutron_target_spec.get_num_macs() - channels_dim = 1 if node.meta[NXP_NODE_FORMAT].is_channels_first() else -1 - if (node.meta["val"].shape[channels_dim] % num_macs) != 0: - return False + else: + # Requirements of the old Neutron flow. + rank = len(node.args[0].meta["val"].shape) + dim, keepdim = MeanDimConverter._get_attrs(node) + dim = [MeanDimConverter._to_pos_dim(d, rank) for d in dim] - # Neutron only supports reduction over the spatial dimensions H, W. - if node.meta[NXP_NODE_FORMAT].is_channels_first(): - # The input is NCHW. H and W are at indices 2 and 3. - if dim not in [[2, 3], [3, 2]]: + if rank != 4 or not keepdim: + # neutron-converter/src/OperatorC/GlobalAvgPoolPlugin.cpp#74-77 return False - else: - # The input is formatless. It can be considered as NHWC, as this is the way Neutron will look at - # the dimensions. So H and W are the middle dimensions. - if dim not in [[1, 2], [2, 1]]: + + # The `mean.dim` gets converted to AveragePool by the NeutronConverter, so the channels must be a + # multiple of `num_macs`. + # neutron-converter/src/OperatorC/GlobalAvgPoolPlugin.cpp#59-85 + num_macs = neutron_target_spec.get_num_macs() + channels_dim = 1 if node.meta[NXP_NODE_FORMAT].is_channels_first() else -1 + if (node.meta["val"].shape[channels_dim] % num_macs) != 0: return False - return True + # Neutron only supports reduction over the spatial dimensions H, W. + if node.meta[NXP_NODE_FORMAT].is_channels_first(): + # The input is NCHW. H and W are at indices 2 and 3. + if dim not in [[2, 3], [3, 2]]: + return False + else: + # The input is formatless. It can be considered as NHWC, as this is the way Neutron will look at + # the dimensions. So H and W are the middle dimensions. + if dim not in [[1, 2], [2, 1]]: + return False + + return True @staticmethod def _is_supported_in_IR( @@ -91,15 +138,29 @@ def _normalize_and_to_channel_last_dim(dim: list[int], rank: int) -> list[int]: perm = create_channels_last_to_channels_first_permutation(rank, True) dim = [perm[d] for d in dim] + # noinspection PyTypeChecker return dim - # Mean Dim Node format: (Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) + @staticmethod + def _get_attrs(node: Node) -> tuple[list[int], bool]: + dim = node.args[1] + keepdim = node.args[2] if len(node.args) >= 3 else False + return dim, keepdim + def convert(self, node: Node): - """Convert 'mean.dim' operator to TFLite 'Mean'.""" + """Convert the 'mean.dim' operator to NeutronIR 'Mean'. + The ExecuTorch schema is: + mean.dim( + Tensor self, + int[1]? dim, + bool keepdim=False, + *, + ScalarType? dtype=None + ) -> Tensor + """ self.assert_convertible(node) - dim = node.args[1] - keepdim = node.args[2] if len(node.args) >= 3 else False + dim, keepdim = self._get_attrs(node) t_op = self._create_tflite_op_with_io_tensors(node) t_op.builtin_options = mean_options.Mean(keepdim) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py index 7c0a5e8ffcf..a265ca557c9 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_mean_dim_converter.py @@ -1,15 +1,18 @@ -# Copyright 2025 NXP +# Copyright 2025-2026 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. import numpy as np + +# noinspection PyUnusedImports import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import ( convert_run_compare, @@ -17,10 +20,21 @@ ToChannelFirstPreprocess, ToChannelLastPreprocess, ) +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + AllCloseOutputComparator, +) from executorch.backends.nxp.tests.models import MeanDimConvModule, MeanDimLinearModule -from executorch.backends.nxp.tests.use_qat import * # noqa F403 -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + AddTensor, + ExecutorchDelegateCall, + GetItem, + MaxPool2DWithIndices, + MeanDim, +) from torch.export import ExportedProgram +from executorch.backends.nxp.tests.use_qat import * # noqa F403 @pytest.fixture(autouse=True) @@ -39,6 +53,12 @@ def forward(self, x): return torch.mean(x, dim=self.dim, keepdim=self.keepdim) +class MeanDimAddModule(MeanDimModule): + def forward(self, x): + x = super().forward(x) + return x + x + + @pytest.mark.parametrize( "input_shape, dim", [ @@ -60,7 +80,7 @@ def test_mean_dim_conv_quant_conversion( model, input_shape, use_qat=use_qat, use_neutron_for_format_conversion=False ).exported_program() # Make sure the `mean.dim` was delegated. - assert not graph_contains_any_of_ops(ep.graph, [exir_ops.edge.aten.mean.dim]) + assert not graph_contains_any_of_ops(ep.graph, [MeanDim]) assert any("lowered_module" in n.name for n in ep.graph.nodes) # Capture generated model @@ -109,7 +129,7 @@ def test_mean_dim_linear_unsupported_quant_conversion( nodes = list(edge_program.graph.nodes) # Last 2 dimensions are not used or keepdim is False, cannot be converted to MeanDim, node is not delegated - assert nodes[6].target.__name__ == "aten.mean.dim" + assert nodes[6].target == MeanDim # Capture generated model tflite_flatbuffers_model, io_formats = converter_spy.spy_return @@ -157,7 +177,7 @@ def test_mean_dim_conv_unsupported_quant_conversion( nodes = list(edge_program.graph.nodes) # Last 2 dimensions are not used or keepdim is False, cannot be converted to MeanDim, node is not delegated - assert nodes[6].target.__name__ == "aten.mean.dim" + assert nodes[6].target == MeanDim # Capture generated model tflite_flatbuffers_model, io_formats = converter_spy.spy_return @@ -197,7 +217,7 @@ def test_mean_dim__formatless__supported( ).exported_program() # Make sure the `mean.dim` was delegated. - assert not graph_contains_any_of_ops(ep.graph, [exir_ops.edge.aten.mean.dim]) + assert not graph_contains_any_of_ops(ep.graph, [MeanDim]) assert any("lowered_module" in n.name for n in ep.graph.nodes) # Capture generated model @@ -230,7 +250,7 @@ def test_mean_dim__formatless__unsupported(input_shape, dim, use_qat, keepdim=Tr ).exported_program() # Make sure the `mean.dim` was NOT delegated. - assert graph_contains_any_of_ops(ep.graph, [exir_ops.edge.aten.mean.dim]) + assert graph_contains_any_of_ops(ep.graph, [MeanDim]) assert not any("lowered_module" in n.name for n in ep.graph.nodes) @@ -252,7 +272,7 @@ def test_mean_dim__formatless__unsupported_channels( ).exported_program() # Make sure the `mean.dim` was NOT delegated. - assert graph_contains_any_of_ops(ep.graph, [exir_ops.edge.aten.mean.dim]) + assert graph_contains_any_of_ops(ep.graph, [MeanDim]) assert not any("lowered_module" in n.name for n in ep.graph.nodes) @@ -277,4 +297,181 @@ def test_mean_dim__channels_first__unsupported_channels( ).exported_program() # Make sure the `mean.dim` was NOT delegated. - assert graph_contains_any_of_ops(ep.graph, [exir_ops.edge.aten.mean.dim]) + assert graph_contains_any_of_ops(ep.graph, [MeanDim]) + + +class MaxPoolMeanDimModule(torch.nn.Module): + def __init__(self, dim, keepdim): + super().__init__() + self.dim, self.keepdim = dim, keepdim + + def forward(self, x): + x = torch.max_pool2d( + x, kernel_size=1 + ) # NoOp, but it enforces the channels first format. + return torch.mean(x, dim=self.dim, keepdim=self.keepdim) + + +class TestMeanDimNewNeutronFlow: + + # noinspection PyMethodMayBeStatic + def assert_delegated( + self, + model, + input_shape, + mocker, + use_qat=False, + atol=None, + expected_delegated_ops=None, + ): + if expected_delegated_ops is None: + expected_delegated_ops = {MeanDim: 1} + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops=expected_delegated_ops, + expected_non_delegated_ops={}, + ) + + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) + + kwargs = {"atol": atol} if atol is not None else {} + output_comparator = AllCloseOutputComparator(**kwargs) + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + output_comparator, + use_qat=use_qat, + use_new_flow_neutron_c=True, # Use the new flow. + ) + + # noinspection PyMethodMayBeStatic + def assert_not_delegated(self, model, input_shape): + delegated_ep = to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + # Make sure the `mean` was NOT delegated. + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [MeanDim]) + + @pytest.fixture(params=[True, False], ids=lambda keep_dim: f"keep_dim = {keep_dim}") + def keep_dim(self, request): + return request.param + + def test__basic_nsys_inference__qat(self, mocker, use_qat, keep_dim): + input_shape = (23,) + model = MeanDimModule(0, keep_dim) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat) + + @pytest.mark.parametrize( + "input_shape, dim", + [ + pytest.param((5,), 0, id="1D, dim = 0."), + pytest.param((4, 2), 0, id="2D, dim = 0."), + pytest.param((4, 2), -1, id="2D, dim = -1."), + pytest.param((3, 1, 4), 2, id="3D, dim = 2."), + pytest.param((1, 3, 3, 7), 3, id="4D, dim = 3."), + pytest.param((3, 1, 4, 1, 5), -1, id="5D, dim = -1."), + pytest.param((3, 1, 4, 1, 5), 0, id="5D, dim = 0."), + ], + ) + def test__single_dims(self, mocker, input_shape, dim, keep_dim): + model = MeanDimModule(dim, keep_dim) + # Relatively large error, but it is actually equal to the output scale, so it is a single bit error. + # TODO Replace with quantized dataset testing and `atol = 1`. + atol = 0.014 + self.assert_delegated(model, input_shape, mocker, atol=atol) + + @pytest.mark.parametrize( + "input_shape, dim", + [ + pytest.param((4, 2), (-2,), id="2D, dim = (-2,)."), + pytest.param((2, 3, 4), (0, 2), id="3D, dim = (0, 2,)."), + pytest.param((1, 3, 3, 7), (2, -3), id="4D, dim = (2, -3)."), + pytest.param((3, 1, 4, 1, 5), (3, -5, -4), id="5D, dim = (3, -5 ,-4)."), + ], + ) + def test__tuple_dims(self, mocker, input_shape, dim, keep_dim): + model = MeanDimModule(dim, keep_dim) + # Relatively large error, but it is actually equal to the output scale, so it is a single bit error. + # TODO Replace with quantized dataset testing and `atol = 1`. + atol = 0.015 + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__compute_error(self, mocker, keep_dim): + input_shape, dim = (1, 3, 3, 7), -2 + model = MeanDimModule(dim, keep_dim) + + # Neutron produces an incorrect result in this case (maximum absolute error ~= 0.0607 (more than 2 * scale)). + # This test detects the failure to alert us once the bug is fixed. It should be fixed in Neutron 3.1.2. + with pytest.raises(AssertionError): + self.assert_delegated(model, input_shape, mocker, atol=0.06) + + @pytest.mark.parametrize( + "input_shape, dim", + [ + pytest.param((3, 1, 4), 1, id="3D, dim = 1."), + pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."), + ], + ) + def test__noop__only_node__not_delegated(self, input_shape, dim): + keep_dim = True # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op. + model = MeanDimModule(dim, keep_dim) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, dim", + [ + pytest.param((3, 1, 4), 1, id="3D, dim = 1."), + pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."), + ], + ) + def test__noop__not_only_node__delegated(self, mocker, input_shape, dim): + keep_dim = True # Reduction over a dimension of size `1` with `keep_dim=True` is a no-op. + model = MeanDimAddModule(dim, keep_dim) + self.assert_delegated( + model, + input_shape, + mocker, + expected_delegated_ops={MeanDim: 1, AddTensor: 1}, + ) + + @pytest.mark.parametrize( + "input_shape, dim", + [ + pytest.param((3, 1, 4), 1, id="3D, dim = 1."), + pytest.param((3, 1, 4, 1, 5), -2, id="5D, dim = -2."), + ], + ) + def test__no_reduction__keepdim_false__delegated(self, mocker, input_shape, dim): + # These cases reduce over a dimension of size 1. + # When `keep_dim=True` the node is a noop, and it's not delegated (see `test__noop__only_node__not_delegated`), + # but with `keep_dim=False` it changes the shape so it's not a noop and is therefore delegated successfully. + keep_dim = False + model = MeanDimModule(dim, keep_dim) + self.assert_delegated(model, input_shape, mocker) + + @pytest.mark.parametrize( + "input_shape, dim", + [((1, 7, 3, 3), 1)], + ids=lambda val: f"shape={val}" if isinstance(val, tuple) else f"dim={val}", + ) + def test__channels_first(self, mocker, input_shape, dim, keep_dim): + # Just 1 test case to verify correct handling of the `dim`. + # Most cases fall into the single bit error case, and since this test uses 2 operators, the error accumulates + # and the final error is larger. We cannot with 100% certainty say that the error is only caused by the single + # bit errors and not related to the format. That's why only this 1 case with no errors is used. + model = MaxPoolMeanDimModule(dim, keep_dim) + self.assert_delegated( + model, + input_shape, + mocker, + expected_delegated_ops={MaxPool2DWithIndices: 1, GetItem: 1, MeanDim: 1}, + ) diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py index 7f855dd63af..06eb9c84bd0 100644 --- a/backends/nxp/tests/ops_aliases.py +++ b/backends/nxp/tests/ops_aliases.py @@ -26,6 +26,7 @@ HardTanh_ = exir_ops.edge.aten.hardtanh_.default LeakyRelu = exir_ops.edge.aten.leaky_relu.default MaxPool2DWithIndices = exir_ops.edge.aten.max_pool2d_with_indices.default +MeanDim = exir_ops.edge.aten.mean.dim MulTensor = exir_ops.edge.aten.mul.Tensor QuantizePerChannel = exir_ops.edge.quantized_decomposed.quantize_per_channel.default QuantizePerTensor = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default From 4741f3ae35aaaa16a8ac750726ccf24f4850aa96 Mon Sep 17 00:00:00 2001 From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com> Date: Wed, 27 May 2026 15:18:59 +0200 Subject: [PATCH 037/103] Arm backend: Relocate not-equal decomposition after rank matching (#19769) Move DecomposeNotEqualPass to the post scalar-removal node transformation block. This removes its special placement between ReplaceScalarWithTensorByProfilePass and MatchArgRanksPass. Also match ranks for ne.Tensor before decomposition so scalar not-equal does not produce mismatched TOSA EQUAL operands. Signed-off-by: Sebastian Larsson --- backends/arm/_passes/arm_pass_manager.py | 4 +--- backends/arm/_passes/match_arg_ranks_pass.py | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index 5a135696463..8a02f7393de 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -481,9 +481,6 @@ def _tosa_pipeline( ConvertFullLikeToFullPass(), MatchArgDtypePass(), UnsqueezeScalarPlaceholdersPass(exported_program), - # TODO: Move DecomposeNotEqualPass to before or after this block of - # passes. Ticket: MLETORCH-1540 - DecomposeNotEqualPass(), MatchArgRanksPass(exported_program), ] ) @@ -491,6 +488,7 @@ def _tosa_pipeline( # Node transformation passes (post scalar-removal) self.add_passes( [ + DecomposeNotEqualPass(), NormalizeIndexPutNoneIndicesPass(), NormalizeIndexPutBoolIndexTensorPass(), RewriteIndexPutPass(), diff --git a/backends/arm/_passes/match_arg_ranks_pass.py b/backends/arm/_passes/match_arg_ranks_pass.py index 905286e39b0..199eafe0cfb 100644 --- a/backends/arm/_passes/match_arg_ranks_pass.py +++ b/backends/arm/_passes/match_arg_ranks_pass.py @@ -57,6 +57,7 @@ def __init__(self, exported_program: ExportedProgram, *args, **kwargs) -> None: exir_ops.edge.aten.ge.Tensor, exir_ops.edge.aten.lt.Tensor, exir_ops.edge.aten.le.Tensor, + exir_ops.edge.aten.ne.Tensor, exir_ops.edge.aten.pow.Tensor_Tensor, exir_ops.edge.aten.remainder.Tensor, exir_ops.edge.aten.where.self, From 628246784dd2efb71ebdbae4157d87da442c39f4 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Wed, 27 May 2026 13:50:37 -0400 Subject: [PATCH 038/103] [executorch][qualcomm] Add op_fallback.py to model_sharding_py BUCK target Differential Revision: D106429294 Pull Request resolved: https://github.com/pytorch/executorch/pull/19809 --- extension/llm/custom_ops/targets.bzl | 1 + 1 file changed, 1 insertion(+) diff --git a/extension/llm/custom_ops/targets.bzl b/extension/llm/custom_ops/targets.bzl index 6746d7ab877..1d1feeda0c1 100644 --- a/extension/llm/custom_ops/targets.bzl +++ b/extension/llm/custom_ops/targets.bzl @@ -141,6 +141,7 @@ def define_common_targets(): name = "model_sharding_py", srcs = [ "model_sharding.py", + "op_fallback.py", ], visibility = ["PUBLIC"], deps = [ From 2f229597f743105a432b91e086ad219d0f29a728 Mon Sep 17 00:00:00 2001 From: Siddartha Pothapragada Date: Wed, 27 May 2026 11:05:20 -0700 Subject: [PATCH 039/103] Remove debug exit(0) blocking test_llama_stories_110m (#19814) Summary: Remove debug `print` and `exit(0)` statements accidentally left in `TestExampleLLMScript.test_llama_stories_110m` that cause the test to exit before executing any assertions. These lines were introduced in commit 508cbf07be38 (PR #19146) and prevent the `test-static-llama-qnn-linux (stories_110m)` CI job from running actual model validation, blocking viable/strict progression. Differential Revision: D106533426 --- backends/qualcomm/tests/test_qnn_delegate.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py index ee6678fa499..08f5c1f67de 100644 --- a/backends/qualcomm/tests/test_qnn_delegate.py +++ b/backends/qualcomm/tests/test_qnn_delegate.py @@ -7733,8 +7733,6 @@ def test_llama_stories_110m(self): if self.use_fp16: cmds.append("--use_fp16") self.add_default_cmds(cmds) - print(" ".join(cmds)) - exit(0) golden_start_with = "Once upon a time," p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL) with Listener((self.ip, self.port)) as listener: From 52892b2ecda1446e21c585d297c4a653376df080 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 27 May 2026 12:25:07 -0700 Subject: [PATCH 040/103] Convert ExecuTorchRuntime, ExecutorchRuntimeException, EValue from Java to Kotlin (#19788) Differential Revision: D106413930 Pull Request resolved: https://github.com/pytorch/executorch/pull/19788 --- extension/android/BUCK | 6 +- .../executorch/ModuleInstrumentationTest.kt | 2 +- .../java/org/pytorch/executorch/EValue.java | 253 ------------------ .../java/org/pytorch/executorch/EValue.kt | 209 +++++++++++++++ .../pytorch/executorch/ExecuTorchRuntime.java | 68 ----- .../pytorch/executorch/ExecuTorchRuntime.kt | 62 +++++ .../ExecutorchRuntimeException.java | 198 -------------- .../executorch/ExecutorchRuntimeException.kt | 133 +++++++++ 8 files changed, 408 insertions(+), 523 deletions(-) delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.kt delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.kt delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.kt diff --git a/extension/android/BUCK b/extension/android/BUCK index bae5579b2a8..1f1b611ff01 100644 --- a/extension/android/BUCK +++ b/extension/android/BUCK @@ -9,9 +9,9 @@ non_fbcode_target(_kind = fb_android_library, required_for_source_only_abi = True, srcs = [ "executorch_android/src/main/java/org/pytorch/executorch/DType.kt", - "executorch_android/src/main/java/org/pytorch/executorch/EValue.java", - "executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.java", - "executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java", + "executorch_android/src/main/java/org/pytorch/executorch/EValue.kt", + "executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.kt", + "executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.kt", "executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.kt", "executorch_android/src/main/java/org/pytorch/executorch/Module.java", "executorch_android/src/main/java/org/pytorch/executorch/Tensor.java", diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt index b2f10537c2f..1888466ffa6 100644 --- a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/ModuleInstrumentationTest.kt @@ -94,7 +94,7 @@ class ModuleInstrumentationTest { } Assert.assertEquals( ExecutorchRuntimeException.INVALID_ARGUMENT, - exception.getErrorCode(), + exception.errorCode, ) } finally { module.destroy() diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.java deleted file mode 100644 index e85efb291e7..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.java +++ /dev/null @@ -1,253 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch; - -import com.facebook.jni.annotations.DoNotStrip; -import java.nio.ByteBuffer; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; -import java.util.Locale; -import org.pytorch.executorch.annotations.Experimental; - -/** - * Java representation of an ExecuTorch value, which is implemented as tagged union that can be one - * of the supported types: https://pytorch.org/docs/stable/jit.html#types . - * - *

Calling {@code toX} methods for inappropriate types will throw {@link IllegalStateException}. - * - *

{@code EValue} objects are constructed with {@code EValue.from(value)}, {@code - * EValue.tupleFrom(value1, value2, ...)}, {@code EValue.listFrom(value1, value2, ...)}, or one of - * the {@code dict} methods, depending on the key type. - * - *

Data is retrieved from {@code EValue} objects with the {@code toX()} methods. Note that {@code - * str}-type EValues must be extracted with {@link #toStr()}, rather than {@link #toString()}. - * - *

{@code EValue} objects may retain references to objects passed into their constructors, and - * may return references to their internal state from {@code toX()}. - * - *

Warning: These APIs are experimental and subject to change without notice - */ -@Experimental -@DoNotStrip -public class EValue { - private static final int TYPE_CODE_NONE = 0; - - private static final int TYPE_CODE_TENSOR = 1; - private static final int TYPE_CODE_STRING = 2; - private static final int TYPE_CODE_DOUBLE = 3; - private static final int TYPE_CODE_INT = 4; - private static final int TYPE_CODE_BOOL = 5; - - private String[] TYPE_NAMES = { - "None", "Tensor", "String", "Double", "Int", "Bool", - }; - - @DoNotStrip private final int mTypeCode; - @DoNotStrip private Object mData; - - @DoNotStrip - private EValue(int typeCode) { - this.mTypeCode = typeCode; - } - - @DoNotStrip - public boolean isNone() { - return TYPE_CODE_NONE == this.mTypeCode; - } - - @DoNotStrip - public boolean isTensor() { - return TYPE_CODE_TENSOR == this.mTypeCode; - } - - @DoNotStrip - public boolean isBool() { - return TYPE_CODE_BOOL == this.mTypeCode; - } - - @DoNotStrip - public boolean isInt() { - return TYPE_CODE_INT == this.mTypeCode; - } - - @DoNotStrip - public boolean isDouble() { - return TYPE_CODE_DOUBLE == this.mTypeCode; - } - - @DoNotStrip - public boolean isString() { - return TYPE_CODE_STRING == this.mTypeCode; - } - - /** Creates a new {@code EValue} of type {@code Optional} that contains no value. */ - @DoNotStrip - public static EValue optionalNone() { - return new EValue(TYPE_CODE_NONE); - } - - /** Creates a new {@code EValue} of type {@code Tensor}. */ - @DoNotStrip - public static EValue from(Tensor tensor) { - final EValue iv = new EValue(TYPE_CODE_TENSOR); - iv.mData = tensor; - return iv; - } - - /** Creates a new {@code EValue} of type {@code bool}. */ - @DoNotStrip - public static EValue from(boolean value) { - final EValue iv = new EValue(TYPE_CODE_BOOL); - iv.mData = value; - return iv; - } - - /** Creates a new {@code EValue} of type {@code int}. */ - @DoNotStrip - public static EValue from(long value) { - final EValue iv = new EValue(TYPE_CODE_INT); - iv.mData = value; - return iv; - } - - /** Creates a new {@code EValue} of type {@code double}. */ - @DoNotStrip - public static EValue from(double value) { - final EValue iv = new EValue(TYPE_CODE_DOUBLE); - iv.mData = value; - return iv; - } - - /** Creates a new {@code EValue} of type {@code str}. */ - @DoNotStrip - public static EValue from(String value) { - final EValue iv = new EValue(TYPE_CODE_STRING); - iv.mData = value; - return iv; - } - - @DoNotStrip - public Tensor toTensor() { - preconditionType(TYPE_CODE_TENSOR, mTypeCode); - return (Tensor) mData; - } - - @DoNotStrip - public boolean toBool() { - preconditionType(TYPE_CODE_BOOL, mTypeCode); - return (boolean) mData; - } - - @DoNotStrip - public long toInt() { - preconditionType(TYPE_CODE_INT, mTypeCode); - return (long) mData; - } - - @DoNotStrip - public double toDouble() { - preconditionType(TYPE_CODE_DOUBLE, mTypeCode); - return (double) mData; - } - - @DoNotStrip - public String toStr() { - preconditionType(TYPE_CODE_STRING, mTypeCode); - return (String) mData; - } - - private void preconditionType(int typeCodeExpected, int typeCode) { - if (typeCode != typeCodeExpected) { - throw new IllegalStateException( - String.format( - Locale.US, - "Expected EValue type %s, actual type %s", - getTypeName(typeCodeExpected), - getTypeName(typeCode))); - } - } - - private String getTypeName(int typeCode) { - return typeCode >= 0 && typeCode < TYPE_NAMES.length ? TYPE_NAMES[typeCode] : "Unknown"; - } - - /** - * Serializes an {@code EValue} into a byte array. Note: This method is experimental and subject - * to change without notice. - * - * @return The serialized byte array. - */ - public byte[] toByteArray() { - if (isNone()) { - return ByteBuffer.allocate(1).put((byte) TYPE_CODE_NONE).array(); - } else if (isTensor()) { - Tensor t = toTensor(); - byte[] tByteArray = t.toByteArray(); - return ByteBuffer.allocate(1 + tByteArray.length) - .put((byte) TYPE_CODE_TENSOR) - .put(tByteArray) - .array(); - } else if (isBool()) { - return ByteBuffer.allocate(2) - .put((byte) TYPE_CODE_BOOL) - .put((byte) (toBool() ? 1 : 0)) - .array(); - } else if (isInt()) { - return ByteBuffer.allocate(9).put((byte) TYPE_CODE_INT).putLong(toInt()).array(); - } else if (isDouble()) { - return ByteBuffer.allocate(9).put((byte) TYPE_CODE_DOUBLE).putDouble(toDouble()).array(); - } else if (isString()) { - byte[] strBytes = toStr().getBytes(StandardCharsets.UTF_8); - return ByteBuffer.allocate(1 + 4 + strBytes.length) - .put((byte) TYPE_CODE_STRING) - .putInt(strBytes.length) - .put(strBytes) - .array(); - } else { - throw new IllegalArgumentException("Unknown EValue type code: " + mTypeCode); - } - } - - /** - * Deserializes an {@code EValue} from a byte[]. Note: This method is experimental and subject to - * change without notice. - * - * @param bytes The byte array to deserialize from. - * @return The deserialized {@code EValue}. - */ - public static EValue fromByteArray(byte[] bytes) { - ByteBuffer buffer = ByteBuffer.wrap(bytes); - if (buffer == null) { - throw new IllegalArgumentException("buffer cannot be null"); - } - if (!buffer.hasRemaining()) { - throw new IllegalArgumentException("invalid buffer"); - } - int typeCode = buffer.get(); - switch (typeCode) { - case TYPE_CODE_NONE: - return new EValue(TYPE_CODE_NONE); - case TYPE_CODE_TENSOR: - byte[] bufferArray = buffer.array(); - return from(Tensor.fromByteArray(Arrays.copyOfRange(bufferArray, 1, bufferArray.length))); - case TYPE_CODE_STRING: - int strLen = buffer.getInt(); - byte[] strBytes = new byte[strLen]; - buffer.get(strBytes); - return from(new String(strBytes, StandardCharsets.UTF_8)); - case TYPE_CODE_DOUBLE: - return from(buffer.getDouble()); - case TYPE_CODE_INT: - return from(buffer.getLong()); - case TYPE_CODE_BOOL: - return from(buffer.get() != 0); - } - throw new IllegalArgumentException("invalid type code: " + typeCode); - } -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.kt new file mode 100644 index 00000000000..08c02d5c84a --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/EValue.kt @@ -0,0 +1,209 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch + +import com.facebook.jni.annotations.DoNotStrip +import java.nio.ByteBuffer +import java.nio.charset.StandardCharsets +import java.util.Arrays +import java.util.Locale +import org.pytorch.executorch.annotations.Experimental + +/** + * Java representation of an ExecuTorch value, which is implemented as tagged union that can be one + * of the supported types: https://pytorch.org/docs/stable/jit.html#types . + * + * Calling `toX` methods for inappropriate types will throw [IllegalStateException]. + * + * `EValue` objects are constructed with `EValue.from(value)`, depending on the value type. + * + * Data is retrieved from `EValue` objects with the `toX()` methods. Note that `str`-type EValues + * must be extracted with [toStr], rather than [toString]. + * + * `EValue` objects may retain references to objects passed into their constructors, and may return + * references to their internal state from `toX()`. + * + * Warning: These APIs are experimental and subject to change without notice + */ +@Experimental +@DoNotStrip +class EValue +@DoNotStrip +private constructor( + // JNI reads this field by name via GetFieldID("mTypeCode") + @JvmField @DoNotStrip val mTypeCode: Int +) { + + // JNI accesses this field by name via GetFieldID("mData"), requires @JvmField for direct field + // access + @JvmField @DoNotStrip var mData: Any? = null + + private val typeNames = arrayOf("None", "Tensor", "String", "Double", "Int", "Bool") + + val isNone: Boolean + @DoNotStrip get() = TYPE_CODE_NONE == mTypeCode + + val isTensor: Boolean + @DoNotStrip get() = TYPE_CODE_TENSOR == mTypeCode + + val isBool: Boolean + @DoNotStrip get() = TYPE_CODE_BOOL == mTypeCode + + val isInt: Boolean + @DoNotStrip get() = TYPE_CODE_INT == mTypeCode + + val isDouble: Boolean + @DoNotStrip get() = TYPE_CODE_DOUBLE == mTypeCode + + val isString: Boolean + @DoNotStrip get() = TYPE_CODE_STRING == mTypeCode + + @DoNotStrip + fun toTensor(): Tensor { + preconditionType(TYPE_CODE_TENSOR, mTypeCode) + return mData as? Tensor ?: throw IllegalStateException("EValue data is null or not a Tensor") + } + + @DoNotStrip + fun toBool(): Boolean { + preconditionType(TYPE_CODE_BOOL, mTypeCode) + return mData as? Boolean ?: throw IllegalStateException("EValue data is null or not a Boolean") + } + + @DoNotStrip + fun toInt(): Long { + preconditionType(TYPE_CODE_INT, mTypeCode) + return mData as? Long ?: throw IllegalStateException("EValue data is null or not a Long") + } + + @DoNotStrip + fun toDouble(): Double { + preconditionType(TYPE_CODE_DOUBLE, mTypeCode) + return mData as? Double ?: throw IllegalStateException("EValue data is null or not a Double") + } + + @DoNotStrip + fun toStr(): String { + preconditionType(TYPE_CODE_STRING, mTypeCode) + return mData as? String ?: throw IllegalStateException("EValue data is null or not a String") + } + + private fun preconditionType(typeCodeExpected: Int, typeCode: Int) { + if (typeCode != typeCodeExpected) { + throw IllegalStateException( + String.format( + Locale.US, + "Expected EValue type %s, actual type %s", + getTypeName(typeCodeExpected), + getTypeName(typeCode), + ) + ) + } + } + + private fun getTypeName(typeCode: Int): String = + if (typeCode in typeNames.indices) typeNames[typeCode] else "Unknown" + + /** + * Serializes an `EValue` into a byte array. Note: This method is experimental and subject to + * change without notice. + */ + fun toByteArray(): ByteArray = + when { + isNone -> ByteBuffer.allocate(1).put(TYPE_CODE_NONE.toByte()).array() + isTensor -> { + val tByteArray = toTensor().toByteArray() + ByteBuffer.allocate(1 + tByteArray.size) + .put(TYPE_CODE_TENSOR.toByte()) + .put(tByteArray) + .array() + } + isBool -> + ByteBuffer.allocate(2) + .put(TYPE_CODE_BOOL.toByte()) + .put(if (toBool()) 1.toByte() else 0.toByte()) + .array() + isInt -> ByteBuffer.allocate(9).put(TYPE_CODE_INT.toByte()).putLong(toInt()).array() + isDouble -> + ByteBuffer.allocate(9).put(TYPE_CODE_DOUBLE.toByte()).putDouble(toDouble()).array() + isString -> { + val strBytes = toStr().toByteArray(StandardCharsets.UTF_8) + ByteBuffer.allocate(1 + 4 + strBytes.size) + .put(TYPE_CODE_STRING.toByte()) + .putInt(strBytes.size) + .put(strBytes) + .array() + } + else -> throw IllegalArgumentException("Unknown EValue type code: $mTypeCode") + } + + companion object { + private const val TYPE_CODE_NONE = 0 + private const val TYPE_CODE_TENSOR = 1 + private const val TYPE_CODE_STRING = 2 + private const val TYPE_CODE_DOUBLE = 3 + private const val TYPE_CODE_INT = 4 + private const val TYPE_CODE_BOOL = 5 + + /** Creates a new `EValue` of type `Optional` that contains no value. */ + @DoNotStrip @JvmStatic fun optionalNone(): EValue = EValue(TYPE_CODE_NONE) + + /** Creates a new `EValue` of type `Tensor`. */ + @DoNotStrip + @JvmStatic + fun from(tensor: Tensor): EValue = EValue(TYPE_CODE_TENSOR).also { it.mData = tensor } + + /** Creates a new `EValue` of type `bool`. */ + @DoNotStrip + @JvmStatic + fun from(value: Boolean): EValue = EValue(TYPE_CODE_BOOL).also { it.mData = value } + + /** Creates a new `EValue` of type `int`. */ + @DoNotStrip + @JvmStatic + fun from(value: Long): EValue = EValue(TYPE_CODE_INT).also { it.mData = value } + + /** Creates a new `EValue` of type `double`. */ + @DoNotStrip + @JvmStatic + fun from(value: Double): EValue = EValue(TYPE_CODE_DOUBLE).also { it.mData = value } + + /** Creates a new `EValue` of type `str`. */ + @DoNotStrip + @JvmStatic + fun from(value: String): EValue = EValue(TYPE_CODE_STRING).also { it.mData = value } + + /** + * Deserializes an `EValue` from a byte[]. Note: This method is experimental and subject to + * change without notice. + */ + @JvmStatic + fun fromByteArray(bytes: ByteArray): EValue { + val buffer = ByteBuffer.wrap(bytes) + require(buffer.hasRemaining()) { "invalid buffer" } + return when (val typeCode = buffer.get().toInt()) { + TYPE_CODE_NONE -> EValue(TYPE_CODE_NONE) + TYPE_CODE_TENSOR -> { + val bufferArray = buffer.array() + from(Tensor.fromByteArray(Arrays.copyOfRange(bufferArray, 1, bufferArray.size))) + } + TYPE_CODE_STRING -> { + val strLen = buffer.getInt() + val strBytes = ByteArray(strLen) + buffer.get(strBytes) + from(String(strBytes, StandardCharsets.UTF_8)) + } + TYPE_CODE_DOUBLE -> from(buffer.getDouble()) + TYPE_CODE_INT -> from(buffer.getLong()) + TYPE_CODE_BOOL -> from(buffer.get().toInt() != 0) + else -> throw IllegalArgumentException("invalid type code: $typeCode") + } + } + } +} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.java deleted file mode 100644 index 6372da9a397..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.java +++ /dev/null @@ -1,68 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch; - -import com.facebook.jni.annotations.DoNotStrip; -import com.facebook.soloader.nativeloader.NativeLoader; -import com.facebook.soloader.nativeloader.SystemDelegate; -import java.io.File; - -/** Class for entire ExecuTorch Runtime related functions. */ -public class ExecuTorchRuntime { - - static { - if (!NativeLoader.isInitialized()) { - NativeLoader.init(new SystemDelegate()); - } - // Loads libexecutorch.so from jniLibs - NativeLoader.loadLibrary("executorch"); - } - - private static final ExecuTorchRuntime sInstance = new ExecuTorchRuntime(); - - private ExecuTorchRuntime() {} - - /** Get the runtime instance. */ - public static ExecuTorchRuntime getRuntime() { - return sInstance; - } - - /** - * Validates that the given path points to a readable file. - * - * @throws IllegalArgumentException if the path is null, does not exist, is not a file, or is not - * readable. - */ - public static void validateFilePath(String path, String description) { - if (path == null) { - throw new IllegalArgumentException("Cannot load " + description + ": path is null"); - } - File file = new File(path); - if (!file.exists()) { - throw new IllegalArgumentException( - "Cannot load " + description + ": path does not exist: " + path); - } - if (!file.isFile()) { - throw new IllegalArgumentException( - "Cannot load " + description + ": path is not a file: " + path); - } - if (!file.canRead()) { - throw new IllegalArgumentException( - "Cannot load " + description + ": path is not readable: " + path); - } - } - - /** Get all registered ops. */ - @DoNotStrip - public static native String[] getRegisteredOps(); - - /** Get all registered backends. */ - @DoNotStrip - public static native String[] getRegisteredBackends(); -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.kt new file mode 100644 index 00000000000..52d846c5647 --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.kt @@ -0,0 +1,62 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch + +import com.facebook.jni.annotations.DoNotStrip +import com.facebook.soloader.nativeloader.NativeLoader +import com.facebook.soloader.nativeloader.SystemDelegate +import java.io.File + +/** Class for entire ExecuTorch Runtime related functions. */ +class ExecuTorchRuntime private constructor() { + + companion object { + init { + if (!NativeLoader.isInitialized()) { + NativeLoader.init(SystemDelegate()) + } + // Loads libexecutorch.so from jniLibs + NativeLoader.loadLibrary("executorch") + } + + private val sInstance = ExecuTorchRuntime() + + /** Get the runtime instance. */ + @JvmStatic fun getRuntime(): ExecuTorchRuntime = sInstance + + /** + * Validates that the given path points to a readable file. + * + * @throws IllegalArgumentException if the path is null, does not exist, is not a file, or is + * not readable. + */ + @JvmStatic + fun validateFilePath(path: String?, description: String) { + if (path == null) { + throw IllegalArgumentException("Cannot load $description: path is null") + } + val file = File(path) + if (!file.exists()) { + throw IllegalArgumentException("Cannot load $description: path does not exist: $path") + } + if (!file.isFile) { + throw IllegalArgumentException("Cannot load $description: path is not a file: $path") + } + if (!file.canRead()) { + throw IllegalArgumentException("Cannot load $description: path is not readable: $path") + } + } + + /** Get all registered ops. */ + @DoNotStrip @JvmStatic external fun getRegisteredOps(): Array + + /** Get all registered backends. */ + @DoNotStrip @JvmStatic external fun getRegisteredBackends(): Array + } +} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java deleted file mode 100644 index 6f9d654be66..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.java +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch; - -import com.facebook.jni.annotations.DoNotStrip; -import java.util.Collections; -import java.util.HashMap; -import java.util.Map; - -/** - * Base exception for all ExecuTorch runtime errors. Each instance carries an integer error code - * corresponding to the native {@code runtime/core/error.h} values, accessible via {@link - * #getErrorCode()}. - */ -public class ExecutorchRuntimeException extends RuntimeException { - // Error code constants - keep in sync with runtime/core/error.h - - // System errors - - /** Operation completed successfully. */ - public static final int OK = 0x00; - - /** An unexpected internal error occurred in the runtime. */ - public static final int INTERNAL = 0x01; - - /** The runtime or method is in an invalid state for the requested operation. */ - public static final int INVALID_STATE = 0x02; - - /** The method has finished execution and has no more work to do. */ - public static final int END_OF_METHOD = 0x03; - - /** A required resource has already been loaded. */ - public static final int ALREADY_LOADED = 0x04; - - // Logical errors - - /** The requested operation is not supported by this build or backend. */ - public static final int NOT_SUPPORTED = 0x10; - - /** The requested operation has not been implemented. */ - public static final int NOT_IMPLEMENTED = 0x11; - - /** One or more arguments passed to the operation are invalid. */ - public static final int INVALID_ARGUMENT = 0x12; - - /** A value or tensor has an unexpected type. */ - public static final int INVALID_TYPE = 0x13; - - /** A required operator kernel is not registered. */ - public static final int OPERATOR_MISSING = 0x14; - - /** The maximum number of registered kernels has been exceeded. */ - public static final int REGISTRATION_EXCEEDING_MAX_KERNELS = 0x15; - - /** A kernel with the same name is already registered. */ - public static final int REGISTRATION_ALREADY_REGISTERED = 0x16; - - // Resource errors - - /** A required resource (file, tensor, program) was not found. */ - public static final int NOT_FOUND = 0x20; - - /** A memory allocation failed. */ - public static final int MEMORY_ALLOCATION_FAILED = 0x21; - - /** Access to a resource was denied or failed. */ - public static final int ACCESS_FAILED = 0x22; - - /** The loaded program is malformed or incompatible. */ - public static final int INVALID_PROGRAM = 0x23; - - /** External data referenced by the program is invalid or missing. */ - public static final int INVALID_EXTERNAL_DATA = 0x24; - - /** The system has run out of a required resource. */ - public static final int OUT_OF_RESOURCES = 0x25; - - // Delegate errors - - /** A delegate reported an incompatible model or configuration. */ - public static final int DELEGATE_INVALID_COMPATIBILITY = 0x30; - - /** A delegate failed to allocate required memory. */ - public static final int DELEGATE_MEMORY_ALLOCATION_FAILED = 0x31; - - /** A delegate received an invalid or stale handle. */ - public static final int DELEGATE_INVALID_HANDLE = 0x32; - - private static final Map ERROR_CODE_MESSAGES; - - static { - Map map = new HashMap<>(); - - // System errors - map.put(OK, "Operation successful"); - map.put(INTERNAL, "Internal error"); - map.put(INVALID_STATE, "Invalid state"); - map.put(END_OF_METHOD, "End of method reached"); - map.put(ALREADY_LOADED, "Already loaded"); - // Logical errors - map.put(NOT_SUPPORTED, "Operation not supported"); - map.put(NOT_IMPLEMENTED, "Operation not implemented"); - map.put(INVALID_ARGUMENT, "Invalid argument"); - map.put(INVALID_TYPE, "Invalid type"); - map.put(OPERATOR_MISSING, "Operator missing"); - map.put(REGISTRATION_EXCEEDING_MAX_KERNELS, "Exceeded max kernels"); - map.put(REGISTRATION_ALREADY_REGISTERED, "Kernel already registered"); - // Resource errors - map.put(NOT_FOUND, "Resource not found"); - map.put(MEMORY_ALLOCATION_FAILED, "Memory allocation failed"); - map.put(ACCESS_FAILED, "Access failed"); - map.put(INVALID_PROGRAM, "Invalid program"); - map.put(INVALID_EXTERNAL_DATA, "Invalid external data"); - map.put(OUT_OF_RESOURCES, "Out of resources"); - // Delegate errors - map.put(DELEGATE_INVALID_COMPATIBILITY, "Delegate invalid compatibility"); - map.put(DELEGATE_MEMORY_ALLOCATION_FAILED, "Delegate memory allocation failed"); - map.put(DELEGATE_INVALID_HANDLE, "Delegate invalid handle"); - ERROR_CODE_MESSAGES = Collections.unmodifiableMap(map); - } - - static class ErrorHelper { - static String formatMessage(int errorCode, String details) { - String baseMessage = ERROR_CODE_MESSAGES.get(errorCode); - if (baseMessage == null) { - baseMessage = "Unknown error code 0x" + Integer.toHexString(errorCode); - } - - String safeDetails = details != null ? details : "No details provided"; - return String.format( - "[ExecuTorch Error 0x%s] %s: %s", - Integer.toHexString(errorCode), baseMessage, safeDetails); - } - - static String getDetailedErrorLogs() { - StringBuilder sb = new StringBuilder(); - try { - String[] logEntries = Module.readLogBufferStatic(); // JNI call - if (logEntries != null && logEntries.length > 0) { - sb.append("\nDetailed logs:\n"); - for (String entry : logEntries) { - sb.append(entry).append("\n"); - } - } - } catch (Exception e) { - sb.append("Failed to retrieve detailed logs: ").append(e.getMessage()); - } - return sb.toString(); - } - } - - private final int errorCode; - - @DoNotStrip - public ExecutorchRuntimeException(int errorCode, String details) { - super(ErrorHelper.formatMessage(errorCode, details)); - this.errorCode = errorCode; - } - - public ExecutorchRuntimeException(int errorCode, String details, Throwable cause) { - super(ErrorHelper.formatMessage(errorCode, details), cause); - this.errorCode = errorCode; - } - - /** Returns the numeric error code from {@code runtime/core/error.h}. */ - public int getErrorCode() { - return errorCode; - } - - /** Returns detailed log output captured from the native runtime, if available. */ - public String getDetailedError() { - return ErrorHelper.getDetailedErrorLogs(); - } - - @DoNotStrip - public static class ExecutorchInvalidArgumentException extends ExecutorchRuntimeException { - @DoNotStrip - public ExecutorchInvalidArgumentException(String details) { - super(INVALID_ARGUMENT, details); - } - } - - @DoNotStrip - public static RuntimeException makeExecutorchException(int errorCode, String details) { - switch (errorCode) { - case INVALID_ARGUMENT: - return new ExecutorchInvalidArgumentException(details); - default: - return new ExecutorchRuntimeException(errorCode, details); - } - } -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.kt new file mode 100644 index 00000000000..5ec3dd255d8 --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.kt @@ -0,0 +1,133 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch + +import com.facebook.jni.annotations.DoNotStrip + +/** + * Base exception for all ExecuTorch runtime errors. Each instance carries an integer error code + * corresponding to the native `runtime/core/error.h` values, accessible via [getErrorCode]. + */ +open class ExecutorchRuntimeException +@DoNotStrip +constructor( + val errorCode: Int, + details: String?, +) : RuntimeException(ErrorHelper.formatMessage(errorCode, details)) { + + constructor( + errorCode: Int, + details: String?, + cause: Throwable?, + ) : this(errorCode, details) { + if (cause != null) initCause(cause) + } + + /** Returns detailed log output captured from the native runtime, if available. */ + fun getDetailedError(): String = ErrorHelper.getDetailedErrorLogs() + + @DoNotStrip + class ExecutorchInvalidArgumentException @DoNotStrip constructor(details: String?) : + ExecutorchRuntimeException(INVALID_ARGUMENT, details) + + private object ErrorHelper { + private val ERROR_CODE_MESSAGES: Map = + mapOf( + // System errors + OK to "Operation successful", + INTERNAL to "Internal error", + INVALID_STATE to "Invalid state", + END_OF_METHOD to "End of method reached", + ALREADY_LOADED to "Already loaded", + // Logical errors + NOT_SUPPORTED to "Operation not supported", + NOT_IMPLEMENTED to "Operation not implemented", + INVALID_ARGUMENT to "Invalid argument", + INVALID_TYPE to "Invalid type", + OPERATOR_MISSING to "Operator missing", + REGISTRATION_EXCEEDING_MAX_KERNELS to "Exceeded max kernels", + REGISTRATION_ALREADY_REGISTERED to "Kernel already registered", + // Resource errors + NOT_FOUND to "Resource not found", + MEMORY_ALLOCATION_FAILED to "Memory allocation failed", + ACCESS_FAILED to "Access failed", + INVALID_PROGRAM to "Invalid program", + INVALID_EXTERNAL_DATA to "Invalid external data", + OUT_OF_RESOURCES to "Out of resources", + // Delegate errors + DELEGATE_INVALID_COMPATIBILITY to "Delegate invalid compatibility", + DELEGATE_MEMORY_ALLOCATION_FAILED to "Delegate memory allocation failed", + DELEGATE_INVALID_HANDLE to "Delegate invalid handle", + ) + + fun formatMessage(errorCode: Int, details: String?): String { + val baseMessage = + ERROR_CODE_MESSAGES[errorCode] ?: "Unknown error code 0x${Integer.toHexString(errorCode)}" + val safeDetails = details ?: "No details provided" + return "[ExecuTorch Error 0x${Integer.toHexString(errorCode)}] $baseMessage: $safeDetails" + } + + fun getDetailedErrorLogs(): String { + val sb = StringBuilder() + try { + val logEntries = Module.readLogBufferStatic() // JNI call + if (logEntries != null && logEntries.isNotEmpty()) { + sb.append("\nDetailed logs:\n") + for (entry in logEntries) { + sb.append(entry).append("\n") + } + } + } catch (e: Exception) { + sb.append("Failed to retrieve detailed logs: ").append(e.message) + } + return sb.toString() + } + } + + companion object { + // Error code constants - keep in sync with runtime/core/error.h + + // System errors + const val OK = 0x00 + const val INTERNAL = 0x01 + const val INVALID_STATE = 0x02 + const val END_OF_METHOD = 0x03 + const val ALREADY_LOADED = 0x04 + + // Logical errors + const val NOT_SUPPORTED = 0x10 + const val NOT_IMPLEMENTED = 0x11 + const val INVALID_ARGUMENT = 0x12 + const val INVALID_TYPE = 0x13 + const val OPERATOR_MISSING = 0x14 + const val REGISTRATION_EXCEEDING_MAX_KERNELS = 0x15 + const val REGISTRATION_ALREADY_REGISTERED = 0x16 + + // Resource errors + const val NOT_FOUND = 0x20 + const val MEMORY_ALLOCATION_FAILED = 0x21 + const val ACCESS_FAILED = 0x22 + const val INVALID_PROGRAM = 0x23 + const val INVALID_EXTERNAL_DATA = 0x24 + const val OUT_OF_RESOURCES = 0x25 + + // Delegate errors + const val DELEGATE_INVALID_COMPATIBILITY = 0x30 + const val DELEGATE_MEMORY_ALLOCATION_FAILED = 0x31 + const val DELEGATE_INVALID_HANDLE = 0x32 + + @DoNotStrip + @JvmStatic + fun makeExecutorchException(errorCode: Int, details: String?): RuntimeException = + when (errorCode) { + INVALID_ARGUMENT -> ExecutorchInvalidArgumentException(details) + else -> ExecutorchRuntimeException(errorCode, details) + } + } +} From 8be91e0b3c80b6e1338c36711124d065d667900e Mon Sep 17 00:00:00 2001 From: Digant Desai Date: Wed, 27 May 2026 12:27:41 -0700 Subject: [PATCH 041/103] WebGPU: add memory aliasing for intermediate tensor buffers (#19305) USE ETVK's mem_obj_id for the WebGPU runtime to implement memory aliasing --- backends/webgpu/runtime/WebGPUGraph.cpp | 315 ++++++++++++++++---- backends/webgpu/runtime/WebGPUGraph.h | 46 +++ backends/webgpu/test/ops/add/test_add.py | 15 + backends/webgpu/test/test_build_webgpu.sh | 7 +- backends/webgpu/test/test_webgpu_native.cpp | 65 ++++ 5 files changed, 384 insertions(+), 64 deletions(-) diff --git a/backends/webgpu/runtime/WebGPUGraph.cpp b/backends/webgpu/runtime/WebGPUGraph.cpp index f0e4c7959c0..91404fb164f 100644 --- a/backends/webgpu/runtime/WebGPUGraph.cpp +++ b/backends/webgpu/runtime/WebGPUGraph.cpp @@ -50,9 +50,15 @@ size_t vk_datatype_size(vkgraph::VkDataType dtype) { WebGPUGraph::WebGPUGraph() = default; WebGPUGraph::~WebGPUGraph() { - for (auto& t : tensors_) { - if (t.buffer) { - wgpuBufferRelease(t.buffer); + for (size_t i = 0; i < tensors_.size(); i++) { + if (tensors_[i].buffer && + (i >= tensor_mem_obj_ids_.size() || tensor_mem_obj_ids_[i] < 0)) { + wgpuBufferRelease(tensors_[i].buffer); + } + } + for (auto& buf : shared_buffers_) { + if (buf) { + wgpuBufferRelease(buf); } } for (auto& buf : output_staging_buffers_) { @@ -68,6 +74,21 @@ WebGPUGraph::~WebGPUGraph() { wgpuBindGroupRelease(d.bind_group); } } + for (auto& [_, shader] : shader_cache_) { + if (shader) { + wgpuShaderModuleRelease(shader); + } + } + for (auto& [_, pipeline] : pipeline_cache_) { + if (pipeline) { + wgpuComputePipelineRelease(pipeline); + } + } + for (auto& [_, bgl] : bgl_cache_) { + if (bgl) { + wgpuBindGroupLayoutRelease(bgl); + } + } } void WebGPUGraph::build( @@ -94,6 +115,7 @@ void WebGPUGraph::build( const int num_vals = values ? values->size() : 0; value_types_.resize(num_vals, ValueType::Null); tensors_.resize(num_vals); + tensor_mem_obj_ids_.resize(num_vals, -1); ints_.resize(num_vals, 0); doubles_.resize(num_vals, 0.0); bools_.resize(num_vals, false); @@ -121,27 +143,40 @@ void WebGPUGraph::build( } tensor.nbytes = numel * vk_datatype_size(vk_tensor->datatype()); - // Create GPU buffer - WGPUBufferDescriptor buf_desc = {}; - buf_desc.size = tensor.nbytes > 0 ? tensor.nbytes : 4; - buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | - WGPUBufferUsage_CopySrc; - buf_desc.mappedAtCreation = false; - tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); - - // Upload constant data if this tensor has a constant_id int constant_id = vk_tensor->constant_id(); - if (constant_id >= 0 && constant_data) { - const auto* constants = graph->constants(); - if (constants && constant_id < static_cast(constants->size())) { - const auto* vk_bytes = constants->Get(constant_id); - // Only upload from embedded bytes (not named data map) - if (vk_bytes->offset() != UINT64_MAX) { - const uint8_t* src = constant_data + vk_bytes->offset(); - wgpuQueueWriteBuffer( - queue_, tensor.buffer, 0, src, tensor.nbytes); + int mem_obj_id = vk_tensor->mem_obj_id(); + + // Constants always get dedicated buffers regardless of mem_obj_id + if (constant_id >= 0 || mem_obj_id < 0) { + tensor_mem_obj_ids_[i] = -1; + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = std::max(tensor.nbytes, size_t(4)); + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + tensor.buffer = wgpuDeviceCreateBuffer(device_, &buf_desc); + + if (constant_id >= 0 && constant_data && tensor.nbytes > 0) { + const auto* constants = graph->constants(); + if (constants && + constant_id < static_cast(constants->size())) { + const auto* vk_bytes = constants->Get(constant_id); + if (vk_bytes->offset() != UINT64_MAX) { + const uint8_t* src = constant_data + vk_bytes->offset(); + wgpuQueueWriteBuffer( + queue_, tensor.buffer, 0, src, tensor.nbytes); + } } } + } else { + // Shared buffer: track required size, defer allocation to pass 2 + tensor_mem_obj_ids_[i] = mem_obj_id; + size_t id = static_cast(mem_obj_id); + if (id >= shared_buffer_sizes_.size()) { + shared_buffer_sizes_.resize(id + 1, 0); + } + shared_buffer_sizes_[id] = + std::max(shared_buffer_sizes_[id], tensor.nbytes); } break; } @@ -166,6 +201,23 @@ void WebGPUGraph::build( } } + // Allocate shared buffers and assign to tensors + shared_buffers_.resize(shared_buffer_sizes_.size(), nullptr); + for (size_t id = 0; id < shared_buffer_sizes_.size(); id++) { + WGPUBufferDescriptor buf_desc = {}; + buf_desc.size = std::max(shared_buffer_sizes_[id], size_t(4)); + buf_desc.usage = WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc; + buf_desc.mappedAtCreation = false; + shared_buffers_[id] = wgpuDeviceCreateBuffer(device_, &buf_desc); + } + for (int i = 0; i < num_vals; i++) { + int mid = tensor_mem_obj_ids_[i]; + if (mid >= 0) { + tensors_[i].buffer = shared_buffers_[mid]; + } + } + // Phase 2: Record input and output IDs const auto* fb_input_ids = graph->input_ids(); if (fb_input_ids) { @@ -181,7 +233,7 @@ void WebGPUGraph::build( // Create staging buffer for output readback WGPUBufferDescriptor staging_desc = {}; - staging_desc.size = tensors_[oid].nbytes > 0 ? tensors_[oid].nbytes : 4; + staging_desc.size = std::max(tensors_[oid].nbytes, size_t(4)); staging_desc.usage = WGPUBufferUsage_MapRead | WGPUBufferUsage_CopyDst; staging_desc.mappedAtCreation = false; output_staging_buffers_.push_back( @@ -189,6 +241,14 @@ void WebGPUGraph::build( } } + for (size_t i = 0; i < output_ids_.size(); i++) { + int oid = output_ids_[i]; + output_copies_.push_back( + {tensors_[oid].buffer, + output_staging_buffers_[i], + tensors_[oid].nbytes}); + } + // Phase 3: Build operator dispatch chain const auto* chain = graph->chain(); if (chain) { @@ -213,9 +273,70 @@ void WebGPUGraph::build( } } +WGPUShaderModule WebGPUGraph::get_or_create_shader( + const std::string& key, + const char* wgsl_source) { + auto it = shader_cache_.find(key); + if (it != shader_cache_.end()) { + return it->second; + } + + WGPUShaderSourceWGSL wgsl_desc = {}; + wgsl_desc.chain.sType = WGPUSType_ShaderSourceWGSL; + wgsl_desc.code = {wgsl_source, WGPU_STRLEN}; + + WGPUShaderModuleDescriptor shader_desc = {}; + shader_desc.nextInChain = &wgsl_desc.chain; + WGPUShaderModule shader = wgpuDeviceCreateShaderModule(device_, &shader_desc); + + shader_cache_[key] = shader; + return shader; +} + +WGPUComputePipeline WebGPUGraph::get_or_create_pipeline( + const std::string& key, + WGPUShaderModule shader, + WGPUPipelineLayout layout) { + auto it = pipeline_cache_.find(key); + if (it != pipeline_cache_.end()) { + return it->second; + } + + WGPUComputePipelineDescriptor pipeline_desc = {}; + pipeline_desc.layout = layout; + pipeline_desc.compute.module = shader; + pipeline_desc.compute.entryPoint = {"main", WGPU_STRLEN}; + WGPUComputePipeline pipeline = + wgpuDeviceCreateComputePipeline(device_, &pipeline_desc); + + pipeline_cache_[key] = pipeline; + return pipeline; +} + +WGPUBindGroupLayout WebGPUGraph::get_or_create_bgl( + const std::string& key, + const WGPUBindGroupLayoutEntry* entries, + uint32_t count) { + auto it = bgl_cache_.find(key); + if (it != bgl_cache_.end()) { + return it->second; + } + + WGPUBindGroupLayoutDescriptor bgl_desc = {}; + bgl_desc.entryCount = count; + bgl_desc.entries = entries; + WGPUBindGroupLayout bgl = wgpuDeviceCreateBindGroupLayout(device_, &bgl_desc); + + bgl_cache_[key] = bgl; + return bgl; +} + void WebGPUGraph::copy_inputs( const std::vector>& inputs) { for (size_t i = 0; i < inputs.size() && i < input_ids_.size(); i++) { + if (inputs[i].second == 0) { + continue; + } int tid = input_ids_[i]; const auto& tensor = tensors_[tid]; wgpuQueueWriteBuffer( @@ -224,43 +345,89 @@ void WebGPUGraph::copy_inputs( } void WebGPUGraph::execute() { - WGPUCommandEncoderDescriptor enc_desc = {}; - WGPUCommandEncoder encoder = - wgpuDeviceCreateCommandEncoder(device_, &enc_desc); - - WGPUComputePassDescriptor pass_desc = {}; - WGPUComputePassEncoder pass = - wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); - - for (const auto& dispatch : dispatches_) { - wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); - wgpuComputePassEncoderSetBindGroup( - pass, 0, dispatch.bind_group, 0, nullptr); - wgpuComputePassEncoderDispatchWorkgroups( - pass, dispatch.workgroup_count_x, 1, 1); - } + const size_t n = dispatches_.size(); + const size_t chunk = execute_config_.chunk_size; + + if (chunk == 0 || n <= chunk) { + WGPUCommandEncoderDescriptor enc_desc = {}; + WGPUCommandEncoder encoder = + wgpuDeviceCreateCommandEncoder(device_, &enc_desc); + + WGPUComputePassDescriptor pass_desc = {}; + WGPUComputePassEncoder pass = + wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); + + for (const auto& dispatch : dispatches_) { + wgpuComputePassEncoderSetPipeline(pass, dispatch.pipeline); + wgpuComputePassEncoderSetBindGroup( + pass, 0, dispatch.bind_group, 0, nullptr); + wgpuComputePassEncoderDispatchWorkgroups( + pass, dispatch.workgroup_count_x, 1, 1); + } - wgpuComputePassEncoderEnd(pass); - wgpuComputePassEncoderRelease(pass); + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); - // Copy outputs to staging buffers - for (size_t i = 0; i < output_ids_.size(); i++) { - int oid = output_ids_[i]; - wgpuCommandEncoderCopyBufferToBuffer( - encoder, - tensors_[oid].buffer, - 0, - output_staging_buffers_[i], - 0, - tensors_[oid].nbytes); + for (const auto& copy : output_copies_) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes); + } + + WGPUCommandBufferDescriptor cmd_desc = {}; + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); + wgpuQueueSubmit(queue_, 1, &cmd); + + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(encoder); + return; } - WGPUCommandBufferDescriptor cmd_desc = {}; - WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); - wgpuQueueSubmit(queue_, 1, &cmd); + const size_t first_chunk = execute_config_.initial_chunk_size > 0 + ? execute_config_.initial_chunk_size + : chunk; + + size_t start = 0; + size_t current_chunk = first_chunk; - wgpuCommandBufferRelease(cmd); - wgpuCommandEncoderRelease(encoder); + while (start < n) { + size_t end = std::min(start + current_chunk, n); + + WGPUCommandEncoderDescriptor enc_desc = {}; + WGPUCommandEncoder encoder = + wgpuDeviceCreateCommandEncoder(device_, &enc_desc); + + WGPUComputePassDescriptor pass_desc = {}; + WGPUComputePassEncoder pass = + wgpuCommandEncoderBeginComputePass(encoder, &pass_desc); + + for (size_t i = start; i < end; i++) { + wgpuComputePassEncoderSetPipeline(pass, dispatches_[i].pipeline); + wgpuComputePassEncoderSetBindGroup( + pass, 0, dispatches_[i].bind_group, 0, nullptr); + wgpuComputePassEncoderDispatchWorkgroups( + pass, dispatches_[i].workgroup_count_x, 1, 1); + } + + wgpuComputePassEncoderEnd(pass); + wgpuComputePassEncoderRelease(pass); + + if (end == n) { + for (const auto& copy : output_copies_) { + wgpuCommandEncoderCopyBufferToBuffer( + encoder, copy.src_buffer, 0, copy.staging_buffer, 0, copy.nbytes); + } + } + + WGPUCommandBufferDescriptor cmd_desc = {}; + WGPUCommandBuffer cmd = wgpuCommandEncoderFinish(encoder, &cmd_desc); + wgpuQueueSubmit(queue_, 1, &cmd); + + wgpuCommandBufferRelease(cmd); + wgpuCommandEncoderRelease(encoder); + + start = end; + current_chunk = chunk; + } } namespace { @@ -283,24 +450,35 @@ void buffer_map_callback( } // namespace void WebGPUGraph::copy_outputs(std::vector>& outputs) { - for (size_t i = 0; i < outputs.size() && i < output_staging_buffers_.size(); - i++) { - MapCallbackData cb_data; + const size_t count = std::min(outputs.size(), output_staging_buffers_.size()); + + std::vector cb_data(count); + + for (size_t i = 0; i < count; i++) { + if (outputs[i].second == 0) { + cb_data[i].done = true; + cb_data[i].status = WGPUMapAsyncStatus_Success; + continue; + } WGPUBufferMapCallbackInfo cb_info = {}; cb_info.mode = WGPUCallbackMode_AllowSpontaneous; cb_info.callback = buffer_map_callback; - cb_info.userdata1 = &cb_data; + cb_info.userdata1 = &cb_data[i]; wgpuBufferMapAsync( output_staging_buffers_[i], WGPUMapMode_Read, 0, outputs[i].second, cb_info); + } - // Poll until the map callback fires. - wgpuDevicePoll(device_, true, nullptr); + wgpuDevicePoll(device_, true, nullptr); - if (cb_data.status == WGPUMapAsyncStatus_Success) { + for (size_t i = 0; i < count; i++) { + if (outputs[i].second == 0) { + continue; + } + if (cb_data[i].status == WGPUMapAsyncStatus_Success) { const void* mapped = wgpuBufferGetConstMappedRange( output_staging_buffers_[i], 0, outputs[i].second); std::memcpy(outputs[i].first, mapped, outputs[i].second); @@ -315,15 +493,28 @@ WebGPUMemoryStats WebGPUGraph::memory_stats() const { WebGPUMemoryStats stats; for (size_t i = 0; i < value_types_.size(); i++) { if (value_types_[i] == ValueType::Tensor && tensors_[i].nbytes > 0) { - stats.tensor_buffer_bytes += tensors_[i].nbytes; stats.num_tensors++; + // Shared tensors are tracked via shared_buffer_sizes_ + bool is_shared = + i < tensor_mem_obj_ids_.size() && tensor_mem_obj_ids_[i] >= 0; + if (!is_shared) { + stats.unshared_tensor_buffer_bytes += tensors_[i].nbytes; + } } } + for (size_t s : shared_buffer_sizes_) { + stats.shared_buffer_bytes += s; + } + stats.num_shared_objects = static_cast(shared_buffers_.size()); + stats.tensor_buffer_bytes = + stats.shared_buffer_bytes + stats.unshared_tensor_buffer_bytes; for (size_t i = 0; i < output_ids_.size(); i++) { stats.staging_buffer_bytes += tensors_[output_ids_[i]].nbytes; } stats.uniform_buffer_bytes = uniform_buffer_bytes_; stats.num_dispatches = static_cast(dispatches_.size()); + stats.num_cached_pipelines = static_cast(pipeline_cache_.size()); + stats.num_cached_shaders = static_cast(shader_cache_.size()); return stats; } diff --git a/backends/webgpu/runtime/WebGPUGraph.h b/backends/webgpu/runtime/WebGPUGraph.h index 2d6996e9219..3aa96917a4e 100644 --- a/backends/webgpu/runtime/WebGPUGraph.h +++ b/backends/webgpu/runtime/WebGPUGraph.h @@ -12,6 +12,7 @@ #include #include +#include #include namespace executorch { @@ -30,12 +31,28 @@ struct WebGPUDispatch { uint32_t workgroup_count_x = 1; }; +struct OutputCopy { + WGPUBuffer src_buffer = nullptr; + WGPUBuffer staging_buffer = nullptr; + size_t nbytes = 0; +}; + +struct ExecuteConfig { + size_t chunk_size = 0; + size_t initial_chunk_size = 0; +}; + struct WebGPUMemoryStats { size_t tensor_buffer_bytes = 0; + size_t shared_buffer_bytes = 0; + int num_shared_objects = 0; + size_t unshared_tensor_buffer_bytes = 0; size_t staging_buffer_bytes = 0; size_t uniform_buffer_bytes = 0; int num_tensors = 0; int num_dispatches = 0; + int num_cached_pipelines = 0; + int num_cached_shaders = 0; size_t total_bytes() const { return tensor_buffer_bytes + staging_buffer_bytes + uniform_buffer_bytes; @@ -99,6 +116,20 @@ class WebGPUGraph { uniform_buffer_bytes_ += bytes; } + WGPUShaderModule get_or_create_shader( + const std::string& key, + const char* wgsl_source); + + WGPUComputePipeline get_or_create_pipeline( + const std::string& key, + WGPUShaderModule shader, + WGPUPipelineLayout layout); + + WGPUBindGroupLayout get_or_create_bgl( + const std::string& key, + const WGPUBindGroupLayoutEntry* entries, + uint32_t count); + void set_instance(WGPUInstance instance) { instance_ = instance; } @@ -134,11 +165,26 @@ class WebGPUGraph { std::vector input_ids_; std::vector output_ids_; + // Memory aliasing: tensors with the same mem_obj_id share a WGPUBuffer. + std::vector tensor_mem_obj_ids_; + std::vector shared_buffers_; + std::vector shared_buffer_sizes_; + // Staging buffers for reading back outputs (MapRead | CopyDst). std::vector output_staging_buffers_; + // Pre-computed output copy descriptors for execute(). + std::vector output_copies_; + std::vector dispatches_; + ExecuteConfig execute_config_; + + // Caches for reusing GPU objects across dispatches. + std::unordered_map shader_cache_; + std::unordered_map pipeline_cache_; + std::unordered_map bgl_cache_; + size_t uniform_buffer_bytes_ = 0; }; diff --git a/backends/webgpu/test/ops/add/test_add.py b/backends/webgpu/test/ops/add/test_add.py index f4b33ced76d..e8da644a1f9 100644 --- a/backends/webgpu/test/ops/add/test_add.py +++ b/backends/webgpu/test/ops/add/test_add.py @@ -31,6 +31,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor: z = x + y z = z + x z = z + y + z = z + x + z = z + y return z @@ -97,5 +99,18 @@ def export_add_model(output_path: str) -> None: print(f"Exported {output_path}") +def export_chained_add_model(output_path: str) -> None: + """Export a chained add model (z=x+y; z=z+x; z=z+y; z=z+x; z=z+y) to .pte for memory aliasing testing.""" + model = AddChainedModule() + example_inputs = (torch.randn(1024, 1024), torch.randn(1024, 1024)) + ep = torch.export.export(model, example_inputs) + et_program = to_edge_transform_and_lower( + ep, partitioner=[VulkanPartitioner()] + ).to_executorch() + with open(output_path, "wb") as f: + f.write(et_program.buffer) + print(f"Exported {output_path}") + + if __name__ == "__main__": unittest.main() diff --git a/backends/webgpu/test/test_build_webgpu.sh b/backends/webgpu/test/test_build_webgpu.sh index 684926cb181..a42b2304ee7 100755 --- a/backends/webgpu/test/test_build_webgpu.sh +++ b/backends/webgpu/test/test_build_webgpu.sh @@ -22,12 +22,14 @@ $PYTHON_EXECUTABLE -m pytest "${SCRIPT_DIR}/ops/add/test_add.py" -v # ── Step 2: Export .pte model ───────────────────────────────────────────────── -echo "=== Step 2: Export test model ===" +echo "=== Step 2: Export test models ===" PTE_MODEL="/tmp/webgpu_add_test.pte" +PTE_CHAINED_MODEL="/tmp/webgpu_chained_add_test.pte" cd "${EXECUTORCH_ROOT}" $PYTHON_EXECUTABLE -c " -from executorch.backends.webgpu.test.ops.add.test_add import export_add_model +from executorch.backends.webgpu.test.ops.add.test_add import export_add_model, export_chained_add_model export_add_model('${PTE_MODEL}') +export_chained_add_model('${PTE_CHAINED_MODEL}') " # ── Step 3: Native build + test (wgpu-native) ──────────────────────────────── @@ -60,6 +62,7 @@ cmake --build "${NATIVE_BUILD_DIR}" --target webgpu_native_test -j${NPROC} echo "=== Step 4: Run native test ===" WEBGPU_TEST_MODEL="${PTE_MODEL}" \ +WEBGPU_TEST_CHAINED_MODEL="${PTE_CHAINED_MODEL}" \ "${NATIVE_BUILD_DIR}/backends/webgpu/webgpu_native_test" echo "=== Done ===" diff --git a/backends/webgpu/test/test_webgpu_native.cpp b/backends/webgpu/test/test_webgpu_native.cpp index c60695e11c9..d3005debf37 100644 --- a/backends/webgpu/test/test_webgpu_native.cpp +++ b/backends/webgpu/test/test_webgpu_native.cpp @@ -75,6 +75,62 @@ static bool test_single_add(const std::string& model_path) { return true; } +static bool test_chained_add(const std::string& model_path) { + printf("\n--- Test: chained add (1024x1024, 5 ops) ---\n"); + + Module module(model_path); + auto err = module.load_forward(); + if (err != Error::Ok) { + printf("FAIL: could not load forward method (error %d)\n", (int)err); + return false; + } + printf("Model loaded: %s\n", model_path.c_str()); + + constexpr int dim = 1024; + constexpr int size = dim * dim; + + std::vector x_data(size); + std::vector y_data(size); + for (int i = 0; i < size; i++) { + x_data[i] = static_cast(i % 100) * 0.01f; + y_data[i] = static_cast(i % 50) * 0.02f; + } + + auto x = make_tensor_ptr({dim, dim}, std::vector(x_data)); + auto y = make_tensor_ptr({dim, dim}, std::vector(y_data)); + + auto result = module.forward({EValue(x), EValue(y)}); + if (!result.ok()) { + printf("FAIL: forward failed (error %d)\n", (int)result.error()); + return false; + } + + const auto& outputs = result.get(); + if (outputs.empty() || !outputs[0].isTensor()) { + printf("FAIL: no tensor output\n"); + return false; + } + + // z=x+y; z=z+x=2x+y; z=z+y=2x+2y; z=z+x=3x+2y; z=z+y=3x+3y + const auto& out_tensor = outputs[0].toTensor(); + const float* out_data = out_tensor.const_data_ptr(); + + float max_error = 0.0f; + for (int i = 0; i < size; i++) { + float expected = 3.0f * x_data[i] + 3.0f * y_data[i]; + float error = std::abs(out_data[i] - expected); + max_error = std::max(max_error, error); + } + + printf("Max error: %e (checked %d elements)\n", max_error, size); + if (max_error > 1e-3f) { + printf("FAIL: max error exceeds tolerance 1e-3\n"); + return false; + } + printf("PASS: chained add test\n"); + return true; +} + int main(int argc, char** argv) { std::string model_path = "webgpu_add_test.pte"; if (argc > 1) { @@ -84,6 +140,11 @@ int main(int argc, char** argv) { model_path = env; } + std::string chained_model_path; + if (const char* env = std::getenv("WEBGPU_TEST_CHAINED_MODEL")) { + chained_model_path = env; + } + WebGPUContext ctx; try { ctx = create_webgpu_context(); @@ -97,6 +158,10 @@ int main(int argc, char** argv) { bool ok = test_single_add(model_path); + if (!chained_model_path.empty()) { + ok = test_chained_add(chained_model_path) && ok; + } + set_default_webgpu_context(nullptr); destroy_webgpu_context(ctx); From 1e8dc3095a39a709f862034b7b76caedc3de1d2b Mon Sep 17 00:00:00 2001 From: Chizkiyahu Raful <37312901+chizkiyahu@users.noreply.github.com> Date: Wed, 27 May 2026 23:17:56 +0300 Subject: [PATCH 042/103] Serialize/flatbuffer to program (#18129) exir: add flatbuffer-to-program reader This continues the work from https://github.com/pytorch/executorch/pull/17333. cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani --------- Signed-off-by: Chizkiyahu Raful --- exir/_serialize/_flatbuffer.py | 67 --------- exir/_serialize/_flatbuffer_program.py | 141 +++++++++++++++++- exir/_serialize/_program.py | 24 +-- exir/_serialize/test/test_flatbuffer.py | 65 +------- .../test/test_flatbuffer_program.py | 51 +------ exir/_serialize/test/test_program.py | 88 ++++++++++- 6 files changed, 228 insertions(+), 208 deletions(-) diff --git a/exir/_serialize/_flatbuffer.py b/exir/_serialize/_flatbuffer.py index 219e4517aea..43e203d1ff9 100644 --- a/exir/_serialize/_flatbuffer.py +++ b/exir/_serialize/_flatbuffer.py @@ -12,7 +12,6 @@ import importlib.resources import os import re -import shutil import stat import subprocess import tempfile @@ -384,72 +383,6 @@ def _flatc_decompile( ) -def _program_json_to_flatbuffer( - program_json: str, - *, - constant_tensor_alignment: Optional[int] = None, - delegate_alignment: Optional[int] = None, -) -> _FlatbufferResult: - """Converts Program-compatible JSON into binary flatbuffer data. - - Args: - program_json: The JSON to convert. Must be compatible with the root - table type of //executorch/schema/program.fbs. - constant_tensor_alignment: If provided, the alignment to use for tensor - data embedded in the output flatbuffer data. If not provided, uses - the alignment in the schema. - delegate_alignment: If provided, the alignment to use for delegate - data embedded in the output flatbuffer data. If not provided, uses - the alignment in the schema. - - Returns: The flatbuffer data and associated metadata. - """ - with tempfile.TemporaryDirectory() as temp_dir: - schema_info = _prepare_schema( - out_dir=temp_dir, - constant_tensor_alignment=constant_tensor_alignment, - delegate_alignment=delegate_alignment, - ) - file_stem = "data" - json_path = os.path.join(temp_dir, file_stem + ".json") - output_path = os.path.join(temp_dir, file_stem + ".pte") - - with open(json_path, "wb") as json_file: - json_file.write(program_json.encode("ascii")) - - try: - _flatc_compile(temp_dir, schema_info.root_path, json_path) - except Exception as err: - # It's helpful to save the breaking files for debugging. Optionally - # move them out of the auto-deleting temporary directory. Don't do - # this by default because some input files can be many GB in size, - # and these copies won't be auto-deleted. - should_save = os.getenv(_SAVE_FLATC_ENV, "").strip() not in {"", "0"} - extra_message = "" - if should_save: - try: - saved_dir = tempfile.mkdtemp(prefix="exir-saved-flatc-") - for f in os.listdir(temp_dir): - shutil.move(src=os.path.join(temp_dir, f), dst=saved_dir) - extra_message += f" Moved input files to '{saved_dir}'." - except Exception as err2: - extra_message += ( - f" (Failed to save input files for debugging: {err2})" - ) - else: - extra_message += ( - f" Set {_SAVE_FLATC_ENV}=1 to save input files on failure." - ) - - raise RuntimeError( - f"Failed to compile {json_path} to {output_path}." + extra_message - ) from err - with open(output_path, "rb") as output_file: - return _FlatbufferResult( - data=output_file.read(), max_alignment=schema_info.max_alignment - ) - - def _replace_infinity_in_json_file(content: bytes) -> bytes: """Replace -inf and inf with "inf" and "-inf" in the JSON file. program.fbs is used to convert from flatbuffer to JSON. +-inf float values are not diff --git a/exir/_serialize/_flatbuffer_program.py b/exir/_serialize/_flatbuffer_program.py index 4c1c315347a..cd742c8361d 100644 --- a/exir/_serialize/_flatbuffer_program.py +++ b/exir/_serialize/_flatbuffer_program.py @@ -8,12 +8,14 @@ import enum import functools import importlib +import pkgutil import tempfile from contextvars import ContextVar from dataclasses import fields, is_dataclass from functools import lru_cache -from typing import Any, Dict, Optional +from types import ModuleType +from typing import Any, Dict, get_args, get_origin, get_type_hints, Optional, Union import flatbuffers # pyre-ignore[21] from executorch.exir._serialize._flatbuffer import ( @@ -22,6 +24,7 @@ _prepare_schema, _SchemaInfo, ) +from executorch.exir._serialize.generated import executorch_flatbuffer as _generated_fb from executorch.exir._serialize.generated.executorch_flatbuffer import ( BackendDelegateInlineData as _BackendDelegateInlineData, Buffer as _Buffer, @@ -33,6 +36,7 @@ _T_CLASS_CACHE: Dict[type, type] = {} _FIELD_NAME_CACHE: Dict[type, tuple[tuple[str, str], ...]] = {} +_TYPE_HINTS_CACHE: Dict[type, Dict[str, Any]] = {} _BUFFER_ALIGNMENT: ContextVar[int] = ContextVar("_BUFFER_ALIGNMENT", default=1) _DELEGATE_ALIGNMENT: ContextVar[int] = ContextVar("_DELEGATE_ALIGNMENT", default=1) @@ -64,6 +68,15 @@ def _dataclass_field_map(dataclass_type: type) -> tuple[tuple[str, str], ...]: return mapping +def _dataclass_type_hints(dataclass_type: type) -> Dict[str, Any]: + cached = _TYPE_HINTS_CACHE.get(dataclass_type) + if cached is not None: + return cached + type_hints = get_type_hints(dataclass_type) + _TYPE_HINTS_CACHE[dataclass_type] = type_hints + return type_hints + + def _create_aligned_byte_vector(builder: Any, data: bytes, alignment: int) -> int: if not _is_valid_alignment(alignment): raise ValueError(f"Bad alignment {alignment}") @@ -194,6 +207,126 @@ def convert_program(val: Program) -> ProgramT: return _convert_dataclass(val) +# The generated FlatBuffer Python modules import child tables/unions as modules +# (for example, Program.ExecutionPlan becomes the ExecutionPlan module), but the +# unpacking helpers later expect those globals to be the corresponding classes. +# Rebind module globals like ExecutionPlan -> ExecutionPlan.ExecutionPlan so the +# generated InitFromObj()/InitFromPackedBuf() code can instantiate nested types. +def _patch_generated_module_aliases(module: ModuleType) -> None: + for name, maybe_module in vars(module).items(): + if not isinstance(maybe_module, ModuleType): + continue + maybe_class = getattr(maybe_module, name, None) + if isinstance(maybe_class, type): + setattr(module, name, maybe_class) + + +@lru_cache(maxsize=1) +def _patch_generated_flatbuffer_aliases() -> None: + package_name = _generated_fb.__name__ + for module_info in pkgutil.iter_modules(_generated_fb.__path__): + module = importlib.import_module(f"{package_name}.{module_info.name}") + _patch_generated_module_aliases(module) + + +def _flatbuffer_dataclass_names(val: Any) -> tuple[str, Optional[str]]: + val_type_name = type(val).__name__ + if val_type_name.endswith("T"): + return val_type_name, val_type_name[:-1] + return val_type_name, None + + +def _matches_dataclass_union_type( + union_type: Any, val_type_name: str, val_dataclass_name: Optional[str] +) -> bool: + if not is_dataclass(union_type): + return False + union_name = union_type.__name__ + return union_name == val_type_name or ( + val_dataclass_name is not None and union_name == val_dataclass_name + ) + + +def _matches_non_dataclass_union_type(union_type: Any, val: Any) -> bool: + if union_type is Any: + return True + if union_type is str and isinstance(val, (bytes, bytearray, memoryview)): + return True + union_origin = get_origin(union_type) + if union_origin is list and hasattr(val, "__iter__"): + return True + return isinstance(union_type, type) and isinstance(val, union_type) + + +def _union_choice_from_value(union_types: tuple[Any, ...], val: Any) -> Any: + if val is None: + for union_type in union_types: + if union_type is type(None): + return union_type + return None + + val_type_name, val_dataclass_name = _flatbuffer_dataclass_names(val) + + for union_type in union_types: + if union_type is type(None): + continue + if _matches_dataclass_union_type(union_type, val_type_name, val_dataclass_name): + return union_type + if _matches_non_dataclass_union_type(union_type, val): + return union_type + return None + + +def _convert_from_flatbuffer_value(val: Any, expected_type: Any) -> Any: + if val is None: + return None + + origin = get_origin(expected_type) + if origin is list: + item_type = get_args(expected_type)[0] + return [_convert_from_flatbuffer_value(item, item_type) for item in val] + + if origin is Union: + union_type = _union_choice_from_value(get_args(expected_type), val) + if union_type is None: + raise TypeError( + f"Could not match value type {type(val)} to {expected_type}" + ) + if union_type is type(None): + return None + return _convert_from_flatbuffer_value(val, union_type) + + if expected_type is bytes: + return _coerce_bytes(val) + if expected_type is str and isinstance(val, (bytes, bytearray, memoryview)): + return _coerce_bytes(val).decode("utf-8") + if is_dataclass(expected_type): + return _convert_from_flatbuffer_dataclass(val, expected_type) + if isinstance(expected_type, type) and issubclass(expected_type, enum.Enum): + if isinstance(val, expected_type): + return val + return expected_type(val) + if isinstance(expected_type, type): + return expected_type(val) + return val + + +def _convert_from_flatbuffer_dataclass(val: Any, dataclass_type: type) -> Any: + result = {} + type_hints = _dataclass_type_hints(dataclass_type) + for src_name, dst_name in _dataclass_field_map(dataclass_type): + result[src_name] = _convert_from_flatbuffer_value( + getattr(val, dst_name), type_hints[src_name] + ) + return dataclass_type(**result) + + +def _flatbuffer_to_program(program_data: bytes) -> Program: + _patch_generated_flatbuffer_aliases() + program_t = ProgramT.InitFromPackedBuf(program_data) + return _convert_from_flatbuffer_dataclass(program_t, Program) + + @lru_cache(maxsize=1) def _get_schema_info( constant_tensor_alignment: Optional[int], delegate_alignment: Optional[int] @@ -213,11 +346,7 @@ def _program_to_flatbuffer( constant_tensor_alignment: Optional[int] = None, delegate_alignment: Optional[int] = None, ) -> _FlatbufferResult: - """Converts a Program dataclass into binary flatbuffer data. - - Unlike _program_json_to_flatbuffer(), this does not use JSON or invoke - flatc to build the binary. - """ + """Converts a Program dataclass into binary flatbuffer data.""" schema_info = _get_schema_info(constant_tensor_alignment, delegate_alignment) _set_pack_alignments(schema_info.tensor_alignment, schema_info.delegate_alignment) _install_fast_packers() diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py index 4ab2a3572b4..230b50bf558 100644 --- a/exir/_serialize/_program.py +++ b/exir/_serialize/_program.py @@ -16,12 +16,12 @@ from typing import ClassVar, Dict, List, Literal, Optional, Sequence, Tuple from executorch.exir._serialize._cord import Cord -from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass -from executorch.exir._serialize._flatbuffer import ( - _FlatbufferResult, - _program_flatbuffer_to_json, +from executorch.exir._serialize._dataclass import _DataclassEncoder +from executorch.exir._serialize._flatbuffer import _FlatbufferResult +from executorch.exir._serialize._flatbuffer_program import ( + _flatbuffer_to_program, + _program_to_flatbuffer, ) -from executorch.exir._serialize._flatbuffer_program import _program_to_flatbuffer from executorch.exir._serialize._named_data_store import ( NamedDataStore, NamedDataStoreOutput, @@ -86,12 +86,6 @@ def _program_to_json(program: Program) -> str: return json.dumps(program, cls=_DataclassEncoder) -def _json_to_program(program_json: bytes) -> Program: - """Returns a Program deserialized from the given JSON string.""" - # construct program class recursively from dict - return _json_to_dataclass(json.loads(program_json), cls=Program) - - def _insert_flatbuffer_header( flatbuffer_data: bytes, magic_regex: str, header_data: bytes ) -> bytes: @@ -757,9 +751,7 @@ def deserialize_pte_binary(program_data: bytes) -> PTEFile: segment_base_offset = eh.segment_base_offset # Parse the flatbuffer data. - program: Program = _json_to_program( - _program_flatbuffer_to_json(program_data[:program_size]) - ) + program: Program = _flatbuffer_to_program(program_data[:program_size]) if segment_base_offset != 0: # Move segment data back into the Program. @@ -799,9 +791,7 @@ def _extract_delegate_payload( program_size = len(pte_data) # Parse the program flatbuffer - program: Program = _json_to_program( - _program_flatbuffer_to_json(pte_data[:program_size]) - ) + program: Program = _flatbuffer_to_program(pte_data[:program_size]) # Search for the matching delegate match_count = 0 diff --git a/exir/_serialize/test/test_flatbuffer.py b/exir/_serialize/test/test_flatbuffer.py index 801ddca112d..e623da55cd2 100644 --- a/exir/_serialize/test/test_flatbuffer.py +++ b/exir/_serialize/test/test_flatbuffer.py @@ -7,19 +7,13 @@ # LICENSE file in the root directory of this source tree. import os -import re -import shutil import tempfile import unittest from typing import Dict, Optional, Sequence from unittest.mock import patch from executorch.exir._serialize import _flatbuffer -from executorch.exir._serialize._flatbuffer import ( - _program_json_to_flatbuffer, - _ResourceFiles, - _SchemaInfo, -) +from executorch.exir._serialize._flatbuffer import _ResourceFiles, _SchemaInfo def read_file(dir: str, filename: str) -> bytes: @@ -277,60 +271,3 @@ def test_bad_delegate_alignment_fails(self) -> None: out_dir, delegate_alignment=bad_alignment, ) - - -class TestProgramJsonToFlatbuffer(unittest.TestCase): - @patch.dict(os.environ, {_flatbuffer._SAVE_FLATC_ENV: "1"}) - def test_save_json_on_failure(self) -> None: - err_msg: Optional[str] = None - try: - _program_json_to_flatbuffer("} some bad json {") - self.fail("Should have raised an exception") - except RuntimeError as err: - err_msg = err.args[0] - - self.assertIsNotNone(err_msg) - match = re.search(r"Moved input files to '(.*?)'", err_msg) - self.assertTrue(match, msg=f"Unexpected error message: {err_msg}") - path = match.group(1) - - files = frozenset(os.listdir(path)) - # Delete the files otherwise they'll accumulate every time the - # test is run. - shutil.rmtree(path) - # Check for a couple of the files that should be there. - self.assertIn("data.json", files) - self.assertIn("program.fbs", files) - - @patch.dict(os.environ, {_flatbuffer._SAVE_FLATC_ENV: "1"}) - def test_unable_to_save_json_on_failure(self) -> None: - err_msg: Optional[str] = None - try: - with patch.object( - _flatbuffer.shutil, - "move", - side_effect=Exception("shutil.move mock failure"), - ): - _program_json_to_flatbuffer("} some bad json {") - self.fail("Should have raised an exception") - except RuntimeError as err: - err_msg = err.args[0] - - self.assertIsNotNone(err_msg) - self.assertIn("Failed to save input files", err_msg) - - @patch.dict(os.environ, {_flatbuffer._SAVE_FLATC_ENV: ""}) - def test_no_save_json_on_failure(self) -> None: - err_msg: Optional[str] = None - try: - _program_json_to_flatbuffer("} some bad json {") - self.fail("Should have raised an exception") - except RuntimeError as err: - err_msg = err.args[0] - - self.assertIsNotNone(err_msg) - self.assertIn( - f"Set {_flatbuffer._SAVE_FLATC_ENV}=1 to save input files", err_msg - ) - self.assertNotIn("Moved input files", err_msg) - self.assertNotIn("Failed to save input files", err_msg) diff --git a/exir/_serialize/test/test_flatbuffer_program.py b/exir/_serialize/test/test_flatbuffer_program.py index 05e05d4e610..4910f9b431f 100644 --- a/exir/_serialize/test/test_flatbuffer_program.py +++ b/exir/_serialize/test/test_flatbuffer_program.py @@ -4,15 +4,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import json import unittest -from executorch.exir._serialize._flatbuffer import ( - _program_flatbuffer_to_json, - _program_json_to_flatbuffer, +from executorch.exir._serialize._flatbuffer_program import ( + _flatbuffer_to_program, + _program_to_flatbuffer, ) -from executorch.exir._serialize._flatbuffer_program import _program_to_flatbuffer -from executorch.exir._serialize._program import _json_to_program, _program_to_json from executorch.exir.backend.compile_spec_schema import CompileSpec from executorch.exir.schema import ( AllocationDetails, @@ -157,50 +154,12 @@ def _make_program(self) -> Program: named_data=[], ) - def _flatbuffer_to_dict(self, flatbuffer_data: bytes) -> dict: - return json.loads(_program_flatbuffer_to_json(flatbuffer_data)) - - def test_roundtrip_via_json(self) -> None: + def test_roundtrip_via_direct_python(self) -> None: program = self._make_program() result = _program_to_flatbuffer( program, constant_tensor_alignment=32, delegate_alignment=64 ) - self.assertGreater(len(result.data), 8) - self.assertEqual(result.data[4:6], b"ET") - self.assertGreaterEqual(result.max_alignment, 64) - - program2 = _json_to_program(_program_flatbuffer_to_json(result.data)) - self.assertEqual(program2, program) - - def test_flatbuffer_paths_match(self) -> None: - program = self._make_program() - cases = [ - (None, None), - (32, 64), - ] - for constant_tensor_alignment, delegate_alignment in cases: - with self.subTest( - constant_tensor_alignment=constant_tensor_alignment, - delegate_alignment=delegate_alignment, - ): - result = _program_to_flatbuffer( - program, - constant_tensor_alignment=constant_tensor_alignment, - delegate_alignment=delegate_alignment, - ) - result2 = _program_json_to_flatbuffer( - _program_to_json(program), - constant_tensor_alignment=constant_tensor_alignment, - delegate_alignment=delegate_alignment, - ) - direct_dict = self._flatbuffer_to_dict(result.data) - json_path_dict = self._flatbuffer_to_dict(result2.data) - self.assertEqual( - direct_dict, - json_path_dict, - "Flatbuffer JSON differs between direct and JSON paths", - ) - self.assertEqual(result.max_alignment, result2.max_alignment) + self.assertEqual(_flatbuffer_to_program(result.data), program) def test_bad_alignment_fails(self) -> None: program = Program( diff --git a/exir/_serialize/test/test_program.py b/exir/_serialize/test/test_program.py index 579934e9d38..0d0d833c952 100644 --- a/exir/_serialize/test/test_program.py +++ b/exir/_serialize/test/test_program.py @@ -1,6 +1,7 @@ #!/usr/bin/env fbpython # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -16,12 +17,11 @@ from typing import Dict, List, Sequence -from executorch.exir._serialize._flatbuffer import _program_flatbuffer_to_json +from executorch.exir._serialize._flatbuffer_program import _flatbuffer_to_program from executorch.exir._serialize._named_data_store import NamedDataStoreOutput from executorch.exir._serialize._program import ( _ExtendedHeader, _get_extended_header, - _json_to_program, _program_to_json, deserialize_pte_binary, PTEFile, @@ -30,6 +30,8 @@ from executorch.exir._serialize.data_serializer import DataEntry from executorch.exir._serialize.padding import aligned_size +from executorch.exir.backend.compile_spec_schema import CompileSpec + from executorch.exir.schema import ( BackendDelegate, BackendDelegateDataReference, @@ -39,7 +41,15 @@ DataLocation, DataSegment, DeviceType, + Double, + EValue, ExecutionPlan, + Frame, + FrameList, + FreeCall, + Instruction, + JumpFalseCall, + MoveCall, NonConstBufferDevice, Program, SubsegmentOffsets, @@ -197,7 +207,7 @@ def constant_segment_with_tensor_alignment( self.assertGreater(eh.segment_data_size, 0) # Peek inside the actual flatbuffer data to see the segments. - program_with_segments = _json_to_program(_program_flatbuffer_to_json(pte_data)) + program_with_segments = _flatbuffer_to_program(pte_data) # The constant tensor data should appear as the only segment. self.assertEqual(len(program_with_segments.segments), 1) @@ -467,6 +477,68 @@ def test_round_trip_no_header_no_segments(self) -> None: self.assertEqual(deserialized.mutable_data, None) self.assertEqual(deserialized.named_data, None) + def test_deserialize_pte_binary_with_rich_flatbuffer_types(self) -> None: + program = get_test_program() + plan = program.execution_plan[0] + plan.values.append(EValue(Double(float("inf")))) + plan.delegates.append( + BackendDelegate( + id="delegate0", + processed=BackendDelegateDataReference( + location=DataLocation.INLINE, + index=0, + ), + compile_specs=[CompileSpec(key="k", value=b"v")], + ) + ) + plan.chains[0].instructions.extend( + [ + Instruction(MoveCall(move_from=0, move_to=1)), + Instruction( + JumpFalseCall(cond_value_index=1, destination_instruction=0) + ), + Instruction(FreeCall(value_index=0)), + ] + ) + plan.chains[0].stacktrace = [ + FrameList( + items=[ + Frame( + filename="file.py", + lineno=idx + 1, + name="fn", + context="ctx", + ) + ] + ) + for idx, _ in enumerate(plan.chains[0].instructions) + ] + program.constant_buffer.append(Buffer(storage=b"abcd")) + program.backend_delegate_data.append( + BackendDelegateInlineData(data=b"delegate-data") + ) + + deserialized = deserialize_pte_binary( + bytes(serialize_pte_binary(PTEFile(program=program))) + ) + + self.assert_programs_equal(program, deserialized.program) + self.assertEqual(deserialized.mutable_data, None) + self.assertEqual(deserialized.named_data, None) + self.assertIsInstance(plan.values[-1].val, Double) + self.assertIsInstance( + deserialized.program.execution_plan[0].values[-1].val, + Double, + ) + self.assertEqual( + deserialized.program.execution_plan[0].values[-1].val.double_val, + "inf", + ) + self.assertEqual( + deserialized.program.execution_plan[0].delegates[0].compile_specs[0].value, + b"v", + ) + def test_round_trip_large_buffer_sizes(self) -> None: """Tests that when the non_const_buffer_sizes contains integers overflowing a signed/unsigned 32 bit integer, we can still serialize the @@ -531,7 +603,7 @@ def test_round_trip_no_segments_and_no_header(self) -> None: self.assertIsNone(eh) # Peek inside the flatbuffer data to confirm that there are no segments. - program_with_segments = _json_to_program(_program_flatbuffer_to_json(pte_data)) + program_with_segments = _flatbuffer_to_program(pte_data) self.assertEqual(program_with_segments.segments, []) # Convert back. @@ -597,7 +669,7 @@ def test_round_trip_with_segments(self) -> None: # this also implicity tests the case where we try parsing the entire # file with segment data following it, demonstrating that the extra data # doesn't upset the flatbuffer parsing path. - program_with_segments = _json_to_program(_program_flatbuffer_to_json(pte_data)) + program_with_segments = _flatbuffer_to_program(pte_data) # The delegate blobs we added to the program should appear as segments. # The one empty blob should have been ignored, hence the `- 1`. @@ -694,7 +766,7 @@ def test_no_constants(self) -> None: self.assertEqual(program.segments, []) # Peek inside the actual flatbuffer data to see the segments. - flatbuffer_program = _json_to_program(_program_flatbuffer_to_json(pte_data)) + flatbuffer_program = _flatbuffer_to_program(pte_data) # Constant buffer should be empty. self.assertEqual(len(flatbuffer_program.constant_buffer), 0) @@ -814,7 +886,7 @@ def test_constant_delegate_and_named_data_segments(self) -> None: self.assertGreater(eh.segment_data_size, 0) # Peek inside the actual flatbuffer data to see the segments. - program_with_segments = _json_to_program(_program_flatbuffer_to_json(pte_data)) + program_with_segments = _flatbuffer_to_program(pte_data) # Segment table should contain a constant segment, the delegate blobs # and a named data segment. @@ -1017,7 +1089,7 @@ def test_named_data_segments(self) -> None: self.assertGreater(eh.segment_data_size, 0) # Peek inside the actual flatbuffer data to see the named data segments. - program_with_segments = _json_to_program(_program_flatbuffer_to_json(pte_data)) + program_with_segments = _flatbuffer_to_program(pte_data) # pyre-ignore Incompatible parameter type [6] self.assertEqual(len(program_with_segments.named_data), len(pte_named_data)) From daa7ad2d28e60a51a59b1d082c9eaf2ddaf877cb Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 27 May 2026 13:29:16 -0700 Subject: [PATCH 043/103] Update golden artifact path for android_test_setup.sh (#19819) --- extension/android/executorch_android/android_test_setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extension/android/executorch_android/android_test_setup.sh b/extension/android/executorch_android/android_test_setup.sh index 350c60b2e25..9ed1ae63da2 100644 --- a/extension/android/executorch_android/android_test_setup.sh +++ b/extension/android/executorch_android/android_test_setup.sh @@ -29,7 +29,7 @@ prepare_tinyllama() { } prepare_golden() { - local url="https://gha-artifacts.s3.amazonaws.com/pytorch/executorch/test-backend-artifacts/golden-artifacts-xnnpack/golden_artifacts_26022500.zip" + local url="https://gha-artifacts.s3.amazonaws.com/pytorch/executorch/test-backend-artifacts/golden-artifacts-xnnpack/golden_artifacts_26052718.zip" curl -sL -o /tmp/golden.zip "$url" unzip -o /tmp/golden.zip -d /tmp/golden/ for model in mobilenet_v2 vit_b_16; do From b1446cc87162b6803a0b3d1ec0e1f93af5065224 Mon Sep 17 00:00:00 2001 From: Per Held Date: Thu, 21 May 2026 16:12:42 +0200 Subject: [PATCH 044/103] Arm backend: Simplify fake RESIZE validation Avoid revalidating RESIZE output shape against dimensions computed by the same formula. Validate parameters once, compute the fake output shape, and directly validate the computed output dimensions. Signed-off-by: Per Held Change-Id: I97bb91f9fc440c980782955692056196038d5de0 --- .../misc/tosa_dialect/test_tosa_resize.py | 24 +++++++++++++++++++ backends/arm/tosa/dialect/ops/resize.py | 5 +++- backends/arm/tosa/resize_utils.py | 19 +++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) diff --git a/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py b/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py index 0a90de5c0c0..eddb69a8caf 100644 --- a/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py +++ b/backends/arm/test/misc/tosa_dialect/test_tosa_resize.py @@ -72,6 +72,30 @@ def test_resize_rejects_scale_numerator_over_tosa_limit(): ) +@pytest.mark.parametrize( + "offset,border", + ( + ([1, 0], [-1, 0]), + ([0, 1], [0, -1]), + ), +) +def test_resize_rejects_non_positive_output_dimensions(offset, border): + with TosaLoweringContext( + TosaSpecification.create_from_string("TOSA-1.0+INT") + ), FakeTensorMode() as mode: + with pytest.raises( + TosaValueError, + match="RESIZE output dimensions must be positive", + ): + exir_ops.backend.tosa.RESIZE.default( + mode.from_tensor(torch.randint(0, 10, (1, 1, 1, 1), dtype=torch.int8)), + [1, 1, 1, 1], + offset, + border, + resize_mode="nearest", + ) + + def test_resize_accepts_symbolic_scale_and_border_values(): shape_env = ShapeEnv() scale_y_n = _make_symint(shape_env, "scale_y_n", hint=2, min=1, max=8) diff --git a/backends/arm/tosa/dialect/ops/resize.py b/backends/arm/tosa/dialect/ops/resize.py index 8a2d4c5e60a..0d06253ccd8 100644 --- a/backends/arm/tosa/dialect/ops/resize.py +++ b/backends/arm/tosa/dialect/ops/resize.py @@ -10,6 +10,7 @@ from executorch.backends.arm.tosa.dialect.ops_registration import register_fake_tosa_op from executorch.backends.arm.tosa.resize_utils import ( calculate_tosa_resize_output_hw, + get_tosa_resize_output_hw_validation_error, get_tosa_resize_validation_error, ) @@ -92,7 +93,9 @@ def RESIZE( H, W = input_shape[1], input_shape[2] _validate_resize_parameters((H, W), None, scale, offset, border, tosa_spec) output_hw = calculate_tosa_resize_output_hw((H, W), scale, offset, border) - _validate_resize_parameters((H, W), output_hw, scale, offset, border, tosa_spec) + validation_error = get_tosa_resize_output_hw_validation_error(output_hw) + if validation_error is not None: + raise TosaValueError(validation_error, op="RESIZE") if output_hw is None: scale_y_n, scale_y_d, scale_x_n, scale_x_d = scale offset_y, offset_x = offset diff --git a/backends/arm/tosa/resize_utils.py b/backends/arm/tosa/resize_utils.py index 6c716bfa59c..23be6ff42fc 100644 --- a/backends/arm/tosa/resize_utils.py +++ b/backends/arm/tosa/resize_utils.py @@ -67,6 +67,25 @@ def _validate_dimensions( return None +def get_tosa_resize_output_hw_validation_error( + output_hw: Sequence[int | torch.SymInt] | None, +) -> str | None: + if output_hw is None: + return None + + output_hw_ints = _as_concrete_ints(output_hw) + if output_hw_ints is None: + return None + + invalid_dimension = next( + (dimension for dimension in output_hw_ints if dimension <= 0), None + ) + if invalid_dimension is not None: + return f"RESIZE output dimensions must be positive; got {invalid_dimension}" + + return _validate_dimensions((), output_hw) + + def _validate_scale( scale: Sequence[int | torch.SymInt], tosa_spec: TosaSpecification, From 9d1853129d7988570dd62585e65f27efebad8b68 Mon Sep 17 00:00:00 2001 From: Christoffer Johansson Lundqvist <119742508+Christoffer-JL@users.noreply.github.com> Date: Wed, 27 May 2026 23:23:54 +0200 Subject: [PATCH 045/103] Arm backend: Fix bmm quantization bug (#19798) bmm nodes are now forwarded to ArmPass in stead of ExportPass. This fixes an issue where _call_quantized_bmm_without_fake_kernel() does not get called, leading to dtype mismatch error cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Christoffer J.L --- backends/arm/_passes/replace_scalar_with_tensor_pass.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/arm/_passes/replace_scalar_with_tensor_pass.py b/backends/arm/_passes/replace_scalar_with_tensor_pass.py index edd5fc97213..53f0e517a7f 100644 --- a/backends/arm/_passes/replace_scalar_with_tensor_pass.py +++ b/backends/arm/_passes/replace_scalar_with_tensor_pass.py @@ -126,4 +126,4 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) else: # Do not handle; forward unchanged. - return ExportPass.call_operator(self, op, args, kwargs, meta) + return ArmPass.call_operator(self, op, args, kwargs, meta) From 5393742be88b6e8cf863c5e98cf31543c3d512ac Mon Sep 17 00:00:00 2001 From: ssjia Date: Wed, 27 May 2026 09:25:39 -0700 Subject: [PATCH 046/103] [executorch][runtime] Fix -Werror failures under Apple toolchain Two `-Werror` failures surfaced when building `xplat/executorch/runtime` under the iOS toolchain (`-Werror -Wshadow -Wswitch-default`): 1. `EXECUTORCH_SCOPE_PROF` in `runtime/platform/profiler.h` hardcodes the local variable name `profiler`. When the macro is invoked at function scope and again inside a nested block in the same function (for example `Program::load` invokes it at the top of the function and then again inside `check_header` / `verify_internal_consistency` blocks), `-Wshadow` fires and the build fails. Fixed by token-pasting `__LINE__` so each invocation gets a unique identifier. No caller changes required. 2. `to_string(Error)` in `runtime/core/error.h` is a switch statement covering every enum value with a trailing `return "Error::Unknown"` fallback after the switch. Apple's toolchain promotes `-Wswitch-default` to an error and rejects switches that lack an explicit `default:` arm. Folded the trailing fallback into a `default:` arm inside the switch. Both issues only surfaced under the Apple toolchain; fbcode toolchain does not promote these warnings to errors, so devserver / Linux builds continued to pass. Differential Revision: [D106523959](https://our.internmc.facebook.com/intern/diff/D106523959/) ghstack-source-id: 386608989 Pull-Request: https://github.com/pytorch/executorch/pull/19811 --- runtime/core/error.h | 3 ++- runtime/platform/profiler.h | 8 ++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/runtime/core/error.h b/runtime/core/error.h index 80c2ef645d4..b923604ca89 100644 --- a/runtime/core/error.h +++ b/runtime/core/error.h @@ -151,8 +151,9 @@ constexpr const char* to_string(const Error error) { return "Error::RegistrationExceedingMaxKernels"; case Error::RegistrationAlreadyRegistered: return "Error::RegistrationAlreadyRegistered"; + default: + return "Error::Unknown"; } - return "Error::Unknown"; } } // namespace runtime diff --git a/runtime/platform/profiler.h b/runtime/platform/profiler.h index d6362781394..cb011bd0ef9 100644 --- a/runtime/platform/profiler.h +++ b/runtime/platform/profiler.h @@ -227,8 +227,12 @@ using ::executorch::runtime::track_allocator; #define EXECUTORCH_END_PROF(token_id) \ ::executorch::runtime::end_profiling(token_id); -#define EXECUTORCH_SCOPE_PROF(name) \ - ::executorch::runtime::ExecutorchProfiler profiler(name); +#define EXECUTORCH_SCOPE_PROF_CONCAT_IMPL(a, b) a##b +#define EXECUTORCH_SCOPE_PROF_CONCAT(a, b) \ + EXECUTORCH_SCOPE_PROF_CONCAT_IMPL(a, b) +#define EXECUTORCH_SCOPE_PROF(name) \ + ::executorch::runtime::ExecutorchProfiler EXECUTORCH_SCOPE_PROF_CONCAT( \ + et_profiler_, __LINE__)(name); #define EXECUTORCH_PROFILE_INSTRUCTION_SCOPE(chain_idx, instruction_idx) \ ::executorch::runtime::ExecutorchProfilerInstructionScope \ From 5c0aa4f8cf6b3a338ce8499015dd533be205ab0b Mon Sep 17 00:00:00 2001 From: ssjia Date: Wed, 27 May 2026 09:25:40 -0700 Subject: [PATCH 047/103] [executorch][coreml] Fix CoreML SDK proto header includes Pull Request resolved: https://github.com/pytorch/executorch/pull/19789 CoreML SDK builds include generated CoreMLTools proto headers through short `format/*.pb.h` imports. iOS Buck compilation could not resolve those generated headers because they were not exposed under a flat include namespace. This makes the generated proto headers available at the include paths used by the SDK sources. ghstack-source-id: 386608986 @exported-using-ghexport Differential Revision: [D106430265](https://our.internmc.facebook.com/intern/diff/D106430265/) --- backends/apple/coreml/BUCK | 1 + 1 file changed, 1 insertion(+) diff --git a/backends/apple/coreml/BUCK b/backends/apple/coreml/BUCK index 792adcf4d70..688ca64b990 100644 --- a/backends/apple/coreml/BUCK +++ b/backends/apple/coreml/BUCK @@ -171,6 +171,7 @@ runtime.cxx_library( "format/{}.pb.h".format(name): "fbsource//third-party/pypi/coremltools:exported-cpp-protoc[{}.pb.h]".format(name) for name in _PROTOS }, + header_namespace = "", compiler_flags = [ "-Wno-global-constructors", ], From 0ed8dcf8733592a428877cd3b31b3532d266f361 Mon Sep 17 00:00:00 2001 From: Sicheng Stephen Jia Date: Wed, 27 May 2026 18:12:56 -0400 Subject: [PATCH 048/103] Fix etsize workflow build failures under -fno-exceptions Differential Revision: D106539321 Pull Request resolved: https://github.com/pytorch/executorch/pull/19815 --- kernels/portable/targets.bzl | 22 +++++++++++++--------- test/targets.bzl | 4 +++- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/kernels/portable/targets.bzl b/kernels/portable/targets.bzl index 2c6e0b5c35f..b80ce347768 100644 --- a/kernels/portable/targets.bzl +++ b/kernels/portable/targets.bzl @@ -66,15 +66,19 @@ def define_common_targets(): "visibility": ["PUBLIC"], } - executorch_generated_lib( - name = "generated_lib", - deps = [ - ":executorch_aten_ops", - ":executorch_custom_ops", - ], - kernel_deps = ["//executorch/kernels/portable:operators"], - **generated_lib_common_args - ) + for support_exceptions in [True, False]: + exception_suffix = "_no_exceptions" if not support_exceptions else "" + + executorch_generated_lib( + name = "generated_lib" + exception_suffix, + deps = [ + ":executorch_aten_ops", + ":executorch_custom_ops", + ], + kernel_deps = ["//executorch/kernels/portable:operators"], + support_exceptions = support_exceptions, + **generated_lib_common_args + ) if True in get_aten_mode_options(): executorch_generated_lib( diff --git a/test/targets.bzl b/test/targets.bzl index 023a1d48960..0047d5563fc 100644 --- a/test/targets.bzl +++ b/test/targets.bzl @@ -36,7 +36,9 @@ def define_common_targets(): name = "size_test_all_ops", srcs = SIZE_TEST_SOURCES, deps = SIZE_TEST_DEPS + [ - "//executorch/kernels/portable:generated_lib", + # size_test_all_ops is built with -fno-exceptions in the size CI; + # use the _no_exceptions variant whose codegen omits try/catch. + "//executorch/kernels/portable:generated_lib_no_exceptions", "//executorch/runtime/executor/test:test_backend_compiler_lib", ], define_static_target = True, From d366f43906057614f4d88003cf5c3a8ea1b3dd3c Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 27 May 2026 15:22:39 -0700 Subject: [PATCH 049/103] Convert SGD and TrainingModule from Java to Kotlin (#19822) Differential Revision: D106549057 Pull Request resolved: https://github.com/pytorch/executorch/pull/19822 --- extension/android/BUCK | 6 +- .../org/pytorch/executorch/training/SGD.java | 103 ------------- .../org/pytorch/executorch/training/SGD.kt | 100 ++++++++++++ .../executorch/training/TrainingModule.java | 140 ----------------- .../executorch/training/TrainingModule.kt | 144 ++++++++++++++++++ 5 files changed, 247 insertions(+), 246 deletions(-) delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.kt delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt diff --git a/extension/android/BUCK b/extension/android/BUCK index 1f1b611ff01..170c826f40f 100644 --- a/extension/android/BUCK +++ b/extension/android/BUCK @@ -33,11 +33,11 @@ non_fbcode_target(_kind = fb_android_library, name = "executorch_training", warnings_as_errors = False, srcs = [ - "executorch_android/src/main/java/org/pytorch/executorch/training/SGD.java", - "executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.java", + "executorch_android/src/main/java/org/pytorch/executorch/training/SGD.kt", + "executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt", ], autoglob = False, - language = "JAVA", + language = "KOTLIN", deps = [ ":executorch", "//fbandroid/java/com/facebook/jni:jni", diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.java deleted file mode 100644 index 58c7704b83e..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch.training; - -import com.facebook.jni.HybridData; -import com.facebook.jni.annotations.DoNotStrip; -import com.facebook.soloader.nativeloader.NativeLoader; -import com.facebook.soloader.nativeloader.SystemDelegate; -import java.util.Map; -import org.pytorch.executorch.Tensor; -import org.pytorch.executorch.annotations.Experimental; - -/** - * Java wrapper for ExecuTorch SGD Optimizer. - * - *

Warning: These APIs are experimental and subject to change without notice - */ -@Experimental -public class SGD { - - static { - if (!NativeLoader.isInitialized()) { - NativeLoader.init(new SystemDelegate()); - } - // Loads libexecutorch.so from jniLibs - NativeLoader.loadLibrary("executorch"); - } - - private final HybridData mHybridData; - - @DoNotStrip - private static native HybridData initHybrid( - Map namedParameters, - double learningRate, - double momentum, - double dampening, - double weightDecay, - boolean nesterov); - - private SGD( - Map namedParameters, - double learningRate, - double momentum, - double dampening, - double weightDecay, - boolean nesterov) { - mHybridData = - initHybrid(namedParameters, learningRate, momentum, dampening, weightDecay, nesterov); - } - - /** - * Creates a new SGD optimizer with the specified parameters and options. - * - * @param namedParameters Map of parameter names to tensors to be optimized - * @param learningRate The learning rate for the optimizer - * @param momentum The momentum value - * @param dampening The dampening value - * @param weightDecay The weight decay value - * @param nesterov Whether to use Nesterov momentum - * @return new {@link SGD} object - */ - public static SGD create( - Map namedParameters, - double learningRate, - double momentum, - double dampening, - double weightDecay, - boolean nesterov) { - return new SGD(namedParameters, learningRate, momentum, dampening, weightDecay, nesterov); - } - - /** - * Creates a new SGD optimizer with default options. - * - * @param namedParameters Map of parameter names to tensors to be optimized - * @param learningRate The learning rate for the optimizer - * @return new {@link SGD} object - */ - public static SGD create(Map namedParameters, double learningRate) { - return create(namedParameters, learningRate, 0.0, 0.0, 0.0, false); - } - - /** - * Performs a single optimization step using the provided gradients. - * - * @param namedGradients Map of parameter names to gradient tensors - */ - public void step(Map namedGradients) { - if (!mHybridData.isValid()) { - throw new IllegalStateException("SGD optimizer has been destroyed"); - } - stepNative(namedGradients); - } - - @DoNotStrip - private native void stepNative(Map namedGradients); -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.kt new file mode 100644 index 00000000000..e4aa5373498 --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/SGD.kt @@ -0,0 +1,100 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch.training + +import com.facebook.jni.HybridData +import com.facebook.jni.annotations.DoNotStrip +import com.facebook.soloader.nativeloader.NativeLoader +import com.facebook.soloader.nativeloader.SystemDelegate +import org.pytorch.executorch.Tensor +import org.pytorch.executorch.annotations.Experimental + +/** + * Kotlin wrapper for ExecuTorch SGD Optimizer. + * + * Warning: These APIs are experimental and subject to change without notice + */ +@Experimental +class SGD +private constructor( + namedParameters: Map, + learningRate: Double, + momentum: Double, + dampening: Double, + weightDecay: Double, + nesterov: Boolean, +) { + + private val mHybridData: HybridData = + initHybrid(namedParameters, learningRate, momentum, dampening, weightDecay, nesterov) + + /** + * Performs a single optimization step using the provided gradients. + * + * @param namedGradients Map of parameter names to gradient tensors + */ + fun step(namedGradients: Map) { + check(mHybridData.isValid) { "SGD optimizer has been destroyed" } + stepNative(namedGradients) + } + + @DoNotStrip private external fun stepNative(namedGradients: Map) + + companion object { + init { + if (!NativeLoader.isInitialized()) { + NativeLoader.init(SystemDelegate()) + } + NativeLoader.loadLibrary("executorch") + } + + @DoNotStrip + @JvmStatic + private external fun initHybrid( + namedParameters: Map, + learningRate: Double, + momentum: Double, + dampening: Double, + weightDecay: Double, + nesterov: Boolean, + ): HybridData + + /** + * Creates a new SGD optimizer with the specified parameters and options. + * + * @param namedParameters Map of parameter names to tensors to be optimized + * @param learningRate The learning rate for the optimizer + * @param momentum The momentum value + * @param dampening The dampening value + * @param weightDecay The weight decay value + * @param nesterov Whether to use Nesterov momentum + * @return new [SGD] object + */ + @JvmStatic + fun create( + namedParameters: Map, + learningRate: Double, + momentum: Double, + dampening: Double, + weightDecay: Double, + nesterov: Boolean, + ): SGD = SGD(namedParameters, learningRate, momentum, dampening, weightDecay, nesterov) + + /** + * Creates a new SGD optimizer with default options. + * + * @param namedParameters Map of parameter names to tensors to be optimized + * @param learningRate The learning rate for the optimizer + * @return new [SGD] object + */ + @JvmStatic + fun create(namedParameters: Map, learningRate: Double): SGD = + create(namedParameters, learningRate, 0.0, 0.0, 0.0, false) + } +} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.java deleted file mode 100644 index dd2d5a37de2..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.java +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch.training; - -import com.facebook.jni.HybridData; -import com.facebook.jni.annotations.DoNotStrip; -import com.facebook.soloader.nativeloader.NativeLoader; -import com.facebook.soloader.nativeloader.SystemDelegate; -import java.io.Closeable; -import java.util.Map; -import java.util.concurrent.locks.ReentrantLock; -import org.pytorch.executorch.EValue; -import org.pytorch.executorch.ExecuTorchRuntime; -import org.pytorch.executorch.Tensor; -import org.pytorch.executorch.annotations.Experimental; - -/** - * Java wrapper for ExecuTorch TrainingModule. - * - *

Warning: These APIs are experimental and subject to change without notice - */ -@Experimental -public class TrainingModule implements Closeable { - - static { - if (!NativeLoader.isInitialized()) { - NativeLoader.init(new SystemDelegate()); - } - // Loads libexecutorch.so from jniLibs - NativeLoader.loadLibrary("executorch"); - } - - private final HybridData mHybridData; - private final ReentrantLock mLock = new ReentrantLock(); - private volatile boolean mDestroyed = false; - - @DoNotStrip - private static native HybridData initHybrid(String moduleAbsolutePath, String dataAbsolutePath); - - private TrainingModule(String moduleAbsolutePath, String dataAbsolutePath) { - mHybridData = initHybrid(moduleAbsolutePath, dataAbsolutePath); - } - - private void checkNotDestroyed() { - if (mDestroyed) throw new IllegalStateException("TrainingModule has been destroyed"); - } - - /** - * Loads a serialized ExecuTorch Training Module from the specified path on the disk. - * - * @param modelPath path to file that contains the serialized ExecuTorch module. - * @param dataPath path to file that contains the ExecuTorch module external weights. - * @return new {@link TrainingModule} object which owns the model module. - */ - public static TrainingModule load(final String modelPath, final String dataPath) { - ExecuTorchRuntime.validateFilePath(modelPath, "model path"); - ExecuTorchRuntime.validateFilePath(dataPath, "data path"); - return new TrainingModule(modelPath, dataPath); - } - - /** - * Loads a serialized ExecuTorch training module from the specified path on the disk. - * - * @param modelPath path to file that contains the serialized ExecuTorch module. This PTE does not - * rely on external weights. - * @return new {@link TrainingModule} object which owns the model module. - */ - public static TrainingModule load(final String modelPath) { - ExecuTorchRuntime.validateFilePath(modelPath, "model path"); - return new TrainingModule(modelPath, ""); - } - - /** - * Runs the specified joint-graph method of this module with the specified arguments. - * - * @param methodName name of the ExecuTorch method to run. - * @param inputs arguments that will be passed to ExecuTorch method. - * @return return value(s) from the method. - */ - public EValue[] executeForwardBackward(String methodName, EValue... inputs) { - mLock.lock(); - try { - checkNotDestroyed(); - return executeForwardBackwardNative(methodName, inputs); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native EValue[] executeForwardBackwardNative(String methodName, EValue... inputs); - - public Map namedParameters(String methodName) { - mLock.lock(); - try { - checkNotDestroyed(); - return namedParametersNative(methodName); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native Map namedParametersNative(String methodName); - - public Map namedGradients(String methodName) { - mLock.lock(); - try { - checkNotDestroyed(); - return namedGradientsNative(methodName); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native Map namedGradientsNative(String methodName); - - @Override - public void close() { - if (mLock.tryLock()) { - try { - if (!mDestroyed) { - mDestroyed = true; - mHybridData.resetNative(); - } - } finally { - mLock.unlock(); - } - } else { - throw new IllegalStateException("Cannot close module while method is executing"); - } - } -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt new file mode 100644 index 00000000000..4caa4635fdd --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt @@ -0,0 +1,144 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch.training + +import com.facebook.jni.HybridData +import com.facebook.jni.annotations.DoNotStrip +import com.facebook.soloader.nativeloader.NativeLoader +import com.facebook.soloader.nativeloader.SystemDelegate +import java.io.Closeable +import java.util.concurrent.locks.ReentrantLock +import org.pytorch.executorch.EValue +import org.pytorch.executorch.ExecuTorchRuntime +import org.pytorch.executorch.Tensor +import org.pytorch.executorch.annotations.Experimental + +/** + * Kotlin wrapper for ExecuTorch TrainingModule. + * + * Warning: These APIs are experimental and subject to change without notice + */ +@Experimental +class TrainingModule +private constructor(moduleAbsolutePath: String, dataAbsolutePath: String) : Closeable { + + private val mHybridData: HybridData = initHybrid(moduleAbsolutePath, dataAbsolutePath) + private val mLock = ReentrantLock() + + @Volatile private var mDestroyed = false + + private fun checkNotDestroyed() { + check(!mDestroyed) { "TrainingModule has been destroyed" } + } + + /** + * Runs the specified joint-graph method of this module with the specified arguments. + * + * @param methodName name of the ExecuTorch method to run. + * @param inputs arguments that will be passed to ExecuTorch method. + * @return return value(s) from the method. + */ + fun executeForwardBackward(methodName: String, vararg inputs: EValue): Array { + mLock.lock() + try { + checkNotDestroyed() + return executeForwardBackwardNative(methodName, *inputs) + } finally { + mLock.unlock() + } + } + + @DoNotStrip + private external fun executeForwardBackwardNative( + methodName: String, + vararg inputs: EValue, + ): Array + + fun namedParameters(methodName: String): Map { + mLock.lock() + try { + checkNotDestroyed() + return namedParametersNative(methodName) + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun namedParametersNative(methodName: String): Map + + fun namedGradients(methodName: String): Map { + mLock.lock() + try { + checkNotDestroyed() + return namedGradientsNative(methodName) + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun namedGradientsNative(methodName: String): Map + + override fun close() { + if (mLock.tryLock()) { + try { + if (!mDestroyed) { + mDestroyed = true + mHybridData.resetNative() + } + } finally { + mLock.unlock() + } + } else { + throw IllegalStateException("Cannot close module while method is executing") + } + } + + companion object { + init { + if (!NativeLoader.isInitialized()) { + NativeLoader.init(SystemDelegate()) + } + NativeLoader.loadLibrary("executorch") + } + + @DoNotStrip + @JvmStatic + private external fun initHybrid( + moduleAbsolutePath: String, + dataAbsolutePath: String, + ): HybridData + + /** + * Loads a serialized ExecuTorch Training Module from the specified path on the disk. + * + * @param modelPath path to file that contains the serialized ExecuTorch module. + * @param dataPath path to file that contains the ExecuTorch module external weights. + * @return new [TrainingModule] object which owns the model module. + */ + @JvmStatic + fun load(modelPath: String, dataPath: String): TrainingModule { + ExecuTorchRuntime.validateFilePath(modelPath, "model path") + ExecuTorchRuntime.validateFilePath(dataPath, "data path") + return TrainingModule(modelPath, dataPath) + } + + /** + * Loads a serialized ExecuTorch training module from the specified path on the disk. + * + * @param modelPath path to file that contains the serialized ExecuTorch module. This PTE does + * not rely on external weights. + * @return new [TrainingModule] object which owns the model module. + */ + @JvmStatic + fun load(modelPath: String): TrainingModule { + ExecuTorchRuntime.validateFilePath(modelPath, "model path") + return TrainingModule(modelPath, "") + } + } +} From 53fa4dd54b437b3e2e9f46926280df1d55509b33 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 27 May 2026 16:47:49 -0700 Subject: [PATCH 050/103] Fix `TrainingModule` class declaration formatting Differential Revision: D106574405 Pull Request resolved: https://github.com/pytorch/executorch/pull/19830 --- .../java/org/pytorch/executorch/training/TrainingModule.kt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt index 4caa4635fdd..5556b0c16c4 100644 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/training/TrainingModule.kt @@ -25,8 +25,8 @@ import org.pytorch.executorch.annotations.Experimental * Warning: These APIs are experimental and subject to change without notice */ @Experimental -class TrainingModule -private constructor(moduleAbsolutePath: String, dataAbsolutePath: String) : Closeable { +class TrainingModule private constructor(moduleAbsolutePath: String, dataAbsolutePath: String) : + Closeable { private val mHybridData: HybridData = initHybrid(moduleAbsolutePath, dataAbsolutePath) private val mLock = ReentrantLock() From d8d706abf3a6397f61885ef74ae5c06bdd0cca7a Mon Sep 17 00:00:00 2001 From: YIWENX14 <164585414+YIWENX14@users.noreply.github.com> Date: Wed, 27 May 2026 18:35:38 -0700 Subject: [PATCH 051/103] Preserve model dtype when swapping weightless RMSNorm to RMSNormCoreML (#19786) Differential Revision: D106400668 Pull Request resolved: https://github.com/pytorch/executorch/pull/19786 --- examples/models/llama/norm.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/examples/models/llama/norm.py b/examples/models/llama/norm.py index ec92b353eb4..0b6ed7f5b01 100644 --- a/examples/models/llama/norm.py +++ b/examples/models/llama/norm.py @@ -154,6 +154,14 @@ def replace_rms_norm_for_coreml_(model: torch.nn.Module) -> torch.nn.Module: # Preserve trained scale (no-op for ScalelessRMSNorm). if getattr(mod, "weight", None) is not None: new.weight = mod.weight + else: + # Source was weightless (e.g. ScalelessRMSNorm). The freshly-allocated + # `nn.Parameter(torch.ones(dim))` inside RMSNormCoreML defaults to fp32, + # which causes an fp32 leak in fp16 export. Match the model's existing + # parameter dtype/device. + ref = next((p for p in model.parameters() if p.is_floating_point()), None) + if ref is not None: + new.to(dtype=ref.dtype, device=ref.device) # Locate parent module via the dotted name and rebind the attribute. if "." in name: parent_name, attr = name.rsplit(".", 1) From 7fd21f2b5877e0e14c73283827472b37a8f5148e Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 27 May 2026 21:03:13 -0700 Subject: [PATCH 052/103] Convert Module from Java to Kotlin (#19821) Differential Revision: D106415170 Pull Request resolved: https://github.com/pytorch/executorch/pull/19821 --- extension/android/BUCK | 2 +- .../java/org/pytorch/executorch/Module.java | 315 ------------------ .../java/org/pytorch/executorch/Module.kt | 267 +++++++++++++++ 3 files changed, 268 insertions(+), 316 deletions(-) delete mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java create mode 100644 extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.kt diff --git a/extension/android/BUCK b/extension/android/BUCK index 170c826f40f..92cb7c8c040 100644 --- a/extension/android/BUCK +++ b/extension/android/BUCK @@ -13,7 +13,7 @@ non_fbcode_target(_kind = fb_android_library, "executorch_android/src/main/java/org/pytorch/executorch/ExecuTorchRuntime.kt", "executorch_android/src/main/java/org/pytorch/executorch/ExecutorchRuntimeException.kt", "executorch_android/src/main/java/org/pytorch/executorch/MethodMetadata.kt", - "executorch_android/src/main/java/org/pytorch/executorch/Module.java", + "executorch_android/src/main/java/org/pytorch/executorch/Module.kt", "executorch_android/src/main/java/org/pytorch/executorch/Tensor.java", "executorch_android/src/main/java/org/pytorch/executorch/annotations/Experimental.kt", ], diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java deleted file mode 100644 index 94a3ed8d160..00000000000 --- a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.java +++ /dev/null @@ -1,315 +0,0 @@ -/* - * Copyright (c) Meta Platforms, Inc. and affiliates. - * All rights reserved. - * - * This source code is licensed under the BSD-style license found in the - * LICENSE file in the root directory of this source tree. - */ - -package org.pytorch.executorch; - -import com.facebook.jni.HybridData; -import com.facebook.jni.annotations.DoNotStrip; -import com.facebook.soloader.nativeloader.NativeLoader; -import com.facebook.soloader.nativeloader.SystemDelegate; -import java.io.Closeable; -import java.util.HashMap; -import java.util.Map; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReentrantLock; -import org.pytorch.executorch.annotations.Experimental; - -/** - * Java wrapper for ExecuTorch Module. - * - *

Warning: These APIs are experimental and subject to change without notice - */ -@Experimental -public class Module implements Closeable { - - static { - if (!NativeLoader.isInitialized()) { - NativeLoader.init(new SystemDelegate()); - } - // Loads libexecutorch.so from jniLibs - NativeLoader.loadLibrary("executorch"); - } - - /** Load mode for the module. Load the whole file as a buffer. */ - public static final int LOAD_MODE_FILE = 0; - - /** Load mode for the module. Use mmap to load pages into memory. */ - public static final int LOAD_MODE_MMAP = 1; - - /** Load mode for the module. Use memory locking and handle errors. */ - public static final int LOAD_MODE_MMAP_USE_MLOCK = 2; - - /** Load mode for the module. Use memory locking and ignore errors. */ - public static final int LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3; - - private final HybridData mHybridData; - - private final Map mMethodMetadata; - - @DoNotStrip - private static native HybridData initHybrid( - String moduleAbsolutePath, int loadMode, int numThreads); - - private Module(String moduleAbsolutePath, int loadMode, int numThreads) { - ExecuTorchRuntime runtime = ExecuTorchRuntime.getRuntime(); - - mHybridData = initHybrid(moduleAbsolutePath, loadMode, numThreads); - - mMethodMetadata = populateMethodMeta(); - } - - private Map populateMethodMeta() { - String[] methods = getMethods(); - Map metadata = new HashMap(); - for (String name : methods) { - metadata.put(name, new MethodMetadata(name, getUsedBackends(name))); - } - return metadata; - } - - /** Lock protecting the non-thread safe methods in mHybridData. */ - private Lock mLock = new ReentrantLock(); - - /** - * Loads a serialized ExecuTorch module from the specified path on the disk. - * - * @param modelPath path to file that contains the serialized ExecuTorch module. - * @param loadMode load mode for the module. See constants in {@link Module}. - * @return new {@link org.pytorch.executorch.Module} object which owns the model module. - */ - public static Module load(final String modelPath, int loadMode) { - return load(modelPath, loadMode, 0); - } - - /** - * Loads a serialized ExecuTorch module from the specified path on the disk. - * - * @param modelPath path to file that contains the serialized ExecuTorch module. - * @param loadMode load mode for the module. See constants in {@link Module}. - * @param numThreads the number of threads to use for inference. A value of 0 defaults to a - * hardware-specific default. - * @return new {@link org.pytorch.executorch.Module} object which owns the model module. - */ - public static Module load(final String modelPath, int loadMode, int numThreads) { - ExecuTorchRuntime.validateFilePath(modelPath, "model path"); - return new Module(modelPath, loadMode, numThreads); - } - - /** - * Loads a serialized ExecuTorch module from the specified path on the disk to run on CPU. - * - * @param modelPath path to file that contains the serialized ExecuTorch module. - * @return new {@link org.pytorch.executorch.Module} object which owns the model module. - */ - public static Module load(final String modelPath) { - return load(modelPath, LOAD_MODE_FILE); - } - - /** - * Runs the 'forward' method of this module with the specified arguments. - * - * @param inputs arguments for the ExecuTorch module's 'forward' method. Note: if method 'forward' - * requires inputs but no inputs are given, the function will not error out, but run 'forward' - * with sample inputs. - * @return return value from the 'forward' method. - */ - public EValue[] forward(EValue... inputs) { - return execute("forward", inputs); - } - - /** - * Runs the specified method of this module with the specified arguments. - * - * @param methodName name of the ExecuTorch method to run. - * @param inputs arguments that will be passed to ExecuTorch method. - * @return return value from the method. - */ - public EValue[] execute(String methodName, EValue... inputs) { - mLock.lock(); - try { - if (!mHybridData.isValid()) { - throw new IllegalStateException("Module has been destroyed"); - } - return executeNative(methodName, inputs); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native EValue[] executeNative(String methodName, EValue... inputs); - - /** - * Load a method on this module. This might help with the first time inference performance, - * because otherwise the method is loaded lazily when it's execute. Note: this function is - * synchronous, and will block until the method is loaded. Therefore, it is recommended to call - * this on a background thread. However, users need to make sure that they don't execute before - * this function returns. - */ - public void loadMethod(String methodName) { - mLock.lock(); - try { - if (!mHybridData.isValid()) { - throw new IllegalStateException("Module has been destroyed"); - } - int errorCode = loadMethodNative(methodName); - if (errorCode != 0) { - throw new ExecutorchRuntimeException(errorCode, "Failed to load method: " + methodName); - } - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native int loadMethodNative(String methodName); - - /** - * Returns the names of the backends in a certain method. - * - * @param methodName method name to query - * @return an array of backend name - */ - @DoNotStrip - private native String[] getUsedBackends(String methodName); - - /** - * Returns the names of methods. - * - * @return name of methods in this Module - */ - public String[] getMethods() { - mLock.lock(); - try { - if (!mHybridData.isValid()) { - throw new IllegalStateException("Module has been destroyed"); - } - return getMethodsNative(); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native String[] getMethodsNative(); - - /** - * Get the corresponding @MethodMetadata for a method - * - * @param name method name - * @return @MethodMetadata for this method - */ - public MethodMetadata getMethodMetadata(String name) { - mLock.lock(); - try { - if (!mHybridData.isValid()) { - throw new IllegalStateException("Module has been destroyed"); - } - MethodMetadata methodMetadata = mMethodMetadata.get(name); - if (methodMetadata == null) { - throw new IllegalArgumentException("method " + name + " does not exist for this module"); - } - return methodMetadata; - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private static native String[] readLogBufferStaticNative(); - - public static String[] readLogBufferStatic() { - return readLogBufferStaticNative(); - } - - /** Retrieve the in-memory log buffer, containing the most recent ExecuTorch log entries. */ - public String[] readLogBuffer() { - mLock.lock(); - try { - if (!mHybridData.isValid()) { - throw new IllegalStateException("Module has been destroyed"); - } - return readLogBufferNative(); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native String[] readLogBufferNative(); - - /** - * Dump the ExecuTorch ETRecord file to /data/local/tmp/result.etdump. - * - *

Currently for internal (minibench) use only. - * - * @return true if the etdump was successfully written, false otherwise. - */ - @Experimental - public boolean etdump() { - mLock.lock(); - try { - if (!mHybridData.isValid()) { - throw new IllegalStateException("Module has been destroyed"); - } - return etdumpNative(); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native boolean etdumpNative(); - - /** - * Dump the ExecuTorch ETDump file to {@code outputPath}. - * - * @param outputPath absolute path to write the etdump file to. - * @return true if the etdump was successfully written, false otherwise. - */ - @Experimental - public boolean etdump(String outputPath) { - mLock.lock(); - try { - if (!mHybridData.isValid()) { - throw new IllegalStateException("Module has been destroyed"); - } - return etdumpToNative(outputPath); - } finally { - mLock.unlock(); - } - } - - @DoNotStrip - private native boolean etdumpToNative(String outputPath); - - /** - * Explicitly destroys the native Module object. Calling this method is not required, as the - * native object will be destroyed when this object is garbage-collected. However, the timing of - * garbage collection is not guaranteed, so proactively calling {@code destroy} can free memory - * more quickly. See {@link com.facebook.jni.HybridData#resetNative}. - */ - public void destroy() { - if (mLock.tryLock()) { - try { - if (mHybridData.isValid()) { - mHybridData.resetNative(); - } - } finally { - mLock.unlock(); - } - } else { - throw new IllegalStateException("Cannot destroy module while method is executing"); - } - } - - @Override - public void close() { - destroy(); - } -} diff --git a/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.kt b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.kt new file mode 100644 index 00000000000..15f8dbbc992 --- /dev/null +++ b/extension/android/executorch_android/src/main/java/org/pytorch/executorch/Module.kt @@ -0,0 +1,267 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +package org.pytorch.executorch + +import com.facebook.jni.HybridData +import com.facebook.jni.annotations.DoNotStrip +import com.facebook.soloader.nativeloader.NativeLoader +import com.facebook.soloader.nativeloader.SystemDelegate +import java.io.Closeable +import java.util.concurrent.locks.ReentrantLock +import org.pytorch.executorch.annotations.Experimental + +/** + * Java wrapper for ExecuTorch Module. + * + * Warning: These APIs are experimental and subject to change without notice + */ +@Experimental +open class Module private constructor(moduleAbsolutePath: String, loadMode: Int, numThreads: Int) : + Closeable { + + private val mHybridData: HybridData + private val mMethodMetadata: Map + + /** Lock protecting the non-thread safe methods in mHybridData. */ + private val mLock = ReentrantLock() + + init { + ExecuTorchRuntime.getRuntime() + mHybridData = initHybrid(moduleAbsolutePath, loadMode, numThreads) + mMethodMetadata = populateMethodMeta() + } + + private fun populateMethodMeta(): Map { + val methods = getMethodsNative() + val metadata = HashMap() + for (name in methods) { + metadata[name] = MethodMetadata(name, getUsedBackends(name)) + } + return metadata + } + + /** + * Runs the 'forward' method of this module with the specified arguments. + * + * @param inputs arguments for the ExecuTorch module's 'forward' method. Note: if method 'forward' + * requires inputs but no inputs are given, the function will not error out, but run 'forward' + * with sample inputs. + * @return return value from the 'forward' method. + */ + open fun forward(vararg inputs: EValue): Array = execute("forward", *inputs) + + /** + * Runs the specified method of this module with the specified arguments. + * + * @param methodName name of the ExecuTorch method to run. + * @param inputs arguments that will be passed to ExecuTorch method. + * @return return value from the method. + */ + open fun execute(methodName: String, vararg inputs: EValue): Array { + mLock.lock() + try { + check(mHybridData.isValid) { "Module has been destroyed" } + return executeNative(methodName, *inputs) + } finally { + mLock.unlock() + } + } + + @DoNotStrip + private external fun executeNative(methodName: String, vararg inputs: EValue): Array + + /** + * Load a method on this module. This might help with the first time inference performance, + * because otherwise the method is loaded lazily when it's execute. Note: this function is + * synchronous, and will block until the method is loaded. Therefore, it is recommended to call + * this on a background thread. However, users need to make sure that they don't execute before + * this function returns. + */ + open fun loadMethod(methodName: String) { + mLock.lock() + try { + check(mHybridData.isValid) { "Module has been destroyed" } + val errorCode = loadMethodNative(methodName) + if (errorCode != 0) { + throw ExecutorchRuntimeException(errorCode, "Failed to load method: $methodName") + } + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun loadMethodNative(methodName: String): Int + + /** + * Returns the names of the backends in a certain method. + * + * @param methodName method name to query + * @return an array of backend name + */ + @DoNotStrip private external fun getUsedBackends(methodName: String): Array + + /** + * Returns the names of methods. + * + * @return name of methods in this Module + */ + open fun getMethods(): Array { + mLock.lock() + try { + check(mHybridData.isValid) { "Module has been destroyed" } + return getMethodsNative() + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun getMethodsNative(): Array + + /** + * Get the corresponding [MethodMetadata] for a method + * + * @param name method name + * @return [MethodMetadata] for this method + */ + open fun getMethodMetadata(name: String): MethodMetadata { + mLock.lock() + try { + check(mHybridData.isValid) { "Module has been destroyed" } + return mMethodMetadata[name] + ?: throw IllegalArgumentException("method $name does not exist for this module") + } finally { + mLock.unlock() + } + } + + /** Retrieve the in-memory log buffer, containing the most recent ExecuTorch log entries. */ + open fun readLogBuffer(): Array? { + mLock.lock() + try { + check(mHybridData.isValid) { "Module has been destroyed" } + return readLogBufferNative() + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun readLogBufferNative(): Array? + + /** + * Dump the ExecuTorch ETRecord file to /data/local/tmp/result.etdump. + * + * Currently for internal (minibench) use only. + * + * @return true if the etdump was successfully written, false otherwise. + */ + @Experimental + open fun etdump(): Boolean { + mLock.lock() + try { + check(mHybridData.isValid) { "Module has been destroyed" } + return etdumpNative() + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun etdumpNative(): Boolean + + /** + * Dump the ExecuTorch ETDump file to [outputPath]. + * + * @param outputPath absolute path to write the etdump file to. + * @return true if the etdump was successfully written, false otherwise. + */ + @Experimental + open fun etdump(outputPath: String): Boolean { + mLock.lock() + try { + check(mHybridData.isValid) { "Module has been destroyed" } + return etdumpToNative(outputPath) + } finally { + mLock.unlock() + } + } + + @DoNotStrip private external fun etdumpToNative(outputPath: String): Boolean + + /** + * Explicitly destroys the native Module object. Calling this method is not required, as the + * native object will be destroyed when this object is garbage-collected. However, the timing of + * garbage collection is not guaranteed, so proactively calling `destroy` can free memory more + * quickly. See [com.facebook.jni.HybridData.resetNative]. + */ + open fun destroy() { + if (mLock.tryLock()) { + try { + if (mHybridData.isValid) { + mHybridData.resetNative() + } + } finally { + mLock.unlock() + } + } else { + throw IllegalStateException("Cannot destroy module while method is executing") + } + } + + override fun close() { + destroy() + } + + companion object { + init { + if (!NativeLoader.isInitialized()) { + NativeLoader.init(SystemDelegate()) + } + NativeLoader.loadLibrary("executorch") + } + + /** Load mode for the module. Load the whole file as a buffer. */ + const val LOAD_MODE_FILE = 0 + + /** Load mode for the module. Use mmap to load pages into memory. */ + const val LOAD_MODE_MMAP = 1 + + /** Load mode for the module. Use memory locking and handle errors. */ + const val LOAD_MODE_MMAP_USE_MLOCK = 2 + + /** Load mode for the module. Use memory locking and ignore errors. */ + const val LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS = 3 + + /** + * Loads a serialized ExecuTorch module from the specified path on the disk. + * + * @param modelPath path to file that contains the serialized ExecuTorch module. + * @param loadMode load mode for the module. See constants in [Module]. + * @param numThreads the number of threads to use for inference. A value of 0 defaults to a + * hardware-specific default. + * @return new [Module] object which owns the model module. + */ + @JvmStatic + @JvmOverloads + fun load(modelPath: String?, loadMode: Int = LOAD_MODE_FILE, numThreads: Int = 0): Module { + ExecuTorchRuntime.validateFilePath(modelPath, "model path") + return Module(modelPath!!, loadMode, numThreads) + } + + @DoNotStrip + @JvmStatic + private external fun initHybrid( + moduleAbsolutePath: String, + loadMode: Int, + numThreads: Int, + ): HybridData + + @DoNotStrip @JvmStatic fun readLogBufferStatic(): Array? = readLogBufferStaticNative() + + @DoNotStrip @JvmStatic private external fun readLogBufferStaticNative(): Array? + } +} From 7c0f60a8c3e7f4c1fcc46667e669ac9eb0dffa5f Mon Sep 17 00:00:00 2001 From: Martin Pavella Date: Thu, 28 May 2026 08:10:55 +0200 Subject: [PATCH 053/103] NXP backend: Add `tanh` support with new Neutron flow. (#19753) ### Summary Add `tanh` support with new Neutron flow. ### Test plan Unit tests provided. cc @robert-kalmar @JakeStevens @digantdesai @rascani --- .../ops_converters/tanh_converter.py | 32 ++++++- .../node_converter/test_tanh_converter.py | 95 +++++++++++++++++-- backends/nxp/tests/models.py | 9 +- backends/nxp/tests/ops_aliases.py | 2 + 4 files changed, 129 insertions(+), 9 deletions(-) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/tanh_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/tanh_converter.py index 427865f8ee7..54192628e24 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/tanh_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/tanh_converter.py @@ -1,8 +1,10 @@ -# Copyright 2025 NXP +# Copyright 2025-2026 NXP # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +import torch + from executorch.backends.nxp.backend.custom_delegation_options import ( CustomDelegationOptions, ) @@ -10,6 +12,8 @@ from executorch.backends.nxp.backend.ir.lib.tflite.BuiltinOperator import ( BuiltinOperator, ) + +from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node from torch.nn import Parameter @@ -24,7 +28,33 @@ def _is_supported_in_IR( ) -> bool: return True + @staticmethod + def _is_supported_on_target( + node: Node, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + custom_delegation_options: CustomDelegationOptions, + ) -> bool: + if custom_delegation_options.use_new_flow_neutron_c: + # Requirements specified by the new Neutron flow documentation. + + if not NodeConverter.uses_quantization_type_for_io( + node, + supported_types=[torch.int8, torch.uint8], + input_indices=[0], + output_indices=[0], + ): + return False + + return True + def convert(self, node: Node): + """Convert the `aten.tanh` operator to NeutronIR `Tanh`. + The ExecuTorch schema is: + tanh( + Tensor self + ) -> Tensor + """ self.assert_convertible(node) t_op = self._create_tflite_op_with_io_tensors(node) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py index 10892d28e38..ba2f5bf07d1 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_tanh_converter.py @@ -1,4 +1,4 @@ -# Copyright 2025 NXP +# Copyright 2025-2026 NXP # All rights reserved. # # This source code is licensed under the BSD-style license found in the @@ -8,9 +8,13 @@ import kgb import numpy as np + +# noinspection PyUnusedImports +import pytest import torch from executorch.backends.nxp.nxp_backend import EdgeProgramToIRConverter +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import ( convert_run_compare, @@ -18,10 +22,13 @@ ToChannelFirstPreprocess, ToChannelLastPreprocess, ) +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier from executorch.backends.nxp.tests.models import Conv2dWithActivation -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import Convolution, Tanh, Tanh_ from parameterized import parameterized from torch.export import ExportedProgram +from executorch.backends.nxp.tests.use_qat import * # noqa F403 class TestTanhConverter(unittest.TestCase): @@ -73,10 +80,7 @@ def test_conv_tanh( lowered_module_graph = ( quantized_program.graph_module.lowered_module_0.original_module.graph ) - tanh_ops = [ - exir_ops.edge.aten.tanh.default, - exir_ops.edge.aten.tanh_.default, - ] + tanh_ops = [Tanh, Tanh_] assert graph_contains_any_of_ops(graph=lowered_module_graph, ops=tanh_ops) input_data = (np.random.random(input_shape) * 50).astype(np.int8) @@ -88,3 +92,82 @@ def test_conv_tanh( input_data=input_data, atol=2.0, ) + + +class TanhModule(torch.nn.Module): + def __init__(self, inplace: bool = False): + super().__init__() + self.inplace = inplace + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.inplace: + return torch.tanh_(x) + else: + return torch.tanh(x) + + +class TestTanhNewNeutronFlow: + + # noinspection PyMethodMayBeStatic + def assert_delegated( + self, + model, + input_shape, + mocker, + use_qat=False, + expected_delegated_ops=None, + ): + if expected_delegated_ops is None: + expected_delegated_ops = {Tanh: 1} + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops=expected_delegated_ops, + expected_non_delegated_ops={}, + ) + + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + use_qat=use_qat, + use_new_flow_neutron_c=True, # Use the new flow. + ) + + @pytest.fixture(params=[True, False], ids=lambda inplace: f"inplace = {inplace}") + def inplace(self, request): + return request.param + + def test__qat__inplace(self, mocker, use_qat, inplace): + shape = (23,) + model = TanhModule(inplace) + self.assert_delegated(model, shape, mocker, use_qat=use_qat) + + @pytest.mark.parametrize( + "shape", + [ + (16,), + (3, 5), + (2, 3, 4), + (2, 3, 4, 5), + (2, 3, 2, 3, 2), + ], + ids=lambda shape: f"{len(shape)}D", + ) + def test__shapes(self, mocker, shape): + model = TanhModule() + self.assert_delegated(model, shape, mocker) + + def test__with_convolution(self, mocker): + input_shape = (1, 3, 12, 16) + channels = input_shape[1] + model = Conv2dWithActivation( + activation=torch.tanh, in_channels=channels, out_channels=channels + ) + self.assert_delegated( + model, input_shape, mocker, expected_delegated_ops={Tanh: 1, Convolution: 1} + ) diff --git a/backends/nxp/tests/models.py b/backends/nxp/tests/models.py index 1292c4cf17d..0383734b4dd 100644 --- a/backends/nxp/tests/models.py +++ b/backends/nxp/tests/models.py @@ -456,11 +456,16 @@ def forward(self, x): class Conv2dWithActivation(torch.nn.Module): - def __init__(self, activation: torch.nn.Module | Callable, in_channels: int = 3): + def __init__( + self, + activation: torch.nn.Module | Callable, + in_channels: int = 3, + out_channels: int = 64, + ): super().__init__() self.conv = torch.nn.Conv2d( - in_channels=in_channels, out_channels=64, kernel_size=(3, 3) + in_channels=in_channels, out_channels=out_channels, kernel_size=(3, 3) ) self.activation = activation diff --git a/backends/nxp/tests/ops_aliases.py b/backends/nxp/tests/ops_aliases.py index 06eb9c84bd0..78a2ac10f55 100644 --- a/backends/nxp/tests/ops_aliases.py +++ b/backends/nxp/tests/ops_aliases.py @@ -39,6 +39,8 @@ SqueezeDim = exir_ops.edge.aten.squeeze.dim SqueezeDims = exir_ops.edge.aten.squeeze.dims SubTensor = exir_ops.edge.aten.sub.Tensor +Tanh = exir_ops.edge.aten.tanh.default +Tanh_ = exir_ops.edge.aten.tanh_.default Unsqueeze = exir_ops.edge.aten.unsqueeze.default UpsampleBilinear2D = exir_ops.edge.aten.upsample_bilinear2d.vec UpsampleNearest2D = exir_ops.edge.aten.upsample_nearest2d.vec From f59ac9d1e9ccea7a7e4ecb974c5d72051034f9b0 Mon Sep 17 00:00:00 2001 From: Martin Pavella Date: Thu, 28 May 2026 08:18:00 +0200 Subject: [PATCH 054/103] NXP backend: Enable `aten.div.Tensor` with new Neutron flow. (#19802) ### Summary Enable `aten.div.Tensor` with new Neutron flow. ### Test plan Unit tests provided. cc @robert-kalmar @JakeStevens @digantdesai @rascani --- .../generic_tests/test_convert_div_to_mul.py | 62 ++++++++++++++++++- 1 file changed, 61 insertions(+), 1 deletion(-) diff --git a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py index ee89d5d5619..9201f32349f 100644 --- a/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py +++ b/backends/nxp/tests/generic_tests/test_convert_div_to_mul.py @@ -6,6 +6,7 @@ import numpy as np import pytest import torch + from executorch.backends.nxp.aten_passes.neutron_aten_pass_manager import ( ConvertDivToMulPass, NeutronAtenPassManager, @@ -13,6 +14,7 @@ from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import ( neutron_target_spec, to_quantized_edge_program, @@ -21,11 +23,13 @@ convert_run_compare, graph_contains_any_of_ops, ) - +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier from executorch.backends.nxp.tests.models import ( NonstaticDivLinearModel, StaticDivLinearModel, ) +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import MulTensor from executorch.exir.dialects._ops import ops as exir_ops from torch.export import ExportedProgram @@ -248,3 +252,59 @@ def test_convert_div_to_mul_full_pipeline(mocker, input_shape, is_scalar): input_data=example_input, tfl_model=neutron_ir_model, ) + + +class StaticDivModel(torch.nn.Module): + def __init__(self, divisor): + super().__init__() + self.divisor = divisor + + def forward(self, x): + return x / self.divisor + + +class TestConvertDivToMulNewNeutronFlow: + + @pytest.mark.parametrize( + "input_shape", + [ + (23,), + (3, 7), + (2, 3, 4), + (1, 2, 3, 4), + (1, 2, 3, 2, 1), + ], + ids=lambda shape: f"{len(shape)}D", + ) + @pytest.mark.parametrize( + "is_scalar", + [False, True], + ids=lambda is_scalar: "scalar" if is_scalar else "tensor", + ) + def test__static__full_pipeline( + self, mocker, input_shape: tuple[int, ...], is_scalar: bool + ): + if is_scalar: + divisor = np.random.uniform(0.01, 15) + model = StaticDivModel(divisor) + else: + divisor = torch.rand(input_shape) + 0.01 + model = StaticDivModel(divisor) + + graph_verifier = DetailedGraphVerifier( + mocker, + # By the time `DetailedGraphVerifier` checks for operators, the `div` has already been replaced by `mul`. + expected_delegated_ops={MulTensor: 1}, + expected_non_delegated_ops={}, + ) + + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + use_new_flow_neutron_c=True, # Use the new flow. + ) From b48a457a783f490dcc012167ff3b9d6f93c22ed5 Mon Sep 17 00:00:00 2001 From: Sebastian Larsson <38941629+Sebastian-Larsson@users.noreply.github.com> Date: Thu, 28 May 2026 08:33:47 +0200 Subject: [PATCH 055/103] Arm backend: Remove Ethos-U core driver submodule (#19664) Use the Ethos-U scratch checkout as the source for core driver headers. Keep baremetal builds on the same driver copy as the Corstone platform flow, and remove the stale Arm third-party README entry. Signed-off-by: Sebastian Larsson --- .gitmodules | 3 --- backends/arm/CMakeLists.txt | 24 ++++++++++++++++---- backends/arm/README.md | 2 -- backends/arm/scripts/corstone_utils.cmake | 10 +++++--- backends/arm/third-party/ethos-u-core-driver | 1 - 5 files changed, 26 insertions(+), 14 deletions(-) delete mode 160000 backends/arm/third-party/ethos-u-core-driver diff --git a/.gitmodules b/.gitmodules index 917e755da27..0f4d09aa998 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,6 +1,3 @@ -[submodule "backends/arm/third-party/ethos-u-core-driver"] - path = backends/arm/third-party/ethos-u-core-driver - url = https://git.gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-core-driver.git [submodule "backends/vulkan/third-party/Vulkan-Headers"] path = backends/vulkan/third-party/Vulkan-Headers url = https://github.com/KhronosGroup/Vulkan-Headers diff --git a/backends/arm/CMakeLists.txt b/backends/arm/CMakeLists.txt index d8a6c1afce7..726fcfcd0d3 100644 --- a/backends/arm/CMakeLists.txt +++ b/backends/arm/CMakeLists.txt @@ -39,6 +39,11 @@ set(ETHOSU_LINUX_DRIVER_SOURCE_DIR PATH "Optional local path to an existing ethos-u-linux-driver stack checkout" ) +set(ETHOS_SDK_PATH + "${EXECUTORCH_ROOT}/examples/arm/arm-scratch/ethos-u" + CACHE PATH "Path to Ethos-U bare metal driver/env" +) +option(FETCH_ETHOS_U_CONTENT "Fetch ethos_u dependencies" ON) if(EXECUTORCH_BUILD_ARM_BAREMETAL AND EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) message( @@ -52,8 +57,6 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) add_compile_options("-Wall" "-Werror") - set(THIRD_PARTY_ROOT "${CMAKE_CURRENT_SOURCE_DIR}/third-party") - set(_arm_backend_sources backends/arm/runtime/EthosUBackend.cpp backends/arm/runtime/EthosUBackend_IoMemcpy.cpp @@ -72,11 +75,22 @@ if(EXECUTORCH_BUILD_ARM_BAREMETAL OR EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) executorch_delegate_ethos_u PRIVATE ${EXECUTORCH_ROOT}/backends/arm/runtime/EthosUBackend_Cortex_M.cpp ) - set(_ethosu_core_driver_include - "${THIRD_PARTY_ROOT}/ethos-u-core-driver/include" + include(${EXECUTORCH_ROOT}/backends/arm/scripts/corstone_utils.cmake) + if(FETCH_ETHOS_U_CONTENT) + fetch_ethos_u_content(${ETHOS_SDK_PATH} ${EXECUTORCH_ROOT}) + endif() + set(DRIVER_ETHOSU_INCLUDE_DIR + "${ETHOS_SDK_PATH}/core_software/core_driver/include" ) + if(NOT EXISTS "${DRIVER_ETHOSU_INCLUDE_DIR}/ethosu_driver.h") + message( + FATAL_ERROR + "Ethos-U core driver headers were not found in ${DRIVER_ETHOSU_INCLUDE_DIR}." + " Run examples/arm/setup.sh or enable FETCH_ETHOS_U_CONTENT." + ) + endif() target_include_directories( - executorch_delegate_ethos_u PRIVATE ${_ethosu_core_driver_include} + executorch_delegate_ethos_u PRIVATE ${DRIVER_ETHOSU_INCLUDE_DIR} ) target_link_libraries(executorch_delegate_ethos_u PUBLIC ethosu_core_driver) elseif(EXECUTORCH_BUILD_ARM_ETHOSU_LINUX) diff --git a/backends/arm/README.md b/backends/arm/README.md index f822077e170..237f2433cb5 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -61,8 +61,6 @@ backends/arm/ │ ├── models/ # Model level unit tests │ └── tester/ # Testing harnesses and utilities │ -├── third-party/ # External dependencies -│ ├── tosa/ # Shared TOSA backend implementation and dialect │ └── vgf/ # Implementations of VgfPartitioner and VgfBackend diff --git a/backends/arm/scripts/corstone_utils.cmake b/backends/arm/scripts/corstone_utils.cmake index 34f04ba1225..0ed1e4aea0f 100644 --- a/backends/arm/scripts/corstone_utils.cmake +++ b/backends/arm/scripts/corstone_utils.cmake @@ -8,6 +8,7 @@ function(fetch_ethos_u_content ETHOS_SDK_PATH ET_DIR_PATH) file(MAKE_DIRECTORY ${ETHOS_SDK_PATH}/../ethos_u) include(FetchContent) + find_package(Python3 REQUIRED COMPONENTS Interpreter) set(ethos_u_base_tag "26.02") FetchContent_Declare( ethos_u @@ -33,10 +34,13 @@ function(fetch_ethos_u_content ETHOS_SDK_PATH ET_DIR_PATH) "source backends/arm/scripts/utils.sh && patch_repo ${ETHOS_SDK_PATH} ${ethos_u_base_rev} ${patch_dir}" WORKING_DIRECTORY ${ET_DIR_PATH} ) - # Get ethos_u externals only if core_platform folder does not already exist. - if(NOT EXISTS "${ETHOS_SDK_PATH}/core_platform") + + # Get ethos_u externals only if core driver headers do not already exist. + if(NOT EXISTS + "${ETHOS_SDK_PATH}/core_software/core_driver/include/ethosu_driver.h" + ) execute_process( - COMMAND ${PYTHON_EXECUTABLE} fetch_externals.py -c + COMMAND ${Python3_EXECUTABLE} fetch_externals.py -c ${ethos_u_base_tag}.json fetch WORKING_DIRECTORY ${ETHOS_SDK_PATH} ) diff --git a/backends/arm/third-party/ethos-u-core-driver b/backends/arm/third-party/ethos-u-core-driver deleted file mode 160000 index 03567073fe2..00000000000 --- a/backends/arm/third-party/ethos-u-core-driver +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 03567073fe2b9802c0bd73f9534da6f8a03924d1 From 9981ba7e224265197639cabb3687d479424aeda6 Mon Sep 17 00:00:00 2001 From: Yufeng Shi Date: Thu, 28 May 2026 10:23:51 +0100 Subject: [PATCH 056/103] Arm backend: Add FP8 support for primitive lowering ops (#19805) Change-Id: I3bec5e29ea3d2daf81a46dca50e7ae0c9c11e787 cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Yufeng Shi --- .../arm/operator_support/gather_support.py | 31 ++++++++++-- .../operator_support/slice_copy_support.py | 26 +++++++++- backends/arm/operators/op_cat.py | 4 ++ backends/arm/operators/op_permute.py | 4 ++ backends/arm/operators/op_repeat.py | 4 ++ backends/arm/operators/op_tosa_gather.py | 10 ++++ backends/arm/operators/op_tosa_pad.py | 5 +- backends/arm/operators/op_tosa_scatter.py | 18 ++++++- backends/arm/operators/op_tosa_slice.py | 4 ++ backends/arm/operators/op_view.py | 4 ++ .../test/misc/test_tosa_dialect_scatter.py | 38 +++++++++++++++ backends/arm/test/ops/test_cat.py | 31 ++++++++++++ backends/arm/test/ops/test_constant_pad_nd.py | 29 ++++++++++++ backends/arm/test/ops/test_gather.py | 47 +++++++++++++++++++ backends/arm/test/ops/test_repeat.py | 25 ++++++++++ backends/arm/test/ops/test_slice.py | 26 ++++++++++ backends/arm/test/ops/test_view.py | 42 +++++++++++++++++ backends/arm/tosa/dialect/ops/gather.py | 12 +++++ backends/arm/tosa/dialect/ops/pad.py | 4 ++ backends/arm/tosa/dialect/ops/slice.py | 4 ++ 20 files changed, 360 insertions(+), 8 deletions(-) create mode 100644 backends/arm/test/misc/test_tosa_dialect_scatter.py diff --git a/backends/arm/operator_support/gather_support.py b/backends/arm/operator_support/gather_support.py index 651727cd8b6..6d923c0441c 100644 --- a/backends/arm/operator_support/gather_support.py +++ b/backends/arm/operator_support/gather_support.py @@ -49,7 +49,7 @@ class GatherSupported(SupportedTOSAOperatorCheck): targets = [exir_ops.edge.aten.gather.default] - def is_node_tosa_supported( + def is_node_tosa_supported( # noqa: C901 self, node: fx.Node, tosa_spec: TosaSpecification ) -> bool: # type: ignore[override, misc] if len(node.args) != 3: @@ -115,8 +115,14 @@ def is_node_tosa_supported( f"{node.target}: dtype {values_dtype} requires INT profile.", ) return False - # fp16/fp32/bf16: either FP profile, or INT profile (via quantization) - elif values_dtype in (torch.float16, torch.float32, torch.bfloat16): + # fp16/fp32/bf16/fp8: either FP profile, or INT profile (via quantization) + elif values_dtype in ( + torch.float16, + torch.float32, + torch.bfloat16, + torch.float8_e4m3fn, + torch.float8_e5m2, + ): if values_dtype == torch.bfloat16 and not tosa_spec.support_extension( "bf16" ): @@ -125,6 +131,22 @@ def is_node_tosa_supported( f"{node.target}: dtype {values_dtype} requires bf16 extension.", ) return False + if values_dtype == torch.float8_e4m3fn and not tosa_spec.support_extension( + "fp8e4m3" + ): + self.reporter.report_reject( + node, + f"{node.target}: dtype {values_dtype} requires fp8e4m3 extension.", + ) + return False + if values_dtype == torch.float8_e5m2 and not tosa_spec.support_extension( + "fp8e5m2" + ): + self.reporter.report_reject( + node, + f"{node.target}: dtype {values_dtype} requires fp8e5m2 extension.", + ) + return False if not (tosa_spec.support_float() or tosa_spec.support_integer()): self.reporter.report_reject( node, @@ -136,7 +158,8 @@ def is_node_tosa_supported( self.reporter.report_reject( node, f"{node.target}: unsupported values dtype {values_dtype}; " - "expected bool/int8/int16/int32/float16/bfloat16/float32.", + "expected bool/int8/int16/int32/float16/bfloat16/float32/" + "float8_e4m3fn/float8_e5m2.", ) return False diff --git a/backends/arm/operator_support/slice_copy_support.py b/backends/arm/operator_support/slice_copy_support.py index bcc3ddfbbbb..c9ef4a85bdf 100644 --- a/backends/arm/operator_support/slice_copy_support.py +++ b/backends/arm/operator_support/slice_copy_support.py @@ -53,7 +53,13 @@ def is_node_tosa_supported( values_dtype = node.args[0].meta["val"].dtype # type: ignore[union-attr] SUPPORTED_INT_DTYPES = (torch.int8, torch.int16, torch.int32) - SUPPORTED_FLOAT_DTYPES = (torch.float16, torch.float32, torch.bfloat16) + SUPPORTED_FLOAT_DTYPES = ( + torch.float16, + torch.float32, + torch.bfloat16, + torch.float8_e4m3fn, + torch.float8_e5m2, + ) SUPPORTED_DTYPES = (torch.bool,) + SUPPORTED_INT_DTYPES + SUPPORTED_FLOAT_DTYPES # bool is supported in both INT and FP profiles @@ -68,7 +74,7 @@ def is_node_tosa_supported( ) return False - # fp16/fp32/bf16: either FP profile, or INT profile (via quantization) + # fp16/fp32/bf16/fp8: either FP profile, or INT profile (via quantization) elif values_dtype in SUPPORTED_FLOAT_DTYPES: if values_dtype == torch.bfloat16 and not tosa_spec.support_extension( "bf16" @@ -78,6 +84,22 @@ def is_node_tosa_supported( f"{node.target}: dtype {values_dtype} requires bf16 extension.", ) return False + if values_dtype == torch.float8_e4m3fn and not tosa_spec.support_extension( + "fp8e4m3" + ): + self.reporter.report_reject( + node, + f"{node.target}: dtype {values_dtype} requires fp8e4m3 extension.", + ) + return False + if values_dtype == torch.float8_e5m2 and not tosa_spec.support_extension( + "fp8e5m2" + ): + self.reporter.report_reject( + node, + f"{node.target}: dtype {values_dtype} requires fp8e5m2 extension.", + ) + return False if not (tosa_spec.support_float() or tosa_spec.support_integer()): self.reporter.report_reject( node, diff --git a/backends/arm/operators/op_cat.py b/backends/arm/operators/op_cat.py index 544beefadf9..97ea651cb12 100644 --- a/backends/arm/operators/op_cat.py +++ b/backends/arm/operators/op_cat.py @@ -44,6 +44,10 @@ def define_node( supported_dtypes.extend([ts.DType.FP16, ts.DType.FP32]) if self.tosa_spec.support_extension("bf16"): supported_dtypes.append(ts.DType.BF16) + if self.tosa_spec.support_extension("fp8e4m3"): + supported_dtypes.append(ts.DType.FP8E4M3) + if self.tosa_spec.support_extension("fp8e5m2"): + supported_dtypes.append(ts.DType.FP8E5M2) validate_num_inputs(self.target, inputs, [1, 2]) input_tosa_args = [TosaArg(arg, self.tosa_spec) for arg in inputs[0].special] validate_same_dtype(self.target, [*input_tosa_args, output], ts) diff --git a/backends/arm/operators/op_permute.py b/backends/arm/operators/op_permute.py index e200478d7b3..2418131af3e 100644 --- a/backends/arm/operators/op_permute.py +++ b/backends/arm/operators/op_permute.py @@ -43,6 +43,10 @@ def define_node( supported_dtypes.extend([ts.DType.FP16, ts.DType.FP32]) if self.tosa_spec.support_extension("bf16"): supported_dtypes.append(ts.DType.BF16) + if self.tosa_spec.support_extension("fp8e4m3"): + supported_dtypes.append(ts.DType.FP8E4M3) + if self.tosa_spec.support_extension("fp8e5m2"): + supported_dtypes.append(ts.DType.FP8E5M2) validate_num_inputs(self.target, inputs, 2) validate_same_dtype(self.target, [inputs[0], output], ts) diff --git a/backends/arm/operators/op_repeat.py b/backends/arm/operators/op_repeat.py index 9b95c902847..f990dbef64b 100644 --- a/backends/arm/operators/op_repeat.py +++ b/backends/arm/operators/op_repeat.py @@ -42,6 +42,10 @@ def define_node( supported_dtypes.extend([ts.DType.FP16, ts.DType.FP32]) if self.tosa_spec.support_extension("bf16"): supported_dtypes.append(ts.DType.BF16) + if self.tosa_spec.support_extension("fp8e4m3"): + supported_dtypes.append(ts.DType.FP8E4M3) + if self.tosa_spec.support_extension("fp8e5m2"): + supported_dtypes.append(ts.DType.FP8E5M2) validate_num_inputs(self.target, inputs, 2) validate_same_dtype(self.target, [inputs[0], output], ts) diff --git a/backends/arm/operators/op_tosa_gather.py b/backends/arm/operators/op_tosa_gather.py index c242d351c06..913e2cc02b3 100644 --- a/backends/arm/operators/op_tosa_gather.py +++ b/backends/arm/operators/op_tosa_gather.py @@ -63,6 +63,16 @@ def define_node( ts.DType.FP16, ts.DType.FP32, ts.DType.BF16, + *( + [ts.DType.FP8E4M3] + if self.tosa_spec.support_extension("fp8e4m3") + else [] + ), + *( + [ts.DType.FP8E5M2] + if self.tosa_spec.support_extension("fp8e5m2") + else [] + ), ], self.tosa_spec, ) diff --git a/backends/arm/operators/op_tosa_pad.py b/backends/arm/operators/op_tosa_pad.py index 6f1cd488469..6e93adde55b 100644 --- a/backends/arm/operators/op_tosa_pad.py +++ b/backends/arm/operators/op_tosa_pad.py @@ -41,6 +41,10 @@ def define_node( supported_dtypes.extend([ts.DType.FP16, ts.DType.FP32]) if self.tosa_spec.support_extension("bf16"): supported_dtypes.append(ts.DType.BF16) + if self.tosa_spec.support_extension("fp8e4m3"): + supported_dtypes.append(ts.DType.FP8E4M3) + if self.tosa_spec.support_extension("fp8e5m2"): + supported_dtypes.append(ts.DType.FP8E5M2) validate_num_inputs(self.target, inputs, 2) validate_same_dtype(self.target, [inputs[0], output], ts) @@ -50,7 +54,6 @@ def define_node( supported_dtypes, self.tosa_spec, ) - pad_const = tosa_graph.addConst( [1], output.dtype, diff --git a/backends/arm/operators/op_tosa_scatter.py b/backends/arm/operators/op_tosa_scatter.py index b87a2598993..63c44f91fac 100644 --- a/backends/arm/operators/op_tosa_scatter.py +++ b/backends/arm/operators/op_tosa_scatter.py @@ -36,7 +36,13 @@ def define_node( validate_same_dtype(self.target, [inputs[0], inputs[2], output], ts) validate_valid_dtype( self.target, - [inputs[0], inputs[1], inputs[2], output], + [inputs[1]], + [ts.DType.INT32], + self.tosa_spec, + ) + validate_valid_dtype( + self.target, + [inputs[0], inputs[2], output], [ ts.DType.INT8, ts.DType.INT16, @@ -44,6 +50,16 @@ def define_node( ts.DType.FP32, ts.DType.FP16, ts.DType.BF16, + *( + [ts.DType.FP8E4M3] + if self.tosa_spec.support_extension("fp8e4m3") + else [] + ), + *( + [ts.DType.FP8E5M2] + if self.tosa_spec.support_extension("fp8e5m2") + else [] + ), ], self.tosa_spec, ) diff --git a/backends/arm/operators/op_tosa_slice.py b/backends/arm/operators/op_tosa_slice.py index 11ce95df466..818657642a8 100644 --- a/backends/arm/operators/op_tosa_slice.py +++ b/backends/arm/operators/op_tosa_slice.py @@ -42,6 +42,10 @@ def define_node( supported_dtypes.extend([ts.DType.FP16, ts.DType.FP32]) if self.tosa_spec.support_extension("bf16"): supported_dtypes.append(ts.DType.BF16) + if self.tosa_spec.support_extension("fp8e4m3"): + supported_dtypes.append(ts.DType.FP8E4M3) + if self.tosa_spec.support_extension("fp8e5m2"): + supported_dtypes.append(ts.DType.FP8E5M2) validate_num_inputs(self.target, inputs, 3) validate_same_dtype(self.target, [inputs[0], output], ts) diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index 94ed23e2446..ba98f746476 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -42,6 +42,10 @@ def define_node( supported_dtypes.extend([ts.DType.FP16, ts.DType.FP32]) if self.tosa_spec.support_extension("bf16"): supported_dtypes.append(ts.DType.BF16) + if self.tosa_spec.support_extension("fp8e4m3"): + supported_dtypes.append(ts.DType.FP8E4M3) + if self.tosa_spec.support_extension("fp8e5m2"): + supported_dtypes.append(ts.DType.FP8E5M2) validate_num_inputs(self.target, inputs, 2) validate_same_dtype(self.target, [inputs[0], output], ts) diff --git a/backends/arm/test/misc/test_tosa_dialect_scatter.py b/backends/arm/test/misc/test_tosa_dialect_scatter.py new file mode 100644 index 00000000000..dc75df60df9 --- /dev/null +++ b/backends/arm/test/misc/test_tosa_dialect_scatter.py @@ -0,0 +1,38 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import executorch.backends.arm.tosa.dialect # noqa: F401 +import pytest +import torch +from executorch.backends.arm.tosa.specification import ( + TosaLoweringContext, + TosaSpecification, +) +from executorch.exir.dialects._ops import ops as exir_ops +from torch._subclasses.fake_tensor import FakeTensorMode + + +@pytest.mark.parametrize( + "dtype, extension", + [ + (torch.float8_e4m3fn, "fp8e4m3"), + (torch.float8_e5m2, "fp8e5m2"), + ], +) +def test_scatter_tosa_FP_fp8(dtype: torch.dtype, extension: str): + with TosaLoweringContext( + TosaSpecification.create_from_string(f"TOSA-1.0+FP+{extension}") + ), FakeTensorMode() as mode: + values_in = mode.from_tensor( + torch.rand((1, 5, 3), dtype=torch.float32).to(dtype) + ) + indices = mode.from_tensor(torch.tensor([[1, 3]], dtype=torch.int32)) + input_tensor = mode.from_tensor( + torch.rand((1, 2, 3), dtype=torch.float32).to(dtype) + ) + output = exir_ops.backend.tosa.SCATTER.default(values_in, indices, input_tensor) + + assert output.dtype == dtype + assert tuple(output.shape) == (1, 5, 3) diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py index 1e145ef5485..29738ddbe32 100644 --- a/backends/arm/test/ops/test_cat.py +++ b/backends/arm/test/ops/test_cat.py @@ -98,6 +98,24 @@ class Cat(torch.nn.Module): 0, ), } + test_parameters_fp8 = { + "cat_rand_two_tensors_fp8e4m3": lambda: ( + ( + torch.randn(1, 2, 4, 4, dtype=torch.float32).to(torch.float8_e4m3fn), + torch.randn(1, 2, 4, 1, dtype=torch.float32).to(torch.float8_e4m3fn), + ), + 3, + "fp8e4m3", + ), + "cat_rand_dim0_fp8e5m2": lambda: ( + ( + torch.randn(1, 2, 4, 4, dtype=torch.float32).to(torch.float8_e5m2), + torch.randn(1, 2, 4, 4, dtype=torch.float32).to(torch.float8_e5m2), + ), + 0, + "fp8e5m2", + ), + } def __init__(self): super().__init__() @@ -135,6 +153,19 @@ def test_cat_tosa_FP_4d(): pipeline.run() +@common.parametrize("test_data", Cat.test_parameters_fp8) +def test_cat_tosa_FP_fp8(test_data: Tuple): + tensors, dim, tosa_extension = test_data() + pipeline = TosaPipelineFP[input_t1]( + Cat(), + (tensors, dim), + aten_op, + exir_op, + tosa_extensions=[tosa_extension], + ) + pipeline.run() + + @common.parametrize("test_data", Cat.test_parameters) def test_cat_tosa_INT(test_data: Tuple): pipeline = TosaPipelineINT[input_t1]( diff --git a/backends/arm/test/ops/test_constant_pad_nd.py b/backends/arm/test/ops/test_constant_pad_nd.py index 3742f710494..96d829851ed 100644 --- a/backends/arm/test/ops/test_constant_pad_nd.py +++ b/backends/arm/test/ops/test_constant_pad_nd.py @@ -128,6 +128,22 @@ "constant", ), } +test_data_suite_fp8 = { + "4dim_last1dim_fp8e4m3": lambda: ( + torch.rand(1, 1, 8, 8, dtype=torch.float32).to(torch.float8_e4m3fn), + (1, 1, 0, 0, 0, 0, 0, 0), + 1.0, + "constant", + "fp8e4m3", + ), + "3dim_last1dim_fp8e5m2": lambda: ( + torch.rand(1, 1, 8, dtype=torch.float32).to(torch.float8_e5m2), + (1, 0, 1, 0, 0, 0), + -0.5, + "constant", + "fp8e5m2", + ), +} class ConstantPadND(torch.nn.Module): @@ -289,6 +305,19 @@ def test_constant_pad_nd_tosa_FP(test_data: Tuple): pipeline.run() +@common.parametrize("test_data", test_data_suite_fp8) +def test_constant_pad_nd_tosa_FP_fp8(test_data: Tuple): + test_data, padding, value, mode, tosa_extension = test_data() + pipeline = TosaPipelineFP[input_t1]( + ConstantPadND(padding, value, mode), + (test_data,), + aten_op, + exir_op, + tosa_extensions=[tosa_extension], + ) + pipeline.run() + + @common.parametrize("test_data", test_data_suite) def test_constant_pad_nd_tosa_INT(test_data: Tuple): test_data, padding, value, mode = test_data() diff --git a/backends/arm/test/ops/test_gather.py b/backends/arm/test/ops/test_gather.py index 1439210373d..66cb9508c73 100644 --- a/backends/arm/test/ops/test_gather.py +++ b/backends/arm/test/ops/test_gather.py @@ -87,6 +87,36 @@ def forward(self, input_: torch.Tensor, dim_, index_: torch.Tensor): ), # Shape: [N=2, W=2, C=2] ), } +test_data_fp_fp8: dict[str, tuple[input_params, str]] = { + "test_fp8e4m3_2d": ( + ( + torch.tensor( + [[0.5, 1.25, 2.5], [3.5, 4.25, 5.75]], + dtype=torch.float8_e4m3fn, + ), + 1, + torch.tensor( + [[1, 0], [2, 1]], + dtype=torch.int64, + ), + ), + "fp8e4m3", + ), + "test_fp8e5m2_3d": ( + ( + torch.tensor( + [[[0.5, 1.5], [2.5, 3.5]], [[4.5, 5.5], [6.5, 7.5]]], + dtype=torch.float8_e5m2, + ), + 1, + torch.tensor( + [[[0, 1], [1, 0]], [[1, 0], [0, 1]]], + dtype=torch.int64, + ), + ), + "fp8e5m2", + ), +} # INT profile: integer inputs + bool (bool is supported via casts in @@ -145,6 +175,23 @@ def test_gather_tosa_FP(test_data: input_params): pipeline.run() +@common.parametrize("test_data", test_data_fp_fp8) +def test_gather_tosa_FP_fp8(test_data: tuple[input_params, str]): + input_data, tosa_extension = test_data + pipeline = TosaPipelineFP[input_params]( + Gather(), + input_data, + aten_op=Gather.aten_op, + exir_op=Gather.exir_op, + transform_passes=[ + InsertInt32CastsAfterInt64PlaceholdersPass(), + ], # int64 index are not currently supported and need to be cast to int32 + run_on_tosa_ref_model=False, # torch.gather() has no eager CPU FP8 implementation here, so eager reference execution fails. + tosa_extensions=[tosa_extension], + ) + pipeline.run() + + @common.parametrize("test_data", test_data_int | test_data_fp) def test_gather_tosa_INT(test_data: input_params): pipeline = TosaPipelineINT[input_params]( diff --git a/backends/arm/test/ops/test_repeat.py b/backends/arm/test/ops/test_repeat.py index 1a2f71183bb..3368864564d 100644 --- a/backends/arm/test/ops/test_repeat.py +++ b/backends/arm/test/ops/test_repeat.py @@ -85,6 +85,18 @@ def forward(self, x: torch.Tensor): (torch.randn(1, 1, 2, 2, dtype=torch.float16),), ), } +test_data_suite_fp8 = { + "2_x_2_fp8e4m3": lambda: ( + Repeat((2, 1)), + (torch.randn(3, 4, dtype=torch.float32).to(torch.float8_e4m3fn),), + "fp8e4m3", + ), + "4_x_4_fp8e5m2": lambda: ( + Repeat((1, 2, 3, 2)), + (torch.randn(1, 1, 2, 2, dtype=torch.float32).to(torch.float8_e5m2),), + "fp8e5m2", + ), +} @common.parametrize( @@ -102,6 +114,19 @@ def test_repeat_tosa_FP(test_data: Tuple): pipeline.run() +@common.parametrize("test_data", test_data_suite_fp8) +def test_repeat_tosa_FP_fp8(test_data: Tuple): + module, test_data, tosa_extension = test_data() + pipeline = TosaPipelineFP[input_t1]( + module, + test_data, + module.aten_op, + exir_op=[], + tosa_extensions=[tosa_extension], + ) + pipeline.run() + + @common.parametrize("test_data", test_data_suite) def test_repeat_tosa_INT(test_data: Tuple): module, test_data = test_data() diff --git a/backends/arm/test/ops/test_slice.py b/backends/arm/test/ops/test_slice.py index 090d8abb56a..28c9731a6aa 100644 --- a/backends/arm/test/ops/test_slice.py +++ b/backends/arm/test/ops/test_slice.py @@ -50,6 +50,18 @@ [(0, 1), (0, 5), (3, 5), (4, 10)], ), } +test_data_suite_fp8 = { + "ones_slice_4_fp8e4m3": lambda: ( + torch.ones((1, 12, 10, 10), dtype=torch.float32).to(torch.float8_e4m3fn), + [(0, 1), (0, 5), (3, 5), (4, 10)], + "fp8e4m3", + ), + "ones_slice_4_fp8e5m2": lambda: ( + torch.ones((1, 12, 10, 10), dtype=torch.float32).to(torch.float8_e5m2), + [(0, 1), (0, 5), (3, 5), (4, 10)], + "fp8e5m2", + ), +} class Slice(torch.nn.Module): @@ -72,6 +84,20 @@ def test_slice_tensor_tosa_FP_bf16(test_data: torch.Tensor): pipeline.run() +@common.parametrize("test_data", test_data_suite_fp8) +def test_slice_tensor_tosa_FP_fp8(test_data): + input_data, slices, tosa_extension = test_data() + pipeline = TosaPipelineFP[input_t1]( + Slice(), + (input_data, slices), + aten_op, + exir_op, + tosa_extensions=[tosa_extension], + ) + pipeline.count_tosa_ops({"SLICE": 3}) + pipeline.run() + + @common.parametrize("test_data", test_data_suite) def test_slice_tensor_tosa_INT_nchw(test_data: torch.Tensor): pipeline = TosaPipelineINT[input_t1]( diff --git a/backends/arm/test/ops/test_view.py b/backends/arm/test/ops/test_view.py index b1e62c3efef..ce5bf13f2b8 100644 --- a/backends/arm/test/ops/test_view.py +++ b/backends/arm/test/ops/test_view.py @@ -86,6 +86,48 @@ def test_view_tosa_FP(test_data: Tuple): pipeline.run() +class ViewPermuteFP8(torch.nn.Module): + def __init__(self, new_shape: tuple[int, ...], dims: tuple[int, ...]): + super().__init__() + self.new_shape = new_shape + self.dims = dims + + def forward(self, x: torch.Tensor): + # Use permute to keep the graph lowerable for FP8 tests, + # since the mul used in View is not supported with FP8. + return x.view(self.new_shape).permute(self.dims) + + +@common.parametrize( + "test_data", + { + "view_permute_fp8e4m3": lambda: ( + torch.rand((2, 3, 4), dtype=torch.float32).to(torch.float8_e4m3fn), + (2, 4, 3), + (0, 2, 1), + "fp8e4m3", + ), + "view_permute_fp8e5m2": lambda: ( + torch.rand((2, 3, 4), dtype=torch.float32).to(torch.float8_e5m2), + (2, 4, 3), + (0, 2, 1), + "fp8e5m2", + ), + }, +) +def test_view_tosa_FP_fp8_permute(test_data: Tuple): + test_tensor, new_shape, dims, tosa_extension = test_data() + pipeline = TosaPipelineFP[input_t1]( + ViewPermuteFP8(new_shape, dims), + (test_tensor,), + ["torch.ops.aten.view.default", "torch.ops.aten.permute.default"], + exir_op=[], + tosa_extensions=[tosa_extension], + ) + pipeline.count_tosa_ops({"RESHAPE": 1, "TRANSPOSE": 1}) + pipeline.run() + + @common.parametrize("test_data", View.test_suite) def test_view_tosa_INT(test_data: Tuple): test_tensor, new_shape = test_data() diff --git a/backends/arm/tosa/dialect/ops/gather.py b/backends/arm/tosa/dialect/ops/gather.py index 1e1982adae3..49374142cd6 100644 --- a/backends/arm/tosa/dialect/ops/gather.py +++ b/backends/arm/tosa/dialect/ops/gather.py @@ -42,6 +42,8 @@ def GATHER(values: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: torch.float16, torch.float32, torch.bfloat16, + torch.float8_e4m3fn, + torch.float8_e5m2, ) if values.dtype not in allowed_values_dtypes: raise TosaValueError( @@ -57,6 +59,16 @@ def GATHER(values: torch.Tensor, indices: torch.Tensor) -> torch.Tensor: op="GATHER", ) else: + required_extension = { + torch.bfloat16: "bf16", + torch.float8_e4m3fn: "fp8e4m3", + torch.float8_e5m2: "fp8e5m2", + }.get(values.dtype) + if required_extension and not tosa_spec.support_extension(required_extension): + raise TosaValueError( + f"dtype {values.dtype} requires {required_extension} extension.", + op="GATHER", + ) # Support in FP profile, or INT profile via quantization if not (tosa_spec.support_float() or tosa_spec.support_integer()): raise TosaValueError( diff --git a/backends/arm/tosa/dialect/ops/pad.py b/backends/arm/tosa/dialect/ops/pad.py index db2cab6fcfc..3b5628b0ede 100644 --- a/backends/arm/tosa/dialect/ops/pad.py +++ b/backends/arm/tosa/dialect/ops/pad.py @@ -33,6 +33,10 @@ def PAD(a: torch.Tensor, padding: List[int | torch.SymInt], *, value): supported_dtypes.update({torch.float16, torch.float32}) if tosa_spec.support_extension("bf16"): supported_dtypes.add(torch.bfloat16) + if tosa_spec.support_extension("fp8e4m3"): + supported_dtypes.add(torch.float8_e4m3fn) + if tosa_spec.support_extension("fp8e5m2"): + supported_dtypes.add(torch.float8_e5m2) if a.dtype not in supported_dtypes: raise TosaValueError( f"Input tensor dtype {a.dtype} is not supported by the target TOSA specification." diff --git a/backends/arm/tosa/dialect/ops/slice.py b/backends/arm/tosa/dialect/ops/slice.py index 553c8dd489e..3406ccf911b 100644 --- a/backends/arm/tosa/dialect/ops/slice.py +++ b/backends/arm/tosa/dialect/ops/slice.py @@ -52,6 +52,10 @@ def SLICE(a, start, size): supported_dtypes += [torch.float16, torch.float32] if tosa_spec.support_extension("bf16"): supported_dtypes += [torch.bfloat16] + if tosa_spec.support_extension("fp8e4m3"): + supported_dtypes += [torch.float8_e4m3fn] + if tosa_spec.support_extension("fp8e5m2"): + supported_dtypes += [torch.float8_e5m2] if a.dtype not in supported_dtypes: raise TosaValueError( From 990d9d198ac3aaab4403ed340d14e593ddf10dac Mon Sep 17 00:00:00 2001 From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com> Date: Thu, 28 May 2026 11:52:24 +0200 Subject: [PATCH 057/103] Arm backend: Add cmsis_nn fallback example (#19768) Describes how the Ethos-U and Cortex-M backend can be used together to accelerate e.g. op configurations not supported on Ethos-U55, and common pitfalls to consider in doing this. Signed-off-by: Adrian Lundell Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- .../ethos_u_cmsis_nn_fallback_example.ipynb | 262 ++++++++++++++++++ 1 file changed, 262 insertions(+) create mode 100644 examples/arm/ethos_u_cmsis_nn_fallback_example.ipynb diff --git a/examples/arm/ethos_u_cmsis_nn_fallback_example.ipynb b/examples/arm/ethos_u_cmsis_nn_fallback_example.ipynb new file mode 100644 index 00000000000..0dd8f7045fb --- /dev/null +++ b/examples/arm/ethos_u_cmsis_nn_fallback_example.ipynb @@ -0,0 +1,262 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Copyright 2026 Arm Limited and/or its affiliates.\n", + "#\n", + "# This source code is licensed under the BSD-style license found in the\n", + "# LICENSE file in the root directory of this source tree." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ethos-U55 with CMSIS-NN fallback example\n", + "\n", + "This guide demonstrates the current full flow for handling operators which does not lower\n", + "to the Ethos-U55 using the Cortex-M backend to make sure they use accelerated CMSIS-NN implementations. \n", + "The basic idea is that the Ethos-U backend will reject any nodes which are not supported,\n", + "leaving them to be handled by the Cortex-M backend.\n", + "\n", + "Before you begin: Make sure you have completed the `ethos_u_minimal_example` for a\n", + "basic understanding of the Ethos-U backend and have your environment setup. \n", + "\n", + "\n", + "*Some scripts in this notebook produces long output logs: Configuring the 'Customizing Notebook Layout' settings to enable 'Output:scrolling' and setting 'Output:Text Line Limit' makes this more manageable*" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup\n", + "\n", + "The first step is creating a simple model which does not fully lower to the Ethos-U55.\n", + "Importantly it is exported with channels_last data, since the Cortex-M backend currently\n", + "only supports lowering operators in that data-format. \n", + "\n", + "Constraints for the basic operations performed by the Ethos-U55 can be found in the\n", + "[Ethos-U Vela repository](https://gitlab.arm.com/artificial-intelligence/ethos-u/ethos-u-vela/-/blob/main/SUPPORTED_OPS.md?ref_type=heads#ethos-u55-and-ethos-u65-tosa-conv2d-constraints). Note that the listed operators does not map exactly to PyTorch operators, but rather a subset found in\n", + "the graph after decompositions in the Ethos-U backend." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import torch\n", + "from executorch.backends.arm.ethosu import EthosUCompileSpec, EthosUPartitioner\n", + "from executorch.backends.arm.quantizer import (\n", + " EthosUQuantizer,\n", + " get_symmetric_quantization_config,\n", + ")\n", + "from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager\n", + "from executorch.exir import (\n", + " EdgeCompileConfig,\n", + " ExecutorchBackendConfig,\n", + " to_edge_transform_and_lower,\n", + ")\n", + "from executorch.extension.export_util.utils import save_pte_program\n", + "from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e\n", + "\n", + "target = \"ethos-u55-128\"\n", + "output_path = \"ethos_u_cmsis_nn_fallback_example.pte\"\n", + "\n", + "class ToyMixedModule(torch.nn.Module):\n", + " def __init__(self):\n", + " super().__init__()\n", + " self.conv1 = torch.nn.Conv2d(\n", + " in_channels=3,\n", + " out_channels=4,\n", + " kernel_size=3,\n", + " stride=1,\n", + " padding=1,\n", + " bias=False,\n", + " )\n", + " self.conv2 = torch.nn.Conv2d(\n", + " in_channels=4,\n", + " out_channels=1,\n", + " kernel_size=3,\n", + " stride=4,\n", + " padding=1,\n", + " bias=False,\n", + " ) # Stride=4 not supported on Ethos-U55\n", + "\n", + " def forward(self, x: torch.Tensor) -> torch.Tensor:\n", + " x = self.conv1(x)\n", + " x = torch.relu(x)\n", + " return self.conv2(x)\n", + "\n", + "model = ToyMixedModule().eval().to(memory_format=torch.channels_last)\n", + "example_inputs = (\n", + " torch.randn(1, 3, 8, 8, dtype=torch.float32).to(memory_format=torch.channels_last),\n", + ")\n", + "exported_program = torch.export.export(model, example_inputs)\n", + "exported_program.module().graph.print_tabular()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Ethos-U lowering\n", + "\n", + "The Ethos-U lowering of the model is identical to the minimal example, and as expected\n", + "the printed graph leaves the regular `torch.nn.Conv2d` with `stride=4` and some quantization/dequantization nodes\n", + "outside of the Ethos_u call_delegate operator. \n", + "\n", + "One important part in this step is that this `torch.nn.Conv2d` with `stride=4` has been quantized to\n", + "a format supported by the Cortex-M backend by the Ethos-U quantizer even if it was not\n", + "delegated, since the Cortex-M backend will only lower correctly quantized operators. Would there be\n", + "a discrepancy, see the [quantizer tutorial](https://github.com/pytorch/executorch/blob/main/examples/arm/quantizer_tutorial.ipynb) for\n", + "how to configure more precise quantization." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compile_spec = EthosUCompileSpec(target=target)\n", + "quantizer = EthosUQuantizer(compile_spec)\n", + "quantizer.set_global(get_symmetric_quantization_config(is_per_channel=True))\n", + "\n", + "prepared = prepare_pt2e(exported_program.module(), quantizer)\n", + "prepared(*example_inputs)\n", + "quantized_model = convert_pt2e(prepared)\n", + "quantized_exported_program = torch.export.export(quantized_model, example_inputs)\n", + "\n", + "edge_program_manager = to_edge_transform_and_lower(\n", + " quantized_exported_program,\n", + " partitioner=[EthosUPartitioner(compile_spec)],\n", + " compile_config=EdgeCompileConfig(_check_ir_validity=False),\n", + ")\n", + "\n", + "edge_program_manager.exported_program().graph_module.graph.print_tabular()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cortex-M lowering\n", + "\n", + "Finally the Cortex-M backend is applied, and the graph is now fully accelerated. The\n", + "`cortex_m_kernels` can be spotted in the printed graph." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "edge_program_manager._edge_programs[\"forward\"] = CortexMPassManager(\n", + " edge_program_manager.exported_program()\n", + ").transform()\n", + "\n", + "executorch_program = edge_program_manager.to_executorch(\n", + " config=ExecutorchBackendConfig(extract_delegate_segments=False)\n", + ")\n", + "save_pte_program(executorch_program, output_path)\n", + "\n", + "edge_program_manager.exported_program().graph_module.graph.print_tabular()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Build\n", + "\n", + "The executor runner is built as usual, making sure to link the Cortex-M dependencies. In the available\n", + "example executor_runner CMakeFile this is already done, with the Cortex-M kernel and kernel registration libraries\n", + "`cortex_m_kernels` and `cortex_m_ops_lib` corresponding to `portable_kernels` and `arm_portable_ops_lib` for the the\n", + "unaccelerated portable kernels. For more information about kernel registration, see the\n", + "[documentation](https://docs.pytorch.org/executorch/stable/kernel-library-custom-aten-kernel.html).\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash \n", + "source arm-scratch/setup_path.sh\n", + "# Ensure CMake resolves the ExecuTorch checkout root regardless of caller env\n", + "export EXECUTORCH_ROOT=$(cd ../.. && pwd)\n", + "\n", + "# Build example executor runner application to examples/arm/ethos_u_cmsis_nn_fallback_example\n", + "cmake -DCMAKE_TOOLCHAIN_FILE=$(pwd)/ethos-u-setup/arm-none-eabi-gcc.cmake \\\n", + " -DCMAKE_BUILD_TYPE=Release \\\n", + " -DET_PTE_FILE_PATH=ethos_u_cmsis_nn_fallback_example.pte \\\n", + " -DTARGET_CPU=cortex-m55 \\\n", + " -DETHOSU_TARGET_NPU_CONFIG=ethos-u55-128 \\\n", + " -DMEMORY_MODE=Shared_Sram \\\n", + " -DSYSTEM_CONFIG=Ethos_U55_High_End_Embedded \\\n", + " -Bethos_u_cmsis_nn_fallback_example \\\n", + " -S executor_runner/standalone\n", + "cmake --build ethos_u_cmsis_nn_fallback_example -j$(nproc) -- arm_executor_runner" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Sanity check output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import subprocess\n", + "import re\n", + "\n", + "# Use quantized model in eager mode as reference. By default the executor runner will use 1:s as input.\n", + "test_inputs = (torch.ones_like(example_inputs[0]),)\n", + "reference_result = quantized_exported_program.module()(*test_inputs).flatten().tolist()\n", + "\n", + "# Run the lowered .pte file on FVP using helper script and extract the output numbers using regex\n", + "fvp_output = subprocess.run(\"../../backends/arm/scripts/run_fvp.sh --elf=ethos_u_cmsis_nn_fallback_example/arm_executor_runner --target=ethos-u55-128\", shell=True, capture_output=True)\n", + "lowered_result = [float(x) for x in re.findall(\"-?\\d\\.\\d{6}\" , str(fvp_output.stdout))]\n", + "\n", + "print(reference_result)\n", + "print(lowered_result)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv (3.10.15)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From c505aa534448371146e881b6305349d8143138a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5ns=20Nilsson?= Date: Thu, 28 May 2026 12:07:30 +0200 Subject: [PATCH 058/103] Xnnpack: Support clone.default with skip_dim_order=True (#19797) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With the default XNNPACK test config, skip_dim_order=False rewrites aten.clone.default to dim_order_ops._clone_dim_order.default. That path is already supported through CloneDimOrderConfig. Some XNNPACK export flows use skip_dim_order=True, where aten.clone.default stays as aten.clone.default and is not selected by the partitioner. Adds CloneConfig for dim-order-preserving aten.clone.default nodes so this path is partitioned directly. This reduces delegate splits in the EdgeTAM mask decoder, where profiling exports use skip_dim_order=True. cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @Sebastian-Larsson @robell @rascani Signed-off-by: Måns Nilsson --- backends/xnnpack/operators/op_clone.py | 19 +++++++++--- backends/xnnpack/partition/config/__init__.py | 3 ++ .../partition/config/generic_node_configs.py | 21 +++++++++++++ backends/xnnpack/test/ops/test_clone.py | 30 ++++++++++++++++++- 4 files changed, 68 insertions(+), 5 deletions(-) diff --git a/backends/xnnpack/operators/op_clone.py b/backends/xnnpack/operators/op_clone.py index e4ddf187ecc..c36d750148c 100644 --- a/backends/xnnpack/operators/op_clone.py +++ b/backends/xnnpack/operators/op_clone.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -13,6 +14,7 @@ NodeVisitor, register_node_visitor, ) +from executorch.backends.xnnpack.operators.quant_params import QuantParams from executorch.backends.xnnpack.serialization.xnnpack_graph_schema import ( XNNCopy, XNNGraph, @@ -25,9 +27,6 @@ class CloneVisitor(NodeVisitor): target = "aten.clone.default" - def __init__(self, *args) -> None: - super().__init__(*args) - def define_node( self, node: torch.fx.Node, @@ -35,7 +34,19 @@ def define_node( vals_to_ids: Dict[torch.fx.Node, int], debug_handle: int, ) -> None: - self.define_nodes_tensor_inputs_outputs(node, xnn_graph, vals_to_ids) + self.define_tensor( + node, + xnn_graph, + vals_to_ids, + quant_params=QuantParams.from_outputs(node), + ) + input_node = get_input_node(node, 0) + self.define_tensor( + input_node, + xnn_graph, + vals_to_ids, + quant_params=QuantParams.from_inputs(input_node, self._exported_program), + ) # Sanity check that the input and output dim order are the same. We don't # handle dim order conversions yet. diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py index d0a3e94bbc9..c6c54f083d6 100644 --- a/backends/xnnpack/partition/config/__init__.py +++ b/backends/xnnpack/partition/config/__init__.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -23,6 +24,7 @@ CatConfig, CeilConfig, ClampConfig, + CloneConfig, CloneDimOrderConfig, ConstantPadConfig, CosConfig, @@ -82,6 +84,7 @@ BMMConfig, CatConfig, CeilConfig, + CloneConfig, CloneDimOrderConfig, ConstantPadConfig, ConvolutionConfig, diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py index f58c8eefdbe..2f45a8bba04 100644 --- a/backends/xnnpack/partition/config/generic_node_configs.py +++ b/backends/xnnpack/partition/config/generic_node_configs.py @@ -239,6 +239,27 @@ def supported_precision_types(self) -> List[ConfigPrecisionType]: return [ConfigPrecisionType.FP32] +class CloneConfig(GenericNodePartitionerConfig): + target_name = "clone.default" + + def supported_precision_types(self) -> List[ConfigPrecisionType]: + return [ConfigPrecisionType.FP32] + + def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool: + if not self.check_common_constraints(node, ep): + return False + + input_meta = node.args[0].meta["val"] + output_meta = node.meta["val"] + input_dim_order = list(input_meta.dim_order()) + output_dim_order = list(output_meta.dim_order()) + if input_dim_order != output_dim_order: + why(node, reason="Only dim-order preserving clones are supported.") + return False + + return True + + class ClampConfig(GenericNodePartitionerConfig): target_name = "clamp.default" diff --git a/backends/xnnpack/test/ops/test_clone.py b/backends/xnnpack/test/ops/test_clone.py index 0396b9b2bea..bb995a6cf1e 100644 --- a/backends/xnnpack/test/ops/test_clone.py +++ b/backends/xnnpack/test/ops/test_clone.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -9,7 +10,8 @@ import unittest import torch -from executorch.backends.xnnpack.test.tester import Tester +from executorch.backends.xnnpack.test.tester import Tester, ToEdgeTransformAndLower +from executorch.backends.xnnpack.utils.configs import get_xnnpack_edge_compile_config class TestClone(unittest.TestCase): @@ -62,6 +64,32 @@ def test_fp32_clone(self): inputs = (torch.randn(2, 3, 4, 5),) self._test_clone_partitioned(inputs) + def test_fp32_clone_default_partitions_with_skip_dim_order(self): + """Test plain aten.clone.default partitioning without dim-order rewrite.""" + inputs = (torch.randn(2, 3, 4, 5),) + ( + Tester(self.Clone(), inputs) + .export() + .check_count({"torch.ops.aten.clone.default": 1}) + .to_edge_transform_and_lower( + ToEdgeTransformAndLower( + edge_compile_config=get_xnnpack_edge_compile_config( + skip_dim_order=True + ) + ) + ) + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .check_not( + [ + "executorch_exir_dialects_edge__ops_aten_clone_default", + "executorch_exir_dialects_edge__ops_dim_order_ops__clone_dim_order_default", + ] + ) + .to_executorch() + .serialize() + .run_method_and_compare_outputs() + ) + def test_fp32_clone_2d(self): """Test FP32 clone with 2D tensor - should be partitioned""" inputs = (torch.randn(10, 20),) From 94f971911d3ced56f701887d5c0fe3b501baeac4 Mon Sep 17 00:00:00 2001 From: Oscar Andersson <87121123+oscarandersson8218@users.noreply.github.com> Date: Thu, 28 May 2026 13:32:39 +0200 Subject: [PATCH 059/103] [exir] Materialize alloc shapes in ToOutVarPass (#19806) Fix a dynamic-shape lowering bug in exir. ConstraintBasedSymShapeEvalPass concretizes TensorSpec metadata, but ToOutVarPass was still building memory.alloc nodes from symbolic FakeTensor/tensor_meta shapes. That let symbolic dims leak into the generated ExecuTorch GraphModule and caused runtime failures when the lowered module was executed in Python. Build memory.alloc specs from concrete upper-bounded integer shapes instead. If an alloc shape is still not concretely bounded, raise a clear error. Add an EXIR regression test that exports a dynamic-shape model, runs ConstraintBasedSymShapeEvalPass + ToOutVarPass, and verifies that memory.alloc shapes are concrete integers. cc @digantdesai @freddan80 @per @zingo @mansnils @Sebastian-Larsson @robell @rascani --------- Signed-off-by: Oscar Andersson --- .../arm/test/models/test_torch_functions.py | 4 -- exir/passes/__init__.py | 28 +++++++---- exir/tests/test_passes.py | 49 +++++++++++++++++++ 3 files changed, 67 insertions(+), 14 deletions(-) diff --git a/backends/arm/test/models/test_torch_functions.py b/backends/arm/test/models/test_torch_functions.py index 0ca8d3ac091..c6a4c5580dc 100644 --- a/backends/arm/test/models/test_torch_functions.py +++ b/backends/arm/test/models/test_torch_functions.py @@ -97,8 +97,6 @@ def forward(self, *args): "test_data", test_parameters, xfails={ - "nonzero": "torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u4, 0). " - "Requires dynamic output shape.", "topk": "NotImplementedError: No registered serialization name for found", "sort": "NotImplementedError: No registered serialization name for found", }, @@ -124,8 +122,6 @@ def test_torch_functions_tosa_FP(test_data): "test_data", test_parameters, xfails={ - "nonzero": "torch.fx.experimental.symbolic_shapes.GuardOnDataDependentSymNode: Could not guard on data-dependent expression Eq(u4, 0). " - "Requires dynamic output shape.", "topk": "NotImplementedError: No registered serialization name for found", "sort": "NotImplementedError: No registered serialization name for found", }, diff --git a/exir/passes/__init__.py b/exir/passes/__init__.py index 9b1b8efe682..ede866549b2 100644 --- a/exir/passes/__init__.py +++ b/exir/passes/__init__.py @@ -62,6 +62,7 @@ from executorch.exir.passes.to_device_pass import ToDevicePass from executorch.exir.passes.weights_to_outputs_pass import weights_to_outputs_pass +from executorch.exir.sym_util import eval_shape_upper_bound from torch import fx from torch._subclasses import FakeTensor from torch.fx.passes.infra.pass_base import PassBase, PassResult @@ -281,31 +282,38 @@ def make_alloc_node( Note: tensor_metadata is only used in the case of a Tensor subclass, since fakifying a tensor subclass is not supported right now """ + + def materialize_alloc_spec( + shape: Union[torch.Size, Tuple[int, ...], List[int]], + dtype: torch.dtype, + ) -> memory.AllocSpec: + concrete_shape = eval_shape_upper_bound(shape) + if any(not isinstance(dim, int) for dim in concrete_shape): + raise RuntimeError( + "Memory allocator node requires concrete upper-bounded dimensions. " + f"Got shape {shape} and evaluated upper bounds {concrete_shape}." + ) + return (tuple(concrete_shape), dtype) + if val is None: if tensor_meta is not None: assert isinstance(tensor_meta, TensorMetadata) - alloc_spec = (tensor_meta.shape, tensor_meta.dtype) + alloc_spec = materialize_alloc_spec(tensor_meta.shape, tensor_meta.dtype) else: raise InternalError( "Memory allocator node needs FakeTensor val or TensorMetadata to proceed" ) elif isinstance(val, FakeTensor): - alloc_spec = (val.shape, val.dtype) + alloc_spec = materialize_alloc_spec(val.shape, val.dtype) else: assert isinstance(val, list) or isinstance(val, tuple) assert isinstance(tensor_meta, list) or isinstance(tensor_meta, tuple) alloc_spec: List[memory.AllocSpec] = [] for v, t in zip(val, tensor_meta): if v is not None: - # pyre-fixme[6]: For 1st argument expected - # `Union[List[Tuple[List[int], dtype]], Tuple[List[int], dtype]]` but - # got `Tuple[Size, dtype]`. - alloc_spec.append((v.shape, v.dtype)) + alloc_spec.append(materialize_alloc_spec(v.shape, v.dtype)) elif t is not None: - # pyre-fixme[6]: For 1st argument expected - # `Union[List[Tuple[List[int], dtype]], Tuple[List[int], dtype]]` but - # got `Tuple[Size, dtype]`. - alloc_spec.append((t.shape, t.dtype)) + alloc_spec.append(materialize_alloc_spec(t.shape, t.dtype)) else: raise InternalError( "Memory allocator node needs FakeTensor val or TensorMetadata to proceed" diff --git a/exir/tests/test_passes.py b/exir/tests/test_passes.py index 8a084ba491a..1316dffb828 100644 --- a/exir/tests/test_passes.py +++ b/exir/tests/test_passes.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -74,6 +75,7 @@ ) from executorch.exir.passes.scalar_to_tensor_pass import ScalarToTensorPass from executorch.exir.passes.spec_prop_pass import SpecPropPass +from executorch.exir.passes.sym_shape_eval_pass import ConstraintBasedSymShapeEvalPass from executorch.exir.passes.sym_to_tensor_pass import SymToTensorPass from executorch.exir.program._program import lift_constant_tensor_pass from executorch.exir.schema import TensorShapeDynamism @@ -1036,6 +1038,53 @@ def test_alloc_node_spec(self) -> None: for node in alloc_nodes: self.assertTrue(isinstance(node.meta.get("spec", None), TensorSpec)) + def test_to_out_var_dynamic_alloc_uses_concrete_upper_bounds(self) -> None: + class DynamicRelu(nn.Module): + def forward(self, x): + return torch.relu(x) + + eager_model = DynamicRelu() + inputs = (torch.randn(2, 4, 8, 3),) + dynamic_shapes = { + "x": { + 0: torch.export.Dim("batch", min=0, max=2), + 2: torch.export.Dim("height", min=0, max=8), + 3: torch.export.Dim("width", min=0, max=8), + } + } + prog = to_edge( + export( + eager_model, + inputs, + dynamic_shapes=dynamic_shapes, + strict=True, + ), + compile_config=exir.EdgeCompileConfig(_check_ir_validity=False), + ) + new_prog = prog.transform( + [ + SpecPropPass(), + ConstraintBasedSymShapeEvalPass(), + ] + ) + + new_gm_res = ToOutVarPass()(new_prog.exported_program().graph_module) + self.assertIsNotNone(new_gm_res) + new_gm = new_gm_res.graph_module + + alloc_nodes = [] + for node in new_gm.graph.nodes: + if node.target == memory.alloc: + alloc_nodes.append(node) + + self.assertTrue(len(alloc_nodes) > 0) + for node in alloc_nodes: + alloc_spec = node.args[0] + self.assertIsInstance(alloc_spec, tuple) + shape, _dtype = alloc_spec + for dim in shape: + self.assertIsInstance(dim, int) + def test_debug_pass_file_log(self) -> None: eager_model = Mul() inputs = eager_model.get_random_inputs() From 5ca3207e1c10d8a8841a80a12fdb65fe89a86294 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?= <33344797+martinlsm@users.noreply.github.com> Date: Thu, 28 May 2026 13:41:23 +0200 Subject: [PATCH 060/103] Arm backend: Update examples/arm/README.md (#19756) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make the README concise for setup, run.sh usage, example notebooks, applications, and helper scripts. Move broader backend documentation links to the backend README. Signed-off-by: Martin Lindström --- backends/arm/README.md | 6 +- examples/arm/README.md | 206 +++++++++++++---------------------------- 2 files changed, 67 insertions(+), 145 deletions(-) diff --git a/backends/arm/README.md b/backends/arm/README.md index 237f2433cb5..8edd3665d44 100644 --- a/backends/arm/README.md +++ b/backends/arm/README.md @@ -136,8 +136,10 @@ The delegated Python API flow is: For complete examples of that flow, including quantization and target-specific compile specs, see: -- `docs/source/backends/arm-ethos-u/tutorials/ethos-u-getting-started.md` -- `docs/source/backends/arm-vgf/tutorials/vgf-getting-started.md` +- [Arm Ethos-U tutorial](../../docs/source/backends/arm-ethos-u/tutorials/ethos-u-getting-started.md) +- [Arm VGF tutorial](../../docs/source/backends/arm-vgf/tutorials/vgf-getting-started.md) +- [Arm Cortex-M backend overview](../../docs/source/backends/arm-cortex-m/arm-cortex-m-overview.md) +- [Ethos-U porting guide](../../examples/arm/ethos-u-porting-guide.md) Additional examples are available in `examples/arm`. diff --git a/examples/arm/README.md b/examples/arm/README.md index c5f5bb24862..07aecec51e2 100644 --- a/examples/arm/README.md +++ b/examples/arm/README.md @@ -5,175 +5,95 @@ This source code is licensed under the BSD-style license found in the LICENSE file in the root directory of this source tree. --> -## ExecuTorch for Arm backends Ethos-U, VGF and Cortex-M +# Examples for Arm backends Ethos-U, VGF and Cortex-M -This project contains scripts to help you setup and run a PyTorch -model on a Arm backend via ExecuTorch. This backend supports Ethos-U and VGF as -targets (using TOSA) but you can also use the Ethos-U example runner as an example -on Cortex-M if you do not delegate the model. +This directory contains documentation and scripts to +help you setup and run a PyTorch model on the Arm backend +via ExecuTorch. -The main scripts are `setup.sh`, `run.sh` and -`backends/arm/scripts/aot_arm_compiler.py`. +## setup.sh -`setup.sh` will install the needed tools and with --root-dir -you can change the path to a scratch folder where it will download and generate build -artifacts. If supplied, you must also supply the same folder to run.sh with ---scratch-dir= If not supplied both scripts will use examples/arm/arm-scratch. +`setup.sh` downloads the Arm cross-compilation toolchain and Corstone FVP +simulators, installs the Python dependencies for TOSA, Ethos-U Vela, and +Cortex-M/CMSIS-NN, and generates `setup_path.sh` scripts for adding those tools +to your environment. Optional flags also install VGF/MLSDK and Vulkan +dependencies. -`run.sh` can be used to build, run and test a model in an easy way and it will call cmake for you -and in cases you want to run a simulator it will start it also. The script will call `aot_arm_compiler.py` -to convert a model and include it in the build/run. - -For bare-metal Ethos-U builds `run.sh` configures the standalone -`examples/arm/executor_runner/standalone` CMake entry point automatically. If -`--build-dir` is omitted, the script creates and owns a build tree under -`arm_test/_`. Supplying `--build-dir` reuses an existing tree -(for example a VGF host build or out-of-tree configuration) and `run.sh` -verifies it exposes the runner options it needs before compiling. - -Build and test artifacts are by default placed under the folder arm_test folder -this can be changed with --et_build_root= - -`aot_arm_compiler.py` is used to convert a Python model or a saved .pt model to a PTE file and is used by `run.sh` -and other test script but can also be used directly. - - -## Create a PTE file for Arm backends - -There is an easy to use example flow to compile your PyTorch model to a PTE file for the Arm backend called `aot_arm_compiler.py` -that you can use to generate PTE files, it can generate PTE files for the supported targets `-t` or even non delegated (Cortex-M) -using different memory modes and can both use a python file as input or just use the models from examples/models with `--model_name`. -It also supports generating Devtools artifacts like BundleIO BPTE files, and ETRecords. Run it with `--help` to check its capabilities. - -You point out the model to convert with `--model_name=` It supports running a model from examples/models or models -from a python file if you just specify `ModelUnderTest` and `ModelInputs` in it. - -``` -$ python3 -m backends.arm.scripts.aot_arm_compiler --help -``` - -This is how you generate a BundleIO BPTE of a simple add example +Example to install the default Arm backend dependencies and add them to your current shell: +```bash +./examples/arm/setup.sh --i-agree-to-the-contained-eula +source examples/arm/arm-scratch/setup_path.sh ``` -$ python3 -m backends.arm.scripts.aot_arm_compiler --model_name=examples/arm/example_modules/add.py --target=ethos-u55-128 --bundleio -``` - -The example model used has added two extra variables that is picked up to make this work. - -`ModelUnderTest` should be a `torch.nn.module` instance. - -`ModelInputs` should be a tuple of inputs to the forward function. - - -You can also use the models from example/models directly by just using the short name e.g. - -``` -$ python3 -m backends.arm.scripts.aot_arm_compiler --model_name=mv2 --target=ethos-u55-64 -``` - - -`aot_arm_compiler.py` is called from the scripts below so you don't need to, but it can be useful to do by hand in some cases. -## Host VGF example applications +## run.sh -The Arm examples directory also contains host-side VGF reference flows for -specific tasks: +`run.sh` is an end-to-end helper for building and executing an Arm backend +example. It sources the `setup_path.sh` script generated by `setup.sh`, runs +`aot_arm_compiler.py` to convert the selected model to a `.pte` or `.bpte`, +builds the matching runner with CMake, and starts the simulator or runtime for +the selected target when `--build_only` is not set. -- `examples/arm/image_classification_example_vgf` for DEiT image - classification. -- `examples/arm/super_resolution_example_vgf` for Swin2SR image - super-resolution. - - -## ExecuTorch on Arm Ethos-U55/U65 and U85 - -This example code will help you get going with the Corstone™-300/320 platforms and -run on the FVP and can be used a starting guide in your porting to your board/HW - -We will start from a PyTorch model in python, export it, convert it to a `.pte` -file - A binary format adopted by ExecuTorch. Then we will take the `.pte` -model file and embed that with a baremetal application executor_runner. We will -then take the executor_runner file, which contains not only the `.pte` binary but -also necessary software components to run standalone on a baremetal system. -The build flow will pick up the non delegated ops from the generated PTE file and -add CPU implementation of them. -Lastly, we will run the executor_runner binary on a Corstone™-300/320 FVP Simulator platform. - - -### Example workflow - -Below is example workflow to build an application for Ethos-U55/85. The script below requires an internet connection: - -``` -# Step [1] - setup necessary tools -$ cd -$ ./examples/arm/setup.sh --i-agree-to-the-contained-eula - -# Step [2] - Setup path to tools, The `setup.sh` script has generated a script that you need to source every time you restart you shell. -$ source examples/arm/arm-scratch/setup_path.sh +Build and test artifacts are written to `arm_test` by default. Use +`--et_build_root=` to choose another build root. -# Step [3] - build and run ExecuTorch and executor_runner baremetal example application -# on a Corstone(TM)-320 FVP to run a simple PyTorch model from a file. -$ ./examples/arm/run.sh --model_name=examples/arm/example_modules/add.py --target=ethos-u85-128 -``` - -The argument `--model_name=` is passed to `aot_arm_compiler.py` so you can use it in the same way -e.g. you can also use the models from example/models directly in the same way as above. +For example, after running `setup.sh` and sourcing the generated +`setup_path.sh`, build and run a model on an Ethos-U85 target with: -``` -$ ./examples/arm/run.sh --model_name=mv2 --target=ethos-u55-64 +```bash +./examples/arm/run.sh --model_name=examples/arm/example_modules/add.py --target=ethos-u85-128 ``` -The runner will by default set all inputs to "1" and you are supposed to add/change the code -handling the input for your hardware target to give the model proper input, maybe from your camera -or mic hardware. +For bundled input/output and ETDump testing: -While testing you can use the --bundleio flag to use the input from the python model file and -generate a .bpte instead of a .pte file. This will embed the input example data and reference output -in the bpte file/data, which is used to verify the model's output. You can also use --etdump to generate -an ETRecord and a ETDump trace files from your target (they are printed as base64 strings in the serial log). - -Just keep in mind that CPU cycles are NOT accurate on the FVP simulator and it can not be used for -performance measurements, so you need to run on FPGA or actual ASIC to get good results from --etdump. -As a note the printed NPU cycle numbers are still usable and closer to real values if the timing -adaptor is setup correctly. - -``` -# Build + run with BundleIO and ETDump -$ ./examples/arm/run.sh --model_name=lstm --target=ethos-u85-128 --bundleio --etdump +```bash +./examples/arm/run.sh --model_name=lstm --target=ethos-u85-128 --bundleio --etdump ``` +For Cortex-M testing, use a Cortex-M target and bundled I/O: -### Ethos-U minimal example - -See the jupyter notebook `ethos_u_minimal_example.ipynb` for an explained minimal example of the full flow for running a -PyTorch module on the EthosUDelegate. The notebook runs directly in some IDE:s s.a. VS Code, otherwise it can be run in -your browser using -``` -pip install jupyter -jupyter notebook ethos_u_minimal_example.ipynb +```bash +./examples/arm/run.sh --model_name=mv2 --target=cortex-m55 --bundleio ``` -## ExecuTorch on ARM Cortex-M +## Example Contents -For Cortex-M you run the script without delegating e.g `--no_delegate` as the build flow already supports picking up -the non delegated ops from the generated PTE file and add CPU implementation of them this will work out of the box in -most cases. +### Notebook examples -To run mobilenet_v2 on the Cortex-M55 only, without using the Ethos-U try this: +- [ethos_u_minimal_example.ipynb](ethos_u_minimal_example.ipynb) - Minimal + Ethos-U AOT, runtime build, and FVP execution flow. +- [vgf_minimal_example.ipynb](vgf_minimal_example.ipynb) - Minimal VGF + lowering and host execution flow. +- [cortex_m_mv2_example.ipynb](cortex_m_mv2_example.ipynb) - Cortex-M + MobileNetV2 export, quantization, runtime build, and FVP execution flow. +- [pruning_minimal_example.ipynb](pruning_minimal_example.ipynb) - Model + conditioning and pruning flow for Ethos-U85. +- [quantizer_tutorial.ipynb](quantizer_tutorial.ipynb) - Quantizer tutorial + for TOSA, Ethos-U, and VGF quantizers. -``` -$ ./examples/arm/run.sh --model_name=mv2 --target=ethos-u55-128 --no_delegate -``` +### Application examples +- [image_classification_example_ethos_u](image_classification_example_ethos_u/) + - End-to-end DEiT-Tiny image classification flow for Ethos-U, including + model fine-tuning, export, bare-metal runtime build, and Corstone-320 FVP + execution. +- [image_classification_example_vgf](image_classification_example_vgf/) - + DEiT-Tiny image classification flow for VGF host execution. +- [super_resolution_example_vgf](super_resolution_example_vgf) - Swin2SR image + super-resolution. +- [example_modules/add.py](example_modules/add.py) - Small external model file + usable with `run.sh --model_name=examples/arm/example_modules/add.py`. -### Online Tutorial +### Utility examples and guides -We also have a [tutorial](https://pytorch.org/executorch/stable/backends-arm-ethos-u) explaining the steps performed in these -scripts, expected results, possible problems and more. It is a step-by-step guide -you can follow to better understand this delegate. +- [ethos-u-porting-guide.md](ethos-u-porting-guide.md) - Notes for adapting + the example Ethos-U runtime integration to another target. +- [export_standalone_tosa_graph.py](export_standalone_tosa_graph.py) - + Example of exporting a standalone TOSA graph with multiple outputs. +- [visualize.py](visualize.py) - Helper used by `run.sh --model_explorer` to + visualize TOSA or PTE graphs. -### Project Templates +## Project Templates These project templates provide alternative starting points with different toolchains and build systems: From 96b19af7744debd62f8cac2579a03de18069e36d Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Thu, 28 May 2026 14:20:00 +0200 Subject: [PATCH 061/103] Arm backend: Guard empty cmake arg array in build_executorch (#19840) Avoid expanding extra_cmake_args when the array is empty. Older Bash versions on macOS treat an empty array expansion under set -u as an unbound variable. Append the extra CMake arguments only when the array is non-empty so the script behaves the same on Linux and macOS. Signed-off-by: Erik Lundell --- backends/arm/scripts/build_executorch.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh index 5ac2674f964..5ebc0eb46b4 100755 --- a/backends/arm/scripts/build_executorch.sh +++ b/backends/arm/scripts/build_executorch.sh @@ -96,9 +96,12 @@ cmake_args=( -DEXECUTORCH_BUILD_DEVTOOLS=${build_devtools} -DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump} -DEXECUTORCH_BAREMETAL_SKIP_INSTALL=OFF - "${extra_cmake_args[@]}" ) +if [[ ${#extra_cmake_args[@]} -gt 0 ]]; then + cmake_args+=("${extra_cmake_args[@]}") +fi + if [[ -n "${target_cpu}" ]]; then cmake_args+=(-DTARGET_CPU=${target_cpu}) fi From b903c30c046676c8f38df3caef8e4da44ed2b170 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Thu, 28 May 2026 14:21:37 +0200 Subject: [PATCH 062/103] Arm backend: Fix vgf_quant swin test op-count and test vgf models in trunk job. (#19841) --- .github/workflows/trunk.yml | 1 + backends/arm/test/models/test_swin2sr_arm.py | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 5a6720cdfad..cca1fe5fe45 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -278,6 +278,7 @@ jobs: matrix: include: - test_arm_backend: test_pytest_ops_vkml + - test_arm_backend: test_pytest_models_vkml - test_arm_backend: test_ootb_tests_vgf fail-fast: false with: diff --git a/backends/arm/test/models/test_swin2sr_arm.py b/backends/arm/test/models/test_swin2sr_arm.py index e4fc6f07950..5fd29943b94 100644 --- a/backends/arm/test/models/test_swin2sr_arm.py +++ b/backends/arm/test/models/test_swin2sr_arm.py @@ -42,6 +42,9 @@ "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 5, "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 6, } +swin2sr_vgf_quant_lowered_outer_graph_ops = { + "torch.ops.higher_order.executorch_call_delegate": 1, +} class TinySwin2SR(torch.nn.Module): @@ -110,7 +113,7 @@ def test_swin2sr_vgf_quant(): quantize=True, run_on_vulkan_runtime=sys.platform == "linux", ) - pipeline.change_args("check_count.exir", swin2sr_int_lowered_outer_graph_ops) + pipeline.change_args("check_count.exir", swin2sr_vgf_quant_lowered_outer_graph_ops) pipeline.run() From acce7cd6f1558132e40edd9a25b12febaf7beb79 Mon Sep 17 00:00:00 2001 From: robert-kalmar Date: Thu, 28 May 2026 17:00:49 +0200 Subject: [PATCH 063/103] NXP Backend: Force backend (NeutronBackend) destructor call before neutronDeInit() (#19795) ### Summary The `NeutronBackend::destroy` function shall be called before the Neutron driver's `neutronDeInit()` function to avoid double free. At this moment the ExecuTorch does not provide means to destroy the backend or the method outside of the method's desctructor. ### Test plan With upcomming eIQ Neutron SDK 3.1.2 the nxp-executor-runner crash, so existing unit tests covers this problem. cc @JakeStevens @digantdesai @rascani --- .../executor_runner/nxp_executor_runner.cpp | 183 +++++++++--------- 1 file changed, 93 insertions(+), 90 deletions(-) diff --git a/examples/nxp/executor_runner/nxp_executor_runner.cpp b/examples/nxp/executor_runner/nxp_executor_runner.cpp index 65f5831e5c5..52d7c778227 100644 --- a/examples/nxp/executor_runner/nxp_executor_runner.cpp +++ b/examples/nxp/executor_runner/nxp_executor_runner.cpp @@ -384,71 +384,30 @@ int main(int argc, char* argv[]) { torch::executor::MemoryManager memory_manager( &method_allocator, &planned_memory, &tmp_allocator); - Result method = - program->load_method(method_name, &memory_manager); - if (!method.ok()) { - fprintf( - stderr, - "Loading of method (%s) failed with status %" PRIu32 "...\n", - method_name, - (unsigned int)method.error()); - exit(-1); - } - printf("Method loaded...\n"); - - Error status = Error::Ok; - if (!FLAGS_dataset.empty()) { - // Go through entire dataset for this model. - FLAGS_dataset += "/"; - while (dataset = readdir(datasetDir)) { - if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) - continue; - - std::vector inputsData; - inputsData.push_back(FLAGS_dataset + dataset->d_name); - // Set input and call inferrence. - setInputs(method.get(), inputsData); - - status = method->execute(); - if (status != Error::Ok) { - fprintf( - stderr, - "Execution of method %s failed with status %" PRIu32 "...\n", - method_name, - (unsigned int)status); - exit(-1); - } else { - printf("Method executed successfully...\n"); - } - - // Save outputs in binary files. - saveOutputs(method.get(), FLAGS_output, dataset->d_name); - // Print result with highest confidence. - printOutput(method.get(), FLAGS_output, dataset->d_name); + { + Result method = + program->load_method(method_name, &memory_manager); + if (!method.ok()) { + fprintf( + stderr, + "Loading of method (%s) failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)method.error()); + exit(-1); } - closedir(datasetDir); - } else if (!FLAGS_inputs.empty()) { - std::vector inputPaths; - - // Validate and process inputs and separate into two lists. - processInputs(inputPaths, FLAGS_inputs); - - if (std::all_of(inputPaths.begin(), inputPaths.end(), isDirectory)) { - // Inputs are in directories - use files in each directory as the inputs. - std::vector inputsData; - for (std::string& inputDir : inputPaths) { - datasetDir = opendir(inputDir.c_str()); - while (dataset = readdir(datasetDir)) { - if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) - continue; - - inputsData.push_back(inputDir + "/" + dataset->d_name); - } - closedir(datasetDir); - - // Sort inputsData to ensure correct input ordering - std::sort(inputsData.begin(), inputsData.end()); - + printf("Method loaded...\n"); + + Error status = Error::Ok; + if (!FLAGS_dataset.empty()) { + // Go through entire dataset for this model. + FLAGS_dataset += "/"; + while (dataset = readdir(datasetDir)) { + if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) + continue; + + std::vector inputsData; + inputsData.push_back(FLAGS_dataset + dataset->d_name); + // Set input and call inferrence. setInputs(method.get(), inputsData); status = method->execute(); @@ -463,37 +422,81 @@ int main(int argc, char* argv[]) { printf("Method executed successfully...\n"); } - if (inputDir.back() == '/') - inputDir.pop_back(); - - auto pos = inputDir.find_last_of('/'); - if (pos != std::string::npos) - inputDir = inputDir.substr(pos + 1); - // Save outputs in binary files. - saveOutputs(method.get(), FLAGS_output, inputDir.c_str()); - inputsData.clear(); + saveOutputs(method.get(), FLAGS_output, dataset->d_name); + // Print result with highest confidence. + printOutput(method.get(), FLAGS_output, dataset->d_name); } - } else { - // Inputs are files. - setInputs(method.get(), inputPaths); - - status = method->execute(); - if (status != Error::Ok) { - fprintf( - stderr, - "Execution of method %s failed with status %" PRIu32 "...\n", - method_name, - (unsigned int)status); - exit(-1); + closedir(datasetDir); + } else if (!FLAGS_inputs.empty()) { + std::vector inputPaths; + + // Validate and process inputs and separate into two lists. + processInputs(inputPaths, FLAGS_inputs); + + if (std::all_of(inputPaths.begin(), inputPaths.end(), isDirectory)) { + // Inputs are in directories - use files in each directory as the + // inputs. + std::vector inputsData; + for (std::string& inputDir : inputPaths) { + datasetDir = opendir(inputDir.c_str()); + while (dataset = readdir(datasetDir)) { + if (!strcmp(dataset->d_name, ".") || !strcmp(dataset->d_name, "..")) + continue; + + inputsData.push_back(inputDir + "/" + dataset->d_name); + } + closedir(datasetDir); + + // Sort inputsData to ensure correct input ordering + std::sort(inputsData.begin(), inputsData.end()); + + setInputs(method.get(), inputsData); + + status = method->execute(); + if (status != Error::Ok) { + fprintf( + stderr, + "Execution of method %s failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)status); + exit(-1); + } else { + printf("Method executed successfully...\n"); + } + + if (inputDir.back() == '/') + inputDir.pop_back(); + + auto pos = inputDir.find_last_of('/'); + if (pos != std::string::npos) + inputDir = inputDir.substr(pos + 1); + + // Save outputs in binary files. + saveOutputs(method.get(), FLAGS_output, inputDir.c_str()); + inputsData.clear(); + } } else { - printf("Method executed successfully...\n"); - } + // Inputs are files. + setInputs(method.get(), inputPaths); + + status = method->execute(); + if (status != Error::Ok) { + fprintf( + stderr, + "Execution of method %s failed with status %" PRIu32 "...\n", + method_name, + (unsigned int)status); + exit(-1); + } else { + printf("Method executed successfully...\n"); + } - // Save outputs in binary files. - saveOutputs(method.get(), FLAGS_output); + // Save outputs in binary files. + saveOutputs(method.get(), FLAGS_output); + } } - } + } // Destruct the method object before destroying the Neutron Device. printf("Finished...\n"); From 463fbe4407eee8f5f3c70fed1a50f9d8afb206c8 Mon Sep 17 00:00:00 2001 From: Adrian Lundell <36153706+AdrianLundell@users.noreply.github.com> Date: Thu, 28 May 2026 18:41:05 +0200 Subject: [PATCH 064/103] Add general Aten lowering pass (#19837) Adds a simple pass for replacing single Aten ops with corresponding dialect ops to be reused across multiple backends. Signed-off-by: Adrian Lundell --- backends/transforms/aten_to_dialect_pass.py | 138 ++++++++++ backends/transforms/targets.bzl | 25 ++ .../test/test_aten_to_dialect_pass.py | 239 ++++++++++++++++++ 3 files changed, 402 insertions(+) create mode 100644 backends/transforms/aten_to_dialect_pass.py create mode 100644 backends/transforms/test/test_aten_to_dialect_pass.py diff --git a/backends/transforms/aten_to_dialect_pass.py b/backends/transforms/aten_to_dialect_pass.py new file mode 100644 index 00000000000..f31df73bc58 --- /dev/null +++ b/backends/transforms/aten_to_dialect_pass.py @@ -0,0 +1,138 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import traceback +from collections.abc import Callable +from dataclasses import dataclass +from typing import ClassVar, TypeAlias + +import torch + +from executorch.backends.xnnpack._passes.xnnpack_pass import ExportPass + +from executorch.exir import ExportedProgram +from torch.fx.node import Target +from torch.fx.passes.infra.pass_manager import PassResult + + +# Expected type to be returned by substitution functions. +@dataclass +class DialectNodeSpec: + op: Target + args: tuple + kwargs: dict = None + + +# Expected type to be used for substitution functions +SubstitutionFn: TypeAlias = Callable[ + [torch.fx.Node, torch.export.ExportedProgram], DialectNodeSpec | None +] + + +class AtenToDialectPass(ExportPass): + """ + General pass to convert ops 1-1 from ATen to a specific dialect. + + Usage: + 1. Subclass the pass for a specific dialect + 2. For each ATen target to be substituted, implement a function returning a DialectNodeSpec defining the + corresponding dialect op, or None if the substitution does not apply. + 3. Register each substitution function for the subclass using the decorator register_dialect_substitution + + Only one substitution function can be registered for a given target. + + The pass must be initialized with an exported_program to allow substitution functions to modify placeholders, + e.g. if the dialect ops require additional scratch buffers. + """ + + _DIALECT_SUBSTITUTIONS: ClassVar[dict[Target, SubstitutionFn]] = {} + + def __init__(self, exported_program: ExportedProgram): + super().__init__() + self.exported_program: ExportedProgram = exported_program + + # Ensure each subclass has its own substitution registry. + def __init_subclass__(cls, **kwargs): + super().__init_subclass__(**kwargs) + cls._DIALECT_SUBSTITUTIONS = {} + + @classmethod + def register_dialect_substitution( + cls, target: Target + ) -> Callable[[SubstitutionFn], SubstitutionFn]: + + def decorator(func: SubstitutionFn) -> SubstitutionFn: + if target in cls._DIALECT_SUBSTITUTIONS: + raise RuntimeError( + f"Multiple substitutions registered for the same target in {cls.__name__} are not allowed." + ) + else: + cls._DIALECT_SUBSTITUTIONS[target] = func + return func + + return decorator + + def call(self, graph_module: torch.fx.GraphModule) -> PassResult: + modified = False + + for node in graph_module.graph.nodes: + if node.op != "call_function": + continue + + substitution_func = self._DIALECT_SUBSTITUTIONS.get(node.target, None) + if substitution_func is None: + continue + + dialect_node_spec = substitution_func(node, self.exported_program) + if dialect_node_spec is None: + continue + + modified = True + with graph_module.graph.inserting_before(node): + dialect_node = graph_module.graph.create_node( + "call_function", + target=dialect_node_spec.op, + args=dialect_node_spec.args, + kwargs=dialect_node_spec.kwargs or {}, + ) + + node.replace_all_uses_with(dialect_node) + + # Keep same meta dict for new node and append new trace + dialect_node.meta = node.meta + old_stack_trace = dialect_node.meta.get("stack_trace", "") + dialect_node.meta["stack_trace"] = ( + f"{old_stack_trace}\n{traceback.format_stack()[-2]}" + ) + + graph_module.graph.erase_node(node) + + if modified: + graph_module.graph.eliminate_dead_code() + graph_module.recompile() + graph_module = super().call(graph_module).graph_module + + return PassResult(graph_module, modified) + + def requires(self, graph_module): + self.ops_before = sum( + 1 for node in graph_module.graph.nodes if node.op == "call_function" + ) + return super().requires(graph_module) + + def ensures(self, graph_module: torch.fx.GraphModule) -> bool: + """Ensure that there has only been 1-1 substitution of call_function nodes, i.e. that the number of call_function nodes is preserved after the pass.""" + + self.ops_after = sum( + 1 for node in graph_module.graph.nodes if node.op == "call_function" + ) + if self.ops_after != self.ops_before: + raise RuntimeError( + f"{self.__class__.__name__} did not preserve the number of call_function nodes: " + f"before={self.ops_before}, after={self.ops_after}" + ) + + return super().ensures(graph_module) diff --git a/backends/transforms/targets.bzl b/backends/transforms/targets.bzl index 8c3603e293d..36466ec4aa0 100644 --- a/backends/transforms/targets.bzl +++ b/backends/transforms/targets.bzl @@ -176,6 +176,21 @@ def define_common_targets(): ], ) + runtime.python_library( + name = "aten_to_dialect_pass", + srcs = [ + "aten_to_dialect_pass.py", + ], + visibility = [ + "//executorch/backends/...", + ], + deps = [ + "//caffe2:torch", + "//executorch/backends/xnnpack/_passes:xnnpack_passes", + "//executorch/exir:lib", + ], + ) + runtime.python_library( name = "rank_0_to_rank_1", srcs = [ @@ -243,6 +258,16 @@ def define_common_targets(): ], ) + runtime.python_test( + name = "test_aten_to_dialect_pass", + srcs = [ + "test/test_aten_to_dialect_pass.py", + ], + deps = [ + "//caffe2:torch", + ":aten_to_dialect_pass", + ], + ) runtime.python_test( name = "test_rank_0_to_rank_1", diff --git a/backends/transforms/test/test_aten_to_dialect_pass.py b/backends/transforms/test/test_aten_to_dialect_pass.py new file mode 100644 index 00000000000..80dbf210d72 --- /dev/null +++ b/backends/transforms/test/test_aten_to_dialect_pass.py @@ -0,0 +1,239 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import pytest +import torch +from executorch.backends.transforms.aten_to_dialect_pass import ( + AtenToDialectPass, + DialectNodeSpec, +) +from executorch.backends.transforms.utils import create_constant_placeholder +from torch.export import ExportedProgram +from torch.export.graph_signature import InputKind +from torch.fx import Node + + +class AddModel(torch.nn.Module): + def forward(self, x, y): + return torch.ops.aten.add.Tensor(x, y) + + +class AddAlphaModel(torch.nn.Module): + def forward(self, x, y): + return torch.ops.aten.add.Tensor(x, y, alpha=2) + + +def _count_target(graph_module: torch.fx.GraphModule, target) -> int: + return sum( + 1 + for node in graph_module.graph.nodes + if node.op == "call_function" and node.target == target + ) + + +def _get_target_node(graph_module: torch.fx.GraphModule, target) -> Node: + nodes = [ + node + for node in graph_module.graph.nodes + if node.op == "call_function" and node.target == target + ] + assert len(nodes) == 1 + return nodes[0] + + +def _export_add_model() -> ExportedProgram: + return torch.export.export( + AddModel().eval(), (torch.randn(2, 3), torch.randn(2, 3)), strict=True + ) + + +def _export_add_alpha_model() -> ExportedProgram: + return torch.export.export( + AddAlphaModel().eval(), (torch.randn(2, 3), torch.randn(2, 3)), strict=True + ) + + +def test_rewrites_node_when_substitution_matches() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_with_sub( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args) + + exported_program = _export_add_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert result.modified + assert _count_target(result.graph_module, torch.ops.aten.add.Tensor) == 0 + assert _count_target(result.graph_module, torch.ops.aten.sub.Tensor) == 1 + + +def test_substitution_can_add_state_dict_placeholder() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_rhs_with_constant( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + first_placeholder = next( + graph_node + for graph_node in node.graph.nodes + if graph_node.op == "placeholder" + ) + with node.graph.inserting_before(first_placeholder): + const_node = create_constant_placeholder( + exp_program=exported_program, + graph=node.graph, + name="test_constant", + kind=InputKind.PARAMETER, + data=torch.ones(2, 3), + ) + return DialectNodeSpec(torch.ops.aten.add.Tensor, (node.args[0], const_node)) + + exported_program = _export_add_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert result.modified + assert "test_constant" in exported_program.state_dict + assert torch.equal(exported_program.state_dict["test_constant"], torch.ones(2, 3)) + assert ( + exported_program.graph_signature.inputs_to_parameters["test_constant"] + == "test_constant" + ) + add_node = _get_target_node(result.graph_module, torch.ops.aten.add.Tensor) + assert add_node.args[1].name == "test_constant" + + x = torch.full((2, 3), 2.0) + y = torch.full((2, 3), 5.0) + torch.testing.assert_close(exported_program.module()(x, y), x + torch.ones_like(x)) + + +def test_substitution_can_change_kwargs() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_alpha( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.add.Tensor, node.args, {"alpha": 3}) + + exported_program = _export_add_alpha_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert result.modified + add_node = _get_target_node(result.graph_module, torch.ops.aten.add.Tensor) + assert add_node.kwargs["alpha"] == 3 + + x = torch.full((2, 3), 2.0) + y = torch.full((2, 3), 5.0) + torch.testing.assert_close(exported_program.module()(x, y), x + 3 * y) + + +def test_preserves_meta_when_substitution_matches() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def replace_add_with_sub( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args) + + exported_program = _export_add_model() + add_node = _get_target_node( + exported_program.graph_module, torch.ops.aten.add.Tensor + ) + add_node.meta["test_sentinel"] = "kept" + add_node.meta["stack_trace"] = "original stack" + + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + sub_node = _get_target_node(result.graph_module, torch.ops.aten.sub.Tensor) + assert sub_node.meta["test_sentinel"] == "kept" + assert sub_node.meta["stack_trace"].startswith("original stack\n") + assert sub_node.meta["stack_trace"] != "original stack" + + +def test_keeps_node_when_substitution_returns_none() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def do_not_replace( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del node, exported_program + return None + + exported_program = _export_add_model() + result = _TestAtenToDialectPass(exported_program=exported_program).call( + exported_program.graph_module + ) + + assert not result.modified + assert _count_target(result.graph_module, torch.ops.aten.add.Tensor) == 1 + assert _count_target(result.graph_module, torch.ops.aten.sub.Tensor) == 0 + + +def test_raises_when_duplicate_substitution_is_registered() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def first_replace( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.sub.Tensor, node.args) + + with pytest.raises(RuntimeError, match="Multiple substitutions registered"): + + @_TestAtenToDialectPass.register_dialect_substitution(torch.ops.aten.add.Tensor) + def second_replace( + node: Node, exported_program: ExportedProgram + ) -> DialectNodeSpec | None: + del exported_program + return DialectNodeSpec(torch.ops.aten.mul.Tensor, node.args) + + +def test_ensures_raises_when_call_function_count_changes() -> None: + class _TestAtenToDialectPass(AtenToDialectPass): + pass + + exported_program = _export_add_model() + graph_module = exported_program.graph_module + test_pass = _TestAtenToDialectPass(exported_program=exported_program) + test_pass.requires(graph_module) + + placeholders = [ + node for node in graph_module.graph.nodes if node.op == "placeholder" + ] + output_node = next(node for node in graph_module.graph.nodes if node.op == "output") + with graph_module.graph.inserting_before(output_node): + graph_module.graph.create_node( + "call_function", + target=torch.ops.aten.sub.Tensor, + args=tuple(placeholders), + kwargs={}, + ) + + with pytest.raises(RuntimeError, match="did not preserve"): + test_pass.ensures(graph_module) From c8c04e4b6e3aa7b11574374484fb18c404daefc6 Mon Sep 17 00:00:00 2001 From: Hansong Zhang <107070759+kirklandsign@users.noreply.github.com> Date: Thu, 28 May 2026 09:59:29 -0700 Subject: [PATCH 065/103] Remove `google-java-format` from CI lint infrastructure Differential Revision: D106575515 Pull Request resolved: https://github.com/pytorch/executorch/pull/19831 --- .ci/docker/common/install_linter.sh | 4 --- .github/workflows/lint.yml | 46 ----------------------------- 2 files changed, 50 deletions(-) diff --git a/.ci/docker/common/install_linter.sh b/.ci/docker/common/install_linter.sh index 52d2d262685..4a796a72d54 100755 --- a/.ci/docker/common/install_linter.sh +++ b/.ci/docker/common/install_linter.sh @@ -13,7 +13,3 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" # NB: Install all linter dependencies, the caching of lintrunner init could be # done after Executorch becomes public pip_install -r requirements-lintrunner.txt - -# Install google-java-format -curl -L --retry 3 --retry-all-errors https://github.com/google/google-java-format/releases/download/v1.23.0/google-java-format_linux-x86-64 > /opt/google-java-format -chmod +x /opt/google-java-format diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b26247d2333..b21cc527b8d 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -125,49 +125,3 @@ jobs: uses: ./.github/workflows/_link_check.yml with: ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - - android-java-format: - runs-on: ubuntu-latest - permissions: - contents: read - steps: - - uses: actions/checkout@v4 - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - - - uses: actions/setup-java@v4 - with: - distribution: 'temurin' - java-version: '17' - - - name: Check Java formatting - run: | - GOOGLE_JAVA_FORMAT_VERSION="1.24.0" - curl -sSfL "https://github.com/google/google-java-format/releases/download/v${GOOGLE_JAVA_FORMAT_VERSION}/google-java-format-${GOOGLE_JAVA_FORMAT_VERSION}-all-deps.jar" \ - -o /tmp/google-java-format.jar - - FILES_NEEDS_FORMAT=$(find extension/android/executorch_android/src/main/java/org/pytorch/executorch \ - extension/android/executorch_android/src/main/java/org/pytorch/executorch/extension/llm \ - extension/android/executorch_android/src/main/java/org/pytorch/executorch/annotations \ - extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch \ - extension/benchmark/android/benchmark/app/src/main/java/org/pytorch/minibench \ - extension/benchmark/android/benchmark/app/src/androidTest/java/org/pytorch/minibench \ - -type f -name "*.java" 2>/dev/null | \ - xargs -r java -jar /tmp/google-java-format.jar -n) - - if [ -n "$FILES_NEEDS_FORMAT" ]; then - echo "Warning: The following files need formatting:" - echo "$FILES_NEEDS_FORMAT" - echo "" - echo "Please use google-java-format from https://github.com/google/google-java-format/releases/" - echo "" - echo "To fix, run one of these commands:" - echo " # Using xargs (recommended):" - echo " find -type f -name '*.java' | xargs google-java-format -i" - echo "" - echo " # Or format specific files:" - echo "$FILES_NEEDS_FORMAT" | while IFS= read -r file; do - echo " google-java-format -i \"$file\"" - done - exit 1 - fi From 000d81029005954628a59cf86c292fefe7d04e85 Mon Sep 17 00:00:00 2001 From: Gasoonjia Date: Thu, 28 May 2026 14:04:39 -0700 Subject: [PATCH 066/103] [ET Device Support] Define et_copy runtime h2d and d2h copy ops (#19858) clone https://github.com/pytorch/executorch/pull/18729 due to bot crash --- backends/cuda/runtime/shims/tests/targets.bzl | 24 ++ .../shims/tests/test_op__device_copy.cpp | 195 ++++++++++++ kernels/portable/cpu/op__device_copy.cpp | 154 +++++++++ kernels/portable/functions.yaml | 10 + kernels/test/op__device_copy_test.cpp | 297 ++++++++++++++++++ kernels/test/targets.bzl | 14 +- shim_et/xplat/executorch/codegen/codegen.bzl | 1 + .../kernels/portable/op_registration_util.bzl | 6 + 8 files changed, 698 insertions(+), 3 deletions(-) create mode 100644 backends/cuda/runtime/shims/tests/test_op__device_copy.cpp create mode 100644 kernels/portable/cpu/op__device_copy.cpp create mode 100644 kernels/test/op__device_copy_test.cpp diff --git a/backends/cuda/runtime/shims/tests/targets.bzl b/backends/cuda/runtime/shims/tests/targets.bzl index b68043f7feb..a54c47e979d 100644 --- a/backends/cuda/runtime/shims/tests/targets.bzl +++ b/backends/cuda/runtime/shims/tests/targets.bzl @@ -42,3 +42,27 @@ def define_common_targets(): cuda_shim_cpp_unittest("aoti_torch_new_tensor_handle") cuda_shim_cpp_unittest("aoti_torch_item_bool") cuda_shim_cpp_unittest("aoti_torch_assign_tensors_out") + + cpp_unittest( + name = "test_op__device_copy", + srcs = ["test_op__device_copy.cpp"], + deps = [ + "//executorch/backends/cuda/runtime:cuda_backend", + "//executorch/kernels/portable:generated_lib", + "//executorch/kernels/portable:generated_lib_headers", + "//executorch/kernels/portable/cpu:op__device_copy", + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/core/exec_aten:lib", + "//executorch/runtime/core/portable_type:portable_type", + "//executorch/runtime/kernel:kernel_runtime_context", + "//executorch/runtime/platform:platform", + ], + external_deps = [ + ("cuda", None, "cuda-lazy"), + ], + preprocessor_flags = ["-DCUDA_AVAILABLE=1"], + keep_gpu_sections = True, + remote_execution = re_test_utils.remote_execution( + platform = "gpu-remote-execution", + ), + ) diff --git a/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp new file mode 100644 index 00000000000..4e5c5a099b7 --- /dev/null +++ b/backends/cuda/runtime/shims/tests/test_op__device_copy.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#if (defined(__has_feature) && __has_feature(address_sanitizer)) || \ + defined(__SANITIZE_ADDRESS__) +#include +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 1 +#else +#define EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE 0 +#endif + +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::TensorShapeDynamism; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +namespace { + +struct CudaDeleter { + void operator()(void* ptr) const { + if (ptr != nullptr) { + cudaFree(ptr); + } + } +}; + +using CudaPtr = std::unique_ptr; + +CudaPtr allocate_cuda(size_t nbytes) { + void* ptr = nullptr; + const cudaError_t err = cudaMalloc(&ptr, nbytes); + EXPECT_EQ(err, cudaSuccess) << "cudaMalloc failed"; + return CudaPtr(ptr); +} + +bool is_cuda_available() { +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_disable(); +#endif + int device_count = 0; + const cudaError_t err = cudaGetDeviceCount(&device_count); +#if EXECUTORCH_CUDA_DEVICE_COPY_HAS_LSAN_INTERFACE + __lsan_enable(); +#endif + return err == cudaSuccess && device_count > 0; +} + +std::vector copy_cuda_to_host(const void* device_ptr, size_t numel) { + std::vector host(numel); + const cudaError_t err = cudaMemcpy( + host.data(), device_ptr, numel * sizeof(float), cudaMemcpyDeviceToHost); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy D2H failed"; + return host; +} + +void copy_host_to_cuda(const std::vector& host, void* device_ptr) { + const cudaError_t err = cudaMemcpy( + device_ptr, + host.data(), + host.size() * sizeof(float), + cudaMemcpyHostToDevice); + EXPECT_EQ(err, cudaSuccess) << "cudaMemcpy H2D failed"; +} + +class CudaDeviceCopyOpTest : public ::testing::Test { + protected: + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + ASSERT_NE(get_device_allocator(DeviceType::CUDA), nullptr) + << "Linking cuda_backend should auto-register the CUDA allocator"; + } + + void SetUp() override { + if (!is_cuda_available()) { + GTEST_SKIP() << "CUDA not available, skipping CUDA device copy op tests"; + } + } + + Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_h2d_copy_outf(context_, self, out); + } + + Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_d2h_copy_outf(context_, self, out); + } + + KernelRuntimeContext context_; +}; + +} // namespace + +TEST_F(CudaDeviceCopyOpTest, H2dCopyUsesRegisteredCudaAllocator) { + std::vector src_data = {1.0f, 2.0f, 3.0f, 4.0f}; + auto device_data = allocate_cuda(src_data.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + + int32_t sizes[] = {static_cast(src_data.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_h2d_copy_out(src, dst); + + EXPECT_EQ(context_.failure_state(), Error::Ok); + EXPECT_EQ(&result, &dst); + EXPECT_EQ(copy_cuda_to_host(device_data.get(), src_data.size()), src_data); +} + +TEST_F(CudaDeviceCopyOpTest, D2hCopyUsesRegisteredCudaAllocator) { + const std::vector expected = {5.0f, 6.0f, 7.0f, 8.0f}; + auto device_data = allocate_cuda(expected.size() * sizeof(float)); + ASSERT_NE(device_data.get(), nullptr); + copy_host_to_cuda(expected, device_data.get()); + + std::vector dst_data(expected.size(), 0.0f); + int32_t sizes[] = {static_cast(expected.size())}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + device_data.get(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data.data(), + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_d2h_copy_out(src, dst); + + EXPECT_EQ(context_.failure_state(), Error::Ok); + EXPECT_EQ(&result, &dst); + EXPECT_EQ(dst_data, expected); +} diff --git a/kernels/portable/cpu/op__device_copy.cpp b/kernels/portable/cpu/op__device_copy.cpp new file mode 100644 index 00000000000..5e1a51a83be --- /dev/null +++ b/kernels/portable/cpu/op__device_copy.cpp @@ -0,0 +1,154 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Runtime kernels for et_copy._h2d_copy and et_copy._d2h_copy ops. + * + * These ops transfer tensor data between CPU and device memory using + * the DeviceAllocator interface. The device type is inferred from the + * tensor metadata (out.device_type() for H2D, self.device_type() for D2H), + * which was set during AOT serialization by PropagateDevicePass. + */ + +#include +#include +#include + +namespace torch { +namespace executor { +namespace native { + +using Tensor = executorch::aten::Tensor; +using DeviceAllocator = executorch::runtime::DeviceAllocator; +using Error = executorch::runtime::Error; + +/** + * Copies tensor data from host (CPU) memory to device memory. + * + * self: source tensor on CPU + * out: destination tensor on device (memory-planned by runtime) + * + * The device type and index are inferred from out's TensorImpl metadata. + */ +Tensor& +_h2d_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = out.unsafeGetTensorImpl()->device_type(); + auto device_index = out.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + self.unsafeGetTensorImpl()->device_type() == + executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: source tensor must be on CPU, got device_type=%d", + static_cast(self.unsafeGetTensorImpl()->device_type())); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_h2d_copy: destination tensor must be on a non-CPU device"); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_h2d_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = + executorch::runtime::get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_h2d_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_host_to_device( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_h2d_copy: copy_host_to_device failed"); + + return out; +} + +/** + * Copies tensor data from device memory to host (CPU) memory. + * + * self: source tensor on device + * out: destination tensor on CPU (memory-planned by runtime) + * + * The device type and index are inferred from self's TensorImpl metadata. + */ +Tensor& +_d2h_copy_out(KernelRuntimeContext& ctx, const Tensor& self, Tensor& out) { + auto device_type = self.unsafeGetTensorImpl()->device_type(); + auto device_index = self.unsafeGetTensorImpl()->device_index(); + + ET_KERNEL_CHECK_MSG( + ctx, + device_type != executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: source tensor must be on a non-CPU device"); + + ET_KERNEL_CHECK_MSG( + ctx, + out.unsafeGetTensorImpl()->device_type() == + executorch::runtime::etensor::DeviceType::CPU, + InvalidArgument, + out, + "_d2h_copy: destination tensor must be on CPU, got device_type=%d", + static_cast(out.unsafeGetTensorImpl()->device_type())); + + auto nbytes = self.nbytes(); + ET_KERNEL_CHECK_MSG( + ctx, + nbytes == out.nbytes(), + InvalidArgument, + out, + "_d2h_copy: size mismatch: self.nbytes()=%zu, out.nbytes()=%zu", + nbytes, + out.nbytes()); + + DeviceAllocator* allocator = + executorch::runtime::get_device_allocator(device_type); + ET_KERNEL_CHECK_MSG( + ctx, + allocator != nullptr, + NotFound, + out, + "_d2h_copy: no device allocator registered for device_type=%d", + static_cast(device_type)); + + Error err = allocator->copy_device_to_host( + out.mutable_data_ptr(), self.const_data_ptr(), nbytes, device_index); + ET_KERNEL_CHECK_MSG( + ctx, + err == Error::Ok, + Internal, + out, + "_d2h_copy: copy_device_to_host failed"); + + return out; +} + +} // namespace native +} // namespace executor +} // namespace torch diff --git a/kernels/portable/functions.yaml b/kernels/portable/functions.yaml index 620d97d050f..ecf62ee3606 100644 --- a/kernels/portable/functions.yaml +++ b/kernels/portable/functions.yaml @@ -1045,6 +1045,16 @@ - arg_meta: null kernel_name: torch::executor::zeros_out +- func: et_copy::_h2d_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_h2d_copy_out + +- func: et_copy::_d2h_copy.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: torch::executor::_d2h_copy_out + - func: dim_order_ops::_empty_dim_order.out(int[] size, *, int[]? dim_order=None, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/kernels/test/op__device_copy_test.cpp b/kernels/test/op__device_copy_test.cpp new file mode 100644 index 00000000000..d345642bd37 --- /dev/null +++ b/kernels/test/op__device_copy_test.cpp @@ -0,0 +1,297 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/** + * Tests for et_copy._h2d_copy.out and et_copy._d2h_copy.out runtime kernels. + * + * Uses a MockDeviceAllocator to verify that the kernels correctly call + * copy_host_to_device / copy_device_to_host via the DeviceAllocator interface, + * and that device type is inferred from tensor metadata. + */ + +#include + +#include // Declares the operator +#include +#include +#include +#include +#include + +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::aten::TensorImpl; +using executorch::runtime::DeviceAllocator; +using executorch::runtime::Error; +using executorch::runtime::get_device_allocator; +using executorch::runtime::register_device_allocator; +using executorch::runtime::Result; +using executorch::runtime::etensor::DeviceIndex; +using executorch::runtime::etensor::DeviceType; + +using TensorShapeDynamism = executorch::runtime::TensorShapeDynamism; + +namespace { + +class MockDeviceAllocator : public DeviceAllocator { + public: + Result allocate( + size_t nbytes, + DeviceIndex index, + size_t alignment = kDefaultAlignment) override { + return Error::NotSupported; + } + + void deallocate(void* ptr, DeviceIndex index) override {} + + Error copy_host_to_device( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + h2d_call_count_++; + last_h2d_nbytes_ = nbytes; + last_h2d_device_index_ = index; + // Actually copy so we can verify data + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + Error copy_device_to_host( + void* dst, + const void* src, + size_t nbytes, + DeviceIndex index) override { + d2h_call_count_++; + last_d2h_nbytes_ = nbytes; + last_d2h_device_index_ = index; + std::memcpy(dst, src, nbytes); + return Error::Ok; + } + + DeviceType device_type() const override { + return DeviceType::CUDA; + } + + int h2d_call_count_ = 0; + int d2h_call_count_ = 0; + size_t last_h2d_nbytes_ = 0; + size_t last_d2h_nbytes_ = 0; + DeviceIndex last_h2d_device_index_ = -1; + DeviceIndex last_d2h_device_index_ = -1; +}; + +} // namespace + +static MockDeviceAllocator g_mock_cuda; + +class OpDeviceCopyTest : public OperatorTest { + protected: + Tensor& op_h2d_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_h2d_copy_outf(context_, self, out); + } + + Tensor& op_d2h_copy_out(const Tensor& self, Tensor& out) { + return torch::executor::et_copy::_d2h_copy_outf(context_, self, out); + } + + static void SetUpTestSuite() { + executorch::runtime::runtime_init(); + if (get_device_allocator(DeviceType::CUDA) == nullptr) { + register_device_allocator(&g_mock_cuda); + } + } + + void SetUp() override { + OperatorTest::SetUp(); + g_mock_cuda.h2d_call_count_ = 0; + g_mock_cuda.d2h_call_count_ = 0; + g_mock_cuda.last_h2d_nbytes_ = 0; + g_mock_cuda.last_d2h_nbytes_ = 0; + g_mock_cuda.last_h2d_device_index_ = -1; + g_mock_cuda.last_d2h_device_index_ = -1; + } +}; + +TEST_F(OpDeviceCopyTest, H2dCopyCopiesDataAndCallsAllocator) { + // Set up a CPU source tensor with known data. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Set up a CUDA destination tensor (simulated with host memory). + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_h2d_copy_out(src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 0); + + // Verify data was copied (mock does a real memcpy). + EXPECT_EQ(dst_data[0], 1.0f); + EXPECT_EQ(dst_data[1], 2.0f); + EXPECT_EQ(dst_data[2], 3.0f); + EXPECT_EQ(dst_data[3], 4.0f); + + // Verify return value is the out tensor. + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, D2hCopyCopiesDataAndCallsAllocator) { + // Set up a CUDA source tensor with known data. + float src_data[] = {5.0f, 6.0f, 7.0f, 8.0f}; + int32_t sizes[] = {4}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor src(&src_impl); + + // Set up a CPU destination tensor. + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f}; + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor dst(&dst_impl); + + Tensor& result = op_d2h_copy_out(src, dst); + + // Verify the allocator was called correctly. + EXPECT_EQ(g_mock_cuda.d2h_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_d2h_nbytes_, 4 * sizeof(float)); + EXPECT_EQ(g_mock_cuda.last_d2h_device_index_, 0); + + // Verify data was copied. + EXPECT_EQ(dst_data[0], 5.0f); + EXPECT_EQ(dst_data[1], 6.0f); + EXPECT_EQ(dst_data[2], 7.0f); + EXPECT_EQ(dst_data[3], 8.0f); + + EXPECT_EQ(&result, &dst); +} + +TEST_F(OpDeviceCopyTest, H2dCopyWithDeviceIndex1) { + // Verify device_index is correctly forwarded to the allocator. + float src_data[] = {1.0f}; + float dst_data[] = {0.0f}; + int32_t sizes[] = {1}; + uint8_t dim_order[] = {0}; + int32_t strides[] = {1}; + + TensorImpl src_impl( + ScalarType::Float, + 1, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + // Device index = 1 (e.g., cuda:1) + TensorImpl dst_impl( + ScalarType::Float, + 1, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 1); + Tensor dst(&dst_impl); + + op_h2d_copy_out(src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_device_index_, 1); +} + +TEST_F(OpDeviceCopyTest, H2dCopyMultidimensionalTensor) { + // Test with a 2D tensor [2, 3]. + float src_data[] = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; + float dst_data[] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}; + int32_t sizes[] = {2, 3}; + uint8_t dim_order[] = {0, 1}; + int32_t strides[] = {3, 1}; + + TensorImpl src_impl( + ScalarType::Float, + 2, + sizes, + src_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CPU, + 0); + Tensor src(&src_impl); + + TensorImpl dst_impl( + ScalarType::Float, + 2, + sizes, + dst_data, + dim_order, + strides, + TensorShapeDynamism::STATIC, + DeviceType::CUDA, + 0); + Tensor dst(&dst_impl); + + op_h2d_copy_out(src, dst); + + EXPECT_EQ(g_mock_cuda.h2d_call_count_, 1); + EXPECT_EQ(g_mock_cuda.last_h2d_nbytes_, 6 * sizeof(float)); + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(dst_data[i], src_data[i]); + } +} diff --git a/kernels/test/targets.bzl b/kernels/test/targets.bzl index bc51e336cb8..5212d691c5b 100644 --- a/kernels/test/targets.bzl +++ b/kernels/test/targets.bzl @@ -1,14 +1,14 @@ load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") load("@fbsource//xplat/executorch/kernels/test:util.bzl", "codegen_function_header_wrapper", "op_test") -def _common_op_test(name, kernels): +def _common_op_test(name, kernels, deps = []): """ Defines test targets in format of _op__test For ATen kernel testing, let's use portable functions.yaml for tested ops. """ for kernel in kernels: - deps = [":function_header_wrapper_{}".format(kernel)] - op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = deps) + op_deps = [":function_header_wrapper_{}".format(kernel)] + deps + op_test(name, kernel_name = kernel, use_kernel_prefix = True, deps = op_deps) def define_common_targets(): """Defines targets that should be shared between fbcode and xplat. @@ -177,6 +177,14 @@ def define_common_targets(): _common_op_test("op__clone_dim_order_test", ["aten", "portable"]) _common_op_test("op__conj_physical_test", ["aten", "portable"]) _common_op_test("op__adaptive_avg_pool2d_test", ["aten", "portable"]) + _common_op_test( + "op__device_copy_test", + ["portable"], + deps = [ + "//executorch/runtime/core:device_allocator", + "//executorch/runtime/platform:platform", + ], + ) _common_op_test("op_abs_test", ["aten", "portable"]) _common_op_test("op_acos_test", ["aten", "portable"]) _common_op_test("op_acosh_test", ["aten", "portable"]) diff --git a/shim_et/xplat/executorch/codegen/codegen.bzl b/shim_et/xplat/executorch/codegen/codegen.bzl index 5ffa7b65a36..318996784a1 100644 --- a/shim_et/xplat/executorch/codegen/codegen.bzl +++ b/shim_et/xplat/executorch/codegen/codegen.bzl @@ -535,6 +535,7 @@ def get_portable_lib_deps(): "//executorch/kernels/portable/cpu:vec_ops", "//executorch/kernels/portable/cpu/pattern:all_deps", "//executorch/kernels/portable/cpu/util:all_deps", + "//executorch/runtime/core:device_allocator", ] def get_optimized_lib_deps(): diff --git a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl index cc2a0f78c75..479f3913f8f 100644 --- a/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl +++ b/shim_et/xplat/executorch/kernels/portable/op_registration_util.bzl @@ -1405,6 +1405,12 @@ ATEN_OPS = ( "//executorch/kernels/portable/cpu/util:copy_ops_util", ], ), + op_target( + name = "op__device_copy", + deps = [ + "//executorch/runtime/core:device_allocator", + ], + ), ) # Operators that are not listed in `functions.yaml` (i.e., operators listed in From 42581f1b09167b8dbed119eabd240354bf8f6108 Mon Sep 17 00:00:00 2001 From: Mergen Nachin Date: Thu, 28 May 2026 17:44:19 -0400 Subject: [PATCH 067/103] =?UTF-8?q?Add=20GGUF=20=E2=86=92=20MLX=20export?= =?UTF-8?q?=20support=20for=20Gemma=204=2031B=20(#19829)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enable loading GGUF files (e.g. Q4_K_M) and exporting to the MLX backend. Three areas of change: GGUF loader (gguf_loader.py): - Add MLX backend support alongside CUDA - Keep embedding quantized for MLX (QuantizedEmbeddingHandler supports quantized gather natively, unlike CUDA's Int4Tensor) - Fix stale docstring references to Int4TilePackedTo4dTensor/tinygemm MLX backend (op_helpers.py, patterns.py): - Accept group_size=16 in parse_dequant_node for GGUF Q6_K tensors - For group_size < 32, emit DequantizeNode + TransposeNode + AddmmNode instead of QuantizedMatmulNode, since MLX Metal kernels are only instantiated for group_size >= 32. Weights stay packed as int8 in the .pte file and are dequantized on-device at runtime — same strategy CUDA/Inductor uses (separate Triton dequant + cuBLAS mm). Packer (pack_mlx.py): - Add 16 to supported group sizes so Q6_K IntxUnpackedToInt8Tensor passes through to export unchanged Tests (test_ops.py): - Add group_size=16 configs for int8, int4, and no-bias variants Test Plan: Export and run this model https://huggingface.co/unsloth/gemma-4-31B-it-GGUF/blob/main/gemma-4-31B-it-Q4_K_M.gguf On M1 32GB machine (exported on Linux A100) ``` (executorch_dev) mnachin@mnachin-mbp executorch % ./cmake-out/examples/models/gemma4_31b/gemma4_31b_runner \ --model_path /Users/mnachin/repos/models/gemma-4-31B-it-GGUF/model.pte \ --tokenizer_path /Users/mnachin/repos/models/gemma-4-31B-it-HQQ-INT4/tokenizer.json \ --prompt "Tell me a joke about RAM usage" \ --max_new_tokens 128 \ --temperature 0.8 I tokenizers:regex.cpp:27] Registering override fallback regex WARNING: All log messages before absl::InitializeLog() is called are written to STDERR E0000 00:00:1779926968.603672 54889180 re2.cc:237] Error parsing '((\|ool\|\>1\x00\x00\ �\|\|\<\|tool_response\>|\<\|think\|\>|\x0...': invalid UTF-8 I tokenizers:re2_regex.cpp:27] Re2 failed to compile regex: ((\|ool\|\>1\x00\x00\ �\|\|\<\|tool_response\>|\<\|think\|\>|\x00\x00\\\<|\|\|\<\|\"\|\>|all\|\>j\x00\x00\\|\|\<\|turn\>|\|\<\|image\>|\<\|$ I tokenizers:regex_lookahead.cpp:27] Creating PCRE2 regex I tokenizers:pcre2_regex.cpp:48] PCRE2 UTF-8 validation failed at offset 27: UTF-8 error: byte 2 top bits not 0x80. Retrying without UTF flags. Loading model... Prompt tokens: 23 Why did the computer go to therapy? Because it had too many **unresolved dependencies** and it just couldn't stop **dwelling on the past**... but it forgot everything the moment it took a nap. PyTorchObserver {"prefill_token_per_sec":2.49539,"decode_token_per_sec":0.0880671,"prompt_tokens":23,"generated_tokens":44,"model_load_start_ms":1779926968052,"model_load_end_ms":1779926982494,"inference_start_ms":1779926982497,"inference_end_ms":1779927491333,"prompt_eval_end_ms":1779926991714,"first_token_ms":1779926991714,"aggregate_sampling_time_ms":0,"SCALING_FACTOR_UNITS_PER_SECOND":1000} ``` For reference, here's the this model: https://huggingface.co/SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4 ``` (executorch_dev) mnachin@mnachin-mbp executorch % ./cmake-out/examples/models/gemma4_31b/gemma4_31b_runner \ --model_path /Users/mnachin/repos/models/gemma-4-31B-it-HQQ-INT4/model.pte \ --tokenizer_path /Users/mnachin/repos/models/gemma-4-31B-it-HQQ-INT4/tokenizer.json \ --prompt "Tell me a joke about RAM usage" \ --max_new_tokens 128 \ --temperature 0.8 I tokenizers:regex.cpp:27] Registering override fallback regex WARNING: All log messages before absl::InitializeLog() is called are written to STDERR E0000 00:00:1779927592.109382 54914733 re2.cc:237] Error parsing '((\|ool\|\>1\x00\x00\ �\|\|\<\|tool_response\>|\<\|think\|\>|\x0...': invalid UTF-8 I tokenizers:re2_regex.cpp:27] Re2 failed to compile regex: ((\|ool\|\>1\x00\x00\ �\|\|\<\|tool_response\>|\<\|think\|\>|\x00\x00\\\<|\|\|\<\|\"\|\>|all\|\>j\x00\x00\\|\|\<\|turn\>|\|\<\|image\>|\<\|$ I tokenizers:regex_lookahead.cpp:27] Creating PCRE2 regex I tokenizers:pcre2_regex.cpp:48] PCRE2 UTF-8 validation failed at offset 27: UTF-8 error: byte 2 top bits not 0x80. Retrying without UTF flags. Loading model... Prompt tokens: 23 Why did the computer go to therapy? Because it had too many **unresolved dependencies** and couldn't stop **dwelling on the past**, but it still couldn't remember why it was there. *** Alternatively, a shorter one: **Why was the RAM so stressed?** Because it had too much on its mind, but it knew that as soon as it slept, it would forget everything. PyTorchObserver {"prefill_token_per_sec":9.11975,"decode_token_per_sec":5.24998,"prompt_tokens":23,"generated_tokens":86,"model_load_start_ms":1779927591719,"model_load_end_ms":1779927603575,"inference_start_ms":1779927603579,"inference_end_ms":1779927622482,"prompt_eval_end_ms":1779927606101,"first_token_ms":1779927606101,"aggregate_sampling_time_ms":0,"SCALING_FACTOR_UNITS_PER_SECOND":1000} ``` There's definitely performance degradation when running GGUF --- .github/workflows/mlx.yml | 4 + backends/mlx/builder/op_helpers.py | 2 +- backends/mlx/patterns.py | 79 ++++++++++++++++--- backends/mlx/test/test_ops.py | 14 ++++ examples/models/gemma4_31b/README.md | 1 + examples/models/gemma4_31b/export.py | 7 +- examples/models/gemma4_31b/gguf_loader.py | 19 +++-- examples/models/gemma4_31b/quant/README.md | 2 - examples/models/gemma4_31b/quant/pack_mlx.py | 6 +- .../gemma4_31b/quant/tests/test_pack_mlx.py | 46 ++++++++++- .../gemma4_31b/tests/test_mlx_pipeline.py | 79 +++++++++++++++++++ 11 files changed, 233 insertions(+), 26 deletions(-) diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml index c4be146f862..027101ba7f0 100644 --- a/.github/workflows/mlx.yml +++ b/.github/workflows/mlx.yml @@ -47,6 +47,10 @@ jobs: ${CONDA_RUN} pip list + echo "::group::Install Python test requirements" + ${CONDA_RUN} pip install gguf + echo "::endgroup::" + echo "::group::Build test runners" ${CONDA_RUN} cmake --build cmake-out --target op_test_runner multi_thread_test_runner -j$(( $(sysctl -n hw.ncpu) - 1 )) echo "::endgroup::" diff --git a/backends/mlx/builder/op_helpers.py b/backends/mlx/builder/op_helpers.py index 40e71e0bdab..7740546cc2c 100644 --- a/backends/mlx/builder/op_helpers.py +++ b/backends/mlx/builder/op_helpers.py @@ -334,7 +334,7 @@ def parse_dequant_node( if len(non_one) != 1: return None quantized_dim, group_size = non_one[0] - if group_size not in [32, 64, 128]: + if group_size not in [16, 32, 64, 128]: return None # TODO: MLX supports 3, 5, and 7, but we need to figure out the diff --git a/backends/mlx/patterns.py b/backends/mlx/patterns.py index 29e5e326c69..5f74cbea643 100644 --- a/backends/mlx/patterns.py +++ b/backends/mlx/patterns.py @@ -15,6 +15,7 @@ from __future__ import annotations +import os from typing import Any, List, Optional, Tuple import torch @@ -37,6 +38,7 @@ ) from executorch.backends.mlx.serialization.mlx_graph_schema import ( AddIntNode, + AddmmNode, AddNode, AsTypeNode, DequantizeNode, @@ -52,6 +54,7 @@ SubtractIntNode, SymSizeNode, TakeNode, + TransposeNode, ) from torch.export.exported_program import ExportedProgram from torch.fx.node import Node @@ -883,6 +886,18 @@ def maybe_create( out_dtype=out_dtype, ) + # MLX's quantized_matmul Metal kernels are only instantiated for + # group_size in {32, 64, 128}. For smaller group sizes (e.g. GGUF + # Q6_K with group_size=16), emit DequantizeNode + matmul instead. + # Weights stay packed in the .pte file; dequantized on-device. + # This non-fused path is significantly slower and must be opted in + # via ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1. + _MIN_FUSED_GROUP_SIZE = 32 + + @staticmethod + def _allow_non_fused() -> bool: + return os.environ.get("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", "0") == "1" + def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot: assert n == self.head @@ -908,19 +923,59 @@ def __call__(self, P: MLXProgramBuilder, n: Node) -> Slot: x_dtype = x_node.meta["val"].dtype needs_cast = self.out_dtype != x_dtype - P.emit( - QuantizedMatmulNode( - x=P.slot_to_tid(x_slot), - w=P.slot_to_tid(w), - scales=P.slot_to_tid(scale_slot), - out=P.slot_to_tid(out), - biases=P.slot_to_tid(biases), - group_size=self.group_size, - bits=self.bits, - mode="affine", - transpose=True, + if self.group_size >= self._MIN_FUSED_GROUP_SIZE: + P.emit( + QuantizedMatmulNode( + x=P.slot_to_tid(x_slot), + w=P.slot_to_tid(w), + scales=P.slot_to_tid(scale_slot), + out=P.slot_to_tid(out), + biases=P.slot_to_tid(biases), + group_size=self.group_size, + bits=self.bits, + mode="affine", + transpose=True, + ) ) - ) + else: + if not self._allow_non_fused(): + raise ValueError( + f"Quantized linear with group_size={self.group_size} requires " + f"the non-fused dequantize+matmul path, which is significantly " + f"slower than the fused QuantizedMatmulNode (group_size >= 32). " + f"Set ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1 to allow this." + ) + out_scalar_type = torch_dtype_to_scalar_type(self.out_dtype) + _, w_deq = P.make_tmp_slot() + P.emit( + DequantizeNode( + w=P.slot_to_tid(w), + scales=P.slot_to_tid(scale_slot), + out=P.slot_to_tid(w_deq), + biases=P.slot_to_tid(biases), + group_size=self.group_size, + bits=self.bits, + mode="affine", + dtype=out_scalar_type, + ) + ) + _, w_t = P.make_tmp_slot() + P.emit( + TransposeNode( + x=P.slot_to_tid(w_deq), + out=P.slot_to_tid(w_t), + perm=[1, 0], + ) + ) + P.emit( + AddmmNode( + mat1=P.slot_to_tid(x_slot), + mat2=P.slot_to_tid(w_t), + out=P.slot_to_tid(out), + ) + ) + # DequantizeNode already produces the correct dtype. + needs_cast = False if has_bias: P.emit( diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py index 4471610519e..45ea024f0e8 100644 --- a/backends/mlx/test/test_ops.py +++ b/backends/mlx/test/test_ops.py @@ -24,6 +24,7 @@ See README.md in this directory for full documentation. """ +import os from typing import Callable, Dict, List, Optional, Tuple import torch @@ -5621,8 +5622,21 @@ def get_test_configs(cls) -> List["QuantizedLinearTest"]: cls(group_size=128), cls(qdtype=torch.int2), cls(qdtype=torch.int8), + # group_size=16: exercises the non-fused dequantize+matmul path + # (requires ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS=1). + cls(qdtype=torch.int8, group_size=16), + cls(qdtype=torch.int4, group_size=16), + cls(qdtype=torch.int8, group_size=16, bias=False), ] + def generate_test_files(self, verbose=False): + if self.group_size < 32: + os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1" + try: + return super().generate_test_files(verbose=verbose) + finally: + os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None) + def create_model(self) -> nn.Module: model = LinearModel(self.in_features, self.out_features, bias=self.bias) model = model.to(self.dtype) diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md index da4aa893079..c6ac10748d8 100644 --- a/examples/models/gemma4_31b/README.md +++ b/examples/models/gemma4_31b/README.md @@ -15,6 +15,7 @@ both export and eager inference: |---|---|---| | `quantize_and_save.py` | bf16 HF checkpoint → quantized checkpoint (one-time) | ~30 GB CPU | | `export.py --prequantized

` | quantized checkpoint → `model.pte` + `model.ptd` | ~24 GB CPU + CUDA for packing | +| `export.py --gguf [--backend mlx]` | GGUF file (Q4_K_M, etc.) → `model.pte` + `model.ptd` | ~24 GB CPU | | `inference.py --prequantized ` | quantized checkpoint → eager generation under `torch.compile` | ~24 GB GPU | | `inference.py --gguf ` | GGUF file (Q4_K_M, etc.) → eager generation | ~24 GB GPU | | `export.py --model-dir ` | one-shot bf16 → quantize → export (no intermediate file) | ~30 GB CPU + CUDA for packing | diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py index 046e365947b..bd648f534b5 100644 --- a/examples/models/gemma4_31b/export.py +++ b/examples/models/gemma4_31b/export.py @@ -443,7 +443,12 @@ def main() -> None: backend=args.backend, ) - export_and_lower(model, config, args.output_dir, backend=args.backend) + if args.gguf and args.backend == "mlx": + os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1" + try: + export_and_lower(model, config, args.output_dir, backend=args.backend) + finally: + os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None) if __name__ == "__main__": diff --git a/examples/models/gemma4_31b/gguf_loader.py b/examples/models/gemma4_31b/gguf_loader.py index 3e50991e553..35dddb5a0dc 100644 --- a/examples/models/gemma4_31b/gguf_loader.py +++ b/examples/models/gemma4_31b/gguf_loader.py @@ -12,6 +12,7 @@ Usage: model, config = load_gguf_model("model.gguf", backend="cuda") + model, config = load_gguf_model("model.gguf", backend="mlx") """ from typing import Optional @@ -104,10 +105,11 @@ def load_gguf_model( Streams tensors one at a time for low peak memory. GGUF ties ``embed_tokens`` and ``lm_head`` into a single Q4_K tensor. - We untie them: the embedding is dequantized to bf16 (``nn.Embedding`` - needs gather, which ``Int4TilePackedTo4dTensor`` does not support), - while ``lm_head`` keeps the original Q4_K quantization (``nn.Linear`` - matmul via tinygemm). + We untie them so ``lm_head`` keeps the original Q4_K quantization. + On CUDA, the embedding is dequantized to bf16 because ``Int4Tensor`` + does not support the gather op that ``nn.Embedding`` requires. On + MLX, the embedding stays quantized — ``QuantizedEmbeddingHandler`` + handles quantized gather natively. Returns ``(model, config)``. """ @@ -120,8 +122,12 @@ def load_gguf_model( from executorch.examples.models.gemma4_31b.quant import DEFAULT_CUDA_PACKERS packers = DEFAULT_CUDA_PACKERS + elif backend == "mlx": + from executorch.examples.models.gemma4_31b.quant import DEFAULT_MLX_PACKERS + + packers = DEFAULT_MLX_PACKERS else: - raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda'.") + raise ValueError(f"Unsupported backend: {backend!r}. Supported: 'cuda', 'mlx'.") config = Gemma4_31BConfig(max_seq_len=max_seq_len) @@ -143,7 +149,8 @@ def load_gguf_model( if model_key == "embed_tokens.weight" and isinstance(result, Int4Tensor): embed_quant = result - result = dequantize_weight(result, torch.bfloat16) + if backend == "cuda": + result = dequantize_weight(result, torch.bfloat16) pack_one(model, model_key, result, packers) diff --git a/examples/models/gemma4_31b/quant/README.md b/examples/models/gemma4_31b/quant/README.md index 2eacced4387..92ddbf97243 100644 --- a/examples/models/gemma4_31b/quant/README.md +++ b/examples/models/gemma4_31b/quant/README.md @@ -50,5 +50,3 @@ The format is compatible with torchao's `save_pretrained` / `load_pretrained`. - `pack_metal.py` — Metal backend packer. - `gguf.py` — extend with Q5_K, Q8_0 GGUF quant types. -- Upstream `Int4TilePackedTo4dTensor.from_int4_tensor()` to torchao - to replace the manual conversion in `pack_int4_for_cuda`. diff --git a/examples/models/gemma4_31b/quant/pack_mlx.py b/examples/models/gemma4_31b/quant/pack_mlx.py index 63aeca426a8..d627c9c437c 100644 --- a/examples/models/gemma4_31b/quant/pack_mlx.py +++ b/examples/models/gemma4_31b/quant/pack_mlx.py @@ -22,7 +22,7 @@ from .pack import ModulePackerFn, pack_model # noqa: F401 -_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32) +_MLX_SUPPORTED_GROUP_SIZES = (128, 64, 32, 16) # --------------------------------------------------------------------------- @@ -126,7 +126,9 @@ def pack_for_mlx(module: nn.Module, weights: dict[str, torch.Tensor]) -> None: default dispatch produces the ``dequantize_affine → linear`` pattern MLX expects. Regroups to a compatible group_size when needed (e.g. per-axis group_size=5376 → group_size=128) since MLX's - ``parse_dequant_node`` only accepts group_size in {32, 64, 128}. + ``parse_dequant_node`` only accepts group_size in {16, 32, 64, 128}. + Group sizes ≥ 32 use the fused ``QuantizedMatmulNode``; group_size=16 + (e.g. GGUF Q6_K) falls back to ``DequantizeNode`` + matmul at export. """ from torchao.quantization import IntxUnpackedToInt8Tensor from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor diff --git a/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py b/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py index ffb2e0e2dd3..2e6310b9c10 100644 --- a/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py +++ b/examples/models/gemma4_31b/quant/tests/test_pack_mlx.py @@ -146,7 +146,7 @@ def test_regroup_preserves_dequant(self): class TestMlxGroupSize(unittest.TestCase): def test_passthrough(self): - for gs in (32, 64, 128): + for gs in (16, 32, 64, 128): self.assertEqual(_mlx_group_size(gs, 256), gs) def test_regroup_5376(self): @@ -157,7 +157,49 @@ def test_regroup_256(self): def test_rejects_indivisible(self): with self.assertRaises(ValueError): - _mlx_group_size(48, 48) + _mlx_group_size(7, 7) + + +class TestPackLinearGroupSize16(unittest.TestCase): + """Packing group_size=16 weights (GGUF Q6_K) preserves semantics.""" + + def _make_gs16_tensor(self, N=64, K=128): + from torchao.quantization import IntxUnpackedToInt8Tensor + + return IntxUnpackedToInt8Tensor( + qdata=torch.randint(-32, 31, (N, K), dtype=torch.int8), + scale=torch.randn(N, K // 16, dtype=torch.bfloat16), + zero_point=torch.zeros(N, K // 16, dtype=torch.int8), + target_dtype=torch.int8, + block_size=(1, 16), + dtype=torch.bfloat16, + activation_quantization=None, + ) + + def test_dequant_preserves_values(self): + """Packing preserves the dequantized weight values.""" + w = self._make_gs16_tensor(64, 128) + before = dequantize_weight(w, torch.float32) + + module = nn.Linear(128, 64, bias=False) + pack_for_mlx(module, {"weight": w}) + after = dequantize_weight(module.weight.data, torch.float32) + + self.assertTrue( + torch.allclose(before, after, atol=1e-5), + f"max diff: {(before - after).abs().max():.6g}", + ) + + def test_forward_produces_valid_output(self): + """Packed gs=16 weight produces finite output in a linear forward.""" + w = self._make_gs16_tensor(64, 128) + module = nn.Linear(128, 64, bias=False) + pack_for_mlx(module, {"weight": w}) + + x = torch.randn(1, 128, dtype=torch.bfloat16) + out = torch.nn.functional.linear(x, module.weight.data.dequantize()) + self.assertEqual(out.shape, torch.Size([1, 64])) + self.assertFalse(torch.isnan(out).any()) class TestPackEmbeddingForMlx(unittest.TestCase): diff --git a/examples/models/gemma4_31b/tests/test_mlx_pipeline.py b/examples/models/gemma4_31b/tests/test_mlx_pipeline.py index 0e62ab88e4b..37f61fddb0f 100644 --- a/examples/models/gemma4_31b/tests/test_mlx_pipeline.py +++ b/examples/models/gemma4_31b/tests/test_mlx_pipeline.py @@ -244,5 +244,84 @@ def test_export_to_pte(self): self.assertTrue(os.path.exists(os.path.join(out_dir, "model.pte"))) +class TestGgufMlxPipeline(unittest.TestCase): + """Test GGUF → MLX loading path with synthetic Q6_K-like tensors.""" + + def test_load_gguf_model_mlx_backend(self): + """gguf_loader.load_gguf_model accepts backend='mlx'.""" + try: + import gguf # noqa: F401 + except ModuleNotFoundError: + self.skipTest("gguf package not installed") + + from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model + + # Will fail on missing file, but NOT on "Unsupported backend". + with self.assertRaisesRegex((FileNotFoundError, OSError, RuntimeError), ".*"): + load_gguf_model("/nonexistent.gguf", backend="mlx") + + def test_mlx_backend_rejects_unknown(self): + from executorch.examples.models.gemma4_31b.gguf_loader import load_gguf_model + + with self.assertRaisesRegex(ValueError, "Unsupported backend"): + load_gguf_model("/nonexistent.gguf", backend="tpu") + + def test_gs16_packing_preserves_values(self): + """Q6_K-like weight (gs=16) preserves dequantized values after packing.""" + from executorch.examples.models.gemma4_31b.quant.pack_mlx import pack_for_mlx + from executorch.examples.models.gemma4_31b.quant.quantize import ( + dequantize_weight, + ) + from torchao.quantization import IntxUnpackedToInt8Tensor + + w = IntxUnpackedToInt8Tensor( + qdata=torch.randint(-32, 31, (64, 128), dtype=torch.int8), + scale=torch.randn(64, 8, dtype=torch.bfloat16), + zero_point=torch.zeros(64, 8, dtype=torch.int8), + target_dtype=torch.int8, + block_size=(1, 16), + dtype=torch.bfloat16, + activation_quantization=None, + ) + before = dequantize_weight(w, torch.float32) + + module = nn.Linear(128, 64, bias=False) + pack_for_mlx(module, {"weight": w}) + after = dequantize_weight(module.weight.data, torch.float32) + + self.assertTrue( + torch.allclose(before, after, atol=1e-5), + f"max diff: {(before - after).abs().max():.6g}", + ) + + def test_embedding_packing_preserves_values(self): + """MLX embedding packing preserves dequantized weight values.""" + from executorch.examples.models.gemma4_31b.quant.pack_mlx import pack_for_mlx + from executorch.examples.models.gemma4_31b.quant.quantize import ( + dequantize_weight, + ) + from torchao.quantization import IntxUnpackedToInt8Tensor + + w = IntxUnpackedToInt8Tensor( + qdata=torch.randint(-8, 7, (256, 128), dtype=torch.int8), + scale=torch.randn(256, 4, dtype=torch.bfloat16), + zero_point=torch.zeros(256, 4, dtype=torch.bfloat16), + target_dtype=torch.int4, + block_size=(1, 32), + dtype=torch.bfloat16, + activation_quantization=None, + ) + before = dequantize_weight(w, torch.float32) + + module = nn.Embedding(256, 128) + pack_for_mlx(module, {"weight": w}) + after = dequantize_weight(module.weight.data, torch.float32) + + self.assertTrue( + torch.allclose(before, after, atol=1e-5), + f"max diff: {(before - after).abs().max():.6g}", + ) + + if __name__ == "__main__": unittest.main() From 9596866371dbabf763de063a5ab2fa00c5c3fe2e Mon Sep 17 00:00:00 2001 From: Siddartha Pothapragada Date: Thu, 28 May 2026 17:38:40 -0700 Subject: [PATCH 068/103] Add ASR module and LoRA/dataFiles instrumentation tests (#19859) Adds two new Android instrumentation test suites covering previously untested API surfaces, completing feature testing coverage for OKR 3.2. AsrModuleInstrumentationTest (18 tests): constructor validation, lifecycle (close idempotency, use-after-close), transcribe validation, and AsrTranscribeConfig builder/validation. LlmLoraInstrumentationTest (13 tests): dataFiles constructor variants, LlmModuleConfig with dataPath, invalid data file error handling, baseline equivalence, and config builder validation. ## Test plan - [x] `./gradlew :executorch_android:connectedAndroidTest -Pandroid.testInstrumentationRunnerArguments.class=org.pytorch.executor ch.AsrModuleInstrumentationTest` - [x] `./gradlew :executorch_android:connectedAndroidTest -Pandroid.testInstrumentationRunnerArguments.class=org.pytorch.executor ch.LlmLoraInstrumentationTest` - [x] Verify all 31 new tests pass on emulator (API 34 x86_64) - [x] Verify existing tests are unaffected --- .../AsrModuleInstrumentationTest.kt | 260 ++++++++++++++++ .../executorch/LlmLoraInstrumentationTest.kt | 291 ++++++++++++++++++ 2 files changed, 551 insertions(+) create mode 100644 extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt create mode 100644 extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt new file mode 100644 index 00000000000..fe8a168e406 --- /dev/null +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/AsrModuleInstrumentationTest.kt @@ -0,0 +1,260 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +package org.pytorch.executorch + +import androidx.test.ext.junit.runners.AndroidJUnit4 +import java.io.File +import java.io.IOException +import org.apache.commons.io.FileUtils +import org.junit.Assert.assertEquals +import org.junit.Assert.assertFalse +import org.junit.Assert.assertTrue +import org.junit.Assert.fail +import org.junit.Assume.assumeNotNull +import org.junit.Test +import org.junit.runner.RunWith +import org.pytorch.executorch.TestFileUtils.getTestFilePath +import org.pytorch.executorch.extension.asr.AsrCallback +import org.pytorch.executorch.extension.asr.AsrModule +import org.pytorch.executorch.extension.asr.AsrTranscribeConfig + +/** + * Instrumentation tests for [AsrModule], [AsrTranscribeConfig], and [AsrCallback]. + * + * Tests cover: + * - Constructor validation (invalid model/tokenizer/preprocessor paths) + * - AsrTranscribeConfig builder and validation + * - Lifecycle (close idempotency, use-after-close) + * - Transcribe validation (invalid WAV path) + * + * The test fixture is the TinyStories-110M LLM model, NOT an ASR model, so functional transcription + * tests are not possible. Tests that require a valid AsrModule instance handle the case where + * nativeCreate fails (stories.pte lacks encoder/text_decoder methods). + */ +@RunWith(AndroidJUnit4::class) +class AsrModuleInstrumentationTest { + + // ─── Constructor validation ───────────────────────────────────────────────── + + @Test(timeout = 30_000) + fun testInvalidModelPathThrows() { + try { + AsrModule("/nonexistent/model.pte", "/nonexistent/tokenizer") + fail("Should throw for invalid model path") + } catch (_: IllegalArgumentException) { + // Expected: require(modelFile.canRead() && modelFile.isFile) + } + } + + @Test(timeout = 30_000) + fun testInvalidTokenizerPathThrows() { + val modelFile = provisionModelFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + try { + AsrModule(modelFile!!.absolutePath, "/nonexistent/tokenizer") + fail("Should throw for invalid tokenizer path") + } catch (_: IllegalArgumentException) { + // Expected: require(tokenizerFile.exists()) + } + } + + @Test(timeout = 30_000) + fun testInvalidPreprocessorPathThrows() { + val modelFile = provisionModelFile() + val tokenizerFile = provisionTokenizerFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile) + try { + AsrModule( + modelFile!!.absolutePath, + tokenizerFile!!.absolutePath, + preprocessorPath = "/nonexistent/preprocessor.pte", + ) + fail("Should throw for invalid preprocessor path") + } catch (_: IllegalArgumentException) { + // Expected: require(preprocessorFile.canRead() && preprocessorFile.isFile) + } + } + + @Test(timeout = 30_000) + fun testNonAsrModelFailsGracefully() { + val modelFile = provisionModelFile() + val tokenizerFile = provisionTokenizerFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile) + try { + val module = AsrModule(modelFile!!.absolutePath, tokenizerFile!!.absolutePath) + // If construction succeeds (model was accepted), verify basic state + assertTrue("Module should be valid after construction", module.isValid) + module.close() + } catch (_: ExecutorchRuntimeException) { + // Expected: nativeCreate returns 0 for non-ASR model + } catch (_: RuntimeException) { + // Also acceptable: native layer rejects the model + } + } + + // ─── Lifecycle ────────────────────────────────────────────────────────────── + + @Test(timeout = 30_000) + fun testCloseIsIdempotent() { + val module = tryCreateAsrModule() ?: return + module.close() + module.close() + module.close() + assertFalse("isValid must be false after close", module.isValid) + } + + @Test(timeout = 30_000) + fun testLoadAfterCloseThrows() { + val module = tryCreateAsrModule() ?: return + module.close() + try { + module.load() + fail("load() after close() must throw IllegalStateException") + } catch (_: IllegalStateException) { + // Expected + } + } + + @Test(timeout = 30_000) + fun testTranscribeAfterCloseThrows() { + val module = tryCreateAsrModule() ?: return + module.close() + try { + module.transcribe("/some/audio.wav") + fail("transcribe() after close() must throw IllegalStateException") + } catch (_: IllegalStateException) { + // Expected + } + } + + @Test(timeout = 30_000) + fun testIsValidAndIsLoadedState() { + val module = tryCreateAsrModule() ?: return + assertTrue("Module should be valid after construction", module.isValid) + module.close() + assertFalse("Module should not be valid after close", module.isValid) + assertFalse("Module should not be loaded after close", module.isLoaded) + } + + // ─── Transcribe validation ────────────────────────────────────────────────── + + @Test(timeout = 30_000) + fun testTranscribeInvalidWavPathThrows() { + val module = tryCreateAsrModule() ?: return + try { + module.transcribe("/nonexistent/audio.wav") + fail("transcribe() with invalid WAV path must throw") + } catch (_: IllegalArgumentException) { + // Expected: require(wavFile.canRead() && wavFile.isFile) + } finally { + module.close() + } + } + + // ─── AsrTranscribeConfig ──────────────────────────────────────────────────── + + @Test + fun testConfigDefaults() { + val config = AsrTranscribeConfig() + assertEquals(128L, config.maxNewTokens) + assertEquals(0.0f, config.temperature, 0.0f) + assertEquals(0L, config.decoderStartTokenId) + } + + @Test + fun testConfigBuilder() { + val config = + AsrTranscribeConfig.Builder() + .setMaxNewTokens(256) + .setTemperature(0.7f) + .setDecoderStartTokenId(50258) + .build() + assertEquals(256L, config.maxNewTokens) + assertEquals(0.7f, config.temperature, 0.001f) + assertEquals(50258L, config.decoderStartTokenId) + } + + @Test + fun testConfigCustomValues() { + val config = AsrTranscribeConfig(maxNewTokens = 64, temperature = 0.5f, decoderStartTokenId = 1) + assertEquals(64L, config.maxNewTokens) + assertEquals(0.5f, config.temperature, 0.001f) + assertEquals(1L, config.decoderStartTokenId) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigZeroMaxNewTokensThrows() { + AsrTranscribeConfig(maxNewTokens = 0) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigNegativeMaxNewTokensThrows() { + AsrTranscribeConfig(maxNewTokens = -1) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigNegativeTemperatureThrows() { + AsrTranscribeConfig(temperature = -0.1f) + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderZeroMaxNewTokensThrows() { + AsrTranscribeConfig.Builder().setMaxNewTokens(0).build() + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderNegativeTemperatureThrows() { + AsrTranscribeConfig.Builder().setTemperature(-1.0f).build() + } + + @Test + fun testConfigDataClassEquality() { + val a = AsrTranscribeConfig(maxNewTokens = 100, temperature = 0.5f, decoderStartTokenId = 42) + val b = AsrTranscribeConfig(maxNewTokens = 100, temperature = 0.5f, decoderStartTokenId = 42) + assertEquals(a, b) + assertEquals(a.hashCode(), b.hashCode()) + } + + // ─── Helpers ──────────────────────────────────────────────────────────────── + + @Throws(IOException::class) + private fun provisionModelFile(): File? { + val pteFile = File(getTestFilePath(MODEL_FILE_NAME)) + val stream = javaClass.getResourceAsStream(MODEL_FILE_NAME) ?: return null + stream.use { FileUtils.copyInputStreamToFile(it, pteFile) } + return pteFile + } + + @Throws(IOException::class) + private fun provisionTokenizerFile(): File? { + val tokenizerFile = File(getTestFilePath(TOKENIZER_FILE_NAME)) + val stream = javaClass.getResourceAsStream(TOKENIZER_FILE_NAME) ?: return null + stream.use { FileUtils.copyInputStreamToFile(it, tokenizerFile) } + return tokenizerFile + } + + private fun tryCreateAsrModule(): AsrModule? { + val modelFile = provisionModelFile() + val tokenizerFile = provisionTokenizerFile() + assumeNotNull("Test resource $MODEL_FILE_NAME not available", modelFile) + assumeNotNull("Test resource $TOKENIZER_FILE_NAME not available", tokenizerFile) + return try { + AsrModule(modelFile!!.absolutePath, tokenizerFile!!.absolutePath) + } catch (_: RuntimeException) { + // nativeCreate may reject non-ASR models — skip lifecycle tests in that case + null + } + } + + companion object { + private const val MODEL_FILE_NAME = "/stories.pte" + private const val TOKENIZER_FILE_NAME = "/tokenizer.bin" + } +} diff --git a/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt new file mode 100644 index 00000000000..a8d35b09de2 --- /dev/null +++ b/extension/android/executorch_android/src/androidTest/java/org/pytorch/executorch/LlmLoraInstrumentationTest.kt @@ -0,0 +1,291 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ +package org.pytorch.executorch + +import androidx.test.ext.junit.runners.AndroidJUnit4 +import java.io.File +import java.io.IOException +import org.apache.commons.io.FileUtils +import org.junit.After +import org.junit.Assert.assertTrue +import org.junit.Assert.fail +import org.junit.Before +import org.junit.Test +import org.junit.runner.RunWith +import org.pytorch.executorch.TestFileUtils.getTestFilePath +import org.pytorch.executorch.extension.llm.LlmCallback +import org.pytorch.executorch.extension.llm.LlmModule +import org.pytorch.executorch.extension.llm.LlmModuleConfig + +/** + * Instrumentation tests for LlmModule's LoRA / dataFiles constructor paths. + * + * LoRA adapters are loaded at construction time via the `dataFiles` parameter or + * `LlmModuleConfig.dataPath`. These tests verify that: + * 1. The dataFiles constructor variants produce a functional module + * 2. LlmModuleConfig with dataPath integrates correctly + * 3. Invalid data file paths are handled gracefully + * 4. Empty vs null dataFiles behave identically to no-data constructors + * + * Uses TinyStories-110M; no LoRA adapter fixture is available so functional LoRA tests + * (output-changes-with-adapter) are not possible. + */ +@RunWith(AndroidJUnit4::class) +class LlmLoraInstrumentationTest { + + private var llmModule: LlmModule? = null + + @Before + @Throws(IOException::class) + fun setUp() { + val pteFile = File(getTestFilePath(MODEL_FILE_NAME)) + requireNotNull(javaClass.getResourceAsStream(MODEL_FILE_NAME)) { + "Test resource $MODEL_FILE_NAME not found; did android_test_setup.sh run?" + } + .use { FileUtils.copyInputStreamToFile(it, pteFile) } + + val tokenizerFile = File(getTestFilePath(TOKENIZER_FILE_NAME)) + requireNotNull(javaClass.getResourceAsStream(TOKENIZER_FILE_NAME)) { + "Test resource $TOKENIZER_FILE_NAME not found; did android_test_setup.sh run?" + } + .use { FileUtils.copyInputStreamToFile(it, tokenizerFile) } + } + + @After + fun tearDown() { + llmModule?.close() + llmModule = null + } + + // ─── dataFiles constructor variants ───────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testConstructorWithEmptyDataFilesList() { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + emptyList(), + ) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module with empty dataFiles should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testConstructorWithNullDataPath() { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + null as String?, + ) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module with null dataPath should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testConstructorWithDataFilesAndBosEos() { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + emptyList(), + 0, + 0, + ) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module with dataFiles+BOS/EOS should generate tokens", tokens.isNotEmpty()) + } + + // ─── LlmModuleConfig with dataPath ────────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testLlmModuleConfigNoDataPath() { + val config = + LlmModuleConfig.create() + .modulePath(getTestFilePath(MODEL_FILE_NAME)) + .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME)) + .temperature(0.0f) + .build() + llmModule = LlmModule(config) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module via config with no dataPath should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testLlmModuleConfigWithNullDataPath() { + val config = + LlmModuleConfig.create() + .modulePath(getTestFilePath(MODEL_FILE_NAME)) + .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME)) + .temperature(0.0f) + .dataPath(null) + .build() + llmModule = LlmModule(config) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module via config with null dataPath should generate tokens", tokens.isNotEmpty()) + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testLlmModuleConfigWithLoadMode() { + val config = + LlmModuleConfig.create() + .modulePath(getTestFilePath(MODEL_FILE_NAME)) + .tokenizerPath(getTestFilePath(TOKENIZER_FILE_NAME)) + .temperature(0.0f) + .loadMode(LlmModuleConfig.LOAD_MODE_FILE) + .build() + llmModule = LlmModule(config) + val tokens = generateAndCollect(llmModule!!) + assertTrue("Module via config with LOAD_MODE_FILE should generate tokens", tokens.isNotEmpty()) + } + + // ─── Invalid data file paths ──────────────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testInvalidDataFilePathThrowsOnConstruction() { + try { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + listOf("/nonexistent/lora_weights.bin"), + ) + // dataFiles are passed to native initHybrid — invalid paths should cause + // construction to fail. If we reach here, the native layer didn't validate. + llmModule!!.close() + fail("Construction should have thrown for invalid data file path") + } catch (e: RuntimeException) { + assertTrue( + "Exception message should be non-empty", + e.message != null && e.message!!.isNotEmpty(), + ) + } + } + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testMultipleInvalidDataFilePathsThrowOnConstruction() { + try { + llmModule = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + listOf("/nonexistent/a.bin", "/nonexistent/b.bin"), + ) + llmModule!!.close() + fail("Construction should have thrown for invalid data file paths") + } catch (e: RuntimeException) { + assertTrue( + "Exception message should be non-empty", + e.message != null && e.message!!.isNotEmpty(), + ) + } + } + + // ─── Baseline equivalence ─────────────────────────────────────────────────── + + @Test(timeout = MAX_TEST_TIMEOUT_MS) + fun testEmptyDataFilesMatchesNoDataConstructor() { + val moduleNoData = + LlmModule(getTestFilePath(MODEL_FILE_NAME), getTestFilePath(TOKENIZER_FILE_NAME), 0.0f) + val moduleEmptyList = + LlmModule( + LlmModule.MODEL_TYPE_TEXT, + getTestFilePath(MODEL_FILE_NAME), + getTestFilePath(TOKENIZER_FILE_NAME), + 0.0f, + emptyList(), + ) + + try { + val tokensNoData = generateAndCollect(moduleNoData) + val tokensEmptyList = generateAndCollect(moduleEmptyList) + + assertTrue("Both constructors should produce tokens", tokensNoData.isNotEmpty()) + assertTrue("Both constructors should produce tokens", tokensEmptyList.isNotEmpty()) + } finally { + moduleNoData.close() + moduleEmptyList.close() + } + } + + // ─── LlmModuleConfig builder validation ───────────────────────────────────── + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderMissingModulePathThrows() { + LlmModuleConfig.create().tokenizerPath("/some/tokenizer.bin").build() + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderMissingTokenizerPathThrows() { + LlmModuleConfig.create().modulePath("/some/model.pte").build() + } + + @Test(expected = IllegalArgumentException::class) + fun testConfigBuilderInvalidLoadModeThrows() { + LlmModuleConfig.create() + .modulePath("/some/model.pte") + .tokenizerPath("/some/tokenizer.bin") + .loadMode(99) + .build() + } + + @Test + fun testConfigBuilderAllLoadModes() { + val modes = + listOf( + LlmModuleConfig.LOAD_MODE_FILE, + LlmModuleConfig.LOAD_MODE_MMAP, + LlmModuleConfig.LOAD_MODE_MMAP_USE_MLOCK, + LlmModuleConfig.LOAD_MODE_MMAP_USE_MLOCK_IGNORE_ERRORS, + ) + for (mode in modes) { + val config = + LlmModuleConfig.create() + .modulePath("/some/model.pte") + .tokenizerPath("/some/tokenizer.bin") + .loadMode(mode) + .build() + assertTrue("Config should accept load mode $mode", config.loadMode == mode) + } + } + + // ─── Helpers ──────────────────────────────────────────────────────────────── + + private fun generateAndCollect(module: LlmModule): List { + val collector = mutableListOf() + module.generate( + TEST_PROMPT, + SEQ_LEN, + object : LlmCallback { + override fun onResult(result: String) { + collector.add(result) + } + }, + ) + return collector + } + + companion object { + private const val MODEL_FILE_NAME = "/stories.pte" + private const val TOKENIZER_FILE_NAME = "/tokenizer.bin" + private const val TEST_PROMPT = "Once" + private const val SEQ_LEN = 16 + private const val MAX_TEST_TIMEOUT_MS = 120_000L + } +} From 4de16d0ad24339f52f784c8e35297e702fb7675e Mon Sep 17 00:00:00 2001 From: Ethan Ng Date: Thu, 28 May 2026 19:43:41 -0700 Subject: [PATCH 069/103] Add shared fusion infrastructure and QuantFusionPass (#19724) Differential Revision: D105728137 Pull Request resolved: https://github.com/pytorch/executorch/pull/19724 --- backends/cadence/aot/compiler_funcs.py | 30 +++ backends/cadence/aot/pass_utils.py | 17 ++ backends/cadence/aot/quantizer/BUCK | 15 ++ .../cadence/aot/quantizer/pattern_utils.py | 207 ++++++++++++++++++ backends/cadence/aot/quantizer/patterns.py | 18 +- backends/cadence/aot/quantizer/utils.py | 4 +- 6 files changed, 289 insertions(+), 2 deletions(-) create mode 100644 backends/cadence/aot/quantizer/pattern_utils.py diff --git a/backends/cadence/aot/compiler_funcs.py b/backends/cadence/aot/compiler_funcs.py index 02dcde7fd39..cec3cb7d016 100644 --- a/backends/cadence/aot/compiler_funcs.py +++ b/backends/cadence/aot/compiler_funcs.py @@ -14,6 +14,7 @@ import torch from torch._inductor.decomposition import remove_decompositions from torch.fx import GraphModule +from torch.fx.passes.infra.pass_base import PassBase, PassResult from torchao.quantization.pt2e.quantize_pt2e import prepare_pt2e, prepare_qat_pt2e from torchao.quantization.pt2e.quantizer import Quantizer @@ -607,3 +608,32 @@ def sink_input_dequant_through_transparent_ops( graph_module.recompile() return modified + + +class QuantFusionPass(PassBase): + """ + Iterates patterns, finds anchor ops in the converted graph, and calls + pattern.fuse() to replace dq-op-q subgraphs with fused ops. + """ + + def __init__(self, patterns: Sequence[object]) -> None: + super().__init__() + self.patterns = patterns + + def call(self, graph_module: GraphModule) -> Optional[PassResult]: + changed = False + for pattern in self.patterns: + pattern_changed = False + for target in pattern.anchor_ops(): # pyre-ignore[16] + for node in graph_module.graph.find_nodes( + op="call_function", target=target + ): + result = pattern.fuse(graph_module, node) # pyre-ignore[16] + if result is not None: + changed = True + pattern_changed = True + if pattern_changed: + graph_module.graph.eliminate_dead_code() + if changed: + graph_module.recompile() + return PassResult(graph_module, changed) diff --git a/backends/cadence/aot/pass_utils.py b/backends/cadence/aot/pass_utils.py index ab42ef43d56..091605e94ec 100644 --- a/backends/cadence/aot/pass_utils.py +++ b/backends/cadence/aot/pass_utils.py @@ -212,3 +212,20 @@ def nodes_not_adjacent_in_gm( def none_throws(x: Optional[PassResult]) -> PassResult: assert x is not None return x + + +def replace_with_op( + gm: torch.fx.GraphModule, + insert_after: torch.fx.Node, + replacement_op: torch._ops.OpOverload, + args: tuple, # pyre-ignore[2] + kwargs: dict, # pyre-ignore[2] + node_to_replace: torch.fx.Node, +) -> torch.fx.Node: + """Insert ``replacement_op`` after ``insert_after`` and replace all uses of + ``node_to_replace`` with the new node.""" + with gm.graph.inserting_after(insert_after): + new_node = gm.graph.call_function(replacement_op, args, kwargs) + new_node.meta = node_to_replace.meta + node_to_replace.replace_all_uses_with(new_node) + return new_node diff --git a/backends/cadence/aot/quantizer/BUCK b/backends/cadence/aot/quantizer/BUCK index 34fec2556f8..c2ec3e3a1f6 100644 --- a/backends/cadence/aot/quantizer/BUCK +++ b/backends/cadence/aot/quantizer/BUCK @@ -14,6 +14,21 @@ fbcode_target(_kind = runtime.python_library, ], ) +fbcode_target(_kind = runtime.python_library, + name = "pattern_utils", + srcs = [ + "pattern_utils.py", + ], + typing = True, + deps = [ + ":utils", + "//caffe2:torch", + "//executorch/backends/cadence/aot:compiler_utils", + "//executorch/backends/cadence/aot:pass_utils", + "//executorch/backends/cadence/aot:utils", + ], +) + fbcode_target(_kind = runtime.python_library, name = "patterns", srcs = [ diff --git a/backends/cadence/aot/quantizer/pattern_utils.py b/backends/cadence/aot/quantizer/pattern_utils.py new file mode 100644 index 00000000000..25ff363ecc9 --- /dev/null +++ b/backends/cadence/aot/quantizer/pattern_utils.py @@ -0,0 +1,207 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-strict + +import operator +from typing import Any + +import torch +from executorch.backends.cadence.aot.pass_utils import get_arg, replace_with_op +from executorch.backends.cadence.aot.quantizer.utils import ( + copy_node_metadata, + create_zero_bias_int32, + quantize_tensor_multiplier, +) +from executorch.backends.cadence.aot.utils import is_depthwise_conv +from torch import fx +from torch._ops import OpOverload + +DQ_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.dequantize_per_tensor.default +Q_PER_TENSOR: OpOverload = torch.ops.quantized_decomposed.quantize_per_tensor.default + + +def insert_node_with_meta( + gm: fx.GraphModule, + op: OpOverload, + args: tuple[Any, ...], + kwargs: dict[str, Any] | None, + insert_before: fx.Node, + like_node: fx.Node, +) -> fx.Node: + """Create a new node and populate its FakeTensor metadata. + + Inserts ``op(*args, **kwargs)`` before ``insert_before``, runs the op + under ``like_node``'s fake_mode to compute ``meta["val"]``, and copies + remaining metadata from ``like_node``. + """ + with gm.graph.inserting_before(insert_before): + node = gm.graph.call_function(op, args, kwargs or {}) + assert "val" in like_node.meta + fake_mode = like_node.meta["val"].fake_mode + assert fake_mode is not None + + def _resolve(x: Any) -> Any: + return x.meta["val"] if isinstance(x, fx.Node) else x + + fake_args = tuple(_resolve(a) for a in args) + fake_kwargs = {k: _resolve(v) for k, v in (kwargs or {}).items()} + with fake_mode: + node.meta["val"] = op(*fake_args, **fake_kwargs) + copy_node_metadata(node, like_node) + return node + + +def find_quant_user(node: fx.Node) -> fx.Node | None: + """Find the first quantize_per_tensor user of ``node``, traversing through getitem.""" + users = list(node.users) + if not users: + return None + user = users[0] + if user.target is operator.getitem: + if user.args[1] == 0: + users = list(user.users) + if not users: + return None + user = users[0] + else: + return None + if user.target == Q_PER_TENSOR: + return user + return None + + +def fuse_conv( + pattern: object, + gm: fx.GraphModule, + conv_node: fx.Node, + dq_input: fx.Node, + dq_weight: fx.Node, + quant_node: fx.Node, +) -> fx.Node: + """Fuse a dq->conv->q chain into a single quantized conv op.""" + dq_bias = None + if len(conv_node.args) > 2 and conv_node.args[2] is not None: + bias_arg = conv_node.args[2] + assert isinstance(bias_arg, fx.Node) + dq_bias = bias_arg if bias_arg.target == DQ_PER_TENSOR else None + weight_scale = get_arg(dq_weight, "scale", float) + input_scale = get_arg(dq_input, "scale", float) + bias_scale = input_scale * weight_scale + if dq_bias is not None: + bias_q = get_arg(dq_bias, "input", fx.Node) + else: + # Cadence quantized conv ops require a non-optional bias argument. + weight_node = get_arg(dq_weight, "input", fx.Node) + with gm.graph.inserting_before(conv_node): + bias_q = create_zero_bias_int32(gm, weight_node, bias_scale) + requantize_scale = bias_scale / get_arg(quant_node, "scale", float) + requantize_scale_t = torch.tensor([requantize_scale]) + out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t) + args = ( + get_arg(dq_input, "input", fx.Node), + get_arg(dq_weight, "input", fx.Node), + bias_q, + ) + groups = get_arg(conv_node, "groups", int) + kwargs = { + "stride": get_arg(conv_node, "stride", list[int]), + "padding": get_arg(conv_node, "padding", list[int]), + "dilation": get_arg(conv_node, "dilation", list[int]), + "groups": groups, + "input_zero_point": get_arg(dq_input, "zero_point", int), + "weight_zero_point": get_arg(dq_weight, "zero_point", int), + "bias_scale": bias_scale, + "out_scale": get_arg(quant_node, "scale", float), + "out_zero_point": get_arg(quant_node, "zero_point", int), + "out_multiplier": out_multiplier[0].item(), + "out_shift": out_shift[0].item(), + } + replacement_op = pattern.replacement_op() # pyre-ignore[16] + if replacement_op == torch.ops.cadence.quantized_conv1d_ncl.per_tensor: + input_node = get_arg(dq_input, "input", fx.Node) + assert len(input_node.meta["val"].shape) >= 2 + in_channels = input_node.meta["val"].shape[1] + if is_depthwise_conv(groups, in_channels): + replacement_op = torch.ops.cadence.quantized_depthwise_conv1d_ncl.per_tensor + return replace_with_op(gm, conv_node, replacement_op, args, kwargs, quant_node) + + +def fuse_linear( + gm: fx.GraphModule, + dq_input: fx.Node, + dq_weight: fx.Node, + dq_bias: fx.Node | None, + quant_node: fx.Node, + op_node: fx.Node, + replacement_op: OpOverload, + weight_q: fx.Node | None = None, +) -> fx.Node: + """Fuse a dq->linear->q chain into a single quantized linear op.""" + assert op_node.target in ( + torch.ops.aten.linear.default, + torch.ops.aten.addmm.default, + ), f"Expected linear/addmm, got {op_node.target}" + weight_scale = get_arg(dq_weight, "scale", float) + input_scale = get_arg(dq_input, "scale", float) + bias_scale = input_scale * weight_scale + requantize_scale = bias_scale / get_arg(quant_node, "scale", float) + requantize_scale_t = torch.tensor([requantize_scale]) + out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t) + if dq_bias is not None: + bias_q = get_arg(dq_bias, "input", fx.Node) + else: + # Cadence quantized linear ops require a non-optional bias argument. + weight_node = get_arg(dq_weight, "input", fx.Node) + with gm.graph.inserting_before(op_node): + bias_q = create_zero_bias_int32(gm, weight_node, bias_scale) + final_weight = ( + weight_q if weight_q is not None else get_arg(dq_weight, "input", fx.Node) + ) + args = (get_arg(dq_input, "input", fx.Node), final_weight, bias_q) + kwargs = { + "src_zero_point": get_arg(dq_input, "zero_point", int), + "weight_zero_point": get_arg(dq_weight, "zero_point", int), + "out_multiplier": out_multiplier[0].item(), + "out_shift": out_shift[0].item(), + "out_zero_point": get_arg(quant_node, "zero_point", int), + "offset": None, + } + return replace_with_op(gm, op_node, replacement_op, args, kwargs, quant_node) + + +def fuse_matmul( + gm: fx.GraphModule, + anchor_node: fx.Node, + dq0: fx.Node, + dq1: fx.Node, + quant_node: fx.Node, + replacement_op: OpOverload, +) -> fx.Node: + """Fuse a dq->matmul->q chain into a single quantized matmul op.""" + assert anchor_node.target in ( + torch.ops.aten.bmm.default, + torch.ops.aten.matmul.default, + ), f"Expected bmm/matmul, got {anchor_node.target}" + scale0 = get_arg(dq0, "scale", float) + scale1 = get_arg(dq1, "scale", float) + requantize_scale = (scale0 * scale1) / get_arg(quant_node, "scale", float) + requantize_scale_t = torch.tensor([requantize_scale]) + out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t) + args = ( + get_arg(dq0, "input", fx.Node), + get_arg(dq0, "zero_point", int), + get_arg(dq1, "input", fx.Node), + get_arg(dq1, "zero_point", int), + None, + ) + kwargs = { + "out_multiplier": out_multiplier[0].item(), + "out_shift": out_shift[0].item(), + "out_zero_point": get_arg(quant_node, "zero_point", int), + "transposed": False, + } + return replace_with_op(gm, anchor_node, replacement_op, args, kwargs, quant_node) diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index 54c01227d07..e1f44b8ce5c 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -9,7 +9,7 @@ import operator from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import List, Tuple, Union +from typing import List, Optional, Tuple, Union import torch from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams @@ -79,6 +79,22 @@ def replacement_op(self) -> OpOverload: """ pass + def anchor_ops(self) -> tuple[OpOverload, ...]: + return tuple(self.partition_types()) + + def fuse( + self, + gm: fx.GraphModule, + anchor_node: fx.Node, + ) -> Optional[fx.Node]: + """Replace the dq→op→q subgraph around ``anchor_node`` with a fused op. + + Called by ``QuantFusionPass`` for each node matching ``anchor_ops()``. + Returns the new fused node on success, or ``None`` to skip this match. + Subclasses override to implement pattern-specific fusion logic. + """ + return None + class AddmmPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: diff --git a/backends/cadence/aot/quantizer/utils.py b/backends/cadence/aot/quantizer/utils.py index 51182a4ce92..f5773938f0a 100644 --- a/backends/cadence/aot/quantizer/utils.py +++ b/backends/cadence/aot/quantizer/utils.py @@ -118,7 +118,9 @@ def create_zero_bias_int32( bias_scale: float, ) -> fx.Node: """ - Creates a zero bias tensor with the shape of weight[0] + Creates a zero bias tensor with the shape of weight[0]. + Caller is responsible for setting the graph insertion point + (e.g. ``with gm.graph.inserting_before(node):``). """ try: attr_node = getattr(graph_module, weight_node.target) From 007570a970b0d3d1188b887fae2fd276970499f5 Mon Sep 17 00:00:00 2001 From: Martin Pavella Date: Fri, 29 May 2026 08:58:13 +0200 Subject: [PATCH 070/103] NXP backend: Enable `aten.upsample_bilinear2d` with new Neutron flow. (#19793) ### Summary Enable `aten.upsample_bilinear2d` with new Neutron flow. ### Test plan Unit tests provided. cc @robert-kalmar @JakeStevens @digantdesai @rascani --- .../upsample_bilinear2d_converter.py | 102 +++++-- .../test_convert_upsample_bilinear2d.py | 283 +++++++++++++++++- 2 files changed, 353 insertions(+), 32 deletions(-) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py index 33d97dff642..1183ef494b5 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_bilinear2d_converter.py @@ -4,11 +4,13 @@ # LICENSE file in the root directory of this source tree. import numpy as np +import torch from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT from executorch.backends.nxp.backend.edge_helper import node_has_well_defined_shape from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, + is_not_qdq_node, NodeConverter, ) from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.resize_bilinear_options import ( @@ -16,12 +18,35 @@ ) from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node +from torch.fx.passes.infra.partitioner import Partition from torch.nn import Parameter # noinspection SpellCheckingInspection class UpsampleBilinear2DConverter(NodeConverter): + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + ) -> bool: + input_shape = node.all_input_nodes[0].meta["val"].shape + output_shape = node.meta["val"].shape + is_alone_in_partition = cls.is_node_alone_in_partition( + node, partition_list, filter_fn=is_not_qdq_node + ) + + if is_alone_in_partition and input_shape == output_shape: + # The operator is a no-op, so the Neutron Converter will skip it. If it's the only node in the + # partition, the graph would end up empty. + return False + + return True + @staticmethod def _is_supported_in_IR( node: Node, @@ -36,6 +61,14 @@ def _is_supported_in_IR( " format. Please report this." ) + # The conversion requires the output shape to be known and static. + if not node_has_well_defined_shape(node): + return False + + if len(node.meta["val"].shape) != 4: + # Unexpected case. The input should always be 4D. + return False + return True @staticmethod @@ -45,38 +78,58 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - # Neutron requires static shapes. - # neutron-converter/src/OperatorC/UpsamplePlugin.cpp?at=NEUTRON_SOFTWARE_2.2.3#74 - if not node_has_well_defined_shape(node): - return False - - if len(node.meta["val"].shape) != 4: - # Unexpected case. The input should always be 4D. - return False - - # The tensors here use the channels first format (NCHW). + # The tensors are always 4D and use the channels first format (NCHW). _, in_c, in_h, in_w = node.all_input_nodes[0].meta["val"].shape _, _, out_h, out_w = node.meta["val"].shape - # Neutron supports only the doubling and quadrupleing of both height and width at the same time. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 - supported_scales = [2, 4] - if not any( - in_h * scale == out_h and in_w * scale == out_w - for scale in supported_scales - ): - return False - - # Neutron requires the input channels to be a multiple of `num_macs`. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#777 - if in_c % neutron_target_spec.get_num_macs() != 0: - return False + if custom_delegation_options.use_new_flow_neutron_c: + # Requirements specified by the new Neutron flow documentation. + + if not NodeConverter.uses_quantization_type_for_io( + node, + supported_types=[torch.int8, torch.uint8], + input_indices=[0], + output_indices=[0], + ): + return False + + supported_scales = [1, 2, 4, 8] + align_corners = node.args[2] + if align_corners: + if in_h == 1 or in_w == 1: + return False # Avoid division by 0. + h_scale = (out_h - 1) / (in_h - 1) + w_scale = (out_w - 1) / (in_w - 1) + else: + h_scale = out_h / in_h + w_scale = out_w / in_w + + # The H and W scales don't need to be equal, but both must be supported. + if (h_scale not in supported_scales) or (w_scale not in supported_scales): + return False + + else: + # Requirements of the old Neutron flow. + + # Neutron supports only the doubling and quadrupleing of both height and width at the same time. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 + supported_scales = [2, 4] + if not any( + in_h * scale == out_h and in_w * scale == out_w + for scale in supported_scales + ): + return False + + # Neutron requires the input channels to be a multiple of `num_macs`. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#777 + if in_c % neutron_target_spec.get_num_macs() != 0: + return False return True def convert(self, node: Node): """Convert the `aten.upsample_bilinear2d.vec` operator to Neutron IR `ResizeBilinear`. - The schema is: + The ExecuTorch schema is: aten::upsample_bilinear2d.vec( Tensor input, SymInt[]? output_size, @@ -109,6 +162,7 @@ def convert(self, node: Node): # and the second one is what NeutronIR uses when `align_corners == False and half_pixel_centers == True`. # https://github.com/tensorflow/tensorflow/blob/v2.20.0/tensorflow/lite/kernels/internal/reference/resize_bilinear.h#L82-L88 # https://github.com/tensorflow/tensorflow/blob/v2.20.0/tensorflow/lite/kernels/internal/reference/resize_bilinear.h#L172-L180 + # Also, the new Neutron flow requires that `align_corners` and `half_pixel_centers` are not True simultainiously. align_corners = node.args[2] half_pixel_centers = not align_corners t_op.builtin_options = ResizeBilinear(align_corners, half_pixel_centers) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py index 5663eea9cc3..2d2f9845fa3 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_bilinear2d.py @@ -4,12 +4,15 @@ # LICENSE file in the root directory of this source tree. import numpy as np + +# noinspection PyUnusedImports import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import ( convert_run_compare, @@ -17,7 +20,17 @@ ToChannelFirstPreprocess, ToChannelLastPreprocess, ) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + AllCloseOutputComparator, +) +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + AddTensor, + ExecutorchDelegateCall, + UpsampleBilinear2D, +) +from executorch.backends.nxp.tests.use_qat import * # noqa F403 @pytest.fixture(autouse=True) @@ -26,23 +39,25 @@ def reseed_model_per_test_run(): np.random.seed(23) -# noinspection PyProtectedMember -ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate -UpsampleBilinear2D = exir_ops.edge.aten.upsample_bilinear2d.vec - - class UpsampleBilinearModule(torch.nn.Module): - def __init__(self, size=None, scale=None): + def __init__(self, size=None, scale=None, **kwargs): super().__init__() self.upsample = torch.nn.Upsample( - size=size, scale_factor=scale, mode="bilinear" + size=size, scale_factor=scale, mode="bilinear", **kwargs ) def forward(self, x): return self.upsample(x) +class UpsampleBilinearAddModule(UpsampleBilinearModule): + + def forward(self, x): + x = super().forward(x) + return x + x + + @pytest.mark.parametrize( "input_shape, size", [ @@ -185,3 +200,255 @@ def test_convert_upsample_bilinear2d__no_delegation__unsupported_size( # Make sure the `upsample` was NOT delegated (size != double of input). assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D]) + + +class TestUpsampleBilinear2DNewNeutronFlow: + # TODO Use quantized dataset and `atol=1` in the tests. + + # noinspection PyMethodMayBeStatic + def assert_delegated( + self, + model, + input_shape, + mocker, + use_qat=False, + atol=None, + expected_delegated_ops=None, + ): + if expected_delegated_ops is None: + expected_delegated_ops = {UpsampleBilinear2D: 1} + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops=expected_delegated_ops, + expected_non_delegated_ops={}, + ) + + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) + + kwargs = {"atol": atol} if atol is not None else {} + output_comparator = AllCloseOutputComparator(**kwargs) + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + output_comparator, + use_qat=use_qat, + use_new_flow_neutron_c=True, # Use the new flow. + ) + + # noinspection PyMethodMayBeStatic + def assert_not_delegated(self, model, input_shape): + delegated_ep = to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleBilinear2D]) + + def test__qat__align_corners(self, mocker, use_qat): + align_corners = True + input_shape = (1, 2, 3, 4) + output_size = (5, 7) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.015 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol) + + def test__qat__not_align_corners(self, mocker, use_qat): + align_corners = False + input_shape = (1, 2, 3, 4) + output_size = (6, 8) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.015 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat, atol=atol) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param((1, 2, 3, 4), (6, 8), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (3, 3, 3, 5), + (6, 5), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (3, 16), id="batch=2, scale_h=1, scale_w=4"), + pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"), + ], + ) + def test__not_align_corners__output_size(self, mocker, input_shape, output_size): + align_corners = False + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__not_align_corners__output_size__unsupported(self): + align_corners = False + input_shape = (1, 2, 3, 4) + output_size = (9, 12) # scale = (3, 3) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + pytest.param((1, 2, 3, 4), (2, 2), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (3, 3, 3, 5), + (2, 1), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (4, 1), id="batch=2, scale_h=4, scale_w=1"), + pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"), + ], + ) + def test__not_align_corners__scales(self, mocker, input_shape, scale): + align_corners = False + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__not_align_corners__scales__unsupported(self): + align_corners = False + input_shape = (1, 2, 3, 4) + scale = (3, 3) + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param((1, 2, 4, 5), (7, 9), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (1, 3, 3, 5), + (5, 5), + id="batch=1, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 4, 5), (4, 17), id="batch=2, scale_h=1, scale_w=4"), + pytest.param((1, 2, 4, 5), (25, 9), id="batch=1, scale_h=8, scale_w=2"), + ], + ) + def test__align_corners__output_size(self, mocker, input_shape, output_size): + align_corners = True + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param( + (2, 2, 4, 5), (25, 9), id="batch=2, scale_h=8, scale_w=2" + ), # Error ~= 0.47 + pytest.param( + (3, 3, 3, 5), + (5, 5), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), # Error ~= 3.7 + ], + ) + def test__align_corners__output_size__incorrect_output( + self, mocker, input_shape, output_size + ): + align_corners = True + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + atol = 0.45 # Huge tolerance (still not enough to pass). + with pytest.raises(AssertionError): + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__align_corners__output_size__unsupported(self): + align_corners = True + input_shape = (1, 2, 3, 4) + output_size = (6, 8) # Neutron scale = (5/2, 7/3) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + def test__align_corners__output_size__input_size_equal_to_one(self): + align_corners = True + input_shape = (1, 2, 1, 1) # Neutron scale computation would divide by zero. + output_size = (2, 2) + model = UpsampleBilinearModule(size=output_size, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + # The PyTorch scales are "weird" because the "Neutron scales" are computed differently. + # The fractions correspond to "nice" Neutron scales (1, 2, 4, or 8). + pytest.param( + (1, 2, 4, 5), + (7 / 4, 9 / 5), + id="batch=1, scale_h=7/4, scale_w=9/5 (Neutron scales = (2, 2)", + ), + pytest.param( + (1, 3, 3, 5), + (5 / 3, 1), + id="batch=1, scale_h=5/3, scale_w=1 (Neutron scales = (2, 1))", + ), + pytest.param( + (2, 2, 4, 5), + (1, 17 / 5), + id="batch=2, scale_h=1, scale_w=17/5 (Neutron scales = (1, 4))", + ), + pytest.param( + (1, 2, 4, 5), + (25 / 4, 9 / 5), + id="batch=1, scale_h=25/4, scale_w=9/5 (Neutron scales = (8, 2))", + ), + ], + ) + def test__align_corners__scales(self, mocker, input_shape, scale): + align_corners = True + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + atol = 0.016 # ~= output scale -> single bit error. + self.assert_delegated(model, input_shape, mocker, atol=atol) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + pytest.param( + (2, 2, 4, 5), + (25 / 4, 9 / 5), + id="batch=3, scale_h=25/4, scale_w=9/5 (Neutron scales = (8, 2))", + ), # Error ~= 0.47 + pytest.param( + (3, 3, 3, 5), + (5 / 3, 1), + id="batch=3, scale_h=5/3, scale_w=1 (Neutron scales = (2, 1))", + ), # Error ~= 3.7 + ], + ) + def test__align_corners__scales__incorrect_output(self, mocker, input_shape, scale): + align_corners = True + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + atol = 0.45 # Huge tolerance (still not enough to pass). + with pytest.raises(AssertionError): + self.assert_delegated(model, input_shape, mocker, atol=atol) + + def test__align_corners__scales__unsupported(self): + align_corners = True + input_shape = (1, 2, 3, 4) + scale = (2, 2) # Neutron scale = (5/2, 7/3) + model = UpsampleBilinearModule(scale=scale, align_corners=align_corners) + self.assert_not_delegated(model, input_shape) + + def test__noop__alone_in_partition__not_delegated(self): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleBilinearModule(scale=scale) + self.assert_not_delegated(model, input_shape) + + def test__noop__not_alone_in_partition__delegated(self, mocker): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleBilinearAddModule(scale=scale) + self.assert_delegated( + model, + input_shape, + mocker, + expected_delegated_ops={UpsampleBilinear2D: 1, AddTensor: 1}, + ) From c72bc872a652c2197e954287bb62f0ebd0a69d75 Mon Sep 17 00:00:00 2001 From: Martin Pavella Date: Fri, 29 May 2026 09:00:32 +0200 Subject: [PATCH 071/103] NXP backend: Enable `aten.upsample_nearest2d` with new Neutron flow. (#19796) ### Summary NXP backend: Enable `aten.upsample_nearest2d` with new Neutron flow. ### Test plan Unit tests provided. cc @robert-kalmar @JakeStevens @digantdesai @rascani --- .../upsample_nearest2d_converter.py | 110 ++++++++++---- .../test_convert_upsample_nearest2d.py | 141 +++++++++++++++++- 2 files changed, 220 insertions(+), 31 deletions(-) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py index 1ddc71425ef..6e18a7bfe67 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/upsample_nearest2d_converter.py @@ -4,11 +4,13 @@ # LICENSE file in the root directory of this source tree. import numpy as np +import torch from executorch.backends.nxp.backend.data_format import DataFormat, NXP_NODE_FORMAT from executorch.backends.nxp.backend.edge_helper import node_has_well_defined_shape from executorch.backends.nxp.backend.ir.converter.node_converter import ( CustomDelegationOptions, + is_not_qdq_node, NodeConverter, ) from executorch.backends.nxp.backend.ir.tflite_generator.builtin_options.resize_nearest_neighbor_options import ( @@ -16,12 +18,37 @@ ) from executorch.backends.nxp.backend.neutron_target_spec import NeutronTargetSpec from torch.fx import Node +from torch.fx.passes.infra.partitioner import Partition from torch.nn import Parameter +HeightScale = float +WidthScale = float + # noinspection SpellCheckingInspection class UpsampleNearest2DConverter(NodeConverter): + @classmethod + def supports_partitioning_result( + cls, + node: Node, + partition_list: list[Partition], + custom_delegation_options: CustomDelegationOptions, + neutron_target_spec: NeutronTargetSpec, + parameters_mapping: dict[str, Parameter], + ) -> bool: + h_scale, w_scale = cls._get_effective_scales(node) + is_alone_in_partition = cls.is_node_alone_in_partition( + node, partition_list, filter_fn=is_not_qdq_node + ) + + if is_alone_in_partition and h_scale == w_scale == 1: + # The operator is a no-op, so the Neutron Converter will skip it. If it's the only node in the + # partition, the graph would end up empty. + return False + + return True + @staticmethod def _is_supported_in_IR( node: Node, @@ -36,6 +63,14 @@ def _is_supported_in_IR( " format. Please report this." ) + # The conversion requires the output shape to be known and static. + if not node_has_well_defined_shape(node): + return False + + if len(node.meta["val"].shape) != 4: + # Unexpected case. The input should always be 4D. + return False + return True @staticmethod @@ -45,39 +80,62 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: - # Neutron requires static shapes. - # neutron-converter/src/OperatorC/UpsamplePlugin.cpp?at=NEUTRON_SOFTWARE_2.2.3#74 - if not node_has_well_defined_shape(node): - return False - - if len(node.meta["val"].shape) != 4: - # Unexpected case. The input should always be 4D. - return False - - # The tensors here use the channels first format (NCHW). + # The tensors are always 4D and use the channels first format (NCHW). _, in_c, in_h, in_w = node.all_input_nodes[0].meta["val"].shape _, _, out_h, out_w = node.meta["val"].shape - # Neutron supports only the doubling and quadrupleing of both height and width at the same time. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#768 - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 - supported_scales = [2, 4] - if not any( - in_h * scale == out_h and in_w * scale == out_w - for scale in supported_scales - ): - return False - - # Neutron requires the input channels to be a multiple of `num_macs`. - # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#767 - if in_c % neutron_target_spec.get_num_macs() != 0: - return False + if custom_delegation_options.use_new_flow_neutron_c: + # Requirements specified by the new Neutron flow documentation. + + if not NodeConverter.uses_quantization_type_for_io( + node, + supported_types=[torch.int8, torch.uint8], + input_indices=[0], + output_indices=[0], + ): + return False + + supported_scales = [1, 2, 4, 8] + h_scale, w_scale = UpsampleNearest2DConverter._get_effective_scales(node) + # The H and W scales don't need to be equal but both must be supported. + if (h_scale not in supported_scales) or (w_scale not in supported_scales): + return False + + else: + # Requirements of the old Neutron flow. + + # Neutron supports only the doubling and quadrupleing of both height and width at the same time. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#768 + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#778 + supported_scales = [2, 4] + if not any( + in_h * scale == out_h and in_w * scale == out_w + for scale in supported_scales + ): + return False + + # Neutron requires the input channels to be a multiple of `num_macs`. + # neutron-library/src/utils/NeutronLibraryInterrogation.cpp?at=refs%2Ftags%2FNEUTRON_SOFTWARE_2.2.3#767 + if in_c % neutron_target_spec.get_num_macs() != 0: + return False return True + @staticmethod + def _get_effective_scales(node: Node) -> tuple[HeightScale, WidthScale]: + # Neutron supports variants where `align_corners=False` and `align_corners=True`. ExecuTorch doesn't have this + # parameter. Its behavior is equivalent to `align_corners=False`. Hence, the scale calculation corresponds to + # the `align_corners=False` case in the Neutron documentation. + _, _, in_h, in_w = node.all_input_nodes[0].meta["val"].shape + _, _, out_h, out_w = node.meta["val"].shape + h_scale = out_h / in_h + w_scale = out_w / in_w + + return h_scale, w_scale + def convert(self, node: Node): """Convert the `aten.upsample_nearest2d.vec` operator to Neutron IR `ResizeNearestNeighbor`. - The schema is: + The ExecuTorch schema is: aten::upsample_nearest2d.vec( Tensor input, SymInt[]? output_size, @@ -90,6 +148,8 @@ def convert(self, node: Node): x = t_op.tmp_inputs[0] y = t_op.tmp_outputs[0] + # Neutron supports variants where `align_corners=False` and `align_corners=True`. ExecuTorch doesn't have this + # parameter. Its behavior is equivalent to `align_corners=False` and `half_pixel_centers=False`. t_op.builtin_options = ResizeNearestNeighbor(False, False) # The `aten.upsample_nearest2d` can use either the `size` attribute or the `scale_factor` to define the output diff --git a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py index 3d9ec84dec9..27d1ac718a0 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_convert_upsample_nearest2d.py @@ -4,12 +4,15 @@ # LICENSE file in the root directory of this source tree. import numpy as np + +# noinspection PyUnusedImports import pytest import torch from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import ( convert_run_compare, @@ -17,7 +20,14 @@ ToChannelFirstPreprocess, ToChannelLastPreprocess, ) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + AddTensor, + ExecutorchDelegateCall, + UpsampleNearest2D, +) +from executorch.backends.nxp.tests.use_qat import * # noqa F403 @pytest.fixture(autouse=True) @@ -26,11 +36,6 @@ def reseed_model_per_test_run(): np.random.seed(23) -# noinspection PyProtectedMember -ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate -UpsampleNearest2D = exir_ops.edge.aten.upsample_nearest2d.vec - - class UpsampleNearestModule(torch.nn.Module): def __init__(self, size=None, scale=None): @@ -41,6 +46,13 @@ def forward(self, x): return self.upsample(x) +class UpsampleNearestAddModule(UpsampleNearestModule): + + def forward(self, x): + x = super().forward(x) + return x + x + + @pytest.mark.parametrize( "input_shape, size", [ @@ -181,3 +193,120 @@ def test_convert_upsample_nearest2d__no_delegation__unsupported_size(input_shape # Make sure the `upsample` was NOT delegated (size != double of input). assert not graph_contains_any_of_ops(delegated_ep.graph, [ExecutorchDelegateCall]) assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D]) + + +class TestUpsampleNearest2DNewNeutronFlow: + + # noinspection PyMethodMayBeStatic + def assert_delegated( + self, + model, + input_shape, + mocker, + use_qat=False, + expected_delegated_ops=None, + ): + if expected_delegated_ops is None: + expected_delegated_ops = {UpsampleNearest2D: 1} + + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops=expected_delegated_ops, + expected_non_delegated_ops={}, + ) + + # Cover also negative values to thoroughly test the operator. + dataset_creator = RandomDatasetCreator(low=-2, high=2) + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset_creator, + use_qat=use_qat, + use_new_flow_neutron_c=True, # Use the new flow. + ) + + # noinspection PyMethodMayBeStatic + def assert_not_delegated(self, model, input_shape): + delegated_ep = to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [UpsampleNearest2D]) + + def test__qat(self, mocker, use_qat): + input_shape = (1, 2, 3, 4) + output_size = (6, 8) + model = UpsampleNearestModule(size=output_size) + self.assert_delegated(model, input_shape, mocker, use_qat=use_qat) + + @pytest.mark.parametrize( + "input_shape, output_size", + [ + pytest.param((1, 2, 3, 4), (6, 8), id="batch=1, scale_h=scale_w=2"), + pytest.param((1, 2, 3, 3), 6, id="batch=1, scale_h=scale_w=2, scalar size"), + pytest.param( + (3, 3, 3, 5), + (6, 5), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (3, 16), id="batch=2, scale_h=1, scale_w=4"), + pytest.param((2, 2, 3, 4), (24, 8), id="batch=2, scale_h=8, scale_w=2"), + ], + ) + def test__output_size(self, mocker, input_shape, output_size): + model = UpsampleNearestModule(size=output_size) + self.assert_delegated(model, input_shape, mocker) + + def test__output_size__unsupported(self): + input_shape = (1, 2, 3, 4) + output_size = (9, 12) # scale = (3, 3) + model = UpsampleNearestModule(size=output_size) + self.assert_not_delegated(model, input_shape) + + @pytest.mark.parametrize( + "input_shape, scale", + [ + pytest.param((1, 2, 3, 4), (2, 2), id="batch=1, scale_h=scale_w=2"), + pytest.param( + (1, 2, 3, 4), 4, id="batch=1, scale_h=scale_w=4, scalar scale" + ), + pytest.param( + (3, 3, 3, 5), + (2, 1), + id="batch=3, scale_h=2, scale_w=1 (no num_macs multiples)", + ), + pytest.param((2, 2, 3, 4), (4, 1), id="batch=2, scale_h=4, scale_w=1"), + pytest.param((2, 2, 3, 4), (2, 8), id="batch=2, scale_h=2, scale_w=8"), + ], + ) + def test__scales(self, mocker, input_shape, scale): + model = UpsampleNearestModule(scale=scale) + self.assert_delegated(model, input_shape, mocker) + + def test__scales__unsupported(self): + input_shape = (1, 2, 3, 4) + scale = (3, 3) + model = UpsampleNearestModule(scale=scale) + self.assert_not_delegated(model, input_shape) + + def test__noop__alone_in_partition__not_delegated(self): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleNearestModule(scale=scale) + self.assert_not_delegated(model, input_shape) + + def test__noop__not_alone_in_partition__delegated(self, mocker): + input_shape = (1, 2, 3, 4) + scale = 1 + model = UpsampleNearestAddModule(scale=scale) + self.assert_delegated( + model, + input_shape, + mocker, + expected_delegated_ops={UpsampleNearest2D: 1, AddTensor: 1}, + ) From 501d6415437eae895531d3783bf622f6ccb56f40 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Fri, 29 May 2026 09:38:52 +0200 Subject: [PATCH 072/103] Arm backend: Fix bug causing empty partition reports (#19842) logger.level was used to determine whether to add the partition_report.txt FileHandler to the logger. This value is not est by logging.setBasicConfig, and defaults to 0. This caused empty reports to be output when intermediate path was set and logging was > info Instead, use .getEffectiveLevel() cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Erik Lundell --- backends/arm/tosa/partitioner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/backends/arm/tosa/partitioner.py b/backends/arm/tosa/partitioner.py index d93e212c314..37b9cd7cc2a 100644 --- a/backends/arm/tosa/partitioner.py +++ b/backends/arm/tosa/partitioner.py @@ -550,7 +550,10 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult: partition_tags = {tag: self.delegation_spec for tag in tags} tag_constant_data(exported_program) - if self.intermediate_path is not None and logger.level <= logging.INFO: + if ( + self.intermediate_path is not None + and logger.getEffectiveLevel() <= logging.INFO + ): intermediate_path = Path(self.intermediate_path) intermediate_path.mkdir(parents=True, exist_ok=True) file_handler = logging.FileHandler( From ea37954cd7eeec168608010f8faaaa6c9ccfa6bc Mon Sep 17 00:00:00 2001 From: Tom Allsop <72802373+tom-arm@users.noreply.github.com> Date: Fri, 29 May 2026 09:58:02 +0100 Subject: [PATCH 073/103] Arm backend: Add BF16 layer tests for Qwen (#19767) * Add layers that run in BF16 in the HF model Change-Id: If75434db138059f3a433a70abda3f3e26f6dd3b6 cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani --------- Signed-off-by: Tom Allsop --- .../models/Qwen3_VL/test_qwen3_vl_layers.py | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_layers.py b/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_layers.py index 77b2739167a..f1ffe35b14e 100644 --- a/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_layers.py +++ b/backends/arm/test/models/Qwen3_VL/test_qwen3_vl_layers.py @@ -33,7 +33,7 @@ Qwen3VLVisionRotaryEmbedding, ) -input_t = Tuple[torch.Tensor, ...] +input_t = Tuple[torch.Tensor | int, ...] def _make_qwen3_vl_2b_instruct_layer_config(): @@ -99,6 +99,19 @@ def prepare_model_and_inputs(cls): raise NotImplementedError +def _to_bfloat16( + model: torch.nn.Module, inputs: input_t +) -> tuple[torch.nn.Module, input_t]: + return model.to(torch.bfloat16), tuple( + ( + x.to(torch.bfloat16) + if isinstance(x, torch.Tensor) and x.is_floating_point() + else x + ) + for x in inputs + ) + + class Qwen3VLVisionMLPModel(Qwen3VLTestModule): def __init__(self, config) -> None: super().__init__() @@ -442,6 +455,18 @@ class Qwen3VLTestCase: VGF_NO_QUANT_TEST_CASES: dict[str, Qwen3VLTestCase] = TOSA_FP_TEST_CASES +TOSA_BF16_TEST_CASES: dict[str, Qwen3VLTestCase] = { + "vision_mlp": TOSA_FP_TEST_CASES["vision_mlp"], + "vision_patch_embed": TOSA_FP_TEST_CASES["vision_patch_embed"], + "vision_rotary_embedding": TOSA_FP_TEST_CASES["vision_rotary_embedding"], + "vision_rotary_apply": TOSA_FP_TEST_CASES["vision_rotary_apply"], + "vision_attention": TOSA_FP_TEST_CASES["vision_attention"], + "vision_block": TOSA_FP_TEST_CASES["vision_block"], + "vision_patch_merger": TOSA_FP_TEST_CASES["vision_patch_merger"], + "text_rms_norm": TOSA_FP_TEST_CASES["text_rms_norm"], + "qk_norm": TOSA_FP_TEST_CASES["qk_norm"], +} + @common.parametrize( "test_case", @@ -460,6 +485,27 @@ def test_qwen3_vl_tosa_FP(test_case: Qwen3VLTestCase): pipeline.run() +@common.parametrize( + "test_case", + TOSA_BF16_TEST_CASES, +) +def test_qwen3_vl_tosa_FP_bf16(test_case: Qwen3VLTestCase): + model, inputs = test_case.model_cls.prepare_model_and_inputs() + model, inputs = _to_bfloat16(model, inputs) + with torch.no_grad(): + pipeline = TosaPipelineFP[input_t]( + model, + inputs, + aten_op=[], + exir_op=[], + transform_passes=list(test_case.transform_passes), + tosa_extensions=["bf16"], + atol=1e-2, + rtol=1e-2, + ) + pipeline.run() + + @common.SkipIfNoModelConverter @common.parametrize( "test_case", From f6be9851aa90b373a212d4eab24614d561c44c43 Mon Sep 17 00:00:00 2001 From: Xingguo Li <100689130+xingguo01@users.noreply.github.com> Date: Fri, 29 May 2026 10:01:03 +0100 Subject: [PATCH 074/103] LLM support: improve VGF export and calibration pipeline (#19157) This is stacked on top of https://github.com/pytorch/executorch/pull/19029 - make non-KV-cache example inputs match the static export window - fix PT2E calibration flow for padded prefixes and optional LM-Eval tasks - update SmolLM2 export settings used by the VGF PT2E workflow - Fix rope_theta in 135M_config.json to align with Hugging face model config cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Xingguo Li Co-authored-by: Zingo Andersen --- examples/models/llama/eval_llama_lib.py | 94 +++++++++---- examples/models/llama/evaluate/eager_eval.py | 8 +- examples/models/llama/model.py | 23 +++- extension/llm/export/builder.py | 131 +++++++++++++------ 4 files changed, 183 insertions(+), 73 deletions(-) diff --git a/examples/models/llama/eval_llama_lib.py b/examples/models/llama/eval_llama_lib.py index 23d00ff8c15..b562a2b3c70 100644 --- a/examples/models/llama/eval_llama_lib.py +++ b/examples/models/llama/eval_llama_lib.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -46,9 +47,13 @@ def __init__( use_kv_cache: bool = False, generate_full_logits: bool = False, enable_dynamic_shape: bool = True, + device: Optional[str] = None, ): super().__init__( - model=model, tokenizer=tokenizer, max_seq_length=max_seq_length + model=model, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + device=device, ) self._model = model.to(self.device) self._use_kv_cache = use_kv_cache @@ -57,30 +62,70 @@ def __init__( def _model_call(self, inps): if self._use_kv_cache: - if not self._enable_dynamic_shape: - # graph module exported without dynamic shape won't work with a different shape. - # And we have to do single token prefill here. - result_logits = [] - for pos in range(inps.shape[-1]): - pos_tensor = torch.tensor([pos], dtype=torch.int64) - logits = self._model( - inps[:, pos : pos + 1], {"input_pos": pos_tensor} - ) - result_logits.append(logits) - if self._generate_full_logits: - return torch.cat(result_logits, dim=1) - else: - return torch.stack(result_logits, dim=1) - else: - pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) - # Batch process the whole sequence. - logits = self._model( - inps[:, : self._max_seq_length], {"input_pos": pos_tensor} - ) - return logits + return self._model_call_kv_cache(inps) + return self._model_call_no_kv_cache(inps) - else: - return self._model(inps) + def _model_call_kv_cache(self, inps): + if self._enable_dynamic_shape: + pos_tensor = torch.tensor([0], dtype=torch.int64, device=self.device) + return self._model( + inps[:, : self._max_seq_length], {"input_pos": pos_tensor} + ) + + # graph module exported without dynamic shape won't work with a different shape. + # And we have to do single token prefill here. + result_logits = [] + for pos in range(inps.shape[-1]): + pos_tensor = torch.tensor([pos], dtype=torch.int64) + logits = self._model(inps[:, pos : pos + 1], {"input_pos": pos_tensor}) + result_logits.append(logits) + if self._generate_full_logits: + return torch.cat(result_logits, dim=1) + return torch.stack(result_logits, dim=1) + + def _model_call_no_kv_cache(self, inps): + # lm-eval expects logits shaped [batch, seq, vocab]. In the non-KV path, + # some exported graphs (when generate_full_logits=False) return only + # last-position logits [batch, vocab], so reconstruct per-position + # logits by running prefix calls. + if not self._enable_dynamic_shape and not self._generate_full_logits: + raise ValueError( + "Static non-KV lm-eval requires generate_full_logits=True " + "so logits can be read from the last non-pad token." + ) + + if self._generate_full_logits: + return self._model(self._pad_to_max_len(inps)) + + result_logits = [] + seq_len = inps.shape[-1] + for pos in range(min(seq_len, self._max_seq_length)): + prefix = self._pad_to_max_len(inps[:, : pos + 1]) + logits = self._model(prefix) + if logits.dim() == 3: + logits = logits[:, -1, :] + result_logits.append(logits) + + return torch.stack(result_logits, dim=1) + + def _pad_to_max_len(self, tokens: torch.Tensor) -> torch.Tensor: + if self._enable_dynamic_shape: + return tokens + token_len = tokens.shape[-1] + if token_len > self._max_seq_length: + return tokens[:, : self._max_seq_length] + if token_len == self._max_seq_length: + return tokens + + pad_len = self._max_seq_length - token_len + pad_token = getattr(self._tokenizer, "pad_id", self._tokenizer.eos_id) + pad = torch.full( + (tokens.shape[0], pad_len), + pad_token, + dtype=tokens.dtype, + device=tokens.device, + ) + return torch.cat((tokens, pad), dim=-1) def _model_generate(self, context, max_length, eos_token_id): raise Exception("unimplemented") @@ -219,6 +264,7 @@ def gen_eval_wrapper( tokenizer=tokenizer, max_seq_length=llm_config.export.max_seq_length, use_kv_cache=llm_config.model.use_kv_cache, + generate_full_logits=llm_config.debug.generate_full_logits, enable_dynamic_shape=llm_config.model.enable_dynamic_shape, ) else: diff --git a/examples/models/llama/evaluate/eager_eval.py b/examples/models/llama/evaluate/eager_eval.py index 9d5d7ad447b..5c129e1c250 100644 --- a/examples/models/llama/evaluate/eager_eval.py +++ b/examples/models/llama/evaluate/eager_eval.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -28,12 +29,13 @@ def __init__( tokenizer: Union[SentencePieceTokenizer, Tiktoken, HuggingFaceTokenizer], max_seq_length: Optional[int] = None, use_kv_cache: bool = False, + device: Optional[str] = None, ): - device = "cuda" if torch.cuda.is_available() else "cpu" - super().__init__(device=device, pretrained="gpt2") + resolved_device = device or ("cuda" if torch.cuda.is_available() else "cpu") + super().__init__(device=resolved_device, pretrained="gpt2") self._model = model self._tokenizer = tokenizer - self._device = torch.device(device) + self._device = torch.device(resolved_device) self._max_seq_length = 2048 if max_seq_length is None else max_seq_length self._use_kv_cache = use_kv_cache diff --git a/examples/models/llama/model.py b/examples/models/llama/model.py index f02621b66b2..8ae146dda0f 100644 --- a/examples/models/llama/model.py +++ b/examples/models/llama/model.py @@ -1,5 +1,6 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -285,11 +286,25 @@ def get_example_inputs(self): if self.use_kv_cache: return self.get_example_inputs_kvcache_sdpa() else: - return ( - torch.tensor( - [[1, 2, 3]], dtype=torch.long - ), # tokens, with kv cache our input token length is always just 1 token. + max_seq_len = getattr(self.llm_config.export, "max_seq_length", 3) + # Preserve the historical three-token example input as the minimum. + max_seq_len = max(3, int(max_seq_len)) + max_len = max_seq_len - 1 if self.enable_dynamic_shape else max_seq_len + backend = self.llm_config.backend + token_dtype = ( + torch.int32 + if ( + backend.ethosu.enabled + or backend.tosa.enabled + or backend.vgf.enabled + ) + else torch.long ) + example_tokens = torch.arange(max_len, dtype=token_dtype).unsqueeze(0) + vocab_size = int(getattr(self.model_.params, "vocab_size", 0)) + if vocab_size > 1: + example_tokens = example_tokens % (vocab_size - 1) + 1 + return (example_tokens,) # assumption is the custom op doesnt support dynamic shape right now. It might but its untested so lets first get static shape working def get_example_inputs_kvcache_sdpa(self): diff --git a/extension/llm/export/builder.py b/extension/llm/export/builder.py index c25c1190990..5928e40dc4d 100644 --- a/extension/llm/export/builder.py +++ b/extension/llm/export/builder.py @@ -256,6 +256,35 @@ def run_canonical_optimizations(self): assert res.graph_module is not None, "Pass returned None" self.pre_autograd_graph_module = res.graph_module + def _check_calibration_prefix_options(self) -> None: + if ( + not self.use_kv_cache + and not self.enable_dynamic_shape + and not self.generate_full_logits + ): + raise ValueError( + "Static non-KV calibration with padded prefixes requires " + "generate_full_logits so calibration can sample the last " + "non-pad token position." + ) + + def _prepare_calibration_prefix( + self, token_list: List[int], pos: int, max_len: int, pad_token: int + ) -> Tuple[torch.Tensor, int]: + prefix_tokens = list(token_list[: pos + 1]) + logits_token_pos = min(len(prefix_tokens), max_len) - 1 + + if self.enable_dynamic_shape: + prefix_tokens = prefix_tokens[:max_len] + elif len(prefix_tokens) < max_len: + prefix_tokens.extend([pad_token] * (max_len - len(prefix_tokens))) + else: + prefix_tokens = prefix_tokens[:max_len] + + input_dtype = self.example_inputs[0].dtype + prefix = torch.tensor(prefix_tokens, dtype=input_dtype).unsqueeze(0) + return prefix, logits_token_pos + def pt2e_calibrate( self, prepared_module, @@ -266,39 +295,41 @@ def pt2e_calibrate( tokenizer_path, ): logging.info("Run calibration...") - try: - from executorch.examples.models.llama.eval_llama_lib import ( - GraphModuleEvalWrapper, - ) - from lm_eval.evaluator import simple_evaluate - except ImportError: - raise ImportError( - "Please install the llm eval dependency via examples/models/llama/install_requirements.sh" - ) - + self._check_calibration_prefix_options() tokenizer = get_tokenizer(tokenizer_path) def calibrate_template( module: torch.fx.GraphModule, tokenizer, prompts: str, max_len: int ): # TODO: change criteria & support batch inputs if necessary - pos = torch.tensor(0, dtype=torch.int64) + pos = 0 token_list = tokenizer.encode(prompts, bos=True, eos=False) + pad_token = getattr(tokenizer, "pad_id", tokenizer.eos_id) + with torch.no_grad(): while token_list[-1] != tokenizer.eos_id and pos < max_len: - logits = module( - torch.full((1, 1), token_list[pos]), - {"input_pos": torch.tensor((pos,))}, - ) + logits_token_pos = -1 + if self.use_kv_cache: + logits = module( + torch.full((1, 1), token_list[pos]), + {"input_pos": torch.tensor((pos,))}, + ) + else: + prefix, logits_token_pos = self._prepare_calibration_prefix( + token_list, pos, max_len, pad_token + ) + logits = module(prefix) + pos += 1 if pos >= len(token_list): if self.generate_full_logits: - token_list.append( - torch.argmax(logits[:, -1], dim=-1).item() - ) + next_token = torch.argmax( + logits[:, logits_token_pos], dim=-1 + ).item() else: - token_list.append(torch.argmax(logits[:], dim=-1).item()) + next_token = torch.argmax(logits[:], dim=-1).item() + token_list.append(next_token) calibrate_template( module=prepared_module, @@ -307,26 +338,41 @@ def calibrate_template( max_len=calibration_seq_length, ) - eval_wrapper = GraphModuleEvalWrapper( - model=prepared_module, - tokenizer=tokenizer, - max_seq_length=calibration_seq_length, - use_kv_cache=self.use_kv_cache, - generate_full_logits=self.generate_full_logits, - enable_dynamic_shape=self.enable_dynamic_shape, - ) + if calibration_tasks: + try: + from executorch.examples.models.llama.eval_llama_lib import ( + GraphModuleEvalWrapper, + ) + from lm_eval.evaluator import simple_evaluate + except ImportError: + raise ImportError( + "Please install the llm eval dependency via examples/models/llama/install_requirements.sh" + ) - # Evaluate the model - with torch.no_grad(): - eval_results = simple_evaluate( - model=eval_wrapper, - tasks=calibration_tasks, - limit=calibration_limit, + eval_wrapper = GraphModuleEvalWrapper( + model=prepared_module, + tokenizer=tokenizer, + max_seq_length=calibration_seq_length, + use_kv_cache=self.use_kv_cache, + generate_full_logits=self.generate_full_logits, + enable_dynamic_shape=self.enable_dynamic_shape, + # The exported graph can contain ops like aten.full.default + # without explicit device, which default to CPU and can + # trigger device-mismatch errors when lm_eval runs on CUDA. + # Calibrate on CPU for stability. + device="cpu", ) - for task, res in eval_results["results"].items(): - print(f"{task}: {res}") - logging.info("Calibration finish...") + with torch.no_grad(): + eval_results = simple_evaluate( + model=eval_wrapper, + tasks=calibration_tasks, + limit=calibration_limit, + ) + + for task, res in eval_results["results"].items(): + print(f"{task}: {res}") + logging.info("Calibration finish...") def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManager": """ @@ -351,18 +397,19 @@ def pt2e_quantize(self, quantizers: Optional[List[Quantizer]]) -> "LLMEdgeManage assert ( self.pre_autograd_graph_module is not None ), "Please run export() first" + if self.calibration_tasks and self.calibration_limit is None: + logging.warning( + "calibration_tasks provided without calibration_limit; " + "lm-eval will run the full task dataset during " + "calibration." + ) m = prepare_pt2e( self.pre_autograd_graph_module, # pyre-ignore[6] composed_quantizer, ) - logging.info( - f"Calibrating with tasks: {self.calibration_tasks}, limit: {self.calibration_limit}, calibration_data: {self.calibration_data}, tokenizer_path: {self.tokenizer_path}, seq_length: {self.calibration_seq_length}" - ) # Calibrate if ( - self.calibration_tasks is not None - and self.calibration_limit is not None - and self.calibration_seq_length is not None + self.calibration_seq_length is not None and self.calibration_data is not None and self.tokenizer_path is not None ): From 1494535ba2d391c274a225dd03b2d81c429944c8 Mon Sep 17 00:00:00 2001 From: Michiel Olieslagers <44864547+Michiel-Olieslagers@users.noreply.github.com> Date: Fri, 29 May 2026 10:03:49 +0100 Subject: [PATCH 075/103] Arm backend: Fix VKML install bug for macOS. (#19612) Change-Id: Id97fcb787369b62aecd4a0be27132ff4a0785fcf cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Michiel Olieslagers --- backends/arm/scripts/vulkan_utils.sh | 31 +++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/backends/arm/scripts/vulkan_utils.sh b/backends/arm/scripts/vulkan_utils.sh index c8b169c0c3d..520c244c6fb 100644 --- a/backends/arm/scripts/vulkan_utils.sh +++ b/backends/arm/scripts/vulkan_utils.sh @@ -71,6 +71,9 @@ function install_vulkan_sdk_macos() { fi log_step "vulkan" "Extracting Vulkan SDK installer" + rm -rf \ + "vulkansdk-macOS-${vulkan_sdk_version}.app" \ + "vulkansdk-macos-${vulkan_sdk_version}.app" unzip -q -o "${vulkan_sdk_zip_file}" local vulkan_sdk_app_path="" @@ -91,15 +94,33 @@ function install_vulkan_sdk_macos() { local install_root="$(cd "${root_dir}" && pwd)/${vulkan_sdk_base_dir}/${vulkan_sdk_version}" mkdir -p "${install_root}" - local vulkan_sdk_root="${root_dir}/${vulkan_sdk_base_dir}" log_step "vulkan" "Installing Vulkan SDK (${vulkan_sdk_version}) to ${install_root}" - ${vulkan_sdk_installer} --root "${install_root}" --accept-licenses --default-answer --confirm-command install + "${vulkan_sdk_installer}" --root "${install_root}" --accept-licenses --default-answer --confirm-command install +} + +function validate_vulkan_sdk_installation() { + if [[ ! -d "${root_dir}/${vulkan_sdk_bin_dir}" ]]; then + return 1 + fi + + vulkan_sdk_bin_path="$(cd "${root_dir}/${vulkan_sdk_bin_dir}" && pwd)" + if [[ ! -x "${vulkan_sdk_bin_path}/glslc" ]]; then + return 1 + fi + + "${vulkan_sdk_bin_path}/glslc" --version > /dev/null 2>&1 } function setup_vulkan_sdk() { cd "${root_dir}" + if validate_vulkan_sdk_installation; then + log_step "vulkan" "Reusing Vulkan SDK at ${root_dir}/${vulkan_sdk_base_dir}/${vulkan_sdk_version}" + log_step "vulkan" "Vulkan SDK validation (glslc) succeeded" + return + fi + if [[ "${os_name}" == "Darwin" ]]; then install_vulkan_sdk_macos else @@ -117,11 +138,11 @@ function setup_vulkan_sdk() { exit 1 fi - if ${vulkan_sdk_bin_path}/glslc --version > /dev/null 2>&1; then + if "${vulkan_sdk_bin_path}/glslc" --version > /dev/null 2>&1; then log_step "vulkan" "Vulkan SDK validation (glslc) succeeded" else log_step "vulkan" "Error: Vulkan SDK validation failed" - ${vulkan_sdk_bin_path}/glslc --version + "${vulkan_sdk_bin_path}/glslc" --version exit 1 fi } @@ -143,7 +164,7 @@ function setup_path_vulkan() { vulkan_sdk_arch_root="$(cd "${vulkan_sdk_arch_root}" && pwd)" vulkan_sdk_bin_path="$(cd "${vulkan_sdk_bin_dir}" && pwd)" - append_env_in_setup_path PATH ${vulkan_sdk_bin_path} + append_env_in_setup_path PATH "${vulkan_sdk_bin_path}" if [[ "${OS:-}" == "Darwin" ]]; then prepend_env_in_setup_path DYLD_LIBRARY_PATH "${vulkan_sdk_arch_root}/lib" local moltenvk_icd_path="${vulkan_sdk_arch_root}/share/vulkan/icd.d/MoltenVK_icd.json" From 513a4eaef4411325ae537beb44fe33eaf75205c3 Mon Sep 17 00:00:00 2001 From: Yufeng Shi Date: Fri, 29 May 2026 10:05:33 +0100 Subject: [PATCH 076/103] Arm backend: Avoid running passes with no matching target ops (#19839) Add ArmPass.should_run_pass() as a reusable early-exit hook before call() starts the normal ExportPass retracing path. The default hook returns true, preserving existing behavior for ArmPass subclasses. Introduce ArmOpTargetedPass for passes that only transform a known set of operator targets. It implements should_run_pass() by scanning the current graph and nested GraphModules for matching target operators. If no matching target operator is found, the pass returns an unmodified PassResult. For passes that already gate transformations with allowed_to_transform(), allow the target pre-scan to apply the same check before deciding whether the pass needs to run. This avoids running TFA passes when all matching target nodes are marked as disallowed. The should_run_pass() hook and ArmOpTargetedPass pre-scan avoid rebuilding graphs for decomposition and rewrite passes that cannot affect the current graph. The speedup is most visible on large models. Single-run paired benchmarks on Arm backend model tests across FP32, INT, VGF no-quant, and VGF quant variants: | Model | E2E avg | Pass-manager avg | |-------------|--------:|-----------------:| | T5-small | +30.5% | +47.5% | | DeepLabV3 | +12.9% | +49.8% | | Wav2Letter | +16.9% | +51.2% | | InceptionV3 | +22.2% | +46.5% | | MobileNetV2 | +22.2% | +52.5% | | MobileNetV3 | +29.9% | +54.6% | Model rows are unweighted averages over successful variants. Unweighted average across 23 successful model/target variants: E2E speedup: +22.4% Pass-manager speedup: +50.5% Change-Id: Iaa09638473a1d6d1e2ce98f5a0e3fc3a14378143 cc @digantdesai @freddan80 @per @zingo @oscarandersson8218 @mansnils @Sebastian-Larsson @robell @rascani Signed-off-by: Yufeng Shi Co-authored-by: Erik Lundell --- backends/arm/_passes/__init__.py | 2 +- .../arm/_passes/accumulate_index_put_pass.py | 8 +- backends/arm/_passes/arm_pass.py | 99 +++++++++++- .../arm/_passes/canonicalize_gather_pass.py | 8 +- backends/arm/_passes/conv1d_unsqueeze_pass.py | 7 +- .../_passes/convert_expand_copy_to_repeat.py | 7 +- .../_passes/convert_full_like_to_full_pass.py | 9 +- .../convert_permute_singleton_to_view_pass.py | 7 +- .../arm/_passes/convert_squeezes_to_view.py | 13 +- backends/arm/_passes/convert_to_clamp_pass.py | 10 +- backends/arm/_passes/decompose_acosh_pass.py | 7 +- .../decompose_adaptive_avg_pool2d_pass.py | 8 +- .../_passes/decompose_add_sub_alpha_pass.py | 7 +- backends/arm/_passes/decompose_addmm_pass.py | 7 +- .../_passes/decompose_as_strided_copy_pass.py | 7 +- .../_passes/decompose_asin_and_acos_pass.py | 7 +- backends/arm/_passes/decompose_asinh_pass.py | 7 +- backends/arm/_passes/decompose_atan_pass.py | 7 +- backends/arm/_passes/decompose_atanh_pass.py | 7 +- .../arm/_passes/decompose_avg_pool2d_pass.py | 10 +- backends/arm/_passes/decompose_cosh_pass.py | 7 +- .../decompose_cosine_similarity_pass.py | 8 +- backends/arm/_passes/decompose_div_pass.py | 9 +- .../arm/_passes/decompose_div_tensor_mode.py | 10 +- backends/arm/_passes/decompose_elu_pass.py | 13 +- backends/arm/_passes/decompose_erfinv_pass.py | 7 +- backends/arm/_passes/decompose_expm1_pass.py | 7 +- .../_passes/decompose_floor_divide_pass.py | 7 +- backends/arm/_passes/decompose_gelu_pass.py | 7 +- backends/arm/_passes/decompose_glu_pass.py | 7 +- .../_passes/decompose_grouped_conv_pass.py | 9 +- .../decompose_index_select_to_gather_pass.py | 8 +- .../decompose_index_tensor_to_gather_pass.py | 8 +- .../arm/_passes/decompose_int_pow_pass.py | 7 +- .../arm/_passes/decompose_leaky_relu_pass.py | 8 +- .../decompose_linalg_vector_norm_pass.py | 10 +- backends/arm/_passes/decompose_log1p_pass.py | 7 +- backends/arm/_passes/decompose_logit_pass.py | 10 +- .../arm/_passes/decompose_masked_fill_pass.py | 7 +- .../decompose_maxpool2d_with_dilation_pass.py | 7 +- .../arm/_passes/decompose_meandim_pass.py | 18 ++- backends/arm/_passes/decompose_ne_pass.py | 7 +- .../_passes/decompose_permute_for_u55_pass.py | 7 +- .../arm/_passes/decompose_remainder_pass.py | 13 +- backends/arm/_passes/decompose_round_pass.py | 10 +- .../_passes/decompose_select_scatter_pass.py | 7 +- backends/arm/_passes/decompose_sign_pass.py | 7 +- backends/arm/_passes/decompose_sinh_pass.py | 7 +- .../_passes/decompose_slice_scatter_pass.py | 7 +- .../arm/_passes/decompose_softmax_pass.py | 9 +- backends/arm/_passes/decompose_sqrt_pass.py | 9 +- .../decompose_strided_slice_copy_pass.py | 8 +- backends/arm/_passes/decompose_sum_pass.py | 13 +- backends/arm/_passes/decompose_tan_pass.py | 7 +- .../decompose_tosa_unsupported_clamp_pass.py | 7 +- backends/arm/_passes/decompose_tril_pass.py | 9 +- .../decompose_unfold_to_gather_pass.py | 10 +- backends/arm/_passes/decompose_var_pass.py | 16 +- .../decompose_where_scalar_other_pass.py | 12 +- .../decorate_fp32_to_int32_casting_pass.py | 7 +- .../_passes/fuse_consecutive_concat_shapes.py | 7 +- backends/arm/_passes/insert_const_shapes.py | 8 +- .../_passes/insert_data_layout_casts_pass.py | 8 +- .../arm/_passes/insert_dynamic_padding.py | 13 +- ...malize_index_put_bool_index_tensor_pass.py | 7 +- .../normalize_index_put_none_indices_pass.py | 7 +- .../arm/_passes/promote_bool_operands_pass.py | 8 +- backends/arm/_passes/remove_noop_pass.py | 19 +-- .../arm/_passes/rewrite_avg_pool2d_pass.py | 8 +- .../rewrite_bool_bitwise_to_logical_pass.py | 7 +- ...ewrite_high_rank_singleton_permute_pass.py | 7 +- .../arm/_passes/rewrite_index_put_pass.py | 7 +- .../rewrite_inplace_arithmetic_pass.py | 6 +- .../_passes/rewrite_le_lt_to_ge_gt_pass.py | 6 +- .../arm/_passes/rewrite_max_pool2d_pass.py | 7 +- backends/arm/_passes/rewrite_pad.py | 8 +- backends/arm/_passes/rewrite_slice.py | 7 +- .../test/passes/test_arm_op_targeted_pass.py | 150 ++++++++++++++++++ 78 files changed, 593 insertions(+), 294 deletions(-) create mode 100644 backends/arm/test/passes/test_arm_op_targeted_pass.py diff --git a/backends/arm/_passes/__init__.py b/backends/arm/_passes/__init__.py index 20bddf17793..3e881fdb9ef 100644 --- a/backends/arm/_passes/__init__.py +++ b/backends/arm/_passes/__init__.py @@ -5,7 +5,7 @@ from . import arm_pass_utils # noqa -from .arm_pass import ArmPass # noqa # usort: skip +from .arm_pass import ArmOpTargetedPass, ArmPass # noqa # usort: skip from .accumulate_index_put_pass import AccumulateIndexPutPass # noqa from .broadcast_args_pass import BroadcastArgsPass # noqa from .canonicalize_gather_pass import CanonicalizeGatherPass # noqa diff --git a/backends/arm/_passes/accumulate_index_put_pass.py b/backends/arm/_passes/accumulate_index_put_pass.py index 1194e08e2d8..9aa0457b0c7 100644 --- a/backends/arm/_passes/accumulate_index_put_pass.py +++ b/backends/arm/_passes/accumulate_index_put_pass.py @@ -6,7 +6,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_index_tensor_to_gather_pass import ( DecomposeIndexTensorToGatherPass, ) @@ -32,7 +32,7 @@ def get_ops(op): raise RuntimeError(f"Can't get index_put decomposition for op {op}") -class AccumulateIndexPutPass(ArmPass): +class AccumulateIndexPutPass(ArmOpTargetedPass): """This pass adjusts the values arg when the accumulate arg is set to true for the index_put op. """ @@ -41,9 +41,11 @@ class AccumulateIndexPutPass(ArmPass): DecomposeIndexTensorToGatherPass, RewriteIndexPutPass, } + target_ops = aten_ops + edge_ops + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in (aten_ops + edge_ops) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) source, indices, values = args[:3] diff --git a/backends/arm/_passes/arm_pass.py b/backends/arm/_passes/arm_pass.py index add0f3aeb20..1b4fc677d18 100644 --- a/backends/arm/_passes/arm_pass.py +++ b/backends/arm/_passes/arm_pass.py @@ -7,6 +7,7 @@ import copy import traceback from abc import abstractmethod +from collections.abc import Collection from typing import Any, List, Optional, Set, Type import torch @@ -14,7 +15,7 @@ from executorch.backends.arm.tosa.mapping import TosaSpecialDtype from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue -from torch.fx import GraphModule +from torch.fx import GraphModule, Node from torch.fx.passes.infra.pass_base import PassResult from torch.utils import _pytree as pytree @@ -191,3 +192,99 @@ def call_scalar(self, value: int | float, meta: NodeMetadata | dict[str, Any]): meta=meta, updated=True, ) + + def should_run_pass(self, graph_module: GraphModule) -> bool: + """Return whether this pass should run on the graph module. + + Subclasses can override this to cheaply skip the pass before + ``call()`` starts the normal ``ExportPass`` retracing path. + + Args: + graph_module (GraphModule): The graph module to inspect. + + Returns: + bool: True when the pass should run. + + """ + return True + + def __call__(self, graph_module: GraphModule) -> PassResult | None: + self.requires(graph_module) + if not self.should_run_pass(graph_module): + self.ensures(graph_module) + return PassResult(graph_module, False) + res = self.call(graph_module) + self.ensures(graph_module) + return res + + +class ArmOpTargetedPass(ArmPass): + """Base class for passes that only transform selected operators. + + Subclasses set ``target_ops`` to the call_function targets they can + transform. If the current graph and nested control-flow subgraphs do not + contain any target, the pass returns immediately without paying the default + ExportPass retracing cost. + + Set ``check_allowed_to_transform`` to ``True`` when the target pre-scan + should also apply ``allowed_to_transform()`` to matching target nodes. This + is useful for TFA passes whose ``call_operator()`` leaves disallowed target + nodes unchanged. If all matching targets are disallowed, the pass can + return before entering the normal ``ExportPass`` path. + + """ + + target_ops: Collection[Any] = () + check_allowed_to_transform = False + + def has_target_node(self, graph_module: GraphModule) -> bool: + """Return whether the graph module tree contains a target node. + + Args: + graph_module (GraphModule): The graph module tree to inspect. + + Returns: + bool: True if a matching call_function node is present. + + """ + visited_graph_modules = set() + + def target_node_can_trigger_pass(node: Node) -> bool: + if not self.check_allowed_to_transform: + return True + if self.allowed_to_transform(node.meta): + return True + return False + + def graph_has_target(module: GraphModule) -> bool: + if id(module) in visited_graph_modules: + return False + visited_graph_modules.add(id(module)) + + for target in self.target_ops: + for node in module.graph.find_nodes( + op="call_function", + target=target, + sort=False, + ): + if target_node_can_trigger_pass(node): + return True + + return any( + isinstance(child, GraphModule) and graph_has_target(child) + for child in module.children() + ) + + return graph_has_target(graph_module) + + def should_run_pass(self, graph_module: GraphModule) -> bool: + """Return whether this pass has a target node to transform. + + Args: + graph_module (GraphModule): The graph module tree to inspect. + + Returns: + bool: True when a matching target node is present. + + """ + return self.has_target_node(graph_module) diff --git a/backends/arm/_passes/canonicalize_gather_pass.py b/backends/arm/_passes/canonicalize_gather_pass.py index 23886111b18..aaa77ce4002 100644 --- a/backends/arm/_passes/canonicalize_gather_pass.py +++ b/backends/arm/_passes/canonicalize_gather_pass.py @@ -6,12 +6,12 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class CanonicalizeGatherPass(ArmPass): +class CanonicalizeGatherPass(ArmOpTargetedPass): """Canonicalize gather so it can be lowered to TOSA.GATHER via the backend dialect. @@ -40,10 +40,10 @@ class CanonicalizeGatherPass(ArmPass): _passes_required_after: Set[Type[ExportPass]] = set() - _TARGET_OPS = {exir_ops.edge.aten.gather.default} + target_ops = {exir_ops.edge.aten.gather.default} def call_operator(self, op, args, kwargs, meta): - if op not in self._TARGET_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) # edge.aten.gather.default: (x, dim, index) with kw-only sparse_grad diff --git a/backends/arm/_passes/conv1d_unsqueeze_pass.py b/backends/arm/_passes/conv1d_unsqueeze_pass.py index cf1e884e05b..f81ef33e2d1 100644 --- a/backends/arm/_passes/conv1d_unsqueeze_pass.py +++ b/backends/arm/_passes/conv1d_unsqueeze_pass.py @@ -8,7 +8,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.rewrite_conv_pass import RewriteConvPass from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass @@ -17,7 +17,7 @@ from executorch.exir.pass_base import ExportPass -class Conv1dUnsqueezePass(ArmPass): +class Conv1dUnsqueezePass(ArmOpTargetedPass): """This pass is used to change conv1d ops into conv2d since TOSA only supports 2d and 3d convolution. @@ -34,9 +34,10 @@ class Conv1dUnsqueezePass(ArmPass): RewriteConvPass, SizeAdjustInputPass, } + target_ops = (exir_ops.edge.aten.convolution.default,) def call_operator(self, op, args, kwargs, meta): - if op != exir_ops.edge.aten.convolution.default: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) stride = list(args[3]) if len(stride) != 1: diff --git a/backends/arm/_passes/convert_expand_copy_to_repeat.py b/backends/arm/_passes/convert_expand_copy_to_repeat.py index 69056cb47f4..430dc70bd0c 100644 --- a/backends/arm/_passes/convert_expand_copy_to_repeat.py +++ b/backends/arm/_passes/convert_expand_copy_to_repeat.py @@ -9,7 +9,7 @@ import torch -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.backends.arm._passes.unsqueeze_before_repeat_pass import ( UnsqueezeBeforeRepeatPass, ) @@ -51,7 +51,7 @@ def calculate_multiples(args): return multiples, expanded_rank != len(input_shape) -class ConvertExpandCopyToRepeatPass(ArmPass): +class ConvertExpandCopyToRepeatPass(ArmOpTargetedPass): """Replace expand copy with repeat since it is a repeat that can only repeat singleton dimensions. """ @@ -60,9 +60,10 @@ class ConvertExpandCopyToRepeatPass(ArmPass): expand_copy = exir_ops.edge.aten.expand_copy.default repeat = exir_ops.edge.aten.repeat.default + target_ops = (expand_copy,) def call_operator(self, op, args, kwargs, meta): - if op != self.expand_copy: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) multiples, changes_rank = calculate_multiples(args) diff --git a/backends/arm/_passes/convert_full_like_to_full_pass.py b/backends/arm/_passes/convert_full_like_to_full_pass.py index 1e26f24250a..f7a94424228 100644 --- a/backends/arm/_passes/convert_full_like_to_full_pass.py +++ b/backends/arm/_passes/convert_full_like_to_full_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.backends.arm._passes.fuse_constant_ops_pass import ( ComputeConstantOpsAOTPass, ) @@ -14,7 +14,7 @@ from executorch.exir.pass_base import ExportPass -class ConvertFullLikeToFullPass(ArmPass): +class ConvertFullLikeToFullPass(ArmOpTargetedPass): """Convert edge aten full_like to full. As per the full_like PyTorch documentation, `torch.full_like(input, @@ -35,11 +35,10 @@ class ConvertFullLikeToFullPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOTPass} + target_ops = (exir_ops.edge.aten.full_like.default,) def call_operator(self, op, args, kwargs, meta): - if op not in [ - exir_ops.edge.aten.full_like.default, - ]: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) tensor = args[0].data diff --git a/backends/arm/_passes/convert_permute_singleton_to_view_pass.py b/backends/arm/_passes/convert_permute_singleton_to_view_pass.py index 7447cf037bc..0ed5f92f91d 100644 --- a/backends/arm/_passes/convert_permute_singleton_to_view_pass.py +++ b/backends/arm/_passes/convert_permute_singleton_to_view_pass.py @@ -6,7 +6,7 @@ from typing import Sequence, Set, Tuple, Type -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -20,7 +20,7 @@ ) -class ConvertPermuteSingletonToViewPass(ArmPass): +class ConvertPermuteSingletonToViewPass(ArmOpTargetedPass): """Replace permutations that only move singleton axes with a reshape. Examples: @@ -34,9 +34,10 @@ class ConvertPermuteSingletonToViewPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = _PERMUTE_TARGETS def call_operator(self, op, args, kwargs, meta): - if op not in _PERMUTE_TARGETS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) input_tensor = args[0].data diff --git a/backends/arm/_passes/convert_squeezes_to_view.py b/backends/arm/_passes/convert_squeezes_to_view.py index 2058c3407e3..b79e38cdf10 100644 --- a/backends/arm/_passes/convert_squeezes_to_view.py +++ b/backends/arm/_passes/convert_squeezes_to_view.py @@ -6,7 +6,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.fuse_view_copy_transform_pass import ( FuseViewCopyTransformPass, ) @@ -14,7 +14,7 @@ from executorch.exir.pass_base import ExportPass -class ConvertSqueezesToViewPass(ArmPass): +class ConvertSqueezesToViewPass(ArmOpTargetedPass): """Replaces squeeze/unsqueeze operators with view. These are simply special cases of the view op, so removing them gives us @@ -23,12 +23,13 @@ class ConvertSqueezesToViewPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {FuseViewCopyTransformPass} + target_ops = ( + exir_ops.edge.aten.squeeze_copy.dims, + exir_ops.edge.aten.unsqueeze_copy.default, + ) def call_operator(self, op, args, kwargs, meta): - if op not in [ - exir_ops.edge.aten.squeeze_copy.dims, - exir_ops.edge.aten.unsqueeze_copy.default, - ]: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) x = args[0] diff --git a/backends/arm/_passes/convert_to_clamp_pass.py b/backends/arm/_passes/convert_to_clamp_pass.py index effb46f25c4..6273759aa55 100644 --- a/backends/arm/_passes/convert_to_clamp_pass.py +++ b/backends/arm/_passes/convert_to_clamp_pass.py @@ -1,11 +1,11 @@ -# Copyright 2025 Arm Limited and/or its affiliates. +# Copyright 2025-2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. from typing import Set, Tuple, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import ( QuantizeClampArgumentsPass, @@ -29,11 +29,13 @@ def get_clamp_params(op, args) -> Tuple[float | None, float | None]: raise ValueError(f"Getting clamp parameters for op {op} is not implemented.") -class ConvertToClampPass(ArmPass): +class ConvertToClampPass(ArmOpTargetedPass): _passes_required_after: Set[Type[ExportPass]] = {QuantizeClampArgumentsPass} + target_ops = edge_operators + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in edge_operators or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) return super().call_operator( diff --git a/backends/arm/_passes/decompose_acosh_pass.py b/backends/arm/_passes/decompose_acosh_pass.py index 3ce6d73abc3..3c2cac45e75 100644 --- a/backends/arm/_passes/decompose_acosh_pass.py +++ b/backends/arm/_passes/decompose_acosh_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass # noqa from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass @@ -21,7 +21,7 @@ edge_acosh_op = exir_ops.edge.aten.acosh.default -class DecomposeAcoshPass(ArmPass): +class DecomposeAcoshPass(ArmOpTargetedPass): """Decomposes acosh to supported TOSA-operations. This decomposition is based on the mathematical identity: @@ -36,10 +36,11 @@ class DecomposeAcoshPass(ArmPass): ReplaceScalarWithTensorByProfilePass, MatchArgDtypePass, } + target_ops = (edge_acosh_op,) def call_operator(self, op, args, kwargs, meta, updated=False): - if op is not edge_acosh_op: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py b/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py index eda9dd28bf9..58fcf69cd8f 100644 --- a/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py +++ b/backends/arm/_passes/decompose_adaptive_avg_pool2d_pass.py @@ -8,7 +8,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_avg_pool2d_pass import ( DecomposeAvgPool2dPass, ) @@ -36,7 +36,7 @@ def _get_decomposition(op) -> tuple: raise RuntimeError(f"Unable to get decomposition for op {op}") -class DecomposeAdaptiveAvgPool2dPass(ArmPass): +class DecomposeAdaptiveAvgPool2dPass(ArmOpTargetedPass): """Decomposes AdaptiveAvgPool2d into AvgPool2d operations. An input tensor of shape (N, C, H, W) is transformed into an output tensor @@ -47,9 +47,11 @@ class DecomposeAdaptiveAvgPool2dPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {DecomposeAvgPool2dPass} + target_ops = edge_ops + aten_ops + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta, updated=False): - if op not in (edge_ops + aten_ops) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta, updated) avg_pool2d_op, slice_op, cat_op = _get_decomposition(op) diff --git a/backends/arm/_passes/decompose_add_sub_alpha_pass.py b/backends/arm/_passes/decompose_add_sub_alpha_pass.py index d7db9c5bcf9..30903fbd3d8 100644 --- a/backends/arm/_passes/decompose_add_sub_alpha_pass.py +++ b/backends/arm/_passes/decompose_add_sub_alpha_pass.py @@ -9,7 +9,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -55,13 +55,14 @@ def _should_decompose(alpha) -> bool: return False -class DecomposeAddSubAlphaPass(ArmPass): +class DecomposeAddSubAlphaPass(ArmOpTargetedPass): """Rewrite add/sub with alpha into a mul followed by add/sub.""" _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = _ADD_OPS + _SUB_OPS def call_operator(self, op, args, kwargs, meta, updated: bool | None = False): - if op not in _ADD_OPS + _SUB_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) alpha = kwargs.get("alpha", 1) diff --git a/backends/arm/_passes/decompose_addmm_pass.py b/backends/arm/_passes/decompose_addmm_pass.py index d1368602d5d..d198e1a3b64 100644 --- a/backends/arm/_passes/decompose_addmm_pass.py +++ b/backends/arm/_passes/decompose_addmm_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass # noqa @@ -41,7 +41,7 @@ def get_ops(op): raise ValueError(f"Unsupported operator: {op}") -class DecomposeAddmmPass(ArmPass): +class DecomposeAddmmPass(ArmOpTargetedPass): """Decomposes the addmm operator into tensor multiplication and addition.""" _passes_required_after: Set[Type[ExportPass]] = { @@ -49,9 +49,10 @@ class DecomposeAddmmPass(ArmPass): MatchArgRanksPass, MatchArgDtypePass, } + target_ops = (edge_addmm, aten_addmm) def call_operator(self, op, args, kwargs, meta): - if op not in [edge_addmm, aten_addmm] or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) input, mat1, mat2 = args diff --git a/backends/arm/_passes/decompose_as_strided_copy_pass.py b/backends/arm/_passes/decompose_as_strided_copy_pass.py index a60d1b19fd9..c8c2a200bd8 100644 --- a/backends/arm/_passes/decompose_as_strided_copy_pass.py +++ b/backends/arm/_passes/decompose_as_strided_copy_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm.common.as_strided_utils import ( contiguous_strides, maybe_static_sequence, @@ -18,7 +18,7 @@ from executorch.exir.pass_base import ExportPass -class DecomposeAsStridedCopyPass(ArmPass): +class DecomposeAsStridedCopyPass(ArmOpTargetedPass): """Replace contiguous `aten.as_strided_copy` with `aten.view_copy`. The TOSA backend only supports the contiguous-as-strided case where the stride matches @@ -31,6 +31,7 @@ class DecomposeAsStridedCopyPass(ArmPass): _EDGE_OPS = (exir_ops.edge.aten.as_strided_copy.default,) _ATEN_OPS = (torch.ops.aten.as_strided_copy.default,) + target_ops = _EDGE_OPS + _ATEN_OPS def _extract_args( self, args: Tuple[object, ...], kwargs: dict @@ -76,7 +77,7 @@ def _extract_args( return size_tuple, stride_tuple, storage_offset def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False): - if op not in (*self._EDGE_OPS, *self._ATEN_OPS): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) extracted = self._extract_args(args, kwargs) diff --git a/backends/arm/_passes/decompose_asin_and_acos_pass.py b/backends/arm/_passes/decompose_asin_and_acos_pass.py index 707e6ec070d..5e0cfd66c32 100644 --- a/backends/arm/_passes/decompose_asin_and_acos_pass.py +++ b/backends/arm/_passes/decompose_asin_and_acos_pass.py @@ -10,7 +10,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.convert_full_like_to_full_pass import ( ConvertFullLikeToFullPass, ) @@ -48,7 +48,7 @@ def get_decomposition(op) -> tuple: raise RuntimeError(f"Can't get decomposition for op {op}") -class DecomposeAsinAndAcosPass(ArmPass): +class DecomposeAsinAndAcosPass(ArmOpTargetedPass): """This pass decomposes asin and acos into a rational approximation for small values and a transformed rational approximation for large values. @@ -71,6 +71,7 @@ class DecomposeAsinAndAcosPass(ArmPass): MatchArgDtypePass, ReplaceScalarWithTensorByProfilePass, } + target_ops = edge_asin_op + edge_acos_op def _build_polynomial( self, coefficients: list[float], variable: torch.Tensor, meta: dict[str, str] @@ -116,7 +117,7 @@ def _combine_branches( ) def call_operator(self, op, args, kwargs, meta): - if op not in (edge_asin_op + edge_acos_op): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_asinh_pass.py b/backends/arm/_passes/decompose_asinh_pass.py index 822b793d203..5f31c5efedc 100644 --- a/backends/arm/_passes/decompose_asinh_pass.py +++ b/backends/arm/_passes/decompose_asinh_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass @@ -21,7 +21,7 @@ edge_asinh_op = (exir_ops.edge.aten.asinh.default,) -class DecomposeAsinhPass(ArmPass): +class DecomposeAsinhPass(ArmOpTargetedPass): """Decomposes asinh to supported TOSA-operations. This decomposition is based on the mathematical identity: @@ -36,9 +36,10 @@ class DecomposeAsinhPass(ArmPass): ReplaceScalarWithTensorByProfilePass, MatchArgDtypePass, } + target_ops = edge_asinh_op def call_operator(self, op, args, kwargs, meta): - if op not in edge_asinh_op: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_atan_pass.py b/backends/arm/_passes/decompose_atan_pass.py index a7ca90e7b43..cd33504c972 100644 --- a/backends/arm/_passes/decompose_atan_pass.py +++ b/backends/arm/_passes/decompose_atan_pass.py @@ -7,7 +7,7 @@ from math import pi from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass @@ -40,7 +40,7 @@ def _get_atan_ops(op): ) -class DecomposeAtanPass(ArmPass): +class DecomposeAtanPass(ArmOpTargetedPass): """Decomposes the atan operator into a rational (Padé) approximation.""" _passes_required_after: Set[Type[ExportPass]] = { @@ -49,6 +49,7 @@ class DecomposeAtanPass(ArmPass): MatchArgDtypePass, ReplaceScalarWithTensorByProfilePass, } + target_ops = (edge_atan,) def _rational_approximation(self, z, ops, meta): """Creates a (2,1) Padé approximation for atan(x) on [-1, 1].""" @@ -77,7 +78,7 @@ def _rational_approximation(self, z, ops, meta): return super().call_operator(op_mul, (z, prod), {}, meta, updated=True) def call_operator(self, op, args, kwargs, meta): - if op is not edge_atan: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_atanh_pass.py b/backends/arm/_passes/decompose_atanh_pass.py index 014da39d7bd..c542b94f30d 100644 --- a/backends/arm/_passes/decompose_atanh_pass.py +++ b/backends/arm/_passes/decompose_atanh_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass @@ -33,7 +33,7 @@ def _get_atanh_ops(op): ) -class DecomposeAtanhPass(ArmPass): +class DecomposeAtanhPass(ArmOpTargetedPass): """Decomposes the atanh operator into primitive ops. atanh(x) = 0.5 * log((1 + x) / (1 - x)) @@ -46,9 +46,10 @@ class DecomposeAtanhPass(ArmPass): MatchArgDtypePass, ReplaceScalarWithTensorByProfilePass, } + target_ops = (edge_atanh,) def call_operator(self, op, args, kwargs, meta): - if op is not edge_atanh: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_avg_pool2d_pass.py b/backends/arm/_passes/decompose_avg_pool2d_pass.py index 8fcbcd35b5e..eb30a7600d8 100644 --- a/backends/arm/_passes/decompose_avg_pool2d_pass.py +++ b/backends/arm/_passes/decompose_avg_pool2d_pass.py @@ -7,7 +7,7 @@ from typing import Any, Set, Type import torch -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.backends.arm._passes.fuse_constant_ops_pass import ( ComputeConstantOpsAOTPass, ) @@ -96,13 +96,13 @@ def _get_avgpool_post_pad( return [pad_w, post_w, pad_h, post_h], [0, 0] -class DecomposeAvgPool2dPass(ArmPass): +class DecomposeAvgPool2dPass(ArmOpTargetedPass): _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOTPass} + target_ops = edge_avg_pool2d + aten_avg_pool2d + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in ( - edge_avg_pool2d + aten_avg_pool2d - ) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) pad_op, avgpool_op, mul_op = get_decomposition(op) diff --git a/backends/arm/_passes/decompose_cosh_pass.py b/backends/arm/_passes/decompose_cosh_pass.py index 70d4247d9e0..96c73b6cdf2 100644 --- a/backends/arm/_passes/decompose_cosh_pass.py +++ b/backends/arm/_passes/decompose_cosh_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass @@ -19,7 +19,7 @@ edge_cosh = exir_ops.edge.aten.cosh.default -class DecomposeCoshPass(ArmPass): +class DecomposeCoshPass(ArmOpTargetedPass): """ This pass replaces the cosh operator with a sequence of TOSA-equivalent operations that compute the hyperbolic cosine using the formula: @@ -34,9 +34,10 @@ class DecomposeCoshPass(ArmPass): ReplaceScalarWithTensorByProfilePass, MatchArgDtypePass, } + target_ops = (edge_cosh,) def call_operator(self, op, args, kwargs, meta, updated=False): - if op is not edge_cosh: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_cosine_similarity_pass.py b/backends/arm/_passes/decompose_cosine_similarity_pass.py index 6ceb50fdf55..b9e11a68174 100644 --- a/backends/arm/_passes/decompose_cosine_similarity_pass.py +++ b/backends/arm/_passes/decompose_cosine_similarity_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.convert_full_like_to_full_pass import ( ConvertFullLikeToFullPass, ) @@ -19,7 +19,7 @@ torch_cosine_similarity = (torch.ops.aten.cosine_similarity.default,) -class DecomposeCosineSimilarityPass(ArmPass): +class DecomposeCosineSimilarityPass(ArmOpTargetedPass): """Decomposition of aten.cosine_similarity. Example: @@ -42,9 +42,11 @@ class DecomposeCosineSimilarityPass(ArmPass): ConvertFullLikeToFullPass, InsertTableOpsPass, } + target_ops = torch_cosine_similarity + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in torch_cosine_similarity or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) x1, x2 = args[0], args[1] diff --git a/backends/arm/_passes/decompose_div_pass.py b/backends/arm/_passes/decompose_div_pass.py index 651e58a563c..be4d91cd30c 100644 --- a/backends/arm/_passes/decompose_div_pass.py +++ b/backends/arm/_passes/decompose_div_pass.py @@ -8,7 +8,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -28,7 +28,7 @@ def get_div_decomposition(op) -> tuple: raise RuntimeError(f"Can't get div decomposition for op {op}") -class DecomposeDivPass(ArmPass): +class DecomposeDivPass(ArmOpTargetedPass): """This pass decomposes div into a mul and a reciprocal node. Example: @@ -40,11 +40,10 @@ class DecomposeDivPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass} + target_ops = edge_div_ops + aten_div_ops def call_operator(self, op, args, kwargs, meta): - if op not in (edge_div_ops + aten_div_ops) or not self.allowed_to_transform( - meta - ): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) reciprocal_op, mul_op = get_div_decomposition(op) diff --git a/backends/arm/_passes/decompose_div_tensor_mode.py b/backends/arm/_passes/decompose_div_tensor_mode.py index 774557b816f..cc5440b4e5b 100644 --- a/backends/arm/_passes/decompose_div_tensor_mode.py +++ b/backends/arm/_passes/decompose_div_tensor_mode.py @@ -7,7 +7,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -42,7 +42,7 @@ def _get_opset(op): raise RuntimeError(f"div.Tensor_mode not supported for op {op}") -class DecomposeDivTensorModePass(ArmPass): +class DecomposeDivTensorModePass(ArmOpTargetedPass): """Rewrites aten.div.Tensor_mode into. Example: @@ -57,11 +57,11 @@ class DecomposeDivTensorModePass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivPass} + target_ops = edge_div_mode_ops + aten_div_mode_ops + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in ( - edge_div_mode_ops + aten_div_mode_ops - ) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) opset = _get_opset(op) diff --git a/backends/arm/_passes/decompose_elu_pass.py b/backends/arm/_passes/decompose_elu_pass.py index 548a508d914..5f94968ad79 100644 --- a/backends/arm/_passes/decompose_elu_pass.py +++ b/backends/arm/_passes/decompose_elu_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -71,13 +71,15 @@ def _get_elu_parameters(op, args, kwargs): return alpha, scale, input_scale -class ConvertEluFamilyToEluPass(ArmPass): +class ConvertEluFamilyToEluPass(ArmOpTargetedPass): """Convert SELU/CELU ops to equivalent parameterized ELU ops.""" _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = selu_ops + celu_ops + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in selu_ops + celu_ops or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta, updated=False) input_ = args[0] @@ -96,7 +98,7 @@ def call_operator(self, op, args, kwargs, meta): ) -class DecomposeEluPass(ArmPass): +class DecomposeEluPass(ArmOpTargetedPass): """A transformation pass that decomposes unsupported 'aten.elu' operations into a combination of supported TOSA-equivalent operations. @@ -119,9 +121,10 @@ class DecomposeEluPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = edge_elu_family_ops def call_operator(self, op, args, kwargs, meta): - if op not in edge_elu_family_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_erfinv_pass.py b/backends/arm/_passes/decompose_erfinv_pass.py index 747209d943e..07f874f9d97 100644 --- a/backends/arm/_passes/decompose_erfinv_pass.py +++ b/backends/arm/_passes/decompose_erfinv_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.convert_full_like_to_full_pass import ( ConvertFullLikeToFullPass, ) @@ -48,7 +48,7 @@ def get_erfinv_decomposition(op) -> tuple: raise RuntimeError(f"Can't get erfinv decomposition for op {op}") -class DecomposeErfinvPass(ArmPass): +class DecomposeErfinvPass(ArmOpTargetedPass): """Decomposes `aten.erfinv` using the same *initial-guess* approximation as the PyTorch CPU scalar `calc_erfinv`, with a guarded Newton refinement step to improve numerical accuracy (especially for fp16). @@ -127,9 +127,10 @@ class DecomposeErfinvPass(ArmPass): MatchArgDtypePass, ReplaceScalarWithTensorByProfilePass, } + target_ops = edge_erfinv_ops def call_operator(self, op, args, kwargs, meta): - if op not in edge_erfinv_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_expm1_pass.py b/backends/arm/_passes/decompose_expm1_pass.py index c1cb0b83166..6898b9fafb2 100644 --- a/backends/arm/_passes/decompose_expm1_pass.py +++ b/backends/arm/_passes/decompose_expm1_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass from executorch.backends.arm._passes.decompose_int_pow_pass import DecomposeIntPowPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass @@ -55,7 +55,7 @@ def _get_expm1_decomposition(op) -> tuple: raise RuntimeError(f"Can't get expm1 decomposition for op {op}") -class DecomposeExpm1Pass(ArmPass): +class DecomposeExpm1Pass(ArmOpTargetedPass): """A transformation pass that decomposes unsupported 'aten.expm1' operations into a combination of supported TOSA-equivalent operations. @@ -87,9 +87,10 @@ class DecomposeExpm1Pass(ArmPass): MatchArgDtypePass, MatchArgRanksPass, } + target_ops = edge_expm1_ops def call_operator(self, op, args, kwargs, meta): - if op not in edge_expm1_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_floor_divide_pass.py b/backends/arm/_passes/decompose_floor_divide_pass.py index 20e63f48023..d8f451f8af6 100644 --- a/backends/arm/_passes/decompose_floor_divide_pass.py +++ b/backends/arm/_passes/decompose_floor_divide_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_div_tensor_mode import ( DecomposeDivTensorModePass, ) @@ -47,15 +47,16 @@ def get_floor_divide_decomposition(op) -> tuple: raise RuntimeError(f"Can't get floor_div decomposition for op {op}") -class DecomposeFloorDividePass(ArmPass): +class DecomposeFloorDividePass(ArmOpTargetedPass): """Decomposes aten.floor_divide into aten.div.Tensor_mode with rounding_mode="floor". """ _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivTensorModePass} + target_ops = edge_floor_divide_ops + aten_floor_divide_ops def call_operator(self, op, args, kwargs, meta): - if op not in (edge_floor_divide_ops + aten_floor_divide_ops): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) (div_op, full_op) = get_floor_divide_decomposition(op) diff --git a/backends/arm/_passes/decompose_gelu_pass.py b/backends/arm/_passes/decompose_gelu_pass.py index 7815b5fa44f..85f0b77df21 100644 --- a/backends/arm/_passes/decompose_gelu_pass.py +++ b/backends/arm/_passes/decompose_gelu_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.backends.arm._passes.fuse_constant_ops_pass import ( ComputeConstantOpsAOTPass, @@ -42,7 +42,7 @@ def _get_gelu_ops(op) -> tuple: raise RuntimeError(f"Can't get GeLU decomposition ops for op {op}") -class DecomposeGeluPass(ArmPass): +class DecomposeGeluPass(ArmOpTargetedPass): """This pass decomposes the GELU operator into primitive ops. Aiming to adhere closely to the reference implementations built into ExecuTorch. Including using the same pre-calculated constants. @@ -88,9 +88,10 @@ class DecomposeGeluPass(ArmPass): MatchArgDtypePass, MatchArgRanksPass, } + target_ops = torch_gelu + edge_gelu def call_operator(self, op, args, kwargs, meta): - if op not in torch_gelu + edge_gelu: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if self._is_quantized_meta(meta): # If quantized, node should be replace by table op diff --git a/backends/arm/_passes/decompose_glu_pass.py b/backends/arm/_passes/decompose_glu_pass.py index 68efaedd784..5927174a776 100644 --- a/backends/arm/_passes/decompose_glu_pass.py +++ b/backends/arm/_passes/decompose_glu_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -39,13 +39,14 @@ def get_ops(op): raise ValueError(f"Unsupported operator: {op}") -class DecomposeGluPass(ArmPass): +class DecomposeGluPass(ArmOpTargetedPass): """Decomposes the GLU operator into hadamard product and sigmoid.""" _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass} + target_ops = (edge_glu, aten_glu) def call_operator(self, op, args, kwargs, meta): - if op not in [edge_glu, aten_glu] or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) hadamard_prod, sigmoid, slice_op = get_ops(op) diff --git a/backends/arm/_passes/decompose_grouped_conv_pass.py b/backends/arm/_passes/decompose_grouped_conv_pass.py index ed0adbe83d7..3fb68bc5aef 100644 --- a/backends/arm/_passes/decompose_grouped_conv_pass.py +++ b/backends/arm/_passes/decompose_grouped_conv_pass.py @@ -7,7 +7,7 @@ from typing import Literal, Protocol, Set, Type, TypeGuard import torch -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.backends.arm._passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass from executorch.backends.arm._passes.quant_args import QuantArgs from executorch.exir.dialects._ops import ops as exir_ops @@ -24,7 +24,7 @@ class _PerChannelQuantArgs(Protocol): per_channel: Literal[True] -class DecomposeGroupedConvPass(ArmPass): +class DecomposeGroupedConvPass(ArmOpTargetedPass): """Splits a grouped convolution which is not supported by TOSA into multiple convolutions using slice->conv->cat. @@ -47,6 +47,11 @@ class DecomposeGroupedConvPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {Conv1dUnsqueezePass} + target_ops = ( + exir_ops.edge.aten.convolution.default, + torch.ops.aten.conv_transpose2d.input, + torch.ops.aten.conv2d.default, + ) @staticmethod def _get_decomposition(op): diff --git a/backends/arm/_passes/decompose_index_select_to_gather_pass.py b/backends/arm/_passes/decompose_index_select_to_gather_pass.py index 5947e8c5499..be0d4dbb07c 100644 --- a/backends/arm/_passes/decompose_index_select_to_gather_pass.py +++ b/backends/arm/_passes/decompose_index_select_to_gather_pass.py @@ -8,7 +8,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.convert_expand_copy_to_repeat import ( ConvertExpandCopyToRepeatPass, ) @@ -38,7 +38,7 @@ def _get_index_select_decomposition(op): raise RuntimeError(f"Can't get index_select decomposition for op {op}") -class DecomposeIndexSelectToGatherPass(ArmPass): +class DecomposeIndexSelectToGatherPass(ArmOpTargetedPass): """Decompose edge index_select into a single backend TOSA gather. index_select(x, dim, index) semantics: @@ -67,12 +67,12 @@ class DecomposeIndexSelectToGatherPass(ArmPass): ConvertSqueezesToViewPass, } - _TARGET_OPS = { + target_ops = { exir_ops.edge.aten.index_select.default, } def call_operator(self, op, args, kwargs, meta): - if op not in self._TARGET_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) x, dim, index = args diff --git a/backends/arm/_passes/decompose_index_tensor_to_gather_pass.py b/backends/arm/_passes/decompose_index_tensor_to_gather_pass.py index 037c9977fa6..93db9f9d434 100644 --- a/backends/arm/_passes/decompose_index_tensor_to_gather_pass.py +++ b/backends/arm/_passes/decompose_index_tensor_to_gather_pass.py @@ -9,7 +9,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import meta_without_qparams from executorch.backends.arm._passes.convert_expand_copy_to_repeat import ( ConvertExpandCopyToRepeatPass, @@ -75,7 +75,7 @@ def _broadcast_shape( return out -class DecomposeIndexTensorToGatherPass(ArmPass): +class DecomposeIndexTensorToGatherPass(ArmOpTargetedPass): """Decompose edge.aten.index.Tensor into backend TOSA gather (+ basic arith). @@ -165,7 +165,7 @@ class DecomposeIndexTensorToGatherPass(ArmPass): ReplaceScalarWithTensorByProfilePass, } - _TARGET_OPS = { + target_ops = { exir_ops.edge.aten.index.Tensor, } @@ -246,7 +246,7 @@ def _compute_index_tensor_params(self, x, m, index_shapes): return x_data, S, W, K, C, trailing, lin_scales def call_operator(self, op, args, kwargs, meta): - if op not in self._TARGET_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) assert ( diff --git a/backends/arm/_passes/decompose_int_pow_pass.py b/backends/arm/_passes/decompose_int_pow_pass.py index a31a9415e23..5147d23b68c 100644 --- a/backends/arm/_passes/decompose_int_pow_pass.py +++ b/backends/arm/_passes/decompose_int_pow_pass.py @@ -6,12 +6,12 @@ from typing import Optional, Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class DecomposeIntPowPass(ArmPass): +class DecomposeIntPowPass(ArmOpTargetedPass): """Replaces pow with integer exponent with a series of multiplications. Only handles pow.Tensor_Scalar and not pow.Tensor_Tensor. Needs to be run @@ -20,6 +20,7 @@ class DecomposeIntPowPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = (exir_ops.edge.aten.pow.Tensor_Scalar,) @staticmethod def _get_decomposable_integer_exponent(exp) -> Optional[int]: @@ -34,7 +35,7 @@ def _get_decomposable_integer_exponent(exp) -> Optional[int]: return None def call_operator(self, op, args, kwargs, meta): - if op != exir_ops.edge.aten.pow.Tensor_Scalar: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_leaky_relu_pass.py b/backends/arm/_passes/decompose_leaky_relu_pass.py index eb8b5bda61a..e2f9852d7f9 100644 --- a/backends/arm/_passes/decompose_leaky_relu_pass.py +++ b/backends/arm/_passes/decompose_leaky_relu_pass.py @@ -8,7 +8,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -33,7 +33,7 @@ def _get_leaky_relu_ops(op) -> tuple: raise RuntimeError(f"Can't get decomposition ops for op {op}") -class DecomposeLeakyReLUPass(ArmPass): +class DecomposeLeakyReLUPass(ArmOpTargetedPass): """This pass decomposes Leaky ReLU into primitive operations. LeakyReLU(x,slope) = max(0,x) + slope * min(0,x) @@ -47,9 +47,11 @@ class DecomposeLeakyReLUPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = edge_ops + torch_ops + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in (edge_ops + torch_ops) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) x = args[0] diff --git a/backends/arm/_passes/decompose_linalg_vector_norm_pass.py b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py index 8b165658c37..1604d861030 100644 --- a/backends/arm/_passes/decompose_linalg_vector_norm_pass.py +++ b/backends/arm/_passes/decompose_linalg_vector_norm_pass.py @@ -6,13 +6,13 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_sqrt_pass import DecomposeSqrtPass from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass from executorch.exir.pass_base import ExportPass -class DecomposeLinalgVectorNormPass(ArmPass): +class DecomposeLinalgVectorNormPass(ArmOpTargetedPass): """This pass decomposes aten.linalg_vector_norm.default into more primitive ops. We need to add this pass before quantization for graph annotation. By default, aten.linalg_vector_norm op is decomposed during legalization to @@ -40,11 +40,11 @@ class DecomposeLinalgVectorNormPass(ArmPass): } torch_linalg_vector_norm = (torch.ops.aten.linalg_vector_norm.default,) + target_ops = torch_linalg_vector_norm + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in self.torch_linalg_vector_norm or not self.allowed_to_transform( - meta - ): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) # Extract inputs and optional arguments. diff --git a/backends/arm/_passes/decompose_log1p_pass.py b/backends/arm/_passes/decompose_log1p_pass.py index b5cb8659140..7cc5f8cec9c 100644 --- a/backends/arm/_passes/decompose_log1p_pass.py +++ b/backends/arm/_passes/decompose_log1p_pass.py @@ -6,7 +6,7 @@ import logging from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass @@ -17,7 +17,7 @@ from executorch.exir.pass_base import ExportPass -class DecomposeLog1pPass(ArmPass): +class DecomposeLog1pPass(ArmOpTargetedPass): """Decompose log1p into a small polynomial with a log fallback for larger inputs. """ @@ -32,6 +32,7 @@ class DecomposeLog1pPass(ArmPass): _supported_ops = { exir_ops.edge.aten.log1p.default, } + target_ops = _supported_ops def _poly(self, x, meta): # 6-term Taylor: x - x^2/2 + x^3/3 - x^4/4 + x^5/5 - x^6/6 @@ -63,7 +64,7 @@ def _poly(self, x, meta): return acc def call_operator(self, op, args, kwargs, meta): - if op not in self._supported_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_logit_pass.py b/backends/arm/_passes/decompose_logit_pass.py index fa82ff4f579..9f9f4744fd0 100644 --- a/backends/arm/_passes/decompose_logit_pass.py +++ b/backends/arm/_passes/decompose_logit_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass @@ -50,7 +50,7 @@ def get_ops(op): raise ValueError(f"Unsupported operator: {op}") -class DecomposeLogitPass(ArmPass): +class DecomposeLogitPass(ArmOpTargetedPass): """Decomposes the `logit` operator into a sequence of primitive operations. If `eps` is provided, the input tensor `x` is first clamped to the range @@ -78,15 +78,13 @@ class DecomposeLogitPass(ArmPass): ReplaceScalarWithTensorByProfilePass, } - _TARGET_OPS = { + target_ops = { edge_logit, aten_logit, } def call_operator(self, op, args, kwargs, meta): - if op not in DecomposeLogitPass._TARGET_OPS or not self.allowed_to_transform( - meta - ): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) X = args[0] diff --git a/backends/arm/_passes/decompose_masked_fill_pass.py b/backends/arm/_passes/decompose_masked_fill_pass.py index 748aee3fc49..dfb85da7742 100644 --- a/backends/arm/_passes/decompose_masked_fill_pass.py +++ b/backends/arm/_passes/decompose_masked_fill_pass.py @@ -8,7 +8,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.convert_full_like_to_full_pass import ( ConvertFullLikeToFullPass, ) @@ -34,7 +34,7 @@ def _get_decomposition(op) -> tuple: raise RuntimeError(f"Unable to get decomposition for op {op}") -class DecomposeMaskedFillPass(ArmPass): +class DecomposeMaskedFillPass(ArmOpTargetedPass): """Masked fill takes in a boolean mask, a tensor and a scalar value. Fills the tensor with the scalar value according to the boolean mask. @@ -43,9 +43,10 @@ class DecomposeMaskedFillPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {ConvertFullLikeToFullPass} + target_ops = aten_ops + edge_ops def call_operator(self, op, args, kwargs, meta, updated=False): - if op not in (*aten_ops, *edge_ops): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) x, mask, scalar = args diff --git a/backends/arm/_passes/decompose_maxpool2d_with_dilation_pass.py b/backends/arm/_passes/decompose_maxpool2d_with_dilation_pass.py index 72fe53d57b9..7729b755113 100644 --- a/backends/arm/_passes/decompose_maxpool2d_with_dilation_pass.py +++ b/backends/arm/_passes/decompose_maxpool2d_with_dilation_pass.py @@ -9,7 +9,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.size_adjust_input_pass import SizeAdjustInputPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -47,7 +47,7 @@ def _pack_dimension( return packed_dim_size, padding + extra_padding, output_size -class DecomposeMaxPool2dPass(ArmPass): +class DecomposeMaxPool2dPass(ArmOpTargetedPass): """Decompose dilated max_pool2d (EXIR edge ops) into space-to-batch -> maxpool -> batch-to-space. """ @@ -55,10 +55,11 @@ class DecomposeMaxPool2dPass(ArmPass): _passes_required_after: Set[Type[ExportPass]] = { SizeAdjustInputPass, } + target_ops = EDGE_MAXPOOL2D def call_operator(self, op, args, kwargs, meta): # Only intercept EXIR edge max_pool2d ops - if op not in EDGE_MAXPOOL2D: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) # detect whether indices variant diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py index c7d3bc0a04d..e1175d5ba1b 100644 --- a/backends/arm/_passes/decompose_meandim_pass.py +++ b/backends/arm/_passes/decompose_meandim_pass.py @@ -8,7 +8,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass from executorch.backends.arm._passes.fuse_constant_ops_pass import ( @@ -69,7 +69,7 @@ def get_quantization(op): return None -class DecomposeMeanDimPass(ArmPass): +class DecomposeMeanDimPass(ArmOpTargetedPass): """Decomposes a meandim into sum + mul (1/N). Each reduction dimension is handled via REDUCE_SUM followed by @@ -94,6 +94,13 @@ class DecomposeMeanDimPass(ArmPass): DecomposeSumPass, SizeAdjustInputPass, } + target_ops = ( + exir_ops.edge.aten.mean.dim, + torch.ops.aten.mean.dim, + exir_ops.edge.aten.mean.default, + torch.ops.aten.mean.default, + ) + check_allowed_to_transform = True def __init__(self, graph_module, tosa_spec, *args, **kwargs): super().__init__(*args, **kwargs) @@ -101,12 +108,7 @@ def __init__(self, graph_module, tosa_spec, *args, **kwargs): self._tosa_spec = tosa_spec def call_operator(self, op, args, kwargs, meta, updated=False): - if op not in ( - exir_ops.edge.aten.mean.dim, - torch.ops.aten.mean.dim, - exir_ops.edge.aten.mean.default, - torch.ops.aten.mean.default, - ) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta, updated) x = get_node_arg(args, 0) diff --git a/backends/arm/_passes/decompose_ne_pass.py b/backends/arm/_passes/decompose_ne_pass.py index 95dfc0e1179..4dfcf6ad934 100644 --- a/backends/arm/_passes/decompose_ne_pass.py +++ b/backends/arm/_passes/decompose_ne_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -38,7 +38,7 @@ def get_ne_decomposition(op) -> tuple: raise RuntimeError(f"Can't get ne decomposition for op {op}") -class DecomposeNotEqualPass(ArmPass): +class DecomposeNotEqualPass(ArmOpTargetedPass): """A transformation pass that decomposes unsupported `aten.ne` operations into a combination of supported TOSA-equivalent operations. @@ -57,9 +57,10 @@ class DecomposeNotEqualPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = edge_ne_ops + aten_ne_ops def call_operator(self, op, args, kwargs, meta): - if op not in (edge_ne_ops + aten_ne_ops) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) lhs, rhs = args diff --git a/backends/arm/_passes/decompose_permute_for_u55_pass.py b/backends/arm/_passes/decompose_permute_for_u55_pass.py index ceed25f97ec..a9e8beef1cd 100644 --- a/backends/arm/_passes/decompose_permute_for_u55_pass.py +++ b/backends/arm/_passes/decompose_permute_for_u55_pass.py @@ -11,7 +11,7 @@ import torch import tosa_serializer as ts -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.backends.arm._passes.rewrite_slice import RewriteSlicePass from executorch.backends.arm.arm_vela import vela_compile from executorch.backends.arm.tosa.mapping import map_dtype @@ -20,7 +20,7 @@ from executorch.exir.pass_base import ExportPass -class DecomposePermuteForU55Pass(ArmPass): +class DecomposePermuteForU55Pass(ArmOpTargetedPass): """Decompose U55 permutes into shape-safe permutes for large tensor shapes. Ethos-U55 has transpose shape constraints based on rank-dependent @@ -36,6 +36,7 @@ class DecomposePermuteForU55Pass(ArmPass): exir_ops.edge.aten.permute.default, exir_ops.edge.aten.permute_copy.default, ) + target_ops = _PERMUTE_OPS _SLICE_OP = exir_ops.edge.aten.slice_copy.Tensor _CAT_OP = exir_ops.edge.aten.cat.default _MAX_PRODUCT = 2**16 @@ -323,7 +324,7 @@ def recurse(current, depth: int): return recurse(input_node, 0) def call_operator(self, op, args, kwargs, meta): - if op not in self._PERMUTE_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) spec = get_context_spec() diff --git a/backends/arm/_passes/decompose_remainder_pass.py b/backends/arm/_passes/decompose_remainder_pass.py index 38185b85149..af22cad1624 100644 --- a/backends/arm/_passes/decompose_remainder_pass.py +++ b/backends/arm/_passes/decompose_remainder_pass.py @@ -6,7 +6,7 @@ from typing import Dict, Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_div_tensor_mode import ( DecomposeDivTensorModePass, ) @@ -41,7 +41,7 @@ } -class DecomposeRemainderPass(ArmPass): +class DecomposeRemainderPass(ArmOpTargetedPass): """ Decompose the remainder operation into primitive arithmetic: remainder(x, y) -> x - floor_div(x, y) * y @@ -49,15 +49,10 @@ class DecomposeRemainderPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivTensorModePass} + target_ops = tuple(_decomposition_ops) def call_operator(self, op, args, kwargs, meta, updated=False): - supported_ops = ( - exir_ops.edge.aten.remainder.Scalar, - exir_ops.edge.aten.remainder.Tensor, - torch.ops.aten.remainder.Scalar, - torch.ops.aten.remainder.Tensor, - ) - if op not in supported_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) # Keep scalar remainder opaque during transform-for-annotation so the # quantizer can wrap the original op directly. In the backend pipeline, diff --git a/backends/arm/_passes/decompose_round_pass.py b/backends/arm/_passes/decompose_round_pass.py index 9319394d986..476f75d6b56 100644 --- a/backends/arm/_passes/decompose_round_pass.py +++ b/backends/arm/_passes/decompose_round_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.dialects.edge._ops import EdgeOpOverload from executorch.exir.pass_base import ExportPass @@ -46,7 +46,7 @@ def _get_round_decomposition_ops(op) -> tuple[Op, Op, Op, Op, Op, Op, Op]: raise RuntimeError(f"Can't get round decomposition ops for op {op}") -class DecomposeRoundPass(ArmPass): +class DecomposeRoundPass(ArmOpTargetedPass): """ For inputs >= 0, round(x) is equivalent to floor(x + 0.5), and for inputs < 0, round(x) is equivalent to ceil(x - 0.5). This pass decomposes the round operation into @@ -63,15 +63,13 @@ class DecomposeRoundPass(ArmPass): _passes_required_after: Set[Type[ExportPass]] = set() - _TARGET_OPS = { + target_ops = { exir_ops.edge.aten.round.default, torch.ops.aten.round.default, } def call_operator(self, op, args, kwargs, meta, updated=False): - if op not in DecomposeRoundPass._TARGET_OPS or not self.allowed_to_transform( - meta - ): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta, updated) x = args[0] input_dtype = x.node.meta["val"].dtype diff --git a/backends/arm/_passes/decompose_select_scatter_pass.py b/backends/arm/_passes/decompose_select_scatter_pass.py index 4b4db8d208c..129e9f05961 100644 --- a/backends/arm/_passes/decompose_select_scatter_pass.py +++ b/backends/arm/_passes/decompose_select_scatter_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.convert_int64_const_ops_to_int32 import ( ConvertInt64ConstOpsToInt32Pass, ) @@ -44,7 +44,7 @@ def get_select_scatter_decomposition(op) -> tuple: raise RuntimeError(f"Can't get select_scatter decomposition for op {op}") -class DecomposeSelectScatterPass(ArmPass): +class DecomposeSelectScatterPass(ArmOpTargetedPass): """select_scatter is decomposed into other ops during export, however this is only suppported for the fp profile and for the int profile we need to decompose it here. @@ -65,9 +65,10 @@ class DecomposeSelectScatterPass(ArmPass): ReplaceScalarWithTensorByProfilePass, ConvertInt64ConstOpsToInt32Pass, } + target_ops = edge_scatter_ops + aten_scatter_ops def call_operator(self, op, args, kwargs, meta): - if op not in (edge_scatter_ops + aten_scatter_ops): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated=False) ( diff --git a/backends/arm/_passes/decompose_sign_pass.py b/backends/arm/_passes/decompose_sign_pass.py index 111d1ca5ee3..8f7fda8729b 100644 --- a/backends/arm/_passes/decompose_sign_pass.py +++ b/backends/arm/_passes/decompose_sign_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -44,15 +44,16 @@ def get_ops(op): raise ValueError(f"Unsupported operator: {op}") -class DecomposeSignPass(ArmPass): +class DecomposeSignPass(ArmOpTargetedPass): """Decomposes the sign operator into a sequence of operations that are supported by the Arm backend. """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = (edge_sign, aten_sign) def call_operator(self, op, args, kwargs, meta): - if op not in (edge_sign, aten_sign) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) gt_op, lt_op, where_op, neg_op, mul_op, add_op = get_ops(op) diff --git a/backends/arm/_passes/decompose_sinh_pass.py b/backends/arm/_passes/decompose_sinh_pass.py index 71ac0a34f08..053b378af83 100644 --- a/backends/arm/_passes/decompose_sinh_pass.py +++ b/backends/arm/_passes/decompose_sinh_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.backends.arm._passes.match_arg_dtype_pass import MatchArgDtypePass from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass @@ -21,7 +21,7 @@ edge_sinh = exir_ops.edge.aten.sinh.default -class DecomposeSinhPass(ArmPass): +class DecomposeSinhPass(ArmOpTargetedPass): """A decomposition pass that decomposes Sinh operations into a combination of supported TOSA-equivalent operations (MI). @@ -39,9 +39,10 @@ class DecomposeSinhPass(ArmPass): ReplaceScalarWithTensorByProfilePass, MatchArgDtypePass, } + target_ops = (edge_sinh,) def call_operator(self, op, args, kwargs, meta): - if op is not edge_sinh: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_slice_scatter_pass.py b/backends/arm/_passes/decompose_slice_scatter_pass.py index 24cdfeb96a5..edf030f9701 100644 --- a/backends/arm/_passes/decompose_slice_scatter_pass.py +++ b/backends/arm/_passes/decompose_slice_scatter_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.accumulate_index_put_pass import ( AccumulateIndexPutPass, ) @@ -53,7 +53,7 @@ def _fixup_end(end, dim_size: int) -> int: return max(0, min(e, dim_size)) -class DecomposeSliceScatterPass(ArmPass): +class DecomposeSliceScatterPass(ArmOpTargetedPass): """ Decompose slice_scatter into: - Fast path (step == 1): slice_copy + cat (contiguous update), or @@ -71,9 +71,10 @@ class DecomposeSliceScatterPass(ArmPass): AccumulateIndexPutPass, RewriteIndexPutPass, } + target_ops = edge_slice_scatter_ops + aten_slice_scatter_ops def call_operator(self, op, args, kwargs, meta): - if op not in (edge_slice_scatter_ops + aten_slice_scatter_ops): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) ( diff --git a/backends/arm/_passes/decompose_softmax_pass.py b/backends/arm/_passes/decompose_softmax_pass.py index cb05b7c4b0c..d30137c0460 100644 --- a/backends/arm/_passes/decompose_softmax_pass.py +++ b/backends/arm/_passes/decompose_softmax_pass.py @@ -7,7 +7,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops @@ -56,7 +56,7 @@ def _get_logsoftmax_ops(op) -> tuple: raise RuntimeError(f"Can't get logsoftmax decomposition ops for op {op}") -class DecomposeSoftmaxPass(ArmPass): +class DecomposeSoftmaxPass(ArmOpTargetedPass): """This pass decomposes log_softmax or softmax into more primitive ops. Example: @@ -77,6 +77,7 @@ class DecomposeSoftmaxPass(ArmPass): DecomposeSumPass, InsertTableOpsPass, } + target_ops = torch_softmax + edge_softmax def __init__(self, skip_safe_softmax: bool = False, **kwargs): super().__init__(**kwargs) @@ -84,9 +85,7 @@ def __init__(self, skip_safe_softmax: bool = False, **kwargs): self._warned_safe_softmax = False def call_operator(self, op, args, kwargs, meta): - if op not in torch_softmax + edge_softmax or not self.allowed_to_transform( - meta - ): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) if self._skip_safe_softmax and op == torch.ops.aten._safe_softmax.default: diff --git a/backends/arm/_passes/decompose_sqrt_pass.py b/backends/arm/_passes/decompose_sqrt_pass.py index 86e5d6681bd..ce5a5b6d2a4 100644 --- a/backends/arm/_passes/decompose_sqrt_pass.py +++ b/backends/arm/_passes/decompose_sqrt_pass.py @@ -6,7 +6,7 @@ from typing import Set, Tuple, Type, Union import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -27,15 +27,14 @@ def get_sqrt_decomposition(op) -> Union[Tuple, torch._ops.OpOverload]: raise RuntimeError(f"Can't get sqrt decomposition for op {op}") -class DecomposeSqrtPass(ArmPass): +class DecomposeSqrtPass(ArmOpTargetedPass): _passes_required_after: Set[Type[ExportPass]] = {InsertTableOpsPass} + target_ops = edge_sqrt_ops + aten_sqrt_ops def call_operator(self, op, args, kwargs, meta): """Decomposes `sqrt(x)` into `pow(x, 0.5)` for backend support.""" - if op not in (edge_sqrt_ops + aten_sqrt_ops) or not self.allowed_to_transform( - meta - ): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_strided_slice_copy_pass.py b/backends/arm/_passes/decompose_strided_slice_copy_pass.py index 71cc618ed9c..91606dd0bd6 100644 --- a/backends/arm/_passes/decompose_strided_slice_copy_pass.py +++ b/backends/arm/_passes/decompose_strided_slice_copy_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -42,7 +42,7 @@ def _fixup_end(end, dim_size): return max(0, min(e, dim_size)) -class DecomposeStridedSliceCopyPass(ArmPass): +class DecomposeStridedSliceCopyPass(ArmOpTargetedPass): """Decompose edge.aten.slice_copy.Tensor with non-unit step into supported ops. @@ -61,10 +61,10 @@ class DecomposeStridedSliceCopyPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() - _TARGET_OPS = {exir_ops.edge.aten.slice_copy.Tensor} + target_ops = {exir_ops.edge.aten.slice_copy.Tensor} def call_operator(self, op, args, kwargs, meta): - if op not in self._TARGET_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) # Only handle the non-unit-step case; leave unit-step to existing lowering. diff --git a/backends/arm/_passes/decompose_sum_pass.py b/backends/arm/_passes/decompose_sum_pass.py index 3076510533e..e134ea6abc7 100644 --- a/backends/arm/_passes/decompose_sum_pass.py +++ b/backends/arm/_passes/decompose_sum_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -24,7 +24,7 @@ def _get_sum_decomp(op): raise RuntimeError("Unvalid op in DecomposeSumPass") -class DecomposeSumPass(ArmPass): +class DecomposeSumPass(ArmOpTargetedPass): """In Pytorch, the default behaviour of for example Tensor.sum is to squeeze the dimension that is summed (keep_dim = False). However, in TOSA, REDUCE_SUM always preserves the rank of the input (keep_dim = True). To get @@ -44,12 +44,13 @@ class DecomposeSumPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = ( + exir_ops.edge.aten.sum.dim_IntList, + torch.ops.aten.sum.dim_IntList, + ) def call_operator(self, op, args, kwargs, meta): - if op not in [ - exir_ops.edge.aten.sum.dim_IntList, - torch.ops.aten.sum.dim_IntList, - ]: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) match len(args): diff --git a/backends/arm/_passes/decompose_tan_pass.py b/backends/arm/_passes/decompose_tan_pass.py index 87b347dbbad..2d655a9937d 100644 --- a/backends/arm/_passes/decompose_tan_pass.py +++ b/backends/arm/_passes/decompose_tan_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass, DecomposeDivPass +from executorch.backends.arm._passes import ArmOpTargetedPass, DecomposeDivPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -13,13 +13,14 @@ edge_tan_op = exir_ops.edge.aten.tan.default -class DecomposeTanPass(ArmPass): +class DecomposeTanPass(ArmOpTargetedPass): """Decomposes tan to sin/cos.""" _passes_required_after: Set[Type[ExportPass]] = {DecomposeDivPass} + target_ops = (edge_tan_op,) def call_operator(self, op, args, kwargs, meta, updated=False): - if op != edge_tan_op: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) # Skip quantized tan - it is decomposed as one single table op if self._is_quantized_meta(meta): diff --git a/backends/arm/_passes/decompose_tosa_unsupported_clamp_pass.py b/backends/arm/_passes/decompose_tosa_unsupported_clamp_pass.py index 2410ce503a7..12dcd06388c 100644 --- a/backends/arm/_passes/decompose_tosa_unsupported_clamp_pass.py +++ b/backends/arm/_passes/decompose_tosa_unsupported_clamp_pass.py @@ -6,12 +6,12 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class DecomposeTOSAUnsupportedClampPass(ArmPass): +class DecomposeTOSAUnsupportedClampPass(ArmOpTargetedPass): """Rewrite TOSA unsupported clamp into min/max chain since TOSA lacks int32 clamp support and only supports scalar min/max values. """ @@ -23,6 +23,7 @@ class DecomposeTOSAUnsupportedClampPass(ArmPass): torch.ops.aten.clamp.default, torch.ops.aten.clamp.Tensor, } + target_ops = _supported_ops def _ensure_tensor( self, @@ -54,7 +55,7 @@ def call_operator(self, op, args, kwargs, meta): torch.ops.aten.clamp.Tensor, } - if op not in self._supported_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) # Only rewrite scalar clamp for int32 diff --git a/backends/arm/_passes/decompose_tril_pass.py b/backends/arm/_passes/decompose_tril_pass.py index 3101b24e95b..9108208e73d 100644 --- a/backends/arm/_passes/decompose_tril_pass.py +++ b/backends/arm/_passes/decompose_tril_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.backends.arm._passes.fuse_constant_ops_pass import ( ComputeConstantOpsAOTPass, @@ -44,7 +44,7 @@ def _get_ops(op): raise RuntimeError(f"Unable to get decomposition ops for {op}") -class DecomposeTrilPass(ArmPass): +class DecomposeTrilPass(ArmOpTargetedPass): """Tril decomposition. Decomposition: @@ -54,11 +54,10 @@ class DecomposeTrilPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {ComputeConstantOpsAOTPass} + target_ops = (torch.ops.aten.tril.default,) def call_operator(self, op, args, kwargs, meta): - handled_ops = [torch.ops.aten.tril.default] - - if op not in handled_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) x = args[0] diff --git a/backends/arm/_passes/decompose_unfold_to_gather_pass.py b/backends/arm/_passes/decompose_unfold_to_gather_pass.py index d0e3897080a..950290b3b83 100644 --- a/backends/arm/_passes/decompose_unfold_to_gather_pass.py +++ b/backends/arm/_passes/decompose_unfold_to_gather_pass.py @@ -9,7 +9,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.replace_scalar_with_tensor_pass import ( ReplaceScalarWithTensorByProfilePass, ) @@ -29,7 +29,7 @@ def _get_unfold_copy_decomposition(op) -> tuple: """ - if op in DecomposeUnfoldToGatherPass._TARGET_OPS: + if op in DecomposeUnfoldToGatherPass.target_ops: return ( exir_ops.edge.dim_order_ops._to_dim_order_copy.default, exir_ops.edge.aten.view_copy.default, @@ -45,7 +45,7 @@ def _get_unfold_copy_decomposition(op) -> tuple: raise RuntimeError(f"Can't get unfold_copy decomposition for op {op}") -class DecomposeUnfoldToGatherPass(ArmPass): +class DecomposeUnfoldToGatherPass(ArmOpTargetedPass): """Decompose unfold_copy with backend tosa.GATHER as the core op, plus other TOSA-supported ops to build indices and materialize the output layout. @@ -93,7 +93,7 @@ class DecomposeUnfoldToGatherPass(ArmPass): ReplaceScalarWithTensorByProfilePass, } - _TARGET_OPS = { + target_ops = { exir_ops.edge.aten.unfold_copy.default, } @@ -147,7 +147,7 @@ def _compute_unfold_copy_params( return (x_val, C, S, K, U, UC, pre, post, P, Q, needs_bool_cast) def call_operator(self, op, args, kwargs, meta): - if op not in self._TARGET_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) x, dim, size, step = args diff --git a/backends/arm/_passes/decompose_var_pass.py b/backends/arm/_passes/decompose_var_pass.py index fcf61cf5129..90ea80b6b47 100644 --- a/backends/arm/_passes/decompose_var_pass.py +++ b/backends/arm/_passes/decompose_var_pass.py @@ -8,7 +8,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass from executorch.backends.arm._passes.decompose_sum_pass import DecomposeSumPass @@ -37,7 +37,7 @@ def get_var_decomposition(op) -> tuple: raise RuntimeError(f"Can't get var decomposition for op {op}") -class DecomposeVarPass(ArmPass): +class DecomposeVarPass(ArmOpTargetedPass): """ This pass decomposes var.correction and var.dim into smaller ops (see https://pytorch.org/docs/stable/generated/torch.var.html) @@ -56,13 +56,15 @@ class DecomposeVarPass(ArmPass): DecomposeMeanDimPass, DecomposeSumPass, } + target_ops = ( + exir_ops.edge.aten.var.correction, + torch.ops.aten.var.correction, + torch.ops.aten.var.dim, + ) + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): - if op not in ( - exir_ops.edge.aten.var.correction, - torch.ops.aten.var.correction, - torch.ops.aten.var.dim, - ) or not self.allowed_to_transform(meta): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta) x = args[0] diff --git a/backends/arm/_passes/decompose_where_scalar_other_pass.py b/backends/arm/_passes/decompose_where_scalar_other_pass.py index a125a6355cb..8b4b27c8ce2 100644 --- a/backends/arm/_passes/decompose_where_scalar_other_pass.py +++ b/backends/arm/_passes/decompose_where_scalar_other_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -27,20 +27,18 @@ def _get_where_scalar_other_decomposition(op): raise RuntimeError(f"Can't get where.ScalarOther decomposition for op {op}") -class DecomposeWhereScalarOtherPass(ArmPass): +class DecomposeWhereScalarOtherPass(ArmOpTargetedPass): """Decompose where.ScalarOther into where.self with a tensorized scalar.""" _passes_required_after: Set[Type[ExportPass]] = set() - _TARGET_OPS = { + target_ops = { exir_ops.edge.aten.where.ScalarOther, } + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta, updated=False): - if ( - op not in DecomposeWhereScalarOtherPass._TARGET_OPS - or not self.allowed_to_transform(meta) - ): + if op not in self.target_ops or not self.allowed_to_transform(meta): return super().call_operator(op, args, kwargs, meta, updated) condition, self_tensor, other_scalar = args diff --git a/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py b/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py index b856df8e060..3ddd1358035 100644 --- a/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py +++ b/backends/arm/_passes/decorate_fp32_to_int32_casting_pass.py @@ -7,7 +7,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -26,7 +26,7 @@ def _get_decorated_ops(op): raise RuntimeError(f"Can't get decorated ops for op {op}") -class DecorateFp32toInt32CastingPass(ArmPass): +class DecorateFp32toInt32CastingPass(ArmOpTargetedPass): """To lower pytorch fp32 -> int32 casting to TOSA, we need to transform the value with Ceil, Floor, and Where. @@ -47,9 +47,10 @@ class DecorateFp32toInt32CastingPass(ArmPass): targets = [ exir_ops.edge.dim_order_ops._to_dim_order_copy.default, ] + target_ops = targets def call_operator(self, op, args, kwargs, meta): - if op not in self.targets: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) input = get_node_arg(args, 0) diff --git a/backends/arm/_passes/fuse_consecutive_concat_shapes.py b/backends/arm/_passes/fuse_consecutive_concat_shapes.py index 8a02697d57c..fc2d46d3c12 100644 --- a/backends/arm/_passes/fuse_consecutive_concat_shapes.py +++ b/backends/arm/_passes/fuse_consecutive_concat_shapes.py @@ -6,12 +6,12 @@ from typing import Any import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import NodeMetadata, ProxyValue -class FuseConsecutiveConcatShapesPass(ArmPass): +class FuseConsecutiveConcatShapesPass(ArmOpTargetedPass): """This pass fuses consecutive tosa.CONCAT_SHAPE operations into a single tosa.CONCAT_SHAPE operation with a flattened list of input shapes. E.g. tosa.CONCAT_SHAPE([shape1, tosa.CONCAT_SHAPE([shape2, shape3]), shape4]) @@ -24,6 +24,7 @@ class FuseConsecutiveConcatShapesPass(ArmPass): """ _passes_required_after = set() + target_ops = (exir_ops.backend.tosa.CONCAT_SHAPE.default,) def _to_proxy_value( self, arg: ProxyValue | torch.fx.Node | Any @@ -42,7 +43,7 @@ def call_operator( meta: NodeMetadata, updated: bool | None = False, ) -> ProxyValue: - if op != exir_ops.backend.tosa.CONCAT_SHAPE.default: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) arg_list = args[0] new_arg_list: list[Any] = [] diff --git a/backends/arm/_passes/insert_const_shapes.py b/backends/arm/_passes/insert_const_shapes.py index 059731857b4..c916438eb09 100644 --- a/backends/arm/_passes/insert_const_shapes.py +++ b/backends/arm/_passes/insert_const_shapes.py @@ -5,12 +5,12 @@ from typing import Any, Optional -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm.tosa.dialect.shape import meta_has_shape_mark from executorch.exir.dialects._ops import ops as exir_ops -class InsertConstShapesPass(ArmPass): +class InsertConstShapesPass(ArmOpTargetedPass): """Materialize literal shape arguments as CONST_SHAPE nodes. This pass targets ops such as `aten.view_copy` and `aten.repeat` whose shape @@ -21,7 +21,7 @@ class InsertConstShapesPass(ArmPass): """ _passes_required_after = set() - targeted_ops = { + target_ops = { exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.repeat.default, } @@ -41,7 +41,7 @@ def _is_shape_arg(arg: Any) -> bool: ) def call_operator(self, op, args, kwargs, meta, updated: Optional[bool] = False): - if op not in self.targeted_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) if any(InsertConstShapesPass._is_shape_arg(arg) for arg in args): new_args = [] diff --git a/backends/arm/_passes/insert_data_layout_casts_pass.py b/backends/arm/_passes/insert_data_layout_casts_pass.py index b760baef6e8..07a2d186895 100644 --- a/backends/arm/_passes/insert_data_layout_casts_pass.py +++ b/backends/arm/_passes/insert_data_layout_casts_pass.py @@ -6,13 +6,13 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm.tosa.specification import get_context_spec from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, NodeMetadata -class InsertDataLayoutCastsPass(ArmPass): +class InsertDataLayoutCastsPass(ArmOpTargetedPass): """Insert casts around data layout operators when their dtype is not supported by the active TOSA specification. @@ -45,7 +45,7 @@ class InsertDataLayoutCastsPass(ArmPass): exir_ops.edge.aten.slice_copy.Tensor, exir_ops.edge.aten.flip.default, } - targeted_ops = _concat_ops | _single_input_ops + target_ops = _concat_ops | _single_input_ops _fp_to_int_map = { torch.float16: torch.int16, @@ -60,7 +60,7 @@ class InsertDataLayoutCastsPass(ArmPass): } def call_operator(self, op, args, kwargs, meta): - if op not in self.targeted_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if op in self._concat_ops: diff --git a/backends/arm/_passes/insert_dynamic_padding.py b/backends/arm/_passes/insert_dynamic_padding.py index ea03e231ae8..61a5ebd09ca 100644 --- a/backends/arm/_passes/insert_dynamic_padding.py +++ b/backends/arm/_passes/insert_dynamic_padding.py @@ -7,14 +7,14 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm.tosa.dialect.shape import is_shape_op_node from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, ProxyValue -class InsertDynamicPaddingPass(ArmPass): +class InsertDynamicPaddingPass(ArmOpTargetedPass): """This pass rewrites conv operations with padding to use an explicit pad operator before the conv2d operation and setting the padding to zero in the conv2d operator. E.g. conv2d(x, weight, bias, stride, padding, dilation) @@ -27,6 +27,10 @@ class InsertDynamicPaddingPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = ( + exir_ops.backend.tosa.CONV2D.default, + exir_ops.backend.tosa.DEPTHWISE_CONV2D.default, + ) def _is_dynamic_padding( self, padding: ProxyValue | list[int] | tuple[int, ...] @@ -39,10 +43,7 @@ def _is_dynamic_padding( ) def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue: - if op not in ( - exir_ops.backend.tosa.CONV2D.default, - exir_ops.backend.tosa.DEPTHWISE_CONV2D.default, - ): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) padding = args[4] if not self._is_dynamic_padding(padding): diff --git a/backends/arm/_passes/normalize_index_put_bool_index_tensor_pass.py b/backends/arm/_passes/normalize_index_put_bool_index_tensor_pass.py index 9377eaec2fe..badc58b06fb 100644 --- a/backends/arm/_passes/normalize_index_put_bool_index_tensor_pass.py +++ b/backends/arm/_passes/normalize_index_put_bool_index_tensor_pass.py @@ -6,13 +6,13 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.rewrite_index_put_pass import RewriteIndexPutPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class NormalizeIndexPutBoolIndexTensorPass(ArmPass): +class NormalizeIndexPutBoolIndexTensorPass(ArmOpTargetedPass): """Normalize single boolean mask index_put scalar to where. In the general case, boolean masks are complex and data dependent. The simple case x[mask] = scalar @@ -30,6 +30,7 @@ class NormalizeIndexPutBoolIndexTensorPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {RewriteIndexPutPass} + target_ops = (exir_ops.edge.aten.index_put.default,) def __init__(self): super().__init__() @@ -57,7 +58,7 @@ def _is_valid_bool_mask( return True def call_operator(self, op, args, kwargs, meta, updated: bool | None = False): - if op not in (exir_ops.edge.aten.index_put.default,): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) destination, indices_tensor_list, data = args[:3] diff --git a/backends/arm/_passes/normalize_index_put_none_indices_pass.py b/backends/arm/_passes/normalize_index_put_none_indices_pass.py index 7aaace641b0..3afc9732b02 100644 --- a/backends/arm/_passes/normalize_index_put_none_indices_pass.py +++ b/backends/arm/_passes/normalize_index_put_none_indices_pass.py @@ -4,13 +4,13 @@ # LICENSE file in the root directory of this source tree. from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.rewrite_index_put_pass import RewriteIndexPutPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class NormalizeIndexPutNoneIndicesPass(ArmPass): +class NormalizeIndexPutNoneIndicesPass(ArmOpTargetedPass): """Normalize index_put with None:s in the indices_tensor list by moving None-indexed dims to the channel dimensions (*C_j in RewriteIndexPutPass teminology) by permutating the destination and data tensors. A None-index @@ -41,6 +41,7 @@ class NormalizeIndexPutNoneIndicesPass(ArmPass): """ _passes_required_after: Set[Type[ExportPass]] = {RewriteIndexPutPass} + target_ops = (exir_ops.edge.aten.index_put.default,) def __init__(self): super().__init__() @@ -67,7 +68,7 @@ def _get_data_dim_order( return destination_dim_order def call_operator(self, op, args, kwargs, meta, updated: bool | None = False): - if op not in (exir_ops.edge.aten.index_put.default,): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) destination, indices_tensor_list, data = args[:3] diff --git a/backends/arm/_passes/promote_bool_operands_pass.py b/backends/arm/_passes/promote_bool_operands_pass.py index 4d02646e30a..8e162ded1bd 100644 --- a/backends/arm/_passes/promote_bool_operands_pass.py +++ b/backends/arm/_passes/promote_bool_operands_pass.py @@ -11,19 +11,19 @@ import torch -from executorch.backends.arm._passes.arm_pass import ArmPass +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class PromoteBoolOperandsPass(ArmPass): +class PromoteBoolOperandsPass(ArmOpTargetedPass): """Promote boolean operands to the appropriate integer dtype for unsupported ops. """ _passes_required_after: Set[Type[ExportPass]] = set() - targeted_ops = { + target_ops = { exir_ops.edge.aten.bitwise_and.Tensor, exir_ops.edge.aten.bitwise_or.Tensor, exir_ops.edge.aten.bitwise_xor.Tensor, @@ -31,7 +31,7 @@ class PromoteBoolOperandsPass(ArmPass): } def call_operator(self, op, args, kwargs, meta): - if op not in self.targeted_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) original_dtypes = [arg.data.dtype for arg in args] diff --git a/backends/arm/_passes/remove_noop_pass.py b/backends/arm/_passes/remove_noop_pass.py index c7fe469c8b8..5fafc848003 100644 --- a/backends/arm/_passes/remove_noop_pass.py +++ b/backends/arm/_passes/remove_noop_pass.py @@ -8,7 +8,7 @@ import logging from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -16,19 +16,20 @@ logger = logging.getLogger(__name__) -class RemoveNoopPass(ArmPass): +class RemoveNoopPass(ArmOpTargetedPass): """Remove no-ops from graph_module.""" _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = ( + exir_ops.edge.dim_order_ops._clone_dim_order.default, + exir_ops.edge.dim_order_ops._to_dim_order_copy.default, + exir_ops.edge.aten.alias_copy.default, + exir_ops.edge.aten.copy.default, + exir_ops.edge.aten.detach_copy.default, + ) def call_operator(self, op, args, kwargs, meta): - if op not in ( - exir_ops.edge.dim_order_ops._clone_dim_order.default, - exir_ops.edge.dim_order_ops._to_dim_order_copy.default, - exir_ops.edge.aten.alias_copy.default, - exir_ops.edge.aten.copy.default, - exir_ops.edge.aten.detach_copy.default, - ): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) input_dtype = args[0].data.dtype diff --git a/backends/arm/_passes/rewrite_avg_pool2d_pass.py b/backends/arm/_passes/rewrite_avg_pool2d_pass.py index bf81505d923..6427b571218 100644 --- a/backends/arm/_passes/rewrite_avg_pool2d_pass.py +++ b/backends/arm/_passes/rewrite_avg_pool2d_pass.py @@ -6,7 +6,7 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import to_2tuple from executorch.backends.arm.constants import NHWC_INVERSE_ORDER, NHWC_ORDER from executorch.backends.arm.operators.operator_validation_utils import ( @@ -18,11 +18,11 @@ from .fuse_constant_ops_pass import ComputeConstantOpsAOTPass -class RewriteAvgPool2dPass(ArmPass): +class RewriteAvgPool2dPass(ArmOpTargetedPass): """Rewrite aten.avg_pool2d calls to TOSA AVG_POOL2D op.""" # Target the original avg_pool2d operator - targeted_ops = {exir_ops.edge.aten.avg_pool2d.default} + target_ops = {exir_ops.edge.aten.avg_pool2d.default} _passes_required_after: Set[Type[ExportPass]] = { ComputeConstantOpsAOTPass, } @@ -30,7 +30,7 @@ class RewriteAvgPool2dPass(ArmPass): def call_operator(self, op, args, kwargs, meta, updated=False): # Only rewrite avg_pool2d - if op not in self.targeted_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) x = args[0] diff --git a/backends/arm/_passes/rewrite_bool_bitwise_to_logical_pass.py b/backends/arm/_passes/rewrite_bool_bitwise_to_logical_pass.py index 8c6bf6f39ec..962bdbbaf6e 100644 --- a/backends/arm/_passes/rewrite_bool_bitwise_to_logical_pass.py +++ b/backends/arm/_passes/rewrite_bool_bitwise_to_logical_pass.py @@ -7,12 +7,12 @@ from typing import Set, Type import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class RewriteBoolBitwiseToLogicalPass(ArmPass): +class RewriteBoolBitwiseToLogicalPass(ArmOpTargetedPass): """Rewrites ``aten.bitwise_*`` on boolean tensors to ``aten.logical_*``. TOSA ``bitwise_*`` does not support boolean inputs. On boolean tensors, @@ -32,9 +32,10 @@ class RewriteBoolBitwiseToLogicalPass(ArmPass): exir_ops.edge.aten.bitwise_xor.Tensor: exir_ops.edge.aten.logical_xor.default, exir_ops.edge.aten.bitwise_xor.Scalar: exir_ops.edge.aten.logical_xor.default, } + target_ops = tuple(_TARGET_TO_LOGICAL) def call_operator(self, op, args, kwargs, meta): - if op not in self._TARGET_TO_LOGICAL: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if meta["val"].dtype == torch.bool: diff --git a/backends/arm/_passes/rewrite_high_rank_singleton_permute_pass.py b/backends/arm/_passes/rewrite_high_rank_singleton_permute_pass.py index 1c0bac0ba9c..40a7935f050 100644 --- a/backends/arm/_passes/rewrite_high_rank_singleton_permute_pass.py +++ b/backends/arm/_passes/rewrite_high_rank_singleton_permute_pass.py @@ -5,12 +5,12 @@ from typing import Sequence, Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class RewriteHighRankSingletonPermutePass(ArmPass): +class RewriteHighRankSingletonPermutePass(ArmOpTargetedPass): """Rewrite high-rank permute via a lower-rank permute when singleton dims allow it. @@ -30,6 +30,7 @@ class RewriteHighRankSingletonPermutePass(ArmPass): exir_ops.edge.aten.permute.default, exir_ops.edge.aten.permute_copy.default, ) + target_ops = _PERMUTE_OPS @staticmethod def _extract_permutation(permutation_arg: object) -> tuple[int, ...] | None: @@ -46,7 +47,7 @@ def _normalize_permutation( return tuple(dim % rank for dim in permutation) def call_operator(self, op, args, kwargs, meta): - if op not in self._PERMUTE_OPS: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if len(args) < 2: return super().call_operator(op, args, kwargs, meta) diff --git a/backends/arm/_passes/rewrite_index_put_pass.py b/backends/arm/_passes/rewrite_index_put_pass.py index c0898673fd7..8f2ab4bb830 100644 --- a/backends/arm/_passes/rewrite_index_put_pass.py +++ b/backends/arm/_passes/rewrite_index_put_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.convert_expand_copy_to_repeat import ( ConvertExpandCopyToRepeatPass, ) @@ -31,7 +31,7 @@ def calculate_data_stride(destination_shape: list[int]) -> list[int]: return data_strides -class RewriteIndexPutPass(ArmPass): +class RewriteIndexPutPass(ArmOpTargetedPass): """ This pass transforms index_put with arguments - destination, of shape (*K_i, *C_j) @@ -69,6 +69,7 @@ def __init__(self): FuseViewCopyTransformPass, ConvertExpandCopyToRepeatPass, } + target_ops = (exir_ops.edge.aten.index_put.default,) def _calculate_flat_indices( self, @@ -121,7 +122,7 @@ def _calculate_flat_indices( ) def call_operator(self, op, args, kwargs, meta, updated: bool | None = False): - if op not in (exir_ops.edge.aten.index_put.default,): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) destination, indices_tensor_list, data = args[:3] diff --git a/backends/arm/_passes/rewrite_inplace_arithmetic_pass.py b/backends/arm/_passes/rewrite_inplace_arithmetic_pass.py index f5a484343c5..72683b353ce 100644 --- a/backends/arm/_passes/rewrite_inplace_arithmetic_pass.py +++ b/backends/arm/_passes/rewrite_inplace_arithmetic_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -23,10 +23,12 @@ } -class RewriteInplaceArithmeticPass(ArmPass): +class RewriteInplaceArithmeticPass(ArmOpTargetedPass): """Rewrite inplace arithmetic ops into functional equivalents.""" _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = tuple(OP_MAP) + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): if not self.allowed_to_transform(meta): diff --git a/backends/arm/_passes/rewrite_le_lt_to_ge_gt_pass.py b/backends/arm/_passes/rewrite_le_lt_to_ge_gt_pass.py index 9119567b7aa..c73279e65d0 100644 --- a/backends/arm/_passes/rewrite_le_lt_to_ge_gt_pass.py +++ b/backends/arm/_passes/rewrite_le_lt_to_ge_gt_pass.py @@ -7,7 +7,7 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -19,10 +19,12 @@ } -class RewriteLeLtToGeGtPass(ArmPass): +class RewriteLeLtToGeGtPass(ArmOpTargetedPass): """Rewrite le/lt into ge/gt with swapped inputs.""" _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = tuple(OP_MAP) + check_allowed_to_transform = True def call_operator(self, op, args, kwargs, meta): if not self.allowed_to_transform(meta): diff --git a/backends/arm/_passes/rewrite_max_pool2d_pass.py b/backends/arm/_passes/rewrite_max_pool2d_pass.py index 8a59f2bd4ac..8debb322a6d 100644 --- a/backends/arm/_passes/rewrite_max_pool2d_pass.py +++ b/backends/arm/_passes/rewrite_max_pool2d_pass.py @@ -5,7 +5,7 @@ from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.backends.arm._passes.arm_pass_utils import to_2tuple from executorch.backends.arm.constants import NHWC_INVERSE_ORDER, NHWC_ORDER from executorch.backends.arm.operators.operator_validation_utils import ( @@ -17,13 +17,14 @@ edge_max_pool2d_ops = (exir_ops.edge.aten.max_pool2d.default,) -class RewriteMaxPool2dPass(ArmPass): +class RewriteMaxPool2dPass(ArmOpTargetedPass): """Rewrite max_pool2d ops to TOSA MAX_POOL2D.""" _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = edge_max_pool2d_ops def call_operator(self, op, args, kwargs, meta): - if op not in edge_max_pool2d_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) x = args[0] diff --git a/backends/arm/_passes/rewrite_pad.py b/backends/arm/_passes/rewrite_pad.py index 40523fb559a..250fccab38b 100644 --- a/backends/arm/_passes/rewrite_pad.py +++ b/backends/arm/_passes/rewrite_pad.py @@ -8,18 +8,18 @@ import torch -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass -class RewritePadPass(ArmPass): +class RewritePadPass(ArmOpTargetedPass): """Rewrite constant_pad_nd operator to TOSA Pad operator with constant mode. """ _passes_required_after: Set[Type[ExportPass]] = set() - targeted_ops = { + target_ops = { exir_ops.edge.aten.constant_pad_nd.default, exir_ops.edge.aten.pad.default, } @@ -145,7 +145,7 @@ def _rewrite_non_constant_pad( return output def call_operator(self, op, args, kwargs, meta, updated=False): - if op not in self.targeted_ops: + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta) if op == exir_ops.edge.aten.constant_pad_nd.default: diff --git a/backends/arm/_passes/rewrite_slice.py b/backends/arm/_passes/rewrite_slice.py index c0f6e1b6573..2aab2e16539 100644 --- a/backends/arm/_passes/rewrite_slice.py +++ b/backends/arm/_passes/rewrite_slice.py @@ -4,7 +4,7 @@ # LICENSE file in the root directory of this source tree. from typing import Set, Type -from executorch.backends.arm._passes import ArmPass +from executorch.backends.arm._passes import ArmOpTargetedPass from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, ProxyValue @@ -12,10 +12,11 @@ from torch import SymInt -class RewriteSlicePass(ArmPass): +class RewriteSlicePass(ArmOpTargetedPass): """Rewrite slice operations with step of 1 to TOSA slice operators.""" _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = (exir_ops.edge.aten.slice_copy.Tensor,) def _fixup_start(self, start, input_shape, dim) -> int: """Convert negative and out-of-bounds start indices to valid positive @@ -29,7 +30,7 @@ def _fixup_start(self, start, input_shape, dim) -> int: return idx def call_operator(self, op, args, kwargs, meta, updated=False) -> ProxyValue: - if op not in (exir_ops.edge.aten.slice_copy.Tensor,): + if op not in self.target_ops: return super().call_operator(op, args, kwargs, meta, updated) if len(args) == 5 and args[4] != 1: diff --git a/backends/arm/test/passes/test_arm_op_targeted_pass.py b/backends/arm/test/passes/test_arm_op_targeted_pass.py new file mode 100644 index 00000000000..5c213d4c4b9 --- /dev/null +++ b/backends/arm/test/passes/test_arm_op_targeted_pass.py @@ -0,0 +1,150 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import operator +from typing import Set, Type + +import torch +from executorch.backends.arm._passes.arm_pass import ArmOpTargetedPass +from executorch.backends.arm._passes.arm_pass_manager import ArmPassManager +from executorch.backends.arm.constants import DISALLOW_TFA_META_KEY +from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec +from executorch.backends.arm.tosa.specification import TosaSpecification +from executorch.exir.pass_base import ExportPass +from torch.fx import Graph, GraphModule +from torch.fx.passes.infra.pass_base import PassResult + + +TARGET_OP = torch.ops.aten.add.Tensor +OTHER_OP = operator.add + + +def create_graph_module(target=OTHER_OP, disallow_tfa: bool = False) -> GraphModule: + graph = Graph() + lhs = graph.placeholder("lhs") + rhs = graph.placeholder("rhs") + lhs.meta["val"] = torch.randn(2, 3) + rhs.meta["val"] = torch.randn(2, 3) + node = graph.call_function(target, (lhs, rhs)) + node.meta["val"] = torch.randn(2, 3) + if disallow_tfa: + node.meta[DISALLOW_TFA_META_KEY] = True + graph.output(node) + return GraphModule(torch.nn.Module(), graph) + + +def create_test_pass_manager() -> ArmPassManager: + compile_spec = TosaCompileSpec( + TosaSpecification.create_from_string("TOSA-1.00+INT") + ) + return ArmPassManager(compile_spec) + + +def run_single_pass(graph_module: GraphModule, test_pass: ExportPass) -> PassResult: + pass_manager = create_test_pass_manager() + pass_manager.add_pass(test_pass) + return pass_manager(graph_module) + + +class DummyTargetedPass(ArmOpTargetedPass): + _passes_required_after: Set[Type[ExportPass]] = set() + target_ops = (TARGET_OP,) + check_allowed_to_transform = True + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.call_operator_count = 0 + + def call_operator(self, op, args, kwargs, meta): + self.call_operator_count += 1 + return super().call_operator(op, args, kwargs, meta) + + +class InsertTargetPass(ExportPass): + def call(self, graph_module: GraphModule) -> PassResult: + graph = graph_module.graph + placeholders = [node for node in graph.nodes if node.op == "placeholder"] + output = next(node for node in graph.nodes if node.op == "output") + + with graph.inserting_before(output): + target_node = graph.call_function( + TARGET_OP, + (placeholders[0], placeholders[1]), + ) + target_node.meta["val"] = torch.randn(2, 3) + output.args = (target_node,) + graph.lint() + graph_module.recompile() + return PassResult(graph_module, True) + + +class CondModule(torch.nn.Module): + def forward(self, x: torch.Tensor) -> torch.Tensor: + def true_branch(arg: torch.Tensor) -> torch.Tensor: + return arg + 1 + + def false_branch(arg: torch.Tensor) -> torch.Tensor: + return arg - 1 + + return torch.cond(x.sum() > 0, true_branch, false_branch, [x]) + + +def test_skips_when_target_is_absent() -> None: + graph_module = create_graph_module() + targeted_pass = DummyTargetedPass() + + result = run_single_pass(graph_module, targeted_pass) + + assert result is not None + assert result.graph_module is graph_module + assert not result.modified + assert targeted_pass.call_operator_count == 0 + + +def test_runs_when_target_is_present() -> None: + graph_module = create_graph_module(TARGET_OP) + targeted_pass = DummyTargetedPass() + + result = run_single_pass(graph_module, targeted_pass) + + assert result is not None + assert result.modified + assert targeted_pass.call_operator_count == 1 + + +def test_skips_tfa_disallowed_target() -> None: + graph_module = create_graph_module(TARGET_OP, disallow_tfa=True) + targeted_pass = DummyTargetedPass(tfa_pass=True) + + result = run_single_pass(graph_module, targeted_pass) + + assert result is not None + assert result.graph_module is graph_module + assert not result.modified + assert targeted_pass.call_operator_count == 0 + + +def test_runs_when_previous_pass_creates_target() -> None: + graph_module = create_graph_module() + pass_manager = create_test_pass_manager() + targeted_pass = DummyTargetedPass() + pass_manager.add_pass(InsertTargetPass()) + pass_manager.add_pass(targeted_pass) + result = pass_manager(graph_module) + + assert result.modified + assert targeted_pass.call_operator_count == 1 + + +def test_runs_when_target_is_present_in_nested_submodule() -> None: + exported_program = torch.export.export(CondModule(), (torch.randn(2, 3),)) + graph_module = exported_program.graph_module + targeted_pass = DummyTargetedPass() + + result = run_single_pass(graph_module, targeted_pass) + + assert result is not None + assert result.modified + assert targeted_pass.call_operator_count > 0 From ad4d19057d0184ba7aa72d3355a2365dd8a8cc09 Mon Sep 17 00:00:00 2001 From: George Gekov Date: Mon, 11 May 2026 17:17:20 +0100 Subject: [PATCH 077/103] Arm backend: Fix Smollm2 model test - Export & lower the smollm2 via extensions/llm/export_llm - Build the arm_executor_runner application - Fix the propagation of select_ops_list in the CMakeLists.txt - Test the application runs on FVP in fast mode Signed-off-by: George Gekov Change-Id: I8acd87c2f5c3e6b5b189bb987ceccfe4877e2254 --- backends/arm/scripts/build_executorch.sh | 3 ++ backends/arm/test/test_arm_backend.sh | 38 ++++++++++++++++++--- examples/arm/executor_runner/CMakeLists.txt | 1 - examples/arm/run.sh | 2 +- 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh index 5ebc0eb46b4..362fc4d40bf 100755 --- a/backends/arm/scripts/build_executorch.sh +++ b/backends/arm/scripts/build_executorch.sh @@ -97,6 +97,9 @@ cmake_args=( -DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump} -DEXECUTORCH_BAREMETAL_SKIP_INSTALL=OFF ) +if ((${#extra_cmake_args[@]})); then + cmake_args+=("${extra_cmake_args[@]}") +fi if [[ ${#extra_cmake_args[@]} -gt 0 ]]; then cmake_args+=("${extra_cmake_args[@]}") diff --git a/backends/arm/test/test_arm_backend.sh b/backends/arm/test/test_arm_backend.sh index be48d7ad234..26f30974a9c 100755 --- a/backends/arm/test/test_arm_backend.sh +++ b/backends/arm/test/test_arm_backend.sh @@ -302,11 +302,41 @@ test_deit_e2e_ethos_u() { test_model_smollm2_135M() { echo "${TEST_SUITE_NAME}: Test SmolLM2-135M on Ethos-U85" - # Build common libs once - python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --build_libs - - python3 backends/arm/test/test_model.py --test_output=arm_test/test_model --target=ethos-u85-128 --model=smollm2 --extra_flags="-DEXECUTORCH_SELECT_OPS_LIST=dim_order_ops::_to_dim_order_copy.out" --specify_ethosu_scratch + backends/arm/scripts/build_executorch.sh + # Build pte for smollm2 + python -m extension.llm.export.export_llm \ + base.model_class=smollm2 \ + base.params=examples/models/smollm2/135M_config.json \ + debug.verbose=True model.enable_dynamic_shape=False quantization.pt2e_quantize="ethosu_8a8w" \ + backend.ethosu.enabled=True backend.ethosu.target="ethos-u85-256" backend.ethosu.memory_mode=Dedicated_Sram_384KB + + # Build the arm_executor_runner application, pre-loading the pte in the DDR for faster linking + local pte_addr="0x76000000" + backends/arm/scripts/build_executor_runner.sh \ + --et_build_root="${et_root_dir}/arm_test" \ + --pte="${pte_addr}" \ + --build_type=Release \ + --target=ethos-u85-256 \ + --system_config=Ethos_U85_SYS_DRAM_Mid \ + --memory_mode=Dedicated_Sram_384KB \ + --ethosu_tools_dir="${scratch_dir}" \ + --toolchain=arm-none-eabi-gcc \ + --extra_build_flags="-DET_ARM_BAREMETAL_SCRATCH_TEMP_ALLOCATOR_POOL_SIZE=0x20000" \ + --select_ops_list="dim_order_ops::_to_dim_order_copy.out" + + + # Deploy the application on the FVP in fast mode + FVP_Corstone_SSE-320 -C mps4_board.subsystem.ethosu.num_macs=256 \ + -C mps4_board.visualisation.disable-visualisation=1 \ + -C vis_hdlcd.disable_visualisation=1 \ + -C mps4_board.telnetterminal0.start_telnet=0 \ + -C mps4_board.uart0.out_file='-' \ + -C mps4_board.uart0.shutdown_on_eot=1 \ + -a "${et_root_dir}"/arm_test/ethos-u85-256_${pte_addr}/cmake-out/arm_executor_runner \ + -C mps4_board.subsystem.ethosu.extra_args="--fast" \ + --data smollm2.pte@"${pte_addr}" + echo "${TEST_SUITE_NAME}: PASS" } diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt index d84947a75ad..88050a2ae77 100644 --- a/examples/arm/executor_runner/CMakeLists.txt +++ b/examples/arm/executor_runner/CMakeLists.txt @@ -349,7 +349,6 @@ elseif(FOUND_OPS_IN_FILE) "gen_oplist: EXECUTORCH_SELECT_OPS_MODEL=${ET_PTE_FILE_PATH} is used to auto generate ops from" ) else() - set(EXECUTORCH_SELECT_OPS_LIST "") set(EXECUTORCH_SELECT_OPS_MODEL "") message( "gen_oplist: No non delagated ops was found in ${ET_PTE_FILE_PATH} no ops added to build" diff --git a/examples/arm/run.sh b/examples/arm/run.sh index cfbcae2dbad..3ef4b0b829b 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -659,7 +659,7 @@ configure_ethosu_scratch_if_requested() { return fi local scratch_size - scratch_size=$(get_ethosu_scratch_size "$pte_path" || true) + scratch_size=$(get_ethosu_scratch_size "$pte_path" | tail -n 1) if [[ -z "${scratch_size}" ]]; then echo "WARNING: Failed to derive Ethos-U scratch size from ${pte_path}" >&2 return From b0441b50be603a6312c6857d359e47b049fd67c7 Mon Sep 17 00:00:00 2001 From: George Gekov Date: Fri, 29 May 2026 11:15:47 +0100 Subject: [PATCH 078/103] Change python to python3 in shell script Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- backends/arm/test/test_arm_backend.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/arm/test/test_arm_backend.sh b/backends/arm/test/test_arm_backend.sh index 26f30974a9c..1cb9e135d00 100755 --- a/backends/arm/test/test_arm_backend.sh +++ b/backends/arm/test/test_arm_backend.sh @@ -305,7 +305,7 @@ test_model_smollm2_135M() { backends/arm/scripts/build_executorch.sh # Build pte for smollm2 - python -m extension.llm.export.export_llm \ + python3 -m extension.llm.export.export_llm \ base.model_class=smollm2 \ base.params=examples/models/smollm2/135M_config.json \ debug.verbose=True model.enable_dynamic_shape=False quantization.pt2e_quantize="ethosu_8a8w" \ From cf6daa9b1cb354de33528cb3eff1ccbe443ad2df Mon Sep 17 00:00:00 2001 From: Jacob Stevens Date: Fri, 29 May 2026 09:46:24 -0400 Subject: [PATCH 079/103] Add short function support (#19846) Summary: Currently, __builtin_FUNCTION is used opportunistically if it exists. However, for heavily templated code, this results in extremely long string which adds .rodata which can be wasteful on embedded targets. This commit adds an override which uses the shorter __FUNCTION__ even if __bultin_FUNCTION exists and exposes as a BUCK constraint. Integration into CMake intentially left out for now. Differential Revision: D106668077 --- runtime/executor/targets.bzl | 10 ++++++++-- runtime/platform/compiler.h | 17 +++++++++++++--- runtime/platform/targets.bzl | 4 ++++ tools/buck/constraints/BUCK | 38 ++++++++++++++++++++++++++++++++++++ 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/runtime/executor/targets.bzl b/runtime/executor/targets.bzl index 90f8d0221e9..81d0a58667f 100644 --- a/runtime/executor/targets.bzl +++ b/runtime/executor/targets.bzl @@ -16,8 +16,14 @@ def _program_preprocessor_flags(): if enable_verification == "false": return ["-DET_ENABLE_PROGRAM_VERIFICATION=0"] elif enable_verification == "true": - # Enabled by default. - return [] + # Enabled by default; allow opt-out via constraint + if not runtime.is_oss: + return select({ + "DEFAULT": [], + "fbsource//xplat/executorch/tools/buck/constraints:executorch-program-verification-disabled": ["-DET_ENABLE_PROGRAM_VERIFICATION=0"], + }) + else: + return [] else: fail("executorch.enable_program_verification must be one of 'true' or 'false'; saw '" + enable_verification + "'") diff --git a/runtime/platform/compiler.h b/runtime/platform/compiler.h index edd340d1fb0..692d590f44c 100644 --- a/runtime/platform/compiler.h +++ b/runtime/platform/compiler.h @@ -138,8 +138,14 @@ #define __has_builtin(x) (0) #endif -#if __has_builtin(__builtin_strrchr) +#if defined(__FILE_NAME__) +/// __FILE_NAME__ provides just the filename at +/// compile time, avoiding embedding full paths in the binary +#define ET_SHORT_FILENAME __FILE_NAME__ +#elif __has_builtin(__builtin_strrchr) /// Name of the source file without a directory string. +/// Note: This approach embeds the full path in .rodata even though only the +/// basename is used at runtime. __FILE_NAME__ is preferred when available. #define ET_SHORT_FILENAME (__builtin_strrchr("/" __FILE__, '/') + 1) #else #define ET_SHORT_FILENAME __FILE__ @@ -152,12 +158,17 @@ #define ET_LINE __LINE__ #endif // __has_builtin(__builtin_LINE) -#if __has_builtin(__builtin_FUNCTION) +#if defined(ET_USE_BUILTIN_FUNCTION_NAME) && ET_USE_BUILTIN_FUNCTION_NAME == 0 +/// __FUNCTION__ provides a short undecorated name, saving .rodata space +/// compared to __builtin_FUNCTION() which includes the full signature +/// (namespace, parameters, return type). +#define ET_FUNCTION __FUNCTION__ +#elif __has_builtin(__builtin_FUNCTION) /// Name of the current function as a const char[]. #define ET_FUNCTION __builtin_FUNCTION() #else #define ET_FUNCTION __FUNCTION__ -#endif // __has_builtin(__builtin_FUNCTION) +#endif // As of G3 RJ-2024.3 toolchain, zu format specifier is not supported for Xtensa #if defined(__XTENSA__) diff --git a/runtime/platform/targets.bzl b/runtime/platform/targets.bzl index 65d92b134d6..63b8cb553ef 100644 --- a/runtime/platform/targets.bzl +++ b/runtime/platform/targets.bzl @@ -116,5 +116,9 @@ def define_common_targets(): exported_headers = [ "compiler.h", ], + exported_preprocessor_flags = select({ + "DEFAULT": [], + "fbsource//xplat/executorch/tools/buck/constraints:executorch-builtin-function-name-disabled": ["-DET_USE_BUILTIN_FUNCTION_NAME=0"], + }) if not runtime.is_oss else [], visibility = ["PUBLIC"], ) diff --git a/tools/buck/constraints/BUCK b/tools/buck/constraints/BUCK index b558bb9e4a4..49fbaabe06f 100644 --- a/tools/buck/constraints/BUCK +++ b/tools/buck/constraints/BUCK @@ -61,3 +61,41 @@ fb_native.constraint_value( constraint_setting = ":executorch-event-tracer", visibility = ["PUBLIC"], ) + +fb_native.config_setting( + name = "executorch-program-verification-disabled", + constraint_values = [ + ":program-verification-disabled", + ], + visibility = ["PUBLIC"], +) + +fb_native.constraint_setting( + name = "executorch-program-verification", + visibility = ["PUBLIC"], +) + +fb_native.constraint_value( + name = "program-verification-disabled", + constraint_setting = ":executorch-program-verification", + visibility = ["PUBLIC"], +) + +fb_native.config_setting( + name = "executorch-builtin-function-name-disabled", + constraint_values = [ + ":builtin-function-name-disabled", + ], + visibility = ["PUBLIC"], +) + +fb_native.constraint_setting( + name = "executorch-builtin-function-name", + visibility = ["PUBLIC"], +) + +fb_native.constraint_value( + name = "builtin-function-name-disabled", + constraint_setting = ":executorch-builtin-function-name", + visibility = ["PUBLIC"], +) From 88faab264734e7c6b4640d30485ebafa717189a1 Mon Sep 17 00:00:00 2001 From: Jacob Stevens Date: Fri, 29 May 2026 09:46:37 -0400 Subject: [PATCH 080/103] Opportunistically use __FILE_NAME__ to get filename (#19834) (#19834) Summary: The current approach use __FILE__ and opportunistically trims it if the utility is available. However, the long name is still stored in .rodata This can contribute some memory on embedded platforms. Instead, first try __FILE_NAME__ Differential Revision: D106587633 From 84c0484d15c9bc96e05384a93e9ee174e81351fe Mon Sep 17 00:00:00 2001 From: SS-JIA Date: Fri, 29 May 2026 13:30:30 -0400 Subject: [PATCH 081/103] Fix ghstack merge bot failing to parse PR stack header Summary: ghstack 0.15.0 changed the header URL in PR bodies from `Stack from [ghstack](https://github.com/ezyang/ghstack)` to `Stack from [ghstack](https://github.com/ezyang/ghstack/tree/0.15.0)`. The exact string match in `propose_ghstack_orig_pr.py` no longer matched, causing every ghstack_land workflow run to fail since May 14. Use `startswith("Stack from [ghstack]")` instead to be resilient to URL changes. Test Plan: Verified the new pattern matches both the old format (`https://github.com/ezyang/ghstack`) and the new format (`https://github.com/ezyang/ghstack/tree/0.15.0`). This PR was authored with the help of Claude. Reviewers: --- .github/scripts/propose_ghstack_orig_pr.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/scripts/propose_ghstack_orig_pr.py b/.github/scripts/propose_ghstack_orig_pr.py index 3abcc6cdcf9..f41e03f18ff 100644 --- a/.github/scripts/propose_ghstack_orig_pr.py +++ b/.github/scripts/propose_ghstack_orig_pr.py @@ -52,12 +52,9 @@ def extract_stack_from_body(pr_body: str) -> List[int]: """ prs = [] - ghstack_begin = ( - "Stack from [ghstack](https://github.com/ezyang/ghstack) (oldest at bottom):" - ) ghstack_begin_seen = False for line in pr_body.splitlines(): - if ghstack_begin in line: + if line.startswith("Stack from [ghstack]"): ghstack_begin_seen = True if not ghstack_begin_seen: continue From d1c80af479dba2040444959e6b9e7264abbcf377 Mon Sep 17 00:00:00 2001 From: ssjia Date: Fri, 29 May 2026 07:29:56 -0700 Subject: [PATCH 082/103] [ET-VK][tests][1/N] Report disabled delegate tests as executed Pull Request resolved: https://github.com/pytorch/executorch/pull/19867 Some environments preserve stale failure state when tests are reported through unittest skip results. This switches currently disabled Vulkan delegate coverage to a local decorator so those tests stay discoverable, log their disabled reason, and produce an executed result. ghstack-source-id: 387629544 @exported-using-ghexport Differential Revision: [D106732141](https://our.internmc.facebook.com/intern/diff/D106732141/) --- backends/vulkan/test/test_vulkan_delegate.py | 41 ++++++++++++++------ 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/backends/vulkan/test/test_vulkan_delegate.py b/backends/vulkan/test/test_vulkan_delegate.py index 7c9f31b720c..ff709618259 100644 --- a/backends/vulkan/test/test_vulkan_delegate.py +++ b/backends/vulkan/test/test_vulkan_delegate.py @@ -7,6 +7,7 @@ # pyre-unsafe import ctypes +import functools import unittest from typing import Tuple @@ -42,6 +43,24 @@ pass +def disable_test(reason): + """Disable a test while still reporting it as executed. + + Some test runners do not handle skipped results consistently, so this keeps + disabled tests visible in logs without using unittest.skip. + """ + + def decorator(fn): + @functools.wraps(fn) + def wrapper(*args, **kwargs): + print(f"DISABLED_TEST: {fn.__qualname__}: {reason}") + return None + + return wrapper + + return decorator + + def lower_module( model: torch.nn.Module, sample_inputs: Tuple[torch.Tensor], dynamic_shapes=None ) -> EdgeProgramManager: @@ -743,7 +762,7 @@ def forward(self, x): self.lower_module_and_test_output(model, sample_inputs) - @unittest.skip( + @disable_test( "Currently this test is failing due to weird partitioning because the eq scalar" "operator is not supported yet. Re-enable when the operator is supported." ) @@ -810,7 +829,7 @@ def forward(self, x): self.lower_module_and_test_output(module, sample_inputs) - @unittest.skip( + @disable_test( "Reduce shader does not support multiple reduction axes at the moment" ) def test_vulkan_backend_sum_dim_list(self): @@ -831,7 +850,7 @@ def forward(self, x): sample_inputs, ) - @unittest.skip( + @disable_test( "Reduce shader does not support multiple reduction axes at the moment" ) def test_vulkan_backend_sum(self): @@ -1028,7 +1047,7 @@ def forward(self, x): sample_inputs, ) - @unittest.skip("layer norm compute shader not working with swiftshader") + @disable_test("layer norm compute shader not working with swiftshader") def test_vulkan_backend_native_layer_norm(self): class NativeLayerNormModule(torch.nn.Module): def __init__(self): @@ -1459,7 +1478,7 @@ def forward(self, x): sample_inputs, ) - @unittest.skip( + @disable_test( "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug" ) def test_vulkan_backend_softmax(self): @@ -1480,7 +1499,7 @@ def forward(self, x): sample_inputs, ) - @unittest.skip( + @disable_test( "Softmax shader with shared memory does not work with swiftshader due to potential swiftshader bug" ) def test_vulkan_backend_logsoftmax(self): @@ -1512,7 +1531,7 @@ def forward(self, x): self.lower_unary_module_and_test_output(GeluModule()) - @unittest.skip( + @disable_test( "Reduce shader does not support multiple reduction axes at the moment" ) def test_vulkan_backend_mean(self): @@ -2364,7 +2383,7 @@ def apply_quantization(self): quantized_linear_module_gemm, sample_inputs_gemm, atol=1e-2, rtol=1e-2 ) - @unittest.skip("Cannot run on swiftshader due to no integer dot product support") + @disable_test("Cannot run on swiftshader due to no integer dot product support") def test_vulkan_backend_xnnpack_pt2e_quantized_linear_sequence(self): """ Test a sequence of linear layers quantized with XNNPACK quantization config. @@ -2439,7 +2458,7 @@ def forward(self, x): rtol=1e-1, ) - @unittest.skip("Cannot run on swiftshader due to no integer dot product support") + @disable_test("Cannot run on swiftshader due to no integer dot product support") def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence(self): """ Test a sequence of convolution layers quantized with PT2E quantization. @@ -2530,7 +2549,7 @@ def forward(self, x): rtol=1e-1, ) - @unittest.skip("Cannot run on swiftshader due to no integer dot product support") + @disable_test("Cannot run on swiftshader due to no integer dot product support") def test_vulkan_backend_xnnpack_pt2e_quantized_conv_sequence_all_reduced(self): """ Test a sequence of convolution layers quantized with PT2E quantization. @@ -2610,7 +2629,7 @@ def forward(self, x): rtol=1e-1, ) - @unittest.skip("Cannot run on swiftshader due to no 8-bit int support") + @disable_test("Cannot run on swiftshader due to no 8-bit int support") def test_vulkan_backend_torchao_8da4w_quantized_linear(self): """ Test TorchAO 8da4w quantization (int8 dynamic activation + int4 weight) with Vulkan backend. From 915a82d4235c92930b7670c19d4f006852ba6e00 Mon Sep 17 00:00:00 2001 From: ssjia Date: Fri, 29 May 2026 07:30:02 -0700 Subject: [PATCH 083/103] [devtools][tests][4/N] Report disabled inspector tests as executed Applies the same disabled-test treatment as the prior diffs in this stack to the devtools inspector tests. Some test runners preserve stale failure state when tests report through unittest skip results, so this replaces the conditionally disabled coverage with a local decorator that keeps the tests discoverable, logs their disabled reason, and produces an executed result. Adds a disable_if decorator that mirrors unittest.skipIf (evaluating the condition at decoration time) and converts the three Windows-gated test cases to use it. Differential Revision: [D106736354](https://our.internmc.facebook.com/intern/diff/D106736354/) ghstack-source-id: 387629542 Pull-Request: https://github.com/pytorch/executorch/pull/19874 --- devtools/inspector/tests/inspector_test.py | 29 +++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/devtools/inspector/tests/inspector_test.py b/devtools/inspector/tests/inspector_test.py index b33c5b37164..4c59190650c 100644 --- a/devtools/inspector/tests/inspector_test.py +++ b/devtools/inspector/tests/inspector_test.py @@ -7,6 +7,7 @@ # pyre-unsafe import copy +import functools import os import random import statistics @@ -90,6 +91,28 @@ def forward(self, indices: torch.Tensor, values: torch.Tensor) -> torch.Tensor: ETRECORD_PATH = "unittest_etrecord_path" +def disable_if(condition, reason): + """Disable a test when condition is true, still reporting it as executed. + + Conditional analogue of unittest.skipIf that keeps disabled tests visible in + logs instead of producing a skipped result, which some test runners handle + inconsistently. + """ + + def decorator(fn): + if not condition: + return fn + + @functools.wraps(fn) + def wrapper(*args, **kwargs): + print(f"DISABLED_TEST: {fn.__qualname__}: {reason}") + return None + + return wrapper + + return decorator + + # TODO: write an E2E test: create an inspector instance, mock just the file reads, and then verify the external correctness class TestInspector(unittest.TestCase): def test_perf_data(self) -> None: @@ -1504,7 +1527,7 @@ def test_calculate_numeric_gap_with_edge_dialect_exported_program_name(self): self.assertIsInstance(df, pd.DataFrame) self.assertEqual(len(df), 1) - @unittest.skipIf(sys.platform.startswith("win"), "Skipping on Windows") + @disable_if(sys.platform.startswith("win"), "Skipping on Windows") def test_transformer_block_xnnpack_numeric_gap_within_tolerance(self): """ Test that the numeric gap between AOT and runtime intermediate outputs @@ -1693,7 +1716,7 @@ def forward( f"Stack trace for {op_name} doesn't contain file info", ) - @unittest.skipIf(sys.platform.startswith("win"), "Skipping on Windows") + @disable_if(sys.platform.startswith("win"), "Skipping on Windows") def test_intermediate_tensor_comparison_with_torch_export(self): """Test intermediate tensor comparison using torch.export.export and to_edge_transform_and_lower. @@ -1840,7 +1863,7 @@ def _gen_random_runtime_output( ) -> List[Union[None, List[torch.Tensor], bool, float, int, str, torch.Tensor]]: return [torch.randn(RAW_DATA_SIZE)] - @unittest.skipIf(sys.platform.startswith("win"), "Skipping on Windows") + @disable_if(sys.platform.startswith("win"), "Skipping on Windows") def test_disable_debug_handle_validation_with_symbolic_shapes(self): """ Test that demonstrates the issue with symbolic shape related nodes losing from_node info From 10e2eecfb63a14781554aa1e3dae83c19929e46b Mon Sep 17 00:00:00 2001 From: SS-JIA Date: Fri, 29 May 2026 15:29:54 -0400 Subject: [PATCH 084/103] Skip AOTI tests on macOS CI and bump job timeout to 120 min Summary: AOTI tests (llama3_2_vision and select extension/llm tests) hang indefinitely on macOS CI runners after the PyTorch 2.12 pin update. The hang is in native C/C++ code (inductor compilation / dlopen), which prevents faulthandler from producing a traceback. Diagnosis is ongoing in #19886. Skip the affected tests and bump the macOS job timeout from the default 90 to 120 minutes to add margin (observed completion at ~79 min with skips applied). Co-Authored-By: Claude --- .ci/scripts/unittest-macos-cmake.sh | 15 +++++++++++++-- .github/workflows/_unittest.yml | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/.ci/scripts/unittest-macos-cmake.sh b/.ci/scripts/unittest-macos-cmake.sh index 43eb1f21c3c..48f072a0cc1 100755 --- a/.ci/scripts/unittest-macos-cmake.sh +++ b/.ci/scripts/unittest-macos-cmake.sh @@ -12,8 +12,19 @@ set -eux export TORCHINDUCTOR_CACHE_DIR="$(mktemp -d "${RUNNER_TEMP:-/tmp}/torchinductor_cache_XXXXXX")" trap 'rm -rf "${TORCHINDUCTOR_CACHE_DIR}"' EXIT -# Run pytest with coverage -${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml +# TODO(SS-JIA): AOTI tests hang on macOS CI runners — the thread blocks in +# native C/C++ code (dlopen / inductor compilation) so faulthandler cannot +# even produce a traceback. Diagnosis ongoing in #19886. +AOTI_SKIPS=( + --ignore=examples/models/llama3_2_vision/preprocess/test_preprocess.py + --ignore=examples/models/llama3_2_vision/vision_encoder/test/test_vision_encoder.py + --ignore=examples/models/llama3_2_vision/text_decoder/test/test_text_decoder.py + --deselect=extension/llm/modules/test/test_position_embeddings.py::TilePositionalEmbeddingTest::test_tile_positional_embedding_aoti + --deselect=extension/llm/modules/test/test_position_embeddings.py::TiledTokenPositionalEmbeddingTest::test_tiled_token_positional_embedding_aoti + --deselect=extension/llm/modules/test/test_attention.py::AttentionTest::test_attention_aoti +) + +${CONDA_RUN} pytest -n auto --cov=./ --cov-report=xml "${AOTI_SKIPS[@]}" # Run gtest LLVM_PROFDATA="xcrun llvm-profdata" LLVM_COV="xcrun llvm-cov" \ ${CONDA_RUN} test/run_oss_cpp_tests.sh diff --git a/.github/workflows/_unittest.yml b/.github/workflows/_unittest.yml index 15c87bd79e4..a253857d2c0 100644 --- a/.github/workflows/_unittest.yml +++ b/.github/workflows/_unittest.yml @@ -49,6 +49,7 @@ jobs: python-version: '3.11' submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + timeout: 120 script: | set -eux # This is needed to get the prebuilt PyTorch wheel from S3 From 29c18def8be12f6915b5c6b0fab435105c4fb6d2 Mon Sep 17 00:00:00 2001 From: Jacob Szwejbka Date: Fri, 29 May 2026 15:20:29 -0700 Subject: [PATCH 085/103] Use uint64_t for FlatTensor segment end Differential Revision: D106710218 Pull Request resolved: https://github.com/pytorch/executorch/pull/19860 --- .../flat_tensor/flat_tensor_data_map.cpp | 41 ++++++++++++++++--- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/extension/flat_tensor/flat_tensor_data_map.cpp b/extension/flat_tensor/flat_tensor_data_map.cpp index 48684da1239..845778f45c2 100644 --- a/extension/flat_tensor/flat_tensor_data_map.cpp +++ b/extension/flat_tensor/flat_tensor_data_map.cpp @@ -21,6 +21,8 @@ #include #include +#include + using executorch::runtime::Error; using executorch::runtime::FreeableBuffer; using executorch::runtime::Result; @@ -52,7 +54,7 @@ Result get_named_data( flatbuffers::Offset>* named_data, const flatbuffers::Vector< flatbuffers::Offset>* segments, - size_t segment_end_offset) { + uint64_t segment_end_offset) { // Linear search by name. if (named_data == nullptr) { return Error::NotFound; @@ -81,19 +83,34 @@ Result get_named_data( static_cast(segments->Get(segment_index)->offset()), static_cast(segments->Get(segment_index)->size()), &seg_end) && - seg_end <= static_cast(segment_end_offset), + seg_end <= segment_end_offset, InvalidExternalData, "Invalid segment offset %" PRIu64 " is larger than the segment_base_offset + segment_data_size %" PRIu64 "; malformed PTD file.", segments->Get(segment_index)->offset(), - static_cast(segment_end_offset)); + segment_end_offset); return found; } } return Error::NotFound; } +Result get_segment_end_offset(const FlatTensorHeader& header) { + uint64_t segment_end_offset = 0; + ET_CHECK_OR_RETURN_ERROR( + !c10::add_overflows( + header.segment_base_offset, + header.segment_data_size, + &segment_end_offset), + InvalidExternalData, + "segment_base_offset %" PRIu64 " + segment_data_size %" PRIu64 + " overflows uint64_t; malformed PTD file.", + header.segment_base_offset, + header.segment_data_size); + return segment_end_offset; +} + Result create_tensor_layout( const flat_tensor_flatbuffer::TensorLayout* tensor_layout) { ScalarType scalar_type = @@ -111,11 +128,15 @@ Result create_tensor_layout( ET_NODISCARD Result FlatTensorDataMap::get_tensor_layout( executorch::aten::string_view key) const { + Result segment_end_offset = get_segment_end_offset(header_); + if (!segment_end_offset.ok()) { + return segment_end_offset.error(); + } Result named_data = get_named_data( key, flat_tensor_->named_data(), flat_tensor_->segments(), - header_.segment_base_offset + header_.segment_data_size); + segment_end_offset.get()); if (!named_data.ok()) { return named_data.error(); } @@ -124,11 +145,15 @@ ET_NODISCARD Result FlatTensorDataMap::get_tensor_layout( ET_NODISCARD Result FlatTensorDataMap::get_data( executorch::aten::string_view key) const { + Result segment_end_offset = get_segment_end_offset(header_); + if (!segment_end_offset.ok()) { + return segment_end_offset.error(); + } Result named_data = get_named_data( key, flat_tensor_->named_data(), flat_tensor_->segments(), - header_.segment_base_offset + header_.segment_data_size); + segment_end_offset.get()); if (!named_data.ok()) { return named_data.error(); } @@ -148,11 +173,15 @@ ET_NODISCARD Error FlatTensorDataMap::load_data_into( ET_UNUSED executorch::aten::string_view key, ET_UNUSED void* buffer, ET_UNUSED size_t size) const { + Result segment_end_offset = get_segment_end_offset(header_); + if (!segment_end_offset.ok()) { + return segment_end_offset.error(); + } Result named_data = get_named_data( key, flat_tensor_->named_data(), flat_tensor_->segments(), - header_.segment_base_offset + header_.segment_data_size); + segment_end_offset.get()); if (!named_data.ok()) { return named_data.error(); } From 0e6b67ed9620e435fe387e90c12aa284be2e7a71 Mon Sep 17 00:00:00 2001 From: Ethan Ng Date: Fri, 29 May 2026 15:27:59 -0700 Subject: [PATCH 086/103] Add fuse() to QuantizationPatterns (#19726) Differential Revision: D105728156 Pull Request resolved: https://github.com/pytorch/executorch/pull/19726 --- backends/cadence/aot/quantizer/BUCK | 2 + backends/cadence/aot/quantizer/patterns.py | 264 ++++++++++++++++++++- 2 files changed, 264 insertions(+), 2 deletions(-) diff --git a/backends/cadence/aot/quantizer/BUCK b/backends/cadence/aot/quantizer/BUCK index c2ec3e3a1f6..956bf700bd7 100644 --- a/backends/cadence/aot/quantizer/BUCK +++ b/backends/cadence/aot/quantizer/BUCK @@ -36,8 +36,10 @@ fbcode_target(_kind = runtime.python_library, ], typing = True, deps = [ + ":pattern_utils", ":utils", "//caffe2:torch", + "//executorch/backends/cadence/aot:pass_utils", ], ) diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index e1f44b8ce5c..bf7ca3ef567 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -12,8 +12,19 @@ from typing import List, Optional, Tuple, Union import torch -from executorch.backends.cadence.aot.quantizer.utils import get_bias_qparams - +from executorch.backends.cadence.aot.pass_utils import get_arg, replace_with_op +from executorch.backends.cadence.aot.quantizer.pattern_utils import ( + DQ_PER_TENSOR, + find_quant_user, + fuse_conv, + fuse_linear, + fuse_matmul, + insert_node_with_meta, +) +from executorch.backends.cadence.aot.quantizer.utils import ( + check_out_zero_point_is_min_range, + get_bias_qparams, +) from torch import fx from torch._ops import OpOverload from torchao.quantization.pt2e.quantizer import ( @@ -131,6 +142,41 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_linear.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + assert anchor_node.target == torch.ops.aten.addmm.default + # addmm(bias, input, weight) + bias_node = anchor_node.args[0] + assert isinstance(bias_node, fx.Node) + dq_input = get_arg(anchor_node, "mat1", fx.Node) + if dq_input.target != DQ_PER_TENSOR: + return None + dq_weight = get_arg(anchor_node, "mat2", fx.Node) + if dq_weight.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + dq_bias = bias_node if bias_node.target == DQ_PER_TENSOR else None + weight_q = get_arg(dq_weight, "input", fx.Node) + transposed = insert_node_with_meta( + gm, + torch.ops.aten.transpose.int, + (weight_q, 0, 1), + None, + anchor_node, + weight_q, + ) + return fuse_linear( + gm, + dq_input, + dq_weight, + dq_bias, + quant_node, + anchor_node, + self.replacement_op(), + weight_q=transposed, + ) + class AddPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -169,6 +215,33 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_add.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + # Skip if alpha kwarg is present — changes add semantics. + if anchor_node.kwargs: + return None + dq0 = anchor_node.args[0] + if not isinstance(dq0, fx.Node) or dq0.target != DQ_PER_TENSOR: + return None + dq1 = anchor_node.args[1] + if not isinstance(dq1, fx.Node) or dq1.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + args = ( + get_arg(dq0, "input", fx.Node), + get_arg(dq0, "scale", float), + get_arg(dq0, "zero_point", int), + get_arg(dq1, "input", fx.Node), + get_arg(dq1, "scale", float), + get_arg(dq1, "zero_point", int), + get_arg(quant_node, "scale", float), + get_arg(quant_node, "zero_point", int), + ) + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, {}, quant_node + ) + # This is a base class for Add+ReLU fusion, since it can be used with two different relu aten ops class AddReluBasePattern(QuantizationPattern): @@ -212,6 +285,46 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_add.per_tensor + def anchor_ops(self) -> tuple[OpOverload, ...]: + return (torch.ops.aten.add.Tensor,) + + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + add_users = list(anchor_node.users) + if len(add_users) != 1: + return None + relu_node = add_users[0] + if relu_node.target != self.partition_types()[1]: + return None + if len(anchor_node.kwargs) > 0: + return None + dq0 = anchor_node.args[0] + if not isinstance(dq0, fx.Node) or dq0.target != DQ_PER_TENSOR: + return None + dq1 = anchor_node.args[1] + if not isinstance(dq1, fx.Node) or dq1.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(relu_node) + if quant_node is None: + return None + if not check_out_zero_point_is_min_range( + get_arg(quant_node, "zero_point", int), + get_arg(quant_node, "dtype", torch.dtype), + ): + return None + args = ( + get_arg(dq0, "input", fx.Node), + get_arg(dq0, "scale", float), + get_arg(dq0, "zero_point", int), + get_arg(dq1, "input", fx.Node), + get_arg(dq1, "scale", float), + get_arg(dq1, "zero_point", int), + get_arg(quant_node, "scale", float), + get_arg(quant_node, "zero_point", int), + ) + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, {}, quant_node + ) + # Add + regular relu op fusion class AddReluPattern0(AddReluBasePattern): @@ -250,6 +363,18 @@ def replacement_op(self) -> OpOverload: # we just need to change the name of the op return torch.ops.cadence.quantized_matmul.default + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq0 = anchor_node.args[0] + if not isinstance(dq0, fx.Node) or dq0.target != DQ_PER_TENSOR: + return None + dq1 = anchor_node.args[1] + if not isinstance(dq1, fx.Node) or dq1.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + return fuse_matmul(gm, anchor_node, dq0, dq1, quant_node, self.replacement_op()) + class CatPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -299,6 +424,25 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.aten.cat.default + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + cat_inputs = anchor_node.args[0] + if not isinstance(cat_inputs, (list, tuple)) or not cat_inputs: + return None + inputs_q = [] + for inp in cat_inputs: + if not isinstance(inp, fx.Node) or inp.target != DQ_PER_TENSOR: + return None + inputs_q.append(get_arg(inp, "input", fx.Node)) + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + dim = get_arg(anchor_node, "dim", int) + args = (inputs_q,) + kwargs = {"dim": dim} + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, kwargs, quant_node + ) + class Conv1dPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -341,6 +485,18 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_conv1d_ncl.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq_input = anchor_node.args[0] + if not isinstance(dq_input, fx.Node) or dq_input.target != DQ_PER_TENSOR: + return None + dq_weight = anchor_node.args[1] + if not isinstance(dq_weight, fx.Node) or dq_weight.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + return fuse_conv(self, gm, anchor_node, dq_input, dq_weight, quant_node) + class Conv2dPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -383,6 +539,18 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_conv2d_nchw.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq_input = anchor_node.args[0] + if not isinstance(dq_input, fx.Node) or dq_input.target != DQ_PER_TENSOR: + return None + dq_weight = anchor_node.args[1] + if not isinstance(dq_weight, fx.Node) or dq_weight.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + return fuse_conv(self, gm, anchor_node, dq_input, dq_weight, quant_node) + class LayerNormPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -421,6 +589,61 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_layer_norm.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq_input = anchor_node.args[0] + if not isinstance(dq_input, fx.Node) or dq_input.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + scale = get_arg(dq_input, "scale", float) + zero_point = get_arg(dq_input, "zero_point", int) + normalized_shape = anchor_node.args[1] + assert isinstance(normalized_shape, list) + weight = ( + anchor_node.args[2] + if len(anchor_node.args) > 2 and anchor_node.args[2] + else None + ) + bias = ( + anchor_node.args[3] + if len(anchor_node.args) > 3 and anchor_node.args[3] + else None + ) + input_q = get_arg(dq_input, "input", fx.Node) + # Default weight=1 and bias=0 must be float32 — cadence::quantized_layer_norm + # expects float affine parameters, not quantized values. + if not weight: + weight = insert_node_with_meta( + gm, + torch.ops.aten.full.default, + (normalized_shape, 1), + {"dtype": torch.float32}, + anchor_node, + input_q, + ) + if not bias: + bias = insert_node_with_meta( + gm, + torch.ops.aten.full.default, + (normalized_shape, 0), + {"dtype": torch.float32}, + anchor_node, + input_q, + ) + args = (input_q, scale, zero_point) + kwargs = { + "normalized_shape": normalized_shape, + "weight": weight, + "bias": bias, + "eps": get_arg(anchor_node, "eps", float), + "output_scale": get_arg(quant_node, "scale", float), + "output_zero_point": get_arg(quant_node, "zero_point", int), + } + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, kwargs, quant_node + ) + class LinearPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -463,6 +686,31 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_linear.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq_input = anchor_node.args[0] + if not isinstance(dq_input, fx.Node) or dq_input.target != DQ_PER_TENSOR: + return None + dq_weight = anchor_node.args[1] + if not isinstance(dq_weight, fx.Node) or dq_weight.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + dq_bias: fx.Node | None = None + if len(anchor_node.args) > 2: + bias_arg = anchor_node.args[2] + if isinstance(bias_arg, fx.Node) and bias_arg.target == DQ_PER_TENSOR: + dq_bias = bias_arg + return fuse_linear( + gm, + dq_input, + dq_weight, + dq_bias, + quant_node, + anchor_node, + self.replacement_op(), + ) + class MatmulPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -488,6 +736,18 @@ def replacement_op(self) -> OpOverload: # TODO: T240804887 This is actually a per-tensor variant, we just need to change the name of the op return torch.ops.cadence.quantized_matmul.default + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq0 = anchor_node.args[0] + if not isinstance(dq0, fx.Node) or dq0.target != DQ_PER_TENSOR: + return None + dq1 = anchor_node.args[1] + if not isinstance(dq1, fx.Node) or dq1.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + return fuse_matmul(gm, anchor_node, dq0, dq1, quant_node, self.replacement_op()) + class MaxPool2dPattern(QuantizationPattern): """ From 5395f2084ee1ef1243ad30309cc7c74b93e9f683 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Fri, 29 May 2026 16:56:01 -0700 Subject: [PATCH 087/103] [MLX][Gemma4] Add turbo quant support (#19866) Add TurboQuant TQ4 KV cache to the MLX backend, exposed on gemma4_31b via --turboquant. Compresses full-attention KV cache from bf16 to a 4-bit codebook + per-vector norms, letting Gemma 4 31B-IT scale to very long contexts. Sliding-window layers are unchanged. What's in the PR New cache subclass: - backends/mlx/llm/turboquant_cache.py: MLXTurboQuantKVCache, a drop-in subclass of TurboQuantKVCache. Three custom ops + Metal kernels: - mlx::tq4_compress (model_ops/tq4_compress.py): bucketize + cast(uint8) + nibble-pack in one kernel. - mlx::tq_norm (model_ops/tq_norm.py): L2 norm with simd_sum cross-lane reduction in fp32 registers; bf16 in / bf16 out. - mlx::tq_dequant (model_ops/tq_dequant.py): unpack + centroid gather + multiply-by-norm in one kernel. Per-op tests: - test_tq4_compress.py, test_tq_norm.py, test_tq_dequant.py Wiring: - examples/models/gemma4_31b/mlx_source_transformations.py: - examples/models/gemma4_31b/export.py: --turboquant CLI flag - examples/models/gemma4_31b/README.md: TurboQuant subsection. Perf on M4 Max 64GB Ram: ``` 2K prompt: bf16 cache: prefill 189.7 tok/s, decode 17.4 tok/s TurboQuant cache: prefill 187.7 tok/s, decode 16.9 tok/s 8K prompt: bf16 cache: prefill 170.0 tok/s, decode 17.1 tok/s TurboQuant cache: prefill 166.0 tok/s, decode 11.9 tok/s ``` For TQ, max context length is set to 64K. On bf16 cache, max context length is 10K. TODO: why does decode slow more for TQ than bf16? --- .github/workflows/mlx.yml | 12 + backends/mlx/builder/op_helpers.py | 112 +++++ backends/mlx/llm/turboquant_cache.py | 243 +++++++++++ backends/mlx/model_ops/test_tq4_compress.py | 183 ++++++++ backends/mlx/model_ops/test_tq_dequant.py | 166 ++++++++ backends/mlx/model_ops/test_tq_norm.py | 150 +++++++ backends/mlx/model_ops/tq4_compress.py | 189 +++++++++ backends/mlx/model_ops/tq_dequant.py | 216 ++++++++++ backends/mlx/model_ops/tq_norm.py | 170 ++++++++ backends/mlx/test/op_test_runner.cpp | 12 + backends/mlx/test/test_ops.py | 396 ++++++++++++++++++ backends/mlx/test/test_utils.py | 5 + examples/models/gemma4_31b/README.md | 18 + examples/models/gemma4_31b/export.py | 44 +- .../gemma4_31b/mlx_source_transformations.py | 73 +++- 15 files changed, 1961 insertions(+), 28 deletions(-) create mode 100644 backends/mlx/llm/turboquant_cache.py create mode 100644 backends/mlx/model_ops/test_tq4_compress.py create mode 100644 backends/mlx/model_ops/test_tq_dequant.py create mode 100644 backends/mlx/model_ops/test_tq_norm.py create mode 100644 backends/mlx/model_ops/tq4_compress.py create mode 100644 backends/mlx/model_ops/tq_dequant.py create mode 100644 backends/mlx/model_ops/tq_norm.py diff --git a/.github/workflows/mlx.yml b/.github/workflows/mlx.yml index 027101ba7f0..c51f126dbe6 100644 --- a/.github/workflows/mlx.yml +++ b/.github/workflows/mlx.yml @@ -80,6 +80,18 @@ jobs: ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_gated_delta_rule run -v echo "::endgroup::" + echo "::group::Run tq_norm op tests" + ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_norm run -v + echo "::endgroup::" + + echo "::group::Run tq4_compress op tests" + ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq4_compress run -v + echo "::endgroup::" + + echo "::group::Run tq_dequant op tests" + ${CONDA_RUN} python -m executorch.backends.mlx.model_ops.test_tq_dequant run -v + echo "::endgroup::" + test-mlx-qwen35-moe: uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: diff --git a/backends/mlx/builder/op_helpers.py b/backends/mlx/builder/op_helpers.py index 7740546cc2c..be199f75340 100644 --- a/backends/mlx/builder/op_helpers.py +++ b/backends/mlx/builder/op_helpers.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: from executorch.backends.mlx.builder.program_builder import MLXProgramBuilder + from executorch.backends.mlx.serialization.mlx_graph_schema import IntOrVid # When True, always serialize the biases tensor for quantized ops. # When False, use init-time computation when zero_point is all zeros, @@ -173,6 +174,117 @@ def emit_lifted_constant(P: "MLXProgramBuilder", value, dtype: torch.dtype) -> S return slot +def emit_shape( + P: "MLXProgramBuilder", + node: Node, + slot: Slot, + *, + end_dim: "Optional[int]" = None, +) -> "list[IntOrVid]": + """Return the shape of ``node`` as a list of ``IntOrVid``. + + Each static dim becomes a literal ``IntOrVid``; each dynamic dim + emits a ``SymSizeNode`` against ``slot`` and is wrapped via + ``P.to_int_or_vid``. + + Args: + P: program builder. + node: FX node whose shape to walk (must have ``meta['val']``). + slot: slot corresponding to ``node`` (used as the + ``SymSize`` source for any dynamic dim). + end_dim: stop index (exclusive). ``None`` means the full ndim. + Negative values index from the end (e.g. ``-1`` is "all + leading dims, drop the last"). + + Returns: + ``list[IntOrVid]`` of length ``end_dim`` (after normalization). + """ + from executorch.backends.mlx.serialization.mlx_graph_schema import ( + IntOrVid, + SymSizeNode, + ) + + shape = node.meta["val"].shape + ndim = len(shape) + if end_dim is None: + end_dim = ndim + elif end_dim < 0: + end_dim += ndim + + out: "list[IntOrVid]" = [] + for dim_idx in range(end_dim): + s = shape[dim_idx] + if isinstance(s, int): + out.append(IntOrVid.from_literal(int(s))) + else: + _, d_val = P.make_tmp_value_slot() + P.emit( + SymSizeNode( + a=P.slot_to_tid(slot), + dim=dim_idx, + out=P.slot_to_vid(d_val), + ) + ) + out.append(P.to_int_or_vid(d_val)) + return out + + +def emit_product( + P: "MLXProgramBuilder", + dims: "list[IntOrVid]", +) -> "IntOrVid": + """Multiplicative reduction over a list of ``IntOrVid`` values. + + Folds all literal entries AOT into a single static product, then + emits ``MultiplyIntNode`` only for the dynamic entries (and one + final node combining the static product with the dynamic accumulator + when both contribute). + + Args: + P: program builder. + dims: list of ``IntOrVid``. May be empty (returns + ``IntOrVid.from_literal(1)``), all literals, or a mix. + + Returns: + An ``IntOrVid`` representing the product. Always literal when + every entry is literal (or ``dims`` is empty). + """ + from executorch.backends.mlx.serialization.mlx_graph_schema import ( + IntOrVid, + MultiplyIntNode, + ) + + static_product = 1 + dynamic_dims: "list[IntOrVid]" = [] + for d in dims: + if d.is_vid: + dynamic_dims.append(d) + else: + static_product *= d.literal + + if not dynamic_dims: + return IntOrVid.from_literal(static_product) + + acc = dynamic_dims[0] + for d in dynamic_dims[1:]: + _, acc_val = P.make_tmp_value_slot() + P.emit(MultiplyIntNode(a=acc, b=d, out=P.slot_to_vid(acc_val))) + acc = P.to_int_or_vid(acc_val) + + if static_product == 1: + return acc + + _, final_val = P.make_tmp_value_slot() + P.emit( + MultiplyIntNode( + a=IntOrVid.from_literal(static_product), + b=acc, + out=P.slot_to_vid(final_val), + ) + ) + return P.to_int_or_vid(final_val) + + def emit_quantized_biases( P: "MLXProgramBuilder", zero_point_key: str, diff --git a/backends/mlx/llm/turboquant_cache.py b/backends/mlx/llm/turboquant_cache.py new file mode 100644 index 00000000000..7f2109ba074 --- /dev/null +++ b/backends/mlx/llm/turboquant_cache.py @@ -0,0 +1,243 @@ +#!/usr/bin/env python3 +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +TurboQuant TQ4 KV cache for the MLX backend. + +Subclass of the backend-agnostic +``extension/llm/modules/turboquant/kv_cache.py::TurboQuantKVCache``. + +The cache stores K and V in **rotated space** (post-multiplied by R^T) +as nibble-packed uint8 codebook indices plus per-vector bf16 norms. +SDPA runs in rotated space and undoes the rotation on the output side +(both Q and output rotations are ``T_q × D²``, much smaller than +applying the inverse rotation to K/V which would be ``T_kv × D²``). + +Reference: + TurboQuant: Online Vector Quantization with Near-optimal + Distortion Rate. arXiv:2504.19874 (ICLR 2026). +""" + +from typing import Optional, Tuple + +# Register the MLX custom ops used by this cache. +import executorch.backends.mlx.custom_ops # noqa: F401 mlx::custom_sdpa, mlx::kv_cache_update +import executorch.backends.mlx.model_ops.tq4_compress # noqa: F401 mlx::tq4_compress +import executorch.backends.mlx.model_ops.tq_dequant # noqa: F401 mlx::tq_dequant +import executorch.backends.mlx.model_ops.tq_norm # noqa: F401 mlx::tq_norm + +import torch + +from executorch.extension.llm.modules.turboquant.kv_cache import ( + TurboQuantKVCache as _SharedTurboQuantKVCache, +) + + +class TurboQuantKVCache(_SharedTurboQuantKVCache): + """ + TurboQuant TQ4 KV cache, MLX-backend variant. + + Drop-in replacement for ``backends/mlx/llm/cache.py::KVCache``. + + Args: + max_batch_size: Must be 1 (TQ4 is batch=1 only). + max_context_length: Maximum sequence length. + n_heads: Number of KV heads. + head_dim: Per-head dimension. Must be even and a multiple of 64. + enable_dynamic_shape: Accepted for interface parity; ignored. + dtype: Compute dtype (bf16). Used for pre-cast buffers. + bits: Quantization bits (must be 4). + seed: RNG seed for the orthogonal rotation matrix. + """ + + def __init__( + self, + max_batch_size: int, + max_context_length: int, + n_heads: int, + head_dim: int, + enable_dynamic_shape: bool, + dtype: torch.dtype = torch.bfloat16, + bits: int = 4, + seed: int = 42, + ): + if max_batch_size != 1: + raise ValueError( + f"TurboQuantKVCache only supports max_batch_size=1, " + f"got {max_batch_size}" + ) + if bits != 4: + raise ValueError( + f"TurboQuantKVCache only supports bits=4 " + f"(16-entry codebook), got bits={bits}" + ) + # MLX-backend Metal kernels need ``head_dim % 64 == 0``: ``tq_norm`` + # uses 32 SIMD lanes (so D must be a multiple of 32), and + # ``tq_dequant`` packs 2 dims per byte across 32 lanes (so D must + # be a multiple of 64). Take the stricter constraint here. + if head_dim % 64 != 0: + raise ValueError( + f"TurboQuantKVCache requires head_dim to be " + f"a multiple of 64 (Metal SIMD + 4-bit pack constraint), " + f"got {head_dim}" + ) + super().__init__( + n_heads=n_heads, + head_dim=head_dim, + max_seq_len=max_context_length, + bits=bits, + seed=seed, + ) + self.max_batch_size = max_batch_size + self.max_context_length = max_context_length + self.enable_dynamic_shape = enable_dynamic_shape + + # Replace parent's fp32 ``rotation`` and ``centroids`` buffers + # with compute-dtype versions in-place. Avoids a per-call + # ``_to_copy`` cast in the lowered graph at every use site. + # Parent's ``_decompress`` (testing-only) is the sole consumer + # of these as fp32 and is not called at runtime. + self.register_buffer( + "rotation", + self.rotation.to(dtype).contiguous(), + persistent=False, + ) + self.register_buffer( + "centroids", + self.centroids.to(dtype).contiguous(), + persistent=False, + ) + # Pre-cast eps for the divide-by-zero guard in _compress. + self.register_buffer( + "norm_eps", + torch.tensor(1e-10, dtype=dtype), + persistent=False, + ) + + def _compress(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + """Compress ``(1, H, T, D)`` → packed ``(1, H, T, D//2)`` u8 + + norms ``(1, H, T, 1)`` bf16. + + The L2-norm reduction uses ``mlx::tq_norm`` (one Metal kernel + with fp32 sum-of-squares in registers via ``simd_sum``); the + bucketize + nibble-pack tail uses ``mlx::tq4_compress`` (one + Metal kernel for both steps). + """ + orig_shape = x.shape + flat = x.reshape(-1, self.head_dim) + + norms = torch.ops.mlx.tq_norm(flat) + normalized = flat / (norms + self.norm_eps) + rotated = normalized @ self.rotation_T + packed = torch.ops.mlx.tq4_compress(rotated, self.boundaries) + + return ( + packed.reshape(*orig_shape[:-1], self.half_dim), + norms.reshape(*orig_shape[:-1], 1), + ) + + def update( + self, + input_pos, + k_val: torch.Tensor, + v_val: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """Compress + write K/V at ``input_pos``, return the full + compressed cache buffers. + + Accepts ``input_pos`` as either a ``(T,)`` LongTensor of + positions or a Python int / SymInt ``start_pos``. Writes go + through ``mlx::kv_cache_update`` (matching the non-TQ + ``MLXKVCache`` path) which lowers to a tighter in-place + scatter than ``index_copy_`` would. + """ + if isinstance(input_pos, torch.Tensor): + start_pos = input_pos[0].item() + seq_len = k_val.size(2) + torch._check(seq_len == v_val.size(2)) + torch._check(start_pos >= 0) + torch._check(start_pos + seq_len <= self.max_context_length) + else: + start_pos = input_pos + + k_packed, k_norms = self._compress(k_val) + v_packed, v_norms = self._compress(v_val) + + torch.ops.mlx.kv_cache_update(self.k_packed, k_packed, start_pos) + torch.ops.mlx.kv_cache_update(self.k_norms, k_norms, start_pos) + torch.ops.mlx.kv_cache_update(self.v_packed, v_packed, start_pos) + torch.ops.mlx.kv_cache_update(self.v_norms, v_norms, start_pos) + + # Slices on the return create new graph nodes so the same node + # is not both BUFFER_MUTATION and USER_OUTPUT. + return ( + self.k_packed[:, :, :, :], + self.k_norms[:, :, :, :], + self.v_packed[:, :, :, :], + self.v_norms[:, :, :, :], + ) + + # forward() is inherited from the parent (delegates to update). + + def sdpa( + self, + query: torch.Tensor, + start_pos, + scale: Optional[float] = None, + ) -> torch.Tensor: + """SDPA over the compressed cache. + + Runs attention in rotated space: + 1. Q_rot = Q @ R^T (T_q x D^2) + 2. K_rot, V_rot = tq_dequant(...) (rotated-space K/V) + 3. out_rot = custom_sdpa(Q_rot, K_rot, V_rot, ...) + 4. out = out_rot @ R (T_q x D^2) + + Since R is orthogonal, score = (Q·R^T)·(K·R^T)^T = Q·K^T, so + attention is invariant under matched rotation of Q and K. The + ``T_kv x D^2`` inverse-rotation matmul on K/V is replaced with + two ``T_q x D^2`` matmuls (Q and output). + + Args: + query: ``(B, H_q, T_q, D)`` bf16. + start_pos: int or SymInt — absolute position of the first + query token. + scale: 1/sqrt(D) if None. + + Returns: + ``(B, H_q, T_q, D)`` bf16 attention output, in original + (un-rotated) space. + """ + seq_len = query.size(2) + end_pos = start_pos + seq_len + torch._check(start_pos >= 0) + torch._check(end_pos <= self.max_context_length) + + q_rot = query @ self.rotation_T + + k_packed_live = self.k_packed[:, :, :end_pos, :] + k_norms_live = self.k_norms[:, :, :end_pos, :] + v_packed_live = self.v_packed[:, :, :end_pos, :] + v_norms_live = self.v_norms[:, :, :end_pos, :] + + # TODO: optimize with a fused dequant + SDPA + k_rot = torch.ops.mlx.tq_dequant(k_packed_live, k_norms_live, self.centroids) + v_rot = torch.ops.mlx.tq_dequant(v_packed_live, v_norms_live, self.centroids) + + out_rot = torch.ops.mlx.custom_sdpa( + q_rot, + k_rot, + v_rot, + start_pos, + None, # attn_mask + 0.0, # dropout_p + True, # is_causal + scale, + ) + + return out_rot @ self.rotation diff --git a/backends/mlx/model_ops/test_tq4_compress.py b/backends/mlx/model_ops/test_tq4_compress.py new file mode 100644 index 00000000000..c2aaa13afa7 --- /dev/null +++ b/backends/mlx/model_ops/test_tq4_compress.py @@ -0,0 +1,183 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Tests for ``mlx::tq4_compress``. + +Verifies the fused Metal kernel produces byte-exact output vs the +eager Python implementation across head_dim values used by TurboQuant. + +Usage:: + + python -m executorch.backends.mlx.model_ops.test_tq4_compress run + python -m executorch.backends.mlx.model_ops.test_tq4_compress run -v + python -m executorch.backends.mlx.model_ops.test_tq4_compress run --rebuild +""" + +from typing import List, Tuple + +import executorch.backends.mlx.model_ops.tq4_compress # noqa: F401 + +import torch +import torch.nn as nn + +from executorch.backends.mlx.test.test_utils import OpTestCase + + +class TQ4CompressModel(nn.Module): + """``values → packed`` via ``mlx::tq4_compress``. + + Boundaries are stored as a buffer so the model is exportable + without feeding them as a graph input. + """ + + def __init__(self, head_dim: int, dtype: torch.dtype = torch.bfloat16): + super().__init__() + # 15 sorted thresholds (4-bit codebook). + self.register_buffer( + "boundaries", + torch.linspace(-0.2, 0.2, 15, dtype=dtype), + ) + + def forward(self, values: torch.Tensor) -> torch.Tensor: + return torch.ops.mlx.tq4_compress(values, self.boundaries) + + +class TQ4CompressTest(OpTestCase): + """Byte-exact comparison vs eager bucketize + nibble-pack.""" + + name = "tq4_compress" + rtol = 0.0 + atol = 0.0 + + def __init__( + self, + batch_size: int = 1, + n_heads: int = 8, + seq_len: int = 4, + head_dim: int = 128, + dtype: torch.dtype = torch.bfloat16, + ): + self.batch_size = batch_size + self.n_heads = n_heads + self.seq_len = seq_len + self.head_dim = head_dim + self.dtype = dtype + + parts = [ + "tq4_compress", + f"b{batch_size}", + f"h{n_heads}", + f"t{seq_len}", + f"d{head_dim}", + ] + if dtype != torch.bfloat16: + parts.append(str(dtype).split(".")[-1]) + self.name = "_".join(parts) + + @classmethod + def get_test_configs(cls) -> List["TQ4CompressTest"]: + return [ + # head_dim=128 (Qwen3.5 MoE / Gemma 4 sliding) + cls(seq_len=1, head_dim=128), + cls(seq_len=8, head_dim=128), + cls(seq_len=64, head_dim=128), + cls(n_heads=1, seq_len=1, head_dim=128), + # head_dim=256 (Gemma 4 sliding-attention) + cls(head_dim=256), + cls(seq_len=16, head_dim=256), + # head_dim=512 (Gemma 4 31B full-attention) + cls(n_heads=4, seq_len=4, head_dim=512), + cls(n_heads=4, seq_len=64, head_dim=512), + # Smaller D for sanity + cls(head_dim=64, n_heads=2, seq_len=4), + ] + + def create_model(self) -> nn.Module: + return TQ4CompressModel(head_dim=self.head_dim, dtype=self.dtype).to(self.dtype) + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + # Activation-scale values; the kernel is byte-exact regardless + # of magnitude as long as values fall within the bucketize + # comparison range. + values = torch.randn( + self.batch_size, + self.n_heads, + self.seq_len, + self.head_dim, + dtype=self.dtype, + ) * (1.0 / (self.head_dim**0.5)) + return (values,) + + +if __name__ == "__main__": # noqa: C901 + import argparse + import sys + + from executorch.backends.mlx.test.test_utils import rebuild_op_test_runner + + parser = argparse.ArgumentParser(description="Test mlx::tq4_compress op") + parser.add_argument( + "action", + choices=["generate", "compare", "run", "list"], + help="Action: generate (export), compare (check outputs), run (full), list (show configs)", + ) + parser.add_argument("--verbose", "-v", action="store_true") + parser.add_argument( + "--rebuild", action="store_true", help="Rebuild C++ runner first" + ) + parser.add_argument( + "--config", type=str, default=None, help="Run specific config by name" + ) + args = parser.parse_args() + + if args.rebuild and not rebuild_op_test_runner(verbose=args.verbose): + sys.exit(1) + + configs = TQ4CompressTest.get_test_configs() + + if args.action == "list": + for cfg in configs: + print(f" {cfg.name}") + sys.exit(0) + + if args.config: + configs = [c for c in configs if c.name == args.config] + if not configs: + print(f"No config matching '{args.config}'") + sys.exit(1) + + passed = 0 + failed = 0 + failed_names: List[str] = [] + + for test in configs: + if args.action == "generate": + pte_path, _, _ = test.generate_test_files(verbose=args.verbose) + print(f"Generated: {pte_path}") + elif args.action == "compare": + actual_path = test.get_test_dir() / "actual_output.bin" + ok, msg = test.compare_with_actual(actual_path) + print(f"{'✓' if ok else '✗'} {test.name}: {msg}") + if ok: + passed += 1 + else: + failed += 1 + failed_names.append(test.name) + elif args.action == "run": + ok = test.run_test(verbose=args.verbose) + if ok: + passed += 1 + else: + failed += 1 + failed_names.append(test.name) + + if args.action in ("run", "compare"): + print(f"\nPassed: {passed}, Failed: {failed}") + if failed_names: + print(f"Failed: {', '.join(failed_names)}") + sys.exit(0 if failed == 0 else 1) diff --git a/backends/mlx/model_ops/test_tq_dequant.py b/backends/mlx/model_ops/test_tq_dequant.py new file mode 100644 index 00000000000..07d9deb895a --- /dev/null +++ b/backends/mlx/model_ops/test_tq_dequant.py @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Tests for ``mlx::tq_dequant``. + +Verifies the fused unpack + gather + multiply Metal kernel matches +the eager reference at head_dim values used by TurboQuant +(D ∈ {128, 256, 512}). Output is byte-exact — no fp32 promotion in +either path. + +Usage:: + + python -m executorch.backends.mlx.model_ops.test_tq_dequant run + python -m executorch.backends.mlx.model_ops.test_tq_dequant run -v + python -m executorch.backends.mlx.model_ops.test_tq_dequant run --rebuild +""" + +from typing import List, Tuple + +import executorch.backends.mlx.model_ops.tq_dequant # noqa: F401 + +import torch +import torch.nn as nn + +from executorch.backends.mlx.test.test_utils import OpTestCase + + +class TQDequantModel(nn.Module): + """``packed, norms, centroids → unrotated``.""" + + def forward( + self, + packed: torch.Tensor, + norms: torch.Tensor, + centroids: torch.Tensor, + ) -> torch.Tensor: + return torch.ops.mlx.tq_dequant(packed, norms, centroids) + + +class TQDequantTest(OpTestCase): + """Byte-exact comparison vs eager unpack + gather + multiply.""" + + name = "tq_dequant" + rtol = 0.0 + atol = 0.0 + + def __init__( + self, + batch_size: int = 1, + n_heads: int = 8, + seq_len: int = 4, + head_dim: int = 128, + ): + self.batch_size = batch_size + self.n_heads = n_heads + self.seq_len = seq_len + self.head_dim = head_dim + self.half_dim = head_dim // 2 + self.name = f"tq_dequant_b{batch_size}_h{n_heads}_t{seq_len}_d{head_dim}" + + @classmethod + def get_test_configs(cls) -> List["TQDequantTest"]: + return [ + # head_dim=128 (Qwen3.5 MoE / Gemma 4 sliding) + cls(seq_len=1, head_dim=128), + cls(seq_len=8, head_dim=128), + cls(seq_len=64, head_dim=128), + cls(n_heads=1, seq_len=1, head_dim=128), + # head_dim=256 (Gemma 4 sliding-attention) + cls(seq_len=4, head_dim=256), + cls(seq_len=16, head_dim=256), + # head_dim=512 (Gemma 4 31B full-attention) + cls(n_heads=4, seq_len=4, head_dim=512), + cls(n_heads=4, seq_len=64, head_dim=512), + ] + + def create_model(self) -> nn.Module: + return TQDequantModel() + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + # Random packed bytes exercise every codebook entry. + packed = torch.randint( + 0, + 256, + (self.batch_size, self.n_heads, self.seq_len, self.half_dim), + dtype=torch.uint8, + ) + norms = ( + torch.randn( + self.batch_size, + self.n_heads, + self.seq_len, + 1, + dtype=torch.bfloat16, + ).abs() + + 0.1 + ) + # Deterministic codebook covering [-1, 1]. + centroids = torch.linspace(-1.0, 1.0, 16, dtype=torch.bfloat16) + return (packed, norms, centroids) + + +if __name__ == "__main__": # noqa: C901 + import argparse + import sys + + from executorch.backends.mlx.test.test_utils import rebuild_op_test_runner + + parser = argparse.ArgumentParser(description="Test mlx::tq_dequant op") + parser.add_argument("action", choices=["generate", "compare", "run", "list"]) + parser.add_argument("--verbose", "-v", action="store_true") + parser.add_argument("--rebuild", action="store_true") + parser.add_argument("--config", type=str, default=None) + args = parser.parse_args() + + if args.rebuild and not rebuild_op_test_runner(verbose=args.verbose): + sys.exit(1) + + configs = TQDequantTest.get_test_configs() + + if args.action == "list": + for cfg in configs: + print(f" {cfg.name}") + sys.exit(0) + + if args.config: + configs = [c for c in configs if c.name == args.config] + if not configs: + print(f"No config matching '{args.config}'") + sys.exit(1) + + passed = 0 + failed = 0 + failed_names: List[str] = [] + + for test in configs: + if args.action == "generate": + pte_path, _, _ = test.generate_test_files(verbose=args.verbose) + print(f"Generated: {pte_path}") + elif args.action == "compare": + actual_path = test.get_test_dir() / "actual_output.bin" + ok, msg = test.compare_with_actual(actual_path) + print(f"{'✓' if ok else '✗'} {test.name}: {msg}") + if ok: + passed += 1 + else: + failed += 1 + failed_names.append(test.name) + elif args.action == "run": + ok = test.run_test(verbose=args.verbose) + if ok: + passed += 1 + else: + failed += 1 + failed_names.append(test.name) + + if args.action in ("run", "compare"): + print(f"\nPassed: {passed}, Failed: {failed}") + if failed_names: + print(f"Failed: {', '.join(failed_names)}") + sys.exit(0 if failed == 0 else 1) diff --git a/backends/mlx/model_ops/test_tq_norm.py b/backends/mlx/model_ops/test_tq_norm.py new file mode 100644 index 00000000000..35c4491d8ae --- /dev/null +++ b/backends/mlx/model_ops/test_tq_norm.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +""" +Tests for ``mlx::tq_norm``. + +Verifies the fused L2-norm Metal kernel matches eager ``vector_norm`` +at head_dim values used by TurboQuant (D ∈ {128, 256, 512}). + +Usage:: + + python -m executorch.backends.mlx.model_ops.test_tq_norm run + python -m executorch.backends.mlx.model_ops.test_tq_norm run -v + python -m executorch.backends.mlx.model_ops.test_tq_norm run --rebuild +""" + +from typing import List, Tuple + +import executorch.backends.mlx.model_ops.tq_norm # noqa: F401 + +import torch +import torch.nn as nn + +from executorch.backends.mlx.test.test_utils import OpTestCase + + +class TQNormModel(nn.Module): + """``x → ||x||₂`` over the last dim.""" + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.mlx.tq_norm(x) + + +class TQNormTest(OpTestCase): + """Compare ``mlx::tq_norm`` to eager ``vector_norm`` within bf16 ULPs.""" + + name = "tq_norm" + rtol = 1e-2 + atol = 1e-2 + + def __init__( + self, + batch_size: int = 1, + n_heads: int = 8, + seq_len: int = 4, + head_dim: int = 128, + ): + self.batch_size = batch_size + self.n_heads = n_heads + self.seq_len = seq_len + self.head_dim = head_dim + self.name = f"tq_norm_b{batch_size}_h{n_heads}_t{seq_len}_d{head_dim}" + + @classmethod + def get_test_configs(cls) -> List["TQNormTest"]: + return [ + # head_dim=128 (Qwen3.5 MoE / Gemma 4 sliding) + cls(seq_len=1, head_dim=128), + cls(seq_len=8, head_dim=128), + cls(seq_len=64, head_dim=128), + cls(n_heads=1, seq_len=1, head_dim=128), + # head_dim=256 (Gemma 4 sliding-attention) + cls(seq_len=4, head_dim=256), + cls(seq_len=16, head_dim=256), + # head_dim=512 (Gemma 4 31B full-attention) + cls(n_heads=4, seq_len=4, head_dim=512), + cls(n_heads=4, seq_len=64, head_dim=512), + ] + + def create_model(self) -> nn.Module: + return TQNormModel().to(torch.bfloat16) + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + # Activation-scale bf16 inputs. + x = torch.randn( + self.batch_size, + self.n_heads, + self.seq_len, + self.head_dim, + dtype=torch.bfloat16, + ) * (1.0 / (self.head_dim**0.5)) + return (x,) + + +if __name__ == "__main__": # noqa: C901 + import argparse + import sys + + from executorch.backends.mlx.test.test_utils import rebuild_op_test_runner + + parser = argparse.ArgumentParser(description="Test mlx::tq_norm op") + parser.add_argument( + "action", + choices=["generate", "compare", "run", "list"], + ) + parser.add_argument("--verbose", "-v", action="store_true") + parser.add_argument("--rebuild", action="store_true") + parser.add_argument("--config", type=str, default=None) + args = parser.parse_args() + + if args.rebuild and not rebuild_op_test_runner(verbose=args.verbose): + sys.exit(1) + + configs = TQNormTest.get_test_configs() + + if args.action == "list": + for cfg in configs: + print(f" {cfg.name}") + sys.exit(0) + + if args.config: + configs = [c for c in configs if c.name == args.config] + if not configs: + print(f"No config matching '{args.config}'") + sys.exit(1) + + passed = 0 + failed = 0 + failed_names: List[str] = [] + + for test in configs: + if args.action == "generate": + pte_path, _, _ = test.generate_test_files(verbose=args.verbose) + print(f"Generated: {pte_path}") + elif args.action == "compare": + actual_path = test.get_test_dir() / "actual_output.bin" + ok, msg = test.compare_with_actual(actual_path) + print(f"{'✓' if ok else '✗'} {test.name}: {msg}") + if ok: + passed += 1 + else: + failed += 1 + failed_names.append(test.name) + elif args.action == "run": + ok = test.run_test(verbose=args.verbose) + if ok: + passed += 1 + else: + failed += 1 + failed_names.append(test.name) + + if args.action in ("run", "compare"): + print(f"\nPassed: {passed}, Failed: {failed}") + if failed_names: + print(f"Failed: {', '.join(failed_names)}") + sys.exit(0 if failed == 0 else 1) diff --git a/backends/mlx/model_ops/tq4_compress.py b/backends/mlx/model_ops/tq4_compress.py new file mode 100644 index 00000000000..f08d47b9a11 --- /dev/null +++ b/backends/mlx/model_ops/tq4_compress.py @@ -0,0 +1,189 @@ +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# + +""" +``mlx::tq4_compress``: TurboQuant TQ4 quantize + nibble-pack. + +Maps ``(..., D)`` floats to ``(..., D/2)`` uint8 by: + 1. Bucketizing each value against ``boundaries`` (15 sorted thresholds). + 2. Packing pairs of 4-bit indices into one byte: high nibble holds + the even-position index, low nibble holds the odd-position index. + +Constraints: + * ``boundaries`` must be 1-D length 15 (4-bit codebook). + * Last dim of ``values`` must be even and statically known. + +Usage:: + + import executorch.backends.mlx.model_ops.tq4_compress # noqa: F401 + + packed = torch.ops.mlx.tq4_compress(rotated, boundaries) + # rotated: (..., D) float + # boundaries: (15,) same dtype as rotated + # packed: (..., D/2) uint8 +""" + +from __future__ import annotations + +import torch +from torch import Tensor +from torch.fx.node import Node + + +@torch.library.custom_op("mlx::tq4_compress", mutates_args=()) +def tq4_compress(values: Tensor, boundaries: Tensor) -> Tensor: + """TurboQuant TQ4 quantize + nibble-pack. + + Args: + values: ``(..., D)`` float, last dim must be even. + boundaries: ``(15,)`` 1-D sorted, same dtype as ``values``. + + Returns: + ``(..., D/2)`` uint8. Each byte holds two 4-bit indices: high + nibble is the even-position index, low nibble is the odd. + """ + if boundaries.dim() != 1 or boundaries.shape[0] != 15: + raise ValueError( + f"mlx::tq4_compress: boundaries must be 1-D length 15; " + f"got shape {tuple(boundaries.shape)}" + ) + if values.shape[-1] % 2 != 0: + raise ValueError( + f"mlx::tq4_compress: input last dim must be even; got " + f"{values.shape[-1]}" + ) + + indices = torch.bucketize(values, boundaries).to(torch.uint8) + packed = (indices[..., 0::2] << 4) | indices[..., 1::2] + return packed + + +@torch.library.register_fake("mlx::tq4_compress") +def tq4_compress_fake(values: Tensor, boundaries: Tensor) -> Tensor: + out_shape = list(values.shape) + out_shape[-1] = out_shape[-1] // 2 + return values.new_empty(out_shape, dtype=torch.uint8) + + +# --------------------------------------------------------------------------- +# MLX handler +# --------------------------------------------------------------------------- + +from executorch.backends.mlx.builder.op_helpers import ( + emit_product, + emit_shape, + torch_dtype_to_scalar_type, +) +from executorch.backends.mlx.builder.op_registry import REGISTRY +from executorch.backends.mlx.builder.program_builder import MLXProgramBuilder +from executorch.backends.mlx.builder.slot_manager import Slot +from executorch.backends.mlx.serialization.mlx_graph_schema import ( + IntOrVid, + MetalKernelNode, +) + + +# One thread per output byte: reads ``values[2*gid]``, ``values[2*gid+1]``, +# bucketizes against the 15 boundaries (loop unrolled, ``B`` is a template +# constant), and packs the two 4-bit indices into one byte. +_TQ4_COMPRESS_SOURCE = """ + uint gid = thread_position_in_grid.x; + float v_hi = float(values[2 * gid]); + float v_lo = float(values[2 * gid + 1]); + uchar idx_hi = 0; + uchar idx_lo = 0; + #pragma unroll + for (uint i = 0; i < B; ++i) { + float bnd = float(boundaries[i]); + idx_hi += (uchar)(v_hi > bnd); + idx_lo += (uchar)(v_lo > bnd); + } + out[gid] = (idx_hi << 4) | idx_lo; +""" + + +@REGISTRY.register(target=[torch.ops.mlx.tq4_compress.default]) +def _tq4_compress_handler(P: MLXProgramBuilder, n: Node) -> Slot: + """Lower ``mlx::tq4_compress`` to a fused Metal kernel.""" + args = P.args(n) + if len(args) != 2: + raise ValueError( + f"mlx::tq4_compress: expected 2 args (values, boundaries), " + f"got {len(args)}" + ) + + values_slot, boundaries_slot = args + values_node = n.args[0] + boundaries_node = n.args[1] + + values_meta = values_node.meta["val"] + boundaries_meta = boundaries_node.meta["val"] + + # Validate boundaries length: must be 15 for 4-bit nibble pack. + bnd_shape = boundaries_meta.shape + if ( + len(bnd_shape) != 1 + or not isinstance(bnd_shape[0], int) + or int(bnd_shape[0]) != 15 + ): + raise ValueError( + f"mlx::tq4_compress: boundaries must be 1-D length 15; " + f"got shape {tuple(bnd_shape)}" + ) + + last_dim = values_meta.shape[-1] + if not isinstance(last_dim, int): + raise NotImplementedError( + "mlx::tq4_compress: last dim must be statically known" + ) + if int(last_dim) % 2 != 0: + raise ValueError(f"mlx::tq4_compress: last dim must be even; got {last_dim}") + half_last = int(last_dim) // 2 + + in_dtype_int = torch_dtype_to_scalar_type(values_meta.dtype) + + out = P.make_or_get_slot(n) + leading = emit_shape(P, values_node, values_slot, end_dim=-1) + half_last_iov = IntOrVid.from_literal(half_last) + out_shape_flat = leading + [half_last_iov] + + # One thread per output byte, so the grid size is the output numel + # (product of leading dims times the halved last dim). + n_out_iov = emit_product(P, leading + [half_last_iov]) + + P.emit( + MetalKernelNode( + name="tq4_compress", + source=_TQ4_COMPRESS_SOURCE, + inputs=[ + P.slot_to_tid(values_slot), + P.slot_to_tid(boundaries_slot), + ], + outputs=[P.slot_to_tid(out)], + grid=[n_out_iov, IntOrVid.from_literal(1), IntOrVid.from_literal(1)], + # 32 threads per threadgroup so each TG fills one Apple-GPU SIMD group + threadgroup=[ + IntOrVid.from_literal(32), + IntOrVid.from_literal(1), + IntOrVid.from_literal(1), + ], + input_names=["values", "boundaries"], + output_names=["out"], + output_shapes_flat=out_shape_flat, + output_shape_lengths=[len(out_shape_flat)], + output_dtypes=[torch_dtype_to_scalar_type(torch.uint8)], + template_arg_names=["InT", "B"], + template_arg_kinds=[2, 0], # 2=dtype, 0=int + template_arg_values=[ + in_dtype_int, + 15, + ], + ) + ) + + return out diff --git a/backends/mlx/model_ops/tq_dequant.py b/backends/mlx/model_ops/tq_dequant.py new file mode 100644 index 00000000000..28a168e9be0 --- /dev/null +++ b/backends/mlx/model_ops/tq_dequant.py @@ -0,0 +1,216 @@ +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# + +""" +``mlx::tq_dequant``: TurboQuant TQ4 unpack + centroid gather + multiply-by-norm. + + indices = unpack 4-bit nibbles from packed bytes (..., D) + centvals = centroids[indices] (..., D) + out = centvals * norms (..., D) + +Output is in **rotated space** — the inverse rotation, if needed, is +left to the caller (typically MLX's tuned bf16 GEMM). + +Constraints: + * ``D`` (= ``packed.shape[-1] * 2``) must be a multiple of 64. + * ``centroids`` must be a 1-D tensor of length 16. + * Output dtype matches ``norms.dtype``. + +Usage:: + + import executorch.backends.mlx.model_ops.tq_dequant # noqa: F401 + + out = torch.ops.mlx.tq_dequant(packed, norms, centroids) + # packed: (..., D/2) uint8 + # norms: (..., 1) bf16 + # centroids: (16,) bf16 + # out: (..., D) bf16 (in rotated space) +""" + +from __future__ import annotations + +import torch +from torch import Tensor +from torch.fx.node import Node + + +# --------------------------------------------------------------------------- +# Custom op + eager fallback +# --------------------------------------------------------------------------- + + +@torch.library.custom_op("mlx::tq_dequant", mutates_args=()) +def tq_dequant( + packed: Tensor, + norms: Tensor, + centroids: Tensor, +) -> Tensor: + """Fused unpack + centroid gather + multiply-by-norm. + + Args: + packed: ``(..., D/2)`` uint8. High nibble = even-position index, + low nibble = odd-position index. + norms: ``(..., 1)`` of compute dtype, broadcasts over D. + centroids: ``(16,)`` of compute dtype. + + Returns: + ``(..., D)`` of compute dtype, in rotated space. + """ + if centroids.dim() != 1 or centroids.shape[0] != 16: + raise ValueError( + f"mlx::tq_dequant: centroids must be 1-D length 16; got " + f"shape {tuple(centroids.shape)}" + ) + high = (packed >> 4).long() + low = (packed & 0x0F).long() + indices = torch.stack([high, low], dim=-1).reshape( + *packed.shape[:-1], packed.shape[-1] * 2 + ) + return centroids[indices] * norms + + +@torch.library.register_fake("mlx::tq_dequant") +def tq_dequant_fake(packed: Tensor, norms: Tensor, centroids: Tensor) -> Tensor: + out_shape = list(packed.shape) + out_shape[-1] = out_shape[-1] * 2 + return packed.new_empty(out_shape, dtype=norms.dtype) + + +# --------------------------------------------------------------------------- +# MLX handler +# --------------------------------------------------------------------------- + +from executorch.backends.mlx.builder.op_helpers import ( + emit_product, + emit_shape, + torch_dtype_to_scalar_type, +) +from executorch.backends.mlx.builder.op_registry import REGISTRY +from executorch.backends.mlx.builder.program_builder import MLXProgramBuilder +from executorch.backends.mlx.builder.slot_manager import Slot +from executorch.backends.mlx.serialization.mlx_graph_schema import ( + IntOrVid, + MetalKernelNode, +) + + +_TQ_DEQUANT_HEADER = """ +#include +using namespace metal; +""" + + +# Per-vector decompress: +# * Grid (32, 1, M), threadgroup (32, 1, 1): one simdgroup per vector. +# * Each lane handles DIMS_PER_LANE = D/32 output values, sourced +# from BYTES_PER_LANE = DIMS_PER_LANE/2 packed bytes. +# * The 16-entry codebook is preloaded into per-lane registers. +_TQ_DEQUANT_SOURCE = """ + constexpr uint DIMS_PER_LANE = D / 32; + constexpr uint BYTES_PER_LANE = DIMS_PER_LANE / 2; + + uint vec_id = thread_position_in_grid.z; + uint lane_id = thread_position_in_threadgroup.x; + + InT cent[16]; + for (uint c = 0; c < 16; ++c) { + cent[c] = centroids[c]; + } + + InT norm = norms[vec_id]; + + uint packed_base = vec_id * (D / 2) + lane_id * BYTES_PER_LANE; + uint out_base = vec_id * D + lane_id * DIMS_PER_LANE; + + for (uint i = 0; i < BYTES_PER_LANE; ++i) { + uchar byte = packed[packed_base + i]; + uchar idx_hi = (byte >> 4) & 0x0F; + uchar idx_lo = byte & 0x0F; + out[out_base + 2 * i + 0] = cent[idx_hi] * norm; + out[out_base + 2 * i + 1] = cent[idx_lo] * norm; + } +""" + + +@REGISTRY.register(target=[torch.ops.mlx.tq_dequant.default]) +def _tq_dequant_handler(P: MLXProgramBuilder, n: Node) -> Slot: + """Lower ``mlx::tq_dequant`` to a single fused Metal kernel.""" + args = P.args(n) + if len(args) != 3: + raise ValueError( + f"mlx::tq_dequant: expected 3 args (packed, norms, centroids); " + f"got {len(args)}" + ) + packed_slot, norms_slot, centroids_slot = args + packed_node = n.args[0] + norms_node = n.args[1] + centroids_node = n.args[2] + + packed_meta = packed_node.meta["val"] + norms_meta = norms_node.meta["val"] + centroids_meta = centroids_node.meta["val"] + + if centroids_meta.dim() != 1 or int(centroids_meta.shape[0]) != 16: + raise ValueError( + f"mlx::tq_dequant: centroids must be 1-D length 16; got " + f"shape {tuple(centroids_meta.shape)}" + ) + + last_dim_packed = packed_meta.shape[-1] + if not isinstance(last_dim_packed, int): + raise NotImplementedError( + "mlx::tq_dequant: packed last dim must be statically known" + ) + half_D = int(last_dim_packed) + D = half_D * 2 + if D % 64 != 0: + raise NotImplementedError( + f"mlx::tq_dequant: unpacked dim must be a multiple of 64 " + f"(2 dims per packed byte, 32 SIMD lanes); got D={D}" + ) + + out_dtype_int = torch_dtype_to_scalar_type(norms_meta.dtype) + + out = P.make_or_get_slot(n) + leading = emit_shape(P, packed_node, packed_slot, end_dim=-1) + out_shape_flat = leading + [IntOrVid.from_literal(D)] + M_iov = emit_product(P, leading) + + P.emit( + MetalKernelNode( + name="tq_dequant", + source=_TQ_DEQUANT_SOURCE, + header=_TQ_DEQUANT_HEADER, + inputs=[ + P.slot_to_tid(packed_slot), + P.slot_to_tid(norms_slot), + P.slot_to_tid(centroids_slot), + ], + outputs=[P.slot_to_tid(out)], + grid=[ + IntOrVid.from_literal(32), + IntOrVid.from_literal(1), + M_iov, + ], + threadgroup=[ + IntOrVid.from_literal(32), + IntOrVid.from_literal(1), + IntOrVid.from_literal(1), + ], + input_names=["packed", "norms", "centroids"], + output_names=["out"], + output_shapes_flat=out_shape_flat, + output_shape_lengths=[len(out_shape_flat)], + output_dtypes=[out_dtype_int], + template_arg_names=["InT", "D"], + template_arg_kinds=[2, 0], # 2=dtype, 0=int + template_arg_values=[out_dtype_int, D], + ) + ) + + return out diff --git a/backends/mlx/model_ops/tq_norm.py b/backends/mlx/model_ops/tq_norm.py new file mode 100644 index 00000000000..7e6a4d657f3 --- /dev/null +++ b/backends/mlx/model_ops/tq_norm.py @@ -0,0 +1,170 @@ +# +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# + +""" +``mlx::tq_norm``: L2 norm along the last dim, lowered to a single Metal kernel. + + norms[..., 0] = sqrt(sum_i x[..., i]^2) + +Reads / writes ``x.dtype`` directly (no graph-level dtype casts). +Reduces in fp32 inside Metal registers via ``simd_sum`` for precision +on large ``D`` (bf16 sum-of-squares loses too much for D>=128). + +Constraints: + * Last dim ``D`` must be statically known and a multiple of 32. + +Usage:: + + import executorch.backends.mlx.model_ops.tq_norm # noqa: F401 + + norms = torch.ops.mlx.tq_norm(x) + # x: (..., D) bf16 + # norms: (..., 1) bf16, equal to vector_norm(x, dim=-1, keepdim=True) +""" + +from __future__ import annotations + +import torch +from torch import Tensor +from torch.fx.node import Node + + +# --------------------------------------------------------------------------- +# Custom op + eager fallback +# --------------------------------------------------------------------------- + + +@torch.library.custom_op("mlx::tq_norm", mutates_args=()) +def tq_norm(x: Tensor) -> Tensor: + """L2 norm along last dim. + + Args: + x: ``(..., D)``. For MLX lowering, ``D`` must be a multiple of 32. + + Returns: + ``(..., 1)`` of the same dtype as ``x``. + """ + return torch.linalg.vector_norm(x, dim=-1, keepdim=True).to(x.dtype) + + +@torch.library.register_fake("mlx::tq_norm") +def tq_norm_fake(x: Tensor) -> Tensor: + out_shape = list(x.shape) + out_shape[-1] = 1 + return x.new_empty(out_shape, dtype=x.dtype) + + +# --------------------------------------------------------------------------- +# MLX handler +# --------------------------------------------------------------------------- + +from executorch.backends.mlx.builder.op_helpers import ( + emit_product, + emit_shape, + torch_dtype_to_scalar_type, +) +from executorch.backends.mlx.builder.op_registry import REGISTRY +from executorch.backends.mlx.builder.program_builder import MLXProgramBuilder +from executorch.backends.mlx.builder.slot_manager import Slot +from executorch.backends.mlx.serialization.mlx_graph_schema import ( + IntOrVid, + MetalKernelNode, +) + + +_TQ_NORM_HEADER = """ +#include +using namespace metal; +""" + + +# Per-vector reduction: +# * Grid (32, 1, M), threadgroup (32, 1, 1): one simdgroup per vector. +# * Each lane covers DIMS_PER_LANE = D/32 elements; partial sums are +# accumulated in an fp32 register. +# * ``simd_sum`` reduces across the 32 lanes; lane 0 sqrts and writes. +_TQ_NORM_SOURCE = """ + constexpr uint DIMS_PER_LANE = D / 32; + + uint vec_id = thread_position_in_grid.z; + uint lane_id = thread_position_in_threadgroup.x; + + uint base = vec_id * D + lane_id * DIMS_PER_LANE; + + float local_sum_sq = 0.0f; + for (uint i = 0; i < DIMS_PER_LANE; ++i) { + float v = float(x[base + i]); + local_sum_sq += v * v; + } + + float total_sum_sq = simd_sum(local_sum_sq); + + if (lane_id == 0) { + norms[vec_id] = (InT)sqrt(total_sum_sq); + } +""" + + +@REGISTRY.register(target=[torch.ops.mlx.tq_norm.default]) +def _tq_norm_handler(P: MLXProgramBuilder, n: Node) -> Slot: + """Lower ``mlx::tq_norm`` to a single fused Metal kernel.""" + args = P.args(n) + if len(args) != 1: + raise ValueError(f"mlx::tq_norm: expected 1 arg (x), got {len(args)}") + + (x_slot,) = args + x_node = n.args[0] + + x_meta = x_node.meta["val"] + + last_dim = x_meta.shape[-1] + if not isinstance(last_dim, int): + raise NotImplementedError("mlx::tq_norm: last dim must be statically known") + D = int(last_dim) + if D % 32 != 0: + raise NotImplementedError( + f"mlx::tq_norm: last dim must be a multiple of 32 (one per " + f"SIMD lane); got D={D}" + ) + + in_dtype_int = torch_dtype_to_scalar_type(x_meta.dtype) + + out = P.make_or_get_slot(n) + leading = emit_shape(P, x_node, x_slot, end_dim=-1) + out_shape_flat = leading + [IntOrVid.from_literal(1)] + M_iov = emit_product(P, leading) + + P.emit( + MetalKernelNode( + name="tq_norm", + source=_TQ_NORM_SOURCE, + header=_TQ_NORM_HEADER, + inputs=[P.slot_to_tid(x_slot)], + outputs=[P.slot_to_tid(out)], + grid=[ + IntOrVid.from_literal(32), + IntOrVid.from_literal(1), + M_iov, + ], + threadgroup=[ + IntOrVid.from_literal(32), + IntOrVid.from_literal(1), + IntOrVid.from_literal(1), + ], + input_names=["x"], + output_names=["norms"], + output_shapes_flat=out_shape_flat, + output_shape_lengths=[len(out_shape_flat)], + output_dtypes=[in_dtype_int], + template_arg_names=["InT", "D"], + template_arg_kinds=[2, 0], # 2=dtype, 0=int + template_arg_values=[in_dtype_int, D], + ) + ) + + return out diff --git a/backends/mlx/test/op_test_runner.cpp b/backends/mlx/test/op_test_runner.cpp index 6bed13d7a56..925ff410f42 100644 --- a/backends/mlx/test/op_test_runner.cpp +++ b/backends/mlx/test/op_test_runner.cpp @@ -58,6 +58,7 @@ enum class DType : uint32_t { Int64 = 3, BFloat16 = 4, Bool = 5, + UInt8 = 6, }; size_t dtype_size(DType dtype) { @@ -74,6 +75,8 @@ size_t dtype_size(DType dtype) { return 2; case DType::Bool: return 1; + case DType::UInt8: + return 1; default: return 4; } @@ -93,6 +96,8 @@ exec_aten::ScalarType dtype_to_scalar_type(DType dtype) { return exec_aten::ScalarType::BFloat16; case DType::Bool: return exec_aten::ScalarType::Bool; + case DType::UInt8: + return exec_aten::ScalarType::Byte; default: return exec_aten::ScalarType::Float; } @@ -112,6 +117,8 @@ DType scalar_type_to_dtype(exec_aten::ScalarType stype) { return DType::BFloat16; case exec_aten::ScalarType::Bool: return DType::Bool; + case exec_aten::ScalarType::Byte: + return DType::UInt8; default: return DType::Float32; } @@ -316,6 +323,11 @@ int main(int argc, char* argv[]) { std::memcpy(data.data(), t.data.data(), t.data.size()); tensor_ptr = make_tensor_ptr( sizes, std::move(data), {}, {}, exec_aten::ScalarType::Bool); + } else if (t.dtype == DType::UInt8) { + std::vector data(t.data.size()); + std::memcpy(data.data(), t.data.data(), t.data.size()); + tensor_ptr = make_tensor_ptr( + sizes, std::move(data), {}, {}, exec_aten::ScalarType::Byte); } else { std::cerr << "Unsupported dtype: " << static_cast(t.dtype) << std::endl; diff --git a/backends/mlx/test/test_ops.py b/backends/mlx/test/test_ops.py index 45ea024f0e8..ec80b1d3911 100644 --- a/backends/mlx/test/test_ops.py +++ b/backends/mlx/test/test_ops.py @@ -2236,6 +2236,402 @@ def get_dynamic_shapes(self) -> Optional[Dict[str, any]]: } +from executorch.backends.mlx.llm.turboquant_cache import TurboQuantKVCache + + +class TurboQuantKVCacheModel(nn.Module): + """ + Test model wrapping TurboQuantKVCache.update(). + + TurboQuantKVCache stores K/V in rotated 4-bit packed form. ``update`` + returns the four cache buffers (k_packed, k_norms, v_packed, v_norms) + rather than uncompressed K/V. + """ + + def __init__( + self, + max_batch_size: int, + max_context_length: int, + n_heads: int, + head_dim: int, + enable_dynamic_shape: bool = True, + ): + super().__init__() + self.cache = TurboQuantKVCache( + max_batch_size=max_batch_size, + max_context_length=max_context_length, + n_heads=n_heads, + head_dim=head_dim, + enable_dynamic_shape=enable_dynamic_shape, + ) + + def forward( + self, + input_pos: torch.Tensor, + k_val: torch.Tensor, + v_val: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + return self.cache.update(input_pos, k_val, v_val) + + +@register_test +class TurboQuantKVCacheTest(OpTestCase): + """ + Test case for TurboQuantKVCache with tensor input_pos. + + Verifies eager-vs-MLX consistency for the compress + write path + (``mlx::tq_norm``, ``mlx::tq4_compress``, ``mlx::kv_cache_update``). + The packed cache is uint8 (byte-exact), norms are bf16 (loose tol). + """ + + name = "turboquant_kv_cache" + # uint8 packed cache stays effectively exact under atol<1; bf16 + # norms need ~1e-1 absolute slack for the eager-vs-MLX bf16 path. + rtol = 1e-5 + atol = 1e-1 + + def __init__( + self, + n_heads: int = 4, + head_dim: int = 64, + max_context_length: int = 128, + seq_step: int = 8, + enable_dynamic_shape: bool = True, + ): + # TurboQuantKVCache requires batch=1. + self.max_batch_size = 1 + self.n_heads = n_heads + self.head_dim = head_dim + self.max_context_length = max_context_length + self.seq_step = seq_step + self.enable_dynamic_shape = enable_dynamic_shape + + @classmethod + def get_test_configs(cls) -> List["TurboQuantKVCacheTest"]: + return [ + cls(), # default: head_dim=64 (smallest valid) + cls(head_dim=128), + cls(enable_dynamic_shape=False), + ] + + def create_model(self) -> nn.Module: + return TurboQuantKVCacheModel( + max_batch_size=self.max_batch_size, + max_context_length=self.max_context_length, + n_heads=self.n_heads, + head_dim=self.head_dim, + enable_dynamic_shape=self.enable_dynamic_shape, + ) + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + input_pos = torch.tensor([0], dtype=torch.int64) + k_val = torch.randn( + self.max_batch_size, + self.n_heads, + self.seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + v_val = torch.randn( + self.max_batch_size, + self.n_heads, + self.seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + return (input_pos, k_val, v_val) + + def create_test_inputs(self) -> Tuple[torch.Tensor, ...]: + # With static shape, test inputs must match the exported seq length. + test_seq_step = ( + self.seq_step if not self.enable_dynamic_shape else self.seq_step + 4 + ) + input_pos = torch.tensor([16], dtype=torch.int64) + k_val = torch.randn( + self.max_batch_size, + self.n_heads, + test_seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + v_val = torch.randn( + self.max_batch_size, + self.n_heads, + test_seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + return (input_pos, k_val, v_val) + + def get_dynamic_shapes(self) -> Optional[Dict[str, any]]: + if not self.enable_dynamic_shape: + return None + seq_dim = Dim("seq_step", min=1, max=self.max_context_length) + return { + "input_pos": None, + "k_val": {2: seq_dim}, + "v_val": {2: seq_dim}, + } + + +class TurboQuantKVCacheIntModel(nn.Module): + """ + Test model that passes int/SymInt (not tensor) to + ``TurboQuantKVCache.update`` — the multi-layer pattern. + """ + + def __init__( + self, + max_batch_size: int, + max_context_length: int, + n_heads: int, + head_dim: int, + enable_dynamic_shape: bool = True, + ): + super().__init__() + self.cache = TurboQuantKVCache( + max_batch_size=max_batch_size, + max_context_length=max_context_length, + n_heads=n_heads, + head_dim=head_dim, + enable_dynamic_shape=enable_dynamic_shape, + ) + + def forward( + self, + input_pos: torch.Tensor, + k_val: torch.Tensor, + v_val: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + start_pos = input_pos[0].item() + return self.cache.update(start_pos, k_val, v_val) + + +@register_test +class TurboQuantKVCacheIntTest(OpTestCase): + """Test case for TurboQuantKVCache with int/SymInt input_pos.""" + + name = "turboquant_kv_cache_int" + rtol = 1e-5 + atol = 1e-1 + + def __init__( + self, + n_heads: int = 4, + head_dim: int = 64, + max_context_length: int = 128, + seq_step: int = 8, + enable_dynamic_shape: bool = True, + ): + self.max_batch_size = 1 + self.n_heads = n_heads + self.head_dim = head_dim + self.max_context_length = max_context_length + self.seq_step = seq_step + self.enable_dynamic_shape = enable_dynamic_shape + + @classmethod + def get_test_configs(cls) -> List["TurboQuantKVCacheIntTest"]: + return [ + cls(), + cls(head_dim=128), + ] + + def create_model(self) -> nn.Module: + return TurboQuantKVCacheIntModel( + max_batch_size=self.max_batch_size, + max_context_length=self.max_context_length, + n_heads=self.n_heads, + head_dim=self.head_dim, + enable_dynamic_shape=self.enable_dynamic_shape, + ) + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + input_pos = torch.tensor([0], dtype=torch.int64) + k_val = torch.randn( + self.max_batch_size, + self.n_heads, + self.seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + v_val = torch.randn( + self.max_batch_size, + self.n_heads, + self.seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + return (input_pos, k_val, v_val) + + def create_test_inputs(self) -> Tuple[torch.Tensor, ...]: + test_seq_step = self.seq_step + 4 + input_pos = torch.tensor([16], dtype=torch.int64) + k_val = torch.randn( + self.max_batch_size, + self.n_heads, + test_seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + v_val = torch.randn( + self.max_batch_size, + self.n_heads, + test_seq_step, + self.head_dim, + dtype=torch.bfloat16, + ) + return (input_pos, k_val, v_val) + + def get_dynamic_shapes(self) -> Optional[Dict[str, any]]: + if not self.enable_dynamic_shape: + return None + seq_dim = Dim("seq_step", min=1, max=self.max_context_length) + return { + "input_pos": None, + "k_val": {2: seq_dim}, + "v_val": {2: seq_dim}, + } + + +class TurboQuantKVCacheSdpaModel(nn.Module): + """ + Test model wrapping ``TurboQuantKVCache.update + .sdpa`` — the full + prefill/decode flow (compress, dequant, attention in rotated space, + un-rotate output). + """ + + def __init__( + self, + max_batch_size: int, + max_context_length: int, + n_heads: int, + head_dim: int, + enable_dynamic_shape: bool = True, + ): + super().__init__() + self.max_context_length = max_context_length + self.cache = TurboQuantKVCache( + max_batch_size=max_batch_size, + max_context_length=max_context_length, + n_heads=n_heads, + head_dim=head_dim, + enable_dynamic_shape=enable_dynamic_shape, + ) + + def forward( + self, + input_pos: torch.Tensor, + k_val: torch.Tensor, + v_val: torch.Tensor, + query: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + start_pos = input_pos[0].item() + seq_len = k_val.size(2) + torch._check(start_pos >= 0) + torch._check(start_pos + seq_len <= self.max_context_length) + + k_packed, k_norms, v_packed, v_norms = self.cache.update( + start_pos, k_val, v_val + ) + out = self.cache.sdpa(query, start_pos) + return out, k_packed, k_norms, v_packed, v_norms + + +@register_test +class TurboQuantKVCacheSdpaTest(OpTestCase): + """ + Test case for ``TurboQuantKVCache.update`` + ``.sdpa``. + + Exercises the full forward path: compress + write through + ``mlx::tq_norm`` / ``mlx::tq4_compress`` / ``mlx::kv_cache_update``, + then dequantize and attend via ``mlx::tq_dequant`` / + ``mlx::custom_sdpa`` with Q rotated in and output rotated back. + Looser tolerance is needed because attention runs in bf16. + """ + + name = "turboquant_kv_cache_sdpa" + rtol = 1e-5 + atol = 5e-2 # bf16 SDPA output + + def __init__( + self, + n_heads: int = 4, + head_dim: int = 64, + max_context_length: int = 128, + seq_step: int = 8, + enable_dynamic_shape: bool = True, + ): + self.max_batch_size = 1 + self.n_heads = n_heads + self.head_dim = head_dim + self.max_context_length = max_context_length + self.seq_step = seq_step + self.enable_dynamic_shape = enable_dynamic_shape + + @classmethod + def get_test_configs(cls) -> List["TurboQuantKVCacheSdpaTest"]: + return [ + cls(), + cls(head_dim=128), + ] + + def create_model(self) -> nn.Module: + return TurboQuantKVCacheSdpaModel( + max_batch_size=self.max_batch_size, + max_context_length=self.max_context_length, + n_heads=self.n_heads, + head_dim=self.head_dim, + enable_dynamic_shape=self.enable_dynamic_shape, + ) + + def _make_inputs( + self, start: int, q_len: int, kv_len: int + ) -> Tuple[torch.Tensor, ...]: + input_pos = torch.tensor([start], dtype=torch.int64) + k_val = torch.randn( + self.max_batch_size, + self.n_heads, + kv_len, + self.head_dim, + dtype=torch.bfloat16, + ) + v_val = torch.randn( + self.max_batch_size, + self.n_heads, + kv_len, + self.head_dim, + dtype=torch.bfloat16, + ) + query = torch.randn( + self.max_batch_size, + self.n_heads, + q_len, + self.head_dim, + dtype=torch.bfloat16, + ) + return (input_pos, k_val, v_val, query) + + def create_inputs(self) -> Tuple[torch.Tensor, ...]: + # Prefill-style: start=0, q_len == kv_len. + return self._make_inputs(start=0, q_len=self.seq_step, kv_len=self.seq_step) + + def create_test_inputs(self) -> Tuple[torch.Tensor, ...]: + # Decode-style: write a single token into the existing cache. + return self._make_inputs(start=16, q_len=1, kv_len=1) + + def get_dynamic_shapes(self) -> Optional[Dict[str, any]]: + if not self.enable_dynamic_shape: + return None + seq_dim = Dim("seq_step", min=1, max=self.max_context_length) + return { + "input_pos": None, + "k_val": {2: seq_dim}, + "v_val": {2: seq_dim}, + "query": {2: seq_dim}, + } + + class RingBufferKVCacheModel(nn.Module): """ Test model wrapping RingBufferKVCache from cache.py. diff --git a/backends/mlx/test/test_utils.py b/backends/mlx/test/test_utils.py index 660968195b7..5dbc35b824d 100644 --- a/backends/mlx/test/test_utils.py +++ b/backends/mlx/test/test_utils.py @@ -44,6 +44,7 @@ class TestTimeoutError(Exception): DTYPE_INT64 = 3 DTYPE_BFLOAT16 = 4 DTYPE_BOOL = 5 +DTYPE_UINT8 = 6 # Default tolerance presets for different data types. @@ -110,6 +111,7 @@ def torch_dtype_to_bin_dtype(dtype: torch.dtype) -> int: torch.int64: DTYPE_INT64, torch.bfloat16: DTYPE_BFLOAT16, torch.bool: DTYPE_BOOL, + torch.uint8: DTYPE_UINT8, } if dtype not in mapping: raise ValueError(f"Unsupported dtype: {dtype}") @@ -125,6 +127,7 @@ def bin_dtype_to_torch_dtype(dtype_val: int) -> torch.dtype: DTYPE_INT64: torch.int64, DTYPE_BFLOAT16: torch.bfloat16, DTYPE_BOOL: torch.bool, + DTYPE_UINT8: torch.uint8, } if dtype_val not in mapping: raise ValueError(f"Unknown dtype value: {dtype_val}") @@ -208,6 +211,7 @@ def load_tensors_from_bin(path: Union[str, Path]) -> List[torch.Tensor]: torch.int32: np.int32, torch.int64: np.int64, torch.bool: np.bool_, + torch.uint8: np.uint8, # bfloat16 needs special handling - read as uint16 } @@ -219,6 +223,7 @@ def load_tensors_from_bin(path: Union[str, Path]) -> List[torch.Tensor]: torch.int64: 8, torch.bfloat16: 2, torch.bool: 1, + torch.uint8: 1, } tensors = [] diff --git a/examples/models/gemma4_31b/README.md b/examples/models/gemma4_31b/README.md index c6ac10748d8..ae3bcb24c19 100644 --- a/examples/models/gemma4_31b/README.md +++ b/examples/models/gemma4_31b/README.md @@ -93,6 +93,24 @@ method with dynamic sequence length and host-side sampling. Writes `model.pte` (and optionally `model.ptd`) into `--output-dir`. +#### TurboQuant KV cache (long context, MLX only) + +For long-context inference, add `--turboquant` to swap the full-attention +layers' KV cache for a TurboQuant TQ4 cache (4-bit codebook + nibble pack). +This gives ~3.8× cache memory savings on the full-attention layers and lets +you fit context lengths that wouldn't fit in bf16. Sliding-window layers are unaffected. + +```bash +python examples/models/gemma4_31b/export.py \ + --prequantized ./gemma4_31b_int4 \ + --output-dir ./gemma4_31b_exports_mlx_tq \ + --max-seq-len 65536 \ + --backend mlx \ + --turboquant +``` + +Use TurboQuant when you need context beyond what bf16 fits; otherwise leave it off. + ## Eager inference The prompt is automatically wrapped with the Gemma 4 IT chat template. diff --git a/examples/models/gemma4_31b/export.py b/examples/models/gemma4_31b/export.py index bd648f534b5..ed3dcdba9c3 100644 --- a/examples/models/gemma4_31b/export.py +++ b/examples/models/gemma4_31b/export.py @@ -141,12 +141,19 @@ def export_and_lower( config: Gemma4_31BConfig, output_dir: str, backend: str = "cuda", + use_turboquant: bool = False, ) -> None: """Export and lower the model to ExecuTorch for the given backend.""" if backend == "cuda": + if use_turboquant: + raise ValueError( + "--turboquant is only supported with --backend mlx " + "(the CUDA path here uses a different TurboQuant integration; " + "see examples/models/qwen3_5_moe/export.py)." + ) _export_cuda(model, config, output_dir) elif backend == "mlx": - _export_mlx(model, config, output_dir) + _export_mlx(model, config, output_dir, use_turboquant=use_turboquant) else: raise ValueError( f"Unsupported backend: {backend!r}. Supported: {_SUPPORTED_BACKENDS}." @@ -279,7 +286,12 @@ def _export_cuda(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) - print("Done.") -def _export_mlx(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -> None: +def _export_mlx( + model: Gemma4_31B, + config: Gemma4_31BConfig, + output_dir: str, + use_turboquant: bool = False, +) -> None: """Export to .pte via torch.export + MLX backend. Unlike CUDA (which exports separate decode/prefill methods with an @@ -287,6 +299,10 @@ def _export_mlx(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -> sequence length. No int4_dispatch import — IntxUnpackedToInt8Tensor's default dispatch produces the ``dequantize_affine → linear`` pattern that MLX's QuantizedLinearHandler matches. + + When ``use_turboquant=True``, full-attention layers swap to + ``MLXTurboQuantKVCache`` for ~3.8× KV cache memory savings. Sliding + layers are unaffected (already use ``RingBufferKVCache``). """ import gc @@ -304,10 +320,13 @@ def _export_mlx(model: Gemma4_31B, config: Gemma4_31BConfig, output_dir: str) -> from executorch.exir.passes import MemoryPlanningPass from torch.export import Dim, export - mlx_source_transformations(model, dtype=torch.bfloat16) + mlx_source_transformations( + model, dtype=torch.bfloat16, use_turboquant=use_turboquant + ) + materialize_runtime_buffers(model, dtype=torch.bfloat16) - max_prefill = min(config.max_seq_len - 1, config.sliding_window * 2) + max_prefill = 256 seq_dim = Dim("seq_len", min=1, max=max_prefill) print(f"Exporting (T in [1, {max_prefill}])...") @@ -418,8 +437,17 @@ def main() -> None: choices=list(_SUPPORTED_BACKENDS), help="Target backend for export.", ) + parser.add_argument( + "--turboquant", + action="store_true", + help="Use TurboQuant TQ4 KV cache compression (MLX backend only). " + "~3.8× cache memory savings; applies only to full-attention " + "(non-sliding) layers — sliding layers keep RingBufferKVCache.", + ) args = parser.parse_args() + if args.turboquant and args.backend != "mlx": + parser.error("--turboquant requires --backend mlx.") if args.backend == "cuda" and not torch.cuda.is_available(): parser.error("CUDA is required for the cuda backend.") @@ -446,7 +474,13 @@ def main() -> None: if args.gguf and args.backend == "mlx": os.environ["ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS"] = "1" try: - export_and_lower(model, config, args.output_dir, backend=args.backend) + export_and_lower( + model, + config, + args.output_dir, + backend=args.backend, + use_turboquant=args.turboquant, + ) finally: os.environ.pop("ET_MLX_ALLOW_NON_FUSED_QUANTIZED_OPS", None) diff --git a/examples/models/gemma4_31b/mlx_source_transformations.py b/examples/models/gemma4_31b/mlx_source_transformations.py index 3a8ae4420e3..0bbd4f7b250 100644 --- a/examples/models/gemma4_31b/mlx_source_transformations.py +++ b/examples/models/gemma4_31b/mlx_source_transformations.py @@ -24,6 +24,9 @@ KVCache as MLXKVCache, RingBufferKVCache as MLXRingKVCache, ) +from executorch.backends.mlx.llm.turboquant_cache import ( + TurboQuantKVCache as MLXTurboQuantKVCache, +) def _replace_attention_forward(attn: nn.Module) -> None: @@ -68,30 +71,34 @@ def _mlx_forward(self, x: torch.Tensor, input_pos: torch.Tensor) -> torch.Tensor q = torch.ops.mlx.rope(q, rotary_dim, start_pos, False, 0.0, 1.0, mlx_freqs) k = torch.ops.mlx.rope(k, rotary_dim, start_pos, False, 0.0, 1.0, mlx_freqs) - k_cache, v_cache = self.kv_cache.update(start_pos, k, v) - - if self.is_sliding: - sdpa_mask = self.kv_cache.create_sliding_window_mask(start_pos, T) - y = torch.ops.mlx.custom_sdpa( - q, - k_cache, - v_cache, - start_pos=self.kv_cache.buffer_size - T, - attn_mask=sdpa_mask, - dropout_p=0.0, - is_causal=False, - scale=self.scaling, - ) + if getattr(self, "is_turboquant", False): + self.kv_cache.update(start_pos, k, v) + y = self.kv_cache.sdpa(q, start_pos, scale=self.scaling) else: - y = torch.ops.mlx.custom_sdpa( - q, - k_cache, - v_cache, - start_pos=start_pos, - dropout_p=0.0, - is_causal=True, - scale=self.scaling, - ) + k_cache, v_cache = self.kv_cache.update(start_pos, k, v) + + if self.is_sliding: + sdpa_mask = self.kv_cache.create_sliding_window_mask(start_pos, T) + y = torch.ops.mlx.custom_sdpa( + q, + k_cache, + v_cache, + start_pos=self.kv_cache.buffer_size - T, + attn_mask=sdpa_mask, + dropout_p=0.0, + is_causal=False, + scale=self.scaling, + ) + else: + y = torch.ops.mlx.custom_sdpa( + q, + k_cache, + v_cache, + start_pos=start_pos, + dropout_p=0.0, + is_causal=True, + scale=self.scaling, + ) y = y.transpose(1, 2).contiguous().view(B, T, self.n_heads * self.head_dim) return self.o_proj(y) @@ -150,6 +157,7 @@ def _mlx_model_forward( def mlx_source_transformations( model: nn.Module, dtype: torch.dtype = torch.bfloat16, + use_turboquant: bool = False, ) -> None: """Apply MLX source transformations to a Gemma 4 31B model in-place. @@ -162,6 +170,13 @@ def mlx_source_transformations( - Rewrites layer forward to drop mask parameters (each attention builds its own mask via ``custom_sdpa``) - Rewrites model forward to drop the sampler and ``_build_masks`` + + Args: + model: Gemma4_31B model to transform in place. + dtype: dtype for KV cache buffers (bf16 by default). + use_turboquant: If True, swap full-attention layers' KV caches + for ``MLXTurboQuantKVCache`` (~3.8× cache memory savings). + Sliding-window layers are unaffected. """ config = model.config @@ -176,6 +191,17 @@ def mlx_source_transformations( head_dim=attn.head_dim, dtype=dtype, ) + attn.is_turboquant = False + elif use_turboquant: + attn.kv_cache = MLXTurboQuantKVCache( + max_batch_size=1, + max_context_length=config.max_seq_len, + n_heads=attn.n_kv_heads, + head_dim=attn.head_dim, + enable_dynamic_shape=True, + dtype=dtype, + ) + attn.is_turboquant = True else: attn.kv_cache = MLXKVCache( max_batch_size=1, @@ -185,6 +211,7 @@ def mlx_source_transformations( enable_dynamic_shape=True, dtype=dtype, ) + attn.is_turboquant = False _replace_attention_forward(attn) _replace_layer_forward(layer) From bd24e79e87e9093a70cc7f1d8e63366ac457bfd4 Mon Sep 17 00:00:00 2001 From: Ethan Ng Date: Fri, 29 May 2026 22:25:49 -0700 Subject: [PATCH 088/103] Add fuse() to remaining QuantizationPatterns (#19727) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Summary: Add `fuse()` implementations to the remaining Cadence `QuantizationPattern` subclasses: - `MaxPool2dPattern`, `MaxPool2dWithoutIndicesPattern` — order-preserving pool on quantized values - `ReluBasePattern` (inherited by `ReluPattern0`/`1`) — relu with requantization - `ConvReluBasePattern` (inherited by `Conv1d`/`2dReluPattern0`/`1`) — conv+relu fusion with `anchor_ops()` override to match only the conv op - `SoftmaxPattern` — softmax with dummy mask/pos tensors and fake_mode metadata - `MixedW8A32LinearPattern` — weight-only quantized linear (no input/output quant) - `MixedW8A32ConvPattern` — weight-only quantized conv1d with NCL→NLC permutation - `MixedW8A32GruPattern` — weight-only quantized GRU with 4 dequantized params Reviewed By: DrJessop Differential Revision: D105728177 --- backends/cadence/aot/quantizer/patterns.py | 262 ++++++++++++++++++++- 1 file changed, 260 insertions(+), 2 deletions(-) diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index bf7ca3ef567..a7026cbf26c 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -12,6 +12,7 @@ from typing import List, Optional, Tuple, Union import torch +from executorch.backends.cadence.aot.compiler_utils import get_shape from executorch.backends.cadence.aot.pass_utils import get_arg, replace_with_op from executorch.backends.cadence.aot.quantizer.pattern_utils import ( DQ_PER_TENSOR, @@ -24,6 +25,7 @@ from executorch.backends.cadence.aot.quantizer.utils import ( check_out_zero_point_is_min_range, get_bias_qparams, + quantize_tensor_multiplier, ) from torch import fx from torch._ops import OpOverload @@ -806,6 +808,40 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_max_pool2d_nchw.default + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + return _fuse_max_pool2d(gm, anchor_node) + + +def _fuse_max_pool2d(gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + """Shared fuse logic for both MaxPool2d variants.""" + dq_input = anchor_node.args[0] + if not isinstance(dq_input, fx.Node) or dq_input.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + kernel_size = get_arg(anchor_node, "kernel_size", list[int]) + stride = get_arg(anchor_node, "stride", list[int]) + padding = get_arg(anchor_node, "padding", list[int]) + dilation = get_arg(anchor_node, "dilation", list[int]) + ceil_mode = get_arg(anchor_node, "ceil_mode", bool) + args = (get_arg(dq_input, "input", fx.Node),) + kwargs = { + "kernel_size": kernel_size, + "stride": stride, + "padding": padding, + "dilation": dilation, + "ceil_mode": ceil_mode, + } + return replace_with_op( + gm, + anchor_node, + torch.ops.cadence.quantized_max_pool2d_nchw.default, + args, + kwargs, + quant_node, + ) + class MaxPool2dWithoutIndicesPattern(QuantizationPattern): """ @@ -845,8 +881,8 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_max_pool2d_nchw.default - -# This is a base class for ReLU + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + return _fuse_max_pool2d(gm, anchor_node) # This is a base class for ReLU, since it can be used with two different aten ops @@ -874,6 +910,28 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_relu.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq_input = anchor_node.args[0] + if not isinstance(dq_input, fx.Node) or dq_input.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + input_scale = get_arg(dq_input, "scale", float) + requantize_scale = input_scale / get_arg(quant_node, "scale", float) + requantize_scale_t = torch.tensor([requantize_scale]) + out_multiplier, out_shift = quantize_tensor_multiplier(requantize_scale_t) + args = (get_arg(dq_input, "input", fx.Node),) + kwargs = { + "X_zero_point": get_arg(dq_input, "zero_point", int), + "out_zero_point": get_arg(quant_node, "zero_point", int), + "out_multiplier": out_multiplier[0].item(), + "out_shift": out_shift[0].item(), + } + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, kwargs, quant_node + ) + # Regular relu op class ReluPattern0(ReluBasePattern): @@ -933,6 +991,39 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_conv2d_nchw.per_tensor + def anchor_ops(self) -> tuple[OpOverload, ...]: + return (self.partition_types()[0],) + + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + conv_users = list(anchor_node.users) + if len(conv_users) != 1: + return None + relu_node = conv_users[0] + if relu_node.target != self.partition_types()[1]: + return None + _arg0 = anchor_node.args[0] + dq_input = ( + _arg0 + if isinstance(_arg0, fx.Node) and _arg0.target == DQ_PER_TENSOR + else None + ) + _arg1 = anchor_node.args[1] + dq_weight = ( + _arg1 + if isinstance(_arg1, fx.Node) and _arg1.target == DQ_PER_TENSOR + else None + ) + if dq_input is None or dq_weight is None: + return None + quant_node = find_quant_user(relu_node) + if quant_node is None: + return None + check_out_zero_point_is_min_range( + get_arg(quant_node, "zero_point", int), + get_arg(quant_node, "dtype", torch.dtype), + ) + return fuse_conv(self, gm, anchor_node, dq_input, dq_weight, quant_node) + # Conv1d + regular relu op fusion class Conv1dReluPattern0(ConvReluBasePattern): @@ -987,6 +1078,56 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_softmax.per_tensor + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + dq_input = anchor_node.args[0] + if not isinstance(dq_input, fx.Node) or dq_input.target != DQ_PER_TENSOR: + return None + quant_node = find_quant_user(anchor_node) + if quant_node is None: + return None + input_q = get_arg(dq_input, "input", fx.Node) + quant_input = get_arg(quant_node, "input", fx.Node) + mask_shape = get_shape(gm, quant_input) + if not mask_shape: + return None + mask_shape = list(mask_shape) + # Softmax mask is packed 16 elements per int32 word. + assert ( + mask_shape[-1] % 16 == 0 + ), f"Softmax mask dimension must be divisible by 16, got {mask_shape[-1]}" + mask_shape[-1] = mask_shape[-1] // 16 + mask_tensor = insert_node_with_meta( + gm, + torch.ops.aten.full.default, + (mask_shape, 0.0), + {"dtype": torch.int32}, + anchor_node, + input_q, + ) + # Initial position for streaming softmax (unused, set to 0). + pos_tensor = insert_node_with_meta( + gm, + torch.ops.aten.full.default, + ([1], 0), + {"dtype": torch.int64}, + anchor_node, + input_q, + ) + args = ( + input_q, + mask_tensor, + get_arg(anchor_node, "dim", int), + 0, + pos_tensor, + get_arg(dq_input, "scale", float), + get_arg(dq_input, "zero_point", int), + get_arg(quant_node, "scale", float), + get_arg(quant_node, "zero_point", int), + ) + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, {}, quant_node + ) + class MixedW8A32LinearPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -1041,6 +1182,36 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_w8a32_linear.default + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + if len(anchor_node.args) != 3 or len(anchor_node.kwargs) > 0: + return None + _arg1 = anchor_node.args[1] + dq_weight = ( + _arg1 + if isinstance(_arg1, fx.Node) and _arg1.target == DQ_PER_TENSOR + else None + ) + _arg2 = anchor_node.args[2] + dq_bias = ( + _arg2 + if isinstance(_arg2, fx.Node) and _arg2.target == DQ_PER_TENSOR + else None + ) + if dq_weight is None or dq_bias is None: + return None + input_node = anchor_node.args[0] + assert isinstance(input_node, fx.Node) + args = ( + input_node, + get_arg(dq_weight, "input", fx.Node), + get_arg(dq_weight, "scale", float), + get_arg(dq_bias, "input", fx.Node), + get_arg(dq_bias, "scale", float), + ) + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, {}, anchor_node + ) + class MixedW8A32ConvPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -1115,6 +1286,57 @@ def get_anchors( def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_w8a32_conv.default + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + if len(anchor_node.args) != 3 or len(anchor_node.kwargs) > 0: + return None + _arg1 = anchor_node.args[1] + dq_weight = ( + _arg1 + if isinstance(_arg1, fx.Node) and _arg1.target == DQ_PER_TENSOR + else None + ) + _arg2 = anchor_node.args[2] + dq_bias = ( + _arg2 + if isinstance(_arg2, fx.Node) and _arg2.target == DQ_PER_TENSOR + else None + ) + if dq_weight is None or dq_bias is None: + return None + input_node = anchor_node.args[0] + assert isinstance(input_node, fx.Node) + assert get_arg(anchor_node, "stride", list[int]) == [1] + assert get_arg(anchor_node, "padding", list[int]) == [0] + assert get_arg(anchor_node, "dilation", list[int]) == [1] + assert get_arg(anchor_node, "groups", int) == 1 + weight_q = get_arg(dq_weight, "input", fx.Node) + transposed_inputs = insert_node_with_meta( + gm, + torch.ops.aten.permute.default, + (input_node, [0, 2, 1]), + None, + anchor_node, + input_node, + ) + transposed_weights = insert_node_with_meta( + gm, + torch.ops.aten.permute.default, + (weight_q, [2, 0, 1]), + None, + anchor_node, + weight_q, + ) + args = ( + transposed_inputs, + transposed_weights, + get_arg(dq_weight, "scale", float), + get_arg(dq_bias, "input", fx.Node), + get_arg(dq_bias, "scale", float), + ) + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, {}, anchor_node + ) + class MixedW8A32GruPattern(QuantizationPattern): def partition_types(self) -> List[OpOverload]: @@ -1187,6 +1409,42 @@ def __init__(self, args, meta): def replacement_op(self) -> OpOverload: return torch.ops.cadence.quantized_w8a32_gru.default + def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: + if len(anchor_node.kwargs) > 0: + return None + params = anchor_node.args[2] + # GRU requires 4 weight/bias params: w_ih, w_hh, b_ih, b_hh + if not isinstance(params, (list, tuple)) or len(params) < 4: + return None + dq_w_ih = params[0] + if not isinstance(dq_w_ih, fx.Node) or dq_w_ih.target != DQ_PER_TENSOR: + return None + dq_w_hh = params[1] + if not isinstance(dq_w_hh, fx.Node) or dq_w_hh.target != DQ_PER_TENSOR: + return None + dq_b_ih = params[2] + if not isinstance(dq_b_ih, fx.Node) or dq_b_ih.target != DQ_PER_TENSOR: + return None + dq_b_hh = params[3] + if not isinstance(dq_b_hh, fx.Node) or dq_b_hh.target != DQ_PER_TENSOR: + return None + input_node = anchor_node.args[0] + hidden_node = anchor_node.args[1] + args = ( + input_node, + hidden_node, + get_arg(dq_w_ih, "input", fx.Node), + get_arg(dq_w_ih, "scale", float), + get_arg(dq_w_hh, "input", fx.Node), + get_arg(dq_w_hh, "scale", float), + get_arg(dq_b_ih, "input", fx.Node), + get_arg(dq_b_ih, "scale", float), + get_arg(dq_b_hh, "input", fx.Node), + ) + return replace_with_op( + gm, anchor_node, self.replacement_op(), args, {}, anchor_node + ) + class RmsNormPattern(QuantizationPattern): """Pattern that preserves rms_norm from decomposition without matching anything.""" From ec317357dce55a7bda318966bf44eb2abe3f3cec Mon Sep 17 00:00:00 2001 From: Ethan Ng Date: Fri, 29 May 2026 22:32:23 -0700 Subject: [PATCH 089/103] Enable QuantFusionPass in compiler pipeline (#19728) (#19728) Summary: Both and Cadence now use the shared `QuantFusionPass` from `compiler_funcs.py`. - `QuantFusionPass` in `compiler_funcs.py` iterates patterns, matches `anchor_ops()`, calls `fuse()` on each match, with debug logging and dead code elimination - Cadence: `compiler.py` now uses `QuantFusionPass` instead of the old `QuantFusion` isinstance switch - Removed Cadence `compiler` target's dep on `:fusion_pass` (no longer imported) Reviewed By: DrJessop Differential Revision: D105728219 --- backends/cadence/aot/BUCK | 2 -- backends/cadence/aot/compiler.py | 8 ++++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/backends/cadence/aot/BUCK b/backends/cadence/aot/BUCK index 7d8ff3cffd2..57b8194c7f8 100644 --- a/backends/cadence/aot/BUCK +++ b/backends/cadence/aot/BUCK @@ -44,7 +44,6 @@ fbcode_target(_kind = runtime.python_library, ":compiler_funcs", ":utils", "//caffe2:torch", - "//executorch/backends/cadence/aot/quantizer:fusion_pass", "//executorch/backends/cadence/aot/quantizer/passes:fuse_ops", "//executorch/backends/cadence/aot/quantizer:quantizer", "//executorch/backends/transforms:decompose_sdpa", @@ -65,7 +64,6 @@ fbcode_target(_kind = runtime.python_library, ":replace_ops", ":utils", "//caffe2:torch", - "//executorch/backends/cadence/aot/quantizer:fusion_pass", "//executorch/backends/cadence/aot/quantizer:quantizer", "//executorch/backends/cadence/runtime:runtime", "//executorch/backends/transforms:decompose_sdpa", diff --git a/backends/cadence/aot/compiler.py b/backends/cadence/aot/compiler.py index 5c66c9eb62b..0b1b8dac361 100644 --- a/backends/cadence/aot/compiler.py +++ b/backends/cadence/aot/compiler.py @@ -14,6 +14,7 @@ import torch from executorch.backends.cadence.aot.compiler_funcs import ( prepare as prepare_fn, + QuantFusionPass, QuantizedInputWrapper, trace as trace_fn, ) @@ -21,7 +22,6 @@ CadenceMemoryPlanning, print_memory_planning_info, ) -from executorch.backends.cadence.aot.quantizer.fusion_pass import QuantFusion from executorch.backends.cadence.aot.quantizer.passes.fuse_ops import FuseQATConvBN from executorch.backends.cadence.aot.quantizer.quantizer import ( CadenceDefaultQuantizer, @@ -154,9 +154,9 @@ def apply_pre_edge_transform_passes( quantizer: CadenceQuantizer, ) -> ExportedProgram: """ - Apply pre-edge transform passes including QuantFusion and torch ops passes. + Apply pre-edge transform passes including QuantFusionPass and torch ops passes. This mirrors the Cadence AOT compiler flow: - 1. QuantFusion - fuses dq->op->q patterns + 1. QuantFusionPass - fuses dq->op->q patterns 2. apply_torch_ops_passes - applied just before to_edge() The quantizer must be the same as the one used to convert the model. @@ -169,7 +169,7 @@ def apply_pre_edge_transform_passes( PassManager( [ FuseQATConvBN(converted_program), - QuantFusion(patterns), + QuantFusionPass(patterns), ] )(converted_program.graph_module) From 2af5a13d1eab5414cedc364726ce3b32bc7bec3e Mon Sep 17 00:00:00 2001 From: Ethan Ng Date: Mon, 1 Jun 2026 00:17:32 -0700 Subject: [PATCH 090/103] Remove over-strict softmax mask divisibility assert Differential Revision: D106957459 Pull Request resolved: https://github.com/pytorch/executorch/pull/19903 --- backends/cadence/aot/quantizer/patterns.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index a7026cbf26c..9897d443725 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -1092,9 +1092,6 @@ def fuse(self, gm: fx.GraphModule, anchor_node: fx.Node) -> fx.Node | None: return None mask_shape = list(mask_shape) # Softmax mask is packed 16 elements per int32 word. - assert ( - mask_shape[-1] % 16 == 0 - ), f"Softmax mask dimension must be divisible by 16, got {mask_shape[-1]}" mask_shape[-1] = mask_shape[-1] // 16 mask_tensor = insert_node_with_meta( gm, From f244a9f62fd463036470cc2761052e90f0ab5db9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Lindstr=C3=B6m?= <33344797+martinlsm@users.noreply.github.com> Date: Mon, 1 Jun 2026 12:33:27 +0200 Subject: [PATCH 091/103] Arm backend: Add MXFP Linear source transform (#19800) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the possibility to convert torch.nn.Linear modules to MXFP format. The feature works by replacing all torch.nn.Linear submodules inside a graph by a custom implemented MXFP counterpart: `MXFPLinearOp`. A new user API called `to_mxfp` has been added to enable this feature (located in backends/arm/ao_ext/mxfp.py). The API is tagged as experimental for now. An eager CPU and fake implementation is added to the new custom op, but lowering it TOSA is handled in a later patch. To summarize, this patch enables the following flow: ```python m = MyModule() to_mxfp(m, MXFPOpConfig()) m.forward(x) ``` Signed-off-by: Martin Lindström Co-authored-by: Sebastian Larsson --- backends/arm/TARGETS | 27 ++- backends/arm/__init__.py | 6 + backends/arm/ao_ext/__init__.py | 12 + backends/arm/ao_ext/mxfp.py | 64 +++++ backends/arm/ao_ext/mxfp_tosa_lib.py | 11 + backends/arm/ao_ext/mxfp_transform.py | 24 ++ backends/arm/ao_ext/ops/__init__.py | 10 + backends/arm/ao_ext/ops/mxfp_linear_op.py | 179 ++++++++++++++ backends/arm/operators/op_view.py | 16 +- backends/arm/test/misc/test_mxfp_linear_ao.py | 46 ++++ backends/arm/test/ops/test_mxfp_linear.py | 226 ++++++++++++++++++ backends/arm/test/targets.bzl | 3 + .../arm/test/tester/analyze_output_utils.py | 32 ++- 13 files changed, 639 insertions(+), 17 deletions(-) create mode 100644 backends/arm/ao_ext/__init__.py create mode 100644 backends/arm/ao_ext/mxfp.py create mode 100644 backends/arm/ao_ext/mxfp_tosa_lib.py create mode 100644 backends/arm/ao_ext/mxfp_transform.py create mode 100644 backends/arm/ao_ext/ops/__init__.py create mode 100644 backends/arm/ao_ext/ops/mxfp_linear_op.py create mode 100644 backends/arm/test/misc/test_mxfp_linear_ao.py create mode 100644 backends/arm/test/ops/test_mxfp_linear.py diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS index c3e2251bb11..a63237fe2c9 100644 --- a/backends/arm/TARGETS +++ b/backends/arm/TARGETS @@ -1,4 +1,4 @@ -# Copyright 2025 Arm Limited and/or its affiliates. +# Copyright 2025-2026 Arm Limited and/or its affiliates. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -15,6 +15,31 @@ runtime.python_library( "//executorch/exir/dialects:lib", ], ) +runtime.python_library( + name = "ao_ext", + srcs = glob([ + "ao_ext/*.py", + "ao_ext/ops/*.py", + ]), + deps = [ + "//caffe2:torch", + "//executorch/exir:_warnings", + "//pytorch/ao:torchao", + ], +) + +runtime.python_library( + name = "lib", + srcs = [ + "__init__.py", + ], + deps = [ + ":ao_ext", + ":ethosu", + ":vgf", + "//executorch/backends/arm/quantizer:lib", + ], +) runtime.python_library( name = "common", srcs = glob(["common/*.py"]), diff --git a/backends/arm/__init__.py b/backends/arm/__init__.py index fcbafa717ce..7c0b61457d0 100644 --- a/backends/arm/__init__.py +++ b/backends/arm/__init__.py @@ -14,6 +14,10 @@ import importlib from typing import Any +# Register Arm-specific torch.library ops and MXFP transforms at package +# import time. +import executorch.backends.arm.ao_ext # noqa: F401 + # Public for tooling (manifest generation and API validation). LAZY_IMPORTS = { "EthosUBackend": ("executorch.backends.arm.ethosu", "EthosUBackend"), @@ -32,6 +36,8 @@ "executorch.backends.arm.quantizer", "get_symmetric_a16w8_quantization_config", ), + "MXFPOpConfig": ("executorch.backends.arm.ao_ext.mxfp", "MXFPOpConfig"), + "to_mxfp": ("executorch.backends.arm.ao_ext.mxfp", "to_mxfp"), } diff --git a/backends/arm/ao_ext/__init__.py b/backends/arm/ao_ext/__init__.py new file mode 100644 index 00000000000..fef05a9f6ae --- /dev/null +++ b/backends/arm/ao_ext/__init__.py @@ -0,0 +1,12 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Import mxfp_transform to trigger registration of the MXFP transforms. +from . import mxfp_transform # noqa: F401 + +from .mxfp import MXFPOpConfig, to_mxfp + + +__all__ = ["MXFPOpConfig", "to_mxfp"] diff --git a/backends/arm/ao_ext/mxfp.py b/backends/arm/ao_ext/mxfp.py new file mode 100644 index 00000000000..783da92590e --- /dev/null +++ b/backends/arm/ao_ext/mxfp.py @@ -0,0 +1,64 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from dataclasses import dataclass +from typing import Callable, Optional + +import torch +from executorch.exir._warnings import experimental +from torchao.core.config import AOBaseConfig +from torchao.prototype.mx_formats.config import ScaleCalculationMode +from torchao.quantization import quantize_ + + +def _match_supported_modules(module: torch.nn.Module, _name: str) -> bool: + """Default filter function that matches supported modules.""" + return isinstance(module, torch.nn.Linear) + + +@experimental("This API is experimental and may change without notice.") +@dataclass +class MXFPOpConfig(AOBaseConfig): + """Configuration for Arm MXFP source transforms.""" + + weight_dtype: torch.dtype = torch.float8_e4m3fn + weight_scaling_mode: ScaleCalculationMode = ScaleCalculationMode.RCEIL + + # Only block size of 32 is currently supported for now, so we hardcode it here. + @property + def block_size(self) -> int: + return 32 + + def __post_init__(self) -> None: + if self.weight_dtype not in (torch.float8_e4m3fn, torch.float8_e5m2): + raise ValueError(f"Unsupported weight_dtype: {self.weight_dtype}") + if not isinstance(self.weight_scaling_mode, ScaleCalculationMode): + raise ValueError( + f"Unsupported weight_scaling_mode: {self.weight_scaling_mode}" + ) + + +@experimental("This API is experimental and may change without notice.") +def to_mxfp( + model: torch.nn.Module, + config: MXFPOpConfig, + filter_fn: Optional[Callable[[torch.nn.Module, str], bool]] = None, +) -> None: + """Convert matching modules in ``model`` to Arm MXFP modules in-place. + + Args: + model (torch.nn.Module): Module to transform. Matching submodules are + replaced in-place. + config (MXFPOpConfig): Configuration controlling the MXFP conversion + behavior. + filter_fn (Optional[Callable[[torch.nn.Module, str], bool]]): Optional + predicate that receives a module and its fully qualified name. When + omitted, all modules supported by the MXFP transform are matched. + + """ + if filter_fn is None: + filter_fn = _match_supported_modules + + quantize_(model, config, filter_fn) diff --git a/backends/arm/ao_ext/mxfp_tosa_lib.py b/backends/arm/ao_ext/mxfp_tosa_lib.py new file mode 100644 index 00000000000..4459ec59126 --- /dev/null +++ b/backends/arm/ao_ext/mxfp_tosa_lib.py @@ -0,0 +1,11 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from torch.library import Library + +# MXFP TOSA library definition for the Arm backend containing. +# This library will generate custom ops like the following example: +# torch.ops.tosa_mxfp.linear.default +MXFP_TOSA_LIB = Library("tosa_mxfp", "DEF") diff --git a/backends/arm/ao_ext/mxfp_transform.py b/backends/arm/ao_ext/mxfp_transform.py new file mode 100644 index 00000000000..b7823524475 --- /dev/null +++ b/backends/arm/ao_ext/mxfp_transform.py @@ -0,0 +1,24 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig +from executorch.backends.arm.ao_ext.ops.mxfp_linear_op import transform_linear_to_mxfp +from torchao.quantization.transform_module import register_quantize_module_handler + + +@register_quantize_module_handler(MXFPOpConfig) # type: ignore[misc] +def _transform_to_mxfp( + module: torch.nn.Module, + config: MXFPOpConfig, +) -> torch.nn.Module: + """Transforms a given module to use MXFP operations based on the provided + MXFPOpConfig configuration. + """ + if isinstance(module, torch.nn.Linear): + return transform_linear_to_mxfp(module, config) + else: + return module diff --git a/backends/arm/ao_ext/ops/__init__.py b/backends/arm/ao_ext/ops/__init__.py new file mode 100644 index 00000000000..a690c4b7b02 --- /dev/null +++ b/backends/arm/ao_ext/ops/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from .mxfp_linear_op import MXFPLinearOp + +__all__ = [ + "MXFPLinearOp", +] diff --git a/backends/arm/ao_ext/ops/mxfp_linear_op.py b/backends/arm/ao_ext/ops/mxfp_linear_op.py new file mode 100644 index 00000000000..5238f85a847 --- /dev/null +++ b/backends/arm/ao_ext/ops/mxfp_linear_op.py @@ -0,0 +1,179 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +"""MXFP Linear transform for the Arm backend. + +TorchAO extension for MXFP linear. It replaces ``nn.Linear`` with a wrapper +module that stores precomputed MXFP weights and emits a backend-internal custom +op during export. + +""" + +import torch +import torch.nn.functional as F +from executorch.backends.arm.ao_ext.mxfp import MXFPOpConfig +from executorch.backends.arm.ao_ext.mxfp_tosa_lib import MXFP_TOSA_LIB +from torchao.prototype.mx_formats.config import ScaleCalculationMode +from torchao.prototype.mx_formats.mx_tensor import to_dtype, to_mx + +MXFP_TOSA_LIB.define( + "linear(Tensor input, Tensor weight_qdata, Tensor weight_scale, " + "Tensor? bias=None, SymInt block_size=32) -> Tensor" +) + + +@torch.library.register_fake("tosa_mxfp::linear", lib=MXFP_TOSA_LIB) # type: ignore[misc] +def _mxfp_linear_fake( + input: torch.Tensor, + weight_qdata: torch.Tensor, + weight_scale: torch.Tensor, + bias: torch.Tensor | None = None, + block_size: int = 32, +) -> torch.Tensor: + if weight_qdata.ndim != 3: + raise ValueError( + f"Expected weight_qdata to be rank 3 for linear, got {weight_qdata.ndim}" + ) + if weight_qdata.shape[0] != 1: + raise ValueError( + f"Expected weight_qdata batch dim to be 1, got {weight_qdata.shape[0]}" + ) + if input.shape[-1] != weight_qdata.shape[-1]: + raise ValueError( + f"Input last dim {input.shape[-1]} must match linear in_features " + f"{weight_qdata.shape[-1]}" + ) + expected_scale_shape = ( + 1, + weight_qdata.shape[1], + weight_qdata.shape[-1] // block_size, + ) + if tuple(weight_scale.shape) != expected_scale_shape: + raise ValueError( + f"Expected weight_scale shape {expected_scale_shape}, got " + f"{tuple(weight_scale.shape)}" + ) + output_shape = (*input.shape[:-1], weight_qdata.shape[1]) + return input.new_empty(output_shape, dtype=torch.float32) + + +def _cast_to_block_scaled_cpu_ref( + input: torch.Tensor, + output_dtype: torch.dtype, + block_size: int, +) -> torch.Tensor: + """Emulate the current TOSA activation cast in eager mode.""" + input_scale, input_qdata = to_mx( + input.to(torch.float32).contiguous(), + elem_dtype=output_dtype, + block_size=block_size, + scaling_mode=ScaleCalculationMode.RCEIL, + ) + return to_dtype( + input_qdata, + input_scale, + output_dtype, + block_size, + torch.float32, + ) + + +@torch.library.impl("tosa_mxfp::linear", "cpu", lib=MXFP_TOSA_LIB) +def _mxfp_linear_cpu( + input: torch.Tensor, + weight_qdata: torch.Tensor, + weight_scale: torch.Tensor, + bias: torch.Tensor | None = None, + block_size: int = 32, +) -> torch.Tensor: + """CPU reference implementation of the MXFP linear op.""" + + if weight_qdata.ndim != 3 or weight_scale.ndim != 3: + raise ValueError("Expected rank-3 weight tensors for MXFP linear") + + # Cast the input to block-scaled format and back again to match the + # expected input format of the TOSA + dequantized_input = _cast_to_block_scaled_cpu_ref( + input, + weight_qdata.dtype, + block_size, + ) + dequantized_weight = to_dtype( + weight_qdata, + weight_scale, + weight_qdata.dtype, + block_size, + torch.float32, + ) + dequantized_weight = dequantized_weight.squeeze(0) + if bias is not None: + bias = bias.to(torch.float32) + return F.linear(dequantized_input, dequantized_weight, bias) + + +class MXFPLinearOp(torch.nn.Module): + """Linear wrapper that stores MXFP weights and emits a custom op.""" + + def __init__( + self, + weight_qdata: torch.Tensor, + weight_scale: torch.Tensor, + bias: torch.Tensor | None, + config: MXFPOpConfig, + ) -> None: + super().__init__() + self.config = config + + self.register_buffer("weight_qdata", weight_qdata, persistent=True) + self.register_buffer("weight_scale", weight_scale, persistent=True) + + self.bias: torch.nn.Parameter | None + bias_param = ( + torch.nn.Parameter(bias.detach(), requires_grad=False) + if bias is not None + else None + ) + self.register_parameter( + "bias", + bias_param, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return torch.ops.tosa_mxfp.linear.default( + x, + self.weight_qdata, + self.weight_scale, + self.bias, + self.config.block_size, + ) + + +def transform_linear_to_mxfp( + module: torch.nn.Module, + config: MXFPOpConfig, +) -> torch.nn.Module: + assert isinstance(module, torch.nn.Linear) + + weight = module.weight.detach().contiguous() + if weight.shape[-1] % config.block_size != 0: + raise ValueError( + f"Linear in_features={weight.shape[-1]} must be divisible by " + f"block_size={config.block_size}" + ) + + weight_scale, weight_qdata = to_mx( + weight, + elem_dtype=config.weight_dtype, + block_size=config.block_size, + scaling_mode=config.weight_scaling_mode, + ) + + # The resulting TOSA op MATMUL_T_BLOCK_SCALED only works with tensors of + # rank 3, therefore we prepend a batch dimension of 1 to the weight tensors + # here. + weight_qdata = weight_qdata.unsqueeze(0) + weight_scale = weight_scale.unsqueeze(0) + + bias = module.bias.detach().to(torch.float32) if module.bias is not None else None + return MXFPLinearOp(weight_qdata, weight_scale, bias, config) diff --git a/backends/arm/operators/op_view.py b/backends/arm/operators/op_view.py index ba98f746476..6d399b65801 100644 --- a/backends/arm/operators/op_view.py +++ b/backends/arm/operators/op_view.py @@ -35,24 +35,26 @@ def define_node( inputs: List[TosaArg], output: TosaArg, ) -> None: - supported_dtypes = [ts.DType.BOOL] + supported_dtypes = {ts.DType.BOOL} if self.tosa_spec.support_integer(): - supported_dtypes.extend([ts.DType.INT8, ts.DType.INT16, ts.DType.INT32]) + supported_dtypes.update([ts.DType.INT8, ts.DType.INT16, ts.DType.INT32]) if self.tosa_spec.support_float(): - supported_dtypes.extend([ts.DType.FP16, ts.DType.FP32]) + supported_dtypes.update([ts.DType.FP16, ts.DType.FP32]) if self.tosa_spec.support_extension("bf16"): - supported_dtypes.append(ts.DType.BF16) + supported_dtypes.add(ts.DType.BF16) if self.tosa_spec.support_extension("fp8e4m3"): - supported_dtypes.append(ts.DType.FP8E4M3) + supported_dtypes.add(ts.DType.FP8E4M3) if self.tosa_spec.support_extension("fp8e5m2"): - supported_dtypes.append(ts.DType.FP8E5M2) + supported_dtypes.add(ts.DType.FP8E5M2) + if self.tosa_spec.support_extension("mxfp"): + supported_dtypes.update([ts.DType.FP8E4M3, ts.DType.FP8E5M2]) validate_num_inputs(self.target, inputs, 2) validate_same_dtype(self.target, [inputs[0], output], ts) validate_valid_dtype( self.target, [inputs[0], output], - supported_dtypes, + list(supported_dtypes), self.tosa_spec, ) diff --git a/backends/arm/test/misc/test_mxfp_linear_ao.py b/backends/arm/test/misc/test_mxfp_linear_ao.py new file mode 100644 index 00000000000..0f2b6b9198c --- /dev/null +++ b/backends/arm/test/misc/test_mxfp_linear_ao.py @@ -0,0 +1,46 @@ +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp +from executorch.backends.arm.ao_ext.ops import MXFPLinearOp + +from torch.export import export + + +class LinearModule(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.linear = torch.nn.Linear(32, 8, bias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.linear(x) + + +def test_mxfp_linear_quantize_swaps_module() -> None: + model = LinearModule().eval() + + to_mxfp(model, MXFPOpConfig()) + + assert isinstance(model.linear, MXFPLinearOp) + assert model.linear.weight_qdata.dtype == torch.float8_e4m3fn + assert model.linear.weight_scale.dtype == torch.float8_e8m0fnu + assert tuple(model.linear.weight_qdata.shape) == (1, 8, 32) + assert tuple(model.linear.weight_scale.shape) == (1, 8, 1) + + +def test_mxfp_linear_export_preserves_custom_op() -> None: + model = LinearModule().eval() + to_mxfp(model, MXFPOpConfig()) + + exported = export(model, (torch.randn(4, 32),), strict=False) + + targets = [ + node.target + for node in exported.graph_module.graph.nodes + if node.op == "call_function" + ] + + assert torch.ops.tosa_mxfp.linear.default in targets diff --git a/backends/arm/test/ops/test_mxfp_linear.py b/backends/arm/test/ops/test_mxfp_linear.py new file mode 100644 index 00000000000..da1bbec3b83 --- /dev/null +++ b/backends/arm/test/ops/test_mxfp_linear.py @@ -0,0 +1,226 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# Copyright 2026 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import copy + +import torch +from executorch.backends.arm.ao_ext import MXFPOpConfig, to_mxfp +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.analyze_output_utils import ( + compare_rel_frobenius_and_cosine_similarity, +) + + +def _block_input_rank1() -> torch.Tensor: + """Create a rank-1 input with distinct MXFP activation block scales.""" + + return torch.cat( + ( + 1e-3 * torch.randn(32), + 100.0 * torch.randn(32), + ) + ) + + +def _block_input_rank2() -> torch.Tensor: + """Create a rank-2 input with per-row activation block scale changes.""" + + return torch.stack( + ( + _block_input_rank1(), + torch.cat( + ( + 100.0 * torch.randn(32), + 1e-3 * torch.randn(32), + ) + ), + ) + ) + + +_test_data_rank1_fp = { + "mxfp_linear_rank1_zeros": lambda: ( + torch.zeros(32 * 8), + 5, + True, + False, + ), + "mxfp_linear_rank1_rand": lambda: ( + torch.rand(32), + 16, + False, + False, + ), +} + +_test_data_rank2_fp = { + "mxfp_linear_rank2_zeros": lambda: ( + torch.zeros(4, 32), + 16, + True, + False, + ), + "mxfp_linear_rank2_rand": lambda: ( + torch.rand(4, 32 * 6), + 13, + True, + False, + ), +} + +_test_data_rank3_fp = { + "mxfp_linear_rank3_zeros": lambda: ( + torch.zeros(2, 4, 32 * 3), + 1, + True, + False, + ), + "mxfp_linear_rank3_rand": lambda: ( + torch.rand(2, 4, 32), + 20, + True, + False, + ), +} + +_test_data_rank4_fp = { + "mxfp_linear_rank4_zeros": lambda: ( + torch.zeros(2, 3, 4, 32 * 24), + 8, + True, + False, + ), + "mxfp_linear_rank4_rand": lambda: ( + torch.rand(2, 3, 4, 32 * 32), + 64, + False, + False, + ), +} + +_test_data_block_fp = { + "mxfp_linear_rank1_block_weights": lambda: ( + torch.ones(64), + 4, + False, + True, + ), + "mxfp_linear_rank1_block_weights_block_activations": lambda: ( + _block_input_rank1(), + 4, + False, + True, + ), + "mxfp_linear_rank2_block_weights_block_activations": lambda: ( + _block_input_rank2(), + 4, + False, + True, + ), +} + +test_data_fp = ( + _test_data_rank1_fp + | _test_data_rank2_fp + | _test_data_rank3_fp + | _test_data_rank4_fp + | _test_data_block_fp +) + + +class Linear(torch.nn.Module): + def __init__( + self, + in_features: int, + out_features: int = 8, + bias: bool = True, + ) -> None: + super().__init__() + self.fc = torch.nn.Linear( + in_features=in_features, + out_features=out_features, + bias=bias, + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.fc(x) + + def set_block_test_weights(self) -> None: + """Set weights to exercise separate MXFP weight block scales. + + The first two logical 32-wide input blocks use different magnitudes so + tests can verify block scaling does not share one scale across blocks. + + """ + if self.fc.weight.shape[1] < 64: + raise ValueError( + "Block test weights require at least 64 input features (2 blocks), got " + f"{tuple(self.fc.weight.shape)}" + ) + + with torch.no_grad(): + self.fc.weight.zero_() + for row in range(self.fc.weight.shape[0]): + # Small values in the first block. + self.fc.weight[row, 0:32] = 1e-3 + # Large values in the next block to require a different scale. + self.fc.weight[row, 32:64] = 100.0 + if self.fc.bias is not None: + self.fc.bias.zero_() + + +def _is_linear(module: torch.nn.Module, _fqn: str) -> bool: + return isinstance(module, torch.nn.Linear) + + +def _test_mxfp_linear_eager_cpu( + test_data: torch.Tensor, + config: MXFPOpConfig, + frobenius_threshold: float, + cosine_threshold: float, +) -> None: + test_input, out_features, has_bias, set_block_weights = test_data() + in_features = test_input.shape[-1] + ref_model = Linear( + in_features=in_features, + out_features=out_features, + bias=has_bias, + ).eval() + if set_block_weights: + ref_model.set_block_test_weights() + test_model = copy.deepcopy(ref_model).eval() + + to_mxfp(test_model, config, filter_fn=_is_linear) + + test_output = test_model(test_input) + ref_output = ref_model(test_input) + + compare_rel_frobenius_and_cosine_similarity( + ref_output, + test_output, + quantization_parameters=None, + frobenius_threshold=frobenius_threshold, + cosine_threshold=cosine_threshold, + clean_reference=False, + ) + + +@common.parametrize("test_data", test_data_fp) +def test_mxfp_linear_eager_cpu(test_data: torch.Tensor) -> None: + """Check eager MXFP implementation. + + The Arm lowering tests compare lowered output against the eager CPU + implementation, so the eager implementation must be accurate for it to be + used as a reference in other tests. + + """ + _test_mxfp_linear_eager_cpu( + test_data, + MXFPOpConfig(), + frobenius_threshold=0.06, + cosine_threshold=0.995, + ) diff --git a/backends/arm/test/targets.bzl b/backends/arm/test/targets.bzl index 0a3faa6a074..78b0c6a8533 100644 --- a/backends/arm/test/targets.bzl +++ b/backends/arm/test/targets.bzl @@ -25,6 +25,7 @@ def define_arm_tests(): "ops/test_log10.py", "ops/test_max_pool1d.py", "ops/test_mul.py", + "ops/test_mxfp_linear.py", "ops/test_permute.py", "ops/test_rsqrt.py", "ops/test_slice.py", @@ -62,6 +63,7 @@ def define_arm_tests(): "misc/test_bn_relu_folding_qat.py", "misc/test_custom_partition.py", "misc/test_debug_hook.py", + "misc/test_mxfp_linear_ao.py", "misc/test_post_quant_device_switch.py", # "misc/test_dim_order.py", (TODO - T238390249) ] @@ -104,6 +106,7 @@ def define_arm_tests(): "//executorch/backends/arm/test:arm_tester" if runtime.is_oss else "//executorch/backends/arm/test/tester/fb:arm_tester_fb", "//executorch/backends/arm/test:conftest", "//executorch/backends/arm/test/misc:dw_convs_shared_weights_module", + "//executorch/backends/arm:ao_ext", "//executorch/backends/arm:ethosu", "//executorch/backends/arm/tosa:compile_spec", "//executorch/backends/arm/tosa:partitioner", diff --git a/backends/arm/test/tester/analyze_output_utils.py b/backends/arm/test/tester/analyze_output_utils.py index 6a3bbd4d686..c68811eedad 100644 --- a/backends/arm/test/tester/analyze_output_utils.py +++ b/backends/arm/test/tester/analyze_output_utils.py @@ -337,6 +337,24 @@ def dump_error_output( logger.error(f"{atol=}, {rtol=}, {qtol=}") +def calculate_rel_frobenius_and_cosine_similarity( + reference_output: torch.Tensor, + test_output: torch.Tensor, +) -> tuple[float, float]: + reference_output = reference_output.to(torch.float32) + test_output = test_output.to(torch.float32) + + reference_frobenius_norm = torch.linalg.norm(reference_output).item() + error_frobenius_norm = torch.linalg.norm(test_output - reference_output).item() + + relative_frobenius_error = error_frobenius_norm / (reference_frobenius_norm + 1e-8) + cosine_similarity = torch.nn.functional.cosine_similarity( + test_output.flatten(), reference_output.flatten(), dim=0 + ).item() + + return relative_frobenius_error, cosine_similarity + + def compare_rel_frobenius_and_cosine_similarity( reference_output: torch.Tensor, test_output: torch.Tensor, @@ -394,15 +412,11 @@ def compare_rel_frobenius_and_cosine_similarity( if reference_all_zeros: return - reference_output = reference_output.to(torch.float32) - test_output = test_output.to(torch.float32) - - reference_frobenius_norm = torch.linalg.norm(reference_output).item() - error_frobenius_norm = torch.linalg.norm(test_output - reference_output).item() - - relative_frobenius_error = error_frobenius_norm / (reference_frobenius_norm + 1e-8) - cosine_similarity = torch.nn.functional.cosine_similarity( - test_output.flatten(), reference_output.flatten(), dim=0 + relative_frobenius_error, cosine_similarity = ( + calculate_rel_frobenius_and_cosine_similarity(reference_output, test_output) + ) + reference_frobenius_norm = torch.linalg.norm( + reference_output.to(torch.float32) ).item() # Relative Frobenius is unstable when the reference norm is at quantization-noise scale. From 0204e36aeecf8a780c601b933d88a02060496ff2 Mon Sep 17 00:00:00 2001 From: roman-janik-nxp Date: Mon, 1 Jun 2026 14:18:22 +0200 Subject: [PATCH 092/103] NXP backend: Enable integer inputs model testing (#19808) ### Summary Enables to test Neutron delegate with int data created by quantization of generated float data and removed input and output quantization nodes. Turns model to int variant. ### Test plan Tests provided. cc @robert-kalmar --- backends/nxp/tests/dataset_creator.py | 68 ++++++++ backends/nxp/tests/executorch_pipeline.py | 4 + .../test_quantized_input_data.py | 130 ++++++++++++++ backends/nxp/tests/nsys_testing.py | 164 ++++++++++++------ 4 files changed, 317 insertions(+), 49 deletions(-) create mode 100644 backends/nxp/tests/generic_tests/test_quantized_input_data.py diff --git a/backends/nxp/tests/dataset_creator.py b/backends/nxp/tests/dataset_creator.py index eaf267f4fcf..fdfd363c257 100644 --- a/backends/nxp/tests/dataset_creator.py +++ b/backends/nxp/tests/dataset_creator.py @@ -8,6 +8,7 @@ import shutil from collections import OrderedDict from copy import deepcopy +from dataclasses import dataclass from os import mkdir from random import sample, seed @@ -19,6 +20,7 @@ ) from executorch.backends.nxp.tests.calibration_dataset import CalibrationDataset from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec +from executorch.exir.scalar_type import ScalarType from torch import Tensor @@ -33,6 +35,72 @@ def _get_calibration_and_testing_dataset_directory_names( return calibration_path, test_path +@dataclass +class InputQuantizationSpec: + name: str + scale: float + zp: int + dtype: ScalarType + + +def _replace_input_binary_tensor_with_quantized_variant( + input_bin_tensor_path: str, + input_spec: ModelInputSpec, + q_params: InputQuantizationSpec, +): + tensor = np.fromfile( + input_bin_tensor_path, dtype=torch_type_to_numpy_type(input_spec.dtype) + ) + if q_params.dtype == ScalarType.CHAR: + tensor = np.add(np.round(np.divide(tensor, [q_params.scale])), [q_params.zp]) + tensor = np.clip(tensor, -128, 127).astype(np.int8) + else: + raise ValueError(f"Unknown quantization type: '{q_params.dtype}.") + tensor.tofile(input_bin_tensor_path) + + +def create_quantized_variant_of_dataset( + dataset_dir: str, + dataset_dir_quant: str, + input_quant_spec: list[InputQuantizationSpec], + input_spec: list[ModelInputSpec], +): + """ + Create quantized dataset from provided quantization spec. Dataset is cloned from directory 'dataset_dir'. + + :param dataset_dir: Original (float) dataset directory. + :param dataset_dir_quant: Quantized dataset directory. + :param input_quant_spec: Quantization parameters used for dataset quantization. + :param input_spec: Model inputs specification. + """ + assert len(input_quant_spec) > 0 + + shutil.copytree(dataset_dir, dataset_dir_quant, dirs_exist_ok=True) + + if len(input_quant_spec) == 1: + # Single input dataset - quantize only files in dataset's root dir with first input_quant_spec + input_spec = input_spec[0] + input_quant_spec = input_quant_spec[0] + + for file in os.listdir(dataset_dir_quant): + input_bin_tensor_path = os.path.join(dataset_dir_quant, file) + _replace_input_binary_tensor_with_quantized_variant( + input_bin_tensor_path, input_spec, input_quant_spec + ) + else: + # Iterate over samples (subfolders) + for dir_ in os.listdir(dataset_dir_quant): + # Iterate over each input in sample + sample_dir = os.path.join(dataset_dir_quant, dir_) + + for idx, input_ in enumerate(sorted(os.listdir(sample_dir))): + _replace_input_binary_tensor_with_quantized_variant( + os.path.join(sample_dir, input_), + input_spec[idx], + input_quant_spec[idx], + ) + + class DatasetCreator(abc.ABC): @abc.abstractmethod diff --git a/backends/nxp/tests/executorch_pipeline.py b/backends/nxp/tests/executorch_pipeline.py index 8f588be621d..e85a5de4d1b 100644 --- a/backends/nxp/tests/executorch_pipeline.py +++ b/backends/nxp/tests/executorch_pipeline.py @@ -276,6 +276,8 @@ def to_quantized_executorch_program( dataset_dir: str | None = None, delegate_to_npu=True, use_new_flow_neutron_c: bool = False, + operators_not_to_delegate: list[str] = None, + remove_quant_io_ops: bool = False, ) -> ExecutorchProgramManager: if dataset_dir: # Extract calibration data from a directory. @@ -295,6 +297,8 @@ def to_quantized_executorch_program( use_neutron_for_format_conversion=use_neutron_for_format_conversion, delegate_to_npu=delegate_to_npu, use_new_flow_neutron_c=use_new_flow_neutron_c, + operators_not_to_delegate=operators_not_to_delegate, + remove_quant_io_ops=remove_quant_io_ops, **get_calibration_inputs_fn, ) diff --git a/backends/nxp/tests/generic_tests/test_quantized_input_data.py b/backends/nxp/tests/generic_tests/test_quantized_input_data.py new file mode 100644 index 00000000000..4d2188816dc --- /dev/null +++ b/backends/nxp/tests/generic_tests/test_quantized_input_data.py @@ -0,0 +1,130 @@ +# Copyright 2026 NXP +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import executorch.backends.nxp.tests.nsys_testing as nsys_testing +import torch + +from executorch.backends.nxp.tests.executorch_pipeline import ModelInputSpec +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.models import AvgPool2dModule, MulTensorModule +from executorch.backends.nxp.tests.nsys_testing import ( + lower_run_compare, + OUTPUTS_DIR, + ReferenceModel, +) +from executorch.backends.nxp.tests.ops_aliases import AvgPool2D, MulTensor + + +def test__single_quantized_inputs(mocker): + input_spec = ModelInputSpec((2, 4, 6, 7)) + model = AvgPool2dModule(False, 0) + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={} + ) + output_tensor_spec_spy = mocker.spy(nsys_testing, "_get_program_output_spec") + + lower_run_compare( + model, + [input_spec], + graph_verifier, + use_new_flow_neutron_c=True, + remove_quant_io_ops=True, + ) + + assert ( + OUTPUTS_DIR / "test__single_quantized_inputs" / "dataset_quant" / "0000.bin" + ).exists() + + # Check outputs are in quantized int8 format + output_tensor_spec = output_tensor_spec_spy.spy_return + assert output_tensor_spec[0].dtype == torch.int8 + + +def test__single_quantized_inputs_edge_python_reference(mocker): + input_spec = ModelInputSpec((2, 4, 6, 7)) + model = AvgPool2dModule(False, 0) + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={AvgPool2D: 1}, expected_non_delegated_ops={} + ) + output_tensor_spec_spy = mocker.spy(nsys_testing, "_get_program_output_spec") + + lower_run_compare( + model, + [input_spec], + graph_verifier, + reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON, + use_new_flow_neutron_c=True, + remove_quant_io_ops=True, + ) + + assert ( + OUTPUTS_DIR + / "test__single_quantized_inputs_edge_python_reference" + / "dataset_quant" + / "0000.bin" + ).exists() + + # Check outputs are in quantized int8 format + output_tensor_spec = output_tensor_spec_spy.spy_return + assert output_tensor_spec[0].dtype == torch.int8 + + +def test__multiple_quantized_inputs(mocker): + x_input_spec = ModelInputSpec((1, 4, 8, 8)) + model = MulTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={MulTensor: 1}, expected_non_delegated_ops={} + ) + output_tensor_spec_spy = mocker.spy(nsys_testing, "_get_program_output_spec") + + lower_run_compare( + model, + [x_input_spec, x_input_spec], + graph_verifier, + use_new_flow_neutron_c=True, + remove_quant_io_ops=True, + ) + + assert ( + OUTPUTS_DIR + / "test__multiple_quantized_inputs" + / "dataset_quant" + / "0000" + / "00.bin" + ).exists() + + # Check outputs are in quantized int8 format + output_tensor_spec = output_tensor_spec_spy.spy_return + assert output_tensor_spec[0].dtype == torch.int8 + + +def test__multiple_quantized_inputs_edge_python_reference(mocker): + x_input_spec = ModelInputSpec((1, 4, 8, 8)) + model = MulTensorModule() + graph_verifier = DetailedGraphVerifier( + mocker, expected_delegated_ops={MulTensor: 1}, expected_non_delegated_ops={} + ) + output_tensor_spec_spy = mocker.spy(nsys_testing, "_get_program_output_spec") + + lower_run_compare( + model, + [x_input_spec, x_input_spec], + graph_verifier, + reference_model=ReferenceModel.QUANTIZED_EDGE_PYTHON, + use_new_flow_neutron_c=True, + remove_quant_io_ops=True, + ) + + assert ( + OUTPUTS_DIR + / "test__multiple_quantized_inputs_edge_python_reference" + / "dataset_quant" + / "0000" + / "00.bin" + ).exists() + + # Check outputs are in quantized int8 format + output_tensor_spec = output_tensor_spec_spy.spy_return + assert output_tensor_spec[0].dtype == torch.int8 diff --git a/backends/nxp/tests/nsys_testing.py b/backends/nxp/tests/nsys_testing.py index 636e1a28a44..ab5a583ede0 100644 --- a/backends/nxp/tests/nsys_testing.py +++ b/backends/nxp/tests/nsys_testing.py @@ -23,7 +23,11 @@ ) from executorch.backends.nxp.neutron_partitioner import NeutronPartitioner from executorch.backends.nxp.tests.config_importer import test_config -from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator +from executorch.backends.nxp.tests.dataset_creator import ( + create_quantized_variant_of_dataset, + InputQuantizationSpec, + RandomDatasetCreator, +) from executorch.backends.nxp.tests.executorch_pipeline import ( get_calibration_inputs_fn_from_dataset_dir, ModelInputSpec, @@ -61,20 +65,7 @@ class ReferenceModel(Enum): FLOAT_PYTORCH_PYTHON = 4 -def _run_delegated_executorch_program( - model, - test_dir, - test_name, - calibration_dataset_dir, - testing_dataset_dir, - input_spec, - dlg_model_verifier, - npu_results_dir, - mocker, - use_qat: bool = False, - train_fn: Callable[[torch.fx.GraphModule], None] | None = None, - use_new_flow_neutron_c: bool = False, -) -> ExportedProgram: +def _get_dataset_cli_args(input_spec: list[ModelInputSpec], testing_dataset_dir): if len(input_spec) == 1: # Single input, use --dataset dataset_cli = "--dataset" @@ -90,14 +81,25 @@ def _run_delegated_executorch_program( ] ) ) + return dataset_cli, dataset_or_inputs - # Run nxp_executor_runner with program delegated to NPU - delegated_model_path = os.path.abspath( - os.path.join(test_dir, f"{test_name}_delegated.pte") - ) - delegated_cmd = f"{NEUTRON_TEST_PATH} --model {delegated_model_path} {dataset_cli} {dataset_or_inputs} \ - --output {npu_results_dir} --firmware {NSYS_FIRMWARE_PATH} --nsys {NSYS_PATH} --nsys_config {NSYS_CONFIG_PATH}" +def _run_delegated_executorch_program( + model, + test_dir, + test_name, + calibration_dataset_dir, + testing_dataset_dir, + input_spec, + dlg_model_verifier, + npu_results_dir, + mocker, + use_qat: bool = False, + train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + use_new_flow_neutron_c: bool = False, + operators_not_to_delegate: list[str] = None, + remove_quant_io_ops: bool = False, +) -> tuple[ExportedProgram, str]: try: if mocker: method = getattr(NeutronPartitioner, "partition") # noqa B009 @@ -123,6 +125,8 @@ def wrapper(*args, **kwargs): use_qat=use_qat, train_fn=train_fn, use_new_flow_neutron_c=use_new_flow_neutron_c, + operators_not_to_delegate=operators_not_to_delegate, + remove_quant_io_ops=remove_quant_io_ops, ) except RuntimeError as e: if "Model converted with neutron-converter has" in str(e) and hasattr( @@ -139,9 +143,30 @@ def wrapper(*args, **kwargs): dlg_model_verifier.verify_graph(exported_program.graph) save_pte_program(delegated_program, test_name + "_delegated", test_dir) + + # Preparation of quantized dataset, requires quantization parameters from converted delegated model + if remove_quant_io_ops: + dataset_dir_quant = os.path.join(test_dir, "dataset_quant") + input_quant_spec = _parse_input_quant_params(input_spec, delegated_program) + create_quantized_variant_of_dataset( + testing_dataset_dir, dataset_dir_quant, input_quant_spec, input_spec + ) + testing_dataset_dir = dataset_dir_quant + + dataset_cli, dataset_or_inputs = _get_dataset_cli_args( + input_spec, testing_dataset_dir + ) + + # Run nxp_executor_runner with program delegated to NPU + delegated_model_path = os.path.abspath( + os.path.join(test_dir, f"{test_name}_delegated.pte") + ) + + delegated_cmd = f"{NEUTRON_TEST_PATH} --model {delegated_model_path} {dataset_cli} {dataset_or_inputs} \ + --output {npu_results_dir} --firmware {NSYS_FIRMWARE_PATH} --nsys {NSYS_PATH} --nsys_config {NSYS_CONFIG_PATH}" execute_cmd(delegated_cmd) - return exported_program + return exported_program, testing_dataset_dir def _run_non_delegated_executorch_program( @@ -154,31 +179,12 @@ def _run_non_delegated_executorch_program( cpu_results_dir, use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, + remove_quant_io_ops: bool = False, ) -> ExportedProgram: - if len(input_spec) == 1: - # Single input, use --dataset - dataset_cli = "--dataset" - dataset_or_inputs = testing_dataset_dir - else: - # Multiple input, use --inputs with subdirectories - dataset_cli = "--inputs" - dataset_or_inputs = ",".join( - sorted( - [ - os.path.join(testing_dataset_dir, d) - for d in os.listdir(testing_dataset_dir) - ] - ) - ) - - # Run program via nxp_executor_runner on CPU - non_delegated_model_path = os.path.abspath( - os.path.join(test_dir, f"{test_name}_non_delegated.pte") + dataset_cli, dataset_or_inputs = _get_dataset_cli_args( + input_spec, testing_dataset_dir ) - non_delegated_cmd = f"{NEUTRON_TEST_PATH} --model {non_delegated_model_path} {dataset_cli} {dataset_or_inputs} \ - --output {cpu_results_dir} --firmware {NSYS_FIRMWARE_PATH} --nsys {NSYS_PATH} --nsys_config {NSYS_CONFIG_PATH}" - non_delegated_program = to_quantized_executorch_program( model, input_spec, @@ -186,6 +192,7 @@ def _run_non_delegated_executorch_program( delegate_to_npu=False, use_qat=use_qat, train_fn=train_fn, + remove_quant_io_ops=remove_quant_io_ops, ) nodes = list(non_delegated_program.exported_program().graph.nodes) @@ -194,6 +201,14 @@ def _run_non_delegated_executorch_program( ), "Delegated parts found in program executed on CPU!" save_pte_program(non_delegated_program, test_name + "_non_delegated", test_dir) + + # Run program via nxp_executor_runner on CPU + non_delegated_model_path = os.path.abspath( + os.path.join(test_dir, f"{test_name}_non_delegated.pte") + ) + + non_delegated_cmd = f"{NEUTRON_TEST_PATH} --model {non_delegated_model_path} {dataset_cli} {dataset_or_inputs} \ + --output {cpu_results_dir} --firmware {NSYS_FIRMWARE_PATH} --nsys {NSYS_PATH} --nsys_config {NSYS_CONFIG_PATH}" execute_cmd(non_delegated_cmd) return non_delegated_program.exported_program() @@ -229,9 +244,9 @@ def read_prepared_samples( bin_file_path = os.path.join( sample_dir, f"{str(spec_idx).zfill(2)}.bin" ) - sample_vector = np.fromfile(bin_file_path, dtype=spec.type).reshape( - spec.shape - ) + sample_vector = np.fromfile( + bin_file_path, dtype=torch_type_to_numpy_type(spec.dtype) + ).reshape(spec.shape) current_samples.append(sample_vector) all_samples.append(tuple(current_samples)) @@ -385,6 +400,8 @@ def lower_run_compare( use_qat: bool = False, train_fn: Callable[[torch.fx.GraphModule], None] | None = None, use_new_flow_neutron_c: bool = False, + operators_not_to_delegate: list[str] = None, + remove_quant_io_ops: bool = False, ): """ Run provided program twice with neutron-test and check if results correspond. At first, @@ -402,6 +419,10 @@ def lower_run_compare( :param use_qat: If True, applies quantization-aware training before conversion (without the QAT training). :param train_fn: Train/finetune function for QAT training. Is used only when `use_qat=True`. :param use_new_flow_neutron_c: Enable experimental MLIR-based flow for Neutron-C with improved INT8 operator support. + :param operators_not_to_delegate: list of operators not to delegate. + :param remove_quant_io_ops: If true, IO q-ops are removed and verification is done on quantized + version of dataset (quantized INT8 input samples). + """ assert_NSYS() @@ -430,7 +451,7 @@ def lower_run_compare( cpu_results_dir = os.path.join(test_dir, "results_cpu") npu_results_dir = os.path.join(test_dir, "results_npu") - delegated_program = _run_delegated_executorch_program( + delegated_program, testing_dataset_dir = _run_delegated_executorch_program( model_to_delegate, test_dir, test_name, @@ -443,6 +464,8 @@ def lower_run_compare( use_qat=use_qat, train_fn=train_fn, use_new_flow_neutron_c=use_new_flow_neutron_c, + operators_not_to_delegate=operators_not_to_delegate, + remove_quant_io_ops=remove_quant_io_ops, ) output_spec = _get_program_output_spec(delegated_program) @@ -461,6 +484,7 @@ def lower_run_compare( cpu_results_dir, use_qat=use_qat, train_fn=train_fn, + remove_quant_io_ops=remove_quant_io_ops, ) case ReferenceModel.QUANTIZED_EDGE_PYTHON: @@ -475,10 +499,19 @@ def lower_run_compare( delegate_to_npu=False, use_qat=use_qat, train_fn=train_fn, + remove_quant_io_ops=remove_quant_io_ops, ) .exported_program() .module() ) + # Switch input spec dtype to quantized int8 if run with remove_quant_io_ops flag + # The input spec has to still have float32 dtype during edge program lowering to correctly calibrate the + # model. When running in Python, the testing data are loaded from numpy tensors according to input spec. + # There the testing data are in quantized int8 dtype. + if remove_quant_io_ops: + for spec in input_spec: + spec.dtype = torch.int8 + _run_python_program( non_delegated_edge_program, testing_dataset_dir, @@ -489,6 +522,12 @@ def lower_run_compare( ) case ReferenceModel.FLOAT_PYTORCH_PYTHON: + if remove_quant_io_ops: + raise ValueError( + "Flag remove_quant_io_ops is not applicable to FLOAT_PYTORCH_PYTHON reference model" + "as it works with float data only. Run with remove_quant_io_ops=False." + ) + # Run the PyTorch nn.Module directly in Python. _run_python_program( model_to_not_delegate, @@ -561,7 +600,7 @@ def lower_run_compare_ptq_qat( ptq_results_dir = os.path.join(test_dir, "results_ptq") qat_results_dir = os.path.join(test_dir, "results_qat") - delegated_program_ptq = _run_delegated_executorch_program( + delegated_program_ptq, _ = _run_delegated_executorch_program( model_ptq, test_dir, test_name, @@ -597,12 +636,39 @@ def lower_run_compare_ptq_qat( ) +def _parse_input_quant_params( + input_spec: tuple[ModelInputSpec, ...], exported_program_manager +) -> list[InputQuantizationSpec]: + """ + Parse input quantization params from provided exported program manager. + + :param input_spec: Model inputs specification. + :param exported_program_manager: Exported program manager of parsed model. + :return: List of input quantization specification. + """ + if (config_methods := exported_program_manager._config_methods) is None: + raise ValueError("Attempt to parse q-params for not fully quantized model") + + q_params = [] + + for idx in range(len(input_spec)): + input_name = f"input{idx}" + scale = config_methods[f"{input_name}_scale"] + zp = config_methods[f"{input_name}_zp"] + dtype = config_methods[f"{input_name}_dtype"] + + q_params.append(InputQuantizationSpec(input_name, scale, zp, dtype)) + + return q_params + + def _get_caller_name(): test_function_names = ["lower_run_compare", "lower_run_compare_ptq_qat"] for idx, frame in enumerate(inspect.stack()): if frame.function in test_function_names: # Look one index above to get caller return inspect.stack()[idx + 1].function + return None def execute_cmd(cmd, cwd="."): From a072513a967ef4a373a63d1b1c2e8e96b86e0673 Mon Sep 17 00:00:00 2001 From: Vaclav Novak Date: Mon, 1 Jun 2026 14:50:25 +0200 Subject: [PATCH 093/103] NXP backend: added support for `slice` using new Neutron flow (#19803) ### Summary Added support for `aten.slice` using new Neutron flow. ### Test plan tests can be manually run using `pytest -c /dev/null backends/nxp/tests/` cc @robert-kalmar @JakeStevens @digantdesai @rascani @MartinPavella @roman-janik-nxp @jirioc @irtrukhina @StrycekSimon --- .../ops_converters/slice_tensor_converter.py | 31 ++ .../test_slice_tensor_converter.py | 370 +++++++++++++++++- 2 files changed, 394 insertions(+), 7 deletions(-) diff --git a/backends/nxp/backend/ir/converter/node_converters/ops_converters/slice_tensor_converter.py b/backends/nxp/backend/ir/converter/node_converters/ops_converters/slice_tensor_converter.py index f2002cc311c..f5df822b6ad 100644 --- a/backends/nxp/backend/ir/converter/node_converters/ops_converters/slice_tensor_converter.py +++ b/backends/nxp/backend/ir/converter/node_converters/ops_converters/slice_tensor_converter.py @@ -4,6 +4,7 @@ # LICENSE file in the root directory of this source tree. import numpy as np +import torch from executorch.backends.nxp.backend.data_format import NXP_NODE_FORMAT from executorch.backends.nxp.backend.edge_helper import input_tensor from executorch.backends.nxp.backend.ir.converter.conversion import translator @@ -31,6 +32,15 @@ def _is_supported_on_target( parameters_mapping: dict[str, Parameter], custom_delegation_options: CustomDelegationOptions, ) -> bool: + if custom_delegation_options.use_new_flow_neutron_c: + supported_types = [torch.int8, torch.uint8] + if not NodeConverter.uses_quantization_type_for_io( + node, supported_types, [0], [0] + ): + return False + + return True + input_shape = input_tensor(node, 0).shape dim = node.args[1] if node.args[0].meta[NXP_NODE_FORMAT].is_channels_first(): @@ -94,6 +104,23 @@ def _convert_to_slice(self, t_op, main_input, input_rank, dim, start, end) -> No size[dim] = max(end - start, 0) begin[dim] = start + # In the new Neutron flow, slicing can be done along any dim, so + # no additional `transpose` ops have to be added. + if self.context.custom_delegation_options.use_new_flow_neutron_c: + begin_tensor = self.builder.create_tensor_for_data( + np.asarray(begin, np.int32), "begin" + ) + size_tensor = self.builder.create_tensor_for_data( + np.asarray(size, np.int32), "size" + ) + + t_op.tmp_inputs = [main_input, begin_tensor, size_tensor] + t_op.builtin_options = slice_options.Slice() + ops = OpsList(middle_op=t_op) + + self.builder.append_operators(ops.flatten()) + return None + # We can slice only the channels dimension # So we swap the sliced dimension with the channels dimension begin[-1], begin[dim] = begin[dim], begin[-1] @@ -131,6 +158,10 @@ def _get_clipped_slice_args(node: Node) -> tuple[Dim, Start, End]: _, dim, start, end = node.args sliced_tensor_rank = input_shape[dim] + # convert numbering `from the end` to `from the beginning`, ie. normalize + end = end + sliced_tensor_rank if end < 0 else end + start = start + sliced_tensor_rank if start < 0 else start + end = int(np.clip(end, 0, sliced_tensor_rank)) start = int(np.clip(start, 0, sliced_tensor_rank)) diff --git a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py index 78886558ba2..39fa900ca55 100644 --- a/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py +++ b/backends/nxp/tests/ir/converter/node_converter/test_slice_tensor_converter.py @@ -8,6 +8,7 @@ from executorch.backends.nxp.backend.edge_program_converter import ( EdgeProgramToIRConverter, ) +from executorch.backends.nxp.tests.dataset_creator import RandomDatasetCreator from executorch.backends.nxp.tests.executorch_pipeline import to_quantized_edge_program from executorch.backends.nxp.tests.executors import ( convert_run_compare, @@ -15,12 +16,22 @@ ToChannelFirstPreprocess, ToChannelLastPreprocess, ) +from executorch.backends.nxp.tests.graph_verifier import DetailedGraphVerifier +from executorch.backends.nxp.tests.model_output_comparator import ( + AllCloseOutputComparator, +) from executorch.backends.nxp.tests.models import ( SliceTensorConvModule, SliceTensorModule, ) -from executorch.exir.dialects._ops import ops as exir_ops +from executorch.backends.nxp.tests.nsys_testing import lower_run_compare +from executorch.backends.nxp.tests.ops_aliases import ( + Convolution, + ExecutorchDelegateCall, + Slice, + SliceCopy, +) from torch.export import ExportedProgram @@ -30,11 +41,6 @@ def reseed_model_per_test_run(): np.random.seed(23) -ExecutorchDelegateCall = torch.ops.higher_order.executorch_call_delegate -Slice = exir_ops.edge.aten.slice.Tensor -SliceCopy = exir_ops.edge.aten.slice_copy.Tensor - - passing_cases = [ pytest.param((24, 32), (0, 1), (0, 16), (24, 32), id="2D, no transpose"), pytest.param( @@ -238,7 +244,7 @@ def test_slice_tensor_w_conv_quant_conversion( (24, 32), (0, 1), (0, 32), (24, 32), id="2D, start is equal to size" ), pytest.param( - (24, 32), (0, 1), (0, 0), (24, -5), id="2D, clipped end equal to zero" + (24, 32), (0, 1), (0, 0), (24, -35), id="2D, clipped end equal to zero" ), pytest.param( (24, 32), (0, 1), (64, 0), (24, 32), id="2D, clipped start equal to size" @@ -298,3 +304,353 @@ def test_slice_not_delegated(mocker, x_input_shape, dims, starts, ends): for i in range(0, num_slice_ops): slice_idx = (i + 1) * 3 assert nodes[slice_idx].target in [Slice, SliceCopy] + + +class TestSliceTensorConverterNewNeutronFlow: + @staticmethod + def _slice_id(prefix, input_shape, dims, starts, ends): + return f"{prefix}rank={len(input_shape)}_dims={str(dims)}_starts={str(starts)}_ends={str(ends)}" + + @staticmethod + def assert_delegated_and_correct(model, input_shape, num_slices, mocker, use_qat): + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops={SliceCopy: num_slices}, + expected_non_delegated_ops={}, + ) + dataset = RandomDatasetCreator(low=-255.0, high=255.0) + comparator = AllCloseOutputComparator() + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset, + comparator, + use_new_flow_neutron_c=True, + use_qat=use_qat, + ) + + @staticmethod + def assert_model_without_slices(model, input_shape): + delegated_ep = to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + # Check there are no slices and nothing is delegated + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert not graph_contains_any_of_ops(delegated_ep.graph, [Slice, SliceCopy]) + + @staticmethod + def assert_not_delegated(model, input_shape): + delegated_ep = to_quantized_edge_program( + model, input_shape, use_new_flow_neutron_c=True + ).exported_program() + + # Make sure the `slice` was NOT delegated. + assert not graph_contains_any_of_ops( + delegated_ep.graph, [ExecutorchDelegateCall] + ) + assert graph_contains_any_of_ops(delegated_ep.graph, [Slice, SliceCopy]) + + @pytest.mark.parametrize( + "input_shape, dims, starts, ends", + [ + pytest.param( + ins := (5, 2, 3, 4), + d := (0,), + s := (1,), + e := (4,), + id=_slice_id("basic, left and right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (5, 5, 3, 4), + d := (0, 1), + s := (1, 1), + e := (4, 3), + id=_slice_id("basic, left and right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (7, 13, 5, 15), + d := (0, 1, 2, 3), + s := (4, 3, 1, 8), + e := (5, 10, 4, 11), + id=_slice_id("basic, left and right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (5, 13, 5, 13), + d := (0, 1, 2, 3), + s := (0, 0, 0, 0), + e := (4, 11, 4, 11), + id=_slice_id("basic, right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (7, 13, 3, 15), + d := (0, 1, 2, 3), + s := (2, 5, 1, 4), + e := ins, + id=_slice_id("basic, left trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (7, 4, 7), + d := (0, 1, 2), + s := (1, 1, 3), + e := (6, 3, 5), + id=_slice_id("basic, left and right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (4, 5, 9), + d := (0, 1, 2), + s := (0, 0, 0), + e := (3, 4, 7), + id=_slice_id("basic, right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (4, 7, 9), + d := (0, 1, 2), + s := (3, 2, 2), + e := ins, + id=_slice_id("basic, left trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (4, 5), + d := (0, 1), + s := (1, 1), + e := (2, 4), + id=_slice_id("basic, left and right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (4, 5), + d := (0, 1), + s := (0, 0), + e := (2, 4), + id=_slice_id("basic, right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (4, 5), + d := (0, 1), + s := (1, 2), + e := ins, + id=_slice_id("basic, left trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (5,), + d := (0,), + s := (1,), + e := (4,), + id=_slice_id("basic, left and right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (5,), + d := (0,), + s := (0,), + e := (4,), + id=_slice_id("basic, right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (5,), + d := (0,), + s := (1,), + e := ins, + id=_slice_id("basic, left trimmed:", ins, d, s, e), + ), + ], + ) + def test_nsys_inference__basic(self, input_shape, dims, starts, ends, mocker): + model = SliceTensorModule(dims, starts, ends) + + num_slices = len(dims) + self.assert_delegated_and_correct( + model, input_shape, num_slices, mocker, use_qat=False + ) + + @pytest.mark.parametrize( + "input_shape, dims, starts, ends", + [ + pytest.param( + ins := (4, 2, 7, 4), + d := (2,), + s := (5,), + e := (6,), + id=_slice_id("edge case, dimension reduced to 1:", ins, d, s, e), + ), + pytest.param( + ins := (11, 2, 7, 5), + d := (2,), + s := (6,), + e := (6,), + id=_slice_id("edge case, dimension reduced to 0:", ins, d, s, e), + ), + ], + ) + def test_nsys_inference__reduction(self, input_shape, dims, starts, ends, mocker): + model = SliceTensorModule(dims, starts, ends) + + slice_lengths = [e - s for s, e in zip(starts, ends)] + if all(sl == 0 for sl in slice_lengths): + # reductions to 0 are disabled in the backend + self.assert_not_delegated(model, input_shape) + else: + num_slices = len(dims) + self.assert_delegated_and_correct( + model, input_shape, num_slices, mocker, use_qat=False + ) + + @pytest.mark.parametrize( + "input_shape, dims, starts, ends", + [ + pytest.param( + ins := (5, 2, 3, 4), + d := (0,), + s := (-12,), + e := (2,), + id=_slice_id("edge case, `start` clipped:", ins, d, s, e), + ), + pytest.param( + ins := (5, 7, 5, 7), + d := (0,), + s := (1,), + e := (12,), + id=_slice_id("edge case, `end` clipped:", ins, d, s, e), + ), + ], + ) + def test_nsys_inference__clipped(self, input_shape, dims, starts, ends, mocker): + model = SliceTensorModule(dims, starts, ends) + + num_slices = len(dims) + self.assert_delegated_and_correct( + model, input_shape, num_slices, mocker, use_qat=False + ) + + @pytest.mark.parametrize( + "input_shape, dims, starts, ends", + [ + pytest.param( + ins := (5, 11, 13, 3), + d := (1,), + s := (-5,), + e := (10,), + id=_slice_id("edge case, `start` normalized:", ins, d, s, e), + ), + pytest.param( + ins := (7, 15, 5, 7), + d := (1,), + s := (2,), + e := (-2,), + id=_slice_id("edge case, `end` normalized:", ins, d, s, e), + ), + ], + ) + def test_nsys_inference__normalization( + self, input_shape, dims, starts, ends, mocker + ): + model = SliceTensorModule(dims, starts, ends) + + num_slices = len(dims) + self.assert_delegated_and_correct( + model, input_shape, num_slices, mocker, use_qat=False + ) + + @pytest.mark.parametrize( + "input_shape, dims, starts, ends", + [ + pytest.param( + ins := (5000, 3, 5, 3), + d := (0,), + s := (1250,), + e := (2500,), + id=_slice_id("big args, left and right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (2, 5000, 5, 3), + d := (1,), + s := (0,), + e := (4999,), + id=_slice_id("big args, right trimmed:", ins, d, s, e), + ), + pytest.param( + ins := (2, 3, 5000, 3), + d := (2,), + s := (1,), + e := (5000,), + id=_slice_id("big args, left trimmed:", ins, d, s, e), + ), + ], + ) + def test_nsys_inference__big(self, input_shape, dims, starts, ends, mocker): + model = SliceTensorModule(dims, starts, ends) + + num_slices = len(dims) + self.assert_delegated_and_correct( + model, input_shape, num_slices, mocker, use_qat=False + ) + + @pytest.mark.parametrize( + "input_shape, dims, starts, ends", + [ + pytest.param( + ins := (5, 2, 3, 4), + d := (2,), + s := (0,), + e := (3,), + id=_slice_id("edge case, one dimension identity:", ins, d, s, e), + ), + pytest.param( + ins := (5, 2, 3, 4), + d := (0, 1, 2, 3), + s := (0, 0, 0, 0), + e := ins, + id=_slice_id("edge case, all dimensions identity:", ins, d, s, e), + ), + ], + ) + def test_nsys_inference__identity(self, input_shape, dims, starts, ends): + model = SliceTensorModule(dims, starts, ends) + + self.assert_model_without_slices(model, input_shape) + + def test_nsys_inference__with_conv(self, mocker): + input_shape = (11, 13, 5, 7) + in_channels = input_shape[1] + out_channels = 19 + + # we test functionality on `channels` dim + dims = (1,) + starts = (2,) + ends = (out_channels - 2,) + model = SliceTensorConvModule(dims, starts, ends, in_channels, out_channels) + + num_slices = len(dims) + graph_verifier = DetailedGraphVerifier( + mocker, + expected_delegated_ops={SliceCopy: num_slices}, + expected_non_delegated_ops={Convolution: 1}, + ) + dataset = RandomDatasetCreator(low=-255.0, high=255.0) + comparator = AllCloseOutputComparator() + + lower_run_compare( + model, + input_shape, + graph_verifier, + dataset, + comparator, + use_new_flow_neutron_c=True, + use_qat=False, + ) + + def test_nsys_inference__qat(self, mocker): + input_shape = (7, 13, 7, 9) + dims = (0, 1, 2, 3) + starts = (1, 2, 3, 2) + ends = (6, 10, 5, 8) + + model = SliceTensorModule(dims, starts, ends) + + num_slices = len(dims) + self.assert_delegated_and_correct( + model, input_shape, num_slices, mocker, use_qat=True + ) From 10431b98a14876e018812c70d59eea6403101ba0 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Mon, 1 Jun 2026 08:24:01 -0700 Subject: [PATCH 094/103] Suppress cppcheck unusedFunction false positives in headers (#19890) ### Summary cppcheck's unusedFunction is a whole-program check, but lintrunner analyzes files individually. Functions defined in headers are used by the .cpp files that include them, but cppcheck only sees the header in isolation and falsely reports them as never used. Suppress the check for .h/.hpp files while keeping it active for .cpp. Authored with assistance from Claude. --- .lintrunner.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.lintrunner.toml b/.lintrunner.toml index 02380ce1356..75608704110 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -134,6 +134,8 @@ command = [ '--extra-arg=--inconclusive', '--extra-arg=--suppress=unusedStructMember', '--extra-arg=--suppress=toomanyconfigs', + '--extra-arg=--suppress=unusedFunction:*.h', + '--extra-arg=--suppress=unusedFunction:*.hpp', '--', '@{{PATHSFILE}}' ] From 4469d84647266db3f7c6b76068d56f26020eb435 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Mon, 1 Jun 2026 17:25:52 +0200 Subject: [PATCH 095/103] Add executorch-ubuntu-26.04-gcc15 docker image (#19799) ### Summary Add a docker build image based on Ubuntu 26.04 with gcc 15. It's necessary for the the baremetal on RISC-V use case since `libstdc++-riscv64-unknown-elf-picolibc` is only available starting Ubuntu 26.04. It also makes sure that `gcc-riscv64-unknown-elf` is at least gcc 14+ which has support for RVV ### Test plan It will be used by the baremetal testing on RISC-V. Relates to https://github.com/pytorch/executorch/issues/18991 https://github.com/pytorch/executorch/issues/19666 --- .ci/docker/build.sh | 5 +++++ .ci/docker/common/install_docs_reqs.sh | 4 ++-- .github/workflows/docker-builds.yml | 1 + 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index 123680e5275..673b5b4fd4b 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -89,6 +89,11 @@ case "${IMAGE_NAME}" in OS_VERSION=24.04 GCC_VERSION=14 ;; + executorch-ubuntu-26.04-gcc15) + LINTRUNNER="" + OS_VERSION=26.04 + GCC_VERSION=15 + ;; *) echo "Invalid image name ${IMAGE_NAME}" exit 1 diff --git a/.ci/docker/common/install_docs_reqs.sh b/.ci/docker/common/install_docs_reqs.sh index 3b6d10c5c2b..ea54d90523e 100755 --- a/.ci/docker/common/install_docs_reqs.sh +++ b/.ci/docker/common/install_docs_reqs.sh @@ -15,8 +15,8 @@ if [ -n "$BUILD_DOCS" ]; then curl --retry 3 --retry-all-errors -sL https://deb.nodesource.com/setup_16.x | sudo -E bash - sudo apt-get install -y nodejs - curl --retry 3 --retry-all-errors -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo apt-key add - - echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list + curl --retry 3 --retry-all-errors -sS https://dl.yarnpkg.com/debian/pubkey.gpg | sudo gpg --dearmor -o /usr/share/keyrings/yarn-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/yarn-archive-keyring.gpg] https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list apt-get update apt-get install -y --no-install-recommends yarn diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index b77e5497f79..d11b2e9e6d9 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -43,6 +43,7 @@ jobs: executorch-ubuntu-22.04-mediatek-sdk, executorch-ubuntu-22.04-clang12-android, executorch-ubuntu-24.04-gcc14, + executorch-ubuntu-26.04-gcc15, ] include: - docker-image-name: executorch-ubuntu-22.04-gcc11-aarch64 From 00d01735f729489166236c28cf316b1f14e5183d Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Sat, 23 May 2026 15:17:26 +0200 Subject: [PATCH 096/103] Add baremetal RISC-V smoke tests (rv32, rv64) Cross-compiles with riscv64-unknown-elf + picolibc, embeds the .bpte into the ELF, and runs under qemu-system-riscv{32,64} -machine virt with semihosting carrying stdout and exit status. Same bundled-IO PASS criterion as the existing linux runs. --- .ci/scripts/setup-linux.sh | 2 +- .ci/scripts/test_riscv_qemu.sh | 50 ++- .github/workflows/_test_riscv.yml | 57 ++-- .github/workflows/riscv64.yml | 42 ++- CMakePresets.json | 20 +- examples/riscv/README.md | 51 ++-- examples/riscv/aot_riscv.py | 40 ++- examples/riscv/baremetal/CMakeLists.txt | 117 +++++++ .../baremetal/executor_runner_baremetal.cpp | 286 ++++++++++++++++++ examples/riscv/baremetal/riscv_virt.ld | 85 ++++++ examples/riscv/baremetal/semihosting.h | 51 ++++ examples/riscv/baremetal/start.S | 49 +++ .../riscv/riscv32-unknown-elf-toolchain.cmake | 74 +++++ .../riscv/riscv64-unknown-elf-toolchain.cmake | 77 +++++ examples/riscv/run.sh | 246 +++++++++++---- examples/riscv/setup-baremetal.sh | 49 +++ examples/riscv/{setup.sh => setup-linux.sh} | 11 +- examples/riscv/test-matrix.sh | 250 +++++++++++++++ tools/cmake/preset/riscv_baremetal.cmake | 50 +++ ...{riscv64_linux.cmake => riscv_linux.cmake} | 0 20 files changed, 1446 insertions(+), 161 deletions(-) create mode 100644 examples/riscv/baremetal/CMakeLists.txt create mode 100644 examples/riscv/baremetal/executor_runner_baremetal.cpp create mode 100644 examples/riscv/baremetal/riscv_virt.ld create mode 100644 examples/riscv/baremetal/semihosting.h create mode 100644 examples/riscv/baremetal/start.S create mode 100644 examples/riscv/riscv32-unknown-elf-toolchain.cmake create mode 100644 examples/riscv/riscv64-unknown-elf-toolchain.cmake create mode 100755 examples/riscv/setup-baremetal.sh rename examples/riscv/{setup.sh => setup-linux.sh} (90%) create mode 100644 examples/riscv/test-matrix.sh create mode 100644 tools/cmake/preset/riscv_baremetal.cmake rename tools/cmake/preset/{riscv64_linux.cmake => riscv_linux.cmake} (100%) diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh index feb8a128b17..275a93d797e 100755 --- a/.ci/scripts/setup-linux.sh +++ b/.ci/scripts/setup-linux.sh @@ -5,7 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -set -exu +set -eu # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" diff --git a/.ci/scripts/test_riscv_qemu.sh b/.ci/scripts/test_riscv_qemu.sh index 2842542aa3a..0e5b44d97c2 100755 --- a/.ci/scripts/test_riscv_qemu.sh +++ b/.ci/scripts/test_riscv_qemu.sh @@ -4,10 +4,9 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# CI wrapper: install RISC-V cross-compile + qemu-user tooling, then run the -# RISC-V smoke test (export, cross-compile, qemu-user execution) via -# examples/riscv/run.sh. The bundled-IO comparison and Test_result: PASS -# check are done by run.sh. +# CI wrapper: install riscv32/64 cross-compile + qemu tooling, then drive +# examples/riscv/run.sh which does the export, cross-compile, qemu run, and +# bundled-IO PASS check. set -eu @@ -15,29 +14,41 @@ script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")") et_root_dir=$(realpath "${script_dir}/../..") model="add" -xnnpack=false +backend="portable" quantize=false +os="linux" +arch="rv64" +qemu_cpu_ext="" verbose_xnnpack=false debug_xnnpack=false +build_dir= usage() { cat < Which model to export and run (default: add) - --xnnpack Enable the XNNPACK backend (AOT partitioner + runtime) - --quantize Produce an 8-bit quantized model - --verbose-xnnpack Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch - --debug-xnnpack Enable XNNPACK partitioner DEBUG logging and dump the lowered graph - -h, --help Show this help + --model= Which model to export and run (default: ${model}) + --quantize Produce an 8-bit quantized model + --backend= AOT backend (portable|xnnpack) (default: ${backend}) + --os= Target OS (linux|baremetal) (default: ${os}) + --arch= Target arch (rv32|rv64) (default: ${arch}) + --qemu-cpu-ext= QEMU -cpu extensions (no rv32/rv64 prefix, default: none) + --build-dir= Build/output directory for this configuration (required) + --verbose-xnnpack Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch + --debug-xnnpack Enable XNNPACK partitioner DEBUG logging and dump the lowered graph + -h, --help Show this help EOF } for arg in "$@"; do case $arg in --model=*) model="${arg#*=}" ;; - --xnnpack) xnnpack=true ;; --quantize) quantize=true ;; + --backend=*) backend="${arg#*=}" ;; + --os=*) os="${arg#*=}" ;; + --arch=*) arch="${arg#*=}" ;; + --qemu-cpu-ext=*) qemu_cpu_ext="${arg#*=}" ;; + --build-dir=*) build_dir="${arg#*=}" ;; --debug-xnnpack) debug_xnnpack=true ;; --verbose-xnnpack) verbose_xnnpack=true ;; -h|--help) usage; exit 0 ;; @@ -45,9 +56,13 @@ for arg in "$@"; do esac done +if [[ -z "${build_dir}" ]]; then + echo "[test_riscv_qemu.sh] --build-dir is required" >&2; usage; exit 1 +fi + run_extra_args=() -if ${xnnpack}; then - run_extra_args+=(--xnnpack) +if [ -n "${qemu_cpu_ext}" ]; then + run_extra_args+=(--qemu-cpu-ext="${qemu_cpu_ext}") fi if ${quantize}; then run_extra_args+=(--quantize) @@ -59,5 +74,8 @@ if ${verbose_xnnpack}; then run_extra_args+=(--verbose-xnnpack) fi -bash "${et_root_dir}/examples/riscv/setup.sh" -bash "${et_root_dir}/examples/riscv/run.sh" --model="${model}" "${run_extra_args[@]}" +bash "${et_root_dir}/examples/riscv/setup-${os}.sh" +bash "${et_root_dir}/examples/riscv/run.sh" \ + --model="${model}" --backend="${backend}" --os="${os}" --arch="${arch}" \ + --build-dir="${build_dir}" \ + "${run_extra_args[@]}" diff --git a/.github/workflows/_test_riscv.yml b/.github/workflows/_test_riscv.yml index 223a146e3d8..0b7d8472d8b 100644 --- a/.github/workflows/_test_riscv.yml +++ b/.github/workflows/_test_riscv.yml @@ -13,35 +13,44 @@ on: type: number default: 30 model: - description: 'Which model to run. Possible values are: add, mv2 (mobilenetv2)' + description: 'Which model to run (add, mv2, mobilebert, llama2, resnet18, yolo26)' required: false type: string default: 'add' - xnnpack: - description: 'Whether to enable XNNPACK' - required: false - type: boolean - default: false quantize: description: 'Produce an 8-bit quantized model' required: false type: boolean default: false - qemu-cpu: - description: 'Configuration(s) for the CPU to emulate with QEMU, expecting a JSON array' - required: true + backend: + description: 'AOT backend to lower to (portable|xnnpack)' + required: false type: string - docker-image: - description: 'The docker image to use for this job' + default: 'portable' + os: + description: 'Target OS for the runner (linux|baremetal)' required: false type: string + default: 'linux' + arch: + description: 'Target architecture (rv32|rv64)' + required: false + type: string + default: 'rv64' + qemu-cpu-ext: + description: >- + JSON array of QEMU -cpu *extension* strings (no rv32/rv64 prefix). + The script splices each entry with `arch` to form the final -cpu + value. Use [""] for plain base-ISA runs. + required: true + type: string jobs: run: uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: linux.2xlarge - docker-image: ci-image:executorch-ubuntu-24.04-gcc14 + docker-image: ${{ inputs.os == 'linux' && 'ci-image:executorch-ubuntu-24.04-gcc14' || 'ci-image:executorch-ubuntu-26.04-gcc15' }} submodules: 'recursive' ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} timeout: ${{ inputs.timeout }} @@ -55,20 +64,26 @@ jobs: # Allows failure in `echo | jq | while read` pipeline to bubble up and fail the workflow set -o pipefail - echo '${{ inputs.qemu-cpu }}' | jq -r '.[]' | while IFS= read -r qemu_cpu; do - export QEMU_CPU="${qemu_cpu}" - export GCC_VERSION=14 + echo '${{ inputs.qemu-cpu-ext }}' | jq -r '.[]' | while IFS= read -r qemu_cpu_ext; do + variant_slug="${qemu_cpu_ext//,/_}"; variant_slug="${variant_slug//=/_}"; variant_slug="${variant_slug:-base}" + build_dir="riscv_test/${{ inputs.model }}${{ inputs.quantize && '_q' || '' }}/${{ inputs.backend }}/${{ inputs.os }}-${{ inputs.arch }}-${variant_slug}" + bash .ci/scripts/test_riscv_qemu.sh \ --model="${{ inputs.model }}" \ - ${{ inputs.xnnpack && '--xnnpack --verbose-xnnpack' || '' }} \ + --backend="${{ inputs.backend }}" \ + --os="${{ inputs.os }}" \ + --arch="${{ inputs.arch }}" \ + --qemu-cpu-ext="${qemu_cpu_ext}" \ + --build-dir="${build_dir}" \ + ${{ inputs.backend == 'xnnpack' && '--verbose-xnnpack' || '' }} \ ${{ inputs.quantize && '--quantize' || '' }} - # We only generate riscv_test/${{ inputs.model }}_riscv.etdump.json from `--verbose-xnnpack`. - if ${{ inputs.xnnpack }}; then - # Generate markdown table from riscv_test/${{ inputs.model }}_riscv.etdump.json, sorted by sum_ms + # We only generate run.etdump.json from `--verbose-xnnpack`. + if [[ "${{ inputs.backend }}" == "xnnpack" ]]; then + # Generate markdown table from ${build_dir}/run.etdump.json, sorted by sum_ms ( - etdump_json="riscv_test/${{ inputs.model }}_riscv.etdump.json" - echo "### Model=${{ inputs.model }} XNNPACK=${{ inputs.xnnpack }} Quantize=${{ inputs.quantize }} QEMU_CPU='${QEMU_CPU}'" + etdump_json="${build_dir}/run.etdump.json" + echo "### Model=${{ inputs.model }} Quantize=${{ inputs.quantize }} Backend=${{ inputs.backend }} OS=${{ inputs.os }} Arch=${{ inputs.arch }}${qemu_cpu_ext:+,${qemu_cpu_ext}}" jq -r ' def r3: (. * 1000 | round) / 1000; ["Section","Op","Count","Sum (ms)","Avg (ms)","Max (ms)","Microkernels"], diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml index a7a5273e2b0..d6109a47305 100644 --- a/.github/workflows/riscv64.yml +++ b/.github/workflows/riscv64.yml @@ -10,8 +10,9 @@ on: pull_request: paths: - .github/workflows/riscv64.yml + - .github/workflows/_test_riscv.yml - .ci/scripts/test_riscv_qemu.sh - - tools/cmake/preset/riscv64_linux.cmake + - tools/cmake/preset/riscv64_*.cmake - examples/riscv/** workflow_dispatch: schedule: @@ -35,33 +36,42 @@ jobs: - llama2 - resnet18 - yolo26 - xnnpack: [true, false] quantize: [true, false] + backend: [portable, xnnpack] + os: [linux, baremetal] + arch: [rv64, rv32] exclude: - # We only enable quantization with XNNPACK - - xnnpack: false - quantize: true - # We don't test quantization for Yolo26 - - model: yolo26 - quantize: true + # Disable quantization testing with Portable Kernels + - { backend: portable, quantize: true } + # XNNPACK needs pthreads + dynamic loading (no baremetal) + - { backend: xnnpack, os: baremetal } + # No quantization recipe for Yolo26. + - { model: yolo26, quantize: true } + # No riscv32-linux-gnu cross is packaged on Ubuntu. + - { os: linux, arch: rv32 } permissions: id-token: write contents: read with: model: ${{ matrix.model }} - xnnpack: ${{ matrix.xnnpack }} quantize: ${{ matrix.quantize }} - # If XNNPACK, test with multiple RVV length, disabled otherwise - qemu-cpu: >- + backend: ${{ matrix.backend }} + os: ${{ matrix.os }} + arch: ${{ matrix.arch }} + # JSON array of QEMU -cpu *extension* strings (no rv32/rv64 prefix - that + # comes from `arch`). The script splices them as `,`. xnnpack + # benefits from RVV so it sweeps multiple vlen; everything else just uses + # the plain base ISA. + qemu-cpu-ext: >- ${{ case( - matrix.xnnpack, '[ - "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=128,elen=64,vext_spec=v1.0", - "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=256,elen=64,vext_spec=v1.0", - "rv64,zba=true,zbb=true,zbs=true,v=true,vlen=512,elen=64,vext_spec=v1.0" + matrix.backend == 'xnnpack', '[ + "v=true,vext_spec=v1.0,vlen=128", + "v=true,vext_spec=v1.0,vlen=256", + "v=true,vext_spec=v1.0,vlen=512" ]', '[ - "rv64,zba=true,zbb=true,zbs=true,v=false" + "v=false" ]' ) }} diff --git a/CMakePresets.json b/CMakePresets.json index 91848565067..15d005cbede 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -318,7 +318,7 @@ "displayName": "Build ExecuTorch for riscv64 Linux (cross-compile)", "inherits": ["common"], "cacheVariables": { - "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv64_linux.cmake", + "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv_linux.cmake", "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/riscv/riscv64-linux-gnu-toolchain.cmake" }, "condition": { @@ -327,6 +327,24 @@ "rhs": "Linux" } }, + { + "name": "riscv64-baremetal", + "displayName": "Build ExecuTorch for riscv64 baremetal (cross-compile)", + "inherits": ["common"], + "cacheVariables": { + "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv_baremetal.cmake", + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/riscv/riscv64-unknown-elf-toolchain.cmake" + } + }, + { + "name": "riscv32-baremetal", + "displayName": "Build ExecuTorch for riscv32 baremetal (cross-compile)", + "inherits": ["common"], + "cacheVariables": { + "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/riscv_baremetal.cmake", + "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/examples/riscv/riscv32-unknown-elf-toolchain.cmake" + } + }, { "name": "mlx", "displayName": "Build MLX delegate", diff --git a/examples/riscv/README.md b/examples/riscv/README.md index 563ff4913fd..2c250f75cd7 100644 --- a/examples/riscv/README.md +++ b/examples/riscv/README.md @@ -1,41 +1,36 @@ # RISC-V -Cross-compile `executor_runner` for `riscv64-linux-gnu` and run it under -`qemu-user-static` against a small bundled program. The end-to-end check -mirrors the Arm Cortex-M e2e flow: a `Test_result: PASS` line in stdout from -the bundled-IO comparison path is the pass criterion. +End-to-end smoke tests that cross-compile ExecuTorch for RISC-V and run a bundled program under QEMU. A `Test_result: PASS` line emitted by the bundled-IO comparison path is the pass criterion. -This is the Phase 1 deliverable for the RISC-V Support RFC at -[pytorch/executorch#18991][rfc]. The cross-compile and runner artifacts -(toolchain file, preset, AOT script) are designed to carry over unchanged -to a hardware-runner job once one becomes available; only the invocation -step (qemu-user vs. native) would change. - -[rfc]: https://github.com/pytorch/executorch/issues/18991 +Part of the RISC-V Support RFC, [pytorch/executorch#18991](https://github.com/pytorch/executorch/issues/18991). ## Quick start (Ubuntu / Debian) ```bash -examples/riscv/setup.sh # apt: gcc-riscv64-linux-gnu, qemu-user-static -examples/riscv/run.sh # export, cross-compile, run under qemu-user +examples/riscv/setup-linux.sh # apt: gcc cross riscv64-linux-gnu + qemu-user +examples/riscv/setup-baremetal.sh # apt: gcc cross riscv64-unknown-elf + qemu-system + picolibc +examples/riscv/run.sh # export, cross-compile, run under qemu ``` -The driver does three steps: +`run.sh` accepts: + +| Flag | Values | Default | Notes | +|---|---|---|---| +| `--model=` | `add`, `mv2`, `mobilebert`, `llama2`, `resnet18`, `yolo26` | `add` | which model to export | +| `--quantize` | flag | off | XNNPACK quantizer (requires `--backend=xnnpack`) | +| `--backend=` | `portable`, `xnnpack` | `portable` | xnnpack is linux-only | +| `--os=` | `linux`, `baremetal` | `linux` | qemu-user vs qemu-system + semihosting | +| `--arch=` | `rv64` | `rv64` | (rv32 follow-up; no `riscv32-linux-gnu` cross is packaged on Ubuntu) | +| `--qemu-cpu-ext=` | e.g. `v=true,vlen=128` | empty | extensions appended after the arch base | + +## Pipelines + +**linux**: `aot_riscv.py` → `cmake --preset riscv64-linux` → `executor_runner` under `qemu-riscv64`. Portable kernels + (optional) XNNPACK delegate. + +**baremetal**: `aot_riscv.py` → `cmake -S examples/riscv/baremetal` (standalone project; pulls executorch in via `add_subdirectory`) → `executor_runner_baremetal.elf` under `qemu-system-riscv64 -machine virt -bios none -semihosting-config target=native`. -1. `python examples/riscv/aot_riscv.py` exports a `torch.add` module to - `riscv_test/add_riscv.bpte` (a BundledProgram with reference outputs - embedded for two test cases). -2. `cmake --preset riscv64-linux` configures the cross-build using - `examples/riscv/riscv64-linux-gnu-toolchain.cmake` and - `tools/cmake/preset/riscv64_linux.cmake`. `executor_runner` is built - against portable kernels with `ET_BUNDLE_IO_ENABLED` defined. -3. `qemu-riscv64-static` invokes the runner with `--model_path` pointing at - the `.bpte`. The runner detects the bundle, runs every embedded test case, - and emits `Test_result: PASS` (or `FAIL`) per case. +The baremetal runner embeds the `.bpte` directly in `.rodata` via the same `examples/arm/executor_runner/pte_to_header.py` Cortex-M uses; semihosting SYS_WRITE0 / SYS_EXIT carry log output and exit status to the host. ## CI -`.github/workflows/_test_riscv_qemu.yml` is a reusable `workflow_call` -job (mirroring `_test_cortex_m_e2e.yml`) invoked from `pull.yml` to run on -every PR. It runs on the standard `linux.2xlarge` x86_64 runner using the -`executorch-ubuntu-22.04-gcc11` docker image. +`.github/workflows/riscv64.yml` is the entry point; it fans out into `_test_riscv.yml` over a `(model, backend, os, arch, quantize)` matrix and sweeps `qemu-cpu-ext` per backend. Runs on the `executorch-ubuntu-26.04-gcc15` docker image (needed for the `riscv64-unknown-elf` picolibc + libstdc++ packages - see [setup.sh](setup.sh)). diff --git a/examples/riscv/aot_riscv.py b/examples/riscv/aot_riscv.py index edc30c2653b..e01fe6f954e 100644 --- a/examples/riscv/aot_riscv.py +++ b/examples/riscv/aot_riscv.py @@ -3,11 +3,12 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -"""AOT export for the RISC-V smoke test. +"""AOT export for the RISC-V smoke tests. -Exports a small model to a BundledProgram (.bpte) that the portable -executor_runner can load on a riscv64 target and verify against the embedded -reference output, emitting ``Test_result: PASS`` on success. +Exports the model selected by ``--model`` to a BundledProgram (.bpte) that +either ``executor_runner`` (linux) or ``executor_runner_baremetal`` (qemu +virt + semihosting) consumes. The bundled-IO comparison path inside the +runner emits ``Test_result: PASS`` per testset, which is what run.sh greps. """ import argparse @@ -171,9 +172,19 @@ def main() -> None: help="Output .bpte path (default: _riscv.bpte)", ) parser.add_argument( - "--xnnpack", - action="store_true", - help="Lower through the XNNPACK partitioner", + "--backend", + choices=("portable", "xnnpack"), + default="portable", + help="AOT backend: 'portable' runs everything on the portable kernels, " + "'xnnpack' adds the XNNPACK partitioner (default: portable)", + ) + parser.add_argument( + "--os", + choices=("linux", "baremetal"), + default="linux", + help="Target OS for the runner that will consume this .bpte. The .bpte " + "itself is OS-independent; the flag is logged so callers can verify " + "the AOT/runtime sides agree (default: linux)", ) parser.add_argument( "--quantize", @@ -187,6 +198,13 @@ def main() -> None: ) args = parser.parse_args() + if args.debug_xnnpack and args.backend != "xnnpack": + parser.error("--debug-xnnpack requires --backend=xnnpack") + + # xnnpack pulls in pthreads + dynamic loading; baremetal runner doesn't have those. + if args.os == "baremetal" and args.backend == "xnnpack": + parser.error("--backend=xnnpack is not supported on --os=baremetal") + if args.debug_xnnpack: logging.basicConfig(level=logging.DEBUG) @@ -209,7 +227,7 @@ def main() -> None: exported = export(model, example_inputs, strict=strict) partitioners = [] - if args.xnnpack: + if args.backend == "xnnpack": from executorch.backends.xnnpack.partition.xnnpack_partitioner import ( XnnpackPartitioner, ) @@ -223,7 +241,9 @@ def main() -> None: compile_config = EdgeCompileConfig(_check_ir_validity=False) edge = to_edge_transform_and_lower( - exported, partitioner=partitioners, compile_config=compile_config + exported, + partitioner=partitioners, + compile_config=compile_config, ) delegated = sum( 1 @@ -231,7 +251,7 @@ def main() -> None: if n.op == "call_function" and "call_delegate" in str(n.target) ) print( - f"[aot_riscv] model={args.model} xnnpack={args.xnnpack} " + f"[aot_riscv] model={args.model} backend={args.backend} os={args.os} " f"quantize={args.quantize} delegated_nodes={delegated}" ) diff --git a/examples/riscv/baremetal/CMakeLists.txt b/examples/riscv/baremetal/CMakeLists.txt new file mode 100644 index 00000000000..b7765c4e3a1 --- /dev/null +++ b/examples/riscv/baremetal/CMakeLists.txt @@ -0,0 +1,117 @@ +# Copyright 2026 The ExecuTorch Authors. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Standalone runner project, invoked from examples/riscv/run.sh as: +# ~~~ +# cmake -S examples/riscv/baremetal -B \ +# -DEXECUTORCH_ROOT= \ +# -DRISCV_BAREMETAL_PTE=.bpte \ +# -DCMAKE_TOOLCHAIN_FILE=.../riscv{32,64}-unknown-elf-toolchain.cmake +# ~~~ +# Mirrors examples/arm/executor_runner/standalone/CMakeLists.txt so the +# top-level executorch CMake has no reference to examples/riscv/. + +cmake_minimum_required(VERSION 3.20) +project(riscv_executor_runner_baremetal LANGUAGES C CXX ASM) + +get_filename_component( + _default_executorch_root "${CMAKE_CURRENT_LIST_DIR}/../../.." ABSOLUTE +) +if(NOT DEFINED EXECUTORCH_ROOT) + set(EXECUTORCH_ROOT + "${_default_executorch_root}" + CACHE PATH "Path to the ExecuTorch checkout" + ) +endif() +if(NOT EXISTS "${EXECUTORCH_ROOT}/CMakeLists.txt") + message( + FATAL_ERROR + "EXECUTORCH_ROOT (${EXECUTORCH_ROOT}) does not contain an ExecuTorch CMake project." + ) +endif() + +set(RISCV_BAREMETAL_PTE + "" + CACHE FILEPATH "Path to the .bpte to embed in the baremetal runner" +) +if(NOT RISCV_BAREMETAL_PTE) + message( + FATAL_ERROR + "RISCV_BAREMETAL_PTE not set; pass -DRISCV_BAREMETAL_PTE= from run.sh" + ) +endif() + +include("${EXECUTORCH_ROOT}/tools/cmake/common/preset.cmake") +if(NOT DEFINED EXECUTORCH_BUILD_PRESET_FILE) + set(EXECUTORCH_BUILD_PRESET_FILE + "${EXECUTORCH_ROOT}/tools/cmake/preset/riscv64_baremetal.cmake" + CACHE PATH "Preset used when configuring the standalone baremetal runner" + ) +endif() +load_build_preset() +include("${EXECUTORCH_ROOT}/tools/cmake/Utils.cmake") + +add_subdirectory( + "${EXECUTORCH_ROOT}" "${CMAKE_BINARY_DIR}/executorch" EXCLUDE_FROM_ALL +) + +find_package(Python3 REQUIRED COMPONENTS Interpreter) + +set(_pte_header "${CMAKE_CURRENT_BINARY_DIR}/model_pte.h") +add_custom_command( + OUTPUT "${_pte_header}" + COMMAND + "${Python3_EXECUTABLE}" + "${EXECUTORCH_ROOT}/examples/arm/executor_runner/pte_to_header.py" --pte + "${RISCV_BAREMETAL_PTE}" --outdir "${CMAKE_CURRENT_BINARY_DIR}" --outfile + "model_pte.h" --section ".rodata.model_pte" + DEPENDS "${RISCV_BAREMETAL_PTE}" + COMMENT "Embedding ${RISCV_BAREMETAL_PTE} into model_pte.h" + VERBATIM +) + +# pte_to_header.py emits the byte array but not its length; the glue TU +# materialises the matching `model_pte_len` and is the only place the header is +# included (avoids a double-definition at link time). +file( + WRITE "${CMAKE_CURRENT_BINARY_DIR}/model_pte_glue.cpp" + "#include \n#include \"model_pte.h\"\nextern \"C\" const size_t model_pte_len = sizeof(model_pte);\n" +) + +add_executable( + executor_runner_baremetal + start.S executor_runner_baremetal.cpp + "${CMAKE_CURRENT_BINARY_DIR}/model_pte_glue.cpp" "${_pte_header}" +) +set_target_properties( + executor_runner_baremetal PROPERTIES SUFFIX ".elf" LINKER_LANGUAGE CXX +) +target_include_directories( + executor_runner_baremetal PRIVATE "${CMAKE_CURRENT_BINARY_DIR}" +) +target_compile_options( + executor_runner_baremetal PRIVATE -fno-exceptions -fno-rtti -fdata-sections + -ffunction-sections +) +# --specs=picolibc.specs / -nostartfiles / -march / -mabi all come from the +# toolchain file; only the linker script (QEMU virt memory map) is target- +# specific here. +target_link_options( + executor_runner_baremetal PRIVATE + "-T${CMAKE_CURRENT_SOURCE_DIR}/riscv_virt.ld" +) + +# gen_operators_lib / executorch_target_link_options_shared_lib attach INTERFACE +# --whole-archive options to portable_ops_lib (so the static-init +# kernel-registration TU survives DCE) and to executorch itself. Listing the +# libs once each is enough; an extra --whole-archive wrapper around them would +# include the same archive twice and double-register every op. +target_link_libraries(executor_runner_baremetal PRIVATE bundled_program) +if(TARGET portable_ops_lib) + target_link_libraries(executor_runner_baremetal PRIVATE portable_ops_lib) +endif() +if(TARGET portable_kernels) + target_link_libraries(executor_runner_baremetal PRIVATE portable_kernels) +endif() diff --git a/examples/riscv/baremetal/executor_runner_baremetal.cpp b/examples/riscv/baremetal/executor_runner_baremetal.cpp new file mode 100644 index 00000000000..d0bb128bd98 --- /dev/null +++ b/examples/riscv/baremetal/executor_runner_baremetal.cpp @@ -0,0 +1,286 @@ +/* + * Copyright 2026 The ExecuTorch Authors. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Baremetal runner for qemu-system-riscv64 -machine virt + semihosting. Loads +// a .bpte embedded into the ELF and emits "TEST: BundleIO index[N] +// Test_result: PASS|FAIL" via ET_LOG so examples/riscv/run.sh's grep can +// detect success without a host filesystem. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "semihosting.h" + +extern "C" const uint8_t model_pte[]; +extern "C" const size_t model_pte_len; + +using executorch::extension::BufferDataLoader; +using executorch::runtime::Error; +using executorch::runtime::HierarchicalAllocator; +using executorch::runtime::MemoryAllocator; +using executorch::runtime::MemoryManager; +using executorch::runtime::Method; +using executorch::runtime::MethodMeta; +using executorch::runtime::Program; +using executorch::runtime::Result; +using executorch::runtime::Span; + +namespace { + +// Pools are sized for the largest model we currently test (llama2 / yolo26) +// rather than per-model; the .bss grows but freestanding picolibc never +// allocates from it so the cost is just a bigger ELF. Bumping these requires +// matching headroom in riscv_virt.ld's RAM region and qemu's -m flag. +alignas(16) uint8_t method_allocator_pool[1u << 23]; // 8 MiB +alignas(16) uint8_t temp_allocator_pool[1u << 22]; // 4 MiB +alignas(16) uint8_t planned_memory_pool[1u << 26]; // 64 MiB + +constexpr size_t kMaxPlannedBuffers = 8; +constexpr double kRtol = 0.01; +constexpr double kAtol = 0.01; + +} // namespace + +extern "C" [[noreturn]] void baremetal_exit(int status) { + executorch::riscv::baremetal::semihost_exit(status); +} + +// picolibc's abort()/raise() resolve _exit; with our own start.S we don't +// link its crt0, so reroute it to the semihosting trap. +extern "C" [[noreturn]] void _exit(int status) { + executorch::riscv::baremetal::semihost_exit(status); +} + +// libstdc++'s drags std::random_device → getentropy/read. The portable +// rand kernels are never invoked at runtime for our bundled-IO tests, so a +// failing stub is enough to satisfy the link. +extern "C" int getentropy(void*, size_t) { + return -1; +} +extern "C" long read(int, void*, size_t) { + return -1; +} + +// Virtual destructors emit deleting variants that reference operator delete +// even when we never new/delete. Stubs satisfy the linker; never called. +void operator delete(void*) noexcept {} +void operator delete(void*, size_t) noexcept {} +void operator delete[](void*) noexcept {} +void operator delete[](void*, size_t) noexcept {} + +// op_rand / op_native_dropout / op_randn from portable_kernels reference +// std::random_device::_M_{init,getval,fini}, whose only definitions live in +// libstdc++.a's medlow-built random.o (won't relocate at 0x80000000). The +// bundled-IO smoke tests never invoke those ops, so satisfy the linker with +// no-op trampolines under the Itanium-mangled names. +asm(R"( + .globl _ZNSt13random_device7_M_initERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE + .type _ZNSt13random_device7_M_initERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE, @function +_ZNSt13random_device7_M_initERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE: + ret + + .globl _ZNSt13random_device9_M_getvalEv + .type _ZNSt13random_device9_M_getvalEv, @function +_ZNSt13random_device9_M_getvalEv: + li a0, 0 + ret + + .globl _ZNSt13random_device7_M_finiEv + .type _ZNSt13random_device7_M_finiEv, @function +_ZNSt13random_device7_M_finiEv: + ret +)"); + +// Route ET_LOG through semihosting. Messages aren't null-terminated; copy and +// append \n\0 before forwarding to SYS_WRITE0. +extern "C" void et_pal_emit_log_message( + et_timestamp_t, + et_pal_log_level_t, + const char*, + const char*, + size_t, + const char* message, + size_t length) { + // The bundle doesn't expose a testset count, so we probe past the end and + // rely on InvalidArgument to terminate the loop. The accompanying ET_LOG + // ("testset_idx N is out of range ...") is benign noise — suppress it so + // run.sh's PASS/FAIL grep stays clean. + static const char kOorPrefix[] = "testset_idx "; + if (length >= sizeof(kOorPrefix) - 1 && + std::memcmp(message, kOorPrefix, sizeof(kOorPrefix) - 1) == 0) { + return; + } + char buf[512]; + size_t n = length < sizeof(buf) - 2 ? length : sizeof(buf) - 2; + std::memcpy(buf, message, n); + buf[n] = '\n'; + buf[n + 1] = '\0'; + executorch::riscv::baremetal::semihost_write0(buf); +} + +extern "C" void et_pal_init(void) {} +extern "C" [[noreturn]] void et_pal_abort(void) { + executorch::riscv::baremetal::semihost_exit(1); +} +extern "C" et_timestamp_t et_pal_current_ticks(void) { + return 0; +} +extern "C" et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) { + return {1, 1}; +} +extern "C" void* et_pal_allocate(size_t) { + return nullptr; +} +extern "C" void et_pal_free(void*) {} + +int main() { + executorch::runtime::runtime_init(); + + const void* program_data = nullptr; + size_t program_size = 0; + Error status = executorch::bundled_program::get_program_data( + const_cast(model_pte), + model_pte_len, + &program_data, + &program_size); + if (status != Error::Ok) { + ET_LOG( + Error, "get_program_data failed: 0x%x", static_cast(status)); + return 1; + } + + BufferDataLoader loader(program_data, program_size); + Result program = Program::load(&loader); + if (!program.ok()) { + ET_LOG( + Error, + "Program::load failed: 0x%x", + static_cast(program.error())); + return 1; + } + + // The harness always exports a single "forward" method. Skipping the + // Result deref of program->get_method_name(0) sidesteps a + // codegen wedge we hit under -mcmodel=medany + picolibc. + const char* method_name = "forward"; + ET_LOG(Info, "Using method %s", method_name); + + Result method_meta = program->method_meta(method_name); + if (!method_meta.ok()) { + ET_LOG( + Error, + "method_meta failed: 0x%x", + static_cast(method_meta.error())); + return 1; + } + + MemoryAllocator method_allocator( + sizeof(method_allocator_pool), method_allocator_pool); + MemoryAllocator temp_allocator( + sizeof(temp_allocator_pool), temp_allocator_pool); + + // One span per planned buffer, bumped through a single .bss arena so we + // don't need a heap. kMaxPlannedBuffers / pool size both grow with bigger + // models; failures here are loud rather than silent. + Span planned_spans[kMaxPlannedBuffers]; + size_t num_planned = method_meta->num_memory_planned_buffers(); + if (num_planned > kMaxPlannedBuffers) { + ET_LOG( + Error, + "num_planned=%zu exceeds kMaxPlannedBuffers=%zu", + num_planned, + kMaxPlannedBuffers); + return 1; + } + size_t offset = 0; + for (size_t id = 0; id < num_planned; ++id) { + size_t sz = + static_cast(method_meta->memory_planned_buffer_size(id).get()); + sz = (sz + 15u) & ~15u; + if (offset + sz > sizeof(planned_memory_pool)) { + ET_LOG( + Error, + "planned buffer %zu (size %zu) overflows pool (%zu/%zu)", + id, + sz, + offset, + sizeof(planned_memory_pool)); + return 1; + } + planned_spans[id] = Span(planned_memory_pool + offset, sz); + offset += sz; + } + HierarchicalAllocator planned_memory( + Span>(planned_spans, num_planned)); + MemoryManager memory_manager( + &method_allocator, &planned_memory, &temp_allocator); + + Result method = program->load_method(method_name, &memory_manager); + if (!method.ok()) { + ET_LOG( + Error, + "load_method failed: 0x%x", + static_cast(method.error())); + return 1; + } + + // load_bundled_input returns InvalidArgument past the last testset; that's + // how we detect the loop terminator (the bundle has no public count API). + int rc = 0; + for (size_t testset_idx = 0;; ++testset_idx) { + Error load = executorch::bundled_program::load_bundled_input( + *method, const_cast(model_pte), testset_idx); + if (load != Error::Ok) { + if (testset_idx == 0) { + ET_LOG( + Error, + "load_bundled_input failed for testset 0: 0x%x", + static_cast(load)); + rc = 1; + } + break; + } + Error exec = method->execute(); + if (exec != Error::Ok) { + ET_LOG( + Error, + "execute failed for testset %zu: 0x%x", + testset_idx, + static_cast(exec)); + ET_LOG(Error, "TEST: BundleIO index[%zu] Test_result: FAIL", testset_idx); + rc = 1; + continue; + } + Error verify = executorch::bundled_program::verify_method_outputs( + *method, const_cast(model_pte), testset_idx, kRtol, kAtol); + if (verify == Error::Ok) { + ET_LOG(Info, "TEST: BundleIO index[%zu] Test_result: PASS", testset_idx); + } else { + ET_LOG( + Error, + "verify_method_outputs failed for testset %zu: 0x%x", + testset_idx, + static_cast(verify)); + ET_LOG(Error, "TEST: BundleIO index[%zu] Test_result: FAIL", testset_idx); + rc = 1; + } + } + + return rc; +} diff --git a/examples/riscv/baremetal/riscv_virt.ld b/examples/riscv/baremetal/riscv_virt.ld new file mode 100644 index 00000000000..34980116b1d --- /dev/null +++ b/examples/riscv/baremetal/riscv_virt.ld @@ -0,0 +1,85 @@ +/* + * Copyright 2026 The ExecuTorch Authors. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* qemu-system-riscv{32,64} -machine virt -bios none -kernel: the virt board's + * reset stub at 0x1000 jumps to DRAM base 0x80000000, so _start has to live + * there. RAM size matches the qemu `-m 512M` we pass from run.sh — the + * embedded .bpte in .rodata can be tens of MB for mv2 / llama2 / yolo26. */ + +OUTPUT_ARCH(riscv) +ENTRY(_start) + +MEMORY +{ + RAM (rwx) : ORIGIN = 0x80000000, LENGTH = 512M +} + +SECTIONS +{ + .text 0x80000000 : + { + KEEP(*(.text.boot)) + *(.text .text.*) + } > RAM + + .rodata : ALIGN(8) + { + *(.rodata .rodata.*) + *(.srodata .srodata.*) + } > RAM + + /* C++ global ctors. start.S calls picolibc's __libc_init_array, which + * walks symbols __bothinit_array_start..__bothinit_array_end (preinit + + * init combined). The stock newlib names (__init_array_start/end) are + * defined too for portability, but it's the "both" pair picolibc reads. */ + .bothinit_array : ALIGN(8) + { + PROVIDE_HIDDEN(__bothinit_array_start = .); + PROVIDE_HIDDEN(__preinit_array_start = .); + KEEP(*(.preinit_array)) + PROVIDE_HIDDEN(__preinit_array_end = .); + PROVIDE_HIDDEN(__init_array_start = .); + KEEP(*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))) + KEEP(*(.init_array EXCLUDE_FILE(*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o) .ctors)) + PROVIDE_HIDDEN(__init_array_end = .); + PROVIDE_HIDDEN(__bothinit_array_end = .); + } > RAM + .fini_array : ALIGN(8) + { + PROVIDE_HIDDEN(__fini_array_start = .); + KEEP(*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*))) + KEEP(*(.fini_array EXCLUDE_FILE(*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o) .dtors)) + PROVIDE_HIDDEN(__fini_array_end = .); + } > RAM + + .data : ALIGN(8) + { + *(.data .data.*) + *(.sdata .sdata.*) + } > RAM + + .bss : ALIGN(8) + { + _bss_start = .; + *(.bss .bss.*) + *(.sbss .sbss.*) + *(COMMON) + . = ALIGN(8); + _bss_end = .; + } > RAM + + /* 2 MiB stack at the high end of RAM; grows downward. picolibc's sbrk + * looks up __heap_start / __heap_end (double-underscore). */ + . = ALIGN(16); + PROVIDE(__heap_start = .); + . = ORIGIN(RAM) + LENGTH(RAM) - 2M; + PROVIDE(__heap_end = .); + . = . + 2M; + _stack_top = .; + + /DISCARD/ : { *(.note.* .comment .eh_frame .riscv.attributes) } +} diff --git a/examples/riscv/baremetal/semihosting.h b/examples/riscv/baremetal/semihosting.h new file mode 100644 index 00000000000..7af63048d29 --- /dev/null +++ b/examples/riscv/baremetal/semihosting.h @@ -0,0 +1,51 @@ +/* + * Copyright 2026 The ExecuTorch Authors. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +namespace executorch { +namespace riscv { +namespace baremetal { + +// The RISC-V semihosting trigger is a fixed three-insn sequence (slli/ebreak/ +// srai of x0) so qemu can distinguish it from a normal ecall. Op number in +// a0, arg pointer in a1, return value back in a0. +inline long semihost_call(long op, const void* arg) { + register long a0 asm("a0") = op; + register long a1 asm("a1") = (long)arg; + asm volatile( + ".option push\n\t" + ".option norvc\n\t" + "slli x0, x0, 0x1f\n\t" + "ebreak\n\t" + "srai x0, x0, 0x7\n\t" + ".option pop" + : "+r"(a0) + : "r"(a1) + : "memory"); + return a0; +} + +constexpr long SYS_WRITE0 = 0x04; +constexpr long SYS_EXIT_EXTENDED = 0x20; + +inline void semihost_write0(const char* s) { + semihost_call(SYS_WRITE0, s); +} + +[[noreturn]] inline void semihost_exit(int status) { + // ADP_Stopped_ApplicationExit (0x20026) + status, per the semihosting spec. + long block[2] = {0x20026, (long)status}; + semihost_call(SYS_EXIT_EXTENDED, block); + __builtin_trap(); +} + +} // namespace baremetal +} // namespace riscv +} // namespace executorch diff --git a/examples/riscv/baremetal/start.S b/examples/riscv/baremetal/start.S new file mode 100644 index 00000000000..092eeffa4a6 --- /dev/null +++ b/examples/riscv/baremetal/start.S @@ -0,0 +1,49 @@ +/* + * Copyright 2026 The ExecuTorch Authors. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Boot stub for the qemu virt RISC-V baremetal runner: set sp, enable FPU, +// zero .bss, run C++ static ctors via __libc_init_array, jump to main. On +// return, call baremetal_exit so qemu terminates deterministically. + +#if __riscv_xlen == 64 +#define SX sd +#define XLEN_BYTES 8 +#else +#define SX sw +#define XLEN_BYTES 4 +#endif + + .section .text.boot, "ax" + .globl _start + .type _start, @function +_start: + la sp, _stack_top + + // mstatus.FS resets to Off in M-mode, so any FP insn (libstdc++ template + // code emits fsd/fld) traps. We have no trap vector, so the CPU would + // loop on the fault. FS=Dirty (0b11 in bits 13-14) keeps the FPU live. + li t0, 0x6000 + csrs mstatus, t0 + + la a0, _bss_start + la a1, _bss_end +1: + bgeu a0, a1, 2f + SX zero, 0(a0) + addi a0, a0, XLEN_BYTES + j 1b +2: + call __libc_init_array + li a0, 0 + li a1, 0 + call main + call baremetal_exit +3: + wfi + j 3b + + .size _start, .-_start diff --git a/examples/riscv/riscv32-unknown-elf-toolchain.cmake b/examples/riscv/riscv32-unknown-elf-toolchain.cmake new file mode 100644 index 00000000000..ae968ea6fe2 --- /dev/null +++ b/examples/riscv/riscv32-unknown-elf-toolchain.cmake @@ -0,0 +1,74 @@ +# Copyright 2026 The ExecuTorch Authors. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# rv32 baremetal cross-toolchain. Uses the multilib-aware riscv64-unknown-elf +# gcc (one package, both XLENs); `-march=rv32...` + `-mabi=ilp32d` selects the +# 32-bit picolibc + libstdc++ variant. ELF runs under qemu-system-riscv32 +# -machine virt with semihosting. + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_SYSTEM_PROCESSOR riscv32) + +set(CMAKE_C_COMPILER + "riscv64-unknown-elf-gcc" + CACHE FILEPATH "" +) +set(CMAKE_CXX_COMPILER + "riscv64-unknown-elf-g++" + CACHE FILEPATH "" +) +set(CMAKE_ASM_COMPILER + "riscv64-unknown-elf-gcc" + CACHE FILEPATH "" +) +set(CMAKE_AR + "riscv64-unknown-elf-ar" + CACHE FILEPATH "" +) +set(CMAKE_RANLIB + "riscv64-unknown-elf-ranlib" + CACHE FILEPATH "" +) +set(CMAKE_STRIP + "riscv64-unknown-elf-strip" + CACHE FILEPATH "" +) + +set(CMAKE_EXECUTABLE_SUFFIX ".elf") +# try_compile() can't link without crt0/specs; archive-only sidesteps that. +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) + +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) + +# Baseline rv32imafdc / ilp32d — the rv32gc-equivalent multilib Ubuntu's +# picolibc + libstdc++ ship. (Unlike rv64, the full rv32gc multilib *is* +# packaged, so we don't have to drop M / C here.) -mcmodel=medany because medlow +# can't reach our 0x80000000 base. picolibc.specs must be on the compile line +# too so libstdc++ headers find picolibc's C headers via the spec's sysroot. +add_compile_options( + --specs=picolibc.specs + -march=rv32imafdc + -mabi=ilp32d + -mcmodel=medany + -fdata-sections + -ffunction-sections + "$<$:-fno-rtti;-fno-exceptions;-fno-unwind-tables>" +) +# -nostdlib++ drops g++'s implicit libstdc++.a (medlow-built, won't relocate). +# -nostartfiles drops picolibc's crt0 in favour of our start.S. +add_link_options( + --specs=picolibc.specs + -march=rv32imafdc + -mabi=ilp32d + -mcmodel=medany + -nostdlib++ + -nostartfiles + "LINKER:--gc-sections" +) diff --git a/examples/riscv/riscv64-unknown-elf-toolchain.cmake b/examples/riscv/riscv64-unknown-elf-toolchain.cmake new file mode 100644 index 00000000000..a4533675f89 --- /dev/null +++ b/examples/riscv/riscv64-unknown-elf-toolchain.cmake @@ -0,0 +1,77 @@ +# Copyright 2026 The ExecuTorch Authors. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# rv64 baremetal cross-toolchain (Ubuntu 26.04+ packages: +# gcc-riscv64-unknown-elf, picolibc-riscv64-unknown-elf, +# libstdc++-riscv64-unknown-elf-picolibc). The resulting ELF runs under +# qemu-system-riscv64 -machine virt with semihosting. + +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +set(CMAKE_C_COMPILER + "riscv64-unknown-elf-gcc" + CACHE FILEPATH "" +) +set(CMAKE_CXX_COMPILER + "riscv64-unknown-elf-g++" + CACHE FILEPATH "" +) +set(CMAKE_ASM_COMPILER + "riscv64-unknown-elf-gcc" + CACHE FILEPATH "" +) +set(CMAKE_AR + "riscv64-unknown-elf-ar" + CACHE FILEPATH "" +) +set(CMAKE_RANLIB + "riscv64-unknown-elf-ranlib" + CACHE FILEPATH "" +) +set(CMAKE_STRIP + "riscv64-unknown-elf-strip" + CACHE FILEPATH "" +) + +set(CMAKE_EXECUTABLE_SUFFIX ".elf") +# try_compile() can't link without crt0/specs; archive-only sidesteps that. +set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY) +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_PACKAGE ONLY) + +set(CMAKE_C_STANDARD 11) +set(CMAKE_CXX_STANDARD 17) + +# Picked baseline: rv64iafd / lp64d. Ubuntu's picolibc + libstdc++ packages +# don't ship the rv64gc (= rv64imafdc) multilib, so this drops M (integer mul) +# and C (compressed) but keeps double-float. -mcmodel=medany because medlow's +# signed-32-bit-around-0 reach can't address our 0x80000000 base. +# --specs=picolibc.specs has to appear at *compile* time too: libstdc++'s +# // need picolibc's C headers via the spec's +# sysroot. +add_compile_options( + --specs=picolibc.specs + -march=rv64iafd + -mabi=lp64d + -mcmodel=medany + -fdata-sections + -ffunction-sections + "$<$:-fno-rtti;-fno-exceptions;-fno-unwind-tables>" +) +# -nostdlib++ drops g++'s implicit libstdc++.a (medlow-built, won't relocate at +# 0x80000000); we only use its templates, no runtime calls. -nostartfiles drops +# picolibc's crt0 in favour of our start.S. +add_link_options( + --specs=picolibc.specs + -march=rv64iafd + -mabi=lp64d + -mcmodel=medany + -nostdlib++ + -nostartfiles + "LINKER:--gc-sections" +) diff --git a/examples/riscv/run.sh b/examples/riscv/run.sh index 2c207816bfc..e44f23add86 100755 --- a/examples/riscv/run.sh +++ b/examples/riscv/run.sh @@ -4,42 +4,52 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -# RISC-V Phase 1 smoke test driver (pytorch/executorch#18991): -# 1. Export a tiny model to a BundledProgram (.bpte) on the x86_64 host. -# 2. Cross-compile executor_runner for riscv64 Linux glibc. -# 3. Invoke the runner under qemu-user-static and grep its stdout for the -# Test_result: PASS marker emitted by the bundled-IO comparison path. +# RISC-V smoke test driver: +# 1. Export a small model to a BundledProgram (.bpte) on the host. +# 2. Cross-compile a riscv32/64 runner (linux glibc or baremetal). +# 3. Invoke under qemu and grep stdout for the Test_result: PASS marker. -set -eu +set -euo pipefail script_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) et_root_dir=$(realpath "${script_dir}/../..") build_only=false -build_dir="${et_root_dir}/cmake-out-riscv" -output_dir="${et_root_dir}/riscv_test" -qemu="qemu-riscv64-static" -qemu_timeout="600" +build_dir= +qemu_timeout="1800" model="add" -xnnpack=false +backend="portable" +os="linux" +arch="rv64" +qemu_cpu_ext="" quantize=false debug_xnnpack=false verbose_xnnpack=false +qemu_override="" usage() { cat < Which model to export and run (default: ${model}) - --xnnpack Enable the XNNPACK backend (AOT partitioner + runtime) --quantize Produce an 8-bit quantized model - --verbose-xnnpack Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch at runtime + --backend= AOT backend (default: ${backend}): + - 'portable': portable kernels only + - 'xnnpack': XNNPACK delegate (linux only) + --os= Target OS (default: ${os}): + - 'linux': glibc, qemu-user + - 'baremetal': no OS, qemu-system + semihosting + --arch= Target arch (default: ${arch}): + - 'rv64': riscv64 + - 'rv32': riscv32 + --qemu-cpu-ext= QEMU -cpu extensions appended after the arch base + (e.g. 'v=true,vlen=128'); no rv32/rv64 prefix. + --verbose-xnnpack Build XNNPACK with XNN_LOG_LEVEL=4 to log microkernel dispatch --debug-xnnpack Enable XNNPACK partitioner DEBUG logging and dump the lowered graph --build_only Only export and cross-compile; do not invoke QEMU - --build_dir= CMake build directory (default: ${build_dir}) - --output_dir= Directory for the exported .bpte (default: ${output_dir}) - --qemu= qemu-user binary (default: ${qemu}) - --timeout= Maximum QEMU runtime; matches run_fvp.sh --timelimit (default: ${qemu_timeout}) + --build-dir= Build/output directory for this configuration (required) + --qemu= Override qemu binary + --timeout= Maximum QEMU runtime (default: ${qemu_timeout}) -h, --help Show this help EOF } @@ -47,51 +57,125 @@ EOF for arg in "$@"; do case $arg in --model=*) model="${arg#*=}" ;; - --xnnpack) xnnpack=true ;; --quantize) quantize=true ;; + --backend=*) backend="${arg#*=}" ;; + --os=*) os="${arg#*=}" ;; + --arch=*) arch="${arg#*=}" ;; + --qemu-cpu-ext=*) qemu_cpu_ext="${arg#*=}" ;; --debug-xnnpack) debug_xnnpack=true ;; --verbose-xnnpack) verbose_xnnpack=true ;; --build_only) build_only=true ;; - --build_dir=*) build_dir="${arg#*=}" ;; - --output_dir=*) output_dir="${arg#*=}" ;; - --qemu=*) qemu="${arg#*=}" ;; + --build-dir=*) build_dir="${arg#*=}" ;; + --qemu=*) qemu_override="${arg#*=}" ;; --timeout=*) qemu_timeout="${arg#*=}" ;; -h|--help) usage; exit 0 ;; *) echo "Unknown option: $arg" >&2; usage; exit 1 ;; esac done -mkdir -p "${output_dir}" -bpte_path="${output_dir}/${model}_riscv.bpte" +case "${backend}" in + portable|xnnpack) ;; + *) echo "Unknown backend: ${backend}" >&2; usage; exit 1 ;; +esac +case "${os}" in + linux|baremetal) ;; + *) echo "Unknown os: ${os}" >&2; usage; exit 1 ;; +esac +case "${arch}" in + rv32|rv64) ;; + *) echo "Unknown arch: ${arch}" >&2; usage; exit 1 ;; +esac -echo "[run.sh] Step 1/3: AOT export on host" -aot_extra_args=() -if ${xnnpack}; then - aot_extra_args+=(--xnnpack) +# xnnpack needs pthreads + dynamic loading: baremetal has neither, and the +# Ubuntu xnnpack microkernels don't ship an rv32 build. +if [[ "${backend}" == "xnnpack" && "${os}" == "baremetal" ]]; then + echo "[run.sh] --backend=xnnpack requires --os=linux" >&2 + exit 1 +fi +if [[ "${backend}" == "xnnpack" && "${arch}" == "rv32" ]]; then + echo "[run.sh] --backend=xnnpack requires --arch=rv64" >&2 + exit 1 +fi +# Ubuntu doesn't package a riscv32-linux-gnu cross (riscv64-linux-gnu has no +# rv32 multilib either), so rv32 linux is blocked on a custom toolchain build. +if [[ "${arch}" == "rv32" && "${os}" == "linux" ]]; then + echo "[run.sh] --arch=rv32 --os=linux not supported: no riscv32-linux-gnu toolchain on Ubuntu" >&2 + exit 1 +fi + +if ${debug_xnnpack} && [[ "${backend}" != "xnnpack" ]]; then + echo "[run.sh] --debug-xnnpack requires --backend=xnnpack" >&2 + exit 1 fi +if ${verbose_xnnpack} && [[ "${backend}" != "xnnpack" ]]; then + echo "[run.sh] --verbose-xnnpack requires --backend=xnnpack" >&2 + exit 1 +fi + +if [[ -z "${build_dir}" ]]; then + echo "[run.sh] --build-dir is required" >&2; usage; exit 1 +fi +mkdir -p "${build_dir}" + +bpte_path="${build_dir}/model.bpte" + +echo "[run.sh] Step 1/3: AOT export on host (backend=${backend} os=${os} arch=${arch})" +aot_extra_args=() if ${quantize}; then aot_extra_args+=(--quantize) fi if ${debug_xnnpack}; then aot_extra_args+=(--debug-xnnpack) fi -python "${script_dir}/aot_riscv.py" --model "${model}" "${aot_extra_args[@]}" --output "${bpte_path}" +python "${script_dir}/aot_riscv.py" --model "${model}" --backend "${backend}" --os "${os}" "${aot_extra_args[@]}" --output "${bpte_path}" -echo "[run.sh] Step 2/3: cross-compile executor_runner for riscv64-linux" +echo "[run.sh] Step 2/3: cross-compile executor_runner for ${arch}-${os}" cmake_extra_args=() -if ${xnnpack}; then +if [[ "${backend}" == "xnnpack" ]]; then cmake_extra_args+=(-DEXECUTORCH_BUILD_XNNPACK=ON) fi if ${verbose_xnnpack}; then cmake_extra_args+=(-DEXECUTORCH_XNNPACK_LOG_LEVEL=4 -DEXECUTORCH_BUILD_RISCV_ETDUMP=ON) fi -cmake -S "${et_root_dir}" -B "${build_dir}" \ - --preset riscv64-linux \ - "${cmake_extra_args[@]}" \ - -DCMAKE_BUILD_TYPE=Release -cmake --build "${build_dir}" -j"$(nproc)" --target executor_runner -runner="${build_dir}/executor_runner" +# Map our short arch (rv32/rv64) to the canonical riscv32/riscv64 prefix used +# by the cross toolchain and qemu binary names. +case "${arch}" in + rv32) arch_long="riscv32" ;; + rv64) arch_long="riscv64" ;; +esac + +if [[ "${os}" == "linux" ]]; then + build_target="executor_runner" + qemu_default="qemu-${arch_long}-static" + cmake -S "${et_root_dir}" -B "${build_dir}" --fresh \ + --preset "${arch_long}-linux" \ + "${cmake_extra_args[@]}" \ + -DCMAKE_BUILD_TYPE=Release + cmake --build "${build_dir}" -j"$(nproc)" --target "${build_target}" + runner="${build_dir}/${build_target}" + +elif [[ "${os}" == "baremetal" ]]; then + build_target="executor_runner_baremetal" + qemu_default="qemu-system-${arch_long}" + # Standalone build (mirrors examples/arm/executor_runner/standalone) + cmake -S "${et_root_dir}/examples/riscv/baremetal" -B "${build_dir}" --fresh \ + -DCMAKE_TOOLCHAIN_FILE=${et_root_dir}/examples/riscv/${arch_long}-unknown-elf-toolchain.cmake \ + -DEXECUTORCH_BUILD_PRESET_FILE=${et_root_dir}/tools/cmake/preset/riscv_baremetal.cmake \ + -DEXECUTORCH_ROOT="${et_root_dir}" \ + -DRISCV_BAREMETAL_PTE="${bpte_path}" \ + "${cmake_extra_args[@]}" \ + -DCMAKE_BUILD_TYPE=Release + cmake --build "${build_dir}" -j"$(nproc)" --target "${build_target}" + runner="${build_dir}/${build_target}.elf" + +else + echo "Unknown os: ${os}" >&2 + usage + exit 1 +fi + +qemu="${qemu_override:-${qemu_default}}" [[ -x "${runner}" ]] || { echo "[run.sh] runner not found at ${runner}" >&2; exit 1; } if file "${runner}" | grep -q "RISC-V"; then @@ -113,45 +197,75 @@ hash "${qemu}" 2>/dev/null || { exit 1 } -# QEMU_LD_PREFIX points qemu-user at the riscv64 sysroot so the dynamic -# linker (ld-linux-riscv64-lp64d.so.1) referenced in the ELF resolves. -export QEMU_LD_PREFIX="${QEMU_LD_PREFIX:-/usr/riscv64-linux-gnu}" +log_file="${build_dir}/run.log" +rm -f "${log_file}" -if [[ -n "${QEMU_CPU+x}" ]]; then - echo "[run.sh] QEMU_CPU=${QEMU_CPU}" +# Compose the QEMU -cpu value once: ${arch} alone, or ${arch},${ext} when an +# extension list was supplied. qemu-user reads $QEMU_CPU; qemu-system takes +# -cpu on the command line. +qemu_cpu="${arch}" +if [[ -n "${qemu_cpu_ext}" ]]; then + qemu_cpu="${arch},${qemu_cpu_ext}" fi +echo "[run.sh] qemu -cpu = ${qemu_cpu}" -runner_extra_args=() -if ${quantize}; then - runner_extra_args+=(--bundleio_rtol=0.1 --bundleio_atol=0.25) -fi -etdump_path="" -if ${verbose_xnnpack}; then - etdump_path="${output_dir}/${model}_riscv.etdump" - rm -f "${etdump_path}" - runner_extra_args+=(--etdump_path="${etdump_path}") -fi +if [[ "${os}" == "linux" ]]; then + # QEMU_LD_PREFIX points qemu-user at the cross sysroot so the dynamic + # linker (ld-linux-riscv*) referenced in the ELF resolves. + if [[ "${arch}" == "rv64" ]]; then + export QEMU_LD_PREFIX="${QEMU_LD_PREFIX:-/usr/riscv64-linux-gnu}" + else + export QEMU_LD_PREFIX="${QEMU_LD_PREFIX:-/usr/riscv32-linux-gnu}" + fi + export QEMU_CPU="${qemu_cpu}" -# etdump_summary.py reads the XNN_LOG_LEVEL=4 registrations. -log_file="${output_dir}/${model}_riscv.run.log" -rm -f "${log_file}" + runner_extra_args=() + if ${quantize}; then + runner_extra_args+=(--bundleio_rtol=0.1 --bundleio_atol=0.25) + fi + etdump_path="" + if ${verbose_xnnpack}; then + etdump_path="${build_dir}/run.etdump" + rm -f "${etdump_path}" + runner_extra_args+=(--etdump_path="${etdump_path}") + fi -set +e -timeout --signal=KILL "${qemu_timeout}" "${qemu}" "${runner}" \ - --model_path="${bpte_path}" \ - "${runner_extra_args[@]}" \ - 2>&1 | tee "${log_file}" -qemu_status=${PIPESTATUS[0]} -set -e + set +e + timeout --signal=KILL "${qemu_timeout}" "${qemu}" "${runner}" \ + --model_path="${bpte_path}" \ + "${runner_extra_args[@]}" \ + |& tee "${log_file}" + qemu_status=${PIPESTATUS[0]} + set -e -echo "[run.sh] qemu exit status: ${qemu_status}" + if [[ -n "${etdump_path}" && -f "${etdump_path}" ]]; then + python "${script_dir}/etdump_summary.py" "${etdump_path}" \ + --run-log "${log_file}" \ + --json "${etdump_path}.json" || true + fi + +elif [[ "${os}" == "baremetal" ]]; then + # qemu-system -machine virt boots at 0x80000000; -bios none skips OpenSBI; + # semihosting target=native routes SYS_WRITE0/SYS_EXIT to host stdio. + # For deeper debugging, add: -accel tcg,one-insn-per-tb=on -d in_asm,nochain + # -D + set +e + timeout --signal=KILL "${qemu_timeout}" "${qemu}" \ + -machine virt -cpu "${qemu_cpu}" -m 512M -nographic -bios none \ + -semihosting-config enable=on,target=native \ + -kernel "${runner}" \ + |& tee "${log_file}" + qemu_status=${PIPESTATUS[0]} + set -e -if [[ -n "${etdump_path}" && -f "${etdump_path}" ]]; then - python "${script_dir}/etdump_summary.py" "${etdump_path}" \ - --run-log "${log_file}" \ - --json "${etdump_path}.json" || true +else + echo "Unknown os: ${os}" >&2 + usage + exit 1 fi +echo "[run.sh] qemu exit status: ${qemu_status}" + if grep -q "Test_result: PASS" "${log_file}"; then echo "[run.sh] Bundled I/O check PASSED" exit 0 diff --git a/examples/riscv/setup-baremetal.sh b/examples/riscv/setup-baremetal.sh new file mode 100755 index 00000000000..f94a11388a8 --- /dev/null +++ b/examples/riscv/setup-baremetal.sh @@ -0,0 +1,49 @@ +#!/usr/bin/env bash +# Copyright 2026 The ExecuTorch Authors. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Host tooling for the RISC-V smoke tests. Targets Ubuntu 26.04: that's where +# libstdc++-riscv64-unknown-elf-picolibc was first packaged, and the baremetal +# build chain needs C++ stdlib headers paired with picolibc. + +set -euo pipefail + +script_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) + +if ! command -v apt-get >/dev/null 2>&1; then + echo "[$(basename "$0")] this setup script targets Debian/Ubuntu (apt-get not found)" >&2 + exit 1 +fi + +SUDO="" +if [[ $EUID -ne 0 ]]; then + SUDO="sudo" +fi + +${SUDO} apt-get update +${SUDO} apt-get install -y --no-install-recommends \ + build-essential \ + gcc-riscv64-linux-gnu \ + g++-riscv64-linux-gnu \ + binutils-riscv64-linux-gnu \ + libc6-riscv64-cross \ + libc6-dev-riscv64-cross \ + gcc-riscv64-unknown-elf \ + picolibc-riscv64-unknown-elf \ + libstdc++-riscv64-unknown-elf-picolibc \ + cmake \ + file \ + ca-certificates \ + qemu-user \ + qemu-system-riscv \ + libglib2.0-0t64 \ + libxcb1 \ + libgl1 + +riscv64-linux-gnu-gcc --version | head -n1 +qemu-riscv64 --version | head -n1 + +# Some python packages also need to be installed +pip install -r "${script_dir}/requirements.txt" diff --git a/examples/riscv/setup.sh b/examples/riscv/setup-linux.sh similarity index 90% rename from examples/riscv/setup.sh rename to examples/riscv/setup-linux.sh index 48d5ed27642..03206d9305c 100755 --- a/examples/riscv/setup.sh +++ b/examples/riscv/setup-linux.sh @@ -8,7 +8,7 @@ # - gcc/g++/binutils for riscv64-linux-gnu (cross-compiler + sysroot) # - qemu-user-static (qemu-riscv64 user-mode emulator) -set -eu +set -euo pipefail script_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) @@ -22,6 +22,13 @@ if [[ $EUID -ne 0 ]]; then SUDO="sudo" fi +source /etc/os-release + +GCC_VERSION="" +if [[ "${VERSION_ID:-}" == "24.04" ]]; then + GCC_VERSION="14" +fi + ${SUDO} apt-get update ${SUDO} apt-get install -y --no-install-recommends \ build-essential \ @@ -44,7 +51,7 @@ if [[ -n "${GCC_VERSION+x}" ]]; then fi riscv64-linux-gnu-gcc --version | head -n1 -qemu-riscv64-static --version | head -n1 +qemu-riscv64 --version | head -n1 # Some python packages also need to be installed pip install -r "${script_dir}/requirements.txt" diff --git a/examples/riscv/test-matrix.sh b/examples/riscv/test-matrix.sh new file mode 100644 index 00000000000..93c09d1976d --- /dev/null +++ b/examples/riscv/test-matrix.sh @@ -0,0 +1,250 @@ +#!/usr/bin/env bash +# Copyright 2026 The ExecuTorch Authors. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +# +# Local mirror of riscv64.yml's matrix using two docker containers: +# +# - executorch-riscv-linux (ubuntu:24.04 + gcc-14). +# - executorch-riscv-baremetal (ubuntu:26.04 + gcc-15). +# 26.04 is the only release shipping libstdc++-riscv64-unknown-elf-picolibc. +# +# Usage: +# examples/riscv/test-matrix.sh # full sweep +# examples/riscv/test-matrix.sh --model=mv2 # one model, all configs +# examples/riscv/test-matrix.sh --os=baremetal # one OS +# examples/riscv/test-matrix.sh --quantize-only # skip the no-q half +# examples/riscv/test-matrix.sh --setup-only # bootstrap containers, don't run +# +# Re-runs are cheap when the per-cell build dirs survive (set --keep-build). + +set -euo pipefail + +script_dir=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) +et_root_dir=$(realpath "${script_dir}/../..") + +model_filter="" +os_filter="" +arch_filter="" +variant_filter="" +backend_filter="" +quantize_mode="both" # both | only | none +setup_only=false +keep_build=false + +usage() { + cat < Only run cells for this model + --os= + --arch= + --backend= + --variant= + --quantize-only Skip the non-quantized cells + --no-quantize Skip the quantized cells + --setup-only Make sure both containers are ready, then exit + --keep-build Reuse riscv_test/ dirs instead of starting fresh + -h, --help +EOF +} + +for arg in "$@"; do + case $arg in + --model=*) model_filter="${arg#*=}" ;; + --os=*) os_filter="${arg#*=}" ;; + --arch=*) arch_filter="${arg#*=}" ;; + --backend=*) backend_filter="${arg#*=}" ;; + --variant=*) variant_filter="${arg#*=}" ;; + --quantize-only) quantize_mode="only" ;; + --no-quantize) quantize_mode="none" ;; + --setup-only) setup_only=true ;; + --keep-build) keep_build=true ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown: $arg" >&2; usage; exit 1 ;; + esac +done + +# Container names + image tags match what the CI workflow consumes. +LINUX_CTR=executorch-riscv-linux +BAREMETAL_CTR=executorch-riscv-baremetal + +# `add`/`mv2`/`resnet18` are the only models with XNNPACK quantization recipes +# in MODEL_NAME_TO_OPTIONS — others raise at AOT time when --quantize is set. +QUANTIZED_MODELS="mv2 resnet18" +ALL_MODELS="add mv2 resnet18 mobilebert llama2 yolo26" +ALL_BACKENDS="portable xnnpack" + +# qemu-cpu-ext sweeps; keep parity with the JSON arrays in riscv64.yml. +SCALAR_EXT="zba=true,zbb=true,zbs=true,v=false" +RVV_EXT="zba=true,zbb=true,zbs=true,v=true,vlen=128,vext_spec=v1.0" + +# Check if a cell combination should be excluded (matching riscv64.yml excludes) +should_exclude() { + local os=$1 arch=$2 backend=$3 variant=$4 model=$5 quantize=$6 + + # Disable quantization testing with Portable Kernels + if [[ "${backend}" == "portable" && "${quantize}" == "true" ]]; then + return 0 + fi + # XNNPACK needs pthreads + dynamic loading (no baremetal) + if [[ "${backend}" == "xnnpack" && "${os}" == "baremetal" ]]; then + return 0 + fi + # XNNPACK needs RVV + if [[ "${backend}" == "xnnpack" && "${variant}" == "scalar" ]]; then + return 0 + fi + # No quantization recipe for Yolo26 + if [[ "${model}" == "yolo26" && "${quantize}" == "true" ]]; then + return 0 + fi + # No riscv32-linux-gnu cross is packaged on Ubuntu + if [[ "${os}" == "linux" && "${arch}" == "rv32" ]]; then + return 0 + fi + + return 1 +} + +# ---- container bootstrap (idempotent) ------------------------------------- + +ensure_linux() { + if ! docker ps -a --format '{{.Names}}' | grep -qx "${LINUX_CTR}"; then + echo "[matrix] starting ${LINUX_CTR} (ubuntu:24.04)" + docker run -d --name "${LINUX_CTR}" \ + -e DEBIAN_FRONTEND=noninteractive \ + -v "${et_root_dir}":/executorch -w /executorch \ + ubuntu:24.04 sleep infinity >/dev/null + fi + docker start "${LINUX_CTR}" >/dev/null + if ! docker exec "${LINUX_CTR}" test -d /executorch/.venv-docker-linux; then + echo "[matrix] bootstrapping ${LINUX_CTR} (this takes a few minutes)" + docker exec "${LINUX_CTR}" bash -eu -c ' + set -e + apt-get update -qq && apt-get install -y -qq --no-install-recommends \ + python3 python3-pip ca-certificates sudo + python3 -m pip install --break-system-packages --quiet uv + uv python install 3.10 + cd /executorch + uv venv --python 3.10 --seed .venv-docker-linux + ' + fi + docker exec "${LINUX_CTR}" bash -eu -c ' + set -e + cd /executorch + source .venv-docker-linux/bin/activate + pip install --upgrade pip + pip install executorch + bash examples/riscv/setup-linux.sh + ' +} + +ensure_baremetal() { + if ! docker ps -a --format '{{.Names}}' | grep -qx "${BAREMETAL_CTR}"; then + echo "[matrix] starting ${BAREMETAL_CTR} (ubuntu:26.04)" + docker run -d --name "${BAREMETAL_CTR}" \ + -e DEBIAN_FRONTEND=noninteractive \ + -v "${et_root_dir}":/executorch -w /executorch \ + ubuntu:26.04 sleep infinity >/dev/null + fi + docker start "${BAREMETAL_CTR}" >/dev/null + if ! docker exec "${BAREMETAL_CTR}" test -d /executorch/.venv-docker-baremetal; then + echo "[matrix] bootstrapping ${BAREMETAL_CTR} (this takes a few minutes)" + docker exec "${BAREMETAL_CTR}" bash -eu -c ' + set -e + apt-get update -qq && apt-get install -y -qq --no-install-recommends \ + python3 python3-pip ca-certificates sudo + python3 -m pip install --break-system-packages --quiet uv + uv python install 3.10 + cd /executorch + uv venv --python 3.10 --seed .venv-docker-baremetal + ' + fi + docker exec "${BAREMETAL_CTR}" bash -eu -c ' + set -e + cd /executorch + source .venv-docker-baremetal/bin/activate + pip install --upgrade pip + pip install executorch + bash examples/riscv/setup-baremetal.sh + ' +} + +ensure_linux +ensure_baremetal +if ${setup_only}; then exit 0; fi + +# ---- one cell -------------------------------------------------------------- + +# Args: ctr venv os arch backend variant ext model quantize_flag +run_cell() { + local ctr=$1 venv=$2 os=$3 arch=$4 backend=$5 variant=$6 ext=$7 model=$8 q=$9 + local cell="${model}${q:++q}-${backend}/${os}-${arch}" + local model_q="${model}${q:+-q}" + local variant_slug="${ext//,/_}"; variant_slug="${variant_slug//=/_}"; variant_slug="${variant_slug:-base}" + local build_dir="/executorch/riscv_test/${model_q}/${backend}/${os}-${arch}-${variant_slug}" + if ! ${keep_build}; then + docker exec "${ctr}" rm -rf "${build_dir}" + fi + if docker exec "${ctr}" bash -lc " + cd /executorch && source ${venv}/bin/activate && + timeout 1800 bash -eu examples/riscv/run.sh \ + --model=${model} ${q} --backend=${backend} \ + --os=${os} --arch=${arch} \ + --qemu-cpu-ext='${ext}' \ + --build-dir=${build_dir} --timeout=900 + "; then + echo " PASS ${cell}" + return 0 + else + echo " FAIL ${cell}" + return 1 + fi +} + +# ---- iterate --------------------------------------------------------------- + +passed=0; total=0 +for os_arch in "linux:rv64" "baremetal:rv64" "baremetal:rv32"; do + os="${os_arch%%:*}"; arch="${os_arch##*:}" + if [[ -n "${os_filter}" && "${os}" != "${os_filter}" ]]; then continue; fi + if [[ -n "${arch_filter}" && "${arch}" != "${arch_filter}" ]]; then continue; fi + if [[ "${os}" == "linux" ]]; then ctr="${LINUX_CTR}"; venv=/executorch/.venv-docker-linux; + else ctr="${BAREMETAL_CTR}"; venv=/executorch/.venv-docker-baremetal; fi + + for variant_lbl in "scalar:${SCALAR_EXT}" "rvv:${RVV_EXT}"; do + variant="${variant_lbl%%:*}"; ext="${variant_lbl#*:}" + if [[ -n "${variant_filter}" && "${variant}" != "${variant_filter}" ]]; then continue; fi + + for backend in ${ALL_BACKENDS}; do + if [[ -n "${backend_filter}" && "${backend}" != "${backend_filter}" ]]; then continue; fi + + # non-quantized models + if [[ "${quantize_mode}" != "only" ]]; then + for m in ${ALL_MODELS}; do + if [[ -n "${model_filter}" && "${m}" != "${model_filter}" ]]; then continue; fi + if should_exclude "${os}" "${arch}" "${backend}" "${variant}" "${m}" "false"; then continue; fi + total=$((total+1)) + run_cell "${ctr}" "${venv}" "${os}" "${arch}" "${backend}" "${variant}" "${ext}" "${m}" "" \ + && passed=$((passed+1)) || exit 1 + done + fi + # quantized — only the 3 models with XNNPACK recipes + if [[ "${quantize_mode}" != "none" ]]; then + for m in ${QUANTIZED_MODELS}; do + if [[ -n "${model_filter}" && "${m}" != "${model_filter}" ]]; then continue; fi + if should_exclude "${os}" "${arch}" "${backend}" "${variant}" "${m}" "true"; then continue; fi + total=$((total+1)) + run_cell "${ctr}" "${venv}" "${os}" "${arch}" "${backend}" "${variant}" "${ext}" "${m}" "--quantize" \ + && passed=$((passed+1)) || exit 1 + done + fi + done + done +done + +echo "" +echo "===== ${passed}/${total} cells passed =====" +test "${passed}" -eq "${total}" diff --git a/tools/cmake/preset/riscv_baremetal.cmake b/tools/cmake/preset/riscv_baremetal.cmake new file mode 100644 index 00000000000..e70fc57ba57 --- /dev/null +++ b/tools/cmake/preset/riscv_baremetal.cmake @@ -0,0 +1,50 @@ +# Copyright 2026 The ExecuTorch Authors. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Baremetal builds consume the build tree directly; mirror arm_baremetal so +# install rules stay invokable but write back into the build dir. +define_overridable_option( + EXECUTORCH_BAREMETAL_SKIP_INSTALL + "Skip emitting install/export rules when building bare-metal artifacts" BOOL + ON +) + +if(EXECUTORCH_BAREMETAL_SKIP_INSTALL) + set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}") + unset(CMAKE_SKIP_INSTALL_RULES CACHE) + set(CMAKE_SKIP_INSTALL_RULES + OFF + CACHE + BOOL + "Retain install() rules so docs/scripts can keep calling --target install" + FORCE + ) +endif() + +set_overridable_option(EXECUTORCH_BUILD_EXECUTOR_RUNNER OFF) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER OFF) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR OFF) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_EVALUE_UTIL ON) +set_overridable_option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL ON) +set_overridable_option(EXECUTORCH_BUILD_KERNELS_QUANTIZED ON) +# BUNDLE_IO requires DEVTOOLS to provide the bundled_program lib. +set_overridable_option(EXECUTORCH_BUILD_DEVTOOLS ON) +set_overridable_option(EXECUTORCH_ENABLE_BUNDLE_IO ON) +set_overridable_option(EXECUTORCH_ENABLE_LOGGING ON) +# Freestanding target: no pthreadpool, no cpuinfo, no shared lib. +set_overridable_option(EXECUTORCH_BUILD_PTHREADPOOL OFF) +set_overridable_option(EXECUTORCH_BUILD_CPUINFO OFF) + +define_overridable_option( + EXECUTORCH_BUILD_RISCV_ETDUMP "Build etdump support for RISC-V" BOOL OFF +) + +if("${EXECUTORCH_BUILD_RISCV_ETDUMP}") + set(EXECUTORCH_BUILD_DEVTOOLS ON) + set(EXECUTORCH_ENABLE_EVENT_TRACER ON) + set(FLATCC_ALLOW_WERROR OFF) +else() + set(EXECUTORCH_ENABLE_EVENT_TRACER OFF) +endif() diff --git a/tools/cmake/preset/riscv64_linux.cmake b/tools/cmake/preset/riscv_linux.cmake similarity index 100% rename from tools/cmake/preset/riscv64_linux.cmake rename to tools/cmake/preset/riscv_linux.cmake From 0df077d96ae296e5e83c1a1fda82915bd639d15d Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Mon, 1 Jun 2026 21:39:05 +0200 Subject: [PATCH 097/103] Fix based on Claude's review --- .github/workflows/riscv64.yml | 2 +- examples/riscv/README.md | 4 ++-- examples/riscv/baremetal/CMakeLists.txt | 2 +- examples/riscv/run.sh | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml index d6109a47305..9331fc35508 100644 --- a/.github/workflows/riscv64.yml +++ b/.github/workflows/riscv64.yml @@ -12,7 +12,7 @@ on: - .github/workflows/riscv64.yml - .github/workflows/_test_riscv.yml - .ci/scripts/test_riscv_qemu.sh - - tools/cmake/preset/riscv64_*.cmake + - tools/cmake/preset/riscv_*.cmake - examples/riscv/** workflow_dispatch: schedule: diff --git a/examples/riscv/README.md b/examples/riscv/README.md index 2c250f75cd7..3ae8a151f24 100644 --- a/examples/riscv/README.md +++ b/examples/riscv/README.md @@ -20,7 +20,7 @@ examples/riscv/run.sh # export, cross-compile, run under qemu | `--quantize` | flag | off | XNNPACK quantizer (requires `--backend=xnnpack`) | | `--backend=` | `portable`, `xnnpack` | `portable` | xnnpack is linux-only | | `--os=` | `linux`, `baremetal` | `linux` | qemu-user vs qemu-system + semihosting | -| `--arch=` | `rv64` | `rv64` | (rv32 follow-up; no `riscv32-linux-gnu` cross is packaged on Ubuntu) | +| `--arch=` | `rv32`, `rv64` | `rv64` | valid - pairs are `linux-rv64`, `baremetal-rv32`, `baremetal-rv64` | | `--qemu-cpu-ext=` | e.g. `v=true,vlen=128` | empty | extensions appended after the arch base | ## Pipelines @@ -33,4 +33,4 @@ The baremetal runner embeds the `.bpte` directly in `.rodata` via the same `exam ## CI -`.github/workflows/riscv64.yml` is the entry point; it fans out into `_test_riscv.yml` over a `(model, backend, os, arch, quantize)` matrix and sweeps `qemu-cpu-ext` per backend. Runs on the `executorch-ubuntu-26.04-gcc15` docker image (needed for the `riscv64-unknown-elf` picolibc + libstdc++ packages - see [setup.sh](setup.sh)). +`.github/workflows/riscv64.yml` is the entry point; it fans out into `_test_riscv.yml` over a `(model, backend, os, arch, quantize)` matrix and sweeps `qemu-cpu-ext` per backend. Runs on the `executorch-ubuntu-26.04-gcc15` docker image (needed for the `riscv64-unknown-elf` picolibc + libstdc++ packages - see [setup-linux.sh](setup-linux.sh) or [setup-baremetal.sh](setup-baremetal.sh)). diff --git a/examples/riscv/baremetal/CMakeLists.txt b/examples/riscv/baremetal/CMakeLists.txt index b7765c4e3a1..b0208e41d2b 100644 --- a/examples/riscv/baremetal/CMakeLists.txt +++ b/examples/riscv/baremetal/CMakeLists.txt @@ -46,7 +46,7 @@ endif() include("${EXECUTORCH_ROOT}/tools/cmake/common/preset.cmake") if(NOT DEFINED EXECUTORCH_BUILD_PRESET_FILE) set(EXECUTORCH_BUILD_PRESET_FILE - "${EXECUTORCH_ROOT}/tools/cmake/preset/riscv64_baremetal.cmake" + "${EXECUTORCH_ROOT}/tools/cmake/preset/riscv_baremetal.cmake" CACHE PATH "Preset used when configuring the standalone baremetal runner" ) endif() diff --git a/examples/riscv/run.sh b/examples/riscv/run.sh index e44f23add86..0635bfedb4e 100755 --- a/examples/riscv/run.sh +++ b/examples/riscv/run.sh @@ -193,7 +193,7 @@ fi echo "[run.sh] Step 3/3: run under ${qemu}" hash "${qemu}" 2>/dev/null || { - echo "[run.sh] ERROR: ${qemu} not found on PATH; install with examples/riscv/setup.sh" >&2 + echo "[run.sh] ERROR: ${qemu} not found on PATH; install with examples/riscv/setup-${os}.sh" >&2 exit 1 } From cfd9b52cb319334b4dfb26f76bdbd463a50af0d5 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Mon, 1 Jun 2026 21:41:07 +0200 Subject: [PATCH 098/103] Fix qemu-riscv64-static live check --- examples/riscv/setup-linux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/riscv/setup-linux.sh b/examples/riscv/setup-linux.sh index 03206d9305c..bef4408ad56 100755 --- a/examples/riscv/setup-linux.sh +++ b/examples/riscv/setup-linux.sh @@ -51,7 +51,7 @@ if [[ -n "${GCC_VERSION+x}" ]]; then fi riscv64-linux-gnu-gcc --version | head -n1 -qemu-riscv64 --version | head -n1 +qemu-riscv64-static --version | head -n1 # Some python packages also need to be installed pip install -r "${script_dir}/requirements.txt" From 66edf4edf7134ac39ec0449662cb84e84551f24b Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Tue, 2 Jun 2026 01:10:23 +0200 Subject: [PATCH 099/103] Use GCC 14 for host compiler as well sentencepiece fails to compile on GCC 15 due to missing #include --- examples/riscv/setup-baremetal.sh | 20 ++++++++++++++++++-- examples/riscv/setup-linux.sh | 6 +++++- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/examples/riscv/setup-baremetal.sh b/examples/riscv/setup-baremetal.sh index f94a11388a8..f96e8c75032 100755 --- a/examples/riscv/setup-baremetal.sh +++ b/examples/riscv/setup-baremetal.sh @@ -22,11 +22,20 @@ if [[ $EUID -ne 0 ]]; then SUDO="sudo" fi +source /etc/os-release + +GCC_VERSION="" +if [[ "${VERSION_ID:-}" == "24.04" || "${VERSION_ID:-}" == "26.04" ]]; then + GCC_VERSION="14" +fi + ${SUDO} apt-get update ${SUDO} apt-get install -y --no-install-recommends \ build-essential \ - gcc-riscv64-linux-gnu \ - g++-riscv64-linux-gnu \ + gcc${GCC_VERSION:+-${GCC_VERSION}} \ + g++${GCC_VERSION:+-${GCC_VERSION}} \ + gcc${GCC_VERSION:+-${GCC_VERSION}}-riscv64-linux-gnu \ + g++${GCC_VERSION:+-${GCC_VERSION}}-riscv64-linux-gnu \ binutils-riscv64-linux-gnu \ libc6-riscv64-cross \ libc6-dev-riscv64-cross \ @@ -42,6 +51,13 @@ ${SUDO} apt-get install -y --no-install-recommends \ libxcb1 \ libgl1 +if [[ -n "${GCC_VERSION+x}" ]]; then + ${SUDO} update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc${GCC_VERSION:+-${GCC_VERSION}} 100 + ${SUDO} update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++${GCC_VERSION:+-${GCC_VERSION}} 100 + ${SUDO} update-alternatives --install /usr/bin/riscv64-linux-gnu-gcc riscv64-linux-gnu-gcc /usr/bin/riscv64-linux-gnu-gcc${GCC_VERSION:+-${GCC_VERSION}} 100 + ${SUDO} update-alternatives --install /usr/bin/riscv64-linux-gnu-g++ riscv64-linux-gnu-g++ /usr/bin/riscv64-linux-gnu-g++${GCC_VERSION:+-${GCC_VERSION}} 100 +fi + riscv64-linux-gnu-gcc --version | head -n1 qemu-riscv64 --version | head -n1 diff --git a/examples/riscv/setup-linux.sh b/examples/riscv/setup-linux.sh index bef4408ad56..912557e3bfb 100755 --- a/examples/riscv/setup-linux.sh +++ b/examples/riscv/setup-linux.sh @@ -25,13 +25,15 @@ fi source /etc/os-release GCC_VERSION="" -if [[ "${VERSION_ID:-}" == "24.04" ]]; then +if [[ "${VERSION_ID:-}" == "24.04" || "${VERSION_ID:-}" == "26.04" ]]; then GCC_VERSION="14" fi ${SUDO} apt-get update ${SUDO} apt-get install -y --no-install-recommends \ build-essential \ + gcc${GCC_VERSION:+-${GCC_VERSION}} \ + g++${GCC_VERSION:+-${GCC_VERSION}} \ gcc${GCC_VERSION:+-${GCC_VERSION}}-riscv64-linux-gnu \ g++${GCC_VERSION:+-${GCC_VERSION}}-riscv64-linux-gnu \ binutils-riscv64-linux-gnu \ @@ -46,6 +48,8 @@ ${SUDO} apt-get install -y --no-install-recommends \ libgl1 if [[ -n "${GCC_VERSION+x}" ]]; then + ${SUDO} update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc${GCC_VERSION:+-${GCC_VERSION}} 100 + ${SUDO} update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++${GCC_VERSION:+-${GCC_VERSION}} 100 ${SUDO} update-alternatives --install /usr/bin/riscv64-linux-gnu-gcc riscv64-linux-gnu-gcc /usr/bin/riscv64-linux-gnu-gcc${GCC_VERSION:+-${GCC_VERSION}} 100 ${SUDO} update-alternatives --install /usr/bin/riscv64-linux-gnu-g++ riscv64-linux-gnu-g++ /usr/bin/riscv64-linux-gnu-g++${GCC_VERSION:+-${GCC_VERSION}} 100 fi From ba2281ec6c65da12361a4ac8fa80a5bef091c8a5 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Tue, 2 Jun 2026 11:21:28 +0200 Subject: [PATCH 100/103] Fix unecessary change --- .ci/scripts/setup-linux.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.ci/scripts/setup-linux.sh b/.ci/scripts/setup-linux.sh index 275a93d797e..feb8a128b17 100755 --- a/.ci/scripts/setup-linux.sh +++ b/.ci/scripts/setup-linux.sh @@ -5,7 +5,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -set -eu +set -exu # shellcheck source=/dev/null source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" From 89fdf663e10e3cc3b0051e4e78617712e9175139 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Tue, 2 Jun 2026 11:22:59 +0200 Subject: [PATCH 101/103] Add testing on RVV on Portable Backend --- .github/workflows/riscv64.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/riscv64.yml b/.github/workflows/riscv64.yml index 9331fc35508..f2010b86fe5 100644 --- a/.github/workflows/riscv64.yml +++ b/.github/workflows/riscv64.yml @@ -71,7 +71,10 @@ jobs: "v=true,vext_spec=v1.0,vlen=512" ]', '[ - "v=false" + "v=false", + "v=true,vext_spec=v1.0,vlen=128", + "v=true,vext_spec=v1.0,vlen=256", + "v=true,vext_spec=v1.0,vlen=512" ]' ) }} From 7dc53a1bf03d2c273db8948eb693e26fcfde1549 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Tue, 2 Jun 2026 11:39:29 +0200 Subject: [PATCH 102/103] Add rvv128, rvv256, and rvv512 testing in test-matrix.sh --- examples/riscv/test-matrix.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/examples/riscv/test-matrix.sh b/examples/riscv/test-matrix.sh index 93c09d1976d..084b2eea308 100644 --- a/examples/riscv/test-matrix.sh +++ b/examples/riscv/test-matrix.sh @@ -41,7 +41,7 @@ Options: --os= --arch= --backend= - --variant= + --variant= --quantize-only Skip the non-quantized cells --no-quantize Skip the quantized cells --setup-only Make sure both containers are ready, then exit @@ -77,8 +77,10 @@ ALL_MODELS="add mv2 resnet18 mobilebert llama2 yolo26" ALL_BACKENDS="portable xnnpack" # qemu-cpu-ext sweeps; keep parity with the JSON arrays in riscv64.yml. -SCALAR_EXT="zba=true,zbb=true,zbs=true,v=false" -RVV_EXT="zba=true,zbb=true,zbs=true,v=true,vlen=128,vext_spec=v1.0" +SCALAR_EXT="v=false" +RVV128_EXT="v=true,vext_spec=v1.0,vlen=128" +RVV256_EXT="v=true,vext_spec=v1.0,vlen=256" +RVV512_EXT="v=true,vext_spec=v1.0,vlen=512" # Check if a cell combination should be excluded (matching riscv64.yml excludes) should_exclude() { @@ -214,7 +216,7 @@ for os_arch in "linux:rv64" "baremetal:rv64" "baremetal:rv32"; do if [[ "${os}" == "linux" ]]; then ctr="${LINUX_CTR}"; venv=/executorch/.venv-docker-linux; else ctr="${BAREMETAL_CTR}"; venv=/executorch/.venv-docker-baremetal; fi - for variant_lbl in "scalar:${SCALAR_EXT}" "rvv:${RVV_EXT}"; do + for variant_lbl in "scalar:${SCALAR_EXT}" "rvv128:${RVV128_EXT}" "rvv256:${RVV256_EXT}" "rvv512:${RVV512_EXT}"; do variant="${variant_lbl%%:*}"; ext="${variant_lbl#*:}" if [[ -n "${variant_filter}" && "${variant}" != "${variant_filter}" ]]; then continue; fi From 4b616c0395be8583a3e681051bc4a61a55ddc043 Mon Sep 17 00:00:00 2001 From: Ludovic Henry Date: Tue, 2 Jun 2026 13:20:16 +0200 Subject: [PATCH 103/103] Run all models with quantization (except excluded) --- examples/riscv/test-matrix.sh | 85 +++++++++++++++-------------------- 1 file changed, 37 insertions(+), 48 deletions(-) diff --git a/examples/riscv/test-matrix.sh b/examples/riscv/test-matrix.sh index 084b2eea308..9ed8115de44 100644 --- a/examples/riscv/test-matrix.sh +++ b/examples/riscv/test-matrix.sh @@ -29,7 +29,7 @@ os_filter="" arch_filter="" variant_filter="" backend_filter="" -quantize_mode="both" # both | only | none +quantize_filter="" setup_only=false keep_build=false @@ -42,8 +42,7 @@ Options: --arch= --backend= --variant= - --quantize-only Skip the non-quantized cells - --no-quantize Skip the quantized cells + --quantize= --setup-only Make sure both containers are ready, then exit --keep-build Reuse riscv_test/ dirs instead of starting fresh -h, --help @@ -52,16 +51,15 @@ EOF for arg in "$@"; do case $arg in - --model=*) model_filter="${arg#*=}" ;; - --os=*) os_filter="${arg#*=}" ;; - --arch=*) arch_filter="${arg#*=}" ;; - --backend=*) backend_filter="${arg#*=}" ;; - --variant=*) variant_filter="${arg#*=}" ;; - --quantize-only) quantize_mode="only" ;; - --no-quantize) quantize_mode="none" ;; - --setup-only) setup_only=true ;; - --keep-build) keep_build=true ;; - -h|--help) usage; exit 0 ;; + --model=*) model_filter="${arg#*=}" ;; + --os=*) os_filter="${arg#*=}" ;; + --arch=*) arch_filter="${arg#*=}" ;; + --backend=*) backend_filter="${arg#*=}" ;; + --variant=*) variant_filter="${arg#*=}" ;; + --quantize=*) quantize_filter="${arg#*=}" ;; + --setup-only) setup_only=true ;; + --keep-build) keep_build=true ;; + -h|--help) usage; exit 0 ;; *) echo "Unknown: $arg" >&2; usage; exit 1 ;; esac done @@ -70,11 +68,8 @@ done LINUX_CTR=executorch-riscv-linux BAREMETAL_CTR=executorch-riscv-baremetal -# `add`/`mv2`/`resnet18` are the only models with XNNPACK quantization recipes -# in MODEL_NAME_TO_OPTIONS — others raise at AOT time when --quantize is set. -QUANTIZED_MODELS="mv2 resnet18" -ALL_MODELS="add mv2 resnet18 mobilebert llama2 yolo26" -ALL_BACKENDS="portable xnnpack" +MODELS="add mv2 resnet18 mobilebert llama2 yolo26" +BACKENDS="portable xnnpack" # qemu-cpu-ext sweeps; keep parity with the JSON arrays in riscv64.yml. SCALAR_EXT="v=false" @@ -209,42 +204,36 @@ run_cell() { # ---- iterate --------------------------------------------------------------- passed=0; total=0 +for m in ${MODELS}; do +for backend in ${BACKENDS}; do for os_arch in "linux:rv64" "baremetal:rv64" "baremetal:rv32"; do - os="${os_arch%%:*}"; arch="${os_arch##*:}" +for variant_lbl in "scalar:${SCALAR_EXT}" "rvv128:${RVV128_EXT}" "rvv256:${RVV256_EXT}" "rvv512:${RVV512_EXT}"; do + os="${os_arch%%:*}"; arch="${os_arch##*:}"; variant="${variant_lbl%%:*}"; ext="${variant_lbl#*:}" + + if [[ -n "${model_filter}" && "${m}" != "${model_filter}" ]]; then continue; fi + if [[ -n "${backend_filter}" && "${backend}" != "${backend_filter}" ]]; then continue; fi if [[ -n "${os_filter}" && "${os}" != "${os_filter}" ]]; then continue; fi if [[ -n "${arch_filter}" && "${arch}" != "${arch_filter}" ]]; then continue; fi + if [[ -n "${variant_filter}" && "${variant}" != "${variant_filter}" ]]; then continue; fi + if [[ "${os}" == "linux" ]]; then ctr="${LINUX_CTR}"; venv=/executorch/.venv-docker-linux; else ctr="${BAREMETAL_CTR}"; venv=/executorch/.venv-docker-baremetal; fi - for variant_lbl in "scalar:${SCALAR_EXT}" "rvv128:${RVV128_EXT}" "rvv256:${RVV256_EXT}" "rvv512:${RVV512_EXT}"; do - variant="${variant_lbl%%:*}"; ext="${variant_lbl#*:}" - if [[ -n "${variant_filter}" && "${variant}" != "${variant_filter}" ]]; then continue; fi - - for backend in ${ALL_BACKENDS}; do - if [[ -n "${backend_filter}" && "${backend}" != "${backend_filter}" ]]; then continue; fi - - # non-quantized models - if [[ "${quantize_mode}" != "only" ]]; then - for m in ${ALL_MODELS}; do - if [[ -n "${model_filter}" && "${m}" != "${model_filter}" ]]; then continue; fi - if should_exclude "${os}" "${arch}" "${backend}" "${variant}" "${m}" "false"; then continue; fi - total=$((total+1)) - run_cell "${ctr}" "${venv}" "${os}" "${arch}" "${backend}" "${variant}" "${ext}" "${m}" "" \ - && passed=$((passed+1)) || exit 1 - done - fi - # quantized — only the 3 models with XNNPACK recipes - if [[ "${quantize_mode}" != "none" ]]; then - for m in ${QUANTIZED_MODELS}; do - if [[ -n "${model_filter}" && "${m}" != "${model_filter}" ]]; then continue; fi - if should_exclude "${os}" "${arch}" "${backend}" "${variant}" "${m}" "true"; then continue; fi - total=$((total+1)) - run_cell "${ctr}" "${venv}" "${os}" "${arch}" "${backend}" "${variant}" "${ext}" "${m}" "--quantize" \ - && passed=$((passed+1)) || exit 1 - done - fi - done - done + if [[ -z "${quantize_filter}" || "${quantize_filter}" = "no" ]]; then + if should_exclude "${os}" "${arch}" "${backend}" "${variant}" "${m}" "false"; then continue; fi + total=$((total+1)) + run_cell "${ctr}" "${venv}" "${os}" "${arch}" "${backend}" "${variant}" "${ext}" "${m}" "" \ + && passed=$((passed+1)) || exit 1 + fi + if [[ -z "${quantize_filter}" || "${quantize_filter}" = "yes" ]]; then + if should_exclude "${os}" "${arch}" "${backend}" "${variant}" "${m}" "true"; then continue; fi + total=$((total+1)) + run_cell "${ctr}" "${venv}" "${os}" "${arch}" "${backend}" "${variant}" "${ext}" "${m}" "--quantize" \ + && passed=$((passed+1)) || exit 1 + fi +done +done +done done echo ""