From 565b95f9913198b05ae8a0c761013e584cfd2dab Mon Sep 17 00:00:00 2001
From: RJ Ascani <rja@meta.com>
Date: Mon, 11 May 2026 09:41:58 -0700
Subject: [PATCH] Add ExportRecipe support for Arm targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces `ArmRecipeProvider` and `ArmRecipeType` so callers can use the
existing `ExportRecipe` abstraction to target Ethos-U, TOSA, and VGF
instead of going through `aot_arm_compiler.py`. Shape mirrors the
sibling XNNPACK / QNN providers; the provider auto-registers on import
of `backends/arm/recipes/`.

Eight recipes ship: Ethos-U55/U65/U85 INT8 (with `macs`,
`system_config`, `memory_mode`, `extra_flags`, `config_ini` kwargs),
TOSA FP / INT8 / A16W8, and VGF FP / INT8. Cortex-M is not yet
supported via recipes — its no-partitioner flow needs a different
pipeline shape and is left for a follow-up.

Faithfulness to the CLI: INT8 and A16W8 paths wire
`ReplaceQuantNodesPass` through `LoweringRecipe.edge_manager_transform_passes`
and override `pipeline_stages` to insert `EDGE_PROGRAM_MANAGER_TRANSFORM`
after `TO_EDGE_TRANSFORM_AND_LOWER`, matching
`aot_arm_compiler.py:200-201`. The pass is skipped for VGF and FP, also
matching the CLI gate. Ethos-U `extra_flags` are prepended with
`--verbose-operators --verbose-cycle-estimate` to mirror
`aot_arm_compiler.py:479-484`. Unknown kwargs raise `ValueError` (vs.
XNNPACK/QNN which warn) — intentional for a new provider so typos like
`mac=128` fail fast rather than silently producing a wrong-target
binary.

Enabling the post-partition hook required uncommenting the existing
TODO at `EdgeProgramManagerTransformStage.valid_predecessor_stages` to
also accept `TO_EDGE_TRANSFORM_AND_LOWER`. The stage's `run()` method
already handles a partitioned `EdgeProgramManager` correctly.

A pre-existing circular import between `tosa.backend` and
`ethosu.backend` surfaces when `executorch.backends.arm.vgf` is loaded
without `ethosu` already in `sys.modules`. The provider primes `ethosu`
before importing `vgf`, the same workaround `aot_arm_compiler.py` uses
implicitly through its module-level import order.

Tests live in `backends/arm/test/recipes/test_arm_recipes.py`:
- Registration suite runs anywhere (no Arm SDK deps).
- TOSA / VGF / Ethos-U construction suites skip cleanly if the
  corresponding SDK piece isn't installed.
- AOT round-trip suite exports `_AddModule` (TOSA FP) and
  `_ConvReluModule` (TOSA INT8) and asserts the right delegation
  shape — full delegation for FP; for INT8, ≥1 `DelegateCall` plus
  `cortex_m::quantize_per_tensor` / `cortex_m::dequantize_per_tensor`
  boundary kernels, which verifies `ReplaceQuantNodesPass` actually
  ran.

CI hookup adds a `test_pytest_recipes` matrix entry to
`unittest-arm-backend-with-no-deps` in pull.yml (Ethos-U tests skip via
the Vela guard) and to `test-arm-backend-ethos-u` in trunk.yml (full
SDK available; all tests run).

Authored with Claude Code.
---
 .github/workflows/pull.yml                    |   1 +
 .github/workflows/trunk.yml                   |   1 +
 backends/arm/recipes/__init__.py              |  15 +
 backends/arm/recipes/arm_recipe_provider.py   | 224 +++++++++++++
 backends/arm/recipes/arm_recipe_types.py      |  46 +++
 backends/arm/test/recipes/__init__.py         |   5 +
 backends/arm/test/recipes/test_arm_recipes.py | 297 ++++++++++++++++++
 backends/arm/test/test_arm_baremetal.sh       |  13 +
 export/stages.py                              |   2 +-
 9 files changed, 603 insertions(+), 1 deletion(-)
 create mode 100644 backends/arm/recipes/__init__.py
 create mode 100644 backends/arm/recipes/arm_recipe_provider.py
 create mode 100644 backends/arm/recipes/arm_recipe_types.py
 create mode 100644 backends/arm/test/recipes/__init__.py
 create mode 100644 backends/arm/test/recipes/test_arm_recipes.py

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 97633965652..891dab7f502 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -721,6 +721,7 @@ jobs:
           - test_arm_baremetal: test_pytest_ops_tosa
           - test_arm_baremetal: test_pytest_models_tosa
           - test_arm_baremetal: test_run_tosa
+          - test_arm_baremetal: test_pytest_recipes
       fail-fast: false
     with:
       runner: linux.2xlarge
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index 670517f836b..62b944aeb48 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -355,6 +355,7 @@ jobs:
           - test_arm_baremetal: test_pytest_ops_ethos_u85
           - test_arm_baremetal: test_pytest_models_ethos_u85
           - test_arm_baremetal: test_run_ethos_u85
+          - test_arm_baremetal: test_pytest_recipes
           - test_arm_baremetal: test_smaller_stories_llama
           - test_arm_baremetal: test_memory_allocation
       fail-fast: false
diff --git a/backends/arm/recipes/__init__.py b/backends/arm/recipes/__init__.py
new file mode 100644
index 00000000000..2b751645d68
--- /dev/null
+++ b/backends/arm/recipes/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.export import recipe_registry
+
+from .arm_recipe_provider import ArmRecipeProvider
+from .arm_recipe_types import ArmRecipeType
+
+recipe_registry.register_backend_recipe_provider(ArmRecipeProvider())
+
+
+__all__ = ["ArmRecipeProvider", "ArmRecipeType"]
diff --git a/backends/arm/recipes/arm_recipe_provider.py b/backends/arm/recipes/arm_recipe_provider.py
new file mode 100644
index 00000000000..5e910324b54
--- /dev/null
+++ b/backends/arm/recipes/arm_recipe_provider.py
@@ -0,0 +1,224 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-strict
+
+from typing import Any, Callable, Optional, Sequence
+
+from executorch.backends.arm.common.arm_compile_spec import ArmCompileSpec
+from executorch.backends.arm.recipes.arm_recipe_types import ARM_BACKEND, ArmRecipeType
+from executorch.exir.pass_manager import PassType
+from executorch.export import (
+    BackendRecipeProvider,
+    ExportRecipe,
+    LoweringRecipe,
+    QuantizationRecipe,
+    RecipeType,
+    StageType,
+)
+
+
+_ETHOS_U_FAMILIES: dict[ArmRecipeType, tuple[str, tuple[int, ...], int]] = {
+    ArmRecipeType.ETHOS_U55_INT8: ("ethos-u55", (32, 64, 128, 256), 128),
+    ArmRecipeType.ETHOS_U65_INT8: ("ethos-u65", (256, 512), 256),
+    ArmRecipeType.ETHOS_U85_INT8: ("ethos-u85", (128, 256, 512, 1024, 2048), 256),
+}
+
+_ETHOS_U_KWARGS: frozenset[str] = frozenset(
+    {"macs", "system_config", "memory_mode", "extra_flags", "config_ini"}
+)
+
+# Matches aot_arm_compiler.py:479-484 — bit-identical Vela invocation vs. CLI.
+_VELA_DEFAULT_FLAGS: tuple[str, ...] = (
+    "--verbose-operators",
+    "--verbose-cycle-estimate",
+)
+
+# Pipeline used by INT8/A16W8 paths so ReplaceQuantNodesPass runs after the
+# partitioner (matches aot_arm_compiler.py:200-201).
+_PIPELINE_WITH_EDGE_PASSES: list[StageType] = [
+    StageType.SOURCE_TRANSFORM,
+    StageType.QUANTIZE,
+    StageType.TORCH_EXPORT,
+    StageType.TO_EDGE_TRANSFORM_AND_LOWER,
+    StageType.EDGE_PROGRAM_MANAGER_TRANSFORM,
+    StageType.TO_EXECUTORCH,
+]
+
+
+def _replace_quant_nodes_pass(_epm: Any) -> list[PassType]:
+    from executorch.backends.cortex_m.passes.replace_quant_nodes_pass import (
+        ReplaceQuantNodesPass,
+    )
+
+    return [ReplaceQuantNodesPass()]
+
+
+class ArmRecipeProvider(BackendRecipeProvider):
+    """Note: unknown kwargs raise ``ValueError`` (vs. XNNPACK/QNN, which log a
+    warning). Intentional for a new provider so typos like ``mac=128`` fail
+    fast rather than silently producing a wrong-target binary."""
+
+    @property
+    def backend_name(self) -> str:
+        return ARM_BACKEND
+
+    def get_supported_recipes(self) -> Sequence[RecipeType]:
+        return list(ArmRecipeType)
+
+    def create_recipe(
+        self, recipe_type: RecipeType, **kwargs: Any
+    ) -> Optional[ExportRecipe]:
+        if not isinstance(recipe_type, ArmRecipeType):
+            return None
+
+        if recipe_type in _ETHOS_U_FAMILIES:
+            return self._build_ethos_u_recipe(recipe_type, kwargs)
+
+        # Prime ethosu before importing vgf: a pre-existing circular dep
+        # between tosa.backend and ethosu.backend breaks if vgf is loaded
+        # first (vgf.backend → tosa.backend → _passes → ethosu.backend →
+        # tosa.backend [partial]). The Arm CLI works around it by the same
+        # ordering at module load (aot_arm_compiler.py:26-35).
+        import executorch.backends.arm.ethosu  # noqa: F401
+        from executorch.backends.arm.tosa.compile_spec import TosaCompileSpec
+        from executorch.backends.arm.vgf import VgfCompileSpec
+
+        # (compile_spec_factory, tosa_spec, quant_mode, replace_quant_nodes).
+        # replace_quant_nodes is False for VGF, matching aot_arm_compiler.py:200.
+        delegated: dict[
+            ArmRecipeType,
+            tuple[Callable[[str], ArmCompileSpec], str, Optional[str], bool],
+        ] = {
+            ArmRecipeType.TOSA_FP: (TosaCompileSpec, "TOSA-1.0+FP", None, False),
+            ArmRecipeType.TOSA_INT8: (TosaCompileSpec, "TOSA-1.0+INT", "INT8", True),
+            ArmRecipeType.TOSA_A16W8: (
+                TosaCompileSpec,
+                "TOSA-1.0+INT+int16",
+                "A16W8",
+                True,
+            ),
+            ArmRecipeType.VGF_FP: (VgfCompileSpec, "TOSA-1.0+FP", None, False),
+            ArmRecipeType.VGF_INT8: (VgfCompileSpec, "TOSA-1.0+INT", "INT8", False),
+        }
+        factory, tosa_spec, quant_mode, replace_quant_nodes = delegated[recipe_type]
+        return self._build_delegated_recipe(
+            recipe_type, factory, tosa_spec, kwargs, quant_mode, replace_quant_nodes
+        )
+
+    def _build_ethos_u_recipe(
+        self, recipe_type: ArmRecipeType, kwargs: dict[str, Any]
+    ) -> ExportRecipe:
+        from executorch.backends.arm.ethosu import EthosUCompileSpec
+        from executorch.backends.arm.util._factory import create_partitioner
+
+        self._validate_kwargs(recipe_type, kwargs, _ETHOS_U_KWARGS)
+
+        family, allowed_macs, default_macs = _ETHOS_U_FAMILIES[recipe_type]
+        macs = kwargs.get("macs", default_macs)
+        if macs not in allowed_macs:
+            raise ValueError(
+                f"Recipe '{recipe_type.value}' does not support macs={macs}. "
+                f"Allowed: {list(allowed_macs)}"
+            )
+
+        user_extra_flags = kwargs.get("extra_flags") or []
+        compile_spec = EthosUCompileSpec(
+            target=f"{family}-{macs}",
+            system_config=kwargs.get("system_config"),
+            memory_mode=kwargs.get("memory_mode"),
+            extra_flags=list(_VELA_DEFAULT_FLAGS) + list(user_extra_flags),
+            config_ini=kwargs.get("config_ini", "Arm/vela.ini"),
+        )
+
+        return ExportRecipe(
+            name=recipe_type.value,
+            quantization_recipe=self._build_quantization_recipe(compile_spec, "INT8"),
+            lowering_recipe=LoweringRecipe(
+                partitioners=[create_partitioner(compile_spec)],
+                edge_manager_transform_passes=[_replace_quant_nodes_pass],
+            ),
+            pipeline_stages=_PIPELINE_WITH_EDGE_PASSES,
+        )
+
+    def _build_delegated_recipe(
+        self,
+        recipe_type: ArmRecipeType,
+        compile_spec_factory: Callable[[str], ArmCompileSpec],
+        tosa_spec: str,
+        kwargs: dict[str, Any],
+        quant_mode: Optional[str],
+        replace_quant_nodes: bool,
+    ) -> ExportRecipe:
+        from executorch.backends.arm.util._factory import create_partitioner
+
+        self._validate_kwargs(recipe_type, kwargs, frozenset())
+
+        compile_spec = compile_spec_factory(tosa_spec)
+        partitioner = create_partitioner(compile_spec)
+
+        if replace_quant_nodes:
+            lowering = LoweringRecipe(
+                partitioners=[partitioner],
+                edge_manager_transform_passes=[_replace_quant_nodes_pass],
+            )
+            pipeline = _PIPELINE_WITH_EDGE_PASSES
+        else:
+            lowering = LoweringRecipe(partitioners=[partitioner])
+            pipeline = None
+
+        return ExportRecipe(
+            name=recipe_type.value,
+            quantization_recipe=self._build_quantization_recipe(
+                compile_spec, quant_mode
+            ),
+            lowering_recipe=lowering,
+            pipeline_stages=pipeline,
+        )
+
+    @staticmethod
+    def _build_quantization_recipe(
+        compile_spec: ArmCompileSpec, quant_mode: Optional[str]
+    ) -> Optional[QuantizationRecipe]:
+        from executorch.backends.arm.quantizer import (
+            get_symmetric_a16w8_quantization_config,
+            get_symmetric_quantization_config,
+        )
+        from executorch.backends.arm.util._factory import create_quantizer
+
+        if quant_mode is None:
+            return None
+
+        quantizer = create_quantizer(compile_spec)
+        if quant_mode == "INT8":
+            operator_config = get_symmetric_quantization_config(is_per_channel=True)
+        elif quant_mode == "A16W8":
+            if not compile_spec.tosa_spec.support_extension("int16"):
+                raise ValueError(
+                    f"TOSA spec {compile_spec.tosa_spec} does not support int16 "
+                    "(required for A16W8)"
+                )
+            operator_config = get_symmetric_a16w8_quantization_config(
+                is_per_channel=True
+            )
+        else:
+            raise ValueError(f"Unsupported quant_mode: {quant_mode}")
+        quantizer.set_global(operator_config)
+        return QuantizationRecipe(quantizers=[quantizer])
+
+    @staticmethod
+    def _validate_kwargs(
+        recipe_type: ArmRecipeType,
+        kwargs: dict[str, Any],
+        expected: frozenset[str],
+    ) -> None:
+        unexpected = set(kwargs.keys()) - expected
+        if unexpected:
+            allowed = sorted(expected) if expected else "none"
+            raise ValueError(
+                f"Arm recipe '{recipe_type.value}' got unexpected parameters: "
+                f"{sorted(unexpected)}. Allowed: {allowed}"
+            )
diff --git a/backends/arm/recipes/arm_recipe_types.py b/backends/arm/recipes/arm_recipe_types.py
new file mode 100644
index 00000000000..cc51a5249a9
--- /dev/null
+++ b/backends/arm/recipes/arm_recipe_types.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.export import RecipeType
+
+
+ARM_BACKEND = "arm"
+
+
+class ArmRecipeType(RecipeType):
+    """Arm-specific recipe types.
+
+    Coverage matches ``backends/arm/scripts/aot_arm_compiler.py`` today
+    (Cortex-M is not yet supported via recipes).
+
+    Ethos-U recipes accept the following kwargs:
+        macs (int): MAC count for the family.
+            U55: 32 / 64 / 128 / 256 (default 128).
+            U65: 256 / 512 (default 256).
+            U85: 128 / 256 / 512 / 1024 / 2048 (default 256).
+        system_config (str): Vela system config name. Defaults from
+            ``EthosUCompileSpec`` apply when omitted.
+        memory_mode (str): Vela memory mode. Defaults from
+            ``EthosUCompileSpec`` apply when omitted.
+        extra_flags (list[str]): Additional Vela compiler flags.
+        config_ini (str): Path to a Vela .ini configuration file.
+
+    """
+
+    ETHOS_U55_INT8 = "arm_ethos_u55_int8"
+    ETHOS_U65_INT8 = "arm_ethos_u65_int8"
+    ETHOS_U85_INT8 = "arm_ethos_u85_int8"
+
+    TOSA_FP = "arm_tosa_fp"
+    TOSA_INT8 = "arm_tosa_int8"
+    TOSA_A16W8 = "arm_tosa_a16w8"
+
+    VGF_FP = "arm_vgf_fp"
+    VGF_INT8 = "arm_vgf_int8"
+
+    @classmethod
+    def get_backend_name(cls) -> str:
+        return ARM_BACKEND
diff --git a/backends/arm/test/recipes/__init__.py b/backends/arm/test/recipes/__init__.py
new file mode 100644
index 00000000000..2e41cd717f6
--- /dev/null
+++ b/backends/arm/test/recipes/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/backends/arm/test/recipes/test_arm_recipes.py b/backends/arm/test/recipes/test_arm_recipes.py
new file mode 100644
index 00000000000..8d31e3ef289
--- /dev/null
+++ b/backends/arm/test/recipes/test_arm_recipes.py
@@ -0,0 +1,297 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Any, Optional
+
+import torch
+
+from executorch.backends.arm.recipes.arm_recipe_provider import ArmRecipeProvider
+from executorch.backends.arm.recipes.arm_recipe_types import ARM_BACKEND, ArmRecipeType
+from executorch.export import ExportRecipe, recipe_registry
+
+
+class _AddModule(torch.nn.Module):
+    def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+        return x + y
+
+
+class _ConvReluModule(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.conv = torch.nn.Conv2d(3, 8, kernel_size=3, padding=1)
+        self.relu = torch.nn.ReLU()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.relu(self.conv(x))
+
+
+def _compile_spec_value(partitioner: Any, key: str) -> Optional[str]:
+    for spec in partitioner.delegation_spec.compile_specs:
+        if spec.key == key:
+            value = spec.value
+            return value.decode() if isinstance(value, (bytes, bytearray)) else value
+    return None
+
+
+def _first_partitioner(recipe: ExportRecipe) -> Any:
+    assert recipe.lowering_recipe is not None
+    assert recipe.lowering_recipe.partitioners
+    return recipe.lowering_recipe.partitioners[0]
+
+
+def _input_activation_dtype(recipe: ExportRecipe) -> Optional[torch.dtype]:
+    assert recipe.quantization_recipe is not None
+    assert recipe.quantization_recipe.quantizers is not None
+    quantizer = recipe.quantization_recipe.quantizers[0]
+    config = quantizer.global_config  # type: ignore[attr-defined]
+    if config is None or config.input_activation is None:
+        return None
+    return config.input_activation.dtype
+
+
+class TestArmRecipeRegistration(unittest.TestCase):
+    def test_backend_registered(self) -> None:
+        # Auto-registered via recipes/__init__.py.
+        self.assertIn(ARM_BACKEND, recipe_registry.list_backends())
+
+    def test_supported_recipes_match_enum(self) -> None:
+        # Guards against the provider drifting from ArmRecipeType (e.g.,
+        # adding an enum value but forgetting to wire it).
+        supported = recipe_registry.get_supported_recipes(ARM_BACKEND)
+        self.assertEqual(set(supported), set(ArmRecipeType))
+
+    def test_unknown_recipe_returns_none(self) -> None:
+        from executorch.export import RecipeType
+
+        class _StubRecipeType(RecipeType):
+            FOO = "stub_foo"
+
+            @classmethod
+            def get_backend_name(cls) -> str:
+                return "stub"
+
+        self.assertIsNone(ArmRecipeProvider().create_recipe(_StubRecipeType.FOO))
+
+
+class _ArmRecipeBaseTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls) -> None:
+        try:
+            import executorch.backends.arm.tosa.compile_spec  # noqa: F401
+            import tosa_serializer  # noqa: F401
+        except ImportError as exc:
+            raise unittest.SkipTest(
+                f"Arm Python deps not available, skipping: {exc}"
+            ) from exc
+
+
+class _EthosURecipeBaseTest(_ArmRecipeBaseTest):
+    """`EthosUCompileSpec` transitively imports `arm_vela`, which is unavailable
+    on `--disable-ethos-u-deps` runners.
+
+    Without this guard, those runs would surface as ImportError instead of a
+    clean skip.
+
+    """
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        super().setUpClass()
+        try:
+            import executorch.backends.arm.ethosu  # noqa: F401
+        except ImportError as exc:
+            raise unittest.SkipTest(
+                f"Ethos-U deps not available, skipping: {exc}"
+            ) from exc
+
+
+class TestTosaRecipes(_ArmRecipeBaseTest):
+    def test_tosa_construction(self) -> None:
+        cases = [
+            (ArmRecipeType.TOSA_FP, "arm_tosa_fp", None),
+            (ArmRecipeType.TOSA_INT8, "arm_tosa_int8", torch.int8),
+            (ArmRecipeType.TOSA_A16W8, "arm_tosa_a16w8", torch.int16),
+        ]
+        for recipe_type, expected_name, expected_act_dtype in cases:
+            with self.subTest(recipe_type=recipe_type):
+                recipe = ExportRecipe.get_recipe(recipe_type)
+                self.assertEqual(recipe.name, expected_name)
+                self.assertIsNotNone(_first_partitioner(recipe))
+                if expected_act_dtype is None:
+                    self.assertIsNone(recipe.quantization_recipe)
+                else:
+                    self.assertEqual(
+                        _input_activation_dtype(recipe), expected_act_dtype
+                    )
+
+    def test_unexpected_kwarg_raises(self) -> None:
+        with self.assertRaisesRegex(ValueError, "unexpected parameters"):
+            ExportRecipe.get_recipe(ArmRecipeType.TOSA_INT8, foo=1)
+
+
+class TestVgfRecipes(_ArmRecipeBaseTest):
+    def test_vgf_construction(self) -> None:
+        cases = [
+            (ArmRecipeType.VGF_FP, "arm_vgf_fp", None),
+            (ArmRecipeType.VGF_INT8, "arm_vgf_int8", torch.int8),
+        ]
+        for recipe_type, expected_name, expected_act_dtype in cases:
+            with self.subTest(recipe_type=recipe_type):
+                recipe = ExportRecipe.get_recipe(recipe_type)
+                self.assertEqual(recipe.name, expected_name)
+                self.assertIsNotNone(_first_partitioner(recipe))
+                if expected_act_dtype is None:
+                    self.assertIsNone(recipe.quantization_recipe)
+                else:
+                    self.assertEqual(
+                        _input_activation_dtype(recipe), expected_act_dtype
+                    )
+
+
+class TestEthosURecipes(_EthosURecipeBaseTest):
+    def test_default_macs(self) -> None:
+        cases = [
+            (ArmRecipeType.ETHOS_U55_INT8, "ethos-u55-128"),
+            (ArmRecipeType.ETHOS_U65_INT8, "ethos-u65-256"),
+            (ArmRecipeType.ETHOS_U85_INT8, "ethos-u85-256"),
+        ]
+        for recipe_type, expected_target in cases:
+            with self.subTest(recipe_type=recipe_type):
+                recipe = ExportRecipe.get_recipe(recipe_type)
+                self.assertEqual(recipe.name, recipe_type.value)
+                self.assertEqual(_input_activation_dtype(recipe), torch.int8)
+                partitioner = _first_partitioner(recipe)
+                self.assertEqual(
+                    _compile_spec_value(partitioner, "target"), expected_target
+                )
+
+    def test_custom_macs(self) -> None:
+        cases = [
+            (ArmRecipeType.ETHOS_U55_INT8, 32, "ethos-u55-32"),
+            (ArmRecipeType.ETHOS_U55_INT8, 256, "ethos-u55-256"),
+            (ArmRecipeType.ETHOS_U65_INT8, 512, "ethos-u65-512"),
+            (ArmRecipeType.ETHOS_U85_INT8, 128, "ethos-u85-128"),
+            (ArmRecipeType.ETHOS_U85_INT8, 2048, "ethos-u85-2048"),
+        ]
+        for recipe_type, macs, expected_target in cases:
+            with self.subTest(recipe_type=recipe_type, macs=macs):
+                recipe = ExportRecipe.get_recipe(recipe_type, macs=macs)
+                partitioner = _first_partitioner(recipe)
+                self.assertEqual(
+                    _compile_spec_value(partitioner, "target"), expected_target
+                )
+
+    def test_invalid_macs_raises(self) -> None:
+        cases = [
+            (ArmRecipeType.ETHOS_U55_INT8, 512),
+            (ArmRecipeType.ETHOS_U65_INT8, 128),
+            (ArmRecipeType.ETHOS_U85_INT8, 64),
+            (ArmRecipeType.ETHOS_U55_INT8, 999),
+        ]
+        for recipe_type, macs in cases:
+            with self.subTest(recipe_type=recipe_type, macs=macs):
+                with self.assertRaises(ValueError):
+                    ExportRecipe.get_recipe(recipe_type, macs=macs)
+
+    def test_pass_through_kwargs(self) -> None:
+        recipe = ExportRecipe.get_recipe(
+            ArmRecipeType.ETHOS_U55_INT8,
+            macs=128,
+            system_config="Custom_System",
+            memory_mode="Custom_Memory",
+            extra_flags=["--user-flag"],
+            config_ini="custom/vela.ini",
+        )
+        partitioner = _first_partitioner(recipe)
+        flags = _compile_spec_value(partitioner, "compile_flags") or ""
+        self.assertIn("--system-config=Custom_System", flags)
+        self.assertIn("--memory-mode=Custom_Memory", flags)
+        # Verbose flags must be prepended (matches aot_arm_compiler.py:479-484).
+        self.assertIn("--verbose-operators", flags)
+        self.assertIn("--verbose-cycle-estimate", flags)
+        self.assertIn("--user-flag", flags)
+        self.assertIn("--config=custom/vela.ini", flags)
+
+    def test_unexpected_kwarg_raises(self) -> None:
+        # Catches typos like `mac=128` (instead of `macs=128`) that would
+        # otherwise silently produce a wrong-target binary.
+        with self.assertRaisesRegex(ValueError, "unexpected parameters"):
+            ExportRecipe.get_recipe(ArmRecipeType.ETHOS_U55_INT8, mac=128)
+
+
+class TestTosaAOTRoundTrip(_ArmRecipeBaseTest):
+    """TOSA AOT round-trips run with just ``tosa_serializer`` installed.
+
+    Ethos-U / VGF round-trips need a real compiler and are deferred to an FVP-
+    bearing follow-up.
+
+    """
+
+    def _export(
+        self,
+        recipe: ExportRecipe,
+        model: torch.nn.Module,
+        example_inputs: tuple,
+    ):
+        from executorch.export import export
+
+        session = export(
+            model=model,
+            example_inputs=[example_inputs],
+            export_recipe=recipe,
+        )
+        return session.get_executorch_program()
+
+    def _instruction_kinds(self, program) -> tuple[list, list]:
+        from executorch.exir.schema import DelegateCall, KernelCall
+
+        instructions = program.execution_plan[0].chains[0].instructions
+        assert instructions is not None
+        operators = program.execution_plan[0].operators
+        delegate_calls = [
+            i for i in instructions if isinstance(i.instr_args, DelegateCall)
+        ]
+        kernel_op_names = [
+            operators[i.instr_args.op_index].name
+            for i in instructions
+            if isinstance(i.instr_args, KernelCall)
+        ]
+        return delegate_calls, kernel_op_names
+
+    def test_tosa_fp_export(self) -> None:
+        # FP path: no quant ops, expect full delegation (Add is supported by TOSA).
+        program = self._export(
+            ExportRecipe.get_recipe(ArmRecipeType.TOSA_FP),
+            _AddModule(),
+            (torch.randn(2, 3), torch.randn(2, 3)),
+        )
+        delegates, kernels = self._instruction_kinds(program)
+        self.assertEqual(len(delegates), 1, "Add should produce one TOSA delegate")
+        self.assertEqual(
+            kernels, [], f"Expected full delegation, got kernels {kernels}"
+        )
+
+    def test_tosa_int8_export(self) -> None:
+        # INT8 path: boundary quantize/dequantize remain outside the delegate
+        # and ReplaceQuantNodesPass rewrites them to cortex_m::* (matches
+        # aot_arm_compiler.py:200-201).
+        program = self._export(
+            ExportRecipe.get_recipe(ArmRecipeType.TOSA_INT8),
+            _ConvReluModule(),
+            (torch.randn(1, 3, 8, 8),),
+        )
+        delegates, kernels = self._instruction_kinds(program)
+        self.assertGreaterEqual(len(delegates), 1, "Conv+ReLU should delegate")
+        for op_name in kernels:
+            self.assertTrue(
+                op_name.startswith("cortex_m::"),
+                f"Non-delegate kernels must be cortex_m boundary ops; got {op_name}",
+            )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/backends/arm/test/test_arm_baremetal.sh b/backends/arm/test/test_arm_baremetal.sh
index ad8cd8b7d3a..f9b9790e46e 100755
--- a/backends/arm/test/test_arm_baremetal.sh
+++ b/backends/arm/test/test_arm_baremetal.sh
@@ -96,6 +96,19 @@ test_pytest_models_no_target() {
     echo "${TEST_SUITE_NAME}: PASS"
 }
 
+# ----------------------------------------------
+# -------- ExportRecipe construction tests -----
+# ----------------------------------------------
+test_pytest_recipes() {
+    echo "${TEST_SUITE_NAME}: Run pytest for Arm ExportRecipe construction"
+
+    # Construction-only suite. Per-target tests guard themselves via SkipTest
+    # when the corresponding SDK piece (Vela, libvgf, ...) isn't installed, so
+    # this command is safe on both the no-deps and full-SDK runners.
+    pytest "${PYTEST_RETRY_ARGS[@]}" --verbose --color=yes --numprocesses=auto --durations=10 backends/arm/test/recipes
+    echo "${TEST_SUITE_NAME}: PASS"
+}
+
 # -------------------------------------
 # -------- TOSA specific tests --------
 # -------------------------------------
diff --git a/export/stages.py b/export/stages.py
index dbc4703faaa..0d874823e10 100644
--- a/export/stages.py
+++ b/export/stages.py
@@ -553,7 +553,7 @@ def stage_type(self) -> str:
     def valid_predecessor_stages(self) -> List["StageType"]:
         return [
             StageType.TO_EDGE,
-            # StageType.TO_EDGE_TRANSFORM_AND_LOWER,  # TODO
+            StageType.TO_EDGE_TRANSFORM_AND_LOWER,
         ]
 
     @property