From 8b8ce9ec4c83f5fcd6a875679533f4f77c8f1b55 Mon Sep 17 00:00:00 2001 From: RJ Ascani Date: Wed, 13 May 2026 11:12:39 -0700 Subject: [PATCH] Cortex-M backend: build for any Cortex-M variant against Corstone-300 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the Cortex-M test pipeline so the `cortex-m` target strings registered in the AOT compile-config plumbing actually produce runnable, ISA-faithful binaries. The binary is built end-to-end with `-mcpu=cortex-m` — runner and core libraries alike — so CMSIS-NN's compile-time `__ARM_FEATURE_DSP` / `__ARM_FEATURE_MVE` selector exercises the matching kernel implementation. The Corstone-300 M55 simulator is an ISA superset of every earlier Cortex-M, so it executes binaries compiled for older cores without modification — the CI gate becomes "did the right CMSIS-NN code path execute correctly" rather than "did per-CPU silicon behave as expected". The build pipeline learns the target CPU end-to-end: * `build_executorch.sh` accepts `--target_cpu` and passes `-DTARGET_CPU` to the toolchain CMake. * `build_test_runner.sh` derives `target_cpu` from `--target` and forwards it. The regex matches both bare `cortex-m` (the canonical form after the Phase 1 AOT API drop of `+int8`) and the legacy `cortex-m+int8` shape for any callers still on it. * `build_executor_runner.sh` derives the matching `target_cpu` and supplies a dummy `ETHOSU_TARGET_NPU_CONFIG=ethos-u55-128` so core_platform's `ethosu_get_architecture()` parser stays happy. A single `arm_test/cmake-out` continues to stage the core libraries — when switching `target_cpu` locally, clear `arm_test/cmake-out` first to avoid linking stale per-CPU artifacts. Without the `--target_cpu` plumbing, `build_executorch.sh` defaulted to `-mcpu=cortex-m55`, so the core libraries (libexecutorch.a, libcortex_m_kernels.a, the bundled CMSIS-NN) baked in M55+MVE code paths. A runner built with `-mcpu=cortex-m4` would link those libraries and execute MVE instructions on Corstone-300's M55 — passing bundled-IO checks while testing the wrong code path. One transient patch is layered into the externally-fetched `ethos-u/core_platform` repo via the existing `patch_repo` mechanism: an `#if defined(__ARM_ARCH_8M_MAIN__) || defined(__ARM_ARCH_8_1M_MAIN__)` guard around the MPU init block in `corstone-300/target.cpp`. Without it, the Armv8-M-only `ARM_MPU_RBAR` / `ARM_MPU_RLAR` API breaks the build for older cores. The FVP doesn't enforce protection regions without an explicit setup, so simulation correctness is unaffected. The patch is a bridge — see TODO at `corstone_utils.cmake:52` — pending upstream merge of the equivalent guard. Inside our own runner, the optional Armv8.1-M PMU intrinsics (`ARM_PMU_*`) in `arm_executor_runner.cpp` and `arm_perf_monitor.cpp` are guarded on `__ARM_ARCH_8_1M_MAIN__`. Earlier cores get a zero cycle count rather than a compile error; functional correctness is unaffected. `run_fvp.sh` routes all `cortex-m*` targets except `cortex-m85*` to the Corstone-300 FVP. Locally validated end-to-end on Corstone-300 with the `qadd` model: * `cortex-m55` — baseline, PASS; op_quantize_per_tensor.cpp.obj contains MVE intrinsics (vdup.16, vmax.s16). * `cortex-m4` — PASS; same object has no MVE — only single-precision FP (vmul.f32, vcvt.s32.f32). CMSIS-NN selects the DSP path (1275 DSP opcodes in libcmsis-nn.a). * `cortex-m7` — PASS; same shape as M4. Scalar-class variants (`cortex-m{0,0plus,3,23}`) still need a follow-up: an Armv6-M `HardFault_Handler` guard in `target.cpp` and a `core_software/cmsis.cmake` `ARMCM0plus` directory-case fix. The target_cpu plumbing here already accommodates soft-float ABI builds — the follow-up only adds those two additional `__ARM_ARCH_*` guards. Authored with Claude. --- backends/arm/scripts/build_executor_runner.sh | 11 +++- backends/arm/scripts/build_executorch.sh | 7 +++ backends/arm/scripts/corstone_utils.cmake | 7 ++- backends/arm/scripts/run_fvp.sh | 6 +-- backends/cortex_m/test/build_test_runner.sh | 20 ++++++- ...rmv8-M-MPU-init-for-cross-CPU-builds.patch | 53 +++++++++++++++++++ .../executor_runner/arm_executor_runner.cpp | 9 +++- .../arm/executor_runner/arm_perf_monitor.cpp | 25 ++++++--- examples/arm/run.sh | 2 +- 9 files changed, 123 insertions(+), 17 deletions(-) create mode 100644 examples/arm/ethos-u-setup/core_platform/0002-Guard-Armv8-M-MPU-init-for-cross-CPU-builds.patch diff --git a/backends/arm/scripts/build_executor_runner.sh b/backends/arm/scripts/build_executor_runner.sh index f2ffd2e27a7..c436397458b 100755 --- a/backends/arm/scripts/build_executor_runner.sh +++ b/backends/arm/scripts/build_executor_runner.sh @@ -151,10 +151,17 @@ fi mkdir -p "${output_folder}" output_folder=$(realpath ${output_folder}) -if [[ ${target} == *"ethos-u55"* ]]; then +if [[ ${target} =~ ^cortex-m([0-9]+(plus|p)?)(\+|$) ]]; then + # NPU isn't used at runtime, but core_platform's ethosu_get_architecture() + # parser rejects non-ethos-u strings — pass a dummy. + target_cpu="cortex-m${BASH_REMATCH[1]}" + npu_target_config="ethos-u55-128" +elif [[ ${target} == *"ethos-u55"* ]]; then target_cpu=cortex-m55 + npu_target_config="${target}" else target_cpu=cortex-m85 + npu_target_config="${target}" fi echo "--------------------------------------------------------------------------------" echo "Build Arm ${toolchain/-gcc/} executor_runner for ${target} PTE: ${pte_file} using ${system_config} ${memory_mode} ${extra_build_flags} to '${output_folder}'" @@ -178,7 +185,7 @@ cmake \ -DET_DIR_PATH:PATH=${et_root_dir} \ -DET_BUILD_DIR_PATH:PATH=${et_build_dir} \ -DETHOS_SDK_PATH:PATH=${ethos_u_root_dir} \ - -DETHOSU_TARGET_NPU_CONFIG=${target} \ + -DETHOSU_TARGET_NPU_CONFIG=${npu_target_config} \ ${pte_data} \ ${build_bundleio_flags} \ ${build_with_etdump_flags} \ diff --git a/backends/arm/scripts/build_executorch.sh b/backends/arm/scripts/build_executorch.sh index cf7e327b9ce..e83586f4327 100755 --- a/backends/arm/scripts/build_executorch.sh +++ b/backends/arm/scripts/build_executorch.sh @@ -24,6 +24,7 @@ build_type="Release" build_devtools=OFF build_with_etdump=OFF is_linux_musl=0 +target_cpu="" help() { echo "Usage: $(basename $0) [options]" @@ -33,6 +34,7 @@ help() { echo " --devtools Build Devtools libs" echo " --etdump Adds Devtools etdump support to track timing, etdump area will be base64 encoded in the log" echo " --toolchain= Toolchain can be specified (arm-none-eabi-gcc, arm-zephyr-eabi-gcc, aarch64-linux-musl-gcc). Default: ${toolchain}" + echo " --target_cpu= Override the toolchain's default TARGET_CPU (e.g. cortex-m4). Switching target_cpu reuses the same cmake-out dir, so clear ${et_build_root}/cmake-out first to avoid stale per-CPU artifacts. Default: unset (toolchain default)." exit 0 } @@ -44,6 +46,7 @@ for arg in "$@"; do --devtools) build_devtools=ON ;; --etdump) build_with_etdump=ON ;; --toolchain=*) toolchain="${arg#*=}";; + --target_cpu=*) target_cpu="${arg#*=}";; *) ;; esac @@ -87,6 +90,10 @@ cmake_args=( -DEXECUTORCH_BUILD_ARM_ETDUMP=${build_with_etdump} ) +if [[ -n "${target_cpu}" ]]; then + cmake_args+=(-DTARGET_CPU=${target_cpu}) +fi + if [[ ${is_linux_musl} -eq 1 ]]; then if [[ -z "${MUSL_TOOLCHAIN_ROOT:-}" ]]; then echo "Error: MUSL_TOOLCHAIN_ROOT is required for aarch64-linux-musl-gcc." diff --git a/backends/arm/scripts/corstone_utils.cmake b/backends/arm/scripts/corstone_utils.cmake index bdfc3aeb007..58ce4f9a919 100644 --- a/backends/arm/scripts/corstone_utils.cmake +++ b/backends/arm/scripts/corstone_utils.cmake @@ -49,7 +49,12 @@ function(fetch_ethos_u_content ETHOS_SDK_PATH ET_DIR_PATH) "source backends/arm/scripts/utils.sh && patch_repo ${ETHOS_SDK_PATH}/core_software ${core_software_base_rev} ${patch_dir}" WORKING_DIRECTORY ${ET_DIR_PATH} ) - # Always patch the core_platform repo since this is fast enough. + # Always patch the core_platform repo since this is fast enough. TODO: + # examples/arm/ethos-u-setup/core_platform/0002-*.patch is a transient bridge + # that guards Armv8-M-only MPU init so the source compiles for non-Armv8-M + # Cortex-M cores. Once the same guard lands upstream in ethos-u/core_platform + # and ${core_platform_base_rev} is bumped past that commit, delete the 0002 + # patch. set(core_platform_base_rev "26.02") execute_process( COMMAND diff --git a/backends/arm/scripts/run_fvp.sh b/backends/arm/scripts/run_fvp.sh index 9f0010189af..cbdb4764248 100755 --- a/backends/arm/scripts/run_fvp.sh +++ b/backends/arm/scripts/run_fvp.sh @@ -55,9 +55,9 @@ done elf_file=$(realpath ${elf_file}) -# cortex-m55 is the only Cortex-M CPU on the Corstone-300 board today; -# cortex-m85 lives on Corstone-320, so it falls through to the SSE-320 FVP. -if [[ ${target} == *"ethos-u55"* || ${target} == cortex-m55* ]]; then +# cortex-m85 lives on Corstone-320; all other Cortex-M variants run on +# the Corstone-300 M55 (ISA superset). +if [[ ${target} == *"ethos-u55"* || ${target} == cortex-m* && ${target} != cortex-m85* ]]; then fvp_model=FVP_Corstone_SSE-300_Ethos-U55 else fvp_model=FVP_Corstone_SSE-320 diff --git a/backends/cortex_m/test/build_test_runner.sh b/backends/cortex_m/test/build_test_runner.sh index 2505f83c9da..bdca1a21e7c 100755 --- a/backends/cortex_m/test/build_test_runner.sh +++ b/backends/cortex_m/test/build_test_runner.sh @@ -8,11 +8,27 @@ set -eu +target="cortex-m55" +for arg in "$@"; do + case $arg in + --target=*) target="${arg#*=}";; + *) ;; + esac +done + +# Forward to build_executorch.sh so the core libs share the runner's -mcpu. +if [[ ${target} =~ ^cortex-m([0-9]+(plus|p)?)(\+|$) ]]; then + target_cpu="cortex-m${BASH_REMATCH[1]}" +else + echo "Error: build_test_runner.sh only supports cortex-m targets, got: ${target}" + exit 1 +fi + # Always rebuild executorch in case the cortex-m kernels has been updated. script_dir=$(realpath "$(dirname "${BASH_SOURCE[0]}")") et_root_dir=$(realpath "${script_dir}/../../..") build_executorch="${et_root_dir}/backends/arm/scripts/build_executorch.sh" -${build_executorch} --devtools +${build_executorch} --devtools --target_cpu="${target_cpu}" # Build executor runner with selected aten ops and semi hosting build_dir="${et_root_dir}/arm_test" @@ -32,4 +48,4 @@ aten::unsqueeze_copy.out,\ aten::select_copy.int_out,\ aten::amax.out" -${build_executor_runner} --pte=semihosting --bundleio --target=ethos-u55-128 --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0" +${build_executor_runner} --pte=semihosting --bundleio --target="${target}" --output="${build_root_test_dir}" --select_ops_list="${select_ops_list}" --extra_build_flags="-DET_ATOL=5.0 -DET_RTOL=1.0" diff --git a/examples/arm/ethos-u-setup/core_platform/0002-Guard-Armv8-M-MPU-init-for-cross-CPU-builds.patch b/examples/arm/ethos-u-setup/core_platform/0002-Guard-Armv8-M-MPU-init-for-cross-CPU-builds.patch new file mode 100644 index 00000000000..c03aa4ec24f --- /dev/null +++ b/examples/arm/ethos-u-setup/core_platform/0002-Guard-Armv8-M-MPU-init-for-cross-CPU-builds.patch @@ -0,0 +1,53 @@ +From 7a00a3cdf2f47424fdf29718e582ad7ae9af9cb5 Mon Sep 17 00:00:00 2001 +From: RJ Ascani +Date: Tue, 12 May 2026 09:25:48 -0700 +Subject: [PATCH] Guard Armv8-M MPU init for cross-CPU builds +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +The Armv8-M MPU API (ARM_MPU_RBAR, ARM_MPU_RLAR, ARM_MPU_Region_t with +its v8-M layout) is only defined for Armv8-M Mainline targets. When +building the Corstone-300 platform for an older Cortex-M variant — to +exercise the scalar / DSP CMSIS-NN code paths on the Corstone-300 M55 +simulator, which is an ISA superset — the v8-M MPU symbols are not in +scope and the build fails. + +Guard the MPU configuration block with __ARM_ARCH_8M_MAIN__ / +__ARM_ARCH_8_1M_MAIN__ so the same source compiles for Cortex-M0/M0+/ +M3/M4/M7 against the Corstone-300 platform layer. The FVP doesn't +enforce MPU permissions, so skipping the configuration has no +runtime effect on simulation correctness. +--- + targets/corstone-300/target.cpp | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/targets/corstone-300/target.cpp b/targets/corstone-300/target.cpp +index 45eb98e..bda2248 100644 +--- a/targets/corstone-300/target.cpp ++++ b/targets/corstone-300/target.cpp +@@ -314,6 +314,13 @@ void targetSetup() { + #endif + + // MPU setup ++ // ++ // The Armv8-M `ARM_MPU_RBAR`/`ARM_MPU_RLAR` API is only available on ++ // Armv8-M Mainline cores (M33/M55/M85/M35P). On pre-Armv8-M targets ++ // (M0/M0+/M3/M4/M7), CMSIS doesn't define these macros — the binary ++ // is still runnable on the Corstone-300 M55 simulator (ISA superset) ++ // without MPU configuration, since the FVP doesn't enforce protection. ++#if defined(__ARM_ARCH_8M_MAIN__) || defined(__ARM_ARCH_8_1M_MAIN__) + const std::vector mpuConfig = { + { + // ITCM (NS) +@@ -418,6 +425,7 @@ void targetSetup() { + + // Setup MPU configuration + Mpu::loadAndEnableConfig(&mpuConfig[0], mpuConfig.size()); ++#endif // __ARM_ARCH_8M_MAIN__ || __ARM_ARCH_8_1M_MAIN__ + + #if defined(CPU_CACHE_ENABLE) && defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) + SCB_EnableICache(); +-- +2.53.0-Meta + diff --git a/examples/arm/executor_runner/arm_executor_runner.cpp b/examples/arm/executor_runner/arm_executor_runner.cpp index c1c49d54779..3a7289b7868 100644 --- a/examples/arm/executor_runner/arm_executor_runner.cpp +++ b/examples/arm/executor_runner/arm_executor_runner.cpp @@ -272,11 +272,14 @@ unsigned char* ethosu_fast_scratch = dedicated_sram; #endif [[maybe_unused]] void et_pal_init(void) { - // Enable ARM PMU Clock +#if defined(__ARM_ARCH_8_1M_MAIN__) + // Armv8.1-M Mainline cores (M55, M85) have the optional PMU extension. + // Pre-Armv8.1-M cores lack ARM_PMU_*; et_pal_current_ticks() returns 0. ARM_PMU_Enable(); DCB->DEMCR |= DCB_DEMCR_TRCENA_Msk; // Trace enable ARM_PMU_CYCCNT_Reset(); ARM_PMU_CNTR_Enable(PMU_CNTENSET_CCNTR_ENABLE_Msk); +#endif } /** @@ -296,7 +299,11 @@ unsigned char* ethosu_fast_scratch = dedicated_sram; } [[maybe_unused]] et_timestamp_t et_pal_current_ticks(void) { +#if defined(__ARM_ARCH_8_1M_MAIN__) return ARM_PMU_Get_CCNTR(); +#else + return 0; +#endif } [[maybe_unused]] et_tick_ratio_t et_pal_ticks_to_ns_multiplier(void) { diff --git a/examples/arm/executor_runner/arm_perf_monitor.cpp b/examples/arm/executor_runner/arm_perf_monitor.cpp index 35fd114f777..5ff22f77597 100644 --- a/examples/arm/executor_runner/arm_perf_monitor.cpp +++ b/examples/arm/executor_runner/arm_perf_monitor.cpp @@ -16,6 +16,15 @@ namespace { +// Returns the Armv8.1-M PMU cycle counter; 0 on cores without it. +static inline uint64_t arm_pmu_cycles() { +#if defined(__ARM_ARCH_8_1M_MAIN__) + return ARM_PMU_Get_CCNTR(); +#else + return 0; +#endif +} + #if defined(ETHOSU55) || defined(ETHOSU65) const uint32_t ethosu_pmuCountersUsed = 4; #elif defined(ETHOSU85) @@ -85,7 +94,7 @@ void ethosu_inference_begin(struct ethosu_driver* drv, void*) { // Save Cortex-M cycle clock to calculate total CPU cycles used in // ethosu_inference_end() - ethosu_ArmWhenNPURunCycleCountStart = ARM_PMU_Get_CCNTR(); + ethosu_ArmWhenNPURunCycleCountStart = arm_pmu_cycles(); } // Callback invoked at end of NPU execution @@ -99,21 +108,21 @@ void ethosu_inference_end(struct ethosu_driver* drv, void*) { ETHOSU_PMU_Disable(drv); // Add Cortex-M cycle clock used during this NPU execution ethosu_ArmWhenNPURunCycleCount += - (ARM_PMU_Get_CCNTR() - ethosu_ArmWhenNPURunCycleCountStart); + (arm_pmu_cycles() - ethosu_ArmWhenNPURunCycleCountStart); } // Callback invoked at start of ArmBackend::execute() void EthosUBackend_execute_begin() { // Save Cortex-M cycle clock to calculate total CPU cycles used in // ArmBackend_execute_end() - ethosu_ArmBackendExecuteCycleCountStart = ARM_PMU_Get_CCNTR(); + ethosu_ArmBackendExecuteCycleCountStart = arm_pmu_cycles(); } // Callback invoked at end of ArmBackend::execute() void EthosUBackend_execute_end() { // Add Cortex-M cycle clock used during this ArmBackend::execute() ethosu_ArmBackendExecuteCycleCount += - (ARM_PMU_Get_CCNTR() - ethosu_ArmBackendExecuteCycleCountStart); + (arm_pmu_cycles() - ethosu_ArmBackendExecuteCycleCountStart); } } @@ -126,14 +135,16 @@ void StartMeasurements() { for (size_t i = 0; i < ethosu_pmuCountersUsed; i++) { ethosu_pmuEventCounts[i] = 0; } - ethosu_ArmCycleCountStart = ARM_PMU_Get_CCNTR(); + ethosu_ArmCycleCountStart = arm_pmu_cycles(); } void StopMeasurements(int num_inferences) { +#if defined(__ARM_ARCH_8_1M_MAIN__) ARM_PMU_CNTR_Disable( PMU_CNTENCLR_CCNTR_ENABLE_Msk | PMU_CNTENCLR_CNT0_ENABLE_Msk | PMU_CNTENCLR_CNT1_ENABLE_Msk); - uint32_t cycle_count = ARM_PMU_Get_CCNTR() - ethosu_ArmCycleCountStart; +#endif + uint32_t cycle_count = arm_pmu_cycles() - ethosu_ArmCycleCountStart; // Number of comand streams handled by the NPU ET_LOG(Info, "NPU Inferences : %d", num_inferences); @@ -171,7 +182,7 @@ void StopMeasurements(int num_inferences) { Info, "NOTE: CPU cycle values and ratio calculations require FPGA and identical CPU/NPU frequency"); - // Avoid division with zero if ARM_PMU_Get_CCNTR() is not enabled properly. + // Avoid division with zero if arm_pmu_cycles() is not enabled properly. if (cycle_count == 0) { ET_LOG(Info, "Inference CPU ratio: ?.?? %%"); ET_LOG(Info, "Inference NPU ratio: ?.?? %%"); diff --git a/examples/arm/run.sh b/examples/arm/run.sh index b18115723b0..3365d1a9f21 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -366,7 +366,7 @@ for i in "${!test_model[@]}"; do exit 1 fi set -x - backends/cortex_m/test/build_test_runner.sh + backends/cortex_m/test/build_test_runner.sh --target="${target}" cortex_m_elf="${et_root_dir}/arm_test/arm_semihosting_executor_runner_corstone-300/arm_executor_runner" if [ "$build_only" = false ] ; then backends/arm/scripts/run_fvp.sh --elf="${cortex_m_elf}" --target="${target}" --bundle="${pte_file}"