Skip to content

Commit 26e2ab8

Browse files
psiddhclaude
andauthored
[Pico2] Add CMSIS-NN INT8 support and latency instrumentation (#18612)
Add INT8 quantized inference via CMSIS-NN kernels for Cortex-M33 on Pico2, alongside the existing FP32 portable path. This enables a direct FP32 vs INT8 comparison for MCU deployment benchmarking. - Add export_mlp_mnist_cmsis.py: quantized export using CortexMQuantizer - CMakeLists.txt: USE_CMSIS_NN and USE_SELECTIVE_BUILD options for flexible linking - build_firmware_pico.sh: --cmsis flag, fix TARGET_CPU to cortex-m33+nofp, portable nproc, auto-detect ARM toolchain, remove unused cmake flags - main.cpp: per-digit inference timing via time_us_32() --------- Co-authored-by: Claude <noreply@anthropic.com>
1 parent 75c677f commit 26e2ab8

4 files changed

Lines changed: 325 additions & 21 deletions

File tree

examples/raspberry_pi/pico2/CMakeLists.txt

Lines changed: 74 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -125,21 +125,83 @@ target_compile_options(
125125

126126
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
127127

128-
set(BAREMETAL_BUILD_DIR ${EXECUTORCH_ROOT}/executorch/cmake-out/)
128+
set(BAREMETAL_BUILD_DIR
129+
${EXECUTORCH_ROOT}/executorch/cmake-out/
130+
CACHE STRING "ExecuTorch baremetal build dir"
131+
)
129132

130-
# Link ExecuTorch and Pico libraries
131-
target_link_libraries(
132-
executorch_pico
133-
PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
134-
${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
135-
-Wl,--whole-archive
136-
${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
137-
-Wl,--no-whole-archive
138-
${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
139-
pico_stdlib
140-
pico_stdio_usb
133+
# CMSIS-NN support: link quantized cortex_m kernels instead of portable ops
134+
option(USE_CMSIS_NN "Link CMSIS-NN INT8 kernels for Cortex-M33 acceleration"
135+
OFF
136+
)
137+
option(USE_SELECTIVE_BUILD "Use selective build (only link model-required ops)"
138+
OFF
141139
)
142140

141+
if(USE_CMSIS_NN)
142+
message(STATUS "CMSIS-NN enabled: linking cortex_m_ops_lib + cmsis-nn")
143+
if(USE_SELECTIVE_BUILD)
144+
# CMSIS-NN model uses only cortex_m:: ops, no portable ops needed. Skip
145+
# --whole-archive on portable_ops_lib to avoid pulling unused ops.
146+
message(STATUS "Selective build: CMSIS-NN only (no portable ops)")
147+
target_link_libraries(
148+
executorch_pico
149+
PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
150+
${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
151+
-Wl,--whole-archive
152+
${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a
153+
-Wl,--no-whole-archive
154+
${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a
155+
${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
156+
${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a
157+
pico_stdlib
158+
pico_stdio_usb
159+
)
160+
else()
161+
target_link_libraries(
162+
executorch_pico
163+
PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
164+
${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
165+
-Wl,--whole-archive
166+
${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a
167+
${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
168+
-Wl,--no-whole-archive
169+
${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a
170+
${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
171+
${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a
172+
pico_stdlib
173+
pico_stdio_usb
174+
)
175+
endif()
176+
else()
177+
if(USE_SELECTIVE_BUILD)
178+
message(STATUS "Selective build: using executorch_selected_kernels")
179+
target_link_libraries(
180+
executorch_pico
181+
PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
182+
${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
183+
-Wl,--whole-archive
184+
${BAREMETAL_BUILD_DIR}/lib/libexecutorch_selected_kernels.a
185+
-Wl,--no-whole-archive
186+
${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
187+
pico_stdlib
188+
pico_stdio_usb
189+
)
190+
else()
191+
target_link_libraries(
192+
executorch_pico
193+
PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
194+
${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
195+
-Wl,--whole-archive
196+
${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
197+
-Wl,--no-whole-archive
198+
${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
199+
pico_stdlib
200+
pico_stdio_usb
201+
)
202+
endif()
203+
endif()
204+
143205
# Only add extra outputs if the target builds successfully
144206
if(TARGET executorch_pico)
145207
pico_add_extra_outputs(executorch_pico)

examples/raspberry_pi/pico2/build_firmware_pico.sh

Lines changed: 58 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# This source code is licensed under the BSD-style license found in the
66
# LICENSE file in the root directory of this source tree.
77

8-
#!/bin/bash
98
# build_firmware_pico.sh
109
# Simple script to cross-compile ExecuTorch and build Pico2 firmware with optional model input
1110

@@ -17,26 +16,61 @@ PICO2_DIR="${ROOT_DIR}/examples/raspberry_pi/pico2"
1716
BUILD_DIR="${PICO2_DIR}/build"
1817
EXECUTORCH_BUILD_DIR="${ROOT_DIR}/cmake-out"
1918

19+
# Pico SDK 2.0's mbedtls requires this for CMake >= 3.30
20+
export CMAKE_POLICY_VERSION_MINIMUM=3.5
21+
22+
# Portable nproc: use nproc on Linux, sysctl on macOS
23+
if command -v nproc &>/dev/null; then
24+
NPROC=$(nproc)
25+
else
26+
NPROC=$(sysctl -n hw.ncpu)
27+
fi
28+
29+
# Source ARM toolchain if available and not already on PATH
30+
if ! command -v arm-none-eabi-gcc &>/dev/null; then
31+
SETUP_PATH="${ROOT_DIR}/examples/arm/arm-scratch/setup_path.sh"
32+
if [ -f "${SETUP_PATH}" ]; then
33+
source "${SETUP_PATH}"
34+
else
35+
# Try to find the toolchain directly
36+
TOOLCHAIN_BIN=$(find "${ROOT_DIR}/examples/arm/arm-scratch" -name "arm-none-eabi-gcc" -type f 2>/dev/null | head -1)
37+
if [ -n "${TOOLCHAIN_BIN:-}" ]; then
38+
export PATH="$(dirname "${TOOLCHAIN_BIN}"):${PATH}"
39+
else
40+
echo "Error: arm-none-eabi-gcc not found. Run: ./examples/arm/setup.sh --i-agree-to-the-contained-eula"
41+
exit 1
42+
fi
43+
fi
44+
fi
45+
46+
echo "Using ARM toolchain: $(which arm-none-eabi-gcc)"
47+
2048
# Default model
2149
DEFAULT_MODEL="default_model.pte"
2250

2351
usage() {
24-
echo "Usage: $0 [--clean] [--model=path/to/model.pte]"
52+
echo "Usage: $0 [--clean] [--cmsis] [--model=path/to/model.pte]"
2553
echo " --clean Clean build directories"
54+
echo " --cmsis Build with CMSIS-NN INT8 kernels (requires cortex_m backend)"
2655
echo " --model=FILE Specify model file to embed (relative to pico2/)"
2756
exit 1
2857
}
2958

3059
# Parse args
3160
MODEL_INPUT=""
3261
CLEAN_BUILD=0
62+
USE_CMSIS=0
3363

3464
for arg in "$@"; do
3565
case $arg in
3666
--clean)
3767
CLEAN_BUILD=1
3868
shift
3969
;;
70+
--cmsis)
71+
USE_CMSIS=1
72+
shift
73+
;;
4074
--model=*)
4175
MODEL_INPUT="${arg#*=}"
4276
shift
@@ -68,42 +102,57 @@ if [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then
68102
echo "Using selective build from model: ${MODEL_ABS_PATH}"
69103
fi
70104

105+
CMSIS_FLAGS=()
106+
if [ $USE_CMSIS -eq 1 ]; then
107+
echo "CMSIS-NN mode: building with Cortex-M backend and CMSIS-NN kernels"
108+
CMSIS_FLAGS=(
109+
-DEXECUTORCH_BUILD_CORTEX_M=ON
110+
)
111+
fi
112+
71113
cmake -B "${EXECUTORCH_BUILD_DIR}" \
72114
-DCMAKE_TOOLCHAIN_FILE="${ROOT_DIR}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake" \
73-
-DTARGET_CPU=cortex-m0plus \
115+
-DTARGET_CPU=cortex-m33+nofp \
74116
-DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
75117
-DEXECUTORCH_PAL_DEFAULT=minimal \
76-
-DEXECUTORCH_DTYPE_SELECTIVE_BUILD=ON \
77118
-DCMAKE_BUILD_TYPE=MinSizeRel \
78119
-DEXECUTORCH_ENABLE_LOGGING=OFF \
79-
-DEXECUTORCH_SELECT_ALL_OPS=OFF \
80120
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
81121
-DCMAKE_INSTALL_PREFIX="${EXECUTORCH_BUILD_DIR}" \
82122
${SELECT_OPS_FLAGS} \
123+
${CMSIS_FLAGS[@]+"${CMSIS_FLAGS[@]}"} \
83124
"${ROOT_DIR}"
84125

85-
cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j$(nproc)
126+
cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j${NPROC}
86127

87128
echo "ExecuTorch cross compile complete."
88129

89130
# Step 2: Build firmware for Pico2 with model input
90131

91132
cd "${PICO2_DIR}"
92133

134+
PICO_CMAKE_FLAGS=(-DPICO_BOARD=pico2 -DCMAKE_BUILD_TYPE=Release)
135+
136+
if [ $USE_CMSIS -eq 1 ]; then
137+
PICO_CMAKE_FLAGS+=(-DUSE_CMSIS_NN=ON)
138+
fi
139+
93140
if [ -n "$MODEL_INPUT" ]; then
94141
# Use specified model
95142
if [ ! -f "${MODEL_INPUT}" ]; then
96143
echo "Error: Specified model file '${MODEL_INPUT}' not found in pico2 directory."
97144
exit 1
98145
fi
99146
echo "Building firmware with model: ${MODEL_INPUT}"
100-
cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${MODEL_INPUT}" -DCMAKE_BUILD_TYPE=Release
147+
PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${MODEL_INPUT}")
101148
else
102149
# Use default model
103150
echo "Building firmware with default model: ${DEFAULT_MODEL}"
104-
cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${DEFAULT_MODEL}" -DCMAKE_BUILD_TYPE=Release
151+
PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${DEFAULT_MODEL}")
105152
fi
106153

107-
cmake --build "${BUILD_DIR}" -j$(nproc)
154+
cmake -B "${BUILD_DIR}" "${PICO_CMAKE_FLAGS[@]}"
155+
156+
cmake --build "${BUILD_DIR}" -j${NPROC}
108157

109158
echo "Firmware build complete. Output in ${BUILD_DIR}, Binary: executorch_pico.uf2"
Lines changed: 159 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,159 @@
1+
#!/usr/bin/env python3
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
"""
9+
Export the TinyMLP MNIST model with INT8 quantization for CMSIS-NN acceleration.
10+
11+
Uses the CortexMQuantizer to produce INT8 quantized ops that map to CMSIS-NN
12+
kernels on Cortex-M33 (RP2350/Pico2). The model I/O stays float — quantize and
13+
dequantize nodes are inserted inside the graph.
14+
15+
Usage:
16+
python export_mlp_mnist_cmsis.py
17+
python export_mlp_mnist_cmsis.py --output my_model.pte
18+
python export_mlp_mnist_cmsis.py --num-calibration 200
19+
"""
20+
21+
import argparse
22+
import logging
23+
import os
24+
25+
import torch
26+
27+
from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
28+
from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
29+
from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
30+
from executorch.extension.export_util.utils import save_pte_program
31+
32+
from export_mlp_mnist import create_balanced_model, IMAGE_SIZE, test_comprehensive
33+
from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
34+
35+
FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
36+
logging.basicConfig(level=logging.INFO, format=FORMAT)
37+
logger = logging.getLogger(__name__)
38+
39+
40+
def get_calibration_data(num_samples: int = 100):
41+
"""
42+
Generate calibration data for quantization.
43+
Mixes structured digit-like patterns and random noise so the observer
44+
sees a representative activation range.
45+
"""
46+
calibration_data = []
47+
48+
# Structured patterns that look like the digits the model will see
49+
for _ in range(num_samples // 2):
50+
x = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE)
51+
# Random vertical / horizontal strokes
52+
col = torch.randint(5, 23, (1,)).item()
53+
row = torch.randint(5, 23, (1,)).item()
54+
x[0, 2:26, col - 1 : col + 2] = 1.0 # vertical stroke
55+
x[0, row - 1 : row + 2, 5:23] = 1.0 # horizontal stroke
56+
calibration_data.append(x)
57+
58+
# Random pixel patterns
59+
for _ in range(num_samples - num_samples // 2):
60+
x = (torch.rand(1, IMAGE_SIZE, IMAGE_SIZE) > 0.7).float()
61+
calibration_data.append(x)
62+
63+
return calibration_data
64+
65+
66+
def quantize_model(model, calibration_data):
67+
quantizer = CortexMQuantizer()
68+
example_input = calibration_data[0]
69+
70+
exported = torch.export.export(model, (example_input,))
71+
graph_module = exported.module()
72+
73+
prepared = prepare_pt2e(graph_module, quantizer)
74+
75+
logger.info(f"Calibrating with {len(calibration_data)} samples...")
76+
with torch.no_grad():
77+
for i, data in enumerate(calibration_data):
78+
prepared(data)
79+
if (i + 1) % 25 == 0:
80+
logger.info(f" Calibrated {i + 1}/{len(calibration_data)} samples")
81+
82+
quantized = convert_pt2e(prepared)
83+
return quantized, example_input
84+
85+
86+
def export_to_pte(quantized_model, example_input, output_path: str):
87+
exported_program = torch.export.export(quantized_model, (example_input,))
88+
89+
edge_config = EdgeCompileConfig(
90+
_check_ir_validity=False,
91+
preserve_ops=[torch.ops.aten.linear.default],
92+
)
93+
edge_program = to_edge(exported_program, compile_config=edge_config)
94+
logger.info("Edge program created")
95+
96+
logger.info("Applying Cortex-M optimization passes...")
97+
pass_manager = CortexMPassManager(edge_program.exported_program())
98+
transformed_ep = pass_manager.transform()
99+
100+
edge_program = to_edge(transformed_ep, compile_config=edge_config)
101+
102+
logger.info("Converting to ExecuTorch format...")
103+
exec_program = edge_program.to_executorch(
104+
config=ExecutorchBackendConfig(extract_delegate_segments=False)
105+
)
106+
107+
save_pte_program(exec_program, output_path)
108+
file_size = os.path.getsize(output_path)
109+
logger.info(f"Model saved to {output_path} ({file_size / 1024:.1f} KB)")
110+
111+
112+
def main():
113+
parser = argparse.ArgumentParser(
114+
description="Export TinyMLP MNIST for Cortex-M with CMSIS-NN (INT8)"
115+
)
116+
parser.add_argument(
117+
"--output",
118+
type=str,
119+
default="balanced_tiny_mlp_mnist_cmsis.pte",
120+
help="Output .pte file path",
121+
)
122+
parser.add_argument(
123+
"--num-calibration",
124+
type=int,
125+
default=100,
126+
help="Number of calibration samples for quantization",
127+
)
128+
parser.add_argument("--debug", action="store_true", help="Enable debug logging")
129+
args = parser.parse_args()
130+
131+
if args.debug:
132+
logging.getLogger().setLevel(logging.DEBUG)
133+
134+
logger.info("Creating balanced MLP MNIST model...")
135+
model = create_balanced_model()
136+
model.eval()
137+
138+
logger.info("Testing FP32 model before quantization:")
139+
test_comprehensive(model)
140+
141+
calibration_data = get_calibration_data(args.num_calibration)
142+
quantized_model, example_input = quantize_model(model, calibration_data)
143+
144+
logger.info("Testing quantized model:")
145+
with torch.no_grad():
146+
test_input = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE)
147+
test_input[0, 2:26, 13:16] = 1.0 # digit-1-like pattern
148+
output = quantized_model(test_input)
149+
pred = output.argmax(dim=1).item()
150+
logger.info(f" Digit-1 pattern -> predicted: {pred}")
151+
152+
export_to_pte(quantized_model, example_input, args.output)
153+
logger.info("Export complete!")
154+
logger.info(f"Input shape: (1, {IMAGE_SIZE}, {IMAGE_SIZE})")
155+
logger.info("Input format: Float [0.0, 1.0] (same as FP32 variant)")
156+
157+
158+
if __name__ == "__main__":
159+
main()

0 commit comments

Comments
 (0)