[Pico2] Add CMSIS-NN INT8 support and latency instrumentation (#18612)

psiddh · claude · web-flow · commit 26e2ab881bb3 · 2026-04-14T15:33:27.000-07:00
Add INT8 quantized inference via CMSIS-NN kernels for Cortex-M33 on
Pico2, alongside the existing FP32 portable path. This enables a direct
FP32 vs INT8 comparison for MCU deployment benchmarking.

- Add export_mlp_mnist_cmsis.py: quantized export using CortexMQuantizer
- CMakeLists.txt: USE_CMSIS_NN and USE_SELECTIVE_BUILD options for
flexible linking
- build_firmware_pico.sh: --cmsis flag, fix TARGET_CPU to
cortex-m33+nofp, portable nproc, auto-detect ARM toolchain, remove
unused cmake flags
- main.cpp: per-digit inference timing via time_us_32()

---------

Co-authored-by: Claude &lt;noreply@anthropic.com&gt;
diff --git a/examples/raspberry_pi/pico2/CMakeLists.txt b/examples/raspberry_pi/pico2/CMakeLists.txt
@@ -125,21 +125,83 @@ target_compile_options(
 
 set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections")
 
-set(BAREMETAL_BUILD_DIR ${EXECUTORCH_ROOT}/executorch/cmake-out/)
+set(BAREMETAL_BUILD_DIR
+    ${EXECUTORCH_ROOT}/executorch/cmake-out/
+    CACHE STRING "ExecuTorch baremetal build dir"
+)
 
-# Link ExecuTorch and Pico libraries
-target_link_libraries(
-  executorch_pico
-  PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
-          ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
-          -Wl,--whole-archive
-          ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
-          -Wl,--no-whole-archive
-          ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
-          pico_stdlib
-          pico_stdio_usb
+# CMSIS-NN support: link quantized cortex_m kernels instead of portable ops
+option(USE_CMSIS_NN "Link CMSIS-NN INT8 kernels for Cortex-M33 acceleration"
+       OFF
+)
+option(USE_SELECTIVE_BUILD "Use selective build (only link model-required ops)"
+       OFF
 )
 
+if(USE_CMSIS_NN)
+  message(STATUS "CMSIS-NN enabled: linking cortex_m_ops_lib + cmsis-nn")
+  if(USE_SELECTIVE_BUILD)
+    # CMSIS-NN model uses only cortex_m:: ops, no portable ops needed. Skip
+    # --whole-archive on portable_ops_lib to avoid pulling unused ops.
+    message(STATUS "Selective build: CMSIS-NN only (no portable ops)")
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  else()
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_ops_lib.a
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libcortex_m_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              ${BAREMETAL_BUILD_DIR}/lib/libcmsis-nn.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  endif()
+else()
+  if(USE_SELECTIVE_BUILD)
+    message(STATUS "Selective build: using executorch_selected_kernels")
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_selected_kernels.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  else()
+    target_link_libraries(
+      executorch_pico
+      PRIVATE ${BAREMETAL_BUILD_DIR}/lib/libexecutorch.a
+              ${BAREMETAL_BUILD_DIR}/lib/libexecutorch_core.a
+              -Wl,--whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_ops_lib.a
+              -Wl,--no-whole-archive
+              ${BAREMETAL_BUILD_DIR}/lib/libportable_kernels.a
+              pico_stdlib
+              pico_stdio_usb
+    )
+  endif()
+endif()
+
 # Only add extra outputs if the target builds successfully
 if(TARGET executorch_pico)
   pico_add_extra_outputs(executorch_pico)
diff --git a/examples/raspberry_pi/pico2/build_firmware_pico.sh b/examples/raspberry_pi/pico2/build_firmware_pico.sh
@@ -5,7 +5,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-#!/bin/bash
 # build_firmware_pico.sh
 # Simple script to cross-compile ExecuTorch and build Pico2 firmware with optional model input
 
@@ -17,26 +16,61 @@ PICO2_DIR="${ROOT_DIR}/examples/raspberry_pi/pico2"
 BUILD_DIR="${PICO2_DIR}/build"
 EXECUTORCH_BUILD_DIR="${ROOT_DIR}/cmake-out"
 
+# Pico SDK 2.0's mbedtls requires this for CMake >= 3.30
+export CMAKE_POLICY_VERSION_MINIMUM=3.5
+
+# Portable nproc: use nproc on Linux, sysctl on macOS
+if command -v nproc &>/dev/null; then
+  NPROC=$(nproc)
+else
+  NPROC=$(sysctl -n hw.ncpu)
+fi
+
+# Source ARM toolchain if available and not already on PATH
+if ! command -v arm-none-eabi-gcc &>/dev/null; then
+  SETUP_PATH="${ROOT_DIR}/examples/arm/arm-scratch/setup_path.sh"
+  if [ -f "${SETUP_PATH}" ]; then
+    source "${SETUP_PATH}"
+  else
+    # Try to find the toolchain directly
+    TOOLCHAIN_BIN=$(find "${ROOT_DIR}/examples/arm/arm-scratch" -name "arm-none-eabi-gcc" -type f 2>/dev/null | head -1)
+    if [ -n "${TOOLCHAIN_BIN:-}" ]; then
+      export PATH="$(dirname "${TOOLCHAIN_BIN}"):${PATH}"
+    else
+      echo "Error: arm-none-eabi-gcc not found. Run: ./examples/arm/setup.sh --i-agree-to-the-contained-eula"
+      exit 1
+    fi
+  fi
+fi
+
+echo "Using ARM toolchain: $(which arm-none-eabi-gcc)"
+
 # Default model
 DEFAULT_MODEL="default_model.pte"
 
 usage() {
-  echo "Usage: $0 [--clean] [--model=path/to/model.pte]"
+  echo "Usage: $0 [--clean] [--cmsis] [--model=path/to/model.pte]"
   echo "  --clean           Clean build directories"
+  echo "  --cmsis           Build with CMSIS-NN INT8 kernels (requires cortex_m backend)"
   echo "  --model=FILE      Specify model file to embed (relative to pico2/)"
   exit 1
 }
 
 # Parse args
 MODEL_INPUT=""
 CLEAN_BUILD=0
+USE_CMSIS=0
 
 for arg in "$@"; do
   case $arg in
     --clean)
       CLEAN_BUILD=1
       shift
       ;;
+    --cmsis)
+      USE_CMSIS=1
+      shift
+      ;;
     --model=*)
       MODEL_INPUT="${arg#*=}"
       shift
@@ -68,42 +102,57 @@ if [ -n "$MODEL_INPUT" ] && [ -f "${PICO2_DIR}/${MODEL_INPUT}" ]; then
   echo "Using selective build from model: ${MODEL_ABS_PATH}"
 fi
 
+CMSIS_FLAGS=()
+if [ $USE_CMSIS -eq 1 ]; then
+  echo "CMSIS-NN mode: building with Cortex-M backend and CMSIS-NN kernels"
+  CMSIS_FLAGS=(
+    -DEXECUTORCH_BUILD_CORTEX_M=ON
+  )
+fi
+
 cmake -B "${EXECUTORCH_BUILD_DIR}" \
   -DCMAKE_TOOLCHAIN_FILE="${ROOT_DIR}/examples/arm/ethos-u-setup/arm-none-eabi-gcc.cmake" \
-  -DTARGET_CPU=cortex-m0plus \
+  -DTARGET_CPU=cortex-m33+nofp \
   -DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
   -DEXECUTORCH_PAL_DEFAULT=minimal \
-  -DEXECUTORCH_DTYPE_SELECTIVE_BUILD=ON \
   -DCMAKE_BUILD_TYPE=MinSizeRel \
   -DEXECUTORCH_ENABLE_LOGGING=OFF \
-  -DEXECUTORCH_SELECT_ALL_OPS=OFF \
   -DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
   -DCMAKE_INSTALL_PREFIX="${EXECUTORCH_BUILD_DIR}" \
   ${SELECT_OPS_FLAGS} \
+  ${CMSIS_FLAGS[@]+"${CMSIS_FLAGS[@]}"} \
   "${ROOT_DIR}"
 
-cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j$(nproc)
+cmake --build "${EXECUTORCH_BUILD_DIR}" --target install -j${NPROC}
 
 echo "ExecuTorch cross compile complete."
 
 # Step 2: Build firmware for Pico2 with model input
 
 cd "${PICO2_DIR}"
 
+PICO_CMAKE_FLAGS=(-DPICO_BOARD=pico2 -DCMAKE_BUILD_TYPE=Release)
+
+if [ $USE_CMSIS -eq 1 ]; then
+  PICO_CMAKE_FLAGS+=(-DUSE_CMSIS_NN=ON)
+fi
+
 if [ -n "$MODEL_INPUT" ]; then
   # Use specified model
   if [ ! -f "${MODEL_INPUT}" ]; then
     echo "Error: Specified model file '${MODEL_INPUT}' not found in pico2 directory."
     exit 1
   fi
   echo "Building firmware with model: ${MODEL_INPUT}"
-  cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${MODEL_INPUT}" -DCMAKE_BUILD_TYPE=Release
+  PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${MODEL_INPUT}")
 else
   # Use default model
   echo "Building firmware with default model: ${DEFAULT_MODEL}"
-  cmake -B "${BUILD_DIR}" -DPICO_BOARD=pico2 -DINPUT_MODEL="./${DEFAULT_MODEL}" -DCMAKE_BUILD_TYPE=Release
+  PICO_CMAKE_FLAGS+=(-DINPUT_MODEL="./${DEFAULT_MODEL}")
 fi
 
-cmake --build "${BUILD_DIR}" -j$(nproc)
+cmake -B "${BUILD_DIR}" "${PICO_CMAKE_FLAGS[@]}"
+
+cmake --build "${BUILD_DIR}" -j${NPROC}
 
 echo "Firmware build complete. Output in ${BUILD_DIR}, Binary: executorch_pico.uf2"
diff --git a/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py b/examples/raspberry_pi/pico2/export_mlp_mnist_cmsis.py
@@ -0,0 +1,159 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Export the TinyMLP MNIST model with INT8 quantization for CMSIS-NN acceleration.
+
+Uses the CortexMQuantizer to produce INT8 quantized ops that map to CMSIS-NN
+kernels on Cortex-M33 (RP2350/Pico2). The model I/O stays float — quantize and
+dequantize nodes are inserted inside the graph.
+
+Usage:
+    python export_mlp_mnist_cmsis.py
+    python export_mlp_mnist_cmsis.py --output my_model.pte
+    python export_mlp_mnist_cmsis.py --num-calibration 200
+"""
+
+import argparse
+import logging
+import os
+
+import torch
+
+from executorch.backends.cortex_m.passes.cortex_m_pass_manager import CortexMPassManager
+from executorch.backends.cortex_m.quantizer.quantizer import CortexMQuantizer
+from executorch.exir import EdgeCompileConfig, ExecutorchBackendConfig, to_edge
+from executorch.extension.export_util.utils import save_pte_program
+
+from export_mlp_mnist import create_balanced_model, IMAGE_SIZE, test_comprehensive
+from torchao.quantization.pt2e.quantize_pt2e import convert_pt2e, prepare_pt2e
+
+FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
+logging.basicConfig(level=logging.INFO, format=FORMAT)
+logger = logging.getLogger(__name__)
+
+
+def get_calibration_data(num_samples: int = 100):
+    """
+    Generate calibration data for quantization.
+    Mixes structured digit-like patterns and random noise so the observer
+    sees a representative activation range.
+    """
+    calibration_data = []
+
+    # Structured patterns that look like the digits the model will see
+    for _ in range(num_samples // 2):
+        x = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE)
+        # Random vertical / horizontal strokes
+        col = torch.randint(5, 23, (1,)).item()
+        row = torch.randint(5, 23, (1,)).item()
+        x[0, 2:26, col - 1 : col + 2] = 1.0  # vertical stroke
+        x[0, row - 1 : row + 2, 5:23] = 1.0  # horizontal stroke
+        calibration_data.append(x)
+
+    # Random pixel patterns
+    for _ in range(num_samples - num_samples // 2):
+        x = (torch.rand(1, IMAGE_SIZE, IMAGE_SIZE) > 0.7).float()
+        calibration_data.append(x)
+
+    return calibration_data
+
+
+def quantize_model(model, calibration_data):
+    quantizer = CortexMQuantizer()
+    example_input = calibration_data[0]
+
+    exported = torch.export.export(model, (example_input,))
+    graph_module = exported.module()
+
+    prepared = prepare_pt2e(graph_module, quantizer)
+
+    logger.info(f"Calibrating with {len(calibration_data)} samples...")
+    with torch.no_grad():
+        for i, data in enumerate(calibration_data):
+            prepared(data)
+            if (i + 1) % 25 == 0:
+                logger.info(f"  Calibrated {i + 1}/{len(calibration_data)} samples")
+
+    quantized = convert_pt2e(prepared)
+    return quantized, example_input
+
+
+def export_to_pte(quantized_model, example_input, output_path: str):
+    exported_program = torch.export.export(quantized_model, (example_input,))
+
+    edge_config = EdgeCompileConfig(
+        _check_ir_validity=False,
+        preserve_ops=[torch.ops.aten.linear.default],
+    )
+    edge_program = to_edge(exported_program, compile_config=edge_config)
+    logger.info("Edge program created")
+
+    logger.info("Applying Cortex-M optimization passes...")
+    pass_manager = CortexMPassManager(edge_program.exported_program())
+    transformed_ep = pass_manager.transform()
+
+    edge_program = to_edge(transformed_ep, compile_config=edge_config)
+
+    logger.info("Converting to ExecuTorch format...")
+    exec_program = edge_program.to_executorch(
+        config=ExecutorchBackendConfig(extract_delegate_segments=False)
+    )
+
+    save_pte_program(exec_program, output_path)
+    file_size = os.path.getsize(output_path)
+    logger.info(f"Model saved to {output_path} ({file_size / 1024:.1f} KB)")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Export TinyMLP MNIST for Cortex-M with CMSIS-NN (INT8)"
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="balanced_tiny_mlp_mnist_cmsis.pte",
+        help="Output .pte file path",
+    )
+    parser.add_argument(
+        "--num-calibration",
+        type=int,
+        default=100,
+        help="Number of calibration samples for quantization",
+    )
+    parser.add_argument("--debug", action="store_true", help="Enable debug logging")
+    args = parser.parse_args()
+
+    if args.debug:
+        logging.getLogger().setLevel(logging.DEBUG)
+
+    logger.info("Creating balanced MLP MNIST model...")
+    model = create_balanced_model()
+    model.eval()
+
+    logger.info("Testing FP32 model before quantization:")
+    test_comprehensive(model)
+
+    calibration_data = get_calibration_data(args.num_calibration)
+    quantized_model, example_input = quantize_model(model, calibration_data)
+
+    logger.info("Testing quantized model:")
+    with torch.no_grad():
+        test_input = torch.zeros(1, IMAGE_SIZE, IMAGE_SIZE)
+        test_input[0, 2:26, 13:16] = 1.0  # digit-1-like pattern
+        output = quantized_model(test_input)
+        pred = output.argmax(dim=1).item()
+        logger.info(f"  Digit-1 pattern -> predicted: {pred}")
+
+    export_to_pte(quantized_model, example_input, args.output)
+    logger.info("Export complete!")
+    logger.info(f"Input shape: (1, {IMAGE_SIZE}, {IMAGE_SIZE})")
+    logger.info("Input format: Float [0.0, 1.0] (same as FP32 variant)")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/raspberry_pi/pico2/main.cpp b/examples/raspberry_pi/pico2/main.cpp