diff --git a/backends/cortex_m/CMakeLists.txt b/backends/cortex_m/CMakeLists.txt index cf3071512bc..41b54e3d4c3 100644 --- a/backends/cortex_m/CMakeLists.txt +++ b/backends/cortex_m/CMakeLists.txt @@ -67,6 +67,7 @@ set(_cortex_m_kernels__srcs ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_maximum.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_softmax.cpp ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_transpose.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ops/op_pad.cpp ) # Generate C++ bindings to register kernels into Executorch diff --git a/backends/cortex_m/ops/op_pad.cpp b/backends/cortex_m/ops/op_pad.cpp new file mode 100644 index 00000000000..739c584c419 --- /dev/null +++ b/backends/cortex_m/ops/op_pad.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "cortex_m_ops_common.h" + +extern "C" { +#include "arm_nnfunctions.h" +} + +namespace cortex_m { +namespace native { + +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; + +namespace { + +constexpr size_t kMaxSupportedDims = 4; + +} // namespace + +Tensor& pad_out( + KernelRuntimeContext& context, + const Tensor& input, + const Int64ArrayRef pre_pad, + const Int64ArrayRef post_pad, + int64_t pad_value, + Tensor& out) { + if (input.scalar_type() != ScalarType::Char || + out.scalar_type() != ScalarType::Char) { + ET_LOG( + Error, + "pad_out: only int8 tensors are supported (input=%d, out=%d)", + static_cast(input.scalar_type()), + static_cast(out.scalar_type())); + context.fail(Error::InvalidArgument); + return out; + } + + const size_t rank = input.dim(); + if (rank == 0 || rank > kMaxSupportedDims) { + ET_LOG( + Error, + "pad_out: expected tensor rank in [1, %zu], got %zu", + kMaxSupportedDims, + rank); + context.fail(Error::InvalidArgument); + return out; + } + + const size_t offset = kMaxSupportedDims - rank; + + cmsis_nn_dims input_dims = {1, 1, 1, 1}; + int32_t* d = &input_dims.n; + for (size_t i = 0; i < rank; ++i) { + d[offset + i] = static_cast(input.size(i)); + } + + cmsis_nn_dims cmsis_pre_pad = { + static_cast(pre_pad[0]), + static_cast(pre_pad[1]), + static_cast(pre_pad[2]), + static_cast(pre_pad[3])}; + cmsis_nn_dims cmsis_post_pad = { + static_cast(post_pad[0]), + static_cast(post_pad[1]), + static_cast(post_pad[2]), + static_cast(post_pad[3])}; + + const int8_t* input_data = input.const_data_ptr(); + int8_t* output_data = out.mutable_data_ptr(); + + const arm_cmsis_nn_status status = arm_pad_s8( + input_data, + output_data, + static_cast(pad_value), + &input_dims, + &cmsis_pre_pad, + &cmsis_post_pad); + + if (status != ARM_CMSIS_NN_SUCCESS) { + ET_LOG( + Error, + "pad_out: arm_pad_s8 failed with status [%d]", + static_cast(status)); + context.fail(Error::Internal); + return out; + } + + return out; +} + +} // namespace native +} // namespace cortex_m diff --git a/backends/cortex_m/ops/operators.py b/backends/cortex_m/ops/operators.py index 9c8d7a8771b..d02ac49348c 100644 --- a/backends/cortex_m/ops/operators.py +++ b/backends/cortex_m/ops/operators.py @@ -498,6 +498,46 @@ def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor: return input.permute(tuple(perm)).contiguous() +# =================================================================== +# PAD OPERATION DEFINITION +# =================================================================== +lib.define("pad(Tensor input, int[] pre_pad, int[] post_pad, int pad_value) -> Tensor") +lib.define( + "pad.out(Tensor input, int[] pre_pad, int[] post_pad, int pad_value, " + "*, Tensor(a!) out) -> Tensor(a!)" +) + + +@register_fake("cortex_m::pad") +def pad_meta( + input: torch.Tensor, + pre_pad: list[int], + post_pad: list[int], + pad_value: int, +) -> torch.Tensor: + rank = input.dim() + offset = 4 - rank + output_shape = list(input.shape) + for i in range(rank): + output_shape[i] += pre_pad[offset + i] + post_pad[offset + i] + return torch.empty(output_shape, dtype=input.dtype, device=input.device) + + +@impl(lib, "pad", "CompositeExplicitAutograd") +def pad_impl( + input: torch.Tensor, + pre_pad: list[int], + post_pad: list[int], + pad_value: int, +) -> torch.Tensor: + rank = input.dim() + offset = 4 - rank + padding = [] + for i in reversed(range(rank)): + padding.extend([pre_pad[offset + i], post_pad[offset + i]]) + return F.pad(input, padding, mode="constant", value=pad_value) + + # =================================================================== # QUANTIZED CONV2D OPERATION DEFINITION # =================================================================== diff --git a/backends/cortex_m/ops/operators.yaml b/backends/cortex_m/ops/operators.yaml index 767d221429a..4f09b30bd4e 100644 --- a/backends/cortex_m/ops/operators.yaml +++ b/backends/cortex_m/ops/operators.yaml @@ -59,6 +59,12 @@ - arg_meta: null kernel_name: cortex_m::transpose_out +- func: cortex_m::pad.out(Tensor input, int[] pre_pad, int[] post_pad, int pad_value, *, Tensor(a!) out) -> Tensor(a!) + variants: function + kernels: + - arg_meta: null + kernel_name: cortex_m::pad_out + - func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: diff --git a/backends/cortex_m/passes/quantized_op_fusion_pass.py b/backends/cortex_m/passes/quantized_op_fusion_pass.py index 19da9f89cc5..8978c60a30e 100644 --- a/backends/cortex_m/passes/quantized_op_fusion_pass.py +++ b/backends/cortex_m/passes/quantized_op_fusion_pass.py @@ -11,6 +11,7 @@ import torch from executorch.backends.cortex_m.passes.passes_utils import ( quantize_multiplier_aot, + quantize_val, SHIFT_INT8, ) from executorch.backends.cortex_m.quantizer.quantization_configs import ( @@ -374,6 +375,39 @@ def _get_avg_pool2d_replacement(self, args, meta): return exir_ops.edge.cortex_m.quantized_avg_pool2d.default, args + def _get_pad_replacement(self, args, meta): + input_qparams = meta.data.get("input_qparams", {}) + if not input_qparams: + return exir_ops.edge.aten.constant_pad_nd.default, args + + scale = float(input_qparams[0].scale) + zero_point = int(input_qparams[0].zp) + + padding = self._unwrap_argument(args[1]) + pad_value_raw = self._unwrap_argument(args[2]) if len(args) > 2 else 0 + pad_value_float = float(pad_value_raw) + + quantized_pad_value = int( + quantize_val(pad_value_float, scale, zero_point, -128, 127) + ) + + rank = len(args[0].data.shape) + assert 1 <= rank <= 4, f"cortex_m pad: expected rank in [1, 4], got {rank}" + n_pairs = len(padding) // 2 + assert ( + len(padding) % 2 == 0 and n_pairs <= rank + ), f"cortex_m pad: invalid padding length {len(padding)} for rank {rank}" + + pre_pad = [0, 0, 0, 0] + post_pad = [0, 0, 0, 0] + for i in range(n_pairs): + dim_4d = 3 - i + pre_pad[dim_4d] = int(padding[2 * i]) + post_pad[dim_4d] = int(padding[2 * i + 1]) + + new_args = (args[0], pre_pad, post_pad, int(quantized_pad_value)) + return exir_ops.edge.cortex_m.pad.default, new_args + def call_operator( self, op: EdgeOpOverload, @@ -399,6 +433,8 @@ def call_operator( op, args = self._get_permute_replacement(args, meta) case exir_ops.edge.aten.avg_pool2d.default: op, args = self._get_avg_pool2d_replacement(args, meta) + case exir_ops.edge.aten.constant_pad_nd.default: + op, args = self._get_pad_replacement(args, meta) case _: pass diff --git a/backends/cortex_m/quantizer/quantizer.py b/backends/cortex_m/quantizer/quantizer.py index d6f3514d0ce..2d1a837ca0e 100644 --- a/backends/cortex_m/quantizer/quantizer.py +++ b/backends/cortex_m/quantizer/quantizer.py @@ -256,6 +256,9 @@ class SharedQspecQuantizer(Quantizer): torch.ops.aten._unsafe_view.default, torch.ops.aten.unflatten.int, torch.ops.aten.flatten.using_ints, + # Padding + torch.ops.aten.pad.default, + torch.ops.aten.constant_pad_nd.default, ] def __init__(self, targets: Optional[List[OpOverload]] = None) -> None: diff --git a/backends/cortex_m/test/ops/test_pad.py b/backends/cortex_m/test/ops/test_pad.py new file mode 100644 index 00000000000..cbfa858f24c --- /dev/null +++ b/backends/cortex_m/test/ops/test_pad.py @@ -0,0 +1,88 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + + +import torch +import torch.nn.functional as F +from executorch.backends.arm.test.common import parametrize +from executorch.backends.cortex_m.test.tester import ( + CortexMTester, + McuTestCase, + ramp_tensor, +) + +OPS_BEFORE_PASSES = { + "executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2, + "executorch_exir_dialects_edge__ops_aten_constant_pad_nd_default": 1, +} + +OPS_AFTER_PASSES = { + "executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_pad_default": 1, + "executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1, +} + + +class CortexMPad(torch.nn.Module): + ops_before_transforms = OPS_BEFORE_PASSES + ops_after_transforms = OPS_AFTER_PASSES + + def __init__(self, padding, value=0.0): + super().__init__() + self.padding = padding + self.value = value + + def forward(self, x): + return F.pad(x, self.padding, mode="constant", value=self.value) + + +test_cases = { + "pad_rank4_all_dims": McuTestCase( + CortexMPad((1, 1, 2, 2, 1, 0, 0, 1)), + (ramp_tensor(-0.5, 0.5, (1, 2, 3, 4)),), + ), + "pad_rank4_last_two_dims": McuTestCase( + CortexMPad((1, 2, 3, 4)), + (ramp_tensor(-1.0, 1.0, (1, 3, 4, 5)),), + ), + "pad_rank3": McuTestCase( + CortexMPad((1, 1, 2, 2)), + (ramp_tensor(-0.5, 0.5, (2, 3, 4)),), + ), + "pad_rank2": McuTestCase( + CortexMPad((1, 2, 3, 4)), + (ramp_tensor(-1.0, 1.0, (3, 5)),), + ), + "pad_rank1": McuTestCase( + CortexMPad((2, 3)), + (ramp_tensor(0.0, 1.0, (6,)),), + ), + "pad_nonzero_value": McuTestCase( + CortexMPad((1, 1), value=0.5), + (ramp_tensor(-1.0, 1.0, (2, 4)),), + ), + "pad_zero_padding": McuTestCase( + CortexMPad((0, 0, 0, 0)), + (ramp_tensor(-0.5, 0.5, (2, 3, 4, 5)),), + ), +} + + +@parametrize("test_case", test_cases) +def test_dialect_pad(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_dialect( + test_case.model.ops_before_transforms, + test_case.model.ops_after_transforms, + qtol=0, + ) + + +@parametrize("test_case", test_cases) +def test_implementation_pad(test_case): + tester = CortexMTester(test_case.model, test_case.example_inputs) + tester.test_implementation(qtol=0)