Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backends/cortex_m/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ set(_cortex_m_kernels__srcs
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_maximum.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_softmax.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_transpose.cpp
${CMAKE_CURRENT_SOURCE_DIR}/ops/op_pad.cpp
)

# Generate C++ bindings to register kernels into Executorch
Expand Down
98 changes: 98 additions & 0 deletions backends/cortex_m/ops/op_pad.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#include "cortex_m_ops_common.h"

extern "C" {
#include "arm_nnfunctions.h"
}

namespace cortex_m {
namespace native {

using KernelRuntimeContext = torch::executor::KernelRuntimeContext;

namespace {

constexpr size_t kMaxSupportedDims = 4;

} // namespace

Tensor& pad_out(
KernelRuntimeContext& context,
const Tensor& input,
const Int64ArrayRef pre_pad,
const Int64ArrayRef post_pad,
int64_t pad_value,
Tensor& out) {
if (input.scalar_type() != ScalarType::Char ||
out.scalar_type() != ScalarType::Char) {
ET_LOG(
Error,
"pad_out: only int8 tensors are supported (input=%d, out=%d)",
static_cast<int>(input.scalar_type()),
static_cast<int>(out.scalar_type()));
context.fail(Error::InvalidArgument);
return out;
}

const size_t rank = input.dim();
if (rank == 0 || rank > kMaxSupportedDims) {
ET_LOG(
Error,
"pad_out: expected tensor rank in [1, %zu], got %zu",
kMaxSupportedDims,
rank);
context.fail(Error::InvalidArgument);
return out;
}

const size_t offset = kMaxSupportedDims - rank;

cmsis_nn_dims input_dims = {1, 1, 1, 1};
int32_t* d = &input_dims.n;
for (size_t i = 0; i < rank; ++i) {
d[offset + i] = static_cast<int32_t>(input.size(i));
}

cmsis_nn_dims cmsis_pre_pad = {
static_cast<int32_t>(pre_pad[0]),
static_cast<int32_t>(pre_pad[1]),
static_cast<int32_t>(pre_pad[2]),
static_cast<int32_t>(pre_pad[3])};
cmsis_nn_dims cmsis_post_pad = {
static_cast<int32_t>(post_pad[0]),
static_cast<int32_t>(post_pad[1]),
static_cast<int32_t>(post_pad[2]),
static_cast<int32_t>(post_pad[3])};

const int8_t* input_data = input.const_data_ptr<int8_t>();
int8_t* output_data = out.mutable_data_ptr<int8_t>();

const arm_cmsis_nn_status status = arm_pad_s8(
input_data,
output_data,
static_cast<int8_t>(pad_value),
&input_dims,
&cmsis_pre_pad,
&cmsis_post_pad);

if (status != ARM_CMSIS_NN_SUCCESS) {
ET_LOG(
Error,
"pad_out: arm_pad_s8 failed with status [%d]",
static_cast<int>(status));
context.fail(Error::Internal);
return out;
}

return out;
}

} // namespace native
} // namespace cortex_m
40 changes: 40 additions & 0 deletions backends/cortex_m/ops/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,46 @@ def transpose_impl(input: torch.Tensor, perm) -> torch.Tensor:
return input.permute(tuple(perm)).contiguous()


# ===================================================================
# PAD OPERATION DEFINITION
# ===================================================================
lib.define("pad(Tensor input, int[] pre_pad, int[] post_pad, int pad_value) -> Tensor")
lib.define(
"pad.out(Tensor input, int[] pre_pad, int[] post_pad, int pad_value, "
"*, Tensor(a!) out) -> Tensor(a!)"
)


@register_fake("cortex_m::pad")
def pad_meta(
input: torch.Tensor,
pre_pad: list[int],
post_pad: list[int],
pad_value: int,
) -> torch.Tensor:
rank = input.dim()
offset = 4 - rank
output_shape = list(input.shape)
for i in range(rank):
output_shape[i] += pre_pad[offset + i] + post_pad[offset + i]
return torch.empty(output_shape, dtype=input.dtype, device=input.device)


@impl(lib, "pad", "CompositeExplicitAutograd")
def pad_impl(
input: torch.Tensor,
pre_pad: list[int],
post_pad: list[int],
pad_value: int,
) -> torch.Tensor:
rank = input.dim()
offset = 4 - rank
padding = []
for i in reversed(range(rank)):
padding.extend([pre_pad[offset + i], post_pad[offset + i]])
return F.pad(input, padding, mode="constant", value=pad_value)


# ===================================================================
# QUANTIZED CONV2D OPERATION DEFINITION
# ===================================================================
Expand Down
6 changes: 6 additions & 0 deletions backends/cortex_m/ops/operators.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@
- arg_meta: null
kernel_name: cortex_m::transpose_out

- func: cortex_m::pad.out(Tensor input, int[] pre_pad, int[] post_pad, int pad_value, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
kernel_name: cortex_m::pad_out

- func: cortex_m::quantized_conv2d.out(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, int input_offset, int output_offset, Tensor requantize_multipliers, Tensor requantize_shifts, int activation_min, int activation_max, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
Expand Down
36 changes: 36 additions & 0 deletions backends/cortex_m/passes/quantized_op_fusion_pass.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import torch
from executorch.backends.cortex_m.passes.passes_utils import (
quantize_multiplier_aot,
quantize_val,
SHIFT_INT8,
)
from executorch.backends.cortex_m.quantizer.quantization_configs import (
Expand Down Expand Up @@ -374,6 +375,39 @@ def _get_avg_pool2d_replacement(self, args, meta):

return exir_ops.edge.cortex_m.quantized_avg_pool2d.default, args

def _get_pad_replacement(self, args, meta):
input_qparams = meta.data.get("input_qparams", {})
if not input_qparams:
return exir_ops.edge.aten.constant_pad_nd.default, args

scale = float(input_qparams[0].scale)
zero_point = int(input_qparams[0].zp)

padding = self._unwrap_argument(args[1])
pad_value_raw = self._unwrap_argument(args[2]) if len(args) > 2 else 0
pad_value_float = float(pad_value_raw)

quantized_pad_value = int(
quantize_val(pad_value_float, scale, zero_point, -128, 127)
)

rank = len(args[0].data.shape)
assert 1 <= rank <= 4, f"cortex_m pad: expected rank in [1, 4], got {rank}"
n_pairs = len(padding) // 2
assert (
len(padding) % 2 == 0 and n_pairs <= rank
), f"cortex_m pad: invalid padding length {len(padding)} for rank {rank}"

pre_pad = [0, 0, 0, 0]
post_pad = [0, 0, 0, 0]
for i in range(n_pairs):
dim_4d = 3 - i
pre_pad[dim_4d] = int(padding[2 * i])
post_pad[dim_4d] = int(padding[2 * i + 1])

new_args = (args[0], pre_pad, post_pad, int(quantized_pad_value))
return exir_ops.edge.cortex_m.pad.default, new_args

def call_operator(
self,
op: EdgeOpOverload,
Expand All @@ -399,6 +433,8 @@ def call_operator(
op, args = self._get_permute_replacement(args, meta)
case exir_ops.edge.aten.avg_pool2d.default:
op, args = self._get_avg_pool2d_replacement(args, meta)
case exir_ops.edge.aten.constant_pad_nd.default:
op, args = self._get_pad_replacement(args, meta)
case _:
pass

Expand Down
3 changes: 3 additions & 0 deletions backends/cortex_m/quantizer/quantizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ class SharedQspecQuantizer(Quantizer):
torch.ops.aten._unsafe_view.default,
torch.ops.aten.unflatten.int,
torch.ops.aten.flatten.using_ints,
# Padding
torch.ops.aten.pad.default,
torch.ops.aten.constant_pad_nd.default,
]

def __init__(self, targets: Optional[List[OpOverload]] = None) -> None:
Expand Down
88 changes: 88 additions & 0 deletions backends/cortex_m/test/ops/test_pad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.


import torch
import torch.nn.functional as F
from executorch.backends.arm.test.common import parametrize
from executorch.backends.cortex_m.test.tester import (
CortexMTester,
McuTestCase,
ramp_tensor,
)

OPS_BEFORE_PASSES = {
"executorch_exir_dialects_edge__ops_quantized_decomposed_quantize_per_tensor_default": 2,
"executorch_exir_dialects_edge__ops_quantized_decomposed_dequantize_per_tensor_default": 2,
"executorch_exir_dialects_edge__ops_aten_constant_pad_nd_default": 1,
}

OPS_AFTER_PASSES = {
"executorch_exir_dialects_edge__ops_cortex_m_quantize_per_tensor_default": 1,
"executorch_exir_dialects_edge__ops_cortex_m_pad_default": 1,
"executorch_exir_dialects_edge__ops_cortex_m_dequantize_per_tensor_default": 1,
}


class CortexMPad(torch.nn.Module):
ops_before_transforms = OPS_BEFORE_PASSES
ops_after_transforms = OPS_AFTER_PASSES

def __init__(self, padding, value=0.0):
super().__init__()
self.padding = padding
self.value = value

def forward(self, x):
return F.pad(x, self.padding, mode="constant", value=self.value)


test_cases = {
"pad_rank4_all_dims": McuTestCase(
CortexMPad((1, 1, 2, 2, 1, 0, 0, 1)),
(ramp_tensor(-0.5, 0.5, (1, 2, 3, 4)),),
),
"pad_rank4_last_two_dims": McuTestCase(
CortexMPad((1, 2, 3, 4)),
(ramp_tensor(-1.0, 1.0, (1, 3, 4, 5)),),
),
"pad_rank3": McuTestCase(
CortexMPad((1, 1, 2, 2)),
(ramp_tensor(-0.5, 0.5, (2, 3, 4)),),
),
"pad_rank2": McuTestCase(
CortexMPad((1, 2, 3, 4)),
(ramp_tensor(-1.0, 1.0, (3, 5)),),
),
"pad_rank1": McuTestCase(
CortexMPad((2, 3)),
(ramp_tensor(0.0, 1.0, (6,)),),
),
"pad_nonzero_value": McuTestCase(
CortexMPad((1, 1), value=0.5),
(ramp_tensor(-1.0, 1.0, (2, 4)),),
),
"pad_zero_padding": McuTestCase(
CortexMPad((0, 0, 0, 0)),
(ramp_tensor(-0.5, 0.5, (2, 3, 4, 5)),),
),
}


@parametrize("test_case", test_cases)
def test_dialect_pad(test_case):
tester = CortexMTester(test_case.model, test_case.example_inputs)
tester.test_dialect(
test_case.model.ops_before_transforms,
test_case.model.ops_after_transforms,
qtol=0,
)


@parametrize("test_case", test_cases)
def test_implementation_pad(test_case):
tester = CortexMTester(test_case.model, test_case.example_inputs)
tester.test_implementation(qtol=0)
Loading