Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions backends/cadence/aot/functions_hifi.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -286,12 +286,12 @@
- func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: impl::HiFi::native::im2row_out
kernel_name: impl::HiFi::im2row_out

- func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
kernels:
- arg_meta: null
kernel_name: impl::HiFi::native::im2row_per_tensor_out
kernel_name: impl::HiFi::im2row_per_tensor_out

- func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
variants: function
Expand Down
12 changes: 1 addition & 11 deletions backends/cadence/hifi/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,15 @@
add_library(
cadence_kernels
kernels.cpp
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
)
# Let files say "include <executorch/path/to/header.h>".
set(_common_include_directories
Expand Down
157 changes: 0 additions & 157 deletions backends/cadence/hifi/kernels/kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,41 +18,9 @@ using executorch::runtime::Result;

/* Potential NNLIB function/APIs */

extern "C" WORD32 xa_nn_broadcast_32_32(
WORD32* __restrict__ p_out,
const int* const out_shape,
WORD32* __restrict__ p_in,
const int* const in_shape,
int num_dims);

extern "C" WORD32 xa_nn_concat_32_32(
WORD32* __restrict__ p_out,
const WORD32* const p_out_shape,
const WORD32** pp_inps,
const WORD32* const* pp_inps_shape,
WORD32 num_out_dims,
WORD32 num_inp,
WORD32 num_inp_dims,
WORD32 axis);

extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" void
xa_nn_elm_atan2_f32(FLOAT32* z, const FLOAT32* y, const FLOAT32* x, WORD32 N);

extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp,
const FLOAT32* __restrict__ p_min,
const FLOAT32* __restrict__ p_max,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
Expand All @@ -63,14 +31,6 @@ extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(
const FLOAT32* __restrict__ p_max,
const WORD32* const p_max_shape);

extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
Expand All @@ -87,22 +47,6 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
const WORD32* const p_inp2_shape,
WORD32 mode);

extern "C" WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(
WORD8* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
const FLOAT32* __restrict__ p_inp2,
WORD32 num_elm,
WORD32 kernel_type);

extern "C" WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
WORD8* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape,
WORD32 kernel_type);

extern "C" WORD32 xa_nn_elm_fmod_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
Expand All @@ -123,42 +67,6 @@ extern "C" WORD32 xa_nn_elm_logicalxor_boolxbool_bool(
const WORD8* __restrict__ p_inp2,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
const FLOAT32* __restrict__ p_inp2,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
const FLOAT32* __restrict__ p_inp2,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" void xa_nn_elm_pow_f32(
FLOAT32* __restrict__ z,
const FLOAT32* __restrict__ x,
Expand All @@ -179,23 +87,6 @@ extern "C" WORD32 xa_nn_elm_remainder_broadcast_4D_f32xf32_f32(
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape);

extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const FLOAT32* __restrict__ p_inp1,
const FLOAT32* __restrict__ p_inp2,
const unsigned char* __restrict__ p_condition,
WORD32 num_elm);

extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
FLOAT32* __restrict__ p_out,
const WORD32* const p_out_shape,
const FLOAT32* __restrict__ p_inp1,
const WORD32* const p_inp1_shape,
const FLOAT32* __restrict__ p_inp2,
const WORD32* const p_inp2_shape,
const unsigned char* __restrict__ p_condition,
const WORD32* const p_condition_shape);

extern "C" WORD32 xa_nn_im2row_quantized(
const WORD8* __restrict__ data_im,
const WORD32 in_zero_point,
Expand Down Expand Up @@ -229,60 +120,12 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
WORD32 num_axis_dims,
void* __restrict__ p_scratch_in);

extern "C" WORD32 xa_nn_transpose_32_32(
WORD32* __restrict__ p_out,
const WORD32* const p_out_shape,
const WORD32* __restrict__ p_inp,
const WORD32* const p_inp_shape,
const WORD32* __restrict__ p_permute_vec,
WORD32 num_out_dims,
WORD32 num_inp_dims);

namespace impl {
namespace HiFi {
namespace kernels {

void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size);

void memcpy(void* dst, const void* src, size_t num_bytes);

WORD32 matmul_asym8uxasym8u_asym8u(
UWORD8* __restrict__ p_out, // output uint8 matrix
const UWORD8* __restrict__ p_mat1, // weight uint8 matrix
const UWORD8* __restrict__ p_vec1, // input uint8 matrix
const WORD32* __restrict__ p_bias, // bias int32 vec
WORD32 rows, // rows of p_mat1
WORD32 cols1, // columns of p_mat1
WORD32 row_stride1, // row stride of p_mat1
WORD32 vec_count, // rows of p_mat2
WORD32 vec_offset, // vec_offset of p_mat2.
WORD32 out_offset, // out_offset, i.e., offset of next output element
WORD32 out_stride, // out_stride, i.e., stride to go to next output row
WORD32 mat1_zero_bias, // zero_point of p_mat1
WORD32 vec1_zero_bias, // zero_point of p_vec1
const WORD32* __restrict__ out_multiplier,
const WORD32* __restrict__ out_shift,
WORD32 out_zero_bias,
bool per_channel_quantized = false); // per-channel quantized weight

WORD32 xa_nn_matmul_asym8uxasym8u_asym8u(
UWORD8* __restrict__ p_out,
const UWORD8* __restrict__ p_mat1,
const UWORD8* __restrict__ p_mat2,
const WORD32* __restrict__ p_bias,
WORD32 rows,
WORD32 cols,
WORD32 row_stride,
WORD32 vec_count,
WORD32 vec_offset,
WORD32 out_offset,
WORD32 out_stride,
WORD32 mat1_zero_bias,
WORD32 vec1_zero_bias,
WORD32 out_multiplier,
WORD32 out_shift,
WORD32 out_zero_bias);

template <typename T>
T quantize(const float x, float scale, int32_t zero_point);

Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/op_clamp.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -155,13 +155,13 @@ Tensor& clamp_Tensor_out(
inp_shape[i + off_inp] = in.size(i);
}

WORD32 ret_val = xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
WORD32 ret_val = xa_nn_elm_min_4D_Bcast_f32xf32_f32(
out_data, out_shape, inp_data, inp_shape, max_data, max_shape);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);

} else {
WORD32 ret_val = xa_nn_elm_minimum_f32xf32_f32(
WORD32 ret_val = xa_nn_elm_min_f32xf32_f32(
out_data, inp_data, max_data, out.numel());

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
Expand Down Expand Up @@ -192,13 +192,13 @@ Tensor& clamp_Tensor_out(
min_shape[i + off_min] = min.size(i);
for (int i = 0; i < inp_dim; i++)
inp_shape[i + off_inp] = in.size(i);
WORD32 ret_val = xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
WORD32 ret_val = xa_nn_elm_max_4D_Bcast_f32xf32_f32(
out_data, out_shape, inp_data, inp_shape, min_data, min_shape);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);

} else {
WORD32 ret_val = xa_nn_elm_maximum_f32xf32_f32(
WORD32 ret_val = xa_nn_elm_max_f32xf32_f32(
out_data, inp_data, min_data, out.numel());

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/op_eq.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,15 @@ Tensor& eq_Tensor_out(
for (int i = 0; i < b.dim(); i++)
inp2_shape[i + off_b] = b.size(i);

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 4);
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_EQUAL);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
} else {
int num_elm = out.numel();

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, 4);
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, COMPARE_EQUAL);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
}
Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/op_ge.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,15 @@ Tensor& ge_Tensor_out(
for (int i = 0; i < b.dim(); i++)
inp2_shape[i + off_b] = b.size(i);

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 0);
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_GREATEREQUAL);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
} else {
int num_elm = out.numel();

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, 0);
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATEREQUAL);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
}
Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/op_gt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,15 @@ Tensor& gt_Tensor_out(
for (int i = 0; i < b.dim(); i++)
inp2_shape[i + off_b] = b.size(i);

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 1);
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_GREATER);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
} else {
int num_elm = out.numel();

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, 1);
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATER);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
}
Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/op_le.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,15 +95,15 @@ Tensor& le_Tensor_out(
for (int i = 0; i < b.dim(); i++)
inp2_shape[i + off_b] = b.size(i);

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 2);
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_LESSEREQUAL);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
} else {
int num_elm = out.numel();

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, 2);
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSEREQUAL);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
}
Expand Down
8 changes: 4 additions & 4 deletions backends/cadence/hifi/operators/op_lt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,15 +93,15 @@ Tensor& lt_Tensor_out(
for (int i = 0; i < b.dim(); i++)
inp2_shape[i + off_b] = b.size(i);

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 3);
WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_LESSER);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
} else {
int num_elm = out.numel();

WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, 3);
WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSER);

ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
}
Expand Down
4 changes: 2 additions & 2 deletions backends/cadence/hifi/operators/op_maximum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,10 +141,10 @@ Tensor& maximum_out(
for (int i = 0; i < b.dim(); i++)
inp2_shape[i + off_b] = b.size(i);

xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
xa_nn_elm_max_4D_Bcast_f32xf32_f32(
out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
} else {
xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel());
xa_nn_elm_max_f32xf32_f32(out_data, a_data, b_data, out.numel());
}
return out;
}
Expand Down
Loading