diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index f713d0a3227..01e11068dad 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -286,12 +286,12 @@ - func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::native::im2row_out + kernel_name: impl::HiFi::im2row_out - func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null - kernel_name: impl::HiFi::native::im2row_per_tensor_out + kernel_name: impl::HiFi::im2row_per_tensor_out - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) variants: function diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index c366cecbe0c..a7ea9f5963c 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -8,25 +8,15 @@ add_library( cadence_kernels kernels.cpp - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c - ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c ) # Let files say "include ". set(_common_include_directories diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 6a3dcd1d245..ce392be8aa3 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -18,41 +18,9 @@ using executorch::runtime::Result; /* Potential NNLIB function/APIs */ -extern "C" WORD32 xa_nn_broadcast_32_32( - WORD32* __restrict__ p_out, - const int* const out_shape, - WORD32* __restrict__ p_in, - const int* const in_shape, - int num_dims); - -extern "C" WORD32 xa_nn_concat_32_32( - WORD32* __restrict__ p_out, - const WORD32* const p_out_shape, - const WORD32** pp_inps, - const WORD32* const* pp_inps_shape, - WORD32 num_out_dims, - WORD32 num_inp, - WORD32 num_inp_dims, - WORD32 axis); - -extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const WORD32* const p_out_shape, - const FLOAT32* __restrict__ p_inp1, - const WORD32* const p_inp1_shape, - const FLOAT32* __restrict__ p_inp2, - const WORD32* const p_inp2_shape); - extern "C" void xa_nn_elm_atan2_f32(FLOAT32* z, const FLOAT32* y, const FLOAT32* x, WORD32 N); -extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32( - FLOAT32* __restrict__ p_out, - const FLOAT32* __restrict__ p_inp, - const FLOAT32* __restrict__ p_min, - const FLOAT32* __restrict__ p_max, - WORD32 num_elm); - extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -63,14 +31,6 @@ extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32( const FLOAT32* __restrict__ p_max, const WORD32* const p_max_shape); -extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const WORD32* const p_out_shape, - const FLOAT32* __restrict__ p_inp1, - const WORD32* const p_inp1_shape, - const FLOAT32* __restrict__ p_inp2, - const WORD32* const p_inp2_shape); - extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32( FLOAT32* __restrict__ p_out, const FLOAT32* __restrict__ p_inp1, @@ -87,22 +47,6 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32( const WORD32* const p_inp2_shape, WORD32 mode); -extern "C" WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32( - WORD8* __restrict__ p_out, - const FLOAT32* __restrict__ p_inp1, - const FLOAT32* __restrict__ p_inp2, - WORD32 num_elm, - WORD32 kernel_type); - -extern "C" WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( - WORD8* __restrict__ p_out, - const WORD32* const p_out_shape, - const FLOAT32* __restrict__ p_inp1, - const WORD32* const p_inp1_shape, - const FLOAT32* __restrict__ p_inp2, - const WORD32* const p_inp2_shape, - WORD32 kernel_type); - extern "C" WORD32 xa_nn_elm_fmod_f32xf32_f32( FLOAT32* __restrict__ p_out, const FLOAT32* __restrict__ p_inp1, @@ -123,42 +67,6 @@ extern "C" WORD32 xa_nn_elm_logicalxor_boolxbool_bool( const WORD8* __restrict__ p_inp2, WORD32 num_elm); -extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const FLOAT32* __restrict__ p_inp1, - const FLOAT32* __restrict__ p_inp2, - WORD32 num_elm); - -extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const WORD32* const p_out_shape, - const FLOAT32* __restrict__ p_inp1, - const WORD32* const p_inp1_shape, - const FLOAT32* __restrict__ p_inp2, - const WORD32* const p_inp2_shape); - -extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const FLOAT32* __restrict__ p_inp1, - const FLOAT32* __restrict__ p_inp2, - WORD32 num_elm); - -extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const WORD32* const p_out_shape, - const FLOAT32* __restrict__ p_inp1, - const WORD32* const p_inp1_shape, - const FLOAT32* __restrict__ p_inp2, - const WORD32* const p_inp2_shape); - -extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const WORD32* const p_out_shape, - const FLOAT32* __restrict__ p_inp1, - const WORD32* const p_inp1_shape, - const FLOAT32* __restrict__ p_inp2, - const WORD32* const p_inp2_shape); - extern "C" void xa_nn_elm_pow_f32( FLOAT32* __restrict__ z, const FLOAT32* __restrict__ x, @@ -179,23 +87,6 @@ extern "C" WORD32 xa_nn_elm_remainder_broadcast_4D_f32xf32_f32( const FLOAT32* __restrict__ p_inp2, const WORD32* const p_inp2_shape); -extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const FLOAT32* __restrict__ p_inp1, - const FLOAT32* __restrict__ p_inp2, - const unsigned char* __restrict__ p_condition, - WORD32 num_elm); - -extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32( - FLOAT32* __restrict__ p_out, - const WORD32* const p_out_shape, - const FLOAT32* __restrict__ p_inp1, - const WORD32* const p_inp1_shape, - const FLOAT32* __restrict__ p_inp2, - const WORD32* const p_inp2_shape, - const unsigned char* __restrict__ p_condition, - const WORD32* const p_condition_shape); - extern "C" WORD32 xa_nn_im2row_quantized( const WORD8* __restrict__ data_im, const WORD32 in_zero_point, @@ -229,60 +120,12 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32( WORD32 num_axis_dims, void* __restrict__ p_scratch_in); -extern "C" WORD32 xa_nn_transpose_32_32( - WORD32* __restrict__ p_out, - const WORD32* const p_out_shape, - const WORD32* __restrict__ p_inp, - const WORD32* const p_inp_shape, - const WORD32* __restrict__ p_permute_vec, - WORD32 num_out_dims, - WORD32 num_inp_dims); - namespace impl { namespace HiFi { namespace kernels { void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size); -void memcpy(void* dst, const void* src, size_t num_bytes); - -WORD32 matmul_asym8uxasym8u_asym8u( - UWORD8* __restrict__ p_out, // output uint8 matrix - const UWORD8* __restrict__ p_mat1, // weight uint8 matrix - const UWORD8* __restrict__ p_vec1, // input uint8 matrix - const WORD32* __restrict__ p_bias, // bias int32 vec - WORD32 rows, // rows of p_mat1 - WORD32 cols1, // columns of p_mat1 - WORD32 row_stride1, // row stride of p_mat1 - WORD32 vec_count, // rows of p_mat2 - WORD32 vec_offset, // vec_offset of p_mat2. - WORD32 out_offset, // out_offset, i.e., offset of next output element - WORD32 out_stride, // out_stride, i.e., stride to go to next output row - WORD32 mat1_zero_bias, // zero_point of p_mat1 - WORD32 vec1_zero_bias, // zero_point of p_vec1 - const WORD32* __restrict__ out_multiplier, - const WORD32* __restrict__ out_shift, - WORD32 out_zero_bias, - bool per_channel_quantized = false); // per-channel quantized weight - -WORD32 xa_nn_matmul_asym8uxasym8u_asym8u( - UWORD8* __restrict__ p_out, - const UWORD8* __restrict__ p_mat1, - const UWORD8* __restrict__ p_mat2, - const WORD32* __restrict__ p_bias, - WORD32 rows, - WORD32 cols, - WORD32 row_stride, - WORD32 vec_count, - WORD32 vec_offset, - WORD32 out_offset, - WORD32 out_stride, - WORD32 mat1_zero_bias, - WORD32 vec1_zero_bias, - WORD32 out_multiplier, - WORD32 out_shift, - WORD32 out_zero_bias); - template T quantize(const float x, float scale, int32_t zero_point); diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp index e3d5c8914a4..6485bea1007 100644 --- a/backends/cadence/hifi/operators/op_clamp.cpp +++ b/backends/cadence/hifi/operators/op_clamp.cpp @@ -155,13 +155,13 @@ Tensor& clamp_Tensor_out( inp_shape[i + off_inp] = in.size(i); } - WORD32 ret_val = xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + WORD32 ret_val = xa_nn_elm_min_4D_Bcast_f32xf32_f32( out_data, out_shape, inp_data, inp_shape, max_data, max_shape); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { - WORD32 ret_val = xa_nn_elm_minimum_f32xf32_f32( + WORD32 ret_val = xa_nn_elm_min_f32xf32_f32( out_data, inp_data, max_data, out.numel()); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); @@ -192,13 +192,13 @@ Tensor& clamp_Tensor_out( min_shape[i + off_min] = min.size(i); for (int i = 0; i < inp_dim; i++) inp_shape[i + off_inp] = in.size(i); - WORD32 ret_val = xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + WORD32 ret_val = xa_nn_elm_max_4D_Bcast_f32xf32_f32( out_data, out_shape, inp_data, inp_shape, min_data, min_shape); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { - WORD32 ret_val = xa_nn_elm_maximum_f32xf32_f32( + WORD32 ret_val = xa_nn_elm_max_f32xf32_f32( out_data, inp_data, min_data, out.numel()); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); diff --git a/backends/cadence/hifi/operators/op_eq.cpp b/backends/cadence/hifi/operators/op_eq.cpp index a76b910e379..6d75ab51a96 100644 --- a/backends/cadence/hifi/operators/op_eq.cpp +++ b/backends/cadence/hifi/operators/op_eq.cpp @@ -94,15 +94,15 @@ Tensor& eq_Tensor_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( - p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 4); + WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32( + p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_EQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { int num_elm = out.numel(); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32( - p_out, p_inp1, p_inp2, num_elm, 4); + WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32( + p_out, p_inp1, p_inp2, num_elm, COMPARE_EQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } diff --git a/backends/cadence/hifi/operators/op_ge.cpp b/backends/cadence/hifi/operators/op_ge.cpp index 5d9111b5312..bf2e2562d73 100644 --- a/backends/cadence/hifi/operators/op_ge.cpp +++ b/backends/cadence/hifi/operators/op_ge.cpp @@ -94,15 +94,15 @@ Tensor& ge_Tensor_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( - p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 0); + WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32( + p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_GREATEREQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { int num_elm = out.numel(); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32( - p_out, p_inp1, p_inp2, num_elm, 0); + WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32( + p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATEREQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } diff --git a/backends/cadence/hifi/operators/op_gt.cpp b/backends/cadence/hifi/operators/op_gt.cpp index 5995dba3bed..1e054caf8ea 100644 --- a/backends/cadence/hifi/operators/op_gt.cpp +++ b/backends/cadence/hifi/operators/op_gt.cpp @@ -96,15 +96,15 @@ Tensor& gt_Tensor_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( - p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 1); + WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32( + p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_GREATER); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { int num_elm = out.numel(); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32( - p_out, p_inp1, p_inp2, num_elm, 1); + WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32( + p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATER); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } diff --git a/backends/cadence/hifi/operators/op_le.cpp b/backends/cadence/hifi/operators/op_le.cpp index fb224b84369..fb2189c7b4c 100644 --- a/backends/cadence/hifi/operators/op_le.cpp +++ b/backends/cadence/hifi/operators/op_le.cpp @@ -95,15 +95,15 @@ Tensor& le_Tensor_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( - p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 2); + WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32( + p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_LESSEREQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { int num_elm = out.numel(); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32( - p_out, p_inp1, p_inp2, num_elm, 2); + WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32( + p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSEREQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } diff --git a/backends/cadence/hifi/operators/op_lt.cpp b/backends/cadence/hifi/operators/op_lt.cpp index bbff9cc0aee..bfab9236964 100644 --- a/backends/cadence/hifi/operators/op_lt.cpp +++ b/backends/cadence/hifi/operators/op_lt.cpp @@ -93,15 +93,15 @@ Tensor& lt_Tensor_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( - p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 3); + WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32( + p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_LESSER); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { int num_elm = out.numel(); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32( - p_out, p_inp1, p_inp2, num_elm, 3); + WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32( + p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSER); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp index 1882967f81a..e84fdcb7988 100644 --- a/backends/cadence/hifi/operators/op_maximum.cpp +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -141,10 +141,10 @@ Tensor& maximum_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + xa_nn_elm_max_4D_Bcast_f32xf32_f32( out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); } else { - xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + xa_nn_elm_max_f32xf32_f32(out_data, a_data, b_data, out.numel()); } return out; } diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp index 1f069b362fd..4385c6f6fc9 100644 --- a/backends/cadence/hifi/operators/op_minimum.cpp +++ b/backends/cadence/hifi/operators/op_minimum.cpp @@ -141,10 +141,10 @@ Tensor& minimum_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + xa_nn_elm_min_4D_Bcast_f32xf32_f32( out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); } else { - xa_nn_elm_minimum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + xa_nn_elm_min_f32xf32_f32(out_data, a_data, b_data, out.numel()); } return out; } diff --git a/backends/cadence/hifi/operators/op_ne.cpp b/backends/cadence/hifi/operators/op_ne.cpp index f183a42452a..aa782e3c0f8 100644 --- a/backends/cadence/hifi/operators/op_ne.cpp +++ b/backends/cadence/hifi/operators/op_ne.cpp @@ -95,15 +95,15 @@ Tensor& ne_Tensor_out( for (int i = 0; i < b.dim(); i++) inp2_shape[i + off_b] = b.size(i); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32( - p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 5); + WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32( + p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_NOTEQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } else { int num_elm = out.numel(); - WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32( - p_out, p_inp1, p_inp2, num_elm, 5); + WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32( + p_out, p_inp1, p_inp2, num_elm, COMPARE_NOTEQUAL); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index af06c17f50f..c45d03841a9 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -114,49 +114,22 @@ Tensor& where_self_out( for (int i = 0; i < cond.dim(); i++) con_shape[i + off_c] = cond.size(i); - if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || - con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) { - void* p_scratch = (void*)kernels::allocate_temp_memory( - ctx, - (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) * - sizeof(int)); - - ET_KERNEL_CHECK(ctx, p_scratch != nullptr, MemoryAllocationFailed, out); - - const unsigned char* p_brd_cond = (const unsigned char*)p_scratch; - xa_nn_broadcast_8_8( - (WORD8* __restrict__)p_brd_cond, - out_shape, - (const WORD8* __restrict__)con, - con_shape, - 4); - - for (int i = 0; i < 4; i++) { - con_shape[i] = out_shape[i]; - } - xa_nn_elm_where_broadcast_4D_f32xf32_f32( - out_data, - out_shape, - a_data, - inp1_shape, - b_data, - inp2_shape, - p_brd_cond, - con_shape); - - } else { - xa_nn_elm_where_broadcast_4D_f32xf32_f32( - out_data, - out_shape, - a_data, - inp1_shape, - b_data, - inp2_shape, - con, - con_shape); - } + xa_nn_elm_select_broadcast_4D_32x32_32( + (WORD32*)out_data, + out_shape, + (WORD32*)a_data, + inp1_shape, + (WORD32*)b_data, + inp2_shape, + con, + con_shape); } else { - xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel()); + xa_nn_elm_select_32x32_32( + (WORD32*)out_data, + (WORD32*)a_data, + (WORD32*)b_data, + con, + out.numel()); } return out; } diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c deleted file mode 100644 index cad3f1a25bb..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c +++ /dev/null @@ -1,313 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -/* - * xa_nn_broadcast_8_8.c - */ - -#include "xa_nnlib_common.h" -//#include "xa_nn_basic_state.h" - -#include -#include - -#include "stdio.h" - -/* - * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c - */ - -#define NUMDIMS_MAX 8 - -typedef struct bcast_expansion_struct_{ - size_t load_num_elem; - int replicate_loadedElm_times; - int repeat_operation; -} bcast_expansion_rule ; - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src); - -void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) -{ - char *dest = (char *)dest1; - char *src = (char *)src1; - int n = (int)n1; - ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; - int i; - void *orig_dest = dest; - - if (n < 32) { - return memcpy(dest, src, n); - } - - if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned - s_align_addr = (ae_int16x4 *) src; - d_align_addr = (ae_int16x4 *) dest; - for (i=0; i>3; i++) { - d_align_addr[i] = s_align_addr[i]; - } - - for (i=(n&~7); i>3; i++) { - AE_LA16X4_IP(t, s_align, s_align_addr); - AE_LA16X4_IP(t2, s_align, s_align_addr); - AE_SA16X4_IP(t, d_align, d_align_addr); - AE_SA16X4_IP(t2, d_align, d_align_addr); - } - AE_SA64POS_FP(d_align, d_align_addr); - ae_int16 *s_src = (ae_int16 *) src; - ae_int16 *s_dest = (ae_int16 *) dest; - for (i=8*i; i8, -1); - - int i = 0; - - /* Check for valid IO shapes */ - for(i=0; i=0){ - - /* Find the sub-matrix size */ - while(in_shape[dim] != 1 && dim>=0){ - num_elem_load *= out_shape[dim]; - dim--; - } - - /* Find the number of times this sub-matrix needs to be copied */ - num_copy_times = 1; - while(in_shape[dim] == 1 && dim>=0){ - num_copy_times *= out_shape[dim]; - dim--; - } - - /* Find the number of times the above copy needs to be repeated */ - num_repeat = 1; - while(in_shape[dim] != 1 && dim>=0){ - num_repeat *= 1 * out_shape[dim]; - dim--; - } - - bcast_expansion_steps[k].load_num_elem = num_elem_load; - bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; - bcast_expansion_steps[k].repeat_operation = num_repeat; - k++; - - num_elem_load = num_elem_load * num_copy_times * num_repeat; - } - - res = broadcast_node_32(bcast_expansion_steps, num_dims-1, - p_out, p_in); - (void)res; /* Unused return value */ - - return 0; -} - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src) { - int step_itr=0, rep_itr=0; - int i=0, j=0, k=0; - bcast_expansion_rule *step = NULL; - - // ignore steps that are null - while(steps[step_id].repeat_operation == 0 && step_id>0){ - step_id--; - } - - // step is now the parent node for this iteration - step = &steps[step_id]; - size_t numLoadedElm = step->load_num_elem; - - WORD32 *cp_dst = dst; - WORD32 *cp_src = src; - WORD32 *cp_src_temp=NULL; - WORD32 *cp_dst_temp=NULL; - - if(numLoadedElm>32){ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; jrepeat_operation; j++){ - for(i=0; ireplicate_loadedElm_times; i++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } - else{ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - for(k=0; k<(int)numLoadedElm; k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - } - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; j < step->repeat_operation; j++){ - for(i=0; i < step->replicate_loadedElm_times; i++){ - for(k=0; k<(int)(numLoadedElm); k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - - } - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } -} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c deleted file mode 100644 index 34a7111ee78..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c +++ /dev/null @@ -1,313 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -/* - * xa_nn_broadcast_32_32.c - */ - -#include "xa_nnlib_common.h" -//#include "xa_nn_basic_state.h" - -#include -#include - -#include "stdio.h" - -/* - * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c - */ - -#define NUMDIMS_MAX 8 - -typedef struct bcast_expansion_struct_{ - size_t load_num_elem; - int replicate_loadedElm_times; - int repeat_operation; -} bcast_expansion_rule ; - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src); - -void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) -{ - char *dest = (char *)dest1; - char *src = (char *)src1; - int n = (int)n1; - ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; - int i; - void *orig_dest = dest; - - if (n < 32) { - return memcpy(dest, src, n); - } - - if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned - s_align_addr = (ae_int16x4 *) src; - d_align_addr = (ae_int16x4 *) dest; - for (i=0; i>3; i++) { - d_align_addr[i] = s_align_addr[i]; - } - - for (i=(n&~7); i>3; i++) { - AE_LA16X4_IP(t, s_align, s_align_addr); - AE_LA16X4_IP(t2, s_align, s_align_addr); - AE_SA16X4_IP(t, d_align, d_align_addr); - AE_SA16X4_IP(t2, d_align, d_align_addr); - } - AE_SA64POS_FP(d_align, d_align_addr); - ae_int16 *s_src = (ae_int16 *) src; - ae_int16 *s_dest = (ae_int16 *) dest; - for (i=8*i; i8, -1); - - int i = 0; - - /* Check for valid IO shapes */ - for(i=0; i=0){ - - /* Find the sub-matrix size */ - while(in_shape[dim] != 1 && dim>=0){ - num_elem_load *= out_shape[dim]; - dim--; - } - - /* Find the number of times this sub-matrix needs to be copied */ - num_copy_times = 1; - while(in_shape[dim] == 1 && dim>=0){ - num_copy_times *= out_shape[dim]; - dim--; - } - - /* Find the number of times the above copy needs to be repeated */ - num_repeat = 1; - while(in_shape[dim] != 1 && dim>=0){ - num_repeat *= 1 * out_shape[dim]; - dim--; - } - - bcast_expansion_steps[k].load_num_elem = num_elem_load; - bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; - bcast_expansion_steps[k].repeat_operation = num_repeat; - k++; - - num_elem_load = num_elem_load * num_copy_times * num_repeat; - } - - res = broadcast_node_32(bcast_expansion_steps, num_dims-1, - p_out, p_in); - (void)res; /* Unused return value */ - - return 0; -} - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src) { - int step_itr=0, rep_itr=0; - int i=0, j=0, k=0; - bcast_expansion_rule *step = NULL; - - // ignore steps that are null - while(steps[step_id].repeat_operation == 0 && step_id>0){ - step_id--; - } - - // step is now the parent node for this iteration - step = &steps[step_id]; - size_t numLoadedElm = step->load_num_elem; - - WORD32 *cp_dst = dst; - WORD32 *cp_src = src; - WORD32 *cp_src_temp=NULL; - WORD32 *cp_dst_temp=NULL; - - if(numLoadedElm>32){ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; jrepeat_operation; j++){ - for(i=0; ireplicate_loadedElm_times; i++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } - else{ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - for(k=0; k<(int)numLoadedElm; k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - } - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; j < step->repeat_operation; j++){ - for(i=0; i < step->replicate_loadedElm_times; i++){ - for(k=0; k<(int)(numLoadedElm); k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - - } - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } -} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c deleted file mode 100644 index 3b73e30db42..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c +++ /dev/null @@ -1,195 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ - - -#include "xa_type_def.h" -#include "xa_nn_common.h" -#include "xa_nnlib_kernels_api.h" -#include "xa_nnlib_common_macros.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_common.h" - -WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out - ,const WORD32 *const p_out_shape - ,const WORD32 **pp_inps - ,const WORD32 *const *pp_inps_shape - ,WORD32 num_out_dims - ,WORD32 num_inp - ,WORD32 num_inp_dims - ,WORD32 axis) -{ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); - XA_NNLIB_ARG_CHK_PTR(pp_inps, -1); - XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1); - XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1); - //Validate Arguments - XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1); - XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1); - XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1); - XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1); - - int i = 0, j = 0; - for(i = 0; i < num_out_dims; i++) - { - XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1); - } - - if(axis < 0) - axis = num_out_dims + axis; - - WORD32 concat_size = 0; - for (i = 0; i < num_inp; i++) - { - XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1); - XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1); - XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1); -#pragma loop_count min=1 - for(j = 0; j < num_out_dims; j++) - { - XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1); - } - - XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1); - concat_size += pp_inps_shape[i][axis]; - } - - XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1); - - //Calculate outer and inner size for axis - WORD32 outer_size = 1; -#pragma no_simd - for(int i = 0; i < axis; i++) - { - outer_size *= p_out_shape[i]; - } - - WORD32 base_inner_size = 1; -#pragma no_simd - for(int i = axis + 1; i < num_out_dims; i++) - { - base_inner_size *= p_out_shape[i]; - } - - WORD32 *ptmp_out = p_out; - for(int i = 0; i < num_inp; i++) - { - const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size; - WORD32 *output_ptr = ptmp_out; - const WORD32* input_ptr = pp_inps[i]; - - if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0) - && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0)) - { - if(copy_size <= 8) - { - const ae_f32 *pae_inp = (const ae_f32 *)input_ptr; - for(int k = 0; k < outer_size; k++) - { - ae_f32 *pae_out = (ae_f32 *)output_ptr; -#pragma concurrent -#pragma no_simd - for(int ic = 0; ic < copy_size; ic++) - { - *pae_out++ = *pae_inp++; - } - output_ptr += concat_size * base_inner_size; - } - } - else - { - for(int k = 0; k < outer_size; k++) - { - const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; - ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; - ae_valign inp_a, out_a; - inp_a = AE_LA64_PP(pae_inp); - out_a = AE_ZALIGN64(); - for(int ic = 0; ic < (copy_size >> 1); ic++) - { - ae_int32x2 d0; - AE_LA32X2_IP(d0, inp_a, pae_inp); - AE_SA32X2_IP(d0, out_a, pae_out); - } - AE_SA64POS_FP(out_a, pae_out); - const ae_f32 *puae_inp = (const ae_f32 *)pae_inp; - ae_f32 *puae_out = (ae_f32 *)pae_out; -#pragma concurrent - for(int ic = 0; ic < (copy_size & 1); ic++) - { - puae_out[copy_size - 1] = puae_inp[copy_size - 1]; - } - input_ptr += copy_size; - output_ptr += concat_size * base_inner_size; - } - } - } - else - { - if(copy_size <= 6) - { - for(int k = 0; k < outer_size; k++) - { -#pragma concurrent -#pragma no_unroll - for(int ic = 0; ic < copy_size; ic++) - { - output_ptr[ic] = *input_ptr++; - } - output_ptr += concat_size * base_inner_size; - } - } - else - { - for(int k = 0; k < outer_size; k++) - { - const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; - ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; - ae_valign inp_a, out_a; - inp_a = AE_LA64_PP(pae_inp); - out_a = AE_ZALIGN64(); - -#pragma concurrent - for(int ic = 0; ic < copy_size >> 1; ic++) - { - ae_int32x2 d0; - AE_LA32X2_IP(d0, inp_a, pae_inp); - AE_SA32X2_IP(d0, out_a, pae_out); - } - AE_SA64POS_FP(out_a, pae_out); - - for(int ic = 0; ic < (copy_size & 1); ic++) - { - output_ptr[copy_size - 1] = input_ptr[copy_size - 1]; - } - input_ptr += copy_size; - output_ptr += concat_size * base_inner_size; - } - } - } - ptmp_out += copy_size; - } - return 0; -} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c deleted file mode 100644 index 2a18d57e99f..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c +++ /dev/null @@ -1,426 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_kernels_api.h" - -#if HAVE_VFPU -static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - /* For computing inp2 + inp1 */ - if(sign_flag){ - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_ADD_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_ADD_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_ADD_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } - /* For computing inp1 + inp2 */ - else - { - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_ADD_SX2(x1, x2); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_ADD_SX2(x1, x2); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_ADD_S(a0, b0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } -} - -static void internal_elm_add_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - /* For computing inp2 + inp1 */ - if(sign_flag){ - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_add_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_add_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_add_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_add_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; - -} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c index 3d8106eead6..db7154610d3 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c @@ -25,98 +25,21 @@ #include "xa_nnlib_err_chk.h" #include "xa_nnlib_kernels_api.h" - #if !HAVE_VFPU DISCARD_FUN_FOR_NONVOID_RETURN( - WORD32, xa_nn_elm_clamp_f32xf32xf32_f32, + WORD32, xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32, ( - FLOAT32 *p_out, - const FLOAT32 *p_inp, - const FLOAT32 *p_min, - const FLOAT32 *p_max, - WORD32 num_elm + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp, + const WORD32 *const p_inp_shape, + const FLOAT32 * __restrict__ p_min, + const WORD32 *const p_min_shape, + const FLOAT32 * __restrict__ p_max, + const WORD32 *const p_max_shape ) ) #else -WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp, - const FLOAT32 * __restrict__ p_min, - const FLOAT32 * __restrict__ p_max, - WORD32 num_elm) -{ - - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp, -1); - XA_NNLIB_ARG_CHK_PTR(p_min, -1); - XA_NNLIB_ARG_CHK_PTR(p_max, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_min, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_max, sizeof(FLOAT32), -1); - /* Basic Parameter checks */ - XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); - - int i; - xtfloatx2 *inp = (xtfloatx2 *)p_inp; - xtfloatx2 *min = (xtfloatx2 *)p_min; - xtfloatx2 *max = (xtfloatx2 *)p_max; - xtfloatx2 *out = (xtfloatx2 *)p_out; - - xtfloatx2 x1, d_min, d_max, y; - - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp)&7) == 0) && ((((unsigned)p_min)&7) == 0) && ((((unsigned)p_max)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp, 2*sizeof(FLOAT32)); - XT_LSX2IP(d_min, min, 2*sizeof(FLOAT32)); - XT_LSX2IP(d_max, max, 2*sizeof(FLOAT32)); - - y = XT_MAX_SX2(x1, d_min); - y = XT_MIN_SX2(y, d_max); - - XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); - } - } - else - { - ae_valign inp_a, min_a, max_a, out_a; - - inp_a = XT_LASX2PP(inp); - min_a = XT_LASX2PP(min); - max_a = XT_LASX2PP(max); - out_a = AE_ZALIGN64(); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp_a, inp); - XT_LASX2IP(d_min, min_a, min); - XT_LASX2IP(d_max, max_a, max); - - y = XT_MAX_SX2(x1, d_min); - y = XT_MIN_SX2(y, d_max); - - XT_SASX2IP(y, out_a, out); - } - XT_SASX2POSFP(out_a, out); - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a3, a; - XT_LSIP(a1, (xtfloat *)inp, 0); - XT_LSIP(a2, (xtfloat *)min, 0); - XT_LSIP(a3, (xtfloat *)max, 0); - a = XT_MAX_S(a1, a2); - a = XT_MIN_S(a, a3); - XT_SSI(a, (xtfloat *)out, 0); - } - return 0; -} - static void internal_elm_clamp_broadcast_f32xf32xf32_f32(FLOAT32 * __restrict__ p_out, const FLOAT32 * __restrict__ p_min, const FLOAT32 * __restrict__ p_max, @@ -794,4 +717,4 @@ WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(FLOAT32 * __restrict__ p_out } return 0; } -#endif +#endif \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c deleted file mode 100644 index 16fc23f59de..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c +++ /dev/null @@ -1,441 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ - -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -//#include "xa_nn_basic_state.h" -#include "xa_nnlib_kernels_api.h" - -#if HAVE_VFPU -static void internal_elm_div_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - /* For computing inp2 - inp1 */ - if(sign_flag){ - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_DIV_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_DIV_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_DIV_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } - /* For computing inp1 - inp2 */ - else - { - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_DIV_SX2(x1, x2); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_DIV_SX2(x1, x2); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_DIV_S(a0, b0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } -} - -static void internal_elm_div_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - /* For computing inp2 - inp1 */ - if(sign_flag){ - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_div_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_div_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_div_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_div_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; -} -#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c deleted file mode 100644 index 7d95e536c9e..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c +++ /dev/null @@ -1,845 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nnlib_err_chk.h" - -#if !HAVE_VFPU -DISCARD_FUN_FOR_NONVOID_RETURN( - WORD32, xa_nn_elm_maximum_f32xf32_f32, - ( - FLOAT32 *p_out, - const FLOAT32 *p_inp1, - const FLOAT32 *p_inp2, - WORD32 num_elm - ) - ) -#else -WORD32 xa_nn_elm_maximum_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm) -{ - - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); - /* Basic Parameter checks */ - XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); - - int i; - xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; - xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; - xtfloatx2 *out = (xtfloatx2 *)p_out; - xtfloatx2 x1, x2, y; - unsigned char con1, con2; - xtbool2 con = int32_rtor_xtbool2(0x00000003); - - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - y = XT_MAX_SX2(x2, x1); - XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - out_a = AE_ZALIGN64(); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - y = XT_MAX_SX2(x2, x1); - XT_SASX2IP(y, out_a, out); - } - XT_SASX2POSFP(out_a, out); - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - a = XT_MAX_S(a1, a2); - XT_SSI(a, (xtfloat *)out, 0); - } - return 0; -} -#endif - -#if HAVE_VFPU -static void internal_elm_maximum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_MAX_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_MAX_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_MAX_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } -} - -static void internal_elm_maximum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_maximum_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_maximum_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_maximum_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_maximum_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; -} -#endif - -#if !HAVE_VFPU -DISCARD_FUN_FOR_NONVOID_RETURN( - WORD32, xa_nn_elm_minimum_f32xf32_f32, - ( - FLOAT32 *p_out, - const FLOAT32 *p_inp1, - const FLOAT32 *p_inp2, - WORD32 num_elm - ) - ) -#else -WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm) -{ - - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); - /* Basic Parameter checks */ - XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); - - int i; - xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; - xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; - xtfloatx2 *out = (xtfloatx2 *)p_out; - xtfloatx2 x1, x2, y; - unsigned char con1, con2; - xtbool2 con = int32_rtor_xtbool2(0x00000003); - - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - y = XT_MIN_SX2(x2, x1); - XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - out_a = AE_ZALIGN64(); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - y = XT_MIN_SX2(x2, x1); - XT_SASX2IP(y, out_a, out); - } - XT_SASX2POSFP(out_a, out); - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - a = XT_MIN_S(a1, a2); - XT_SSI(a, (xtfloat *)out, 0); - } - return 0; -} -#endif - -#if HAVE_VFPU -static void internal_elm_minimum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_MIN_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_MIN_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_MIN_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } -} - -static void internal_elm_minimum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_minimum_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_minimum_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_minimum_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_minimum_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; -} - -#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c deleted file mode 100644 index e11fccbba52..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c +++ /dev/null @@ -1,359 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_kernels_api.h" - -#if HAVE_VFPU -static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_MUL_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_MUL_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_MUL_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } -} - -static void internal_elm_mul_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_mul_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_mul_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_mul_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_mul_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; -} -#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c deleted file mode 100644 index 840a027f7a7..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c +++ /dev/null @@ -1,838 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_kernels_api.h" - - -#if !HAVE_VFPU -DISCARD_FUN_FOR_NONVOID_RETURN( - WORD32, xa_nn_elm_where_f32xf32_f32, - ( - FLOAT32 *p_out, - const FLOAT32 *p_inp1, - const FLOAT32 *p_inp2, - const unsigned char *__restrict__ condition, - WORD32 num_elm - ) - ) -#else -WORD32 xa_nn_elm_where_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - const unsigned char *__restrict__ p_condition, - WORD32 num_elm) -{ - - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); - /* Basic Parameter checks */ - XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); - - int i; - xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; - xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; - xtfloatx2 *out = (xtfloatx2 *)p_out; - unsigned char *condition = p_condition; - xtfloatx2 x1, x2, y; - unsigned char con1, con2; - xtbool2 con = int32_rtor_xtbool2(0x00000003); - - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x1, con); - XT_MOVF_SX2 (y, x2, con); - XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - out_a = AE_ZALIGN64(); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x1, con); - XT_MOVF_SX2 (y, x2, con); - XT_SASX2IP(y, out_a, out); - } - XT_SASX2POSFP(out_a, out); - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - con1 = XT_L8UI(condition, 0); - xtbool s = AE_MOVBA(con1); - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - XT_MOVT_S(a, a1, s); - XT_MOVF_S(a, a2, s); - XT_SSI(a, (xtfloat *)out, 0); - } - return 0; -} - -static void internal_elm_where_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - const unsigned char * __restrict__ p_condition, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - unsigned char *condition = p_condition; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - unsigned char con1, con2; - xtbool2 con = int32_rtor_xtbool2(0x00000003); - - /* For out = condition ? inp2 :inp1 */ - if(sign_flag){ - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - x1 = XT_LSI((xtfloat *)p_a, 0); - - unsigned char con1, con2; - xtbool2 con = int32_rtor_xtbool2(0x00000003); - - if((((unsigned)p_c)&7) == 0) - { - for(i=0; i> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - unsigned char con1, con2; - xtbool2 con = int32_rtor_xtbool2(0x00000003); - /* For out = condition ? inp2 :inp1 */ - if(sign_flag){ - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - condition = &p_condition[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x2, con); - XT_MOVF_SX2 (y, x1, con); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x2, con); - XT_MOVF_SX2 (y, x1, con); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, 0); - XT_LSIP(b0, (xtfloat *)p_b, 0); - con1 = XT_L8UI(condition, 0); - xtbool s = AE_MOVBA(con1); - XT_MOVT_S(c0, b0, s); - XT_MOVF_S(c0, a0, s); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } - /* For out = condition ? inp1 :inp2 */ - else - { - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - condition = &p_condition[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x1, con); - XT_MOVF_SX2 (y, x2, con); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x1, con); - XT_MOVF_SX2 (y, x2, con); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, 0); - XT_LSIP(b0, (xtfloat *)p_b, 0); - con1 = XT_L8UI(condition, 0); - xtbool s = AE_MOVBA(con1); - XT_MOVT_S(c0, a0, s); - XT_MOVF_S(c0, b0, s); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } -} - -static void internal_elm_where_broadcast_both_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - const unsigned char * __restrict__ p_condition, - WORD32 out_lc, - WORD32 in_lc) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - unsigned char *condition = p_condition; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - unsigned char con1, con2; - xtbool2 con = int32_rtor_xtbool2(0x00000003); - - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)p_inp1; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - condition = &p_condition[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x1, con); - XT_MOVF_SX2 (y, x2, con); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - con1 = XT_L8UI(condition, 0); - condition++; - con2 = XT_L8UI(condition, 0); - condition++; - con = AE_MOVBA1X2(con1, con2); - XT_MOVT_SX2 (y, x1, con); - XT_MOVF_SX2 (y, x2, con); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, 0); - XT_LSIP(b0, (xtfloat *)p_b, 0); - con1 = XT_L8UI(condition, 0); - xtbool s = AE_MOVBA(con1); - XT_MOVT_S(c0, a0, s); - XT_MOVF_S(c0, b0, s); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } -} - -WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const WORD32 *const p_out_shape, - const FLOAT32 * __restrict__ p_inp1, - const WORD32 *const p_inp1_shape, - const FLOAT32 * __restrict__ p_inp2, - const WORD32 *const p_inp2_shape, - const unsigned char *__restrict__ p_condition, - const WORD32 *const p_condition_shape - ) -{ - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); - XA_NNLIB_ARG_CHK_PTR(p_condition, -1); - XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_condition_shape, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_condition, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_condition_shape, sizeof(WORD32), -1); - - /* Check shapes */ - int i; - xtbool sign_flag; - for(i = 0; i < 4; i++) - { - if((p_inp1_shape[i] != p_inp2_shape[i]) && ((p_inp1_shape[i] != 1) && (p_inp2_shape[i] != 1))) - { - return -1; - } - } - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] == 1) - { - inp1_strides[i] = 0; - need_broadcast = 1; - } - else - { - inp1_const &= 0; - } - if(p_inp2_shape[i] == 1) - { - inp2_strides[i] = 0; - need_broadcast = 1; - } - else - { - inp2_const &= 0; - } - } - - int itr0, itr1, itr2; - FLOAT32 *p_out_tmp = p_out; - const unsigned char *__restrict p_condition_temp = p_condition; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_where_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - p_condition, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if((inp1_strides[3] == 1)&& (inp2_strides[3] == 1)) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if((inp1_strides[2] == 0) && (inp2_strides[2] == 0)) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_where_broadcast_both_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - p_condition_temp, - out_lc, - in_lc); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - p_condition_temp += in_lc * out_lc; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else - { - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_where_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - p_condition_temp, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - p_condition_temp += in_lc * out_lc; - } - - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - if((inp1_const == 1)&&(inp2_const == 1)) - { - internal_elm_where_broadcast_both_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_condition_temp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3]); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_where_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_condition_temp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - } - else - { - sign_flag = 0; - if((inp1_strides[3] == 0) && (inp2_strides[3] == 0)) - { - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_where_broadcast_both_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_condition_temp, - p_out_shape[3]); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - p_condition_temp += p_out_shape[3]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else - { - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_where_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_condition_temp, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - p_condition_temp += p_out_shape[3]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - } - return 0; -} - -#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c deleted file mode 100644 index 792b152e1fa..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c +++ /dev/null @@ -1,2028 +0,0 @@ -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_kernels_api.h" - - -#if !HAVE_VFPU -DISCARD_FUN_FOR_NONVOID_RETURN( - WORD32, xa_nn_elm_greater_lesser_equal_f32xf32_f32, - ( - WORD8 *y, - const FLOAT32 *x1, - const FLOAT32 *x2, - WORD32 N, - WORD32 kernel_type - ) - ) -#else -WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - WORD32 kernel_type) -{ - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); - /* Basic Parameter checks */ - XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); - - int i; - xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; - xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; - //xtfloatx2 *out = (xtfloatx2 *)p_out; - UWORD8 *out = p_out; - xtfloatx2 x1, x2, y; - xtbool check; - - xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); - - if(kernel_type == 0) - { - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLE_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLE_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - - a = XT_SUB_S(a2, a1); - - check = 0; - if(a <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *out++ = store; - } - } - else if(kernel_type == 1) - { - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLT_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLT_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - - a = XT_SUB_S(a2, a1); - - check = 0; - if(a < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *out++ = store; - } - } - else if(kernel_type == 2) - { - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLE_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLE_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - - a = XT_SUB_S(a1, a2); - - check = 0; - if(a <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *out++ = store; - } - } - else if(kernel_type == 3) - { - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLT_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLT_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - - a = XT_SUB_S(a1, a2); - - check = 0; - if(a < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *out++ = store; - } - } - else if(kernel_type == 4) - { - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *out++ = store1; - - uint8_t store0 = val & 0x1; - *out++ = store0; - } - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - - //a = XT_SUB_S(a2, a1); - - check = 0; - if(a1 == a2) - check = 1; - - uint8_t store = AE_MOVAB(check); - *out++ = store; - } - } - else if(kernel_type == 5) - { - ae_int32x2 ones = AE_MOVDA32(1); - if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) - { - for(i=0;i < num_elm>>1;i++) - { - XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - ae_int32x2 store = AE_ZERO32(); - AE_MOVF32X2(store, ones, check); - - *out++ = AE_MOVAD32_H(store); - *out++ = AE_MOVAD32_L(store); - } - } - else - { - ae_valign inp1_a, inp2_a, out_a; - - inp1_a = XT_LASX2PP(inp1); - inp2_a = XT_LASX2PP(inp2); - /* Each iteration of loop is independent so safe to use concurrent pragma */ -#pragma concurrent - for(i=0;i < num_elm>>1;i++) - { - XT_LASX2IP(x1, inp1_a, inp1); - XT_LASX2IP(x2, inp2_a, inp2); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - ae_int32x2 store = AE_ZERO32(); - AE_MOVF32X2(store, ones, check); - - *out++ = AE_MOVAD32_H(store); - *out++ = AE_MOVAD32_L(store); - } - } - // Remainder Loop - if (num_elm & 1) - { - xtfloat a1, a2, a; - XT_LSIP(a1, (xtfloat *)inp1, 0); - XT_LSIP(a2, (xtfloat *)inp2, 0); - - a = XT_SUB_S(a2, a1); - - check = 0; - if(a != 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *out++ = store; - } - } - - return 0; -} -#endif - -#if HAVE_VFPU -static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag, - WORD32 kernel_type) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - - xtbool check; - - xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - /* For computing inp2 - inp1 */ - if(sign_flag){ - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc]; - - if(kernel_type == 0) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLE_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLE_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(a0, b0); - - check = 0; - - if(c0 <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 1) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLT_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLT_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(a0, b0); - - check = 0; - - if(c0 < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 2) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLE_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLE_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(b0, a0); - - check = 0; - - if(c0 <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 3) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLT_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLT_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(b0, a0); - - check = 0; - - if(c0 < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 4) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - //c0 = XT_SUB_S(a0, b0); - - check = 0; - - if(a0 == b0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 5) - { - ae_int32x2 ones = AE_MOVDA32(1); - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - ae_int32x2 store = AE_ZERO32(); - AE_MOVF32X2(store, ones, check); - - *p_c++ = AE_MOVAD32_H(store); - *p_c++ = AE_MOVAD32_L(store); - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - ae_int32x2 store = AE_ZERO32(); - AE_MOVF32X2(store, ones, check); - - *p_c++ = AE_MOVAD32_H(store); - *p_c++ = AE_MOVAD32_L(store); - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(a0, b0); - - check = 0; - - if(c0 != 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - } - } - /* For computing inp1 - inp2 */ - else - { - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc]; - - if(kernel_type == 0) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLE_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLE_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(b0, a0); - - check = 0; - - if(c0 <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if (kernel_type == 1) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLT_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = XT_OLT_SX2(x2, x1); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(b0, a0); - - check = 0; - - if(c0 < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 2) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLE_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLE_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(a0, b0); - - check = 0; - - if(c0 <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 3) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLT_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x1, x2); - xtbool2 check = XT_OLT_SX2(x1, x2); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(a0, b0); - - check = 0; - - if(c0 < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 4) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - uint8_t val = AE_MOVAB2(check); - - uint8_t store1 = (val >> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - //c0 = XT_SUB_S(b0, a0); - - check = 0; - - if(a0 == b0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 5) - { - ae_int32x2 ones = AE_MOVDA32(1); - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32)); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - ae_int32x2 store = AE_ZERO32(); - AE_MOVF32X2(store, ones, check); - - *p_c++ = AE_MOVAD32_H(store); - *p_c++ = AE_MOVAD32_L(store); - } - } - else - { - ae_valign vinp1, vinp2; - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - - //y = XT_SUB_SX2(x2, x1); - xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2)); - - ae_int32x2 store = AE_ZERO32(); - AE_MOVF32X2(store, ones, check); - - *p_c++ = AE_MOVAD32_H(store); - *p_c++ = AE_MOVAD32_L(store); - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_SUB_S(b0, a0); - - check = 0; - - if(c0 != 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - } - } -} - -static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag, - WORD32 kernel_type) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - - xtbool check; - - UWORD8 * p_c = p_out; - xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32()); - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - /* For computing inp2 - inp1 */ - if(sign_flag){ - if(kernel_type == 0) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(a0_7, x2); - - check = 0; - - if(out <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 1) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(a0_7, x2); - - check = 0; - - if(out < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 2) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(x2, a0_7); - - check = 0; - - if(out <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 3) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(x2, a0_7); - - check = 0; - - if(out < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 4) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(a0_7, x2); - - check = 0; - - if(out == 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 5) - { - ae_int32x2 ones = AE_MOVDA32(1); - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(x2, a0_7); - - check = 0; - - if(out <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 1) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(x2, a0_7); - - check = 0; - - if(out < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 2) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(a0_7, x2); - - check = 0; - - if(out <= 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 3) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(a0_7, x2); - - check = 0; - - if(out < 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - else if(kernel_type == 4) - { - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - else - { - ae_valign inp1_a, out_a; - inp1_a = XT_LASX2PP(p_a); - - for(i=0; i> 1) & 0x1; - *p_c++ = store1; - - uint8_t store0 = val & 0x1; - *p_c++ = store0; - } - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); - out = XT_SUB_S(x2, a0_7); - - check = 0; - - if(out == 0) - check = 1; - - uint8_t store = AE_MOVAB(check); - *p_c++ = store; - } - } - } -} -#endif - -#if !HAVE_VFPU -DISCARD_FUN_FOR_NONVOID_RETURN( - WORD32, xa_nn_elm_greaterequal_broadcast_4D_f32xf32_f32, - ( - WORD8 * p_out, - const WORD32 *const p_out_shape, - const FLOAT32 * p_inp1, - const WORD32 *const p_inp1_shape, - const FLOAT32 * p_inp2, - const WORD32 *const p_inp2_shape, - WORD32 kernel_type - ) - ) -#else -WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(WORD8 * __restrict__ p_out, - const WORD32 *const p_out_shape, - const FLOAT32 * __restrict__ p_inp1, - const WORD32 *const p_inp1_shape, - const FLOAT32 * __restrict__ p_inp2, - const WORD32 *const p_inp2_shape, - WORD32 kernel_type) -{ - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); - XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1); - - /* Check shapes */ - int i; - xtbool sign_flag; - for(i = 0; i < 4; i++) - { - if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) || - (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - UWORD8 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag, - kernel_type); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag, - kernel_type); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_greater_lesser_equal_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag, - kernel_type); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_greater_lesser_equal_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag, - kernel_type); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; -} -#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c deleted file mode 100644 index 5b3ed385568..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c +++ /dev/null @@ -1,241 +0,0 @@ -#include "xa_nnlib_common.h" -#include "stdio.h" -/* - * Currently only supports upto 5D input tensors. - * 1/2/3/4 D input tensors will be scaled up to 5D. - * For example, 2x3 -> 1x1x1x2x3. - */ - -WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out - ,const WORD32 *const p_out_shape - ,const WORD32 * __restrict__ p_inp - ,const WORD32 *const p_inp_shape - ,const WORD32 * __restrict__ p_permute_vec - ,WORD32 num_out_dims - ,WORD32 num_inp_dims) -{ - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp, -1); - XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1); - XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); - - /* Invalid input checks */ - XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1); - XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1); - - int itr = 0; - for(itr=0; itr < num_inp_dims; itr++) - { - XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1); - } - for(itr=0; itr < num_out_dims; itr++) - { - XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1); - } - - - /* Output shape provided must be correct based on input - * shape and permute values */ - for(itr=0; itr < num_out_dims; itr++) - { - int output_dim = p_out_shape[itr]; - int expected_dim = p_inp_shape[p_permute_vec[itr]]; - XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1); - } - - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); - - /* Shift all dim with 1 in the outer part */ - int eff_output_shape[5]; - int eff_permute_vec[5]; - - for(int i = 0; i < num_out_dims; i++) - { - eff_output_shape[i] = p_out_shape[i]; - eff_permute_vec[i] = p_permute_vec[i]; - } - - int one_i=num_out_dims-1, non_one_i=num_out_dims-1; - while(one_i > 0 && non_one_i >=0){ - while(one_i > 0 && eff_output_shape[one_i]!=1){ - one_i--; - } - non_one_i = one_i; - while(non_one_i >= 0 && eff_output_shape[non_one_i]==1) - { - non_one_i--; - } - if(one_i > 0 && non_one_i >=0){ - int temp; - /*swap output_shape*/ - { - temp = eff_output_shape[one_i]; - eff_output_shape[one_i] = eff_output_shape[non_one_i]; - eff_output_shape[non_one_i] = temp; - } - /*swap permute_vec*/ - { - temp = eff_permute_vec[one_i]; - eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; - eff_permute_vec[non_one_i] = temp; - } - - } - } - - /* Promoting lesser dim tensors to 5D tensors. - * Also updating the permute_vec and shapes as needed for optimization */ - int p_5D_inp_shape[5] = {1, 1, 1, 1, 1}; - int p_5D_out_shape[5] = {1, 1, 1, 1, 1}; - int p_5D_permute_vec[5] = {0, 1, 2, 3, 4}; - - /* Check if any inner inp dimension is same in the output */ - int last_dim_same = 1, last_n_same_dim = 0; - itr = num_inp_dims - 1; - while(itr >= 0) - { - last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; - last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; - itr--; - } - - int dims_added = 5 - num_inp_dims; - itr = num_inp_dims - 1; - int same_count = last_n_same_dim; - int count = 4; - while(itr >= 0) - { - p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr]; - p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr]; - same_count--; - itr--; - count = (same_count > 0) ? count : count - 1; - } - - itr = num_inp_dims - 1; - same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; - count = 4; - while(itr >= 0) - { - p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; - same_count--; - itr--; - count--; - } - - int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; - int inp_dim1, inp_dim2, inp_dim3, inp_dim4; - int inp_stride[5]; - - out_dim0 = p_5D_out_shape[0]; - out_dim1 = p_5D_out_shape[1]; - out_dim2 = p_5D_out_shape[2]; - out_dim3 = p_5D_out_shape[3]; - out_dim4 = p_5D_out_shape[4]; - - inp_dim1 = p_5D_inp_shape[1]; - inp_dim2 = p_5D_inp_shape[2]; - inp_dim3 = p_5D_inp_shape[3]; - inp_dim4 = p_5D_inp_shape[4]; - - inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4; - inp_stride[1] = inp_dim2*inp_dim3*inp_dim4; - inp_stride[2] = inp_dim3*inp_dim4; - inp_stride[3] = inp_dim4; - inp_stride[4] = 1; - - if(last_n_same_dim) - { - int itr0, itr1, itr2, itr3, itr4; - WORD32 *p_inp0 = (WORD32 *)p_inp; - for(itr0 = 0; itr0 < out_dim0; itr0++) - { - WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); -#pragma loop_count min=1 - for(itr1 = 0; itr1 < out_dim1; itr1++) - { - WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); -#pragma loop_count min=1 - for(itr2 = 0; itr2 < out_dim2; itr2++) - { - WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); -#pragma loop_count min=1 - for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) - { - WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); - ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); - ae_valign a_inp = AE_LA64_PP(pae_i); - ae_valign a_out = AE_ZALIGN64(); - ae_int32x2 d0; - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - AE_LA32X2_IP(d0, a_inp, pae_i); - AE_SA32X2_IP(d0, a_out, pae_o); - } - AE_SA64POS_FP(a_out, pae_o); - ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); - ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); -#pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - puae_o[itr4] = puae_i[itr4]; - } - } - } - } - } - } - else - { - int itr0, itr1, itr2, itr3, itr4; - WORD32 *p_inp0 = (WORD32 *)p_inp; - for(itr0 = 0; itr0 < out_dim0; itr0++) - { - WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); - for(itr1 = 0; itr1 < out_dim1; itr1++) - { - WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); - for(itr2 = 0; itr2 < out_dim2; itr2++) - { - WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); - for(itr3 = 0; itr3 < out_dim3; itr3++) - { - WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - - ae_valign a_out = AE_ZALIGN64(); - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - ae_int32x2 d0, d1; - ae_int32x2 tmp0; - - d0 = AE_L32_X((ae_int32 *)p_inp4, 0); - p_inp4 += inp_stride[p_5D_permute_vec[4]]; - d1 = AE_L32_X((ae_int32 *)p_inp4, 0); - p_inp4 += inp_stride[p_5D_permute_vec[4]]; - - tmp0 = AE_SEL32_HH(d0, d1); - - AE_SA32X2_IP(tmp0, a_out, (ae_int32x2 *)p_out); - } - AE_SA64POS_FP(a_out, p_out); -#pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - *p_out++ = *p_inp4; - } - } - } - } - } - } - - return 0; -}