diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 6a3dcd1d245..228355882a4 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -18,23 +18,6 @@ using executorch::runtime::Result; /* Potential NNLIB function/APIs */ -extern "C" WORD32 xa_nn_broadcast_32_32( - WORD32* __restrict__ p_out, - const int* const out_shape, - WORD32* __restrict__ p_in, - const int* const in_shape, - int num_dims); - -extern "C" WORD32 xa_nn_concat_32_32( - WORD32* __restrict__ p_out, - const WORD32* const p_out_shape, - const WORD32** pp_inps, - const WORD32* const* pp_inps_shape, - WORD32 num_out_dims, - WORD32 num_inp, - WORD32 num_inp_dims, - WORD32 axis); - extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c deleted file mode 100644 index cad3f1a25bb..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c +++ /dev/null @@ -1,313 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -/* - * xa_nn_broadcast_8_8.c - */ - -#include "xa_nnlib_common.h" -//#include "xa_nn_basic_state.h" - -#include -#include - -#include "stdio.h" - -/* - * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c - */ - -#define NUMDIMS_MAX 8 - -typedef struct bcast_expansion_struct_{ - size_t load_num_elem; - int replicate_loadedElm_times; - int repeat_operation; -} bcast_expansion_rule ; - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src); - -void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) -{ - char *dest = (char *)dest1; - char *src = (char *)src1; - int n = (int)n1; - ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; - int i; - void *orig_dest = dest; - - if (n < 32) { - return memcpy(dest, src, n); - } - - if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned - s_align_addr = (ae_int16x4 *) src; - d_align_addr = (ae_int16x4 *) dest; - for (i=0; i>3; i++) { - d_align_addr[i] = s_align_addr[i]; - } - - for (i=(n&~7); i>3; i++) { - AE_LA16X4_IP(t, s_align, s_align_addr); - AE_LA16X4_IP(t2, s_align, s_align_addr); - AE_SA16X4_IP(t, d_align, d_align_addr); - AE_SA16X4_IP(t2, d_align, d_align_addr); - } - AE_SA64POS_FP(d_align, d_align_addr); - ae_int16 *s_src = (ae_int16 *) src; - ae_int16 *s_dest = (ae_int16 *) dest; - for (i=8*i; i8, -1); - - int i = 0; - - /* Check for valid IO shapes */ - for(i=0; i=0){ - - /* Find the sub-matrix size */ - while(in_shape[dim] != 1 && dim>=0){ - num_elem_load *= out_shape[dim]; - dim--; - } - - /* Find the number of times this sub-matrix needs to be copied */ - num_copy_times = 1; - while(in_shape[dim] == 1 && dim>=0){ - num_copy_times *= out_shape[dim]; - dim--; - } - - /* Find the number of times the above copy needs to be repeated */ - num_repeat = 1; - while(in_shape[dim] != 1 && dim>=0){ - num_repeat *= 1 * out_shape[dim]; - dim--; - } - - bcast_expansion_steps[k].load_num_elem = num_elem_load; - bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; - bcast_expansion_steps[k].repeat_operation = num_repeat; - k++; - - num_elem_load = num_elem_load * num_copy_times * num_repeat; - } - - res = broadcast_node_32(bcast_expansion_steps, num_dims-1, - p_out, p_in); - (void)res; /* Unused return value */ - - return 0; -} - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src) { - int step_itr=0, rep_itr=0; - int i=0, j=0, k=0; - bcast_expansion_rule *step = NULL; - - // ignore steps that are null - while(steps[step_id].repeat_operation == 0 && step_id>0){ - step_id--; - } - - // step is now the parent node for this iteration - step = &steps[step_id]; - size_t numLoadedElm = step->load_num_elem; - - WORD32 *cp_dst = dst; - WORD32 *cp_src = src; - WORD32 *cp_src_temp=NULL; - WORD32 *cp_dst_temp=NULL; - - if(numLoadedElm>32){ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; jrepeat_operation; j++){ - for(i=0; ireplicate_loadedElm_times; i++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } - else{ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - for(k=0; k<(int)numLoadedElm; k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - } - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; j < step->repeat_operation; j++){ - for(i=0; i < step->replicate_loadedElm_times; i++){ - for(k=0; k<(int)(numLoadedElm); k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - - } - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } -} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c deleted file mode 100644 index 34a7111ee78..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c +++ /dev/null @@ -1,313 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -/* - * xa_nn_broadcast_32_32.c - */ - -#include "xa_nnlib_common.h" -//#include "xa_nn_basic_state.h" - -#include -#include - -#include "stdio.h" - -/* - * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c - */ - -#define NUMDIMS_MAX 8 - -typedef struct bcast_expansion_struct_{ - size_t load_num_elem; - int replicate_loadedElm_times; - int repeat_operation; -} bcast_expansion_rule ; - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src); - -void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) -{ - char *dest = (char *)dest1; - char *src = (char *)src1; - int n = (int)n1; - ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; - int i; - void *orig_dest = dest; - - if (n < 32) { - return memcpy(dest, src, n); - } - - if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned - s_align_addr = (ae_int16x4 *) src; - d_align_addr = (ae_int16x4 *) dest; - for (i=0; i>3; i++) { - d_align_addr[i] = s_align_addr[i]; - } - - for (i=(n&~7); i>3; i++) { - AE_LA16X4_IP(t, s_align, s_align_addr); - AE_LA16X4_IP(t2, s_align, s_align_addr); - AE_SA16X4_IP(t, d_align, d_align_addr); - AE_SA16X4_IP(t2, d_align, d_align_addr); - } - AE_SA64POS_FP(d_align, d_align_addr); - ae_int16 *s_src = (ae_int16 *) src; - ae_int16 *s_dest = (ae_int16 *) dest; - for (i=8*i; i8, -1); - - int i = 0; - - /* Check for valid IO shapes */ - for(i=0; i=0){ - - /* Find the sub-matrix size */ - while(in_shape[dim] != 1 && dim>=0){ - num_elem_load *= out_shape[dim]; - dim--; - } - - /* Find the number of times this sub-matrix needs to be copied */ - num_copy_times = 1; - while(in_shape[dim] == 1 && dim>=0){ - num_copy_times *= out_shape[dim]; - dim--; - } - - /* Find the number of times the above copy needs to be repeated */ - num_repeat = 1; - while(in_shape[dim] != 1 && dim>=0){ - num_repeat *= 1 * out_shape[dim]; - dim--; - } - - bcast_expansion_steps[k].load_num_elem = num_elem_load; - bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; - bcast_expansion_steps[k].repeat_operation = num_repeat; - k++; - - num_elem_load = num_elem_load * num_copy_times * num_repeat; - } - - res = broadcast_node_32(bcast_expansion_steps, num_dims-1, - p_out, p_in); - (void)res; /* Unused return value */ - - return 0; -} - -WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, - WORD32 *dst, WORD32 *src) { - int step_itr=0, rep_itr=0; - int i=0, j=0, k=0; - bcast_expansion_rule *step = NULL; - - // ignore steps that are null - while(steps[step_id].repeat_operation == 0 && step_id>0){ - step_id--; - } - - // step is now the parent node for this iteration - step = &steps[step_id]; - size_t numLoadedElm = step->load_num_elem; - - WORD32 *cp_dst = dst; - WORD32 *cp_src = src; - WORD32 *cp_src_temp=NULL; - WORD32 *cp_dst_temp=NULL; - - if(numLoadedElm>32){ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; jrepeat_operation; j++){ - for(i=0; ireplicate_loadedElm_times; i++){ - xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } - else{ - if(step_id > 0){ - for(step_itr=0; step_itrrepeat_operation; step_itr++){ - src = broadcast_node_32(steps, step_id-1, dst, src); - cp_src = dst; - cp_dst = dst + numLoadedElm; - for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ - for(k=0; k<(int)numLoadedElm; k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - } - cp_dst += numLoadedElm; - } - dst = cp_dst; - } - return src; - } else { - if(numLoadedElm == 1){ - for(j=0; jrepeat_operation; j++){ -// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); - for(i = 0; i < step->replicate_loadedElm_times; i++) - cp_dst[i] = cp_src[0]; - cp_dst += step->replicate_loadedElm_times; - cp_src++; - } - } else { - for(j=0; j < step->repeat_operation; j++){ - for(i=0; i < step->replicate_loadedElm_times; i++){ - for(k=0; k<(int)(numLoadedElm); k++){ - cp_src_temp = cp_src; - cp_dst_temp = cp_dst; - cp_dst_temp[k] = cp_src_temp[k]; - - } - cp_dst += numLoadedElm; - } - cp_src += numLoadedElm; - } - } - return cp_src; - } - } -} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c deleted file mode 100644 index 3b73e30db42..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c +++ /dev/null @@ -1,195 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ - - -#include "xa_type_def.h" -#include "xa_nn_common.h" -#include "xa_nnlib_kernels_api.h" -#include "xa_nnlib_common_macros.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_common.h" - -WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out - ,const WORD32 *const p_out_shape - ,const WORD32 **pp_inps - ,const WORD32 *const *pp_inps_shape - ,WORD32 num_out_dims - ,WORD32 num_inp - ,WORD32 num_inp_dims - ,WORD32 axis) -{ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); - XA_NNLIB_ARG_CHK_PTR(pp_inps, -1); - XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1); - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1); - XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1); - //Validate Arguments - XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1); - XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1); - XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1); - XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1); - - int i = 0, j = 0; - for(i = 0; i < num_out_dims; i++) - { - XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1); - } - - if(axis < 0) - axis = num_out_dims + axis; - - WORD32 concat_size = 0; - for (i = 0; i < num_inp; i++) - { - XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1); - XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1); - XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1); -#pragma loop_count min=1 - for(j = 0; j < num_out_dims; j++) - { - XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1); - } - - XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1); - concat_size += pp_inps_shape[i][axis]; - } - - XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1); - - //Calculate outer and inner size for axis - WORD32 outer_size = 1; -#pragma no_simd - for(int i = 0; i < axis; i++) - { - outer_size *= p_out_shape[i]; - } - - WORD32 base_inner_size = 1; -#pragma no_simd - for(int i = axis + 1; i < num_out_dims; i++) - { - base_inner_size *= p_out_shape[i]; - } - - WORD32 *ptmp_out = p_out; - for(int i = 0; i < num_inp; i++) - { - const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size; - WORD32 *output_ptr = ptmp_out; - const WORD32* input_ptr = pp_inps[i]; - - if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0) - && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0)) - { - if(copy_size <= 8) - { - const ae_f32 *pae_inp = (const ae_f32 *)input_ptr; - for(int k = 0; k < outer_size; k++) - { - ae_f32 *pae_out = (ae_f32 *)output_ptr; -#pragma concurrent -#pragma no_simd - for(int ic = 0; ic < copy_size; ic++) - { - *pae_out++ = *pae_inp++; - } - output_ptr += concat_size * base_inner_size; - } - } - else - { - for(int k = 0; k < outer_size; k++) - { - const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; - ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; - ae_valign inp_a, out_a; - inp_a = AE_LA64_PP(pae_inp); - out_a = AE_ZALIGN64(); - for(int ic = 0; ic < (copy_size >> 1); ic++) - { - ae_int32x2 d0; - AE_LA32X2_IP(d0, inp_a, pae_inp); - AE_SA32X2_IP(d0, out_a, pae_out); - } - AE_SA64POS_FP(out_a, pae_out); - const ae_f32 *puae_inp = (const ae_f32 *)pae_inp; - ae_f32 *puae_out = (ae_f32 *)pae_out; -#pragma concurrent - for(int ic = 0; ic < (copy_size & 1); ic++) - { - puae_out[copy_size - 1] = puae_inp[copy_size - 1]; - } - input_ptr += copy_size; - output_ptr += concat_size * base_inner_size; - } - } - } - else - { - if(copy_size <= 6) - { - for(int k = 0; k < outer_size; k++) - { -#pragma concurrent -#pragma no_unroll - for(int ic = 0; ic < copy_size; ic++) - { - output_ptr[ic] = *input_ptr++; - } - output_ptr += concat_size * base_inner_size; - } - } - else - { - for(int k = 0; k < outer_size; k++) - { - const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; - ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; - ae_valign inp_a, out_a; - inp_a = AE_LA64_PP(pae_inp); - out_a = AE_ZALIGN64(); - -#pragma concurrent - for(int ic = 0; ic < copy_size >> 1; ic++) - { - ae_int32x2 d0; - AE_LA32X2_IP(d0, inp_a, pae_inp); - AE_SA32X2_IP(d0, out_a, pae_out); - } - AE_SA64POS_FP(out_a, pae_out); - - for(int ic = 0; ic < (copy_size & 1); ic++) - { - output_ptr[copy_size - 1] = input_ptr[copy_size - 1]; - } - input_ptr += copy_size; - output_ptr += concat_size * base_inner_size; - } - } - } - ptmp_out += copy_size; - } - return 0; -} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c deleted file mode 100644 index 2a18d57e99f..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c +++ /dev/null @@ -1,426 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_kernels_api.h" - -#if HAVE_VFPU -static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - /* For computing inp2 + inp1 */ - if(sign_flag){ - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_ADD_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_ADD_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_ADD_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } - /* For computing inp1 + inp2 */ - else - { - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_ADD_SX2(x1, x2); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_ADD_SX2(x1, x2); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_ADD_S(a0, b0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } -} - -static void internal_elm_add_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - /* For computing inp2 + inp1 */ - if(sign_flag){ - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_add_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_add_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_add_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_add_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; - -} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c deleted file mode 100644 index 16fc23f59de..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c +++ /dev/null @@ -1,441 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ - -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -//#include "xa_nn_basic_state.h" -#include "xa_nnlib_kernels_api.h" - -#if HAVE_VFPU -static void internal_elm_div_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - /* For computing inp2 - inp1 */ - if(sign_flag){ - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_DIV_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_DIV_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_DIV_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } - /* For computing inp1 - inp2 */ - else - { - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_DIV_SX2(x1, x2); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_DIV_SX2(x1, x2); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_DIV_S(a0, b0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } - } -} - -static void internal_elm_div_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - /* For computing inp2 - inp1 */ - if(sign_flag){ - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_div_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_div_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_div_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_div_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; -} -#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c deleted file mode 100644 index e11fccbba52..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c +++ /dev/null @@ -1,359 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -#include "xa_type_def.h" -#include "xa_nnlib_common_fpu.h" -#include "xa_nn_common.h" -#include "xa_nnlib_err_chk.h" -#include "xa_nnlib_kernels_api.h" - -#if HAVE_VFPU -static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 out_lc, - WORD32 in_lc, - xtbool sign_flag) -{ - int i, j; - - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - int num_simd2_ops; - int num_scalar_ops; - - if(out_lc) - { - num_simd2_ops = in_lc >> 1; - num_scalar_ops = in_lc & 1; - } - else - { - num_simd2_ops = (in_lc >> 2) << 1; - num_scalar_ops = in_lc & 3; - } - - xtfloatx2 x1, x2, y; - xtfloat a0, b0, c0; - - for(i = 0; i < out_lc; i++) - { - p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; - p_b = (xtfloatx2 *)p_inp2; - p_c = (xtfloatx2 *)&p_out[i * in_lc]; - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(j = 0; j < num_simd2_ops; j++) - { - XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); - XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); - y = XT_MUL_SX2(x2, x1); - XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); - } - } - else - { - ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); - vinp1 = XT_LASX2PP(p_a); - vinp2 = XT_LASX2PP(p_b); - for(j = 0; j < num_simd2_ops; j++) - { - XT_LASX2IP(x1, vinp1, p_a); - XT_LASX2IP(x2, vinp2, p_b); - y = XT_MUL_SX2(x2, x1); - XT_SASX2IP(y, out_a, p_c); - } - XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); - } - if(num_scalar_ops !=0) - { - XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); - XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); - c0 = XT_MUL_S(b0, a0); - XT_SSI(c0, (xtfloat *)p_c, 0); - } - } -} - -static void internal_elm_mul_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, - const FLOAT32 * __restrict__ p_inp1, - const FLOAT32 * __restrict__ p_inp2, - WORD32 num_elm, - xtbool sign_flag) -{ - int i; - xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; - xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; - xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; - - const int num_simd2_ops = num_elm >> 1; - const int num_scalar_ops = num_elm & 1; - - xtfloat a0_7, out; - xtfloatx2 x1, x2, y; - x2 = XT_LSI((xtfloat *)p_b, 0); - - if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) - { - for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) - { - return -1; - } - } - - WORD32 inp1_strides[4], inp2_strides[4]; - inp1_strides[3] = 1; - inp2_strides[3] = 1; - for(i = 2; i >= 0; i--) - { - ae_int32x2 d_str, d_shape; - d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); - d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); - d_str = AE_MULP32X2(d_str, d_shape); - inp1_strides[i] = AE_MOVAD32_H(d_str); - inp2_strides[i] = AE_MOVAD32_L(d_str); - } - - int need_broadcast = 0; - int inp1_const = 1, inp2_const = 1; - for(i = 0; i < 4; i++) - { - if(p_inp1_shape[i] != p_inp2_shape[i]) - { - if(p_inp1_shape[i] == 1) - inp1_strides[i] = 0; - else - inp2_strides[i] = 0; - - need_broadcast = 1; - } - if(p_inp1_shape[i] != 1) - inp1_const &= 0; - if(p_inp2_shape[i] != 1) - inp2_const &= 0; - } - int itr0, itr1, itr2; - - FLOAT32 *p_out_tmp = p_out; - const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; - const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; - if(need_broadcast == 0) - { - sign_flag = 0; - internal_elm_mul_broadcast_2D_f32xf32_f32( - p_out, - p_inp1, - p_inp2, - 1, - p_out_shape[0] * inp1_strides[0], - sign_flag); - } - else if(inp1_strides[3] == inp2_strides[3]) - { - WORD32 in_lc, out_lc; - sign_flag = 0; - in_lc = p_out_shape[2] * p_out_shape[3]; - out_lc = 1; - if(inp1_strides[2] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[2]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - else if(inp2_strides[2] == 0) - { - in_lc = p_out_shape[3]; - out_lc = p_out_shape[2]; - } - - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - internal_elm_mul_broadcast_2D_f32xf32_f32( - p_out_tmp, - p_inp1_tmp0, - p_inp2_tmp0, - out_lc, - in_lc, - sign_flag); - p_out_tmp += in_lc * out_lc; - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - else if(inp1_const == 1 || inp2_const == 1) - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - sign_flag = 1; - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - } - internal_elm_mul_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp, - p_inp2_tmp, - p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], - sign_flag); - } - else - { - sign_flag = 0; - if(inp1_strides[3] == 0) - { - const FLOAT32 *tmp; - tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; - sign_flag = 1; - int tmp_strides[3]; - tmp_strides[0] = inp1_strides[0]; - tmp_strides[1] = inp1_strides[1]; - tmp_strides[2] = inp1_strides[2]; - - inp1_strides[0] = inp2_strides[0]; - inp1_strides[1] = inp2_strides[1]; - inp1_strides[2] = inp2_strides[2]; - - inp2_strides[0] = tmp_strides[0]; - inp2_strides[1] = tmp_strides[1]; - inp2_strides[2] = tmp_strides[2]; - } - for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) - { - const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; - const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; - for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) - { - const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; - const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; - for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) - { - { - internal_elm_mul_broadcast_f32xf32_f32( - p_out_tmp, - p_inp1_tmp1, - p_inp2_tmp1, - p_out_shape[3], - sign_flag); - } - p_out_tmp += p_out_shape[3]; - p_inp1_tmp1 += inp1_strides[2]; - p_inp2_tmp1 += inp2_strides[2]; - } - p_inp1_tmp0 += inp1_strides[1]; - p_inp2_tmp0 += inp2_strides[1]; - } - p_inp1_tmp += inp1_strides[0]; - p_inp2_tmp += inp2_strides[0]; - } - } - return 0; -} -#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c deleted file mode 100644 index 5b3ed385568..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c +++ /dev/null @@ -1,241 +0,0 @@ -#include "xa_nnlib_common.h" -#include "stdio.h" -/* - * Currently only supports upto 5D input tensors. - * 1/2/3/4 D input tensors will be scaled up to 5D. - * For example, 2x3 -> 1x1x1x2x3. - */ - -WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out - ,const WORD32 *const p_out_shape - ,const WORD32 * __restrict__ p_inp - ,const WORD32 *const p_inp_shape - ,const WORD32 * __restrict__ p_permute_vec - ,WORD32 num_out_dims - ,WORD32 num_inp_dims) -{ - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp, -1); - XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1); - XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); - - /* Invalid input checks */ - XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1); - XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1); - - int itr = 0; - for(itr=0; itr < num_inp_dims; itr++) - { - XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1); - } - for(itr=0; itr < num_out_dims; itr++) - { - XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1); - } - - - /* Output shape provided must be correct based on input - * shape and permute values */ - for(itr=0; itr < num_out_dims; itr++) - { - int output_dim = p_out_shape[itr]; - int expected_dim = p_inp_shape[p_permute_vec[itr]]; - XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1); - } - - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); - - /* Shift all dim with 1 in the outer part */ - int eff_output_shape[5]; - int eff_permute_vec[5]; - - for(int i = 0; i < num_out_dims; i++) - { - eff_output_shape[i] = p_out_shape[i]; - eff_permute_vec[i] = p_permute_vec[i]; - } - - int one_i=num_out_dims-1, non_one_i=num_out_dims-1; - while(one_i > 0 && non_one_i >=0){ - while(one_i > 0 && eff_output_shape[one_i]!=1){ - one_i--; - } - non_one_i = one_i; - while(non_one_i >= 0 && eff_output_shape[non_one_i]==1) - { - non_one_i--; - } - if(one_i > 0 && non_one_i >=0){ - int temp; - /*swap output_shape*/ - { - temp = eff_output_shape[one_i]; - eff_output_shape[one_i] = eff_output_shape[non_one_i]; - eff_output_shape[non_one_i] = temp; - } - /*swap permute_vec*/ - { - temp = eff_permute_vec[one_i]; - eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; - eff_permute_vec[non_one_i] = temp; - } - - } - } - - /* Promoting lesser dim tensors to 5D tensors. - * Also updating the permute_vec and shapes as needed for optimization */ - int p_5D_inp_shape[5] = {1, 1, 1, 1, 1}; - int p_5D_out_shape[5] = {1, 1, 1, 1, 1}; - int p_5D_permute_vec[5] = {0, 1, 2, 3, 4}; - - /* Check if any inner inp dimension is same in the output */ - int last_dim_same = 1, last_n_same_dim = 0; - itr = num_inp_dims - 1; - while(itr >= 0) - { - last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; - last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; - itr--; - } - - int dims_added = 5 - num_inp_dims; - itr = num_inp_dims - 1; - int same_count = last_n_same_dim; - int count = 4; - while(itr >= 0) - { - p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr]; - p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr]; - same_count--; - itr--; - count = (same_count > 0) ? count : count - 1; - } - - itr = num_inp_dims - 1; - same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; - count = 4; - while(itr >= 0) - { - p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; - same_count--; - itr--; - count--; - } - - int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; - int inp_dim1, inp_dim2, inp_dim3, inp_dim4; - int inp_stride[5]; - - out_dim0 = p_5D_out_shape[0]; - out_dim1 = p_5D_out_shape[1]; - out_dim2 = p_5D_out_shape[2]; - out_dim3 = p_5D_out_shape[3]; - out_dim4 = p_5D_out_shape[4]; - - inp_dim1 = p_5D_inp_shape[1]; - inp_dim2 = p_5D_inp_shape[2]; - inp_dim3 = p_5D_inp_shape[3]; - inp_dim4 = p_5D_inp_shape[4]; - - inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4; - inp_stride[1] = inp_dim2*inp_dim3*inp_dim4; - inp_stride[2] = inp_dim3*inp_dim4; - inp_stride[3] = inp_dim4; - inp_stride[4] = 1; - - if(last_n_same_dim) - { - int itr0, itr1, itr2, itr3, itr4; - WORD32 *p_inp0 = (WORD32 *)p_inp; - for(itr0 = 0; itr0 < out_dim0; itr0++) - { - WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); -#pragma loop_count min=1 - for(itr1 = 0; itr1 < out_dim1; itr1++) - { - WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); -#pragma loop_count min=1 - for(itr2 = 0; itr2 < out_dim2; itr2++) - { - WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); -#pragma loop_count min=1 - for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) - { - WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); - ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); - ae_valign a_inp = AE_LA64_PP(pae_i); - ae_valign a_out = AE_ZALIGN64(); - ae_int32x2 d0; - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - AE_LA32X2_IP(d0, a_inp, pae_i); - AE_SA32X2_IP(d0, a_out, pae_o); - } - AE_SA64POS_FP(a_out, pae_o); - ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); - ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); -#pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - puae_o[itr4] = puae_i[itr4]; - } - } - } - } - } - } - else - { - int itr0, itr1, itr2, itr3, itr4; - WORD32 *p_inp0 = (WORD32 *)p_inp; - for(itr0 = 0; itr0 < out_dim0; itr0++) - { - WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); - for(itr1 = 0; itr1 < out_dim1; itr1++) - { - WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); - for(itr2 = 0; itr2 < out_dim2; itr2++) - { - WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); - for(itr3 = 0; itr3 < out_dim3; itr3++) - { - WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - - ae_valign a_out = AE_ZALIGN64(); - for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) - { - ae_int32x2 d0, d1; - ae_int32x2 tmp0; - - d0 = AE_L32_X((ae_int32 *)p_inp4, 0); - p_inp4 += inp_stride[p_5D_permute_vec[4]]; - d1 = AE_L32_X((ae_int32 *)p_inp4, 0); - p_inp4 += inp_stride[p_5D_permute_vec[4]]; - - tmp0 = AE_SEL32_HH(d0, d1); - - AE_SA32X2_IP(tmp0, a_out, (ae_int32x2 *)p_out); - } - AE_SA64POS_FP(a_out, p_out); -#pragma loop_count max=3 - for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) - { - *p_out++ = *p_inp4; - } - } - } - } - } - } - - return 0; -} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c deleted file mode 100644 index b069035dc90..00000000000 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c +++ /dev/null @@ -1,232 +0,0 @@ -/******************************************************************************* -* Copyright (c) 2018-2024 Cadence Design Systems, Inc. -* -* Permission is hereby granted, free of charge, to any person obtaining -* a copy of this software and associated documentation files (the -* "Software"), to use this Software with Cadence processor cores only and -* not with any other processors and platforms, subject to -* the following conditions: -* -* The above copyright notice and this permission notice shall be included -* in all copies or substantial portions of the Software. -* -* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY -* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, -* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE -* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - -******************************************************************************/ -#include "xa_nnlib_common.h" - -#include - -/* - * Currently only supports upto 5D input tensors. - * 1/2/3/4 D input tensors will be scaled up to 5D. - * For example, 2x3 -> 1x1x1x2x3. - */ - -WORD32 xa_nn_transpose_8_8(WORD8 * __restrict__ p_out - ,const WORD32 *const p_out_shape - ,const WORD8 * __restrict__ p_inp - ,const WORD32 *const p_inp_shape - ,const WORD32 * __restrict__ p_permute_vec - ,WORD32 num_out_dims - ,WORD32 num_inp_dims) -{ - /* NULL pointer checks */ - XA_NNLIB_ARG_CHK_PTR(p_out, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp, -1); - XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1); - XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); - XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); - - /* Invalid input checks */ - XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1); - XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1); - - int itr = 0; - for(itr=0; itr < num_inp_dims; itr++) - { - XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1); - } - for(itr=0; itr < num_out_dims; itr++) - { - XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1); - } - - /* Output shape provided must be correct based on input - * shape and permute values */ - for(itr=0; itr < num_out_dims; itr++) - { - int output_dim = p_out_shape[itr]; - int expected_dim = p_inp_shape[p_permute_vec[itr]]; - XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1); - } - - /* Pointer alignment checks */ - XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); - XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); - - /* Shift all dim with 1 in the outer part */ - int eff_output_shape[5]; - int eff_permute_vec[5]; - - for(int i = 0; i < num_out_dims; i++) - { - eff_output_shape[i] = p_out_shape[i]; - eff_permute_vec[i] = p_permute_vec[i]; - } - - int one_i=num_out_dims-1, non_one_i=num_out_dims-1; - while(one_i > 0 && non_one_i >=0){ - while(one_i > 0 && eff_output_shape[one_i]!=1){ - one_i--; - } - non_one_i = one_i; - while(non_one_i >= 0 && eff_output_shape[non_one_i]==1) - { - non_one_i--; - } - if(one_i > 0 && non_one_i >=0){ - int temp; - /*swap output_shape*/ - { - temp = eff_output_shape[one_i]; - eff_output_shape[one_i] = eff_output_shape[non_one_i]; - eff_output_shape[non_one_i] = temp; - } - /*swap permute_vec*/ - { - temp = eff_permute_vec[one_i]; - eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; - eff_permute_vec[non_one_i] = temp; - } - - } - } - - - /* Promoting lesser dim tensors to 5D tensors. - * Also updating the permute_vec and shapes as needed for optimization */ - int p_5D_inp_shape[5] = {1, 1, 1, 1, 1}; - int p_5D_out_shape[5] = {1, 1, 1, 1, 1}; - int p_5D_permute_vec[5] = {0, 1, 2, 3, 4}; - - /* Check if any inner inp dimension is same in the output */ - int last_dim_same = 1, last_n_same_dim = 0; - itr = num_inp_dims - 1; - while(itr >= 0) - { - last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; - last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; - itr--; - } - - int dims_added = 5 - num_inp_dims; - itr = num_inp_dims - 1; - int same_count = last_n_same_dim; - int count = 4; - while(itr >= 0) - { - p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr]; - p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr]; - same_count--; - itr--; - count = (same_count > 0) ? count : count - 1; - } - - itr = num_inp_dims - 1; - same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; - count = 4; - while(itr >= 0) - { - p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; - same_count--; - itr--; - count--; - } - - int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; - int inp_dim1, inp_dim2, inp_dim3, inp_dim4; - int inp_stride[5]; - - out_dim0 = p_5D_out_shape[0]; - out_dim1 = p_5D_out_shape[1]; - out_dim2 = p_5D_out_shape[2]; - out_dim3 = p_5D_out_shape[3]; - out_dim4 = p_5D_out_shape[4]; - - inp_dim1 = p_5D_inp_shape[1]; - inp_dim2 = p_5D_inp_shape[2]; - inp_dim3 = p_5D_inp_shape[3]; - inp_dim4 = p_5D_inp_shape[4]; - - inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4; - inp_stride[1] = inp_dim2*inp_dim3*inp_dim4; - inp_stride[2] = inp_dim3*inp_dim4; - inp_stride[3] = inp_dim4; - inp_stride[4] = 1; - - if(last_n_same_dim) - { - int itr0, itr1, itr2, itr3; - WORD8 *p_inp0 = (WORD8*)p_inp; - for(itr0 = 0; itr0 < out_dim0; itr0++) - { - WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); -#pragma loop_count min=1 - for(itr1 = 0; itr1 < out_dim1; itr1++) - { - WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); -#pragma loop_count min=1 - for(itr2 = 0; itr2 < out_dim2; itr2++) - { - WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); -#pragma loop_count min=1 - for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) - { - WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - memcpy(p_out, p_inp4, out_dim4); - } - } - } - } - } - else - { - int itr0, itr1, itr2, itr3, itr4; - WORD8 *p_inp0 = (WORD8*)p_inp; - for(itr0 = 0; itr0 < out_dim0; itr0++) - { - WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); - for(itr1 = 0; itr1 < out_dim1; itr1++) - { - WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); - for(itr2 = 0; itr2 < out_dim2; itr2++) - { - WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); - for(itr3 = 0; itr3 < out_dim3; itr3++) - { - WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); - for(itr4 = 0; itr4 < out_dim4; itr4++) - { - WORD8 d0 = *(p_inp4); - p_inp4 += inp_stride[p_5D_permute_vec[4]]; - *p_out++ = d0; - - } - } - } - } - } - } - - return 0; -}