diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 6a3dcd1d245..228355882a4 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -18,23 +18,6 @@ using executorch::runtime::Result;
 
 /* Potential NNLIB function/APIs */
 
-extern "C" WORD32 xa_nn_broadcast_32_32(
-    WORD32* __restrict__ p_out,
-    const int* const out_shape,
-    WORD32* __restrict__ p_in,
-    const int* const in_shape,
-    int num_dims);
-
-extern "C" WORD32 xa_nn_concat_32_32(
-    WORD32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const WORD32** pp_inps,
-    const WORD32* const* pp_inps_shape,
-    WORD32 num_out_dims,
-    WORD32 num_inp,
-    WORD32 num_inp_dims,
-    WORD32 axis);
-
 extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
deleted file mode 100644
index cad3f1a25bb..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-/*
- * xa_nn_broadcast_8_8.c
- */
-
-#include "xa_nnlib_common.h"
-//#include "xa_nn_basic_state.h"
-
-#include<string.h>
-#include<stdbool.h>
-
-#include "stdio.h"
-
-/*
- * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
- */
-
-#define NUMDIMS_MAX 8
-
-typedef struct bcast_expansion_struct_{
-    size_t load_num_elem;
-    int    replicate_loadedElm_times;
-    int    repeat_operation;
-} bcast_expansion_rule ;
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src);
-
-void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
-{
-  char *dest = (char *)dest1;
-  char *src = (char *)src1;
-  int n = (int)n1;
-  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
-  int i;
-  void *orig_dest = dest;
-
-  if (n < 32) {
-    return memcpy(dest, src, n);
-  }
-
-  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
-    s_align_addr = (ae_int16x4 *) src;
-    d_align_addr = (ae_int16x4 *) dest;
-    for (i=0; i<n>>3; i++) {
-        d_align_addr[i] = s_align_addr[i];
-    }
-
-    for (i=(n&~7); i<n; i++) {
-      dest[i] = src[i];
-    }
-    return orig_dest;
-  }
-
-  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
-    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
-      *dest++ = *src++;
-       n--;
-    } else {
-      #if 0
-      return memcpy(dest, src, n);
-      #else
-        ae_int32x2 *pOut = (ae_int32x2 *)dest;
-        ae_int32x2 *pInp = (ae_int32x2 *)src;
-        ae_valign alignIn, alignOut;
-        alignIn = AE_LA64_PP(pInp);
-        alignOut = AE_ZALIGN64();
-        ae_int24x2 d0;
-        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
-        int remainder_start = 6*Nby6;
-
-        for(i=0;i<Nby6;i++)
-        {
-          AE_LA24X2_IP(d0, alignIn, pInp);
-          AE_SA24X2_IP(d0, alignOut, pOut);
-        }
-        AE_SA64POS_FP(alignOut, pOut);
-        /* remainder loop */
-        for(i=remainder_start; i < n; i++){
-          dest[i] = src[i];
-      }
-      return orig_dest;
-      #endif
-    }
-  }
-  int n2 = n/2;
-  ae_valign d_align = AE_ZALIGN64();
-  d_align_addr = (ae_int16x4 *) dest;
-  s_align_addr = (ae_int16x4 *) src;
-  ae_valign s_align = AE_LA64_PP(s_align_addr);
-  ae_int16x4 t,t2;
-  for (i=0; i<n2>>3; i++) {
-      AE_LA16X4_IP(t, s_align, s_align_addr);
-      AE_LA16X4_IP(t2, s_align, s_align_addr);
-      AE_SA16X4_IP(t, d_align, d_align_addr);
-      AE_SA16X4_IP(t2, d_align, d_align_addr);
-  }
-  AE_SA64POS_FP(d_align, d_align_addr);
-  ae_int16 *s_src = (ae_int16 *) src;
-  ae_int16 *s_dest = (ae_int16 *) dest;
-  for (i=8*i; i<n2; i++) {
-    s_dest[i] = s_src[i];
-  }
-  if (n % 2) {
-    dest[n-1] = src[n-1];
-  }
-  return orig_dest;
-} /* xa_nn_memcpy */
-
-WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
-        const int *const out_shape,         /* output shape resulting after broadcast */
-
-        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
-        const int * const in_shape,         /* input shape */
-        int num_dims)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
-    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
-
-    /* IO shape pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
-
-    /* Check if number of dims is valid */
-    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
-
-    int i = 0;
-
-    /* Check for valid IO shapes */
-    for(i=0; i<num_dims; i++){
-        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
-        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
-    }
-
-    /* Check if input shape can be broadcasted to requested output shape */
-    for(i=0; i<num_dims; i++){
-        if(in_shape[i] != out_shape[i]){
-            /* in_shape is either same as out_shape or 1 */
-            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
-        }
-    }
-
-    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
-    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
-
-    int k=0;
-    int dim=0;
-    const void *res=0;
-
-    int num_elem_load = 1;
-    int num_copy_times = 1;
-    int num_repeat = 1;
-
-    dim = num_dims-1;
-    while(dim>=0){
-
-        /* Find the sub-matrix size */
-        while(in_shape[dim] != 1 && dim>=0){
-            num_elem_load *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times this sub-matrix needs to be copied */
-        num_copy_times = 1;
-        while(in_shape[dim] == 1 && dim>=0){
-            num_copy_times *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times the above copy needs to be repeated */
-        num_repeat = 1;
-        while(in_shape[dim] != 1 && dim>=0){
-            num_repeat *= 1 * out_shape[dim];
-            dim--;
-        }
-
-        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
-        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
-        bcast_expansion_steps[k].repeat_operation = num_repeat;
-        k++;
-
-        num_elem_load = num_elem_load * num_copy_times * num_repeat;
-    }
-
-    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
-            p_out, p_in);
-    (void)res; /* Unused return value */
-
-    return 0;
-}
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src) {
-    int step_itr=0, rep_itr=0;
-    int i=0, j=0, k=0;
-    bcast_expansion_rule *step = NULL;
-
-    // ignore steps that are null
-    while(steps[step_id].repeat_operation == 0 && step_id>0){
-        step_id--;
-    }
-
-    // step is now the parent node for this iteration
-    step = &steps[step_id];
-    size_t numLoadedElm = step->load_num_elem;
-
-    WORD32 *cp_dst = dst;
-    WORD32 *cp_src = src;
-    WORD32 *cp_src_temp=NULL;
-    WORD32 *cp_dst_temp=NULL;
-
-    if(numLoadedElm>32){
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j<step->repeat_operation; j++){
-                    for(i=0; i<step->replicate_loadedElm_times; i++){
-                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-    else{
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    for(k=0; k<(int)numLoadedElm; k++){
-                        cp_src_temp = cp_src;
-                        cp_dst_temp = cp_dst;
-                        cp_dst_temp[k] = cp_src_temp[k];
-                    }
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j < step->repeat_operation; j++){
-                    for(i=0; i < step->replicate_loadedElm_times; i++){
-                        for(k=0; k<(int)(numLoadedElm); k++){
-                            cp_src_temp = cp_src;
-                            cp_dst_temp = cp_dst;
-                            cp_dst_temp[k] = cp_src_temp[k];
-
-                        }
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
deleted file mode 100644
index 34a7111ee78..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-/*
- * xa_nn_broadcast_32_32.c
- */
-
-#include "xa_nnlib_common.h"
-//#include "xa_nn_basic_state.h"
-
-#include<string.h>
-#include<stdbool.h>
-
-#include "stdio.h"
-
-/*
- * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
- */
-
-#define NUMDIMS_MAX 8
-
-typedef struct bcast_expansion_struct_{
-    size_t load_num_elem;
-    int    replicate_loadedElm_times;
-    int    repeat_operation;
-} bcast_expansion_rule ;
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src);
-
-void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
-{
-  char *dest = (char *)dest1;
-  char *src = (char *)src1;
-  int n = (int)n1;
-  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
-  int i;
-  void *orig_dest = dest;
-
-  if (n < 32) {
-    return memcpy(dest, src, n);
-  }
-
-  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
-    s_align_addr = (ae_int16x4 *) src;
-    d_align_addr = (ae_int16x4 *) dest;
-    for (i=0; i<n>>3; i++) {
-        d_align_addr[i] = s_align_addr[i];
-    }
-
-    for (i=(n&~7); i<n; i++) {
-      dest[i] = src[i];
-    }
-    return orig_dest;
-  }
-
-  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
-    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
-      *dest++ = *src++;
-       n--;
-    } else {
-      #if 0
-      return memcpy(dest, src, n);
-      #else
-        ae_int32x2 *pOut = (ae_int32x2 *)dest;
-        ae_int32x2 *pInp = (ae_int32x2 *)src;
-        ae_valign alignIn, alignOut;
-        alignIn = AE_LA64_PP(pInp);
-        alignOut = AE_ZALIGN64();
-        ae_int24x2 d0;
-        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
-        int remainder_start = 6*Nby6;
-
-        for(i=0;i<Nby6;i++)
-        {
-          AE_LA24X2_IP(d0, alignIn, pInp);
-          AE_SA24X2_IP(d0, alignOut, pOut);
-        }
-        AE_SA64POS_FP(alignOut, pOut);
-        /* remainder loop */
-        for(i=remainder_start; i < n; i++){
-          dest[i] = src[i];
-      }
-      return orig_dest;
-      #endif
-    }
-  }
-  int n2 = n/2;
-  ae_valign d_align = AE_ZALIGN64();
-  d_align_addr = (ae_int16x4 *) dest;
-  s_align_addr = (ae_int16x4 *) src;
-  ae_valign s_align = AE_LA64_PP(s_align_addr);
-  ae_int16x4 t,t2;
-  for (i=0; i<n2>>3; i++) {
-      AE_LA16X4_IP(t, s_align, s_align_addr);
-      AE_LA16X4_IP(t2, s_align, s_align_addr);
-      AE_SA16X4_IP(t, d_align, d_align_addr);
-      AE_SA16X4_IP(t2, d_align, d_align_addr);
-  }
-  AE_SA64POS_FP(d_align, d_align_addr);
-  ae_int16 *s_src = (ae_int16 *) src;
-  ae_int16 *s_dest = (ae_int16 *) dest;
-  for (i=8*i; i<n2; i++) {
-    s_dest[i] = s_src[i];
-  }
-  if (n % 2) {
-    dest[n-1] = src[n-1];
-  }
-  return orig_dest;
-} /* xa_nn_memcpy */
-
-WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
-        const int *const out_shape,         /* output shape resulting after broadcast */
-
-        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
-        const int * const in_shape,         /* input shape */
-        int num_dims)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
-    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
-
-    /* IO shape pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
-
-    /* Check if number of dims is valid */
-    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
-
-    int i = 0;
-
-    /* Check for valid IO shapes */
-    for(i=0; i<num_dims; i++){
-        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
-        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
-    }
-
-    /* Check if input shape can be broadcasted to requested output shape */
-    for(i=0; i<num_dims; i++){
-        if(in_shape[i] != out_shape[i]){
-            /* in_shape is either same as out_shape or 1 */
-            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
-        }
-    }
-
-    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
-    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
-
-    int k=0;
-    int dim=0;
-    const void *res=0;
-
-    int num_elem_load = 1;
-    int num_copy_times = 1;
-    int num_repeat = 1;
-
-    dim = num_dims-1;
-    while(dim>=0){
-
-        /* Find the sub-matrix size */
-        while(in_shape[dim] != 1 && dim>=0){
-            num_elem_load *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times this sub-matrix needs to be copied */
-        num_copy_times = 1;
-        while(in_shape[dim] == 1 && dim>=0){
-            num_copy_times *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times the above copy needs to be repeated */
-        num_repeat = 1;
-        while(in_shape[dim] != 1 && dim>=0){
-            num_repeat *= 1 * out_shape[dim];
-            dim--;
-        }
-
-        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
-        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
-        bcast_expansion_steps[k].repeat_operation = num_repeat;
-        k++;
-
-        num_elem_load = num_elem_load * num_copy_times * num_repeat;
-    }
-
-    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
-            p_out, p_in);
-    (void)res; /* Unused return value */
-
-    return 0;
-}
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src) {
-    int step_itr=0, rep_itr=0;
-    int i=0, j=0, k=0;
-    bcast_expansion_rule *step = NULL;
-
-    // ignore steps that are null
-    while(steps[step_id].repeat_operation == 0 && step_id>0){
-        step_id--;
-    }
-
-    // step is now the parent node for this iteration
-    step = &steps[step_id];
-    size_t numLoadedElm = step->load_num_elem;
-
-    WORD32 *cp_dst = dst;
-    WORD32 *cp_src = src;
-    WORD32 *cp_src_temp=NULL;
-    WORD32 *cp_dst_temp=NULL;
-
-    if(numLoadedElm>32){
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j<step->repeat_operation; j++){
-                    for(i=0; i<step->replicate_loadedElm_times; i++){
-                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-    else{
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    for(k=0; k<(int)numLoadedElm; k++){
-                        cp_src_temp = cp_src;
-                        cp_dst_temp = cp_dst;
-                        cp_dst_temp[k] = cp_src_temp[k];
-                    }
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j < step->repeat_operation; j++){
-                    for(i=0; i < step->replicate_loadedElm_times; i++){
-                        for(k=0; k<(int)(numLoadedElm); k++){
-                            cp_src_temp = cp_src;
-                            cp_dst_temp = cp_dst;
-                            cp_dst_temp[k] = cp_src_temp[k];
-
-                        }
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
deleted file mode 100644
index 3b73e30db42..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-
-
-#include "xa_type_def.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_kernels_api.h"
-#include "xa_nnlib_common_macros.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_common.h"
-
-WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out
-                        ,const WORD32 *const p_out_shape
-                        ,const WORD32 **pp_inps
-                        ,const WORD32 *const *pp_inps_shape
-                        ,WORD32 num_out_dims
-                        ,WORD32 num_inp
-                        ,WORD32 num_inp_dims
-                        ,WORD32 axis)
-{
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(pp_inps, -1);
-  XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1);
-  //Validate Arguments
-  XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1);
-  XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1);
-  XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1);
-  XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1);
-
-  int i = 0, j = 0;
-  for(i = 0; i < num_out_dims; i++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1);
-  }
-
-  if(axis < 0)
-    axis = num_out_dims + axis;
-
-  WORD32 concat_size = 0;
-  for (i = 0; i < num_inp; i++)
-  {
-    XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1);
-    XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1);
-    XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1);
-#pragma loop_count min=1
-    for(j = 0; j < num_out_dims; j++)
-    {
-      XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1);
-    }
-    
-    XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1);
-    concat_size += pp_inps_shape[i][axis];
-  }
-
-  XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1);
-
-  //Calculate outer and inner size for axis
-  WORD32 outer_size = 1;
-#pragma no_simd
-  for(int i = 0; i < axis; i++)
-  {
-    outer_size *= p_out_shape[i];
-  }
-
-  WORD32 base_inner_size = 1;
-#pragma no_simd
-  for(int i = axis + 1; i < num_out_dims; i++)
-  {
-    base_inner_size *= p_out_shape[i];
-  }
-
-  WORD32 *ptmp_out = p_out;
-  for(int i = 0; i < num_inp; i++)
-  {
-    const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size;
-    WORD32 *output_ptr = ptmp_out;
-    const WORD32* input_ptr = pp_inps[i];
-
-    if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0)
-      && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0))
-    {
-      if(copy_size <= 8)
-      {
-        const ae_f32 *pae_inp = (const ae_f32 *)input_ptr;
-        for(int k = 0; k < outer_size; k++)
-        {
-          ae_f32 *pae_out = (ae_f32 *)output_ptr;
-#pragma concurrent
-#pragma no_simd
-          for(int ic = 0; ic < copy_size; ic++)
-          {
-            *pae_out++ = *pae_inp++;
-          }
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-      else
-      {
-        for(int k = 0; k < outer_size; k++)
-        {
-          const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr;
-          ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr;
-          ae_valign inp_a, out_a;
-          inp_a = AE_LA64_PP(pae_inp);
-          out_a = AE_ZALIGN64();
-          for(int ic = 0; ic < (copy_size >> 1); ic++)
-          {
-            ae_int32x2 d0;
-            AE_LA32X2_IP(d0, inp_a, pae_inp);
-            AE_SA32X2_IP(d0, out_a, pae_out);
-          }
-          AE_SA64POS_FP(out_a, pae_out);
-          const ae_f32 *puae_inp = (const ae_f32 *)pae_inp;
-          ae_f32 *puae_out = (ae_f32 *)pae_out;
-#pragma concurrent
-          for(int ic = 0; ic < (copy_size & 1); ic++)
-          {
-            puae_out[copy_size - 1] = puae_inp[copy_size - 1];
-          }
-          input_ptr += copy_size;
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-    }
-    else
-    {
-      if(copy_size <= 6)
-      {
-        for(int k = 0; k < outer_size; k++)
-        {
-#pragma concurrent
-#pragma no_unroll
-          for(int ic = 0; ic < copy_size; ic++)
-          {
-            output_ptr[ic] = *input_ptr++;
-          }
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-      else
-      {
-        for(int k = 0; k < outer_size; k++)
-        {
-          const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr;
-          ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr;
-          ae_valign inp_a, out_a;
-          inp_a = AE_LA64_PP(pae_inp);
-          out_a = AE_ZALIGN64();
-
-#pragma concurrent
-          for(int ic = 0; ic < copy_size >> 1; ic++)
-          {
-            ae_int32x2 d0;
-            AE_LA32X2_IP(d0, inp_a, pae_inp);
-            AE_SA32X2_IP(d0, out_a, pae_out);
-          }
-          AE_SA64POS_FP(out_a, pae_out);
-          
-          for(int ic = 0; ic < (copy_size & 1); ic++)
-          {
-            output_ptr[copy_size - 1] = input_ptr[copy_size - 1];
-          }
-          input_ptr += copy_size;
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-    }
-    ptmp_out += copy_size;
-  }
-  return 0;
-}
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
deleted file mode 100644
index 2a18d57e99f..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 + inp1 */
-  if(sign_flag){
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_ADD_SX2(x2, x1);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_ADD_SX2(x2, x1);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_ADD_S(b0, a0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }
-    }
-  }
-  /* For computing inp1 + inp2 */
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_ADD_SX2(x1, x2);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_ADD_SX2(x1, x2);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_ADD_S(a0, b0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }
-    }
-  }
-}
-
-static void internal_elm_add_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-
-  /* For computing inp2 + inp1 */
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_ADD_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_ADD_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_ADD_S(x2, a0_7);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For computing inp1 + inp2 */
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_ADD_SX2(x1, x2);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_ADD_SX2(x1, x2);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_ADD_S(a0_7, x2);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-}
-#endif
-
-WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_add_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_add_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_add_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_add_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3],
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-
-}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
deleted file mode 100644
index 16fc23f59de..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-//#include "xa_nn_basic_state.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_div_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 - inp1 */   
-  if(sign_flag){  
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x2, x1);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x2, x1);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(b0, a0);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x1, x2);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x1, x2);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(a0, b0);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }  
-  }
-}
-
-static void internal_elm_div_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  /* For computing inp2 - inp1 */      
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();      
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-    }  
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(x2, a0_7);   
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x1, x2);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();       
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x1, x2);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(a0_7, x2);   
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }    
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_div_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_div_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_div_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_div_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_div_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
deleted file mode 100644
index e11fccbba52..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
- 
-  for(i = 0; i < out_lc; i++)
-  {
-    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-    p_b = (xtfloatx2 *)p_inp2;
-    p_c = (xtfloatx2 *)&p_out[i * in_lc];
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-        y = XT_MUL_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-      vinp1 = XT_LASX2PP(p_a);
-      vinp2 = XT_LASX2PP(p_b);
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LASX2IP(x1, vinp1, p_a);
-        XT_LASX2IP(x2, vinp2, p_b);
-        y = XT_MUL_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c); 
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-      c0 = XT_MUL_S(b0, a0);   
-      XT_SSI(c0, (xtfloat *)p_c, 0);
-    }
-  }
-}
-
-static void internal_elm_mul_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-  {
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-      y = XT_MUL_SX2(x2, x1);
-      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-    }
-  }
-  else
-  {
-    ae_valign inp1_a, out_a;
-    inp1_a = XT_LASX2PP(p_a);
-    out_a = AE_ZALIGN64();      
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LASX2IP(x1, inp1_a, p_a);
-      y = XT_MUL_SX2(x2, x1);
-      XT_SASX2IP(y, out_a, p_c);
-    }
-    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-  }  
-  if(num_scalar_ops !=0)
-  {
-    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-    out = XT_MUL_S(x2, a0_7);   
-    XT_SSI(out, (xtfloat *)p_c, 0);
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_mul_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_mul_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_mul_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_mul_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_mul_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
deleted file mode 100644
index 5b3ed385568..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "xa_nnlib_common.h"
-#include "stdio.h"
-/*
- * Currently only supports upto 5D input tensors.
- * 1/2/3/4 D input tensors will be scaled up to 5D.
- * For example, 2x3 -> 1x1x1x2x3.
- */
-
-WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
-                    ,const WORD32 *const p_out_shape
-                    ,const WORD32 * __restrict__ p_inp
-                    ,const WORD32 *const p_inp_shape
-                    ,const WORD32 * __restrict__ p_permute_vec
-                    ,WORD32 num_out_dims
-                    ,WORD32 num_inp_dims)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1);
-
-  /* Invalid input checks */
-  XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1);
-  XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1);
-
-  int itr = 0;
-  for(itr=0; itr < num_inp_dims; itr++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1);
-  }
-  for(itr=0; itr < num_out_dims; itr++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1);
-  }
-
-
-  /* Output shape provided must be correct based on input
-   * shape and permute values */
-  for(itr=0; itr < num_out_dims; itr++)
-  {
-    int output_dim = p_out_shape[itr];
-    int expected_dim = p_inp_shape[p_permute_vec[itr]];
-    XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1);
-  }
-
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1);
-
-  /* Shift all dim with 1 in the outer part */
-  int eff_output_shape[5];
-  int eff_permute_vec[5];
-
-  for(int i = 0; i < num_out_dims; i++)
-  {
-    eff_output_shape[i] = p_out_shape[i];
-    eff_permute_vec[i] = p_permute_vec[i];
-  }
-
-  int one_i=num_out_dims-1, non_one_i=num_out_dims-1;
-  while(one_i > 0 && non_one_i >=0){
-    while(one_i > 0 && eff_output_shape[one_i]!=1){
-      one_i--;
-    }
-    non_one_i = one_i;
-    while(non_one_i >= 0 && eff_output_shape[non_one_i]==1)
-    {
-      non_one_i--;
-    }
-    if(one_i > 0 && non_one_i >=0){
-      int temp;
-      /*swap output_shape*/
-      {
-        temp = eff_output_shape[one_i];
-        eff_output_shape[one_i] = eff_output_shape[non_one_i];
-        eff_output_shape[non_one_i] = temp;
-      }
-      /*swap permute_vec*/
-      {
-        temp = eff_permute_vec[one_i];
-        eff_permute_vec[one_i] = eff_permute_vec[non_one_i];
-        eff_permute_vec[non_one_i] = temp;
-      }
-
-    }
-  }
-
-  /* Promoting lesser dim tensors to 5D tensors.
-   * Also updating the permute_vec and shapes as needed for optimization */
-  int p_5D_inp_shape[5] = {1, 1, 1, 1, 1};
-  int p_5D_out_shape[5] = {1, 1, 1, 1, 1};
-  int p_5D_permute_vec[5] = {0, 1, 2, 3, 4};
-
-  /* Check if any inner inp dimension is same in the output */
-  int last_dim_same = 1, last_n_same_dim = 0;
-  itr = num_inp_dims - 1;
-  while(itr >= 0)
-  {
-    last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim;
-    last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0;
-    itr--;
-  }
-
-  int dims_added = 5 - num_inp_dims;
-  itr = num_inp_dims - 1;
-  int same_count = last_n_same_dim;
-  int count = 4;
-  while(itr >= 0)
-  {
-    p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr];
-    p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr];
-    same_count--;
-    itr--;
-    count = (same_count > 0) ? count : count - 1;
-  }
-
-  itr = num_inp_dims - 1;
-  same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0;
-  count = 4;
-  while(itr >= 0)
-  {
-    p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added;
-    same_count--;
-    itr--;
-    count--;
-  }
-
-  int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4;
-  int inp_dim1, inp_dim2, inp_dim3, inp_dim4;
-  int inp_stride[5];
-
-  out_dim0 = p_5D_out_shape[0];
-  out_dim1 = p_5D_out_shape[1];
-  out_dim2 = p_5D_out_shape[2];
-  out_dim3 = p_5D_out_shape[3];
-  out_dim4 = p_5D_out_shape[4];
-
-  inp_dim1 = p_5D_inp_shape[1];
-  inp_dim2 = p_5D_inp_shape[2];
-  inp_dim3 = p_5D_inp_shape[3];
-  inp_dim4 = p_5D_inp_shape[4];
-
-  inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4;
-  inp_stride[1] = inp_dim2*inp_dim3*inp_dim4;
-  inp_stride[2] = inp_dim3*inp_dim4;
-  inp_stride[3] = inp_dim4;
-  inp_stride[4] = 1;
-
-  if(last_n_same_dim)
-  {
-    int itr0, itr1, itr2, itr3, itr4;
-    WORD32 *p_inp0 = (WORD32 *)p_inp;
-    for(itr0 = 0; itr0 < out_dim0; itr0++)
-    {
-      WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
-#pragma loop_count min=1
-      for(itr1 = 0; itr1 < out_dim1; itr1++)
-      {
-        WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
-#pragma loop_count min=1
-        for(itr2 = 0; itr2 < out_dim2; itr2++)
-        {
-          WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
-#pragma loop_count min=1
-          for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
-          {
-            WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-            ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
-            ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
-            ae_valign a_inp = AE_LA64_PP(pae_i);
-            ae_valign a_out = AE_ZALIGN64();
-            ae_int32x2 d0;
-            for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-            {
-              AE_LA32X2_IP(d0, a_inp, pae_i);
-              AE_SA32X2_IP(d0, a_out, pae_o);
-            }
-            AE_SA64POS_FP(a_out, pae_o);
-            ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
-            ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
-#pragma loop_count max=3
-            for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-            {
-              puae_o[itr4] = puae_i[itr4];
-            }
-          }
-        }
-      }
-    }
-  }
-  else
-  {
-    int itr0, itr1, itr2, itr3, itr4;
-    WORD32 *p_inp0 = (WORD32 *)p_inp;
-    for(itr0 = 0; itr0 < out_dim0; itr0++)
-    {
-      WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
-      for(itr1 = 0; itr1 < out_dim1; itr1++)
-      {
-        WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
-        for(itr2 = 0; itr2 < out_dim2; itr2++)
-        {
-          WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
-          for(itr3 = 0; itr3 < out_dim3; itr3++)
-          {
-            WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-
-            ae_valign a_out = AE_ZALIGN64();
-            for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-            {
-              ae_int32x2 d0, d1;
-              ae_int32x2 tmp0;
-
-              d0 = AE_L32_X((ae_int32 *)p_inp4, 0);
-              p_inp4 += inp_stride[p_5D_permute_vec[4]];
-              d1 = AE_L32_X((ae_int32 *)p_inp4, 0);
-              p_inp4 += inp_stride[p_5D_permute_vec[4]];
-
-              tmp0 = AE_SEL32_HH(d0, d1);
-
-              AE_SA32X2_IP(tmp0, a_out, (ae_int32x2 *)p_out);
-            }
-            AE_SA64POS_FP(a_out, p_out);
-#pragma loop_count max=3
-            for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-            {
-              *p_out++ = *p_inp4;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  return 0;
-}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c
deleted file mode 100644
index b069035dc90..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_8.c
+++ /dev/null
@@ -1,232 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_nnlib_common.h"
-
-#include <string.h>
-
-/*
- * Currently only supports upto 5D input tensors.
- * 1/2/3/4 D input tensors will be scaled up to 5D.
- * For example, 2x3 -> 1x1x1x2x3.
- */
-
-WORD32 xa_nn_transpose_8_8(WORD8 * __restrict__ p_out
-                    ,const WORD32 *const p_out_shape
-                    ,const WORD8 * __restrict__ p_inp
-                    ,const WORD32 *const p_inp_shape
-                    ,const WORD32 * __restrict__ p_permute_vec
-                    ,WORD32 num_out_dims
-                    ,WORD32 num_inp_dims)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1);
-
-  /* Invalid input checks */
-  XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1);
-  XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1);
-
-  int itr = 0;
-  for(itr=0; itr < num_inp_dims; itr++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1);
-  }
-  for(itr=0; itr < num_out_dims; itr++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1);
-  }
-
-  /* Output shape provided must be correct based on input
-   * shape and permute values */
-  for(itr=0; itr < num_out_dims; itr++)
-  {
-    int output_dim = p_out_shape[itr];
-    int expected_dim = p_inp_shape[p_permute_vec[itr]];
-    XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1);
-  }
-
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD8), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD8), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1);
-
-  /* Shift all dim with 1 in the outer part */
-  int eff_output_shape[5];
-  int eff_permute_vec[5];
-
-  for(int i = 0; i < num_out_dims; i++)
-  {
-    eff_output_shape[i] = p_out_shape[i];
-    eff_permute_vec[i] = p_permute_vec[i];
-  }
-  
-  int one_i=num_out_dims-1, non_one_i=num_out_dims-1;
-  while(one_i > 0 && non_one_i >=0){
-    while(one_i > 0 && eff_output_shape[one_i]!=1){
-      one_i--;
-    }
-    non_one_i = one_i;
-    while(non_one_i >= 0 && eff_output_shape[non_one_i]==1)
-    {
-      non_one_i--;
-    }
-    if(one_i > 0 && non_one_i >=0){
-      int temp;
-      /*swap output_shape*/
-      {
-        temp = eff_output_shape[one_i];
-        eff_output_shape[one_i] = eff_output_shape[non_one_i];
-        eff_output_shape[non_one_i] = temp;
-      }
-      /*swap permute_vec*/
-      {
-        temp = eff_permute_vec[one_i];
-        eff_permute_vec[one_i] = eff_permute_vec[non_one_i];
-        eff_permute_vec[non_one_i] = temp;
-      }
-      
-    }
-  }
-
-
-  /* Promoting lesser dim tensors to 5D tensors. 
-   * Also updating the permute_vec and shapes as needed for optimization */
-  int p_5D_inp_shape[5] = {1, 1, 1, 1, 1};
-  int p_5D_out_shape[5] = {1, 1, 1, 1, 1};
-  int p_5D_permute_vec[5] = {0, 1, 2, 3, 4};
-  
-  /* Check if any inner inp dimension is same in the output */
-  int last_dim_same = 1, last_n_same_dim = 0;
-  itr = num_inp_dims - 1;
-  while(itr >= 0)
-  {
-    last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim;
-    last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0;
-    itr--;
-  }
-  
-  int dims_added = 5 - num_inp_dims;
-  itr = num_inp_dims - 1;
-  int same_count = last_n_same_dim;
-  int count = 4;
-  while(itr >= 0)
-  {
-    p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr];
-    p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr];
-    same_count--;
-    itr--;
-    count = (same_count > 0) ? count : count - 1;
-  }
-  
-  itr = num_inp_dims - 1;
-  same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0;
-  count = 4;
-  while(itr >= 0)
-  {
-    p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added;
-    same_count--;
-    itr--;
-    count--;
-  }
-  
-  int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4;
-  int inp_dim1, inp_dim2, inp_dim3, inp_dim4;
-  int inp_stride[5];
-
-  out_dim0 = p_5D_out_shape[0]; 
-  out_dim1 = p_5D_out_shape[1]; 
-  out_dim2 = p_5D_out_shape[2]; 
-  out_dim3 = p_5D_out_shape[3];
-  out_dim4 = p_5D_out_shape[4];
-
-  inp_dim1 = p_5D_inp_shape[1]; 
-  inp_dim2 = p_5D_inp_shape[2]; 
-  inp_dim3 = p_5D_inp_shape[3];
-  inp_dim4 = p_5D_inp_shape[4];
-
-  inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4;
-  inp_stride[1] = inp_dim2*inp_dim3*inp_dim4;
-  inp_stride[2] = inp_dim3*inp_dim4;
-  inp_stride[3] = inp_dim4;
-  inp_stride[4] = 1;
-
-  if(last_n_same_dim)
-  {
-    int itr0, itr1, itr2, itr3;
-    WORD8 *p_inp0 = (WORD8*)p_inp;
-    for(itr0 = 0; itr0 < out_dim0; itr0++)
-    {
-      WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
-#pragma loop_count min=1
-      for(itr1 = 0; itr1 < out_dim1; itr1++)
-      {
-        WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
-#pragma loop_count min=1
-        for(itr2 = 0; itr2 < out_dim2; itr2++)
-        {
-          WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
-#pragma loop_count min=1
-          for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
-          {
-            WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-            memcpy(p_out, p_inp4, out_dim4);
-          }
-        }
-      }
-    }
-  }
-  else
-  {
-    int itr0, itr1, itr2, itr3, itr4;
-    WORD8 *p_inp0 = (WORD8*)p_inp;
-    for(itr0 = 0; itr0 < out_dim0; itr0++)
-    {
-      WORD8 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
-      for(itr1 = 0; itr1 < out_dim1; itr1++)
-      {
-        WORD8 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
-        for(itr2 = 0; itr2 < out_dim2; itr2++)
-        {
-          WORD8 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
-          for(itr3 = 0; itr3 < out_dim3; itr3++)
-          {
-            WORD8 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-            for(itr4 = 0; itr4 < out_dim4; itr4++)
-            {
-              WORD8 d0 = *(p_inp4);
-              p_inp4 += inp_stride[p_5D_permute_vec[4]];
-              *p_out++ = d0;
-
-            }
-          }
-        }
-      }
-    }
-  }
-
-  return 0;
-}