diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index f713d0a3227..01e11068dad 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -286,12 +286,12 @@
 - func: cadence::im2row.out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, Tensor in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::native::im2row_out
+      kernel_name: impl::HiFi::im2row_out
 
 - func: cadence::im2row.per_tensor_out(Tensor input, int[2] kernel_size, int[2] dilation, int[2] padding, int[2] stride, int in_zero_point, bool channel_last=False, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
-      kernel_name: impl::HiFi::native::im2row_per_tensor_out
+      kernel_name: impl::HiFi::im2row_per_tensor_out
 
 - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!)
   variants: function
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index c366cecbe0c..a7ea9f5963c 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -8,25 +8,15 @@
 add_library(
   cadence_kernels
   kernels.cpp
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_fmod_broadcast_f32.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_logicalxor_bool_bool.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_im2row.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
-  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
 )
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 6a3dcd1d245..ce392be8aa3 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -18,41 +18,9 @@ using executorch::runtime::Result;
 
 /* Potential NNLIB function/APIs */
 
-extern "C" WORD32 xa_nn_broadcast_32_32(
-    WORD32* __restrict__ p_out,
-    const int* const out_shape,
-    WORD32* __restrict__ p_in,
-    const int* const in_shape,
-    int num_dims);
-
-extern "C" WORD32 xa_nn_concat_32_32(
-    WORD32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const WORD32** pp_inps,
-    const WORD32* const* pp_inps_shape,
-    WORD32 num_out_dims,
-    WORD32 num_inp,
-    WORD32 num_inp_dims,
-    WORD32 axis);
-
-extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
-
 extern "C" void
 xa_nn_elm_atan2_f32(FLOAT32* z, const FLOAT32* y, const FLOAT32* x, WORD32 N);
 
-extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const FLOAT32* __restrict__ p_inp,
-    const FLOAT32* __restrict__ p_min,
-    const FLOAT32* __restrict__ p_max,
-    WORD32 num_elm);
-
 extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -63,14 +31,6 @@ extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(
     const FLOAT32* __restrict__ p_max,
     const WORD32* const p_max_shape);
 
-extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
-
 extern "C" WORD32 xa_nn_elm_div_mode_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const FLOAT32* __restrict__ p_inp1,
@@ -87,22 +47,6 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
     const WORD32* const p_inp2_shape,
     WORD32 mode);
 
-extern "C" WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(
-    WORD8* __restrict__ p_out,
-    const FLOAT32* __restrict__ p_inp1,
-    const FLOAT32* __restrict__ p_inp2,
-    WORD32 num_elm,
-    WORD32 kernel_type);
-
-extern "C" WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
-    WORD8* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape,
-    WORD32 kernel_type);
-
 extern "C" WORD32 xa_nn_elm_fmod_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const FLOAT32* __restrict__ p_inp1,
@@ -123,42 +67,6 @@ extern "C" WORD32 xa_nn_elm_logicalxor_boolxbool_bool(
     const WORD8* __restrict__ p_inp2,
     WORD32 num_elm);
 
-extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const FLOAT32* __restrict__ p_inp1,
-    const FLOAT32* __restrict__ p_inp2,
-    WORD32 num_elm);
-
-extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
-
-extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const FLOAT32* __restrict__ p_inp1,
-    const FLOAT32* __restrict__ p_inp2,
-    WORD32 num_elm);
-
-extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
-
-extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape);
-
 extern "C" void xa_nn_elm_pow_f32(
     FLOAT32* __restrict__ z,
     const FLOAT32* __restrict__ x,
@@ -179,23 +87,6 @@ extern "C" WORD32 xa_nn_elm_remainder_broadcast_4D_f32xf32_f32(
     const FLOAT32* __restrict__ p_inp2,
     const WORD32* const p_inp2_shape);
 
-extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const FLOAT32* __restrict__ p_inp1,
-    const FLOAT32* __restrict__ p_inp2,
-    const unsigned char* __restrict__ p_condition,
-    WORD32 num_elm);
-
-extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(
-    FLOAT32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const FLOAT32* __restrict__ p_inp1,
-    const WORD32* const p_inp1_shape,
-    const FLOAT32* __restrict__ p_inp2,
-    const WORD32* const p_inp2_shape,
-    const unsigned char* __restrict__ p_condition,
-    const WORD32* const p_condition_shape);
-
 extern "C" WORD32 xa_nn_im2row_quantized(
     const WORD8* __restrict__ data_im,
     const WORD32 in_zero_point,
@@ -229,60 +120,12 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32(
     WORD32 num_axis_dims,
     void* __restrict__ p_scratch_in);
 
-extern "C" WORD32 xa_nn_transpose_32_32(
-    WORD32* __restrict__ p_out,
-    const WORD32* const p_out_shape,
-    const WORD32* __restrict__ p_inp,
-    const WORD32* const p_inp_shape,
-    const WORD32* __restrict__ p_permute_vec,
-    WORD32 num_out_dims,
-    WORD32 num_inp_dims);
-
 namespace impl {
 namespace HiFi {
 namespace kernels {
 
 void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size);
 
-void memcpy(void* dst, const void* src, size_t num_bytes);
-
-WORD32 matmul_asym8uxasym8u_asym8u(
-    UWORD8* __restrict__ p_out, // output uint8 matrix
-    const UWORD8* __restrict__ p_mat1, // weight uint8 matrix
-    const UWORD8* __restrict__ p_vec1, // input uint8 matrix
-    const WORD32* __restrict__ p_bias, // bias int32 vec
-    WORD32 rows, // rows of p_mat1
-    WORD32 cols1, // columns of p_mat1
-    WORD32 row_stride1, // row stride of p_mat1
-    WORD32 vec_count, // rows of p_mat2
-    WORD32 vec_offset, // vec_offset of p_mat2.
-    WORD32 out_offset, // out_offset, i.e., offset of next output element
-    WORD32 out_stride, // out_stride, i.e., stride to go to next output row
-    WORD32 mat1_zero_bias, // zero_point of p_mat1
-    WORD32 vec1_zero_bias, // zero_point of p_vec1
-    const WORD32* __restrict__ out_multiplier,
-    const WORD32* __restrict__ out_shift,
-    WORD32 out_zero_bias,
-    bool per_channel_quantized = false); // per-channel quantized weight
-
-WORD32 xa_nn_matmul_asym8uxasym8u_asym8u(
-    UWORD8* __restrict__ p_out,
-    const UWORD8* __restrict__ p_mat1,
-    const UWORD8* __restrict__ p_mat2,
-    const WORD32* __restrict__ p_bias,
-    WORD32 rows,
-    WORD32 cols,
-    WORD32 row_stride,
-    WORD32 vec_count,
-    WORD32 vec_offset,
-    WORD32 out_offset,
-    WORD32 out_stride,
-    WORD32 mat1_zero_bias,
-    WORD32 vec1_zero_bias,
-    WORD32 out_multiplier,
-    WORD32 out_shift,
-    WORD32 out_zero_bias);
-
 template <typename T>
 T quantize(const float x, float scale, int32_t zero_point);
 
diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp
index e3d5c8914a4..6485bea1007 100644
--- a/backends/cadence/hifi/operators/op_clamp.cpp
+++ b/backends/cadence/hifi/operators/op_clamp.cpp
@@ -155,13 +155,13 @@ Tensor& clamp_Tensor_out(
           inp_shape[i + off_inp] = in.size(i);
         }
 
-        WORD32 ret_val = xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
+        WORD32 ret_val = xa_nn_elm_min_4D_Bcast_f32xf32_f32(
             out_data, out_shape, inp_data, inp_shape, max_data, max_shape);
 
         ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
 
       } else {
-        WORD32 ret_val = xa_nn_elm_minimum_f32xf32_f32(
+        WORD32 ret_val = xa_nn_elm_min_f32xf32_f32(
             out_data, inp_data, max_data, out.numel());
 
         ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
@@ -192,13 +192,13 @@ Tensor& clamp_Tensor_out(
           min_shape[i + off_min] = min.size(i);
         for (int i = 0; i < inp_dim; i++)
           inp_shape[i + off_inp] = in.size(i);
-        WORD32 ret_val = xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
+        WORD32 ret_val = xa_nn_elm_max_4D_Bcast_f32xf32_f32(
             out_data, out_shape, inp_data, inp_shape, min_data, min_shape);
 
         ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
 
       } else {
-        WORD32 ret_val = xa_nn_elm_maximum_f32xf32_f32(
+        WORD32 ret_val = xa_nn_elm_max_f32xf32_f32(
             out_data, inp_data, min_data, out.numel());
 
         ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
diff --git a/backends/cadence/hifi/operators/op_eq.cpp b/backends/cadence/hifi/operators/op_eq.cpp
index a76b910e379..6d75ab51a96 100644
--- a/backends/cadence/hifi/operators/op_eq.cpp
+++ b/backends/cadence/hifi/operators/op_eq.cpp
@@ -94,15 +94,15 @@ Tensor& eq_Tensor_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
-          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 4);
+      WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
+          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_EQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     } else {
       int num_elm = out.numel();
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
-          p_out, p_inp1, p_inp2, num_elm, 4);
+      WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
+          p_out, p_inp1, p_inp2, num_elm, COMPARE_EQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     }
diff --git a/backends/cadence/hifi/operators/op_ge.cpp b/backends/cadence/hifi/operators/op_ge.cpp
index 5d9111b5312..bf2e2562d73 100644
--- a/backends/cadence/hifi/operators/op_ge.cpp
+++ b/backends/cadence/hifi/operators/op_ge.cpp
@@ -94,15 +94,15 @@ Tensor& ge_Tensor_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
-          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 0);
+      WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
+          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_GREATEREQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     } else {
       int num_elm = out.numel();
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
-          p_out, p_inp1, p_inp2, num_elm, 0);
+      WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
+          p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATEREQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     }
diff --git a/backends/cadence/hifi/operators/op_gt.cpp b/backends/cadence/hifi/operators/op_gt.cpp
index 5995dba3bed..1e054caf8ea 100644
--- a/backends/cadence/hifi/operators/op_gt.cpp
+++ b/backends/cadence/hifi/operators/op_gt.cpp
@@ -96,15 +96,15 @@ Tensor& gt_Tensor_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
-          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 1);
+      WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
+          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_GREATER);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     } else {
       int num_elm = out.numel();
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
-          p_out, p_inp1, p_inp2, num_elm, 1);
+      WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
+          p_out, p_inp1, p_inp2, num_elm, COMPARE_GREATER);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     }
diff --git a/backends/cadence/hifi/operators/op_le.cpp b/backends/cadence/hifi/operators/op_le.cpp
index fb224b84369..fb2189c7b4c 100644
--- a/backends/cadence/hifi/operators/op_le.cpp
+++ b/backends/cadence/hifi/operators/op_le.cpp
@@ -95,15 +95,15 @@ Tensor& le_Tensor_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
-          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 2);
+      WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
+          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_LESSEREQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     } else {
       int num_elm = out.numel();
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
-          p_out, p_inp1, p_inp2, num_elm, 2);
+      WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
+          p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSEREQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     }
diff --git a/backends/cadence/hifi/operators/op_lt.cpp b/backends/cadence/hifi/operators/op_lt.cpp
index bbff9cc0aee..bfab9236964 100644
--- a/backends/cadence/hifi/operators/op_lt.cpp
+++ b/backends/cadence/hifi/operators/op_lt.cpp
@@ -93,15 +93,15 @@ Tensor& lt_Tensor_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
-          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 3);
+      WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
+          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_LESSER);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     } else {
       int num_elm = out.numel();
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
-          p_out, p_inp1, p_inp2, num_elm, 3);
+      WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
+          p_out, p_inp1, p_inp2, num_elm, COMPARE_LESSER);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     }
diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
index 1882967f81a..e84fdcb7988 100644
--- a/backends/cadence/hifi/operators/op_maximum.cpp
+++ b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -141,10 +141,10 @@ Tensor& maximum_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
+      xa_nn_elm_max_4D_Bcast_f32xf32_f32(
           out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
     } else {
-      xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel());
+      xa_nn_elm_max_f32xf32_f32(out_data, a_data, b_data, out.numel());
     }
     return out;
   }
diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp
index 1f069b362fd..4385c6f6fc9 100644
--- a/backends/cadence/hifi/operators/op_minimum.cpp
+++ b/backends/cadence/hifi/operators/op_minimum.cpp
@@ -141,10 +141,10 @@ Tensor& minimum_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
+      xa_nn_elm_min_4D_Bcast_f32xf32_f32(
           out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
     } else {
-      xa_nn_elm_minimum_f32xf32_f32(out_data, a_data, b_data, out.numel());
+      xa_nn_elm_min_f32xf32_f32(out_data, a_data, b_data, out.numel());
     }
     return out;
   }
diff --git a/backends/cadence/hifi/operators/op_ne.cpp b/backends/cadence/hifi/operators/op_ne.cpp
index f183a42452a..aa782e3c0f8 100644
--- a/backends/cadence/hifi/operators/op_ne.cpp
+++ b/backends/cadence/hifi/operators/op_ne.cpp
@@ -95,15 +95,15 @@ Tensor& ne_Tensor_out(
       for (int i = 0; i < b.dim(); i++)
         inp2_shape[i + off_b] = b.size(i);
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(
-          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, 5);
+      WORD32 ret_val = xa_nn_elm_compare_broadcast_4D_f32xf32_f32(
+          p_out, out_shape, p_inp1, inp1_shape, p_inp2, inp2_shape, COMPARE_NOTEQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     } else {
       int num_elm = out.numel();
 
-      WORD32 ret_val = xa_nn_elm_greater_lesser_equal_f32xf32_f32(
-          p_out, p_inp1, p_inp2, num_elm, 5);
+      WORD32 ret_val = xa_nn_elm_compare_f32xf32_f32(
+          p_out, p_inp1, p_inp2, num_elm, COMPARE_NOTEQUAL);
 
       ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out);
     }
diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp
index af06c17f50f..c45d03841a9 100644
--- a/backends/cadence/hifi/operators/op_where.cpp
+++ b/backends/cadence/hifi/operators/op_where.cpp
@@ -114,49 +114,22 @@ Tensor& where_self_out(
       for (int i = 0; i < cond.dim(); i++)
         con_shape[i + off_c] = cond.size(i);
 
-      if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] ||
-          con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) {
-        void* p_scratch = (void*)kernels::allocate_temp_memory(
-            ctx,
-            (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) *
-                sizeof(int));
-
-        ET_KERNEL_CHECK(ctx, p_scratch != nullptr, MemoryAllocationFailed, out);
-
-        const unsigned char* p_brd_cond = (const unsigned char*)p_scratch;
-        xa_nn_broadcast_8_8(
-            (WORD8* __restrict__)p_brd_cond,
-            out_shape,
-            (const WORD8* __restrict__)con,
-            con_shape,
-            4);
-
-        for (int i = 0; i < 4; i++) {
-          con_shape[i] = out_shape[i];
-        }
-        xa_nn_elm_where_broadcast_4D_f32xf32_f32(
-            out_data,
-            out_shape,
-            a_data,
-            inp1_shape,
-            b_data,
-            inp2_shape,
-            p_brd_cond,
-            con_shape);
-
-      } else {
-        xa_nn_elm_where_broadcast_4D_f32xf32_f32(
-            out_data,
-            out_shape,
-            a_data,
-            inp1_shape,
-            b_data,
-            inp2_shape,
-            con,
-            con_shape);
-      }
+      xa_nn_elm_select_broadcast_4D_32x32_32(
+          (WORD32*)out_data,
+          out_shape,
+          (WORD32*)a_data,
+          inp1_shape,
+          (WORD32*)b_data,
+          inp2_shape,
+          con,
+          con_shape);
     } else {
-      xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel());
+      xa_nn_elm_select_32x32_32(
+          (WORD32*)out_data,
+          (WORD32*)a_data,
+          (WORD32*)b_data,
+          con,
+          out.numel());
     }
     return out;
   }
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
deleted file mode 100644
index cad3f1a25bb..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-/*
- * xa_nn_broadcast_8_8.c
- */
-
-#include "xa_nnlib_common.h"
-//#include "xa_nn_basic_state.h"
-
-#include<string.h>
-#include<stdbool.h>
-
-#include "stdio.h"
-
-/*
- * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
- */
-
-#define NUMDIMS_MAX 8
-
-typedef struct bcast_expansion_struct_{
-    size_t load_num_elem;
-    int    replicate_loadedElm_times;
-    int    repeat_operation;
-} bcast_expansion_rule ;
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src);
-
-void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
-{
-  char *dest = (char *)dest1;
-  char *src = (char *)src1;
-  int n = (int)n1;
-  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
-  int i;
-  void *orig_dest = dest;
-
-  if (n < 32) {
-    return memcpy(dest, src, n);
-  }
-
-  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
-    s_align_addr = (ae_int16x4 *) src;
-    d_align_addr = (ae_int16x4 *) dest;
-    for (i=0; i<n>>3; i++) {
-        d_align_addr[i] = s_align_addr[i];
-    }
-
-    for (i=(n&~7); i<n; i++) {
-      dest[i] = src[i];
-    }
-    return orig_dest;
-  }
-
-  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
-    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
-      *dest++ = *src++;
-       n--;
-    } else {
-      #if 0
-      return memcpy(dest, src, n);
-      #else
-        ae_int32x2 *pOut = (ae_int32x2 *)dest;
-        ae_int32x2 *pInp = (ae_int32x2 *)src;
-        ae_valign alignIn, alignOut;
-        alignIn = AE_LA64_PP(pInp);
-        alignOut = AE_ZALIGN64();
-        ae_int24x2 d0;
-        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
-        int remainder_start = 6*Nby6;
-
-        for(i=0;i<Nby6;i++)
-        {
-          AE_LA24X2_IP(d0, alignIn, pInp);
-          AE_SA24X2_IP(d0, alignOut, pOut);
-        }
-        AE_SA64POS_FP(alignOut, pOut);
-        /* remainder loop */
-        for(i=remainder_start; i < n; i++){
-          dest[i] = src[i];
-      }
-      return orig_dest;
-      #endif
-    }
-  }
-  int n2 = n/2;
-  ae_valign d_align = AE_ZALIGN64();
-  d_align_addr = (ae_int16x4 *) dest;
-  s_align_addr = (ae_int16x4 *) src;
-  ae_valign s_align = AE_LA64_PP(s_align_addr);
-  ae_int16x4 t,t2;
-  for (i=0; i<n2>>3; i++) {
-      AE_LA16X4_IP(t, s_align, s_align_addr);
-      AE_LA16X4_IP(t2, s_align, s_align_addr);
-      AE_SA16X4_IP(t, d_align, d_align_addr);
-      AE_SA16X4_IP(t2, d_align, d_align_addr);
-  }
-  AE_SA64POS_FP(d_align, d_align_addr);
-  ae_int16 *s_src = (ae_int16 *) src;
-  ae_int16 *s_dest = (ae_int16 *) dest;
-  for (i=8*i; i<n2; i++) {
-    s_dest[i] = s_src[i];
-  }
-  if (n % 2) {
-    dest[n-1] = src[n-1];
-  }
-  return orig_dest;
-} /* xa_nn_memcpy */
-
-WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
-        const int *const out_shape,         /* output shape resulting after broadcast */
-
-        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
-        const int * const in_shape,         /* input shape */
-        int num_dims)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
-    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
-
-    /* IO shape pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
-
-    /* Check if number of dims is valid */
-    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
-
-    int i = 0;
-
-    /* Check for valid IO shapes */
-    for(i=0; i<num_dims; i++){
-        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
-        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
-    }
-
-    /* Check if input shape can be broadcasted to requested output shape */
-    for(i=0; i<num_dims; i++){
-        if(in_shape[i] != out_shape[i]){
-            /* in_shape is either same as out_shape or 1 */
-            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
-        }
-    }
-
-    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
-    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
-
-    int k=0;
-    int dim=0;
-    const void *res=0;
-
-    int num_elem_load = 1;
-    int num_copy_times = 1;
-    int num_repeat = 1;
-
-    dim = num_dims-1;
-    while(dim>=0){
-
-        /* Find the sub-matrix size */
-        while(in_shape[dim] != 1 && dim>=0){
-            num_elem_load *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times this sub-matrix needs to be copied */
-        num_copy_times = 1;
-        while(in_shape[dim] == 1 && dim>=0){
-            num_copy_times *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times the above copy needs to be repeated */
-        num_repeat = 1;
-        while(in_shape[dim] != 1 && dim>=0){
-            num_repeat *= 1 * out_shape[dim];
-            dim--;
-        }
-
-        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
-        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
-        bcast_expansion_steps[k].repeat_operation = num_repeat;
-        k++;
-
-        num_elem_load = num_elem_load * num_copy_times * num_repeat;
-    }
-
-    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
-            p_out, p_in);
-    (void)res; /* Unused return value */
-
-    return 0;
-}
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src) {
-    int step_itr=0, rep_itr=0;
-    int i=0, j=0, k=0;
-    bcast_expansion_rule *step = NULL;
-
-    // ignore steps that are null
-    while(steps[step_id].repeat_operation == 0 && step_id>0){
-        step_id--;
-    }
-
-    // step is now the parent node for this iteration
-    step = &steps[step_id];
-    size_t numLoadedElm = step->load_num_elem;
-
-    WORD32 *cp_dst = dst;
-    WORD32 *cp_src = src;
-    WORD32 *cp_src_temp=NULL;
-    WORD32 *cp_dst_temp=NULL;
-
-    if(numLoadedElm>32){
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j<step->repeat_operation; j++){
-                    for(i=0; i<step->replicate_loadedElm_times; i++){
-                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-    else{
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    for(k=0; k<(int)numLoadedElm; k++){
-                        cp_src_temp = cp_src;
-                        cp_dst_temp = cp_dst;
-                        cp_dst_temp[k] = cp_src_temp[k];
-                    }
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j < step->repeat_operation; j++){
-                    for(i=0; i < step->replicate_loadedElm_times; i++){
-                        for(k=0; k<(int)(numLoadedElm); k++){
-                            cp_src_temp = cp_src;
-                            cp_dst_temp = cp_dst;
-                            cp_dst_temp[k] = cp_src_temp[k];
-
-                        }
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
deleted file mode 100644
index 34a7111ee78..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
+++ /dev/null
@@ -1,313 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-/*
- * xa_nn_broadcast_32_32.c
- */
-
-#include "xa_nnlib_common.h"
-//#include "xa_nn_basic_state.h"
-
-#include<string.h>
-#include<stdbool.h>
-
-#include "stdio.h"
-
-/*
- * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
- */
-
-#define NUMDIMS_MAX 8
-
-typedef struct bcast_expansion_struct_{
-    size_t load_num_elem;
-    int    replicate_loadedElm_times;
-    int    repeat_operation;
-} bcast_expansion_rule ;
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src);
-
-void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
-{
-  char *dest = (char *)dest1;
-  char *src = (char *)src1;
-  int n = (int)n1;
-  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
-  int i;
-  void *orig_dest = dest;
-
-  if (n < 32) {
-    return memcpy(dest, src, n);
-  }
-
-  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
-    s_align_addr = (ae_int16x4 *) src;
-    d_align_addr = (ae_int16x4 *) dest;
-    for (i=0; i<n>>3; i++) {
-        d_align_addr[i] = s_align_addr[i];
-    }
-
-    for (i=(n&~7); i<n; i++) {
-      dest[i] = src[i];
-    }
-    return orig_dest;
-  }
-
-  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
-    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
-      *dest++ = *src++;
-       n--;
-    } else {
-      #if 0
-      return memcpy(dest, src, n);
-      #else
-        ae_int32x2 *pOut = (ae_int32x2 *)dest;
-        ae_int32x2 *pInp = (ae_int32x2 *)src;
-        ae_valign alignIn, alignOut;
-        alignIn = AE_LA64_PP(pInp);
-        alignOut = AE_ZALIGN64();
-        ae_int24x2 d0;
-        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
-        int remainder_start = 6*Nby6;
-
-        for(i=0;i<Nby6;i++)
-        {
-          AE_LA24X2_IP(d0, alignIn, pInp);
-          AE_SA24X2_IP(d0, alignOut, pOut);
-        }
-        AE_SA64POS_FP(alignOut, pOut);
-        /* remainder loop */
-        for(i=remainder_start; i < n; i++){
-          dest[i] = src[i];
-      }
-      return orig_dest;
-      #endif
-    }
-  }
-  int n2 = n/2;
-  ae_valign d_align = AE_ZALIGN64();
-  d_align_addr = (ae_int16x4 *) dest;
-  s_align_addr = (ae_int16x4 *) src;
-  ae_valign s_align = AE_LA64_PP(s_align_addr);
-  ae_int16x4 t,t2;
-  for (i=0; i<n2>>3; i++) {
-      AE_LA16X4_IP(t, s_align, s_align_addr);
-      AE_LA16X4_IP(t2, s_align, s_align_addr);
-      AE_SA16X4_IP(t, d_align, d_align_addr);
-      AE_SA16X4_IP(t2, d_align, d_align_addr);
-  }
-  AE_SA64POS_FP(d_align, d_align_addr);
-  ae_int16 *s_src = (ae_int16 *) src;
-  ae_int16 *s_dest = (ae_int16 *) dest;
-  for (i=8*i; i<n2; i++) {
-    s_dest[i] = s_src[i];
-  }
-  if (n % 2) {
-    dest[n-1] = src[n-1];
-  }
-  return orig_dest;
-} /* xa_nn_memcpy */
-
-WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
-        const int *const out_shape,         /* output shape resulting after broadcast */
-
-        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
-        const int * const in_shape,         /* input shape */
-        int num_dims)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
-    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
-
-    /* IO shape pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
-
-    /* Check if number of dims is valid */
-    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
-
-    int i = 0;
-
-    /* Check for valid IO shapes */
-    for(i=0; i<num_dims; i++){
-        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
-        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
-    }
-
-    /* Check if input shape can be broadcasted to requested output shape */
-    for(i=0; i<num_dims; i++){
-        if(in_shape[i] != out_shape[i]){
-            /* in_shape is either same as out_shape or 1 */
-            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
-        }
-    }
-
-    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
-    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
-
-    int k=0;
-    int dim=0;
-    const void *res=0;
-
-    int num_elem_load = 1;
-    int num_copy_times = 1;
-    int num_repeat = 1;
-
-    dim = num_dims-1;
-    while(dim>=0){
-
-        /* Find the sub-matrix size */
-        while(in_shape[dim] != 1 && dim>=0){
-            num_elem_load *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times this sub-matrix needs to be copied */
-        num_copy_times = 1;
-        while(in_shape[dim] == 1 && dim>=0){
-            num_copy_times *= out_shape[dim];
-            dim--;
-        }
-
-        /* Find the number of times the above copy needs to be repeated */
-        num_repeat = 1;
-        while(in_shape[dim] != 1 && dim>=0){
-            num_repeat *= 1 * out_shape[dim];
-            dim--;
-        }
-
-        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
-        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
-        bcast_expansion_steps[k].repeat_operation = num_repeat;
-        k++;
-
-        num_elem_load = num_elem_load * num_copy_times * num_repeat;
-    }
-
-    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
-            p_out, p_in);
-    (void)res; /* Unused return value */
-
-    return 0;
-}
-
-WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
-        WORD32 *dst, WORD32 *src) {
-    int step_itr=0, rep_itr=0;
-    int i=0, j=0, k=0;
-    bcast_expansion_rule *step = NULL;
-
-    // ignore steps that are null
-    while(steps[step_id].repeat_operation == 0 && step_id>0){
-        step_id--;
-    }
-
-    // step is now the parent node for this iteration
-    step = &steps[step_id];
-    size_t numLoadedElm = step->load_num_elem;
-
-    WORD32 *cp_dst = dst;
-    WORD32 *cp_src = src;
-    WORD32 *cp_src_temp=NULL;
-    WORD32 *cp_dst_temp=NULL;
-
-    if(numLoadedElm>32){
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j<step->repeat_operation; j++){
-                    for(i=0; i<step->replicate_loadedElm_times; i++){
-                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-    else{
-        if(step_id > 0){
-            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
-                src = broadcast_node_32(steps, step_id-1, dst, src);
-                cp_src = dst;
-                cp_dst = dst + numLoadedElm;
-                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
-                    for(k=0; k<(int)numLoadedElm; k++){
-                        cp_src_temp = cp_src;
-                        cp_dst_temp = cp_dst;
-                        cp_dst_temp[k] = cp_src_temp[k];
-                    }
-                    cp_dst += numLoadedElm;
-                }
-                dst = cp_dst;
-            }
-            return src;
-        } else {
-            if(numLoadedElm == 1){
-                for(j=0; j<step->repeat_operation; j++){
-//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
-                	for(i = 0; i < step->replicate_loadedElm_times; i++)
-                		cp_dst[i] = cp_src[0];
-                    cp_dst += step->replicate_loadedElm_times;
-                    cp_src++;
-                }
-            } else {
-                for(j=0; j < step->repeat_operation; j++){
-                    for(i=0; i < step->replicate_loadedElm_times; i++){
-                        for(k=0; k<(int)(numLoadedElm); k++){
-                            cp_src_temp = cp_src;
-                            cp_dst_temp = cp_dst;
-                            cp_dst_temp[k] = cp_src_temp[k];
-
-                        }
-                        cp_dst += numLoadedElm;
-                    }
-                    cp_src += numLoadedElm;
-                }
-            }
-            return cp_src;
-        }
-    }
-}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
deleted file mode 100644
index 3b73e30db42..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c
+++ /dev/null
@@ -1,195 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-
-
-#include "xa_type_def.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_kernels_api.h"
-#include "xa_nnlib_common_macros.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_common.h"
-
-WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out
-                        ,const WORD32 *const p_out_shape
-                        ,const WORD32 **pp_inps
-                        ,const WORD32 *const *pp_inps_shape
-                        ,WORD32 num_out_dims
-                        ,WORD32 num_inp
-                        ,WORD32 num_inp_dims
-                        ,WORD32 axis)
-{
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(pp_inps, -1);
-  XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1);
-  //Validate Arguments
-  XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1);
-  XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1);
-  XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1);
-  XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1);
-
-  int i = 0, j = 0;
-  for(i = 0; i < num_out_dims; i++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1);
-  }
-
-  if(axis < 0)
-    axis = num_out_dims + axis;
-
-  WORD32 concat_size = 0;
-  for (i = 0; i < num_inp; i++)
-  {
-    XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1);
-    XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1);
-    XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1);
-#pragma loop_count min=1
-    for(j = 0; j < num_out_dims; j++)
-    {
-      XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1);
-    }
-    
-    XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1);
-    concat_size += pp_inps_shape[i][axis];
-  }
-
-  XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1);
-
-  //Calculate outer and inner size for axis
-  WORD32 outer_size = 1;
-#pragma no_simd
-  for(int i = 0; i < axis; i++)
-  {
-    outer_size *= p_out_shape[i];
-  }
-
-  WORD32 base_inner_size = 1;
-#pragma no_simd
-  for(int i = axis + 1; i < num_out_dims; i++)
-  {
-    base_inner_size *= p_out_shape[i];
-  }
-
-  WORD32 *ptmp_out = p_out;
-  for(int i = 0; i < num_inp; i++)
-  {
-    const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size;
-    WORD32 *output_ptr = ptmp_out;
-    const WORD32* input_ptr = pp_inps[i];
-
-    if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0)
-      && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0))
-    {
-      if(copy_size <= 8)
-      {
-        const ae_f32 *pae_inp = (const ae_f32 *)input_ptr;
-        for(int k = 0; k < outer_size; k++)
-        {
-          ae_f32 *pae_out = (ae_f32 *)output_ptr;
-#pragma concurrent
-#pragma no_simd
-          for(int ic = 0; ic < copy_size; ic++)
-          {
-            *pae_out++ = *pae_inp++;
-          }
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-      else
-      {
-        for(int k = 0; k < outer_size; k++)
-        {
-          const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr;
-          ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr;
-          ae_valign inp_a, out_a;
-          inp_a = AE_LA64_PP(pae_inp);
-          out_a = AE_ZALIGN64();
-          for(int ic = 0; ic < (copy_size >> 1); ic++)
-          {
-            ae_int32x2 d0;
-            AE_LA32X2_IP(d0, inp_a, pae_inp);
-            AE_SA32X2_IP(d0, out_a, pae_out);
-          }
-          AE_SA64POS_FP(out_a, pae_out);
-          const ae_f32 *puae_inp = (const ae_f32 *)pae_inp;
-          ae_f32 *puae_out = (ae_f32 *)pae_out;
-#pragma concurrent
-          for(int ic = 0; ic < (copy_size & 1); ic++)
-          {
-            puae_out[copy_size - 1] = puae_inp[copy_size - 1];
-          }
-          input_ptr += copy_size;
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-    }
-    else
-    {
-      if(copy_size <= 6)
-      {
-        for(int k = 0; k < outer_size; k++)
-        {
-#pragma concurrent
-#pragma no_unroll
-          for(int ic = 0; ic < copy_size; ic++)
-          {
-            output_ptr[ic] = *input_ptr++;
-          }
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-      else
-      {
-        for(int k = 0; k < outer_size; k++)
-        {
-          const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr;
-          ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr;
-          ae_valign inp_a, out_a;
-          inp_a = AE_LA64_PP(pae_inp);
-          out_a = AE_ZALIGN64();
-
-#pragma concurrent
-          for(int ic = 0; ic < copy_size >> 1; ic++)
-          {
-            ae_int32x2 d0;
-            AE_LA32X2_IP(d0, inp_a, pae_inp);
-            AE_SA32X2_IP(d0, out_a, pae_out);
-          }
-          AE_SA64POS_FP(out_a, pae_out);
-          
-          for(int ic = 0; ic < (copy_size & 1); ic++)
-          {
-            output_ptr[copy_size - 1] = input_ptr[copy_size - 1];
-          }
-          input_ptr += copy_size;
-          output_ptr += concat_size * base_inner_size;
-        }
-      }
-    }
-    ptmp_out += copy_size;
-  }
-  return 0;
-}
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
deleted file mode 100644
index 2a18d57e99f..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
+++ /dev/null
@@ -1,426 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_add_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 + inp1 */
-  if(sign_flag){
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_ADD_SX2(x2, x1);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_ADD_SX2(x2, x1);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_ADD_S(b0, a0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }
-    }
-  }
-  /* For computing inp1 + inp2 */
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_ADD_SX2(x1, x2);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_ADD_SX2(x1, x2);
-          XT_SASX2IP(y, out_a, p_c);
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_ADD_S(a0, b0);
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }
-    }
-  }
-}
-
-static void internal_elm_add_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2;
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-
-  /* For computing inp2 + inp1 */
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_ADD_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_ADD_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_ADD_S(x2, a0_7);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For computing inp1 + inp2 */
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_ADD_SX2(x1, x2);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32));
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_ADD_SX2(x1, x2);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_ADD_S(a0_7, x2);
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-}
-#endif
-
-WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_add_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_add_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_add_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_add_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3],
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-
-}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
index 3d8106eead6..db7154610d3 100644
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c
@@ -25,98 +25,21 @@
 #include "xa_nnlib_err_chk.h"
 #include "xa_nnlib_kernels_api.h"
 
-
 #if !HAVE_VFPU
 DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_clamp_f32xf32xf32_f32,
+             WORD32, xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32,
              (
-                FLOAT32 *p_out,
-                const FLOAT32 *p_inp,
-                const FLOAT32 *p_min,
-                const FLOAT32 *p_max,
-                WORD32 num_elm
+                  FLOAT32 * __restrict__ p_out,
+                  const WORD32 *const p_out_shape,
+                  const FLOAT32 * __restrict__ p_inp,
+                  const WORD32 *const p_inp_shape,
+                  const FLOAT32 * __restrict__ p_min,
+                  const WORD32 *const p_min_shape,
+                  const FLOAT32 * __restrict__ p_max,
+                  const WORD32 *const p_max_shape
               )
            )
 #else
-WORD32 xa_nn_elm_clamp_f32xf32xf32_f32(FLOAT32 * __restrict__ p_out,
-                               const FLOAT32 * __restrict__ p_inp,
-                               const FLOAT32 * __restrict__ p_min,
-                               const FLOAT32 * __restrict__ p_max,
-                               WORD32 num_elm)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_min, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_max, -1);
-    /* Pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_min, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_max, sizeof(FLOAT32), -1);
-    /* Basic Parameter checks */
-    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
-
-    int i;
-    xtfloatx2 *inp = (xtfloatx2 *)p_inp;
-    xtfloatx2 *min = (xtfloatx2 *)p_min;
-    xtfloatx2 *max = (xtfloatx2 *)p_max;
-    xtfloatx2 *out =  (xtfloatx2 *)p_out;
-
-    xtfloatx2 x1, d_min, d_max, y;
-
-    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp)&7) == 0) && ((((unsigned)p_min)&7) == 0) && ((((unsigned)p_max)&7) == 0))
-    {
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LSX2IP(x1, inp, 2*sizeof(FLOAT32));
-            XT_LSX2IP(d_min, min, 2*sizeof(FLOAT32));
-            XT_LSX2IP(d_max, max, 2*sizeof(FLOAT32));
-
-            y = XT_MAX_SX2(x1, d_min);
-            y = XT_MIN_SX2(y, d_max);
-
-            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
-        }
-    }
-    else
-    {
-        ae_valign inp_a, min_a, max_a, out_a;
-
-        inp_a = XT_LASX2PP(inp);
-        min_a = XT_LASX2PP(min);
-        max_a = XT_LASX2PP(max);
-        out_a = AE_ZALIGN64();
-        /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LASX2IP(x1, inp_a, inp);
-            XT_LASX2IP(d_min, min_a, min);
-            XT_LASX2IP(d_max, max_a, max);
-
-            y = XT_MAX_SX2(x1, d_min);
-            y = XT_MIN_SX2(y, d_max);
-
-            XT_SASX2IP(y, out_a, out);
-        }
-        XT_SASX2POSFP(out_a, out);
-    }
-    // Remainder Loop
-    if (num_elm & 1)
-    {
-        xtfloat a1, a2, a3, a;
-        XT_LSIP(a1, (xtfloat *)inp, 0);
-        XT_LSIP(a2, (xtfloat *)min, 0);
-        XT_LSIP(a3, (xtfloat *)max, 0);
-        a = XT_MAX_S(a1, a2); 
-        a = XT_MIN_S(a, a3); 
-        XT_SSI(a, (xtfloat *)out, 0);
-    }
-    return 0;
-}
-
 static void internal_elm_clamp_broadcast_f32xf32xf32_f32(FLOAT32 * __restrict__ p_out,
                     const    FLOAT32 * __restrict__ p_min,
                     const    FLOAT32 * __restrict__ p_max,
@@ -794,4 +717,4 @@ WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(FLOAT32 * __restrict__ p_out
   }
   return 0;
 }
-#endif
+#endif
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
deleted file mode 100644
index 16fc23f59de..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
+++ /dev/null
@@ -1,441 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-//#include "xa_nn_basic_state.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_div_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 - inp1 */   
-  if(sign_flag){  
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x2, x1);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x2, x1);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(b0, a0);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          y = XT_DIV_SX2(x1, x2);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          y = XT_DIV_SX2(x1, x2);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-        XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-        c0 = XT_DIV_S(a0, b0);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }  
-  }
-}
-
-static void internal_elm_div_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  /* For computing inp2 - inp1 */      
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();      
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-    }  
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(x2, a0_7);   
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        y = XT_DIV_SX2(x1, x2);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();       
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        y = XT_DIV_SX2(x1, x2);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      out = XT_DIV_S(a0_7, x2);   
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }    
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_div_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_div_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_div_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_div_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_div_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
deleted file mode 100644
index 7d95e536c9e..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
+++ /dev/null
@@ -1,845 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nnlib_err_chk.h"
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_maximum_f32xf32_f32,
-             (
-                FLOAT32 *p_out,
-                const FLOAT32 *p_inp1,
-                const FLOAT32 *p_inp2,
-                WORD32 num_elm
-              )
-           )
-#else
-WORD32 xa_nn_elm_maximum_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                               const FLOAT32 * __restrict__ p_inp1,
-                               const FLOAT32 * __restrict__ p_inp2,
-                               WORD32 num_elm)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-    /* Pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-    /* Basic Parameter checks */
-    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
-
-    int i;
-    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
-    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
-    xtfloatx2 *out =  (xtfloatx2 *)p_out;
-    xtfloatx2 x1, x2, y;
-    unsigned char con1, con2;
-    xtbool2 con = int32_rtor_xtbool2(0x00000003);
-
-    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-    {
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-            y = XT_MAX_SX2(x2, x1);
-            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
-        }
-    }
-    else
-    {
-        ae_valign inp1_a, inp2_a, out_a;
-
-        inp1_a = XT_LASX2PP(inp1);
-        inp2_a = XT_LASX2PP(inp2);
-        out_a = AE_ZALIGN64();
-        /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            y = XT_MAX_SX2(x2, x1);
-            XT_SASX2IP(y, out_a, out);
-        }
-        XT_SASX2POSFP(out_a, out);
-    }
-    // Remainder Loop
-    if (num_elm & 1)
-    {
-        xtfloat a1, a2, a;
-        XT_LSIP(a1, (xtfloat *)inp1, 0);
-        XT_LSIP(a2, (xtfloat *)inp2, 0);
-        a = XT_MAX_S(a1, a2);   
-        XT_SSI(a, (xtfloat *)out, 0);
-    }
-    return 0;
-}
-#endif
-
-#if HAVE_VFPU
-static void internal_elm_maximum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
- 
-  for(i = 0; i < out_lc; i++)
-  {
-    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-    p_b = (xtfloatx2 *)p_inp2;
-    p_c = (xtfloatx2 *)&p_out[i * in_lc];
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-        y = XT_MAX_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-      vinp1 = XT_LASX2PP(p_a);
-      vinp2 = XT_LASX2PP(p_b);
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LASX2IP(x1, vinp1, p_a);
-        XT_LASX2IP(x2, vinp2, p_b);
-        y = XT_MAX_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c); 
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-      c0 = XT_MAX_S(b0, a0);   
-      XT_SSI(c0, (xtfloat *)p_c, 0);
-    }
-  }
-}
-
-static void internal_elm_maximum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-  {
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-      y = XT_MAX_SX2(x2, x1);
-      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-    }
-  }
-  else
-  {
-    ae_valign inp1_a, out_a;
-    inp1_a = XT_LASX2PP(p_a);
-    out_a = AE_ZALIGN64();      
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LASX2IP(x1, inp1_a, p_a);
-      y = XT_MAX_SX2(x2, x1);
-      XT_SASX2IP(y, out_a, p_c);
-    }
-    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-  }  
-  if(num_scalar_ops !=0)
-  {
-    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-    out = XT_MAX_S(x2, a0_7);   
-    XT_SSI(out, (xtfloat *)p_c, 0);
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_maximum_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_maximum_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_maximum_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_maximum_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_maximum_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_minimum_f32xf32_f32,
-             (
-                FLOAT32 *p_out,
-                const FLOAT32 *p_inp1,
-                const FLOAT32 *p_inp2,
-                WORD32 num_elm
-              )
-           )
-#else
-WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                               const FLOAT32 * __restrict__ p_inp1,
-                               const FLOAT32 * __restrict__ p_inp2,
-                               WORD32 num_elm)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-    /* Pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-    /* Basic Parameter checks */
-    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
-
-    int i;
-    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
-    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
-    xtfloatx2 *out =  (xtfloatx2 *)p_out;
-    xtfloatx2 x1, x2, y;
-    unsigned char con1, con2;
-    xtbool2 con = int32_rtor_xtbool2(0x00000003);
-
-    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-    {
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-            y = XT_MIN_SX2(x2, x1);
-            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
-        }
-    }
-    else
-    {
-        ae_valign inp1_a, inp2_a, out_a;
-
-        inp1_a = XT_LASX2PP(inp1);
-        inp2_a = XT_LASX2PP(inp2);
-        out_a = AE_ZALIGN64();
-        /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            y = XT_MIN_SX2(x2, x1);
-            XT_SASX2IP(y, out_a, out);
-        }
-        XT_SASX2POSFP(out_a, out);
-    }
-    // Remainder Loop
-    if (num_elm & 1)
-    {
-        xtfloat a1, a2, a;
-        XT_LSIP(a1, (xtfloat *)inp1, 0);
-        XT_LSIP(a2, (xtfloat *)inp2, 0);
-        a = XT_MIN_S(a1, a2);   
-        XT_SSI(a, (xtfloat *)out, 0);
-    }
-    return 0;
-}
-#endif
-
-#if HAVE_VFPU
-static void internal_elm_minimum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
- 
-  for(i = 0; i < out_lc; i++)
-  {
-    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-    p_b = (xtfloatx2 *)p_inp2;
-    p_c = (xtfloatx2 *)&p_out[i * in_lc];
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-        y = XT_MIN_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-      vinp1 = XT_LASX2PP(p_a);
-      vinp2 = XT_LASX2PP(p_b);
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LASX2IP(x1, vinp1, p_a);
-        XT_LASX2IP(x2, vinp2, p_b);
-        y = XT_MIN_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c); 
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-      c0 = XT_MIN_S(b0, a0);   
-      XT_SSI(c0, (xtfloat *)p_c, 0);
-    }
-  }
-}
-
-static void internal_elm_minimum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-  {
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-      y = XT_MIN_SX2(x2, x1);
-      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-    }
-  }
-  else
-  {
-    ae_valign inp1_a, out_a;
-    inp1_a = XT_LASX2PP(p_a);
-    out_a = AE_ZALIGN64();      
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LASX2IP(x1, inp1_a, p_a);
-      y = XT_MIN_SX2(x2, x1);
-      XT_SASX2IP(y, out_a, p_c);
-    }
-    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-  }  
-  if(num_scalar_ops !=0)
-  {
-    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-    out = XT_MIN_S(x2, a0_7);   
-    XT_SSI(out, (xtfloat *)p_c, 0);
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_minimum_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_minimum_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_minimum_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_minimum_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_minimum_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
deleted file mode 100644
index e11fccbba52..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+++ /dev/null
@@ -1,359 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-
-#if HAVE_VFPU
-static void internal_elm_mul_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
- 
-  for(i = 0; i < out_lc; i++)
-  {
-    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-    p_b = (xtfloatx2 *)p_inp2;
-    p_c = (xtfloatx2 *)&p_out[i * in_lc];
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-        y = XT_MUL_SX2(x2, x1);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-      vinp1 = XT_LASX2PP(p_a);
-      vinp2 = XT_LASX2PP(p_b);
-      for(j = 0; j < num_simd2_ops; j++)
-      {
-        XT_LASX2IP(x1, vinp1, p_a);
-        XT_LASX2IP(x2, vinp2, p_b);
-        y = XT_MUL_SX2(x2, x1);
-        XT_SASX2IP(y, out_a, p_c); 
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-      c0 = XT_MUL_S(b0, a0);   
-      XT_SSI(c0, (xtfloat *)p_c, 0);
-    }
-  }
-}
-
-static void internal_elm_mul_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-  {
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-      y = XT_MUL_SX2(x2, x1);
-      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-    }
-  }
-  else
-  {
-    ae_valign inp1_a, out_a;
-    inp1_a = XT_LASX2PP(p_a);
-    out_a = AE_ZALIGN64();      
-    for(i=0; i<num_simd2_ops; i++)
-    {
-      XT_LASX2IP(x1, inp1_a, p_a);
-      y = XT_MUL_SX2(x2, x1);
-      XT_SASX2IP(y, out_a, p_c);
-    }
-    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-  }  
-  if(num_scalar_ops !=0)
-  {
-    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-    out = XT_MUL_S(x2, a0_7);   
-    XT_SSI(out, (xtfloat *)p_c, 0);
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_mul_broadcast_4D_f32xf32_f32,
-             (
-                      FLOAT32 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape
-              )
-           )
-#else           
-WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  FLOAT32 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_mul_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_mul_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_mul_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_mul_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
deleted file mode 100644
index 840a027f7a7..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
+++ /dev/null
@@ -1,838 +0,0 @@
-/*******************************************************************************
-* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
-*
-* Permission is hereby granted, free of charge, to any person obtaining
-* a copy of this software and associated documentation files (the
-* "Software"), to use this Software with Cadence processor cores only and
-* not with any other processors and platforms, subject to
-* the following conditions:
-*
-* The above copyright notice and this permission notice shall be included
-* in all copies or substantial portions of the Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
-* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
-* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-******************************************************************************/
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_where_f32xf32_f32,
-             (
-                FLOAT32 *p_out,
-                const FLOAT32 *p_inp1,
-                const FLOAT32 *p_inp2,
-                const unsigned char *__restrict__ condition,
-                WORD32 num_elm
-              )
-           )
-#else
-WORD32 xa_nn_elm_where_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                               const FLOAT32 * __restrict__ p_inp1,
-                               const FLOAT32 * __restrict__ p_inp2,
-                               const unsigned char *__restrict__ p_condition,
-                               WORD32 num_elm)
-{
-
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-    /* Pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-    /* Basic Parameter checks */
-    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
-
-    int i;
-    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
-    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
-    xtfloatx2 *out =  (xtfloatx2 *)p_out;
-    unsigned char *condition = p_condition;
-    xtfloatx2 x1, x2, y;
-    unsigned char con1, con2;
-    xtbool2 con = int32_rtor_xtbool2(0x00000003);
-
-    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-    {
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-            con1 = XT_L8UI(condition, 0);
-            condition++;
-            con2 = XT_L8UI(condition, 0);
-            condition++;
-            con = AE_MOVBA1X2(con1, con2);
-            XT_MOVT_SX2 (y, x1, con);
-            XT_MOVF_SX2 (y, x2, con);
-            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
-        }
-    }
-    else
-    {
-        ae_valign inp1_a, inp2_a, out_a;
-
-        inp1_a = XT_LASX2PP(inp1);
-        inp2_a = XT_LASX2PP(inp2);
-        out_a = AE_ZALIGN64();
-        /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-        for(i=0;i < num_elm>>1;i++)
-        {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            con1 = XT_L8UI(condition, 0);
-            condition++;
-            con2 = XT_L8UI(condition, 0);
-            condition++;
-            con = AE_MOVBA1X2(con1, con2);
-            XT_MOVT_SX2 (y, x1, con);
-            XT_MOVF_SX2 (y, x2, con);
-            XT_SASX2IP(y, out_a, out);
-        }
-        XT_SASX2POSFP(out_a, out);
-    }
-    // Remainder Loop
-    if (num_elm & 1)
-    {
-        xtfloat a1, a2, a;
-        con1 = XT_L8UI(condition, 0);
-        xtbool s = AE_MOVBA(con1);
-        XT_LSIP(a1, (xtfloat *)inp1, 0);
-        XT_LSIP(a2, (xtfloat *)inp2, 0);
-        XT_MOVT_S(a, a1, s);
-        XT_MOVF_S(a, a2, s);
-        XT_SSI(a, (xtfloat *)out, 0);
-    }
-    return 0;
-}
-
-static void internal_elm_where_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                    const    unsigned char * __restrict__ p_condition,
-                             WORD32  num_elm,
-                             xtbool  sign_flag)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-  unsigned char *condition = p_condition;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-
-  unsigned char con1, con2;
-  xtbool2 con = int32_rtor_xtbool2(0x00000003);
-
-  /* For out = condition ? inp2 :inp1 */
-  if(sign_flag){
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        con1 = XT_L8UI(condition, 0);
-        condition++;
-        con2 = XT_L8UI(condition, 0);
-        condition++;
-        con = AE_MOVBA1X2(con1, con2);
-        XT_MOVT_SX2 (y, x2, con);
-        XT_MOVF_SX2 (y, x1, con);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();      
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        con1 = XT_L8UI(condition, 0);
-        condition++;
-        con2 = XT_L8UI(condition, 0);
-        condition++;
-        con = AE_MOVBA1X2(con1, con2);
-        XT_MOVT_SX2 (y, x2, con);
-        XT_MOVF_SX2 (y, x1, con);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
-    }  
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      con1 = XT_L8UI(condition, 0);
-      xtbool s = AE_MOVBA(con1);
-      XT_MOVT_S(out, x2, s);
-      XT_MOVF_S(out, a0_7, s);  
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-  }
-  /* For out = condition ? inp1 :inp2 */
-  else
-  {
-    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-        con1 = XT_L8UI(condition, 0);
-        condition++;
-        con2 = XT_L8UI(condition, 0);
-        condition++;
-        con = AE_MOVBA1X2(con1, con2);
-        XT_MOVT_SX2 (y, x1, con);
-        XT_MOVF_SX2 (y, x2, con);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign inp1_a, out_a;
-      inp1_a = XT_LASX2PP(p_a);
-      out_a = AE_ZALIGN64();       
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        XT_LASX2IP(x1, inp1_a, p_a);
-        con1 = XT_L8UI(condition, 0);
-        condition++;
-        con2 = XT_L8UI(condition, 0);
-        condition++;
-        con = AE_MOVBA1X2(con1, con2);
-        XT_MOVT_SX2 (y, x1, con);
-        XT_MOVF_SX2 (y, x2, con);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-      con1 = XT_L8UI(condition, 0);
-      xtbool s = AE_MOVBA(con1);
-      XT_MOVT_S(out, a0_7, s);
-      XT_MOVF_S(out, x2, s);    
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }    
-  }
-}
-
-static void internal_elm_where_broadcast_both_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                    const    unsigned char * __restrict__ p_condition,
-                             WORD32  num_elm)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-  unsigned char *condition = p_condition;
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-  x1 = XT_LSI((xtfloat *)p_a, 0);
-
-  unsigned char con1, con2;
-  xtbool2 con = int32_rtor_xtbool2(0x00000003);
-
-    if((((unsigned)p_c)&7) == 0)
-    {
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        con1 = XT_L8UI(condition, 0);
-        condition++;
-        con2 = XT_L8UI(condition, 0);
-        condition++;
-        con = AE_MOVBA1X2(con1, con2);
-        XT_MOVT_SX2 (y, x1, con);
-        XT_MOVF_SX2 (y, x2, con);
-        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-      }
-    }
-    else
-    {
-      ae_valign out_a;
-      out_a = AE_ZALIGN64();       
-      for(i=0; i<num_simd2_ops; i++)
-      {
-        con1 = XT_L8UI(condition, 0);
-        condition++;
-        con2 = XT_L8UI(condition, 0);
-        condition++;
-        con = AE_MOVBA1X2(con1, con2);
-        XT_MOVT_SX2 (y, x1, con);
-        XT_MOVF_SX2 (y, x2, con);
-        XT_SASX2IP(y, out_a, p_c);
-      }
-      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-    }
-    if(num_scalar_ops !=0)
-    {
-      con1 = XT_L8UI(condition, 0);
-      xtbool s = AE_MOVBA(con1);
-      XT_MOVT_S(out, x1, s);
-      XT_MOVF_S(out, x2, s);    
-      XT_SSI(out, (xtfloat *)p_c, 0);
-    }
-}
-
-static void internal_elm_where_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                    const    unsigned char * __restrict__ p_condition,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-  unsigned char *condition = p_condition;
-  
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-    unsigned char con1, con2;
-    xtbool2 con = int32_rtor_xtbool2(0x00000003);
-  /* For out = condition ? inp2 :inp1 */   
-  if(sign_flag){  
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      condition = &p_condition[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          con1 = XT_L8UI(condition, 0);
-          condition++;
-          con2 = XT_L8UI(condition, 0);
-          condition++;
-          con = AE_MOVBA1X2(con1, con2);
-          XT_MOVT_SX2 (y, x2, con);
-          XT_MOVF_SX2 (y, x1, con);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          con1 = XT_L8UI(condition, 0);
-          condition++;
-          con2 = XT_L8UI(condition, 0);
-          condition++;
-          con = AE_MOVBA1X2(con1, con2);
-          XT_MOVT_SX2 (y, x2, con);
-          XT_MOVF_SX2 (y, x1, con);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, 0);
-        XT_LSIP(b0, (xtfloat *)p_b, 0);
-        con1 = XT_L8UI(condition, 0);
-        xtbool s = AE_MOVBA(con1);
-        XT_MOVT_S(c0, b0, s);
-        XT_MOVF_S(c0, a0, s);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }
-  }
-  /* For out = condition ? inp1 :inp2 */
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      condition = &p_condition[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          con1 = XT_L8UI(condition, 0);
-          condition++;
-          con2 = XT_L8UI(condition, 0);
-          condition++;
-          con = AE_MOVBA1X2(con1, con2);
-          XT_MOVT_SX2 (y, x1, con);
-          XT_MOVF_SX2 (y, x2, con);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          con1 = XT_L8UI(condition, 0);
-          condition++;
-          con2 = XT_L8UI(condition, 0);
-          condition++;
-          con = AE_MOVBA1X2(con1, con2);
-          XT_MOVT_SX2 (y, x1, con);
-          XT_MOVF_SX2 (y, x2, con);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, 0);
-        XT_LSIP(b0, (xtfloat *)p_b, 0);
-        con1 = XT_L8UI(condition, 0);
-        xtbool s = AE_MOVBA(con1);
-        XT_MOVT_S(c0, a0, s);
-        XT_MOVF_S(c0, b0, s);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }  
-  }
-}
-
-static void internal_elm_where_broadcast_both_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                    const    unsigned char * __restrict__ p_condition,
-                             WORD32  out_lc,
-                             WORD32  in_lc)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
-  unsigned char *condition = p_condition;
-  
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-    unsigned char con1, con2;
-    xtbool2 con = int32_rtor_xtbool2(0x00000003);
-
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)p_inp1;
-      p_b = (xtfloatx2 *)p_inp2;
-      p_c = (xtfloatx2 *)&p_out[i * in_lc];
-      condition = &p_condition[i * in_lc];
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
-          con1 = XT_L8UI(condition, 0);
-          condition++;
-          con2 = XT_L8UI(condition, 0);
-          condition++;
-          con = AE_MOVBA1X2(con1, con2);
-          XT_MOVT_SX2 (y, x1, con);
-          XT_MOVF_SX2 (y, x2, con);
-          XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
-        }
-      }
-      else
-      {
-        ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
-        vinp1 = XT_LASX2PP(p_a);
-        vinp2 = XT_LASX2PP(p_b);
-
-        for(j = 0; j < num_simd2_ops; j++)
-        {
-          XT_LASX2IP(x1, vinp1, p_a);
-          XT_LASX2IP(x2, vinp2, p_b);
-          con1 = XT_L8UI(condition, 0);
-          condition++;
-          con2 = XT_L8UI(condition, 0);
-          condition++;
-          con = AE_MOVBA1X2(con1, con2);
-          XT_MOVT_SX2 (y, x1, con);
-          XT_MOVF_SX2 (y, x2, con);
-          XT_SASX2IP(y, out_a, p_c); 
-        }
-        XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0, (xtfloat *)p_a, 0);
-        XT_LSIP(b0, (xtfloat *)p_b, 0);
-        con1 = XT_L8UI(condition, 0);
-        xtbool s = AE_MOVBA(con1);
-        XT_MOVT_S(c0, a0, s);
-        XT_MOVF_S(c0, b0, s);   
-        XT_SSI(c0, (xtfloat *)p_c, 0);
-      }      
-    }  
-}
-
-WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape,
-                      const unsigned char *__restrict__ p_condition,
-                      const WORD32 *const p_condition_shape
-                      )
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_condition, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_condition_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_condition, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_condition_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i]) && ((p_inp1_shape[i] != 1) && (p_inp2_shape[i] != 1)))
-    {
-      return -1;
-    }
-  }
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-      if(p_inp1_shape[i] == 1)
-      {
-          inp1_strides[i] = 0;
-          need_broadcast = 1;
-      }
-      else
-      {
-          inp1_const &= 0;
-      }
-      if(p_inp2_shape[i] == 1)
-      {
-          inp2_strides[i] = 0;
-          need_broadcast = 1;
-      }
-      else
-      {
-          inp2_const &= 0;
-      }
-  }
-
-  int itr0, itr1, itr2;
-  FLOAT32 *p_out_tmp = p_out;
-  const unsigned char *__restrict p_condition_temp = p_condition;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_where_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                p_condition,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag);
-  }
-  else if((inp1_strides[3] == 1)&& (inp2_strides[3] == 1))
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if((inp1_strides[2] == 0) && (inp2_strides[2] == 0))
-    {
-        in_lc = p_out_shape[3];
-        out_lc = p_out_shape[2];
-        for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-        {
-          const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-          const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-          for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-          {
-            internal_elm_where_broadcast_both_2D_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp0,
-                p_inp2_tmp0,
-                p_condition_temp,
-                out_lc,
-                in_lc);
-            p_out_tmp += in_lc * out_lc;
-            p_inp1_tmp0 += inp1_strides[1];
-            p_inp2_tmp0 += inp2_strides[1];
-            p_condition_temp += in_lc * out_lc;
-          }
-          p_inp1_tmp += inp1_strides[0];
-          p_inp2_tmp += inp2_strides[0];
-        }
-    }
-    else
-    {
-        if(inp1_strides[2] == 0)
-        {
-          const FLOAT32 *tmp;
-          tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-          sign_flag = 1;
-          int tmp_strides[2];
-          tmp_strides[0] = inp1_strides[0];
-          tmp_strides[1] = inp1_strides[1];
-
-          inp1_strides[0] = inp2_strides[0];
-          inp1_strides[1] = inp2_strides[1];
-
-          inp2_strides[0] = tmp_strides[0];
-          inp2_strides[1] = tmp_strides[1];
-          in_lc = p_out_shape[3];
-          out_lc = p_out_shape[2];
-        }
-        else if(inp2_strides[2] == 0)
-        {
-          in_lc = p_out_shape[3];
-          out_lc = p_out_shape[2];
-        }
-
-        for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-        {
-          const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-          const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-          for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-          {
-            internal_elm_where_broadcast_2D_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp0,
-                p_inp2_tmp0,
-                p_condition_temp,
-                out_lc,
-                in_lc,
-                sign_flag);
-            p_out_tmp += in_lc * out_lc;
-            p_inp1_tmp0 += inp1_strides[1];
-            p_inp2_tmp0 += inp2_strides[1];
-            p_condition_temp += in_lc * out_lc;
-          }
-
-          p_inp1_tmp += inp1_strides[0];
-          p_inp2_tmp += inp2_strides[0];
-        }
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    if((inp1_const == 1)&&(inp2_const == 1))
-    {
-        internal_elm_where_broadcast_both_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp,
-            p_inp2_tmp,
-            p_condition_temp,
-            p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3]);
-    }
-    else
-    {
-        sign_flag = 0;
-        if(inp1_strides[3] == 0)
-        {
-          sign_flag = 1;
-          const FLOAT32 *tmp;
-          tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-        }
-        internal_elm_where_broadcast_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp,
-            p_inp2_tmp,
-            p_condition_temp,
-            p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-            sign_flag);
-    }
-  }
-  else
-  {
-    sign_flag = 0;
-    if((inp1_strides[3] == 0) && (inp2_strides[3] == 0))
-    {
-        for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-        {
-          const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-          const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-          for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-          {
-            const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-            const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-            for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-            {
-              {
-                internal_elm_where_broadcast_both_f32xf32_f32(
-                    p_out_tmp,
-                    p_inp1_tmp1,
-                    p_inp2_tmp1,
-                    p_condition_temp,
-                    p_out_shape[3]);
-              }
-              p_out_tmp += p_out_shape[3];
-              p_inp1_tmp1 += inp1_strides[2];
-              p_inp2_tmp1 += inp2_strides[2];
-              p_condition_temp += p_out_shape[3];
-            }
-            p_inp1_tmp0 += inp1_strides[1];
-            p_inp2_tmp0 += inp2_strides[1];
-          }
-          p_inp1_tmp += inp1_strides[0];
-          p_inp2_tmp += inp2_strides[0];
-        }
-    }
-    else
-    {
-        if(inp1_strides[3] == 0)
-        {
-          const FLOAT32 *tmp;
-          tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-          sign_flag = 1;
-          int tmp_strides[3];
-          tmp_strides[0] = inp1_strides[0];
-          tmp_strides[1] = inp1_strides[1];
-          tmp_strides[2] = inp1_strides[2];
-
-          inp1_strides[0] = inp2_strides[0];
-          inp1_strides[1] = inp2_strides[1];
-          inp1_strides[2] = inp2_strides[2];
-
-          inp2_strides[0] = tmp_strides[0];
-          inp2_strides[1] = tmp_strides[1];
-          inp2_strides[2] = tmp_strides[2];
-        }
-        for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-        {
-          const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-          const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-          for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-          {
-            const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-            const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-            for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-            {
-              {
-                internal_elm_where_broadcast_f32xf32_f32(
-                    p_out_tmp,
-                    p_inp1_tmp1,
-                    p_inp2_tmp1,
-                    p_condition_temp,
-                    p_out_shape[3], 
-                    sign_flag);
-              }
-              p_out_tmp += p_out_shape[3];
-              p_inp1_tmp1 += inp1_strides[2];
-              p_inp2_tmp1 += inp2_strides[2];
-              p_condition_temp += p_out_shape[3];
-            }
-            p_inp1_tmp0 += inp1_strides[1];
-            p_inp2_tmp0 += inp2_strides[1];
-          }
-          p_inp1_tmp += inp1_strides[0];
-          p_inp2_tmp += inp2_strides[0];
-        }
-    }
-  }
-  return 0;
-}
-
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
deleted file mode 100644
index 792b152e1fa..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_greater_lesser_equal_f32.c
+++ /dev/null
@@ -1,2028 +0,0 @@
-#include "xa_type_def.h"
-#include "xa_nnlib_common_fpu.h"
-#include "xa_nn_common.h"
-#include "xa_nnlib_err_chk.h"
-#include "xa_nnlib_kernels_api.h"
-
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_greater_lesser_equal_f32xf32_f32,
-             (
-                WORD8 *y,
-                const FLOAT32 *x1,
-                const FLOAT32 *x2,
-                WORD32 N,
-                WORD32 kernel_type
-              )
-           )
-#else
-WORD32 xa_nn_elm_greater_lesser_equal_f32xf32_f32(WORD8 * __restrict__ p_out,
-                               const FLOAT32 * __restrict__ p_inp1,
-                               const FLOAT32 * __restrict__ p_inp2,
-                               WORD32 num_elm,
-                               WORD32 kernel_type)
-{
-    /* NULL pointer checks */
-    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-    /* Pointer alignment checks */
-    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-    /* Basic Parameter checks */
-    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
-
-    int i;
-    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
-    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
-    //xtfloatx2 *out =  (xtfloatx2 *)p_out;
-    UWORD8 *out = p_out;
-    xtfloatx2 x1, x2, y;
-    xtbool check;
-    
-    xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32());
-
-    if(kernel_type == 0)
-    {    
-      if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-      {
-          for(i=0;i < num_elm>>1;i++)
-          {
-              XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-              XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-              
-              //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = XT_OLE_SX2(x2, x1);
-              
-              uint8_t val = AE_MOVAB2(check);
-              
-              uint8_t store1 = (val >> 1) & 0x1;
-              *out++ = store1;
-              
-              uint8_t store0 = val & 0x1;
-              *out++ = store0;
-          }
-      }
-      else
-      {
-          ae_valign inp1_a, inp2_a, out_a;
-  
-          inp1_a = XT_LASX2PP(inp1);
-          inp2_a = XT_LASX2PP(inp2);
-          /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-          for(i=0;i < num_elm>>1;i++)
-          {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLE_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *out++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *out++ = store0;
-          }
-      }
-      // Remainder Loop
-      if (num_elm & 1)
-      {
-          xtfloat a1, a2, a;
-          XT_LSIP(a1, (xtfloat *)inp1, 0);
-          XT_LSIP(a2, (xtfloat *)inp2, 0);
-          
-          a = XT_SUB_S(a2, a1);
-          
-          check = 0;        
-          if(a <= 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *out++ = store;
-      }
-    }
-    else if(kernel_type == 1)
-    {
-      if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-      {
-          for(i=0;i < num_elm>>1;i++)
-          {
-              XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-              XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-              
-              //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = XT_OLT_SX2(x2, x1);
-              
-              uint8_t val = AE_MOVAB2(check);
-              
-              uint8_t store1 = (val >> 1) & 0x1;
-              *out++ = store1;
-              
-              uint8_t store0 = val & 0x1;
-              *out++ = store0;
-          }
-      }
-      else
-      {
-          ae_valign inp1_a, inp2_a, out_a;
-  
-          inp1_a = XT_LASX2PP(inp1);
-          inp2_a = XT_LASX2PP(inp2);
-          /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-          for(i=0;i < num_elm>>1;i++)
-          {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLT_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *out++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *out++ = store0;
-          }
-      }
-      // Remainder Loop
-      if (num_elm & 1)
-      {
-          xtfloat a1, a2, a;
-          XT_LSIP(a1, (xtfloat *)inp1, 0);
-          XT_LSIP(a2, (xtfloat *)inp2, 0);
-          
-          a = XT_SUB_S(a2, a1);
-          
-          check = 0;        
-          if(a < 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *out++ = store;
-      }  
-    }
-    else if(kernel_type == 2)
-    {
-      if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-      {
-          for(i=0;i < num_elm>>1;i++)
-          {
-              XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-              XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-              
-              //y = XT_SUB_SX2(x1, x2);
-              xtbool2 check = XT_OLE_SX2(x1, x2);
-              
-              uint8_t val = AE_MOVAB2(check);
-              
-              uint8_t store1 = (val >> 1) & 0x1;
-              *out++ = store1;
-              
-              uint8_t store0 = val & 0x1;
-              *out++ = store0;
-          }
-      }
-      else
-      {
-          ae_valign inp1_a, inp2_a, out_a;
-  
-          inp1_a = XT_LASX2PP(inp1);
-          inp2_a = XT_LASX2PP(inp2);
-          /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-          for(i=0;i < num_elm>>1;i++)
-          {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLE_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *out++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *out++ = store0;
-          }
-      }
-      // Remainder Loop
-      if (num_elm & 1)
-      {
-          xtfloat a1, a2, a;
-          XT_LSIP(a1, (xtfloat *)inp1, 0);
-          XT_LSIP(a2, (xtfloat *)inp2, 0);
-          
-          a = XT_SUB_S(a1, a2);
-          
-          check = 0;        
-          if(a <= 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *out++ = store;
-      }
-    }
-    else if(kernel_type == 3)
-    {
-      if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-      {
-          for(i=0;i < num_elm>>1;i++)
-          {
-              XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-              XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-              
-              //y = XT_SUB_SX2(x1, x2);
-              xtbool2 check = XT_OLT_SX2(x1, x2);
-              
-              uint8_t val = AE_MOVAB2(check);
-              
-              uint8_t store1 = (val >> 1) & 0x1;
-              *out++ = store1;
-              
-              uint8_t store0 = val & 0x1;
-              *out++ = store0;
-          }
-      }
-      else
-      {
-          ae_valign inp1_a, inp2_a, out_a;
-  
-          inp1_a = XT_LASX2PP(inp1);
-          inp2_a = XT_LASX2PP(inp2);
-          /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-          for(i=0;i < num_elm>>1;i++)
-          {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLT_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *out++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *out++ = store0;
-          }
-      }
-      // Remainder Loop
-      if (num_elm & 1)
-      {
-          xtfloat a1, a2, a;
-          XT_LSIP(a1, (xtfloat *)inp1, 0);
-          XT_LSIP(a2, (xtfloat *)inp2, 0);
-          
-          a = XT_SUB_S(a1, a2);
-          
-          check = 0;        
-          if(a < 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *out++ = store;
-      }
-    }
-    else if(kernel_type == 4)
-    {
-      if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-      {
-          for(i=0;i < num_elm>>1;i++)
-          {
-              XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-              XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-              
-              //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-              
-              uint8_t val = AE_MOVAB2(check);
-              
-              uint8_t store1 = (val >> 1) & 0x1;
-              *out++ = store1;
-              
-              uint8_t store0 = val & 0x1;
-              *out++ = store0;
-          }
-      }
-      else
-      {
-          ae_valign inp1_a, inp2_a, out_a;
-  
-          inp1_a = XT_LASX2PP(inp1);
-          inp2_a = XT_LASX2PP(inp2);
-          /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-          for(i=0;i < num_elm>>1;i++)
-          {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *out++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *out++ = store0;
-          }
-      }
-      // Remainder Loop
-      if (num_elm & 1)
-      {
-          xtfloat a1, a2, a;
-          XT_LSIP(a1, (xtfloat *)inp1, 0);
-          XT_LSIP(a2, (xtfloat *)inp2, 0);
-          
-          //a = XT_SUB_S(a2, a1);
-          
-          check = 0;        
-          if(a1 == a2)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *out++ = store;
-      }
-    }
-    else if(kernel_type == 5)
-    {
-      ae_int32x2 ones = AE_MOVDA32(1);
-      if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
-      {
-          for(i=0;i < num_elm>>1;i++)
-          {
-              XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
-              XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
-              
-              //y = XT_SUB_SX2(x2, x1);
-              xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-              
-              ae_int32x2 store = AE_ZERO32();
-              AE_MOVF32X2(store, ones, check);
-              
-              *out++ = AE_MOVAD32_H(store);
-              *out++ = AE_MOVAD32_L(store);
-          }
-      }
-      else
-      {
-          ae_valign inp1_a, inp2_a, out_a;
-  
-          inp1_a = XT_LASX2PP(inp1);
-          inp2_a = XT_LASX2PP(inp2);
-          /* Each iteration of loop is independent so safe to use concurrent pragma */
-#pragma concurrent
-          for(i=0;i < num_elm>>1;i++)
-          {
-            XT_LASX2IP(x1, inp1_a, inp1);
-            XT_LASX2IP(x2, inp2_a, inp2);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            ae_int32x2 store = AE_ZERO32();
-            AE_MOVF32X2(store, ones, check);
-            
-            *out++ = AE_MOVAD32_H(store);
-            *out++ = AE_MOVAD32_L(store);
-          }
-      }
-      // Remainder Loop
-      if (num_elm & 1)
-      {
-          xtfloat a1, a2, a;
-          XT_LSIP(a1, (xtfloat *)inp1, 0);
-          XT_LSIP(a2, (xtfloat *)inp2, 0);
-          
-          a = XT_SUB_S(a2, a1);
-          
-          check = 0;        
-          if(a != 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *out++ = store;
-      }
-    }
-
-    return 0;
-}
-#endif
-
-#if HAVE_VFPU
-static void internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(UWORD8 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  out_lc,
-                             WORD32  in_lc,
-                             xtbool  sign_flag,
-                             WORD32 kernel_type)
-{
-  int i, j;
-
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  
-  xtbool check;
-  
-  xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32());
-
-  int num_simd2_ops;
-  int num_scalar_ops;
-
-  if(out_lc)
-  {
-    num_simd2_ops = in_lc >> 1;
-    num_scalar_ops = in_lc & 1;
-  }
-  else
-  {
-    num_simd2_ops = (in_lc >> 2) << 1;
-    num_scalar_ops = in_lc & 3;
-  }
-
-    xtfloatx2 x1, x2, y;
-    xtfloat a0, b0, c0;
-
-  /* For computing inp2 - inp1 */   
-  if(sign_flag){  
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc];
-      
-      if(kernel_type == 0)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLE_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLE_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(a0, b0);   
-          
-          check = 0;
-          
-          if(c0 <= 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 1)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLT_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLT_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(a0, b0);   
-          
-          check = 0;
-          
-          if(c0 < 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 2)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLE_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLE_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(b0, a0);   
-          
-          check = 0;
-          
-          if(c0 <= 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 3)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLT_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLT_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(b0, a0);   
-          
-          check = 0;
-          
-          if(c0 < 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 4)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          //c0 = XT_SUB_S(a0, b0);   
-          
-          check = 0;
-          
-          if(a0 == b0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 5)
-      {
-        ae_int32x2 ones = AE_MOVDA32(1);
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            ae_int32x2 store = AE_ZERO32();
-            AE_MOVF32X2(store, ones, check);
-            
-            *p_c++ = AE_MOVAD32_H(store);
-            *p_c++ = AE_MOVAD32_L(store);
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            ae_int32x2 store = AE_ZERO32();
-            AE_MOVF32X2(store, ones, check);
-            
-            *p_c++ = AE_MOVAD32_H(store);
-            *p_c++ = AE_MOVAD32_L(store);
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(a0, b0);   
-          
-          check = 0;
-          
-          if(c0 != 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    for(i = 0; i < out_lc; i++)
-    {
-      p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
-      p_b = (xtfloatx2 *)p_inp2;
-      UWORD8 *p_c = (UWORD8 *)&p_out[i * in_lc];
-      
-      if(kernel_type == 0)
-      {    
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLE_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-  
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLE_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(b0, a0);   
-          
-          check = 0;
-          
-          if(c0 <= 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if (kernel_type == 1)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLT_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-  
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = XT_OLT_SX2(x2, x1);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(b0, a0);   
-          
-          check = 0;
-          
-          if(c0 < 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 2)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLE_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-  
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLE_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(a0, b0);   
-          
-          check = 0;
-          
-          if(c0 <= 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 3)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLT_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-  
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x1, x2);
-            xtbool2 check = XT_OLT_SX2(x1, x2);
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(a0, b0);   
-          
-          check = 0;
-          
-          if(c0 < 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 4)
-      {
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-  
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            uint8_t val = AE_MOVAB2(check);
-            
-            uint8_t store1 = (val >> 1) & 0x1;
-            *p_c++ = store1;
-            
-            uint8_t store0 = val & 0x1;
-            *p_c++ = store0;
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          //c0 = XT_SUB_S(b0, a0);   
-          
-          check = 0;
-          
-          if(a0 == b0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-      else if(kernel_type == 5)
-      {
-        ae_int32x2 ones = AE_MOVDA32(1);
-        if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-        {
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LSX2IP(x1, p_a, 2*sizeof(FLOAT32));
-            XT_LSX2IP(x2, p_b, 2*sizeof(FLOAT32));
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            ae_int32x2 store = AE_ZERO32();
-            AE_MOVF32X2(store, ones, check);
-            
-            *p_c++ = AE_MOVAD32_H(store);
-            *p_c++ = AE_MOVAD32_L(store);
-          }
-        }
-        else
-        {
-          ae_valign vinp1, vinp2;
-          vinp1 = XT_LASX2PP(p_a);
-          vinp2 = XT_LASX2PP(p_b);
-  
-          for(j = 0; j < num_simd2_ops; j++)
-          {
-            XT_LASX2IP(x1, vinp1, p_a);
-            XT_LASX2IP(x2, vinp2, p_b);
-            
-            //y = XT_SUB_SX2(x2, x1);
-            xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-            
-            ae_int32x2 store = AE_ZERO32();
-            AE_MOVF32X2(store, ones, check);
-            
-            *p_c++ = AE_MOVAD32_H(store);
-            *p_c++ = AE_MOVAD32_L(store);
-          }
-        }
-        if(num_scalar_ops !=0)
-        {
-          XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
-          XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
-          c0 = XT_SUB_S(b0, a0);   
-          
-          check = 0;
-          
-          if(c0 != 0)
-            check = 1;
-          
-          uint8_t store = AE_MOVAB(check);
-          *p_c++ = store;
-        }
-      }
-    }  
-  }
-}
-
-static void internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(UWORD8 * __restrict__ p_out,
-                    const    FLOAT32 * __restrict__ p_inp1,
-                    const    FLOAT32 * __restrict__ p_inp2,
-                             WORD32  num_elm,
-                             xtbool  sign_flag,
-                             WORD32 kernel_type)
-{
-  int i;
-  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
-  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
-  
-  xtbool check;
-  
-  UWORD8 * p_c = p_out;
-  xtfloatx2 float_0 = XT_MOV_SX2(AE_ZERO32());
-
-  const int num_simd2_ops = num_elm >> 1;
-  const int num_scalar_ops = num_elm & 1;
-
-  xtfloat a0_7, out;
-  xtfloatx2 x1, x2, y;
-  x2 = XT_LSI((xtfloat *)p_b, 0);
-        
-  /* For computing inp2 - inp1 */      
-  if(sign_flag){
-    if(kernel_type == 0)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = XT_OLE_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);   
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = XT_OLE_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }  
-      }  
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(a0_7, x2);   
-        
-        check = 0;
-          
-        if(out <= 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 1)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = XT_OLT_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);   
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = XT_OLT_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }  
-      }  
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(a0_7, x2);   
-        
-        check = 0;
-          
-        if(out < 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 2)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLE_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);   
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLE_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }  
-      }  
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(x2, a0_7);   
-        
-        check = 0;
-          
-        if(out <= 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 3)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLT_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);   
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLT_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }  
-      }  
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(x2, a0_7);   
-        
-        check = 0;
-          
-        if(out < 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 4)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);   
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }  
-      }  
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(a0_7, x2);   
-        
-        check = 0;
-          
-        if(out == 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 5)
-    {
-      ae_int32x2 ones = AE_MOVDA32(1);
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-          
-          ae_int32x2 store = AE_ZERO32();
-          AE_MOVF32X2(store, ones, check);
-          
-          *p_c++ = AE_MOVAD32_H(store);
-          *p_c++ = AE_MOVAD32_L(store); 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);   
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-          
-          ae_int32x2 store = AE_ZERO32();
-          AE_MOVF32X2(store, ones, check);
-          
-          *p_c++ = AE_MOVAD32_H(store);
-          *p_c++ = AE_MOVAD32_L(store);
-        }  
-      }  
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(a0_7, x2);   
-        
-        check = 0;
-          
-        if(out != 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-  }
-  /* For computing inp1 - inp2 */   
-  else
-  {
-    if(kernel_type == 0)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLE_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);
-        
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLE_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(x2, a0_7);   
-        
-        check = 0;
-          
-        if(out <= 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 1)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLT_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);
-        
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = XT_OLT_SX2(x2, x1);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(x2, a0_7);   
-        
-        check = 0;
-          
-        if(out < 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 2)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = XT_OLE_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);
-        
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = XT_OLE_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(a0_7, x2);   
-        
-        check = 0;
-          
-        if(out <= 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 3)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x1, x2);
-          
-          xtbool2 check = XT_OLT_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);
-        
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x1, x2);
-          
-        xtbool2 check = XT_OLT_SX2(x1, x2);
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(a0_7, x2);   
-        
-        check = 0;
-          
-        if(out < 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-    else if(kernel_type == 4)
-    {
-      if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
-      {
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0; 
-        }
-      }
-      else
-      {
-        ae_valign inp1_a, out_a;
-        inp1_a = XT_LASX2PP(p_a);
-        
-        for(i=0; i<num_simd2_ops; i++)
-        {
-          XT_LASX2IP(x1, inp1_a, p_a);
-          //y = XT_SUB_SX2(x2, x1);
-          
-          xtbool2 check = AE_EQ32(XT_AE_MOVINT32X2_FROMXTFLOATX2(x1), XT_AE_MOVINT32X2_FROMXTFLOATX2(x2));
-          
-          uint8_t val = AE_MOVAB2(check);
-          
-          uint8_t store1 = (val >> 1) & 0x1;
-          *p_c++ = store1;
-          
-          uint8_t store0 = val & 0x1;
-          *p_c++ = store0;
-        }
-      }
-      if(num_scalar_ops !=0)
-      {
-        XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
-        out = XT_SUB_S(x2, a0_7);   
-        
-        check = 0;
-          
-        if(out == 0)
-          check = 1;
-          
-        uint8_t store = AE_MOVAB(check);
-        *p_c++ = store;
-      }
-    }
-  }
-}
-#endif
-
-#if !HAVE_VFPU
-DISCARD_FUN_FOR_NONVOID_RETURN(
-             WORD32, xa_nn_elm_greaterequal_broadcast_4D_f32xf32_f32,
-             (
-                      WORD8 * p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * p_inp2,
-                      const WORD32 *const p_inp2_shape,
-                      WORD32 kernel_type
-              )
-           )
-#else           
-WORD32 xa_nn_elm_greater_lesser_equal_broadcast_4D_f32xf32_f32(WORD8 * __restrict__ p_out,
-                      const WORD32 *const p_out_shape,
-                      const FLOAT32 * __restrict__ p_inp1,
-                      const WORD32 *const p_inp1_shape,
-                      const FLOAT32 * __restrict__ p_inp2,
-                      const WORD32 *const p_inp2_shape,
-                      WORD32 kernel_type)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(UWORD8), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
-
-  /* Check shapes */
-  int i;
-  xtbool sign_flag;
-  for(i = 0; i < 4; i++)
-  {
-    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
-       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
-    {
-      return -1;
-    }
-  }
-
-  WORD32 inp1_strides[4], inp2_strides[4];
-  inp1_strides[3] = 1;
-  inp2_strides[3] = 1;
-  for(i = 2; i >= 0; i--)
-  {
-    ae_int32x2 d_str, d_shape;
-    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
-    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
-    d_str = AE_MULP32X2(d_str, d_shape);
-    inp1_strides[i] = AE_MOVAD32_H(d_str);
-    inp2_strides[i] = AE_MOVAD32_L(d_str);
-  }
-
-  int need_broadcast = 0;
-  int inp1_const = 1, inp2_const = 1;
-  for(i = 0; i < 4; i++)
-  {
-    if(p_inp1_shape[i] != p_inp2_shape[i])
-    {
-      if(p_inp1_shape[i] == 1)
-        inp1_strides[i] = 0;
-      else
-        inp2_strides[i] = 0;
-
-      need_broadcast = 1;
-    }
-    if(p_inp1_shape[i] != 1)
-      inp1_const &= 0;
-    if(p_inp2_shape[i] != 1)
-      inp2_const &= 0;
-  }
-  int itr0, itr1, itr2;
-
-  UWORD8 *p_out_tmp = p_out;
-  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
-  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
-  if(need_broadcast == 0)
-  {
-    sign_flag = 0;
-    internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(
-                p_out,
-                p_inp1,
-                p_inp2,
-                1,
-                p_out_shape[0] * inp1_strides[0],
-                sign_flag,
-                kernel_type);
-  }
-  else if(inp1_strides[3] == inp2_strides[3])
-  {
-    WORD32 in_lc, out_lc;
-    sign_flag = 0;
-    in_lc = p_out_shape[2] * p_out_shape[3];
-    out_lc = 1;
-    if(inp1_strides[2] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[2];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-    else if(inp2_strides[2] == 0)
-    {
-      in_lc = p_out_shape[3];
-      out_lc = p_out_shape[2];
-    }
-
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        internal_elm_greater_lesser_equal_broadcast_2D_f32xf32_f32(
-            p_out_tmp,
-            p_inp1_tmp0,
-            p_inp2_tmp0,
-            out_lc,
-            in_lc,
-            sign_flag,
-            kernel_type);
-        p_out_tmp += in_lc * out_lc;
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  else if(inp1_const == 1 || inp2_const == 1)
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      sign_flag = 1;
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-    }
-    internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(
-        p_out_tmp,
-        p_inp1_tmp,
-        p_inp2_tmp,
-        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
-        sign_flag,
-        kernel_type);
-  }
-  else
-  {
-    sign_flag = 0;
-    if(inp1_strides[3] == 0)
-    {
-      const FLOAT32 *tmp;
-      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
-      sign_flag = 1;
-      int tmp_strides[3];
-      tmp_strides[0] = inp1_strides[0];
-      tmp_strides[1] = inp1_strides[1];
-      tmp_strides[2] = inp1_strides[2];
-
-      inp1_strides[0] = inp2_strides[0];
-      inp1_strides[1] = inp2_strides[1];
-      inp1_strides[2] = inp2_strides[2];
-
-      inp2_strides[0] = tmp_strides[0];
-      inp2_strides[1] = tmp_strides[1];
-      inp2_strides[2] = tmp_strides[2];
-    }
-    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
-    {
-      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
-      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
-      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
-      {
-        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
-        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
-        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
-        {
-          {
-            internal_elm_greater_lesser_equal_broadcast_f32xf32_f32(
-                p_out_tmp,
-                p_inp1_tmp1,
-                p_inp2_tmp1,
-                p_out_shape[3], 
-                sign_flag,
-                kernel_type);
-          }
-          p_out_tmp += p_out_shape[3];
-          p_inp1_tmp1 += inp1_strides[2];
-          p_inp2_tmp1 += inp2_strides[2];
-        }
-        p_inp1_tmp0 += inp1_strides[1];
-        p_inp2_tmp0 += inp2_strides[1];
-      }
-      p_inp1_tmp += inp1_strides[0];
-      p_inp2_tmp += inp2_strides[0];
-    }
-  }
-  return 0;
-}
-#endif
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
deleted file mode 100644
index 5b3ed385568..00000000000
--- a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c
+++ /dev/null
@@ -1,241 +0,0 @@
-#include "xa_nnlib_common.h"
-#include "stdio.h"
-/*
- * Currently only supports upto 5D input tensors.
- * 1/2/3/4 D input tensors will be scaled up to 5D.
- * For example, 2x3 -> 1x1x1x2x3.
- */
-
-WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out
-                    ,const WORD32 *const p_out_shape
-                    ,const WORD32 * __restrict__ p_inp
-                    ,const WORD32 *const p_inp_shape
-                    ,const WORD32 * __restrict__ p_permute_vec
-                    ,WORD32 num_out_dims
-                    ,WORD32 num_inp_dims)
-{
-  /* NULL pointer checks */
-  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
-  XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1);
-
-  /* Invalid input checks */
-  XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1);
-  XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1);
-
-  int itr = 0;
-  for(itr=0; itr < num_inp_dims; itr++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1);
-  }
-  for(itr=0; itr < num_out_dims; itr++)
-  {
-    XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1);
-  }
-
-
-  /* Output shape provided must be correct based on input
-   * shape and permute values */
-  for(itr=0; itr < num_out_dims; itr++)
-  {
-    int output_dim = p_out_shape[itr];
-    int expected_dim = p_inp_shape[p_permute_vec[itr]];
-    XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1);
-  }
-
-  /* Pointer alignment checks */
-  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
-  XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1);
-
-  /* Shift all dim with 1 in the outer part */
-  int eff_output_shape[5];
-  int eff_permute_vec[5];
-
-  for(int i = 0; i < num_out_dims; i++)
-  {
-    eff_output_shape[i] = p_out_shape[i];
-    eff_permute_vec[i] = p_permute_vec[i];
-  }
-
-  int one_i=num_out_dims-1, non_one_i=num_out_dims-1;
-  while(one_i > 0 && non_one_i >=0){
-    while(one_i > 0 && eff_output_shape[one_i]!=1){
-      one_i--;
-    }
-    non_one_i = one_i;
-    while(non_one_i >= 0 && eff_output_shape[non_one_i]==1)
-    {
-      non_one_i--;
-    }
-    if(one_i > 0 && non_one_i >=0){
-      int temp;
-      /*swap output_shape*/
-      {
-        temp = eff_output_shape[one_i];
-        eff_output_shape[one_i] = eff_output_shape[non_one_i];
-        eff_output_shape[non_one_i] = temp;
-      }
-      /*swap permute_vec*/
-      {
-        temp = eff_permute_vec[one_i];
-        eff_permute_vec[one_i] = eff_permute_vec[non_one_i];
-        eff_permute_vec[non_one_i] = temp;
-      }
-
-    }
-  }
-
-  /* Promoting lesser dim tensors to 5D tensors.
-   * Also updating the permute_vec and shapes as needed for optimization */
-  int p_5D_inp_shape[5] = {1, 1, 1, 1, 1};
-  int p_5D_out_shape[5] = {1, 1, 1, 1, 1};
-  int p_5D_permute_vec[5] = {0, 1, 2, 3, 4};
-
-  /* Check if any inner inp dimension is same in the output */
-  int last_dim_same = 1, last_n_same_dim = 0;
-  itr = num_inp_dims - 1;
-  while(itr >= 0)
-  {
-    last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim;
-    last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0;
-    itr--;
-  }
-
-  int dims_added = 5 - num_inp_dims;
-  itr = num_inp_dims - 1;
-  int same_count = last_n_same_dim;
-  int count = 4;
-  while(itr >= 0)
-  {
-    p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr];
-    p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr];
-    same_count--;
-    itr--;
-    count = (same_count > 0) ? count : count - 1;
-  }
-
-  itr = num_inp_dims - 1;
-  same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0;
-  count = 4;
-  while(itr >= 0)
-  {
-    p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added;
-    same_count--;
-    itr--;
-    count--;
-  }
-
-  int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4;
-  int inp_dim1, inp_dim2, inp_dim3, inp_dim4;
-  int inp_stride[5];
-
-  out_dim0 = p_5D_out_shape[0];
-  out_dim1 = p_5D_out_shape[1];
-  out_dim2 = p_5D_out_shape[2];
-  out_dim3 = p_5D_out_shape[3];
-  out_dim4 = p_5D_out_shape[4];
-
-  inp_dim1 = p_5D_inp_shape[1];
-  inp_dim2 = p_5D_inp_shape[2];
-  inp_dim3 = p_5D_inp_shape[3];
-  inp_dim4 = p_5D_inp_shape[4];
-
-  inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4;
-  inp_stride[1] = inp_dim2*inp_dim3*inp_dim4;
-  inp_stride[2] = inp_dim3*inp_dim4;
-  inp_stride[3] = inp_dim4;
-  inp_stride[4] = 1;
-
-  if(last_n_same_dim)
-  {
-    int itr0, itr1, itr2, itr3, itr4;
-    WORD32 *p_inp0 = (WORD32 *)p_inp;
-    for(itr0 = 0; itr0 < out_dim0; itr0++)
-    {
-      WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
-#pragma loop_count min=1
-      for(itr1 = 0; itr1 < out_dim1; itr1++)
-      {
-        WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
-#pragma loop_count min=1
-        for(itr2 = 0; itr2 < out_dim2; itr2++)
-        {
-          WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
-#pragma loop_count min=1
-          for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4)
-          {
-            WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-            ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4);
-            ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out);
-            ae_valign a_inp = AE_LA64_PP(pae_i);
-            ae_valign a_out = AE_ZALIGN64();
-            ae_int32x2 d0;
-            for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-            {
-              AE_LA32X2_IP(d0, a_inp, pae_i);
-              AE_SA32X2_IP(d0, a_out, pae_o);
-            }
-            AE_SA64POS_FP(a_out, pae_o);
-            ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i);
-            ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o);
-#pragma loop_count max=3
-            for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-            {
-              puae_o[itr4] = puae_i[itr4];
-            }
-          }
-        }
-      }
-    }
-  }
-  else
-  {
-    int itr0, itr1, itr2, itr3, itr4;
-    WORD32 *p_inp0 = (WORD32 *)p_inp;
-    for(itr0 = 0; itr0 < out_dim0; itr0++)
-    {
-      WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]);
-      for(itr1 = 0; itr1 < out_dim1; itr1++)
-      {
-        WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]);
-        for(itr2 = 0; itr2 < out_dim2; itr2++)
-        {
-          WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]);
-          for(itr3 = 0; itr3 < out_dim3; itr3++)
-          {
-            WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]);
-
-            ae_valign a_out = AE_ZALIGN64();
-            for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++)
-            {
-              ae_int32x2 d0, d1;
-              ae_int32x2 tmp0;
-
-              d0 = AE_L32_X((ae_int32 *)p_inp4, 0);
-              p_inp4 += inp_stride[p_5D_permute_vec[4]];
-              d1 = AE_L32_X((ae_int32 *)p_inp4, 0);
-              p_inp4 += inp_stride[p_5D_permute_vec[4]];
-
-              tmp0 = AE_SEL32_HH(d0, d1);
-
-              AE_SA32X2_IP(tmp0, a_out, (ae_int32x2 *)p_out);
-            }
-            AE_SA64POS_FP(a_out, p_out);
-#pragma loop_count max=3
-            for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++)
-            {
-              *p_out++ = *p_inp4;
-            }
-          }
-        }
-      }
-    }
-  }
-
-  return 0;
-}