From 9557e0e3bbd019327142bf4ce52168ad625e1a1d Mon Sep 17 00:00:00 2001
From: JulianJuelg <julian.juelg@gmx.de>
Date: Tue, 27 Jan 2026 21:07:02 +0100
Subject: [PATCH 1/3] first vector api implementaions + benchmarking suit

---
 .../runtime/codegen/LibSpoofPrimitives.java   | 287 +++++++++++++++++-
 .../primitives_vector_api/BenchCase.java      |  46 +++
 .../primitives_vector_api/BenchUtil.java      |  63 ++++
 .../primitives_vector_api/Ctx.java            |  33 ++
 .../PrimitivePerfSuite.java                   |  43 +++
 .../codegen/performance_tests/benchUtil.java  |  36 +++
 .../rowMaxsVectMultTest.java                  |  95 ++++++
 .../performance_tests/vectDivAddTest.java     | 100 ++++++
 .../performance_tests/vectEqualWriteTest.java |  61 ++++
 .../performance_tests/vectSumTest.java        |  74 +++++
 10 files changed, 833 insertions(+), 5 deletions(-)
 create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
 create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
 create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
 create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java
 create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java
diff --git a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
index ebb42676f0e..214226497f0 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
@@ -28,10 +28,15 @@
 import org.apache.sysds.runtime.functionobjects.IntegerDivide;
 import org.apache.sysds.runtime.functionobjects.Modulus;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col;
 import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling;
 import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
-import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.FloatVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
 
 /**
  * This library contains all vector primitives that are used in 
@@ -45,6 +50,12 @@ public class LibSpoofPrimitives
 	private static IntegerDivide intDiv = IntegerDivide.getFnObject();
 	private static Modulus mod = Modulus.getFnObject();
 	private static BitwAnd bwAnd = BitwAnd.getBitwAndFnObject();
+
+	// Vector API initializations
+	private static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+	private static final VectorSpecies<Float> FSPECIES = FloatVector.SPECIES_PREFERRED;
+	private static final int vLen = SPECIES.length();
+
 	
 	//global pool of reusable vectors, individual operations set up their own thread-local
 	//ring buffers of reusable vectors with specific number of vectors and vector sizes 
@@ -56,7 +67,7 @@ public class LibSpoofPrimitives
 		@Override protected SparseVectorBuffer initialValue() { return new SparseVectorBuffer(0,0,0); }
 	};
 
-	public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
+	public static double scalarrowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
 		double val = Double.NEGATIVE_INFINITY;
 		int j=0;
 		for( int i = ai; i < ai+len; i++ )
@@ -64,6 +75,78 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int
 		return val;
 	}
 
+	public static double scalarrowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) {
+		float val = Float.NEGATIVE_INFINITY;
+		int j=0;
+		for( int i = ai; i < ai+len; i++ )
+			val = Math.max(a[i]*b[j++], val);
+		return val;
+	}
+
+	public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
+		double maxVal = Double.NEGATIVE_INFINITY;
+	
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+	
+		// vector accumulator for max
+		DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+	
+		// IMPORTANT:
+		// Your original code uses b[j++] starting at 0 (ignores bi).
+		// I assume that is a bug/oversight, so I use b[bi + i].
+		// If you *must* keep exact old semantics, replace (bi + i) with just i.
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i);
+			DoubleVector prod = va.mul(vb);
+			vmax = vmax.max(prod);
+		}
+	
+		// Reduce vector lanes to a scalar max
+		maxVal = vmax.reduceLanes(VectorOperators.MAX);
+	
+		// Tail
+		for (; i < len; i++) {
+			maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+		}
+	
+		return maxVal;
+	}
+
+	public static double rowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) {
+		float maxVal = Float.NEGATIVE_INFINITY;
+	
+		int i = 0;
+		int upper = FSPECIES.loopBound(len);
+	
+		// vector accumulator for max
+		FloatVector vmax = FloatVector.broadcast(FSPECIES, Float.NEGATIVE_INFINITY);
+	
+		// IMPORTANT:
+		// Your original code uses b[j++] starting at 0 (ignores bi).
+		// I assume that is a bug/oversight, so I use b[bi + i].
+		// If you *must* keep exact old semantics, replace (bi + i) with just i.
+		for (; i < upper; i += FSPECIES.length()) {
+			FloatVector va = FloatVector.fromArray(FSPECIES, a, ai + i);
+			FloatVector vb = FloatVector.fromArray(FSPECIES, b, bi + i);
+			FloatVector prod = va.mul(vb);
+			vmax = vmax.max(prod);
+		}
+	
+		// Reduce vector lanes to a scalar max
+		maxVal = vmax.reduceLanes(VectorOperators.MAX);
+	
+		// Tail
+		for (; i < len; i++) {
+			maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+		}
+	
+		return maxVal;
+	}
+	
+
+
 	public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
 		double val = Double.NEGATIVE_INFINITY;
 		for( int i = ai; i < ai+len; i++ )
@@ -295,7 +378,9 @@ public static double[] vectCbindWrite(double[] a, double[] b, int[] aix, int ai,
 	 * @param len number of processed elements
 	 * @return sum value
 	 */
-	public static double vectSum(double[] a, int ai, int len) { 
+
+	// scalar function 
+	public static double scalarvectSum(double[] a, int ai, int len) { 
 		double val = 0;
 		final int bn = len%8;
 		
@@ -313,6 +398,113 @@ public static double vectSum(double[] a, int ai, int len) {
 		//scalar result
 		return val; 
 	} 
+
+	public static double scalarvectSumFloat(float[] a, int ai, int len) { 
+		float val = 0;
+		final int bn = len%8;
+		
+		//compute rest
+		for( int i = ai; i < ai+bn; i++ )
+			val += a[ i ];
+		
+		//unrolled 8-block (for better instruction-level parallelism)
+		for( int i = ai+bn; i < ai+len; i+=8 ) {
+			//read 64B cacheline of a, compute cval' = sum(a) + cval
+			val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ]
+			     + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ i+7 ];
+		}
+		
+		//scalar result
+		return val; 
+	}
+	public static double vectSum(double[] a, int ai, int len) {
+        double sum = 0d;
+        int i = 0;
+
+        DoubleVector acc = DoubleVector.zero(SPECIES);
+
+		// largest multiple of vLen <= len
+        int upperBound = SPECIES.loopBound(len);
+
+        for (; i < upperBound; i += SPECIES.length()) {
+            DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+            acc = acc.add(v);
+        }
+
+        // reduce vector lanes into scalar
+        sum += acc.reduceLanes(VectorOperators.ADD);
+
+        // tail (remaining elements)
+        for (; i < len; i++) {
+            sum += a[ai + i];
+        }
+
+        return sum;
+    }
+
+	public static double rowMaxsVectMultVec2Acc(double[] a, double[] b, int ai, int bi, int len) {
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+	
+		DoubleVector vmax1 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+		DoubleVector vmax2 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+	
+		// step = 2 vectors per iteration
+		int step = vLen * 2;
+	
+		for (; i + step <= upper; i += step) {
+			DoubleVector va1 = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb1 = DoubleVector.fromArray(SPECIES, b, bi + i);
+			vmax1 = vmax1.max(va1.mul(vb1));
+	
+			DoubleVector va2 = DoubleVector.fromArray(SPECIES, a, ai + i + vLen);
+			DoubleVector vb2 = DoubleVector.fromArray(SPECIES, b, bi + i + vLen);
+			vmax2 = vmax2.max(va2.mul(vb2));
+		}
+	
+		// finish remaining vector loop
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i);
+			vmax1 = vmax1.max(va.mul(vb));
+		}
+	
+		// combine both accumulators
+		DoubleVector vmax = vmax1.max(vmax2);
+		double maxVal = vmax.reduceLanes(VectorOperators.MAX);
+	
+		// tail
+		for (; i < len; i++) {
+			maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+		}
+	
+		return maxVal;
+	}
+	
+	public static double vectSumFloat(float[] a, int ai, int len) {
+        float sum = 0;
+        int i = 0;
+
+        FloatVector acc = FloatVector.zero(FSPECIES);
+
+		// largest multiple of vLen <= len
+        int upperBound = FSPECIES.loopBound(len);
+
+        for (; i < upperBound; i += FSPECIES.length()) {
+            FloatVector v = FloatVector.fromArray(FSPECIES, a, ai + i);
+            acc = acc.add(v);
+        }
+
+        // reduce vector lanes into scalar
+        sum += acc.reduceLanes(VectorOperators.ADD);
+
+        // tail (remaining elements)
+        for (; i < len; i++) {
+            sum += a[ai + i];
+        }
+
+        return sum;
+    }
 	
 	public static double vectSum(double[] avals, int[] aix, int ai, int alen, int len) {
 		//forward to dense as column indexes not required here
@@ -373,10 +565,68 @@ public static double vectMean(double[] avals, int[] aix, int ai, int alen, int l
 	
 	//custom vector div
 	
-	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+	public static void scalarvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] +=  a[j] / bval;
 	}
+
+	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		// Handle trivial case
+		if (len <= 0) return;
+
+		// Preferred SIMD width for the current CPU (AVX2/AVX-512/etc.)
+		final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+
+		// Hoist reciprocal (1 division instead of len divisions)
+		final double inv = 1.0 / bval;
+		final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+
+		int i = 0;
+		final int upperBound = SPECIES.loopBound(len);
+
+		// Vector loop
+		for (; i < upperBound; i += SPECIES.length()) {
+			// load a and c
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+			// vc += va * inv
+			vc = vc.add(va.mul(vinv));
+
+			// store result back to c
+			vc.intoArray(c, ci + i);
+		}
+
+		// Tail loop
+		for (; i < len; i++) {
+			c[ci + i] += a[ai + i] * inv;
+		}
+	}
+
+	public static void pureDivvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		if (len <= 0) return;
+	
+		final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+		final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+	
+		int i = 0;
+		final int upperBound = SPECIES.loopBound(len);
+	
+		for (; i < upperBound; i += SPECIES.length()) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i);
+	
+			vc = vc.add(va.div(vb));
+	
+			vc.intoArray(c, ci + i);
+		}
+	
+		for (; i < len; i++) {
+			c[ci + i] += a[ai + i] / bval;
+		}
+	}
+	
+
 	
 	public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
@@ -1607,12 +1857,39 @@ public static void vectEqualAdd(double bval, double[] a, double[] c, int[] aix,
 		vectEqualAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) {
+	public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++)
 			c[j] = (a[ai] == bval) ? 1 : 0;
 		return c;
 	}
+	public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+	
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+	
+		DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+		DoubleVector zeros = DoubleVector.zero(SPECIES);
+		DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+	
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			var mask = va.compare(VectorOperators.EQ, vb);
+	
+			// out = (va == vb) ? 1.0 : 0.0
+			DoubleVector out = zeros.blend(ones, mask);
+			out.intoArray(c, i);
+		}
+	
+		// tail
+		for (; i < len; i++) {
+			c[i] = (a[ai + i] == bval) ? 1 : 0;
+		}
+	
+		return c;
+	}
+	
 	
 	public static double[] vectEqualWrite(double bval, double[] a, int ai, int len) {
 		return vectEqualWrite(a, bval, ai, len);
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
new file mode 100644
index 00000000000..b748642171d
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
@@ -0,0 +1,46 @@
+package org.apache.sysds.performance.primitives_vector_api;
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+
+public enum BenchCase {
+    VECT_SUM(
+      "vectSum dense",
+      OutKind.SCALAR_DOUBLE,
+      ctx -> ctx.initDenseA(),
+      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectSum(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.scalarRes;
+             },
+      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectSum(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.vectorRes;},
+      ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+    ),
+  
+    VECT_DIV_ADD(
+      "vectDivAdd dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> ctx.initDenseAandC(),
+      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len),
+      ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    );
+    public enum OutKind { SCALAR_DOUBLE, ARRAY_DOUBLE }
+    public final String name;
+    public final java.util.function.Consumer<Ctx> setup;
+    public final java.util.function.Consumer<Ctx> scalar;
+    public final java.util.function.Consumer<Ctx> vector;
+    public final java.util.function.Consumer<Ctx> verify;
+    public final OutKind outKind;
+
+  
+    BenchCase(String name,
+              OutKind outKind,
+              java.util.function.Consumer<Ctx> setup,
+              java.util.function.Consumer<Ctx> scalar,
+              java.util.function.Consumer<Ctx> vector,
+              java.util.function.Consumer<Ctx> verify) {
+      this.name = name; this.outKind = outKind; this.setup = setup; this.scalar = scalar; this.vector = vector; this.verify = verify;
+    }
+  }
+  
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
new file mode 100644
index 00000000000..12af0df27e1
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java
@@ -0,0 +1,63 @@
+package org.apache.sysds.performance.primitives_vector_api;
+
+
+public class BenchUtil {
+    public static volatile double blackhole;
+  
+    public static void warmup(Runnable r,int iters ) {
+      for (int i = 0; i < iters; i++) r.run();
+    }
+  
+    public static double measure(Runnable r,int iters) {
+      System.gc();
+      long t0 = System.nanoTime();
+      for (int i = 0; i < iters; i++) r.run();
+      long t1 = System.nanoTime();
+      return (t1 - t0) / (double) iters;
+    }
+
+    // ---- args helpers ----
+    public static int argInt(String[] args, String key, int def) {
+        for (int i = 0; i < args.length - 1; i++)
+        if (args[i].equals(key))
+            return Integer.parseInt(args[i + 1]);
+        return def;
+    }
+
+    public static String argStr(String[] args, String key, String def) {
+        for (int i = 0; i < args.length - 1; i++)
+        if (args[i].equals(key))
+            return args[i + 1];
+        return def;
+    }
+  
+    public static double maxAbsDiff(double[] a, double[] b) {
+      double m = 0;
+      for (int i = 0; i < a.length; i++)
+        m = Math.max(m, Math.abs(a[i] - b[i]));
+      return m;
+    }
+  
+    public static void printScalarDouble(String name,
+        double nsScalar, double nsVector,
+        double scalarRes, double vectorRes,
+        boolean ok) {
+  
+      double speedup = nsScalar / nsVector;
+      System.out.printf("%s | scalar %.1f ns | vector %.1f ns | speedup %.3fx | " +
+                        "s=%.6g v=%.6g | %s%n",
+          name, nsScalar, nsVector, speedup, scalarRes, vectorRes, ok ? "OK" : "FAIL");
+    }
+  
+    public static void printArrayDiff(String name,
+        double nsScalar, double nsVector,
+        double maxDiff,
+        boolean ok) {
+  
+      double speedup = nsScalar / nsVector;
+      System.out.printf("%s | scalar %.1f ns | vector %.1f ns | speedup %.3fx | " +
+                        "maxDiff=%.6g | %s%n",
+          name, nsScalar, nsVector, speedup, maxDiff, ok ? "OK" : "FAIL");
+    }
+  }
+  
\ No newline at end of file
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
new file mode 100644
index 00000000000..84c66266c8f
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
@@ -0,0 +1,33 @@
+package org.apache.sysds.performance.primitives_vector_api;
+
+public class Ctx {
+    public int len;
+    public double[] a, cInit, cScalar, cVector;
+    public double bval;
+  
+    public double scalarRes, vectorRes;
+    public double maxDiff;
+    public boolean ok;
+  
+    void initDenseA() {
+      a = new double[len];
+      for (int i = 0; i < len; i++) a[i] = (i % 10) - 5;
+    }
+  
+    void initDenseAandC() {
+      initDenseA();
+      cInit = new double[len];
+      for (int i = 0; i < len; i++) cInit[i] = (i % 10) - 5;
+      cScalar = java.util.Arrays.copyOf(cInit, len);
+      cVector = java.util.Arrays.copyOf(cInit, len);
+      bval = 1.234567;
+    }
+  
+    void resetC() {
+      if (cInit != null) {
+        System.arraycopy(cInit, 0, cScalar, 0, len);
+        System.arraycopy(cInit, 0, cVector, 0, len);
+      }
+    }
+  }
+  
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
new file mode 100644
index 00000000000..c478c7edfb7
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
@@ -0,0 +1,43 @@
+package org.apache.sysds.performance.primitives_vector_api;
+
+public class PrimitivePerfSuite {
+  public static void main(String[] args) {
+    //int len = BenchUtil.argInt(args, "--len", 262_144);
+    int len = BenchUtil.argInt(args, "--len", 1_000_000);
+    int warmup = BenchUtil.argInt(args, "--warmup", 10_000);
+    int iters = BenchUtil.argInt(args, "--iters", 2000);
+    String filter = BenchUtil.argStr(args, "--filter", "");
+
+    for (BenchCase bc : BenchCase.values()) {
+      if (!filter.isEmpty() && !bc.name.contains(filter)) continue;
+
+      Ctx ctx = new Ctx();
+      ctx.len = len;
+      bc.setup.accept(ctx);
+
+      // warm scalar
+      ctx.resetC(); 
+      BenchUtil.warmup(() -> {bc.scalar.accept(ctx); },warmup);
+      ctx.resetC();
+      double nsScalar = BenchUtil.measure(() -> { bc.scalar.accept(ctx); }, iters);
+
+      // warm vector
+      ctx.resetC(); 
+      BenchUtil.warmup(() -> {bc.vector.accept(ctx); }, warmup);
+      ctx.resetC();
+      double nsVector = BenchUtil.measure(() -> {bc.vector.accept(ctx); }, iters);
+
+      // verify once
+      ctx.resetC(); bc.scalar.accept(ctx);
+      bc.vector.accept(ctx);
+      bc.verify.accept(ctx);
+
+      if (bc.outKind == BenchCase.OutKind.SCALAR_DOUBLE) {
+        BenchUtil.printScalarDouble(bc.name, nsScalar, nsVector, ctx.scalarRes, ctx.vectorRes, ctx.ok);
+      } else {
+        BenchUtil.printArrayDiff(bc.name, nsScalar, nsVector, ctx.maxDiff, ctx.ok);
+      }
+      
+    }
+  }
+}
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java
new file mode 100644
index 00000000000..4c2bd230349
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java
@@ -0,0 +1,36 @@
+package org.apache.sysds.test.component.codegen.performance_tests;
+
+
+public class benchUtil {
+
+    public static void warmup(Runnable r, int iters) {
+        for (int i = 0; i < iters; i++) {
+            r.run();
+        }
+    }
+
+    /** returns ns per call */
+    public static double measure(Runnable r, int iters) {
+        long t0 = System.nanoTime();
+        for (int i = 0; i < iters; i++) {
+            r.run();
+        }
+        long t1 = System.nanoTime();
+        return (t1 - t0) / (double) iters;
+    }
+
+    public static double checksum(double[] x) {
+        double s = 0;
+        for (double v : x) s += v;
+        return s;
+    }
+
+    public static double maxAbsDiff(double[] a, double[] b) {
+        double m = 0;
+        for (int i = 0; i < a.length; i++) {
+            m = Math.max(m, Math.abs(a[i] - b[i]));
+        }
+        return m;
+    }
+}
+
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java
new file mode 100644
index 00000000000..c2cd8f068f4
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java
@@ -0,0 +1,95 @@
+package org.apache.sysds.test.component.codegen.performance_tests;
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+
+
+public class rowMaxsVectMultTest {
+    public static void main(String[] args) {
+        int len = 1_000_000;
+        double[] a = new double[len];
+        for (int i = 0; i < len; i++)
+            a[i] = (i % 10) - 5;
+        double[] b = new double[len];
+        for (int i = 0; i < len; i++)
+            b[i] = (i % 10) - 5;
+
+        float[] a_f = new float[len];
+        for (int i = 0; i < len; i++)
+            a_f[i] = (i % 10) - 5;
+        float[] b_f = new float[len];
+        for (int i = 0; i < len; i++)
+            b_f[i] = (i % 10) - 5;
+
+
+
+        // warm up
+        for (int i = 0; i < 20_000; i++) {
+            LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len);
+            LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len);
+            LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len);
+            LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len);
+            LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len);
+        }
+
+        // measure
+        long t2_0 = System.nanoTime();
+        double s2 = 0;
+        for (int i = 0; i < 2000; i++)
+            s2 += LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len);
+        long t2_1 = System.nanoTime();
+
+        System.out.println("Vector MaxVal=" + s2/2000);
+        System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0));
+        
+        // measure
+        long t1_0 = System.nanoTime();
+        double s1 = 0;
+        for (int i = 0; i < 2000; i++)
+            s1 += LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len);
+        long t1_1 = System.nanoTime();
+
+        System.out.println("Scalar MaxVal Sum=" + s1/2000);
+        System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0));
+
+
+        // measure
+        long t3_0 = System.nanoTime();
+        double s3 = 0;
+        for (int i = 0; i < 2000; i++)
+            s3 += LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len);
+        long t3_1 = System.nanoTime();
+
+        System.out.println("Vector Float MaxVal=" + s3/2000);
+        System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0));
+
+        // measure
+        long t4_0 = System.nanoTime();
+        double s4 = 0;
+        for (int i = 0; i < 2000; i++)
+            s4 += LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len);
+        long t4_1 = System.nanoTime();
+
+        System.out.println("Scalar Float MaxVal=" + s4/2000);
+        System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0));
+
+        // measure
+        long t5_0 = System.nanoTime();
+        double s5 = 0;
+        for (int i = 0; i < 2000; i++)
+            s5 += LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len);
+        long t5_1 = System.nanoTime();
+
+        System.out.println("Vector 2acc MaxVal=" + s5/2000);
+        System.out.println("Time per call (ns): " + ((t5_1 - t5_0) / 2000.0));
+
+    
+
+    }
+}
+/* 
+Scalar Sum=-1.0E9
+Time per call (ns): 142774.5625
+Vector Sum=-1.0E9
+Time per call (ns): 468854.25
+Vector Float Sum=-1.0E9
+Time per call (ns): 274727.3545
+*/
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java
new file mode 100644
index 00000000000..a43496d6a8d
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java
@@ -0,0 +1,100 @@
+
+package org.apache.sysds.test.component.codegen.performance_tests;
+import java.util.Arrays;
+
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+
+
+public class vectDivAddTest {
+    public static void main(String[] args) {
+        //final int len = 32_768;
+        final int len = 262_144;
+        //final int len = 1_000_000;
+
+        final double[] a = new double[len];
+        final double[] cInit = new double[len];
+
+        for (int i = 0; i < len; i++) {
+            a[i] = (i % 10) - 5;
+            cInit[i] = (i % 10) - 5;
+        }
+
+        final double bval = 1.234567; // NOT 1.0
+
+        double[] cScalar = Arrays.copyOf(cInit, len);
+        double[] cVector = Arrays.copyOf(cInit, len);
+        double[] cVectorPureDiv = Arrays.copyOf(cInit, len);
+
+        // Warm up scalar only
+        for (int i = 0; i < 200; i++) {
+            LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len);
+        }
+
+        // Warm up vector only
+        for (int i = 0; i < 200; i++) {
+            LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len);
+        }
+
+        // Warm up pure div vector only
+        for (int i = 0; i < 200; i++) {
+            LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len);
+        }
+
+        // Reset for measurement
+        cScalar = Arrays.copyOf(cInit, len);
+
+        // Measure scalar
+        long t0 = System.nanoTime();
+        for (int i = 0; i < 2000; i++) {
+            LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len);
+        }
+        long t1 = System.nanoTime();
+
+        // Reset for measurement
+        cVector = Arrays.copyOf(cInit, len);
+
+        // Measure vector
+        long t2 = System.nanoTime();
+        for (int i = 0; i < 2000; i++) {
+            LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len);
+        }
+        long t3 = System.nanoTime();
+
+        // Compare correctness
+        double maxDiff = 0;
+        double sumScalar = 0, sumVector = 0;
+        for (int i = 0; i < len; i++) {
+            maxDiff = Math.max(maxDiff, Math.abs(cScalar[i] - cVector[i]));
+            sumScalar += cScalar[i];
+            sumVector += cVector[i];
+        }
+
+
+         // Reset for measurement
+         cVectorPureDiv = Arrays.copyOf(cInit, len);
+
+         // Measure vector
+         long t4 = System.nanoTime();
+         for (int i = 0; i < 2000; i++) {
+             LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len);
+         }
+         long t5 = System.nanoTime();
+ 
+         // Compare correctness
+
+         double sum_prev = sumScalar + sumVector;
+         double sum_Vector_pure_div = 0;
+         for (int i = 0; i < len; i++) {
+             maxDiff = Math.max(maxDiff, Math.abs(sumScalar - cVectorPureDiv[i]));
+             sum_Vector_pure_div += cVectorPureDiv[i];
+         }
+
+        System.out.println("Scalar time per call (ns): " + ((t1 - t0) / 2000.0));
+        System.out.println("Vector time per call (ns): " + ((t3 - t2) / 2000.0));
+        System.out.println("pure vector div time per call (ns): " + ((t5 - t4) / 2000.0));
+        System.out.println("maxDiff: " + maxDiff);
+        System.out.println("checksum scalar: " + sumScalar);
+        System.out.println("checksum vector: " + sumVector);
+        System.out.println("checksum pure vector div : " + sum_Vector_pure_div);
+    }
+}
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java
new file mode 100644
index 00000000000..be5666a6847
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java
@@ -0,0 +1,61 @@
+
+package org.apache.sysds.test.component.codegen.performance_tests;
+import java.util.Arrays;
+
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+
+
+public class vectEqualWriteTest {
+    public static void main(String[] args) {
+        //final int len = 32_768;
+        //final int len = 262_144;
+        final int len = 1_000_000;
+        //final int len = 1_000_000;
+
+        final double[] aInit = new double[len];
+
+        for (int i = 0; i < len; i++) {
+            aInit[i] = (i % 10) - 5;
+        }
+
+        final double bval = 1.234567; // NOT 1.0
+
+        double[] aScalar = Arrays.copyOf(aInit, len);
+        double[] aVector = Arrays.copyOf(aInit, len);
+
+        // Warm up scalar only
+        for (int i = 0; i < 200; i++) {
+            LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len);
+        }
+
+        // Warm up vector only
+        for (int i = 0; i < 200; i++) {
+            LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len);
+        }
+
+        // Reset for measurement
+        aScalar = Arrays.copyOf(aInit, len);
+
+        // Measure scalar
+        long t0 = System.nanoTime();
+        for (int i = 0; i < 2000; i++) {
+            LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len);
+        }
+        long t1 = System.nanoTime();
+        System.out.println("Scalar");
+        System.out.println("Time per call (ns): " + ((t1- t0) / 2000.0));
+        
+
+        // Reset for measurement
+        aVector = Arrays.copyOf(aInit, len);
+
+        // Measure vector
+        long t2 = System.nanoTime();
+        for (int i = 0; i < 2000; i++) {
+            LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len);
+        }
+        long t3 = System.nanoTime();
+        System.out.println("Vector");
+        System.out.println("Time per call (ns): " + ((t3- t2) / 2000.0));
+    }
+}
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java
new file mode 100644
index 00000000000..90fb36192c8
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java
@@ -0,0 +1,74 @@
+package org.apache.sysds.test.component.codegen.performance_tests;
+import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
+
+
+public class vectSumTest {
+    public static void main(String[] args) {
+        int len = 1_000_000;
+        double[] a = new double[len];
+        for (int i = 0; i < len; i++)
+            a[i] = (i % 10) - 5;
+        float[] a_f = new float[len];
+        for (int i = 0; i < len; i++)
+            a_f[i] = (i % 10) - 5;
+
+        // warm up
+        for (int i = 0; i < 20_000; i++) {
+            LibSpoofPrimitives.vectSum(a, 0, len);
+            LibSpoofPrimitives.scalarvectSum(a, 0, len);
+            LibSpoofPrimitives.vectSumFloat(a_f, 0, len);
+            LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len);
+        }
+
+
+        // measure
+        long t2_0 = System.nanoTime();
+        double s2 = 0;
+        for (int i = 0; i < 2000; i++)
+            s2 += LibSpoofPrimitives.scalarvectSum(a, 0, len);
+        long t2_1 = System.nanoTime();
+
+        System.out.println("Scalar Sum=" + s2);
+        System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0));
+        
+        // measure
+        long t1_0 = System.nanoTime();
+        double s1 = 0;
+        for (int i = 0; i < 2000; i++)
+            s1 += LibSpoofPrimitives.vectSum(a, 0, len);
+        long t1_1 = System.nanoTime();
+
+        System.out.println("Vector Sum=" + s1);
+        System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0));
+
+        // measure
+        long t3_0 = System.nanoTime();
+        double s3 = 0;
+        for (int i = 0; i < 2000; i++)
+            s3 += LibSpoofPrimitives.vectSumFloat(a_f, 0, len);
+        long t3_1 = System.nanoTime();
+
+        System.out.println("Vector Float Sum=" + s3);
+        System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0));
+
+
+        // measure
+        long t4_0 = System.nanoTime();
+        double s4 = 0;
+        for (int i = 0; i < 2000; i++)
+            s4 += LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len);
+        long t4_1 = System.nanoTime();
+
+        System.out.println("Scalar Float Sum=" + s4/2000);
+        System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0));
+
+    }
+}
+/* 
+Scalar Sum=-1.0E9
+Time per call (ns): 142774.5625
+Vector Sum=-1.0E9
+Time per call (ns): 468854.25
+Vector Float Sum=-1.0E9
+Time per call (ns): 274727.3545
+*/

From 2c6f30df1a88c90011dcbec73c9d9f057810c370 Mon Sep 17 00:00:00 2001
From: JulianJuelg <julian.juelg@gmx.de>
Date: Fri, 30 Jan 2026 18:40:53 +0100
Subject: [PATCH 2/3] all vector api implementation of dense primitives  a)
 multiplyAdd, b) div, c) aggregations, d) comparisons

---
 .../runtime/codegen/LibSpoofPrimitives.java   | 838 ++++++++++++++----
 .../runtime/matrix/data/LibMatrixMult.java    |  39 +
 .../primitives_vector_api/BenchCase.java      | 377 +++++++-
 .../primitives_vector_api/Ctx.java            |  36 +-
 .../PrimitivePerfSuite.java                   |   3 +-
 5 files changed, 1089 insertions(+), 204 deletions(-)

diff --git a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
index 214226497f0..c89c734fa81 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
@@ -37,6 +37,7 @@
 import jdk.incubator.vector.FloatVector;
 import jdk.incubator.vector.VectorOperators;
 import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.VectorMask;
 
 /**
  * This library contains all vector primitives that are used in 
@@ -75,27 +76,15 @@ public static double scalarrowMaxsVectMult(double[] a, double[] b, int ai, int b
 		return val;
 	}
 
-	public static double scalarrowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) {
-		float val = Float.NEGATIVE_INFINITY;
-		int j=0;
-		for( int i = ai; i < ai+len; i++ )
-			val = Math.max(a[i]*b[j++], val);
-		return val;
-	}
-
 	public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
 		double maxVal = Double.NEGATIVE_INFINITY;
 	
 		int i = 0;
 		int upper = SPECIES.loopBound(len);
 	
-		// vector accumulator for max
 		DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
-	
-		// IMPORTANT:
-		// Your original code uses b[j++] starting at 0 (ignores bi).
-		// I assume that is a bug/oversight, so I use b[bi + i].
-		// If you *must* keep exact old semantics, replace (bi + i) with just i.
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
 		for (; i < upper; i += vLen) {
 			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
 			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i);
@@ -103,10 +92,9 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int
 			vmax = vmax.max(prod);
 		}
 	
-		// Reduce vector lanes to a scalar max
 		maxVal = vmax.reduceLanes(VectorOperators.MAX);
 	
-		// Tail
+		//rest, not aligned to vLen-blocks
 		for (; i < len; i++) {
 			maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
 		}
@@ -114,44 +102,37 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int
 		return maxVal;
 	}
 
-	public static double rowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) {
-		float maxVal = Float.NEGATIVE_INFINITY;
-	
+	// note: parameter bi unused
+	public static double scalarrowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
+		double val = Double.NEGATIVE_INFINITY;
+		for( int i = ai; i < ai+len; i++ )
+			val = Math.max(a[i]*b[aix[i]], val);
+		return val;
+	}
+
+	public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
+		double scalarMax = Double.NEGATIVE_INFINITY;
+
 		int i = 0;
-		int upper = FSPECIES.loopBound(len);
-	
-		// vector accumulator for max
-		FloatVector vmax = FloatVector.broadcast(FSPECIES, Float.NEGATIVE_INFINITY);
-	
-		// IMPORTANT:
-		// Your original code uses b[j++] starting at 0 (ignores bi).
-		// I assume that is a bug/oversight, so I use b[bi + i].
-		// If you *must* keep exact old semantics, replace (bi + i) with just i.
-		for (; i < upper; i += FSPECIES.length()) {
-			FloatVector va = FloatVector.fromArray(FSPECIES, a, ai + i);
-			FloatVector vb = FloatVector.fromArray(FSPECIES, b, bi + i);
-			FloatVector prod = va.mul(vb);
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += SPECIES.length()) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0, aix, ai + i);
+			DoubleVector prod = va.mul(vb);
 			vmax = vmax.max(prod);
 		}
-	
-		// Reduce vector lanes to a scalar max
-		maxVal = vmax.reduceLanes(VectorOperators.MAX);
-	
-		// Tail
+		scalarMax = Math.max(scalarMax, vmax.reduceLanes(VectorOperators.MAX));
+
+		//rest, not aligned to vLen-blocks
 		for (; i < len; i++) {
-			maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+			double prod = a[ai + i] * b[aix[ai + i]];
+			if (prod > scalarMax)
+				scalarMax = prod;
 		}
-	
-		return maxVal;
-	}
-	
-
-
-	public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
-		double val = Double.NEGATIVE_INFINITY;
-		for( int i = ai; i < ai+len; i++ )
-			val = Math.max(a[i]*b[aix[i]], val);
-		return val;
+		return scalarMax;
 	}
 
 	// forwarded calls to LibMatrixMult
@@ -399,110 +380,24 @@ public static double scalarvectSum(double[] a, int ai, int len) {
 		return val; 
 	} 
 
-	public static double scalarvectSumFloat(float[] a, int ai, int len) { 
-		float val = 0;
-		final int bn = len%8;
-		
-		//compute rest
-		for( int i = ai; i < ai+bn; i++ )
-			val += a[ i ];
-		
-		//unrolled 8-block (for better instruction-level parallelism)
-		for( int i = ai+bn; i < ai+len; i+=8 ) {
-			//read 64B cacheline of a, compute cval' = sum(a) + cval
-			val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ]
-			     + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ i+7 ];
-		}
-		
-		//scalar result
-		return val; 
-	}
 	public static double vectSum(double[] a, int ai, int len) {
         double sum = 0d;
         int i = 0;
 
         DoubleVector acc = DoubleVector.zero(SPECIES);
-
-		// largest multiple of vLen <= len
         int upperBound = SPECIES.loopBound(len);
 
+		//unrolled vLen-block  (for better instruction-level parallelism)
         for (; i < upperBound; i += SPECIES.length()) {
             DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
             acc = acc.add(v);
         }
-
-        // reduce vector lanes into scalar
-        sum += acc.reduceLanes(VectorOperators.ADD);
-
-        // tail (remaining elements)
-        for (; i < len; i++) {
-            sum += a[ai + i];
-        }
-
-        return sum;
-    }
-
-	public static double rowMaxsVectMultVec2Acc(double[] a, double[] b, int ai, int bi, int len) {
-		int i = 0;
-		int upper = SPECIES.loopBound(len);
-	
-		DoubleVector vmax1 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
-		DoubleVector vmax2 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
-	
-		// step = 2 vectors per iteration
-		int step = vLen * 2;
-	
-		for (; i + step <= upper; i += step) {
-			DoubleVector va1 = DoubleVector.fromArray(SPECIES, a, ai + i);
-			DoubleVector vb1 = DoubleVector.fromArray(SPECIES, b, bi + i);
-			vmax1 = vmax1.max(va1.mul(vb1));
-	
-			DoubleVector va2 = DoubleVector.fromArray(SPECIES, a, ai + i + vLen);
-			DoubleVector vb2 = DoubleVector.fromArray(SPECIES, b, bi + i + vLen);
-			vmax2 = vmax2.max(va2.mul(vb2));
-		}
-	
-		// finish remaining vector loop
-		for (; i < upper; i += vLen) {
-			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
-			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i);
-			vmax1 = vmax1.max(va.mul(vb));
-		}
-	
-		// combine both accumulators
-		DoubleVector vmax = vmax1.max(vmax2);
-		double maxVal = vmax.reduceLanes(VectorOperators.MAX);
-	
-		// tail
-		for (; i < len; i++) {
-			maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
-		}
-	
-		return maxVal;
-	}
-	
-	public static double vectSumFloat(float[] a, int ai, int len) {
-        float sum = 0;
-        int i = 0;
-
-        FloatVector acc = FloatVector.zero(FSPECIES);
-
-		// largest multiple of vLen <= len
-        int upperBound = FSPECIES.loopBound(len);
-
-        for (; i < upperBound; i += FSPECIES.length()) {
-            FloatVector v = FloatVector.fromArray(FSPECIES, a, ai + i);
-            acc = acc.add(v);
-        }
-
-        // reduce vector lanes into scalar
         sum += acc.reduceLanes(VectorOperators.ADD);
 
-        // tail (remaining elements)
+        //rest, not aligned to vLen-blocks
         for (; i < len; i++) {
             sum += a[ai + i];
         }
-
         return sum;
     }
 	
@@ -519,36 +414,93 @@ public static double vectSumsq(double[] avals, int[] aix, int ai, int alen, int
 		return LibMatrixMult.dotProduct(avals, avals, ai, ai, alen);
 	}
 	
-	public static double vectMin(double[] a, int ai, int len) { 
+	public static double scalarvectMin(double[] a, int ai, int len) { 
 		double val = Double.POSITIVE_INFINITY;
 		for( int i = ai; i < ai+len; i++ )
 			val = Math.min(a[i], val);
 		return val; 
 	}
+
+	public static double vectMin(double[] a, int ai, int len) {	
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vmin = DoubleVector.broadcast(SPECIES, Double.POSITIVE_INFINITY);
+	
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+			vmin = vmin.min(v);
+		}
+		double minVal = vmin.reduceLanes(VectorOperators.MIN);
+	
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			minVal = Math.min(minVal, a[ai + i]);
+		}
+		return minVal;
+	}
 	
 	public static double vectMin(double[] avals, int[] aix, int ai, int alen, int len) {
 		double val = vectMin(avals, ai, alen);
 		return (alen<len) ? Math.min(val, 0) : val;
 	}
 	
-	public static double vectMax(double[] a, int ai, int len) { 
+	public static double scalarvectMax(double[] a, int ai, int len) { 
 		double val = Double.NEGATIVE_INFINITY;
 		for( int i = ai; i < ai+len; i++ )
 			val = Math.max(a[i], val);
 		return val; 
 	} 
+
+	public static double vectMax(double[] a, int ai, int len) {
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+	
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+			vmax = vmax.max(v);
+		}
+		double maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+		//rest, not aligned to vLen-blocks	
+		for(;i<len;i++){
+			maxVal = Math.max(a[ai + i],maxVal);
+		}
+		return maxVal;
+	}
 	
 	public static double vectMax(double[] avals, int[] aix, int ai, int alen, int len) {
 		double val = vectMax(avals, ai, alen);
 		return (alen<len) ? Math.max(val, 0) : val;
 	}
 	
-	public static double vectCountnnz(double[] a, int ai, int len) { 
+	public static double scalarvectCountnnz(double[] a, int ai, int len) { 
 		int count = 0;
 		for( int i = ai; i < ai+len; i++ )
 			count += (a[i] != 0) ? 1 : 0;
 		return count;
 	} 
+	public static double vectCountnnz(double[] a, int ai, int len) {	
+		int count = 0;
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vzero = DoubleVector.zero(SPECIES);
+	
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+			VectorMask<Double> nz = v.compare(VectorOperators.NE, vzero);
+			count += nz.trueCount();
+		}
+	
+		//rest, not aligned to vLen-blocks	
+		for(;i<len;i++){
+			count += (a[i] != 0) ? 1 : 0;
+		}
+		return count;
+	}
 	
 	public static double vectCountnnz(double[] avals, int[] aix, int ai, int alen, int len) {
 		//pure meta data operation
@@ -570,39 +522,27 @@ public static void scalarvectDivAdd(double[] a, double bval, double[] c, int ai,
 			c[ci] +=  a[j] / bval;
 	}
 
-	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		// Handle trivial case
-		if (len <= 0) return;
-
-		// Preferred SIMD width for the current CPU (AVX2/AVX-512/etc.)
-		final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { 
+		final double inv = 1.0 / bval; 
+		final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); 
+		int i = 0; final int upperBound = SPECIES.loopBound(len); 
 
-		// Hoist reciprocal (1 division instead of len divisions)
-		final double inv = 1.0 / bval;
-		final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
-
-		int i = 0;
-		final int upperBound = SPECIES.loopBound(len);
-
-		// Vector loop
-		for (; i < upperBound; i += SPECIES.length()) {
-			// load a and c
-			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
-			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i);
-
-			// vc += va * inv
-			vc = vc.add(va.mul(vinv));
-
-			// store result back to c
-			vc.intoArray(c, ci + i);
-		}
-
-		// Tail loop
-		for (; i < len; i++) {
+		//unrolled vLen-block (for better instruction-level parallelism) 
+		for (; i < upperBound; i += vLen) { 
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); 
+			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); 
+			vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i); 
+		} 
+		
+		//rest, not aligned to vLen-blocks 
+		for (; i < len; i++) { 
 			c[ci + i] += a[ai + i] * inv;
-		}
+		} 
 	}
 
+
+
+	// for comparison
 	public static void pureDivvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		if (len <= 0) return;
 	
@@ -628,42 +568,172 @@ public static void pureDivvectDivAdd(double[] a, double bval, double[] c, int ai
 	
 
 	
-	public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
+	public static void scalarvectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] +=  bval / a[j];
 	}
 
-	public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+	public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i);
+			vc = vc.add(vb.div(va));
+			vc.intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks	
+		for (;i<len;i++){
+			c[ci+i] += bval/a[ai+i];
+		}
+	}
+
+
+	public static void scalarvectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		for( int j = ai; j < ai+alen; j++ )
 			c[ci + aix[j]] += a[j] / bval;
 	}
+
+	// sparse
+	public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+
+		final double inv = 1.0 / bval;
+		int i = 0;
+		int upperBound = SPECIES.loopBound(alen);
+		DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vcontrib = va.mul(vinv);
+
+			// scatter-add lane-by-lane
+			for (int lane = 0; lane < vLen; lane++) {
+				int idx = ci + aix[ai + i + lane];
+				c[idx] += vcontrib.lane(lane);
+			}
+		}
+
+		//rest, not aligned to vLen-blocks
+		for(; i<alen; i++){
+			c[ci + aix[ai + i]] += a[ai + i] * inv;
+		}
+	}
+
 	
-	public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+	public static void scalarvectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		for( int j = ai; j < ai+alen; j++ )
 			c[ci + aix[j]] += bval / a[j];
 	}
+
+	//sparse
+	public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+		int i = 0;
+		int upperBound = SPECIES.loopBound(alen);
+		DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vcontrib = vb.div(va);
+
+			// scatter-add lane-by-lane
+			for (int lane = 0; lane < vLen; lane++) {
+				int idx = ci + aix[ai + i + lane];
+				c[idx] += vcontrib.lane(lane);
+			}	
+		}
+		//rest, not aligned to vLen-blocks
+		for (; i<alen; i++){
+			c[ci + aix[ai + i]] += bval / a[ai +i];
+		}
+	}
+
 	
-	public static double[] vectDivWrite(double[] a, double bval, int ai, int len) {
+	public static double[] scalarvectDivWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++)
 			c[j] = a[ai+j] / bval;
 		return c;
 	}
+
+	public static double[] vectDivWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		final double inv = 1.0 / bval;
+		final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			va.mul(vinv).intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[i] = a[ai + i] * inv;
+		}
+		return c;
+	}
+
 	
-	public static double[] vectDivWrite(double bval, double[] a, int ai, int len) {
+	public static double[] scalarvectDivWrite(double bval, double[] a, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++)
 			c[j] = bval / a[ai + j];
 		return c;
 	}
+
+	public static double[] vectDivWrite(double bval, double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			vb.div(va).intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i<len; i++){
+			c[i] = bval / a[ai + i];
+		}
+		return c;
+	}
 	
-	public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] scalarvectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++)
 			c[j] = a[ai + j] / b[bi + j];
 		return c;
 	}
 
+	public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
+		double[] c = allocVector(len, false);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i);
+			va.div(vb).intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for(; i <len; i++){
+			c[i] = a[ai + i] / b[bi + i];
+		}
+		return c;
+	}
+
 	public static double[] vectDivWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) {
 		double init = (bval != 0) ? 0 : Double.NaN;
 		double[] c = allocVector(len, true, init);
@@ -1731,22 +1801,42 @@ public static double[] vectPow2Write(double[] a, int[] aix, int ai, int alen, in
 	
 	//custom mult2
 	
-	public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
+	public static void scalarvectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] +=  a[j] + a[j];
 	}
+
+	public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
+		LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len);
+	}
 	
 	public static void vectMult2Add(double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		for( int j = ai; j < ai+alen; j++ )
 			c[ci + aix[j]] += a[j] + a[j];
 	}
 	
-	public static double[] vectMult2Write(double[] a, int ai, int len) {
+	public static double[] scalarvectMult2Write(double[] a, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++)
 			c[j] = a[ai] + a[ai];
 		return c;
 	}
+	public static double[] vectMult2Write(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		LibMatrixMult.vectMultiplyWrite(2.0,a,c,ai,0,len);
+		return c;
+	}
+	public static double[] vectMult2Write_dedicated(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		return LibMatrixMult.vectMult2Write(a,c,ai,len);
+	}
+	public static double[] vectMult2Write_dedicated_2(double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		return LibMatrixMult.vectMult2Write_dedicated_2(a,c,ai,len);
+	}
+
+	
+	
 	
 	public static double[] vectMult2Write(double[] a, int[] aix, int ai, int alen, int len) {
 		double[] c = allocVector(len, true);
@@ -1835,10 +1925,35 @@ public static double[] vectSigmoidWrite(double[] a, int[] aix, int ai, int alen,
 	
 	//custom vector equal
 	
-	public static void vectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+	public static void scalarvectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] += (a[j] == bval) ? 1 : 0;
 	}
+	public static void vectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+		final DoubleVector bVec   = DoubleVector.broadcast(SPECIES, bval);
+		final DoubleVector ones   = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros  = DoubleVector.zero(SPECIES);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+			VectorMask<Double> eq = aVec.compare(VectorOperators.EQ, bVec);
+
+			DoubleVector inc = zeros.blend(ones, eq);
+
+			cVec.add(inc).intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0;
+			}
+		}
+	
 	
 	public static void vectEqualAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		vectEqualAdd(a, bval, c, ai, ci, len);
@@ -1865,28 +1980,24 @@ public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int
 	}
 	public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
-	
 		int i = 0;
 		int upper = SPECIES.loopBound(len);
-	
 		DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
 		DoubleVector zeros = DoubleVector.zero(SPECIES);
 		DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
 	
+		//unrolled vLen-block  (for better instruction-level parallelism)
 		for (; i < upper; i += vLen) {
 			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
 			var mask = va.compare(VectorOperators.EQ, vb);
-	
-			// out = (va == vb) ? 1.0 : 0.0
 			DoubleVector out = zeros.blend(ones, mask);
 			out.intoArray(c, i);
 		}
 	
-		// tail
+		//rest, not aligned to vLen-blocks
 		for (; i < len; i++) {
 			c[i] = (a[ai + i] == bval) ? 1 : 0;
 		}
-	
 		return c;
 	}
 	
@@ -1895,13 +2006,37 @@ public static double[] vectEqualWrite(double bval, double[] a, int ai, int len)
 		return vectEqualWrite(a, bval, ai, len);
 	}
 	
-	public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] scalarvectEqualWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++, bi++)
 			c[j] = (a[ai] == b[bi]) ? 1 : 0;
 		return c;
 	}
 
+	public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) {
+        double[] c = allocVector(len, false);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+        //unrolled vLen-block  (for better instruction-level parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+            DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+            VectorMask<Double> eq = aVec.compare(VectorOperators.EQ, bVec);
+            DoubleVector out = zeros.blend(ones, eq);
+
+            out.intoArray(c, i);
+        }
+
+       	//rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0;
+        }
+        return c;
+    }
+
 	public static double[] vectEqualWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) {
 		double init = (bval == 0) ? 1 : 0;
 		double[] c = allocVector(len, true, init);
@@ -1931,10 +2066,33 @@ public static double[] vectEqualWrite(double[] a, double[] b, int ai, int[] bix,
 	
 	//custom vector not equal
 	
-	public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+	public static void scalarvectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] += (a[j] != bval) ? 1 : 0;
 	}
+	public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+		DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+		DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+		VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+		DoubleVector inc = zeros.blend(ones, ne);
+
+		cVec.add(inc).intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[ci + i] += (a[ai + i] != bval) ? 1.0 : 0.0;
+		}
+	}
 	
 	public static void vectNotequalAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		vectNotequalAdd(a, bval, c, ai, ci, len);
@@ -1953,24 +2111,74 @@ public static void vectNotequalAdd(double bval, double[] a, double[] c, int[] ai
 		vectNotequalAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] vectNotequalWrite(double[] a, double bval, int ai, int len) {
+	public static double[] scalarvectNotequalWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++)
 			c[j] = (a[ai] != bval) ? 1 : 0;
 		return c;
 	}
+
+	public static double[] vectNotequalWrite(double[] a, double bval, int ai, int len) {
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+            VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+            DoubleVector out = zeros.blend(ones, ne);
+
+            out.intoArray(c, i);
+        }
+
+		//rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] != bval) ? 1.0 : 0.0;
+        }
+        return c;
+    }
 	
 	public static double[] vectNotequalWrite(double bval, double[] a, int ai, int len) {
 		return vectNotequalWrite(a, bval, ai, len);
 	}
 	
-	public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] scalarvectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++, bi++)
 			c[j] = (a[ai] != b[bi]) ? 1 : 0;
 		return c;
 	}
 
+	public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+		double[] c = allocVector(len, false);
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+		
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+
+			VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+			DoubleVector out = zeros.blend(ones, ne);
+
+			out.intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0;
+		}
+		return c;
+		}
+
 	public static double[] vectNotequalWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) {
 		double init = (bval != 0) ? 1 : 0;
 		double[] c = allocVector(len, true, init);
@@ -1999,10 +2207,34 @@ public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int[] b
 	
 	//custom vector less
 	
-	public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+	public static void scalarvectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] += (a[j] < bval) ? 1 : 0;
 	}
+	public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+			VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+			DoubleVector inc = zeros.blend(ones, lt);
+
+			cVec.add(inc).intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0;
+			}
+		}
 	
 	public static void vectLessAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		vectGreaterequalAdd(a, bval, c, ai, ci, len);
@@ -2021,24 +2253,81 @@ public static void vectLessAdd(double bval, double[] a, double[] c, int[] aix, i
 		vectGreaterequalAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] vectLessWrite(double[] a, double bval, int ai, int len) {
+	public static double[] scalarvectLessWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++)
 			c[j] = (a[ai] < bval) ? 1 : 0;
 		return c;
 	}
+
+
+	public static double[] vectLessWrite(double[] a, double bval, int ai, int len) {
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+            VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+            DoubleVector out = zeros.blend(ones, lt);
+
+            out.intoArray(c, i);
+        }
+
+		//rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] < bval) ? 1.0 : 0.0;
+        }
+
+        return c;
+    }
+
 	
 	public static double[] vectLessWrite(double bval, double[] a, int ai, int len) {
 		return vectGreaterequalWrite(a, bval, ai, len);
 	}
 	
-	public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] scalarvectLessWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++, bi++)
 			c[j] = (a[ai] < b[bi]) ? 1 : 0;
 		return c;
 	}
 
+	public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) {
+		double[] c = allocVector(len, false);
+
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+
+			VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+			DoubleVector out = zeros.blend(ones, lt);
+
+			out.intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+		c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0;
+		}
+
+		return c;
+		}
+
 	public static double[] vectLessWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) {
 		double init = (bval > 0) ? 1 : 0;
 		double[] c = allocVector(len, true, init);
@@ -2067,10 +2356,35 @@ public static double[] vectLessWrite(double[] a, double[] b, int ai, int[] bix,
 	
 	//custom vector less equal
 	
-	public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+	public static void scalarvectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] += (a[j] <= bval) ? 1 : 0;
 	}
+
+	public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+			VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+			DoubleVector inc = zeros.blend(ones, le);
+
+			cVec.add(inc).intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0;
+		}
+		}
 	
 	public static void vectLessequalAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		vectGreaterAdd(a, bval, c, ai, ci, len);
@@ -2089,24 +2403,78 @@ public static void vectLessequalAdd(double bval, double[] a, double[] c, int[] a
 		vectGreaterAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) {
+	public static double[] scalarvectLessequalWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++)
 			c[j] = (a[ai] <= bval) ? 1 : 0;
 		return c;
 	}
+	public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) {
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+            VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+            DoubleVector out = zeros.blend(ones, le);
+
+            out.intoArray(c, i);
+        }
+
+		//rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0;
+        }
+
+        return c;
+    }
 	
 	public static double[] vectLessequalWrite(double bval, double[] a, int ai, int len) {
 		return vectGreaterWrite(a, bval, ai, len);
 	}
 	
-	public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] scalarvectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++, bi++)
 			c[j] = (a[ai] <= b[bi]) ? 1 : 0;
 		return c;
 	}
 
+	public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+		double[] c = allocVector(len, false);
+
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+		DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+		DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+
+		VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+		DoubleVector out = zeros.blend(ones, le);
+
+		out.intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+		c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0;
+		}
+
+		return c;
+		}
+
 	public static double[] vectLessequalWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) {
 		double init = (bval >= 0) ? 1 : 0;
 		double[] c = allocVector(len, true, init);
@@ -2135,10 +2503,35 @@ public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int[]
 
 	//custom vector greater
 	
-	public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+	public static void scalarvectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		for( int j = ai; j < ai+len; j++, ci++)
 			c[ci] += (a[j] > bval) ? 1 : 0;
 	}
+
+	public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+			VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+			DoubleVector inc = zeros.blend(ones, gt);
+
+			cVec.add(inc).intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0;
+		}
+		}
 	
 	public static void vectGreaterAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		vectLessequalAdd(a, bval, c, ai, ci, len);
@@ -2157,24 +2550,75 @@ public static void vectGreaterAdd(double bval, double[] a, double[] c, int[] aix
 		vectLessequalAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) {
+	public static double[] scalarvectGreaterWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++)
 			c[j] = (a[ai] > bval) ? 1 : 0;
 		return c;
 	}
+	public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) {
+        double[] c = allocVector(len, false);
+        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+        int i = 0;
+        int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+        for (; i < upper; i += vLen) {
+            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+
+            VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+            DoubleVector out = zeros.blend(ones, gt);
+
+            out.intoArray(c, i);
+        }
+
+		//rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            c[i] = (a[ai + i] > bval) ? 1.0 : 0.0;
+		}
+        return c;
+    }
 	
 	public static double[] vectGreaterWrite(double bval, double[] a, int ai, int len) {
 		return vectLessWrite(a, bval, ai, len);
 	}
 	
-	public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] scalarvectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++, bi++)
 			c[j] = (a[ai] > b[bi]) ? 1 : 0;
 		return c;
 	}
 
+	public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
+		double[] c = allocVector(len, false);
+		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros = DoubleVector.zero(SPECIES);
+
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+
+			VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+			DoubleVector out = zeros.blend(ones, gt);
+
+			out.intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0;
+		}
+		return c;
+		}
+
 	public static double[] vectGreaterWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) {
 		double init = (bval < 0) ? 1 : 0;
 		double[] c = allocVector(len, true, init);
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index cfdf21255e7..9417e5134e8 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -4019,6 +4019,45 @@ public static void vectMultiplyWrite( final double[] a, double[] b, double[] c,
 			c[ ci+bix[j+7] ] = a[ ai+bix[j+7] ] * b[ j+7 ];
 		}
 	}
+	// test
+	public static double[] vectMult2Write(double[] a,double[] c, int ai, int len) {
+	
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+	
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			va.add(va).intoArray(c, i);
+		}
+	
+		for (; i < len; i++) {
+			double x = a[ai + i];
+			c[i] = x + x;
+		}
+	
+		return c;
+	}
+	public static double[] vectMult2Write_dedicated_2(double[] a, double[] c, int ai, int len) {
+		
+		final int bn = len % vLen;
+	
+		// scalar prefix so the vector loop is an exact multiple of vLen
+		for (int j = 0; j < bn; j++) {
+			double x = a[ai + j];
+			c[j] = x + x;
+		}
+	
+		// vector loop: j runs over multiples of vLen, no tail afterwards
+		for (int j = bn; j < len; j += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + j);
+			va.add(va).intoArray(c, j);
+			// or: va.mul(2.0) via broadcast if you prefer
+		}
+	
+		return c;
+	}
+	
+	
 
 	public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len){
 
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
index b748642171d..9cd67051b1e 100644
--- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
@@ -1,7 +1,11 @@
 package org.apache.sysds.performance.primitives_vector_api;
+import org.apache.sysds.performance.primitives_vector_api.BenchCase.OutKind;
 import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
 
 public enum BenchCase {
+
+    // Aggregations
+
     VECT_SUM(
       "vectSum dense",
       OutKind.SCALAR_DOUBLE,
@@ -13,18 +17,387 @@ public enum BenchCase {
               BenchUtil.blackhole = ctx.vectorRes;},
       ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
     ),
-  
+
+
+    ROWS_MAXS_VECT_MULT(
+      "rowMaxsVectMult dense",
+      OutKind.SCALAR_DOUBLE,
+      ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+      ctx -> ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> {
+        ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
+      }
+    ),
+
+    ROWS_MAXS_VECT_MULT_AIX(
+      "rowMaxsVectMult_aix dense",
+      OutKind.SCALAR_DOUBLE,
+      ctx -> {ctx.initDenseA();ctx.initDenseB();ctx.initDenseAInt();},
+      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len);
+        BenchUtil.blackhole = ctx.scalarRes;
+            },
+      ctx -> {
+        ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len);
+        BenchUtil.blackhole = ctx.vectorRes;
+            },
+      ctx -> {
+        ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
+      }
+    ),
+    VECT_MIN(
+      "vectMin dense",
+      OutKind.SCALAR_DOUBLE,
+      ctx -> ctx.initDenseA(),
+      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMin(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.scalarRes;
+             },
+      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMin(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.vectorRes;},
+      ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+    ),
+
+    VECT_MAX(
+      "vectMax dense",
+      OutKind.SCALAR_DOUBLE,
+      ctx -> ctx.initDenseA(),
+      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMax(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.scalarRes;
+             },
+      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMax(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.vectorRes;},
+      ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+    ),
+    VECT_COUNTNNZ(
+      "vectCountnnz dense",
+      OutKind.SCALAR_DOUBLE,
+      ctx -> ctx.initDenseA(),
+      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectCountnnz(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.scalarRes;
+             },
+      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectCountnnz(ctx.a, 0, ctx.len);
+              BenchUtil.blackhole = ctx.vectorRes;},
+      ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
+    ),
+
+    // Divisions
+
     VECT_DIV_ADD(
       "vectDivAdd dense",
       OutKind.ARRAY_DOUBLE,
-      ctx -> ctx.initDenseAandC(),
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();},
       ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len),
       ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
         ctx.ok = ctx.maxDiff <= 1e-9;
       }
+    ),
+
+    VECT_DIV_ADD_2(
+      "vectDivAdd2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, 0, 0, ctx.len),
+      ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 0, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+
+    VECT_DIV_ADD_SPARSE(
+      "vectDivAdd sparse",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+      ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+
+
+    VECT_DIV_ADD_SPARSE2(
+      "vectDivAdd2 sparse",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+      ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+
+    VECT_DIV_WRITE(
+      "vectDivWrite dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_DIV_WRITE2(
+      "vectDivWrite2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.bval, ctx.a, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ), 
+    VECT_DIV_WRITE3(
+      "vectDivWrite3 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseBDiv();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+
+    // Comparisons
+
+    VECT_EQUAL_WRITE(
+      "vectEqualWrite dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_EQUAL_ADD(
+      "vectEqualAdd dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectEqualAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> LibSpoofPrimitives.vectEqualAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_EQUAL_WRITE2(
+      "vectEqualWrite2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA(); ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_NOTEQUAL_ADD(
+      "vectNotequalAdd dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectNotequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> LibSpoofPrimitives.vectNotequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_NOTEQUAL_WRITE(
+      "vectNotequalWrite dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA(); ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_NOTEQUAL_WRITE2(
+      "vectNotequalWrite2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA();  ctx.initDenseB(); ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.b, 0 ,0 ,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_LESS_ADD(
+      "vectLessAdd dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectLessAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> LibSpoofPrimitives.vectLessAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_LESS_WRITE(
+      "vectLessWrite dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA();  ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.bval, 0 ,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.bval, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_LESS_WRITE2(
+      "vectLessWrite2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_LESSEQUAL_ADD(
+      "vectLessequalAdd dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectLessequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> LibSpoofPrimitives.vectLessequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_LESSEQUAL_WRITE(
+      "vectLessequalWrite dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA();  ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 ,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.bval, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_LESSEQUAL_WRITE2(
+      "vectLessequalWrite2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+
+    VECT_GREATER_ADD(
+      "vectGreaterAdd dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
+      ctx -> LibSpoofPrimitives.scalarvectGreaterAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> LibSpoofPrimitives.vectGreaterAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_GREATER_WRITE(
+      "vectGreaterWrite dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA();  ctx.initbval();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 ,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.bval, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_GREATER_WRITE2(
+      "vectGreaterWrite2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseA(); ctx.initDenseB();},
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_DIV_ADD_pure(
+      "vectDivAddpure dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();},
+      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len),
+      ctx -> LibSpoofPrimitives.pureDivvectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+
+    // vectMult2
+
+    VECT_Mult2_ADD(
+      "vectMult2Add dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); },
+      ctx -> LibSpoofPrimitives.scalarvectMult2Add(ctx.a, ctx.cScalar,0, 0,ctx.len),
+      ctx -> LibSpoofPrimitives.vectMult2Add(ctx.a, ctx.cVector, 0, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_Mult2_WRITE(
+      "vectMult2Write dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); },
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write(ctx.a, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_Mult2_WRITE_DEDICATED(
+      "vectMult2Write_dedicated dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); },
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated(ctx.a, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
+    ),
+    VECT_Mult2_WRITE_DEDICATED2(
+      "vectMult2Write_dedicated2 dense",
+      OutKind.ARRAY_DOUBLE,
+      ctx -> {ctx.initDenseAandC_mutable(); },
+      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len),
+      ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated_2(ctx.a, 0,ctx.len),
+      ctx -> {
+        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
+        ctx.ok = ctx.maxDiff <= 1e-9;
+      }
     );
+
+
+
+
+
+
+
     public enum OutKind { SCALAR_DOUBLE, ARRAY_DOUBLE }
     public final String name;
     public final java.util.function.Consumer<Ctx> setup;
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
index 84c66266c8f..d32ca3433e9 100644
--- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java
@@ -2,26 +2,54 @@
 
 public class Ctx {
     public int len;
-    public double[] a, cInit, cScalar, cVector;
+    public double[] a, cInit,b,c, cScalar, cVector;
     public double bval;
   
     public double scalarRes, vectorRes;
     public double maxDiff;
     public boolean ok;
+    public int[] a_int;
   
     void initDenseA() {
       a = new double[len];
       for (int i = 0; i < len; i++) a[i] = (i % 10) - 5;
     }
+    void initDenseB() {
+      b = new double[len];
+      for (int i = 0; i < len; i++) b[i] = (i % 10) - 5;
+    }
+    void initDenseC() {
+      c = new double[len];
+      for (int i = 0; i < len; i++) c[i] = (i % 10) - 5;
+    }
+    void initDenseAInt() {
+      a_int = new int[len];
+      for (int i = 0; i < len; i++) a_int[i] = i;;
+    }
+    void initbval(){
+      bval = 1.234567;
+    }
+    void initDenseADiv() {
+      a = new double[len];
+      for (int i = 0; i < len; i++) {
+          a[i] = ((i % 10) + 1);  // Range: 1 to 10 (no zeros)
+      }
+    }
+    void initDenseBDiv() {
+        b = new double[len];
+        for (int i = 0; i < len; i++) b[i] = ((i % 10) + 1);
+      }
+  
   
-    void initDenseAandC() {
-      initDenseA();
+    void initDenseAandC_mutable() {
+      initDenseADiv();
       cInit = new double[len];
       for (int i = 0; i < len; i++) cInit[i] = (i % 10) - 5;
       cScalar = java.util.Arrays.copyOf(cInit, len);
       cVector = java.util.Arrays.copyOf(cInit, len);
-      bval = 1.234567;
     }
+
+
   
     void resetC() {
       if (cInit != null) {
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
index c478c7edfb7..6dcb6797f30 100644
--- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java
@@ -1,11 +1,12 @@
 package org.apache.sysds.performance.primitives_vector_api;
 
+
 public class PrimitivePerfSuite {
   public static void main(String[] args) {
     //int len = BenchUtil.argInt(args, "--len", 262_144);
     int len = BenchUtil.argInt(args, "--len", 1_000_000);
     int warmup = BenchUtil.argInt(args, "--warmup", 10_000);
-    int iters = BenchUtil.argInt(args, "--iters", 2000);
+    int iters = BenchUtil.argInt(args, "--iters", 100);
     String filter = BenchUtil.argStr(args, "--filter", "");
 
     for (BenchCase bc : BenchCase.values()) {

From a881e55d2e7b51e829156e77b6904d50e59a9ad5 Mon Sep 17 00:00:00 2001
From: JulianJuelg <julian.juelg@gmx.de>
Date: Fri, 30 Jan 2026 23:11:02 +0100
Subject: [PATCH 3/3] Replace codegen primitives with vector api implementation
 if faster; add all primitives implementations to benchmarking suite

---
 .../runtime/codegen/LibSpoofPrimitives.java   | 203 +----
 .../primitives_vector_api/BenchCase.java      | 151 +--
 .../backup_primitives_for_benchmark.java      | 856 ++++++++++++++++++
 .../codegen/performance_tests/benchUtil.java  |  36 -
 .../rowMaxsVectMultTest.java                  |  95 --
 .../performance_tests/vectDivAddTest.java     | 100 --
 .../performance_tests/vectEqualWriteTest.java |  61 --
 .../performance_tests/vectSumTest.java        |  74 --
 8 files changed, 916 insertions(+), 660 deletions(-)
 create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java
 delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java

diff --git a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
index c89c734fa81..a66d8f2dcaa 100644
--- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
+++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java
@@ -68,14 +68,6 @@ public class LibSpoofPrimitives
 		@Override protected SparseVectorBuffer initialValue() { return new SparseVectorBuffer(0,0,0); }
 	};
 
-	public static double scalarrowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
-		double val = Double.NEGATIVE_INFINITY;
-		int j=0;
-		for( int i = ai; i < ai+len; i++ )
-			val = Math.max(a[i]*b[j++], val);
-		return val;
-	}
-
 	public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
 		double maxVal = Double.NEGATIVE_INFINITY;
 	
@@ -103,14 +95,15 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int
 	}
 
 	// note: parameter bi unused
-	public static double scalarrowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
+	public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
 		double val = Double.NEGATIVE_INFINITY;
 		for( int i = ai; i < ai+len; i++ )
 			val = Math.max(a[i]*b[aix[i]], val);
 		return val;
 	}
 
-	public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static double rowMaxsVectMult_vector_api(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
 		double scalarMax = Double.NEGATIVE_INFINITY;
 
 		int i = 0;
@@ -360,8 +353,7 @@ public static double[] vectCbindWrite(double[] a, double[] b, int[] aix, int ai,
 	 * @return sum value
 	 */
 
-	// scalar function 
-	public static double scalarvectSum(double[] a, int ai, int len) { 
+	public static double vectSum(double[] a, int ai, int len) { 
 		double val = 0;
 		final int bn = len%8;
 		
@@ -379,8 +371,8 @@ public static double scalarvectSum(double[] a, int ai, int len) {
 		//scalar result
 		return val; 
 	} 
-
-	public static double vectSum(double[] a, int ai, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static double vectSum_vector_api(double[] a, int ai, int len) {
         double sum = 0d;
         int i = 0;
 
@@ -445,12 +437,6 @@ public static double vectMin(double[] avals, int[] aix, int ai, int alen, int le
 		return (alen<len) ? Math.min(val, 0) : val;
 	}
 	
-	public static double scalarvectMax(double[] a, int ai, int len) { 
-		double val = Double.NEGATIVE_INFINITY;
-		for( int i = ai; i < ai+len; i++ )
-			val = Math.max(a[i], val);
-		return val; 
-	} 
 
 	public static double vectMax(double[] a, int ai, int len) {
 		int i = 0;
@@ -476,12 +462,7 @@ public static double vectMax(double[] avals, int[] aix, int ai, int alen, int le
 		return (alen<len) ? Math.max(val, 0) : val;
 	}
 	
-	public static double scalarvectCountnnz(double[] a, int ai, int len) { 
-		int count = 0;
-		for( int i = ai; i < ai+len; i++ )
-			count += (a[i] != 0) ? 1 : 0;
-		return count;
-	} 
+
 	public static double vectCountnnz(double[] a, int ai, int len) {	
 		int count = 0;
 		int i = 0;
@@ -516,11 +497,6 @@ public static double vectMean(double[] avals, int[] aix, int ai, int alen, int l
 	}
 	
 	//custom vector div
-	
-	public static void scalarvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] +=  a[j] / bval;
-	}
 
 	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { 
 		final double inv = 1.0 / bval; 
@@ -539,40 +515,8 @@ public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int c
 			c[ci + i] += a[ai + i] * inv;
 		} 
 	}
-
-
-
-	// for comparison
-	public static void pureDivvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		if (len <= 0) return;
-	
-		final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
-		final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
-	
-		int i = 0;
-		final int upperBound = SPECIES.loopBound(len);
-	
-		for (; i < upperBound; i += SPECIES.length()) {
-			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
-			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i);
-	
-			vc = vc.add(va.div(vb));
-	
-			vc.intoArray(c, ci + i);
-		}
-	
-		for (; i < len; i++) {
-			c[ci + i] += a[ai + i] / bval;
-		}
-	}
 	
 
-	
-	public static void scalarvectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] +=  bval / a[j];
-	}
-
 	public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
 		int i = 0;
 		int upperBound = SPECIES.loopBound(len);
@@ -593,13 +537,13 @@ public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int c
 	}
 
 
-	public static void scalarvectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+	public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		for( int j = ai; j < ai+alen; j++ )
 			c[ci + aix[j]] += a[j] / bval;
 	}
 
-	// sparse
-	public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static void vectDivAdd_vector_api(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 
 		final double inv = 1.0 / bval;
 		int i = 0;
@@ -625,13 +569,13 @@ public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, in
 	}
 
 	
-	public static void scalarvectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+	public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		for( int j = ai; j < ai+alen; j++ )
 			c[ci + aix[j]] += bval / a[j];
 	}
 
-	//sparse
-	public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static void vectDivAdd_vector_api(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		int i = 0;
 		int upperBound = SPECIES.loopBound(alen);
 		DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
@@ -654,14 +598,15 @@ public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, in
 	}
 
 	
-	public static double[] scalarvectDivWrite(double[] a, double bval, int ai, int len) {
+	public static double[] vectDivWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++)
 			c[j] = a[ai+j] / bval;
 		return c;
 	}
 
-	public static double[] vectDivWrite(double[] a, double bval, int ai, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectDivWrite_vector_api(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		final double inv = 1.0 / bval;
 		final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
@@ -682,14 +627,15 @@ public static double[] vectDivWrite(double[] a, double bval, int ai, int len) {
 	}
 
 	
-	public static double[] scalarvectDivWrite(double bval, double[] a, int ai, int len) {
+	public static double[] vectDivWrite(double bval, double[] a, int ai, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++)
 			c[j] = bval / a[ai + j];
 		return c;
 	}
 
-	public static double[] vectDivWrite(double bval, double[] a, int ai, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectDivWrite_vector_api(double bval, double[] a, int ai, int len) {
 		double[] c = allocVector(len, false);
 		final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
 		int i = 0;
@@ -708,14 +654,15 @@ public static double[] vectDivWrite(double bval, double[] a, int ai, int len) {
 		return c;
 	}
 	
-	public static double[] scalarvectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++)
 			c[j] = a[ai + j] / b[bi + j];
 		return c;
 	}
 
-	public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectDivWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		int i = 0;
 		int upper = SPECIES.loopBound(len);
@@ -1800,11 +1747,6 @@ public static double[] vectPow2Write(double[] a, int[] aix, int ai, int alen, in
 	}
 	
 	//custom mult2
-	
-	public static void scalarvectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] +=  a[j] + a[j];
-	}
 
 	public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
 		LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len);
@@ -1815,29 +1757,13 @@ public static void vectMult2Add(double[] a, double[] c, int[] aix, int ai, int c
 			c[ci + aix[j]] += a[j] + a[j];
 	}
 	
-	public static double[] scalarvectMult2Write(double[] a, int ai, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++)
-			c[j] = a[ai] + a[ai];
-		return c;
-	}
 	public static double[] vectMult2Write(double[] a, int ai, int len) {
 		double[] c = allocVector(len, false);
 		LibMatrixMult.vectMultiplyWrite(2.0,a,c,ai,0,len);
 		return c;
 	}
-	public static double[] vectMult2Write_dedicated(double[] a, int ai, int len) {
-		double[] c = allocVector(len, false);
-		return LibMatrixMult.vectMult2Write(a,c,ai,len);
-	}
-	public static double[] vectMult2Write_dedicated_2(double[] a, int ai, int len) {
-		double[] c = allocVector(len, false);
-		return LibMatrixMult.vectMult2Write_dedicated_2(a,c,ai,len);
-	}
 
 	
-	
-	
 	public static double[] vectMult2Write(double[] a, int[] aix, int ai, int alen, int len) {
 		double[] c = allocVector(len, true);
 		for( int j = ai; j < ai+alen; j++ )
@@ -1925,10 +1851,6 @@ public static double[] vectSigmoidWrite(double[] a, int[] aix, int ai, int alen,
 	
 	//custom vector equal
 	
-	public static void scalarvectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] += (a[j] == bval) ? 1 : 0;
-	}
 	public static void vectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		int i = 0;
 		int upper = SPECIES.loopBound(len);
@@ -1972,12 +1894,6 @@ public static void vectEqualAdd(double bval, double[] a, double[] c, int[] aix,
 		vectEqualAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++)
-			c[j] = (a[ai] == bval) ? 1 : 0;
-		return c;
-	}
 	public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) {
 		double[] c = allocVector(len, false);
 		int i = 0;
@@ -2006,12 +1922,6 @@ public static double[] vectEqualWrite(double bval, double[] a, int ai, int len)
 		return vectEqualWrite(a, bval, ai, len);
 	}
 	
-	public static double[] scalarvectEqualWrite(double[] a, double[] b, int ai, int bi, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++, bi++)
-			c[j] = (a[ai] == b[bi]) ? 1 : 0;
-		return c;
-	}
 
 	public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) {
         double[] c = allocVector(len, false);
@@ -2066,10 +1976,6 @@ public static double[] vectEqualWrite(double[] a, double[] b, int ai, int[] bix,
 	
 	//custom vector not equal
 	
-	public static void scalarvectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] += (a[j] != bval) ? 1 : 0;
-	}
 	public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
 		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
@@ -2110,13 +2016,6 @@ public static void vectNotequalAdd(double[] a, double bval, double[] c, int[] ai
 	public static void vectNotequalAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		vectNotequalAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
-	
-	public static double[] scalarvectNotequalWrite(double[] a, double bval, int ai, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++)
-			c[j] = (a[ai] != bval) ? 1 : 0;
-		return c;
-	}
 
 	public static double[] vectNotequalWrite(double[] a, double bval, int ai, int len) {
         double[] c = allocVector(len, false);
@@ -2147,14 +2046,15 @@ public static double[] vectNotequalWrite(double bval, double[] a, int ai, int le
 		return vectNotequalWrite(a, bval, ai, len);
 	}
 	
-	public static double[] scalarvectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++, bi++)
 			c[j] = (a[ai] != b[bi]) ? 1 : 0;
 		return c;
 	}
 
-	public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectNotequalWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
 		final DoubleVector zeros = DoubleVector.zero(SPECIES);
@@ -2207,10 +2107,6 @@ public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int[] b
 	
 	//custom vector less
 	
-	public static void scalarvectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] += (a[j] < bval) ? 1 : 0;
-	}
 	public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
 		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
@@ -2252,14 +2148,6 @@ public static void vectLessAdd(double[] a, double bval, double[] c, int[] aix, i
 	public static void vectLessAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
 		vectGreaterequalAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
-	
-	public static double[] scalarvectLessWrite(double[] a, double bval, int ai, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++)
-			c[j] = (a[ai] < bval) ? 1 : 0;
-		return c;
-	}
-
 
 	public static double[] vectLessWrite(double[] a, double bval, int ai, int len) {
         double[] c = allocVector(len, false);
@@ -2292,13 +2180,6 @@ public static double[] vectLessWrite(double[] a, double bval, int ai, int len) {
 	public static double[] vectLessWrite(double bval, double[] a, int ai, int len) {
 		return vectGreaterequalWrite(a, bval, ai, len);
 	}
-	
-	public static double[] scalarvectLessWrite(double[] a, double[] b, int ai, int bi, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++, bi++)
-			c[j] = (a[ai] < b[bi]) ? 1 : 0;
-		return c;
-	}
 
 	public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
@@ -2355,11 +2236,6 @@ public static double[] vectLessWrite(double[] a, double[] b, int ai, int[] bix,
 	}
 	
 	//custom vector less equal
-	
-	public static void scalarvectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] += (a[j] <= bval) ? 1 : 0;
-	}
 
 	public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
@@ -2403,12 +2279,6 @@ public static void vectLessequalAdd(double bval, double[] a, double[] c, int[] a
 		vectGreaterAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] scalarvectLessequalWrite(double[] a, double bval, int ai, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++)
-			c[j] = (a[ai] <= bval) ? 1 : 0;
-		return c;
-	}
 	public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) {
         double[] c = allocVector(len, false);
         final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
@@ -2439,13 +2309,6 @@ public static double[] vectLessequalWrite(double[] a, double bval, int ai, int l
 	public static double[] vectLessequalWrite(double bval, double[] a, int ai, int len) {
 		return vectGreaterWrite(a, bval, ai, len);
 	}
-	
-	public static double[] scalarvectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++, bi++)
-			c[j] = (a[ai] <= b[bi]) ? 1 : 0;
-		return c;
-	}
 
 	public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
@@ -2503,11 +2366,6 @@ public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int[]
 
 	//custom vector greater
 	
-	public static void scalarvectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
-		for( int j = ai; j < ai+len; j++, ci++)
-			c[ci] += (a[j] > bval) ? 1 : 0;
-	}
-
 	public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
 		final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
 		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
@@ -2550,12 +2408,6 @@ public static void vectGreaterAdd(double bval, double[] a, double[] c, int[] aix
 		vectLessequalAdd(a, bval, c, aix, ai, ci, alen, len);
 	}
 	
-	public static double[] scalarvectGreaterWrite(double[] a, double bval, int ai, int len) {
-		double[] c = allocVector(len, false);
-		for( int j = 0; j < len; j++, ai++)
-			c[j] = (a[ai] > bval) ? 1 : 0;
-		return c;
-	}
 	public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) {
         double[] c = allocVector(len, false);
         final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
@@ -2586,14 +2438,15 @@ public static double[] vectGreaterWrite(double bval, double[] a, int ai, int len
 		return vectLessWrite(a, bval, ai, len);
 	}
 	
-	public static double[] scalarvectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
+	public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		for( int j = 0; j < len; j++, ai++, bi++)
 			c[j] = (a[ai] > b[bi]) ? 1 : 0;
 		return c;
 	}
 
-	public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectGreaterWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) {
 		double[] c = allocVector(len, false);
 		final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
 		final DoubleVector zeros = DoubleVector.zero(SPECIES);
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
index 9cd67051b1e..c428f6782a9 100644
--- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java
@@ -2,6 +2,7 @@
 import org.apache.sysds.performance.primitives_vector_api.BenchCase.OutKind;
 import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
 
+
 public enum BenchCase {
 
     // Aggregations
@@ -10,10 +11,10 @@ public enum BenchCase {
       "vectSum dense",
       OutKind.SCALAR_DOUBLE,
       ctx -> ctx.initDenseA(),
-      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectSum(ctx.a, 0, ctx.len);
+      ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectSum(ctx.a, 0, ctx.len);
               BenchUtil.blackhole = ctx.scalarRes;
              },
-      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectSum(ctx.a, 0, ctx.len);
+      ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectSum(ctx.a, 0, ctx.len);
               BenchUtil.blackhole = ctx.vectorRes;},
       ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
     ),
@@ -23,8 +24,8 @@ public enum BenchCase {
       "rowMaxsVectMult dense",
       OutKind.SCALAR_DOUBLE,
       ctx -> {ctx.initDenseA(); ctx.initDenseB();},
-      ctx -> ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
-      ctx -> ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> ctx.scalarRes = backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
+      ctx -> ctx.vectorRes = backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len),
       ctx -> {
         ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
       }
@@ -34,37 +35,26 @@ public enum BenchCase {
       "rowMaxsVectMult_aix dense",
       OutKind.SCALAR_DOUBLE,
       ctx -> {ctx.initDenseA();ctx.initDenseB();ctx.initDenseAInt();},
-      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len);
+      ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len);
         BenchUtil.blackhole = ctx.scalarRes;
             },
       ctx -> {
-        ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len);
+        ctx.vectorRes = backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len);
         BenchUtil.blackhole = ctx.vectorRes;
             },
       ctx -> {
         ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;
       }
     ),
-    VECT_MIN(
-      "vectMin dense",
-      OutKind.SCALAR_DOUBLE,
-      ctx -> ctx.initDenseA(),
-      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMin(ctx.a, 0, ctx.len);
-              BenchUtil.blackhole = ctx.scalarRes;
-             },
-      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMin(ctx.a, 0, ctx.len);
-              BenchUtil.blackhole = ctx.vectorRes;},
-      ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
-    ),
 
     VECT_MAX(
       "vectMax dense",
       OutKind.SCALAR_DOUBLE,
       ctx -> ctx.initDenseA(),
-      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMax(ctx.a, 0, ctx.len);
+      ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectMax(ctx.a, 0, ctx.len);
               BenchUtil.blackhole = ctx.scalarRes;
              },
-      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMax(ctx.a, 0, ctx.len);
+      ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectMax(ctx.a, 0, ctx.len);
               BenchUtil.blackhole = ctx.vectorRes;},
       ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
     ),
@@ -72,10 +62,10 @@ public enum BenchCase {
       "vectCountnnz dense",
       OutKind.SCALAR_DOUBLE,
       ctx -> ctx.initDenseA(),
-      ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectCountnnz(ctx.a, 0, ctx.len);
+      ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectCountnnz(ctx.a, 0, ctx.len);
               BenchUtil.blackhole = ctx.scalarRes;
              },
-      ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectCountnnz(ctx.a, 0, ctx.len);
+      ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectCountnnz(ctx.a, 0, ctx.len);
               BenchUtil.blackhole = ctx.vectorRes;},
       ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;}
     ),
@@ -86,8 +76,8 @@ public enum BenchCase {
       "vectDivAdd dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();},
-      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len),
-      ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len),
+      ctx -> backup_primitives_for_benchmark.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
         ctx.ok = ctx.maxDiff <= 1e-9;
@@ -98,7 +88,7 @@ public enum BenchCase {
       "vectDivAdd2 dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, 0, 0, ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, 0, 0, ctx.len),
       ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 0, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -110,7 +100,7 @@ public enum BenchCase {
       "vectDivAdd sparse",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
       ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -123,7 +113,7 @@ public enum BenchCase {
       "vectDivAdd2 sparse",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len),
       ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -135,7 +125,7 @@ public enum BenchCase {
       "vectDivWrite dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.bval, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -146,7 +136,7 @@ public enum BenchCase {
       "vectDivWrite2 dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.bval, ctx.a, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -157,7 +147,7 @@ public enum BenchCase {
       "vectDivWrite3 dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseBDiv();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -171,7 +161,7 @@ public enum BenchCase {
       "vectEqualWrite dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -182,7 +172,7 @@ public enum BenchCase {
       "vectEqualAdd dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectEqualAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectEqualAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
       ctx -> LibSpoofPrimitives.vectEqualAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -193,51 +183,18 @@ public enum BenchCase {
       "vectEqualWrite2 dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseA(); ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
         ctx.ok = ctx.maxDiff <= 1e-9;
       }
     ),
-    VECT_NOTEQUAL_ADD(
-      "vectNotequalAdd dense",
-      OutKind.ARRAY_DOUBLE,
-      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectNotequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
-      ctx -> LibSpoofPrimitives.vectNotequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
-      ctx -> {
-        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
-        ctx.ok = ctx.maxDiff <= 1e-9;
-      }
-    ),
-    VECT_NOTEQUAL_WRITE(
-      "vectNotequalWrite dense",
-      OutKind.ARRAY_DOUBLE,
-      ctx -> {ctx.initDenseA(); ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len),
-      ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len),
-      ctx -> {
-        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
-        ctx.ok = ctx.maxDiff <= 1e-9;
-      }
-    ),
-    VECT_NOTEQUAL_WRITE2(
-      "vectNotequalWrite2 dense",
-      OutKind.ARRAY_DOUBLE,
-      ctx -> {ctx.initDenseA();  ctx.initDenseB(); ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.b, 0 ,0 ,ctx.len),
-      ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.b, 0, 0, ctx.len),
-      ctx -> {
-        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
-        ctx.ok = ctx.maxDiff <= 1e-9;
-      }
-    ),
     VECT_LESS_ADD(
       "vectLessAdd dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectLessAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectLessAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
       ctx -> LibSpoofPrimitives.vectLessAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -248,7 +205,7 @@ public enum BenchCase {
       "vectLessWrite dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseA();  ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.bval, 0 ,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.bval, 0 ,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.bval, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -259,7 +216,7 @@ public enum BenchCase {
       "vectLessWrite2 dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.b, 0, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -270,7 +227,7 @@ public enum BenchCase {
       "vectLessequalAdd dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectLessequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectLessequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
       ctx -> LibSpoofPrimitives.vectLessequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -281,7 +238,7 @@ public enum BenchCase {
       "vectLessequalWrite dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseA();  ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 ,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 ,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.bval, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -292,7 +249,7 @@ public enum BenchCase {
       "vectLessequalWrite2 dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseA(); ctx.initDenseB();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.b, 0, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -304,7 +261,7 @@ public enum BenchCase {
       "vectGreaterAdd dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();},
-      ctx -> LibSpoofPrimitives.scalarvectGreaterAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectGreaterAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len),
       ctx -> LibSpoofPrimitives.vectGreaterAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -315,7 +272,7 @@ public enum BenchCase {
       "vectGreaterWrite dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseA();  ctx.initbval();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 ,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 ,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.bval, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
@@ -326,24 +283,13 @@ public enum BenchCase {
       "vectGreaterWrite2 dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseA(); ctx.initDenseB();},
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
+      ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 ,ctx.len),
       ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.b, 0, 0, ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
         ctx.ok = ctx.maxDiff <= 1e-9;
       }
     ),
-    VECT_DIV_ADD_pure(
-      "vectDivAddpure dense",
-      OutKind.ARRAY_DOUBLE,
-      ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();},
-      ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len),
-      ctx -> LibSpoofPrimitives.pureDivvectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len),
-      ctx -> {
-        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
-        ctx.ok = ctx.maxDiff <= 1e-9;
-      }
-    ),
 
     // vectMult2
 
@@ -351,45 +297,12 @@ public enum BenchCase {
       "vectMult2Add dense",
       OutKind.ARRAY_DOUBLE,
       ctx -> {ctx.initDenseAandC_mutable(); },
-      ctx -> LibSpoofPrimitives.scalarvectMult2Add(ctx.a, ctx.cScalar,0, 0,ctx.len),
+      ctx -> backup_primitives_for_benchmark.scalarvectMult2Add(ctx.a, ctx.cScalar,0, 0,ctx.len),
       ctx -> LibSpoofPrimitives.vectMult2Add(ctx.a, ctx.cVector, 0, 0,ctx.len),
       ctx -> {
         ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
         ctx.ok = ctx.maxDiff <= 1e-9;
       }
-    ),
-    VECT_Mult2_WRITE(
-      "vectMult2Write dense",
-      OutKind.ARRAY_DOUBLE,
-      ctx -> {ctx.initDenseAandC_mutable(); },
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len),
-      ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write(ctx.a, 0,ctx.len),
-      ctx -> {
-        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
-        ctx.ok = ctx.maxDiff <= 1e-9;
-      }
-    ),
-    VECT_Mult2_WRITE_DEDICATED(
-      "vectMult2Write_dedicated dense",
-      OutKind.ARRAY_DOUBLE,
-      ctx -> {ctx.initDenseAandC_mutable(); },
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len),
-      ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated(ctx.a, 0,ctx.len),
-      ctx -> {
-        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
-        ctx.ok = ctx.maxDiff <= 1e-9;
-      }
-    ),
-    VECT_Mult2_WRITE_DEDICATED2(
-      "vectMult2Write_dedicated2 dense",
-      OutKind.ARRAY_DOUBLE,
-      ctx -> {ctx.initDenseAandC_mutable(); },
-      ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len),
-      ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated_2(ctx.a, 0,ctx.len),
-      ctx -> {
-        ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector);
-        ctx.ok = ctx.maxDiff <= 1e-9;
-      }
     );
 
 
diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
new file mode 100644
index 00000000000..d0086eb9f66
--- /dev/null
+++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java
@@ -0,0 +1,856 @@
+package org.apache.sysds.performance.primitives_vector_api;
+
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+
+
+
+import java.util.Arrays;
+
+import org.apache.commons.math3.util.FastMath;
+import org.apache.sysds.runtime.data.DenseBlockFP64;
+import org.apache.sysds.runtime.data.SparseRowVector;
+import org.apache.sysds.runtime.functionobjects.BitwAnd;
+import org.apache.sysds.runtime.functionobjects.IntegerDivide;
+import org.apache.sysds.runtime.functionobjects.Modulus;
+import org.apache.sysds.runtime.matrix.data.LibMatrixDNN;
+import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType;
+import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col;
+import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+
+import jdk.incubator.vector.DoubleVector;
+import jdk.incubator.vector.VectorOperators;
+import jdk.incubator.vector.VectorSpecies;
+import jdk.incubator.vector.VectorMask;
+
+
+public class backup_primitives_for_benchmark {
+
+    // Vector API initializations
+	private static final VectorSpecies<Double> SPECIES = DoubleVector.SPECIES_PREFERRED;
+	private static final int vLen = SPECIES.length();
+
+    public static double[] allocVector(int len, boolean reset) {
+		return allocVector(len, reset, 0);
+	}
+	
+	protected static double[] allocVector(int len, boolean reset, double resetVal) {
+		VectorBuffer buff = memPool.get();
+		
+		//find next matching vector in ring buffer or
+		//allocate new vector if required
+		double[] vect = buff.next(len);
+		if( vect == null )
+			vect = new double[len];
+		
+		//reset vector if required
+		if( reset )
+			Arrays.fill(vect, resetVal);
+		return vect;
+	}
+    private static class VectorBuffer {
+		private static final int MAX_SIZE = 512*1024; //4MB
+		private final double[][] _data;
+		private int _pos;
+		private int _len1;
+		private int _len2;
+		
+		public VectorBuffer(int num, int len1, int len2) {
+			//best effort size restriction since large intermediates
+			//not necessarily used (num refers to the total number)
+			len1 = Math.min(len1, MAX_SIZE);
+			len2 = Math.min(len2, MAX_SIZE);
+			//pre-allocate ring buffer
+			int lnum = (len2>0 && len1!=len2) ? 2*num : num;
+			_data = new double[lnum][];
+			for( int i=0; i<num; i++ ) {
+				if( lnum > num ) {
+					_data[2*i] = new double[len1];
+					_data[2*i+1] = new double[len2];
+				}
+				else {
+					_data[i] = new double[len1];
+				}
+			}
+			_pos = -1;
+			_len1 = len1;
+			_len2 = len2;
+		}
+		public double[] next(int len) {
+			if( _len1!=len && _len2!=len )
+				return null;
+			do {
+				_pos = (_pos+1>=_data.length) ? 0 : _pos+1;
+			} while( _data[_pos].length!=len );
+			return _data[_pos];
+		}
+		@SuppressWarnings("unused")
+		public boolean isReusable(int num, int len1, int len2) {
+			int lnum = (len2>0 && len1!=len2) ? 2*num : num;
+			return (_len1 == len1 && _len2 == len2
+				&& _data.length == lnum);
+		}
+	}
+    private static ThreadLocal<VectorBuffer> memPool = new ThreadLocal<>() {
+		@Override protected VectorBuffer initialValue() { return new VectorBuffer(0,0,0); }
+	};
+
+    public static void scalarvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  a[j] / bval;
+	}
+
+	public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { 
+		final double inv = 1.0 / bval; 
+		final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); 
+		int i = 0; final int upperBound = SPECIES.loopBound(len); 
+
+		//unrolled vLen-block (for better instruction-level parallelism) 
+		for (; i < upperBound; i += vLen) { 
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); 
+			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); 
+			vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i); 
+		} 
+		
+		//rest, not aligned to vLen-blocks 
+		for (; i < len; i++) { 
+			c[ci + i] += a[ai + i] * inv;
+		} 
+	}
+
+    public static void scalarvectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] +=  bval / a[j];
+	}
+
+	public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) {
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i);
+			vc = vc.add(vb.div(va));
+			vc.intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks	
+		for (;i<len;i++){
+			c[ci+i] += bval/a[ai+i];
+		}
+	}
+
+    public static void scalarvectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+		for( int j = ai; j < ai+alen; j++ )
+			c[ci + aix[j]] += a[j] / bval;
+	}
+
+	// not in use: vector api implementation slower than scalar loop version
+	public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+
+		final double inv = 1.0 / bval;
+		int i = 0;
+		int upperBound = SPECIES.loopBound(alen);
+		DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vcontrib = va.mul(vinv);
+
+			// scatter-add lane-by-lane
+			for (int lane = 0; lane < vLen; lane++) {
+				int idx = ci + aix[ai + i + lane];
+				c[idx] += vcontrib.lane(lane);
+			}
+		}
+
+		//rest, not aligned to vLen-blocks
+		for(; i<alen; i++){
+			c[ci + aix[ai + i]] += a[ai + i] * inv;
+		}
+	}
+
+    public static void scalarvectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+		for( int j = ai; j < ai+alen; j++ )
+			c[ci + aix[j]] += bval / a[j];
+	}
+
+	// not in use: vector api implementation slower than scalar loop version
+	public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) {
+		int i = 0;
+		int upperBound = SPECIES.loopBound(alen);
+		DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vcontrib = vb.div(va);
+
+			// scatter-add lane-by-lane
+			for (int lane = 0; lane < vLen; lane++) {
+				int idx = ci + aix[ai + i + lane];
+				c[idx] += vcontrib.lane(lane);
+			}	
+		}
+		//rest, not aligned to vLen-blocks
+		for (; i<alen; i++){
+			c[ci + aix[ai + i]] += bval / a[ai +i];
+		}
+	}
+    public static double[] scalarvectDivWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++)
+			c[j] = a[ai+j] / bval;
+		return c;
+	}
+
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectDivWrite(double[] a, double bval, int ai, int len) {
+		double[] c = allocVector(len, false);
+		final double inv = 1.0 / bval;
+		final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			va.mul(vinv).intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[i] = a[ai + i] * inv;
+		}
+		return c;
+	}
+    public static double[] scalarvectDivWrite(double bval, double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++)
+			c[j] = bval / a[ai + j];
+		return c;
+	}
+
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectDivWrite(double bval, double[] a, int ai, int len) {
+		double[] c = allocVector(len, false);
+		final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			vb.div(va).intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i<len; i++){
+			c[i] = bval / a[ai + i];
+		}
+		return c;
+	}
+    public static double[] scalarvectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
+		double[] c = allocVector(len, false);
+		for( int j = 0; j < len; j++)
+			c[j] = a[ai + j] / b[bi + j];
+		return c;
+	}
+
+	// not in use: vector api implementation slower than scalar loop version
+	public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int len) {
+		double[] c = allocVector(len, false);
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i);
+			va.div(vb).intoArray(c, i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for(; i <len; i++){
+			c[i] = a[ai + i] / b[bi + i];
+		}
+		return c;
+	}
+    public static double scalarrowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
+		double val = Double.NEGATIVE_INFINITY;
+		int j=0;
+		for( int i = ai; i < ai+len; i++ )
+			val = Math.max(a[i]*b[j++], val);
+		return val;
+	}
+
+	public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) {
+		double maxVal = Double.NEGATIVE_INFINITY;
+	
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+	
+		DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i);
+			DoubleVector prod = va.mul(vb);
+			vmax = vmax.max(prod);
+		}
+	
+		maxVal = vmax.reduceLanes(VectorOperators.MAX);
+	
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]);
+		}
+	
+		return maxVal;
+	}
+    // note: parameter bi unused
+	public static double scalarrowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
+		double val = Double.NEGATIVE_INFINITY;
+		for( int i = ai; i < ai+len; i++ )
+			val = Math.max(a[i]*b[aix[i]], val);
+		return val;
+	}
+
+	// not in use: vector api implementation slower than scalar loop version
+	public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) {
+		double scalarMax = Double.NEGATIVE_INFINITY;
+
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += SPECIES.length()) {
+			DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0, aix, ai + i);
+			DoubleVector prod = va.mul(vb);
+			vmax = vmax.max(prod);
+		}
+		scalarMax = Math.max(scalarMax, vmax.reduceLanes(VectorOperators.MAX));
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			double prod = a[ai + i] * b[aix[ai + i]];
+			if (prod > scalarMax)
+				scalarMax = prod;
+		}
+		return scalarMax;
+    }
+	
+
+    public static double scalarvectSum(double[] a, int ai, int len) { 
+		double val = 0;
+		final int bn = len%8;
+		
+		//compute rest
+		for( int i = ai; i < ai+bn; i++ )
+			val += a[ i ];
+		
+		//unrolled 8-block (for better instruction-level parallelism)
+		for( int i = ai+bn; i < ai+len; i+=8 ) {
+			//read 64B cacheline of a, compute cval' = sum(a) + cval
+			val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ]
+			     + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ i+7 ];
+		}
+		
+		//scalar result
+		return val; 
+	} 
+	
+	public static double vectSum(double[] a, int ai, int len) {
+        double sum = 0d;
+        int i = 0;
+
+        DoubleVector acc = DoubleVector.zero(SPECIES);
+        int upperBound = SPECIES.loopBound(len);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+        for (; i < upperBound; i += SPECIES.length()) {
+            DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+            acc = acc.add(v);
+        }
+        sum += acc.reduceLanes(VectorOperators.ADD);
+
+        //rest, not aligned to vLen-blocks
+        for (; i < len; i++) {
+            sum += a[ai + i];
+        }
+        return sum;
+    }
+    public static double scalarvectMax(double[] a, int ai, int len) { 
+		double val = Double.NEGATIVE_INFINITY;
+		for( int i = ai; i < ai+len; i++ )
+			val = Math.max(a[i], val);
+		return val; 
+	} 
+
+	public static double vectMax(double[] a, int ai, int len) {
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY);
+	
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+			vmax = vmax.max(v);
+		}
+		double maxVal = vmax.reduceLanes(VectorOperators.MAX);
+
+		//rest, not aligned to vLen-blocks	
+		for(;i<len;i++){
+			maxVal = Math.max(a[ai + i],maxVal);
+		}
+		return maxVal;
+	}
+    public static double scalarvectCountnnz(double[] a, int ai, int len) { 
+		int count = 0;
+		for( int i = ai; i < ai+len; i++ )
+			count += (a[i] != 0) ? 1 : 0;
+		return count;
+	} 
+	public static double vectCountnnz(double[] a, int ai, int len) {	
+		int count = 0;
+		int i = 0;
+		int upperBound = SPECIES.loopBound(len);
+		DoubleVector vzero = DoubleVector.zero(SPECIES);
+	
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upperBound; i += vLen) {
+			DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i);
+			VectorMask<Double> nz = v.compare(VectorOperators.NE, vzero);
+			count += nz.trueCount();
+		}
+	
+		//rest, not aligned to vLen-blocks	
+		for(;i<len;i++){
+			count += (a[i] != 0) ? 1 : 0;
+		}
+		return count;
+	}
+    public static void scalarvectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		for( int j = ai; j < ai+len; j++, ci++)
+			c[ci] += (a[j] == bval) ? 1 : 0;
+	}
+	public static void vectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+		int i = 0;
+		int upper = SPECIES.loopBound(len);
+		final DoubleVector bVec   = DoubleVector.broadcast(SPECIES, bval);
+		final DoubleVector ones   = DoubleVector.broadcast(SPECIES, 1.0);
+		final DoubleVector zeros  = DoubleVector.zero(SPECIES);
+
+		//unrolled vLen-block  (for better instruction-level parallelism)
+		for (; i < upper; i += vLen) {
+			DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+			DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+
+			VectorMask<Double> eq = aVec.compare(VectorOperators.EQ, bVec);
+
+			DoubleVector inc = zeros.blend(ones, eq);
+
+			cVec.add(inc).intoArray(c, ci + i);
+		}
+
+		//rest, not aligned to vLen-blocks
+		for (; i < len; i++) {
+			c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0;
+			}
+		}
+    public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int len) {
+            double[] c = allocVector(len, false);
+            for( int j = 0; j < len; j++, ai++)
+                c[j] = (a[ai] == bval) ? 1 : 0;
+            return c;
+        }
+        public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) {
+            double[] c = allocVector(len, false);
+            int i = 0;
+            int upper = SPECIES.loopBound(len);
+            DoubleVector vb = DoubleVector.broadcast(SPECIES, bval);
+            DoubleVector zeros = DoubleVector.zero(SPECIES);
+            DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0);
+        
+            //unrolled vLen-block  (for better instruction-level parallelism)
+            for (; i < upper; i += vLen) {
+                DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i);
+                var mask = va.compare(VectorOperators.EQ, vb);
+                DoubleVector out = zeros.blend(ones, mask);
+                out.intoArray(c, i);
+            }
+        
+            //rest, not aligned to vLen-blocks
+            for (; i < len; i++) {
+                c[i] = (a[ai + i] == bval) ? 1 : 0;
+            }
+            return c;
+        }
+            public static double[] scalarvectEqualWrite(double[] a, double[] b, int ai, int bi, int len) {
+                double[] c = allocVector(len, false);
+                for( int j = 0; j < len; j++, ai++, bi++)
+                    c[j] = (a[ai] == b[bi]) ? 1 : 0;
+                return c;
+            }
+        
+    public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) {
+                double[] c = allocVector(len, false);
+                final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                int i = 0;
+                int upper = SPECIES.loopBound(len);
+        
+                //unrolled vLen-block  (for better instruction-level parallelism)
+                for (; i < upper; i += vLen) {
+                    DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                    DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+                    VectorMask<Double> eq = aVec.compare(VectorOperators.EQ, bVec);
+                    DoubleVector out = zeros.blend(ones, eq);
+        
+                    out.intoArray(c, i);
+                }
+        
+                   //rest, not aligned to vLen-blocks
+                for (; i < len; i++) {
+                    c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0;
+                }
+                return c;
+            }
+            public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+                double[] c = allocVector(len, false);
+                for( int j = 0; j < len; j++, ai++, bi++)
+                    c[j] = (a[ai] != b[bi]) ? 1 : 0;
+                return c;
+            }
+        
+            // not in use: vector api implementation slower than scalar loop version
+public static double[] vectNotequalWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) {
+                double[] c = allocVector(len, false);
+                final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                int i = 0;
+                int upper = SPECIES.loopBound(len);
+                
+                //unrolled vLen-block  (for better instruction-level parallelism)
+                for (; i < upper; i += vLen) {
+                    DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                    DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+        
+                    VectorMask<Double> ne = aVec.compare(VectorOperators.NE, bVec);
+                    DoubleVector out = zeros.blend(ones, ne);
+        
+                    out.intoArray(c, i);
+                }
+        
+                //rest, not aligned to vLen-blocks
+                for (; i < len; i++) {
+                    c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0;
+                }
+                return c;
+                }
+
+
+                public static void scalarvectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+                    for( int j = ai; j < ai+len; j++, ci++)
+                        c[ci] += (a[j] < bval) ? 1 : 0;
+                }
+    public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+                    final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+                    final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                    final DoubleVector zeros = DoubleVector.zero(SPECIES);
+            
+                    int i = 0;
+                    int upper = SPECIES.loopBound(len);
+            
+                    //unrolled vLen-block  (for better instruction-level parallelism)
+                    for (; i < upper; i += vLen) {
+                        DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                        DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+            
+                        VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+                        DoubleVector inc = zeros.blend(ones, lt);
+            
+                        cVec.add(inc).intoArray(c, ci + i);
+                    }
+            
+                    //rest, not aligned to vLen-blocks
+                    for (; i < len; i++) {
+                        c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0;
+                        }
+                    }
+
+
+    public static double[] scalarvectLessWrite(double[] a, double bval, int ai, int len) {
+                        double[] c = allocVector(len, false);
+                        for( int j = 0; j < len; j++, ai++)
+                            c[j] = (a[ai] < bval) ? 1 : 0;
+                        return c;
+                    }
+                
+                
+    public static double[] vectLessWrite(double[] a, double bval, int ai, int len) {
+                        double[] c = allocVector(len, false);
+                        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+                        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                
+                        int i = 0;
+                        int upper = SPECIES.loopBound(len);
+                
+                        //unrolled vLen-block  (for better instruction-level parallelism)
+                        for (; i < upper; i += vLen) {
+                            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                
+                            VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+                            DoubleVector out = zeros.blend(ones, lt);
+                
+                            out.intoArray(c, i);
+                        }
+                
+                        //rest, not aligned to vLen-blocks
+                        for (; i < len; i++) {
+                            c[i] = (a[ai + i] < bval) ? 1.0 : 0.0;
+                        }
+                
+                        return c;
+                    }
+
+                    public static double[] scalarvectLessWrite(double[] a, double[] b, int ai, int bi, int len) {
+                        double[] c = allocVector(len, false);
+                        for( int j = 0; j < len; j++, ai++, bi++)
+                            c[j] = (a[ai] < b[bi]) ? 1 : 0;
+                        return c;
+                    }
+                
+                    public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) {
+                        double[] c = allocVector(len, false);
+                
+                        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                
+                        int i = 0;
+                        int upper = SPECIES.loopBound(len);
+                
+                        //unrolled vLen-block  (for better instruction-level parallelism)
+                        for (; i < upper; i += vLen) {
+                            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                            DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+                
+                            VectorMask<Double> lt = aVec.compare(VectorOperators.LT, bVec);
+                            DoubleVector out = zeros.blend(ones, lt);
+                
+                            out.intoArray(c, i);
+                        }
+                
+                        //rest, not aligned to vLen-blocks
+                        for (; i < len; i++) {
+                        c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0;
+                        }
+                
+                        return c;
+                        }
+                        public static void scalarvectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+                            for( int j = ai; j < ai+len; j++, ci++)
+                                c[ci] += (a[j] <= bval) ? 1 : 0;
+                        }
+                    
+                        public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+                            final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+                            final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                            final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                    
+                            int i = 0;
+                            int upper = SPECIES.loopBound(len);
+                    
+                            //unrolled vLen-block  (for better instruction-level parallelism)
+                            for (; i < upper; i += vLen) {
+                                DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                                DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+                    
+                                VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+                                DoubleVector inc = zeros.blend(ones, le);
+                    
+                                cVec.add(inc).intoArray(c, ci + i);
+                            }
+                    
+                            //rest, not aligned to vLen-blocks
+                            for (; i < len; i++) {
+                                c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0;
+                            }
+                            }
+                            public static double[] scalarvectLessequalWrite(double[] a, double bval, int ai, int len) {
+                                double[] c = allocVector(len, false);
+                                for( int j = 0; j < len; j++, ai++)
+                                    c[j] = (a[ai] <= bval) ? 1 : 0;
+                                return c;
+                            }
+                            public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) {
+                                double[] c = allocVector(len, false);
+                                final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+                                final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                                final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                        
+                                int i = 0;
+                                int upper = SPECIES.loopBound(len);
+                        
+                                //unrolled vLen-block  (for better instruction-level parallelism)
+                                for (; i < upper; i += vLen) {
+                                    DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                        
+                                    VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+                                    DoubleVector out = zeros.blend(ones, le);
+                        
+                                    out.intoArray(c, i);
+                                }
+                        
+                                //rest, not aligned to vLen-blocks
+                                for (; i < len; i++) {
+                                    c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0;
+                                }
+                        
+                                return c;
+                            }
+                            public static double[] scalarvectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+                                double[] c = allocVector(len, false);
+                                for( int j = 0; j < len; j++, ai++, bi++)
+                                    c[j] = (a[ai] <= b[bi]) ? 1 : 0;
+                                return c;
+                            }
+                        
+                            public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) {
+                                double[] c = allocVector(len, false);
+                        
+                                final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                                final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                        
+                                int i = 0;
+                                int upper = SPECIES.loopBound(len);
+                        
+                                //unrolled vLen-block  (for better instruction-level parallelism)
+                                for (; i < upper; i += vLen) {
+                                DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                                DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+                        
+                                VectorMask<Double> le = aVec.compare(VectorOperators.LE, bVec);
+                                DoubleVector out = zeros.blend(ones, le);
+                        
+                                out.intoArray(c, i);
+                                }
+                        
+                                //rest, not aligned to vLen-blocks
+                                for (; i < len; i++) {
+                                c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0;
+                                }
+                        
+                                return c;
+                                }
+                                public static void scalarvectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+                                    for( int j = ai; j < ai+len; j++, ci++)
+                                        c[ci] += (a[j] > bval) ? 1 : 0;
+                                }
+                            
+                                public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) {
+                                    final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+                                    final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                                    final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                            
+                                    int i = 0;
+                                    int upper = SPECIES.loopBound(len);
+                            
+                                    //unrolled vLen-block  (for better instruction-level parallelism)
+                                    for (; i < upper; i += vLen) {
+                                        DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                                        DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i);
+                            
+                                        VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+                                        DoubleVector inc = zeros.blend(ones, gt);
+                            
+                                        cVec.add(inc).intoArray(c, ci + i);
+                                    }
+                            
+                                    //rest, not aligned to vLen-blocks
+                                    for (; i < len; i++) {
+                                        c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0;
+                                    }
+                                    }
+                                    public static double[] scalarvectGreaterWrite(double[] a, double bval, int ai, int len) {
+                                        double[] c = allocVector(len, false);
+                                        for( int j = 0; j < len; j++, ai++)
+                                            c[j] = (a[ai] > bval) ? 1 : 0;
+                                        return c;
+                                    }
+                                    public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) {
+                                        double[] c = allocVector(len, false);
+                                        final DoubleVector bVec  = DoubleVector.broadcast(SPECIES, bval);
+                                        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                                        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                                
+                                        int i = 0;
+                                        int upper = SPECIES.loopBound(len);
+                                
+                                        //unrolled vLen-block  (for better instruction-level parallelism)
+                                        for (; i < upper; i += vLen) {
+                                            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                                
+                                            VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+                                            DoubleVector out = zeros.blend(ones, gt);
+                                
+                                            out.intoArray(c, i);
+                                        }
+                                
+                                        //rest, not aligned to vLen-blocks
+                                        for (; i < len; i++) {
+                                            c[i] = (a[ai + i] > bval) ? 1.0 : 0.0;
+                                        }
+                                        return c;
+                                    }
+                                    public static void scalarvectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
+                                        for( int j = ai; j < ai+len; j++, ci++)
+                                            c[ci] +=  a[j] + a[j];
+                                    }
+                                
+                                    public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) {
+                                        LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len);
+                                    }
+
+                                    public static double[] scalarvectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
+                                        double[] c = allocVector(len, false);
+                                        for( int j = 0; j < len; j++, ai++, bi++)
+                                            c[j] = (a[ai] > b[bi]) ? 1 : 0;
+                                        return c;
+                                    }
+                                
+                                    // not in use: vector api implementation slower than scalar loop version
+                                    public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) {
+                                        double[] c = allocVector(len, false);
+                                        final DoubleVector ones  = DoubleVector.broadcast(SPECIES, 1.0);
+                                        final DoubleVector zeros = DoubleVector.zero(SPECIES);
+                                
+                                        int i = 0;
+                                        int upper = SPECIES.loopBound(len);
+                                
+                                        //unrolled vLen-block  (for better instruction-level parallelism)
+                                        for (; i < upper; i += vLen) {
+                                            DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i);
+                                            DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i);
+                                
+                                            VectorMask<Double> gt = aVec.compare(VectorOperators.GT, bVec);
+                                            DoubleVector out = zeros.blend(ones, gt);
+                                
+                                            out.intoArray(c, i);
+                                        }
+                                
+                                        //rest, not aligned to vLen-blocks
+                                        for (; i < len; i++) {
+                                            c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0;
+                                        }
+                                        return c;
+                                        }
+    
+}
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java
deleted file mode 100644
index 4c2bd230349..00000000000
--- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java
+++ /dev/null
@@ -1,36 +0,0 @@
-package org.apache.sysds.test.component.codegen.performance_tests;
-
-
-public class benchUtil {
-
-    public static void warmup(Runnable r, int iters) {
-        for (int i = 0; i < iters; i++) {
-            r.run();
-        }
-    }
-
-    /** returns ns per call */
-    public static double measure(Runnable r, int iters) {
-        long t0 = System.nanoTime();
-        for (int i = 0; i < iters; i++) {
-            r.run();
-        }
-        long t1 = System.nanoTime();
-        return (t1 - t0) / (double) iters;
-    }
-
-    public static double checksum(double[] x) {
-        double s = 0;
-        for (double v : x) s += v;
-        return s;
-    }
-
-    public static double maxAbsDiff(double[] a, double[] b) {
-        double m = 0;
-        for (int i = 0; i < a.length; i++) {
-            m = Math.max(m, Math.abs(a[i] - b[i]));
-        }
-        return m;
-    }
-}
-
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java
deleted file mode 100644
index c2cd8f068f4..00000000000
--- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java
+++ /dev/null
@@ -1,95 +0,0 @@
-package org.apache.sysds.test.component.codegen.performance_tests;
-import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
-
-
-public class rowMaxsVectMultTest {
-    public static void main(String[] args) {
-        int len = 1_000_000;
-        double[] a = new double[len];
-        for (int i = 0; i < len; i++)
-            a[i] = (i % 10) - 5;
-        double[] b = new double[len];
-        for (int i = 0; i < len; i++)
-            b[i] = (i % 10) - 5;
-
-        float[] a_f = new float[len];
-        for (int i = 0; i < len; i++)
-            a_f[i] = (i % 10) - 5;
-        float[] b_f = new float[len];
-        for (int i = 0; i < len; i++)
-            b_f[i] = (i % 10) - 5;
-
-
-
-        // warm up
-        for (int i = 0; i < 20_000; i++) {
-            LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len);
-            LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len);
-            LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len);
-            LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len);
-            LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len);
-        }
-
-        // measure
-        long t2_0 = System.nanoTime();
-        double s2 = 0;
-        for (int i = 0; i < 2000; i++)
-            s2 += LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len);
-        long t2_1 = System.nanoTime();
-
-        System.out.println("Vector MaxVal=" + s2/2000);
-        System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0));
-        
-        // measure
-        long t1_0 = System.nanoTime();
-        double s1 = 0;
-        for (int i = 0; i < 2000; i++)
-            s1 += LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len);
-        long t1_1 = System.nanoTime();
-
-        System.out.println("Scalar MaxVal Sum=" + s1/2000);
-        System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0));
-
-
-        // measure
-        long t3_0 = System.nanoTime();
-        double s3 = 0;
-        for (int i = 0; i < 2000; i++)
-            s3 += LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len);
-        long t3_1 = System.nanoTime();
-
-        System.out.println("Vector Float MaxVal=" + s3/2000);
-        System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0));
-
-        // measure
-        long t4_0 = System.nanoTime();
-        double s4 = 0;
-        for (int i = 0; i < 2000; i++)
-            s4 += LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len);
-        long t4_1 = System.nanoTime();
-
-        System.out.println("Scalar Float MaxVal=" + s4/2000);
-        System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0));
-
-        // measure
-        long t5_0 = System.nanoTime();
-        double s5 = 0;
-        for (int i = 0; i < 2000; i++)
-            s5 += LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len);
-        long t5_1 = System.nanoTime();
-
-        System.out.println("Vector 2acc MaxVal=" + s5/2000);
-        System.out.println("Time per call (ns): " + ((t5_1 - t5_0) / 2000.0));
-
-    
-
-    }
-}
-/* 
-Scalar Sum=-1.0E9
-Time per call (ns): 142774.5625
-Vector Sum=-1.0E9
-Time per call (ns): 468854.25
-Vector Float Sum=-1.0E9
-Time per call (ns): 274727.3545
-*/
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java
deleted file mode 100644
index a43496d6a8d..00000000000
--- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java
+++ /dev/null
@@ -1,100 +0,0 @@
-
-package org.apache.sysds.test.component.codegen.performance_tests;
-import java.util.Arrays;
-
-import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
-
-
-public class vectDivAddTest {
-    public static void main(String[] args) {
-        //final int len = 32_768;
-        final int len = 262_144;
-        //final int len = 1_000_000;
-
-        final double[] a = new double[len];
-        final double[] cInit = new double[len];
-
-        for (int i = 0; i < len; i++) {
-            a[i] = (i % 10) - 5;
-            cInit[i] = (i % 10) - 5;
-        }
-
-        final double bval = 1.234567; // NOT 1.0
-
-        double[] cScalar = Arrays.copyOf(cInit, len);
-        double[] cVector = Arrays.copyOf(cInit, len);
-        double[] cVectorPureDiv = Arrays.copyOf(cInit, len);
-
-        // Warm up scalar only
-        for (int i = 0; i < 200; i++) {
-            LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len);
-        }
-
-        // Warm up vector only
-        for (int i = 0; i < 200; i++) {
-            LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len);
-        }
-
-        // Warm up pure div vector only
-        for (int i = 0; i < 200; i++) {
-            LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len);
-        }
-
-        // Reset for measurement
-        cScalar = Arrays.copyOf(cInit, len);
-
-        // Measure scalar
-        long t0 = System.nanoTime();
-        for (int i = 0; i < 2000; i++) {
-            LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len);
-        }
-        long t1 = System.nanoTime();
-
-        // Reset for measurement
-        cVector = Arrays.copyOf(cInit, len);
-
-        // Measure vector
-        long t2 = System.nanoTime();
-        for (int i = 0; i < 2000; i++) {
-            LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len);
-        }
-        long t3 = System.nanoTime();
-
-        // Compare correctness
-        double maxDiff = 0;
-        double sumScalar = 0, sumVector = 0;
-        for (int i = 0; i < len; i++) {
-            maxDiff = Math.max(maxDiff, Math.abs(cScalar[i] - cVector[i]));
-            sumScalar += cScalar[i];
-            sumVector += cVector[i];
-        }
-
-
-         // Reset for measurement
-         cVectorPureDiv = Arrays.copyOf(cInit, len);
-
-         // Measure vector
-         long t4 = System.nanoTime();
-         for (int i = 0; i < 2000; i++) {
-             LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len);
-         }
-         long t5 = System.nanoTime();
- 
-         // Compare correctness
-
-         double sum_prev = sumScalar + sumVector;
-         double sum_Vector_pure_div = 0;
-         for (int i = 0; i < len; i++) {
-             maxDiff = Math.max(maxDiff, Math.abs(sumScalar - cVectorPureDiv[i]));
-             sum_Vector_pure_div += cVectorPureDiv[i];
-         }
-
-        System.out.println("Scalar time per call (ns): " + ((t1 - t0) / 2000.0));
-        System.out.println("Vector time per call (ns): " + ((t3 - t2) / 2000.0));
-        System.out.println("pure vector div time per call (ns): " + ((t5 - t4) / 2000.0));
-        System.out.println("maxDiff: " + maxDiff);
-        System.out.println("checksum scalar: " + sumScalar);
-        System.out.println("checksum vector: " + sumVector);
-        System.out.println("checksum pure vector div : " + sum_Vector_pure_div);
-    }
-}
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java
deleted file mode 100644
index be5666a6847..00000000000
--- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java
+++ /dev/null
@@ -1,61 +0,0 @@
-
-package org.apache.sysds.test.component.codegen.performance_tests;
-import java.util.Arrays;
-
-import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
-
-
-public class vectEqualWriteTest {
-    public static void main(String[] args) {
-        //final int len = 32_768;
-        //final int len = 262_144;
-        final int len = 1_000_000;
-        //final int len = 1_000_000;
-
-        final double[] aInit = new double[len];
-
-        for (int i = 0; i < len; i++) {
-            aInit[i] = (i % 10) - 5;
-        }
-
-        final double bval = 1.234567; // NOT 1.0
-
-        double[] aScalar = Arrays.copyOf(aInit, len);
-        double[] aVector = Arrays.copyOf(aInit, len);
-
-        // Warm up scalar only
-        for (int i = 0; i < 200; i++) {
-            LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len);
-        }
-
-        // Warm up vector only
-        for (int i = 0; i < 200; i++) {
-            LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len);
-        }
-
-        // Reset for measurement
-        aScalar = Arrays.copyOf(aInit, len);
-
-        // Measure scalar
-        long t0 = System.nanoTime();
-        for (int i = 0; i < 2000; i++) {
-            LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len);
-        }
-        long t1 = System.nanoTime();
-        System.out.println("Scalar");
-        System.out.println("Time per call (ns): " + ((t1- t0) / 2000.0));
-        
-
-        // Reset for measurement
-        aVector = Arrays.copyOf(aInit, len);
-
-        // Measure vector
-        long t2 = System.nanoTime();
-        for (int i = 0; i < 2000; i++) {
-            LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len);
-        }
-        long t3 = System.nanoTime();
-        System.out.println("Vector");
-        System.out.println("Time per call (ns): " + ((t3- t2) / 2000.0));
-    }
-}
diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java
deleted file mode 100644
index 90fb36192c8..00000000000
--- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java
+++ /dev/null
@@ -1,74 +0,0 @@
-package org.apache.sysds.test.component.codegen.performance_tests;
-import org.apache.sysds.runtime.codegen.LibSpoofPrimitives;
-
-
-public class vectSumTest {
-    public static void main(String[] args) {
-        int len = 1_000_000;
-        double[] a = new double[len];
-        for (int i = 0; i < len; i++)
-            a[i] = (i % 10) - 5;
-        float[] a_f = new float[len];
-        for (int i = 0; i < len; i++)
-            a_f[i] = (i % 10) - 5;
-
-        // warm up
-        for (int i = 0; i < 20_000; i++) {
-            LibSpoofPrimitives.vectSum(a, 0, len);
-            LibSpoofPrimitives.scalarvectSum(a, 0, len);
-            LibSpoofPrimitives.vectSumFloat(a_f, 0, len);
-            LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len);
-        }
-
-
-        // measure
-        long t2_0 = System.nanoTime();
-        double s2 = 0;
-        for (int i = 0; i < 2000; i++)
-            s2 += LibSpoofPrimitives.scalarvectSum(a, 0, len);
-        long t2_1 = System.nanoTime();
-
-        System.out.println("Scalar Sum=" + s2);
-        System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0));
-        
-        // measure
-        long t1_0 = System.nanoTime();
-        double s1 = 0;
-        for (int i = 0; i < 2000; i++)
-            s1 += LibSpoofPrimitives.vectSum(a, 0, len);
-        long t1_1 = System.nanoTime();
-
-        System.out.println("Vector Sum=" + s1);
-        System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0));
-
-        // measure
-        long t3_0 = System.nanoTime();
-        double s3 = 0;
-        for (int i = 0; i < 2000; i++)
-            s3 += LibSpoofPrimitives.vectSumFloat(a_f, 0, len);
-        long t3_1 = System.nanoTime();
-
-        System.out.println("Vector Float Sum=" + s3);
-        System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0));
-
-
-        // measure
-        long t4_0 = System.nanoTime();
-        double s4 = 0;
-        for (int i = 0; i < 2000; i++)
-            s4 += LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len);
-        long t4_1 = System.nanoTime();
-
-        System.out.println("Scalar Float Sum=" + s4/2000);
-        System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0));
-
-    }
-}
-/* 
-Scalar Sum=-1.0E9
-Time per call (ns): 142774.5625
-Vector Sum=-1.0E9
-Time per call (ns): 468854.25
-Vector Float Sum=-1.0E9
-Time per call (ns): 274727.3545
-*/