From 9557e0e3bbd019327142bf4ce52168ad625e1a1d Mon Sep 17 00:00:00 2001 From: JulianJuelg Date: Tue, 27 Jan 2026 21:07:02 +0100 Subject: [PATCH 1/3] first vector api implementaions + benchmarking suit --- .../runtime/codegen/LibSpoofPrimitives.java | 287 +++++++++++++++++- .../primitives_vector_api/BenchCase.java | 46 +++ .../primitives_vector_api/BenchUtil.java | 63 ++++ .../primitives_vector_api/Ctx.java | 33 ++ .../PrimitivePerfSuite.java | 43 +++ .../codegen/performance_tests/benchUtil.java | 36 +++ .../rowMaxsVectMultTest.java | 95 ++++++ .../performance_tests/vectDivAddTest.java | 100 ++++++ .../performance_tests/vectEqualWriteTest.java | 61 ++++ .../performance_tests/vectSumTest.java | 74 +++++ 10 files changed, 833 insertions(+), 5 deletions(-) create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java create mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java diff --git a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java index ebb42676f0e..214226497f0 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java @@ -28,10 +28,15 @@ import org.apache.sysds.runtime.functionobjects.IntegerDivide; import org.apache.sysds.runtime.functionobjects.Modulus; import org.apache.sysds.runtime.matrix.data.LibMatrixDNN; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType; import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col; import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling; import org.apache.sysds.runtime.matrix.data.LibMatrixMult; -import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; /** * This library contains all vector primitives that are used in @@ -45,6 +50,12 @@ public class LibSpoofPrimitives private static IntegerDivide intDiv = IntegerDivide.getFnObject(); private static Modulus mod = Modulus.getFnObject(); private static BitwAnd bwAnd = BitwAnd.getBitwAndFnObject(); + + // Vector API initializations + private static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + private static final VectorSpecies FSPECIES = FloatVector.SPECIES_PREFERRED; + private static final int vLen = SPECIES.length(); + //global pool of reusable vectors, individual operations set up their own thread-local //ring buffers of reusable vectors with specific number of vectors and vector sizes @@ -56,7 +67,7 @@ public class LibSpoofPrimitives @Override protected SparseVectorBuffer initialValue() { return new SparseVectorBuffer(0,0,0); } }; - public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) { + public static double scalarrowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) { double val = Double.NEGATIVE_INFINITY; int j=0; for( int i = ai; i < ai+len; i++ ) @@ -64,6 +75,78 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int return val; } + public static double scalarrowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) { + float val = Float.NEGATIVE_INFINITY; + int j=0; + for( int i = ai; i < ai+len; i++ ) + val = Math.max(a[i]*b[j++], val); + return val; + } + + public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) { + double maxVal = Double.NEGATIVE_INFINITY; + + int i = 0; + int upper = SPECIES.loopBound(len); + + // vector accumulator for max + DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + + // IMPORTANT: + // Your original code uses b[j++] starting at 0 (ignores bi). + // I assume that is a bug/oversight, so I use b[bi + i]. + // If you *must* keep exact old semantics, replace (bi + i) with just i. + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i); + DoubleVector prod = va.mul(vb); + vmax = vmax.max(prod); + } + + // Reduce vector lanes to a scalar max + maxVal = vmax.reduceLanes(VectorOperators.MAX); + + // Tail + for (; i < len; i++) { + maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]); + } + + return maxVal; + } + + public static double rowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) { + float maxVal = Float.NEGATIVE_INFINITY; + + int i = 0; + int upper = FSPECIES.loopBound(len); + + // vector accumulator for max + FloatVector vmax = FloatVector.broadcast(FSPECIES, Float.NEGATIVE_INFINITY); + + // IMPORTANT: + // Your original code uses b[j++] starting at 0 (ignores bi). + // I assume that is a bug/oversight, so I use b[bi + i]. + // If you *must* keep exact old semantics, replace (bi + i) with just i. + for (; i < upper; i += FSPECIES.length()) { + FloatVector va = FloatVector.fromArray(FSPECIES, a, ai + i); + FloatVector vb = FloatVector.fromArray(FSPECIES, b, bi + i); + FloatVector prod = va.mul(vb); + vmax = vmax.max(prod); + } + + // Reduce vector lanes to a scalar max + maxVal = vmax.reduceLanes(VectorOperators.MAX); + + // Tail + for (; i < len; i++) { + maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]); + } + + return maxVal; + } + + + public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { double val = Double.NEGATIVE_INFINITY; for( int i = ai; i < ai+len; i++ ) @@ -295,7 +378,9 @@ public static double[] vectCbindWrite(double[] a, double[] b, int[] aix, int ai, * @param len number of processed elements * @return sum value */ - public static double vectSum(double[] a, int ai, int len) { + + // scalar function + public static double scalarvectSum(double[] a, int ai, int len) { double val = 0; final int bn = len%8; @@ -313,6 +398,113 @@ public static double vectSum(double[] a, int ai, int len) { //scalar result return val; } + + public static double scalarvectSumFloat(float[] a, int ai, int len) { + float val = 0; + final int bn = len%8; + + //compute rest + for( int i = ai; i < ai+bn; i++ ) + val += a[ i ]; + + //unrolled 8-block (for better instruction-level parallelism) + for( int i = ai+bn; i < ai+len; i+=8 ) { + //read 64B cacheline of a, compute cval' = sum(a) + cval + val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ] + + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ i+7 ]; + } + + //scalar result + return val; + } + public static double vectSum(double[] a, int ai, int len) { + double sum = 0d; + int i = 0; + + DoubleVector acc = DoubleVector.zero(SPECIES); + + // largest multiple of vLen <= len + int upperBound = SPECIES.loopBound(len); + + for (; i < upperBound; i += SPECIES.length()) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + acc = acc.add(v); + } + + // reduce vector lanes into scalar + sum += acc.reduceLanes(VectorOperators.ADD); + + // tail (remaining elements) + for (; i < len; i++) { + sum += a[ai + i]; + } + + return sum; + } + + public static double rowMaxsVectMultVec2Acc(double[] a, double[] b, int ai, int bi, int len) { + int i = 0; + int upper = SPECIES.loopBound(len); + + DoubleVector vmax1 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + DoubleVector vmax2 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + + // step = 2 vectors per iteration + int step = vLen * 2; + + for (; i + step <= upper; i += step) { + DoubleVector va1 = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vb1 = DoubleVector.fromArray(SPECIES, b, bi + i); + vmax1 = vmax1.max(va1.mul(vb1)); + + DoubleVector va2 = DoubleVector.fromArray(SPECIES, a, ai + i + vLen); + DoubleVector vb2 = DoubleVector.fromArray(SPECIES, b, bi + i + vLen); + vmax2 = vmax2.max(va2.mul(vb2)); + } + + // finish remaining vector loop + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i); + vmax1 = vmax1.max(va.mul(vb)); + } + + // combine both accumulators + DoubleVector vmax = vmax1.max(vmax2); + double maxVal = vmax.reduceLanes(VectorOperators.MAX); + + // tail + for (; i < len; i++) { + maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]); + } + + return maxVal; + } + + public static double vectSumFloat(float[] a, int ai, int len) { + float sum = 0; + int i = 0; + + FloatVector acc = FloatVector.zero(FSPECIES); + + // largest multiple of vLen <= len + int upperBound = FSPECIES.loopBound(len); + + for (; i < upperBound; i += FSPECIES.length()) { + FloatVector v = FloatVector.fromArray(FSPECIES, a, ai + i); + acc = acc.add(v); + } + + // reduce vector lanes into scalar + sum += acc.reduceLanes(VectorOperators.ADD); + + // tail (remaining elements) + for (; i < len; i++) { + sum += a[ai + i]; + } + + return sum; + } public static double vectSum(double[] avals, int[] aix, int ai, int alen, int len) { //forward to dense as column indexes not required here @@ -373,10 +565,68 @@ public static double vectMean(double[] avals, int[] aix, int ai, int alen, int l //custom vector div - public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + public static void scalarvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { for( int j = ai; j < ai+len; j++, ci++) c[ci] += a[j] / bval; } + + public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + // Handle trivial case + if (len <= 0) return; + + // Preferred SIMD width for the current CPU (AVX2/AVX-512/etc.) + final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + + // Hoist reciprocal (1 division instead of len divisions) + final double inv = 1.0 / bval; + final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); + + int i = 0; + final int upperBound = SPECIES.loopBound(len); + + // Vector loop + for (; i < upperBound; i += SPECIES.length()) { + // load a and c + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + + // vc += va * inv + vc = vc.add(va.mul(vinv)); + + // store result back to c + vc.intoArray(c, ci + i); + } + + // Tail loop + for (; i < len; i++) { + c[ci + i] += a[ai + i] * inv; + } + } + + public static void pureDivvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + if (len <= 0) return; + + final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + + int i = 0; + final int upperBound = SPECIES.loopBound(len); + + for (; i < upperBound; i += SPECIES.length()) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + + vc = vc.add(va.div(vb)); + + vc.intoArray(c, ci + i); + } + + for (; i < len; i++) { + c[ci + i] += a[ai + i] / bval; + } + } + + public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { for( int j = ai; j < ai+len; j++, ci++) @@ -1607,12 +1857,39 @@ public static void vectEqualAdd(double bval, double[] a, double[] c, int[] aix, vectEqualAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) { + public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++) c[j] = (a[ai] == bval) ? 1 : 0; return c; } + public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + + int i = 0; + int upper = SPECIES.loopBound(len); + + DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + DoubleVector zeros = DoubleVector.zero(SPECIES); + DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + var mask = va.compare(VectorOperators.EQ, vb); + + // out = (va == vb) ? 1.0 : 0.0 + DoubleVector out = zeros.blend(ones, mask); + out.intoArray(c, i); + } + + // tail + for (; i < len; i++) { + c[i] = (a[ai + i] == bval) ? 1 : 0; + } + + return c; + } + public static double[] vectEqualWrite(double bval, double[] a, int ai, int len) { return vectEqualWrite(a, bval, ai, len); diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java new file mode 100644 index 00000000000..b748642171d --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java @@ -0,0 +1,46 @@ +package org.apache.sysds.performance.primitives_vector_api; +import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; + +public enum BenchCase { + VECT_SUM( + "vectSum dense", + OutKind.SCALAR_DOUBLE, + ctx -> ctx.initDenseA(), + ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectSum(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectSum(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.vectorRes;}, + ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} + ), + + VECT_DIV_ADD( + "vectDivAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> ctx.initDenseAandC(), + ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len), + ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ); + public enum OutKind { SCALAR_DOUBLE, ARRAY_DOUBLE } + public final String name; + public final java.util.function.Consumer setup; + public final java.util.function.Consumer scalar; + public final java.util.function.Consumer vector; + public final java.util.function.Consumer verify; + public final OutKind outKind; + + + BenchCase(String name, + OutKind outKind, + java.util.function.Consumer setup, + java.util.function.Consumer scalar, + java.util.function.Consumer vector, + java.util.function.Consumer verify) { + this.name = name; this.outKind = outKind; this.setup = setup; this.scalar = scalar; this.vector = vector; this.verify = verify; + } + } + diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java new file mode 100644 index 00000000000..12af0df27e1 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java @@ -0,0 +1,63 @@ +package org.apache.sysds.performance.primitives_vector_api; + + +public class BenchUtil { + public static volatile double blackhole; + + public static void warmup(Runnable r,int iters ) { + for (int i = 0; i < iters; i++) r.run(); + } + + public static double measure(Runnable r,int iters) { + System.gc(); + long t0 = System.nanoTime(); + for (int i = 0; i < iters; i++) r.run(); + long t1 = System.nanoTime(); + return (t1 - t0) / (double) iters; + } + + // ---- args helpers ---- + public static int argInt(String[] args, String key, int def) { + for (int i = 0; i < args.length - 1; i++) + if (args[i].equals(key)) + return Integer.parseInt(args[i + 1]); + return def; + } + + public static String argStr(String[] args, String key, String def) { + for (int i = 0; i < args.length - 1; i++) + if (args[i].equals(key)) + return args[i + 1]; + return def; + } + + public static double maxAbsDiff(double[] a, double[] b) { + double m = 0; + for (int i = 0; i < a.length; i++) + m = Math.max(m, Math.abs(a[i] - b[i])); + return m; + } + + public static void printScalarDouble(String name, + double nsScalar, double nsVector, + double scalarRes, double vectorRes, + boolean ok) { + + double speedup = nsScalar / nsVector; + System.out.printf("%s | scalar %.1f ns | vector %.1f ns | speedup %.3fx | " + + "s=%.6g v=%.6g | %s%n", + name, nsScalar, nsVector, speedup, scalarRes, vectorRes, ok ? "OK" : "FAIL"); + } + + public static void printArrayDiff(String name, + double nsScalar, double nsVector, + double maxDiff, + boolean ok) { + + double speedup = nsScalar / nsVector; + System.out.printf("%s | scalar %.1f ns | vector %.1f ns | speedup %.3fx | " + + "maxDiff=%.6g | %s%n", + name, nsScalar, nsVector, speedup, maxDiff, ok ? "OK" : "FAIL"); + } + } + \ No newline at end of file diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java new file mode 100644 index 00000000000..84c66266c8f --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java @@ -0,0 +1,33 @@ +package org.apache.sysds.performance.primitives_vector_api; + +public class Ctx { + public int len; + public double[] a, cInit, cScalar, cVector; + public double bval; + + public double scalarRes, vectorRes; + public double maxDiff; + public boolean ok; + + void initDenseA() { + a = new double[len]; + for (int i = 0; i < len; i++) a[i] = (i % 10) - 5; + } + + void initDenseAandC() { + initDenseA(); + cInit = new double[len]; + for (int i = 0; i < len; i++) cInit[i] = (i % 10) - 5; + cScalar = java.util.Arrays.copyOf(cInit, len); + cVector = java.util.Arrays.copyOf(cInit, len); + bval = 1.234567; + } + + void resetC() { + if (cInit != null) { + System.arraycopy(cInit, 0, cScalar, 0, len); + System.arraycopy(cInit, 0, cVector, 0, len); + } + } + } + diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java new file mode 100644 index 00000000000..c478c7edfb7 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java @@ -0,0 +1,43 @@ +package org.apache.sysds.performance.primitives_vector_api; + +public class PrimitivePerfSuite { + public static void main(String[] args) { + //int len = BenchUtil.argInt(args, "--len", 262_144); + int len = BenchUtil.argInt(args, "--len", 1_000_000); + int warmup = BenchUtil.argInt(args, "--warmup", 10_000); + int iters = BenchUtil.argInt(args, "--iters", 2000); + String filter = BenchUtil.argStr(args, "--filter", ""); + + for (BenchCase bc : BenchCase.values()) { + if (!filter.isEmpty() && !bc.name.contains(filter)) continue; + + Ctx ctx = new Ctx(); + ctx.len = len; + bc.setup.accept(ctx); + + // warm scalar + ctx.resetC(); + BenchUtil.warmup(() -> {bc.scalar.accept(ctx); },warmup); + ctx.resetC(); + double nsScalar = BenchUtil.measure(() -> { bc.scalar.accept(ctx); }, iters); + + // warm vector + ctx.resetC(); + BenchUtil.warmup(() -> {bc.vector.accept(ctx); }, warmup); + ctx.resetC(); + double nsVector = BenchUtil.measure(() -> {bc.vector.accept(ctx); }, iters); + + // verify once + ctx.resetC(); bc.scalar.accept(ctx); + bc.vector.accept(ctx); + bc.verify.accept(ctx); + + if (bc.outKind == BenchCase.OutKind.SCALAR_DOUBLE) { + BenchUtil.printScalarDouble(bc.name, nsScalar, nsVector, ctx.scalarRes, ctx.vectorRes, ctx.ok); + } else { + BenchUtil.printArrayDiff(bc.name, nsScalar, nsVector, ctx.maxDiff, ctx.ok); + } + + } + } +} diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java new file mode 100644 index 00000000000..4c2bd230349 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java @@ -0,0 +1,36 @@ +package org.apache.sysds.test.component.codegen.performance_tests; + + +public class benchUtil { + + public static void warmup(Runnable r, int iters) { + for (int i = 0; i < iters; i++) { + r.run(); + } + } + + /** returns ns per call */ + public static double measure(Runnable r, int iters) { + long t0 = System.nanoTime(); + for (int i = 0; i < iters; i++) { + r.run(); + } + long t1 = System.nanoTime(); + return (t1 - t0) / (double) iters; + } + + public static double checksum(double[] x) { + double s = 0; + for (double v : x) s += v; + return s; + } + + public static double maxAbsDiff(double[] a, double[] b) { + double m = 0; + for (int i = 0; i < a.length; i++) { + m = Math.max(m, Math.abs(a[i] - b[i])); + } + return m; + } +} + diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java new file mode 100644 index 00000000000..c2cd8f068f4 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java @@ -0,0 +1,95 @@ +package org.apache.sysds.test.component.codegen.performance_tests; +import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; + + +public class rowMaxsVectMultTest { + public static void main(String[] args) { + int len = 1_000_000; + double[] a = new double[len]; + for (int i = 0; i < len; i++) + a[i] = (i % 10) - 5; + double[] b = new double[len]; + for (int i = 0; i < len; i++) + b[i] = (i % 10) - 5; + + float[] a_f = new float[len]; + for (int i = 0; i < len; i++) + a_f[i] = (i % 10) - 5; + float[] b_f = new float[len]; + for (int i = 0; i < len; i++) + b_f[i] = (i % 10) - 5; + + + + // warm up + for (int i = 0; i < 20_000; i++) { + LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len); + LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len); + LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len); + LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len); + LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len); + } + + // measure + long t2_0 = System.nanoTime(); + double s2 = 0; + for (int i = 0; i < 2000; i++) + s2 += LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len); + long t2_1 = System.nanoTime(); + + System.out.println("Vector MaxVal=" + s2/2000); + System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0)); + + // measure + long t1_0 = System.nanoTime(); + double s1 = 0; + for (int i = 0; i < 2000; i++) + s1 += LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len); + long t1_1 = System.nanoTime(); + + System.out.println("Scalar MaxVal Sum=" + s1/2000); + System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0)); + + + // measure + long t3_0 = System.nanoTime(); + double s3 = 0; + for (int i = 0; i < 2000; i++) + s3 += LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len); + long t3_1 = System.nanoTime(); + + System.out.println("Vector Float MaxVal=" + s3/2000); + System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0)); + + // measure + long t4_0 = System.nanoTime(); + double s4 = 0; + for (int i = 0; i < 2000; i++) + s4 += LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len); + long t4_1 = System.nanoTime(); + + System.out.println("Scalar Float MaxVal=" + s4/2000); + System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0)); + + // measure + long t5_0 = System.nanoTime(); + double s5 = 0; + for (int i = 0; i < 2000; i++) + s5 += LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len); + long t5_1 = System.nanoTime(); + + System.out.println("Vector 2acc MaxVal=" + s5/2000); + System.out.println("Time per call (ns): " + ((t5_1 - t5_0) / 2000.0)); + + + + } +} +/* +Scalar Sum=-1.0E9 +Time per call (ns): 142774.5625 +Vector Sum=-1.0E9 +Time per call (ns): 468854.25 +Vector Float Sum=-1.0E9 +Time per call (ns): 274727.3545 +*/ diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java new file mode 100644 index 00000000000..a43496d6a8d --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java @@ -0,0 +1,100 @@ + +package org.apache.sysds.test.component.codegen.performance_tests; +import java.util.Arrays; + +import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; + + +public class vectDivAddTest { + public static void main(String[] args) { + //final int len = 32_768; + final int len = 262_144; + //final int len = 1_000_000; + + final double[] a = new double[len]; + final double[] cInit = new double[len]; + + for (int i = 0; i < len; i++) { + a[i] = (i % 10) - 5; + cInit[i] = (i % 10) - 5; + } + + final double bval = 1.234567; // NOT 1.0 + + double[] cScalar = Arrays.copyOf(cInit, len); + double[] cVector = Arrays.copyOf(cInit, len); + double[] cVectorPureDiv = Arrays.copyOf(cInit, len); + + // Warm up scalar only + for (int i = 0; i < 200; i++) { + LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len); + } + + // Warm up vector only + for (int i = 0; i < 200; i++) { + LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len); + } + + // Warm up pure div vector only + for (int i = 0; i < 200; i++) { + LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len); + } + + // Reset for measurement + cScalar = Arrays.copyOf(cInit, len); + + // Measure scalar + long t0 = System.nanoTime(); + for (int i = 0; i < 2000; i++) { + LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len); + } + long t1 = System.nanoTime(); + + // Reset for measurement + cVector = Arrays.copyOf(cInit, len); + + // Measure vector + long t2 = System.nanoTime(); + for (int i = 0; i < 2000; i++) { + LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len); + } + long t3 = System.nanoTime(); + + // Compare correctness + double maxDiff = 0; + double sumScalar = 0, sumVector = 0; + for (int i = 0; i < len; i++) { + maxDiff = Math.max(maxDiff, Math.abs(cScalar[i] - cVector[i])); + sumScalar += cScalar[i]; + sumVector += cVector[i]; + } + + + // Reset for measurement + cVectorPureDiv = Arrays.copyOf(cInit, len); + + // Measure vector + long t4 = System.nanoTime(); + for (int i = 0; i < 2000; i++) { + LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len); + } + long t5 = System.nanoTime(); + + // Compare correctness + + double sum_prev = sumScalar + sumVector; + double sum_Vector_pure_div = 0; + for (int i = 0; i < len; i++) { + maxDiff = Math.max(maxDiff, Math.abs(sumScalar - cVectorPureDiv[i])); + sum_Vector_pure_div += cVectorPureDiv[i]; + } + + System.out.println("Scalar time per call (ns): " + ((t1 - t0) / 2000.0)); + System.out.println("Vector time per call (ns): " + ((t3 - t2) / 2000.0)); + System.out.println("pure vector div time per call (ns): " + ((t5 - t4) / 2000.0)); + System.out.println("maxDiff: " + maxDiff); + System.out.println("checksum scalar: " + sumScalar); + System.out.println("checksum vector: " + sumVector); + System.out.println("checksum pure vector div : " + sum_Vector_pure_div); + } +} diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java new file mode 100644 index 00000000000..be5666a6847 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java @@ -0,0 +1,61 @@ + +package org.apache.sysds.test.component.codegen.performance_tests; +import java.util.Arrays; + +import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; + + +public class vectEqualWriteTest { + public static void main(String[] args) { + //final int len = 32_768; + //final int len = 262_144; + final int len = 1_000_000; + //final int len = 1_000_000; + + final double[] aInit = new double[len]; + + for (int i = 0; i < len; i++) { + aInit[i] = (i % 10) - 5; + } + + final double bval = 1.234567; // NOT 1.0 + + double[] aScalar = Arrays.copyOf(aInit, len); + double[] aVector = Arrays.copyOf(aInit, len); + + // Warm up scalar only + for (int i = 0; i < 200; i++) { + LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len); + } + + // Warm up vector only + for (int i = 0; i < 200; i++) { + LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len); + } + + // Reset for measurement + aScalar = Arrays.copyOf(aInit, len); + + // Measure scalar + long t0 = System.nanoTime(); + for (int i = 0; i < 2000; i++) { + LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len); + } + long t1 = System.nanoTime(); + System.out.println("Scalar"); + System.out.println("Time per call (ns): " + ((t1- t0) / 2000.0)); + + + // Reset for measurement + aVector = Arrays.copyOf(aInit, len); + + // Measure vector + long t2 = System.nanoTime(); + for (int i = 0; i < 2000; i++) { + LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len); + } + long t3 = System.nanoTime(); + System.out.println("Vector"); + System.out.println("Time per call (ns): " + ((t3- t2) / 2000.0)); + } +} diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java new file mode 100644 index 00000000000..90fb36192c8 --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java @@ -0,0 +1,74 @@ +package org.apache.sysds.test.component.codegen.performance_tests; +import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; + + +public class vectSumTest { + public static void main(String[] args) { + int len = 1_000_000; + double[] a = new double[len]; + for (int i = 0; i < len; i++) + a[i] = (i % 10) - 5; + float[] a_f = new float[len]; + for (int i = 0; i < len; i++) + a_f[i] = (i % 10) - 5; + + // warm up + for (int i = 0; i < 20_000; i++) { + LibSpoofPrimitives.vectSum(a, 0, len); + LibSpoofPrimitives.scalarvectSum(a, 0, len); + LibSpoofPrimitives.vectSumFloat(a_f, 0, len); + LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len); + } + + + // measure + long t2_0 = System.nanoTime(); + double s2 = 0; + for (int i = 0; i < 2000; i++) + s2 += LibSpoofPrimitives.scalarvectSum(a, 0, len); + long t2_1 = System.nanoTime(); + + System.out.println("Scalar Sum=" + s2); + System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0)); + + // measure + long t1_0 = System.nanoTime(); + double s1 = 0; + for (int i = 0; i < 2000; i++) + s1 += LibSpoofPrimitives.vectSum(a, 0, len); + long t1_1 = System.nanoTime(); + + System.out.println("Vector Sum=" + s1); + System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0)); + + // measure + long t3_0 = System.nanoTime(); + double s3 = 0; + for (int i = 0; i < 2000; i++) + s3 += LibSpoofPrimitives.vectSumFloat(a_f, 0, len); + long t3_1 = System.nanoTime(); + + System.out.println("Vector Float Sum=" + s3); + System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0)); + + + // measure + long t4_0 = System.nanoTime(); + double s4 = 0; + for (int i = 0; i < 2000; i++) + s4 += LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len); + long t4_1 = System.nanoTime(); + + System.out.println("Scalar Float Sum=" + s4/2000); + System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0)); + + } +} +/* +Scalar Sum=-1.0E9 +Time per call (ns): 142774.5625 +Vector Sum=-1.0E9 +Time per call (ns): 468854.25 +Vector Float Sum=-1.0E9 +Time per call (ns): 274727.3545 +*/ From 2c6f30df1a88c90011dcbec73c9d9f057810c370 Mon Sep 17 00:00:00 2001 From: JulianJuelg Date: Fri, 30 Jan 2026 18:40:53 +0100 Subject: [PATCH 2/3] all vector api implementation of dense primitives a) multiplyAdd, b) div, c) aggregations, d) comparisons --- .../runtime/codegen/LibSpoofPrimitives.java | 838 ++++++++++++++---- .../runtime/matrix/data/LibMatrixMult.java | 39 + .../primitives_vector_api/BenchCase.java | 377 +++++++- .../primitives_vector_api/Ctx.java | 36 +- .../PrimitivePerfSuite.java | 3 +- 5 files changed, 1089 insertions(+), 204 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java index 214226497f0..c89c734fa81 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java @@ -37,6 +37,7 @@ import jdk.incubator.vector.FloatVector; import jdk.incubator.vector.VectorOperators; import jdk.incubator.vector.VectorSpecies; +import jdk.incubator.vector.VectorMask; /** * This library contains all vector primitives that are used in @@ -75,27 +76,15 @@ public static double scalarrowMaxsVectMult(double[] a, double[] b, int ai, int b return val; } - public static double scalarrowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) { - float val = Float.NEGATIVE_INFINITY; - int j=0; - for( int i = ai; i < ai+len; i++ ) - val = Math.max(a[i]*b[j++], val); - return val; - } - public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) { double maxVal = Double.NEGATIVE_INFINITY; int i = 0; int upper = SPECIES.loopBound(len); - // vector accumulator for max DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); - - // IMPORTANT: - // Your original code uses b[j++] starting at 0 (ignores bi). - // I assume that is a bug/oversight, so I use b[bi + i]. - // If you *must* keep exact old semantics, replace (bi + i) with just i. + + //unrolled vLen-block (for better instruction-level parallelism) for (; i < upper; i += vLen) { DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i); @@ -103,10 +92,9 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int vmax = vmax.max(prod); } - // Reduce vector lanes to a scalar max maxVal = vmax.reduceLanes(VectorOperators.MAX); - // Tail + //rest, not aligned to vLen-blocks for (; i < len; i++) { maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]); } @@ -114,44 +102,37 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int return maxVal; } - public static double rowMaxsVectMultFloat(float[] a, float[] b, int ai, int bi, int len) { - float maxVal = Float.NEGATIVE_INFINITY; - + // note: parameter bi unused + public static double scalarrowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { + double val = Double.NEGATIVE_INFINITY; + for( int i = ai; i < ai+len; i++ ) + val = Math.max(a[i]*b[aix[i]], val); + return val; + } + + public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { + double scalarMax = Double.NEGATIVE_INFINITY; + int i = 0; - int upper = FSPECIES.loopBound(len); - - // vector accumulator for max - FloatVector vmax = FloatVector.broadcast(FSPECIES, Float.NEGATIVE_INFINITY); - - // IMPORTANT: - // Your original code uses b[j++] starting at 0 (ignores bi). - // I assume that is a bug/oversight, so I use b[bi + i]. - // If you *must* keep exact old semantics, replace (bi + i) with just i. - for (; i < upper; i += FSPECIES.length()) { - FloatVector va = FloatVector.fromArray(FSPECIES, a, ai + i); - FloatVector vb = FloatVector.fromArray(FSPECIES, b, bi + i); - FloatVector prod = va.mul(vb); + int upperBound = SPECIES.loopBound(len); + DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += SPECIES.length()) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0, aix, ai + i); + DoubleVector prod = va.mul(vb); vmax = vmax.max(prod); } - - // Reduce vector lanes to a scalar max - maxVal = vmax.reduceLanes(VectorOperators.MAX); - - // Tail + scalarMax = Math.max(scalarMax, vmax.reduceLanes(VectorOperators.MAX)); + + //rest, not aligned to vLen-blocks for (; i < len; i++) { - maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]); + double prod = a[ai + i] * b[aix[ai + i]]; + if (prod > scalarMax) + scalarMax = prod; } - - return maxVal; - } - - - - public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { - double val = Double.NEGATIVE_INFINITY; - for( int i = ai; i < ai+len; i++ ) - val = Math.max(a[i]*b[aix[i]], val); - return val; + return scalarMax; } // forwarded calls to LibMatrixMult @@ -399,110 +380,24 @@ public static double scalarvectSum(double[] a, int ai, int len) { return val; } - public static double scalarvectSumFloat(float[] a, int ai, int len) { - float val = 0; - final int bn = len%8; - - //compute rest - for( int i = ai; i < ai+bn; i++ ) - val += a[ i ]; - - //unrolled 8-block (for better instruction-level parallelism) - for( int i = ai+bn; i < ai+len; i+=8 ) { - //read 64B cacheline of a, compute cval' = sum(a) + cval - val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ] - + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ i+7 ]; - } - - //scalar result - return val; - } public static double vectSum(double[] a, int ai, int len) { double sum = 0d; int i = 0; DoubleVector acc = DoubleVector.zero(SPECIES); - - // largest multiple of vLen <= len int upperBound = SPECIES.loopBound(len); + //unrolled vLen-block (for better instruction-level parallelism) for (; i < upperBound; i += SPECIES.length()) { DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); acc = acc.add(v); } - - // reduce vector lanes into scalar - sum += acc.reduceLanes(VectorOperators.ADD); - - // tail (remaining elements) - for (; i < len; i++) { - sum += a[ai + i]; - } - - return sum; - } - - public static double rowMaxsVectMultVec2Acc(double[] a, double[] b, int ai, int bi, int len) { - int i = 0; - int upper = SPECIES.loopBound(len); - - DoubleVector vmax1 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); - DoubleVector vmax2 = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); - - // step = 2 vectors per iteration - int step = vLen * 2; - - for (; i + step <= upper; i += step) { - DoubleVector va1 = DoubleVector.fromArray(SPECIES, a, ai + i); - DoubleVector vb1 = DoubleVector.fromArray(SPECIES, b, bi + i); - vmax1 = vmax1.max(va1.mul(vb1)); - - DoubleVector va2 = DoubleVector.fromArray(SPECIES, a, ai + i + vLen); - DoubleVector vb2 = DoubleVector.fromArray(SPECIES, b, bi + i + vLen); - vmax2 = vmax2.max(va2.mul(vb2)); - } - - // finish remaining vector loop - for (; i < upper; i += vLen) { - DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); - DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i); - vmax1 = vmax1.max(va.mul(vb)); - } - - // combine both accumulators - DoubleVector vmax = vmax1.max(vmax2); - double maxVal = vmax.reduceLanes(VectorOperators.MAX); - - // tail - for (; i < len; i++) { - maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]); - } - - return maxVal; - } - - public static double vectSumFloat(float[] a, int ai, int len) { - float sum = 0; - int i = 0; - - FloatVector acc = FloatVector.zero(FSPECIES); - - // largest multiple of vLen <= len - int upperBound = FSPECIES.loopBound(len); - - for (; i < upperBound; i += FSPECIES.length()) { - FloatVector v = FloatVector.fromArray(FSPECIES, a, ai + i); - acc = acc.add(v); - } - - // reduce vector lanes into scalar sum += acc.reduceLanes(VectorOperators.ADD); - // tail (remaining elements) + //rest, not aligned to vLen-blocks for (; i < len; i++) { sum += a[ai + i]; } - return sum; } @@ -519,36 +414,93 @@ public static double vectSumsq(double[] avals, int[] aix, int ai, int alen, int return LibMatrixMult.dotProduct(avals, avals, ai, ai, alen); } - public static double vectMin(double[] a, int ai, int len) { + public static double scalarvectMin(double[] a, int ai, int len) { double val = Double.POSITIVE_INFINITY; for( int i = ai; i < ai+len; i++ ) val = Math.min(a[i], val); return val; } + + public static double vectMin(double[] a, int ai, int len) { + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vmin = DoubleVector.broadcast(SPECIES, Double.POSITIVE_INFINITY); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + vmin = vmin.min(v); + } + double minVal = vmin.reduceLanes(VectorOperators.MIN); + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + minVal = Math.min(minVal, a[ai + i]); + } + return minVal; + } public static double vectMin(double[] avals, int[] aix, int ai, int alen, int len) { double val = vectMin(avals, ai, alen); return (alen nz = v.compare(VectorOperators.NE, vzero); + count += nz.trueCount(); + } + + //rest, not aligned to vLen-blocks + for(;i SPECIES = DoubleVector.SPECIES_PREFERRED; + public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final double inv = 1.0 / bval; + final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); + int i = 0; final int upperBound = SPECIES.loopBound(len); - // Hoist reciprocal (1 division instead of len divisions) - final double inv = 1.0 / bval; - final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); - - int i = 0; - final int upperBound = SPECIES.loopBound(len); - - // Vector loop - for (; i < upperBound; i += SPECIES.length()) { - // load a and c - DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); - DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); - - // vc += va * inv - vc = vc.add(va.mul(vinv)); - - // store result back to c - vc.intoArray(c, ci + i); - } - - // Tail loop - for (; i < len; i++) { + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { c[ci + i] += a[ai + i] * inv; - } + } } + + + // for comparison public static void pureDivvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { if (len <= 0) return; @@ -628,42 +568,172 @@ public static void pureDivvectDivAdd(double[] a, double bval, double[] c, int ai - public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { + public static void scalarvectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { for( int j = ai; j < ai+len; j++, ci++) c[ci] += bval / a[j]; } - public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) { + public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + vc = vc.add(vb.div(va)); + vc.intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (;i eq = aVec.compare(VectorOperators.EQ, bVec); + + DoubleVector inc = zeros.blend(ones, eq); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0; + } + } + public static void vectEqualAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectEqualAdd(a, bval, c, ai, ci, len); @@ -1865,28 +1980,24 @@ public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int } public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); - int i = 0; int upper = SPECIES.loopBound(len); - DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); DoubleVector zeros = DoubleVector.zero(SPECIES); DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + //unrolled vLen-block (for better instruction-level parallelism) for (; i < upper; i += vLen) { DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); var mask = va.compare(VectorOperators.EQ, vb); - - // out = (va == vb) ? 1.0 : 0.0 DoubleVector out = zeros.blend(ones, mask); out.intoArray(c, i); } - // tail + //rest, not aligned to vLen-blocks for (; i < len; i++) { c[i] = (a[ai + i] == bval) ? 1 : 0; } - return c; } @@ -1895,13 +2006,37 @@ public static double[] vectEqualWrite(double bval, double[] a, int ai, int len) return vectEqualWrite(a, bval, ai, len); } - public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] scalarvectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++, bi++) c[j] = (a[ai] == b[bi]) ? 1 : 0; return c; } + public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + VectorMask eq = aVec.compare(VectorOperators.EQ, bVec); + DoubleVector out = zeros.blend(ones, eq); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + public static double[] vectEqualWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval == 0) ? 1 : 0; double[] c = allocVector(len, true, init); @@ -1931,10 +2066,33 @@ public static double[] vectEqualWrite(double[] a, double[] b, int ai, int[] bix, //custom vector not equal - public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + public static void scalarvectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { for( int j = ai; j < ai+len; j++, ci++) c[ci] += (a[j] != bval) ? 1 : 0; } + public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector inc = zeros.blend(ones, ne); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] != bval) ? 1.0 : 0.0; + } + } public static void vectNotequalAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectNotequalAdd(a, bval, c, ai, ci, len); @@ -1953,24 +2111,74 @@ public static void vectNotequalAdd(double bval, double[] a, double[] c, int[] ai vectNotequalAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] vectNotequalWrite(double[] a, double bval, int ai, int len) { + public static double[] scalarvectNotequalWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++) c[j] = (a[ai] != bval) ? 1 : 0; return c; } + + public static double[] vectNotequalWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector out = zeros.blend(ones, ne); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] != bval) ? 1.0 : 0.0; + } + return c; + } public static double[] vectNotequalWrite(double bval, double[] a, int ai, int len) { return vectNotequalWrite(a, bval, ai, len); } - public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] scalarvectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++, bi++) c[j] = (a[ai] != b[bi]) ? 1 : 0; return c; } + public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector out = zeros.blend(ones, ne); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + public static double[] vectNotequalWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval != 0) ? 1 : 0; double[] c = allocVector(len, true, init); @@ -1999,10 +2207,34 @@ public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int[] b //custom vector less - public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + public static void scalarvectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { for( int j = ai; j < ai+len; j++, ci++) c[ci] += (a[j] < bval) ? 1 : 0; } + public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector inc = zeros.blend(ones, lt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0; + } + } public static void vectLessAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectGreaterequalAdd(a, bval, c, ai, ci, len); @@ -2021,24 +2253,81 @@ public static void vectLessAdd(double bval, double[] a, double[] c, int[] aix, i vectGreaterequalAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] vectLessWrite(double[] a, double bval, int ai, int len) { + public static double[] scalarvectLessWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++) c[j] = (a[ai] < bval) ? 1 : 0; return c; } + + + public static double[] vectLessWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < bval) ? 1.0 : 0.0; + } + + return c; + } + public static double[] vectLessWrite(double bval, double[] a, int ai, int len) { return vectGreaterequalWrite(a, bval, ai, len); } - public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] scalarvectLessWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++, bi++) c[j] = (a[ai] < b[bi]) ? 1 : 0; return c; } + public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0; + } + + return c; + } + public static double[] vectLessWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval > 0) ? 1 : 0; double[] c = allocVector(len, true, init); @@ -2067,10 +2356,35 @@ public static double[] vectLessWrite(double[] a, double[] b, int ai, int[] bix, //custom vector less equal - public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + public static void scalarvectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { for( int j = ai; j < ai+len; j++, ci++) c[ci] += (a[j] <= bval) ? 1 : 0; } + + public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector inc = zeros.blend(ones, le); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0; + } + } public static void vectLessequalAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectGreaterAdd(a, bval, c, ai, ci, len); @@ -2089,24 +2403,78 @@ public static void vectLessequalAdd(double bval, double[] a, double[] c, int[] a vectGreaterAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) { + public static double[] scalarvectLessequalWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++) c[j] = (a[ai] <= bval) ? 1 : 0; return c; } + public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0; + } + + return c; + } public static double[] vectLessequalWrite(double bval, double[] a, int ai, int len) { return vectGreaterWrite(a, bval, ai, len); } - public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] scalarvectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++, bi++) c[j] = (a[ai] <= b[bi]) ? 1 : 0; return c; } + public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0; + } + + return c; + } + public static double[] vectLessequalWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval >= 0) ? 1 : 0; double[] c = allocVector(len, true, init); @@ -2135,10 +2503,35 @@ public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int[] //custom vector greater - public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + public static void scalarvectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { for( int j = ai; j < ai+len; j++, ci++) c[ci] += (a[j] > bval) ? 1 : 0; } + + public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector inc = zeros.blend(ones, gt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0; + } + } public static void vectGreaterAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectLessequalAdd(a, bval, c, ai, ci, len); @@ -2157,24 +2550,75 @@ public static void vectGreaterAdd(double bval, double[] a, double[] c, int[] aix vectLessequalAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) { + public static double[] scalarvectGreaterWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++) c[j] = (a[ai] > bval) ? 1 : 0; return c; } + public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > bval) ? 1.0 : 0.0; + } + return c; + } public static double[] vectGreaterWrite(double bval, double[] a, int ai, int len) { return vectLessWrite(a, bval, ai, len); } - public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] scalarvectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++, bi++) c[j] = (a[ai] > b[bi]) ? 1 : 0; return c; } + public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + public static double[] vectGreaterWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval < 0) ? 1 : 0; double[] c = allocVector(len, true, init); diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index cfdf21255e7..9417e5134e8 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -4019,6 +4019,45 @@ public static void vectMultiplyWrite( final double[] a, double[] b, double[] c, c[ ci+bix[j+7] ] = a[ ai+bix[j+7] ] * b[ j+7 ]; } } + // test + public static double[] vectMult2Write(double[] a,double[] c, int ai, int len) { + + int i = 0; + int upper = SPECIES.loopBound(len); + + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + va.add(va).intoArray(c, i); + } + + for (; i < len; i++) { + double x = a[ai + i]; + c[i] = x + x; + } + + return c; + } + public static double[] vectMult2Write_dedicated_2(double[] a, double[] c, int ai, int len) { + + final int bn = len % vLen; + + // scalar prefix so the vector loop is an exact multiple of vLen + for (int j = 0; j < bn; j++) { + double x = a[ai + j]; + c[j] = x + x; + } + + // vector loop: j runs over multiples of vLen, no tail afterwards + for (int j = bn; j < len; j += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + j); + va.add(va).intoArray(c, j); + // or: va.mul(2.0) via broadcast if you prefer + } + + return c; + } + + public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len){ diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java index b748642171d..9cd67051b1e 100644 --- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java @@ -1,7 +1,11 @@ package org.apache.sysds.performance.primitives_vector_api; +import org.apache.sysds.performance.primitives_vector_api.BenchCase.OutKind; import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; public enum BenchCase { + + // Aggregations + VECT_SUM( "vectSum dense", OutKind.SCALAR_DOUBLE, @@ -13,18 +17,387 @@ public enum BenchCase { BenchUtil.blackhole = ctx.vectorRes;}, ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} ), - + + + ROWS_MAXS_VECT_MULT( + "rowMaxsVectMult dense", + OutKind.SCALAR_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB();}, + ctx -> ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9; + } + ), + + ROWS_MAXS_VECT_MULT_AIX( + "rowMaxsVectMult_aix dense", + OutKind.SCALAR_DOUBLE, + ctx -> {ctx.initDenseA();ctx.initDenseB();ctx.initDenseAInt();}, + ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> { + ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); + BenchUtil.blackhole = ctx.vectorRes; + }, + ctx -> { + ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9; + } + ), + VECT_MIN( + "vectMin dense", + OutKind.SCALAR_DOUBLE, + ctx -> ctx.initDenseA(), + ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMin(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMin(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.vectorRes;}, + ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} + ), + + VECT_MAX( + "vectMax dense", + OutKind.SCALAR_DOUBLE, + ctx -> ctx.initDenseA(), + ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMax(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMax(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.vectorRes;}, + ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} + ), + VECT_COUNTNNZ( + "vectCountnnz dense", + OutKind.SCALAR_DOUBLE, + ctx -> ctx.initDenseA(), + ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectCountnnz(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectCountnnz(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.vectorRes;}, + ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} + ), + + // Divisions + VECT_DIV_ADD( "vectDivAdd dense", OutKind.ARRAY_DOUBLE, - ctx -> ctx.initDenseAandC(), + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();}, ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len), ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); ctx.ok = ctx.maxDiff <= 1e-9; } + ), + + VECT_DIV_ADD_2( + "vectDivAdd2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, 0, 0, ctx.len), + ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + VECT_DIV_ADD_SPARSE( + "vectDivAdd sparse", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + + VECT_DIV_ADD_SPARSE2( + "vectDivAdd2 sparse", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + VECT_DIV_WRITE( + "vectDivWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_DIV_WRITE2( + "vectDivWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.bval, ctx.a, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_DIV_WRITE3( + "vectDivWrite3 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseBDiv();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + // Comparisons + + VECT_EQUAL_WRITE( + "vectEqualWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_EQUAL_ADD( + "vectEqualAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectEqualAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectEqualAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_EQUAL_WRITE2( + "vectEqualWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_NOTEQUAL_ADD( + "vectNotequalAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectNotequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectNotequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_NOTEQUAL_WRITE( + "vectNotequalWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_NOTEQUAL_WRITE2( + "vectNotequalWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.b, 0 ,0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESS_ADD( + "vectLessAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectLessAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectLessAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESS_WRITE( + "vectLessWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.bval, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESS_WRITE2( + "vectLessWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESSEQUAL_ADD( + "vectLessequalAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectLessequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectLessequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESSEQUAL_WRITE( + "vectLessequalWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.bval, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESSEQUAL_WRITE2( + "vectLessequalWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + VECT_GREATER_ADD( + "vectGreaterAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> LibSpoofPrimitives.scalarvectGreaterAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectGreaterAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_GREATER_WRITE( + "vectGreaterWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.bval, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_GREATER_WRITE2( + "vectGreaterWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB();}, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_DIV_ADD_pure( + "vectDivAddpure dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();}, + ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len), + ctx -> LibSpoofPrimitives.pureDivvectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + // vectMult2 + + VECT_Mult2_ADD( + "vectMult2Add dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); }, + ctx -> LibSpoofPrimitives.scalarvectMult2Add(ctx.a, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectMult2Add(ctx.a, ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_Mult2_WRITE( + "vectMult2Write dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); }, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write(ctx.a, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_Mult2_WRITE_DEDICATED( + "vectMult2Write_dedicated dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); }, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated(ctx.a, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_Mult2_WRITE_DEDICATED2( + "vectMult2Write_dedicated2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); }, + ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated_2(ctx.a, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } ); + + + + + + + public enum OutKind { SCALAR_DOUBLE, ARRAY_DOUBLE } public final String name; public final java.util.function.Consumer setup; diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java index 84c66266c8f..d32ca3433e9 100644 --- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java @@ -2,26 +2,54 @@ public class Ctx { public int len; - public double[] a, cInit, cScalar, cVector; + public double[] a, cInit,b,c, cScalar, cVector; public double bval; public double scalarRes, vectorRes; public double maxDiff; public boolean ok; + public int[] a_int; void initDenseA() { a = new double[len]; for (int i = 0; i < len; i++) a[i] = (i % 10) - 5; } + void initDenseB() { + b = new double[len]; + for (int i = 0; i < len; i++) b[i] = (i % 10) - 5; + } + void initDenseC() { + c = new double[len]; + for (int i = 0; i < len; i++) c[i] = (i % 10) - 5; + } + void initDenseAInt() { + a_int = new int[len]; + for (int i = 0; i < len; i++) a_int[i] = i;; + } + void initbval(){ + bval = 1.234567; + } + void initDenseADiv() { + a = new double[len]; + for (int i = 0; i < len; i++) { + a[i] = ((i % 10) + 1); // Range: 1 to 10 (no zeros) + } + } + void initDenseBDiv() { + b = new double[len]; + for (int i = 0; i < len; i++) b[i] = ((i % 10) + 1); + } + - void initDenseAandC() { - initDenseA(); + void initDenseAandC_mutable() { + initDenseADiv(); cInit = new double[len]; for (int i = 0; i < len; i++) cInit[i] = (i % 10) - 5; cScalar = java.util.Arrays.copyOf(cInit, len); cVector = java.util.Arrays.copyOf(cInit, len); - bval = 1.234567; } + + void resetC() { if (cInit != null) { diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java index c478c7edfb7..6dcb6797f30 100644 --- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java @@ -1,11 +1,12 @@ package org.apache.sysds.performance.primitives_vector_api; + public class PrimitivePerfSuite { public static void main(String[] args) { //int len = BenchUtil.argInt(args, "--len", 262_144); int len = BenchUtil.argInt(args, "--len", 1_000_000); int warmup = BenchUtil.argInt(args, "--warmup", 10_000); - int iters = BenchUtil.argInt(args, "--iters", 2000); + int iters = BenchUtil.argInt(args, "--iters", 100); String filter = BenchUtil.argStr(args, "--filter", ""); for (BenchCase bc : BenchCase.values()) { From a881e55d2e7b51e829156e77b6904d50e59a9ad5 Mon Sep 17 00:00:00 2001 From: JulianJuelg Date: Fri, 30 Jan 2026 23:11:02 +0100 Subject: [PATCH 3/3] Replace codegen primitives with vector api implementation if faster; add all primitives implementations to benchmarking suite --- .../runtime/codegen/LibSpoofPrimitives.java | 203 +---- .../primitives_vector_api/BenchCase.java | 151 +-- .../backup_primitives_for_benchmark.java | 856 ++++++++++++++++++ .../codegen/performance_tests/benchUtil.java | 36 - .../rowMaxsVectMultTest.java | 95 -- .../performance_tests/vectDivAddTest.java | 100 -- .../performance_tests/vectEqualWriteTest.java | 61 -- .../performance_tests/vectSumTest.java | 74 -- 8 files changed, 916 insertions(+), 660 deletions(-) create mode 100644 src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java delete mode 100644 src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java diff --git a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java index c89c734fa81..a66d8f2dcaa 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java @@ -68,14 +68,6 @@ public class LibSpoofPrimitives @Override protected SparseVectorBuffer initialValue() { return new SparseVectorBuffer(0,0,0); } }; - public static double scalarrowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) { - double val = Double.NEGATIVE_INFINITY; - int j=0; - for( int i = ai; i < ai+len; i++ ) - val = Math.max(a[i]*b[j++], val); - return val; - } - public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) { double maxVal = Double.NEGATIVE_INFINITY; @@ -103,14 +95,15 @@ public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int } // note: parameter bi unused - public static double scalarrowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { + public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { double val = Double.NEGATIVE_INFINITY; for( int i = ai; i < ai+len; i++ ) val = Math.max(a[i]*b[aix[i]], val); return val; } - public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { + // not in use: vector api implementation slower than scalar loop version + public static double rowMaxsVectMult_vector_api(double[] a, double[] b, int[] aix, int ai, int bi, int len) { double scalarMax = Double.NEGATIVE_INFINITY; int i = 0; @@ -360,8 +353,7 @@ public static double[] vectCbindWrite(double[] a, double[] b, int[] aix, int ai, * @return sum value */ - // scalar function - public static double scalarvectSum(double[] a, int ai, int len) { + public static double vectSum(double[] a, int ai, int len) { double val = 0; final int bn = len%8; @@ -379,8 +371,8 @@ public static double scalarvectSum(double[] a, int ai, int len) { //scalar result return val; } - - public static double vectSum(double[] a, int ai, int len) { + // not in use: vector api implementation slower than scalar loop version + public static double vectSum_vector_api(double[] a, int ai, int len) { double sum = 0d; int i = 0; @@ -445,12 +437,6 @@ public static double vectMin(double[] avals, int[] aix, int ai, int alen, int le return (alen SPECIES = DoubleVector.SPECIES_PREFERRED; - final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); - - int i = 0; - final int upperBound = SPECIES.loopBound(len); - - for (; i < upperBound; i += SPECIES.length()) { - DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); - DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); - - vc = vc.add(va.div(vb)); - - vc.intoArray(c, ci + i); - } - - for (; i < len; i++) { - c[ci + i] += a[ai + i] / bval; - } - } - - public static void scalarvectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += bval / a[j]; - } - public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { int i = 0; int upperBound = SPECIES.loopBound(len); @@ -593,13 +537,13 @@ public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int c } - public static void scalarvectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) { + public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) { for( int j = ai; j < ai+alen; j++ ) c[ci + aix[j]] += a[j] / bval; } - // sparse - public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) { + // not in use: vector api implementation slower than scalar loop version + public static void vectDivAdd_vector_api(double[] a, double bval, double[] c, int[] aix, int ai, int ci, int alen, int len) { final double inv = 1.0 / bval; int i = 0; @@ -625,13 +569,13 @@ public static void vectDivAdd(double[] a, double bval, double[] c, int[] aix, in } - public static void scalarvectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { + public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { for( int j = ai; j < ai+alen; j++ ) c[ci + aix[j]] += bval / a[j]; } - //sparse - public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { + // not in use: vector api implementation slower than scalar loop version + public static void vectDivAdd_vector_api(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { int i = 0; int upperBound = SPECIES.loopBound(alen); DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); @@ -654,14 +598,15 @@ public static void vectDivAdd(double bval, double[] a, double[] c, int[] aix, in } - public static double[] scalarvectDivWrite(double[] a, double bval, int ai, int len) { + public static double[] vectDivWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++) c[j] = a[ai+j] / bval; return c; } - public static double[] vectDivWrite(double[] a, double bval, int ai, int len) { + // not in use: vector api implementation slower than scalar loop version + public static double[] vectDivWrite_vector_api(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); final double inv = 1.0 / bval; final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); @@ -682,14 +627,15 @@ public static double[] vectDivWrite(double[] a, double bval, int ai, int len) { } - public static double[] scalarvectDivWrite(double bval, double[] a, int ai, int len) { + public static double[] vectDivWrite(double bval, double[] a, int ai, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++) c[j] = bval / a[ai + j]; return c; } - public static double[] vectDivWrite(double bval, double[] a, int ai, int len) { + // not in use: vector api implementation slower than scalar loop version + public static double[] vectDivWrite_vector_api(double bval, double[] a, int ai, int len) { double[] c = allocVector(len, false); final DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); int i = 0; @@ -708,14 +654,15 @@ public static double[] vectDivWrite(double bval, double[] a, int ai, int len) { return c; } - public static double[] scalarvectDivWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++) c[j] = a[ai + j] / b[bi + j]; return c; } - public static double[] vectDivWrite(double[] a, double[] b, int ai, int bi, int len) { + // not in use: vector api implementation slower than scalar loop version + public static double[] vectDivWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); int i = 0; int upper = SPECIES.loopBound(len); @@ -1800,11 +1747,6 @@ public static double[] vectPow2Write(double[] a, int[] aix, int ai, int alen, in } //custom mult2 - - public static void scalarvectMult2Add(double[] a, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += a[j] + a[j]; - } public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) { LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len); @@ -1815,29 +1757,13 @@ public static void vectMult2Add(double[] a, double[] c, int[] aix, int ai, int c c[ci + aix[j]] += a[j] + a[j]; } - public static double[] scalarvectMult2Write(double[] a, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = a[ai] + a[ai]; - return c; - } public static double[] vectMult2Write(double[] a, int ai, int len) { double[] c = allocVector(len, false); LibMatrixMult.vectMultiplyWrite(2.0,a,c,ai,0,len); return c; } - public static double[] vectMult2Write_dedicated(double[] a, int ai, int len) { - double[] c = allocVector(len, false); - return LibMatrixMult.vectMult2Write(a,c,ai,len); - } - public static double[] vectMult2Write_dedicated_2(double[] a, int ai, int len) { - double[] c = allocVector(len, false); - return LibMatrixMult.vectMult2Write_dedicated_2(a,c,ai,len); - } - - public static double[] vectMult2Write(double[] a, int[] aix, int ai, int alen, int len) { double[] c = allocVector(len, true); for( int j = ai; j < ai+alen; j++ ) @@ -1925,10 +1851,6 @@ public static double[] vectSigmoidWrite(double[] a, int[] aix, int ai, int alen, //custom vector equal - public static void scalarvectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] == bval) ? 1 : 0; - } public static void vectEqualAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { int i = 0; int upper = SPECIES.loopBound(len); @@ -1972,12 +1894,6 @@ public static void vectEqualAdd(double bval, double[] a, double[] c, int[] aix, vectEqualAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] == bval) ? 1 : 0; - return c; - } public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); int i = 0; @@ -2006,12 +1922,6 @@ public static double[] vectEqualWrite(double bval, double[] a, int ai, int len) return vectEqualWrite(a, bval, ai, len); } - public static double[] scalarvectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++, bi++) - c[j] = (a[ai] == b[bi]) ? 1 : 0; - return c; - } public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); @@ -2066,10 +1976,6 @@ public static double[] vectEqualWrite(double[] a, double[] b, int ai, int[] bix, //custom vector not equal - public static void scalarvectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] != bval) ? 1 : 0; - } public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); @@ -2110,13 +2016,6 @@ public static void vectNotequalAdd(double[] a, double bval, double[] c, int[] ai public static void vectNotequalAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { vectNotequalAdd(a, bval, c, aix, ai, ci, alen, len); } - - public static double[] scalarvectNotequalWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] != bval) ? 1 : 0; - return c; - } public static double[] vectNotequalWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); @@ -2147,14 +2046,15 @@ public static double[] vectNotequalWrite(double bval, double[] a, int ai, int le return vectNotequalWrite(a, bval, ai, len); } - public static double[] scalarvectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++, bi++) c[j] = (a[ai] != b[bi]) ? 1 : 0; return c; } - public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { + // not in use: vector api implementation slower than scalar loop version + public static double[] vectNotequalWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); final DoubleVector zeros = DoubleVector.zero(SPECIES); @@ -2207,10 +2107,6 @@ public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int[] b //custom vector less - public static void scalarvectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] < bval) ? 1 : 0; - } public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); @@ -2252,14 +2148,6 @@ public static void vectLessAdd(double[] a, double bval, double[] c, int[] aix, i public static void vectLessAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { vectGreaterequalAdd(a, bval, c, aix, ai, ci, alen, len); } - - public static double[] scalarvectLessWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] < bval) ? 1 : 0; - return c; - } - public static double[] vectLessWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); @@ -2292,13 +2180,6 @@ public static double[] vectLessWrite(double[] a, double bval, int ai, int len) { public static double[] vectLessWrite(double bval, double[] a, int ai, int len) { return vectGreaterequalWrite(a, bval, ai, len); } - - public static double[] scalarvectLessWrite(double[] a, double[] b, int ai, int bi, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++, bi++) - c[j] = (a[ai] < b[bi]) ? 1 : 0; - return c; - } public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); @@ -2355,11 +2236,6 @@ public static double[] vectLessWrite(double[] a, double[] b, int ai, int[] bix, } //custom vector less equal - - public static void scalarvectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] <= bval) ? 1 : 0; - } public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); @@ -2403,12 +2279,6 @@ public static void vectLessequalAdd(double bval, double[] a, double[] c, int[] a vectGreaterAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] scalarvectLessequalWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] <= bval) ? 1 : 0; - return c; - } public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); @@ -2439,13 +2309,6 @@ public static double[] vectLessequalWrite(double[] a, double bval, int ai, int l public static double[] vectLessequalWrite(double bval, double[] a, int ai, int len) { return vectGreaterWrite(a, bval, ai, len); } - - public static double[] scalarvectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++, bi++) - c[j] = (a[ai] <= b[bi]) ? 1 : 0; - return c; - } public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); @@ -2503,11 +2366,6 @@ public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int[] //custom vector greater - public static void scalarvectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] > bval) ? 1 : 0; - } - public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); @@ -2550,12 +2408,6 @@ public static void vectGreaterAdd(double bval, double[] a, double[] c, int[] aix vectLessequalAdd(a, bval, c, aix, ai, ci, alen, len); } - public static double[] scalarvectGreaterWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] > bval) ? 1 : 0; - return c; - } public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); @@ -2586,14 +2438,15 @@ public static double[] vectGreaterWrite(double bval, double[] a, int ai, int len return vectLessWrite(a, bval, ai, len); } - public static double[] scalarvectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); for( int j = 0; j < len; j++, ai++, bi++) c[j] = (a[ai] > b[bi]) ? 1 : 0; return c; } - public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + // not in use: vector api implementation slower than scalar loop version + public static double[] vectGreaterWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); final DoubleVector zeros = DoubleVector.zero(SPECIES); diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java index 9cd67051b1e..c428f6782a9 100644 --- a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java @@ -2,6 +2,7 @@ import org.apache.sysds.performance.primitives_vector_api.BenchCase.OutKind; import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; + public enum BenchCase { // Aggregations @@ -10,10 +11,10 @@ public enum BenchCase { "vectSum dense", OutKind.SCALAR_DOUBLE, ctx -> ctx.initDenseA(), - ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectSum(ctx.a, 0, ctx.len); + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectSum(ctx.a, 0, ctx.len); BenchUtil.blackhole = ctx.scalarRes; }, - ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectSum(ctx.a, 0, ctx.len); + ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectSum(ctx.a, 0, ctx.len); BenchUtil.blackhole = ctx.vectorRes;}, ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} ), @@ -23,8 +24,8 @@ public enum BenchCase { "rowMaxsVectMult dense", OutKind.SCALAR_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initDenseB();}, - ctx -> ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), - ctx -> ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> ctx.scalarRes = backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> ctx.vectorRes = backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), ctx -> { ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9; } @@ -34,37 +35,26 @@ public enum BenchCase { "rowMaxsVectMult_aix dense", OutKind.SCALAR_DOUBLE, ctx -> {ctx.initDenseA();ctx.initDenseB();ctx.initDenseAInt();}, - ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarrowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); BenchUtil.blackhole = ctx.scalarRes; }, ctx -> { - ctx.vectorRes = LibSpoofPrimitives.rowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); + ctx.vectorRes = backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); BenchUtil.blackhole = ctx.vectorRes; }, ctx -> { ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9; } ), - VECT_MIN( - "vectMin dense", - OutKind.SCALAR_DOUBLE, - ctx -> ctx.initDenseA(), - ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMin(ctx.a, 0, ctx.len); - BenchUtil.blackhole = ctx.scalarRes; - }, - ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMin(ctx.a, 0, ctx.len); - BenchUtil.blackhole = ctx.vectorRes;}, - ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} - ), VECT_MAX( "vectMax dense", OutKind.SCALAR_DOUBLE, ctx -> ctx.initDenseA(), - ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectMax(ctx.a, 0, ctx.len); + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectMax(ctx.a, 0, ctx.len); BenchUtil.blackhole = ctx.scalarRes; }, - ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectMax(ctx.a, 0, ctx.len); + ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectMax(ctx.a, 0, ctx.len); BenchUtil.blackhole = ctx.vectorRes;}, ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} ), @@ -72,10 +62,10 @@ public enum BenchCase { "vectCountnnz dense", OutKind.SCALAR_DOUBLE, ctx -> ctx.initDenseA(), - ctx -> {ctx.scalarRes = LibSpoofPrimitives.scalarvectCountnnz(ctx.a, 0, ctx.len); + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectCountnnz(ctx.a, 0, ctx.len); BenchUtil.blackhole = ctx.scalarRes; }, - ctx -> {ctx.vectorRes = LibSpoofPrimitives.vectCountnnz(ctx.a, 0, ctx.len); + ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectCountnnz(ctx.a, 0, ctx.len); BenchUtil.blackhole = ctx.vectorRes;}, ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} ), @@ -86,8 +76,8 @@ public enum BenchCase { "vectDivAdd dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();}, - ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len), - ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len), + ctx -> backup_primitives_for_benchmark.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); ctx.ok = ctx.maxDiff <= 1e-9; @@ -98,7 +88,7 @@ public enum BenchCase { "vectDivAdd2 dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, 0, 0, ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, 0, 0, ctx.len), ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 0, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -110,7 +100,7 @@ public enum BenchCase { "vectDivAdd sparse", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -123,7 +113,7 @@ public enum BenchCase { "vectDivAdd2 sparse", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -135,7 +125,7 @@ public enum BenchCase { "vectDivWrite dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.bval, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -146,7 +136,7 @@ public enum BenchCase { "vectDivWrite2 dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.bval, ctx.a, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -157,7 +147,7 @@ public enum BenchCase { "vectDivWrite3 dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseBDiv();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -171,7 +161,7 @@ public enum BenchCase { "vectEqualWrite dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -182,7 +172,7 @@ public enum BenchCase { "vectEqualAdd dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectEqualAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectEqualAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), ctx -> LibSpoofPrimitives.vectEqualAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -193,51 +183,18 @@ public enum BenchCase { "vectEqualWrite2 dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); ctx.ok = ctx.maxDiff <= 1e-9; } ), - VECT_NOTEQUAL_ADD( - "vectNotequalAdd dense", - OutKind.ARRAY_DOUBLE, - ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectNotequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), - ctx -> LibSpoofPrimitives.vectNotequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), - ctx -> { - ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); - ctx.ok = ctx.maxDiff <= 1e-9; - } - ), - VECT_NOTEQUAL_WRITE( - "vectNotequalWrite dense", - OutKind.ARRAY_DOUBLE, - ctx -> {ctx.initDenseA(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len), - ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.bval, 0,ctx.len), - ctx -> { - ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); - ctx.ok = ctx.maxDiff <= 1e-9; - } - ), - VECT_NOTEQUAL_WRITE2( - "vectNotequalWrite2 dense", - OutKind.ARRAY_DOUBLE, - ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectNotequalWrite(ctx.a, ctx.b, 0 ,0 ,ctx.len), - ctx -> ctx.cVector = LibSpoofPrimitives.vectNotequalWrite(ctx.a, ctx.b, 0, 0, ctx.len), - ctx -> { - ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); - ctx.ok = ctx.maxDiff <= 1e-9; - } - ), VECT_LESS_ADD( "vectLessAdd dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectLessAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectLessAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), ctx -> LibSpoofPrimitives.vectLessAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -248,7 +205,7 @@ public enum BenchCase { "vectLessWrite dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.bval, 0 ,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.bval, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -259,7 +216,7 @@ public enum BenchCase { "vectLessWrite2 dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.b, 0, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -270,7 +227,7 @@ public enum BenchCase { "vectLessequalAdd dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectLessequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectLessequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), ctx -> LibSpoofPrimitives.vectLessequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -281,7 +238,7 @@ public enum BenchCase { "vectLessequalWrite dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 ,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.bval, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -292,7 +249,7 @@ public enum BenchCase { "vectLessequalWrite2 dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initDenseB();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.b, 0, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -304,7 +261,7 @@ public enum BenchCase { "vectGreaterAdd dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, - ctx -> LibSpoofPrimitives.scalarvectGreaterAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectGreaterAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), ctx -> LibSpoofPrimitives.vectGreaterAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -315,7 +272,7 @@ public enum BenchCase { "vectGreaterWrite dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initbval();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 ,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.bval, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); @@ -326,24 +283,13 @@ public enum BenchCase { "vectGreaterWrite2 dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseA(); ctx.initDenseB();}, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.b, 0, 0, ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); ctx.ok = ctx.maxDiff <= 1e-9; } ), - VECT_DIV_ADD_pure( - "vectDivAddpure dense", - OutKind.ARRAY_DOUBLE, - ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();}, - ctx -> LibSpoofPrimitives.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len), - ctx -> LibSpoofPrimitives.pureDivvectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len), - ctx -> { - ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); - ctx.ok = ctx.maxDiff <= 1e-9; - } - ), // vectMult2 @@ -351,45 +297,12 @@ public enum BenchCase { "vectMult2Add dense", OutKind.ARRAY_DOUBLE, ctx -> {ctx.initDenseAandC_mutable(); }, - ctx -> LibSpoofPrimitives.scalarvectMult2Add(ctx.a, ctx.cScalar,0, 0,ctx.len), + ctx -> backup_primitives_for_benchmark.scalarvectMult2Add(ctx.a, ctx.cScalar,0, 0,ctx.len), ctx -> LibSpoofPrimitives.vectMult2Add(ctx.a, ctx.cVector, 0, 0,ctx.len), ctx -> { ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); ctx.ok = ctx.maxDiff <= 1e-9; } - ), - VECT_Mult2_WRITE( - "vectMult2Write dense", - OutKind.ARRAY_DOUBLE, - ctx -> {ctx.initDenseAandC_mutable(); }, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len), - ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write(ctx.a, 0,ctx.len), - ctx -> { - ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); - ctx.ok = ctx.maxDiff <= 1e-9; - } - ), - VECT_Mult2_WRITE_DEDICATED( - "vectMult2Write_dedicated dense", - OutKind.ARRAY_DOUBLE, - ctx -> {ctx.initDenseAandC_mutable(); }, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len), - ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated(ctx.a, 0,ctx.len), - ctx -> { - ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); - ctx.ok = ctx.maxDiff <= 1e-9; - } - ), - VECT_Mult2_WRITE_DEDICATED2( - "vectMult2Write_dedicated2 dense", - OutKind.ARRAY_DOUBLE, - ctx -> {ctx.initDenseAandC_mutable(); }, - ctx -> ctx.cScalar = LibSpoofPrimitives.scalarvectMult2Write(ctx.a, 0,ctx.len), - ctx -> ctx.cVector = LibSpoofPrimitives.vectMult2Write_dedicated_2(ctx.a, 0,ctx.len), - ctx -> { - ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); - ctx.ok = ctx.maxDiff <= 1e-9; - } ); diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java new file mode 100644 index 00000000000..d0086eb9f66 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java @@ -0,0 +1,856 @@ +package org.apache.sysds.performance.primitives_vector_api; + +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; + + + +import java.util.Arrays; + +import org.apache.commons.math3.util.FastMath; +import org.apache.sysds.runtime.data.DenseBlockFP64; +import org.apache.sysds.runtime.data.SparseRowVector; +import org.apache.sysds.runtime.functionobjects.BitwAnd; +import org.apache.sysds.runtime.functionobjects.IntegerDivide; +import org.apache.sysds.runtime.functionobjects.Modulus; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNN; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling; +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; +import jdk.incubator.vector.VectorMask; + + +public class backup_primitives_for_benchmark { + + // Vector API initializations + private static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + private static final int vLen = SPECIES.length(); + + public static double[] allocVector(int len, boolean reset) { + return allocVector(len, reset, 0); + } + + protected static double[] allocVector(int len, boolean reset, double resetVal) { + VectorBuffer buff = memPool.get(); + + //find next matching vector in ring buffer or + //allocate new vector if required + double[] vect = buff.next(len); + if( vect == null ) + vect = new double[len]; + + //reset vector if required + if( reset ) + Arrays.fill(vect, resetVal); + return vect; + } + private static class VectorBuffer { + private static final int MAX_SIZE = 512*1024; //4MB + private final double[][] _data; + private int _pos; + private int _len1; + private int _len2; + + public VectorBuffer(int num, int len1, int len2) { + //best effort size restriction since large intermediates + //not necessarily used (num refers to the total number) + len1 = Math.min(len1, MAX_SIZE); + len2 = Math.min(len2, MAX_SIZE); + //pre-allocate ring buffer + int lnum = (len2>0 && len1!=len2) ? 2*num : num; + _data = new double[lnum][]; + for( int i=0; i num ) { + _data[2*i] = new double[len1]; + _data[2*i+1] = new double[len2]; + } + else { + _data[i] = new double[len1]; + } + } + _pos = -1; + _len1 = len1; + _len2 = len2; + } + public double[] next(int len) { + if( _len1!=len && _len2!=len ) + return null; + do { + _pos = (_pos+1>=_data.length) ? 0 : _pos+1; + } while( _data[_pos].length!=len ); + return _data[_pos]; + } + @SuppressWarnings("unused") + public boolean isReusable(int num, int len1, int len2) { + int lnum = (len2>0 && len1!=len2) ? 2*num : num; + return (_len1 == len1 && _len2 == len2 + && _data.length == lnum); + } + } + private static ThreadLocal memPool = new ThreadLocal<>() { + @Override protected VectorBuffer initialValue() { return new VectorBuffer(0,0,0); } + }; + + public static void scalarvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += a[j] / bval; + } + + public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final double inv = 1.0 / bval; + final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); + int i = 0; final int upperBound = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += a[ai + i] * inv; + } + } + + public static void scalarvectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += bval / a[j]; + } + + public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + vc = vc.add(vb.div(va)); + vc.intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (;i scalarMax) + scalarMax = prod; + } + return scalarMax; + } + + + public static double scalarvectSum(double[] a, int ai, int len) { + double val = 0; + final int bn = len%8; + + //compute rest + for( int i = ai; i < ai+bn; i++ ) + val += a[ i ]; + + //unrolled 8-block (for better instruction-level parallelism) + for( int i = ai+bn; i < ai+len; i+=8 ) { + //read 64B cacheline of a, compute cval' = sum(a) + cval + val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ] + + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ i+7 ]; + } + + //scalar result + return val; + } + + public static double vectSum(double[] a, int ai, int len) { + double sum = 0d; + int i = 0; + + DoubleVector acc = DoubleVector.zero(SPECIES); + int upperBound = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += SPECIES.length()) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + acc = acc.add(v); + } + sum += acc.reduceLanes(VectorOperators.ADD); + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + sum += a[ai + i]; + } + return sum; + } + public static double scalarvectMax(double[] a, int ai, int len) { + double val = Double.NEGATIVE_INFINITY; + for( int i = ai; i < ai+len; i++ ) + val = Math.max(a[i], val); + return val; + } + + public static double vectMax(double[] a, int ai, int len) { + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + vmax = vmax.max(v); + } + double maxVal = vmax.reduceLanes(VectorOperators.MAX); + + //rest, not aligned to vLen-blocks + for(;i nz = v.compare(VectorOperators.NE, vzero); + count += nz.trueCount(); + } + + //rest, not aligned to vLen-blocks + for(;i eq = aVec.compare(VectorOperators.EQ, bVec); + + DoubleVector inc = zeros.blend(ones, eq); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0; + } + } + public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] == bval) ? 1 : 0; + return c; + } + public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + int i = 0; + int upper = SPECIES.loopBound(len); + DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + DoubleVector zeros = DoubleVector.zero(SPECIES); + DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + var mask = va.compare(VectorOperators.EQ, vb); + DoubleVector out = zeros.blend(ones, mask); + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] == bval) ? 1 : 0; + } + return c; + } + public static double[] scalarvectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] == b[bi]) ? 1 : 0; + return c; + } + + public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + VectorMask eq = aVec.compare(VectorOperators.EQ, bVec); + DoubleVector out = zeros.blend(ones, eq); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] != b[bi]) ? 1 : 0; + return c; + } + + // not in use: vector api implementation slower than scalar loop version +public static double[] vectNotequalWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector out = zeros.blend(ones, ne); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + + + public static void scalarvectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += (a[j] < bval) ? 1 : 0; + } + public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector inc = zeros.blend(ones, lt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0; + } + } + + + public static double[] scalarvectLessWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] < bval) ? 1 : 0; + return c; + } + + + public static double[] vectLessWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < bval) ? 1.0 : 0.0; + } + + return c; + } + + public static double[] scalarvectLessWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] < b[bi]) ? 1 : 0; + return c; + } + + public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0; + } + + return c; + } + public static void scalarvectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += (a[j] <= bval) ? 1 : 0; + } + + public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector inc = zeros.blend(ones, le); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0; + } + } + public static double[] scalarvectLessequalWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] <= bval) ? 1 : 0; + return c; + } + public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0; + } + + return c; + } + public static double[] scalarvectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] <= b[bi]) ? 1 : 0; + return c; + } + + public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0; + } + + return c; + } + public static void scalarvectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += (a[j] > bval) ? 1 : 0; + } + + public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector inc = zeros.blend(ones, gt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0; + } + } + public static double[] scalarvectGreaterWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] > bval) ? 1 : 0; + return c; + } + public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > bval) ? 1.0 : 0.0; + } + return c; + } + public static void scalarvectMult2Add(double[] a, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += a[j] + a[j]; + } + + public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) { + LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len); + } + + public static double[] scalarvectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] > b[bi]) ? 1 : 0; + return c; + } + + // not in use: vector api implementation slower than scalar loop version + public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + +} diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java deleted file mode 100644 index 4c2bd230349..00000000000 --- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/benchUtil.java +++ /dev/null @@ -1,36 +0,0 @@ -package org.apache.sysds.test.component.codegen.performance_tests; - - -public class benchUtil { - - public static void warmup(Runnable r, int iters) { - for (int i = 0; i < iters; i++) { - r.run(); - } - } - - /** returns ns per call */ - public static double measure(Runnable r, int iters) { - long t0 = System.nanoTime(); - for (int i = 0; i < iters; i++) { - r.run(); - } - long t1 = System.nanoTime(); - return (t1 - t0) / (double) iters; - } - - public static double checksum(double[] x) { - double s = 0; - for (double v : x) s += v; - return s; - } - - public static double maxAbsDiff(double[] a, double[] b) { - double m = 0; - for (int i = 0; i < a.length; i++) { - m = Math.max(m, Math.abs(a[i] - b[i])); - } - return m; - } -} - diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java deleted file mode 100644 index c2cd8f068f4..00000000000 --- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/rowMaxsVectMultTest.java +++ /dev/null @@ -1,95 +0,0 @@ -package org.apache.sysds.test.component.codegen.performance_tests; -import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; - - -public class rowMaxsVectMultTest { - public static void main(String[] args) { - int len = 1_000_000; - double[] a = new double[len]; - for (int i = 0; i < len; i++) - a[i] = (i % 10) - 5; - double[] b = new double[len]; - for (int i = 0; i < len; i++) - b[i] = (i % 10) - 5; - - float[] a_f = new float[len]; - for (int i = 0; i < len; i++) - a_f[i] = (i % 10) - 5; - float[] b_f = new float[len]; - for (int i = 0; i < len; i++) - b_f[i] = (i % 10) - 5; - - - - // warm up - for (int i = 0; i < 20_000; i++) { - LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len); - LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len); - LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len); - LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len); - LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len); - } - - // measure - long t2_0 = System.nanoTime(); - double s2 = 0; - for (int i = 0; i < 2000; i++) - s2 += LibSpoofPrimitives.rowMaxsVectMult(a, b, 0,0,len); - long t2_1 = System.nanoTime(); - - System.out.println("Vector MaxVal=" + s2/2000); - System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0)); - - // measure - long t1_0 = System.nanoTime(); - double s1 = 0; - for (int i = 0; i < 2000; i++) - s1 += LibSpoofPrimitives.scalarrowMaxsVectMult(a, b,0,0, len); - long t1_1 = System.nanoTime(); - - System.out.println("Scalar MaxVal Sum=" + s1/2000); - System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0)); - - - // measure - long t3_0 = System.nanoTime(); - double s3 = 0; - for (int i = 0; i < 2000; i++) - s3 += LibSpoofPrimitives.rowMaxsVectMultFloat(a_f, b_f,0,0, len); - long t3_1 = System.nanoTime(); - - System.out.println("Vector Float MaxVal=" + s3/2000); - System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0)); - - // measure - long t4_0 = System.nanoTime(); - double s4 = 0; - for (int i = 0; i < 2000; i++) - s4 += LibSpoofPrimitives.scalarrowMaxsVectMultFloat(a_f, b_f,0,0, len); - long t4_1 = System.nanoTime(); - - System.out.println("Scalar Float MaxVal=" + s4/2000); - System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0)); - - // measure - long t5_0 = System.nanoTime(); - double s5 = 0; - for (int i = 0; i < 2000; i++) - s5 += LibSpoofPrimitives.rowMaxsVectMultVec2Acc(a, b,0,0, len); - long t5_1 = System.nanoTime(); - - System.out.println("Vector 2acc MaxVal=" + s5/2000); - System.out.println("Time per call (ns): " + ((t5_1 - t5_0) / 2000.0)); - - - - } -} -/* -Scalar Sum=-1.0E9 -Time per call (ns): 142774.5625 -Vector Sum=-1.0E9 -Time per call (ns): 468854.25 -Vector Float Sum=-1.0E9 -Time per call (ns): 274727.3545 -*/ diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java deleted file mode 100644 index a43496d6a8d..00000000000 --- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectDivAddTest.java +++ /dev/null @@ -1,100 +0,0 @@ - -package org.apache.sysds.test.component.codegen.performance_tests; -import java.util.Arrays; - -import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; - - -public class vectDivAddTest { - public static void main(String[] args) { - //final int len = 32_768; - final int len = 262_144; - //final int len = 1_000_000; - - final double[] a = new double[len]; - final double[] cInit = new double[len]; - - for (int i = 0; i < len; i++) { - a[i] = (i % 10) - 5; - cInit[i] = (i % 10) - 5; - } - - final double bval = 1.234567; // NOT 1.0 - - double[] cScalar = Arrays.copyOf(cInit, len); - double[] cVector = Arrays.copyOf(cInit, len); - double[] cVectorPureDiv = Arrays.copyOf(cInit, len); - - // Warm up scalar only - for (int i = 0; i < 200; i++) { - LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len); - } - - // Warm up vector only - for (int i = 0; i < 200; i++) { - LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len); - } - - // Warm up pure div vector only - for (int i = 0; i < 200; i++) { - LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len); - } - - // Reset for measurement - cScalar = Arrays.copyOf(cInit, len); - - // Measure scalar - long t0 = System.nanoTime(); - for (int i = 0; i < 2000; i++) { - LibSpoofPrimitives.scalarvectDivAdd(a, bval, cScalar, 0, 0, len); - } - long t1 = System.nanoTime(); - - // Reset for measurement - cVector = Arrays.copyOf(cInit, len); - - // Measure vector - long t2 = System.nanoTime(); - for (int i = 0; i < 2000; i++) { - LibSpoofPrimitives.vectDivAdd(a, bval, cVector, 0, 0, len); - } - long t3 = System.nanoTime(); - - // Compare correctness - double maxDiff = 0; - double sumScalar = 0, sumVector = 0; - for (int i = 0; i < len; i++) { - maxDiff = Math.max(maxDiff, Math.abs(cScalar[i] - cVector[i])); - sumScalar += cScalar[i]; - sumVector += cVector[i]; - } - - - // Reset for measurement - cVectorPureDiv = Arrays.copyOf(cInit, len); - - // Measure vector - long t4 = System.nanoTime(); - for (int i = 0; i < 2000; i++) { - LibSpoofPrimitives.pureDivvectDivAdd(a, bval, cVectorPureDiv, 0, 0, len); - } - long t5 = System.nanoTime(); - - // Compare correctness - - double sum_prev = sumScalar + sumVector; - double sum_Vector_pure_div = 0; - for (int i = 0; i < len; i++) { - maxDiff = Math.max(maxDiff, Math.abs(sumScalar - cVectorPureDiv[i])); - sum_Vector_pure_div += cVectorPureDiv[i]; - } - - System.out.println("Scalar time per call (ns): " + ((t1 - t0) / 2000.0)); - System.out.println("Vector time per call (ns): " + ((t3 - t2) / 2000.0)); - System.out.println("pure vector div time per call (ns): " + ((t5 - t4) / 2000.0)); - System.out.println("maxDiff: " + maxDiff); - System.out.println("checksum scalar: " + sumScalar); - System.out.println("checksum vector: " + sumVector); - System.out.println("checksum pure vector div : " + sum_Vector_pure_div); - } -} diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java deleted file mode 100644 index be5666a6847..00000000000 --- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectEqualWriteTest.java +++ /dev/null @@ -1,61 +0,0 @@ - -package org.apache.sysds.test.component.codegen.performance_tests; -import java.util.Arrays; - -import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; - - -public class vectEqualWriteTest { - public static void main(String[] args) { - //final int len = 32_768; - //final int len = 262_144; - final int len = 1_000_000; - //final int len = 1_000_000; - - final double[] aInit = new double[len]; - - for (int i = 0; i < len; i++) { - aInit[i] = (i % 10) - 5; - } - - final double bval = 1.234567; // NOT 1.0 - - double[] aScalar = Arrays.copyOf(aInit, len); - double[] aVector = Arrays.copyOf(aInit, len); - - // Warm up scalar only - for (int i = 0; i < 200; i++) { - LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len); - } - - // Warm up vector only - for (int i = 0; i < 200; i++) { - LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len); - } - - // Reset for measurement - aScalar = Arrays.copyOf(aInit, len); - - // Measure scalar - long t0 = System.nanoTime(); - for (int i = 0; i < 2000; i++) { - LibSpoofPrimitives.scalarvectEqualWrite(aScalar, bval, 0,len); - } - long t1 = System.nanoTime(); - System.out.println("Scalar"); - System.out.println("Time per call (ns): " + ((t1- t0) / 2000.0)); - - - // Reset for measurement - aVector = Arrays.copyOf(aInit, len); - - // Measure vector - long t2 = System.nanoTime(); - for (int i = 0; i < 2000; i++) { - LibSpoofPrimitives.vectEqualWrite(aVector, bval, 0,len); - } - long t3 = System.nanoTime(); - System.out.println("Vector"); - System.out.println("Time per call (ns): " + ((t3- t2) / 2000.0)); - } -} diff --git a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java b/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java deleted file mode 100644 index 90fb36192c8..00000000000 --- a/src/test/java/org/apache/sysds/test/component/codegen/performance_tests/vectSumTest.java +++ /dev/null @@ -1,74 +0,0 @@ -package org.apache.sysds.test.component.codegen.performance_tests; -import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; - - -public class vectSumTest { - public static void main(String[] args) { - int len = 1_000_000; - double[] a = new double[len]; - for (int i = 0; i < len; i++) - a[i] = (i % 10) - 5; - float[] a_f = new float[len]; - for (int i = 0; i < len; i++) - a_f[i] = (i % 10) - 5; - - // warm up - for (int i = 0; i < 20_000; i++) { - LibSpoofPrimitives.vectSum(a, 0, len); - LibSpoofPrimitives.scalarvectSum(a, 0, len); - LibSpoofPrimitives.vectSumFloat(a_f, 0, len); - LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len); - } - - - // measure - long t2_0 = System.nanoTime(); - double s2 = 0; - for (int i = 0; i < 2000; i++) - s2 += LibSpoofPrimitives.scalarvectSum(a, 0, len); - long t2_1 = System.nanoTime(); - - System.out.println("Scalar Sum=" + s2); - System.out.println("Time per call (ns): " + ((t2_1 - t2_0) / 2000.0)); - - // measure - long t1_0 = System.nanoTime(); - double s1 = 0; - for (int i = 0; i < 2000; i++) - s1 += LibSpoofPrimitives.vectSum(a, 0, len); - long t1_1 = System.nanoTime(); - - System.out.println("Vector Sum=" + s1); - System.out.println("Time per call (ns): " + ((t1_1 - t1_0) / 2000.0)); - - // measure - long t3_0 = System.nanoTime(); - double s3 = 0; - for (int i = 0; i < 2000; i++) - s3 += LibSpoofPrimitives.vectSumFloat(a_f, 0, len); - long t3_1 = System.nanoTime(); - - System.out.println("Vector Float Sum=" + s3); - System.out.println("Time per call (ns): " + ((t3_1 - t3_0) / 2000.0)); - - - // measure - long t4_0 = System.nanoTime(); - double s4 = 0; - for (int i = 0; i < 2000; i++) - s4 += LibSpoofPrimitives.scalarvectSumFloat(a_f,0, len); - long t4_1 = System.nanoTime(); - - System.out.println("Scalar Float Sum=" + s4/2000); - System.out.println("Time per call (ns): " + ((t4_1 - t4_0) / 2000.0)); - - } -} -/* -Scalar Sum=-1.0E9 -Time per call (ns): 142774.5625 -Vector Sum=-1.0E9 -Time per call (ns): 468854.25 -Vector Float Sum=-1.0E9 -Time per call (ns): 274727.3545 -*/