diff --git a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java index ebb42676f0e..a66d8f2dcaa 100644 --- a/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java +++ b/src/main/java/org/apache/sysds/runtime/codegen/LibSpoofPrimitives.java @@ -28,10 +28,16 @@ import org.apache.sysds.runtime.functionobjects.IntegerDivide; import org.apache.sysds.runtime.functionobjects.Modulus; import org.apache.sysds.runtime.matrix.data.LibMatrixDNN; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType; import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col; import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling; import org.apache.sysds.runtime.matrix.data.LibMatrixMult; -import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.FloatVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; +import jdk.incubator.vector.VectorMask; /** * This library contains all vector primitives that are used in @@ -45,6 +51,12 @@ public class LibSpoofPrimitives private static IntegerDivide intDiv = IntegerDivide.getFnObject(); private static Modulus mod = Modulus.getFnObject(); private static BitwAnd bwAnd = BitwAnd.getBitwAndFnObject(); + + // Vector API initializations + private static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + private static final VectorSpecies FSPECIES = FloatVector.SPECIES_PREFERRED; + private static final int vLen = SPECIES.length(); + //global pool of reusable vectors, individual operations set up their own thread-local //ring buffers of reusable vectors with specific number of vectors and vector sizes @@ -57,13 +69,32 @@ public class LibSpoofPrimitives }; public static double rowMaxsVectMult(double[] a, double[] b, int ai, int bi, int len) { - double val = Double.NEGATIVE_INFINITY; - int j=0; - for( int i = ai; i < ai+len; i++ ) - val = Math.max(a[i]*b[j++], val); - return val; + double maxVal = Double.NEGATIVE_INFINITY; + + int i = 0; + int upper = SPECIES.loopBound(len); + + DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vb = DoubleVector.fromArray(SPECIES, b, bi + i); + DoubleVector prod = va.mul(vb); + vmax = vmax.max(prod); + } + + maxVal = vmax.reduceLanes(VectorOperators.MAX); + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + maxVal = Math.max(maxVal, a[ai + i] * b[bi + i]); + } + + return maxVal; } + // note: parameter bi unused public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, int bi, int len) { double val = Double.NEGATIVE_INFINITY; for( int i = ai; i < ai+len; i++ ) @@ -71,6 +102,32 @@ public static double rowMaxsVectMult(double[] a, double[] b, int[] aix, int ai, return val; } + // not in use: vector api implementation slower than scalar loop version + public static double rowMaxsVectMult_vector_api(double[] a, double[] b, int[] aix, int ai, int bi, int len) { + double scalarMax = Double.NEGATIVE_INFINITY; + + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += SPECIES.length()) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vb = DoubleVector.fromArray(SPECIES, b, 0, aix, ai + i); + DoubleVector prod = va.mul(vb); + vmax = vmax.max(prod); + } + scalarMax = Math.max(scalarMax, vmax.reduceLanes(VectorOperators.MAX)); + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + double prod = a[ai + i] * b[aix[ai + i]]; + if (prod > scalarMax) + scalarMax = prod; + } + return scalarMax; + } + // forwarded calls to LibMatrixMult public static double dotProduct(double[] a, double[] b, int ai, int bi, int len) { if( a == null || b == null ) return 0; @@ -295,6 +352,7 @@ public static double[] vectCbindWrite(double[] a, double[] b, int[] aix, int ai, * @param len number of processed elements * @return sum value */ + public static double vectSum(double[] a, int ai, int len) { double val = 0; final int bn = len%8; @@ -313,6 +371,27 @@ public static double vectSum(double[] a, int ai, int len) { //scalar result return val; } + // not in use: vector api implementation slower than scalar loop version + public static double vectSum_vector_api(double[] a, int ai, int len) { + double sum = 0d; + int i = 0; + + DoubleVector acc = DoubleVector.zero(SPECIES); + int upperBound = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += SPECIES.length()) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + acc = acc.add(v); + } + sum += acc.reduceLanes(VectorOperators.ADD); + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + sum += a[ai + i]; + } + return sum; + } public static double vectSum(double[] avals, int[] aix, int ai, int alen, int len) { //forward to dense as column indexes not required here @@ -327,36 +406,82 @@ public static double vectSumsq(double[] avals, int[] aix, int ai, int alen, int return LibMatrixMult.dotProduct(avals, avals, ai, ai, alen); } - public static double vectMin(double[] a, int ai, int len) { + public static double scalarvectMin(double[] a, int ai, int len) { double val = Double.POSITIVE_INFINITY; for( int i = ai; i < ai+len; i++ ) val = Math.min(a[i], val); return val; } + + public static double vectMin(double[] a, int ai, int len) { + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vmin = DoubleVector.broadcast(SPECIES, Double.POSITIVE_INFINITY); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + vmin = vmin.min(v); + } + double minVal = vmin.reduceLanes(VectorOperators.MIN); + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + minVal = Math.min(minVal, a[ai + i]); + } + return minVal; + } public static double vectMin(double[] avals, int[] aix, int ai, int alen, int len) { double val = vectMin(avals, ai, alen); return (alen nz = v.compare(VectorOperators.NE, vzero); + count += nz.trueCount(); + } + + //rest, not aligned to vLen-blocks + for(;i eq = aVec.compare(VectorOperators.EQ, bVec); + + DoubleVector inc = zeros.blend(ones, eq); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0; + } + } + public static void vectEqualAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectEqualAdd(a, bval, c, ai, ci, len); @@ -1609,21 +1896,56 @@ public static void vectEqualAdd(double bval, double[] a, double[] c, int[] aix, public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) { double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] == bval) ? 1 : 0; + int i = 0; + int upper = SPECIES.loopBound(len); + DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + DoubleVector zeros = DoubleVector.zero(SPECIES); + DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + var mask = va.compare(VectorOperators.EQ, vb); + DoubleVector out = zeros.blend(ones, mask); + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] == bval) ? 1 : 0; + } return c; } + public static double[] vectEqualWrite(double bval, double[] a, int ai, int len) { return vectEqualWrite(a, bval, ai, len); } + public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++, bi++) - c[j] = (a[ai] == b[bi]) ? 1 : 0; - return c; - } + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + VectorMask eq = aVec.compare(VectorOperators.EQ, bVec); + DoubleVector out = zeros.blend(ones, eq); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0; + } + return c; + } public static double[] vectEqualWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval == 0) ? 1 : 0; @@ -1655,8 +1977,27 @@ public static double[] vectEqualWrite(double[] a, double[] b, int ai, int[] bix, //custom vector not equal public static void vectNotequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] != bval) ? 1 : 0; + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector inc = zeros.blend(ones, ne); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] != bval) ? 1.0 : 0.0; + } } public static void vectNotequalAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { @@ -1675,13 +2016,31 @@ public static void vectNotequalAdd(double[] a, double bval, double[] c, int[] ai public static void vectNotequalAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { vectNotequalAdd(a, bval, c, aix, ai, ci, alen, len); } - + public static double[] vectNotequalWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] != bval) ? 1 : 0; - return c; - } + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector out = zeros.blend(ones, ne); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] != bval) ? 1.0 : 0.0; + } + return c; + } public static double[] vectNotequalWrite(double bval, double[] a, int ai, int len) { return vectNotequalWrite(a, bval, ai, len); @@ -1694,6 +2053,32 @@ public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, return c; } + // not in use: vector api implementation slower than scalar loop version + public static double[] vectNotequalWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector out = zeros.blend(ones, ne); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + public static double[] vectNotequalWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval != 0) ? 1 : 0; double[] c = allocVector(len, true, init); @@ -1723,9 +2108,29 @@ public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int[] b //custom vector less public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] < bval) ? 1 : 0; - } + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector inc = zeros.blend(ones, lt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0; + } + } public static void vectLessAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectGreaterequalAdd(a, bval, c, ai, ci, len); @@ -1743,24 +2148,66 @@ public static void vectLessAdd(double[] a, double bval, double[] c, int[] aix, i public static void vectLessAdd(double bval, double[] a, double[] c, int[] aix, int ai, int ci, int alen, int len) { vectGreaterequalAdd(a, bval, c, aix, ai, ci, alen, len); } - + public static double[] vectLessWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] < bval) ? 1 : 0; - return c; - } + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < bval) ? 1.0 : 0.0; + } + + return c; + } + public static double[] vectLessWrite(double bval, double[] a, int ai, int len) { return vectGreaterequalWrite(a, bval, ai, len); } - + public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++, bi++) - c[j] = (a[ai] < b[bi]) ? 1 : 0; + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0; + } + return c; - } + } public static double[] vectLessWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval > 0) ? 1 : 0; @@ -1789,11 +2236,31 @@ public static double[] vectLessWrite(double[] a, double[] b, int ai, int[] bix, } //custom vector less equal - + public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] <= bval) ? 1 : 0; - } + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector inc = zeros.blend(ones, le); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0; + } + } public static void vectLessequalAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectGreaterAdd(a, bval, c, ai, ci, len); @@ -1813,22 +2280,63 @@ public static void vectLessequalAdd(double bval, double[] a, double[] c, int[] a } public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] <= bval) ? 1 : 0; - return c; - } + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0; + } + + return c; + } public static double[] vectLessequalWrite(double bval, double[] a, int ai, int len) { return vectGreaterWrite(a, bval, ai, len); } - + public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++, bi++) - c[j] = (a[ai] <= b[bi]) ? 1 : 0; + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0; + } + return c; - } + } public static double[] vectLessequalWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval >= 0) ? 1 : 0; @@ -1859,9 +2367,29 @@ public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int[] //custom vector greater public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { - for( int j = ai; j < ai+len; j++, ci++) - c[ci] += (a[j] > bval) ? 1 : 0; - } + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector inc = zeros.blend(ones, gt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0; + } + } public static void vectGreaterAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { vectLessequalAdd(a, bval, c, ai, ci, len); @@ -1881,11 +2409,30 @@ public static void vectGreaterAdd(double bval, double[] a, double[] c, int[] aix } public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) { - double[] c = allocVector(len, false); - for( int j = 0; j < len; j++, ai++) - c[j] = (a[ai] > bval) ? 1 : 0; - return c; - } + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > bval) ? 1.0 : 0.0; + } + return c; + } public static double[] vectGreaterWrite(double bval, double[] a, int ai, int len) { return vectLessWrite(a, bval, ai, len); @@ -1898,6 +2445,33 @@ public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, return c; } + // not in use: vector api implementation slower than scalar loop version + public static double[] vectGreaterWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + public static double[] vectGreaterWrite(double[] a, double bval, int[] aix, int ai, int alen, int len) { double init = (bval < 0) ? 1 : 0; double[] c = allocVector(len, true, init); diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index cfdf21255e7..9417e5134e8 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -4019,6 +4019,45 @@ public static void vectMultiplyWrite( final double[] a, double[] b, double[] c, c[ ci+bix[j+7] ] = a[ ai+bix[j+7] ] * b[ j+7 ]; } } + // test + public static double[] vectMult2Write(double[] a,double[] c, int ai, int len) { + + int i = 0; + int upper = SPECIES.loopBound(len); + + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + va.add(va).intoArray(c, i); + } + + for (; i < len; i++) { + double x = a[ai + i]; + c[i] = x + x; + } + + return c; + } + public static double[] vectMult2Write_dedicated_2(double[] a, double[] c, int ai, int len) { + + final int bn = len % vLen; + + // scalar prefix so the vector loop is an exact multiple of vLen + for (int j = 0; j < bn; j++) { + double x = a[ai + j]; + c[j] = x + x; + } + + // vector loop: j runs over multiples of vLen, no tail afterwards + for (int j = bn; j < len; j += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + j); + va.add(va).intoArray(c, j); + // or: va.mul(2.0) via broadcast if you prefer + } + + return c; + } + + public static void vectMultiply(double[] a, double[] c, int ai, int ci, final int len){ diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java new file mode 100644 index 00000000000..c428f6782a9 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchCase.java @@ -0,0 +1,332 @@ +package org.apache.sysds.performance.primitives_vector_api; +import org.apache.sysds.performance.primitives_vector_api.BenchCase.OutKind; +import org.apache.sysds.runtime.codegen.LibSpoofPrimitives; + + +public enum BenchCase { + + // Aggregations + + VECT_SUM( + "vectSum dense", + OutKind.SCALAR_DOUBLE, + ctx -> ctx.initDenseA(), + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectSum(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectSum(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.vectorRes;}, + ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} + ), + + + ROWS_MAXS_VECT_MULT( + "rowMaxsVectMult dense", + OutKind.SCALAR_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB();}, + ctx -> ctx.scalarRes = backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> ctx.vectorRes = backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9; + } + ), + + ROWS_MAXS_VECT_MULT_AIX( + "rowMaxsVectMult_aix dense", + OutKind.SCALAR_DOUBLE, + ctx -> {ctx.initDenseA();ctx.initDenseB();ctx.initDenseAInt();}, + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarrowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> { + ctx.vectorRes = backup_primitives_for_benchmark.rowMaxsVectMult(ctx.a, ctx.b, ctx.a_int,0,0,ctx.len); + BenchUtil.blackhole = ctx.vectorRes; + }, + ctx -> { + ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9; + } + ), + + VECT_MAX( + "vectMax dense", + OutKind.SCALAR_DOUBLE, + ctx -> ctx.initDenseA(), + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectMax(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectMax(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.vectorRes;}, + ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} + ), + VECT_COUNTNNZ( + "vectCountnnz dense", + OutKind.SCALAR_DOUBLE, + ctx -> ctx.initDenseA(), + ctx -> {ctx.scalarRes = backup_primitives_for_benchmark.scalarvectCountnnz(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.scalarRes; + }, + ctx -> {ctx.vectorRes = backup_primitives_for_benchmark.vectCountnnz(ctx.a, 0, ctx.len); + BenchUtil.blackhole = ctx.vectorRes;}, + ctx -> {ctx.ok = Math.abs(ctx.scalarRes - ctx.vectorRes) <= 1e-9;} + ), + + // Divisions + + VECT_DIV_ADD( + "vectDivAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseADiv();}, + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, 0, 0, ctx.len), + ctx -> backup_primitives_for_benchmark.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + VECT_DIV_ADD_2( + "vectDivAdd2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, 0, 0, ctx.len), + ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + VECT_DIV_ADD_SPARSE( + "vectDivAdd sparse", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();}, + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.a, ctx.bval, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> LibSpoofPrimitives.vectDivAdd(ctx.a, ctx.bval, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + + VECT_DIV_ADD_SPARSE2( + "vectDivAdd2 sparse", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initDenseAInt(); ctx.initbval();}, + ctx -> backup_primitives_for_benchmark.scalarvectDivAdd(ctx.bval, ctx.a, ctx.cScalar, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> LibSpoofPrimitives.vectDivAdd(ctx.bval, ctx.a, ctx.cVector, ctx.a_int, 0, 0,ctx.len, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + VECT_DIV_WRITE( + "vectDivWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_DIV_WRITE2( + "vectDivWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.bval, ctx.a, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.bval, ctx.a, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_DIV_WRITE3( + "vectDivWrite3 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval(); ctx.initDenseBDiv();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectDivWrite(ctx.a, ctx.b, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + // Comparisons + + VECT_EQUAL_WRITE( + "vectEqualWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_EQUAL_ADD( + "vectEqualAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> backup_primitives_for_benchmark.scalarvectEqualAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectEqualAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_EQUAL_WRITE2( + "vectEqualWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectEqualWrite(ctx.a, ctx.bval, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESS_ADD( + "vectLessAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> backup_primitives_for_benchmark.scalarvectLessAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectLessAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESS_WRITE( + "vectLessWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.bval, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESS_WRITE2( + "vectLessWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessWrite(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESSEQUAL_ADD( + "vectLessequalAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> backup_primitives_for_benchmark.scalarvectLessequalAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectLessequalAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESSEQUAL_WRITE( + "vectLessequalWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.bval, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_LESSEQUAL_WRITE2( + "vectLessequalWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectLessequalWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectLessequalWrite(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + VECT_GREATER_ADD( + "vectGreaterAdd dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); ctx.initbval();}, + ctx -> backup_primitives_for_benchmark.scalarvectGreaterAdd(ctx.a, ctx.bval, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectGreaterAdd(ctx.a, ctx.bval,ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_GREATER_WRITE( + "vectGreaterWrite dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initbval();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.bval, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.bval, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + VECT_GREATER_WRITE2( + "vectGreaterWrite2 dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseA(); ctx.initDenseB();}, + ctx -> ctx.cScalar = backup_primitives_for_benchmark.scalarvectGreaterWrite(ctx.a, ctx.b, 0, 0 ,ctx.len), + ctx -> ctx.cVector = LibSpoofPrimitives.vectGreaterWrite(ctx.a, ctx.b, 0, 0, ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ), + + // vectMult2 + + VECT_Mult2_ADD( + "vectMult2Add dense", + OutKind.ARRAY_DOUBLE, + ctx -> {ctx.initDenseAandC_mutable(); }, + ctx -> backup_primitives_for_benchmark.scalarvectMult2Add(ctx.a, ctx.cScalar,0, 0,ctx.len), + ctx -> LibSpoofPrimitives.vectMult2Add(ctx.a, ctx.cVector, 0, 0,ctx.len), + ctx -> { + ctx.maxDiff = BenchUtil.maxAbsDiff(ctx.cScalar, ctx.cVector); + ctx.ok = ctx.maxDiff <= 1e-9; + } + ); + + + + + + + + public enum OutKind { SCALAR_DOUBLE, ARRAY_DOUBLE } + public final String name; + public final java.util.function.Consumer setup; + public final java.util.function.Consumer scalar; + public final java.util.function.Consumer vector; + public final java.util.function.Consumer verify; + public final OutKind outKind; + + + BenchCase(String name, + OutKind outKind, + java.util.function.Consumer setup, + java.util.function.Consumer scalar, + java.util.function.Consumer vector, + java.util.function.Consumer verify) { + this.name = name; this.outKind = outKind; this.setup = setup; this.scalar = scalar; this.vector = vector; this.verify = verify; + } + } + diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java new file mode 100644 index 00000000000..12af0df27e1 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/BenchUtil.java @@ -0,0 +1,63 @@ +package org.apache.sysds.performance.primitives_vector_api; + + +public class BenchUtil { + public static volatile double blackhole; + + public static void warmup(Runnable r,int iters ) { + for (int i = 0; i < iters; i++) r.run(); + } + + public static double measure(Runnable r,int iters) { + System.gc(); + long t0 = System.nanoTime(); + for (int i = 0; i < iters; i++) r.run(); + long t1 = System.nanoTime(); + return (t1 - t0) / (double) iters; + } + + // ---- args helpers ---- + public static int argInt(String[] args, String key, int def) { + for (int i = 0; i < args.length - 1; i++) + if (args[i].equals(key)) + return Integer.parseInt(args[i + 1]); + return def; + } + + public static String argStr(String[] args, String key, String def) { + for (int i = 0; i < args.length - 1; i++) + if (args[i].equals(key)) + return args[i + 1]; + return def; + } + + public static double maxAbsDiff(double[] a, double[] b) { + double m = 0; + for (int i = 0; i < a.length; i++) + m = Math.max(m, Math.abs(a[i] - b[i])); + return m; + } + + public static void printScalarDouble(String name, + double nsScalar, double nsVector, + double scalarRes, double vectorRes, + boolean ok) { + + double speedup = nsScalar / nsVector; + System.out.printf("%s | scalar %.1f ns | vector %.1f ns | speedup %.3fx | " + + "s=%.6g v=%.6g | %s%n", + name, nsScalar, nsVector, speedup, scalarRes, vectorRes, ok ? "OK" : "FAIL"); + } + + public static void printArrayDiff(String name, + double nsScalar, double nsVector, + double maxDiff, + boolean ok) { + + double speedup = nsScalar / nsVector; + System.out.printf("%s | scalar %.1f ns | vector %.1f ns | speedup %.3fx | " + + "maxDiff=%.6g | %s%n", + name, nsScalar, nsVector, speedup, maxDiff, ok ? "OK" : "FAIL"); + } + } + \ No newline at end of file diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java new file mode 100644 index 00000000000..d32ca3433e9 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/Ctx.java @@ -0,0 +1,61 @@ +package org.apache.sysds.performance.primitives_vector_api; + +public class Ctx { + public int len; + public double[] a, cInit,b,c, cScalar, cVector; + public double bval; + + public double scalarRes, vectorRes; + public double maxDiff; + public boolean ok; + public int[] a_int; + + void initDenseA() { + a = new double[len]; + for (int i = 0; i < len; i++) a[i] = (i % 10) - 5; + } + void initDenseB() { + b = new double[len]; + for (int i = 0; i < len; i++) b[i] = (i % 10) - 5; + } + void initDenseC() { + c = new double[len]; + for (int i = 0; i < len; i++) c[i] = (i % 10) - 5; + } + void initDenseAInt() { + a_int = new int[len]; + for (int i = 0; i < len; i++) a_int[i] = i;; + } + void initbval(){ + bval = 1.234567; + } + void initDenseADiv() { + a = new double[len]; + for (int i = 0; i < len; i++) { + a[i] = ((i % 10) + 1); // Range: 1 to 10 (no zeros) + } + } + void initDenseBDiv() { + b = new double[len]; + for (int i = 0; i < len; i++) b[i] = ((i % 10) + 1); + } + + + void initDenseAandC_mutable() { + initDenseADiv(); + cInit = new double[len]; + for (int i = 0; i < len; i++) cInit[i] = (i % 10) - 5; + cScalar = java.util.Arrays.copyOf(cInit, len); + cVector = java.util.Arrays.copyOf(cInit, len); + } + + + + void resetC() { + if (cInit != null) { + System.arraycopy(cInit, 0, cScalar, 0, len); + System.arraycopy(cInit, 0, cVector, 0, len); + } + } + } + diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java new file mode 100644 index 00000000000..6dcb6797f30 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/PrimitivePerfSuite.java @@ -0,0 +1,44 @@ +package org.apache.sysds.performance.primitives_vector_api; + + +public class PrimitivePerfSuite { + public static void main(String[] args) { + //int len = BenchUtil.argInt(args, "--len", 262_144); + int len = BenchUtil.argInt(args, "--len", 1_000_000); + int warmup = BenchUtil.argInt(args, "--warmup", 10_000); + int iters = BenchUtil.argInt(args, "--iters", 100); + String filter = BenchUtil.argStr(args, "--filter", ""); + + for (BenchCase bc : BenchCase.values()) { + if (!filter.isEmpty() && !bc.name.contains(filter)) continue; + + Ctx ctx = new Ctx(); + ctx.len = len; + bc.setup.accept(ctx); + + // warm scalar + ctx.resetC(); + BenchUtil.warmup(() -> {bc.scalar.accept(ctx); },warmup); + ctx.resetC(); + double nsScalar = BenchUtil.measure(() -> { bc.scalar.accept(ctx); }, iters); + + // warm vector + ctx.resetC(); + BenchUtil.warmup(() -> {bc.vector.accept(ctx); }, warmup); + ctx.resetC(); + double nsVector = BenchUtil.measure(() -> {bc.vector.accept(ctx); }, iters); + + // verify once + ctx.resetC(); bc.scalar.accept(ctx); + bc.vector.accept(ctx); + bc.verify.accept(ctx); + + if (bc.outKind == BenchCase.OutKind.SCALAR_DOUBLE) { + BenchUtil.printScalarDouble(bc.name, nsScalar, nsVector, ctx.scalarRes, ctx.vectorRes, ctx.ok); + } else { + BenchUtil.printArrayDiff(bc.name, nsScalar, nsVector, ctx.maxDiff, ctx.ok); + } + + } + } +} diff --git a/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java new file mode 100644 index 00000000000..d0086eb9f66 --- /dev/null +++ b/src/test/java/org/apache/sysds/performance/primitives_vector_api/backup_primitives_for_benchmark.java @@ -0,0 +1,856 @@ +package org.apache.sysds.performance.primitives_vector_api; + +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; + + + +import java.util.Arrays; + +import org.apache.commons.math3.util.FastMath; +import org.apache.sysds.runtime.data.DenseBlockFP64; +import org.apache.sysds.runtime.data.SparseRowVector; +import org.apache.sysds.runtime.functionobjects.BitwAnd; +import org.apache.sysds.runtime.functionobjects.IntegerDivide; +import org.apache.sysds.runtime.functionobjects.Modulus; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNN; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNN.PoolingType; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNNIm2Col; +import org.apache.sysds.runtime.matrix.data.LibMatrixDNNPooling; +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; + +import jdk.incubator.vector.DoubleVector; +import jdk.incubator.vector.VectorOperators; +import jdk.incubator.vector.VectorSpecies; +import jdk.incubator.vector.VectorMask; + + +public class backup_primitives_for_benchmark { + + // Vector API initializations + private static final VectorSpecies SPECIES = DoubleVector.SPECIES_PREFERRED; + private static final int vLen = SPECIES.length(); + + public static double[] allocVector(int len, boolean reset) { + return allocVector(len, reset, 0); + } + + protected static double[] allocVector(int len, boolean reset, double resetVal) { + VectorBuffer buff = memPool.get(); + + //find next matching vector in ring buffer or + //allocate new vector if required + double[] vect = buff.next(len); + if( vect == null ) + vect = new double[len]; + + //reset vector if required + if( reset ) + Arrays.fill(vect, resetVal); + return vect; + } + private static class VectorBuffer { + private static final int MAX_SIZE = 512*1024; //4MB + private final double[][] _data; + private int _pos; + private int _len1; + private int _len2; + + public VectorBuffer(int num, int len1, int len2) { + //best effort size restriction since large intermediates + //not necessarily used (num refers to the total number) + len1 = Math.min(len1, MAX_SIZE); + len2 = Math.min(len2, MAX_SIZE); + //pre-allocate ring buffer + int lnum = (len2>0 && len1!=len2) ? 2*num : num; + _data = new double[lnum][]; + for( int i=0; i num ) { + _data[2*i] = new double[len1]; + _data[2*i+1] = new double[len2]; + } + else { + _data[i] = new double[len1]; + } + } + _pos = -1; + _len1 = len1; + _len2 = len2; + } + public double[] next(int len) { + if( _len1!=len && _len2!=len ) + return null; + do { + _pos = (_pos+1>=_data.length) ? 0 : _pos+1; + } while( _data[_pos].length!=len ); + return _data[_pos]; + } + @SuppressWarnings("unused") + public boolean isReusable(int num, int len1, int len2) { + int lnum = (len2>0 && len1!=len2) ? 2*num : num; + return (_len1 == len1 && _len2 == len2 + && _data.length == lnum); + } + } + private static ThreadLocal memPool = new ThreadLocal<>() { + @Override protected VectorBuffer initialValue() { return new VectorBuffer(0,0,0); } + }; + + public static void scalarvectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += a[j] / bval; + } + + public static void vectDivAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final double inv = 1.0 / bval; + final DoubleVector vinv = DoubleVector.broadcast(SPECIES, inv); + int i = 0; final int upperBound = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + vc = vc.add(va.mul(vinv)); vc.intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += a[ai + i] * inv; + } + } + + public static void scalarvectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += bval / a[j]; + } + + public static void vectDivAdd(double bval, double[] a, double[] c, int ai, int ci, int len) { + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector vc = DoubleVector.fromArray(SPECIES, c, ci + i); + vc = vc.add(vb.div(va)); + vc.intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (;i scalarMax) + scalarMax = prod; + } + return scalarMax; + } + + + public static double scalarvectSum(double[] a, int ai, int len) { + double val = 0; + final int bn = len%8; + + //compute rest + for( int i = ai; i < ai+bn; i++ ) + val += a[ i ]; + + //unrolled 8-block (for better instruction-level parallelism) + for( int i = ai+bn; i < ai+len; i+=8 ) { + //read 64B cacheline of a, compute cval' = sum(a) + cval + val += a[ i+0 ] + a[ i+1 ] + a[ i+2 ] + a[ i+3 ] + + a[ i+4 ] + a[ i+5 ] + a[ i+6 ] + a[ i+7 ]; + } + + //scalar result + return val; + } + + public static double vectSum(double[] a, int ai, int len) { + double sum = 0d; + int i = 0; + + DoubleVector acc = DoubleVector.zero(SPECIES); + int upperBound = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += SPECIES.length()) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + acc = acc.add(v); + } + sum += acc.reduceLanes(VectorOperators.ADD); + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + sum += a[ai + i]; + } + return sum; + } + public static double scalarvectMax(double[] a, int ai, int len) { + double val = Double.NEGATIVE_INFINITY; + for( int i = ai; i < ai+len; i++ ) + val = Math.max(a[i], val); + return val; + } + + public static double vectMax(double[] a, int ai, int len) { + int i = 0; + int upperBound = SPECIES.loopBound(len); + DoubleVector vmax = DoubleVector.broadcast(SPECIES, Double.NEGATIVE_INFINITY); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upperBound; i += vLen) { + DoubleVector v = DoubleVector.fromArray(SPECIES, a, ai + i); + vmax = vmax.max(v); + } + double maxVal = vmax.reduceLanes(VectorOperators.MAX); + + //rest, not aligned to vLen-blocks + for(;i nz = v.compare(VectorOperators.NE, vzero); + count += nz.trueCount(); + } + + //rest, not aligned to vLen-blocks + for(;i eq = aVec.compare(VectorOperators.EQ, bVec); + + DoubleVector inc = zeros.blend(ones, eq); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] == bval) ? 1.0 : 0.0; + } + } + public static double[] scalarvectEqualWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] == bval) ? 1 : 0; + return c; + } + public static double[] vectEqualWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + int i = 0; + int upper = SPECIES.loopBound(len); + DoubleVector vb = DoubleVector.broadcast(SPECIES, bval); + DoubleVector zeros = DoubleVector.zero(SPECIES); + DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector va = DoubleVector.fromArray(SPECIES, a, ai + i); + var mask = va.compare(VectorOperators.EQ, vb); + DoubleVector out = zeros.blend(ones, mask); + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] == bval) ? 1 : 0; + } + return c; + } + public static double[] scalarvectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] == b[bi]) ? 1 : 0; + return c; + } + + public static double[] vectEqualWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + VectorMask eq = aVec.compare(VectorOperators.EQ, bVec); + DoubleVector out = zeros.blend(ones, eq); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] == b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + public static double[] vectNotequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] != b[bi]) ? 1 : 0; + return c; + } + + // not in use: vector api implementation slower than scalar loop version +public static double[] vectNotequalWrite_vector_api(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask ne = aVec.compare(VectorOperators.NE, bVec); + DoubleVector out = zeros.blend(ones, ne); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] != b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + + + public static void scalarvectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += (a[j] < bval) ? 1 : 0; + } + public static void vectLessAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector inc = zeros.blend(ones, lt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] < bval) ? 1.0 : 0.0; + } + } + + + public static double[] scalarvectLessWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] < bval) ? 1 : 0; + return c; + } + + + public static double[] vectLessWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < bval) ? 1.0 : 0.0; + } + + return c; + } + + public static double[] scalarvectLessWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] < b[bi]) ? 1 : 0; + return c; + } + + public static double[] vectLessWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask lt = aVec.compare(VectorOperators.LT, bVec); + DoubleVector out = zeros.blend(ones, lt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] < b[bi + i]) ? 1.0 : 0.0; + } + + return c; + } + public static void scalarvectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += (a[j] <= bval) ? 1 : 0; + } + + public static void vectLessequalAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector inc = zeros.blend(ones, le); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] <= bval) ? 1.0 : 0.0; + } + } + public static double[] scalarvectLessequalWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] <= bval) ? 1 : 0; + return c; + } + public static double[] vectLessequalWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= bval) ? 1.0 : 0.0; + } + + return c; + } + public static double[] scalarvectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] <= b[bi]) ? 1 : 0; + return c; + } + + public static double[] vectLessequalWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask le = aVec.compare(VectorOperators.LE, bVec); + DoubleVector out = zeros.blend(ones, le); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] <= b[bi + i]) ? 1.0 : 0.0; + } + + return c; + } + public static void scalarvectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += (a[j] > bval) ? 1 : 0; + } + + public static void vectGreaterAdd(double[] a, double bval, double[] c, int ai, int ci, int len) { + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector cVec = DoubleVector.fromArray(SPECIES, c, ci + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector inc = zeros.blend(ones, gt); + + cVec.add(inc).intoArray(c, ci + i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[ci + i] += (a[ai + i] > bval) ? 1.0 : 0.0; + } + } + public static double[] scalarvectGreaterWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++) + c[j] = (a[ai] > bval) ? 1 : 0; + return c; + } + public static double[] vectGreaterWrite(double[] a, double bval, int ai, int len) { + double[] c = allocVector(len, false); + final DoubleVector bVec = DoubleVector.broadcast(SPECIES, bval); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > bval) ? 1.0 : 0.0; + } + return c; + } + public static void scalarvectMult2Add(double[] a, double[] c, int ai, int ci, int len) { + for( int j = ai; j < ai+len; j++, ci++) + c[ci] += a[j] + a[j]; + } + + public static void vectMult2Add(double[] a, double[] c, int ai, int ci, int len) { + LibMatrixMult.vectMultiplyAdd(2.0,a,c,ai,ci,len); + } + + public static double[] scalarvectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + for( int j = 0; j < len; j++, ai++, bi++) + c[j] = (a[ai] > b[bi]) ? 1 : 0; + return c; + } + + // not in use: vector api implementation slower than scalar loop version + public static double[] vectGreaterWrite(double[] a, double[] b, int ai, int bi, int len) { + double[] c = allocVector(len, false); + final DoubleVector ones = DoubleVector.broadcast(SPECIES, 1.0); + final DoubleVector zeros = DoubleVector.zero(SPECIES); + + int i = 0; + int upper = SPECIES.loopBound(len); + + //unrolled vLen-block (for better instruction-level parallelism) + for (; i < upper; i += vLen) { + DoubleVector aVec = DoubleVector.fromArray(SPECIES, a, ai + i); + DoubleVector bVec = DoubleVector.fromArray(SPECIES, b, bi + i); + + VectorMask gt = aVec.compare(VectorOperators.GT, bVec); + DoubleVector out = zeros.blend(ones, gt); + + out.intoArray(c, i); + } + + //rest, not aligned to vLen-blocks + for (; i < len; i++) { + c[i] = (a[ai + i] > b[bi + i]) ? 1.0 : 0.0; + } + return c; + } + +}