diff --git a/pom.xml b/pom.xml
index 08669868aa1..191888c389c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -1577,5 +1577,18 @@
fastdoubleparser
0.9.0
+
+
+ org.openjdk.jmh
+ jmh-core
+ 1.37
+ test
+
+
+ org.openjdk.jmh
+ jmh-generator-annprocess
+ 1.37
+ test
+
diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index cfdf21255e7..0b17c1bd4f1 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -1321,6 +1321,65 @@ private static void matrixMultDenseDenseOutSparse(MatrixBlock m1, MatrixBlock m2
}
}
+// private static void matrixMultDenseDenseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2,
+// int rl, int ru) {
+// final DenseBlock a = m1.getDenseBlock();
+// final DenseBlock b = m2.getDenseBlock();
+// final SparseBlock c = ret.getSparseBlock();
+// final int m = m1.rlen; // rows left
+// final int cd = m1.clen; // common dim
+// final int n = m2.clen;
+//
+// final int rl1 = pm2 ? 0 : rl;
+// final int ru1 = pm2 ? m : ru;
+// final int rl2 = pm2 ? rl : 0;
+// final int ru2 = pm2 ? ru : cd;
+//
+// final int blocksizeK = 32;
+// final int blocksizeI = 32;
+//
+// // Dense temp buffer for vectorized accumulation
+// final double[] tempRow = new double[n];
+//
+// for(int bi = rl1; bi < ru1; bi += blocksizeI) {
+// final int bimin = Math.min(ru1, bi + blocksizeI);
+// for(int i = bi; i < bimin; i++) {
+// Arrays.fill(tempRow, 0);
+//
+// final double[] avals = a.values(i);
+// final int aix = a.pos(i);
+//
+// for(int bk = rl2; bk < ru2; bk += blocksizeK) {
+// final int bkmin = Math.min(ru2, bk + blocksizeK);
+//
+// for(int k = bk; k < bkmin; k++) { // common dimension
+// final double aval = avals[aix + k];
+// if(aval == 0) continue;
+//
+// final DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval);
+//
+// final double[] bvals = b.values(k);
+// final int bpos = b.pos(k);
+//
+// int j = 0;
+// for(; j <= n - vLen; j += vLen) {
+// DoubleVector bVec = DoubleVector.fromArray(SPECIES, bvals, bpos + j);
+// DoubleVector cVec = DoubleVector.fromArray(SPECIES, tempRow, j);
+// cVec = bVec.fma(aVec, cVec);
+// cVec.intoArray(tempRow, j);
+// }
+//
+// // Scalar tail for remaining elements
+// for(; j < n; j++) {
+// tempRow[j] += aval * bvals[bpos + j];
+// }
+// }
+// }
+//
+// c.setIndexRange(i, 0, n, tempRow, 0, n);
+// }
+// }
+// }
private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2,
int rl, int ru) {
@@ -1362,6 +1421,69 @@ private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m
}
}
+// private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2,
+// int rl, int ru) {
+// final DenseBlock a = m1.getDenseBlock();
+// final SparseBlock b = m2.getSparseBlock();
+// final SparseBlock c = ret.getSparseBlock();
+// final int m = m1.rlen; // rows left
+// final int cd = m1.clen; // common dim
+// final int n = m2.clen;
+//
+// final int rl1 = pm2 ? 0 : rl;
+// final int ru1 = pm2 ? m : ru;
+// final int rl2 = pm2 ? rl : 0;
+// final int ru2 = pm2 ? ru : cd;
+//
+// final int blocksizeK = 32;
+// final int blocksizeI = 32;
+//
+// // Dense temp buffer for vectorized accumulation (one per row)
+// final double[] tempRow = new double[n];
+//
+// for(int bi = rl1; bi < ru1; bi += blocksizeI) {
+// final int bimin = Math.min(ru1, bi + blocksizeI);
+// for(int i = bi; i < bimin; i++) {
+//
+// Arrays.fill(tempRow, 0);
+// final double[] avals = a.values(i);
+// final int aix = a.pos(i);
+//
+// for(int bk = rl2; bk < ru2; bk += blocksizeK) {
+// final int bkmin = Math.min(ru2, bk + blocksizeK);
+// for(int k = bk; k < bkmin; k++) {
+//
+// final double aval = avals[aix + k];
+// if (aval == 0 || b.isEmpty(k)) {
+// continue;
+// }
+//
+// final int[] bIdx = b.indexes(k);
+// final double[] bVals = b.values(k);
+// final int bPos = b.pos(k);
+// final int bLen = b.size(k);
+//
+// int j = 0;
+// for (; j <= bLen - vLen; j += vLen) {
+// DoubleVector bVec = DoubleVector.fromArray(SPECIES, bVals, bPos + j);
+// DoubleVector scaled = bVec.mul(aval);
+//
+// for(int lane = 0; lane < vLen; lane++) {
+// tempRow[bIdx[bPos + j + lane]] += scaled.lane(lane);
+// }
+// }
+//
+// for (; j < bLen; j++) {
+// tempRow[bIdx[bPos + j]] += aval * bVals[bPos + j];
+// }
+// }
+// }
+//
+// c.setIndexRange(i, 0, n, tempRow, 0, n);
+// }
+// }
+// }
+
private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl,
int ru) {
DenseBlock a = m1.getDenseBlock();
@@ -1413,6 +1535,57 @@ private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2
}
}
+// private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2,
+// int rl, int ru) {
+// DenseBlock a = m1.getDenseBlock();
+// DenseBlock c = ret.getDenseBlock();
+// int m = m1.rlen;
+// int cd = m1.clen;
+//
+// // MATRIX-MATRIX (VV, MV not applicable here because V always dense)
+// SparseBlock b = m2.sparseBlock;
+//
+// if( pm2 && m==1 ) { //VECTOR-MATRIX
+// //parallelization over rows in rhs matrix
+// double[] avals = a.valuesAt(0); //vector
+// double[] cvals = c.valuesAt(0); //vector
+// for( int k=rl; k= cd ) {
+// matrixMultSparseDenseMVShortRHS(a, b, c, cd, rl, ru);
+// return;
+// }
+//
+// //sparse matrix-vector w/ cache blocking (keep front of positions)
+// double[] bvals = b.valuesAt(0);
+// double[] cvals = c.valuesAt(0);
+// int[] curk = new int[blocksizeI];
+//
+// for( int bi = rl; bi < ru; bi+=blocksizeI ) {
+// Arrays.fill(curk, 0); //reset positions
+// for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk data() {
tests.add(new Object[]{1000, 1000, 1000, 0.005, 0.6, 6, true});
+ tests.add(new Object[]{1000, 4096, 1, 0.02, 0.6, 1, false});
}
catch(Exception e) {
e.printStackTrace();