diff --git a/pom.xml b/pom.xml index 08669868aa1..191888c389c 100644 --- a/pom.xml +++ b/pom.xml @@ -1577,5 +1577,18 @@ fastdoubleparser 0.9.0 + + + org.openjdk.jmh + jmh-core + 1.37 + test + + + org.openjdk.jmh + jmh-generator-annprocess + 1.37 + test + diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index cfdf21255e7..0b17c1bd4f1 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -1321,6 +1321,65 @@ private static void matrixMultDenseDenseOutSparse(MatrixBlock m1, MatrixBlock m2 } } +// private static void matrixMultDenseDenseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, +// int rl, int ru) { +// final DenseBlock a = m1.getDenseBlock(); +// final DenseBlock b = m2.getDenseBlock(); +// final SparseBlock c = ret.getSparseBlock(); +// final int m = m1.rlen; // rows left +// final int cd = m1.clen; // common dim +// final int n = m2.clen; +// +// final int rl1 = pm2 ? 0 : rl; +// final int ru1 = pm2 ? m : ru; +// final int rl2 = pm2 ? rl : 0; +// final int ru2 = pm2 ? ru : cd; +// +// final int blocksizeK = 32; +// final int blocksizeI = 32; +// +// // Dense temp buffer for vectorized accumulation +// final double[] tempRow = new double[n]; +// +// for(int bi = rl1; bi < ru1; bi += blocksizeI) { +// final int bimin = Math.min(ru1, bi + blocksizeI); +// for(int i = bi; i < bimin; i++) { +// Arrays.fill(tempRow, 0); +// +// final double[] avals = a.values(i); +// final int aix = a.pos(i); +// +// for(int bk = rl2; bk < ru2; bk += blocksizeK) { +// final int bkmin = Math.min(ru2, bk + blocksizeK); +// +// for(int k = bk; k < bkmin; k++) { // common dimension +// final double aval = avals[aix + k]; +// if(aval == 0) continue; +// +// final DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval); +// +// final double[] bvals = b.values(k); +// final int bpos = b.pos(k); +// +// int j = 0; +// for(; j <= n - vLen; j += vLen) { +// DoubleVector bVec = DoubleVector.fromArray(SPECIES, bvals, bpos + j); +// DoubleVector cVec = DoubleVector.fromArray(SPECIES, tempRow, j); +// cVec = bVec.fma(aVec, cVec); +// cVec.intoArray(tempRow, j); +// } +// +// // Scalar tail for remaining elements +// for(; j < n; j++) { +// tempRow[j] += aval * bvals[bpos + j]; +// } +// } +// } +// +// c.setIndexRange(i, 0, n, tempRow, 0, n); +// } +// } +// } private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) { @@ -1362,6 +1421,69 @@ private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m } } +// private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, +// int rl, int ru) { +// final DenseBlock a = m1.getDenseBlock(); +// final SparseBlock b = m2.getSparseBlock(); +// final SparseBlock c = ret.getSparseBlock(); +// final int m = m1.rlen; // rows left +// final int cd = m1.clen; // common dim +// final int n = m2.clen; +// +// final int rl1 = pm2 ? 0 : rl; +// final int ru1 = pm2 ? m : ru; +// final int rl2 = pm2 ? rl : 0; +// final int ru2 = pm2 ? ru : cd; +// +// final int blocksizeK = 32; +// final int blocksizeI = 32; +// +// // Dense temp buffer for vectorized accumulation (one per row) +// final double[] tempRow = new double[n]; +// +// for(int bi = rl1; bi < ru1; bi += blocksizeI) { +// final int bimin = Math.min(ru1, bi + blocksizeI); +// for(int i = bi; i < bimin; i++) { +// +// Arrays.fill(tempRow, 0); +// final double[] avals = a.values(i); +// final int aix = a.pos(i); +// +// for(int bk = rl2; bk < ru2; bk += blocksizeK) { +// final int bkmin = Math.min(ru2, bk + blocksizeK); +// for(int k = bk; k < bkmin; k++) { +// +// final double aval = avals[aix + k]; +// if (aval == 0 || b.isEmpty(k)) { +// continue; +// } +// +// final int[] bIdx = b.indexes(k); +// final double[] bVals = b.values(k); +// final int bPos = b.pos(k); +// final int bLen = b.size(k); +// +// int j = 0; +// for (; j <= bLen - vLen; j += vLen) { +// DoubleVector bVec = DoubleVector.fromArray(SPECIES, bVals, bPos + j); +// DoubleVector scaled = bVec.mul(aval); +// +// for(int lane = 0; lane < vLen; lane++) { +// tempRow[bIdx[bPos + j + lane]] += scaled.lane(lane); +// } +// } +// +// for (; j < bLen; j++) { +// tempRow[bIdx[bPos + j]] += aval * bVals[bPos + j]; +// } +// } +// } +// +// c.setIndexRange(i, 0, n, tempRow, 0, n); +// } +// } +// } + private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) { DenseBlock a = m1.getDenseBlock(); @@ -1413,6 +1535,57 @@ private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2 } } +// private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, +// int rl, int ru) { +// DenseBlock a = m1.getDenseBlock(); +// DenseBlock c = ret.getDenseBlock(); +// int m = m1.rlen; +// int cd = m1.clen; +// +// // MATRIX-MATRIX (VV, MV not applicable here because V always dense) +// SparseBlock b = m2.sparseBlock; +// +// if( pm2 && m==1 ) { //VECTOR-MATRIX +// //parallelization over rows in rhs matrix +// double[] avals = a.valuesAt(0); //vector +// double[] cvals = c.valuesAt(0); //vector +// for( int k=rl; k= cd ) { +// matrixMultSparseDenseMVShortRHS(a, b, c, cd, rl, ru); +// return; +// } +// +// //sparse matrix-vector w/ cache blocking (keep front of positions) +// double[] bvals = b.valuesAt(0); +// double[] cvals = c.valuesAt(0); +// int[] curk = new int[blocksizeI]; +// +// for( int bi = rl; bi < ru; bi+=blocksizeI ) { +// Arrays.fill(curk, 0); //reset positions +// for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk data() { tests.add(new Object[]{1000, 1000, 1000, 0.005, 0.6, 6, true}); + tests.add(new Object[]{1000, 4096, 1, 0.02, 0.6, 1, false}); } catch(Exception e) { e.printStackTrace();