From 644b2578f28c3a12ce6a0c3e4e69c24eb32c1513 Mon Sep 17 00:00:00 2001 From: Paul Pohlitz Date: Wed, 28 Jan 2026 18:54:55 +0100 Subject: [PATCH 1/2] adds rewritten matrix multiplication kernels and test for matrixMultSparseDenseMVTallRHS --- .../runtime/matrix/data/LibMatrixMult.java | 246 +++++++++++++++++- .../component/matrix/MatrixMultiplyTest.java | 3 +- 2 files changed, 247 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java index cfdf21255e7..0b17c1bd4f1 100644 --- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java +++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java @@ -1321,6 +1321,65 @@ private static void matrixMultDenseDenseOutSparse(MatrixBlock m1, MatrixBlock m2 } } +// private static void matrixMultDenseDenseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, +// int rl, int ru) { +// final DenseBlock a = m1.getDenseBlock(); +// final DenseBlock b = m2.getDenseBlock(); +// final SparseBlock c = ret.getSparseBlock(); +// final int m = m1.rlen; // rows left +// final int cd = m1.clen; // common dim +// final int n = m2.clen; +// +// final int rl1 = pm2 ? 0 : rl; +// final int ru1 = pm2 ? m : ru; +// final int rl2 = pm2 ? rl : 0; +// final int ru2 = pm2 ? ru : cd; +// +// final int blocksizeK = 32; +// final int blocksizeI = 32; +// +// // Dense temp buffer for vectorized accumulation +// final double[] tempRow = new double[n]; +// +// for(int bi = rl1; bi < ru1; bi += blocksizeI) { +// final int bimin = Math.min(ru1, bi + blocksizeI); +// for(int i = bi; i < bimin; i++) { +// Arrays.fill(tempRow, 0); +// +// final double[] avals = a.values(i); +// final int aix = a.pos(i); +// +// for(int bk = rl2; bk < ru2; bk += blocksizeK) { +// final int bkmin = Math.min(ru2, bk + blocksizeK); +// +// for(int k = bk; k < bkmin; k++) { // common dimension +// final double aval = avals[aix + k]; +// if(aval == 0) continue; +// +// final DoubleVector aVec = DoubleVector.broadcast(SPECIES, aval); +// +// final double[] bvals = b.values(k); +// final int bpos = b.pos(k); +// +// int j = 0; +// for(; j <= n - vLen; j += vLen) { +// DoubleVector bVec = DoubleVector.fromArray(SPECIES, bvals, bpos + j); +// DoubleVector cVec = DoubleVector.fromArray(SPECIES, tempRow, j); +// cVec = bVec.fma(aVec, cVec); +// cVec.intoArray(tempRow, j); +// } +// +// // Scalar tail for remaining elements +// for(; j < n; j++) { +// tempRow[j] += aval * bvals[bpos + j]; +// } +// } +// } +// +// c.setIndexRange(i, 0, n, tempRow, 0, n); +// } +// } +// } private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) { @@ -1362,6 +1421,69 @@ private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m } } +// private static void matrixMultDenseSparseOutSparse(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, +// int rl, int ru) { +// final DenseBlock a = m1.getDenseBlock(); +// final SparseBlock b = m2.getSparseBlock(); +// final SparseBlock c = ret.getSparseBlock(); +// final int m = m1.rlen; // rows left +// final int cd = m1.clen; // common dim +// final int n = m2.clen; +// +// final int rl1 = pm2 ? 0 : rl; +// final int ru1 = pm2 ? m : ru; +// final int rl2 = pm2 ? rl : 0; +// final int ru2 = pm2 ? ru : cd; +// +// final int blocksizeK = 32; +// final int blocksizeI = 32; +// +// // Dense temp buffer for vectorized accumulation (one per row) +// final double[] tempRow = new double[n]; +// +// for(int bi = rl1; bi < ru1; bi += blocksizeI) { +// final int bimin = Math.min(ru1, bi + blocksizeI); +// for(int i = bi; i < bimin; i++) { +// +// Arrays.fill(tempRow, 0); +// final double[] avals = a.values(i); +// final int aix = a.pos(i); +// +// for(int bk = rl2; bk < ru2; bk += blocksizeK) { +// final int bkmin = Math.min(ru2, bk + blocksizeK); +// for(int k = bk; k < bkmin; k++) { +// +// final double aval = avals[aix + k]; +// if (aval == 0 || b.isEmpty(k)) { +// continue; +// } +// +// final int[] bIdx = b.indexes(k); +// final double[] bVals = b.values(k); +// final int bPos = b.pos(k); +// final int bLen = b.size(k); +// +// int j = 0; +// for (; j <= bLen - vLen; j += vLen) { +// DoubleVector bVec = DoubleVector.fromArray(SPECIES, bVals, bPos + j); +// DoubleVector scaled = bVec.mul(aval); +// +// for(int lane = 0; lane < vLen; lane++) { +// tempRow[bIdx[bPos + j + lane]] += scaled.lane(lane); +// } +// } +// +// for (; j < bLen; j++) { +// tempRow[bIdx[bPos + j]] += aval * bVals[bPos + j]; +// } +// } +// } +// +// c.setIndexRange(i, 0, n, tempRow, 0, n); +// } +// } +// } + private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, int rl, int ru) { DenseBlock a = m1.getDenseBlock(); @@ -1413,6 +1535,57 @@ private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2 } } +// private static void matrixMultDenseSparseOutDense(MatrixBlock m1, MatrixBlock m2, MatrixBlock ret, boolean pm2, +// int rl, int ru) { +// DenseBlock a = m1.getDenseBlock(); +// DenseBlock c = ret.getDenseBlock(); +// int m = m1.rlen; +// int cd = m1.clen; +// +// // MATRIX-MATRIX (VV, MV not applicable here because V always dense) +// SparseBlock b = m2.sparseBlock; +// +// if( pm2 && m==1 ) { //VECTOR-MATRIX +// //parallelization over rows in rhs matrix +// double[] avals = a.valuesAt(0); //vector +// double[] cvals = c.valuesAt(0); //vector +// for( int k=rl; k= cd ) { +// matrixMultSparseDenseMVShortRHS(a, b, c, cd, rl, ru); +// return; +// } +// +// //sparse matrix-vector w/ cache blocking (keep front of positions) +// double[] bvals = b.valuesAt(0); +// double[] cvals = c.valuesAt(0); +// int[] curk = new int[blocksizeI]; +// +// for( int bi = rl; bi < ru; bi+=blocksizeI ) { +// Arrays.fill(curk, 0); //reset positions +// for( int bk=0, bimin = Math.min(ru, bi+blocksizeI); bk data() { tests.add(new Object[]{1000, 1000, 1000, 0.005, 0.6, 6, true}); + tests.add(new Object[]{1000, 4096, 1, 0.02, 0.6, 1, false}); } catch(Exception e) { e.printStackTrace(); From b626acd5765db3fd023763dc89599578f2cd80a3 Mon Sep 17 00:00:00 2001 From: Paul Pohlitz Date: Wed, 28 Jan 2026 19:00:46 +0100 Subject: [PATCH 2/2] [DO NOT MERGE] Temporary benchmark script for reviewer verification --- pom.xml | 13 +++ .../component/matrix/BenchmarkMatMult.java | 93 +++++++++++++++++++ 2 files changed, 106 insertions(+) create mode 100644 src/test/java/org/apache/sysds/test/component/matrix/BenchmarkMatMult.java diff --git a/pom.xml b/pom.xml index 08669868aa1..191888c389c 100644 --- a/pom.xml +++ b/pom.xml @@ -1577,5 +1577,18 @@ fastdoubleparser 0.9.0 + + + org.openjdk.jmh + jmh-core + 1.37 + test + + + org.openjdk.jmh + jmh-generator-annprocess + 1.37 + test + diff --git a/src/test/java/org/apache/sysds/test/component/matrix/BenchmarkMatMult.java b/src/test/java/org/apache/sysds/test/component/matrix/BenchmarkMatMult.java new file mode 100644 index 00000000000..eedbb34c4cc --- /dev/null +++ b/src/test/java/org/apache/sysds/test/component/matrix/BenchmarkMatMult.java @@ -0,0 +1,93 @@ +package org.apache.sysds.test.component.matrix; + +import java.util.concurrent.TimeUnit; + +import org.apache.sysds.runtime.matrix.data.LibMatrixMult; +import org.apache.sysds.runtime.matrix.data.MatrixBlock; +import org.apache.sysds.test.TestUtils; +import org.openjdk.jmh.annotations.*; +import org.openjdk.jmh.infra.Blackhole; +import org.openjdk.jmh.results.format.ResultFormatType; +import org.openjdk.jmh.runner.Runner; +import org.openjdk.jmh.runner.RunnerException; +import org.openjdk.jmh.runner.options.Options; +import org.openjdk.jmh.runner.options.OptionsBuilder; + +@State(Scope.Thread) +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@Warmup(iterations = 5, time = 1, timeUnit = TimeUnit.SECONDS) +@Measurement(iterations = 10, time = 1, timeUnit = TimeUnit.SECONDS) +@Fork(1) +public class BenchmarkMatMult { + + private MatrixBlock left; + private MatrixBlock right; + private MatrixBlock result; + + @Param({"1024", "2048", "4096", "8192"}) + public int m; + + @Param({"1"}) + public int cd; + + @Param({"0.5", "0.75", "1"}) + public double sparsityLeft; + + @Param({"0.001", "0.01", "0.1", "0.2"}) + public double sparsityRight; + + @Param({"1024", "2048", "4096", "8192"}) + public int n; + + // Executes once per benchmark + @Setup(Level.Trial) + public void setup() { + left = TestUtils.ceil(TestUtils.generateTestMatrixBlock(m, cd, -10, 10, sparsityLeft, 13)); + if (left.isInSparseFormat()) { + throw new IllegalStateException("Left matrix is sparse but must be dense for m: " + m + ", cd: " + cd + ", sparsity: " + sparsityLeft); + } + + MatrixBlock rightOriginal = TestUtils.ceil(TestUtils.generateTestMatrixBlock(cd, n, -10, 10, sparsityRight, 14)); + if (rightOriginal.isInSparseFormat()) { + right = new MatrixBlock(); + right.copy(rightOriginal, false); + if (right.isEmptyBlock(false)) { + throw new IllegalStateException("Right is empty after copy"); + } + if (right.isEmptyBlock() || right.getNonZeros() != rightOriginal.getNonZeros()) { + throw new IllegalStateException("Right has weird setup after copying"); + } + } else { + throw new IllegalStateException("Right is dense originally"); + } + + result = new MatrixBlock(m, n, true); + result.allocateBlock(); + if (!result.isInSparseFormat()) { + throw new IllegalStateException("Expected result is not in sparse format"); + } + } + + @Setup(Level.Iteration) + public void clearResult() { + result.reset(m, n, true); + } + + @Benchmark + public void benchmarkDenseDenseOutSparse(Blackhole bh) { +// LibMatrixMult.matrixMultDenseDenseOutSparse(left, right, result, false, 0, m); + bh.consume(result); + } + + public static void main(String[] args) throws RunnerException { + String tag = "vector-api-mult-dense-dense-out-sparse" + System.currentTimeMillis(); + Options opt = new OptionsBuilder() + .include(BenchmarkMatMult.class.getName()) + .jvmArgs("--add-modules=jdk.incubator.vector") + .result(tag + ".csv") + .resultFormat(ResultFormatType.CSV) + .build(); + new Runner(opt).run(); + } +}