From 11bd20355061b1c947b6c716c4685a4b66115350 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Fri, 12 Dec 2025 10:11:09 +0100 Subject: [PATCH 1/7] Initial commit: preliminary tests, some helper functions --- scripts/builtin/hdbscan.dml | 132 ++++++++++++++++++ scripts/builtin/hdbscanApply.dml | 0 .../functions/builtin/BuiltinHDBSCANTest.java | 0 src/test/scripts/functions/builtin/hdbscan.R | 0 .../scripts/functions/builtin/hdbscan.dml | 20 +++ .../functions/builtin/hdbscanApply.dml | 0 test_build_mst.dml | 46 ++++++ test_kth_smallest.dml | 14 ++ test_mutual_reachability.dml | 26 ++++ 9 files changed, 238 insertions(+) create mode 100644 scripts/builtin/hdbscan.dml create mode 100644 scripts/builtin/hdbscanApply.dml create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java create mode 100644 src/test/scripts/functions/builtin/hdbscan.R create mode 100644 src/test/scripts/functions/builtin/hdbscan.dml create mode 100644 src/test/scripts/functions/builtin/hdbscanApply.dml create mode 100644 test_build_mst.dml create mode 100644 test_kth_smallest.dml create mode 100644 test_mutual_reachability.dml diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml new file mode 100644 index 00000000000..ca48c7fdfc6 --- /dev/null +++ b/scripts/builtin/hdbscan.dml @@ -0,0 +1,132 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# The hdbscan function is used to perform the hdbscan clustering +# algorithm using knn-based mutual reachability distance and minimum spanning tree. +# +# INPUT: +# ------------------------------------------------------------ +# X The input Matrix to do hdbscan on. +# minPts Minimum number of points for core distance computation. (Defaults to 5) +# minClSize Minimum cluster size (Defaults to minPts) +# ------------------------------------------------------------ +# +# OUTPUT: +# ------------------------------------------------------------ +# clusterMems Cluster labels for each point +# clusterModel The cluster centroids for prediction +# ------------------------------------------------------------ + +# TODO: m,s , f? +m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -1) + return (Matrix[Double] clusterMems, Matrix[Double] clusterModel) +{ + if(minPts < 2) { + stop("HDBSCAN: minPts should be at least 2") + } + + if(minClSize < 0) { + minClSize = minPts + } + + n = nrow(X) + d = ncol(X) + + if(n < minPts) { + stop("HDBSCAN: Number of data points should be at least minPts") + } + + distances = dist(X) + + coreDistances = matrix(0, rows=n, cols=1) + for(i in 1:n) { + kthDist = computeKthSmallest(distances[i,], minPts) + coreDistances[i] = kthDist + } + + mutualReachDist = computeMutualReachability(distances, coreDistances) + + [mstEdges, mstWeights] = buildMST(mutualReachDist, n) + + # TODO: build cluster hierarchy + # TODO: get stable cluster with stability score + # TODO: build cluster model + + # temp dummy values + clusterMems = matrix(1, rows=n, cols=1) + clusterModel = X +} + + +computeKthSmallest = function(Matrix[Double] array, Integer k) + return (Double res) +{ + sorted = order(target=array, by=1, decreasing=FALSE) + res = as.scalar(sorted[k+1, 1]) +} + + +computeMutualReachability = function(Matrix[Double] distances, Matrix[Double] coreDistances) + return (Matrix[Double] mutualReach) +{ + # mutualReach(i,j) = max(dist(i,j), coreDist(i), coreDist(j)) + # Diagonal is set to zero. + + n = nrow(distances) + + coreDistRow = t(coreDistances) + coreDistCol = coreDistances + + maxCoreRow = (distances > coreDistRow) * distances + (distances <= coreDistRow) * coreDistRow + mutualReach = (maxCoreRow > coreDistCol) * maxCoreRow + (maxCoreRow <= coreDistCol) * coreDistCol + + mutualReach = mutualReach * (1 - diag(matrix(1, rows=n, cols=1))) +} + + +buildMST = function(Matrix[Double] distances, Integer n) + return (Matrix[Double] edges, Matrix[Double] weights) +{ + edges = matrix(0, rows=n-1, cols=2) + weights = matrix(0, rows=n-1, cols=1) + + inMST = matrix(0, rows=n, cols=1) + inMST[1] = 1 + + minDist = distances[1,] + minDist = t(minDist) + + for(i in 1:(n-1)) { + candidates = minDist + inMST * 1e15 + minIdx = as.scalar(rowIndexMin(t(candidates))) + minWeight = as.scalar(minDist[minIdx]) + + # Find which node in MST connects to minIdx + connectIdx = as.scalar(rowIndexMin(distances[minIdx,] + t(1-inMST) * 1e15)) + edges[i,1] = minIdx + edges[i,2] = connectIdx + weights[i] = minWeight + + inMST[minIdx] = 1 + newDists = distances[minIdx,] + minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) * t(newDists) + } +} \ No newline at end of file diff --git a/scripts/builtin/hdbscanApply.dml b/scripts/builtin/hdbscanApply.dml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/scripts/functions/builtin/hdbscan.R b/src/test/scripts/functions/builtin/hdbscan.R new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/test/scripts/functions/builtin/hdbscan.dml b/src/test/scripts/functions/builtin/hdbscan.dml new file mode 100644 index 00000000000..cc59154f3c3 --- /dev/null +++ b/src/test/scripts/functions/builtin/hdbscan.dml @@ -0,0 +1,20 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- diff --git a/src/test/scripts/functions/builtin/hdbscanApply.dml b/src/test/scripts/functions/builtin/hdbscanApply.dml new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test_build_mst.dml b/test_build_mst.dml new file mode 100644 index 00000000000..a8dd7a44ff9 --- /dev/null +++ b/test_build_mst.dml @@ -0,0 +1,46 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 4 +# / | \ +# / | \ +# / (2) (5) +# | | \ +# | | \ +# (2) 1-(3)--3 +# | | / +# \ | (4) +# \ (1) / +# \ | / +# \|/ +# 2 + +distances = matrix(0, rows=4, cols=4) +distances[1,2] = 1 +distances[2,1] = 1 + +distances[1,3] = 3 +distances[3,1] = 3 + +distances[1,4] = 2 +distances[4,1] = 2 + +distances[2,3] = 4 +distances[3,2] = 4 + +distances[2,4] = 2 +distances[4,2] = 2 + +distances[3,4] = 5 +distances[4,3] = 5 + +[edges, weights] = hdb::buildMST(distances, 4) + +totalWeight = sum(weights) + +test_pass = (nrow(edges) == 3) & (totalWeight == 6) + +if(test_pass) { + print("Passed") +} else { + print("Failes") +} \ No newline at end of file diff --git a/test_kth_smallest.dml b/test_kth_smallest.dml new file mode 100644 index 00000000000..b5065230f26 --- /dev/null +++ b/test_kth_smallest.dml @@ -0,0 +1,14 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +array = matrix("0 1 4 2", rows=4, cols=1) + +res1 = hdb::computeKthSmallest(array, 1) +res2 = hdb::computeKthSmallest(array, 2) + +test_pass = (res1 == 1) & (res2 == 2) + +if(test_pass) { + print("Passed") +} else { + print("Failed") +} \ No newline at end of file diff --git a/test_mutual_reachability.dml b/test_mutual_reachability.dml new file mode 100644 index 00000000000..2e86ed28311 --- /dev/null +++ b/test_mutual_reachability.dml @@ -0,0 +1,26 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +distances = matrix("0 1 5 1 0 4 5 4 0", rows=3, cols=3) +coreDistances = matrix("1 1 4", rows=3, cols=1) + +mutualReach = hdb::computeMutualReachability(distances, coreDistances) + +diagSum = sum(diag(mutualReach)) + +val_AB = as.scalar(mutualReach[1,2]) +val_AC = as.scalar(mutualReach[1,3]) +val_BC = as.scalar(mutualReach[2,3]) + +sym_AB = as.scalar(mutualReach[2,1]) +sym_AC = as.scalar(mutualReach[3,1]) +sym_BC = as.scalar(mutualReach[3,2]) + +test1_pass = (val_AB == 1) & (val_AC == 5) & (val_BC == 4) & + (diagSum == 0) & + (val_AB == sym_AB) & (val_AC == sym_AC) & (val_BC == sym_BC) + +if(test1_pass) { + print("Passed") +} else { + print("Failed") +} From 39b3a4a45aa3db4e4d9ecd329ce7d227c791266a Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Mon, 29 Dec 2025 17:34:07 +0100 Subject: [PATCH 2/7] Hierarchy builder with union-find --- scripts/builtin/hdbscan.dml | 88 ++++++++++++++++++++++++++++- test_build_hierarchy.dml | 77 +++++++++++++++++++++++++ test_build_mst.dml | 2 +- test_union_find.dml | 108 ++++++++++++++++++++++++++++++++++++ 4 files changed, 273 insertions(+), 2 deletions(-) create mode 100644 test_build_hierarchy.dml create mode 100644 test_union_find.dml diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml index ca48c7fdfc6..3cb00e021ce 100644 --- a/scripts/builtin/hdbscan.dml +++ b/scripts/builtin/hdbscan.dml @@ -66,7 +66,8 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = - [mstEdges, mstWeights] = buildMST(mutualReachDist, n) - # TODO: build cluster hierarchy + [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n) + # TODO: get stable cluster with stability score # TODO: build cluster model @@ -129,4 +130,89 @@ buildMST = function(Matrix[Double] distances, Integer n) newDists = distances[minIdx,] minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) * t(newDists) } +} + +# Union-find utils + +find = function(Matrix[Double] parent, Integer x) + return (Integer root) +{ + root = x + while(as.scalar(parent[root]) != root) { + root = as.integer(as.scalar(parent[root])) + } +} + +union = function(Matrix[Double] parent, Matrix[Double] rank, + Integer x, Integer y, Matrix[Double] size) + return (Matrix[Double] newParent, Matrix[Double] newRank, Matrix[Double] newSize) +{ + newParent = parent + newRank = rank + newSize = size + + rankX = as.scalar(rank[x]) + rankY = as.scalar(rank[y]) + + # Calculate combined size + combinedSize = as.scalar(size[x]) + as.scalar(size[y]) + + if(rankX < rankY) { + newParent[x] = y + newSize[y] = combinedSize # Update size at new root + } + else if(rankX > rankY) { + newParent[y] = x + newSize[x] = combinedSize # Update size at new root + } + else { + newParent[y] = x + newRank[x] = rankX + 1 + newSize[x] = combinedSize # Update size at new root + } +} + +buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n) + return (Matrix[Double] hierarchy, Matrix[Double] sizes) +{ + # sort edges by weight in ascending order + # to build the hierarchy from dense cores outward + sorted = order(target=weights, by=1, decreasing=FALSE) + + # parent[i] = i, meaning each point is its own parent in the beginning + parent = seq(1, n) + # tree depth when unioning (icnreases only when merged trees are of same height) + rank = matrix(0, rows=n, cols=1) + # each cluster is 1 in the beginning + clusterSize = matrix(1, rows=2*n-1, cols=1) # Space for all possible cluster IDs + + # hierarcy[i, :] = [cluster1_id, cluster2_id, merge_distance] + hierarchy = matrix(0, rows=n-1, cols=3) + + # sizes[i] = size of the cluster created by merge i + sizes = matrix(0, rows=n-1, cols=1) + + nextCluster = n + 1 + + for(i in 1:(n-1)) { + idx = as.scalar(sorted[i]) + u = as.scalar(edges[idx, 1]) + v = as.scalar(edges[idx, 2]) + w = as.scalar(weights[idx]) + + root_u = find(parent, u) + root_v = find(parent, v) + + if(root_u != root_v) { + hierarchy[i,1] = root_u + hierarchy[i,2] = root_v + hierarchy[i,3] = w + sizes[i] = as.scalar(clusterSize[root_u]) + as.scalar(clusterSize[root_v]) + + [parent, rank, clusterSize] = union(parent, rank, root_u, root_v, clusterSize) + + nextCluster = nextCluster + 1 + } + } + } \ No newline at end of file diff --git a/test_build_hierarchy.dml b/test_build_hierarchy.dml new file mode 100644 index 00000000000..b59aa288619 --- /dev/null +++ b/test_build_hierarchy.dml @@ -0,0 +1,77 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 4 +# / | \ +# / | \ +# / (2) (5) +# | | \ +# | | \ +# (2) 1-(3)--3 +# | | / +# \ | (4) +# \ (1) / +# \ | / +# \|/ +# 2 + +distances = matrix(0, rows=4, cols=4) +distances[1,2] = 1 +distances[2,1] = 1 + +distances[1,3] = 3 +distances[3,1] = 3 + +distances[1,4] = 2 +distances[4,1] = 2 + +distances[2,3] = 4 +distances[3,2] = 4 + +distances[2,4] = 2 +distances[4,2] = 2 + +distances[3,4] = 5 +distances[4,3] = 5 + +[edges, weights] = hdb::buildMST(distances, 4) + +[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, 4) + +print("Hierarchy (format: [cluster1, cluster2, merge_distance]):") +print(toString(hierarchy)) + +# Should have n-1 merge operations for n nodes +num_merges = nrow(hierarchy) +print("Number of merges: " + num_merges + " (should be 3)") +test1 = (num_merges == 3) + +# Merge distances should be in ascending order (or equal) +# Because we process edges from low weight to high weight +dist1 = as.scalar(hierarchy[1,3]) +dist2 = as.scalar(hierarchy[2,3]) +dist3 = as.scalar(hierarchy[3,3]) +print("\nMerge distances: [" + dist1 + ", " + dist2 + ", " + dist3 + "]" + " (Should be in ascending order)") +test2 = (dist1 <= dist2) & (dist2 <= dist3) + +# Cluster sizes should increase +size1 = as.scalar(sizes[1]) +size2 = as.scalar(sizes[2]) +size3 = as.scalar(sizes[3]) +print("\nCluster sizes: [" + size1 + ", " + size2 + ", " + size3 + "]" + " (Should be increasing)") +test3 = (size1 <= size2) & (size2 <= size3) + +# Final size should equal total number of nodes +print("Final cluster size: " + size3 + " (should be 4)") +test4 = (size3 == 4) + +# First merge should be size 2 +print("First merge size: " + size1 + " (should be 2)") +test5 = (size1 == 2) + +test_pass = test1 & test2 & test3 & test4 & test5 + +if(test_pass) { + print("Passed") +} else { + print("Failed") +} \ No newline at end of file diff --git a/test_build_mst.dml b/test_build_mst.dml index a8dd7a44ff9..e2ed10ab040 100644 --- a/test_build_mst.dml +++ b/test_build_mst.dml @@ -42,5 +42,5 @@ test_pass = (nrow(edges) == 3) & (totalWeight == 6) if(test_pass) { print("Passed") } else { - print("Failes") + print("Failed") } \ No newline at end of file diff --git a/test_union_find.dml b/test_union_find.dml new file mode 100644 index 00000000000..98eb14c7141 --- /dev/null +++ b/test_union_find.dml @@ -0,0 +1,108 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 1 -> 1 +# 2 -> 1 +# 3 -> 2 -> 1 +# 4 -> 4 + +n = 4 +parent = matrix(0, rows=n, cols=1) +parent[1] = 1 +parent[2] = 1 +parent[3] = 2 +parent[4] = 4 + +root1 = hdb::find(parent, 1) +root2 = hdb::find(parent, 2) +root3 = hdb::find(parent, 3) +root4 = hdb::find(parent, 4) + +test_find = (root1 == 1) & (root2 == 1) & (root3 == 1) & (root4 == 4) + +if(test_find) { + print("Passed") +} else { + print("Failed") +} + +# ensure union() attaches lower-rank root under higher-rank root +# rank[1] < rank[2] => parent[1] becomes 2 + +parentA = matrix(0, rows=n, cols=1) +parentA[1] = 1 +parentA[2] = 2 +parentA[3] = 3 +parentA[4] = 4 + +rankA = matrix(0, rows=n, cols=1) +rankA[1] = 0 +rankA[2] = 1 +rankA[3] = 0 +rankA[4] = 0 + +sizeA = matrix(1, rows=n, cols=1) + +[newParentA, newRankA, newSizeA] = hdb::union(parentA, rankA, 1, 2, sizeA) +test_unionA = (as.scalar(newParentA[1]) == 2) & (as.scalar(newParentA[2]) == 2) + +if(test_unionA) { + print("Passed") +} else { + print("Failed") +} + +# ensure union() attaches lower-rank root under higher-rank root +# rank[1] > rank[2] => parent[2] becomes 1 + +parentB = matrix(0, rows=n, cols=1) +parentB[1] = 1 +parentB[2] = 2 +parentB[3] = 3 +parentB[4] = 4 + +rankB = matrix(0, rows=n, cols=1) +rankB[1] = 2 +rankB[2] = 1 +rankB[3] = 0 +rankB[4] = 0 + +sizeB = matrix(1, rows=n, cols=1) + +[newParentB, newRankB, newSizeB] = hdb::union(parentB, rankB, 1, 2, sizeB) +test_unionB = (as.scalar(newParentB[2]) == 1) & (as.scalar(newParentB[1]) == 1) + +if(test_unionB) { + print("Passed") +} else { + print("Failed") +} + +# ensure union() increments if height is same + +parentC = matrix(0, rows=n, cols=1) +parentC[1] = 1 +parentC[2] = 2 +parentC[3] = 3 +parentC[4] = 4 + +rankC = matrix(0, rows=n, cols=1) +rankC[1] = 0 +rankC[2] = 0 +rankC[3] = 0 +rankC[4] = 0 + +sizeC = matrix(1, rows=n, cols=1) + +[newParentC, newRankC, newSizeC] = hdb::union(parentC, rankC, 1, 2, sizeC) + +test_unionC_parent = (as.scalar(newParentC[2]) == 1) +test_unionC_rank = (as.scalar(newRankC[1]) == 1) + +test_unionC = test_unionC_parent & test_unionC_rank + +if(test_unionC) { + print("Passed") +} else { + print("Failed") + print(rankC[1]) +} \ No newline at end of file From 82ed4c0cf517e3cf0a9cffa846490ad723bf0b31 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Sun, 18 Jan 2026 21:11:12 +0100 Subject: [PATCH 3/7] hierarchy with linkage --- scripts/builtin/hdbscan.dml | 115 ++++++++++++++++++++++-------------- test_build_hierarchy.dml | 46 ++++++++++++++- test_union_find.dml | 41 +++++++++++-- 3 files changed, 151 insertions(+), 51 deletions(-) diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml index 3cb00e021ce..1029256836c 100644 --- a/scripts/builtin/hdbscan.dml +++ b/scripts/builtin/hdbscan.dml @@ -143,35 +143,44 @@ find = function(Matrix[Double] parent, Integer x) } } -union = function(Matrix[Double] parent, Matrix[Double] rank, - Integer x, Integer y, Matrix[Double] size) - return (Matrix[Double] newParent, Matrix[Double] newRank, Matrix[Double] newSize) +union = function(Matrix[Double] parent, Matrix[Double] rank, + Integer x, Integer y, Matrix[Double] size, + Matrix[Double] compToNode, Integer newId) + return (Matrix[Double] newParent, Matrix[Double] newRank, + Matrix[Double] newSize, Matrix[Double] newCompToNode, Integer newRoot) { - newParent = parent - newRank = rank - newSize = size - - rankX = as.scalar(rank[x]) - rankY = as.scalar(rank[y]) - - # Calculate combined size - combinedSize = as.scalar(size[x]) + as.scalar(size[y]) - - if(rankX < rankY) { - newParent[x] = y - newSize[y] = combinedSize # Update size at new root - } - else if(rankX > rankY) { - newParent[y] = x - newSize[x] = combinedSize # Update size at new root - } - else { - newParent[y] = x - newRank[x] = rankX + 1 - newSize[x] = combinedSize # Update size at new root - } + newParent = parent + newRank = rank + newSize = size + newCompToNode = compToNode + + rankX = as.scalar(rank[x,1]) + rankY = as.scalar(rank[y,1]) + + combinedSize = as.scalar(size[x,1]) + as.scalar(size[y,1]) + + if(rankX < rankY) { + newParent[x] = y + newSize[y,1] = combinedSize + newCompToNode[y,1] = newId + newRoot = y + } + else if(rankX > rankY) { + newParent[y] = x + newSize[x,1] = combinedSize + newCompToNode[x,1] = newId + newRoot = x + } + else { + newParent[y] = x + newRank[x,1] = rankX + 1 + newSize[x,1] = combinedSize + newCompToNode[x,1] = newId + newRoot = x + } } + buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n) return (Matrix[Double] hierarchy, Matrix[Double] sizes) { @@ -180,11 +189,21 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer sorted = order(target=weights, by=1, decreasing=FALSE) # parent[i] = i, meaning each point is its own parent in the beginning - parent = seq(1, n) + parent = seq(1, n) # tree depth when unioning (icnreases only when merged trees are of same height) rank = matrix(0, rows=n, cols=1) - # each cluster is 1 in the beginning - clusterSize = matrix(1, rows=2*n-1, cols=1) # Space for all possible cluster IDs + + # initially, each component is a single point, so root i maps to leaf node i. + # Later, when two components merge, the new component root will map to a new + # internal linkage node id (n+1, n+2, ..., 2n-1) + compToNode = seq(1, n) + + # size (number of original points) for each linkage-tree node id, + # leaves 1..n have size 1. + nodeSize = matrix(0, rows=2*n-1, cols=1) + for(j in 1:n) { + nodeSize[j,1] = 1 + } # hierarcy[i, :] = [cluster1_id, cluster2_id, merge_distance] hierarchy = matrix(0, rows=n-1, cols=3) @@ -192,27 +211,33 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer # sizes[i] = size of the cluster created by merge i sizes = matrix(0, rows=n-1, cols=1) - nextCluster = n + 1 - + row = 1 for(i in 1:(n-1)) { - idx = as.scalar(sorted[i]) - u = as.scalar(edges[idx, 1]) - v = as.scalar(edges[idx, 2]) - w = as.scalar(weights[idx]) + idx = as.integer(as.scalar(sorted[i,1])) + u = as.integer(as.scalar(edges[idx,1])) + v = as.integer(as.scalar(edges[idx,2])) + w = as.scalar(weights[idx,1]) root_u = find(parent, u) root_v = find(parent, v) if(root_u != root_v) { - hierarchy[i,1] = root_u - hierarchy[i,2] = root_v - hierarchy[i,3] = w - sizes[i] = as.scalar(clusterSize[root_u]) + as.scalar(clusterSize[root_v]) - - [parent, rank, clusterSize] = union(parent, rank, root_u, root_v, clusterSize) - - nextCluster = nextCluster + 1 + left = as.integer(as.scalar(compToNode[root_u])) + right = as.integer(as.scalar(compToNode[root_v])) + + newId = n + row + + hierarchy[row,1] = left + hierarchy[row,2] = right + hierarchy[row,3] = w + + nodeSize[newId,1] = as.scalar(nodeSize[left,1]) + as.scalar(nodeSize[right,1]) + sizes[row,1] = as.scalar(nodeSize[newId,1]) + + [parent, rank, ufSize, compToNode, newRoot] = + union(parent, rank, root_u, root_v, nodeSize, compToNode, newId) + + row = row + 1 } } - -} \ No newline at end of file +} diff --git a/test_build_hierarchy.dml b/test_build_hierarchy.dml index b59aa288619..bc586bd97ac 100644 --- a/test_build_hierarchy.dml +++ b/test_build_hierarchy.dml @@ -68,7 +68,51 @@ test4 = (size3 == 4) print("First merge size: " + size1 + " (should be 2)") test5 = (size1 == 2) -test_pass = test1 & test2 & test3 & test4 & test5 +# New classic-linkage checks +n = 4 +hasInternal = (sum(hierarchy[,1] > n) + sum(hierarchy[,2] > n)) > 0 +print("Has internal node ids (>n): " + hasInternal + " (should be true)") +test6 = hasInternal + +# Check that child ids are within valid range +maxChild1 = max(hierarchy[,1]) +maxChild2 = max(hierarchy[,2]) +maxChild = maxChild1 +if(maxChild2 > maxChild) { maxChild = maxChild2 } + +print("Max child id: " + maxChild + " (should be <= " + (2*n-2) + ")") +test7 = (maxChild <= (2*n-2)) + +# Check that internal ids are “created in order” +test8 = TRUE +for(r in 1:(n-1)) { + child1 = as.scalar(hierarchy[r,1]) + child2 = as.scalar(hierarchy[r,2]) + newId = n + r + ok = (child1 < newId) & (child2 < newId) + test8 = test8 & ok +} +print("Children reference only existing nodes: " + test8 + " (should be true)") + +# Recompute node sizes from hierarchy and verify +nodeSize = matrix(0, rows=2*n-1, cols=1) +for(j in 1:n) { nodeSize[j,1] = 1 } + +test9 = TRUE +for(r in 1:(n-1)) { + left = as.integer(as.scalar(hierarchy[r,1])) + right = as.integer(as.scalar(hierarchy[r,2])) + newId = n + r + + expected = as.scalar(nodeSize[left,1]) + as.scalar(nodeSize[right,1]) + nodeSize[newId,1] = expected + + ok = (as.scalar(sizes[r,1]) == expected) + test9 = test9 & ok +} +print("sizes[r] equals sum of child sizes: " + test9 + " (should be true)") + +test_pass = test1 & test2 & test3 & test4 & test5 & test6 & test7 & test8 & test9 if(test_pass) { print("Passed") diff --git a/test_union_find.dml b/test_union_find.dml index 98eb14c7141..9c4f7a2d5f9 100644 --- a/test_union_find.dml +++ b/test_union_find.dml @@ -28,6 +28,8 @@ if(test_find) { # ensure union() attaches lower-rank root under higher-rank root # rank[1] < rank[2] => parent[1] becomes 2 +newId = 5 # arbitrary "new linkage node id" for this unit test + parentA = matrix(0, rows=n, cols=1) parentA[1] = 1 parentA[2] = 2 @@ -42,7 +44,13 @@ rankA[4] = 0 sizeA = matrix(1, rows=n, cols=1) -[newParentA, newRankA, newSizeA] = hdb::union(parentA, rankA, 1, 2, sizeA) +compToNodeA = matrix(0, rows=n, cols=1) +compToNodeA[1] = 1 +compToNodeA[2] = 2 +compToNodeA[3] = 3 +compToNodeA[4] = 4 + +[newParentA, newRankA, newSizeA, newCompToNodeA, newRootA] = hdb::union(parentA, rankA, 1, 2, sizeA, compToNodeA, newId) test_unionA = (as.scalar(newParentA[1]) == 2) & (as.scalar(newParentA[2]) == 2) if(test_unionA) { @@ -68,7 +76,13 @@ rankB[4] = 0 sizeB = matrix(1, rows=n, cols=1) -[newParentB, newRankB, newSizeB] = hdb::union(parentB, rankB, 1, 2, sizeB) +compToNodeB = matrix(0, rows=n, cols=1) +compToNodeB[1] = 1 +compToNodeB[2] = 2 +compToNodeB[3] = 3 +compToNodeB[4] = 4 + +[newParentB, newRankB, newSizeB, newCompToNodeB, newRootB] = hdb::union(parentB, rankB, 1, 2, sizeB, compToNodeB, newId) test_unionB = (as.scalar(newParentB[2]) == 1) & (as.scalar(newParentB[1]) == 1) if(test_unionB) { @@ -77,6 +91,16 @@ if(test_unionB) { print("Failed") } +# check linkage +test_unionB = test_unionB & (newRootB == 1) & (as.scalar(newCompToNodeB[1]) == newId) + +if(test_unionB) { + print("Passed") +} else { + print("Failed") +} + + # ensure union() increments if height is same parentC = matrix(0, rows=n, cols=1) @@ -93,16 +117,23 @@ rankC[4] = 0 sizeC = matrix(1, rows=n, cols=1) -[newParentC, newRankC, newSizeC] = hdb::union(parentC, rankC, 1, 2, sizeC) +compToNodeC = matrix(0, rows=n, cols=1) +compToNodeC[1] = 1 +compToNodeC[2] = 2 +compToNodeC[3] = 3 +compToNodeC[4] = 4 + +[newParentC, newRankC, newSizeC, newCompToNodeC, newRootC] = hdb::union(parentC, rankC, 1, 2, sizeC, compToNodeC, newId) test_unionC_parent = (as.scalar(newParentC[2]) == 1) test_unionC_rank = (as.scalar(newRankC[1]) == 1) test_unionC = test_unionC_parent & test_unionC_rank +test_unionC = test_unionC & (newRootC == 1) & (as.scalar(newCompToNodeC[1]) == newId) + if(test_unionC) { print("Passed") } else { print("Failed") - print(rankC[1]) -} \ No newline at end of file +} From e42bdb56d819fe9b1271629537f2cda376f075d6 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Sun, 25 Jan 2026 16:04:21 +0100 Subject: [PATCH 4/7] extract stabilit clusters --- scripts/builtin/hdbscan.dml | 165 +++++++++++++++++++++++++++++++- test_extract_stabe_clusters.dml | 136 ++++++++++++++++++++++++++ 2 files changed, 296 insertions(+), 5 deletions(-) create mode 100644 test_extract_stabe_clusters.dml diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml index 1029256836c..d1926dbb190 100644 --- a/scripts/builtin/hdbscan.dml +++ b/scripts/builtin/hdbscan.dml @@ -58,7 +58,7 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = - coreDistances = matrix(0, rows=n, cols=1) for(i in 1:n) { - kthDist = computeKthSmallest(distances[i,], minPts) + kthDist = computeKthSmallest(t(distances[i,]), minPts) # Add t() here! coreDistances[i] = kthDist } @@ -68,7 +68,8 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = - [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n) - # TODO: get stable cluster with stability score + [clusterMems, stabilities] = extractStableClusters(hierarchy, mstWeights, n, minClSize) + # TODO: build cluster model # temp dummy values @@ -184,9 +185,9 @@ union = function(Matrix[Double] parent, Matrix[Double] rank, buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n) return (Matrix[Double] hierarchy, Matrix[Double] sizes) { - # sort edges by weight in ascending order - # to build the hierarchy from dense cores outward - sorted = order(target=weights, by=1, decreasing=FALSE) + # create indexed weights to preserve original positions after sorting + indexedWeights = cbind(seq(1, nrow(weights)), weights) + sorted = order(target=indexedWeights, by=2, decreasing=FALSE) # parent[i] = i, meaning each point is its own parent in the beginning parent = seq(1, n) @@ -241,3 +242,157 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer } } } + +getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeId) + return (Matrix[Double] leaves) +{ + if(nodeId <= n) { + leaves = matrix(nodeId, rows=1, cols=1) + } else { + mergeIdx = nodeId - n + left = as.integer(as.scalar(hierarchy[mergeIdx,1])) + right = as.integer(as.scalar(hierarchy[mergeIdx,2])) + + leftLeaves = getLeafDescendants(hierarchy, n, left) + rightLeaves = getLeafDescendants(hierarchy, n, right) + + leaves = rbind(leftLeaves, rightLeaves) + } +} + +extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weights, + Integer n, Integer minClSize) + return (Matrix[Double] labels, Matrix[Double] stabilities) +{ + numMerges = n - 1 # hierarchical tree over n points has exactly n-1 merge events + numNodes = 2*n - 1 # total nodes in the dendogram + + # convert distances to lambda (density) + lambda = matrix(0, rows=numMerges, cols=1) + for(i in 1:numMerges) { + dist = as.scalar(hierarchy[i,3]) + if(dist > 0) { + lambda[i,1] = 1.0 / dist + } else { + lambda[i,1] = 1e15 + } + } + + lambda_birth = matrix(1e15, rows=numNodes, cols=1) + lambda_death = matrix(0, rows=numNodes, cols=1) + cluster_size = matrix(0, rows=numNodes, cols=1) + + # initialize the leaf nodes to have cluster size 1 + for(i in 1:n) { + cluster_size[i,1] = 1 + } + + for(i in 1:numMerges) { + left = as.integer(as.scalar(hierarchy[i,1])) + right = as.integer(as.scalar(hierarchy[i,2])) + newId = n + i + merge_lambda = as.scalar(lambda[i,1]) + + # cluster newId starts existing as its own cluster at this density level + # and that's why the children get their det set at the same density + lambda_birth[newId,1] = merge_lambda + lambda_death[left,1] = merge_lambda + lambda_death[right,1] = merge_lambda + cluster_size[newId,1] = as.scalar(cluster_size[left,1]) + as.scalar(cluster_size[right,1]) + } + + # root cluster exists all the way + rootId = 2*n - 1 + lambda_death[rootId,1] = 0 + + # compute own stability for each internal node + # NOTE: If the cluster is big enough, we assign stability. + # The more long-lived it is (birth - death) and + # the larger it is, the more stable it is. + stability = matrix(0, rows=numNodes, cols=1) + for(nodeId in (n+1):numNodes) { + size = as.scalar(cluster_size[nodeId,1]) + birth = as.scalar(lambda_birth[nodeId,1]) + death = as.scalar(lambda_death[nodeId,1]) + if(size >= minClSize) { + stability[nodeId,1] = size * (birth - death) + } + } + + # compute subtree stability (best achievable from each subtree) + subtree_stability = matrix(0, rows=numNodes, cols=1) + + # leaf nodes have 0 subtree stability + for(i in 1:n) { + subtree_stability[i,1] = 0 + } + + # process merges in order (bottom-up) + for(i in 1:numMerges) { + nodeId = n + i + left = as.integer(as.scalar(hierarchy[i,1])) + right = as.integer(as.scalar(hierarchy[i,2])) + + children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1]) + own_stab = as.scalar(stability[nodeId,1]) + + # Subtree stability is the best we can achieve from this subtree + if(children_subtree > own_stab) { + subtree_stability[nodeId,1] = children_subtree + } else { + subtree_stability[nodeId,1] = own_stab + } + } + + # select clusters + selected = matrix(0, rows=numNodes, cols=1) + selected[rootId,1] = 1 + + i = numMerges + while(i >= 1) { + nodeId = n + i + + if(as.scalar(selected[nodeId,1]) == 1) { + left = as.integer(as.scalar(hierarchy[i,1])) + right = as.integer(as.scalar(hierarchy[i,2])) + + children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1]) + own_stab = as.scalar(stability[nodeId,1]) + parent_size = as.scalar(cluster_size[nodeId,1]) + + # select children if they have higher subtree stability + if(parent_size < minClSize | children_subtree > own_stab) { + selected[nodeId,1] = 0 + selected[left,1] = 1 + selected[right,1] = 1 + } + } + + i = i - 1 + } + + # assign labels + labels = matrix(-1, rows=n, cols=1) + cluster_id = 1 + + for(nodeId in 1:numNodes) { + if(as.scalar(selected[nodeId,1]) == 1) { + size = as.scalar(cluster_size[nodeId,1]) + + if(size >= minClSize) { + leaves = getLeafDescendants(hierarchy, n, nodeId) + + for(j in 1:nrow(leaves)) { + leafId = as.integer(as.scalar(leaves[j,1])) + if(leafId >= 1 & leafId <= n) { + labels[leafId,1] = cluster_id + } + } + + cluster_id = cluster_id + 1 + } + } + } + + stabilities = stability +} diff --git a/test_extract_stabe_clusters.dml b/test_extract_stabe_clusters.dml new file mode 100644 index 00000000000..95f6b1e3d8b --- /dev/null +++ b/test_extract_stabe_clusters.dml @@ -0,0 +1,136 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 6 point example with clear cluster structure +# 1,2,3 form tight cluster A, 4,5,6 form tight cluster B, A and B are far apart (10) + +n = 6 +distances = matrix(10, rows=n, cols=n) + +# points 1,2,3 (cluster A) +distances[1,2] = 1 +distances[2,1] = 1 + +distances[1,3] = 2 +distances[3,1] = 2 + +distances[2,3] = 1 +distances[3,2] = 1 + +# points 4,5,6 (cluster B) +distances[4,5] = 1 +distances[5,4] = 1 + +distances[4,6] = 2 +distances[6,4] = 2 + +distances[5,6] = 1 +distances[6,5] = 1 + +# zero diagonal (to self) +for(i in 1:n) { + distances[i,i] = 0 +} + + +print("\nBuilding MST") +expected_edges = matrix("2 1 3 2 6 3 5 6 4 5", rows=5, cols=2) +expected_weights = matrix("1 1 10 1 1", rows=5, cols=1) +[edges, weights] = hdb::buildMST(distances, n) +edges_match = (min(edges == expected_edges) == 1) +weights_match = (min(weights == expected_weights) == 1) +if (edges_match) { + print("Pass: edges match.") +} else { + print("Fail: edges don't match.") +} +if (weights_match) { + print("Pass: weights match.") +} else { + print("Fail: weights don't match.") +} +print("MST edges:\n" + toString(edges)) +print("MST weights:\n " + toString(weights)) + + +print("\nBuilding hierarchy") +[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, n) +expected_hierarchy = matrix("2 1 1 3 7 1 5 6 1 4 9 1 10 8 10", rows=5, cols=3) +expected_sizes = matrix("2 3 2 3 6", rows=5, cols=1) +hierachy_match = (min(hierarchy == expected_hierarchy) == 1) +sizes_match = (min(sizes == expected_sizes) == 1) +if (hierachy_match) { + print("Pass: hierachy mathes.") +} else { + print("Fail: hierarchy doesn't match.") +} +if (sizes_match) { + print("Pass: sizes match.") +} else { + print("Fail: sizes don't match.") +} +print("Hierarchy:\n" + toString(hierarchy)) +print("Sizes:\n" + toString(sizes)) + + +print("\nExtracting stable clusters with minClSize=2") +[labels, stabilities] = hdb::extractStableClusters(hierarchy, weights, n, 2) +expected_labels = matrix("1 1 1 2 2 2", rows=6, cols=1) +expected_stabilites = matrix("0 0 0 0 0 0 0 2.7 0 2.7 0.6", rows=n*2-1, cols=1) +labels_match = (min(labels == expected_labels) == 1) +tolerance = 1e-10 +stabilities_match = max(abs(stabilities - expected_stabilites)) < tolerance +if (labels_match) { + print("Pass: labels match.") +} else { + print("Fail: labels don't match.") +} +if (stabilities_match) { + print("Pass: stabilities match.") +} else { + print("Fail: stabilities don't match.") +} +print("Cluster labels:\n" + toString(labels)) +print("Top stabilities:\n" + toString(stabilities)) + + + +# check results (we have some duplicate logic here, but anyway) +num_clusters = max(labels) +num_noise = sum(labels == -1) + +print("\nNumber of clusters found: " + num_clusters) +print("Number of noise points: " + num_noise) + +# should find 2 clusters +test1 = (num_clusters == 2) +print("Found 2 clusters: " + test1) + +# no points should be noise +test2 = (num_noise == 0) +print("No noise points: " + test2) + +# points 1,2,3 should be in same cluster +label1 = as.scalar(labels[1]) +label2 = as.scalar(labels[2]) +label3 = as.scalar(labels[3]) +test3 = (label1 == label2) & (label2 == label3) & (label1 > 0) +print("points 1,2,3 in same cluster: " + test3) + +# points 4,5,6 should be in same +label4 = as.scalar(labels[4]) +label5 = as.scalar(labels[5]) +label6 = as.scalar(labels[6]) +test4 = (label4 == label5) & (label5 == label6) & (label4 > 0) +print("Points 4,5,6 in same cluster: " + test4) + +# clusters A and B should be different +test5 = (label1 != label4) +print("Two clusters are different: " + test5) + +test_pass = edges_match & weights_match & hierachy_match & sizes_match & labels_match & stabilities_match & test1 & test2 & test3 & test4 & test5 + +if(test_pass) { + print("\nAll tests passed\n") +} else { + print("\nTests failed\n") +} \ No newline at end of file From 06cc04ce093ce2a10c779305303b01a569ee0676 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Sun, 25 Jan 2026 16:41:36 +0100 Subject: [PATCH 5/7] Model builder --- scripts/builtin/hdbscan.dml | 78 +++++++++++++++++++--- test_cluster_model.dml | 129 ++++++++++++++++++++++++++++++++++++ 2 files changed, 197 insertions(+), 10 deletions(-) create mode 100644 test_cluster_model.dml diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml index d1926dbb190..5d989574a99 100644 --- a/scripts/builtin/hdbscan.dml +++ b/scripts/builtin/hdbscan.dml @@ -68,13 +68,11 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = - [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n) - [clusterMems, stabilities] = extractStableClusters(hierarchy, mstWeights, n, minClSize) - - # TODO: build cluster model - - # temp dummy values - clusterMems = matrix(1, rows=n, cols=1) - clusterModel = X + [clusterMems, stabilities, clusterToNode] = extractStableClusters(hierarchy, mstWeights, n, minClSize) + + [centroids, clusterInfo] = buildClusterModel(X, clusterMems, stabilities, clusterToNode) + + clusterModel = centroids } @@ -133,8 +131,8 @@ buildMST = function(Matrix[Double] distances, Integer n) } } -# Union-find utils +# Union-find utils find = function(Matrix[Double] parent, Integer x) return (Integer root) { @@ -144,6 +142,7 @@ find = function(Matrix[Double] parent, Integer x) } } + union = function(Matrix[Double] parent, Matrix[Double] rank, Integer x, Integer y, Matrix[Double] size, Matrix[Double] compToNode, Integer newId) @@ -243,6 +242,8 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer } } + +# Leaf descendants util getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeId) return (Matrix[Double] leaves) { @@ -260,9 +261,10 @@ getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeI } } + extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weights, Integer n, Integer minClSize) - return (Matrix[Double] labels, Matrix[Double] stabilities) + return (Matrix[Double] labels, Matrix[Double] stabilities, Matrix[Double] clusterToNode) { numMerges = n - 1 # hierarchical tree over n points has exactly n-1 merge events numNodes = 2*n - 1 # total nodes in the dendogram @@ -374,12 +376,18 @@ extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weight # assign labels labels = matrix(-1, rows=n, cols=1) cluster_id = 1 - + + # while tracking which node each cluster comes from + maxClusters = sum(selected) # upper bound on number of clusters + clusterToNode = matrix(0, rows=maxClusters, cols=1) + for(nodeId in 1:numNodes) { if(as.scalar(selected[nodeId,1]) == 1) { size = as.scalar(cluster_size[nodeId,1]) if(size >= minClSize) { + clusterToNode[cluster_id,1] = nodeId # ADD THIS LINE + leaves = getLeafDescendants(hierarchy, n, nodeId) for(j in 1:nrow(leaves)) { @@ -394,5 +402,55 @@ extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weight } } + # trim clusterToNode to actual number of clusters + numActualClusters = cluster_id - 1 + if(numActualClusters > 0) { + clusterToNode = clusterToNode[1:numActualClusters,] + } else { + clusterToNode = matrix(0, rows=0, cols=1) + } + stabilities = stability + } + + +buildClusterModel = function(Matrix[Double] X, Matrix[Double] labels, + Matrix[Double] stabilities, Matrix[Double] clusterToNode) + return (Matrix[Double] centroids, Matrix[Double] clusterInfo) +{ + n = nrow(X) + d = ncol(X) + + numClusters = as.integer(max(labels)) + + if(numClusters <= 0) { + # No clusters found, return empty model + centroids = matrix(0, rows=0, cols=d) + clusterInfo = matrix(0, rows=0, cols=2) + } else { + centroids = matrix(0, rows=numClusters, cols=d) + clusterInfo = matrix(0, rows=numClusters, cols=2) + + for(c in 1:numClusters) { + # find all points in cluster c + mask = (labels == c) + clusterSize = sum(mask) + + if(clusterSize > 0) { + # get centroid as mean of all points in cluster + clusterSum = t(mask) %*% X + centroids[c,] = clusterSum / clusterSize + + # store cluster metadata: [size, stability] + clusterInfo[c,1] = clusterSize + + # get stability for this cluster from the mapping + if(nrow(clusterToNode) >= c & as.scalar(clusterToNode[c,1]) > 0) { + nodeId = as.integer(as.scalar(clusterToNode[c,1])) + clusterInfo[c,2] = as.scalar(stabilities[nodeId,1]) + } + } + } + } +} \ No newline at end of file diff --git a/test_cluster_model.dml b/test_cluster_model.dml new file mode 100644 index 00000000000..6bc37d89b37 --- /dev/null +++ b/test_cluster_model.dml @@ -0,0 +1,129 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# 3 clear clusters +# A: points around (0, 0) +# B: points around (10, 10) +# C: points around (20, 0) + +n = 12 +d = 2 +X = matrix(0, rows=n, cols=d) + +# A +X[1,] = matrix("0.0 0.0", rows=1, cols=2) +X[2,] = matrix("0.5 0.5", rows=1, cols=2) +X[3,] = matrix("-0.5 0.5", rows=1, cols=2) +X[4,] = matrix("0.0 -0.5", rows=1, cols=2) + +# B +X[5,] = matrix("10.0 10.0", rows=1, cols=2) +X[6,] = matrix("10.5 10.5", rows=1, cols=2) +X[7,] = matrix("9.5 10.5", rows=1, cols=2) +X[8,] = matrix("10.0 9.5", rows=1, cols=2) + +# C +X[9,] = matrix("20.0 0.0", rows=1, cols=2) +X[10,] = matrix("20.5 0.5", rows=1, cols=2) +X[11,] = matrix("19.5 0.5", rows=1, cols=2) +X[12,] = matrix("20.0 -0.5", rows=1, cols=2) + +print(toString(X)) + + +# get distances +distances = hdb::dist(X) + + +# get core distances +minPts = 3 +coreDistances = matrix(0, rows=n, cols=1) +for(i in 1:n) { + kthDist = hdb::computeKthSmallest(t(distances[i,]), minPts) + coreDistances[i] = kthDist +} + + +# get mutual reachability +mutualReach = hdb::computeMutualReachability(distances, coreDistances) + + +# get MST +[edges, weights] = hdb::buildMST(mutualReach, n) + + +# get hierarchy +[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, n) + + +# get stable clusters +minClSize = 3 +[labels, stabilities, clusterToNode] = hdb::extractStableClusters(hierarchy, weights, n, minClSize) +expected_labels = matrix("1 1 1 1 2 2 2 2 3 3 3 3", rows=12, cols=1) +labels_match = (min(labels == expected_labels) == 1) +if (labels_match) { + print("Pass: labels match.") +} else { + print("Fail: labels don't match.") +} +print("Cluster labels:") +print(toString(labels)) + + +# get cluster model +[centroids, clusterInfo] = hdb::buildClusterModel(X, labels, stabilities, clusterToNode) + +print("\nCentroids:") +print(toString(centroids)) + +print("\nInfo [size, stability]:") +print(toString(clusterInfo)) + + +# check results +numClusters = nrow(centroids) +print("\nNumber of clusters found: " + numClusters) + + +# should find 3 clusters +test1 = (numClusters == 3) +print("Found 3 clusters: " + test1) + + +# each cluster should have 4 points +allSizesFour = TRUE +for(c in 1:numClusters) { + size = as.scalar(clusterInfo[c,1]) + allSizesFour = allSizesFour & (size == 4) +} +print("All clusters have size 4: " + allSizesFour) + + +test_A = min(sqrt(rowSums((centroids - matrix("0 0.125", 1, 2))^2))) < 0.001 +test_B = min(sqrt(rowSums((centroids - matrix("10 10.125", 1, 2))^2))) < 0.001 +test_C = min(sqrt(rowSums((centroids - matrix("20 0.125", 1, 2))^2))) < 0.001 +all_found = test_A & test_B & test_C +print("Expected centroids near expected positions: " + all_found) + + +# no noise (all assigned to clusters) +numNoise = sum(labels == -1) +test4 = (numNoise == 0) +print("No noise points: " + test4) + + +# stabilities are populated (not 0) +test_stability = TRUE +for(c in 1:numClusters) { + stab = as.scalar(clusterInfo[c,2]) + test_stability = test_stability & (stab > 0) +} +print("Stabilities populated: " + test_stability) + + +test_pass = test1 & allSizesFour & all_found & test4 & test_stability + +if(test_pass) { + print("\nAll tests passed") +} else { + print("\nTests failed") +} \ No newline at end of file From 1f98db12fcd7be22412ee4bb5251092241b7d7dc Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Sun, 25 Jan 2026 16:50:13 +0100 Subject: [PATCH 6/7] Overall hdbscan test --- test_hdbscan.dml | 51 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 test_hdbscan.dml diff --git a/test_hdbscan.dml b/test_hdbscan.dml new file mode 100644 index 00000000000..b26aa983e5c --- /dev/null +++ b/test_hdbscan.dml @@ -0,0 +1,51 @@ +source("scripts/builtin/hdbscan.dml") as hdb + +# A: 5 points tightly grouped around (0, 0) +# B: 5 points tightly grouped around (10, 0) +# Outliers: 2 points far from any cluster + +n = 12 +d = 2 +X = matrix(0, rows=n, cols=d) + +# A +X[1,] = matrix("0.0 0.0", rows=1, cols=2) +X[2,] = matrix("0.3 0.2", rows=1, cols=2) +X[3,] = matrix("-0.2 0.3", rows=1, cols=2) +X[4,] = matrix("0.1 -0.2", rows=1, cols=2) +X[5,] = matrix("-0.1 -0.1", rows=1, cols=2) + +# B +X[6,] = matrix("10.0 0.0", rows=1, cols=2) +X[7,] = matrix("10.3 0.2", rows=1, cols=2) +X[8,] = matrix("9.8 0.3", rows=1, cols=2) +X[9,] = matrix("10.1 -0.2", rows=1, cols=2) +X[10,] = matrix("9.9 -0.1", rows=1, cols=2) + +# Outliers +X[11,] = matrix("5.0 5.0", rows=1, cols=2) +X[12,] = matrix("-5.0 -5.0", rows=1, cols=2) + +print(toString(X)) + +minPts = 3 +minClSize = 4 + + +# run hdbscan +[labels, centroids] = hdb::m_hdbscan(X, minPts, minClSize) +expected_labels = matrix("1 1 1 1 1 2 2 2 2 2 -1 -1", rows=12, cols=1) +labels_match = (min(labels == expected_labels) == 1) +if (labels_match) { + print("Pass: labels match.") +} else { + print("Fail: labels don't match.") +} +test_A = min(sqrt(rowSums((centroids - matrix("0.020 0.040", 1, 2))^2))) < 0.001 +test_B = min(sqrt(rowSums((centroids - matrix("10.020 0.040", 1, 2))^2))) < 0.001 +all_found = test_A & test_B +print("Expected centroids near expected positions: " + all_found) +print("Cluster labels:") +print(toString(labels)) +print("\nCluster centroids:") +print(toString(centroids)) From 3fd34345b931ef712c2235f5efac47c2670726b4 Mon Sep 17 00:00:00 2001 From: anuunchin <88698977+anuunchin@users.noreply.github.com> Date: Sun, 25 Jan 2026 16:51:22 +0100 Subject: [PATCH 7/7] Remove wrong test location --- .../apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java deleted file mode 100644 index e69de29bb2d..00000000000