From 11bd20355061b1c947b6c716c4685a4b66115350 Mon Sep 17 00:00:00 2001
From: anuunchin <88698977+anuunchin@users.noreply.github.com>
Date: Fri, 12 Dec 2025 10:11:09 +0100
Subject: [PATCH 1/7] Initial commit: preliminary tests, some helper functions

---
 scripts/builtin/hdbscan.dml                   | 132 ++++++++++++++++++
 scripts/builtin/hdbscanApply.dml              |   0
 .../functions/builtin/BuiltinHDBSCANTest.java |   0
 src/test/scripts/functions/builtin/hdbscan.R  |   0
 .../scripts/functions/builtin/hdbscan.dml     |  20 +++
 .../functions/builtin/hdbscanApply.dml        |   0
 test_build_mst.dml                            |  46 ++++++
 test_kth_smallest.dml                         |  14 ++
 test_mutual_reachability.dml                  |  26 ++++
 9 files changed, 238 insertions(+)
 create mode 100644 scripts/builtin/hdbscan.dml
 create mode 100644 scripts/builtin/hdbscanApply.dml
 create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java
 create mode 100644 src/test/scripts/functions/builtin/hdbscan.R
 create mode 100644 src/test/scripts/functions/builtin/hdbscan.dml
 create mode 100644 src/test/scripts/functions/builtin/hdbscanApply.dml
 create mode 100644 test_build_mst.dml
 create mode 100644 test_kth_smallest.dml
 create mode 100644 test_mutual_reachability.dml

diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml
new file mode 100644
index 00000000000..ca48c7fdfc6
--- /dev/null
+++ b/scripts/builtin/hdbscan.dml
@@ -0,0 +1,132 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# The hdbscan function is used to perform the hdbscan clustering
+# algorithm using knn-based mutual reachability distance and minimum spanning tree.
+#
+# INPUT:
+# ------------------------------------------------------------
+# X             The input Matrix to do hdbscan on.
+# minPts        Minimum number of points for core distance computation. (Defaults to 5)
+# minClSize     Minimum cluster size (Defaults to minPts)              
+# ------------------------------------------------------------
+#
+# OUTPUT:
+# ------------------------------------------------------------
+# clusterMems   Cluster labels for each point
+# clusterModel  The cluster centroids for prediction
+# ------------------------------------------------------------
+
+# TODO: m,s , f?
+m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -1)
+    return (Matrix[Double] clusterMems, Matrix[Double] clusterModel)
+{
+    if(minPts < 2) {
+        stop("HDBSCAN: minPts should be at least 2")
+    }
+
+    if(minClSize < 0) {
+        minClSize = minPts
+    }
+
+    n = nrow(X)
+    d = ncol(X)
+
+    if(n < minPts) {
+        stop("HDBSCAN: Number of data points should be at least minPts")
+    }
+
+    distances = dist(X)
+
+    coreDistances = matrix(0, rows=n, cols=1)
+    for(i in 1:n) {
+        kthDist = computeKthSmallest(distances[i,], minPts)
+        coreDistances[i] = kthDist
+    }
+   
+    mutualReachDist = computeMutualReachability(distances, coreDistances)
+
+    [mstEdges, mstWeights] = buildMST(mutualReachDist, n)
+
+    # TODO: build cluster hierarchy
+    # TODO: get stable cluster with stability score
+    # TODO: build cluster model
+    
+    # temp  dummy values
+    clusterMems = matrix(1, rows=n, cols=1)
+    clusterModel = X
+}
+
+
+computeKthSmallest = function(Matrix[Double] array, Integer k)
+    return (Double res)
+{
+  sorted = order(target=array, by=1, decreasing=FALSE)
+  res = as.scalar(sorted[k+1, 1])
+}
+
+
+computeMutualReachability = function(Matrix[Double] distances, Matrix[Double] coreDistances)
+    return (Matrix[Double] mutualReach)
+{
+    # mutualReach(i,j) = max(dist(i,j), coreDist(i), coreDist(j))
+    # Diagonal is set to zero.
+
+    n = nrow(distances)
+  
+    coreDistRow = t(coreDistances)
+    coreDistCol = coreDistances
+  
+    maxCoreRow = (distances > coreDistRow) * distances + (distances <= coreDistRow) * coreDistRow
+    mutualReach = (maxCoreRow > coreDistCol) * maxCoreRow + (maxCoreRow <= coreDistCol) * coreDistCol
+  
+    mutualReach = mutualReach * (1 - diag(matrix(1, rows=n, cols=1)))
+}
+
+
+buildMST = function(Matrix[Double] distances, Integer n)
+    return (Matrix[Double] edges, Matrix[Double] weights)
+{
+  edges = matrix(0, rows=n-1, cols=2)
+  weights = matrix(0, rows=n-1, cols=1)
+  
+  inMST = matrix(0, rows=n, cols=1)
+  inMST[1] = 1
+  
+  minDist = distances[1,]
+  minDist = t(minDist)
+  
+  for(i in 1:(n-1)) {
+    candidates = minDist + inMST * 1e15 
+    minIdx = as.scalar(rowIndexMin(t(candidates)))
+    minWeight = as.scalar(minDist[minIdx])
+    
+    # Find which node in MST connects to minIdx
+    connectIdx = as.scalar(rowIndexMin(distances[minIdx,] + t(1-inMST) * 1e15))
+    edges[i,1] = minIdx
+    edges[i,2] = connectIdx
+    weights[i] = minWeight
+    
+    inMST[minIdx] = 1
+    newDists = distances[minIdx,]
+    minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) * t(newDists)
+  }
+}
\ No newline at end of file
diff --git a/scripts/builtin/hdbscanApply.dml b/scripts/builtin/hdbscanApply.dml
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/test/scripts/functions/builtin/hdbscan.R b/src/test/scripts/functions/builtin/hdbscan.R
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/src/test/scripts/functions/builtin/hdbscan.dml b/src/test/scripts/functions/builtin/hdbscan.dml
new file mode 100644
index 00000000000..cc59154f3c3
--- /dev/null
+++ b/src/test/scripts/functions/builtin/hdbscan.dml
@@ -0,0 +1,20 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
diff --git a/src/test/scripts/functions/builtin/hdbscanApply.dml b/src/test/scripts/functions/builtin/hdbscanApply.dml
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/test_build_mst.dml b/test_build_mst.dml
new file mode 100644
index 00000000000..a8dd7a44ff9
--- /dev/null
+++ b/test_build_mst.dml
@@ -0,0 +1,46 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+#        4
+#      / | \
+#     /  |  \
+#    /  (2)  (5)
+#    |   |    \
+#    |   |     \
+#   (2)  1-(3)--3
+#    |   |    /
+#    \   |   (4)
+#     \  (1) /
+#      \ | /
+#       \|/
+#        2
+
+distances = matrix(0, rows=4, cols=4)
+distances[1,2] = 1 
+distances[2,1] = 1
+
+distances[1,3] = 3 
+distances[3,1] = 3
+
+distances[1,4] = 2  
+distances[4,1] = 2
+
+distances[2,3] = 4 
+distances[3,2] = 4
+
+distances[2,4] = 2
+distances[4,2] = 2
+
+distances[3,4] = 5 
+distances[4,3] = 5
+
+[edges, weights] = hdb::buildMST(distances, 4)
+
+totalWeight = sum(weights)
+
+test_pass = (nrow(edges) == 3) & (totalWeight == 6)
+
+if(test_pass) {
+    print("Passed")
+} else {
+    print("Failes")
+}
\ No newline at end of file
diff --git a/test_kth_smallest.dml b/test_kth_smallest.dml
new file mode 100644
index 00000000000..b5065230f26
--- /dev/null
+++ b/test_kth_smallest.dml
@@ -0,0 +1,14 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+array = matrix("0 1 4 2", rows=4, cols=1)
+
+res1 = hdb::computeKthSmallest(array, 1)
+res2 = hdb::computeKthSmallest(array, 2)
+
+test_pass = (res1 == 1) & (res2 == 2)
+
+if(test_pass) {
+    print("Passed")
+} else {
+    print("Failed")
+}
\ No newline at end of file
diff --git a/test_mutual_reachability.dml b/test_mutual_reachability.dml
new file mode 100644
index 00000000000..2e86ed28311
--- /dev/null
+++ b/test_mutual_reachability.dml
@@ -0,0 +1,26 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+distances = matrix("0 1 5 1 0 4 5 4 0", rows=3, cols=3)
+coreDistances = matrix("1 1 4", rows=3, cols=1)
+
+mutualReach = hdb::computeMutualReachability(distances, coreDistances)
+
+diagSum = sum(diag(mutualReach))
+
+val_AB = as.scalar(mutualReach[1,2])
+val_AC = as.scalar(mutualReach[1,3])
+val_BC = as.scalar(mutualReach[2,3])
+
+sym_AB = as.scalar(mutualReach[2,1])
+sym_AC = as.scalar(mutualReach[3,1])
+sym_BC = as.scalar(mutualReach[3,2])
+
+test1_pass = (val_AB == 1) & (val_AC == 5) & (val_BC == 4) & 
+             (diagSum == 0) & 
+             (val_AB == sym_AB) & (val_AC == sym_AC) & (val_BC == sym_BC)
+
+if(test1_pass) {
+    print("Passed")
+} else {
+    print("Failed")
+}

From 39b3a4a45aa3db4e4d9ecd329ce7d227c791266a Mon Sep 17 00:00:00 2001
From: anuunchin <88698977+anuunchin@users.noreply.github.com>
Date: Mon, 29 Dec 2025 17:34:07 +0100
Subject: [PATCH 2/7] Hierarchy builder with union-find

---
 scripts/builtin/hdbscan.dml |  88 ++++++++++++++++++++++++++++-
 test_build_hierarchy.dml    |  77 +++++++++++++++++++++++++
 test_build_mst.dml          |   2 +-
 test_union_find.dml         | 108 ++++++++++++++++++++++++++++++++++++
 4 files changed, 273 insertions(+), 2 deletions(-)
 create mode 100644 test_build_hierarchy.dml
 create mode 100644 test_union_find.dml

diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml
index ca48c7fdfc6..3cb00e021ce 100644
--- a/scripts/builtin/hdbscan.dml
+++ b/scripts/builtin/hdbscan.dml
@@ -66,7 +66,8 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
 
     [mstEdges, mstWeights] = buildMST(mutualReachDist, n)
 
-    # TODO: build cluster hierarchy
+    [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n)
+
     # TODO: get stable cluster with stability score
     # TODO: build cluster model
     
@@ -129,4 +130,89 @@ buildMST = function(Matrix[Double] distances, Integer n)
     newDists = distances[minIdx,]
     minDist = (minDist < t(newDists)) * minDist + (minDist >= t(newDists)) * t(newDists)
   }
+}
+
+# Union-find utils
+
+find = function(Matrix[Double] parent, Integer x)
+    return (Integer root)
+{
+    root = x
+    while(as.scalar(parent[root]) != root) {
+        root = as.integer(as.scalar(parent[root]))
+    }
+}
+
+union = function(Matrix[Double] parent, Matrix[Double] rank, 
+                Integer x, Integer y, Matrix[Double] size)
+    return (Matrix[Double] newParent, Matrix[Double] newRank, Matrix[Double] newSize)
+{
+  newParent = parent
+  newRank = rank
+  newSize = size
+  
+  rankX = as.scalar(rank[x])
+  rankY = as.scalar(rank[y])
+  
+  # Calculate combined size
+  combinedSize = as.scalar(size[x]) + as.scalar(size[y])
+  
+  if(rankX < rankY) {
+    newParent[x] = y
+    newSize[y] = combinedSize  # Update size at new root
+  }
+  else if(rankX > rankY) {
+    newParent[y] = x
+    newSize[x] = combinedSize  # Update size at new root
+  }
+  else {
+    newParent[y] = x
+    newRank[x] = rankX + 1
+    newSize[x] = combinedSize  # Update size at new root
+  }
+}
+
+buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n)
+    return (Matrix[Double] hierarchy, Matrix[Double] sizes)
+{
+    # sort edges by weight in ascending order
+    # to build the hierarchy from dense cores outward
+    sorted = order(target=weights, by=1, decreasing=FALSE)
+
+    # parent[i] = i, meaning each point is its own parent in the beginning
+    parent = seq(1, n) 
+    # tree depth when unioning (icnreases only when merged trees are of same height)
+    rank = matrix(0, rows=n, cols=1)
+    # each cluster is 1 in the beginning
+    clusterSize = matrix(1, rows=2*n-1, cols=1)  # Space for all possible cluster IDs
+
+    # hierarcy[i, :] = [cluster1_id, cluster2_id, merge_distance]
+    hierarchy = matrix(0, rows=n-1, cols=3)
+
+    # sizes[i] = size of the cluster created by merge i
+    sizes = matrix(0, rows=n-1, cols=1)
+
+    nextCluster = n + 1
+
+    for(i in 1:(n-1)) {
+        idx = as.scalar(sorted[i])
+        u = as.scalar(edges[idx, 1])
+        v = as.scalar(edges[idx, 2])
+        w = as.scalar(weights[idx])
+
+        root_u = find(parent, u)
+        root_v = find(parent, v)
+
+        if(root_u != root_v) {
+            hierarchy[i,1] = root_u
+            hierarchy[i,2] = root_v
+            hierarchy[i,3] = w
+            sizes[i] = as.scalar(clusterSize[root_u]) + as.scalar(clusterSize[root_v])
+            
+            [parent, rank, clusterSize] = union(parent, rank, root_u, root_v, clusterSize)
+
+            nextCluster = nextCluster + 1
+        }
+    }
+
 }
\ No newline at end of file
diff --git a/test_build_hierarchy.dml b/test_build_hierarchy.dml
new file mode 100644
index 00000000000..b59aa288619
--- /dev/null
+++ b/test_build_hierarchy.dml
@@ -0,0 +1,77 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+#        4
+#      / | \
+#     /  |  \
+#    /  (2)  (5)
+#    |   |    \
+#    |   |     \
+#   (2)  1-(3)--3
+#    |   |    /
+#    \   |   (4)
+#     \  (1) /
+#      \ | /
+#       \|/
+#        2
+
+distances = matrix(0, rows=4, cols=4)
+distances[1,2] = 1 
+distances[2,1] = 1
+
+distances[1,3] = 3 
+distances[3,1] = 3
+
+distances[1,4] = 2  
+distances[4,1] = 2
+
+distances[2,3] = 4 
+distances[3,2] = 4
+
+distances[2,4] = 2
+distances[4,2] = 2
+
+distances[3,4] = 5 
+distances[4,3] = 5
+
+[edges, weights] = hdb::buildMST(distances, 4)
+
+[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, 4)
+
+print("Hierarchy (format: [cluster1, cluster2, merge_distance]):")
+print(toString(hierarchy))
+
+# Should have n-1 merge operations for n nodes
+num_merges = nrow(hierarchy)
+print("Number of merges: " + num_merges + " (should be 3)")
+test1 = (num_merges == 3)
+
+# Merge distances should be in ascending order (or equal)
+# Because we process edges from low weight to high weight
+dist1 = as.scalar(hierarchy[1,3])
+dist2 = as.scalar(hierarchy[2,3])
+dist3 = as.scalar(hierarchy[3,3])
+print("\nMerge distances: [" + dist1 + ", " + dist2 + ", " + dist3 + "]" + " (Should be in ascending order)")
+test2 = (dist1 <= dist2) & (dist2 <= dist3)
+
+# Cluster sizes should increase
+size1 = as.scalar(sizes[1])
+size2 = as.scalar(sizes[2])
+size3 = as.scalar(sizes[3])
+print("\nCluster sizes: [" + size1 + ", " + size2 + ", " + size3 + "]" + " (Should be increasing)")
+test3 = (size1 <= size2) & (size2 <= size3)
+
+# Final size should equal total number of nodes
+print("Final cluster size: " + size3 + " (should be 4)")
+test4 = (size3 == 4)
+
+# First merge should be size 2
+print("First merge size: " + size1 + " (should be 2)")
+test5 = (size1 == 2)
+
+test_pass = test1 & test2 & test3 & test4 & test5
+
+if(test_pass) {
+    print("Passed")
+} else {
+    print("Failed")
+}
\ No newline at end of file
diff --git a/test_build_mst.dml b/test_build_mst.dml
index a8dd7a44ff9..e2ed10ab040 100644
--- a/test_build_mst.dml
+++ b/test_build_mst.dml
@@ -42,5 +42,5 @@ test_pass = (nrow(edges) == 3) & (totalWeight == 6)
 if(test_pass) {
     print("Passed")
 } else {
-    print("Failes")
+    print("Failed")
 }
\ No newline at end of file
diff --git a/test_union_find.dml b/test_union_find.dml
new file mode 100644
index 00000000000..98eb14c7141
--- /dev/null
+++ b/test_union_find.dml
@@ -0,0 +1,108 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+#   1 -> 1
+#   2 -> 1
+#   3 -> 2 -> 1
+#   4 -> 4
+
+n = 4
+parent = matrix(0, rows=n, cols=1)
+parent[1] = 1
+parent[2] = 1
+parent[3] = 2
+parent[4] = 4
+
+root1 = hdb::find(parent, 1)
+root2 = hdb::find(parent, 2)
+root3 = hdb::find(parent, 3)
+root4 = hdb::find(parent, 4)
+
+test_find = (root1 == 1) & (root2 == 1) & (root3 == 1) & (root4 == 4)
+
+if(test_find) {
+    print("Passed")
+} else {
+    print("Failed")
+}
+
+# ensure union() attaches lower-rank root under higher-rank root
+# rank[1] < rank[2]  => parent[1] becomes 2
+
+parentA = matrix(0, rows=n, cols=1)
+parentA[1] = 1
+parentA[2] = 2
+parentA[3] = 3
+parentA[4] = 4
+
+rankA = matrix(0, rows=n, cols=1)
+rankA[1] = 0
+rankA[2] = 1
+rankA[3] = 0
+rankA[4] = 0
+
+sizeA = matrix(1, rows=n, cols=1)
+
+[newParentA, newRankA, newSizeA] = hdb::union(parentA, rankA, 1, 2, sizeA)
+test_unionA = (as.scalar(newParentA[1]) == 2) & (as.scalar(newParentA[2]) == 2)
+
+if(test_unionA) {
+    print("Passed")
+} else {
+    print("Failed")
+}
+
+# ensure union() attaches lower-rank root under higher-rank root
+# rank[1] > rank[2]  => parent[2] becomes 1
+
+parentB = matrix(0, rows=n, cols=1)
+parentB[1] = 1
+parentB[2] = 2
+parentB[3] = 3
+parentB[4] = 4
+
+rankB = matrix(0, rows=n, cols=1)
+rankB[1] = 2
+rankB[2] = 1
+rankB[3] = 0
+rankB[4] = 0
+
+sizeB = matrix(1, rows=n, cols=1)
+
+[newParentB, newRankB, newSizeB] = hdb::union(parentB, rankB, 1, 2, sizeB)
+test_unionB = (as.scalar(newParentB[2]) == 1) & (as.scalar(newParentB[1]) == 1)
+
+if(test_unionB) {
+    print("Passed")
+} else {
+    print("Failed")
+}
+
+# ensure union() increments if height is same
+
+parentC = matrix(0, rows=n, cols=1)
+parentC[1] = 1
+parentC[2] = 2
+parentC[3] = 3
+parentC[4] = 4
+
+rankC = matrix(0, rows=n, cols=1)
+rankC[1] = 0
+rankC[2] = 0
+rankC[3] = 0
+rankC[4] = 0
+
+sizeC = matrix(1, rows=n, cols=1)
+
+[newParentC, newRankC, newSizeC] = hdb::union(parentC, rankC, 1, 2, sizeC)
+
+test_unionC_parent = (as.scalar(newParentC[2]) == 1)
+test_unionC_rank = (as.scalar(newRankC[1]) == 1)
+
+test_unionC = test_unionC_parent & test_unionC_rank
+
+if(test_unionC) {
+    print("Passed")
+} else {
+    print("Failed")
+    print(rankC[1])
+}
\ No newline at end of file

From 82ed4c0cf517e3cf0a9cffa846490ad723bf0b31 Mon Sep 17 00:00:00 2001
From: anuunchin <88698977+anuunchin@users.noreply.github.com>
Date: Sun, 18 Jan 2026 21:11:12 +0100
Subject: [PATCH 3/7] hierarchy with linkage

---
 scripts/builtin/hdbscan.dml | 115 ++++++++++++++++++++++--------------
 test_build_hierarchy.dml    |  46 ++++++++++++++-
 test_union_find.dml         |  41 +++++++++++--
 3 files changed, 151 insertions(+), 51 deletions(-)

diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml
index 3cb00e021ce..1029256836c 100644
--- a/scripts/builtin/hdbscan.dml
+++ b/scripts/builtin/hdbscan.dml
@@ -143,35 +143,44 @@ find = function(Matrix[Double] parent, Integer x)
     }
 }
 
-union = function(Matrix[Double] parent, Matrix[Double] rank, 
-                Integer x, Integer y, Matrix[Double] size)
-    return (Matrix[Double] newParent, Matrix[Double] newRank, Matrix[Double] newSize)
+union = function(Matrix[Double] parent, Matrix[Double] rank,
+                Integer x, Integer y, Matrix[Double] size,
+                Matrix[Double] compToNode, Integer newId)
+    return (Matrix[Double] newParent, Matrix[Double] newRank,
+            Matrix[Double] newSize, Matrix[Double] newCompToNode, Integer newRoot)
 {
-  newParent = parent
-  newRank = rank
-  newSize = size
-  
-  rankX = as.scalar(rank[x])
-  rankY = as.scalar(rank[y])
-  
-  # Calculate combined size
-  combinedSize = as.scalar(size[x]) + as.scalar(size[y])
-  
-  if(rankX < rankY) {
-    newParent[x] = y
-    newSize[y] = combinedSize  # Update size at new root
-  }
-  else if(rankX > rankY) {
-    newParent[y] = x
-    newSize[x] = combinedSize  # Update size at new root
-  }
-  else {
-    newParent[y] = x
-    newRank[x] = rankX + 1
-    newSize[x] = combinedSize  # Update size at new root
-  }
+    newParent = parent
+    newRank = rank
+    newSize = size
+    newCompToNode = compToNode
+
+    rankX = as.scalar(rank[x,1])
+    rankY = as.scalar(rank[y,1])
+
+    combinedSize = as.scalar(size[x,1]) + as.scalar(size[y,1])
+
+    if(rankX < rankY) {
+        newParent[x] = y
+        newSize[y,1] = combinedSize
+        newCompToNode[y,1] = newId
+        newRoot = y
+    }
+    else if(rankX > rankY) {
+        newParent[y] = x
+        newSize[x,1] = combinedSize
+        newCompToNode[x,1] = newId
+        newRoot = x
+    }
+    else {
+        newParent[y] = x
+        newRank[x,1] = rankX + 1
+        newSize[x,1] = combinedSize
+        newCompToNode[x,1] = newId
+        newRoot = x
+    }
 }
 
+
 buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n)
     return (Matrix[Double] hierarchy, Matrix[Double] sizes)
 {
@@ -180,11 +189,21 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer
     sorted = order(target=weights, by=1, decreasing=FALSE)
 
     # parent[i] = i, meaning each point is its own parent in the beginning
-    parent = seq(1, n) 
+    parent = seq(1, n)
     # tree depth when unioning (icnreases only when merged trees are of same height)
     rank = matrix(0, rows=n, cols=1)
-    # each cluster is 1 in the beginning
-    clusterSize = matrix(1, rows=2*n-1, cols=1)  # Space for all possible cluster IDs
+
+    # initially, each component is a single point, so root i maps to leaf node i.
+    # Later, when two components merge, the new component root will map to a new
+    # internal linkage node id (n+1, n+2, ..., 2n-1)
+    compToNode = seq(1, n)
+
+    # size (number of original points) for each linkage-tree node id,
+    # leaves 1..n have size 1.
+    nodeSize = matrix(0, rows=2*n-1, cols=1)
+    for(j in 1:n) {
+        nodeSize[j,1] = 1
+    }
 
     # hierarcy[i, :] = [cluster1_id, cluster2_id, merge_distance]
     hierarchy = matrix(0, rows=n-1, cols=3)
@@ -192,27 +211,33 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer
     # sizes[i] = size of the cluster created by merge i
     sizes = matrix(0, rows=n-1, cols=1)
 
-    nextCluster = n + 1
-
+    row = 1
     for(i in 1:(n-1)) {
-        idx = as.scalar(sorted[i])
-        u = as.scalar(edges[idx, 1])
-        v = as.scalar(edges[idx, 2])
-        w = as.scalar(weights[idx])
+        idx = as.integer(as.scalar(sorted[i,1]))
+        u = as.integer(as.scalar(edges[idx,1]))
+        v = as.integer(as.scalar(edges[idx,2]))
+        w = as.scalar(weights[idx,1])
 
         root_u = find(parent, u)
         root_v = find(parent, v)
 
         if(root_u != root_v) {
-            hierarchy[i,1] = root_u
-            hierarchy[i,2] = root_v
-            hierarchy[i,3] = w
-            sizes[i] = as.scalar(clusterSize[root_u]) + as.scalar(clusterSize[root_v])
-            
-            [parent, rank, clusterSize] = union(parent, rank, root_u, root_v, clusterSize)
-
-            nextCluster = nextCluster + 1
+            left  = as.integer(as.scalar(compToNode[root_u]))
+            right = as.integer(as.scalar(compToNode[root_v]))
+
+            newId = n + row
+
+            hierarchy[row,1] = left
+            hierarchy[row,2] = right
+            hierarchy[row,3] = w
+
+            nodeSize[newId,1] = as.scalar(nodeSize[left,1]) + as.scalar(nodeSize[right,1])
+            sizes[row,1] = as.scalar(nodeSize[newId,1])
+
+            [parent, rank, ufSize, compToNode, newRoot] =
+                union(parent, rank, root_u, root_v, nodeSize, compToNode, newId)
+
+            row = row + 1
         }
     }
-
-}
\ No newline at end of file
+}
diff --git a/test_build_hierarchy.dml b/test_build_hierarchy.dml
index b59aa288619..bc586bd97ac 100644
--- a/test_build_hierarchy.dml
+++ b/test_build_hierarchy.dml
@@ -68,7 +68,51 @@ test4 = (size3 == 4)
 print("First merge size: " + size1 + " (should be 2)")
 test5 = (size1 == 2)
 
-test_pass = test1 & test2 & test3 & test4 & test5
+# New classic-linkage checks
+n = 4
+hasInternal = (sum(hierarchy[,1] > n) + sum(hierarchy[,2] > n)) > 0
+print("Has internal node ids (>n): " + hasInternal + " (should be true)")
+test6 = hasInternal
+
+# Check that child ids are within valid range
+maxChild1 = max(hierarchy[,1])
+maxChild2 = max(hierarchy[,2])
+maxChild = maxChild1
+if(maxChild2 > maxChild) { maxChild = maxChild2 }
+
+print("Max child id: " + maxChild + " (should be <= " + (2*n-2) + ")")
+test7 = (maxChild <= (2*n-2))
+
+# Check that internal ids are “created in order”
+test8 = TRUE
+for(r in 1:(n-1)) {
+    child1 = as.scalar(hierarchy[r,1])
+    child2 = as.scalar(hierarchy[r,2])
+    newId = n + r
+    ok = (child1 < newId) & (child2 < newId)
+    test8 = test8 & ok
+}
+print("Children reference only existing nodes: " + test8 + " (should be true)")
+
+# Recompute node sizes from hierarchy and verify
+nodeSize = matrix(0, rows=2*n-1, cols=1)
+for(j in 1:n) { nodeSize[j,1] = 1 }
+
+test9 = TRUE
+for(r in 1:(n-1)) {
+    left = as.integer(as.scalar(hierarchy[r,1]))
+    right = as.integer(as.scalar(hierarchy[r,2]))
+    newId = n + r
+
+    expected = as.scalar(nodeSize[left,1]) + as.scalar(nodeSize[right,1])
+    nodeSize[newId,1] = expected
+
+    ok = (as.scalar(sizes[r,1]) == expected)
+    test9 = test9 & ok
+}
+print("sizes[r] equals sum of child sizes: " + test9 + " (should be true)")
+
+test_pass = test1 & test2 & test3 & test4 & test5 & test6 & test7 & test8 & test9
 
 if(test_pass) {
     print("Passed")
diff --git a/test_union_find.dml b/test_union_find.dml
index 98eb14c7141..9c4f7a2d5f9 100644
--- a/test_union_find.dml
+++ b/test_union_find.dml
@@ -28,6 +28,8 @@ if(test_find) {
 # ensure union() attaches lower-rank root under higher-rank root
 # rank[1] < rank[2]  => parent[1] becomes 2
 
+newId = 5  # arbitrary "new linkage node id" for this unit test
+
 parentA = matrix(0, rows=n, cols=1)
 parentA[1] = 1
 parentA[2] = 2
@@ -42,7 +44,13 @@ rankA[4] = 0
 
 sizeA = matrix(1, rows=n, cols=1)
 
-[newParentA, newRankA, newSizeA] = hdb::union(parentA, rankA, 1, 2, sizeA)
+compToNodeA = matrix(0, rows=n, cols=1)
+compToNodeA[1] = 1
+compToNodeA[2] = 2
+compToNodeA[3] = 3
+compToNodeA[4] = 4
+
+[newParentA, newRankA, newSizeA, newCompToNodeA, newRootA] = hdb::union(parentA, rankA, 1, 2, sizeA, compToNodeA, newId)
 test_unionA = (as.scalar(newParentA[1]) == 2) & (as.scalar(newParentA[2]) == 2)
 
 if(test_unionA) {
@@ -68,7 +76,13 @@ rankB[4] = 0
 
 sizeB = matrix(1, rows=n, cols=1)
 
-[newParentB, newRankB, newSizeB] = hdb::union(parentB, rankB, 1, 2, sizeB)
+compToNodeB = matrix(0, rows=n, cols=1)
+compToNodeB[1] = 1
+compToNodeB[2] = 2
+compToNodeB[3] = 3
+compToNodeB[4] = 4
+
+[newParentB, newRankB, newSizeB, newCompToNodeB, newRootB] = hdb::union(parentB, rankB, 1, 2, sizeB, compToNodeB, newId)
 test_unionB = (as.scalar(newParentB[2]) == 1) & (as.scalar(newParentB[1]) == 1)
 
 if(test_unionB) {
@@ -77,6 +91,16 @@ if(test_unionB) {
     print("Failed")
 }
 
+# check linkage
+test_unionB = test_unionB & (newRootB == 1) & (as.scalar(newCompToNodeB[1]) == newId)
+
+if(test_unionB) {
+    print("Passed")
+} else {
+    print("Failed")
+}
+
+
 # ensure union() increments if height is same
 
 parentC = matrix(0, rows=n, cols=1)
@@ -93,16 +117,23 @@ rankC[4] = 0
 
 sizeC = matrix(1, rows=n, cols=1)
 
-[newParentC, newRankC, newSizeC] = hdb::union(parentC, rankC, 1, 2, sizeC)
+compToNodeC = matrix(0, rows=n, cols=1)
+compToNodeC[1] = 1
+compToNodeC[2] = 2
+compToNodeC[3] = 3
+compToNodeC[4] = 4
+
+[newParentC, newRankC, newSizeC, newCompToNodeC, newRootC] = hdb::union(parentC, rankC, 1, 2, sizeC, compToNodeC, newId)
 
 test_unionC_parent = (as.scalar(newParentC[2]) == 1)
 test_unionC_rank = (as.scalar(newRankC[1]) == 1)
 
 test_unionC = test_unionC_parent & test_unionC_rank
 
+test_unionC = test_unionC & (newRootC == 1) & (as.scalar(newCompToNodeC[1]) == newId)
+
 if(test_unionC) {
     print("Passed")
 } else {
     print("Failed")
-    print(rankC[1])
-}
\ No newline at end of file
+}

From e42bdb56d819fe9b1271629537f2cda376f075d6 Mon Sep 17 00:00:00 2001
From: anuunchin <88698977+anuunchin@users.noreply.github.com>
Date: Sun, 25 Jan 2026 16:04:21 +0100
Subject: [PATCH 4/7] extract stabilit clusters

---
 scripts/builtin/hdbscan.dml     | 165 +++++++++++++++++++++++++++++++-
 test_extract_stabe_clusters.dml | 136 ++++++++++++++++++++++++++
 2 files changed, 296 insertions(+), 5 deletions(-)
 create mode 100644 test_extract_stabe_clusters.dml

diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml
index 1029256836c..d1926dbb190 100644
--- a/scripts/builtin/hdbscan.dml
+++ b/scripts/builtin/hdbscan.dml
@@ -58,7 +58,7 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
 
     coreDistances = matrix(0, rows=n, cols=1)
     for(i in 1:n) {
-        kthDist = computeKthSmallest(distances[i,], minPts)
+        kthDist = computeKthSmallest(t(distances[i,]), minPts)  # Add t() here!
         coreDistances[i] = kthDist
     }
    
@@ -68,7 +68,8 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
 
     [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n)
 
-    # TODO: get stable cluster with stability score
+    [clusterMems, stabilities] = extractStableClusters(hierarchy, mstWeights, n, minClSize)
+    
     # TODO: build cluster model
     
     # temp  dummy values
@@ -184,9 +185,9 @@ union = function(Matrix[Double] parent, Matrix[Double] rank,
 buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer n)
     return (Matrix[Double] hierarchy, Matrix[Double] sizes)
 {
-    # sort edges by weight in ascending order
-    # to build the hierarchy from dense cores outward
-    sorted = order(target=weights, by=1, decreasing=FALSE)
+    # create indexed weights to preserve original positions after sorting
+    indexedWeights = cbind(seq(1, nrow(weights)), weights)
+    sorted = order(target=indexedWeights, by=2, decreasing=FALSE)
 
     # parent[i] = i, meaning each point is its own parent in the beginning
     parent = seq(1, n)
@@ -241,3 +242,157 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer
         }
     }
 }
+
+getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeId)
+    return (Matrix[Double] leaves)
+{
+    if(nodeId <= n) {
+        leaves = matrix(nodeId, rows=1, cols=1)
+    } else {
+        mergeIdx = nodeId - n
+        left = as.integer(as.scalar(hierarchy[mergeIdx,1]))
+        right = as.integer(as.scalar(hierarchy[mergeIdx,2]))
+        
+        leftLeaves = getLeafDescendants(hierarchy, n, left)
+        rightLeaves = getLeafDescendants(hierarchy, n, right)
+        
+        leaves = rbind(leftLeaves, rightLeaves)
+    }
+}
+
+extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weights, 
+                                  Integer n, Integer minClSize)
+    return (Matrix[Double] labels, Matrix[Double] stabilities)
+{
+    numMerges = n - 1 # hierarchical tree over n points has exactly n-1 merge events
+    numNodes = 2*n - 1 # total nodes in the dendogram
+    
+    # convert distances to lambda (density)
+    lambda = matrix(0, rows=numMerges, cols=1)
+    for(i in 1:numMerges) {
+        dist = as.scalar(hierarchy[i,3])
+        if(dist > 0) {
+            lambda[i,1] = 1.0 / dist
+        } else {
+            lambda[i,1] = 1e15
+        }
+    }
+    
+    lambda_birth = matrix(1e15, rows=numNodes, cols=1)
+    lambda_death = matrix(0, rows=numNodes, cols=1)
+    cluster_size = matrix(0, rows=numNodes, cols=1)
+
+    # initialize the leaf nodes to have cluster size 1
+    for(i in 1:n) {
+        cluster_size[i,1] = 1
+    }
+    
+    for(i in 1:numMerges) {
+        left = as.integer(as.scalar(hierarchy[i,1]))
+        right = as.integer(as.scalar(hierarchy[i,2]))
+        newId = n + i
+        merge_lambda = as.scalar(lambda[i,1])
+        
+        # cluster newId starts existing as its own cluster at this density level
+        # and that's why the children get their det set at the same density
+        lambda_birth[newId,1] = merge_lambda
+        lambda_death[left,1] = merge_lambda
+        lambda_death[right,1] = merge_lambda
+        cluster_size[newId,1] = as.scalar(cluster_size[left,1]) + as.scalar(cluster_size[right,1])
+    }
+    
+    # root cluster exists all the way
+    rootId = 2*n - 1
+    lambda_death[rootId,1] = 0
+    
+    # compute own stability for each internal node
+    # NOTE: If the cluster is big enough, we assign stability.
+    # The more long-lived it is (birth - death) and 
+    # the larger it is, the more stable it is.
+    stability = matrix(0, rows=numNodes, cols=1)
+    for(nodeId in (n+1):numNodes) {
+        size = as.scalar(cluster_size[nodeId,1])
+        birth = as.scalar(lambda_birth[nodeId,1])
+        death = as.scalar(lambda_death[nodeId,1])
+        if(size >= minClSize) {
+            stability[nodeId,1] = size * (birth - death)
+        }
+    }
+    
+    # compute subtree stability (best achievable from each subtree)
+    subtree_stability = matrix(0, rows=numNodes, cols=1)
+    
+    # leaf nodes have 0 subtree stability
+    for(i in 1:n) {
+        subtree_stability[i,1] = 0
+    }
+    
+    # process merges in order (bottom-up)
+    for(i in 1:numMerges) {
+        nodeId = n + i
+        left = as.integer(as.scalar(hierarchy[i,1]))
+        right = as.integer(as.scalar(hierarchy[i,2]))
+        
+        children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1])
+        own_stab = as.scalar(stability[nodeId,1])
+        
+        # Subtree stability is the best we can achieve from this subtree
+        if(children_subtree > own_stab) {
+            subtree_stability[nodeId,1] = children_subtree
+        } else {
+            subtree_stability[nodeId,1] = own_stab
+        }
+    }
+    
+    # select clusters
+    selected = matrix(0, rows=numNodes, cols=1)
+    selected[rootId,1] = 1
+    
+    i = numMerges
+    while(i >= 1) {
+        nodeId = n + i
+        
+        if(as.scalar(selected[nodeId,1]) == 1) {
+            left = as.integer(as.scalar(hierarchy[i,1]))
+            right = as.integer(as.scalar(hierarchy[i,2]))
+            
+            children_subtree = as.scalar(subtree_stability[left,1]) + as.scalar(subtree_stability[right,1])
+            own_stab = as.scalar(stability[nodeId,1])
+            parent_size = as.scalar(cluster_size[nodeId,1])
+            
+            # select children if they have higher subtree stability
+            if(parent_size < minClSize | children_subtree > own_stab) {
+                selected[nodeId,1] = 0
+                selected[left,1] = 1
+                selected[right,1] = 1
+            }
+        }
+        
+        i = i - 1
+    }
+    
+    # assign labels
+    labels = matrix(-1, rows=n, cols=1)
+    cluster_id = 1
+    
+    for(nodeId in 1:numNodes) {
+        if(as.scalar(selected[nodeId,1]) == 1) {
+            size = as.scalar(cluster_size[nodeId,1])
+            
+            if(size >= minClSize) {
+                leaves = getLeafDescendants(hierarchy, n, nodeId)
+                
+                for(j in 1:nrow(leaves)) {
+                    leafId = as.integer(as.scalar(leaves[j,1]))
+                    if(leafId >= 1 & leafId <= n) {
+                        labels[leafId,1] = cluster_id
+                    }
+                }
+                
+                cluster_id = cluster_id + 1
+            }
+        }
+    }
+    
+    stabilities = stability
+}
diff --git a/test_extract_stabe_clusters.dml b/test_extract_stabe_clusters.dml
new file mode 100644
index 00000000000..95f6b1e3d8b
--- /dev/null
+++ b/test_extract_stabe_clusters.dml
@@ -0,0 +1,136 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 6 point example with clear cluster structure
+# 1,2,3 form tight cluster A, 4,5,6 form tight cluster B, A and B are far apart (10)
+
+n = 6
+distances = matrix(10, rows=n, cols=n)
+
+# points 1,2,3 (cluster A)
+distances[1,2] = 1
+distances[2,1] = 1
+
+distances[1,3] = 2
+distances[3,1] = 2
+
+distances[2,3] = 1
+distances[3,2] = 1
+
+# points 4,5,6 (cluster B)
+distances[4,5] = 1
+distances[5,4] = 1
+
+distances[4,6] = 2
+distances[6,4] = 2
+
+distances[5,6] = 1
+distances[6,5] = 1
+
+# zero diagonal (to self)
+for(i in 1:n) {
+    distances[i,i] = 0
+}
+
+
+print("\nBuilding MST")
+expected_edges = matrix("2 1  3 2  6 3  5 6  4 5", rows=5, cols=2)
+expected_weights = matrix("1 1 10 1 1", rows=5, cols=1)
+[edges, weights] = hdb::buildMST(distances, n)
+edges_match = (min(edges == expected_edges) == 1)
+weights_match = (min(weights == expected_weights) == 1)
+if (edges_match) {
+    print("Pass: edges match.")
+} else {
+    print("Fail: edges don't match.")
+}
+if (weights_match) {
+    print("Pass: weights match.")
+} else {
+    print("Fail: weights don't match.")
+}
+print("MST edges:\n" + toString(edges))
+print("MST weights:\n " + toString(weights))
+
+
+print("\nBuilding hierarchy")
+[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, n)
+expected_hierarchy = matrix("2 1 1  3 7 1  5 6 1  4 9 1  10 8 10", rows=5, cols=3)
+expected_sizes = matrix("2 3 2 3 6", rows=5, cols=1)
+hierachy_match = (min(hierarchy == expected_hierarchy) == 1)
+sizes_match = (min(sizes == expected_sizes) == 1)
+if (hierachy_match) {
+    print("Pass: hierachy mathes.")
+} else {
+    print("Fail: hierarchy doesn't match.")
+}
+if (sizes_match) {
+    print("Pass: sizes match.")
+} else {
+    print("Fail: sizes don't match.")
+}
+print("Hierarchy:\n" + toString(hierarchy))
+print("Sizes:\n" + toString(sizes))
+
+
+print("\nExtracting stable clusters with minClSize=2")
+[labels, stabilities] = hdb::extractStableClusters(hierarchy, weights, n, 2)
+expected_labels = matrix("1 1 1 2 2 2", rows=6, cols=1)
+expected_stabilites = matrix("0 0 0 0 0 0 0 2.7 0 2.7 0.6", rows=n*2-1, cols=1)
+labels_match = (min(labels == expected_labels) == 1)
+tolerance = 1e-10
+stabilities_match = max(abs(stabilities - expected_stabilites)) < tolerance
+if (labels_match) {
+    print("Pass: labels match.")
+} else {
+    print("Fail: labels don't match.")
+}
+if (stabilities_match) {
+    print("Pass: stabilities match.")
+} else {
+    print("Fail: stabilities don't match.")
+}
+print("Cluster labels:\n" + toString(labels))
+print("Top stabilities:\n" + toString(stabilities))
+
+
+
+# check results (we have some duplicate logic here, but anyway)
+num_clusters = max(labels)
+num_noise = sum(labels == -1)
+
+print("\nNumber of clusters found: " + num_clusters)
+print("Number of noise points: " + num_noise)
+
+# should find 2 clusters
+test1 = (num_clusters == 2)
+print("Found 2 clusters: " + test1)
+
+# no points should be noise
+test2 = (num_noise == 0)
+print("No noise points: " + test2)
+
+# points 1,2,3 should be in same cluster
+label1 = as.scalar(labels[1])
+label2 = as.scalar(labels[2])
+label3 = as.scalar(labels[3])
+test3 = (label1 == label2) & (label2 == label3) & (label1 > 0)
+print("points 1,2,3 in same cluster: " + test3)
+
+# points 4,5,6 should be in same
+label4 = as.scalar(labels[4])
+label5 = as.scalar(labels[5])
+label6 = as.scalar(labels[6])
+test4 = (label4 == label5) & (label5 == label6) & (label4 > 0)
+print("Points 4,5,6 in same cluster: " + test4)
+
+# clusters A and B should be different
+test5 = (label1 != label4)
+print("Two clusters are different: " + test5)
+
+test_pass = edges_match & weights_match & hierachy_match & sizes_match & labels_match & stabilities_match & test1 & test2 & test3 & test4 & test5
+
+if(test_pass) {
+    print("\nAll tests passed\n")
+} else {
+    print("\nTests failed\n")
+}
\ No newline at end of file

From 06cc04ce093ce2a10c779305303b01a569ee0676 Mon Sep 17 00:00:00 2001
From: anuunchin <88698977+anuunchin@users.noreply.github.com>
Date: Sun, 25 Jan 2026 16:41:36 +0100
Subject: [PATCH 5/7] Model builder

---
 scripts/builtin/hdbscan.dml |  78 +++++++++++++++++++---
 test_cluster_model.dml      | 129 ++++++++++++++++++++++++++++++++++++
 2 files changed, 197 insertions(+), 10 deletions(-)
 create mode 100644 test_cluster_model.dml

diff --git a/scripts/builtin/hdbscan.dml b/scripts/builtin/hdbscan.dml
index d1926dbb190..5d989574a99 100644
--- a/scripts/builtin/hdbscan.dml
+++ b/scripts/builtin/hdbscan.dml
@@ -68,13 +68,11 @@ m_hdbscan = function(Matrix[Double] X, Integer minPts = 5, Integer minClSize = -
 
     [hierarchy, clusterSizes] = buildHierarchy(mstEdges, mstWeights, n)
 
-    [clusterMems, stabilities] = extractStableClusters(hierarchy, mstWeights, n, minClSize)
-    
-    # TODO: build cluster model
-    
-    # temp  dummy values
-    clusterMems = matrix(1, rows=n, cols=1)
-    clusterModel = X
+    [clusterMems, stabilities, clusterToNode] = extractStableClusters(hierarchy, mstWeights, n, minClSize)
+
+    [centroids, clusterInfo] = buildClusterModel(X, clusterMems, stabilities, clusterToNode)
+
+    clusterModel = centroids
 }
 
 
@@ -133,8 +131,8 @@ buildMST = function(Matrix[Double] distances, Integer n)
   }
 }
 
-# Union-find utils
 
+# Union-find utils
 find = function(Matrix[Double] parent, Integer x)
     return (Integer root)
 {
@@ -144,6 +142,7 @@ find = function(Matrix[Double] parent, Integer x)
     }
 }
 
+
 union = function(Matrix[Double] parent, Matrix[Double] rank,
                 Integer x, Integer y, Matrix[Double] size,
                 Matrix[Double] compToNode, Integer newId)
@@ -243,6 +242,8 @@ buildHierarchy = function(Matrix[Double] edges, Matrix[Double] weights, Integer
     }
 }
 
+
+# Leaf descendants util
 getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeId)
     return (Matrix[Double] leaves)
 {
@@ -260,9 +261,10 @@ getLeafDescendants = function(Matrix[Double] hierarchy, Integer n, Integer nodeI
     }
 }
 
+
 extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weights, 
                                   Integer n, Integer minClSize)
-    return (Matrix[Double] labels, Matrix[Double] stabilities)
+    return (Matrix[Double] labels, Matrix[Double] stabilities, Matrix[Double] clusterToNode)
 {
     numMerges = n - 1 # hierarchical tree over n points has exactly n-1 merge events
     numNodes = 2*n - 1 # total nodes in the dendogram
@@ -374,12 +376,18 @@ extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weight
     # assign labels
     labels = matrix(-1, rows=n, cols=1)
     cluster_id = 1
-    
+
+    # while tracking which node each cluster comes from
+    maxClusters = sum(selected)  # upper bound on number of clusters
+    clusterToNode = matrix(0, rows=maxClusters, cols=1)
+   
     for(nodeId in 1:numNodes) {
         if(as.scalar(selected[nodeId,1]) == 1) {
             size = as.scalar(cluster_size[nodeId,1])
             
             if(size >= minClSize) {
+                clusterToNode[cluster_id,1] = nodeId  # ADD THIS LINE
+                
                 leaves = getLeafDescendants(hierarchy, n, nodeId)
                 
                 for(j in 1:nrow(leaves)) {
@@ -394,5 +402,55 @@ extractStableClusters = function(Matrix[Double] hierarchy, Matrix[Double] weight
         }
     }
     
+    # trim clusterToNode to actual number of clusters
+    numActualClusters = cluster_id - 1
+    if(numActualClusters > 0) {
+        clusterToNode = clusterToNode[1:numActualClusters,]
+    } else {
+        clusterToNode = matrix(0, rows=0, cols=1)
+    }
+
     stabilities = stability
+
 }
+
+
+buildClusterModel = function(Matrix[Double] X, Matrix[Double] labels, 
+                             Matrix[Double] stabilities, Matrix[Double] clusterToNode)
+    return (Matrix[Double] centroids, Matrix[Double] clusterInfo)
+{
+    n = nrow(X)
+    d = ncol(X)
+
+    numClusters = as.integer(max(labels))
+
+    if(numClusters <= 0) {
+        # No clusters found, return empty model
+        centroids = matrix(0, rows=0, cols=d)
+        clusterInfo = matrix(0, rows=0, cols=2)
+    } else {
+        centroids = matrix(0, rows=numClusters, cols=d)
+        clusterInfo = matrix(0, rows=numClusters, cols=2)
+
+        for(c in 1:numClusters) {
+            # find all points in cluster c
+            mask = (labels == c)
+            clusterSize = sum(mask)
+
+            if(clusterSize > 0) {
+                # get centroid as mean of all points in cluster
+                clusterSum = t(mask) %*% X
+                centroids[c,] = clusterSum / clusterSize
+
+                # store cluster metadata: [size, stability]
+                clusterInfo[c,1] = clusterSize
+
+                # get stability for this cluster from the mapping
+                if(nrow(clusterToNode) >= c & as.scalar(clusterToNode[c,1]) > 0) {
+                    nodeId = as.integer(as.scalar(clusterToNode[c,1]))
+                    clusterInfo[c,2] = as.scalar(stabilities[nodeId,1])
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/test_cluster_model.dml b/test_cluster_model.dml
new file mode 100644
index 00000000000..6bc37d89b37
--- /dev/null
+++ b/test_cluster_model.dml
@@ -0,0 +1,129 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# 3 clear clusters
+# A: points around (0, 0)
+# B: points around (10, 10)
+# C: points around (20, 0)
+
+n = 12
+d = 2
+X = matrix(0, rows=n, cols=d)
+
+# A
+X[1,] = matrix("0.0 0.0", rows=1, cols=2)
+X[2,] = matrix("0.5 0.5", rows=1, cols=2)
+X[3,] = matrix("-0.5 0.5", rows=1, cols=2)
+X[4,] = matrix("0.0 -0.5", rows=1, cols=2)
+
+# B
+X[5,] = matrix("10.0 10.0", rows=1, cols=2)
+X[6,] = matrix("10.5 10.5", rows=1, cols=2)
+X[7,] = matrix("9.5 10.5", rows=1, cols=2)
+X[8,] = matrix("10.0 9.5", rows=1, cols=2)
+
+# C
+X[9,] = matrix("20.0 0.0", rows=1, cols=2)
+X[10,] = matrix("20.5 0.5", rows=1, cols=2)
+X[11,] = matrix("19.5 0.5", rows=1, cols=2)
+X[12,] = matrix("20.0 -0.5", rows=1, cols=2)
+
+print(toString(X))
+
+
+# get distances
+distances = hdb::dist(X)
+
+
+# get core distances
+minPts = 3
+coreDistances = matrix(0, rows=n, cols=1)
+for(i in 1:n) {
+    kthDist = hdb::computeKthSmallest(t(distances[i,]), minPts)
+    coreDistances[i] = kthDist
+}
+
+
+# get mutual reachability
+mutualReach = hdb::computeMutualReachability(distances, coreDistances)
+
+
+# get MST
+[edges, weights] = hdb::buildMST(mutualReach, n)
+
+
+# get hierarchy
+[hierarchy, sizes] = hdb::buildHierarchy(edges, weights, n)
+
+
+# get stable clusters
+minClSize = 3
+[labels, stabilities, clusterToNode] = hdb::extractStableClusters(hierarchy, weights, n, minClSize)
+expected_labels = matrix("1 1 1 1 2 2 2 2 3 3 3 3", rows=12, cols=1)
+labels_match = (min(labels == expected_labels) == 1)
+if (labels_match) {
+    print("Pass: labels match.")
+} else {
+    print("Fail: labels don't match.")
+}
+print("Cluster labels:")
+print(toString(labels))
+
+
+# get cluster model
+[centroids, clusterInfo] = hdb::buildClusterModel(X, labels, stabilities, clusterToNode)
+
+print("\nCentroids:")
+print(toString(centroids))
+
+print("\nInfo [size, stability]:")
+print(toString(clusterInfo))
+
+
+# check results
+numClusters = nrow(centroids)
+print("\nNumber of clusters found: " + numClusters)
+
+
+# should find 3 clusters
+test1 = (numClusters == 3)
+print("Found 3 clusters: " + test1)
+
+
+# each cluster should have 4 points
+allSizesFour = TRUE
+for(c in 1:numClusters) {
+    size = as.scalar(clusterInfo[c,1])
+    allSizesFour = allSizesFour & (size == 4)
+}
+print("All clusters have size 4: " + allSizesFour)
+
+
+test_A = min(sqrt(rowSums((centroids - matrix("0 0.125", 1, 2))^2))) < 0.001
+test_B = min(sqrt(rowSums((centroids - matrix("10 10.125", 1, 2))^2))) < 0.001
+test_C = min(sqrt(rowSums((centroids - matrix("20 0.125", 1, 2))^2))) < 0.001
+all_found = test_A & test_B & test_C
+print("Expected centroids near expected positions: " + all_found)
+
+
+# no noise (all assigned to clusters)
+numNoise = sum(labels == -1)
+test4 = (numNoise == 0)
+print("No noise points: " + test4)
+
+
+# stabilities are populated (not 0)
+test_stability = TRUE
+for(c in 1:numClusters) {
+    stab = as.scalar(clusterInfo[c,2])
+    test_stability = test_stability & (stab > 0)
+}
+print("Stabilities populated: " + test_stability)
+
+
+test_pass = test1 & allSizesFour & all_found & test4 & test_stability
+
+if(test_pass) {
+    print("\nAll tests passed")
+} else {
+    print("\nTests failed")
+}
\ No newline at end of file

From 1f98db12fcd7be22412ee4bb5251092241b7d7dc Mon Sep 17 00:00:00 2001
From: anuunchin <88698977+anuunchin@users.noreply.github.com>
Date: Sun, 25 Jan 2026 16:50:13 +0100
Subject: [PATCH 6/7] Overall hdbscan test

---
 test_hdbscan.dml | 51 ++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 test_hdbscan.dml

diff --git a/test_hdbscan.dml b/test_hdbscan.dml
new file mode 100644
index 00000000000..b26aa983e5c
--- /dev/null
+++ b/test_hdbscan.dml
@@ -0,0 +1,51 @@
+source("scripts/builtin/hdbscan.dml") as hdb
+
+# A: 5 points tightly grouped around (0, 0)
+# B: 5 points tightly grouped around (10, 0)
+# Outliers: 2 points far from any cluster
+
+n = 12
+d = 2
+X = matrix(0, rows=n, cols=d)
+
+# A
+X[1,] = matrix("0.0 0.0", rows=1, cols=2)
+X[2,] = matrix("0.3 0.2", rows=1, cols=2)
+X[3,] = matrix("-0.2 0.3", rows=1, cols=2)
+X[4,] = matrix("0.1 -0.2", rows=1, cols=2)
+X[5,] = matrix("-0.1 -0.1", rows=1, cols=2)
+
+# B
+X[6,] = matrix("10.0 0.0", rows=1, cols=2)
+X[7,] = matrix("10.3 0.2", rows=1, cols=2)
+X[8,] = matrix("9.8 0.3", rows=1, cols=2)
+X[9,] = matrix("10.1 -0.2", rows=1, cols=2)
+X[10,] = matrix("9.9 -0.1", rows=1, cols=2)
+
+# Outliers
+X[11,] = matrix("5.0 5.0", rows=1, cols=2)
+X[12,] = matrix("-5.0 -5.0", rows=1, cols=2)
+
+print(toString(X))
+
+minPts = 3
+minClSize = 4  
+
+
+# run hdbscan
+[labels, centroids] = hdb::m_hdbscan(X, minPts, minClSize)
+expected_labels = matrix("1 1 1 1 1 2 2 2 2 2 -1 -1", rows=12, cols=1)
+labels_match = (min(labels == expected_labels) == 1)
+if (labels_match) {
+    print("Pass: labels match.")
+} else {
+    print("Fail: labels don't match.")
+}
+test_A = min(sqrt(rowSums((centroids - matrix("0.020 0.040", 1, 2))^2))) < 0.001
+test_B = min(sqrt(rowSums((centroids - matrix("10.020 0.040", 1, 2))^2))) < 0.001
+all_found = test_A & test_B
+print("Expected centroids near expected positions: " + all_found)
+print("Cluster labels:")
+print(toString(labels))
+print("\nCluster centroids:")
+print(toString(centroids))

From 3fd34345b931ef712c2235f5efac47c2670726b4 Mon Sep 17 00:00:00 2001
From: anuunchin <88698977+anuunchin@users.noreply.github.com>
Date: Sun, 25 Jan 2026 16:51:22 +0100
Subject: [PATCH 7/7] Remove wrong test location

---
 .../apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java   | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java

diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinHDBSCANTest.java
deleted file mode 100644
index e69de29bb2d..00000000000