From 7e8192a9bc929b32318795299e0bb882bbb0e6a0 Mon Sep 17 00:00:00 2001
From: Arno Bock <arno.bock@hotmail.de>
Date: Mon, 26 Jan 2026 21:04:14 +0100
Subject: [PATCH 1/6] [SYSTEMDS-3928] added disjoint subnet masking based on FC
 layers

---
 scripts/builtin/independentSubnetTrain.dml    | 462 ++++++++++++++++++
 .../org/apache/sysds/common/Builtins.java     |   1 +
 .../builtin/indSubnetTest_mnist_lenet.dml     | 186 +++++++
 3 files changed, 649 insertions(+)
 create mode 100644 scripts/builtin/independentSubnetTrain.dml
 create mode 100644 src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml

diff --git a/scripts/builtin/independentSubnetTrain.dml b/scripts/builtin/independentSubnetTrain.dml
new file mode 100644
index 00000000000..a77d3a98e42
--- /dev/null
+++ b/scripts/builtin/independentSubnetTrain.dml
@@ -0,0 +1,462 @@
+m_independentSubnetTrain = function(
+   list[unknown] model,
+   matrix[double]       features,
+   matrix[double]       labels,
+   matrix[double]       val_features,
+   matrix[double]       val_labels,
+   string               upd,
+   string               agg,
+   string               mode,
+   string               utype,
+   int                  epochs,
+   int                  batchsize,
+   int                  j,
+   int                  k,
+   string               scheme,
+   list[unknown]        hyperparams,
+   boolean              verbose,
+   int                  paramsPerLayer,
+   list[int]          fullyConnectedLayers
+)
+return (list[unknown] model_out_2)
+{
+    # ------------------------------------------------------------
+    # Setup
+    # ------------------------------------------------------------
+    print("Entered IST function.")
+    model_out = model
+
+    P = length(model)
+    print("Parameters in model:")
+    print(P)
+
+    N = nrow(features)
+    print("Samples:")
+    print(N)
+
+    print("Is model length NOT divisible by paramsPerLayer? :")
+    print(P %% paramsPerLayer != 0)
+    if (P %% paramsPerLayer != 0) {
+       stop("Model length not divisible by paramsPerLayer")
+    }
+
+    print("We made it")
+    L = as.integer(P / paramsPerLayer)  # total layers
+    print("Layers:")
+    print(L)
+
+    # obtain indices of FC layers
+    fcLayers = fullyConnectedLayers
+    print("FC layers:")
+    print(toString(fcLayers))
+
+    # I. determine shared parameters
+    isSharedParam = matrix(0, 1, P)
+
+    # create mask for all parameters of FC layers
+    isFC = matrix(0, rows=1, cols=L)
+    for (i in 1:length(fcLayers)) {
+       idx = as.integer(as.scalar(fcLayers[i]))
+       isFC[1, idx] = 1
+    }
+    print(toString(isFC))
+
+    isFC_rep = isFC
+    for (r in 2:paramsPerLayer) {
+        isFC_rep = cbind(isFC_rep, isFC)
+    }
+    print(toString(isFC_rep))
+    if (ncol(isFC_rep)!=P) stop("Dimension mismatch for FC layer mask.")
+
+
+    # 1. all non-FC layers are shared
+    isSharedParam = 1 - isFC_rep
+    print(toString(isSharedParam))
+
+
+    # 2. FC bias parameters are shared in: output layer or at the end of a FC block
+    for (paramId in seq(2, paramsPerLayer, 2)) {   # iterate bias blocks only
+        for (l in 1:L) {
+            if (as.scalar(isFC[1,l])==1 & l==L) {
+                p_out_bias = (paramId - 1) * L + L  # output bias is shared across subnets
+                isSharedParam[1, p_out_bias] = 1
+            }
+            else if (as.scalar(isFC[1,l])==1 & l<L & as.scalar(isFC[1,l+1])==0) {
+                p_out_bias = (paramId - 1) * L + l  # end of FC block's bias is shared across subnets
+                isSharedParam[1, p_out_bias] = 1
+            }
+        }
+    }
+    print(toString(isSharedParam))
+    if (ncol(isSharedParam) != P) stop("isSharedParam dimension mismatch")
+
+    # II. calculate update-steps per epoch
+    if (batchsize <= 0) {
+        stepsPerEpoch = 1  # full batch at once TODO what happens in training loop then? use allSampleIndicesRandom directly as batchIndex??
+    } else {
+        stepsPerEpoch = ceil(N / batchsize)
+    }
+    print("Steps per epoch:")
+    print(stepsPerEpoch)
+
+
+    # III. training loop
+    for (epoch in 1:epochs) {
+        # reshuffle indices each epoch
+        randyOrton = rand(rows=N, cols=1)
+        allSampleIndicesRandom = order(target=randyOrton, by=1, decreasing=FALSE, index.return=TRUE)
+        print("Length of random sample indices:")
+        print(length(allSampleIndicesRandom))
+        # print(toString(allSampleIndicesRandom))
+
+        # iterate IST rounds
+        for (step in seq(1, stepsPerEpoch, j)) {
+            print("Iterating IST round:")
+            print(step)
+            # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+            # 1.) create masks for all subnets
+            masks = ist_create_disjoint_masks(model_out, k, L, fcLayers, paramsPerLayer, isFC)
+            #print(toString(masks[1,1]))
+        }
+    }
+
+    # TODO leave as return statement - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    model_out_2 = model
+}
+
+
+# ------------------------------------------------------------
+# Independent Subnet Masking
+#
+# This helper function creates a list of masks, one per subnet.
+# Each mask is a binary vector/matrix indicating which parameters belong to that subnet.
+# ------------------------------------------------------------
+# ASSUMPTIONS:
+#  - neuron ownership is defined via bias vectors
+#  - model is a list of parameter tensors
+#  - trainable parameters are grouped by parameter type i.e. param blocks like (W_l1, W_l2, ..., b_l1, b_l2, ...)
+#  - assumes W and b are always the first two param blocks
+#  - the pattern of optional optimizer state tensors (e.g., vW_l, vb_l) follow the same grouping and always W followed by b
+#  - (output layer & end of FC block) biases are shared -> gradients collide; must be handled by aggregation logic
+# ------------------------------------------------------------
+
+ist_create_disjoint_masks = function(
+    list[unknown] model,
+    int numSubnets,
+    int L,  # total layers including output layer
+    list[int] fullyConnectedLayers,  # the indices of FC-layers starting from 1
+    int paramsPerFCLayer,
+    Matrix[Double] isFC)
+  return (list[unknown] masks)
+{
+    P = length(model)
+    modus = 0  # {0: matrix / 1: list}
+
+    # SANITY CHECKS: ensure provided model can be masked correctly
+    if (as.integer(P / paramsPerFCLayer) != L) {
+        stop("Layer/parameter mismatch. Please make sure each layer has the same amount of parameters.")
+    };
+    if (paramsPerFCLayer < 2 | paramsPerFCLayer %% 2 != 0) {
+        stop("At least 1 pair of W and b needs to be present, as well as parameters need to be W&b pairs.")
+    }
+
+    # I.) initialize and preallocate masks
+    masks = list()
+    for (s in 1:numSubnets) {
+        masks = append(masks, model);
+        #for (k in 1:P) { TODO might be a problem?
+        #    masks[s][k] = matrix(0, rows=nrow(model[p]), cols=ncol(model[p]))
+        #}
+    }
+    print("Masks now has following length:")
+    print(length(masks))
+
+
+    # II.) determine FC layers
+    print("Forwarded fully connected layer information:")
+    print(toString(isFC))
+
+    # TODO NEW START - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - -
+
+    masks_new_meta = matrix(0, rows=length(model), cols=4)  # columns=[start,end,rows,cols]
+    current_position = 1
+    for (p in 1:length(model)) {
+        M = as.matrix(model[p])
+        param_length = ncol(M) * nrow(M)  # as.scalar(ncol(M)) * as.scalar(nrow(M))
+
+        masks_new_meta[p,1] = current_position
+        masks_new_meta[p,2] = current_position + param_length -1
+        masks_new_meta[p,3] = nrow(M)
+        masks_new_meta[p,4] = ncol(M)
+
+        current_position = current_position + param_length
+    }
+    mask_size = current_position-1
+
+    # All subnets in one matrix
+    masks_new = matrix(0, rows=numSubnets, cols=mask_size)
+
+
+    print(masks_new_meta)
+
+    # TODO NEW END - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - -
+
+
+    # III.) iterate all layers
+    for (l in 1:L) {
+        if (as.scalar(isFC[1,l]) == 1) {
+            print("Entered fully connected layer. The layer is:")
+            print(l)
+
+            W = as.matrix(model[l])
+            print("W (rows/cols):")
+            print(nrow(W))
+            print(ncol(W))
+
+            b = as.matrix(model[l+L])
+            print("b:")
+            print(ncol(b))
+
+            H = ncol(W);  # bias neurons in layer l
+            print("H:")
+            print(H)
+
+            # SANITY CHECK
+            if (nrow(b) != 1 | ncol(b) != H) {
+                print("Bias shape mismatch!")
+                print("b:", nrow(b), "x", ncol(b))
+                print("expected: 1 x", H)
+                stop("Invalid bias shape")
+            }
+            if (l!=L & numSubnets>ncol(b)) {  # TODO change to next layer is non-FC logic
+                print("More subnets than available neurons in layer:")
+                print(l)
+                stop("Please use a wider model or decrease the amount of subnets.")
+            }
+
+            randyOrton2 = rand(rows=H, cols=1)  # shuffle all indices
+            allNeuronIndicesRandom = order(target=randyOrton2, by=1, decreasing=FALSE, index.return=TRUE)  # shuffle all indices
+            print("Length of random sample indices:")
+            print(length(allNeuronIndicesRandom))
+
+            # TODO FROM HERE ...
+            chunk_size = floor(H/numSubnets)  # amount of neurons each subnet will consist at least TODO for l=L they are shared s this value will be quite low or even below 1
+            remaining_neurons = H - chunk_size * numSubnets
+            print("Dividing all hidden layer neurons by the number of subnets, each subnet will own at least:")
+            print(chunk_size)
+            print("Following amount of neurons remains and will be randomly assigned to the subnets (at most one to a subnet):")
+            print(remaining_neurons)
+
+            amount_active_neurons = matrix(chunk_size, rows=numSubnets, cols=1)
+            if (remaining_neurons > 0) {
+                randomSubnetIndices = order(target=rand(rows=numSubnets, cols=1, seed=-1), by=1, decreasing=FALSE, index.return=TRUE)  # TODO replace seed for experiments
+                for (i in 1:remaining_neurons) {
+                  sid = as.integer(as.scalar(randomSubnetIndices[i,1]))
+                  amount_active_neurons[sid,1] = as.scalar(amount_active_neurons[sid,1]) + 1  # TODO use pmin()
+                }
+            }
+            print("Amount of active neurons per subnet:")
+            print(amount_active_neurons)
+
+            neuron_end_indices = cumsum(amount_active_neurons)
+            neuron_start_indices = neuron_end_indices - amount_active_neurons + 1
+            print("Start indices of each subnet:")
+            print(neuron_start_indices)
+
+            for(s in 1:numSubnets) {
+                print("Entered subnet:")
+                print(s)
+
+                # A. obtain owned neurons for this layer
+                start = as.integer(as.scalar(neuron_start_indices[s,1]))
+                end = as.integer(as.scalar(neuron_end_indices[s,1]))
+                current_b_indices = allNeuronIndicesRandom[start:end, 1]
+                print(length(current_b_indices))
+                # TODO ... UNTIL HERE: only required for else case (in bias case)
+
+                # B. create masked bias
+                if(l==L) {  # output layer
+                    masked_b = matrix(1, rows=1, cols=ncol(b))
+                }
+                else if (l<L & as.scalar(isFC[1, l+1]) == 0) {  # next layer is not FC TODO works with || operator?
+                    masked_b = matrix(1, rows=1, cols=ncol(b))
+                }
+                else {
+                    masked_b = matrix(0, rows=1, cols=ncol(b))
+                    for (i in 1:nrow(current_b_indices)) {  # TODO vectorized possible?
+                       idx = as.integer(as.scalar(current_b_indices[i,1]))
+                       masked_b[1, idx] = 1
+                    }
+                }
+                #print(masked_b)
+
+                # 2b. create masked weight
+                masked_W = matrix(0, rows=nrow(W), cols=ncol(W))
+                if(l==1) {
+                    for (i in 1:nrow(current_b_indices)) {  # TODO vectorized possible?
+                       idx = as.integer(as.scalar(current_b_indices[i,1]))
+                       masked_W[1:nrow(W), idx] = matrix(1, rows=nrow(W), cols=1)
+                    }
+                }
+                else if (l>1 & as.scalar(isFC[1, l-1])==0) {  # previous layer is not FC TODO works with || operator?
+                    for (i in 1:nrow(current_b_indices)) {  # TODO vectorized possible?
+                       idx = as.integer(as.scalar(current_b_indices[i,1]))
+                       masked_W[1:nrow(W), idx] = matrix(1, rows=nrow(W), cols=1)
+                    }
+                }
+                else {
+                    # obtain active neurons of previous layer
+                    p = L + (l-1) # TODO investigate
+
+                    if (modus==1) {
+                        previous_masked_b_list = as.list(masks[s])
+                        previous_masked_b = as.matrix(previous_masked_b_list[p])
+                        print(toString(previous_masked_b))
+                    } else {
+                        start = as.integer(as.scalar(masks_new_meta[p,1]))
+                        end   = as.integer(as.scalar(masks_new_meta[p,2]))
+                        r     = as.integer(as.scalar(masks_new_meta[p,3]))
+                        c     = as.integer(as.scalar(masks_new_meta[p,4]))
+                        print("Following info was obtained.")
+                        print(start)
+                        print(end)
+                        print(r)
+                        print(c)
+
+                        vec = masks_new[s, start:end]
+                        previous_masked_b = matrix(vec, rows=r, cols=c, byrow=TRUE)  # TODO might be unnecessary
+                        print("Previous bias:")
+                        print(previous_masked_b)
+                    }
+
+                    # SANITY CHECK: dimensions with layers of previous layer match
+                    if (l > 1 & ncol(previous_masked_b) != nrow(W)) {
+                        print("W/prev layer mismatch in layer l=", l)
+                        print("prev_b:", nrow(previous_masked_b), "x", ncol(previous_masked_b))
+                        print("W:", nrow(W), "x", ncol(W))
+                        stop("Invalid W shape wrt previous layer")
+                    }
+
+                    if (nrow(previous_masked_b)==1) previous_masked_b = t(previous_masked_b)
+                    if (ncol(masked_b) == 1) masked_b = t(masked_b)
+
+                    if(l==L) {  # output layer
+                        masked_W = previous_masked_b %*% matrix(1, 1, ncol(masked_W))
+                    }
+                    else if (l<L & as.scalar(isFC[1, l+1]) == 0) {  # next layer is not FC TODO works with || operator?
+                        masked_W = previous_masked_b %*% matrix(1, 1, ncol(masked_W))
+                    } else {
+                        masked_W = previous_masked_b %*% masked_b
+                    }
+                    print("Previous b:")
+                    print(previous_masked_b)
+                }
+                print("Current b:")
+                print(masked_b)
+                print("Current W:")
+                print(masked_W)
+                # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+                # 3. forward these masks to all parameters in this layer
+                if (modus==1) {
+                    # TODO this branch is faulty ATM
+                    for (param in 1:paramsPerFCLayer) {
+                        k = (param-1)*L + l
+
+                        if (param %% 2 == 0) {
+                            # masks[s][k] = masked_b FIXME
+                        } else {
+                            # masks[s][k] = masked_W FIXME
+                        }
+                    }
+                    #print(masks[s])
+                    # TODO this branch is faulty ATM
+                } else {
+                    for (param in 1:paramsPerFCLayer) {
+                        k = (param-1)*L + l
+                        start = as.integer(as.scalar(masks_new_meta[k,1]))
+                        end   = as.integer(as.scalar(masks_new_meta[k,2]))
+                        len   = end - start + 1
+
+                        if (param %% 2 == 0) {
+                            #masks_new[s,start:end] = masked_b
+                            flat = matrix(masked_b, rows=1, cols=len, byrow=TRUE)
+                        } else {
+                            flat = matrix(masked_W, rows=1, cols=len, byrow=TRUE)
+                        }
+                        masks_new[s, start:end] = flat
+                    }
+                    print("Saved masks for this layer:")
+                    print(masks_new[s])
+                }
+            }
+
+            # SANITY CHECK: accumulating all subnets bias's result in vector of all 1s (in FC hidden layers only)
+            if (l < L) {
+                if (as.scalar(isFC[1, l+1]) == 1) {
+                    # bias parameter index for layer l is (L + l)
+                    p = L + l
+
+                    start = as.integer(as.scalar(masks_new_meta[p,1]));
+                    end   = as.integer(as.scalar(masks_new_meta[p,2]));
+                    r     = as.integer(as.scalar(masks_new_meta[p,3]));
+                    c     = as.integer(as.scalar(masks_new_meta[p,4]));
+
+                    # sum across subnets for this bias slice
+                    sumB_flat = colSums(masks_new[1:numSubnets, start:end])
+
+                    # reshape back to bias shape (usually Hx1 or 1xH depending on how you store it)
+                    sumB = matrix(sumB_flat, rows=r, cols=c, byrow=TRUE)
+
+                    if (min(sumB) != 1 | max(sumB) != 1) {
+                        print("Subnet bias masks not a partition in layer l=" + l)
+                        print("min(sumB)=" + min(sumB) + " max(sumB)=" + max(sumB))
+                        stop("Invalid subnet bias partition")
+                    }
+                }
+            }
+        }
+        else {
+            # independent subnet training cannot be used on this layer // Non-FC layer: shared across all subnets -> masks are all-ones
+            for (param in 1:paramsPerFCLayer) {
+                k = (param-1)*L + l
+                start = as.integer(as.scalar(masks_new_meta[k,1]))
+                end   = as.integer(as.scalar(masks_new_meta[k,2]))
+                r     = as.integer(as.scalar(masks_new_meta[k,3]))
+                c     = as.integer(as.scalar(masks_new_meta[k,4]))
+                len   = end - start + 1
+
+                if (param %% 2 == 0) {
+                    shared_b = matrix(1, rows=r, cols=c)
+                    flat = matrix(shared_b, rows=1, cols=len, byrow=TRUE)
+                } else {
+                    shared_W = matrix(1, rows=r, cols=c)
+                    flat = matrix(shared_W, rows=1, cols=len, byrow=TRUE)
+                }
+                masks_new[1:numSubnets, start:end] = matrix(1, numSubnets, 1) %*% flat
+            }
+        }
+    }
+    print("This is the final mask for the 1st subnet:")
+    print(masks_new[1])
+    s = 1
+    # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    for (p in 1:paramsPerFCLayer) {
+        start = as.integer(as.scalar(masks_new_meta[p,1]))
+        end   = as.integer(as.scalar(masks_new_meta[p,2]))
+        r     = as.integer(as.scalar(masks_new_meta[p,3]))
+        c     = as.integer(as.scalar(masks_new_meta[p,4]))
+        print("Following info was obtained.")
+        print(start)
+        print(end)
+        print(r)
+        print(c)
+
+        vec = masks_new[s, start:end]
+        previous_masked_b = matrix(vec, rows=r, cols=c, byrow=TRUE)
+        print(previous_masked_b)
+    }
+    stop("Execution stopped.")
+    # TODO leave as return statement - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    masks = model
+}
\ No newline at end of file
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java b/src/main/java/org/apache/sysds/common/Builtins.java
index 4feab311c76..a914b73de47 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -207,6 +207,7 @@ public enum Builtins {
 	ISNA("is.na", "isNA", false),
 	ISNAN("is.nan", "isNaN", false),
 	ISINF("is.infinite", "isInf", false),
+    ISN_TRAIN("independentSubnetTrain", true),
 	KM("km", true),
 	KMEANS("kmeans", true),
 	KMEANSPREDICT("kmeansPredict", true),
diff --git a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
new file mode 100644
index 00000000000..d28af105929
--- /dev/null
+++ b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
@@ -0,0 +1,186 @@
+
+source("scripts/nn/layers/affine.dml") as affine
+source("scripts/nn/layers/conv2d_builtin.dml") as conv2d
+source("scripts/nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("scripts/nn/layers/dropout.dml") as dropout
+source("scripts/nn/layers/l2_reg.dml") as l2_reg
+source("scripts/nn/layers/max_pool2d_builtin.dml") as max_pool2d
+source("scripts/nn/layers/relu.dml") as relu
+source("scripts/nn/layers/softmax.dml") as softmax
+source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov
+
+train = function(matrix[double] X, matrix[double] Y,
+                 matrix[double] X_val, matrix[double] Y_val,
+                 int C, int Hin, int Win, int epochs, int workers,
+                 string utype, string freq, int batchsize, string scheme, string mode)
+    return (matrix[double] W1, matrix[double] b1,
+            matrix[double] W2, matrix[double] b2,
+            matrix[double] W3, matrix[double] b3,
+            matrix[double] W4, matrix[double] b4) {
+  /*
+   * Trains a convolutional net using the "LeNet" architecture.
+   *
+   * The input matrix, X, has N examples, each represented as a 3D
+   * volume unrolled into a single vector.  The targets, Y, have K
+   * classes, and are one-hot encoded.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - Y: Target matrix, of shape (N, K).
+   *  - X_val: Input validation data matrix, of shape (N, C*Hin*Win).
+   *  - Y_val: Target validation matrix, of shape (N, K).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - epochs: Total number of full training loops over the full data set.
+   *
+   * Outputs:
+   *  - W1: 1st layer weights (parameters) matrix, of shape (F1, C*Hf*Wf).
+   *  - b1: 1st layer biases vector, of shape (F1, 1).
+   *  - W2: 2nd layer weights (parameters) matrix, of shape (F2, F1*Hf*Wf).
+   *  - b2: 2nd layer biases vector, of shape (F2, 1).
+   *  - W3: 3rd layer weights (parameters) matrix, of shape (F2*(Hin/4)*(Win/4), N3).
+   *  - b3: 3rd layer biases vector, of shape (1, N3).
+   *  - W4: 4th layer weights (parameters) matrix, of shape (N3, K).
+   *  - b4: 4th layer biases vector, of shape (1, K).
+   */
+  print("Started training.")
+  N = nrow(X)
+  K = ncol(Y)
+
+  # Parameters in each layer
+  paramsPerLayer = 4  # TODO might be not accurate enough (to use this arg as decider anywhere)
+  fullyConnectedLayers = list(3,4)
+
+  # Create network:
+  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> affine3 -> relu3 -> affine4 -> softmax
+  Hf = 5  # filter height
+  Wf = 5  # filter width
+  stride = 1
+  pad = 2  # For same dimensions, (Hf - stride) / 2
+
+  F1 = 32  # num conv filters in conv1
+  F2 = 64  # num conv filters in conv2
+  N3 = 512  # num nodes in affine3
+  # Note: affine4 has K nodes, which is equal to the number of target dimensions (num classes)
+
+  [W1, b1] = conv2d::init(F1, C, Hf, Wf, -1)  # inputs: (N, C*Hin*Win)
+  [W2, b2] = conv2d::init(F2, F1, Hf, Wf, -1)  # inputs: (N, F1*(Hin/2)*(Win/2))
+  [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3, -1)  # inputs: (N, F2*(Hin/2/2)*(Win/2/2))
+  [W4, b4] = affine::init(N3, K, -1)  # inputs: (N, N3)
+  W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, instead of relu
+
+  # Initialize SGD w/ Nesterov momentum optimizer
+  lr = 0.01  # learning rate
+  mu = 0.9  #0.5  # momentum
+  decay = 0.95  # learning rate decay constant
+  vW1 = sgd_nesterov::init(W1); vb1 = sgd_nesterov::init(b1)
+  vW2 = sgd_nesterov::init(W2); vb2 = sgd_nesterov::init(b2)
+  vW3 = sgd_nesterov::init(W3); vb3 = sgd_nesterov::init(b3)
+  vW4 = sgd_nesterov::init(W4); vb4 = sgd_nesterov::init(b4)
+
+  # Regularization
+  lambda = 5e-04
+
+  # Create the model list
+  modelList = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4) #TODO reinstate
+
+  # Create the hyper parameter list
+  params = list(lr=lr, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3, fullyConnectedLayers=list(3,4))
+
+  # Length of an IST round
+  ist_round = 10 # TODO whats a good value?
+
+  # Use independent subnet training function
+  modelList2 = independentSubnetTrain(features=X, labels=Y, val_features=X_val, val_labels=Y_val, model=modelList, upd="./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::ist_upd_lenet", agg="./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::ist_agg_shared_avg", mode=mode, utype=utype, epochs=epochs, batchsize=batchsize, j=ist_round, k=workers, scheme=scheme, hyperparams=params, verbose=FALSE, paramsPerLayer=paramsPerLayer, fullyConnectedLayers=fullyConnectedLayers)
+  M_old = matrix(1, rows=3, cols=3)
+  #M_new = independentSubnetTrain(M_old)
+  #modelList2 = independentSubnetTrain(M_old)
+
+
+  W1 = as.matrix(modelList2[1])
+  W2 = as.matrix(modelList2[2])
+  W3 = as.matrix(modelList2[3])
+  W4 = as.matrix(modelList2[4])
+  b1 = as.matrix(modelList2[5])
+  b2 = as.matrix(modelList2[6])
+  b3 = as.matrix(modelList2[7])
+  b4 = as.matrix(modelList2[8])
+  print(toString(modelList2))
+  print("Training finished.")
+}
+
+#-------------------------------------------------------------
+# MNIST
+#
+# Load CSV + preprocess + train/val split
+#
+# Returns:
+#       X, Y, X_val, Y_val, C, Hin, Win
+#-------------------------------------------------------------
+
+generate_mnist_datasplit = function(
+    boolean make_three_channels
+)
+return (matrix[double] X, matrix[double] Y,
+        matrix[double] X_val, matrix[double] Y_val,
+        int C, int Hin, int Win)
+{
+    # Read training dataset
+    train = read("./src/test/resources/datasets/MNIST/mnist_train.csv", format="csv")
+
+    # MNIST image properties
+    classes = 10
+    Hin = 28
+    Win = 28
+
+    # Extract images/labels
+    images = train[, 2:ncol(train)]
+    labels = train[, 1]
+
+    N = nrow(images)
+
+    # Scale to [-1, 1]
+    X_all = (images / 255.0) * 2 - 1
+
+    # Channels: LeNet wants C=1; ResNet wants C=3
+    if (make_three_channels) {
+        # duplicate along channels: (N, 784) -> (N, 2352)
+        X_all = cbind(X_all, X_all, X_all)
+        C = 3
+    } else {
+        C = 1
+    }
+
+    # One-hot encode
+    # labels in file are typically 0..9, table expects 1..K
+    Y_all = table(seq(1, N), labels + 1, N, classes)
+
+    # Train/val split
+    val_size = 5000  # Use first val_size rows for val, rest for train (deterministic)
+
+    X_val = X_all[1:val_size, ]
+    Y_val = Y_all[1:val_size, ]
+
+    X = X_all[(val_size+1):N, ]
+    Y = Y_all[(val_size+1):N, ]
+}
+
+#-------------------------------------------------------------
+# EXECUTOR
+#-------------------------------------------------------------
+
+# Training parameters
+epochs = 10  # TODO reinstate 90
+batch_size = 512
+workers = 8
+utype = "BSP"      # or whatever you use
+freq  = "BATCH"    # kept for signature compatibility (IST can ignore)
+scheme = "DISJOINT"  # whatever you wired
+mode = "LOCAL"       # whatever you wired
+
+# 1) Load train/val
+[X, Y, X_val, Y_val, C, Hin, Win] = generate_mnist_datasplit(FALSE)
+
+# 2) Train (IST happens inside train())
+[W1, b1, W2, b2, W3, b3, W4, b4] = train(X, Y, X_val, Y_val, C, Hin, Win, epochs, workers, utype, freq, batch_size, scheme, mode)
\ No newline at end of file

From 0384ba75721da375d6b2412c0c0d81cb186f7144 Mon Sep 17 00:00:00 2001
From: Arno Bock <arno.bock@hotmail.de>
Date: Mon, 26 Jan 2026 21:05:57 +0100
Subject: [PATCH 2/6] [SYSTEMDS-3928] added test file running Independent
 Subnet Training

---
 .../builtin/part1/BuiltinIndSubnetTest.java   | 93 +++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinIndSubnetTest.java

diff --git a/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinIndSubnetTest.java b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinIndSubnetTest.java
new file mode 100644
index 00000000000..c9197248983
--- /dev/null
+++ b/src/test/java/org/apache/sysds/test/functions/builtin/part1/BuiltinIndSubnetTest.java
@@ -0,0 +1,93 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin.part1;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+import org.junit.runners.Parameterized.Parameters;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import static org.junit.Assert.assertTrue;
+
+@RunWith(value = Parameterized.class)
+@net.jcip.annotations.NotThreadSafe
+public class BuiltinIndSubnetTest extends AutomatedTestBase {
+
+	private static final Log LOG = LogFactory.getLog(BuiltinIndSubnetTest.class.getName());
+
+	protected final static String TEST_NAME = "indSubnetTest_mnist_lenet";
+	protected final static String TEST_DIR = "functions/builtin/";
+	protected String TEST_CLASS_DIR = TEST_DIR + BuiltinIndSubnetTest.class.getSimpleName() + "/";
+
+	private final String dataset_path;
+	private final double least_expected_acc;
+	private final String out_path;
+
+	public BuiltinIndSubnetTest(String dataset_path, double least_expected_acc, String out_path) {
+		this.dataset_path = dataset_path;
+		this.least_expected_acc = least_expected_acc;
+		this.out_path = out_path;
+	}
+
+	@Parameters
+	public static Collection<Object[]> data() {
+		String path = "src/test/resources/datasets/MNIST/mnist_test.csv";
+		double least_expected_acc = 0.5;
+		String out_path = "accuracy";
+		List<Object[]> tests = new ArrayList<>();
+		tests.add(new Object[]{path, least_expected_acc, out_path});
+
+		return tests;
+	}
+
+	@Override
+	public void setUp() {
+		addTestConfiguration(TEST_CLASS_DIR, TEST_NAME);
+	}
+
+	@Test
+	public void testClassificationFit() {
+
+		getAndLoadTestConfiguration(TEST_NAME);
+
+		List<String> proArgs = new ArrayList<>();
+		proArgs.add("-args");
+		proArgs.add(this.dataset_path);
+		proArgs.add(output(this.out_path));
+
+		programArgs = proArgs.toArray(new String[proArgs.size()]);
+
+		fullDMLScriptName = getScript();
+
+		LOG.error(runTest(null));
+
+		double[][] from_DML = TestUtils.convertHashMapToDoubleArray(readDMLScalarFromOutputDir(this.out_path));
+		double accuracy = from_DML[0][0];
+		assertTrue("Accuracy lower than expected", accuracy > this.least_expected_acc);
+	}
+}

From 033117adaf2d2191f7536f11bb83963b5be75442 Mon Sep 17 00:00:00 2001
From: Arno Bock <arno.bock@hotmail.de>
Date: Thu, 29 Jan 2026 15:03:05 +0100
Subject: [PATCH 3/6] [SYSTEMDS-3928] implemented Independent Subnet Training
 loop

---
 scripts/builtin/independentSubnetTrain.dml    | 432 +++++++++++++-----
 .../builtin/indSubnetTest_mnist_lenet.dml     | 205 ++++++++-
 2 files changed, 510 insertions(+), 127 deletions(-)

diff --git a/scripts/builtin/independentSubnetTrain.dml b/scripts/builtin/independentSubnetTrain.dml
index a77d3a98e42..8364679314b 100644
--- a/scripts/builtin/independentSubnetTrain.dml
+++ b/scripts/builtin/independentSubnetTrain.dml
@@ -11,7 +11,7 @@ m_independentSubnetTrain = function(
    int                  epochs,
    int                  batchsize,
    int                  j,
-   int                  k,
+   int                  numSubnets,
    string               scheme,
    list[unknown]        hyperparams,
    boolean              verbose,
@@ -22,33 +22,34 @@ return (list[unknown] model_out_2)
 {
     # ------------------------------------------------------------
     # Setup
+    # TODO assumption that the last layer is the output layer
     # ------------------------------------------------------------
-    print("Entered IST function.")
+    if (verbose) print("Entered IST function.")
     model_out = model
 
     P = length(model)
-    print("Parameters in model:")
-    print(P)
+    if (verbose) print("Parameters in model:")
+    if (verbose) print(P)
 
     N = nrow(features)
-    print("Samples:")
-    print(N)
+    if (verbose) print("Samples:")
+    if (verbose) print(N)
 
-    print("Is model length NOT divisible by paramsPerLayer? :")
-    print(P %% paramsPerLayer != 0)
+    if (verbose) print("Is model length NOT divisible by paramsPerLayer? :")
+    if (verbose) print(P %% paramsPerLayer != 0)
     if (P %% paramsPerLayer != 0) {
        stop("Model length not divisible by paramsPerLayer")
     }
 
-    print("We made it")
+    if (verbose) print("We made it")
     L = as.integer(P / paramsPerLayer)  # total layers
-    print("Layers:")
-    print(L)
+    if (verbose) print("Layers:")
+    if (verbose) print(L)
 
     # obtain indices of FC layers
     fcLayers = fullyConnectedLayers
-    print("FC layers:")
-    print(toString(fcLayers))
+    if (verbose) print("FC layers:")
+    if (verbose) print(toString(fcLayers))
 
     # I. determine shared parameters
     isSharedParam = matrix(0, 1, P)
@@ -57,21 +58,21 @@ return (list[unknown] model_out_2)
     isFC = matrix(0, rows=1, cols=L)
     for (i in 1:length(fcLayers)) {
        idx = as.integer(as.scalar(fcLayers[i]))
-       isFC[1, idx] = 1
+       isFC[1, idx] = 1  # TODO vectorize
     }
-    print(toString(isFC))
+    if (verbose) print(toString(isFC))
 
     isFC_rep = isFC
     for (r in 2:paramsPerLayer) {
         isFC_rep = cbind(isFC_rep, isFC)
     }
-    print(toString(isFC_rep))
+    if (verbose) print(toString(isFC_rep))
     if (ncol(isFC_rep)!=P) stop("Dimension mismatch for FC layer mask.")
 
 
     # 1. all non-FC layers are shared
     isSharedParam = 1 - isFC_rep
-    print(toString(isSharedParam))
+    if (verbose) print(toString(isSharedParam))
 
 
     # 2. FC bias parameters are shared in: output layer or at the end of a FC block
@@ -87,51 +88,235 @@ return (list[unknown] model_out_2)
             }
         }
     }
-    print(toString(isSharedParam))
-    if (ncol(isSharedParam) != P) stop("isSharedParam dimension mismatch")
+    if (verbose) print(toString(isSharedParam))
+    if (ncol(isSharedParam) != P) stop("isSharedParam dimension mismatch!")
 
     # II. calculate update-steps per epoch
-    if (batchsize <= 0) {
-        stepsPerEpoch = 1  # full batch at once TODO what happens in training loop then? use allSampleIndicesRandom directly as batchIndex??
+    if (batchsize<=0 | batchsize>N) {
+        stop("Batch size is out of bounds!") # TODO mention concrete values
     } else {
         stepsPerEpoch = ceil(N / batchsize)
     }
-    print("Steps per epoch:")
-    print(stepsPerEpoch)
+    if (verbose) print("Steps per epoch:")
+    if (verbose) print(stepsPerEpoch)
 
 
     # III. training loop
     for (epoch in 1:epochs) {
-        # reshuffle indices each epoch
+        print("Entered epoch: " + epoch)
+        # TODO NEW start - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+        # A.) reshuffle indices each epoch
         randyOrton = rand(rows=N, cols=1)
         allSampleIndicesRandom = order(target=randyOrton, by=1, decreasing=FALSE, index.return=TRUE)
-        print("Length of random sample indices:")
-        print(length(allSampleIndicesRandom))
-        # print(toString(allSampleIndicesRandom))
 
-        # iterate IST rounds
+        batchIndices = allSampleIndicesRandom[, 1]
+
+        b = nrow(batchIndices)
+        I = seq(1, b, 1)
+        V = matrix(1, rows=b, cols=1)
+
+        S = table(I, batchIndices, V, b, N)
+        features_shuffled = S %*% features
+        labels_shuffled = S %*% labels
+        # TODO NEW end - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+        if (verbose) print("Length of random sample indices:")
+        if (verbose) print(length(allSampleIndicesRandom))
+
+        # B.) iterate IST rounds
         for (step in seq(1, stepsPerEpoch, j)) {
-            print("Iterating IST round:")
-            print(step)
-            # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+            print("Starting new IST round at step: " + step)
+            round_model = model_out  # prevent accidental mutation of model_out
+            if (verbose) print("Iterating IST round:")
+            if (verbose) print(step)
 
             # 1.) create masks for all subnets
-            masks = ist_create_disjoint_masks(model_out, k, L, fcLayers, paramsPerLayer, isFC)
-            #print(toString(masks[1,1]))
+            [masks, masks_meta_info] = ist_create_disjoint_masks(round_model, numSubnets, L, fcLayers, paramsPerLayer, isFC, verbose)
+            if (verbose) print(toString(masks))
+
+            # 2.) preallocate list for all subnets TODO move outside epoch loop to prevent constantly allocating? could lead to unwanted side effects if reused in next IST round
+            updatedSubnets = list()
+            updatedSubnetsMasks = list()
+            for (s in 1:numSubnets) {
+                updatedSubnets      = append(updatedSubnets, list())
+                updatedSubnetsMasks = append(updatedSubnetsMasks, list())
+            }
+
+            # 3.) perform 'j' local gradient steps for each subnet
+            for (subnet in 1:numSubnets) { # TODO make parfor
+                if (verbose) print("Current model:")
+                if (verbose) print(round_model)
+
+                # a.) obtain masked subnet
+                subnet_model = list()
+                subnet_model_mask = list()
+                for (p in 1:length(round_model)) {
+                    param_start_idx = as.integer(as.scalar(masks_meta_info[p,1]))
+                    param_end_idx   = as.integer(as.scalar(masks_meta_info[p,2]))
+                    param_rows     = as.integer(as.scalar(masks_meta_info[p,3]))
+                    param_cols     = as.integer(as.scalar(masks_meta_info[p,4]))
+                    if (verbose) print("Following info was obtained.")
+                    if (verbose) print(param_start_idx)
+                    if (verbose) print(param_end_idx)
+                    if (verbose) print(param_rows)
+                    if (verbose) print(param_cols)
+
+                    vec = masks[subnet, param_start_idx:param_end_idx]
+                    param_mask = matrix(vec, rows=param_rows, cols=param_cols, byrow=TRUE)
+                    param = as.matrix(round_model[p])
+                    subnet_model = append(subnet_model, param * param_mask)  # elementwise mask
+                    subnet_model_mask = append(subnet_model_mask, param_mask)
+                }
+                if (verbose) print("SUBNET model #" + subnet)
+                if (verbose) print(subnet_model)
+
+                if (verbose) print("SUBNET model MASK:")
+                if (verbose) print(subnet_model_mask)
+
+                # b.) local optimization steps / IST round
+                localSteps = min(j, (stepsPerEpoch-step+1))
+                for (localStep in 1:localSteps) {
+                    if (verbose) print("Local Step:")
+                    mb = (step-1) + localStep
+                    if (verbose) print(mb)
+
+                    mb = (step-1) + localStep  # mini batch idx
+                    start = (mb-1)*batchsize + 1
+                    end   = min(mb*batchsize, N)
+
+                    Xb = features_shuffled[start:end, 1:ncol(features_shuffled)]
+                    yb = labels_shuffled[start:end, 1:ncol(labels_shuffled)]
+
+                    if (verbose) print("Xb:")
+                    if (verbose) print(Xb)
+
+                    if (verbose) print("yb:")
+                    if (verbose) print(yb)
+
+                    # compute gradients for subnet s + apply update (SGD/Adam/etc.) on owned params (only)
+                    subnet_model = as.list(evalList(upd, list(model=subnet_model, mask=subnet_model_mask, features=Xb, labels=yb, hyperparams=hyperparams)))
+
+                }
+                if (verbose) print("Subnet model mask:")
+                if (verbose) print(subnet_model_mask)
+                if (verbose) print("owned rows = " + nrow(subnet_model_mask))
+                if (verbose) print("owned cols = " + ncol(subnet_model_mask))
+                #stop("EXEC finished!")
+
+                if (verbose) print("Trained subnet model:")
+                if (verbose) print(subnet_model)
+                if (verbose) print("owned rows = " + nrow(subnet_model))
+                if (verbose) print("owned cols = " + ncol(subnet_model))
+
+                # c.) save updated subnet and mask
+                updatedSubnets[subnet] = list(subnet_model)
+                updatedSubnetsMasks[subnet] = list(subnet_model_mask)
+            }
+
+            if (verbose) print("ALL SUBNET UPDATES COMBINED:")
+            if (verbose) print(toString(updatedSubnets))
+            if (verbose) stop("Performed updates on all subnets!")
+
+            #print("ALL SUBNET MASKS COMBINED:")
+            #print(toString(updatedSubnetsMasks))
+            #stop("Performed updates on all subnets!")
+
+            # 4.) aggregate updates into global model (i.e. model_out)
+            for (p in 1:P) {  # TODO parfor ?
+                if (as.scalar(isSharedParam[1, p])==1) {
+                    # construct full model update by aggregating shared parameter updates from all subnets
+                    subnetParams = list()
+                    subnetMasks  = list()
+                    for (s in 1:numSubnets) {
+                        subnet = as.list(updatedSubnets[s])
+                        subnetMask = as.list(updatedSubnetsMasks[s])
+
+                        subnetParams = append(subnetParams, as.matrix(subnet[p]))
+                        subnetMasks = append(subnetMasks, as.matrix(subnetMask[p]))
+                    }
+                    if (verbose) print("1st param (W) from all subnets:")
+                    if (verbose) print(toString(subnetParams))
+
+                    if (verbose) print("Masks of 1st param (W) from all subnets:")
+                    if (verbose) print(toString(subnetMasks))
+
+
+                    # aggregate shared parameters based on provided function
+                    averagedUpdatedParam = eval(agg, list(initialParam=as.matrix(round_model[p]), allSubnetsParam=subnetParams, allSubnetsMasks=subnetMasks))
+                    if (verbose) print("Successfully retrieved averaged updates for shared param.")
+                    if (verbose) print(averagedUpdatedParam)
+
+                    round_model[p] = averagedUpdatedParam
+                    if (verbose) print("Averaging shared params has been successful.")
+                    if (verbose) print(toString(round_model[p]))
+               }
+               else {
+                    # construct full model update by filling with disjointly partitioned parameter updates from all subnets
+                    initialParam = as.matrix(round_model[p])
+                    updatedParam = matrix(0, nrow(initialParam), ncol(initialParam))
+                    owned = matrix(0, nrow(initialParam), ncol(initialParam))
+
+                    if (verbose) print("INITAL PARAM")
+                    if (verbose) print(toString(initialParam))
+
+                    if (verbose) print("owned rows = " + nrow(owned))
+                    if (verbose) print("owned cols = " + ncol(owned));
+
+                    for (s in 1:numSubnets) {
+                        if (verbose) print("Subnet number: " + s)
+                        if (verbose) print("Param number: " + p)
+                        subnet = as.list(updatedSubnets[s])
+                        subnetMask = as.list(updatedSubnetsMasks[s])
+
+                        owned = owned + as.matrix(subnetMask[p])
+                        updatedParam = updatedParam + as.matrix(subnet[p]) #* as.matrix(subnetMask[p])
+
+                        if (verbose) print("Subnets mask (acc):")
+                        if (verbose) print(owned)
+                        if (verbose) print("Subnets update:")
+                        if (verbose) print(as.matrix(subnet[p]))
+                    }
+
+                    # SANITY CHECK:
+                    if (verbose) print("owned rows = " + nrow(owned))
+                    if (verbose) print("owned cols = " + ncol(owned))
+
+                    if (verbose) print("OWNED")
+                    if (verbose) print(owned)
+
+                    overlap = max(owned)
+                    if (verbose) print("MAXIMUM")
+                    if (verbose) print(overlap)
+
+                    if (overlap > 1) stop("Overlap detected")
+                    round_model[p] = updatedParam
+
+                    if (verbose) print("Reconstructing disjoint params has been successful.")
+                    if (verbose) print(toString(round_model[p]))
+               }
+            }
+
+            # end of the IST round
+            model_out = round_model
+
+            print("An IST round has been successfully executed! The updated model is:.")
+            print(toString(model_out))
+            #stop("An IST round has been successfully executed!")
+            # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
         }
     }
 
     # TODO leave as return statement - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    model_out_2 = model
+    model_out_2 = model_out
 }
 
 
-# ------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
 # Independent Subnet Masking
 #
 # This helper function creates a list of masks, one per subnet.
 # Each mask is a binary vector/matrix indicating which parameters belong to that subnet.
-# ------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
 # ASSUMPTIONS:
 #  - neuron ownership is defined via bias vectors
 #  - model is a list of parameter tensors
@@ -139,7 +324,7 @@ return (list[unknown] model_out_2)
 #  - assumes W and b are always the first two param blocks
 #  - the pattern of optional optimizer state tensors (e.g., vW_l, vb_l) follow the same grouping and always W followed by b
 #  - (output layer & end of FC block) biases are shared -> gradients collide; must be handled by aggregation logic
-# ------------------------------------------------------------
+# ----------------------------------------------------------------------------------------------------------------------
 
 ist_create_disjoint_masks = function(
     list[unknown] model,
@@ -147,11 +332,12 @@ ist_create_disjoint_masks = function(
     int L,  # total layers including output layer
     list[int] fullyConnectedLayers,  # the indices of FC-layers starting from 1
     int paramsPerFCLayer,
-    Matrix[Double] isFC)
-  return (list[unknown] masks)
+    Matrix[Double] isFC,
+    boolean verbose)
+  return (Matrix[Double] masks_new, Matrix[Double] masks_new_meta)
 {
     P = length(model)
-    modus = 0  # {0: matrix / 1: list}
+    modus = 0  # masks stored in {0: matrix / 1: list}
 
     # SANITY CHECKS: ensure provided model can be masked correctly
     if (as.integer(P / paramsPerFCLayer) != L) {
@@ -169,16 +355,15 @@ ist_create_disjoint_masks = function(
         #    masks[s][k] = matrix(0, rows=nrow(model[p]), cols=ncol(model[p]))
         #}
     }
-    print("Masks now has following length:")
-    print(length(masks))
+    if (verbose) print("Masks now has following length:")
+    if (verbose) print(length(masks))
 
 
     # II.) determine FC layers
-    print("Forwarded fully connected layer information:")
-    print(toString(isFC))
+    if (verbose) print("Forwarded fully connected layer information:")
+    if (verbose) print(toString(isFC))
 
     # TODO NEW START - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - -
-
     masks_new_meta = matrix(0, rows=length(model), cols=4)  # columns=[start,end,rows,cols]
     current_position = 1
     for (p in 1:length(model)) {
@@ -196,145 +381,142 @@ ist_create_disjoint_masks = function(
 
     # All subnets in one matrix
     masks_new = matrix(0, rows=numSubnets, cols=mask_size)
-
-
-    print(masks_new_meta)
-
+    if (verbose) print(masks_new_meta)
     # TODO NEW END - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - -
 
 
     # III.) iterate all layers
     for (l in 1:L) {
         if (as.scalar(isFC[1,l]) == 1) {
-            print("Entered fully connected layer. The layer is:")
-            print(l)
+            if (verbose) print("Entered fully connected layer. The layer is:")
+            if (verbose) print(l)
 
             W = as.matrix(model[l])
-            print("W (rows/cols):")
-            print(nrow(W))
-            print(ncol(W))
+            if (verbose) print("W (rows/cols):")
+            if (verbose) print(nrow(W))
+            if (verbose) print(ncol(W))
 
             b = as.matrix(model[l+L])
-            print("b:")
-            print(ncol(b))
+            if (verbose) print("b:")
+            if (verbose) print(ncol(b))
 
             H = ncol(W);  # bias neurons in layer l
-            print("H:")
-            print(H)
+            if (verbose) print("H:")
+            if (verbose) print(H)
 
             # SANITY CHECK
             if (nrow(b) != 1 | ncol(b) != H) {
-                print("Bias shape mismatch!")
-                print("b:", nrow(b), "x", ncol(b))
-                print("expected: 1 x", H)
+                if (verbose) print("Bias shape mismatch!")
+                if (verbose) print("b:", nrow(b), "x", ncol(b))
+                if (verbose) print("expected: 1 x", H)
                 stop("Invalid bias shape")
             }
             if (l!=L & numSubnets>ncol(b)) {  # TODO change to next layer is non-FC logic
-                print("More subnets than available neurons in layer:")
-                print(l)
+                if (verbose) print("More subnets than available neurons in layer:")
+                if (verbose) print(l)
                 stop("Please use a wider model or decrease the amount of subnets.")
             }
 
             randyOrton2 = rand(rows=H, cols=1)  # shuffle all indices
             allNeuronIndicesRandom = order(target=randyOrton2, by=1, decreasing=FALSE, index.return=TRUE)  # shuffle all indices
-            print("Length of random sample indices:")
-            print(length(allNeuronIndicesRandom))
+            if (verbose) print("Length of random sample indices:")
+            if (verbose) print(length(allNeuronIndicesRandom))
 
             # TODO FROM HERE ...
             chunk_size = floor(H/numSubnets)  # amount of neurons each subnet will consist at least TODO for l=L they are shared s this value will be quite low or even below 1
             remaining_neurons = H - chunk_size * numSubnets
-            print("Dividing all hidden layer neurons by the number of subnets, each subnet will own at least:")
-            print(chunk_size)
-            print("Following amount of neurons remains and will be randomly assigned to the subnets (at most one to a subnet):")
-            print(remaining_neurons)
+            if (verbose) print("Dividing all hidden layer neurons by the number of subnets, each subnet will own at least:")
+            if (verbose) print(chunk_size)
+            if (verbose) print("Following amount of neurons remains and will be randomly assigned to the subnets (at most one to a subnet):")
+            if (verbose) print(remaining_neurons)
 
             amount_active_neurons = matrix(chunk_size, rows=numSubnets, cols=1)
             if (remaining_neurons > 0) {
                 randomSubnetIndices = order(target=rand(rows=numSubnets, cols=1, seed=-1), by=1, decreasing=FALSE, index.return=TRUE)  # TODO replace seed for experiments
                 for (i in 1:remaining_neurons) {
                   sid = as.integer(as.scalar(randomSubnetIndices[i,1]))
-                  amount_active_neurons[sid,1] = as.scalar(amount_active_neurons[sid,1]) + 1  # TODO use pmin()
+                  amount_active_neurons[sid,1] = as.scalar(amount_active_neurons[sid,1]) + 1  # TODO VECTORIZE use pmin()
                 }
             }
-            print("Amount of active neurons per subnet:")
-            print(amount_active_neurons)
+            if (verbose) print("Amount of active neurons per subnet:")
+            if (verbose) print(amount_active_neurons)
 
             neuron_end_indices = cumsum(amount_active_neurons)
             neuron_start_indices = neuron_end_indices - amount_active_neurons + 1
-            print("Start indices of each subnet:")
-            print(neuron_start_indices)
+            if (verbose) print("Start indices of each subnet:")
+            if (verbose) print(neuron_start_indices)
 
             for(s in 1:numSubnets) {
-                print("Entered subnet:")
-                print(s)
+                if (verbose) print("Entered subnet:")
+                if (verbose) print(s)
 
                 # A. obtain owned neurons for this layer
                 start = as.integer(as.scalar(neuron_start_indices[s,1]))
                 end = as.integer(as.scalar(neuron_end_indices[s,1]))
                 current_b_indices = allNeuronIndicesRandom[start:end, 1]
-                print(length(current_b_indices))
+                if (verbose) print(length(current_b_indices))
                 # TODO ... UNTIL HERE: only required for else case (in bias case)
 
                 # B. create masked bias
                 if(l==L) {  # output layer
                     masked_b = matrix(1, rows=1, cols=ncol(b))
                 }
-                else if (l<L & as.scalar(isFC[1, l+1]) == 0) {  # next layer is not FC TODO works with || operator?
+                else if (l<L & as.scalar(isFC[1, l+1]) == 0) {  # next layer is not FC
                     masked_b = matrix(1, rows=1, cols=ncol(b))
                 }
                 else {
                     masked_b = matrix(0, rows=1, cols=ncol(b))
-                    for (i in 1:nrow(current_b_indices)) {  # TODO vectorized possible?
+                    for (i in 1:nrow(current_b_indices)) {  # TODO VECTORIZE
                        idx = as.integer(as.scalar(current_b_indices[i,1]))
                        masked_b[1, idx] = 1
                     }
                 }
-                #print(masked_b)
+                if (verbose) print(masked_b)
 
                 # 2b. create masked weight
                 masked_W = matrix(0, rows=nrow(W), cols=ncol(W))
                 if(l==1) {
-                    for (i in 1:nrow(current_b_indices)) {  # TODO vectorized possible?
+                    for (i in 1:nrow(current_b_indices)) {  # TODO VECTORIZE
                        idx = as.integer(as.scalar(current_b_indices[i,1]))
                        masked_W[1:nrow(W), idx] = matrix(1, rows=nrow(W), cols=1)
                     }
                 }
-                else if (l>1 & as.scalar(isFC[1, l-1])==0) {  # previous layer is not FC TODO works with || operator?
-                    for (i in 1:nrow(current_b_indices)) {  # TODO vectorized possible?
+                else if (l>1 & as.scalar(isFC[1, l-1])==0) {  # previous layer is not FC
+                    for (i in 1:nrow(current_b_indices)) {  # TODO VECTORIZE
                        idx = as.integer(as.scalar(current_b_indices[i,1]))
                        masked_W[1:nrow(W), idx] = matrix(1, rows=nrow(W), cols=1)
                     }
                 }
                 else {
                     # obtain active neurons of previous layer
-                    p = L + (l-1) # TODO investigate
+                    p = L + (l-1)
 
                     if (modus==1) {
                         previous_masked_b_list = as.list(masks[s])
                         previous_masked_b = as.matrix(previous_masked_b_list[p])
-                        print(toString(previous_masked_b))
+                        if (verbose) print(toString(previous_masked_b))
                     } else {
                         start = as.integer(as.scalar(masks_new_meta[p,1]))
                         end   = as.integer(as.scalar(masks_new_meta[p,2]))
                         r     = as.integer(as.scalar(masks_new_meta[p,3]))
                         c     = as.integer(as.scalar(masks_new_meta[p,4]))
-                        print("Following info was obtained.")
-                        print(start)
-                        print(end)
-                        print(r)
-                        print(c)
+                        if (verbose) print("Following info was obtained.")
+                        if (verbose) print(start)
+                        if (verbose) print(end)
+                        if (verbose) print(r)
+                        if (verbose) print(c)
 
                         vec = masks_new[s, start:end]
                         previous_masked_b = matrix(vec, rows=r, cols=c, byrow=TRUE)  # TODO might be unnecessary
-                        print("Previous bias:")
-                        print(previous_masked_b)
+                        if (verbose) print("Previous bias:")
+                        if (verbose) print(previous_masked_b)
                     }
 
                     # SANITY CHECK: dimensions with layers of previous layer match
                     if (l > 1 & ncol(previous_masked_b) != nrow(W)) {
-                        print("W/prev layer mismatch in layer l=", l)
-                        print("prev_b:", nrow(previous_masked_b), "x", ncol(previous_masked_b))
-                        print("W:", nrow(W), "x", ncol(W))
+                        if (verbose) print("W/prev layer mismatch in layer l=", l)
+                        if (verbose) print("prev_b:", nrow(previous_masked_b), "x", ncol(previous_masked_b))
+                        if (verbose) print("W:", nrow(W), "x", ncol(W))
                         stop("Invalid W shape wrt previous layer")
                     }
 
@@ -344,19 +526,19 @@ ist_create_disjoint_masks = function(
                     if(l==L) {  # output layer
                         masked_W = previous_masked_b %*% matrix(1, 1, ncol(masked_W))
                     }
-                    else if (l<L & as.scalar(isFC[1, l+1]) == 0) {  # next layer is not FC TODO works with || operator?
+                    else if (l<L & as.scalar(isFC[1, l+1]) == 0) {  # next layer is not FC
                         masked_W = previous_masked_b %*% matrix(1, 1, ncol(masked_W))
                     } else {
                         masked_W = previous_masked_b %*% masked_b
                     }
-                    print("Previous b:")
-                    print(previous_masked_b)
+                    if (verbose) print("Previous b:")
+                    if (verbose) print(previous_masked_b)
                 }
-                print("Current b:")
-                print(masked_b)
-                print("Current W:")
-                print(masked_W)
-                # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+                if (verbose) print("Current b:")
+                if (verbose) print(masked_b)
+                if (verbose) print("Current W:")
+                if (verbose) print(masked_W)
+
                 # 3. forward these masks to all parameters in this layer
                 if (modus==1) {
                     # TODO this branch is faulty ATM
@@ -369,7 +551,6 @@ ist_create_disjoint_masks = function(
                             # masks[s][k] = masked_W FIXME
                         }
                     }
-                    #print(masks[s])
                     # TODO this branch is faulty ATM
                 } else {
                     for (param in 1:paramsPerFCLayer) {
@@ -386,8 +567,8 @@ ist_create_disjoint_masks = function(
                         }
                         masks_new[s, start:end] = flat
                     }
-                    print("Saved masks for this layer:")
-                    print(masks_new[s])
+                    if (verbose) print("Saved masks for this layer:")
+                    if (verbose) print(masks_new[s])
                 }
             }
 
@@ -409,8 +590,8 @@ ist_create_disjoint_masks = function(
                     sumB = matrix(sumB_flat, rows=r, cols=c, byrow=TRUE)
 
                     if (min(sumB) != 1 | max(sumB) != 1) {
-                        print("Subnet bias masks not a partition in layer l=" + l)
-                        print("min(sumB)=" + min(sumB) + " max(sumB)=" + max(sumB))
+                        if (verbose) print("Subnet bias masks not a partition in layer l=" + l)
+                        if (verbose) print("min(sumB)=" + min(sumB) + " max(sumB)=" + max(sumB))
                         stop("Invalid subnet bias partition")
                     }
                 }
@@ -437,26 +618,25 @@ ist_create_disjoint_masks = function(
             }
         }
     }
-    print("This is the final mask for the 1st subnet:")
-    print(masks_new[1])
-    s = 1
-    # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    for (p in 1:paramsPerFCLayer) {
-        start = as.integer(as.scalar(masks_new_meta[p,1]))
-        end   = as.integer(as.scalar(masks_new_meta[p,2]))
-        r     = as.integer(as.scalar(masks_new_meta[p,3]))
-        c     = as.integer(as.scalar(masks_new_meta[p,4]))
-        print("Following info was obtained.")
-        print(start)
-        print(end)
-        print(r)
-        print(c)
+
+    # DEBUG: visualize the layer masks for -> the 1st subnet and the 1st parameter only i.e W1..W4
+    if (verbose) print("This is the final mask for the 1st subnet:")
+    if (verbose) print(masks_new[1])
+    s = 1  # 1st subnet
+    for (layer in 1:L) {
+        start = as.integer(as.scalar(masks_new_meta[layer,1]))
+        end   = as.integer(as.scalar(masks_new_meta[layer,2]))
+        r     = as.integer(as.scalar(masks_new_meta[layer,3]))
+        c     = as.integer(as.scalar(masks_new_meta[layer,4]))
+        if (verbose) print("Following info was obtained.")
+        if (verbose) print(start)
+        if (verbose) print(end)
+        if (verbose) print(r)
+        if (verbose) print(c)
 
         vec = masks_new[s, start:end]
         previous_masked_b = matrix(vec, rows=r, cols=c, byrow=TRUE)
-        print(previous_masked_b)
+        if (verbose) print(previous_masked_b)
     }
-    stop("Execution stopped.")
-    # TODO leave as return statement - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    masks = model
+    # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 }
\ No newline at end of file
diff --git a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
index d28af105929..0fe94755211 100644
--- a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
+++ b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
@@ -9,6 +9,10 @@ source("scripts/nn/layers/relu.dml") as relu
 source("scripts/nn/layers/softmax.dml") as softmax
 source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov
 
+#-------------------------------------------------------------
+# TRAINING
+#-------------------------------------------------------------
+
 train = function(matrix[double] X, matrix[double] Y,
                  matrix[double] X_val, matrix[double] Y_val,
                  int C, int Hin, int Win, int epochs, int workers,
@@ -92,7 +96,9 @@ train = function(matrix[double] X, matrix[double] Y,
   ist_round = 10 # TODO whats a good value?
 
   # Use independent subnet training function
-  modelList2 = independentSubnetTrain(features=X, labels=Y, val_features=X_val, val_labels=Y_val, model=modelList, upd="./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::ist_upd_lenet", agg="./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::ist_agg_shared_avg", mode=mode, utype=utype, epochs=epochs, batchsize=batchsize, j=ist_round, k=workers, scheme=scheme, hyperparams=params, verbose=FALSE, paramsPerLayer=paramsPerLayer, fullyConnectedLayers=fullyConnectedLayers)
+  s1 = "./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::computeGradients"
+  s2 = "./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::ist_agg_shared_avg"
+  modelList2 = independentSubnetTrain(features=X, labels=Y, val_features=X_val, val_labels=Y_val, model=modelList, upd="computeGradients", agg="aggregateSharedParameters", mode=mode, utype=utype, epochs=epochs, batchsize=batchsize, j=ist_round, numSubnets=workers, scheme=scheme, hyperparams=params, verbose=FALSE, paramsPerLayer=paramsPerLayer, fullyConnectedLayers=fullyConnectedLayers)
   M_old = matrix(1, rows=3, cols=3)
   #M_new = independentSubnetTrain(M_old)
   #modelList2 = independentSubnetTrain(M_old)
@@ -110,6 +116,203 @@ train = function(matrix[double] X, matrix[double] Y,
   print("Training finished.")
 }
 
+#-------------------------------------------------------------
+# GRADIENTS
+#-------------------------------------------------------------
+computeGradients = function(
+    list[unknown] model,
+    list[unknown] mask,
+    matrix[double] features,
+    matrix[double] labels,
+    list[unknown] hyperparams
+
+) return (list[unknown] subnet_model) {
+# TODO) return (Matrix[Double] MR1, Matrix[Double] MR2) {
+
+    # 1) full gradients
+    grads = gradients(model=model, hyperparams=hyperparams, features=features, labels=labels)
+    #print("GRADS:")
+    #print(toString(grads))
+
+    # 2) mask gradients
+    grads_masked = list()
+    #print("Length of gradients: " + length(grads))
+    for (p in 1:length(grads)) {  # TODO use parfor?
+        grads_masked = append(grads_masked, as.matrix(grads[p]) * as.matrix(mask[p]))
+
+        if (FALSE) {
+            print("GRADS: p=" + p)
+            print(toString(grads[p]))
+
+            print("MASKED GRADS: p=" + p)
+            print(toString(grads_masked[p]))
+
+        }
+    }
+
+    # 3) apply optimizer step locally
+    subnet_model = aggregation(model=model, hyperparams=hyperparams, gradients=grads_masked)
+
+    # 4) mask velocities
+    for (p in 9:length(model)) {  # TODO use parfor?
+        #model_out[p] = as.matrix(model_out[p]) * as.matrix(mask[p]) TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO
+    }
+}
+
+# Should always use 'features' (batch features), 'labels' (batch labels),
+# 'hyperparams', 'model' as the arguments
+# and return the gradients of type list
+gradients = function(list[unknown] model,
+                     list[unknown] hyperparams,
+                     matrix[double] features,
+                     matrix[double] labels)
+          return (list[unknown] gradients) {
+
+  C = as.integer(as.scalar(hyperparams["C"]))
+  Hin = as.integer(as.scalar(hyperparams["Hin"]))
+  Win = as.integer(as.scalar(hyperparams["Win"]))
+  Hf = as.integer(as.scalar(hyperparams["Hf"]))
+  Wf = as.integer(as.scalar(hyperparams["Wf"]))
+  stride = as.integer(as.scalar(hyperparams["stride"]))
+  pad = as.integer(as.scalar(hyperparams["pad"]))
+  lambda = as.double(as.scalar(hyperparams["lambda"]))
+  F1 = as.integer(as.scalar(hyperparams["F1"]))
+  F2 = as.integer(as.scalar(hyperparams["F2"]))
+  N3 = as.integer(as.scalar(hyperparams["N3"]))
+  W1 = as.matrix(model[1])
+  W2 = as.matrix(model[2])
+  W3 = as.matrix(model[3])
+  W4 = as.matrix(model[4])
+  b1 = as.matrix(model[5])
+  b2 = as.matrix(model[6])
+  b3 = as.matrix(model[7])
+  b4 = as.matrix(model[8])
+
+  # Compute forward pass
+  ## layer 1: conv1 -> relu1 -> pool1
+  [outc1, Houtc1, Woutc1] = conv2d::forward(features, W1, b1, C, Hin, Win, Hf, Wf,
+                                              stride, stride, pad, pad)
+  outr1 = relu::forward(outc1)
+  [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
+  ## layer 2: conv2 -> relu2 -> pool2
+  [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, Woutp1, Hf, Wf,
+                                            stride, stride, pad, pad)
+  outr2 = relu::forward(outc2)
+  [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
+  ## layer 3:  affine3 -> relu3 -> dropout
+  outa3 = affine::forward(outp2, W3, b3)
+  outr3 = relu::forward(outa3)
+  [outd3, maskd3] = dropout::forward(outr3, 0.5, -1)
+  ## layer 4:  affine4 -> softmax
+  outa4 = affine::forward(outd3, W4, b4)
+  probs = softmax::forward(outa4)
+
+  # Compute data backward pass
+  ## loss:
+  dprobs = cross_entropy_loss::backward(probs, labels)
+  ## layer 4:  affine4 -> softmax
+  douta4 = softmax::backward(dprobs, outa4)
+  [doutd3, dW4, db4] = affine::backward(douta4, outr3, W4, b4)
+  ## layer 3:  affine3 -> relu3 -> dropout
+  doutr3 = dropout::backward(doutd3, outr3, 0.5, maskd3)
+  douta3 = relu::backward(doutr3, outa3)
+  [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
+  ## layer 2: conv2 -> relu2 -> pool2
+  doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, Woutc2, 2, 2, 2, 2, 0, 0)
+  doutc2 = relu::backward(doutr2, outc2)
+  [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, b2, F1,
+                                        Houtp1, Woutp1, Hf, Wf, stride, stride, pad, pad)
+  ## layer 1: conv1 -> relu1 -> pool1
+  doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, Woutc1, 2, 2, 2, 2, 0, 0)
+  doutc1 = relu::backward(doutr1, outc1)
+  [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, features, W1, b1, C, Hin, Win,
+                                          Hf, Wf, stride, stride, pad, pad)
+
+  # Compute regularization backward pass
+  dW1_reg = l2_reg::backward(W1, lambda)
+  dW2_reg = l2_reg::backward(W2, lambda)
+  dW3_reg = l2_reg::backward(W3, lambda)
+  dW4_reg = l2_reg::backward(W4, lambda)
+  dW1 = dW1 + dW1_reg
+  dW2 = dW2 + dW2_reg
+  dW3 = dW3 + dW3_reg
+  dW4 = dW4 + dW4_reg
+
+  gradients = list(dW1, dW2, dW3, dW4, db1, db2, db3, db4)
+}
+
+# Should use the arguments named 'model', 'gradients', 'hyperparams'
+# and return always a model of type list
+aggregation = function(list[unknown] model,
+                       list[unknown] hyperparams,
+                       list[unknown] gradients)
+   return (list[unknown] modelResult) {
+     W1 = as.matrix(model[1])
+     W2 = as.matrix(model[2])
+     W3 = as.matrix(model[3])
+     W4 = as.matrix(model[4])
+     b1 = as.matrix(model[5])
+     b2 = as.matrix(model[6])
+     b3 = as.matrix(model[7])
+     b4 = as.matrix(model[8])
+     dW1 = as.matrix(gradients[1])
+     dW2 = as.matrix(gradients[2])
+     dW3 = as.matrix(gradients[3])
+     dW4 = as.matrix(gradients[4])
+     db1 = as.matrix(gradients[5])
+     db2 = as.matrix(gradients[6])
+     db3 = as.matrix(gradients[7])
+     db4 = as.matrix(gradients[8])
+     vW1 = as.matrix(model[9])
+     vW2 = as.matrix(model[10])
+     vW3 = as.matrix(model[11])
+     vW4 = as.matrix(model[12])
+     vb1 = as.matrix(model[13])
+     vb2 = as.matrix(model[14])
+     vb3 = as.matrix(model[15])
+     vb4 = as.matrix(model[16])
+     lr = as.double(as.scalar(hyperparams["lr"]))
+     mu = as.double(as.scalar(hyperparams["mu"]))
+
+     # Optimize with SGD w/ Nesterov momentum
+     [W1, vW1] = sgd_nesterov::update(W1, dW1, lr, mu, vW1)
+     [b1, vb1] = sgd_nesterov::update(b1, db1, lr, mu, vb1)
+     [W2, vW2] = sgd_nesterov::update(W2, dW2, lr, mu, vW2)
+     [b2, vb2] = sgd_nesterov::update(b2, db2, lr, mu, vb2)
+     [W3, vW3] = sgd_nesterov::update(W3, dW3, lr, mu, vW3)
+     [b3, vb3] = sgd_nesterov::update(b3, db3, lr, mu, vb3)
+     [W4, vW4] = sgd_nesterov::update(W4, dW4, lr, mu, vW4)
+     [b4, vb4] = sgd_nesterov::update(b4, db4, lr, mu, vb4)
+
+     modelResult = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
+   }
+
+
+#-------------------------------------------------------------
+# AGGREGATION
+#-------------------------------------------------------------
+aggregateSharedParameters = function(
+    Matrix[Double] initialParam,
+    list[unknown] allSubnetsParam,   # list of all subnets updes for a certain shared parameter
+    list[unknown] allSubnetsMasks
+) return (Matrix[Double] averagedUpdatedParam) {
+
+    num = matrix(0, nrow(initialParam), ncol(initialParam))
+    den = matrix(0, nrow(initialParam), ncol(initialParam))
+
+    for (s in 1:length(allSubnetsParam)) {  # TODO make parfor?
+        num = num + (as.matrix(allSubnetsParam[s]) * as.matrix(allSubnetsMasks[s]))
+        den = den + as.matrix(allSubnetsMasks[s])
+    }
+
+    # avoid divide by zero: where den==0, keep base
+    denNZ = (den > 0)
+
+    averagedUpdatedParam = initialParam * (1 - denNZ) + (num / pmax(1, den)) * denNZ
+    #print("averagedUpdatedParam:")
+    #print(averagedUpdatedParam)
+}
+
 #-------------------------------------------------------------
 # MNIST
 #

From f3e525bb90b56438b2357399d5b834e8c5968b03 Mon Sep 17 00:00:00 2001
From: Arno Bock <arno.bock@hotmail.de>
Date: Thu, 29 Jan 2026 15:28:41 +0100
Subject: [PATCH 4/6] [SYSTEMDS-3928] masked velocities for each subnet

---
 .../builtin/indSubnetTest_mnist_lenet.dml     | 21 ++++---------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
index 0fe94755211..1f1e0ac041d 100644
--- a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
+++ b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
@@ -127,35 +127,22 @@ computeGradients = function(
     list[unknown] hyperparams
 
 ) return (list[unknown] subnet_model) {
-# TODO) return (Matrix[Double] MR1, Matrix[Double] MR2) {
 
     # 1) full gradients
     grads = gradients(model=model, hyperparams=hyperparams, features=features, labels=labels)
-    #print("GRADS:")
-    #print(toString(grads))
 
     # 2) mask gradients
     grads_masked = list()
-    #print("Length of gradients: " + length(grads))
-    for (p in 1:length(grads)) {  # TODO use parfor?
+    for (p in 1:length(grads)) {
         grads_masked = append(grads_masked, as.matrix(grads[p]) * as.matrix(mask[p]))
-
-        if (FALSE) {
-            print("GRADS: p=" + p)
-            print(toString(grads[p]))
-
-            print("MASKED GRADS: p=" + p)
-            print(toString(grads_masked[p]))
-
-        }
     }
 
     # 3) apply optimizer step locally
     subnet_model = aggregation(model=model, hyperparams=hyperparams, gradients=grads_masked)
 
     # 4) mask velocities
-    for (p in 9:length(model)) {  # TODO use parfor?
-        #model_out[p] = as.matrix(model_out[p]) * as.matrix(mask[p]) TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO TODO
+    for (p in (length(grads)+1):length(model)) {
+        subnet_model[p] = list(as.matrix(subnet_model[p]) * as.matrix(mask[p]))
     }
 }
 
@@ -293,7 +280,7 @@ aggregation = function(list[unknown] model,
 #-------------------------------------------------------------
 aggregateSharedParameters = function(
     Matrix[Double] initialParam,
-    list[unknown] allSubnetsParam,   # list of all subnets updes for a certain shared parameter
+    list[unknown] allSubnetsParam,   # list of all subnets updates for a certain shared parameter
     list[unknown] allSubnetsMasks
 ) return (Matrix[Double] averagedUpdatedParam) {
 

From fbb0297c377e4c0fda43bcf4b20c937d12234853 Mon Sep 17 00:00:00 2001
From: Arno Bock <arno.bock@hotmail.de>
Date: Fri, 30 Jan 2026 16:17:37 +0100
Subject: [PATCH 5/6] [SYSTEMDS-3928] wrap main computational load in parfor()

---
 scripts/builtin/independentSubnetTrain.dml    | 123 +++++++++---------
 .../context/ExecutionContext.java             |   2 +-
 .../runtime/instructions/cp/ListObject.java   |   2 +-
 .../builtin/indSubnetTest_mnist_lenet.dml     |   4 +-
 4 files changed, 63 insertions(+), 68 deletions(-)

diff --git a/scripts/builtin/independentSubnetTrain.dml b/scripts/builtin/independentSubnetTrain.dml
index 8364679314b..2cd63b6f9b8 100644
--- a/scripts/builtin/independentSubnetTrain.dml
+++ b/scripts/builtin/independentSubnetTrain.dml
@@ -18,7 +18,7 @@ m_independentSubnetTrain = function(
    int                  paramsPerLayer,
    list[int]          fullyConnectedLayers
 )
-return (list[unknown] model_out_2)
+return (list[unknown] trained_model)
 {
     # ------------------------------------------------------------
     # Setup
@@ -131,10 +131,12 @@ return (list[unknown] model_out_2)
             if (verbose) print(step)
 
             # 1.) create masks for all subnets
+            print("Masking started!")
             [masks, masks_meta_info] = ist_create_disjoint_masks(round_model, numSubnets, L, fcLayers, paramsPerLayer, isFC, verbose)
+            print("Masking ended!")
             if (verbose) print(toString(masks))
 
-            # 2.) preallocate list for all subnets TODO move outside epoch loop to prevent constantly allocating? could lead to unwanted side effects if reused in next IST round
+            # 2.) preallocate list to store all subnets TODO move outside epoch loop to prevent constantly allocating? could lead to unwanted side effects if reused in next IST round
             updatedSubnets = list()
             updatedSubnetsMasks = list()
             for (s in 1:numSubnets) {
@@ -142,86 +144,79 @@ return (list[unknown] model_out_2)
                 updatedSubnetsMasks = append(updatedSubnetsMasks, list())
             }
 
-            # 3.) perform 'j' local gradient steps for each subnet
-            for (subnet in 1:numSubnets) { # TODO make parfor
-                if (verbose) print("Current model:")
-                if (verbose) print(round_model)
+            # 3) create a template for each subnet based on input model (allows indexing in subsequent parfor-loop) TODO mve outside loop
+            subnetModelTemplate = list()
+            subnetModelMaskTemplate = list()
+            for (pIdx in 1:P) {
+                subnetModelTemplate      = append(subnetModelTemplate, as.matrix(model[pIdx]))
+                subnetModelMaskTemplate  = append(subnetModelMaskTemplate, as.matrix(model[pIdx]))
+            }
+
+            # TODO make it PARALLEL --- START --------------------------------------------------------------------------
+            print("Trying to run in parallel.")
+
+            # local optimization steps / IST round
+            localSteps = min(j, (stepsPerEpoch-step+1))
+
+            # 4.) obtain all minibatches for this IST round (doing it once prevents TODO)
+            shuffled_features = list()
+            shuffled_labels = list()
+            for (localStep in 1:localSteps) {
+                mb = (step-1) + localStep
+                mb_local = mb-1
+                start = mb_local*batchsize + 1
+                end   = min(mb*batchsize, N)
+
+                Xb = features_shuffled[start:end, 1:ncol(features_shuffled)]
+                yb = labels_shuffled[start:end, 1:ncol(labels_shuffled)]
+                shuffled_features = append(shuffled_features, Xb)
+                shuffled_labels = append(shuffled_labels, yb)
+            }
+
+            # 5.) perform 'j' local gradient steps for each subnet
+            print("Started J local gradient steps FOR ALL SUBNETS SIMULTANEOUSLY...")
+            parfor (subnet in 1:numSubnets) {
 
                 # a.) obtain masked subnet
-                subnet_model = list()
-                subnet_model_mask = list()
-                for (p in 1:length(round_model)) {
-                    param_start_idx = as.integer(as.scalar(masks_meta_info[p,1]))
-                    param_end_idx   = as.integer(as.scalar(masks_meta_info[p,2]))
-                    param_rows     = as.integer(as.scalar(masks_meta_info[p,3]))
-                    param_cols     = as.integer(as.scalar(masks_meta_info[p,4]))
-                    if (verbose) print("Following info was obtained.")
-                    if (verbose) print(param_start_idx)
-                    if (verbose) print(param_end_idx)
-                    if (verbose) print(param_rows)
-                    if (verbose) print(param_cols)
+                subnet_model = subnetModelTemplate
+                subnet_model_mask = subnetModelMaskTemplate
+                for (subnet_p in 1:length(round_model)) {
+                    param_start_idx = as.integer(as.scalar(masks_meta_info[subnet_p,1]))
+                    param_end_idx   = as.integer(as.scalar(masks_meta_info[subnet_p,2]))
+                    param_rows     = as.integer(as.scalar(masks_meta_info[subnet_p,3]))
+                    param_cols     = as.integer(as.scalar(masks_meta_info[subnet_p,4]))
 
                     vec = masks[subnet, param_start_idx:param_end_idx]
                     param_mask = matrix(vec, rows=param_rows, cols=param_cols, byrow=TRUE)
-                    param = as.matrix(round_model[p])
-                    subnet_model = append(subnet_model, param * param_mask)  # elementwise mask
-                    subnet_model_mask = append(subnet_model_mask, param_mask)
-                }
-                if (verbose) print("SUBNET model #" + subnet)
-                if (verbose) print(subnet_model)
+                    param = as.matrix(round_model[subnet_p])
 
-                if (verbose) print("SUBNET model MASK:")
-                if (verbose) print(subnet_model_mask)
+                    subnet_model[subnet_p]      = list(param * param_mask)
+                    subnet_model_mask[subnet_p] = list(param_mask)
+                }
 
                 # b.) local optimization steps / IST round
-                localSteps = min(j, (stepsPerEpoch-step+1))
                 for (localStep in 1:localSteps) {
-                    if (verbose) print("Local Step:")
-                    mb = (step-1) + localStep
-                    if (verbose) print(mb)
-
-                    mb = (step-1) + localStep  # mini batch idx
-                    start = (mb-1)*batchsize + 1
-                    end   = min(mb*batchsize, N)
-
-                    Xb = features_shuffled[start:end, 1:ncol(features_shuffled)]
-                    yb = labels_shuffled[start:end, 1:ncol(labels_shuffled)]
-
-                    if (verbose) print("Xb:")
-                    if (verbose) print(Xb)
-
-                    if (verbose) print("yb:")
-                    if (verbose) print(yb)
+                    feat = as.matrix(shuffled_features[localStep])
+                    lab = as.matrix(shuffled_labels[localStep])
 
                     # compute gradients for subnet s + apply update (SGD/Adam/etc.) on owned params (only)
-                    subnet_model = as.list(evalList(upd, list(model=subnet_model, mask=subnet_model_mask, features=Xb, labels=yb, hyperparams=hyperparams)))
-
+                    #subnet_model = as.list(evalList(upd, list(model=subnet_model, mask=subnet_model_mask, features=feat, labels=lab, hyperparams=hyperparams)))
                 }
-                if (verbose) print("Subnet model mask:")
-                if (verbose) print(subnet_model_mask)
-                if (verbose) print("owned rows = " + nrow(subnet_model_mask))
-                if (verbose) print("owned cols = " + ncol(subnet_model_mask))
-                #stop("EXEC finished!")
-
-                if (verbose) print("Trained subnet model:")
-                if (verbose) print(subnet_model)
-                if (verbose) print("owned rows = " + nrow(subnet_model))
-                if (verbose) print("owned cols = " + ncol(subnet_model))
 
                 # c.) save updated subnet and mask
                 updatedSubnets[subnet] = list(subnet_model)
                 updatedSubnetsMasks[subnet] = list(subnet_model_mask)
             }
+            print(updatedSubnets)
+            #stop("Ran in parallel! --- :: SUCCESS :: ---")
+
+            print("All subnets have run successfully.")
 
             if (verbose) print("ALL SUBNET UPDATES COMBINED:")
             if (verbose) print(toString(updatedSubnets))
             if (verbose) stop("Performed updates on all subnets!")
 
-            #print("ALL SUBNET MASKS COMBINED:")
-            #print(toString(updatedSubnetsMasks))
-            #stop("Performed updates on all subnets!")
-
-            # 4.) aggregate updates into global model (i.e. model_out)
+            # 6.) aggregate updates into global model (i.e. model_out)
             for (p in 1:P) {  # TODO parfor ?
                 if (as.scalar(isSharedParam[1, p])==1) {
                     # construct full model update by aggregating shared parameter updates from all subnets
@@ -295,8 +290,9 @@ return (list[unknown] model_out_2)
                     if (verbose) print(toString(round_model[p]))
                }
             }
+            print("Aggregation of subnets completed.")
 
-            # end of the IST round
+            # 7.) update global model (end of the IST round)
             model_out = round_model
 
             print("An IST round has been successfully executed! The updated model is:.")
@@ -304,10 +300,9 @@ return (list[unknown] model_out_2)
             #stop("An IST round has been successfully executed!")
             # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
         }
+        # TODO (potentially): add validation
     }
-
-    # TODO leave as return statement - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-    model_out_2 = model_out
+    trained_model = model_out
 }
 
 
diff --git a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
index fa87d452d15..67cda352a73 100644
--- a/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
+++ b/src/main/java/org/apache/sysds/runtime/controlprogram/context/ExecutionContext.java
@@ -810,7 +810,7 @@ public void unpinVariables(List<String> varList, Queue<Boolean> varsState) {
 		for (String varName : varList) {
 			Data dat = _variables.get(varName);
 			if (dat instanceof CacheableData<?>)
-				((CacheableData<?>)dat).enableCleanup(varsState.poll());
+				((CacheableData<?>)dat).enableCleanup(Boolean.TRUE.equals(varsState.poll()));
 			else if (dat instanceof ListObject)
 				((ListObject)dat).enableCleanup(varsState);
 		}
diff --git a/src/main/java/org/apache/sysds/runtime/instructions/cp/ListObject.java b/src/main/java/org/apache/sysds/runtime/instructions/cp/ListObject.java
index 344f59535e5..ae59eb54e9e 100644
--- a/src/main/java/org/apache/sysds/runtime/instructions/cp/ListObject.java
+++ b/src/main/java/org/apache/sysds/runtime/instructions/cp/ListObject.java
@@ -552,7 +552,7 @@ public void enableCleanup(boolean flag) {
 	public void enableCleanup(Queue<Boolean> flags) {
 		for (Data dat : this.getData()) {
 			if (dat instanceof CacheableData<?>)
-				((CacheableData<?>)dat).enableCleanup(flags.poll());
+				((CacheableData<?>)dat).enableCleanup(Boolean.TRUE.equals(flags.poll()));
 			else if (dat instanceof ListObject)
 				((ListObject)dat).enableCleanup(flags);
 		}
diff --git a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
index 1f1e0ac041d..f81046d3faf 100644
--- a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
+++ b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
@@ -93,7 +93,7 @@ train = function(matrix[double] X, matrix[double] Y,
   params = list(lr=lr, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3, fullyConnectedLayers=list(3,4))
 
   # Length of an IST round
-  ist_round = 10 # TODO whats a good value?
+  ist_round = 50 # TODO whats a good value?
 
   # Use independent subnet training function
   s1 = "./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::computeGradients"
@@ -362,7 +362,7 @@ return (matrix[double] X, matrix[double] Y,
 
 # Training parameters
 epochs = 10  # TODO reinstate 90
-batch_size = 512
+batch_size = 128 #512 TODO reiinstate?
 workers = 8
 utype = "BSP"      # or whatever you use
 freq  = "BATCH"    # kept for signature compatibility (IST can ignore)

From e71803ff6c8efed2f932666c7fe69b9b3a104c26 Mon Sep 17 00:00:00 2001
From: Arno Bock <arno.bock@hotmail.de>
Date: Fri, 30 Jan 2026 23:12:42 +0100
Subject: [PATCH 6/6] [SYSTEMDS-3928] finalise & clean for submission

---
 scripts/builtin/independentSubnetTrain.dml    | 425 ++++++------------
 src/test/config/SystemDS-config.xml           |   2 +-
 .../builtin/indSubnetTest_mnist_lenet.dml     |  51 ++-
 3 files changed, 180 insertions(+), 298 deletions(-)

diff --git a/scripts/builtin/independentSubnetTrain.dml b/scripts/builtin/independentSubnetTrain.dml
index 2cd63b6f9b8..2009ab05157 100644
--- a/scripts/builtin/independentSubnetTrain.dml
+++ b/scripts/builtin/independentSubnetTrain.dml
@@ -1,142 +1,150 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+# Independent Subnet Training (IST)
+#
+# This builtin implements independent subnet training as a
+# second-order function. It orchestrates distributed / parallel
+# training over disjoint subnets using parfor, while delegating
+# architecture-specific logic to user-provided functions.
+# ------------------------------------------------------------
+# INPUT:
+#   model                : initial model parameters. A list of matrices for NN.
+#   features             : X
+#   labels               : Y
+#   val_features         : validation X
+#   val_labels           : validation Y
+#   upd                  : computes gradients and performs optimizer step
+#   agg                  : aggregation logic to combine updates of shared parameters (across subnets)
+#   epochs               : number of epochs
+#   batchsize            : batchsize for training
+#   j                    : number of gradient steps until aggregation -> determines length of the IST round (aggregation frequency)
+#   numSubnets           : number of independent subnets/workers
+#   hyperparams          : list of hyperparameters (e.g. lr, reg, mask params, etc.)
+#   verbose              : print progress (boolean)
+#   paramsPerLayer       : amount of parameters each layer consists of
+#   fullyConnectedLayers : list of all FC layer indices (starting at idx=1)
+#
+# OUTPUT:
+#   model_out     : trained model parameters (IST: W)
+#
+# ASSUMPTION:
+#   - the last layer is the output layer
+# ------------------------------------------------------------
+
 m_independentSubnetTrain = function(
-   list[unknown] model,
+   list[unknown]        model,
    matrix[double]       features,
    matrix[double]       labels,
    matrix[double]       val_features,
    matrix[double]       val_labels,
    string               upd,
    string               agg,
-   string               mode,
-   string               utype,
    int                  epochs,
    int                  batchsize,
    int                  j,
    int                  numSubnets,
-   string               scheme,
    list[unknown]        hyperparams,
    boolean              verbose,
    int                  paramsPerLayer,
-   list[int]          fullyConnectedLayers
+   list[int]            fullyConnectedLayers
 )
 return (list[unknown] trained_model)
 {
     # ------------------------------------------------------------
     # Setup
-    # TODO assumption that the last layer is the output layer
     # ------------------------------------------------------------
-    if (verbose) print("Entered IST function.")
     model_out = model
 
     P = length(model)
-    if (verbose) print("Parameters in model:")
-    if (verbose) print(P)
-
     N = nrow(features)
-    if (verbose) print("Samples:")
-    if (verbose) print(N)
-
-    if (verbose) print("Is model length NOT divisible by paramsPerLayer? :")
-    if (verbose) print(P %% paramsPerLayer != 0)
-    if (P %% paramsPerLayer != 0) {
-       stop("Model length not divisible by paramsPerLayer")
-    }
-
-    if (verbose) print("We made it")
+    if (P %% paramsPerLayer != 0) stop("Model length not divisible by paramsPerLayer")
     L = as.integer(P / paramsPerLayer)  # total layers
-    if (verbose) print("Layers:")
-    if (verbose) print(L)
-
-    # obtain indices of FC layers
-    fcLayers = fullyConnectedLayers
-    if (verbose) print("FC layers:")
-    if (verbose) print(toString(fcLayers))
 
     # I. determine shared parameters
     isSharedParam = matrix(0, 1, P)
 
-    # create mask for all parameters of FC layers
+    # - create mask for all FC layers
+    fcLayers = fullyConnectedLayers
     isFC = matrix(0, rows=1, cols=L)
     for (i in 1:length(fcLayers)) {
        idx = as.integer(as.scalar(fcLayers[i]))
        isFC[1, idx] = 1  # TODO vectorize
     }
-    if (verbose) print(toString(isFC))
 
+    # - expand layer mask across all parameters
     isFC_rep = isFC
     for (r in 2:paramsPerLayer) {
         isFC_rep = cbind(isFC_rep, isFC)
     }
-    if (verbose) print(toString(isFC_rep))
     if (ncol(isFC_rep)!=P) stop("Dimension mismatch for FC layer mask.")
 
-
-    # 1. all non-FC layers are shared
+    # - all non-FC layers are shared
     isSharedParam = 1 - isFC_rep
-    if (verbose) print(toString(isSharedParam))
-
 
-    # 2. FC bias parameters are shared in: output layer or at the end of a FC block
+    # - edge case: FC bias parameters are shared in: output layer or at the end of a FC block
     for (paramId in seq(2, paramsPerLayer, 2)) {   # iterate bias blocks only
         for (l in 1:L) {
             if (as.scalar(isFC[1,l])==1 & l==L) {
                 p_out_bias = (paramId - 1) * L + L  # output bias is shared across subnets
-                isSharedParam[1, p_out_bias] = 1
+                isSharedParam[1, p_out_bias] = 1  # TODO vectorize
             }
             else if (as.scalar(isFC[1,l])==1 & l<L & as.scalar(isFC[1,l+1])==0) {
                 p_out_bias = (paramId - 1) * L + l  # end of FC block's bias is shared across subnets
-                isSharedParam[1, p_out_bias] = 1
+                isSharedParam[1, p_out_bias] = 1  # TODO vectorize
             }
         }
     }
-    if (verbose) print(toString(isSharedParam))
     if (ncol(isSharedParam) != P) stop("isSharedParam dimension mismatch!")
 
     # II. calculate update-steps per epoch
     if (batchsize<=0 | batchsize>N) {
-        stop("Batch size is out of bounds!") # TODO mention concrete values
+        stop("Batch size is out of bounds!")
     } else {
         stepsPerEpoch = ceil(N / batchsize)
     }
-    if (verbose) print("Steps per epoch:")
-    if (verbose) print(stepsPerEpoch)
-
 
     # III. training loop
     for (epoch in 1:epochs) {
-        print("Entered epoch: " + epoch)
-        # TODO NEW start - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+        if (verbose) print("Entered epoch: " + epoch)
 
         # A.) reshuffle indices each epoch
-        randyOrton = rand(rows=N, cols=1)
-        allSampleIndicesRandom = order(target=randyOrton, by=1, decreasing=FALSE, index.return=TRUE)
-
+        allSampleIndicesRandom = order(target=rand(rows=N, cols=1), by=1, decreasing=FALSE, index.return=TRUE)
         batchIndices = allSampleIndicesRandom[, 1]
-
         b = nrow(batchIndices)
         I = seq(1, b, 1)
         V = matrix(1, rows=b, cols=1)
-
         S = table(I, batchIndices, V, b, N)
+
         features_shuffled = S %*% features
         labels_shuffled = S %*% labels
-        # TODO NEW end - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-        if (verbose) print("Length of random sample indices:")
-        if (verbose) print(length(allSampleIndicesRandom))
 
         # B.) iterate IST rounds
         for (step in seq(1, stepsPerEpoch, j)) {
-            print("Starting new IST round at step: " + step)
+            if (verbose) print("Starting new IST round at step: " + step)
             round_model = model_out  # prevent accidental mutation of model_out
-            if (verbose) print("Iterating IST round:")
-            if (verbose) print(step)
 
             # 1.) create masks for all subnets
-            print("Masking started!")
             [masks, masks_meta_info] = ist_create_disjoint_masks(round_model, numSubnets, L, fcLayers, paramsPerLayer, isFC, verbose)
-            print("Masking ended!")
-            if (verbose) print(toString(masks))
 
-            # 2.) preallocate list to store all subnets TODO move outside epoch loop to prevent constantly allocating? could lead to unwanted side effects if reused in next IST round
+            # 2.) preallocate list to store all subnets TODO move outside epoch loop?  to prevent constantly allocating...
             updatedSubnets = list()
             updatedSubnetsMasks = list()
             for (s in 1:numSubnets) {
@@ -144,7 +152,7 @@ return (list[unknown] trained_model)
                 updatedSubnetsMasks = append(updatedSubnetsMasks, list())
             }
 
-            # 3) create a template for each subnet based on input model (allows indexing in subsequent parfor-loop) TODO mve outside loop
+            # 3) create a template for each subnet based on input model (allows indexing in subsequent parfor-loop) TODO move outside epoch loop?  to prevent constantly allocating...
             subnetModelTemplate = list()
             subnetModelMaskTemplate = list()
             for (pIdx in 1:P) {
@@ -152,13 +160,10 @@ return (list[unknown] trained_model)
                 subnetModelMaskTemplate  = append(subnetModelMaskTemplate, as.matrix(model[pIdx]))
             }
 
-            # TODO make it PARALLEL --- START --------------------------------------------------------------------------
-            print("Trying to run in parallel.")
-
             # local optimization steps / IST round
             localSteps = min(j, (stepsPerEpoch-step+1))
 
-            # 4.) obtain all minibatches for this IST round (doing it once prevents TODO)
+            # 4.) obtain all minibatches for this IST round (doing it once prevents parfor confusion)
             shuffled_features = list()
             shuffled_labels = list()
             for (localStep in 1:localSteps) {
@@ -174,7 +179,6 @@ return (list[unknown] trained_model)
             }
 
             # 5.) perform 'j' local gradient steps for each subnet
-            print("Started J local gradient steps FOR ALL SUBNETS SIMULTANEOUSLY...")
             parfor (subnet in 1:numSubnets) {
 
                 # a.) obtain masked subnet
@@ -190,7 +194,7 @@ return (list[unknown] trained_model)
                     param_mask = matrix(vec, rows=param_rows, cols=param_cols, byrow=TRUE)
                     param = as.matrix(round_model[subnet_p])
 
-                    subnet_model[subnet_p]      = list(param * param_mask)
+                    subnet_model[subnet_p]      = list(param * param_mask)  # TODO sparse masking! dense masking will probably increase computational efficiency
                     subnet_model_mask[subnet_p] = list(param_mask)
                 }
 
@@ -199,25 +203,18 @@ return (list[unknown] trained_model)
                     feat = as.matrix(shuffled_features[localStep])
                     lab = as.matrix(shuffled_labels[localStep])
 
-                    # compute gradients for subnet s + apply update (SGD/Adam/etc.) on owned params (only)
-                    #subnet_model = as.list(evalList(upd, list(model=subnet_model, mask=subnet_model_mask, features=feat, labels=lab, hyperparams=hyperparams)))
+                    # compute gradients for subnet s + apply update step on owned params (only)
+                    subnet_model = as.list(evalList(upd, list(model=subnet_model, mask=subnet_model_mask, features=feat, labels=lab, hyperparams=hyperparams)))
                 }
 
                 # c.) save updated subnet and mask
                 updatedSubnets[subnet] = list(subnet_model)
                 updatedSubnetsMasks[subnet] = list(subnet_model_mask)
             }
-            print(updatedSubnets)
-            #stop("Ran in parallel! --- :: SUCCESS :: ---")
-
-            print("All subnets have run successfully.")
-
-            if (verbose) print("ALL SUBNET UPDATES COMBINED:")
-            if (verbose) print(toString(updatedSubnets))
-            if (verbose) stop("Performed updates on all subnets!")
+            if (verbose) print("All subnets have run successfully.")
 
             # 6.) aggregate updates into global model (i.e. model_out)
-            for (p in 1:P) {  # TODO parfor ?
+            for (p in 1:P) {
                 if (as.scalar(isSharedParam[1, p])==1) {
                     # construct full model update by aggregating shared parameter updates from all subnets
                     subnetParams = list()
@@ -229,21 +226,10 @@ return (list[unknown] trained_model)
                         subnetParams = append(subnetParams, as.matrix(subnet[p]))
                         subnetMasks = append(subnetMasks, as.matrix(subnetMask[p]))
                     }
-                    if (verbose) print("1st param (W) from all subnets:")
-                    if (verbose) print(toString(subnetParams))
-
-                    if (verbose) print("Masks of 1st param (W) from all subnets:")
-                    if (verbose) print(toString(subnetMasks))
-
 
                     # aggregate shared parameters based on provided function
                     averagedUpdatedParam = eval(agg, list(initialParam=as.matrix(round_model[p]), allSubnetsParam=subnetParams, allSubnetsMasks=subnetMasks))
-                    if (verbose) print("Successfully retrieved averaged updates for shared param.")
-                    if (verbose) print(averagedUpdatedParam)
-
                     round_model[p] = averagedUpdatedParam
-                    if (verbose) print("Averaging shared params has been successful.")
-                    if (verbose) print(toString(round_model[p]))
                }
                else {
                     # construct full model update by filling with disjointly partitioned parameter updates from all subnets
@@ -251,56 +237,27 @@ return (list[unknown] trained_model)
                     updatedParam = matrix(0, nrow(initialParam), ncol(initialParam))
                     owned = matrix(0, nrow(initialParam), ncol(initialParam))
 
-                    if (verbose) print("INITAL PARAM")
-                    if (verbose) print(toString(initialParam))
-
-                    if (verbose) print("owned rows = " + nrow(owned))
-                    if (verbose) print("owned cols = " + ncol(owned));
-
                     for (s in 1:numSubnets) {
-                        if (verbose) print("Subnet number: " + s)
-                        if (verbose) print("Param number: " + p)
                         subnet = as.list(updatedSubnets[s])
                         subnetMask = as.list(updatedSubnetsMasks[s])
 
                         owned = owned + as.matrix(subnetMask[p])
-                        updatedParam = updatedParam + as.matrix(subnet[p]) #* as.matrix(subnetMask[p])
-
-                        if (verbose) print("Subnets mask (acc):")
-                        if (verbose) print(owned)
-                        if (verbose) print("Subnets update:")
-                        if (verbose) print(as.matrix(subnet[p]))
+                        updatedParam = updatedParam + as.matrix(subnet[p])
                     }
 
                     # SANITY CHECK:
-                    if (verbose) print("owned rows = " + nrow(owned))
-                    if (verbose) print("owned cols = " + ncol(owned))
-
-                    if (verbose) print("OWNED")
-                    if (verbose) print(owned)
+                    max_freq = max(owned)
+                    if (max_freq > 1) stop("Overlap detected")
 
-                    overlap = max(owned)
-                    if (verbose) print("MAXIMUM")
-                    if (verbose) print(overlap)
-
-                    if (overlap > 1) stop("Overlap detected")
                     round_model[p] = updatedParam
-
-                    if (verbose) print("Reconstructing disjoint params has been successful.")
-                    if (verbose) print(toString(round_model[p]))
                }
             }
-            print("Aggregation of subnets completed.")
+            if (verbose) print("Aggregation of subnets finished. IST round has been successfully executed!")
 
             # 7.) update global model (end of the IST round)
             model_out = round_model
-
-            print("An IST round has been successfully executed! The updated model is:.")
-            print(toString(model_out))
-            #stop("An IST round has been successfully executed!")
-            # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
         }
-        # TODO (potentially): add validation
+        # TODO (potentially): add validation for early stopping etc.
     }
     trained_model = model_out
 }
@@ -309,30 +266,46 @@ return (list[unknown] trained_model)
 # ----------------------------------------------------------------------------------------------------------------------
 # Independent Subnet Masking
 #
-# This helper function creates a list of masks, one per subnet.
-# Each mask is a binary vector/matrix indicating which parameters belong to that subnet.
+# This helper function creates two matrices: one contains all flattened masks, the other contains the info on
+# how to reconstruct the mask matrices. Each mask is a binary vector indicating which parameters belong to that subnet.
 # ----------------------------------------------------------------------------------------------------------------------
+# INPUT:
+#   model                : list of parameter tensors grouped by parameter type i.e. blocks
+#   numSubnets           : number of independent subnets/workers (K)
+#   L                    : total number of layers INCLUDING the output layer (layer indices are assumed to be 1..L)
+#   fullyConnectedLayers : list of all FC layer indices (starting at idx=1)
+#   paramsPerFCLayer     : number of parameters / neurons to be partitioned per FC layer
+#   isFC                 : indicator matrix encoding which layers are FC => isFC[l] ∈ {0,1}
+#   verbose              : print progress (boolean)
+#
+# OUTPUT:
+#   masks_new            : mask matrix defining disjoint neuron ownership across subnets
+#   masks_new_meta       : metadata matrix describing the mask layout and ownership mapping
+#
 # ASSUMPTIONS:
-#  - neuron ownership is defined via bias vectors
-#  - model is a list of parameter tensors
-#  - trainable parameters are grouped by parameter type i.e. param blocks like (W_l1, W_l2, ..., b_l1, b_l2, ...)
-#  - assumes W and b are always the first two param blocks
-#  - the pattern of optional optimizer state tensors (e.g., vW_l, vb_l) follow the same grouping and always W followed by b
-#  - (output layer & end of FC block) biases are shared -> gradients collide; must be handled by aggregation logic
+#   - neuron ownership is defined via bias vectors
+#   - model is a list of parameter tensors
+#   - trainable parameters are grouped by parameter type i.e. param blocks like (W_l1, W_l2, ..., b_l1, b_l2, ...)
+#   - assumes W and b are always the first two param blocks
+#   - the pattern of optional optimizer state tensors (e.g., vW_l, vb_l) follow the same grouping and always W followed by b
+#   - (output layer & end of FC block) biases are shared -> gradients collide; must be handled by aggregation logic
 # ----------------------------------------------------------------------------------------------------------------------
 
 ist_create_disjoint_masks = function(
-    list[unknown] model,
-    int numSubnets,
-    int L,  # total layers including output layer
-    list[int] fullyConnectedLayers,  # the indices of FC-layers starting from 1
-    int paramsPerFCLayer,
-    Matrix[Double] isFC,
-    boolean verbose)
-  return (Matrix[Double] masks_new, Matrix[Double] masks_new_meta)
+    list[unknown]   model,
+    int             numSubnets,
+    int             L,
+    list[int]       fullyConnectedLayers,
+    int             paramsPerFCLayer,
+    Matrix[Double]  isFC,
+    boolean         verbose
+)
+  return (
+    Matrix[Double]  masks_new,
+    Matrix[Double]  masks_new_meta
+  )
 {
     P = length(model)
-    modus = 0  # masks stored in {0: matrix / 1: list}
 
     # SANITY CHECKS: ensure provided model can be masked correctly
     if (as.integer(P / paramsPerFCLayer) != L) {
@@ -343,22 +316,6 @@ ist_create_disjoint_masks = function(
     }
 
     # I.) initialize and preallocate masks
-    masks = list()
-    for (s in 1:numSubnets) {
-        masks = append(masks, model);
-        #for (k in 1:P) { TODO might be a problem?
-        #    masks[s][k] = matrix(0, rows=nrow(model[p]), cols=ncol(model[p]))
-        #}
-    }
-    if (verbose) print("Masks now has following length:")
-    if (verbose) print(length(masks))
-
-
-    # II.) determine FC layers
-    if (verbose) print("Forwarded fully connected layer information:")
-    if (verbose) print(toString(isFC))
-
-    # TODO NEW START - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - -
     masks_new_meta = matrix(0, rows=length(model), cols=4)  # columns=[start,end,rows,cols]
     current_position = 1
     for (p in 1:length(model)) {
@@ -373,33 +330,18 @@ ist_create_disjoint_masks = function(
         current_position = current_position + param_length
     }
     mask_size = current_position-1
+    masks_new = matrix(0, rows=numSubnets, cols=mask_size)  # all subnets in one matrix
 
-    # All subnets in one matrix
-    masks_new = matrix(0, rows=numSubnets, cols=mask_size)
-    if (verbose) print(masks_new_meta)
-    # TODO NEW END - - - - - - -  - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -  - - - - - - -
-
-
-    # III.) iterate all layers
+    # II.) iterate all layers
     for (l in 1:L) {
-        if (as.scalar(isFC[1,l]) == 1) {
-            if (verbose) print("Entered fully connected layer. The layer is:")
-            if (verbose) print(l)
 
+        # FC layer: create #{numSubnets} disjoint partitions for this layer across all parameters
+        if (as.scalar(isFC[1,l]) == 1) {
             W = as.matrix(model[l])
-            if (verbose) print("W (rows/cols):")
-            if (verbose) print(nrow(W))
-            if (verbose) print(ncol(W))
-
             b = as.matrix(model[l+L])
-            if (verbose) print("b:")
-            if (verbose) print(ncol(b))
-
             H = ncol(W);  # bias neurons in layer l
-            if (verbose) print("H:")
-            if (verbose) print(H)
 
-            # SANITY CHECK
+            # SANITY CHECKS:
             if (nrow(b) != 1 | ncol(b) != H) {
                 if (verbose) print("Bias shape mismatch!")
                 if (verbose) print("b:", nrow(b), "x", ncol(b))
@@ -412,47 +354,32 @@ ist_create_disjoint_masks = function(
                 stop("Please use a wider model or decrease the amount of subnets.")
             }
 
-            randyOrton2 = rand(rows=H, cols=1)  # shuffle all indices
-            allNeuronIndicesRandom = order(target=randyOrton2, by=1, decreasing=FALSE, index.return=TRUE)  # shuffle all indices
-            if (verbose) print("Length of random sample indices:")
-            if (verbose) print(length(allNeuronIndicesRandom))
+            # A.) shuffle all neuron indices
+            allNeuronIndicesRandom = order(target=rand(rows=H, cols=1), by=1, decreasing=FALSE, index.return=TRUE)
 
-            # TODO FROM HERE ...
-            chunk_size = floor(H/numSubnets)  # amount of neurons each subnet will consist at least TODO for l=L they are shared s this value will be quite low or even below 1
+            # B.) determine neuron ownership
+            chunk_size = floor(H/numSubnets)
             remaining_neurons = H - chunk_size * numSubnets
-            if (verbose) print("Dividing all hidden layer neurons by the number of subnets, each subnet will own at least:")
-            if (verbose) print(chunk_size)
-            if (verbose) print("Following amount of neurons remains and will be randomly assigned to the subnets (at most one to a subnet):")
-            if (verbose) print(remaining_neurons)
-
             amount_active_neurons = matrix(chunk_size, rows=numSubnets, cols=1)
             if (remaining_neurons > 0) {
                 randomSubnetIndices = order(target=rand(rows=numSubnets, cols=1, seed=-1), by=1, decreasing=FALSE, index.return=TRUE)  # TODO replace seed for experiments
                 for (i in 1:remaining_neurons) {
                   sid = as.integer(as.scalar(randomSubnetIndices[i,1]))
-                  amount_active_neurons[sid,1] = as.scalar(amount_active_neurons[sid,1]) + 1  # TODO VECTORIZE use pmin()
+                  amount_active_neurons[sid,1] = as.scalar(amount_active_neurons[sid,1]) + 1  # TODO VECTORIZE
                 }
             }
-            if (verbose) print("Amount of active neurons per subnet:")
-            if (verbose) print(amount_active_neurons)
-
             neuron_end_indices = cumsum(amount_active_neurons)
             neuron_start_indices = neuron_end_indices - amount_active_neurons + 1
-            if (verbose) print("Start indices of each subnet:")
-            if (verbose) print(neuron_start_indices)
 
+            # C.) obtain masks for all subnets
             for(s in 1:numSubnets) {
-                if (verbose) print("Entered subnet:")
-                if (verbose) print(s)
 
-                # A. obtain owned neurons for this layer
+                # 1. obtain owned neurons for this layer
                 start = as.integer(as.scalar(neuron_start_indices[s,1]))
                 end = as.integer(as.scalar(neuron_end_indices[s,1]))
                 current_b_indices = allNeuronIndicesRandom[start:end, 1]
-                if (verbose) print(length(current_b_indices))
-                # TODO ... UNTIL HERE: only required for else case (in bias case)
 
-                # B. create masked bias
+                # 2. create masked bias
                 if(l==L) {  # output layer
                     masked_b = matrix(1, rows=1, cols=ncol(b))
                 }
@@ -466,9 +393,8 @@ ist_create_disjoint_masks = function(
                        masked_b[1, idx] = 1
                     }
                 }
-                if (verbose) print(masked_b)
 
-                # 2b. create masked weight
+                # 3. create masked weight
                 masked_W = matrix(0, rows=nrow(W), cols=ncol(W))
                 if(l==1) {
                     for (i in 1:nrow(current_b_indices)) {  # TODO VECTORIZE
@@ -485,27 +411,12 @@ ist_create_disjoint_masks = function(
                 else {
                     # obtain active neurons of previous layer
                     p = L + (l-1)
-
-                    if (modus==1) {
-                        previous_masked_b_list = as.list(masks[s])
-                        previous_masked_b = as.matrix(previous_masked_b_list[p])
-                        if (verbose) print(toString(previous_masked_b))
-                    } else {
-                        start = as.integer(as.scalar(masks_new_meta[p,1]))
-                        end   = as.integer(as.scalar(masks_new_meta[p,2]))
-                        r     = as.integer(as.scalar(masks_new_meta[p,3]))
-                        c     = as.integer(as.scalar(masks_new_meta[p,4]))
-                        if (verbose) print("Following info was obtained.")
-                        if (verbose) print(start)
-                        if (verbose) print(end)
-                        if (verbose) print(r)
-                        if (verbose) print(c)
-
-                        vec = masks_new[s, start:end]
-                        previous_masked_b = matrix(vec, rows=r, cols=c, byrow=TRUE)  # TODO might be unnecessary
-                        if (verbose) print("Previous bias:")
-                        if (verbose) print(previous_masked_b)
-                    }
+                    start = as.integer(as.scalar(masks_new_meta[p,1]))
+                    end   = as.integer(as.scalar(masks_new_meta[p,2]))
+                    r     = as.integer(as.scalar(masks_new_meta[p,3]))
+                    c     = as.integer(as.scalar(masks_new_meta[p,4]))
+                    vec = masks_new[s, start:end]
+                    previous_masked_b = matrix(vec, rows=r, cols=c, byrow=TRUE)
 
                     # SANITY CHECK: dimensions with layers of previous layer match
                     if (l > 1 & ncol(previous_masked_b) != nrow(W)) {
@@ -526,44 +437,21 @@ ist_create_disjoint_masks = function(
                     } else {
                         masked_W = previous_masked_b %*% masked_b
                     }
-                    if (verbose) print("Previous b:")
-                    if (verbose) print(previous_masked_b)
                 }
-                if (verbose) print("Current b:")
-                if (verbose) print(masked_b)
-                if (verbose) print("Current W:")
-                if (verbose) print(masked_W)
-
-                # 3. forward these masks to all parameters in this layer
-                if (modus==1) {
-                    # TODO this branch is faulty ATM
-                    for (param in 1:paramsPerFCLayer) {
-                        k = (param-1)*L + l
-
-                        if (param %% 2 == 0) {
-                            # masks[s][k] = masked_b FIXME
-                        } else {
-                            # masks[s][k] = masked_W FIXME
-                        }
-                    }
-                    # TODO this branch is faulty ATM
-                } else {
-                    for (param in 1:paramsPerFCLayer) {
-                        k = (param-1)*L + l
-                        start = as.integer(as.scalar(masks_new_meta[k,1]))
-                        end   = as.integer(as.scalar(masks_new_meta[k,2]))
-                        len   = end - start + 1
-
-                        if (param %% 2 == 0) {
-                            #masks_new[s,start:end] = masked_b
-                            flat = matrix(masked_b, rows=1, cols=len, byrow=TRUE)
-                        } else {
-                            flat = matrix(masked_W, rows=1, cols=len, byrow=TRUE)
-                        }
-                        masks_new[s, start:end] = flat
+
+                # 4. forward these masks to all parameters in this layer
+                for (param in 1:paramsPerFCLayer) {
+                    k = (param-1)*L + l
+                    start = as.integer(as.scalar(masks_new_meta[k,1]))
+                    end   = as.integer(as.scalar(masks_new_meta[k,2]))
+                    len   = end - start + 1
+
+                    if (param %% 2 == 0) {
+                        flat = matrix(masked_b, rows=1, cols=len, byrow=TRUE)
+                    } else {
+                        flat = matrix(masked_W, rows=1, cols=len, byrow=TRUE)
                     }
-                    if (verbose) print("Saved masks for this layer:")
-                    if (verbose) print(masks_new[s])
+                    masks_new[s, start:end] = flat
                 }
             }
 
@@ -593,7 +481,7 @@ ist_create_disjoint_masks = function(
             }
         }
         else {
-            # independent subnet training cannot be used on this layer // Non-FC layer: shared across all subnets -> masks are all-ones
+            # Non-FC layer: independent subnet training will not create disjoint partitions for this layer //  shared across all subnets -> masks are all-ones
             for (param in 1:paramsPerFCLayer) {
                 k = (param-1)*L + l
                 start = as.integer(as.scalar(masks_new_meta[k,1]))
@@ -613,25 +501,4 @@ ist_create_disjoint_masks = function(
             }
         }
     }
-
-    # DEBUG: visualize the layer masks for -> the 1st subnet and the 1st parameter only i.e W1..W4
-    if (verbose) print("This is the final mask for the 1st subnet:")
-    if (verbose) print(masks_new[1])
-    s = 1  # 1st subnet
-    for (layer in 1:L) {
-        start = as.integer(as.scalar(masks_new_meta[layer,1]))
-        end   = as.integer(as.scalar(masks_new_meta[layer,2]))
-        r     = as.integer(as.scalar(masks_new_meta[layer,3]))
-        c     = as.integer(as.scalar(masks_new_meta[layer,4]))
-        if (verbose) print("Following info was obtained.")
-        if (verbose) print(start)
-        if (verbose) print(end)
-        if (verbose) print(r)
-        if (verbose) print(c)
-
-        vec = masks_new[s, start:end]
-        previous_masked_b = matrix(vec, rows=r, cols=c, byrow=TRUE)
-        if (verbose) print(previous_masked_b)
-    }
-    # TODO tested until here! - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 }
\ No newline at end of file
diff --git a/src/test/config/SystemDS-config.xml b/src/test/config/SystemDS-config.xml
index a899f5c71c6..a051323af9b 100644
--- a/src/test/config/SystemDS-config.xml
+++ b/src/test/config/SystemDS-config.xml
@@ -18,7 +18,7 @@
 -->
 
 <root>
-   <!-- The number of theads for the spark instance artificially selected-->
+   <!-- The number of threads for the spark instance artificially selected-->
    <sysds.local.spark.number.threads>2</sysds.local.spark.number.threads>
    <!-- The timeout of the federated tests to initialize the federated matrixes -->
    <sysds.federated.initialization.timeout>2</sysds.federated.initialization.timeout>
diff --git a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
index f81046d3faf..f1e5e84d0db 100644
--- a/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
+++ b/src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml
@@ -1,3 +1,23 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
 
 source("scripts/nn/layers/affine.dml") as affine
 source("scripts/nn/layers/conv2d_builtin.dml") as conv2d
@@ -9,6 +29,10 @@ source("scripts/nn/layers/relu.dml") as relu
 source("scripts/nn/layers/softmax.dml") as softmax
 source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov
 
+/*
+ * MNIST LeNet Example
+ */
+
 #-------------------------------------------------------------
 # TRAINING
 #-------------------------------------------------------------
@@ -16,7 +40,7 @@ source("scripts/nn/optim/sgd_nesterov.dml") as sgd_nesterov
 train = function(matrix[double] X, matrix[double] Y,
                  matrix[double] X_val, matrix[double] Y_val,
                  int C, int Hin, int Win, int epochs, int workers,
-                 string utype, string freq, int batchsize, string scheme, string mode)
+                 int batchsize)
     return (matrix[double] W1, matrix[double] b1,
             matrix[double] W2, matrix[double] b2,
             matrix[double] W3, matrix[double] b3,
@@ -53,7 +77,7 @@ train = function(matrix[double] X, matrix[double] Y,
   K = ncol(Y)
 
   # Parameters in each layer
-  paramsPerLayer = 4  # TODO might be not accurate enough (to use this arg as decider anywhere)
+  paramsPerLayer = 4
   fullyConnectedLayers = list(3,4)
 
   # Create network:
@@ -87,21 +111,16 @@ train = function(matrix[double] X, matrix[double] Y,
   lambda = 5e-04
 
   # Create the model list
-  modelList = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4) #TODO reinstate
+  modelList = list(W1, W2, W3, W4, b1, b2, b3, b4, vW1, vW2, vW3, vW4, vb1, vb2, vb3, vb4)
 
   # Create the hyper parameter list
   params = list(lr=lr, mu=mu, decay=decay, C=C, Hin=Hin, Win=Win, Hf=Hf, Wf=Wf, stride=stride, pad=pad, lambda=lambda, F1=F1, F2=F2, N3=N3, fullyConnectedLayers=list(3,4))
 
   # Length of an IST round
-  ist_round = 50 # TODO whats a good value?
+  ist_round = 20
 
   # Use independent subnet training function
-  s1 = "./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::computeGradients"
-  s2 = "./src/test/scripts/functions/builtin/indSubnetTest_mnist_lenet.dml::ist_agg_shared_avg"
-  modelList2 = independentSubnetTrain(features=X, labels=Y, val_features=X_val, val_labels=Y_val, model=modelList, upd="computeGradients", agg="aggregateSharedParameters", mode=mode, utype=utype, epochs=epochs, batchsize=batchsize, j=ist_round, numSubnets=workers, scheme=scheme, hyperparams=params, verbose=FALSE, paramsPerLayer=paramsPerLayer, fullyConnectedLayers=fullyConnectedLayers)
-  M_old = matrix(1, rows=3, cols=3)
-  #M_new = independentSubnetTrain(M_old)
-  #modelList2 = independentSubnetTrain(M_old)
+  modelList2 = independentSubnetTrain(features=X, labels=Y, val_features=X_val, val_labels=Y_val, model=modelList, upd="computeGradients", agg="aggregateSharedParameters", epochs=epochs, batchsize=batchsize, j=ist_round, numSubnets=workers, hyperparams=params, verbose=TRUE, paramsPerLayer=paramsPerLayer, fullyConnectedLayers=fullyConnectedLayers)
 
 
   W1 = as.matrix(modelList2[1])
@@ -287,7 +306,7 @@ aggregateSharedParameters = function(
     num = matrix(0, nrow(initialParam), ncol(initialParam))
     den = matrix(0, nrow(initialParam), ncol(initialParam))
 
-    for (s in 1:length(allSubnetsParam)) {  # TODO make parfor?
+    for (s in 1:length(allSubnetsParam)) {
         num = num + (as.matrix(allSubnetsParam[s]) * as.matrix(allSubnetsMasks[s]))
         den = den + as.matrix(allSubnetsMasks[s])
     }
@@ -361,16 +380,12 @@ return (matrix[double] X, matrix[double] Y,
 #-------------------------------------------------------------
 
 # Training parameters
-epochs = 10  # TODO reinstate 90
-batch_size = 128 #512 TODO reiinstate?
+epochs = 10
+batch_size = 128
 workers = 8
-utype = "BSP"      # or whatever you use
-freq  = "BATCH"    # kept for signature compatibility (IST can ignore)
-scheme = "DISJOINT"  # whatever you wired
-mode = "LOCAL"       # whatever you wired
 
 # 1) Load train/val
 [X, Y, X_val, Y_val, C, Hin, Win] = generate_mnist_datasplit(FALSE)
 
 # 2) Train (IST happens inside train())
-[W1, b1, W2, b2, W3, b3, W4, b4] = train(X, Y, X_val, Y_val, C, Hin, Win, epochs, workers, utype, freq, batch_size, scheme, mode)
\ No newline at end of file
+[W1, b1, W2, b2, W3, b3, W4, b4] = train(X, Y, X_val, Y_val, C, Hin, Win, epochs, workers, batch_size)
\ No newline at end of file