diff --git a/.github/workflows/ASan.yml b/.github/workflows/ASan.yml
index 90a74a66..7c0b0a95 100644
--- a/.github/workflows/ASan.yml
+++ b/.github/workflows/ASan.yml
@@ -55,8 +55,9 @@ jobs:
         run: |
           export LD_PRELOAD=$(gcc -print-file-name=libasan.so)
 
-          echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" > src/Makevars
-          echo "PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars
+          # Replace PKG_CXXFLAGS in-place; preserve PKG_CPPFLAGS and PKG_LIBS
+          sed -i 's/^PKG_CXXFLAGS.*/PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer/' src/Makevars
+          echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars
 
           mkdir ~/.R
           echo "LDFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> ~/.R/Makevars
diff --git a/.positai/settings.json b/.positai/settings.json
index 7aba4f95..acec4998 100644
--- a/.positai/settings.json
+++ b/.positai/settings.json
@@ -28,6 +28,17 @@
       "rm -f src/*.o src/*.dll": "allow",
       "tail *": "allow",
       "wc *": "allow"
+    },
+    "external_directory": {
+      "C:/Users/pjjg18/GitHub/TreeSearch-a/*": "allow",
+      "C:/Users/pjjg18/.positai/skills/r-package-profiling/references/*": "allow",
+      "../Quartet/*": "allow",
+      "../TreeTools/*": "allow"
+    },
+    "webfetch": {
+      "https://github.com/*": "allow",
+      "https://api.github.com/*": "allow",
+      "https://raw.githubusercontent.com/*": "allow"
     }
   },
   "model": {
diff --git a/AGENTS.md b/AGENTS.md
index 7ed0028b..6d81e883 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -257,6 +257,20 @@ source("benchmark/bench-tree-distances.R")
 C++ compilation flags are controlled by `src/Makevars.win` (Windows) / `src/Makevars`.
 The package requires **C++17** (`CXX_STD = CXX17`).
 
+### Documentation and spelling checks
+
+After any work that adds or modifies roxygen comments, Rd files, NEWS.md, or
+vignettes, run:
+
+```r
+devtools::check_man()                # regenerates Rd files and checks for issues
+spelling::spell_check_package()      # flags potential misspellings
+```
+
+Legitimate technical terms, proper nouns, and code identifiers flagged by the
+spell checker should be added to `inst/WORDLIST` (one word per line,
+alphabetically sorted).  Only fix actual typos in the source.
+
 ### Code coverage
 
 Check that existing tests cover all new code.  The GHA test suite uses codecov.
@@ -271,6 +285,26 @@ covr::file_coverage(cov, "src/pairwise_distances.cpp")  # per-file summary
 Aim for full line coverage of new C++ and R code.  If a new code path is not
 exercised by the existing test suite, add targeted tests in `tests/testthat/`.
 
+### ⚠ Exploratory / risky R code — use a subprocess
+
+When running experimental R code that may be slow, allocate large objects,
+or involve complex loops (e.g., hill-climbing over tree space, brute-force
+evaluation of many candidate trees), **run it in a subprocess** rather than
+the interactive RStudio session.  Long-running or memory-heavy computations
+in the main session can freeze or crash RStudio.
+
+```r
+# Write the experiment to a temp script, then run via Rscript:
+writeLines(code, tmp <- tempfile(fileext = ".R"))
+system2("Rscript", tmp, stdout = TRUE, stderr = TRUE)
+
+# Or use callr for structured subprocess execution:
+callr::r(function() { ... }, timeout = 120)
+```
+
+This applies especially to prototype algorithm exploration (e.g., CID
+hill-climbing over split space) where per-iteration cost is uncertain.
+
 ---
 
 ## Completed Optimizations (this dev cycle)
@@ -1143,6 +1177,52 @@ for the cross-pairs case.
 
 ---
 
+## LinkingTo Header Exposure (`expose-lapjv` branch)
+
+Extracted LAP and MCI C++ APIs into `inst/include/TreeDist/` headers so that
+downstream packages (e.g., TreeSearch) can use `LinkingTo: TreeDist`:
+
+| Header | Content |
+|--------|---------|
+| `types.h` | `cost`, `lap_dim`, `lap_row`, `lap_col`, constants |
+| `cost_matrix.h` | `CostMatrix` class (Rcpp-free) |
+| `lap_scratch.h` | `LapScratch` struct |
+| `lap.h` | `lap()` declarations |
+| `lap_impl.h` | `lap()` implementation (include in exactly one TU) |
+| `mutual_clustering.h` | MCI declarations |
+| `mutual_clustering_impl.h` | MCI implementation (include in exactly one TU) |
+
+`src/lap.h` is now a thin wrapper that includes `<TreeDist/lap.h>` and
+re-exports types to global scope.
+
+### LAPJV codegen regression (diagnosed, partially mitigated)
+
+Including `lap_impl.h` in `lap.cpp` changed the TU context enough for GCC 14's
+register allocator to produce ~8% more instructions in the Dijkstra hot loop,
+causing a consistent 20–25% regression on standalone LAPJV (n ≥ 400).
+
+**Root cause:** the installed-header version of `CostMatrix` (in
+`inst/include/TreeDist/cost_matrix.h`) has a different method set than main's
+monolithic `src/lap.h` (extra methods like `setWithTranspose()`, `dim8()`;
+missing test variants like `findRowSubminNaive`).  This changes GCC's
+optimization heuristics for the entire TU, even though `lap()` never calls
+the differing methods.
+
+**Fix:** define `lap()` directly in `lap.cpp` (not via `#include <TreeDist/lap_impl.h>`)
+with `__attribute__((optimize("align-functions=64", "align-loops=16")))`.
+The `lapjv()` wrapper fills the transposed buffer first (matching R's
+column-major storage) then untransposes — restoring the cache-friendly pattern.
+
+**Residual:** ~5–9% on LAPJV 1999×1999 vs main (from the CostMatrix class
+definition difference).  Tree distance metrics are unaffected — they call
+`lap()` from `pairwise_distances.cpp`, whose TU context is unchanged.
+
+**Maintenance note:** if the `lap()` algorithm changes, update BOTH `src/lap.cpp`
+and `inst/include/TreeDist/lap_impl.h`.  The duplication is intentional — it
+preserves the TU context that was profiled and tuned.
+
+---
+
 ## Remaining Optimization Opportunities
 
 - Sort+merge pre-scan for `rf_info_score`: **DONE** — replaced O(n²) loop with
diff --git a/NAMESPACE b/NAMESPACE
index 60386873..9516b984 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -141,6 +141,10 @@ export(StrainCol)
 export(SumOfRanges)
 export(SumOfVariances)
 export(SumOfVars)
+export(TransferConsensus)
+export(TransferDist)
+export(TransferDistSplits)
+export(TransferDistance)
 export(TreeDistPlot)
 export(TreeDistance)
 export(TreesConsistentWithTwoSplits)
@@ -152,6 +156,7 @@ export(entropy_int)
 export(is.HPart)
 importFrom(Rdpack,reprompt)
 importFrom(TreeTools,AllAncestors)
+importFrom(TreeTools,Consensus)
 importFrom(TreeTools,DropTip)
 importFrom(TreeTools,FirstMatchingSplit)
 importFrom(TreeTools,KeepTip)
@@ -179,6 +184,7 @@ importFrom(TreeTools,RootTree)
 importFrom(TreeTools,SplitFrequency)
 importFrom(TreeTools,SplitInformation)
 importFrom(TreeTools,SplitsInBinaryTree)
+importFrom(TreeTools,StarTree)
 importFrom(TreeTools,TipLabels)
 importFrom(TreeTools,TipsInSplits)
 importFrom(TreeTools,TreeIsRooted)
diff --git a/NEWS.md b/NEWS.md
index 8fc11b03..082fec34 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,23 @@
+<!-- AI-generated branch summary (2026-03-22) -->
+# Branch: `expose-lapjv` → merge target: `master`
+
+This branch exposes the LAPJV (Jonker–Volgenant) linear assignment solver
+via C++ headers so that downstream packages can link to it (`LinkingTo`),
+and implements a transfer consensus method (`TransferConsensus()`,
+`TransferDist()`).  It also tightens error handling by moving all
+`Rcpp::stop()` / `Rf_error()` calls out of C++ implementation code
+into R-level input validation.
+
+# TreeDist 2.13.0.9000
+
+This branch implements transfer consensus trees (`TransferConsensus()`) and
+a corresponding transfer distance metric (`TransferDist()`), providing a
+consensus method based on the transfer distance rather than the majority-rule
+or strict approaches.  Error handling in C++ code is tightened to validate
+inputs at the R level rather than using `Rcpp::stop()` or `Rf_error()` in
+implementation code.
+<!-- end AI-generated summary -->
+
 # TreeDist 2.13.0 (2026-03-17)
 
 ## New features
@@ -68,7 +88,7 @@ Typical speedups over v2.12.0 for tree sets where most splits are shared
 - Cross-pairs computations (`tree1` vs `tree2` where both are lists) now
   use the same optimized batch path as all-pairs computations.
 
-### KendallColijn distance
+### Kendall & Colijn distance
 
 - `KCVector()` reimplemented in C++, giving ~220× speedup per tree.
 
diff --git a/R/RcppExports.R b/R/RcppExports.R
index 140dd0e9..9e4d02b2 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -200,6 +200,77 @@ spr_table_7 <- function(sp1, sp2) {
     .Call(`_TreeDist_spr_table_7`, sp1, sp2)
 }
 
+#' Transfer consensus (C++ implementation)
+#'
+#' @param splits_list List of raw matrices (one per tree), each from as.Splits().
+#' @param n_tip Number of tips.
+#' @param scale Logical: use scaled transfer distance?
+#' @param greedy_best_flag Logical: TRUE for "best", FALSE for "first".
+#' @param init_majority Logical: TRUE to start from majority-rule splits.
+#'
+#' @return A `LogicalVector` of length n_splits indicating which pooled splits
+#'   are included in the consensus, plus attributes "raw_splits" (a raw matrix
+#'   of all unique splits) and "light_side" (integer vector).
+#' @keywords internal
+cpp_transfer_consensus <- function(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_transfer_consensus`, splits_list, n_tip, scale, greedy_best_flag, init_majority, n_threads)
+}
+
+#' @keywords internal
+cpp_tc_profile <- function(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_iter, n_threads = 1L) {
+    .Call(`_TreeDist_cpp_tc_profile`, splits_list, n_tip, scale, greedy_best_flag, init_majority, n_iter, n_threads)
+}
+
+#' Per-pair transfer dissimilarity
+#'
+#' @param x,y Raw matrices representing splits (from as.Splits()).
+#' @param nTip Integer: number of tips.
+#'
+#' @return A list with components:
+#'   - score_scaled: scaled transfer dissimilarity (double)
+#'   - score_unscaled: unscaled transfer dissimilarity (double)
+#'   - `matching_xy`: integer vector, best match in y for each split in x (1-based, NA if sentinel)
+#'   - `matching_yx`: integer vector, best match in x for each split in y (1-based, NA if sentinel)
+#' @keywords internal
+cpp_transfer_dist <- function(x, y, nTip) {
+    .Call(`_TreeDist_cpp_transfer_dist`, x, y, nTip)
+}
+
+#' @keywords internal
+cpp_transfer_dist_scored <- function(x, y, nTip, scale) {
+    .Call(`_TreeDist_cpp_transfer_dist_scored`, x, y, nTip, scale)
+}
+
+#' All-pairs transfer dissimilarity (OpenMP)
+#'
+#' @param splits_list List of raw matrices (one per tree).
+#' @param n_tip Number of tips.
+#' @param scale Logical: use scaled transfer dissimilarity?
+#' @param n_threads Number of OpenMP threads.
+#'
+#' @return Numeric vector of length choose(N,2) in dist order.
+#' @keywords internal
+cpp_transfer_dist_all_pairs <- function(splits_list, n_tip, scale, n_threads) {
+    .Call(`_TreeDist_cpp_transfer_dist_all_pairs`, splits_list, n_tip, scale, n_threads)
+}
+
+#' Cross-pairs transfer dissimilarity (OpenMP)
+#'
+#' @param splits_a,splits_b Lists of raw matrices.
+#' @param n_tip Number of tips.
+#' @param scale Logical: use scaled transfer dissimilarity?
+#' @param n_threads Number of OpenMP threads.
+#'
+#' @return Numeric matrix of dimension `nA` x `nB`.
+#' @keywords internal
+cpp_transfer_dist_cross_pairs <- function(splits_a, splits_b, n_tip, scale, n_threads) {
+    .Call(`_TreeDist_cpp_transfer_dist_cross_pairs`, splits_a, splits_b, n_tip, scale, n_threads)
+}
+
+cpp_mci_impl_score <- function(x, y, n_tips) {
+    .Call(`_TreeDist_cpp_mci_impl_score`, x, y, n_tips)
+}
+
 cpp_robinson_foulds_distance <- function(x, y, nTip) {
     .Call(`_TreeDist_cpp_robinson_foulds_distance`, x, y, nTip)
 }
diff --git a/R/hierarchical_mutual_information.R b/R/hierarchical_mutual_information.R
index 06948acd..862d5c93 100644
--- a/R/hierarchical_mutual_information.R
+++ b/R/hierarchical_mutual_information.R
@@ -66,6 +66,10 @@ HierarchicalMutualInfo <- function(tree1, tree2 = NULL, normalize = FALSE) {
       1
     }
   } else {
+    if (inherits(tree1, "phylo") && inherits(tree2, "phylo") &&
+        NTip(tree1) != NTip(tree2)) {
+      stop("Trees must have the same number of leaves")
+    }
     hp2 <- as.HPart(tree2, tree1)
     hmi <- HMI_xptr(hp1, hp2)
     if (isFALSE(normalize)) {
@@ -132,6 +136,8 @@ HH <- SelfHMI
 #' @rdname HierarchicalMutualInfo
 #' @export
 EHMI <- function(tree1, tree2, precision = 0.01, minResample = 36) {
+  if (minResample < 2L) stop("Must perform at least one resampling")
+  if (precision < 1e-8) stop("Tolerance too low")
   EHMI_xptr(as.HPart(tree1), as.HPart(tree2), as.numeric(precision),
                 as.integer(minResample)) / log(2)
 }
@@ -162,6 +168,8 @@ EHMI <- function(tree1, tree2, precision = 0.01, minResample = 36) {
 #' @rdname HierarchicalMutualInfo
 #' @export
 AHMI <- function(tree1, tree2, Mean = max, precision = 0.01, minResample = 36) {
+  if (minResample < 2L) stop("Must perform at least one resampling")
+  if (precision < 1e-8) stop("Tolerance too low")
   hp1 <- as.HPart(tree1)
   hp2 <- as.HPart(tree2, hp1)
   
diff --git a/R/transfer_consensus.R b/R/transfer_consensus.R
new file mode 100644
index 00000000..384cff0d
--- /dev/null
+++ b/R/transfer_consensus.R
@@ -0,0 +1,550 @@
+#' Consensus tree minimizing transfer distance
+#'
+#' Construct a consensus tree that minimizes the sum of transfer distances
+#' to a set of input trees, using a greedy add-and-prune heuristic.
+#'
+#' Unlike the majority-rule consensus, which minimizes Robinson-Foulds
+#' distance and can be highly unresolved when phylogenetic signal is low,
+#' `TransferConsensus()` uses the finer-grained transfer distance
+#' \insertCite{Lemoine2018}{TreeDist} to construct a more resolved consensus
+#' tree.
+#'
+#' The algorithm pools all splits observed across input trees, computes
+#' pairwise transfer distances between them, and greedily adds or removes
+#' splits to minimize total transfer dissimilarity cost.  The approach
+#' follows \insertCite{Takazawa2026;textual}{TreeDist}, reimplemented for
+#' 'TreeDist' infrastructure.
+#'
+#' @param trees An object of class `multiPhylo`: the input trees.
+#'   All trees must share the same tip labels.
+#' @param scale Logical; if `TRUE` (default), use the scaled transfer
+#'   distance (normalized by light-side size minus one).  If `FALSE`, use
+#'   the unscaled (raw Hamming) transfer distance.
+#' @param greedy Character string specifying the greedy strategy:
+#'   `"best"` (default) picks the single highest-benefit action at each step;
+#'   `"first"` picks the first improving action encountered (faster but
+#'   potentially lower quality).
+#' @param init Character string specifying the initial consensus:
+#'   `"empty"` (default) starts with no splits (purely additive);
+#'   `"majority"` starts with the majority-rule consensus and refines.
+#'
+#' @return A tree of class `phylo`.
+#'
+#' @references
+#' \insertAllCited{}
+#'
+#' @examples
+#' library(TreeTools)
+#' # Generate bootstrap-like trees
+#' trees <- as.phylo(1:20, nTip = 12)
+#'
+#' # Transfer consensus (more resolved than majority rule)
+#' tc <- TransferConsensus(trees)
+#' plot(tc)
+#'
+#' # Compare resolution
+#' mr <- Consensus(trees, p = 0.5)
+#' cat("Majority-rule splits:", NSplits(mr), "\n")
+#' cat("Transfer consensus splits:", NSplits(tc), "\n")
+#'
+#' @importFrom TreeTools as.Splits TipLabels NSplits Consensus StarTree
+#' @export
+TransferConsensus <- function(trees,
+                              scale = TRUE,
+                              greedy = c("best", "first"),
+                              init = c("empty", "majority")) {
+  greedy <- match.arg(greedy)
+  init <- match.arg(init)
+
+  if (!inherits(trees, "multiPhylo")) {
+    stop("`trees` must be an object of class 'multiPhylo'.")
+  }
+  nTree <- length(trees)
+  if (nTree < 2L) stop("Need at least 2 trees.")
+  tipLabels <- TipLabels(trees[[1]])
+  nTip <- length(tipLabels)
+
+  # Convert each tree to a raw split matrix (TreeTools C++ internally)
+  splitsList <- lapply(trees, function(tr) {
+    sp <- as.Splits(tr, tipLabels)
+    unclass(sp)
+  })
+
+  # Delegate all work to C++
+  nThreads <- max(1L, getOption("TreeDist.threads",
+                                 getOption("mc.cores", 1L)))
+  res <- cpp_transfer_consensus(
+    splitsList, nTip, scale,
+    greedy_best_flag = (greedy == "best"),
+    init_majority = (init == "majority"),
+    n_threads = nThreads
+  )
+
+  included <- res$included
+  if (!any(included)) {
+    return(StarTree(tipLabels))
+  }
+
+  rawSplits <- res$raw_splits[included, , drop = FALSE]
+  sp <- structure(rawSplits, nTip = nTip, tip.label = tipLabels,
+                  class = "Splits")
+  as.phylo(sp)
+}
+
+
+# ===========================================================================
+# Internal helpers
+# ===========================================================================
+
+# popcount lookup for 0:255
+.POPCOUNT <- as.integer(sapply(0:255, function(x) sum(as.integer(intToBits(x)))))
+
+#' Pool unique splits, returning an integer (logical) matrix
+#' @noRd
+.PoolSplits <- function(trees, tipLabels) {
+  nTip <- length(tipLabels)
+  nTree <- length(trees)
+
+  # Collect all splits as logical matrices and hash to unique set
+  env <- new.env(hash = TRUE, parent = emptyenv())
+  splitsList <- list()
+  rawSplitsList <- list()
+  countVec <- integer(0)
+  treeMembers <- vector("list", nTree)
+  nextIdx <- 1L
+
+  for (i in seq_len(nTree)) {
+    sp <- as.Splits(trees[[i]], tipLabels)
+    logMat <- as.logical(sp)  # matrix: nSplits x nTip
+    # The Splits object is a raw matrix internally; preserve structure
+    rawMat <- unclass(sp)
+    if (is.null(dim(logMat))) {
+      logMat <- matrix(logMat, nrow = 1)
+      rawMat <- matrix(rawMat, nrow = 1)
+    }
+    nSp <- nrow(logMat)
+    # Canonicalise: ensure tip 1 is FALSE
+    nSp <- nrow(logMat)
+    members <- integer(nSp)
+    for (j in seq_len(nSp)) {
+      row <- logMat[j, ]
+      rawRow <- rawMat[j, ]
+      if (row[1]) {
+        row <- !row
+        rawRow <- .FlipRaw(rawRow, nTip)
+      }
+      key <- paste0(which(row), collapse = ",")
+      idx <- env[[key]]
+      if (is.null(idx)) {
+        env[[key]] <- nextIdx
+        splitsList[[nextIdx]] <- row
+        rawSplitsList[[nextIdx]] <- rawRow
+        countVec[nextIdx] <- 1L
+        members[j] <- nextIdx
+        nextIdx <- nextIdx + 1L
+      } else {
+        countVec[idx] <- countVec[idx] + 1L
+        members[j] <- idx
+      }
+    }
+    treeMembers[[i]] <- unique(members)
+  }
+
+  nSplits <- length(splitsList)
+  splitMat <- matrix(FALSE, nrow = nSplits, ncol = nTip)
+  nBytes <- length(rawSplitsList[[1]])
+  rawMat <- matrix(as.raw(0), nrow = nSplits, ncol = nBytes)
+  for (k in seq_len(nSplits)) {
+    splitMat[k, ] <- splitsList[[k]]
+    rawMat[k, ] <- rawSplitsList[[k]]
+  }
+
+  lightSide <- pmin(rowSums(splitMat), nTip - rowSums(splitMat))
+
+  list(
+    splits = splitMat,         # logical matrix nSplits x nTip
+    rawSplits = rawMat,        # raw matrix nSplits x nBytes
+    counts = countVec,
+    lightSide = as.integer(lightSide),
+    treeMembers = treeMembers
+  )
+}
+
+
+.FlipRaw <- function(rawVec, nTip) {
+  nBytes <- length(rawVec)
+  usedBits <- ((nTip - 1L) %% 8L) + 1L
+  lastMask <- as.raw(sum(2^(0:(usedBits - 1L))))
+  out <- xor(rawVec, as.raw(0xff))
+  out[nBytes] <- out[nBytes] & lastMask
+  out
+}
+
+
+#' Pairwise transfer distance matrix using logical split matrix
+#' Transfer distance = min(hamming(a, b), n - hamming(a, b))
+#' hamming(a,b) when both are logical = sum(xor(a,b))
+#' @noRd
+.TransferDistMat <- function(splitMat, nTip) {
+  # splitMat is logical: nSplits x nTip
+  # hamming = number of differing positions
+  # Use tcrossprod trick: hamming(a,b) = sum(a) + sum(b) - 2*sum(a&b)
+  # = nAgreeing... actually let's just compute XOR directly.
+  # Faster: hamming = nTip - 2 * (a %*% t(b) + (1-a) %*% t(1-b))
+  #       = nTip - 2 * (a %*% t(b) + (nTip - rowSums(a) - colSums(b) + a %*% t(b)))
+  # Simpler: agreement = a %*% t(b) + (1-a) %*% t(1-b)
+  #                    = 2 * (a %*% t(b)) - rowSums(a) - rep(rowSums(b)) + nTip
+  # hamming = nTip - agreement
+  sm <- splitMat + 0L  # convert to integer
+  ab <- tcrossprod(sm)  # sm %*% t(sm)
+  rs <- rowSums(sm)
+  hamming <- nTip - 2L * ab + outer(rs, rs, "+") - nTip
+  # Simplifies to: hamming = outer(rs, rs, "+") - 2 * ab
+  # Wait, let me re-derive:
+  # agreement_ij = sum(a_i == b_j) = sum(a&b) + sum(!a & !b)
+  #              = ab[i,j] + (nTip - rs[i] - rs[j] + ab[i,j])
+  #              = 2*ab[i,j] + nTip - rs[i] - rs[j]
+  # hamming_ij = nTip - agreement_ij = rs[i] + rs[j] - 2*ab[i,j]
+  hamming <- outer(rs, rs, "+") - 2L * ab
+
+  # Transfer distance = min(hamming, nTip - hamming)
+  pmin(hamming, nTip - hamming)
+}
+
+
+#' Compute TD (transfer dissimilarity cost) for each split
+#' @noRd
+.ComputeTD <- function(DIST, sentDist, treeMembers, lightSide, nTree, scale) {
+  nSplits <- nrow(DIST)
+  TD <- numeric(nSplits)
+  pMinus1 <- lightSide - 1L
+
+  for (i in seq_len(nTree)) {
+    idx <- treeMembers[[i]]
+    # For each split b, min distance to any split in tree i
+    if (length(idx) == 1L) {
+      minD <- DIST[, idx]
+    } else {
+      minD <- apply(DIST[, idx, drop = FALSE], 1, min)
+    }
+    # Also consider sentinel distance
+    minD <- pmin(minD, sentDist)
+    if (scale) {
+      TD <- TD + pmin(minD / pMinus1, 1)
+    } else {
+      TD <- TD + pmin(minD, pMinus1)
+    }
+  }
+  TD
+}
+
+
+#' Compatibility matrix via vectorized bipartition check
+#' @noRd
+.CompatMat <- function(splitMat, nTip) {
+  # Two splits compatible iff one of the 4 intersections (A&B, A&~B, ~A&B, ~A&~B) is empty
+  sm <- splitMat + 0L  # integer 0/1 matrix
+  notSm <- 1L - sm
+
+  # A&B: a[i,] & b[j,] -> any nonzero -> tcrossprod(sm, sm) > 0
+  ab   <- tcrossprod(sm, sm) > 0L
+  anb  <- tcrossprod(sm, notSm) > 0L
+  nab  <- tcrossprod(notSm, sm) > 0L
+  nanb <- tcrossprod(notSm, notSm) > 0L
+
+  # Compatible if at least one intersection is empty
+  !ab | !anb | !nab | !nanb
+}
+
+
+#' Initialize MATCH/MATCH2 from currently included splits
+#' @noRd
+.InitMatches <- function(st, DIST, sentDist, lightSide, scale) {
+  nSplits <- length(st$MATCH)
+  incIdx <- which(st$incl)
+  pMinus1 <- lightSide - 1L
+
+  if (length(incIdx) == 0L) return(invisible())
+
+  # For each split b, find 1st and 2nd closest among included
+  subDIST <- DIST[, incIdx, drop = FALSE]
+  for (b in seq_len(nSplits)) {
+    dists <- subDIST[b, ]
+    ord <- order(dists)
+    bestDist <- dists[ord[1]]
+    thresh <- pMinus1[b]
+    if (scale && bestDist / thresh >= 1) next
+    if (!scale && bestDist >= thresh) next
+    st$MATCH[b] <- incIdx[ord[1]]
+    if (length(ord) > 1L) {
+      secDist <- dists[ord[2]]
+      if (scale && secDist / thresh < 1) {
+        st$MATCH2[b] <- incIdx[ord[2]]
+      } else if (!scale && secDist < thresh) {
+        st$MATCH2[b] <- incIdx[ord[2]]
+      }
+    }
+  }
+}
+
+
+#' Get distance from split b to its match (NA = sentinel)
+#' @noRd
+.Dist <- function(b, idx, DIST, sentDist) {
+  if (is.na(idx)) sentDist[b] else DIST[b, idx]
+}
+
+
+#' Vectorized add-benefit: returns benefit for each candidate
+#' @noRd
+.AddBenefitVec <- function(candidates, st, DIST, sentDist, TD, counts,
+                           lightSide, scale) {
+  nSplits <- length(st$MATCH)
+  pMinus1 <- lightSide - 1L
+
+  # Current match distances for all splits
+  matchDist <- ifelse(is.na(st$MATCH), sentDist, .DiagDist(st$MATCH, DIST, sentDist))
+
+  benefits <- numeric(length(candidates))
+  for (ci in seq_along(candidates)) {
+    c <- candidates[ci]
+    newDist <- DIST[, c]
+    if (scale) {
+      diff <- (matchDist - newDist) / pMinus1
+    } else {
+      diff <- matchDist - newDist
+    }
+    diff[diff < 0] <- 0
+    benefits[ci] <- sum(diff * counts) - TD[c]
+  }
+  benefits
+}
+
+#' Helper: get DIST\[b, MATCH\[b\]\] for all b, vectorized
+#' @noRd
+.DiagDist <- function(matchVec, DIST, sentDist) {
+  n <- length(matchVec)
+  out <- numeric(n)
+  notNA <- !is.na(matchVec)
+  if (any(notNA)) {
+    out[notNA] <- DIST[cbind(which(notNA), matchVec[notNA])]
+  }
+  out[!notNA] <- sentDist[!notNA]
+  out
+}
+
+
+#' Vectorized remove-benefit
+#' @noRd
+.RemoveBenefitVec <- function(candidates, st, DIST, sentDist, TD, counts,
+                              lightSide, scale) {
+  nSplits <- length(st$MATCH)
+  pMinus1 <- lightSide - 1L
+
+  # For remove, only splits whose MATCH == candidate are affected
+  benefits <- numeric(length(candidates))
+  matchDist <- .DiagDist(st$MATCH, DIST, sentDist)
+  match2Dist <- .DiagDist(st$MATCH2, DIST, sentDist)
+
+  for (ci in seq_along(candidates)) {
+    c <- candidates[ci]
+    affected <- st$MATCH == c & !is.na(st$MATCH)
+    if (any(affected)) {
+      if (scale) {
+        fn_cost <- sum((DIST[affected, c] - match2Dist[affected]) /
+                         pMinus1[affected] * counts[affected])
+      } else {
+        fn_cost <- sum((DIST[affected, c] - match2Dist[affected]) *
+                         counts[affected])
+      }
+    } else {
+      fn_cost <- 0
+    }
+    benefits[ci] <- TD[c] + fn_cost
+  }
+  benefits
+}
+
+
+.DoAdd <- function(branchIdx, st, DIST, sentDist) {
+  st$incl[branchIdx] <- TRUE
+  nSplits <- length(st$MATCH)
+
+  curMatchDist <- .DiagDist(st$MATCH, DIST, sentDist)
+  newDist <- DIST[, branchIdx]
+
+  better <- newDist < curMatchDist
+  if (any(better)) {
+    st$MATCH2[better] <- st$MATCH[better]
+    st$MATCH[better] <- branchIdx
+  }
+
+  # Check if it becomes second match for others
+  notBetter <- !better
+  secDist <- .DiagDist(st$MATCH2, DIST, sentDist)
+  betterSec <- notBetter & (newDist < secDist)
+  if (any(betterSec)) {
+    st$MATCH2[betterSec] <- branchIdx
+  }
+}
+
+
+.DoRemove <- function(branchIdx, st, DIST, sentDist, lightSide, scale) {
+  st$incl[branchIdx] <- FALSE
+  nSplits <- length(st$MATCH)
+  curInc <- which(st$incl)
+  pMinus1 <- lightSide - 1L
+
+  # Splits whose first match was branchIdx
+  affected1 <- which(st$MATCH == branchIdx & !is.na(st$MATCH))
+  if (length(affected1)) {
+    st$MATCH[affected1] <- st$MATCH2[affected1]
+    for (b in affected1) {
+      if (is.na(st$MATCH[b])) {
+        # Promoted value was sentinel — rescan for actual closest
+        st$MATCH[b] <- .FindSecond(b, NA_integer_, curInc, DIST,
+                                    pMinus1, scale)
+      }
+      # Find new second match
+      st$MATCH2[b] <- .FindSecond(b, st$MATCH[b], curInc, DIST,
+                                  pMinus1, scale)
+    }
+  }
+
+  # Splits whose second match was branchIdx
+  affected2 <- which(st$MATCH2 == branchIdx & !is.na(st$MATCH2))
+  if (length(affected2)) {
+    for (b in affected2) {
+      st$MATCH2[b] <- .FindSecond(b, st$MATCH[b], curInc, DIST,
+                                  pMinus1, scale)
+    }
+  }
+}
+
+
+.FindSecond <- function(b, matchIdx, curInc, DIST, pMinus1, scale) {
+  cands <- if (is.na(matchIdx)) curInc else setdiff(curInc, matchIdx)
+  if (length(cands) == 0L) return(NA_integer_)
+  dists <- DIST[b, cands]
+  best <- cands[which.min(dists)]
+  bestDist <- DIST[b, best]
+  if (scale && bestDist / pMinus1[b] >= 1) return(NA_integer_)
+  if (!scale && bestDist >= pMinus1[b]) return(NA_integer_)
+  best
+}
+
+
+.IsCompat <- function(idx, incl, compat, nTip) {
+  incIdx <- which(incl)
+  if (length(incIdx) == 0L) return(TRUE)
+  if (length(incIdx) >= nTip - 3L) return(FALSE)
+  all(compat[idx, incIdx])
+}
+
+
+.GreedyBest <- function(st, DIST, sentDist, TD, counts, lightSide,
+                        compat, sortOrd, scale, nSplits, nTip) {
+  repeat {
+    # Evaluate all candidates
+    addCands <- integer(0)
+    remCands <- integer(0)
+    for (idx in sortOrd) {
+      if (st$incl[idx]) {
+        remCands <- c(remCands, idx)
+      } else if (.IsCompat(idx, st$incl, compat, nTip)) {
+        addCands <- c(addCands, idx)
+      }
+    }
+
+    bestBen <- 0
+    bestIdx <- 0L
+    bestAction <- ""
+
+    if (length(addCands)) {
+      addBen <- .AddBenefitVec(addCands, st, DIST, sentDist, TD, counts,
+                               lightSide, scale)
+      mx <- max(addBen)
+      if (mx > bestBen) {
+        bestBen <- mx
+        bestIdx <- addCands[which.max(addBen)]
+        bestAction <- "add"
+      }
+    }
+    if (length(remCands)) {
+      remBen <- .RemoveBenefitVec(remCands, st, DIST, sentDist, TD, counts,
+                                  lightSide, scale)
+      mx <- max(remBen)
+      if (mx > bestBen) {
+        bestBen <- mx
+        bestIdx <- remCands[which.max(remBen)]
+        bestAction <- "remove"
+      }
+    }
+
+    if (bestBen <= 0) break
+    if (bestAction == "add") {
+      .DoAdd(bestIdx, st, DIST, sentDist)
+    } else {
+      .DoRemove(bestIdx, st, DIST, sentDist, lightSide, scale)
+    }
+  }
+}
+
+
+.GreedyFirst <- function(st, DIST, sentDist, TD, counts, lightSide,
+                         compat, sortOrd, scale, nSplits, nTip) {
+  improving <- TRUE
+  while (improving) {
+    improving <- FALSE
+    matchDist <- .DiagDist(st$MATCH, DIST, sentDist)
+    match2Dist <- .DiagDist(st$MATCH2, DIST, sentDist)
+    pMinus1 <- lightSide - 1L
+
+    for (idx in sortOrd) {
+      if (st$incl[idx]) {
+        # Quick remove benefit
+        affected <- st$MATCH == idx & !is.na(st$MATCH)
+        if (any(affected)) {
+          if (scale) {
+            fn <- sum((DIST[affected, idx] - match2Dist[affected]) /
+                        pMinus1[affected] * counts[affected])
+          } else {
+            fn <- sum((DIST[affected, idx] - match2Dist[affected]) *
+                        counts[affected])
+          }
+        } else {
+          fn <- 0
+        }
+        if (TD[idx] + fn > 0) {
+          .DoRemove(idx, st, DIST, sentDist, lightSide, scale)
+          improving <- TRUE
+          break
+        }
+      } else if (.IsCompat(idx, st$incl, compat, nTip)) {
+        newDist <- DIST[, idx]
+        if (scale) {
+          diff <- pmax((matchDist - newDist) / pMinus1, 0)
+        } else {
+          diff <- pmax(matchDist - newDist, 0)
+        }
+        if (sum(diff * counts) - TD[idx] > 0) {
+          .DoAdd(idx, st, DIST, sentDist)
+          improving <- TRUE
+          break
+        }
+      }
+    }
+  }
+}
+
+
+.SplitsToPhylo <- function(rawSplits, included, tipLabels, nTip) {
+  selectedIdx <- which(included)
+  if (length(selectedIdx) == 0L) {
+    return(TreeTools::StarTree(tipLabels))
+  }
+  selectedRaw <- rawSplits[selectedIdx, , drop = FALSE]
+  sp <- structure(selectedRaw, nTip = nTip, tip.label = tipLabels,
+                  class = "Splits")
+  as.phylo(sp)
+}
diff --git a/R/tree_distance.R b/R/tree_distance.R
index 466de37e..11bf9e91 100644
--- a/R/tree_distance.R
+++ b/R/tree_distance.R
@@ -35,6 +35,7 @@
 #' @export
 GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, 
                            maximize, reportMatching, ...) {
+  .ValidateSplitArgs(splits1, splits2, nTip)
   nSplits1 <- dim(splits1)[[1]]
   nSplits2 <- dim(splits2)[[1]]
   
@@ -148,6 +149,7 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer,
   }
   nTip <- length(tipLabels)
   if (nTip < 4) return(NULL) # nocov
+  if (nTip > 32767L) stop("This many tips are not (yet) supported.")
   
   splits_list <- as.Splits(tree1, tipLabels = tipLabels)
   n_threads <- as.integer(getOption("mc.cores", 1L))
@@ -201,6 +203,7 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer,
   
   nTip <- length(tipLabels1)
   if (nTip < 4) return(NULL)
+  if (nTip > 32767L) stop("This many tips are not (yet) supported.")
   
   splits1 <- as.Splits(tree1, tipLabels = tipLabels1)
   splits2 <- as.Splits(tree2, tipLabels = tipLabels1)  # Use tipLabels1 to ensure order consistency
diff --git a/R/tree_distance_mast.R b/R/tree_distance_mast.R
index 4cdd06e0..5c069841 100644
--- a/R/tree_distance_mast.R
+++ b/R/tree_distance_mast.R
@@ -93,6 +93,12 @@ MASTSize <- function(tree1, tree2 = tree1, rooted = TRUE) {
 #' @param nTip Integer specifying the number of leaves in each split.
 #' @keywords internal
 .MASTSizeEdges <- function(edge1, edge2, nTip) {
+  if (nrow(edge1) != nrow(edge2)) {
+    stop("Both trees must contain the same number of edges.")
+  }
+  if (nTip > 4096L) {
+    stop("Tree too large; please contact maintainer for advice.")
+  }
   cpp_mast(edge1 - 1L, Postorder(edge2) - 1L, nTip)
 }
 
diff --git a/R/tree_distance_nni.R b/R/tree_distance_nni.R
index 7cce0a83..45854764 100644
--- a/R/tree_distance_nni.R
+++ b/R/tree_distance_nni.R
@@ -73,6 +73,13 @@ NNIDist <- function(tree1, tree2 = tree1) {
 #' @importFrom TreeTools Postorder RenumberTips
 #' @importFrom ape Nnode.phylo
 .NNIDistSingle <- function(tree1, tree2, nTip, ...) {
+  if (nTip > 32768L) {
+    stop("Cannot calculate NNI distance for trees with so many tips.")
+  }
+  if (nrow(tree1[["edge"]]) != nrow(tree2[["edge"]])) {
+    stop("Both trees must have the same number of edges. ",
+         "Is one rooted and the other unrooted?")
+  }
   tree2 <- RenumberTips(tree2, as.character(tree1[["tip.label"]]))
   
   edge1 <- Postorder(tree1[["edge"]])
diff --git a/R/tree_distance_spr.R b/R/tree_distance_spr.R
index 844d054b..85657a37 100644
--- a/R/tree_distance_spr.R
+++ b/R/tree_distance_spr.R
@@ -251,6 +251,9 @@ SPRDist.multiPhylo <- SPRDist.list
 }
 
 .SPRExact7 <- function(sp1, sp2) {
+  if (length(sp1) != 4L || length(sp2) != 4L) {
+    stop("Expected a length-4 raw vector of splits") # nocov
+  }
   spr_table_7(sp1, sp2)
 }
 
@@ -407,7 +410,7 @@ SPRDist.multiPhylo <- SPRDist.list
 # \insertCite{deOliveira2008;textual}{TreeDist}
 # An exact match is unlikely due to the arbitrary breaking of ties when leaves
 # are selected for removal.
-# SPR approximation via Oliveira Martins _et al._ (2008)
+#' SPR approximation via Oliveira Martins _et al._ (2008)
 #' @examples
 #' # de Oliveira Martins et al 2008, fig. 7
 #' tree1 <- ape::read.tree(text = "((1, 2), ((a, b), (c, d)), (3, (4, (5, (6, 7)))));")
@@ -421,6 +424,11 @@ SPRDist.multiPhylo <- SPRDist.list
 #' @importFrom TreeTools DropTip TipsInSplits KeepTipPostorder
 #' @importFrom TreeTools edge_to_splits
 .SPRPairDeO <- function(tree1, tree2, check = TRUE) {
+  if (check) {
+    if (length(tree1[["tip.label"]]) != length(tree2[["tip.label"]])) {
+      stop("`tree1` and `tree2` must have the same number of tips.")
+    }
+  }
   moves <- 0
   
   # Reduce trees (Fig. 7A in deOliveira2008)
@@ -481,3 +489,37 @@ SPRDist.multiPhylo <- SPRDist.list
   # Return:
   moves
 }
+
+# R-level validation wrappers for internal C++ functions.
+# C++ uses ASSERT() (compiled out in release); these ensure user-visible errors.
+mismatch_size <- function(x, y) {
+  if (!inherits(x, "Splits") || is.null(attr(x, "nTip"))) {
+    stop("`x` lacks nTip attribute")
+  }
+  if (!inherits(y, "Splits") || is.null(attr(y, "nTip"))) {
+    stop("`y` lacks nTip attribute")
+  }
+  if (attr(x, "nTip") != attr(y, "nTip")) {
+    stop("`x` and `y` differ in `nTip`")
+  }
+  if (nrow(x) != nrow(y)) {
+    stop("`x` and `y` differ in number of splits.")
+  }
+  .Call(`_TreeDist_mismatch_size`, x, y)
+}
+
+confusion <- function(x, y) {
+  if (!inherits(x, "Splits") || is.null(attr(x, "nTip"))) {
+    stop("`x` lacks nTip attribute")
+  }
+  if (!inherits(y, "Splits") || is.null(attr(y, "nTip"))) {
+    stop("`y` lacks nTip attribute")
+  }
+  if (attr(x, "nTip") != attr(y, "nTip")) {
+    stop("`x` and `y` differ in `nTip`")
+  }
+  if (nrow(x) != nrow(y)) {
+    stop("Input splits must contain same number of splits.")
+  }
+  .Call(`_TreeDist_confusion`, x, y)
+}
diff --git a/R/tree_distance_transfer.R b/R/tree_distance_transfer.R
new file mode 100644
index 00000000..cd0942fd
--- /dev/null
+++ b/R/tree_distance_transfer.R
@@ -0,0 +1,260 @@
+#' Transfer dissimilarity between phylogenetic trees
+#'
+#' Compute the transfer dissimilarity between phylogenetic trees, as defined
+#' by \insertCite{Takazawa2026;textual}{TreeDist}.
+#' The transfer dissimilarity uses the transfer distance
+#' \insertCite{Lemoine2018}{TreeDist} to compare bipartitions, providing a
+#' finer-grained measure than the Robinson--Foulds distance.  Each branch in
+#' each tree is scored by how many taxa must be moved to match its closest
+#' counterpart in the other tree, and these scores are summed.
+#'
+#' The `scaled` variant divides each branch's contribution by its depth minus
+#' one, giving equal weight to all branches regardless of their depth (analogous
+#' to the Robinson--Foulds distance).  The `unscaled` variant uses raw transfer
+#' distances, giving more weight to deep branches.  Neither variant satisfies
+#' the triangle inequality for trees with six or more tips.
+#'
+#' @inheritParams TreeDistance
+#' @param scale Logical; if `TRUE` (default), use the scaled transfer
+#'   dissimilarity.  If `FALSE`, use the unscaled transfer dissimilarity.
+#'
+#' @return `TransferDist()` returns an object of class `dist` (if `tree2` is
+#'   `NULL`), a numeric matrix (if both `tree1` and `tree2` are lists), or a
+#'   numeric value (for a single pair).  If `reportMatching = TRUE`, the
+#'   return value carries `matching` and `pairScores` attributes.
+#'
+#' @section Normalization:
+#'
+#' When `normalize = TRUE`, the scaled transfer dissimilarity is divided by
+#' `2 * (n - 3)`, placing it in the range \[0, 1\].  The unscaled version is
+#' divided by the maximum possible unscaled dissimilarity
+#' (following \insertCite{Takazawa2026;textual}{TreeDist}).
+#'
+#' @examples
+#' library(TreeTools)
+#' TransferDist(BalancedTree(8), PectinateTree(8))
+#' TransferDist(BalancedTree(8), PectinateTree(8), scale = FALSE)
+#'
+#' # All-pairs
+#' TransferDist(as.phylo(0:5, 8))
+#'
+#' @references
+#' \insertAllCited{}
+#'
+#' @family tree distances
+#'
+#' @importFrom TreeTools as.Splits TipLabels NSplits
+#' @export
+TransferDist <- function(tree1, tree2 = NULL, scale = TRUE,
+                         normalize = FALSE, reportMatching = FALSE) {
+  
+  # --- Fast path: all-pairs (tree2 = NULL) ---
+  if (is.null(tree2) && !reportMatching) {
+    fast <- .TransferDistAllPairs(tree1, scale)
+    if (!is.null(fast)) {
+      if (!isFALSE(normalize)) {
+        nTip <- length(TipLabels(tree1[[1]]))
+        denom <- .TransferNormDenom(nTip, scale)
+        fast <- fast / denom
+      }
+      return(fast)
+    }
+  }
+  
+  # --- Fast path: cross-pairs ---
+  if (!is.null(tree2) && !reportMatching) {
+    fast <- .TransferDistCrossPairs(tree1, tree2, scale)
+    if (!is.null(fast)) {
+      if (!isFALSE(normalize)) {
+        nTip <- length(TipLabels(
+          if (inherits(tree1, c("phylo", "Splits"))) tree1 else tree1[[1]]))
+        denom <- .TransferNormDenom(nTip, scale)
+        fast <- fast / denom
+      }
+      return(fast)
+    }
+  }
+  
+  # --- Generic fallback via CalculateTreeDistance ---
+  # Capture `scale` in the closure for the Splits-level function
+  Func <- function(splits1, splits2, nTip, reportMatching = FALSE) {
+    TransferDistSplits(splits1, splits2, nTip = nTip,
+                       scale = scale, reportMatching = reportMatching)
+  }
+  unnormalized <- CalculateTreeDistance(Func, tree1, tree2, reportMatching)
+  
+  if (!isFALSE(normalize)) {
+    nTip <- length(TipLabels(
+      if (inherits(tree1, c("phylo", "Splits"))) tree1 else tree1[[1]]))
+    denom <- .TransferNormDenom(nTip, scale)
+    unnormalized <- unnormalized / denom
+  }
+  
+  unnormalized
+}
+
+#' @rdname TransferDist
+#' @export
+TransferDistance <- TransferDist
+
+#' @rdname TransferDist
+#' @inheritParams SharedPhylogeneticInfoSplits
+#' @export
+TransferDistSplits <- function(splits1, splits2,
+                                nTip = attr(splits1, "nTip"),
+                                scale = TRUE,
+                                reportMatching = FALSE) {
+  .ValidateSplitArgs(splits1, splits2, nTip)
+  solution <- cpp_transfer_dist_scored(splits1, splits2,
+                                        nTip = as.integer(nTip),
+                                        scale = scale)
+  ret <- solution[["score"]]
+  
+  if (reportMatching) {
+    nSplits1 <- nrow(splits1)
+    nSplits2 <- nrow(splits2)
+    matching <- solution[["matching"]]
+    matching[matching > nSplits2 | matching == 0L] <- NA
+    if (nSplits1 < nSplits2) {
+      matching <- matching[seq_len(nSplits1)]
+    }
+    attr(ret, "matching") <- matching
+    
+    # Compute full pairwise score matrix for reportMatching
+    pairScores <- matrix(0, nSplits1, nSplits2)
+    for (i in seq_len(nSplits1)) {
+      for (j in seq_len(nSplits2)) {
+        # Per-pair: the transfer distance contribution
+        # This is the individual δ(b_i, b*_j) / (depth(b_i) - 1) for scaled
+        # or min(δ(b_i, b*_j), depth(b_i) - 1) for unscaled
+        pair_res <- cpp_transfer_dist_scored(
+          splits1[i, , drop = FALSE],
+          splits2[j, , drop = FALSE],
+          nTip = as.integer(nTip),
+          scale = scale
+        )
+        pairScores[i, j] <- pair_res[["score"]]
+      }
+    }
+    
+    if (!is.null(attr(splits1, "tip.label"))) {
+      matched1 <- !is.na(matching)
+      matched2 <- matching[matched1]
+      matched1 <- which(matched1)
+      attr(ret, "matchedSplits") <-
+        ReportMatching(splits1[[matched1]], splits2[[matched2]],
+                       realMatch = rep(TRUE, length(matched1)))
+    }
+    
+    attr(ret, "matchedScores") <- pairScores[
+      matrix(c(seq_along(matching), matching), ncol = 2L)]
+    attr(ret, "pairScores") <- pairScores
+  }
+  
+  ret
+}
+
+
+# ============================================================================
+# Internal helpers
+# ============================================================================
+
+# All-pairs fast path
+.TransferDistAllPairs <- function(tree1, scale) {
+  if (inherits(tree1, c("phylo", "Splits"))) return(NULL)
+  if (length(tree1) < 2L) return(NULL)
+  
+  tipLabels <- TipLabels(tree1[[1]])
+  if (is.null(tipLabels)) return(NULL)
+  nTip <- length(tipLabels)
+  if (nTip < 4L) return(NULL)
+  if (nTip > 32767L) stop("This many tips are not (yet) supported.")
+  
+  # Check all trees share same tip set
+  allLabels <- TipLabels(tree1)
+  if (is.list(allLabels)) {
+    if (!all(vapply(allLabels[-1], setequal, logical(1), tipLabels))) {
+      return(NULL)
+    }
+  }
+  
+  splitsList <- lapply(tree1, function(tr) {
+    unclass(as.Splits(tr, tipLabels))
+  })
+  
+  nThreads <- max(1L, getOption("TreeDist.threads",
+                                 getOption("mc.cores", 1L)))
+  
+  dists <- cpp_transfer_dist_all_pairs(splitsList, nTip, scale, nThreads)
+  
+  N <- length(tree1)
+  attr(dists, "Size") <- N
+  attr(dists, "Diag") <- FALSE
+  attr(dists, "Upper") <- FALSE
+  nms <- names(tree1)
+  if (!is.null(nms)) attr(dists, "Labels") <- nms
+  class(dists) <- "dist"
+  dists
+}
+
+
+# Cross-pairs fast path
+.TransferDistCrossPairs <- function(tree1, tree2, scale) {
+  single1 <- inherits(tree1, c("phylo", "Splits"))
+  single2 <- inherits(tree2, c("phylo", "Splits"))
+  if (single1 && single2) return(NULL) # use generic path for single pair
+  
+  trees1 <- if (single1) list(tree1) else tree1
+  trees2 <- if (single2) list(tree2) else tree2
+  
+  tipLabels <- TipLabels(trees1[[1]])
+  if (is.null(tipLabels)) return(NULL)
+  nTip <- length(tipLabels)
+  if (nTip < 4L) return(NULL)
+  if (nTip > 32767L) stop("This many tips are not (yet) supported.")
+  
+  # Check all trees share same tip set
+  allLabels1 <- TipLabels(trees1)
+  allLabels2 <- TipLabels(trees2)
+  if (is.list(allLabels1)) {
+    if (!all(vapply(allLabels1, setequal, logical(1), tipLabels))) return(NULL)
+  }
+  if (is.list(allLabels2)) {
+    if (!all(vapply(allLabels2, setequal, logical(1), tipLabels))) return(NULL)
+  } else {
+    if (!setequal(allLabels2, tipLabels)) return(NULL)
+  }
+  
+  splits1 <- lapply(trees1, function(tr) unclass(as.Splits(tr, tipLabels)))
+  splits2 <- lapply(trees2, function(tr) unclass(as.Splits(tr, tipLabels)))
+  
+  nThreads <- max(1L, getOption("TreeDist.threads",
+                                 getOption("mc.cores", 1L)))
+  
+  mat <- cpp_transfer_dist_cross_pairs(splits1, splits2, nTip, scale, nThreads)
+  
+  rownames(mat) <- names(trees1)
+  colnames(mat) <- names(trees2)
+  
+  # If one input was a single tree, simplify to vector
+  if (single1) return(mat[1, ])
+  if (single2) return(mat[, 1])
+  mat
+}
+
+
+# Normalization denominator
+.TransferNormDenom <- function(nTip, scale) {
+  if (scale) {
+    # Scaled: each tree contributes at most (n-3) branches × 1.0
+    2.0 * (nTip - 3L)
+  } else {
+    # Unscaled: maximum dissimilarity between two caterpillar trees
+    # Takazawa 2026: (n^2 - 2n + 4)/4 for even n, (n^2 - 2n + 5)/4 for odd n
+    if (nTip %% 2L == 0L) {
+      (nTip^2 - 2 * nTip + 4) / 4
+    } else {
+      (nTip^2 - 2 * nTip + 5) / 4
+    }
+  }
+}
diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R
index 0ad954de..561174bf 100644
--- a/R/tree_distance_utilities.R
+++ b/R/tree_distance_utilities.R
@@ -132,6 +132,9 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL,
   # Fast paths: use OpenMP batch functions when all trees share the same tip
   # set and no R-level cluster has been configured.  Each branch mirrors the
   # generic path exactly but avoids per-pair R overhead.
+  if (!is.na(nTip) && nTip > 32767L) {
+    stop("This many tips are not (yet) supported.")
+  }
   if (!is.na(nTip) && is.null(cluster)) {
     .n_threads <- as.integer(getOption("mc.cores", 1L))
     .batch_result <- if (identical(Func, MutualClusteringInfoSplits)) {
@@ -232,7 +235,9 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL,
 #' @importFrom stats setNames
 .SplitDistanceManyMany <- function(Func, splits1, splits2, 
                                    tipLabels, nTip = length(tipLabels), ...) {
-  
+  if (!is.na(nTip) && nTip > 32767L) {
+    stop("This many tips are not (yet) supported.")
+  }
   if (is.na(nTip)) {
     tipLabels <- union(unlist(tipLabels, use.names = FALSE),
                        unlist(TipLabels(splits2), use.names = FALSE))
@@ -396,6 +401,18 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL,
   }
 }
 
+# Validate split-matrix arguments before passing to C++.
+# On ARM, Rcpp::stop() can call std::terminate() instead of returning
+# a proper R error, so all expected-input validation must happen in R.
+.ValidateSplitArgs <- function(x, y, nTip) {
+  if (ncol(x) != ncol(y)) {
+    stop("Input splits must address same number of tips.")
+  }
+  if (nTip > 32767L) {
+    stop("This many tips are not (yet) supported.")
+  }
+}
+
 .CheckLabelsSame <- function(labelList) {
   nTip <- unique(lengths(labelList))
   if (length(nTip) != 1) {
diff --git a/R/tree_information.R b/R/tree_information.R
index 767c918d..580298c8 100644
--- a/R/tree_information.R
+++ b/R/tree_information.R
@@ -380,6 +380,23 @@ ConsensusInfo <- function(trees, info = "phylogenetic", p = 0.5,
   consensus_info(trees, mode == 1L, p = safeP)
 }
 
+# R-level validation wrapper; shadows RcppExports version.
+# C++ ASSERT() is compiled out in release builds.
+consensus_info <- function(trees, phylo, p) {
+  if (p > 1 + 1e-15) {
+    stop("p must be <= 1.0 in consensus_info()")
+  }
+  if (p < 0.5) {
+    stop("p must be >= 0.5 in consensus_info()")
+  }
+  nTip <- NTip(trees[[1]])
+  # CT_MAX_LEAVES = 16383 in information.h (lookup table size limit)
+  if (nTip > 16383L) {
+    stop("This many leaves are not yet supported")
+  }
+  .Call(`_TreeDist_consensus_info`, trees, phylo, p)
+}
+
 #' Maximum Clade Information Tree
 #'
 #' Analogous to the Maximum Clade Credibility tree:
diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib
index 6614f645..f2f400a3 100644
--- a/inst/REFERENCES.bib
+++ b/inst/REFERENCES.bib
@@ -1,3 +1,22 @@
+@article{Lemoine2018,
+  author = {Lemoine, Fr{\'e}d{\'e}ric and Domelevo Entfellner, Jean-Baka and Wilkinson, Eduan and Correia, Daniela and D{\'a}vila Felipe, Miraine and De Oliveira, Tulio and Gascuel, Olivier},
+  title = {{Renewing Felsenstein's phylogenetic bootstrap in the era of big data}},
+  journal = {Nature},
+  year = {2018},
+  volume = {556},
+  number = {7702},
+  pages = {452--456},
+  doi = {10.1038/s41586-018-0043-0}
+}
+
+@article{Takazawa2026,
+  author = {Takazawa, Yuuki and Takeda, Akiko and Hayamizu, Momoko and Gascuel, Olivier},
+  title = {{Outperforming the majority-rule consensus tree using fine-grained dissimilarity measures}},
+  journal = {bioRxiv},
+  year = {2026},
+  doi = {10.64898/2026.03.16.712085}
+}
+
 @article{Allen2001,
   author = {Allen, Benjamin L. and Steel, Mike A.},
   doi = {10.1007/s00026-001-8006-8},
diff --git a/inst/WORDLIST b/inst/WORDLIST
index d3c333c8..ca6a431f 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -1,6 +1,7 @@
 ABCD
 Bocker
 Bogdanowicz
+branchless
 Böcker
 Cai
 Colijn
diff --git a/inst/include/TreeDist/cost_matrix.h b/inst/include/TreeDist/cost_matrix.h
new file mode 100644
index 00000000..fe4dddcf
--- /dev/null
+++ b/inst/include/TreeDist/cost_matrix.h
@@ -0,0 +1,276 @@
+#ifndef TREEDIST_COST_MATRIX_H_
+#define TREEDIST_COST_MATRIX_H_
+
+// TreeDist CostMatrix — cache-aligned, block-padded cost matrix for the LAP.
+//
+// Rcpp-free public API.  The Rcpp-dependent constructor
+// (CostMatrix(Rcpp::NumericMatrix)) lives in TreeDist's own src/lap.h.
+
+#include "types.h"
+
+#include <algorithm>
+#include <cassert>
+#include <tuple>
+#include <vector>
+
+namespace TreeDist {
+
+class CostMatrix {
+private:
+  std::size_t dim_;
+  std::size_t dim8_;
+  alignas(64) std::vector<cost> data_;
+  alignas(64) std::vector<cost> t_data_;
+  bool transposed_;
+
+  static std::size_t block_containing(const std::size_t x) {
+    return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
+  }
+
+public:
+  // Default constructor for pooled instances (zero-size, no allocation).
+  CostMatrix() : dim_(0), dim8_(0), transposed_(false) {}
+
+  explicit CostMatrix(std::size_t dim)
+    : dim_(dim),
+      dim8_(block_containing(dim_)),
+      data_(std::vector<cost>(dim8_ * dim8_)),
+      t_data_(std::vector<cost>(dim8_ * dim8_)),
+      transposed_(false) {}
+
+  // Resize for reuse.  Only reallocates when the new dimension exceeds the
+  // current buffer capacity; otherwise just updates dim_/dim8_ and marks
+  // the transpose as stale.
+  void resize(std::size_t new_dim) {
+    dim_ = new_dim;
+    dim8_ = block_containing(new_dim);
+    const std::size_t needed = dim8_ * dim8_;
+    if (data_.size() < needed) {
+      data_.resize(needed);
+      t_data_.resize(needed);
+    }
+    transposed_ = false;
+  }
+
+  void reset() noexcept { transposed_ = false; }
+
+  [[nodiscard]] std::size_t dim() const noexcept { return dim_; }
+  [[nodiscard]] std::size_t dim8() const noexcept { return dim8_; }
+
+  // ---- Element access ----
+
+  cost& operator()(lap_row i, lap_col j) {
+    return data_[static_cast<std::size_t>(i) * dim8_ + j];
+  }
+
+  [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const {
+    return data_[static_cast<std::size_t>(i) * dim8_ + j];
+  }
+
+  [[nodiscard]] const cost& row0(lap_col j) const {
+    return data_[j];
+  }
+
+  [[nodiscard]] const cost& entry0(lap_row i) const {
+    return data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] cost* row(lap_row i) {
+    return &data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] const cost* row(lap_row i) const {
+    return &data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] cost* col(lap_col i) {
+    return &t_data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] const cost* col(lap_col i) const {
+    return &t_data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  // Write a value to both the data and transposed-data arrays.
+  // After filling the entire matrix this way, call markTransposed()
+  // to skip the lazy transpose in makeTranspose().
+  void setWithTranspose(lap_row i, lap_col j, cost value) {
+    data_[static_cast<std::size_t>(i) * dim8_ + j] = value;
+    t_data_[static_cast<std::size_t>(j) * dim8_ + i] = value;
+  }
+
+  void markTransposed() noexcept { transposed_ = true; }
+
+  // ---- Transpose ----
+
+  void makeTranspose() noexcept {
+    if (transposed_) return;
+
+    const cost* __restrict__ data_ptr = data_.data();
+    cost* __restrict__ t_data_ptr = t_data_.data();
+
+#if defined(__GNUC__) || defined(__clang__)
+    data_ptr =
+      static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
+    t_data_ptr =
+      static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
+#endif
+    for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) {
+      for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) {
+        for (std::size_t r = i;
+             r < std::min(i + BLOCK_SIZE, dim_); ++r) {
+          for (std::size_t c = j;
+               c < std::min(j + BLOCK_SIZE, dim_); ++c) {
+            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
+          }
+        }
+      }
+    }
+    transposed_ = true;
+  }
+
+  void makeUntranspose() noexcept {
+    const cost* __restrict__ data_ptr = t_data_.data();
+    cost* __restrict__ t_data_ptr = data_.data();
+
+#if defined(__GNUC__) || defined(__clang__)
+    data_ptr =
+      static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
+    t_data_ptr =
+      static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
+#endif
+    for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) {
+      for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) {
+        for (std::size_t r = i;
+             r < std::min(i + BLOCK_SIZE, dim_); ++r) {
+          for (std::size_t c = j;
+               c < std::min(j + BLOCK_SIZE, dim_); ++c) {
+            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
+          }
+        }
+      }
+    }
+  }
+
+  // ---- Padding ----
+
+  void padTrAfterCol(lap_row start_row, cost value) {
+    std::size_t start_index = static_cast<std::size_t>(start_row) * dim8_;
+    std::size_t end_index   = dim_ * dim8_;
+    std::fill(t_data_.begin() + start_index,
+              t_data_.begin() + end_index, value);
+  }
+
+  void padAfterRow(lap_row start_row, cost value) {
+    std::size_t start_index = static_cast<std::size_t>(start_row) * dim8_;
+    std::size_t end_index   = dim_ * dim8_;
+    std::fill(data_.begin() + start_index,
+              data_.begin() + end_index, value);
+  }
+
+  void padTrColAfterRow(const lap_row r, const lap_col start_col,
+                        const cost value) {
+    std::size_t r_offset = r * dim8_;
+    std::size_t actual_start_col = static_cast<std::size_t>(start_col);
+    std::size_t start_index = r_offset + actual_start_col;
+    std::size_t end_index = start_index + dim_ - actual_start_col;
+    std::fill(t_data_.begin() + start_index,
+              t_data_.begin() + end_index, value);
+  }
+
+  void padRowAfterCol(const lap_row r, const lap_col start_col,
+                      const cost value) {
+    std::size_t r_offset = r * dim8_;
+    std::size_t actual_start_col = static_cast<std::size_t>(start_col);
+    std::size_t start_index = r_offset + actual_start_col;
+    std::size_t end_index = start_index + dim_ - actual_start_col;
+    std::fill(data_.begin() + start_index,
+              data_.begin() + end_index, value);
+  }
+
+  // ---- Search ----
+
+  std::pair<cost, lap_row> findColMin(lap_col j,
+                                      lap_row search_dim = -1) {
+    makeTranspose();
+    const cost* col_data = col(j);
+    const lap_row n =
+      (search_dim < 0) ? static_cast<lap_row>(dim_) : search_dim;
+    const auto min_ptr = std::min_element(col_data, col_data + n);
+    return {*min_ptr,
+            static_cast<lap_row>(std::distance(col_data, min_ptr))};
+  }
+
+  std::tuple<cost, cost, lap_col, lap_col> findRowSubmin(
+      const lap_row* i, const std::vector<cost>& v
+  ) const {
+    assert(dim_ > 1);
+
+    const cost* __restrict__ row_i = row(*i);
+    const lap_col dim = static_cast<lap_col>(dim_);
+    const cost* __restrict__ v_ptr = v.data();
+
+    const cost h0 = row_i[0] - v_ptr[0];
+    const cost h1 = row_i[1] - v_ptr[1];
+
+    cost min_val, submin_val;
+    lap_col min_idx, submin_idx;
+
+    if (h0 > h1) {
+      min_val = h1; submin_val = h0;
+      min_idx = 1;  submin_idx = 0;
+    } else {
+      min_val = h0; submin_val = h1;
+      min_idx = 0;  submin_idx = 1;
+    }
+
+    const lap_col j_limit =
+      (dim < 4 ? 0 : static_cast<lap_col>(dim - 3));
+
+    for (lap_col j = 2; j < j_limit; j += 4) {
+      assert(BLOCK_SIZE >= 4);
+      const cost h0 = row_i[j]     - v_ptr[j];
+      const cost h1 = row_i[j + 1] - v_ptr[j + 1];
+      const cost h2 = row_i[j + 2] - v_ptr[j + 2];
+      const cost h3 = row_i[j + 3] - v_ptr[j + 3];
+      if (h0 < submin_val) {
+        if (h0 < min_val) {
+          submin_val = min_val; min_val = h0;
+          submin_idx = min_idx; min_idx = j;
+        } else { submin_val = h0; submin_idx = j; }
+      }
+      if (h1 < submin_val) {
+        if (h1 < min_val) {
+          submin_val = min_val; min_val = h1;
+          submin_idx = min_idx; min_idx = j + 1;
+        } else { submin_val = h1; submin_idx = j + 1; }
+      }
+      if (h2 < submin_val) {
+        if (h2 < min_val) {
+          submin_val = min_val; min_val = h2;
+          submin_idx = min_idx; min_idx = j + 2;
+        } else { submin_val = h2; submin_idx = j + 2; }
+      }
+      if (h3 < submin_val) {
+        if (h3 < min_val) {
+          submin_val = min_val; min_val = h3;
+          submin_idx = min_idx; min_idx = j + 3;
+        } else { submin_val = h3; submin_idx = j + 3; }
+      }
+    }
+    for (lap_col j = j_limit; j < dim; ++j) {
+      const cost h = row_i[j] - v_ptr[j];
+      if (h < submin_val) {
+        if (h < min_val) {
+          submin_val = min_val; min_val = h;
+          submin_idx = min_idx; min_idx = j;
+        } else { submin_val = h; submin_idx = j; }
+      }
+    }
+    return {min_val, submin_val, min_idx, submin_idx};
+  }
+};
+
+} // namespace TreeDist
+
+#endif // TREEDIST_COST_MATRIX_H_
diff --git a/inst/include/TreeDist/lap.h b/inst/include/TreeDist/lap.h
new file mode 100644
index 00000000..673cfa22
--- /dev/null
+++ b/inst/include/TreeDist/lap.h
@@ -0,0 +1,35 @@
+#ifndef TREEDIST_LAP_H_
+#define TREEDIST_LAP_H_
+
+// LAP (Linear Assignment Problem) — Jonker–Volgenant declarations.
+//
+// Provides the lap() function signature.  The implementation lives in
+// lap_impl.h, guarded by TREEDIST_LAP_IMPLEMENTATION.  Downstream
+// packages should include lap_impl.h in exactly one translation unit
+// with the guard defined.
+
+#include "lap_scratch.h"
+
+namespace TreeDist {
+
+  // Primary overload: caller supplies pre-allocated scratch.
+  extern cost lap(lap_row dim,
+                  CostMatrix& input_cost,
+                  std::vector<lap_col>& rowsol,
+                  std::vector<lap_row>& colsol,
+                  bool allow_interrupt,
+                  LapScratch& scratch);
+
+  // Convenience overload: creates a temporary scratch.
+  inline cost lap(lap_row dim,
+                  CostMatrix& input_cost,
+                  std::vector<lap_col>& rowsol,
+                  std::vector<lap_row>& colsol,
+                  bool allow_interrupt = true) {
+    LapScratch scratch;
+    return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch);
+  }
+
+} // namespace TreeDist
+
+#endif // TREEDIST_LAP_H_
diff --git a/inst/include/TreeDist/lap_impl.h b/inst/include/TreeDist/lap_impl.h
new file mode 100644
index 00000000..196c37f6
--- /dev/null
+++ b/inst/include/TreeDist/lap_impl.h
@@ -0,0 +1,254 @@
+#ifndef TREEDIST_LAP_IMPL_H_
+#define TREEDIST_LAP_IMPL_H_
+
+// LAP (Linear Assignment Problem) — Jonker–Volgenant implementation.
+//
+// Guard this with #define TREEDIST_LAP_IMPLEMENTATION before including.
+// Include in exactly one translation unit per package.
+//
+// Interrupt handling:  define TREEDIST_CHECK_INTERRUPT() before including
+// to enable user-interrupt checks (e.g., Rcpp::checkUserInterrupt()).
+// If not defined, interrupt checks are silently skipped.
+//
+// Original algorithm:
+//   R. Jonker and A. Volgenant, "A Shortest Augmenting Path Algorithm
+//   for Dense and Sparse Linear Assignment Problems," Computing 38,
+//   325-340, 1987.
+
+#ifdef TREEDIST_LAP_IMPLEMENTATION
+
+#include "lap.h"
+
+#ifndef TREEDIST_CHECK_INTERRUPT
+#define TREEDIST_CHECK_INTERRUPT() ((void)0)
+#define TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_
+#endif
+
+namespace TreeDist {
+
+namespace detail {
+  inline bool nontrivially_less_than(cost a, cost b) noexcept {
+    return a + ((a > ROUND_PRECISION) ? 8 : 0) < b;
+  }
+} // namespace detail
+
+// The LAP hot loops are sensitive to instruction alignment — even unrelated
+// changes to other code in the same translation unit can shift layout enough
+// to cause measurable regressions.  Force 64-byte function alignment and
+// 16-byte loop alignment to stabilise performance across builds.
+#if defined(__GNUC__) && !defined(__clang__)
+__attribute__((optimize("align-functions=64", "align-loops=16")))
+#endif
+cost lap(const lap_row dim,
+         CostMatrix& input_cost,
+         std::vector<lap_col>& rowsol,
+         std::vector<lap_row>& colsol,
+         const bool allow_interrupt,
+         LapScratch& scratch)
+{
+  lap_row num_free = 0;
+  scratch.ensure(dim);
+  auto& v       = scratch.v;
+  auto& matches = scratch.matches;
+  std::fill(matches.begin(), matches.begin() + dim, 0);
+  const cost* __restrict__ v_ptr = v.data();
+
+  // COLUMN REDUCTION
+  for (lap_col j = dim; j--; ) {
+    const auto [min, imin] = input_cost.findColMin(j);
+    v[j] = min;
+    ++matches[imin];
+
+    if (matches[imin] == 1) {
+      rowsol[imin] = j;
+      colsol[j] = imin;
+    } else if (v_ptr[j] < v_ptr[rowsol[imin]]) {
+      const lap_col j1 = rowsol[imin];
+      rowsol[imin] = j;
+      colsol[j] = imin;
+      colsol[j1] = -1;
+    } else {
+      colsol[j] = -1;
+    }
+  }
+
+  // REDUCTION TRANSFER
+  auto& freeunassigned = scratch.freeunassigned;
+
+  for (lap_row i = 0; i < dim; ++i) {
+    if (matches[i] == 0) {
+      freeunassigned[num_free++] = i;
+    } else if (matches[i] == 1) {
+      const lap_col j1 = rowsol[i];
+      const cost* row_i = input_cost.row(i);
+      cost min_cost;
+      if (j1 == 0) {
+        min_cost = row_i[1] - v_ptr[1];
+        for (lap_col j = 2; j < dim; ++j) {
+          const cost reduced_cost = row_i[j] - v_ptr[j];
+          if (reduced_cost < min_cost) {
+            min_cost = reduced_cost;
+          }
+        }
+      } else {
+        min_cost = row_i[0] - v_ptr[0];
+        for (lap_col j = 1; j < dim; ++j) {
+          if (j == j1) continue;
+          const cost reduced_cost = row_i[j] - v_ptr[j];
+          if (reduced_cost < min_cost) {
+            min_cost = reduced_cost;
+          }
+        }
+      }
+      v[j1] -= min_cost;
+    }
+  }
+
+  // AUGMENTING ROW REDUCTION
+  auto& col_list = scratch.col_list;
+  int loopcnt = 0;
+
+  do {
+    ++loopcnt;
+
+    lap_row previous_num_free = num_free;
+    num_free = 0;
+    lap_row k = 0;
+    while (k < previous_num_free) {
+      const lap_row i = freeunassigned[k++];
+      const auto [umin, usubmin, min_idx, j2] =
+        input_cost.findRowSubmin(&i, v);
+      lap_col j1 = min_idx;
+
+      lap_row i0 = colsol[j1];
+      const bool strictly_less =
+        detail::nontrivially_less_than(umin, usubmin);
+      if (strictly_less) {
+        v[j1] -= (usubmin - umin);
+      } else if (i0 > -1) {
+        j1 = j2;
+        i0 = colsol[j2];
+      }
+
+      rowsol[i] = j1;
+      colsol[j1] = i;
+
+      if (i0 > -1) {
+        if (strictly_less) {
+          freeunassigned[--k] = i0;
+          if (allow_interrupt) TREEDIST_CHECK_INTERRUPT();
+        } else {
+          freeunassigned[num_free++] = i0;
+        }
+      }
+    }
+  } while (loopcnt < 2);
+
+  // AUGMENT SOLUTION for each free row.
+  auto& d           = scratch.d;
+  auto& predecessor = scratch.predecessor;
+
+  for (lap_row f = 0; f < num_free; ++f) {
+    bool unassignedfound = false;
+    lap_row free_row = freeunassigned[f];
+    const cost* free_row_cost = input_cost.row(free_row);
+    lap_col endofpath = 0;
+    lap_col last = 0;
+    lap_row i;
+    lap_col j1;
+
+    for (lap_col j = 0; j < dim; ++j) {
+      d[j] = free_row_cost[j] - v_ptr[j];
+      predecessor[j] = free_row;
+      col_list[j] = j;
+    }
+
+    cost min = 0;
+    lap_col low = 0;
+    lap_col up = 0;
+
+    do {
+      if (up == low) {
+        last = low - 1;
+        min = d[col_list[up++]];
+
+        for (lap_dim k = up; k < dim; ++k) {
+          const lap_col j = col_list[k];
+          const cost h = d[j];
+          if (h <= min) {
+            if (h < min) {
+              up = low;
+              min = h;
+            }
+            col_list[k] = col_list[up];
+            col_list[up++] = j;
+          }
+        }
+        for (lap_dim k = low; k < up; ++k) {
+          if (colsol[col_list[k]] < 0) {
+            endofpath = col_list[k];
+            unassignedfound = true;
+            break;
+          }
+        }
+      }
+
+      if (!unassignedfound) {
+        j1 = col_list[low++];
+        i = colsol[j1];
+        const cost* row_i = input_cost.row(i);
+        const cost h = row_i[j1] - v_ptr[j1] - min;
+
+        for (lap_dim k = up; k < dim; ++k) {
+          const lap_col j = col_list[k];
+          cost v2 = row_i[j] - v_ptr[j] - h;
+          if (v2 < d[j]) {
+            predecessor[j] = i;
+            if (v2 == min) {
+              if (colsol[j] < 0) {
+                endofpath = j;
+                unassignedfound = true;
+                break;
+              } else {
+                col_list[k] = col_list[up];
+                col_list[up++] = j;
+              }
+            }
+            d[j] = v2;
+          }
+        }
+      }
+    } while (!unassignedfound);
+
+    for (lap_dim k = 0; k <= last; ++k) {
+      j1 = col_list[k];
+      v[j1] += d[j1] - min;
+    }
+
+    do {
+      i = predecessor[endofpath];
+      colsol[endofpath] = i;
+      j1 = endofpath;
+      endofpath = rowsol[i];
+      rowsol[i] = j1;
+    } while (i != free_row);
+  }
+
+  // Calculate optimal cost.
+  cost lapcost = 0;
+  for (lap_dim i = 0; i < dim; ++i) {
+    lapcost += input_cost(i, rowsol[i]);
+  }
+
+  return lapcost;
+}
+
+} // namespace TreeDist
+
+#ifdef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_
+#undef TREEDIST_CHECK_INTERRUPT
+#undef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_
+#endif
+
+#endif // TREEDIST_LAP_IMPLEMENTATION
+#endif // TREEDIST_LAP_IMPL_H_
diff --git a/inst/include/TreeDist/lap_scratch.h b/inst/include/TreeDist/lap_scratch.h
new file mode 100644
index 00000000..4a9ca43b
--- /dev/null
+++ b/inst/include/TreeDist/lap_scratch.h
@@ -0,0 +1,44 @@
+#ifndef TREEDIST_LAP_SCRATCH_H_
+#define TREEDIST_LAP_SCRATCH_H_
+
+// Reusable heap storage for one thread's LAPJV workspace.
+//
+// Pass to the scratch-taking overload of lap() to eliminate per-call
+// std::vector allocations.  In a serial context construct once before the
+// loop; in an OpenMP context allocate one per thread and index by
+// omp_get_thread_num().  All vectors are grown lazily; never shrunk.
+
+#include "cost_matrix.h"
+
+namespace TreeDist {
+
+struct LapScratch {
+  std::vector<cost>    v;
+  std::vector<lap_col> matches;
+  std::vector<lap_row> freeunassigned;
+  std::vector<lap_col> col_list;
+  std::vector<cost>    d;
+  std::vector<lap_row> predecessor;
+  std::vector<lap_col> rowsol;
+  std::vector<lap_row> colsol;
+  CostMatrix score_pool;
+  CostMatrix small_pool;
+
+  void ensure(int dim) noexcept {
+    const int padded = static_cast<int>(
+      ((static_cast<std::size_t>(dim) + BLOCK_SIZE - 1)
+       / BLOCK_SIZE) * BLOCK_SIZE);
+    if (static_cast<int>(v.size())              < padded) v.resize(padded);
+    if (static_cast<int>(matches.size())        < dim)    matches.resize(dim);
+    if (static_cast<int>(freeunassigned.size()) < dim)    freeunassigned.resize(dim);
+    if (static_cast<int>(col_list.size())       < dim)    col_list.resize(dim);
+    if (static_cast<int>(d.size())              < dim)    d.resize(dim);
+    if (static_cast<int>(predecessor.size())    < dim)    predecessor.resize(dim);
+    if (static_cast<int>(rowsol.size())         < dim)    rowsol.resize(dim);
+    if (static_cast<int>(colsol.size())         < dim)    colsol.resize(dim);
+  }
+};
+
+} // namespace TreeDist
+
+#endif // TREEDIST_LAP_SCRATCH_H_
diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h
new file mode 100644
index 00000000..437633f0
--- /dev/null
+++ b/inst/include/TreeDist/mutual_clustering.h
@@ -0,0 +1,87 @@
+#ifndef TREEDIST_MUTUAL_CLUSTERING_H_
+#define TREEDIST_MUTUAL_CLUSTERING_H_
+
+// Mutual Clustering Information (MCI) — public API.
+//
+// Provides the core MCI score computation between two split sets,
+// clustering entropy, and lookup-table initialization.
+//
+// Implementation lives in mutual_clustering_impl.h, guarded by
+// TREEDIST_MCI_IMPLEMENTATION.
+
+#include "lap.h"
+#include <TreeTools/SplitList.h>   // splitbit, SL_MAX_TIPS, count_bits
+#include <cmath>
+
+namespace TreeDist {
+
+  // ---- Lookup tables (populated by init_lg2_tables) ----
+  //
+  // lg2[i]                = log2(i)  for  0 <= i <= SL_MAX_TIPS
+  // lg2_double_factorial[i] = log2(i!!) for 0 <= i < 2*SL_MAX_TIPS - 2
+  // lg2_unrooted[i]       = log2((2i-5)!!) for i >= 3
+  // lg2_rooted             = &lg2_unrooted[0] + 1  (so lg2_rooted[i] = lg2_unrooted[i+1])
+  //
+  // These are defined in mutual_clustering_impl.h.
+
+  extern double lg2[SL_MAX_TIPS + 1];
+  extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
+  extern double lg2_unrooted[SL_MAX_TIPS + 2];
+  extern double* lg2_rooted;
+
+  // Populate lookup tables.  Must be called once before any MCI
+  // computation.  max_tips should be >= the largest tree size used.
+  void init_lg2_tables(int max_tips);
+
+  // ---- Inline helpers ----
+
+  // Information content of a perfectly-matching split pair.
+  // ic_matching(a, b, n) = (a + b) * lg2[n] - a * lg2[a] - b * lg2[b]
+  [[nodiscard]] inline double ic_matching(int16 a, int16 b,
+                                          int16 n) noexcept {
+    const double lg2a = lg2[a];
+    const double lg2b = lg2[b];
+    const double lg2n = lg2[n];
+    return (a + b) * lg2n - a * lg2a - b * lg2b;
+  }
+
+  // Clustering entropy of a split set.
+  // Computes H_clust = -sum_i [ p_i * log2(p_i) + q_i * log2(q_i) ]
+  // where p_i = in_split[i] / n_tips and q_i = 1 - p_i.
+  [[nodiscard]] inline double clustering_entropy(
+      const int* in_split, int n_splits, int n_tips) {
+    if (n_tips <= 1 || n_splits == 0) return 0.0;
+    const double lg2n = std::log2(static_cast<double>(n_tips));
+    double ce = 0.0;
+    for (int i = 0; i < n_splits; ++i) {
+      const int a = in_split[i];
+      const int b = n_tips - a;
+      if (a > 1 && b > 1) {
+        ce += (a * std::log2(static_cast<double>(a)) +
+               b * std::log2(static_cast<double>(b)))
+              / n_tips - lg2n;
+      }
+    }
+    return -ce;
+  }
+
+  // ---- MCI score (declaration only) ----
+  //
+  // Computes the Mutual Clustering Information between two split sets.
+  // Uses sort+merge exact-match detection + LAP on the reduced matrix.
+  //
+  // a_state[i] points to n_bins splitbit words for split i of tree A.
+  // a_in[i]    = popcount of split i (tips in the "1" partition).
+  // Returns the MCI score (higher = more similar).
+  //
+  // Implementation in mutual_clustering_impl.h.
+
+  double mutual_clustering_score(
+      const splitbit* const* a_state, const int16* a_in, int16 a_n_splits,
+      const splitbit* const* b_state, const int16* b_in, int16 b_n_splits,
+      int16 n_bins, int32 n_tips,
+      LapScratch& scratch);
+
+} // namespace TreeDist
+
+#endif // TREEDIST_MUTUAL_CLUSTERING_H_
diff --git a/inst/include/TreeDist/mutual_clustering_impl.h b/inst/include/TreeDist/mutual_clustering_impl.h
new file mode 100644
index 00000000..7f98c8fb
--- /dev/null
+++ b/inst/include/TreeDist/mutual_clustering_impl.h
@@ -0,0 +1,290 @@
+#ifndef TREEDIST_MUTUAL_CLUSTERING_IMPL_H_
+#define TREEDIST_MUTUAL_CLUSTERING_IMPL_H_
+
+// Mutual Clustering Information — implementation.
+//
+// Guard with #define TREEDIST_MCI_IMPLEMENTATION before including.
+// Include in exactly one translation unit per package (the same TU
+// that defines TREEDIST_LAP_IMPLEMENTATION).
+//
+// Usage:
+//   #define TREEDIST_LAP_IMPLEMENTATION
+//   #define TREEDIST_MCI_IMPLEMENTATION
+//   #include <TreeDist/lap_impl.h>
+//   #include <TreeDist/mutual_clustering_impl.h>
+
+#ifdef TREEDIST_MCI_IMPLEMENTATION
+
+#include "mutual_clustering.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+namespace TreeDist {
+
+// ---- Table definitions ----
+
+double lg2[SL_MAX_TIPS + 1];
+double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
+double lg2_unrooted[SL_MAX_TIPS + 2];
+double* lg2_rooted = &lg2_unrooted[0] + 1;
+
+void init_lg2_tables(int max_tips) {
+  lg2[0] = 0;
+  const int lg2_limit = std::min(max_tips + 1,
+                                 static_cast<int>(SL_MAX_TIPS + 1));
+  for (int i = 1; i < lg2_limit; ++i) {
+    lg2[i] = std::log2(static_cast<double>(i));
+  }
+
+  for (int i = 0; i < 3; ++i) {
+    lg2_double_factorial[i] = 0;
+    lg2_unrooted[i] = 0;
+  }
+
+  const int df_limit = std::min(2 * max_tips,
+                                static_cast<int>(SL_MAX_TIPS + SL_MAX_TIPS - 2));
+  for (int i = 2; i < df_limit; ++i) {
+    lg2_double_factorial[i] =
+      lg2_double_factorial[i - 2] + std::log2(static_cast<double>(i));
+  }
+
+  const int ur_limit = std::min(max_tips + 2,
+                                static_cast<int>(SL_MAX_TIPS + 2));
+  for (int i = 3; i < ur_limit; ++i) {
+    lg2_unrooted[i] = lg2_double_factorial[i + i - 5];
+  }
+}
+
+// ---- Sort+merge exact-match detection (internal) ----
+//
+// Canonicalise each split so bit 0 is always set (flip complement if not),
+// sort by canonical form, merge-scan.  O(n log n).
+//
+// Writes a_match[ai] = bi+1 if matched, else 0.  Likewise b_match.
+// Returns number of exact matches.
+
+namespace detail {
+
+static int16 find_exact_matches_raw(
+    const splitbit* const* a_state, const int16* /*a_in*/, int16 a_n,
+    const splitbit* const* b_state, const int16* /*b_in*/, int16 b_n,
+    int16 n_bins, int32 n_tips,
+    int16* a_match, int16* b_match)
+{
+  std::fill(a_match, a_match + a_n, int16(0));
+  std::fill(b_match, b_match + b_n, int16(0));
+  if (a_n == 0 || b_n == 0) return 0;
+
+  const int16 last_bin = n_bins - 1;
+  const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0)
+    ? ~splitbit(0)
+    : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1;
+
+  // Flat buffers for canonical forms
+  std::vector<splitbit> a_canon(static_cast<std::size_t>(a_n) * n_bins);
+  std::vector<splitbit> b_canon(static_cast<std::size_t>(b_n) * n_bins);
+
+  for (int16 i = 0; i < a_n; ++i) {
+    const bool flip = !(a_state[i][0] & 1);
+    for (int16 bin = 0; bin < n_bins; ++bin) {
+      splitbit val = flip ? ~a_state[i][bin] : a_state[i][bin];
+      if (bin == last_bin) val &= last_mask;
+      a_canon[i * n_bins + bin] = val;
+    }
+  }
+  for (int16 i = 0; i < b_n; ++i) {
+    const bool flip = !(b_state[i][0] & 1);
+    for (int16 bin = 0; bin < n_bins; ++bin) {
+      splitbit val = flip ? ~b_state[i][bin] : b_state[i][bin];
+      if (bin == last_bin) val &= last_mask;
+      b_canon[i * n_bins + bin] = val;
+    }
+  }
+
+  // Sort index arrays by canonical form
+  auto canon_less = [&](const splitbit* canon, int16 n_b, int16 i, int16 j) {
+    for (int16 bin = 0; bin < n_b; ++bin) {
+      const splitbit vi = canon[i * n_b + bin];
+      const splitbit vj = canon[j * n_b + bin];
+      if (vi < vj) return true;
+      if (vi > vj) return false;
+    }
+    return false;
+  };
+
+  std::vector<int16> a_order(a_n), b_order(b_n);
+  std::iota(a_order.begin(), a_order.end(), int16(0));
+  std::iota(b_order.begin(), b_order.end(), int16(0));
+
+  std::sort(a_order.begin(), a_order.end(),
+            [&](int16 i, int16 j) {
+              return canon_less(a_canon.data(), n_bins, i, j);
+            });
+  std::sort(b_order.begin(), b_order.end(),
+            [&](int16 i, int16 j) {
+              return canon_less(b_canon.data(), n_bins, i, j);
+            });
+
+  // Merge-scan
+  int16 exact_n = 0;
+  int16 ai_pos = 0, bi_pos = 0;
+  while (ai_pos < a_n && bi_pos < b_n) {
+    const int16 ai = a_order[ai_pos];
+    const int16 bi = b_order[bi_pos];
+
+    int cmp = 0;
+    for (int16 bin = 0; bin < n_bins; ++bin) {
+      const splitbit va = a_canon[ai * n_bins + bin];
+      const splitbit vb = b_canon[bi * n_bins + bin];
+      if (va < vb) { cmp = -1; break; }
+      if (va > vb) { cmp =  1; break; }
+    }
+
+    if (cmp < 0) {
+      ++ai_pos;
+    } else if (cmp > 0) {
+      ++bi_pos;
+    } else {
+      a_match[ai] = bi + 1;
+      b_match[bi] = ai + 1;
+      ++exact_n;
+      ++ai_pos;
+      ++bi_pos;
+    }
+  }
+
+  return exact_n;
+}
+
+} // namespace detail
+
+
+// ---- MCI score implementation ----
+
+double mutual_clustering_score(
+    const splitbit* const* a_state, const int16* a_in, int16 a_n_splits,
+    const splitbit* const* b_state, const int16* b_in, int16 b_n_splits,
+    int16 n_bins, int32 n_tips,
+    LapScratch& scratch)
+{
+  if (a_n_splits == 0 || b_n_splits == 0 || n_tips == 0) return 0.0;
+
+  const int16 most_splits = std::max(a_n_splits, b_n_splits);
+  const double n_tips_rcp = 1.0 / static_cast<double>(n_tips);
+
+  constexpr cost max_score  = BIG;
+  constexpr double over_max = 1.0 / static_cast<double>(BIG);
+  const double max_over_tips = static_cast<double>(BIG) * n_tips_rcp;
+  const double lg2_n = lg2[n_tips];
+
+  // --- Phase 1: O(n log n) exact-match detection ---
+  std::vector<int16> a_match_buf(a_n_splits);
+  std::vector<int16> b_match_buf(b_n_splits);
+
+  const int16 exact_n = detail::find_exact_matches_raw(
+    a_state, a_in, a_n_splits,
+    b_state, b_in, b_n_splits,
+    n_bins, n_tips,
+    a_match_buf.data(), b_match_buf.data());
+
+  const int16* a_match = a_match_buf.data();
+  const int16* b_match = b_match_buf.data();
+
+  // Accumulate exact-match score
+  double exact_score = 0.0;
+  for (int16 ai = 0; ai < a_n_splits; ++ai) {
+    if (a_match[ai]) {
+      const int16 na = a_in[ai];
+      const int16 nA = static_cast<int16>(n_tips - na);
+      exact_score += ic_matching(na, nA, static_cast<int16>(n_tips));
+    }
+  }
+
+  // Early exit when everything matched exactly
+  if (exact_n == b_n_splits || exact_n == a_n_splits) {
+    return exact_score * n_tips_rcp;
+  }
+
+  // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) ---
+  const int16 lap_n = most_splits - exact_n;
+
+  std::vector<int16> a_unmatch, b_unmatch;
+  a_unmatch.reserve(lap_n);
+  b_unmatch.reserve(lap_n);
+  for (int16 ai = 0; ai < a_n_splits; ++ai) {
+    if (!a_match[ai]) a_unmatch.push_back(ai);
+  }
+  for (int16 bi = 0; bi < b_n_splits; ++bi) {
+    if (!b_match[bi]) b_unmatch.push_back(bi);
+  }
+
+  scratch.score_pool.resize(lap_n);
+  CostMatrix& score = scratch.score_pool;
+
+  const int16 a_unmatched_n = static_cast<int16>(a_unmatch.size());
+  const int16 b_unmatched_n = static_cast<int16>(b_unmatch.size());
+
+  for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) {
+    const int16 ai = a_unmatch[a_pos];
+    const int16 na = a_in[ai];
+    const int16 nA = static_cast<int16>(n_tips - na);
+    const splitbit* a_row = a_state[ai];
+
+    const double offset_a = lg2_n - lg2[na];
+    const double offset_A = lg2_n - lg2[nA];
+
+    for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) {
+      const int16 bi = b_unmatch[b_pos];
+      const splitbit* b_row = b_state[bi];
+      int16 a_and_b = 0;
+      for (int16 bin = 0; bin < n_bins; ++bin) {
+        a_and_b += TreeTools::count_bits(a_row[bin] & b_row[bin]);
+      }
+
+      const int16 nb = b_in[bi];
+      const int16 nB = static_cast<int16>(n_tips - nb);
+      const int16 a_and_B = na - a_and_b;
+      const int16 A_and_b = nb - a_and_b;
+      const int16 A_and_B = nA - A_and_b;
+
+      if (a_and_b == A_and_b && a_and_b == a_and_B
+          && a_and_b == A_and_B) {
+        score(a_pos, b_pos) = max_score;
+      } else {
+        const double lg2_nb = lg2[nb];
+        const double lg2_nB = lg2[nB];
+        const double ic_sum =
+          a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) +
+          a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) +
+          A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) +
+          A_and_B * (lg2[A_and_B] + offset_A - lg2_nB);
+        score(a_pos, b_pos) =
+          max_score - static_cast<cost>(ic_sum * max_over_tips);
+      }
+    }
+    if (b_unmatched_n < lap_n) {
+      score.padRowAfterCol(a_pos, b_unmatched_n, max_score);
+    }
+  }
+  if (a_unmatched_n < lap_n) {
+    score.padAfterRow(a_unmatched_n, max_score);
+  }
+
+  // --- Phase 3: solve LAP on the reduced matrix ---
+  scratch.ensure(lap_n);
+  auto& rowsol = scratch.rowsol;
+  auto& colsol = scratch.colsol;
+
+  const double lap_score =
+    static_cast<double>((max_score * lap_n) -
+      lap(lap_n, score, rowsol, colsol, false, scratch)) * over_max;
+  return lap_score + exact_score * n_tips_rcp;
+}
+
+} // namespace TreeDist
+
+#endif // TREEDIST_MCI_IMPLEMENTATION
+#endif // TREEDIST_MUTUAL_CLUSTERING_IMPL_H_
diff --git a/inst/include/TreeDist/types.h b/inst/include/TreeDist/types.h
new file mode 100644
index 00000000..c953989b
--- /dev/null
+++ b/inst/include/TreeDist/types.h
@@ -0,0 +1,32 @@
+#ifndef TREEDIST_TYPES_H_
+#define TREEDIST_TYPES_H_
+
+// TreeDist public type definitions.
+//
+// Rcpp-free: uses only standard headers plus TreeTools (for SL_MAX_SPLITS).
+// Downstream packages include via:  #include <TreeDist/types.h>
+
+#include <cstdint>
+#include <limits>
+#include <TreeTools/SplitList.h>   // SL_MAX_SPLITS, splitbit
+
+namespace TreeDist {
+
+  using int16 = int_fast16_t;
+  using int32 = int_fast32_t;
+  using cost  = int_fast64_t;
+
+  using lap_dim = int;
+  using lap_row = lap_dim;
+  using lap_col = lap_dim;
+
+  constexpr cost BIG =
+    (std::numeric_limits<cost>::max)() / SL_MAX_SPLITS;
+
+  constexpr cost ROUND_PRECISION = 2048 * 2048;
+
+  constexpr std::size_t BLOCK_SIZE = 16;
+
+} // namespace TreeDist
+
+#endif // TREEDIST_TYPES_H_
diff --git a/man/HierarchicalMutualInfo.Rd b/man/HierarchicalMutualInfo.Rd
index 9e4788d7..193b18ba 100644
--- a/man/HierarchicalMutualInfo.Rd
+++ b/man/HierarchicalMutualInfo.Rd
@@ -134,6 +134,7 @@ Other tree distances:
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \concept{tree distances}
diff --git a/man/JaccardRobinsonFoulds.Rd b/man/JaccardRobinsonFoulds.Rd
index 5df32914..7e515be7 100644
--- a/man/JaccardRobinsonFoulds.Rd
+++ b/man/JaccardRobinsonFoulds.Rd
@@ -136,6 +136,7 @@ Other tree distances:
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/KendallColijn.Rd b/man/KendallColijn.Rd
index cfd506b5..57d01772 100644
--- a/man/KendallColijn.Rd
+++ b/man/KendallColijn.Rd
@@ -135,6 +135,7 @@ Other tree distances:
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/MASTSize.Rd b/man/MASTSize.Rd
index 5ecdd271..db028b0a 100644
--- a/man/MASTSize.Rd
+++ b/man/MASTSize.Rd
@@ -72,6 +72,7 @@ Other tree distances:
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/MatchingSplitDistance.Rd b/man/MatchingSplitDistance.Rd
index 83af446f..b5d64b27 100644
--- a/man/MatchingSplitDistance.Rd
+++ b/man/MatchingSplitDistance.Rd
@@ -94,6 +94,7 @@ Other tree distances:
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/NNIDist.Rd b/man/NNIDist.Rd
index 70895e04..5e24301b 100644
--- a/man/NNIDist.Rd
+++ b/man/NNIDist.Rd
@@ -111,6 +111,7 @@ Other tree distances:
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/NyeSimilarity.Rd b/man/NyeSimilarity.Rd
index 892ed185..d984b129 100644
--- a/man/NyeSimilarity.Rd
+++ b/man/NyeSimilarity.Rd
@@ -132,6 +132,7 @@ Other tree distances:
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/PathDist.Rd b/man/PathDist.Rd
index 5cbfe90c..1192d4ce 100644
--- a/man/PathDist.Rd
+++ b/man/PathDist.Rd
@@ -78,6 +78,7 @@ Other tree distances:
 \code{\link{NyeSimilarity}()},
 \code{\link{Robinson-Foulds}},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/Robinson-Foulds.Rd b/man/Robinson-Foulds.Rd
index a425e8d1..7510e094 100644
--- a/man/Robinson-Foulds.Rd
+++ b/man/Robinson-Foulds.Rd
@@ -154,6 +154,7 @@ Other tree distances:
 \code{\link{NyeSimilarity}()},
 \code{\link{PathDist}()},
 \code{\link{SPRDist}()},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/SPRDist.Rd b/man/SPRDist.Rd
index 42fc9b2c..5075abda 100644
--- a/man/SPRDist.Rd
+++ b/man/SPRDist.Rd
@@ -84,6 +84,7 @@ Other tree distances:
 \code{\link{NyeSimilarity}()},
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
+\code{\link{TransferDist}()},
 \code{\link{TreeDistance}()}
 }
 \author{
diff --git a/man/TransferConsensus.Rd b/man/TransferConsensus.Rd
new file mode 100644
index 00000000..afd610b7
--- /dev/null
+++ b/man/TransferConsensus.Rd
@@ -0,0 +1,68 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/transfer_consensus.R
+\name{TransferConsensus}
+\alias{TransferConsensus}
+\title{Consensus tree minimizing transfer distance}
+\usage{
+TransferConsensus(
+  trees,
+  scale = TRUE,
+  greedy = c("best", "first"),
+  init = c("empty", "majority")
+)
+}
+\arguments{
+\item{trees}{An object of class \code{multiPhylo}: the input trees.
+All trees must share the same tip labels.}
+
+\item{scale}{Logical; if \code{TRUE} (default), use the scaled transfer
+distance (normalized by light-side size minus one).  If \code{FALSE}, use
+the unscaled (raw Hamming) transfer distance.}
+
+\item{greedy}{Character string specifying the greedy strategy:
+\code{"best"} (default) picks the single highest-benefit action at each step;
+\code{"first"} picks the first improving action encountered (faster but
+potentially lower quality).}
+
+\item{init}{Character string specifying the initial consensus:
+\code{"empty"} (default) starts with no splits (purely additive);
+\code{"majority"} starts with the majority-rule consensus and refines.}
+}
+\value{
+A tree of class \code{phylo}.
+}
+\description{
+Construct a consensus tree that minimizes the sum of transfer distances
+to a set of input trees, using a greedy add-and-prune heuristic.
+}
+\details{
+Unlike the majority-rule consensus, which minimizes Robinson-Foulds
+distance and can be highly unresolved when phylogenetic signal is low,
+\code{TransferConsensus()} uses the finer-grained transfer distance
+\insertCite{Lemoine2018}{TreeDist} to construct a more resolved consensus
+tree.
+
+The algorithm pools all splits observed across input trees, computes
+pairwise transfer distances between them, and greedily adds or removes
+splits to minimize total transfer dissimilarity cost.  The approach
+follows \insertCite{Takazawa2026;textual}{TreeDist}, reimplemented for
+'TreeDist' infrastructure.
+}
+\examples{
+library(TreeTools)
+# Generate bootstrap-like trees
+trees <- as.phylo(1:20, nTip = 12)
+
+# Transfer consensus (more resolved than majority rule)
+tc <- TransferConsensus(trees)
+plot(tc)
+
+# Compare resolution
+mr <- Consensus(trees, p = 0.5)
+cat("Majority-rule splits:", NSplits(mr), "\n")
+cat("Transfer consensus splits:", NSplits(tc), "\n")
+
+}
+\references{
+\insertAllCited{}
+}
diff --git a/man/TransferDist.Rd b/man/TransferDist.Rd
new file mode 100644
index 00000000..2439130f
--- /dev/null
+++ b/man/TransferDist.Rd
@@ -0,0 +1,118 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tree_distance_transfer.R
+\name{TransferDist}
+\alias{TransferDist}
+\alias{TransferDistance}
+\alias{TransferDistSplits}
+\title{Transfer dissimilarity between phylogenetic trees}
+\usage{
+TransferDist(
+  tree1,
+  tree2 = NULL,
+  scale = TRUE,
+  normalize = FALSE,
+  reportMatching = FALSE
+)
+
+TransferDistance(
+  tree1,
+  tree2 = NULL,
+  scale = TRUE,
+  normalize = FALSE,
+  reportMatching = FALSE
+)
+
+TransferDistSplits(
+  splits1,
+  splits2,
+  nTip = attr(splits1, "nTip"),
+  scale = TRUE,
+  reportMatching = FALSE
+)
+}
+\arguments{
+\item{tree1, tree2}{Trees of class \code{phylo}, with leaves labelled identically,
+or lists of such trees to undergo pairwise comparison.  Where implemented,
+\code{tree2 = NULL} will compute distances between each pair of trees in the list
+\code{tree1} using a fast algorithm based on
+\insertCite{Day1985;textual}{TreeDist}.}
+
+\item{scale}{Logical; if \code{TRUE} (default), use the scaled transfer
+dissimilarity.  If \code{FALSE}, use the unscaled transfer dissimilarity.}
+
+\item{normalize}{If a numeric value is provided, this will be used as a
+maximum value against which to rescale results.
+If \code{TRUE}, results will be rescaled against a maximum value calculated from
+the specified tree sizes and topology, as specified in the "Normalization"
+section below.
+If \code{FALSE}, results will not be rescaled.}
+
+\item{reportMatching}{Logical specifying whether to return the clade
+matchings as an attribute of the score.}
+
+\item{splits1, splits2}{Logical matrices where each row corresponds to a leaf,
+either listed in the same order or bearing identical names (in any sequence),
+and each column corresponds to a split, such that each leaf is identified as
+a member of the ingroup (\code{TRUE}) or outgroup (\code{FALSE}) of the respective
+split.}
+
+\item{nTip}{(Optional) Integer specifying the number of leaves in each split.}
+}
+\value{
+\code{TransferDist()} returns an object of class \code{dist} (if \code{tree2} is
+\code{NULL}), a numeric matrix (if both \code{tree1} and \code{tree2} are lists), or a
+numeric value (for a single pair).  If \code{reportMatching = TRUE}, the
+return value carries \code{matching} and \code{pairScores} attributes.
+}
+\description{
+Compute the transfer dissimilarity between phylogenetic trees, as defined
+by \insertCite{Takazawa2026;textual}{TreeDist}.
+The transfer dissimilarity uses the transfer distance
+\insertCite{Lemoine2018}{TreeDist} to compare bipartitions, providing a
+finer-grained measure than the Robinson--Foulds distance.  Each branch in
+each tree is scored by how many taxa must be moved to match its closest
+counterpart in the other tree, and these scores are summed.
+}
+\details{
+The \code{scaled} variant divides each branch's contribution by its depth minus
+one, giving equal weight to all branches regardless of their depth (analogous
+to the Robinson--Foulds distance).  The \code{unscaled} variant uses raw transfer
+distances, giving more weight to deep branches.  Neither variant satisfies
+the triangle inequality for trees with six or more tips.
+}
+\section{Normalization}{
+
+
+When \code{normalize = TRUE}, the scaled transfer dissimilarity is divided by
+\code{2 * (n - 3)}, placing it in the range [0, 1].  The unscaled version is
+divided by the maximum possible unscaled dissimilarity
+(following \insertCite{Takazawa2026;textual}{TreeDist}).
+}
+
+\examples{
+library(TreeTools)
+TransferDist(BalancedTree(8), PectinateTree(8))
+TransferDist(BalancedTree(8), PectinateTree(8), scale = FALSE)
+
+# All-pairs
+TransferDist(as.phylo(0:5, 8))
+
+}
+\references{
+\insertAllCited{}
+}
+\seealso{
+Other tree distances: 
+\code{\link{HierarchicalMutualInfo}()},
+\code{\link{JaccardRobinsonFoulds}()},
+\code{\link{KendallColijn}()},
+\code{\link{MASTSize}()},
+\code{\link{MatchingSplitDistance}()},
+\code{\link{NNIDist}()},
+\code{\link{NyeSimilarity}()},
+\code{\link{PathDist}()},
+\code{\link{Robinson-Foulds}},
+\code{\link{SPRDist}()},
+\code{\link{TreeDistance}()}
+}
+\concept{tree distances}
diff --git a/man/TreeDistance.Rd b/man/TreeDistance.Rd
index 352d169c..f081b6d3 100644
--- a/man/TreeDistance.Rd
+++ b/man/TreeDistance.Rd
@@ -328,7 +328,8 @@ Other tree distances:
 \code{\link{NyeSimilarity}()},
 \code{\link{PathDist}()},
 \code{\link{Robinson-Foulds}},
-\code{\link{SPRDist}()}
+\code{\link{SPRDist}()},
+\code{\link{TransferDist}()}
 }
 \author{
 \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith}
diff --git a/man/cpp_transfer_consensus.Rd b/man/cpp_transfer_consensus.Rd
new file mode 100644
index 00000000..a8ebc915
--- /dev/null
+++ b/man/cpp_transfer_consensus.Rd
@@ -0,0 +1,35 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{cpp_transfer_consensus}
+\alias{cpp_transfer_consensus}
+\title{Transfer consensus (C++ implementation)}
+\usage{
+cpp_transfer_consensus(
+  splits_list,
+  n_tip,
+  scale,
+  greedy_best_flag,
+  init_majority,
+  n_threads = 1L
+)
+}
+\arguments{
+\item{splits_list}{List of raw matrices (one per tree), each from as.Splits().}
+
+\item{n_tip}{Number of tips.}
+
+\item{scale}{Logical: use scaled transfer distance?}
+
+\item{greedy_best_flag}{Logical: TRUE for "best", FALSE for "first".}
+
+\item{init_majority}{Logical: TRUE to start from majority-rule splits.}
+}
+\value{
+A \code{LogicalVector} of length n_splits indicating which pooled splits
+are included in the consensus, plus attributes "raw_splits" (a raw matrix
+of all unique splits) and "light_side" (integer vector).
+}
+\description{
+Transfer consensus (C++ implementation)
+}
+\keyword{internal}
diff --git a/man/cpp_transfer_dist.Rd b/man/cpp_transfer_dist.Rd
new file mode 100644
index 00000000..204f1a01
--- /dev/null
+++ b/man/cpp_transfer_dist.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{cpp_transfer_dist}
+\alias{cpp_transfer_dist}
+\title{Per-pair transfer dissimilarity}
+\usage{
+cpp_transfer_dist(x, y, nTip)
+}
+\arguments{
+\item{x, y}{Raw matrices representing splits (from as.Splits()).}
+
+\item{nTip}{Integer: number of tips.}
+}
+\value{
+A list with components:
+\itemize{
+\item score_scaled: scaled transfer dissimilarity (double)
+\item score_unscaled: unscaled transfer dissimilarity (double)
+\item \code{matching_xy}: integer vector, best match in y for each split in x (1-based, NA if sentinel)
+\item \code{matching_yx}: integer vector, best match in x for each split in y (1-based, NA if sentinel)
+}
+}
+\description{
+Per-pair transfer dissimilarity
+}
+\keyword{internal}
diff --git a/man/cpp_transfer_dist_all_pairs.Rd b/man/cpp_transfer_dist_all_pairs.Rd
new file mode 100644
index 00000000..92af73d4
--- /dev/null
+++ b/man/cpp_transfer_dist_all_pairs.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{cpp_transfer_dist_all_pairs}
+\alias{cpp_transfer_dist_all_pairs}
+\title{All-pairs transfer dissimilarity (OpenMP)}
+\usage{
+cpp_transfer_dist_all_pairs(splits_list, n_tip, scale, n_threads)
+}
+\arguments{
+\item{splits_list}{List of raw matrices (one per tree).}
+
+\item{n_tip}{Number of tips.}
+
+\item{scale}{Logical: use scaled transfer dissimilarity?}
+
+\item{n_threads}{Number of OpenMP threads.}
+}
+\value{
+Numeric vector of length choose(N,2) in dist order.
+}
+\description{
+All-pairs transfer dissimilarity (OpenMP)
+}
+\keyword{internal}
diff --git a/man/cpp_transfer_dist_cross_pairs.Rd b/man/cpp_transfer_dist_cross_pairs.Rd
new file mode 100644
index 00000000..60c55106
--- /dev/null
+++ b/man/cpp_transfer_dist_cross_pairs.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/RcppExports.R
+\name{cpp_transfer_dist_cross_pairs}
+\alias{cpp_transfer_dist_cross_pairs}
+\title{Cross-pairs transfer dissimilarity (OpenMP)}
+\usage{
+cpp_transfer_dist_cross_pairs(splits_a, splits_b, n_tip, scale, n_threads)
+}
+\arguments{
+\item{splits_a, splits_b}{Lists of raw matrices.}
+
+\item{n_tip}{Number of tips.}
+
+\item{scale}{Logical: use scaled transfer dissimilarity?}
+
+\item{n_threads}{Number of OpenMP threads.}
+}
+\value{
+Numeric matrix of dimension \code{nA} x \code{nB}.
+}
+\description{
+Cross-pairs transfer dissimilarity (OpenMP)
+}
+\keyword{internal}
diff --git a/man/dot-SPRPairDeO.Rd b/man/dot-SPRPairDeO.Rd
new file mode 100644
index 00000000..c846b761
--- /dev/null
+++ b/man/dot-SPRPairDeO.Rd
@@ -0,0 +1,22 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/tree_distance_spr.R
+\name{.SPRPairDeO}
+\alias{.SPRPairDeO}
+\title{SPR approximation via Oliveira Martins \emph{et al.} (2008)}
+\usage{
+.SPRPairDeO(tree1, tree2, check = TRUE)
+}
+\description{
+SPR approximation via Oliveira Martins \emph{et al.} (2008)
+}
+\examples{
+# de Oliveira Martins et al 2008, fig. 7
+tree1 <- ape::read.tree(text = "((1, 2), ((a, b), (c, d)), (3, (4, (5, (6, 7)))));")
+tree2 <- ape::read.tree(text = "((1, 2), 3, (4, (5, (((a, b), (c, d)), (6, 7)))));")
+oPar <- par(mfrow =c(2, 1), mar = rep(0, 4))
+plot(tree1)
+plot(tree2)
+par(oPar)
+SPRDist(tree1, tree2, method = "deO")
+}
+\keyword{internal}
diff --git a/src/Makevars b/src/Makevars
index 139b1d17..d3df534e 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -1,2 +1,3 @@
+PKG_CPPFLAGS = -I../inst/include
 PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
diff --git a/src/Makevars.win b/src/Makevars.win
index 139b1d17..d3df534e 100644
--- a/src/Makevars.win
+++ b/src/Makevars.win
@@ -1,2 +1,3 @@
+PKG_CPPFLAGS = -I../inst/include
 PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index 443df8c1..72309c5b 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -522,6 +522,108 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cpp_transfer_consensus
+List cpp_transfer_consensus(const List& splits_list, const int n_tip, const bool scale, const bool greedy_best_flag, const bool init_majority, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_transfer_consensus(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP greedy_best_flagSEXP, SEXP init_majoritySEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< const bool >::type scale(scaleSEXP);
+    Rcpp::traits::input_parameter< const bool >::type greedy_best_flag(greedy_best_flagSEXP);
+    Rcpp::traits::input_parameter< const bool >::type init_majority(init_majoritySEXP);
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_transfer_consensus(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_tc_profile
+Rcpp::NumericVector cpp_tc_profile(const List& splits_list, const int n_tip, const bool scale, const bool greedy_best_flag, const bool init_majority, const int n_iter, const int n_threads);
+RcppExport SEXP _TreeDist_cpp_tc_profile(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP greedy_best_flagSEXP, SEXP init_majoritySEXP, SEXP n_iterSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< const bool >::type scale(scaleSEXP);
+    Rcpp::traits::input_parameter< const bool >::type greedy_best_flag(greedy_best_flagSEXP);
+    Rcpp::traits::input_parameter< const bool >::type init_majority(init_majoritySEXP);
+    Rcpp::traits::input_parameter< const int >::type n_iter(n_iterSEXP);
+    Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_tc_profile(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_iter, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_transfer_dist
+List cpp_transfer_dist(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip);
+RcppExport SEXP _TreeDist_cpp_transfer_dist(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const RawMatrix& >::type x(xSEXP);
+    Rcpp::traits::input_parameter< const RawMatrix& >::type y(ySEXP);
+    Rcpp::traits::input_parameter< const IntegerVector& >::type nTip(nTipSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist(x, y, nTip));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_transfer_dist_scored
+List cpp_transfer_dist_scored(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip, bool scale);
+RcppExport SEXP _TreeDist_cpp_transfer_dist_scored(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP, SEXP scaleSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const RawMatrix& >::type x(xSEXP);
+    Rcpp::traits::input_parameter< const RawMatrix& >::type y(ySEXP);
+    Rcpp::traits::input_parameter< const IntegerVector& >::type nTip(nTipSEXP);
+    Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist_scored(x, y, nTip, scale));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_transfer_dist_all_pairs
+NumericVector cpp_transfer_dist_all_pairs(const List& splits_list, int n_tip, bool scale, int n_threads);
+RcppExport SEXP _TreeDist_cpp_transfer_dist_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP);
+    Rcpp::traits::input_parameter< int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP);
+    Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist_all_pairs(splits_list, n_tip, scale, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_transfer_dist_cross_pairs
+NumericMatrix cpp_transfer_dist_cross_pairs(const List& splits_a, const List& splits_b, int n_tip, bool scale, int n_threads);
+RcppExport SEXP _TreeDist_cpp_transfer_dist_cross_pairs(SEXP splits_aSEXP, SEXP splits_bSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP n_threadsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const List& >::type splits_a(splits_aSEXP);
+    Rcpp::traits::input_parameter< const List& >::type splits_b(splits_bSEXP);
+    Rcpp::traits::input_parameter< int >::type n_tip(n_tipSEXP);
+    Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP);
+    Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist_cross_pairs(splits_a, splits_b, n_tip, scale, n_threads));
+    return rcpp_result_gen;
+END_RCPP
+}
+// cpp_mci_impl_score
+double cpp_mci_impl_score(const Rcpp::RawMatrix& x, const Rcpp::RawMatrix& y, int n_tips);
+RcppExport SEXP _TreeDist_cpp_mci_impl_score(SEXP xSEXP, SEXP ySEXP, SEXP n_tipsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type x(xSEXP);
+    Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type y(ySEXP);
+    Rcpp::traits::input_parameter< int >::type n_tips(n_tipsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_mci_impl_score(x, y, n_tips));
+    return rcpp_result_gen;
+END_RCPP
+}
 // cpp_robinson_foulds_distance
 List cpp_robinson_foulds_distance(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip);
 RcppExport SEXP _TreeDist_cpp_robinson_foulds_distance(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) {
@@ -658,6 +760,13 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeDist_keep_and_reroot", (DL_FUNC) &_TreeDist_keep_and_reroot, 3},
     {"_TreeDist_keep_and_reduce", (DL_FUNC) &_TreeDist_keep_and_reduce, 3},
     {"_TreeDist_spr_table_7", (DL_FUNC) &_TreeDist_spr_table_7, 2},
+    {"_TreeDist_cpp_transfer_consensus", (DL_FUNC) &_TreeDist_cpp_transfer_consensus, 6},
+    {"_TreeDist_cpp_tc_profile", (DL_FUNC) &_TreeDist_cpp_tc_profile, 7},
+    {"_TreeDist_cpp_transfer_dist", (DL_FUNC) &_TreeDist_cpp_transfer_dist, 3},
+    {"_TreeDist_cpp_transfer_dist_scored", (DL_FUNC) &_TreeDist_cpp_transfer_dist_scored, 4},
+    {"_TreeDist_cpp_transfer_dist_all_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_all_pairs, 4},
+    {"_TreeDist_cpp_transfer_dist_cross_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_cross_pairs, 5},
+    {"_TreeDist_cpp_mci_impl_score", (DL_FUNC) &_TreeDist_cpp_mci_impl_score, 3},
     {"_TreeDist_cpp_robinson_foulds_distance", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_distance, 3},
     {"_TreeDist_cpp_robinson_foulds_info", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_info, 3},
     {"_TreeDist_cpp_matching_split_distance", (DL_FUNC) &_TreeDist_cpp_matching_split_distance, 3},
diff --git a/src/day_1985.cpp b/src/day_1985.cpp
index 2f449aae..6773fdea 100644
--- a/src/day_1985.cpp
+++ b/src/day_1985.cpp
@@ -3,6 +3,8 @@ using namespace Rcpp;
 
 #include "tree_distances.h" /* includes <TreeTools/SplitList.h> */
 #include "information.h"
+#include <cassert>
+#include <TreeTools/assert.h>
 
 #include <TreeTools.h> /* for root_on_node() */
 #include <TreeTools/root_tree.h> /* for root_on_node() */
@@ -96,7 +98,7 @@ double calc_consensus_info(const List &trees, const LogicalVector &phylo,
 
   std::vector<ClusterTable> tables;
   if (std::size_t(n_trees) > tables.max_size()) {
-    Rcpp::stop("Not enough memory available to compute consensus of so many trees"); // LCOV_EXCL_LINE
+    ASSERT(false && "Not enough memory for consensus of so many trees"); // LCOV_EXCL_LINE
   }
 
   tables.reserve(n_trees);
@@ -321,31 +323,19 @@ IntegerVector robinson_foulds_all_pairs(const List& tables) {
 // [[Rcpp::export]]
 double consensus_info(const List trees, const LogicalVector phylo,
                       const NumericVector p) {
-  if (p[0] > 1 + 1e-15) { // epsilon catches floating point error
-    Rcpp::stop("p must be <= 1.0 in consensus_info()");
-  } else if (p[0] < 0.5) {
-    Rcpp::stop("p must be >= 0.5 in consensus_info()");
-  }
+  // Validated by R caller (ConsensusInfo checks p range)
+  ASSERT(p[0] <= 1 + 1e-15 && "p must be <= 1.0 in consensus_info()");
+  ASSERT(p[0] >= 0.5 && "p must be >= 0.5 in consensus_info()");
 
-  // First, peek at the tree size to determine allocation strategy
-  // We'll create a temporary ClusterTable just to check the size
-  try {
-    ClusterTable temp_table(Rcpp::List(trees(0)));
-    const int32 n_tip = temp_table.N();
-    
-    if (n_tip <= ct_stack_threshold) {
-      // Small tree: use stack-allocated array
-      std::array<StackEntry, ct_stack_threshold> S;
-      return calc_consensus_info(trees, phylo, p, S);
-    } else {
-      // Large tree: use heap-allocated vector
-      std::vector<StackEntry> S(n_tip);
-      return calc_consensus_info(trees, phylo, p, S);
-    }
-  } catch(const std::exception& e) {
-    Rcpp::stop(e.what());
+  // Peek at tree size to choose stack vs heap allocation for the work buffer
+  ClusterTable temp_table(Rcpp::List(trees(0)));
+  const int32 n_tip = temp_table.N();
+
+  if (n_tip <= ct_stack_threshold) {
+    std::array<StackEntry, ct_stack_threshold> S;
+    return calc_consensus_info(trees, phylo, p, S);
+  } else {
+    std::vector<StackEntry> S(n_tip);
+    return calc_consensus_info(trees, phylo, p, S);
   }
-  
-  ASSERT(false && "Unreachable code in consensus_tree");
-  return 0.0;
 }
diff --git a/src/hmi.cpp b/src/hmi.cpp
index bb594a7f..bcbe7c6a 100644
--- a/src/hmi.cpp
+++ b/src/hmi.cpp
@@ -139,9 +139,8 @@ double hierarchical_self_info(const std::vector<TreeDist::HNode>& nodes, size_t
 double HMI_xptr(SEXP ptr1, SEXP ptr2) {
   Rcpp::XPtr<TreeDist::HPart> hp1(ptr1);
   Rcpp::XPtr<TreeDist::HPart> hp2(ptr2);
-  if (hp1->nodes[hp1->root].n_tip != hp2->nodes[hp2->root].n_tip) {
-    Rcpp::stop("Trees must have the same number of leaves");
-  }
+  ASSERT(hp1->nodes[hp1->root].n_tip == hp2->nodes[hp2->root].n_tip
+         && "Trees must have the same number of leaves");
   return TreeDist::hierarchical_mutual_info(hp1->nodes, hp1->root,
                                             hp2->nodes, hp2->root).second;
 }
@@ -170,12 +169,8 @@ Rcpp::NumericVector EHMI_xptr(SEXP hp1_ptr, SEXP hp2_ptr,
                               double tolerance = 0.01,
                               int minResample = 36) {
   
-  if (minResample < 2) {
-    Rcpp::stop("Must perform at least one resampling");
-  }
-  if (tolerance < 1e-8) {
-    Rcpp::stop("Tolerance too low");
-  }
+  ASSERT(minResample >= 2 && "Must perform at least one resampling");
+  ASSERT(tolerance >= 1e-8 && "Tolerance too low");
   
   Rcpp::XPtr<TreeDist::HPart> hp1(hp1_ptr);
   Rcpp::XPtr<TreeDist::HPart> hp2(hp2_ptr);
diff --git a/src/hpart.cpp b/src/hpart.cpp
index ffbfcaea..22bab059 100644
--- a/src/hpart.cpp
+++ b/src/hpart.cpp
@@ -126,7 +126,7 @@ size_t build_node_from_list(const RObject& node,
   if (Rf_isInteger(node) || Rf_isReal(node)) {
     const IntegerVector leaf_vec(node);
     if (leaf_vec.size() != 1) {
-      Rcpp::stop("List must only contain integers, not vectors of integers");   // #nocov
+      ASSERT(false && "List must only contain integers, not vectors of integers"); // #nocov
     }
     const int leaf_label = leaf_vec[0];         // 1-based R leaf label
     const size_t leaf_idx = leaf_label - 1;     // 0-based label for HNode
@@ -171,8 +171,9 @@ size_t build_node_from_list(const RObject& node,
     return my_idx;
   }
   
-  // Invalid node type
-  Rcpp::stop("Invalid node type");                                              // #nocov
+  // Invalid node type — unreachable when R caller validates input
+  ASSERT(false && "Invalid node type");                                        // #nocov
+  return 0;                                                                    // #nocov
 }
 
 
diff --git a/src/hpart_relabel.cpp b/src/hpart_relabel.cpp
index 6f7906ef..1337c91d 100644
--- a/src/hpart_relabel.cpp
+++ b/src/hpart_relabel.cpp
@@ -13,7 +13,7 @@ void recompute_bitsets_postorder(TreeDist::HPart &hpart, const size_t node_idx,
   if (node.children.empty()) {
     // Leaf node
     if (node.leaf_count != 1) {
-      Rcpp::stop("Leaf node has leaf_count != 1");                              // #nocov
+      ASSERT(node.leaf_count == 1 && "Leaf node has leaf_count != 1");         // #nocov
     }
     int new_index = mapping[node.label]; // mapping is 0-based
     node.label = new_index;
diff --git a/src/information.h b/src/information.h
index 2db2340b..cb2ce58b 100644
--- a/src/information.h
+++ b/src/information.h
@@ -2,7 +2,7 @@
 #define _TREEDIST_INFO_H
 
 #include <Rcpp/Lightest>
-#include <cassert> /* for log2() */
+#include <TreeTools/assert.h>
 #include <cmath> /* for log2() */
 #include "ints.h" /* for int16 */
 
@@ -48,14 +48,12 @@ __attribute__((constructor))
 
 inline double split_phylo_info(const int32 n_in, const int32 *n_tip,
                                const double p) {
-  if (*n_tip > CT_MAX_LEAVES) {
-    Rcpp::stop("This many leaves are not yet supported.");
-  }
+  ASSERT(*n_tip <= CT_MAX_LEAVES && "This many leaves are not yet supported.");
   const int32 n_out = *n_tip - n_in;
-  assert(p > 0);
-  assert(p <= 1);
-  assert(n_in > 1);
-  assert(n_out > 1);
+  ASSERT(p > 0);
+  ASSERT(p <= 1);
+  ASSERT(n_in > 1);
+  ASSERT(n_out > 1);
   if (p == 1) {
     return (l2unrooted[*n_tip] - l2rooted[n_in] - l2rooted[n_out]);
   } else {
@@ -74,14 +72,12 @@ inline double split_phylo_info(const int32 n_in, const int32 *n_tip,
 
 inline double split_clust_info(const int32 n_in, const int32 *n_tip,
                                const double p) {
-  if (*n_tip > CT_MAX_LEAVES) {
-    Rcpp::stop("This many leaves are not yet supported.");
-  }
+  ASSERT(*n_tip <= CT_MAX_LEAVES && "This many leaves are not yet supported.");
   const int32 n_out = *n_tip - n_in;
-  assert(p > 0);
-  assert(p <= 1);
-  assert(n_in > 1);
-  assert(n_out > 1);
+  ASSERT(p > 0);
+  ASSERT(p <= 1);
+  ASSERT(n_in > 1);
+  ASSERT(n_out > 1);
   return -p * (
       (((n_in * l2[n_in]) + (n_out * l2[n_out])) / *n_tip) - l2[*n_tip]
   );
diff --git a/src/ints.h b/src/ints.h
index a5af8774..50fd3cc7 100644
--- a/src/ints.h
+++ b/src/ints.h
@@ -1,10 +1,13 @@
 #ifndef _TREEDIST_INT_H
 #define _TREEDIST_INT_H
 
-#include <cstdint>
+#include <TreeDist/types.h>
 
-using int16 = int_fast16_t;
-using int32 = int_fast32_t;
+// Re-export shared types to global scope for backward compatibility.
+using TreeDist::int16;
+using TreeDist::int32;
+
+// Types used only within TreeDist's own source.
 using uint32 = uint_fast32_t;
 using grf_match = std::vector<int>;
 
diff --git a/src/lap.cpp b/src/lap.cpp
index 4963fa9e..e77c712f 100644
--- a/src/lap.cpp
+++ b/src/lap.cpp
@@ -26,6 +26,15 @@
  https://www.mathworks.com/matlabcentral/fileexchange/26836-lapjv-jonker-volgenant-algorithm-for-linear-assignment-problem-v3-0:
  *
  *************************************************************************/
+
+// NOTE: The LAP hot loops are highly sensitive to instruction alignment and
+// register allocation, which are affected by the TU's full include graph.
+// Do NOT include lap_impl.h here — that header is for downstream LinkingTo
+// consumers only.  TreeDist's own lap() is compiled directly in this file
+// to preserve the codegen context that was profiled and tuned.
+//
+// If the algorithm changes, update BOTH this file and lap_impl.h.
+
 #include "lap.h"
 #include <Rcpp/Lightest>
 
@@ -37,11 +46,42 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) {
   const lap_dim spare_rows = n_row - n_col;
   const cost max_score = cost(BIG / max_dim);
   const double x_max = maxX[0];
+  const double scale_factor = max_score / x_max;
   
   std::vector<lap_col> rowsol(max_dim);
   std::vector<lap_row> colsol(max_dim);
   
-  cost_matrix input(x, x_max);
+  // Build cost matrix.  Fill the transposed buffer first (matching R's
+  // column-major storage for sequential reads) then untranspose.
+  cost_matrix input(max_dim);
+  const double* __restrict__ src_data = REAL(x);
+  cost* __restrict__ t_ptr = input.col(0);
+  const std::size_t dim8 = input.dim8();
+  
+  for (lap_col c = 0; c < n_col; ++c) {
+    const std::size_t t_off = static_cast<std::size_t>(c) * dim8;
+    const std::size_t s_off = static_cast<std::size_t>(c) * n_row;
+    for (lap_row r = 0; r < n_row; ++r) {
+      t_ptr[t_off + r] = static_cast<cost>(src_data[s_off + r] * scale_factor);
+    }
+    // Pad remaining rows in this transposed column
+    for (lap_row r = n_row; r < max_dim; ++r) {
+      t_ptr[t_off + r] = max_score;
+    }
+    for (std::size_t r = max_dim; r < dim8; ++r) {
+      t_ptr[t_off + r] = max_score;
+    }
+  }
+  // Pad remaining transposed columns
+  for (lap_col c = n_col; c < max_dim; ++c) {
+    const std::size_t t_off = static_cast<std::size_t>(c) * dim8;
+    for (std::size_t r = 0; r < dim8; ++r) {
+      t_ptr[t_off + r] = max_score;
+    }
+  }
+  
+  // Untranspose: t_data_ -> data_
+  input.makeUntranspose();
   
   cost score = lap(max_dim, input, rowsol, colsol);
   
@@ -60,27 +100,22 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) {
   );
 }
 
+namespace {
 inline bool nontrivially_less_than(cost a, cost b) noexcept {
   return a + ((a > ROUND_PRECISION) ? 8 : 0) < b;
 }
+} // anonymous namespace
 
 /* This function is the jv shortest augmenting path algorithm to solve the 
    assignment problem */
+namespace TreeDist {
+
 cost lap(const lap_row dim,
-         cost_matrix &input_cost,
+         CostMatrix &input_cost,
          std::vector<lap_col> &rowsol,
          std::vector<lap_row> &colsol,
          const bool allow_interrupt,
          LapScratch &scratch)
-  
-  // input:
-  // dim        - problem size
-  // input_cost - cost matrix
-  
-  // output:
-  // rowsol     - column assigned to row in solution
-  // colsol     - row assigned to column in solution
-  
 {
   lap_row num_free = 0;
   scratch.ensure(dim);
@@ -316,3 +351,4 @@ cost lap(const lap_row dim,
 
   return lapcost;
 }
+} // namespace TreeDist
diff --git a/src/lap.h b/src/lap.h
index b1a22508..592ce9b2 100644
--- a/src/lap.h
+++ b/src/lap.h
@@ -9,456 +9,27 @@
 // because the LAP hot loops are sensitive to instruction alignment — even
 // unrelated inline functions in the same translation unit can shift code
 // layout enough to cause measurable regressions.
-
-#include <TreeTools/SplitList.h>
-#include <Rcpp/Lightest>
-
-#include <limits> /* for numeric_limits */
-#include <vector> /* for vector */
-
-#include "ints.h"
-
-/*************** TYPES      *******************/
-
-using cost = int_fast64_t;
-using lap_dim = int;
-using lap_row = lap_dim;
-using lap_col = lap_dim;
-
-/* For a reason I've not determined, shrinking BIG is necessary to avoid 
- * an infinite loop in lap. */
-constexpr cost BIG = (std::numeric_limits<cost>::max)() / SL_MAX_SPLITS;
-constexpr cost ROUND_PRECISION = 2048 * 2048;
-constexpr size_t BLOCK_SIZE = 16;
-
-class CostMatrix {
-private:
-  size_t dim_; // Important not to use int16, which will overflow on *
-  size_t dim8_;
-  alignas(64) std::vector<cost> data_;
-  alignas(64) std::vector<cost> t_data_;
-  bool transposed_;
-  
-  static size_t block_containing(const size_t x) {
-    return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
-  }
-  
-public:
-  // Default constructor for pooled instances (zero-size, no allocation).
-  CostMatrix() : dim_(0), dim8_(0), transposed_(false) {}
-
-  CostMatrix(size_t dim)
-    : dim_(dim),
-      dim8_(block_containing(dim_)),
-      data_(std::vector<cost>(dim8_ * dim8_)),
-      t_data_(std::vector<cost>(dim8_ * dim8_)),
-      transposed_(false) {}
-
-  // Resize for reuse.  Only reallocates when the new dimension exceeds the
-  // current buffer capacity; otherwise just updates dim_/dim8_ and marks
-  // the transpose as stale.  New elements (if any) are value-initialised.
-  void resize(size_t new_dim) {
-    dim_ = new_dim;
-    dim8_ = block_containing(new_dim);
-    const size_t needed = dim8_ * dim8_;
-    if (data_.size() < needed) {
-      data_.resize(needed);
-      t_data_.resize(needed);
-    }
-    transposed_ = false;
-  }
-      
-  CostMatrix(const Rcpp::NumericMatrix& src, const double x_max)
-    : dim_((std::max(src.nrow(), src.ncol()))),  // or pad here as needed
-      dim8_(block_containing(dim_)),
-      data_(std::vector<cost>(dim8_ * dim_)),
-      t_data_(std::vector<cost>(dim8_ * dim_)),
-      transposed_(true)
-  {
-    // Compute scale factor
-    const cost max_score = cost(BIG / dim_);
-    double scale_factor = max_score / x_max;
-    
-    const lap_row n_row = src.nrow();
-    const lap_col n_col = src.ncol();
-    
-    const double* __restrict src_data = REAL(src);
-    cost* __restrict dest_data = t_data_.data();
-    
-    for (lap_col c = 0; c < n_col; ++c) {
-      const size_t data_c = c * dim8_;
-      const size_t src_c = c * n_row;
-      for (lap_row r = 0; r < n_row; ++r) {
-        // Marginally faster than std::transform
-        dest_data[data_c + r] = static_cast<cost>(src_data[src_c + r] * scale_factor);
-      }
-      
-      padTrColAfterRow(c, n_row, max_score);  // padding now goes after row `c`
-    }
-    
-    padTrAfterCol(n_col, max_score);
-    
-    makeUntranspose();
-  }
-  
-  // Reset for reuse at the same dimension — clears the transpose cache so the
-  // next findColMin() re-builds it from fresh data.  Call between pair
-  // computations when reusing a thread-local CostMatrix instance.
-  void reset() noexcept { transposed_ = false; }
-
-  [[nodiscard]] size_t dim() const noexcept { return dim_; }
-
-  void makeTranspose() noexcept {
-    if (transposed_) return;
-    
-    const cost* __restrict data_ptr = data_.data();
-    cost* __restrict t_data_ptr = t_data_.data();
-    
-#if defined(__GNUC__) || defined(__clang__)
-    data_ptr = static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
-    t_data_ptr = static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
-#endif
-    for (size_t i = 0; i < dim_; i += BLOCK_SIZE) {
-      for (size_t j = 0; j < dim_; j += BLOCK_SIZE) {
-        for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) {
-          for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) {
-            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
-          }
-        }
-      }
-    }
-    
-    transposed_ = true;
-  }
-  
-  void makeUntranspose() noexcept {
-    const cost* __restrict data_ptr = t_data_.data();
-    cost* __restrict t_data_ptr = data_.data();
-    
-#if defined(__GNUC__) || defined(__clang__)
-    data_ptr = static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
-    t_data_ptr = static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
-#endif
-    for (size_t i = 0; i < dim_; i += BLOCK_SIZE) {
-      for (size_t j = 0; j < dim_; j += BLOCK_SIZE) {
-        for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) {
-          for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) {
-            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
-          }
-        }
-      }
-    }
-  }
-  
-  
-  // Access operator for read/write
-  cost& operator()(lap_row i, lap_col j) {
-    return data_[static_cast<size_t>(i) * dim8_ + j];
-  }
-  
-  // Const version for read-only access
-  [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const {
-    return data_[static_cast<size_t>(i) * dim8_ + j];
-  }
-  
-  [[nodiscard]] const cost& row0(lap_col j) const {
-    return data_[j];
-  }
-  
-  [[nodiscard]] const cost& entry0(lap_row i) const {
-    return data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] cost* row(lap_row i) {
-    return &data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] const cost* row(lap_row i) const {
-    return &data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] cost* col(lap_col i) {
-    return &t_data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] const cost* col(lap_col i) const {
-    return &t_data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  void padTrAfterCol(lap_row start_row, cost value) {
-    size_t start_index = static_cast<size_t>(start_row) * dim8_;
-    size_t end_index   = dim_ * dim8_;
-    std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value);
-  }
-  
-  void padAfterRow(lap_row start_row, cost value) {
-    size_t start_index = static_cast<size_t>(start_row) * dim8_;
-    size_t end_index   = dim_ * dim8_;
-    std::fill(data_.begin() + start_index, data_.begin() + end_index, value);
-  }
-  
-  void padTrColAfterRow(const lap_row r, const lap_col start_col,
-                         const cost value) {
-    size_t r_offset = r * dim8_;
-    size_t actual_start_col = static_cast<size_t>(start_col);
-    size_t start_index = r_offset + actual_start_col;
-    size_t end_index = start_index + dim_ - actual_start_col;
-    std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value);
-  }
-  
-  void padRowAfterCol(const lap_row r, const lap_col start_col,
-                      const cost value) {
-    size_t r_offset = r * dim8_;
-    size_t actual_start_col = static_cast<size_t>(start_col);
-    size_t start_index = r_offset + actual_start_col;
-    size_t end_index = start_index + dim_ - actual_start_col;
-    std::fill(data_.begin() + start_index, data_.begin() + end_index, value);
-  }
-  
-  std::pair<cost, lap_row> findColMin(lap_col j,
-                                      lap_row search_dim = -1) {
-    makeTranspose();
-    const cost* col_data = col(j);
-    const lap_row n = (search_dim < 0) ? static_cast<lap_row>(dim_) : search_dim;
-    const auto min_ptr = std::min_element(col_data, col_data + n);
-    return {*min_ptr,
-            static_cast<lap_row>(std::distance(col_data, min_ptr))};
-  }
-  
-  // test2000 = 119 ms (!)
-  //     Find minimum and second minimum reduced cost over columns.
-  std::tuple<cost, cost, lap_col, lap_col> findRowSubmin(
-      const lap_row* i, const std::vector<cost>& v
-  ) const {
-    assert(dim_ > 1);
-    
-    const cost* __restrict row_i = row(*i);
-    const lap_col dim = static_cast<lap_col>(dim_);
-    const cost* __restrict v_ptr = v.data();
-    
-    const cost h0 = row_i[0] - v_ptr[0];
-    const cost h1 = row_i[1] - v_ptr[1];
-    
-    cost min_val, submin_val;
-    lap_col min_idx, submin_idx;
-    
-    if (h0 > h1) {
-      min_val = h1; submin_val = h0;
-      min_idx = 1; submin_idx = 0;
-    } else {
-      min_val = h0; submin_val = h1;
-      min_idx = 0; submin_idx = 1;
-    }
-    
-    const lap_col j_limit = (dim < 4 ? 0 : static_cast<lap_col>(dim - 3));
-    
-    for (lap_col j = 2; j < j_limit; j += 4) {
-      assert(BLOCK_SIZE >= 4);  // Unrolling loop x4 gives ~20% speedup
-      const cost h0 = row_i[j] - v_ptr[j];
-      const cost h1 = row_i[j + 1] - v_ptr[j + 1];
-      const cost h2 = row_i[j + 2] - v_ptr[j + 2];
-      const cost h3 = row_i[j + 3] - v_ptr[j + 3];
-      if (h0 < submin_val) {
-        if (h0 < min_val) {
-          submin_val = min_val;
-          min_val = h0;
-          submin_idx = min_idx;
-          min_idx = j;
-        } else {
-          submin_val = h0;
-          submin_idx = j;
-        }
-      }
-      if (h1 < submin_val) {
-        if (h1 < min_val) {
-          submin_val = min_val;
-          min_val = h1;
-          submin_idx = min_idx;
-          min_idx = j + 1;
-        } else {
-          submin_val = h1;
-          submin_idx = j + 1;
-        }
-      }
-      if (h2 < submin_val) {
-        if (h2 < min_val) {
-          submin_val = min_val;
-          min_val = h2;
-          submin_idx = min_idx;
-          min_idx = j + 2;
-        } else {
-          submin_val = h2;
-          submin_idx = j + 2;
-        }
-      }
-      if (h3 < submin_val) {
-        if (h3 < min_val) {
-          submin_val = min_val;
-          min_val = h3;
-          submin_idx = min_idx;
-          min_idx = j + 3;
-        } else {
-          submin_val = h3;
-          submin_idx = j + 3;
-        }
-      }
-    }
-    for (lap_col j = j_limit; j < dim; ++j) {
-      const cost h = row_i[j] - v_ptr[j];
-      if (h < submin_val) {
-        if (h < min_val) {
-          submin_val = min_val;
-          min_val = h;
-          submin_idx = min_idx;
-          min_idx = j;
-        } else {
-          submin_val = h;
-          submin_idx = j;
-        }
-      }
-    }
-    return {min_val, submin_val, min_idx, submin_idx};
-  }
-  
-  // test2000 = 260 ms
-  std::tuple<cost, cost, lap_col, lap_col> findRowSubminNaive(
-      const lap_row* i,
-      const std::vector<cost>& v_ptr
-  ) {
-    const cost* __restrict row_i = row(*i);
-    const lap_col dim = static_cast<lap_col>(dim_);
-    
-    if (dim < 2) {
-      return {row_i[0] - v_ptr[0], std::numeric_limits<cost>::max(), 0, 0};
-    }
-    
-    // Initialize with first two elements
-    cost h0 = row_i[0] - v_ptr[0];
-    cost h1 = row_i[1] - v_ptr[1];
-    
-    cost min_val, submin_val;
-    lap_col min_idx, submin_idx;
-    
-    if (h0 <= h1) {
-      min_val = h0; submin_val = h1;
-      min_idx = 0; submin_idx = 1;
-    } else {
-      min_val = h1; submin_val = h0;
-      min_idx = 1; submin_idx = 0;
-    }
-    
-    // Process remaining elements
-    for (lap_col j = 2; j < dim; ++j) {
-      const cost h = row_i[j] - v_ptr[j];
-      
-      if (h < min_val) {
-        submin_val = min_val;
-        submin_idx = min_idx;
-        min_val = h;
-        min_idx = j;
-      } else if (h < submin_val) {
-        submin_val = h;
-        submin_idx = j;
-      }
-    }
-    
-    return {min_val, submin_val, min_idx, submin_idx};
-  }
-  
-  // test2000 = 370 ms (!)
-  std::tuple<cost, cost, lap_col, lap_col> findRowSubminTwoPassNaive(
-      const lap_row* i,
-      const std::vector<cost>& v_ptr
-  ) {
-    const cost* __restrict row_i = row(*i);
-    cost min_val = std::numeric_limits<cost>::max();
-    lap_col min_idx = 0;
-    const lap_col dim = static_cast<lap_col>(dim_);
-    
-    for (lap_col j = 0; j < dim; ++j) {
-      const cost h = row_i[j] - v_ptr[j];
-      if (h < min_val) {
-        min_val = h;
-        min_idx = j;
-      }
-    }
-    
-    // Second pass: find subminimum
-    cost submin_val = std::numeric_limits<cost>::max();
-    lap_col submin_idx = (min_idx == 0) ? 1 : 0;
-    
-    for (lap_col j = 0; j < dim; ++j) {
-      if (j != min_idx) {
-        const cost h = row_i[j] - v_ptr[j];
-        if (h < submin_val) {
-          submin_val = h;
-          submin_idx = j;
-        }
-      }
-    }
-    
-    return {min_val, submin_val, min_idx, submin_idx};
-  }
-};
-
-using cost_matrix = CostMatrix;
-
-// ---------------------------------------------------------------------------
-// LapScratch — reusable heap storage for one thread's LAPJV workspace.
 //
-// Pass to the scratch-taking overload of lap() to eliminate the ~6 per-call
-// std::vector allocations.  In a serial context construct once before the
-// loop; in an OpenMP context allocate one per thread and index by
-// omp_get_thread_num().  All vectors are grown lazily; never shrunk.
-// ---------------------------------------------------------------------------
-struct LapScratch {
-  std::vector<cost>    v;               // column dual variables (padded)
-  std::vector<lap_col> matches;         // assignment-count per row
-  std::vector<lap_row> freeunassigned;  // list of unassigned rows
-  std::vector<lap_col> col_list;        // column scan list
-  std::vector<cost>    d;               // Dijkstra distances
-  std::vector<lap_row> predecessor;     // augmenting-path predecessors
-  // rowsol / colsol are included so score functions that don't need the
-  // solution afterwards can avoid a separate allocation.
-  std::vector<lap_col> rowsol;
-  std::vector<lap_row> colsol;
-  // Pooled CostMatrix instances — reused across pairs to avoid per-call
-  // heap allocation.  Grown lazily on first use, never shrunk.
-  CostMatrix score_pool;               // main cost matrix
-  CostMatrix small_pool;               // reduced matrix (exact-match path)
-
-  void ensure(int dim) noexcept {
-    const int padded = static_cast<int>(
-        ((static_cast<size_t>(dim) + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE);
-    if (static_cast<int>(v.size())              < padded) v.resize(padded);
-    if (static_cast<int>(matches.size())        < dim)    matches.resize(dim);
-    if (static_cast<int>(freeunassigned.size()) < dim)    freeunassigned.resize(dim);
-    if (static_cast<int>(col_list.size())       < dim)    col_list.resize(dim);
-    if (static_cast<int>(d.size())              < dim)    d.resize(dim);
-    if (static_cast<int>(predecessor.size())    < dim)    predecessor.resize(dim);
-    if (static_cast<int>(rowsol.size())         < dim)    rowsol.resize(dim);
-    if (static_cast<int>(colsol.size())         < dim)    colsol.resize(dim);
-  }
-};
-
-/*************** FUNCTIONS  *******************/
-
-// Primary overload: caller supplies pre-allocated scratch (zero alloc in hot loop).
-extern cost lap(lap_row dim,
-                cost_matrix &input_cost,
-                std::vector<lap_col> &rowsol,
-                std::vector<lap_row> &colsol,
-                bool allow_interrupt,
-                LapScratch &scratch);
-
-// Convenience overload: creates a temporary scratch (for one-off calls).
-inline cost lap(lap_row dim,
-                cost_matrix &input_cost,
-                std::vector<lap_col> &rowsol,
-                std::vector<lap_row> &colsol,
-                bool allow_interrupt = true) {
-  LapScratch scratch;
-  return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch);
-}
+// Public types, CostMatrix, LapScratch, and lap() declarations live in
+// inst/include/TreeDist/ headers (shared with downstream LinkingTo users).
+// This wrapper re-exports them to global scope for backward compatibility.
+
+#include <TreeDist/lap.h>     // types, CostMatrix, LapScratch, lap()
+
+#include "ints.h"             // grf_match, uint32 (TreeDist-internal types)
+
+// Re-export shared types and classes to global scope.
+using TreeDist::cost;
+using TreeDist::lap_dim;
+using TreeDist::lap_row;
+using TreeDist::lap_col;
+using TreeDist::BIG;
+using TreeDist::ROUND_PRECISION;
+using TreeDist::BLOCK_SIZE;
+using TreeDist::CostMatrix;
+using TreeDist::LapScratch;
+using TreeDist::lap;
+
+using cost_matrix = TreeDist::CostMatrix;
 
 #endif
diff --git a/src/mast.cpp b/src/mast.cpp
index 23fb961e..c26af5ff 100644
--- a/src/mast.cpp
+++ b/src/mast.cpp
@@ -1,5 +1,6 @@
 #include <cstdlib>
 #include <Rcpp/Lightest>
+#include <TreeTools/assert.h>
 #include "ints.h"
 using namespace Rcpp;
 
@@ -63,12 +64,8 @@ int cpp_mast (IntegerMatrix edge1, IntegerMatrix edge2, IntegerVector nTip) {
               n_internal = n_tip - 1,
               n_all_nodes = n_tip + n_internal,
               n_edge = edge1.nrow();
-  if (edge2.nrow() != n_edge) {
-    Rcpp::stop("Both trees must contain the same number of edges.");
-  }
-  if (n_tip > MAST_MAX_TIP) {
-    Rcpp::stop("Tree too large; please contact maintainer for advice.");
-  }
+  ASSERT(edge2.nrow() == n_edge && "Both trees must contain the same number of edges.");
+  ASSERT(n_tip <= MAST_MAX_TIP && "Tree too large; please contact maintainer for advice.");
   
   int16
     t1_left[MAST_MAX_NODE] = {}, t1_right[MAST_MAX_NODE] = {},
diff --git a/src/nni_distance.cpp b/src/nni_distance.cpp
index ac5c17dc..db3a5b3c 100644
--- a/src/nni_distance.cpp
+++ b/src/nni_distance.cpp
@@ -226,16 +226,7 @@ grf_match nni_rf_matching (
   
     ASSERT(n_splits > 0);
     ASSERT(n_tips > 3);
-    if (n_tips > NNI_MAX_TIPS) {
-      Rcpp::stop("Cannot calculate NNI distance for trees with so many tips.");
-    }
-    
-    // #nocov start
-    if (static_cast<int64_t>(n_splits) * static_cast<int64_t>(n_bins) >
-          static_cast<int64_t>(std::numeric_limits<int32_t>::max())) {
-      Rcpp::stop("Cannot calculate NNI distance for trees with so many splits.");
-    }
-    // #nocov end
+    ASSERT(n_tips <= NNI_MAX_TIPS); // Validated by exported wrapper
     
     const int32_t last_bin = n_bins - 1;
     const int32_t unset_tips = (n_tips % SL_BIN_SIZE) ? 
@@ -298,10 +289,7 @@ IntegerVector cpp_nni_distance(const IntegerMatrix& edge1,
                                const IntegerMatrix& edge2,
                                const IntegerVector& nTip) {
   
-  if (nTip[0] > NNI_MAX_TIPS) {
-    Rcpp::stop("Cannot calculate NNI distance for trees with "
-                            "so many tips.");
-  }
+  ASSERT(nTip[0] <= NNI_MAX_TIPS && "Cannot calculate NNI distance for trees with so many tips.");
   const int32_t n_tip = static_cast<int32_t>(nTip[0]);
   const int32_t node_0 = n_tip;
   const int32_t node_0_r = n_tip + 1;
@@ -316,10 +304,8 @@ IntegerVector cpp_nni_distance(const IntegerMatrix& edge1,
   int32_t fack_score_bound = 0;
   int32_t li_score_bound = 0;
   
-  if (n_edge != int32_t(edge2.nrow())) {
-    Rcpp::stop("Both trees must have the same number of edges. "
-                            "Is one rooted and the other unrooted?");
-  }
+  ASSERT(n_edge == int32_t(edge2.nrow())
+         && "Both trees must have the same number of edges.");
 
   if (n_tip < 4) {
     return(IntegerVector::create(Named("lower") = 0,
@@ -349,7 +335,7 @@ IntegerVector cpp_nni_distance(const IntegerMatrix& edge1,
   const int32_t n_splits = n_distinct_edge - n_tip;
   
   if (n_splits < 1) {
-    Rcpp::stop("NNI distance is undefined for trees with no splits"); // #nocov
+    ASSERT(false && "NNI distance is undefined for trees with no splits"); // #nocov
   }
   
   std::unique_ptr<uint64_t[]> splits1(new uint64_t[n_splits * n_bin]);
diff --git a/src/spr.cpp b/src/spr.cpp
index 7eee66d6..c9dd456c 100644
--- a/src/spr.cpp
+++ b/src/spr.cpp
@@ -47,18 +47,11 @@ IntegerVector calc_mismatch_size(const RawMatrix& x, const RawMatrix& y) {
 
 // [[Rcpp::export]]
 IntegerVector mismatch_size (const RawMatrix& x, const RawMatrix& y) {
-  if (x.rows() != y.rows()) {
-    throw std::invalid_argument("`x` and `y` differ in number of splits.");
-  }
-  if (!x.hasAttribute("nTip")) {
-    Rcpp::stop("`x` lacks nTip attribute");
-  }
-  if (!y.hasAttribute("nTip")) {
-    Rcpp::stop("`y` lacks nTip attribute");
-  }
-  if (static_cast<int>(x.attr("nTip")) != static_cast<int>(y.attr("nTip"))) {
-    Rcpp::stop("`x` and `y` differ in `nTip`");
-  }
+  ASSERT(x.rows() == y.rows() && "`x` and `y` differ in number of splits.");
+  ASSERT(x.hasAttribute("nTip") && "`x` lacks nTip attribute");
+  ASSERT(y.hasAttribute("nTip") && "`y` lacks nTip attribute");
+  ASSERT(static_cast<int>(x.attr("nTip")) == static_cast<int>(y.attr("nTip"))
+         && "`x` and `y` differ in `nTip`");
   return calc_mismatch_size(x, y);
 }
 
@@ -102,25 +95,17 @@ IntegerVector calc_confusion(const RawMatrix &x, const RawMatrix &y) {
 
 // [[Rcpp::export]]
 IntegerVector confusion(const RawMatrix& x, const RawMatrix& y) {
-  if (x.rows() != y.rows()) {
-    throw std::invalid_argument("Input splits contain same number of splits.");
-  }
-  if (!x.hasAttribute("nTip")) {
-    Rcpp::stop("`x` lacks nTip attribute");
-  }
-  if (!y.hasAttribute("nTip")) {
-    Rcpp::stop("`y` lacks nTip attribute");
-  }
-  if (static_cast<int>(x.attr("nTip")) != static_cast<int>(y.attr("nTip"))) {
-    Rcpp::stop("`x` and `y` differ in `nTip`");
-  }
+  ASSERT(x.rows() == y.rows() && "Input splits must contain same number of splits.");
+  ASSERT(x.hasAttribute("nTip") && "`x` lacks nTip attribute");
+  ASSERT(y.hasAttribute("nTip") && "`y` lacks nTip attribute");
+  ASSERT(static_cast<int>(x.attr("nTip")) == static_cast<int>(y.attr("nTip"))
+         && "`x` and `y` differ in `nTip`");
   return calc_confusion(x, y);
 }
 
 IntegerMatrix reverse (const IntegerMatrix x) {
-  if (double(x.nrow()) > double(std::numeric_limits<intx>::max())) {
-    Rcpp::stop("This many edges are not (yet) supported.");
-  }
+  ASSERT(double(x.nrow()) <= double(std::numeric_limits<intx>::max())
+         && "This many edges are not (yet) supported.");
   const intx n_edge = intx(x.nrow());
   ASSERT(n_edge % 2 == 0); // Tree is binary
   IntegerMatrix ret(n_edge, 2);
diff --git a/src/spr_lookup.cpp b/src/spr_lookup.cpp
index 9794f248..1ac2281c 100644
--- a/src/spr_lookup.cpp
+++ b/src/spr_lookup.cpp
@@ -223,7 +223,7 @@ int lookup_7(const SplitSet7& sp1, const SplitSet7& sp2) {
 
 inline SplitSet7 read_splits(const Rcpp::RawVector& r) {
   if (r.size() != 4)
-    Rcpp::stop("Expected a length-4 raw vector of splits");
+    ASSERT(false && "Expected a length-4 raw vector of splits");
   
   SplitSet7 sp{};
   for (int i = 0; i < 4; ++i) {
diff --git a/src/transfer_consensus.cpp b/src/transfer_consensus.cpp
new file mode 100644
index 00000000..514c6430
--- /dev/null
+++ b/src/transfer_consensus.cpp
@@ -0,0 +1,879 @@
+/* transfer_consensus.cpp
+ *
+ * C++ implementation of the Transfer Consensus algorithm (Takazawa 2025).
+ * Replaces the R-level .PoolSplits, .TransferDistMat, .ComputeTD,
+ * .CompatMat, and .GreedyBest/.GreedyFirst helpers.
+ *
+ * Exported function: cpp_transfer_consensus()
+ */
+
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include <TreeTools/SplitList.h>
+#include <Rcpp/Lightest>
+#include <algorithm>
+#include <chrono>
+#include <cstdint>
+#include <functional>
+#include <numeric>
+#include <unordered_map>
+#include <vector>
+
+using Rcpp::IntegerVector;
+using Rcpp::List;
+using Rcpp::LogicalVector;
+using Rcpp::RawMatrix;
+using TreeTools::count_bits;
+using TreeTools::SplitList;
+
+
+// ============================================================================
+// Types
+// ============================================================================
+
+using int16 = int_fast16_t;
+using int32 = int_fast32_t;
+
+// ============================================================================
+// Pooled split representation
+// ============================================================================
+
+struct PooledSplits {
+  int n_splits;
+  int n_bins;
+  int n_tips;
+  splitbit last_mask;
+
+  // Flat array of canonical split data: n_splits * n_bins elements.
+  // Split i occupies [i*n_bins .. (i+1)*n_bins).
+  std::vector<splitbit> data;
+
+  // Per-split metadata
+  std::vector<int> count;       // how many trees contain this split
+  std::vector<int> light_side;  // min(popcount, n_tips - popcount)
+
+  // Tree membership: tree_members[t] = list of unique-split indices in tree t
+  std::vector<std::vector<int>> tree_members;
+
+  // Access helpers
+  const splitbit* split(int i) const { return &data[i * n_bins]; }
+  splitbit*       split(int i)       { return &data[i * n_bins]; }
+};
+
+
+// ============================================================================
+// FNV-1a hash for canonical split arrays
+// ============================================================================
+
+struct SplitHash {
+  int n_bins;
+  explicit SplitHash(int nb) : n_bins(nb) {}
+  SplitHash() : n_bins(0) {}
+
+  std::size_t operator()(const splitbit* sp) const {
+    std::size_t h = 14695981039346656037ULL;
+    const unsigned char* p = reinterpret_cast<const unsigned char*>(sp);
+    const int bytes = n_bins * static_cast<int>(sizeof(splitbit));
+    for (int i = 0; i < bytes; ++i) {
+      h ^= static_cast<std::size_t>(p[i]);
+      h *= 1099511628211ULL;
+    }
+    return h;
+  }
+};
+
+struct SplitEqual {
+  int n_bins;
+  explicit SplitEqual(int nb) : n_bins(nb) {}
+  SplitEqual() : n_bins(0) {}
+
+  bool operator()(const splitbit* a, const splitbit* b) const {
+    for (int bin = 0; bin < n_bins; ++bin) {
+      if (a[bin] != b[bin]) return false;
+    }
+    return true;
+  }
+};
+
+// ============================================================================
+// pool_splits: deduplicate and canonicalise all splits from all trees
+// ============================================================================
+
+static PooledSplits pool_splits(const List& splits_list, int n_tips) {
+  const int n_tree = splits_list.size();
+
+  // Determine n_bins from first tree
+  const RawMatrix first_mat = Rcpp::as<RawMatrix>(splits_list[0]);
+  const int n_bytes = first_mat.ncol();
+  const int n_bins = (n_bytes + static_cast<int>(sizeof(splitbit)) - 1)
+    / static_cast<int>(sizeof(splitbit));
+
+  const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0)
+    ? ~splitbit(0)
+    : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1;
+
+  // Count total splits across all trees to pre-allocate pool.data.
+  // This avoids reallocation, which would invalidate the pointers stored
+  // as keys in split_map.
+  int total_splits = 0;
+  for (int t = 0; t < n_tree; ++t) {
+    const RawMatrix mat = Rcpp::as<RawMatrix>(splits_list[t]);
+    total_splits += mat.nrow();
+  }
+
+  // Hash map from canonical split → unique index
+  SplitHash hasher(n_bins);
+  SplitEqual eq(n_bins);
+  std::unordered_map<const splitbit*, int, SplitHash, SplitEqual>
+    split_map(64, hasher, eq);
+
+  // Temp buffer for canonicalisation
+  std::vector<splitbit> canon_buf(n_bins);
+
+  PooledSplits pool;
+  pool.n_tips = n_tips;
+  pool.n_bins = n_bins;
+  pool.last_mask = last_mask;
+  pool.n_splits = 0;
+  pool.tree_members.resize(n_tree);
+
+  // Reserve capacity for the maximum possible unique splits (= total splits).
+  // This ensures pool.data never reallocates, keeping split_map keys valid.
+  pool.data.reserve(static_cast<size_t>(total_splits) * n_bins);
+
+  for (int t = 0; t < n_tree; ++t) {
+    SplitList sl(Rcpp::as<RawMatrix>(splits_list[t]));
+    std::vector<int>& members = pool.tree_members[t];
+    members.reserve(sl.n_splits);
+
+    for (int16 s = 0; s < sl.n_splits; ++s) {
+      // Copy split bits into canon_buf
+      for (int bin = 0; bin < n_bins; ++bin) {
+        canon_buf[bin] = sl.state[s][bin];
+      }
+      // Mask last bin
+      canon_buf[n_bins - 1] &= last_mask;
+
+      // Canonicalise: if bit 0 is set, flip
+      if (canon_buf[0] & splitbit(1)) {
+        for (int bin = 0; bin < n_bins; ++bin) {
+          canon_buf[bin] = ~canon_buf[bin];
+        }
+        canon_buf[n_bins - 1] &= last_mask;
+      }
+
+      // Look up in map
+      auto it = split_map.find(canon_buf.data());
+      int idx;
+      if (it != split_map.end()) {
+        idx = it->second;
+        pool.count[idx]++;
+      } else {
+        idx = pool.n_splits++;
+        // Append canonical data (no reallocation due to reserve above)
+        const size_t old_sz = pool.data.size();
+        pool.data.resize(old_sz + n_bins);
+        std::copy(canon_buf.begin(), canon_buf.end(),
+                  pool.data.begin() + old_sz);
+        pool.count.push_back(1);
+
+        // Compute light_side = min(popcount, n_tips - popcount)
+        int pc = 0;
+        for (int bin = 0; bin < n_bins; ++bin) {
+          pc += count_bits(canon_buf[bin]);
+        }
+        pool.light_side.push_back(std::min(pc, n_tips - pc));
+
+        // Insert pointer into map (points into pool.data; stable due to reserve)
+        split_map[pool.split(idx)] = idx;
+      }
+
+      // Only record unique splits per tree
+      if (members.empty() || members.back() != idx) {
+        // Check if already present (small set, linear scan ok)
+        bool found = false;
+        for (int m : members) {
+          if (m == idx) { found = true; break; }
+        }
+        if (!found) members.push_back(idx);
+      }
+    }
+  }
+
+  return pool;
+}
+
+
+// ============================================================================
+// transfer_dist_mat: pairwise transfer distance between all unique splits
+// ============================================================================
+
+// Returns a flat n_splits x n_splits matrix (row-major).
+// DIST[i * n + j] = transfer distance between split i and split j.
+static std::vector<int> transfer_dist_mat(const PooledSplits& pool,
+                                          int n_threads) {
+  const int M = pool.n_splits;
+  const int nb = pool.n_bins;
+  const int nt = pool.n_tips;
+
+  std::vector<int> dist(M * M, 0);
+
+  #ifdef _OPENMP
+  #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+  #endif
+  for (int i = 0; i < M; ++i) {
+    const splitbit* a = pool.split(i);
+    for (int j = i + 1; j < M; ++j) {
+      const splitbit* b = pool.split(j);
+      int hamming = 0;
+      for (int bin = 0; bin < nb; ++bin) {
+        hamming += count_bits(a[bin] ^ b[bin]);
+      }
+      int td = std::min(hamming, nt - hamming);
+      dist[i * M + j] = td;
+      dist[j * M + i] = td;
+    }
+  }
+  return dist;
+}
+
+
+// ============================================================================
+// compute_td: transfer dissimilarity cost for each split
+// ============================================================================
+
+static std::vector<double> compute_td(
+    const std::vector<int>& dist,
+    const PooledSplits& pool,
+    bool scale,
+    int n_threads
+) {
+  const int M = pool.n_splits;
+  const int n_tree = static_cast<int>(pool.tree_members.size());
+
+  std::vector<double> td(M, 0.0);
+
+  // Parallelise over splits (each td[b] is independent)
+  #ifdef _OPENMP
+  #pragma omp parallel for schedule(static) num_threads(n_threads)
+  #endif
+  for (int b = 0; b < M; ++b) {
+    const int p_minus_1 = pool.light_side[b] - 1;
+    if (p_minus_1 <= 0) continue;
+
+    double acc = 0.0;
+    for (int t = 0; t < n_tree; ++t) {
+      const auto& members = pool.tree_members[t];
+      const int n_mem = static_cast<int>(members.size());
+
+      int min_d = p_minus_1; // sentinel distance
+      for (int k = 0; k < n_mem; ++k) {
+        int d = dist[b * M + members[k]];
+        if (d < min_d) min_d = d;
+      }
+      if (scale) {
+        double contrib = static_cast<double>(min_d) / p_minus_1;
+        acc += (contrib < 1.0) ? contrib : 1.0;
+      } else {
+        acc += (min_d < p_minus_1) ? min_d : p_minus_1;
+      }
+    }
+    td[b] = acc;
+  }
+  return td;
+}
+
+
+// ============================================================================
+// compat_mat: pairwise compatibility between all unique splits
+// ============================================================================
+
+// Returns flat M x M bool matrix.
+// compat[i * M + j] = true iff splits i and j are compatible.
+static std::vector<uint8_t> compat_mat(const PooledSplits& pool,
+                                       int n_threads) {
+  const int M = pool.n_splits;
+  const int nb = pool.n_bins;
+  const int last_bin = nb - 1;
+  const splitbit lm = pool.last_mask;
+
+  std::vector<uint8_t> compat(M * M, 1);
+
+  #ifdef _OPENMP
+  #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+  #endif
+  for (int i = 0; i < M; ++i) {
+    const splitbit* a = pool.split(i);
+    for (int j = i + 1; j < M; ++j) {
+      const splitbit* b = pool.split(j);
+      bool ab = false, anb = false, nab = false, nanb = false;
+      for (int bin = 0; bin < nb; ++bin) {
+        splitbit mask = (bin == last_bin) ? lm : ~splitbit(0);
+        splitbit a_bin = a[bin] & mask;
+        splitbit b_bin = b[bin] & mask;
+        if (!ab)   ab   = (a_bin & b_bin) != 0;
+        if (!anb)  anb  = (a_bin & ~b_bin & mask) != 0;
+        if (!nab)  nab  = (~a_bin & b_bin & mask) != 0;
+        if (!nanb) nanb = (~a_bin & ~b_bin & mask) != 0;
+        if (ab && anb && nab && nanb) break;
+      }
+      uint8_t comp = (!ab || !anb || !nab || !nanb) ? 1 : 0;
+      compat[i * M + j] = comp;
+      compat[j * M + i] = comp;
+    }
+  }
+  return compat;
+}
+
+
+// ============================================================================
+// Greedy loop helpers
+// ============================================================================
+
+// Sentinel distance for split b (distance to a leaf bipartition)
+inline int sent_dist(int b, const PooledSplits& pool) {
+  return pool.light_side[b] - 1;
+}
+
+// "Effective distance" from split b to its match (or sentinel if match == -1)
+inline int eff_dist(int b, int match_idx, const std::vector<int>& dist,
+                    int M, const PooledSplits& pool) {
+  return (match_idx < 0) ? sent_dist(b, pool) : dist[b * M + match_idx];
+}
+
+// Find second-closest included split to split b (excluding matchIdx).
+// Returns {index, distance} as a pair; index = -1 means sentinel.
+static std::pair<int, int> find_second(
+    int b, int matchIdx,
+    const std::vector<uint8_t>& incl,
+    const std::vector<int>& dist,
+    int M, const PooledSplits& pool,
+    bool scale
+) {
+  int p_minus_1 = sent_dist(b, pool);
+  int best = -1;
+  int best_d = (p_minus_1 > 0) ? p_minus_1 : 0;
+
+  for (int c = 0; c < M; ++c) {
+    if (!incl[c] || c == matchIdx) continue;
+    int d = dist[b * M + c];
+    if (d < best_d) {
+      best_d = d;
+      best = c;
+    }
+  }
+  return {best, (best >= 0) ? best_d : p_minus_1};
+}
+
+
+// ============================================================================
+// GreedyState: persistent state for the greedy loop.
+//
+// Maintains match_dist[], match2_dist[], n_incompat[], weight[] to avoid
+// redundant O(M) scans in benefit calculations and compat checks.
+// ============================================================================
+
+struct GreedyState {
+  int M;
+  int n_tip;
+  int n_incl;
+  bool scale;
+
+  std::vector<int>&       match;
+  std::vector<int>&       match2;
+  std::vector<uint8_t>&   incl;
+  const std::vector<int>& dist;
+  const std::vector<double>& td;
+  const PooledSplits&     pool;
+  const std::vector<uint8_t>& compat;
+
+  // Cached per-split arrays
+  std::vector<int>    match_dist;   // dist to match[b], or sentinel
+  std::vector<int>    match2_dist;  // dist to match2[b], or sentinel
+  std::vector<int>    n_incompat;   // # included splits incompatible with b
+  std::vector<double> weight;       // count[b] / (light_side[b] - 1) or count[b]
+
+  GreedyState(
+      std::vector<int>& match_, std::vector<int>& match2_,
+      std::vector<uint8_t>& incl_,
+      const std::vector<int>& dist_, const std::vector<double>& td_,
+      const PooledSplits& pool_, const std::vector<uint8_t>& compat_,
+      bool scale_, int M_, int n_tip_
+  ) : M(M_), n_tip(n_tip_), n_incl(0), scale(scale_),
+      match(match_), match2(match2_), incl(incl_),
+      dist(dist_), td(td_), pool(pool_), compat(compat_),
+      match_dist(M_), match2_dist(M_), n_incompat(M_, 0), weight(M_)
+  {
+    // Precompute weight = count[b] / (light_side[b] - 1) for scaled,
+    //                     count[b] for unscaled
+    for (int b = 0; b < M; ++b) {
+      int p1 = sent_dist(b, pool);
+      if (p1 <= 0) { weight[b] = 0.0; continue; }
+      weight[b] = scale
+        ? static_cast<double>(pool.count[b]) / p1
+        : static_cast<double>(pool.count[b]);
+    }
+
+    // Initialize match_dist / match2_dist from current match/match2
+    for (int b = 0; b < M; ++b) {
+      match_dist[b]  = eff_dist(b, match[b],  dist, M, pool);
+      match2_dist[b] = eff_dist(b, match2[b], dist, M, pool);
+    }
+
+    // Initialize n_incompat from current incl
+    for (int i = 0; i < M; ++i) {
+      if (!incl[i]) continue;
+      n_incl++;
+      for (int j = 0; j < M; ++j) {
+        if (!compat[j * M + i]) n_incompat[j]++;
+      }
+    }
+  }
+
+  // O(1) compat check
+  bool is_compatible(int idx) const {
+    return n_incompat[idx] == 0 && n_incl < n_tip - 3;
+  }
+
+  // Add-benefit using precomputed match_dist[] and weight[]
+  double add_benefit(int c) const {
+    double ben = -td[c];
+    for (int b = 0; b < M; ++b) {
+      int diff = match_dist[b] - dist[b * M + c];
+      if (diff <= 0) continue;
+      ben += diff * weight[b];
+    }
+    return ben;
+  }
+
+  // Remove-benefit using precomputed match2_dist[] and weight[]
+  double remove_benefit(int c) const {
+    double ben = td[c];
+    for (int b = 0; b < M; ++b) {
+      if (match[b] != c) continue;
+      int diff = dist[b * M + c] - match2_dist[b];
+      if (diff >= 0) continue;
+      ben += diff * weight[b];
+    }
+    return ben;
+  }
+
+  // Execute add: update incl, match, match2, match_dist, match2_dist,
+  //              n_incompat, n_incl
+  void do_add(int idx) {
+    incl[idx] = 1;
+    n_incl++;
+
+    // Update n_incompat for all splits
+    for (int j = 0; j < M; ++j) {
+      if (!compat[j * M + idx]) n_incompat[j]++;
+    }
+
+    // Update match/match2 and their cached distances
+    for (int b = 0; b < M; ++b) {
+      int new_d = dist[b * M + idx];
+      if (new_d < match_dist[b]) {
+        match2[b]      = match[b];
+        match2_dist[b] = match_dist[b];
+        match[b]       = idx;
+        match_dist[b]  = new_d;
+      } else if (new_d < match2_dist[b]) {
+        match2[b]      = idx;
+        match2_dist[b] = new_d;
+      }
+    }
+  }
+
+  // Execute remove: update incl, match, match2, match_dist, match2_dist,
+  //                 n_incompat, n_incl
+  void do_remove(int idx) {
+    incl[idx] = 0;
+    n_incl--;
+
+    // Update n_incompat
+    for (int j = 0; j < M; ++j) {
+      if (!compat[j * M + idx]) n_incompat[j]--;
+    }
+
+    // Update match/match2 for affected splits
+    for (int b = 0; b < M; ++b) {
+      if (match[b] == idx) {
+        // Promote match2 → match
+        match[b]      = match2[b];
+        match_dist[b] = match2_dist[b];
+
+        if (match[b] < 0) {
+          // Defensive guard: promoted match2 was the sentinel (no real
+          // second-closest split was known).  Rescan from scratch to find
+          // the actual closest included split.  This requires a topology
+          // where match2[b] stayed at -1 while another split's removal
+          // makes b's greedy match the removed split — theoretically
+          // reachable but extremely unlikely in practice.
+          auto [first, first_d] = find_second(b, -1, incl, dist, M, pool, scale); // #nocov
+          match[b]      = first;   // #nocov
+          match_dist[b] = first_d; // #nocov
+        }
+
+        // Find new second-closest
+        auto [sec, sec_d] = find_second(b, match[b], incl, dist, M, pool, scale);
+        match2[b]      = sec;
+        match2_dist[b] = sec_d;
+      } else if (match2[b] == idx) {
+        auto [sec, sec_d] = find_second(b, match[b], incl, dist, M, pool, scale);
+        match2[b]      = sec;
+        match2_dist[b] = sec_d;
+      }
+    }
+  }
+};
+
+
+// ============================================================================
+// Greedy "best" strategy
+// ============================================================================
+
+static void greedy_best(
+    std::vector<int>& match,
+    std::vector<int>& match2,
+    std::vector<uint8_t>& incl,
+    const std::vector<int>& dist,
+    const std::vector<double>& td,
+    const PooledSplits& pool,
+    const std::vector<uint8_t>& compat,
+    const std::vector<int>& sort_ord,
+    bool scale,
+    int M, int n_tip
+) {
+  GreedyState st(match, match2, incl, dist, td, pool, compat, scale, M, n_tip);
+
+  while (true) {
+    double best_ben = 0.0;
+    int best_idx = -1;
+    bool best_is_add = false;
+
+    for (int si = 0; si < M; ++si) {
+      int idx = sort_ord[si];
+      if (incl[idx]) {
+        double ben = st.remove_benefit(idx);
+        if (ben > best_ben) {
+          best_ben = ben;
+          best_idx = idx;
+          best_is_add = false;
+        }
+      } else {
+        if (!st.is_compatible(idx)) continue;
+        double ben = st.add_benefit(idx);
+        if (ben > best_ben) {
+          best_ben = ben;
+          best_idx = idx;
+          best_is_add = true;
+        }
+      }
+    }
+
+    if (best_ben <= 0.0 || best_idx < 0) break;
+
+    if (best_is_add) {
+      st.do_add(best_idx);
+    } else {
+      st.do_remove(best_idx);
+    }
+  }
+}
+
+
+// ============================================================================
+// Greedy "first" strategy
+// ============================================================================
+
+static void greedy_first(
+    std::vector<int>& match,
+    std::vector<int>& match2,
+    std::vector<uint8_t>& incl,
+    const std::vector<int>& dist,
+    const std::vector<double>& td,
+    const PooledSplits& pool,
+    const std::vector<uint8_t>& compat,
+    const std::vector<int>& sort_ord,
+    bool scale,
+    int M, int n_tip
+) {
+  GreedyState st(match, match2, incl, dist, td, pool, compat, scale, M, n_tip);
+
+  bool improving = true;
+  while (improving) {
+    improving = false;
+    for (int si = 0; si < M; ++si) {
+      int idx = sort_ord[si];
+      if (incl[idx]) {
+        if (st.remove_benefit(idx) > 0.0) {
+          st.do_remove(idx);
+          improving = true;
+          break;
+        }
+      } else {
+        if (!st.is_compatible(idx)) continue;
+        if (st.add_benefit(idx) > 0.0) {
+          st.do_add(idx);
+          improving = true;
+          break;
+        }
+      }
+    }
+  }
+}
+
+
+// ============================================================================
+// Init matches (for init = "majority")
+// ============================================================================
+
+static void init_matches(
+    std::vector<int>& match,
+    std::vector<int>& match2,
+    const std::vector<uint8_t>& incl,
+    const std::vector<int>& dist,
+    int M, const PooledSplits& pool,
+    bool scale
+) {
+  // Collect included indices
+  std::vector<int> inc_idx;
+  for (int i = 0; i < M; ++i) {
+    if (incl[i]) inc_idx.push_back(i);
+  }
+  if (inc_idx.empty()) return;
+
+  for (int b = 0; b < M; ++b) {
+    int p_minus_1 = pool.light_side[b] - 1;
+    if (p_minus_1 <= 0) continue;
+
+    // Find closest and second-closest among included
+    int best = -1, second = -1;
+    int best_d = p_minus_1, second_d = p_minus_1; // sentinel threshold
+
+    for (int c : inc_idx) {
+      int d = dist[b * M + c];
+      if (d < best_d) {
+        second = best;
+        second_d = best_d;
+        best = c;
+        best_d = d;
+      } else if (d < second_d) {
+        second = c;
+        second_d = d;
+      }
+    }
+
+    if (best >= 0) {
+      if (scale) {
+        if (static_cast<double>(best_d) / p_minus_1 >= 1.0) continue;
+      } else {
+        if (best_d >= p_minus_1) continue;
+      }
+      match[b] = best;
+
+      if (second >= 0) {
+        if (scale) {
+          if (static_cast<double>(second_d) / p_minus_1 < 1.0)
+            match2[b] = second;
+        } else {
+          if (second_d < p_minus_1)
+            match2[b] = second;
+        }
+      }
+    }
+  }
+}
+
+
+// ============================================================================
+// Main exported function
+// ============================================================================
+
+//' Transfer consensus (C++ implementation)
+//'
+//' @param splits_list List of raw matrices (one per tree), each from as.Splits().
+//' @param n_tip Number of tips.
+//' @param scale Logical: use scaled transfer distance?
+//' @param greedy_best_flag Logical: TRUE for "best", FALSE for "first".
+//' @param init_majority Logical: TRUE to start from majority-rule splits.
+//'
+//' @return A `LogicalVector` of length n_splits indicating which pooled splits
+//'   are included in the consensus, plus attributes "raw_splits" (a raw matrix
+//'   of all unique splits) and "light_side" (integer vector).
+//' @keywords internal
+// [[Rcpp::export]]
+List cpp_transfer_consensus(
+    const List& splits_list,
+    const int n_tip,
+    const bool scale,
+    const bool greedy_best_flag,
+    const bool init_majority,
+    const int n_threads = 1
+) {
+  const int n_tree = splits_list.size();
+
+  // ---- Pool unique splits ----
+  PooledSplits pool = pool_splits(splits_list, n_tip);
+  const int M = pool.n_splits;
+
+  if (M == 0) {
+    return List::create(
+      Rcpp::Named("included") = LogicalVector(0),
+      Rcpp::Named("raw_splits") = RawMatrix(0, 0),
+      Rcpp::Named("light_side") = IntegerVector(0)
+    );
+  }
+
+  // ---- Pairwise transfer distance matrix ----
+  std::vector<int> dist = transfer_dist_mat(pool, n_threads);
+
+  // ---- Transfer dissimilarity cost ----
+  std::vector<double> td = compute_td(dist, pool, scale, n_threads);
+
+  // ---- Compatibility matrix ----
+  std::vector<uint8_t> compat = compat_mat(pool, n_threads);
+
+  // ---- Sort order (by count descending, then index ascending for ties) ----
+  // Must match R's order(-counts, seq_along(counts)) for reproducibility.
+  std::vector<int> sort_ord(M);
+  std::iota(sort_ord.begin(), sort_ord.end(), 0);
+  std::stable_sort(sort_ord.begin(), sort_ord.end(),
+                   [&](int a, int b) { return pool.count[a] > pool.count[b]; });
+
+  // ---- Mutable state ----
+  std::vector<int> match(M, -1);   // -1 = sentinel
+  std::vector<int> match2(M, -1);
+  std::vector<uint8_t> incl(M, 0);
+
+  // ---- Init from majority if requested ----
+  if (init_majority) {
+    double half = n_tree / 2.0;
+    for (int i = 0; i < M; ++i) {
+      if (pool.count[i] > half) {
+        incl[i] = 1;
+      }
+    }
+    init_matches(match, match2, incl, dist, M, pool, scale);
+  }
+
+  // ---- Greedy loop ----
+  if (greedy_best_flag) {
+    greedy_best(match, match2, incl, dist, td, pool, compat, sort_ord,
+                scale, M, n_tip);
+  } else {
+    greedy_first(match, match2, incl, dist, td, pool, compat, sort_ord,
+                 scale, M, n_tip);
+  }
+
+  // ---- Build output ----
+  // Return raw split data for included splits so R can convert to phylo
+  LogicalVector incl_r(M);
+  for (int i = 0; i < M; ++i) incl_r[i] = incl[i] != 0;
+
+  // Build raw matrix of ALL unique splits (n_splits x n_bytes)
+  // The canonical form may differ from the original raw form: we need to
+  // return the canonical splitbit data as raw bytes for as.phylo().
+  const int n_bytes = pool.n_bins * static_cast<int>(sizeof(splitbit));
+  RawMatrix raw_splits(M, n_bytes);
+  for (int i = 0; i < M; ++i) {
+    const unsigned char* src =
+      reinterpret_cast<const unsigned char*>(pool.split(i));
+    for (int j = 0; j < n_bytes; ++j) {
+      raw_splits(i, j) = Rbyte(src[j]);
+    }
+  }
+
+  IntegerVector light_side(M);
+  for (int i = 0; i < M; ++i) light_side[i] = pool.light_side[i];
+
+  return List::create(
+    Rcpp::Named("included") = incl_r,
+    Rcpp::Named("raw_splits") = raw_splits,
+    Rcpp::Named("light_side") = light_side
+  );
+}
+
+
+// ============================================================================
+// Diagnostic: per-stage timing (for profiling without VTune)
+// ============================================================================
+
+//' @keywords internal
+// [[Rcpp::export]]
+Rcpp::NumericVector cpp_tc_profile(
+    const List& splits_list,
+    const int n_tip,
+    const bool scale,
+    const bool greedy_best_flag,
+    const bool init_majority,
+    const int n_iter,
+    const int n_threads = 1
+) {
+  using clk = std::chrono::high_resolution_clock;
+  const int n_tree = splits_list.size();
+
+  double t_pool = 0, t_dist = 0, t_td = 0, t_compat = 0, t_greedy = 0;
+
+  for (int iter = 0; iter < n_iter; ++iter) {
+    auto t0 = clk::now();
+    PooledSplits pool = pool_splits(splits_list, n_tip);
+    auto t1 = clk::now();
+    const int M = pool.n_splits;
+
+    std::vector<int> dist = transfer_dist_mat(pool, n_threads);
+    auto t2 = clk::now();
+
+    std::vector<double> td = compute_td(dist, pool, scale, n_threads);
+    auto t3 = clk::now();
+
+    std::vector<uint8_t> compat_v = compat_mat(pool, n_threads);
+    auto t4 = clk::now();
+
+    std::vector<int> sort_ord(M);
+    std::iota(sort_ord.begin(), sort_ord.end(), 0);
+    std::stable_sort(sort_ord.begin(), sort_ord.end(),
+                     [&](int a, int b) { return pool.count[a] > pool.count[b]; });
+
+    std::vector<int> match(M, -1);
+    std::vector<int> match2(M, -1);
+    std::vector<uint8_t> incl(M, 0);
+
+    if (init_majority) {
+      double half = n_tree / 2.0;
+      for (int i = 0; i < M; ++i) {
+        if (pool.count[i] > half) incl[i] = 1;
+      }
+      init_matches(match, match2, incl, dist, M, pool, scale);
+    }
+
+    if (greedy_best_flag) {
+      greedy_best(match, match2, incl, dist, td, pool, compat_v, sort_ord,
+                  scale, M, n_tip);
+    } else {
+      greedy_first(match, match2, incl, dist, td, pool, compat_v, sort_ord,
+                   scale, M, n_tip);
+    }
+    auto t5 = clk::now();
+
+    auto us = [](auto a, auto b) {
+      return std::chrono::duration<double, std::micro>(b - a).count();
+    };
+    t_pool   += us(t0, t1);
+    t_dist   += us(t1, t2);
+    t_td     += us(t2, t3);
+    t_compat += us(t3, t4);
+    t_greedy += us(t4, t5);
+  }
+
+  double inv = 1.0 / n_iter;
+  Rcpp::NumericVector result = {
+    t_pool * inv, t_dist * inv, t_td * inv, t_compat * inv, t_greedy * inv
+  };
+  result.attr("names") = Rcpp::CharacterVector(
+    {"pool_splits", "transfer_dist_mat", "compute_td", "compat_mat", "greedy"}
+  );
+  return result;
+}
diff --git a/src/transfer_distance.cpp b/src/transfer_distance.cpp
new file mode 100644
index 00000000..a2ff23e8
--- /dev/null
+++ b/src/transfer_distance.cpp
@@ -0,0 +1,362 @@
+/* transfer_distance.cpp
+ *
+ * Per-pair and batch computation of the transfer dissimilarity between
+ * phylogenetic trees (Takazawa et al. 2026; Lemoine et al. 2018).
+ *
+ * The transfer distance between two bipartitions is the minimum number of
+ * taxa that must be moved to transform one bipartition into the other:
+ *   delta(b, b*) = min(Hamming(b, b*), n - Hamming(b, b*))
+ *
+ * The transfer dissimilarity between two trees sums, for each branch in
+ * each tree, the minimum transfer distance to any branch in the other tree.
+ */
+
+#include <TreeTools/SplitList.h>
+#include <TreeTools/assert.h>
+#include <Rcpp/Lightest>
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <vector>
+
+using Rcpp::IntegerVector;
+using Rcpp::List;
+using Rcpp::LogicalVector;
+using Rcpp::Named;
+using Rcpp::NumericMatrix;
+using Rcpp::NumericVector;
+using Rcpp::RawMatrix;
+using TreeTools::count_bits;
+using TreeTools::SplitList;
+
+using int16 = int_fast16_t;
+using int32 = int_fast32_t;
+
+
+// ============================================================================
+// Per-pair transfer distance computation
+// ============================================================================
+
+// Compute transfer distance between two bipartitions (split bitsets).
+// Returns min(hamming, n_tips - hamming).
+static inline int transfer_dist_splits(
+    const splitbit* a, const splitbit* b, int n_bins, int n_tips
+) {
+  int hamming = 0;
+  for (int bin = 0; bin < n_bins; ++bin) {
+    hamming += count_bits(a[bin] ^ b[bin]);
+  }
+  return std::min(hamming, n_tips - hamming);
+}
+
+
+// For each split in `from`, find the minimum transfer distance to any split
+// in `to`.  Returns a vector of length from.n_splits with the min distances,
+// and a matching vector with the index of the best match in `to` (-1 if no
+// improvement over sentinel).
+static void find_min_transfer(
+    const SplitList& from,
+    const SplitList& to,
+    int n_tips,
+    int n_bins,
+    std::vector<int>& min_dist,     // out: [from.n_splits]
+    std::vector<int>& best_match    // out: [from.n_splits]
+) {
+  for (int16 i = 0; i < from.n_splits; ++i) {
+    // Sentinel distance: depth(b) - 1
+    int pc = 0;
+    for (int bin = 0; bin < n_bins; ++bin) {
+      pc += count_bits(from.state[i][bin]);
+    }
+    int depth = std::min(pc, n_tips - pc);
+    int sentinel = depth - 1;
+    if (sentinel <= 0) {
+      min_dist[i] = 0;
+      best_match[i] = -1;
+      continue;
+    }
+
+    int best_d = sentinel;
+    int best_j = -1;
+
+    for (int16 j = 0; j < to.n_splits; ++j) {
+      int d = transfer_dist_splits(from.state[i], to.state[j], n_bins, n_tips);
+      if (d < best_d) {
+        best_d = d;
+        best_j = j;
+        if (d == 0) break; // can't do better
+      }
+    }
+
+    min_dist[i] = best_d;
+    best_match[i] = best_j;
+  }
+}
+
+
+// Accumulate the transfer dissimilarity contribution from one direction.
+// For scaled: sum of min(min_dist[i] / (depth[i] - 1), 1)
+// For unscaled: sum of min(min_dist[i], depth[i] - 1)
+static double accumulate_transfer(
+    const SplitList& sl,
+    const std::vector<int>& min_dist,
+    int n_tips, int n_bins,
+    bool scale
+) {
+  double total = 0.0;
+  for (int16 i = 0; i < sl.n_splits; ++i) {
+    int pc = 0;
+    for (int bin = 0; bin < n_bins; ++bin) {
+      pc += count_bits(sl.state[i][bin]);
+    }
+    int depth = std::min(pc, n_tips - pc);
+    int p_minus_1 = depth - 1;
+    if (p_minus_1 <= 0) continue;
+
+    int d = min_dist[i];
+    if (scale) {
+      double contrib = static_cast<double>(d) / p_minus_1;
+      total += (contrib < 1.0) ? contrib : 1.0;
+    } else {
+      total += (d < p_minus_1) ? d : p_minus_1;
+    }
+  }
+  return total;
+}
+
+
+// ============================================================================
+// Exported: per-pair transfer distance (returns both scaled and unscaled)
+// ============================================================================
+
+//' Per-pair transfer dissimilarity
+//'
+//' @param x,y Raw matrices representing splits (from as.Splits()).
+//' @param nTip Integer: number of tips.
+//'
+//' @return A list with components:
+//'   - score_scaled: scaled transfer dissimilarity (double)
+//'   - score_unscaled: unscaled transfer dissimilarity (double)
+//'   - `matching_xy`: integer vector, best match in y for each split in x (1-based, NA if sentinel)
+//'   - `matching_yx`: integer vector, best match in x for each split in y (1-based, NA if sentinel)
+//' @keywords internal
+// [[Rcpp::export]]
+List cpp_transfer_dist(
+    const RawMatrix& x, const RawMatrix& y,
+    const IntegerVector& nTip
+) {
+  // Validated by R caller (.ValidateSplitArgs or as.Splits with shared tipLabels)
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
+
+  const int n_tip = nTip[0];
+  SplitList sl_x(x);
+  SplitList sl_y(y);
+  const int n_bins = sl_x.n_bins;
+
+  // Direction x → y
+  std::vector<int> min_dist_xy(sl_x.n_splits);
+  std::vector<int> match_xy(sl_x.n_splits);
+  find_min_transfer(sl_x, sl_y, n_tip, n_bins, min_dist_xy, match_xy);
+
+  // Direction y → x
+  std::vector<int> min_dist_yx(sl_y.n_splits);
+  std::vector<int> match_yx(sl_y.n_splits);
+  find_min_transfer(sl_y, sl_x, n_tip, n_bins, min_dist_yx, match_yx);
+
+  // Accumulate both directions for scaled and unscaled
+  double score_scaled =
+      accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, true) +
+      accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, true);
+  double score_unscaled =
+      accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, false) +
+      accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, false);
+
+  // Build R matching vectors (1-based, NA for sentinel)
+  IntegerVector r_match_xy(sl_x.n_splits);
+  for (int16 i = 0; i < sl_x.n_splits; ++i) {
+    r_match_xy[i] = (match_xy[i] >= 0) ? match_xy[i] + 1 : NA_INTEGER;
+  }
+  IntegerVector r_match_yx(sl_y.n_splits);
+  for (int16 i = 0; i < sl_y.n_splits; ++i) {
+    r_match_yx[i] = (match_yx[i] >= 0) ? match_yx[i] + 1 : NA_INTEGER;
+  }
+
+  return List::create(
+    Named("score_scaled") = score_scaled,
+    Named("score_unscaled") = score_unscaled,
+    Named("matching_xy") = r_match_xy,
+    Named("matching_yx") = r_match_yx
+  );
+}
+
+
+// ============================================================================
+// Exported: per-pair score-only (for GeneralizedRF compatibility)
+// ============================================================================
+
+// Returns a list with "score" and "matching" to be compatible with
+// GeneralizedRF / CalculateTreeDistance dispatch.
+
+//' @keywords internal
+// [[Rcpp::export]]
+List cpp_transfer_dist_scored(
+    const RawMatrix& x, const RawMatrix& y,
+    const IntegerVector& nTip, bool scale
+) {
+  // Validated by R caller (.ValidateSplitArgs or as.Splits with shared tipLabels)
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
+
+  const int n_tip = nTip[0];
+  SplitList sl_x(x);
+  SplitList sl_y(y);
+  const int n_bins = sl_x.n_bins;
+
+  std::vector<int> min_dist_xy(sl_x.n_splits);
+  std::vector<int> match_xy(sl_x.n_splits);
+  find_min_transfer(sl_x, sl_y, n_tip, n_bins, min_dist_xy, match_xy);
+
+  std::vector<int> min_dist_yx(sl_y.n_splits);
+  std::vector<int> match_yx(sl_y.n_splits);
+  find_min_transfer(sl_y, sl_x, n_tip, n_bins, min_dist_yx, match_yx);
+
+  double score = scale
+    ? (accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, true) +
+       accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, true))
+    : (accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, false) +
+       accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, false));
+
+  // Build matching vector (1-based) — just the x→y direction
+  const int n_out = std::max(static_cast<int>(sl_x.n_splits),
+                             static_cast<int>(sl_y.n_splits));
+  IntegerVector matching(n_out, NA_INTEGER);
+  for (int16 i = 0; i < sl_x.n_splits; ++i) {
+    matching[i] = (match_xy[i] >= 0) ? match_xy[i] + 1 : NA_INTEGER;
+  }
+
+  return List::create(Named("score") = score, Named("matching") = matching);
+}
+
+
+// ============================================================================
+// Batch: all-pairs transfer dissimilarity (OpenMP parallel)
+// ============================================================================
+
+// Static helper for score-only per-pair computation.
+static double transfer_dissimilarity_score(
+    const SplitList& sl_x, const SplitList& sl_y,
+    int n_tip, int n_bins, bool scale
+) {
+  // Avoid allocation when trees have 0 splits (star trees)
+  if (sl_x.n_splits == 0 && sl_y.n_splits == 0) return 0.0;
+
+  std::vector<int> min_dist_xy(sl_x.n_splits);
+  std::vector<int> dummy_match_xy(sl_x.n_splits);
+  find_min_transfer(sl_x, sl_y, n_tip, n_bins, min_dist_xy, dummy_match_xy);
+
+  std::vector<int> min_dist_yx(sl_y.n_splits);
+  std::vector<int> dummy_match_yx(sl_y.n_splits);
+  find_min_transfer(sl_y, sl_x, n_tip, n_bins, min_dist_yx, dummy_match_yx);
+
+  return accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, scale) +
+         accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, scale);
+}
+
+
+//' All-pairs transfer dissimilarity (OpenMP)
+//'
+//' @param splits_list List of raw matrices (one per tree).
+//' @param n_tip Number of tips.
+//' @param scale Logical: use scaled transfer dissimilarity?
+//' @param n_threads Number of OpenMP threads.
+//'
+//' @return Numeric vector of length choose(N,2) in dist order.
+//' @keywords internal
+// [[Rcpp::export]]
+NumericVector cpp_transfer_dist_all_pairs(
+    const List& splits_list,
+    int n_tip, bool scale, int n_threads
+) {
+  const int N = splits_list.size();
+  const int n_pairs = N * (N - 1) / 2;
+
+  // Pre-convert all trees to SplitLists
+  std::vector<std::unique_ptr<SplitList>> sls(N);
+  int n_bins = 0;
+  for (int i = 0; i < N; ++i) {
+    sls[i] = std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_list[i]));
+    if (i == 0) n_bins = sls[0]->n_bins;
+  }
+
+  NumericVector result(n_pairs);
+
+  // R dist format: column-major lower triangle
+  // pos k corresponds to (row, col) where row > col, iterating col first:
+  // (1,0), (2,0), ..., (N-1,0), (2,1), (3,1), ..., (N-1,1), ...
+  // k = col * (N - 1) - col * (col - 1) / 2 + (row - col - 1)  (0-based)
+  #ifdef _OPENMP
+  #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+  #endif
+  for (int k = 0; k < n_pairs; ++k) {
+    // Invert k to (col, row) in R dist column-major order
+    // col = largest c such that c*(2N-c-1)/2 <= k
+    int col = 0;
+    int remaining = k;
+    while (remaining >= N - 1 - col) {
+      remaining -= (N - 1 - col);
+      ++col;
+    }
+    int row = col + 1 + remaining;
+
+    result[k] = transfer_dissimilarity_score(
+        *sls[col], *sls[row], n_tip, n_bins, scale);
+  }
+
+  return result;
+}
+
+
+//' Cross-pairs transfer dissimilarity (OpenMP)
+//'
+//' @param splits_a,splits_b Lists of raw matrices.
+//' @param n_tip Number of tips.
+//' @param scale Logical: use scaled transfer dissimilarity?
+//' @param n_threads Number of OpenMP threads.
+//'
+//' @return Numeric matrix of dimension `nA` x `nB`.
+//' @keywords internal
+// [[Rcpp::export]]
+NumericMatrix cpp_transfer_dist_cross_pairs(
+    const List& splits_a, const List& splits_b,
+    int n_tip, bool scale, int n_threads
+) {
+  const int nA = splits_a.size();
+  const int nB = splits_b.size();
+
+  std::vector<std::unique_ptr<SplitList>> sls_a(nA);
+  std::vector<std::unique_ptr<SplitList>> sls_b(nB);
+  int n_bins = 0;
+
+  for (int i = 0; i < nA; ++i) {
+    sls_a[i] = std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_a[i]));
+    if (i == 0) n_bins = sls_a[0]->n_bins;
+  }
+  for (int j = 0; j < nB; ++j) {
+    sls_b[j] = std::make_unique<SplitList>(Rcpp::as<RawMatrix>(splits_b[j]));
+  }
+
+  NumericMatrix result(nA, nB);
+  const int total = nA * nB;
+
+  #ifdef _OPENMP
+  #pragma omp parallel for schedule(dynamic) num_threads(n_threads)
+  #endif
+  for (int k = 0; k < total; ++k) {
+    int i = k / nB;
+    int j = k % nB;
+    result(i, j) = transfer_dissimilarity_score(
+        *sls_a[i], *sls_b[j], n_tip, n_bins, scale);
+  }
+
+  return result;
+}
diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp
index 4a3b77e1..99347300 100644
--- a/src/tree_distance_functions.cpp
+++ b/src/tree_distance_functions.cpp
@@ -1,35 +1,47 @@
+#include <TreeTools/SplitList.h>
 #include <Rcpp/Lightest>
-#include <TreeTools/SplitList.h> /* for SL_MAX_TIPS */
 
-#include <cmath> /* for log2() */
+// Provide the MCI table definitions and implementation in this TU.
+#define TREEDIST_MCI_IMPLEMENTATION
+#include <TreeDist/mutual_clustering_impl.h>
 
 #include "tree_distances.h"
 
-using namespace Rcpp;
-
-constexpr int32 LG2_SIZE = SL_MAX_TIPS + 1;
-
-double lg2[LG2_SIZE];
-double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
-double lg2_unrooted[SL_MAX_TIPS + 2];
-double *lg2_rooted = &lg2_unrooted[0] + 1;
+// Populate lookup tables at library load time.
 __attribute__((constructor))
   void initialize_ldf() {
-    lg2[0] = 0;
-    for (int32 i = 1; i != LG2_SIZE; ++i) {
-      lg2[i] = log2(i);
-    }
-    for (int16 i = 0; i != 3; ++i) {
-      lg2_double_factorial[i] = 0;
-      lg2_unrooted[i] = 0;
-    }
-    assert(lg2_rooted[0] == 0);
-    assert(lg2_rooted[1] == 0);
-    for (int32 i = 2; i != SL_MAX_TIPS + SL_MAX_TIPS - 2; ++i) {
-      lg2_double_factorial[i] = lg2_double_factorial[i - 2] + log2(i);
-    }
-    for (int32 i = 3; i != SL_MAX_TIPS + 2; ++i) {
-      lg2_unrooted[i] = lg2_double_factorial[i + i - 5];
-      assert(lg2_rooted[i - 1] == lg2_double_factorial[i + i - 5]);
-    }
+    TreeDist::init_lg2_tables(SL_MAX_TIPS);
+  }
+
+// Thin wrapper that exercises the installable-header version of
+// mutual_clustering_score(), for test coverage and downstream validation.
+// [[Rcpp::export]]
+double cpp_mci_impl_score(const Rcpp::RawMatrix& x,
+                          const Rcpp::RawMatrix& y,
+                          int n_tips) {
+  using TreeTools::SplitList;
+
+  const SplitList a(x);
+  const SplitList b(y);
+  TreeDist::LapScratch scratch;
+
+  // Build arrays matching the header's raw-pointer API types.
+  std::vector<const splitbit*> a_ptrs(a.n_splits);
+  std::vector<const splitbit*> b_ptrs(b.n_splits);
+  std::vector<TreeDist::int16> a_in(a.n_splits);
+  std::vector<TreeDist::int16> b_in(b.n_splits);
+  for (TreeDist::int16 i = 0; i < a.n_splits; ++i) {
+    a_ptrs[i] = a.state[i];
+    a_in[i] = static_cast<TreeDist::int16>(a.in_split[i]);
   }
+  for (TreeDist::int16 i = 0; i < b.n_splits; ++i) {
+    b_ptrs[i] = b.state[i];
+    b_in[i] = static_cast<TreeDist::int16>(b.in_split[i]);
+  }
+
+  return TreeDist::mutual_clustering_score(
+    a_ptrs.data(), a_in.data(), a.n_splits,
+    b_ptrs.data(), b_in.data(), b.n_splits,
+    a.n_bins, static_cast<TreeDist::int32>(n_tips),
+    scratch);
+}
diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp
index 6aaae498..f4b4ce85 100644
--- a/src/tree_distances.cpp
+++ b/src/tree_distances.cpp
@@ -1,4 +1,5 @@
 #include <TreeTools/SplitList.h>
+#include <TreeTools/assert.h>
 #include <cmath>
 #include <memory> /* for unique_ptr, make_unique */
 #include <Rcpp/Lightest>
@@ -28,9 +29,9 @@ namespace TreeDist {
   }
 
   void check_ntip(const double n) {
-    if (n > static_cast<double>(std::numeric_limits<int16>::max())) {
-      Rcpp::stop("This many tips are not (yet) supported.");
-    }
+    // Validated by R caller (nTip > 32767 guard in CalculateTreeDistance et al.)
+    ASSERT(n <= static_cast<double>(std::numeric_limits<int16>::max())
+           && "This many tips are not (yet) supported.");
   }
 
 
@@ -605,18 +606,14 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y,
 // [[Rcpp::export]]
 List cpp_robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y,
                                   const IntegerVector &nTip) {
-  if (x.cols() != y.cols()) {
-    Rcpp::stop("Input splits must address same number of tips.");
-  }
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
   return robinson_foulds_distance(x, y, static_cast<int32>(nTip[0]));
 }
 
 // [[Rcpp::export]]
 List cpp_robinson_foulds_info(const RawMatrix &x, const RawMatrix &y,
                               const IntegerVector &nTip) {
-  if (x.cols() != y.cols()) {
-    Rcpp::stop("Input splits must address same number of tips.");
-  }
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
   const int32 n_tip = static_cast<int32>(nTip[0]);
   TreeDist::check_ntip(n_tip);
   return robinson_foulds_info(x, y, n_tip);
@@ -625,9 +622,7 @@ List cpp_robinson_foulds_info(const RawMatrix &x, const RawMatrix &y,
 // [[Rcpp::export]]
 List cpp_matching_split_distance(const RawMatrix &x, const RawMatrix &y,
                                  const IntegerVector &nTip) {
-  if (x.cols() != y.cols()) {
-    Rcpp::stop("Input splits must address same number of tips.");
-  }
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
   const int32 n_tip = static_cast<int32>(nTip[0]);
   TreeDist::check_ntip(n_tip);
   return matching_split_distance(x, y, n_tip);
@@ -637,9 +632,7 @@ List cpp_matching_split_distance(const RawMatrix &x, const RawMatrix &y,
 List cpp_jaccard_similarity(const RawMatrix &x, const RawMatrix &y,
                             const IntegerVector &nTip, const NumericVector &k,
                             const LogicalVector &allowConflict) {
-  if (x.cols() != y.cols()) {
-    Rcpp::stop("Input splits must address same number of tips.");
-  }
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
   const int32 n_tip = static_cast<int32>(nTip[0]);
   TreeDist::check_ntip(n_tip);
   return jaccard_similarity(x, y, n_tip, k, allowConflict);
@@ -648,9 +641,7 @@ List cpp_jaccard_similarity(const RawMatrix &x, const RawMatrix &y,
 // [[Rcpp::export]]
 List cpp_msi_distance(const RawMatrix &x, const RawMatrix &y,
                       const IntegerVector &nTip) {
-  if (x.cols() != y.cols()) {
-    Rcpp::stop("Input splits must address same number of tips.");
-  }
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
   const int32 n_tip = static_cast<int32>(nTip[0]);
   TreeDist::check_ntip(n_tip);
   return msi_distance(x, y, n_tip);
@@ -659,9 +650,7 @@ List cpp_msi_distance(const RawMatrix &x, const RawMatrix &y,
 // [[Rcpp::export]]
 List cpp_mutual_clustering(const RawMatrix &x, const RawMatrix &y,
                            const IntegerVector &nTip) {
-  if (x.cols() != y.cols()) {
-    Rcpp::stop("Input splits must address same number of tips.");
-  }
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
   const int32 n_tip = static_cast<int32>(nTip[0]);
   TreeDist::check_ntip(n_tip);
   return mutual_clustering(x, y, n_tip);
@@ -670,9 +659,7 @@ List cpp_mutual_clustering(const RawMatrix &x, const RawMatrix &y,
 // [[Rcpp::export]]
 List cpp_shared_phylo(const RawMatrix &x, const RawMatrix &y,
                       const IntegerVector &nTip) {
-  if (x.cols() != y.cols()) {
-    Rcpp::stop("Input splits must address same number of tips.");
-  }
+  ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips.");
   const int32 n_tip = static_cast<int32>(nTip[0]);
   TreeDist::check_ntip(n_tip);
   return shared_phylo(x, y, n_tip);
diff --git a/src/tree_distances.h b/src/tree_distances.h
index f5ba93e7..8d4f7b4f 100644
--- a/src/tree_distances.h
+++ b/src/tree_distances.h
@@ -2,18 +2,25 @@
 #define _TREEDIST_TREE_DISTANCES_H
 
 #include "lap.h"
+#include <TreeDist/mutual_clustering.h>
 #include <TreeTools/SplitList.h>
 
-/***** Constants requiring initialization *****/
+/***** Re-export shared lookup tables to global scope *****/
+
+using TreeDist::lg2;
+using TreeDist::lg2_double_factorial;
+using TreeDist::lg2_unrooted;
+using TreeDist::lg2_rooted;
+
+/***** Constants *****/
 
 constexpr splitbit ALL_ONES = (std::numeric_limits<splitbit>::max)();
-extern double lg2[SL_MAX_TIPS + 1];
-extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
-extern double lg2_unrooted[SL_MAX_TIPS + 2];
-extern double *lg2_rooted;
 
 namespace TreeDist {
 
+  // Re-exported from mutual_clustering.h:
+  //   ic_matching(int16 a, int16 b, int16 n)
+
   // See equation 16 in Meila 2007 (k' denoted K).
   // nkK is converted to pkK in the calling function when divided by n.
   inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk,
@@ -52,14 +59,6 @@ namespace TreeDist {
   }
 
 
-[[nodiscard]] inline double ic_matching(const int16 a, const int16 b, const int16 n) noexcept {
-    const double lg2a = lg2[a];
-    const double lg2b = lg2[b];
-    const double lg2n = lg2[n];
-    return (a + b) * lg2n - a * lg2a - b * lg2b;
-    //  (a * (lg2n - lg2a)) + (b * (lg2n - lg2b)); is substantially slower
-  }
-
 [[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept {
     assert(SL_MAX_TIPS + 2 <= std::numeric_limits<int16>::max()); // verify int16 ok
     if (a == b) {
diff --git a/tests/testthat/test-hmi.cpp.R b/tests/testthat/test-hmi.cpp.R
index 78ad0f22..cc883e4a 100644
--- a/tests/testthat/test-hmi.cpp.R
+++ b/tests/testthat/test-hmi.cpp.R
@@ -6,9 +6,10 @@ test_that("HMI fails nicely", {
   expect_error(HierarchicalMutualInfo(bal5, pec5, normalize = "Error"),
                "`normalize` must be logical, or a function")
   
-  expect_error(EHMI_xptr(bal5, pec5, tolerance = 1e-16),
-               ".olerance too low")
-  expect_error(EHMI_xptr(bal5, pec5, minResample = 1),
+  # R-level guards in EHMI(); C++ uses ASSERT only
+  expect_error(EHMI(BalancedTree(5), PectinateTree(5), precision = 1e-16),
+               "Tolerance too low")
+  expect_error(EHMI(BalancedTree(5), PectinateTree(5), minResample = 1),
                "Must perform at least one resampl")
 })
 
diff --git a/tests/testthat/test-mast.R b/tests/testthat/test-mast.R
index 8a6235ce..7d3e9289 100644
--- a/tests/testthat/test-mast.R
+++ b/tests/testthat/test-mast.R
@@ -1,7 +1,9 @@
 library("TreeTools")
 
 test_that("MAST fails gracefully", {
-  expect_error(cpp_mast(BalancedTree(7)$edge, BalancedTree(8)$edge, 7)) # Different sizes
+  # R-level guard catches edge count mismatch
+  expect_error(.MASTSizeEdges(BalancedTree(7)$edge, BalancedTree(8)$edge, 7),
+               "same number of edges")
   expect_error(MASTSize(BalancedTree(8), UnrootTree(BalancedTree(8))))
   expect_error(MASTSize(BalancedTree(10000), PectinateTree(10000))) # Too large
   
diff --git a/tests/testthat/test-mci_impl.R b/tests/testthat/test-mci_impl.R
new file mode 100644
index 00000000..58d72da2
--- /dev/null
+++ b/tests/testthat/test-mci_impl.R
@@ -0,0 +1,53 @@
+test_that("Header mutual_clustering_score matches MutualClusteringInfo", {
+  skip_if_not_installed("TreeDist")
+  library(TreeTools)
+  library(TreeDist)
+
+  bal8 <- BalancedTree(8)
+  pec8 <- PectinateTree(8)
+  star8 <- StarTree(8)
+
+  tips <- TipLabels(bal8)
+  n_tip <- length(tips)
+  splits_bal <- as.Splits(bal8, tips)
+  splits_pec <- as.Splits(pec8, tips)
+  splits_star <- as.Splits(star8, tips)
+
+  # Score-only from the installable-header implementation
+  impl_score <- TreeDist:::cpp_mci_impl_score
+
+  impl_bal_pec <- impl_score(splits_bal, splits_pec, n_tip)
+  impl_bal_bal <- impl_score(splits_bal, splits_bal, n_tip)
+  impl_star <- impl_score(splits_bal, splits_star, n_tip)
+
+  # Reference from MutualClusteringInfo (unnormalized score)
+  ref_bal_pec <- MutualClusteringInfo(bal8, pec8)
+  ref_bal_bal <- MutualClusteringInfo(bal8, bal8)
+
+  expect_equal(impl_bal_pec, ref_bal_pec, tolerance = 1e-10)
+  expect_equal(impl_bal_bal, ref_bal_bal, tolerance = 1e-10)
+  expect_equal(impl_star, 0)
+})
+
+test_that("Header MCI covers exact-match early exit and partial LAP", {
+  skip_if_not_installed("TreeDist")
+  library(TreeTools)
+  library(TreeDist)
+  impl_score <- TreeDist:::cpp_mci_impl_score
+
+  # Two identical trees → all splits match exactly (early exit path)
+  bal20 <- BalancedTree(20)
+  tips <- TipLabels(bal20)
+  n_tip <- length(tips)
+  splits20 <- as.Splits(bal20, tips)
+
+  result <- impl_score(splits20, splits20, n_tip)
+  expect_equal(result, MutualClusteringInfo(bal20, bal20), tolerance = 1e-10)
+
+  # Trees that share some but not all splits → partial match + LAP
+  pec20 <- PectinateTree(20)
+  splits_pec20 <- as.Splits(pec20, tips)
+
+  result2 <- impl_score(splits20, splits_pec20, n_tip)
+  expect_equal(result2, MutualClusteringInfo(bal20, pec20), tolerance = 1e-10)
+})
diff --git a/tests/testthat/test-transfer_consensus.R b/tests/testthat/test-transfer_consensus.R
new file mode 100644
index 00000000..e038e095
--- /dev/null
+++ b/tests/testthat/test-transfer_consensus.R
@@ -0,0 +1,700 @@
+test_that("Identical trees return fully resolved consensus", {
+  tree <- as.phylo(0, nTip = 10)
+  trees <- structure(rep(list(tree), 10), class = "multiPhylo")
+
+  tc <- TransferConsensus(trees, greedy = "first")
+  expect_equal(NSplits(tc), NSplits(tree))
+
+  tc_best <- TransferConsensus(trees, greedy = "best")
+  expect_equal(NSplits(tc_best), NSplits(tree))
+})
+
+test_that("Star tree returned when no signal", {
+  set.seed(6129)
+  # Fully random trees with many tips — negligible split overlap
+  trees <- structure(lapply(1:30, function(i) TreeTools::RandomTree(50, root = TRUE)),
+                     class = "multiPhylo")
+  tc <- TransferConsensus(trees, greedy = "first")
+
+  # Should be very unresolved (0 or near-0 splits)
+  expect_lte(NSplits(tc), 5)
+})
+
+test_that("Transfer consensus is at least as resolved as majority-rule for structured trees",
+{
+  # Trees from as.phylo with moderate overlap
+  trees <- as.phylo(0:29, nTip = 20)
+  tc <- TransferConsensus(trees, greedy = "best")
+  mr <- Consensus(trees, p = 0.5)
+
+  # Transfer consensus should be at least as resolved
+  expect_gte(NSplits(tc), NSplits(mr))
+})
+
+test_that("Both greedy strategies produce valid trees", {
+  trees <- as.phylo(1:15, nTip = 12)
+
+  tc_first <- TransferConsensus(trees, greedy = "first")
+  tc_best  <- TransferConsensus(trees, greedy = "best")
+
+  expect_s3_class(tc_first, "phylo")
+  expect_s3_class(tc_best, "phylo")
+  expect_equal(sort(TipLabels(tc_first)), sort(TipLabels(trees[[1]])))
+  expect_equal(sort(TipLabels(tc_best)), sort(TipLabels(trees[[1]])))
+})
+
+test_that("scale = FALSE (unscaled) works", {
+  trees <- as.phylo(1:10, nTip = 10)
+
+  tc_scaled <- TransferConsensus(trees, scale = TRUE)
+  tc_unscaled <- TransferConsensus(trees, scale = FALSE)
+
+  expect_s3_class(tc_scaled, "phylo")
+  expect_s3_class(tc_unscaled, "phylo")
+})
+
+test_that("init = 'majority' works", {
+  trees <- as.phylo(0:19, nTip = 15)
+
+  tc_empty <- TransferConsensus(trees, init = "empty")
+  tc_maj   <- TransferConsensus(trees, init = "majority")
+
+  expect_s3_class(tc_empty, "phylo")
+  expect_s3_class(tc_maj, "phylo")
+  # Both should produce reasonable trees
+  expect_gte(NSplits(tc_empty), 1)
+  expect_gte(NSplits(tc_maj), 1)
+})
+
+test_that("Error on bad input", {
+  expect_error(TransferConsensus(list(TreeTools::BalancedTree(5))), "multiPhylo")
+  expect_error(TransferConsensus(
+    structure(list(TreeTools::BalancedTree(5)), class = "multiPhylo")),
+    "at least 2")
+})
+
+test_that("Two-tree consensus returns a valid tree", {
+  trees <- as.phylo(1:2, nTip = 8)
+  tc <- TransferConsensus(trees)
+  expect_s3_class(tc, "phylo")
+  expect_equal(length(TipLabels(tc)), 8)
+})
+
+
+# =========================================================================
+# C++ edge cases (transfer_consensus.cpp coverage)
+# =========================================================================
+
+test_that("TransferConsensus returns star tree for all-star input", {
+  star <- StarTree(8)
+  trees <- structure(rep(list(star), 3), class = "multiPhylo")
+  tc <- TransferConsensus(trees)
+  expect_s3_class(tc, "phylo")
+  expect_equal(NSplits(tc), 0)
+})
+
+test_that("TransferConsensus covers all parameter combinations", {
+  trees <- as.phylo(0:9, nTip = 10)
+
+  # greedy="first", init="majority", scale=FALSE
+  tc <- TransferConsensus(trees, greedy = "first", init = "majority",
+                          scale = FALSE)
+  expect_s3_class(tc, "phylo")
+
+  # greedy="best", init="majority", scale=FALSE
+  tc2 <- TransferConsensus(trees, greedy = "best", init = "majority",
+                           scale = FALSE)
+  expect_s3_class(tc2, "phylo")
+
+  # greedy="first", init="empty", scale=FALSE
+  tc3 <- TransferConsensus(trees, greedy = "first", init = "empty",
+                           scale = FALSE)
+  expect_s3_class(tc3, "phylo")
+})
+
+test_that("Greedy remove path fires with conflicting majority splits", {
+  # Two groups of trees with very different topologies: majority-init seeds
+  # splits from the larger group, some of which the greedy then removes.
+  t1 <- as.phylo(0, nTip = 12)
+  t2 <- as.phylo(100000, nTip = 12)
+  trees <- structure(c(rep(list(t1), 6), rep(list(t2), 5)),
+                     class = "multiPhylo")
+
+  # greedy="best" exercises do_remove + find_second in GreedyState
+  tc_best_s <- TransferConsensus(trees, greedy = "best", init = "majority",
+                                 scale = TRUE)
+  expect_s3_class(tc_best_s, "phylo")
+
+  tc_best_u <- TransferConsensus(trees, greedy = "best", init = "majority",
+                                 scale = FALSE)
+  expect_s3_class(tc_best_u, "phylo")
+
+  # greedy="first" exercises the same remove path via greedy_first
+
+  tc_first_s <- TransferConsensus(trees, greedy = "first", init = "majority",
+                                  scale = TRUE)
+  expect_s3_class(tc_first_s, "phylo")
+
+  tc_first_u <- TransferConsensus(trees, greedy = "first", init = "majority",
+                                  scale = FALSE)
+  expect_s3_class(tc_first_u, "phylo")
+})
+
+test_that("Greedy exercises diverse topologies (extra C++ path coverage)", {
+  # Random trees with many tips — sparse split overlap
+  set.seed(5123)
+  rand_trees <- structure(
+    lapply(1:30, function(i) TreeTools::RandomTree(10, root = TRUE)),
+    class = "multiPhylo")
+  # Diverse indexed trees
+  diverse_trees <- as.phylo(seq(0, 500, by = 25), nTip = 20)
+  # Three conflicting groups
+  mixed_trees <- structure(c(
+    rep(list(as.phylo(0, 8)), 5),
+    rep(list(as.phylo(300, 8)), 5),
+    rep(list(as.phylo(9999, 8)), 3)
+  ), class = "multiPhylo")
+
+  for (trees in list(rand_trees, diverse_trees, mixed_trees)) {
+    for (gr in c("best", "first")) {
+      for (init in c("empty", "majority")) {
+        for (sc in c(TRUE, FALSE)) {
+          tc <- TransferConsensus(trees, greedy = gr, init = init, scale = sc)
+          expect_s3_class(tc, "phylo")
+        }
+      }
+    }
+  }
+})
+
+test_that("cpp_tc_profile() runs without error", {
+  trees <- as.phylo(0:4, nTip = 8)
+  tipLabels <- TipLabels(trees[[1]])
+  splitsList <- lapply(trees, function(tr) unclass(as.Splits(tr, tipLabels)))
+
+  # Greedy best, empty init
+  res <- TreeDist:::cpp_tc_profile(splitsList, 8L, TRUE, TRUE, FALSE, 1L, 1L)
+  expect_length(res, 5)
+  expect_true(all(res >= 0))
+  expect_equal(names(res),
+               c("pool_splits", "transfer_dist_mat", "compute_td",
+                 "compat_mat", "greedy"))
+
+  # Greedy first, majority init, unscaled
+  res2 <- TreeDist:::cpp_tc_profile(splitsList, 8L, FALSE, FALSE, TRUE, 1L, 1L)
+  expect_length(res2, 5)
+  expect_true(all(res2 >= 0))
+
+  # Multiple iterations
+  res3 <- TreeDist:::cpp_tc_profile(splitsList, 8L, TRUE, TRUE, FALSE, 3L, 1L)
+  expect_length(res3, 5)
+
+  # Greedy best, majority init
+  res4 <- TreeDist:::cpp_tc_profile(splitsList, 8L, TRUE, TRUE, TRUE, 1L, 1L)
+  expect_length(res4, 5)
+})
+
+
+# =========================================================================
+# R-level internal helper functions (transfer_consensus.R lines 94-551)
+# =========================================================================
+
+test_that(".FlipRaw() flips bits and masks correctly", {
+  # 8 tips (1 byte, all bits used, mask = 0xFF)
+  expect_equal(TreeDist:::.FlipRaw(as.raw(0x07), 8), as.raw(0xf8))
+
+  # 5 tips (1 byte, 5 bits, mask = 0x1F)
+  expect_equal(TreeDist:::.FlipRaw(as.raw(0x07), 5), as.raw(0x18))
+
+  # Multi-byte: 10 tips (2 bytes, last byte 2 bits, mask = 0x03)
+  result <- TreeDist:::.FlipRaw(as.raw(c(0x00, 0x01)), 10)
+  expect_equal(result, as.raw(c(0xff, 0x02)))
+})
+
+test_that(".TransferDistMat() computes pairwise transfer distances", {
+  splitMat <- matrix(c(
+    TRUE, TRUE, FALSE, FALSE, FALSE, FALSE,
+    TRUE, TRUE, TRUE, FALSE, FALSE, FALSE,
+    FALSE, FALSE, FALSE, TRUE, TRUE, TRUE
+  ), nrow = 3, byrow = TRUE)
+
+  DIST <- TreeDist:::.TransferDistMat(splitMat, 6)
+  expect_equal(dim(DIST), c(3, 3))
+  expect_equal(diag(DIST), c(0, 0, 0))
+  expect_true(isSymmetric(DIST))
+  # {1,2} vs {1,2,3}: hamming = 1
+  expect_equal(DIST[1, 2], 1)
+  # {1,2,3} vs {4,5,6}: complements → transfer = 0
+  expect_equal(DIST[2, 3], 0)
+
+  # Single split
+  DIST1 <- TreeDist:::.TransferDistMat(
+    matrix(c(TRUE, TRUE, FALSE, FALSE), nrow = 1), 4)
+  expect_equal(dim(DIST1), c(1, 1))
+  expect_equal(DIST1[1, 1], 0)
+})
+
+test_that(".CompatMat() checks bipartition compatibility", {
+  splitMat <- matrix(c(
+    TRUE, TRUE, FALSE, FALSE, FALSE, FALSE,
+    FALSE, FALSE, TRUE, TRUE, FALSE, FALSE,
+    TRUE, FALSE, TRUE, FALSE, FALSE, FALSE
+  ), nrow = 3, byrow = TRUE)
+
+  compat <- TreeDist:::.CompatMat(splitMat, 6)
+  expect_equal(dim(compat), c(3, 3))
+  expect_true(compat[1, 2])    # {1,2} and {3,4}: disjoint → compatible
+  expect_false(compat[1, 3])   # {1,2} and {1,3}: all 4 intersections → incompatible
+  expect_true(isSymmetric(compat))
+})
+
+test_that(".ComputeTD() works for both scale modes", {
+  DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3)
+  lightSide <- c(3, 2, 3)
+  sentDist <- lightSide - 1L
+  treeMembers <- list(c(1, 2), c(2, 3))
+
+  td_s <- TreeDist:::.ComputeTD(DIST, sentDist, treeMembers, lightSide, 2, TRUE)
+  td_u <- TreeDist:::.ComputeTD(DIST, sentDist, treeMembers, lightSide, 2, FALSE)
+  expect_length(td_s, 3)
+  expect_length(td_u, 3)
+  expect_true(all(td_s >= 0))
+  expect_true(all(td_u >= 0))
+
+  # Single member per tree
+  td_single <- TreeDist:::.ComputeTD(
+    DIST, sentDist, list(c(1), c(3)), lightSide, 2, TRUE)
+  expect_length(td_single, 3)
+})
+
+test_that(".IsCompat() handles edge cases", {
+  compat <- matrix(TRUE, 3, 3)
+  compat[1, 3] <- compat[3, 1] <- FALSE
+
+  # Empty included → always compatible
+  expect_true(TreeDist:::.IsCompat(1, c(FALSE, FALSE, FALSE), compat, 10))
+  # Too many included (>= nTip - 3) → never compatible
+  expect_false(TreeDist:::.IsCompat(1, c(TRUE, TRUE, TRUE), compat, 6))
+  # Compatible with included set
+  expect_true(TreeDist:::.IsCompat(1, c(FALSE, TRUE, FALSE), compat, 10))
+  # Incompatible with an included split
+  expect_false(TreeDist:::.IsCompat(1, c(FALSE, FALSE, TRUE), compat, 10))
+})
+
+test_that(".Dist() returns correct distance", {
+  DIST <- matrix(c(0, 2, 3, 2, 0, 4, 3, 4, 0), 3, 3)
+  sentDist <- c(5, 6, 7)
+
+  expect_equal(TreeDist:::.Dist(1, 2, DIST, sentDist), 2)
+  expect_equal(TreeDist:::.Dist(1, NA, DIST, sentDist), 5)
+  expect_equal(TreeDist:::.Dist(3, NA, DIST, sentDist), 7)
+})
+
+test_that(".DiagDist() vectorizes correctly", {
+  DIST <- matrix(c(0, 2, 3, 2, 0, 4, 3, 4, 0), 3, 3)
+  sentDist <- c(5, 6, 7)
+
+  expect_equal(
+    TreeDist:::.DiagDist(c(2, NA, 1), DIST, sentDist),
+    c(DIST[1, 2], 6, DIST[3, 1])
+  )
+  expect_equal(TreeDist:::.DiagDist(c(NA, NA, NA), DIST, sentDist), sentDist)
+  expect_equal(
+    TreeDist:::.DiagDist(c(2, 3, 1), DIST, sentDist),
+    c(DIST[1, 2], DIST[2, 3], DIST[3, 1])
+  )
+})
+
+test_that(".FindSecond() works for both scale modes and edge cases", {
+  DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3)
+  pMinus1 <- c(4, 4, 4)
+
+  # Unscaled: exclude matchIdx, find best
+  expect_equal(
+    TreeDist:::.FindSecond(1, 2, c(1, 2, 3), DIST, pMinus1, FALSE), 1)
+
+  # matchIdx = NA → search all
+  expect_equal(
+    TreeDist:::.FindSecond(1, NA_integer_, c(2, 3), DIST, pMinus1, FALSE), 2)
+
+  # No candidates
+  expect_true(is.na(
+    TreeDist:::.FindSecond(1, 2, c(2), DIST, pMinus1, FALSE)))
+
+  # All exceed threshold (unscaled)
+  DIST_far <- matrix(c(0, 5, 5, 5, 0, 5, 5, 5, 0), 3, 3)
+  expect_true(is.na(
+    TreeDist:::.FindSecond(1, NA_integer_, c(2, 3), DIST_far, c(2, 2, 2), FALSE)
+  ))
+
+  # Scaled: normal case
+  expect_equal(
+    TreeDist:::.FindSecond(1, NA_integer_, c(2, 3), DIST, pMinus1, TRUE), 2)
+
+  # Scaled: exceeds threshold
+  expect_true(is.na(
+    TreeDist:::.FindSecond(1, NA_integer_, c(2), DIST_far, c(2, 2, 2), TRUE)
+  ))
+})
+
+test_that(".DoAdd() updates match state correctly", {
+  DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3)
+  sentDist <- c(4, 4, 4)
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, 3)
+  st$MATCH2 <- rep(NA_integer_, 3)
+  st$incl <- rep(FALSE, 3)
+
+  # First add: all splits match the newly added split
+  TreeDist:::.DoAdd(2, st, DIST, sentDist)
+  expect_true(st$incl[2])
+  expect_equal(st$MATCH[1], 2)
+  expect_equal(st$MATCH[2], 2)
+  expect_equal(st$MATCH[3], 2)
+
+  # Second add: closer splits update, old match demotes to second
+  TreeDist:::.DoAdd(1, st, DIST, sentDist)
+  expect_equal(st$MATCH[1], 1)    # self-match closer
+  expect_equal(st$MATCH2[1], 2)   # old match becomes second
+  expect_equal(st$MATCH[2], 2)    # split 2 still closest to itself
+  expect_equal(st$MATCH2[2], 1)   # split 1 becomes second
+})
+
+test_that(".DoRemove() handles sentinel promotion and affected2", {
+  DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3)
+  sentDist <- c(4, 4, 4)
+  lightSide <- c(5, 5, 5)
+
+  # Case 1: sentinel promotion (MATCH2 = NA, remove MATCH, no other included)
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- c(2L, NA_integer_, 2L)
+  st$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_)
+  st$incl <- c(FALSE, TRUE, FALSE)
+
+  TreeDist:::.DoRemove(2, st, DIST, sentDist, lightSide, FALSE)
+  expect_true(is.na(st$MATCH[1]))
+  expect_true(is.na(st$MATCH[3]))
+
+  # Case 2: sentinel promotion with rescan finding another match
+  st2 <- new.env(parent = emptyenv())
+  st2$MATCH <- c(2L, 2L, NA_integer_)
+  st2$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_)
+  st2$incl <- c(TRUE, TRUE, FALSE)
+
+  TreeDist:::.DoRemove(2, st2, DIST, sentDist, lightSide, FALSE)
+  expect_equal(st2$MATCH[1], 1L)  # rescanned, found split 1
+
+  # Case 3: affected2 path (MATCH2 == removed, MATCH != removed)
+  st3 <- new.env(parent = emptyenv())
+  st3$MATCH <- c(1L, 1L, 1L)
+  st3$MATCH2 <- c(2L, 2L, NA_integer_)
+  st3$incl <- c(TRUE, TRUE, FALSE)
+
+  TreeDist:::.DoRemove(2, st3, DIST, sentDist, lightSide, FALSE)
+  expect_equal(st3$MATCH, c(1L, 1L, 1L))  # unchanged
+  expect_true(is.na(st3$MATCH2[1]))
+  expect_true(is.na(st3$MATCH2[2]))
+
+  # Case 4: scaled mode
+  st4 <- new.env(parent = emptyenv())
+  st4$MATCH <- c(2L, NA_integer_, NA_integer_)
+  st4$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_)
+  st4$incl <- c(TRUE, TRUE, FALSE)
+
+  TreeDist:::.DoRemove(2, st4, DIST, sentDist, lightSide, TRUE)
+  expect_equal(st4$MATCH[1], 1L)
+})
+
+test_that(".AddBenefitVec() computes add benefits", {
+  DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3)
+  lightSide <- c(3, 3, 3)
+  sentDist <- lightSide - 1L
+  counts <- c(2, 3, 1)
+  TD <- c(0.5, 0.3, 0.8)
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, 3)
+  st$MATCH2 <- rep(NA_integer_, 3)
+  st$incl <- rep(FALSE, 3)
+
+  ben_s <- TreeDist:::.AddBenefitVec(
+    c(1, 2, 3), st, DIST, sentDist, TD, counts, lightSide, scale = TRUE)
+  expect_length(ben_s, 3)
+
+  ben_u <- TreeDist:::.AddBenefitVec(
+    c(1, 2, 3), st, DIST, sentDist, TD, counts, lightSide, scale = FALSE)
+  expect_length(ben_u, 3)
+})
+
+test_that(".RemoveBenefitVec() computes remove benefits", {
+  DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3)
+  lightSide <- c(3, 3, 3)
+  sentDist <- lightSide - 1L
+  counts <- c(2, 3, 1)
+  TD <- c(0.5, 0.3, 0.8)
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- c(2L, 1L, 2L)
+  st$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_)
+  st$incl <- c(TRUE, TRUE, FALSE)
+
+  ben_s <- TreeDist:::.RemoveBenefitVec(
+    c(1, 2), st, DIST, sentDist, TD, counts, lightSide, scale = TRUE)
+  expect_length(ben_s, 2)
+
+  ben_u <- TreeDist:::.RemoveBenefitVec(
+    c(1, 2), st, DIST, sentDist, TD, counts, lightSide, scale = FALSE)
+  expect_length(ben_u, 2)
+
+  # No affected splits (MATCH doesn't reference candidate)
+  st2 <- new.env(parent = emptyenv())
+  st2$MATCH <- c(1L, 1L, 1L)
+  st2$MATCH2 <- rep(NA_integer_, 3)
+  st2$incl <- c(TRUE, TRUE, FALSE)
+
+  ben_none <- TreeDist:::.RemoveBenefitVec(
+    c(2), st2, DIST, sentDist, TD, counts, lightSide, scale = TRUE)
+  # Only TD contribution, no fn_cost (no affected splits)
+  expect_equal(ben_none, TD[2])
+})
+
+test_that(".PoolSplits() pools and deduplicates splits", {
+  trees <- as.phylo(0:4, nTip = 8)
+  tipLabels <- TipLabels(trees[[1]])
+  pool <- TreeDist:::.PoolSplits(trees, tipLabels)
+
+  expect_true(is.matrix(pool$splits))
+  expect_true(is.matrix(pool$rawSplits))
+  expect_equal(ncol(pool$splits), length(tipLabels))
+  expect_equal(length(pool$counts), nrow(pool$splits))
+  expect_equal(length(pool$lightSide), nrow(pool$splits))
+  expect_equal(length(pool$treeMembers), length(trees))
+  expect_true(all(pool$counts >= 1))
+  expect_true(all(pool$lightSide >= 1))
+})
+
+test_that(".SplitsToPhylo() converts splits to tree or star", {
+  trees <- as.phylo(0:2, nTip = 8)
+  tipLabels <- TipLabels(trees[[1]])
+  pool <- TreeDist:::.PoolSplits(trees, tipLabels)
+
+  # Some splits included
+  included <- rep(FALSE, nrow(pool$rawSplits))
+  included[1:min(2, nrow(pool$rawSplits))] <- TRUE
+  tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, included, tipLabels, 8)
+  expect_s3_class(tr, "phylo")
+  expect_equal(sort(TipLabels(tr)), sort(tipLabels))
+
+  # No splits → star tree
+  tr_star <- TreeDist:::.SplitsToPhylo(
+    pool$rawSplits, rep(FALSE, nrow(pool$rawSplits)), tipLabels, 8)
+  expect_s3_class(tr_star, "phylo")
+  expect_equal(NSplits(tr_star), 0)
+})
+
+test_that(".InitMatches() initializes match state from included splits", {
+  # DIST: included = {1,2}; split 3 close to split 2 (dist 1 < thresh 2)
+  DIST <- matrix(c(
+    0, 1, 3, 5,
+    1, 0, 1, 4,
+    3, 1, 0, 1,
+    5, 4, 1, 0
+  ), 4, 4, byrow = TRUE)
+  lightSide <- c(3, 3, 3, 3)
+  sentDist <- lightSide - 1L  # all 2
+
+  # Scaled mode
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, 4)
+  st$MATCH2 <- rep(NA_integer_, 4)
+  st$incl <- c(TRUE, TRUE, FALSE, FALSE)
+
+  TreeDist:::.InitMatches(st, DIST, sentDist, lightSide, scale = TRUE)
+  # Split 1: self-match (dist 0), second = split 2 (dist 1)
+  expect_equal(st$MATCH[1], 1)
+  expect_equal(st$MATCH2[1], 2)
+  # Split 3: best among included is split 2 (dist 1), 1/2 < 1 → matched
+  expect_equal(st$MATCH[3], 2)
+  # Split 4: best among included is split 2 (dist 4), 4/2 >= 1 → skipped
+  expect_true(is.na(st$MATCH[4]))
+
+  # Unscaled mode
+  st2 <- new.env(parent = emptyenv())
+  st2$MATCH <- rep(NA_integer_, 4)
+  st2$MATCH2 <- rep(NA_integer_, 4)
+  st2$incl <- c(TRUE, TRUE, FALSE, FALSE)
+
+  TreeDist:::.InitMatches(st2, DIST, sentDist, lightSide, scale = FALSE)
+  expect_equal(st2$MATCH[1], 1)
+  expect_equal(st2$MATCH[3], 2)
+
+  # Empty included set → no-op
+  st3 <- new.env(parent = emptyenv())
+  st3$MATCH <- rep(NA_integer_, 4)
+  st3$MATCH2 <- rep(NA_integer_, 4)
+  st3$incl <- rep(FALSE, 4)
+
+  TreeDist:::.InitMatches(st3, DIST, sentDist, lightSide, scale = TRUE)
+  expect_true(all(is.na(st3$MATCH)))
+})
+
+
+# =========================================================================
+# Full R-level greedy pipeline integration tests
+# =========================================================================
+
+test_that("R-level GreedyBest pipeline produces valid consensus (scaled)", {
+  trees <- as.phylo(0:9, nTip = 8)
+  tipLabels <- TipLabels(trees[[1]])
+  nTip <- length(tipLabels)
+  nTree <- length(trees)
+
+  pool <- TreeDist:::.PoolSplits(trees, tipLabels)
+  nSplits <- nrow(pool$splits)
+  DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip)
+  sentDist <- pool$lightSide - 1L
+  TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide,
+                               nTree, scale = TRUE)
+  compat <- TreeDist:::.CompatMat(pool$splits, nTip)
+  sortOrd <- order(-pool$counts, seq_len(nSplits))
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, nSplits)
+  st$MATCH2 <- rep(NA_integer_, nSplits)
+  st$incl <- rep(FALSE, nSplits)
+
+  TreeDist:::.GreedyBest(st, DIST, sentDist, TD, pool$counts, pool$lightSide,
+                          compat, sortOrd, scale = TRUE, nSplits, nTip)
+  expect_true(any(st$incl))
+
+  tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip)
+  expect_s3_class(tr, "phylo")
+})
+
+test_that("R-level GreedyFirst pipeline produces valid consensus (unscaled)", {
+  trees <- as.phylo(0:9, nTip = 8)
+  tipLabels <- TipLabels(trees[[1]])
+  nTip <- length(tipLabels)
+  nTree <- length(trees)
+
+  pool <- TreeDist:::.PoolSplits(trees, tipLabels)
+  nSplits <- nrow(pool$splits)
+  DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip)
+  sentDist <- pool$lightSide - 1L
+  TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide,
+                               nTree, scale = FALSE)
+  compat <- TreeDist:::.CompatMat(pool$splits, nTip)
+  sortOrd <- order(-pool$counts, seq_len(nSplits))
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, nSplits)
+  st$MATCH2 <- rep(NA_integer_, nSplits)
+  st$incl <- rep(FALSE, nSplits)
+
+  TreeDist:::.GreedyFirst(st, DIST, sentDist, TD, pool$counts, pool$lightSide,
+                           compat, sortOrd, scale = FALSE, nSplits, nTip)
+  expect_true(any(st$incl))
+
+  tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip)
+  expect_s3_class(tr, "phylo")
+})
+
+test_that("R-level GreedyBest with majority init exercises InitMatches", {
+  trees <- as.phylo(0:14, nTip = 10)
+  tipLabels <- TipLabels(trees[[1]])
+  nTip <- length(tipLabels)
+  nTree <- length(trees)
+
+  pool <- TreeDist:::.PoolSplits(trees, tipLabels)
+  nSplits <- nrow(pool$splits)
+  DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip)
+  sentDist <- pool$lightSide - 1L
+  TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide,
+                               nTree, scale = TRUE)
+  compat <- TreeDist:::.CompatMat(pool$splits, nTip)
+  sortOrd <- order(-pool$counts, seq_len(nSplits))
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, nSplits)
+  st$MATCH2 <- rep(NA_integer_, nSplits)
+
+  # Majority init: include splits present in > half of trees
+  st$incl <- pool$counts > (nTree / 2)
+  if (any(st$incl)) {
+    TreeDist:::.InitMatches(st, DIST, sentDist, pool$lightSide, scale = TRUE)
+  }
+
+  TreeDist:::.GreedyBest(st, DIST, sentDist, TD, pool$counts, pool$lightSide,
+                          compat, sortOrd, scale = TRUE, nSplits, nTip)
+
+  tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip)
+  expect_s3_class(tr, "phylo")
+})
+
+test_that("R-level GreedyFirst with majority init (unscaled)", {
+  trees <- as.phylo(0:14, nTip = 10)
+  tipLabels <- TipLabels(trees[[1]])
+  nTip <- length(tipLabels)
+  nTree <- length(trees)
+
+  pool <- TreeDist:::.PoolSplits(trees, tipLabels)
+  nSplits <- nrow(pool$splits)
+  DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip)
+  sentDist <- pool$lightSide - 1L
+  TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide,
+                               nTree, scale = FALSE)
+  compat <- TreeDist:::.CompatMat(pool$splits, nTip)
+  sortOrd <- order(-pool$counts, seq_len(nSplits))
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, nSplits)
+  st$MATCH2 <- rep(NA_integer_, nSplits)
+  st$incl <- pool$counts > (nTree / 2)
+  if (any(st$incl)) {
+    TreeDist:::.InitMatches(st, DIST, sentDist, pool$lightSide, scale = FALSE)
+  }
+
+  TreeDist:::.GreedyFirst(st, DIST, sentDist, TD, pool$counts, pool$lightSide,
+                           compat, sortOrd, scale = FALSE, nSplits, nTip)
+
+  tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip)
+  expect_s3_class(tr, "phylo")
+})
+
+test_that("R greedy pipeline matches C++ consensus", {
+  trees <- as.phylo(0:9, nTip = 8)
+  tipLabels <- TipLabels(trees[[1]])
+  nTip <- length(tipLabels)
+  nTree <- length(trees)
+
+  # C++ result
+  tc_cpp <- TransferConsensus(trees, greedy = "best", scale = TRUE)
+
+  # R-level result
+  pool <- TreeDist:::.PoolSplits(trees, tipLabels)
+  nSplits <- nrow(pool$splits)
+  DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip)
+  sentDist <- pool$lightSide - 1L
+  TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide,
+                               nTree, scale = TRUE)
+  compat <- TreeDist:::.CompatMat(pool$splits, nTip)
+  sortOrd <- order(-pool$counts, seq_len(nSplits))
+
+  st <- new.env(parent = emptyenv())
+  st$MATCH <- rep(NA_integer_, nSplits)
+  st$MATCH2 <- rep(NA_integer_, nSplits)
+  st$incl <- rep(FALSE, nSplits)
+
+  TreeDist:::.GreedyBest(st, DIST, sentDist, TD, pool$counts, pool$lightSide,
+                          compat, sortOrd, scale = TRUE, nSplits, nTip)
+
+  tc_r <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip)
+
+  # Both should produce resolved trees with same tip labels
+  expect_s3_class(tc_r, "phylo")
+  expect_equal(sort(TipLabels(tc_r)), sort(TipLabels(tc_cpp)))
+  # Resolution should be similar (not necessarily identical due to
+  # possible differences in canonicalization/sort-order between R and C++)
+  expect_gte(NSplits(tc_r), 1)
+})
diff --git a/tests/testthat/test-tree_distance.R b/tests/testthat/test-tree_distance.R
index cd96e3e9..67319da5 100644
--- a/tests/testthat/test-tree_distance.R
+++ b/tests/testthat/test-tree_distance.R
@@ -76,16 +76,10 @@ test_that("Size mismatch causes error", {
   expect_error(MeilaVariationOfInformation(splits7, splits8),
                "Split lengths differ")
   
-  Test <- function(Func) {
-    expect_error(Func(splits8, as.Splits(BalancedTree(9)), 8))
-  }
-  Test(cpp_robinson_foulds_distance)
-  Test(cpp_robinson_foulds_info)
-  Test(cpp_matching_split_distance)
-  Test(cpp_jaccard_similarity)
-  Test(cpp_msi_distance)
-  Test(cpp_mutual_clustering)
-  Test(cpp_shared_phylo)
+  # Validation is now in R (.ValidateSplitArgs), not C++
+  splits9 <- as.Splits(BalancedTree(9))
+  expect_error(.ValidateSplitArgs(splits8, splits9, 8L),
+               "same number of tips")
 })
 
 test_that("Metrics handle polytomies", {
diff --git a/tests/testthat/test-tree_distance_nni.R b/tests/testthat/test-tree_distance_nni.R
index 8569231f..20bcda23 100644
--- a/tests/testthat/test-tree_distance_nni.R
+++ b/tests/testthat/test-tree_distance_nni.R
@@ -8,9 +8,8 @@ test_that("NNIDist() handles exceptions", {
   expect_error(NNIDist(list(PectinateTree(1:8), 
                             PectinateTree(as.character(1:8)))),
                "trees must bear identical labels")
-  expect_error(cpp_nni_distance(
-    PectinateTree(40000)$edge, # Will fail before not being postorder is problem
-    BalancedTree(40000)$edge, 40000), "so many tips")
+  # R-level guard catches too-many-tips
+  expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), "so many tips")
   
   expect_error(NNIDist(BalancedTree(5), RootOnNode(BalancedTree(5), 1)))
   
diff --git a/tests/testthat/test-tree_distance_transfer.R b/tests/testthat/test-tree_distance_transfer.R
new file mode 100644
index 00000000..f3882a73
--- /dev/null
+++ b/tests/testthat/test-tree_distance_transfer.R
@@ -0,0 +1,494 @@
+test_that("TransferDist() returns 0 for identical trees", {
+  tr <- TreeTools::BalancedTree(8)
+  expect_equal(0, TransferDist(tr, tr))
+  expect_equal(0, TransferDist(tr, tr, scale = FALSE))
+})
+
+
+test_that("TransferDist() is symmetric", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+  expect_equal(TransferDist(bal8, pec8), TransferDist(pec8, bal8))
+  expect_equal(TransferDist(bal8, pec8, scale = FALSE),
+               TransferDist(pec8, bal8, scale = FALSE))
+})
+
+
+test_that("TransferDist() scaled is bounded by RF", {
+  # The scaled transfer dissimilarity is bounded above by the RF distance
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+  td_scaled <- TransferDist(bal8, pec8)
+  rf <- RobinsonFoulds(bal8, pec8)
+  expect_lte(td_scaled, rf)
+})
+
+
+test_that("TransferDist() normalization", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+
+  # Scaled normalization denominator = 2 * (n - 3) = 10
+  td_raw <- TransferDist(bal8, pec8)
+  td_norm <- TransferDist(bal8, pec8, normalize = TRUE)
+  expect_equal(td_norm, td_raw / (2 * (8 - 3)))
+  expect_true(td_norm >= 0)
+  expect_true(td_norm <= 1)
+
+  # Unscaled normalization
+  td_raw_u <- TransferDist(bal8, pec8, scale = FALSE)
+  td_norm_u <- TransferDist(bal8, pec8, scale = FALSE, normalize = TRUE)
+  # Even n: denom = (n^2 - 2n + 4) / 4 = (64 - 16 + 4) / 4 = 13
+  expect_equal(td_norm_u, td_raw_u / 13)
+})
+
+
+test_that("TransferDist() all-pairs", {
+  trees <- as.phylo(0:5, 8)
+
+  # All-pairs
+  d <- TransferDist(trees)
+  expect_s3_class(d, "dist")
+  expect_equal(attr(d, "Size"), 6L)
+
+  # Check diagonal = 0 implicitly (dist objects don't store diagonal)
+  # Check that batch matches pairwise
+  dm <- as.matrix(d)
+  for (i in 1:5) {
+    for (j in (i + 1):6) {
+      expected <- TransferDist(trees[[i]], trees[[j]])
+      expect_equal(dm[i, j], expected, tolerance = 1e-12,
+                   info = paste("pair", i, j))
+    }
+  }
+})
+
+
+test_that("TransferDist() cross-pairs", {
+  trees1 <- as.phylo(0:2, 8)
+  trees2 <- as.phylo(3:5, 8)
+
+  mat <- TransferDist(trees1, trees2)
+  expect_equal(dim(mat), c(3, 3))
+
+  # Check matches pairwise
+  for (i in 1:3) {
+    for (j in 1:3) {
+      expected <- TransferDist(trees1[[i]], trees2[[j]])
+      expect_equal(mat[i, j], expected, tolerance = 1e-12,
+                   info = paste("cross", i, j))
+    }
+  }
+})
+
+
+test_that("TransferDist() single tree vs list", {
+  tr <- TreeTools::BalancedTree(8)
+  trees <- as.phylo(0:3, 8)
+
+  d1 <- TransferDist(tr, trees)
+  d2 <- TransferDist(trees, tr)
+
+  expect_length(d1, 4)
+  expect_length(d2, 4)
+
+  for (i in seq_along(trees)) {
+    expected <- TransferDist(tr, trees[[i]])
+    expect_equal(d1[i], expected, tolerance = 1e-12)
+    expect_equal(d2[i], expected, tolerance = 1e-12)
+  }
+})
+
+
+test_that("TransferDist() scaled vs unscaled relationship", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+
+  # Both should be non-negative
+  td_s <- TransferDist(bal8, pec8, scale = TRUE)
+  td_u <- TransferDist(bal8, pec8, scale = FALSE)
+  expect_true(td_s >= 0)
+  expect_true(td_u >= 0)
+
+  # For identical trees, both = 0
+  expect_equal(0, TransferDist(bal8, bal8, scale = TRUE))
+  expect_equal(0, TransferDist(bal8, bal8, scale = FALSE))
+})
+
+
+test_that("TransferDist() reportMatching", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+
+  res <- TransferDist(bal8, pec8, reportMatching = TRUE)
+  expect_equal(res[[1]], TransferDist(bal8, pec8))
+
+  matching <- attr(res, "matching")
+  expect_true(!is.null(matching))
+
+  pairScores <- attr(res, "pairScores")
+  expect_true(!is.null(pairScores))
+  expect_equal(nrow(pairScores), TreeTools::NSplits(bal8))
+  expect_equal(ncol(pairScores), TreeTools::NSplits(pec8))
+})
+
+
+test_that("TransferDist() handles star trees", {
+  star <- TreeTools::StarTree(8)
+  bal8 <- TreeTools::BalancedTree(8)
+
+  # Star tree has no internal splits; dissimilarity = sum over binary tree only
+  td <- TransferDist(bal8, star)
+  expect_true(td >= 0)
+
+  # Star vs star = 0
+  expect_equal(0, TransferDist(star, star))
+})
+
+
+test_that("TransferDist() handles small trees", {
+  # 4 tips: minimal non-trivial case
+  tr1 <- TreeTools::BalancedTree(4)
+  tr2 <- TreeTools::PectinateTree(4)
+
+  td <- TransferDist(tr1, tr2)
+  expect_true(is.finite(td))
+  expect_true(td >= 0)
+})
+
+
+test_that("TransferDist() consistent with TransferConsensus internals", {
+  skip_if_not_installed("TreeTools")
+
+  # The transfer dissimilarity for a single pair should be consistent with
+  # how TransferConsensus computes distances internally
+  set.seed(6419)
+  trees <- structure(lapply(1:10, function(i) ape::rtree(12)),
+                     class = "multiPhylo")
+
+  # All-pairs: check symmetry and non-negativity
+  d <- TransferDist(trees)
+  dm <- as.matrix(d)
+  expect_true(all(dm >= -1e-12))
+  expect_true(isSymmetric(dm, tol = 1e-12))
+})
+
+
+# =========================================================================
+# Additional coverage: TransferDistance alias
+# =========================================================================
+
+test_that("TransferDistance() is an alias for TransferDist()", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+  expect_equal(TransferDistance(bal8, pec8), TransferDist(bal8, pec8))
+  expect_equal(TransferDistance(bal8, pec8, scale = FALSE),
+               TransferDist(bal8, pec8, scale = FALSE))
+})
+
+
+# =========================================================================
+# cpp_transfer_dist() direct (both scaled + unscaled + matching)
+# =========================================================================
+
+test_that("cpp_transfer_dist() returns both scores and matchings", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+  tipLabels <- TreeTools::TipLabels(bal8)
+
+  sp1 <- unclass(as.Splits(bal8, tipLabels))
+  sp2 <- unclass(as.Splits(pec8, tipLabels))
+
+  res <- TreeDist:::cpp_transfer_dist(sp1, sp2, 8L)
+
+  expect_true(res$score_scaled >= 0)
+  expect_true(res$score_unscaled >= 0)
+  expect_length(res$matching_xy, nrow(sp1))
+  expect_length(res$matching_yx, nrow(sp2))
+
+  # Consistent with TransferDist
+  expect_equal(res$score_scaled,
+               TransferDist(bal8, pec8, scale = TRUE), tolerance = 1e-12)
+  expect_equal(res$score_unscaled,
+               TransferDist(bal8, pec8, scale = FALSE), tolerance = 1e-12)
+
+  # Identical trees → distance 0, all matchings defined
+  res_id <- TreeDist:::cpp_transfer_dist(sp1, sp1, 8L)
+  expect_equal(res_id$score_scaled, 0)
+  expect_equal(res_id$score_unscaled, 0)
+})
+
+test_that("cpp_transfer_dist() handles star trees", {
+  star <- TreeTools::StarTree(8)
+  bal8 <- TreeTools::BalancedTree(8)
+  tipLabels <- TreeTools::TipLabels(bal8)
+
+  sp_star <- unclass(as.Splits(star, tipLabels))
+  sp_bal <- unclass(as.Splits(bal8, tipLabels))
+
+  # Star vs binary: only one direction contributes
+  res <- TreeDist:::cpp_transfer_dist(sp_bal, sp_star, 8L)
+  expect_true(res$score_scaled >= 0)
+  expect_true(res$score_unscaled >= 0)
+  expect_length(res$matching_xy, nrow(sp_bal))
+  # matching_yx has 0 elements for star tree
+  expect_length(res$matching_yx, 0)
+
+  # Star vs star: both scores = 0
+  res_ss <- TreeDist:::cpp_transfer_dist(sp_star, sp_star, 8L)
+  expect_equal(res_ss$score_scaled, 0)
+  expect_equal(res_ss$score_unscaled, 0)
+})
+
+test_that("column mismatch caught by R-level guard", {
+  sp1 <- matrix(as.raw(0), nrow = 1, ncol = 1)
+  sp2 <- matrix(as.raw(0), nrow = 1, ncol = 2)
+  expect_error(
+    TreeDist:::.ValidateSplitArgs(sp1, sp2, 8L),
+    "same number of tips"
+  )
+})
+
+
+# =========================================================================
+# cpp_transfer_dist_scored() edge cases
+# =========================================================================
+
+test_that("column mismatch caught by R-level guard (scored)", {
+  sp1 <- matrix(as.raw(0), nrow = 1, ncol = 1)
+  sp2 <- matrix(as.raw(0), nrow = 1, ncol = 2)
+  expect_error(
+    TreeDist:::.ValidateSplitArgs(sp1, sp2, 8L),
+    "same number of tips"
+  )
+})
+
+test_that("cpp_transfer_dist_scored() with asymmetric split counts", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+  tipLabels <- TreeTools::TipLabels(bal8)
+
+  sp_bal <- unclass(as.Splits(bal8, tipLabels))
+  sp_pec <- unclass(as.Splits(pec8, tipLabels))
+
+  # BalancedTree(8) has fewer splits than PectinateTree(8)
+  res <- TreeDist:::cpp_transfer_dist_scored(sp_bal, sp_pec, 8L, TRUE)
+  expect_true(res$score >= 0)
+  # matching length = max(nSplits1, nSplits2)
+  expect_equal(length(res$matching),
+               max(nrow(sp_bal), nrow(sp_pec)))
+})
+
+
+# =========================================================================
+# Normalization edge cases
+# =========================================================================
+
+test_that(".TransferNormDenom() for odd and even nTip", {
+  # Even nTip = 8: (64 - 16 + 4) / 4 = 13
+  expect_equal(TreeDist:::.TransferNormDenom(8, FALSE), 13)
+  # Odd nTip = 9: (81 - 18 + 5) / 4 = 17
+  expect_equal(TreeDist:::.TransferNormDenom(9, FALSE), 17)
+  # Scaled: 2 * (n - 3)
+  expect_equal(TreeDist:::.TransferNormDenom(8, TRUE), 10)
+  expect_equal(TreeDist:::.TransferNormDenom(9, TRUE), 12)
+})
+
+test_that("TransferDist() normalize with odd nTip", {
+  trees <- as.phylo(0:3, nTip = 9)
+  # Scaled normalization
+  d <- TransferDist(trees, normalize = TRUE)
+  d_raw <- TransferDist(trees, normalize = FALSE)
+  denom <- 2 * (9 - 3)
+  expect_equal(as.numeric(d), as.numeric(d_raw) / denom, tolerance = 1e-12)
+
+  # Unscaled normalization with odd nTip
+  d_u <- TransferDist(trees, scale = FALSE, normalize = TRUE)
+  d_u_raw <- TransferDist(trees, scale = FALSE, normalize = FALSE)
+  denom_u <- (81 - 18 + 5) / 4
+  expect_equal(as.numeric(d_u), as.numeric(d_u_raw) / denom_u, tolerance = 1e-12)
+})
+
+test_that("TransferDist() normalize with cross-pairs", {
+  trees1 <- as.phylo(0:2, 8)
+  trees2 <- as.phylo(3:5, 8)
+
+  mat_raw <- TransferDist(trees1, trees2)
+  mat_norm <- TransferDist(trees1, trees2, normalize = TRUE)
+  denom <- 2 * (8 - 3)
+  expect_equal(mat_norm, mat_raw / denom, tolerance = 1e-12)
+
+  # Unscaled cross-pairs normalize
+  mat_raw_u <- TransferDist(trees1, trees2, scale = FALSE)
+  mat_norm_u <- TransferDist(trees1, trees2, scale = FALSE, normalize = TRUE)
+  denom_u <- 13
+  expect_equal(mat_norm_u, mat_raw_u / denom_u, tolerance = 1e-12)
+})
+
+test_that("TransferDist() normalize via generic fallback (reportMatching)", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+
+  # reportMatching forces generic path; normalize divides after
+  res <- TransferDist(bal8, pec8, normalize = TRUE, reportMatching = TRUE)
+  raw <- TransferDist(bal8, pec8, reportMatching = TRUE)
+  denom <- 2 * (8 - 3)
+  expect_equal(res[[1]], raw[[1]] / denom, tolerance = 1e-12)
+
+  # Unscaled + normalize + reportMatching
+  res_u <- TransferDist(bal8, pec8, scale = FALSE,
+                         normalize = TRUE, reportMatching = TRUE)
+  raw_u <- TransferDist(bal8, pec8, scale = FALSE, reportMatching = TRUE)
+  expect_equal(res_u[[1]], raw_u[[1]] / 13, tolerance = 1e-12)
+})
+
+
+# =========================================================================
+# Fast path edge cases
+# =========================================================================
+
+test_that(".TransferDistAllPairs() returns NULL for edge cases", {
+  # Single phylo (not multiPhylo)
+  tr <- TreeTools::BalancedTree(8)
+  expect_null(TreeDist:::.TransferDistAllPairs(tr, TRUE))
+
+  # < 2 trees
+  expect_null(TreeDist:::.TransferDistAllPairs(
+    structure(list(tr), class = "multiPhylo"), TRUE))
+
+  # < 4 tips
+  small <- as.phylo(0:2, nTip = 3)
+  expect_null(TreeDist:::.TransferDistAllPairs(small, TRUE))
+
+  # Mismatched tip labels
+  t1 <- TreeTools::BalancedTree(paste0("a", 1:6))
+  t2 <- TreeTools::BalancedTree(paste0("b", 1:6))
+  mixed <- structure(list(t1, t2), class = "multiPhylo")
+  expect_null(TreeDist:::.TransferDistAllPairs(mixed, TRUE))
+})
+
+test_that(".TransferDistCrossPairs() returns NULL for edge cases", {
+  tr1 <- TreeTools::BalancedTree(8)
+  tr2 <- TreeTools::PectinateTree(8)
+
+  # Both single → delegate to generic
+  expect_null(TreeDist:::.TransferDistCrossPairs(tr1, tr2, TRUE))
+
+  # < 4 tips
+  small1 <- as.phylo(0:1, nTip = 3)
+  small2 <- as.phylo(2:3, nTip = 3)
+  expect_null(TreeDist:::.TransferDistCrossPairs(small1, small2, TRUE))
+
+  # Mismatched tip labels
+  t1 <- structure(list(TreeTools::BalancedTree(paste0("a", 1:6))),
+                  class = "multiPhylo")
+  t2 <- structure(list(TreeTools::BalancedTree(paste0("b", 1:6))),
+                  class = "multiPhylo")
+  expect_null(TreeDist:::.TransferDistCrossPairs(t1, t2, TRUE))
+})
+
+test_that("TransferDist() all-pairs with star trees in mix", {
+  star <- TreeTools::StarTree(8)
+  bal <- TreeTools::BalancedTree(8)
+  pec <- TreeTools::PectinateTree(8)
+  trees <- structure(list(star, bal, pec), class = "multiPhylo")
+
+  d <- TransferDist(trees)
+  dm <- as.matrix(d)
+  expect_equal(dm[1, 1], 0)  # star self
+  expect_true(dm[1, 2] >= 0) # star vs binary
+  expect_equal(dm[2, 3], TransferDist(bal, pec), tolerance = 1e-12)
+})
+
+test_that("TransferDist() all-pairs with exactly 2 trees", {
+  trees <- as.phylo(0:1, nTip = 8)
+  d <- TransferDist(trees)
+  expect_s3_class(d, "dist")
+  expect_equal(attr(d, "Size"), 2L)
+  expect_equal(as.numeric(d),
+               TransferDist(trees[[1]], trees[[2]]), tolerance = 1e-12)
+})
+
+test_that("TransferDist() all-pairs with exactly 4 tips", {
+  trees <- as.phylo(0:3, nTip = 4)
+  d <- TransferDist(trees)
+  expect_s3_class(d, "dist")
+  dm <- as.matrix(d)
+  expect_true(all(dm >= -1e-12))
+})
+
+
+# =========================================================================
+# TransferDistSplits edge cases
+# =========================================================================
+
+test_that("TransferDistSplits() reportMatching with various configurations", {
+  bal8 <- TreeTools::BalancedTree(8)
+  pec8 <- TreeTools::PectinateTree(8)
+  tipLabels <- TreeTools::TipLabels(bal8)
+
+  sp_bal <- as.Splits(bal8, tipLabels)
+  sp_pec <- as.Splits(pec8, tipLabels)
+
+  # Unscaled reportMatching
+  res <- TransferDistSplits(sp_bal, sp_pec, scale = FALSE,
+                             reportMatching = TRUE)
+  expect_true(is.numeric(res))
+  expect_true(!is.null(attr(res, "matching")))
+  expect_true(!is.null(attr(res, "pairScores")))
+  expect_true(!is.null(attr(res, "matchedScores")))
+
+  # Scaled reportMatching (already tested above, but confirm matchedSplits)
+  res_s <- TransferDistSplits(sp_bal, sp_pec, scale = TRUE,
+                               reportMatching = TRUE)
+  expect_true(!is.null(attr(res_s, "matchedSplits")))
+})
+
+test_that("TransferDistSplits() with asymmetric nSplits (nSplits1 < nSplits2)", {
+  # Use majority-rule consensus (fewer splits) vs fully resolved tree
+  trees <- as.phylo(0:9, 8)
+  mr <- Consensus(trees, p = 0.5)
+  bal8 <- TreeTools::BalancedTree(8)
+  tipLabels <- TreeTools::TipLabels(bal8)
+
+  sp_mr <- as.Splits(mr, tipLabels)
+  sp_bal <- as.Splits(bal8, tipLabels)
+
+  # Confirm asymmetry
+  expect_lt(nrow(sp_mr), nrow(sp_bal))
+
+  res <- TransferDistSplits(sp_mr, sp_bal, reportMatching = TRUE)
+  # Matching should be truncated to nSplits1
+  matching <- attr(res, "matching")
+  expect_equal(length(matching), nrow(sp_mr))
+  expect_true(!is.null(attr(res, "pairScores")))
+})
+
+test_that("TransferDist() for 3-tip trees uses fallback path", {
+  # 3 tips: fast path returns NULL, falls through to generic
+  tr1 <- TreeTools::BalancedTree(3)
+  tr2 <- TreeTools::PectinateTree(3)
+  td <- TransferDist(tr1, tr2)
+  expect_true(is.finite(td))
+  expect_true(td >= 0)
+})
+
+test_that("TransferDist() with named tree list preserves labels", {
+  trees <- as.phylo(0:3, nTip = 8)
+  names(trees) <- paste0("tree_", 1:4)
+
+  d <- TransferDist(trees)
+  expect_equal(attr(d, "Labels"), paste0("tree_", 1:4))
+})
+
+test_that("TransferDist() cross-pairs with single Splits input", {
+  bal8 <- TreeTools::BalancedTree(8)
+  tipLabels <- TreeTools::TipLabels(bal8)
+  sp <- as.Splits(bal8, tipLabels)
+  trees <- as.phylo(0:3, nTip = 8)
+
+  # Single Splits object vs list of trees
+  result <- TransferDist(sp, trees)
+  expect_length(result, 4)
+  expect_true(all(result >= 0))
+})