diff --git a/.github/workflows/ASan.yml b/.github/workflows/ASan.yml index 90a74a66..7c0b0a95 100644 --- a/.github/workflows/ASan.yml +++ b/.github/workflows/ASan.yml @@ -55,8 +55,9 @@ jobs: run: | export LD_PRELOAD=$(gcc -print-file-name=libasan.so) - echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" > src/Makevars - echo "PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars + # Replace PKG_CXXFLAGS in-place; preserve PKG_CPPFLAGS and PKG_LIBS + sed -i 's/^PKG_CXXFLAGS.*/PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer/' src/Makevars + echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars mkdir ~/.R echo "LDFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> ~/.R/Makevars diff --git a/.positai/settings.json b/.positai/settings.json index 7aba4f95..acec4998 100644 --- a/.positai/settings.json +++ b/.positai/settings.json @@ -28,6 +28,17 @@ "rm -f src/*.o src/*.dll": "allow", "tail *": "allow", "wc *": "allow" + }, + "external_directory": { + "C:/Users/pjjg18/GitHub/TreeSearch-a/*": "allow", + "C:/Users/pjjg18/.positai/skills/r-package-profiling/references/*": "allow", + "../Quartet/*": "allow", + "../TreeTools/*": "allow" + }, + "webfetch": { + "https://github.com/*": "allow", + "https://api.github.com/*": "allow", + "https://raw.githubusercontent.com/*": "allow" } }, "model": { diff --git a/AGENTS.md b/AGENTS.md index 7ed0028b..6d81e883 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -257,6 +257,20 @@ source("benchmark/bench-tree-distances.R") C++ compilation flags are controlled by `src/Makevars.win` (Windows) / `src/Makevars`. The package requires **C++17** (`CXX_STD = CXX17`). +### Documentation and spelling checks + +After any work that adds or modifies roxygen comments, Rd files, NEWS.md, or +vignettes, run: + +```r +devtools::check_man() # regenerates Rd files and checks for issues +spelling::spell_check_package() # flags potential misspellings +``` + +Legitimate technical terms, proper nouns, and code identifiers flagged by the +spell checker should be added to `inst/WORDLIST` (one word per line, +alphabetically sorted). Only fix actual typos in the source. + ### Code coverage Check that existing tests cover all new code. The GHA test suite uses codecov. @@ -271,6 +285,26 @@ covr::file_coverage(cov, "src/pairwise_distances.cpp") # per-file summary Aim for full line coverage of new C++ and R code. If a new code path is not exercised by the existing test suite, add targeted tests in `tests/testthat/`. +### ⚠ Exploratory / risky R code — use a subprocess + +When running experimental R code that may be slow, allocate large objects, +or involve complex loops (e.g., hill-climbing over tree space, brute-force +evaluation of many candidate trees), **run it in a subprocess** rather than +the interactive RStudio session. Long-running or memory-heavy computations +in the main session can freeze or crash RStudio. + +```r +# Write the experiment to a temp script, then run via Rscript: +writeLines(code, tmp <- tempfile(fileext = ".R")) +system2("Rscript", tmp, stdout = TRUE, stderr = TRUE) + +# Or use callr for structured subprocess execution: +callr::r(function() { ... }, timeout = 120) +``` + +This applies especially to prototype algorithm exploration (e.g., CID +hill-climbing over split space) where per-iteration cost is uncertain. + --- ## Completed Optimizations (this dev cycle) @@ -1143,6 +1177,52 @@ for the cross-pairs case. --- +## LinkingTo Header Exposure (`expose-lapjv` branch) + +Extracted LAP and MCI C++ APIs into `inst/include/TreeDist/` headers so that +downstream packages (e.g., TreeSearch) can use `LinkingTo: TreeDist`: + +| Header | Content | +|--------|---------| +| `types.h` | `cost`, `lap_dim`, `lap_row`, `lap_col`, constants | +| `cost_matrix.h` | `CostMatrix` class (Rcpp-free) | +| `lap_scratch.h` | `LapScratch` struct | +| `lap.h` | `lap()` declarations | +| `lap_impl.h` | `lap()` implementation (include in exactly one TU) | +| `mutual_clustering.h` | MCI declarations | +| `mutual_clustering_impl.h` | MCI implementation (include in exactly one TU) | + +`src/lap.h` is now a thin wrapper that includes `` and +re-exports types to global scope. + +### LAPJV codegen regression (diagnosed, partially mitigated) + +Including `lap_impl.h` in `lap.cpp` changed the TU context enough for GCC 14's +register allocator to produce ~8% more instructions in the Dijkstra hot loop, +causing a consistent 20–25% regression on standalone LAPJV (n ≥ 400). + +**Root cause:** the installed-header version of `CostMatrix` (in +`inst/include/TreeDist/cost_matrix.h`) has a different method set than main's +monolithic `src/lap.h` (extra methods like `setWithTranspose()`, `dim8()`; +missing test variants like `findRowSubminNaive`). This changes GCC's +optimization heuristics for the entire TU, even though `lap()` never calls +the differing methods. + +**Fix:** define `lap()` directly in `lap.cpp` (not via `#include `) +with `__attribute__((optimize("align-functions=64", "align-loops=16")))`. +The `lapjv()` wrapper fills the transposed buffer first (matching R's +column-major storage) then untransposes — restoring the cache-friendly pattern. + +**Residual:** ~5–9% on LAPJV 1999×1999 vs main (from the CostMatrix class +definition difference). Tree distance metrics are unaffected — they call +`lap()` from `pairwise_distances.cpp`, whose TU context is unchanged. + +**Maintenance note:** if the `lap()` algorithm changes, update BOTH `src/lap.cpp` +and `inst/include/TreeDist/lap_impl.h`. The duplication is intentional — it +preserves the TU context that was profiled and tuned. + +--- + ## Remaining Optimization Opportunities - Sort+merge pre-scan for `rf_info_score`: **DONE** — replaced O(n²) loop with diff --git a/NAMESPACE b/NAMESPACE index 60386873..9516b984 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -141,6 +141,10 @@ export(StrainCol) export(SumOfRanges) export(SumOfVariances) export(SumOfVars) +export(TransferConsensus) +export(TransferDist) +export(TransferDistSplits) +export(TransferDistance) export(TreeDistPlot) export(TreeDistance) export(TreesConsistentWithTwoSplits) @@ -152,6 +156,7 @@ export(entropy_int) export(is.HPart) importFrom(Rdpack,reprompt) importFrom(TreeTools,AllAncestors) +importFrom(TreeTools,Consensus) importFrom(TreeTools,DropTip) importFrom(TreeTools,FirstMatchingSplit) importFrom(TreeTools,KeepTip) @@ -179,6 +184,7 @@ importFrom(TreeTools,RootTree) importFrom(TreeTools,SplitFrequency) importFrom(TreeTools,SplitInformation) importFrom(TreeTools,SplitsInBinaryTree) +importFrom(TreeTools,StarTree) importFrom(TreeTools,TipLabels) importFrom(TreeTools,TipsInSplits) importFrom(TreeTools,TreeIsRooted) diff --git a/NEWS.md b/NEWS.md index 8fc11b03..082fec34 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,23 @@ + +# Branch: `expose-lapjv` → merge target: `master` + +This branch exposes the LAPJV (Jonker–Volgenant) linear assignment solver +via C++ headers so that downstream packages can link to it (`LinkingTo`), +and implements a transfer consensus method (`TransferConsensus()`, +`TransferDist()`). It also tightens error handling by moving all +`Rcpp::stop()` / `Rf_error()` calls out of C++ implementation code +into R-level input validation. + +# TreeDist 2.13.0.9000 + +This branch implements transfer consensus trees (`TransferConsensus()`) and +a corresponding transfer distance metric (`TransferDist()`), providing a +consensus method based on the transfer distance rather than the majority-rule +or strict approaches. Error handling in C++ code is tightened to validate +inputs at the R level rather than using `Rcpp::stop()` or `Rf_error()` in +implementation code. + + # TreeDist 2.13.0 (2026-03-17) ## New features @@ -68,7 +88,7 @@ Typical speedups over v2.12.0 for tree sets where most splits are shared - Cross-pairs computations (`tree1` vs `tree2` where both are lists) now use the same optimized batch path as all-pairs computations. -### KendallColijn distance +### Kendall & Colijn distance - `KCVector()` reimplemented in C++, giving ~220× speedup per tree. diff --git a/R/RcppExports.R b/R/RcppExports.R index 140dd0e9..9e4d02b2 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -200,6 +200,77 @@ spr_table_7 <- function(sp1, sp2) { .Call(`_TreeDist_spr_table_7`, sp1, sp2) } +#' Transfer consensus (C++ implementation) +#' +#' @param splits_list List of raw matrices (one per tree), each from as.Splits(). +#' @param n_tip Number of tips. +#' @param scale Logical: use scaled transfer distance? +#' @param greedy_best_flag Logical: TRUE for "best", FALSE for "first". +#' @param init_majority Logical: TRUE to start from majority-rule splits. +#' +#' @return A `LogicalVector` of length n_splits indicating which pooled splits +#' are included in the consensus, plus attributes "raw_splits" (a raw matrix +#' of all unique splits) and "light_side" (integer vector). +#' @keywords internal +cpp_transfer_consensus <- function(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_threads = 1L) { + .Call(`_TreeDist_cpp_transfer_consensus`, splits_list, n_tip, scale, greedy_best_flag, init_majority, n_threads) +} + +#' @keywords internal +cpp_tc_profile <- function(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_iter, n_threads = 1L) { + .Call(`_TreeDist_cpp_tc_profile`, splits_list, n_tip, scale, greedy_best_flag, init_majority, n_iter, n_threads) +} + +#' Per-pair transfer dissimilarity +#' +#' @param x,y Raw matrices representing splits (from as.Splits()). +#' @param nTip Integer: number of tips. +#' +#' @return A list with components: +#' - score_scaled: scaled transfer dissimilarity (double) +#' - score_unscaled: unscaled transfer dissimilarity (double) +#' - `matching_xy`: integer vector, best match in y for each split in x (1-based, NA if sentinel) +#' - `matching_yx`: integer vector, best match in x for each split in y (1-based, NA if sentinel) +#' @keywords internal +cpp_transfer_dist <- function(x, y, nTip) { + .Call(`_TreeDist_cpp_transfer_dist`, x, y, nTip) +} + +#' @keywords internal +cpp_transfer_dist_scored <- function(x, y, nTip, scale) { + .Call(`_TreeDist_cpp_transfer_dist_scored`, x, y, nTip, scale) +} + +#' All-pairs transfer dissimilarity (OpenMP) +#' +#' @param splits_list List of raw matrices (one per tree). +#' @param n_tip Number of tips. +#' @param scale Logical: use scaled transfer dissimilarity? +#' @param n_threads Number of OpenMP threads. +#' +#' @return Numeric vector of length choose(N,2) in dist order. +#' @keywords internal +cpp_transfer_dist_all_pairs <- function(splits_list, n_tip, scale, n_threads) { + .Call(`_TreeDist_cpp_transfer_dist_all_pairs`, splits_list, n_tip, scale, n_threads) +} + +#' Cross-pairs transfer dissimilarity (OpenMP) +#' +#' @param splits_a,splits_b Lists of raw matrices. +#' @param n_tip Number of tips. +#' @param scale Logical: use scaled transfer dissimilarity? +#' @param n_threads Number of OpenMP threads. +#' +#' @return Numeric matrix of dimension `nA` x `nB`. +#' @keywords internal +cpp_transfer_dist_cross_pairs <- function(splits_a, splits_b, n_tip, scale, n_threads) { + .Call(`_TreeDist_cpp_transfer_dist_cross_pairs`, splits_a, splits_b, n_tip, scale, n_threads) +} + +cpp_mci_impl_score <- function(x, y, n_tips) { + .Call(`_TreeDist_cpp_mci_impl_score`, x, y, n_tips) +} + cpp_robinson_foulds_distance <- function(x, y, nTip) { .Call(`_TreeDist_cpp_robinson_foulds_distance`, x, y, nTip) } diff --git a/R/hierarchical_mutual_information.R b/R/hierarchical_mutual_information.R index 06948acd..862d5c93 100644 --- a/R/hierarchical_mutual_information.R +++ b/R/hierarchical_mutual_information.R @@ -66,6 +66,10 @@ HierarchicalMutualInfo <- function(tree1, tree2 = NULL, normalize = FALSE) { 1 } } else { + if (inherits(tree1, "phylo") && inherits(tree2, "phylo") && + NTip(tree1) != NTip(tree2)) { + stop("Trees must have the same number of leaves") + } hp2 <- as.HPart(tree2, tree1) hmi <- HMI_xptr(hp1, hp2) if (isFALSE(normalize)) { @@ -132,6 +136,8 @@ HH <- SelfHMI #' @rdname HierarchicalMutualInfo #' @export EHMI <- function(tree1, tree2, precision = 0.01, minResample = 36) { + if (minResample < 2L) stop("Must perform at least one resampling") + if (precision < 1e-8) stop("Tolerance too low") EHMI_xptr(as.HPart(tree1), as.HPart(tree2), as.numeric(precision), as.integer(minResample)) / log(2) } @@ -162,6 +168,8 @@ EHMI <- function(tree1, tree2, precision = 0.01, minResample = 36) { #' @rdname HierarchicalMutualInfo #' @export AHMI <- function(tree1, tree2, Mean = max, precision = 0.01, minResample = 36) { + if (minResample < 2L) stop("Must perform at least one resampling") + if (precision < 1e-8) stop("Tolerance too low") hp1 <- as.HPart(tree1) hp2 <- as.HPart(tree2, hp1) diff --git a/R/transfer_consensus.R b/R/transfer_consensus.R new file mode 100644 index 00000000..384cff0d --- /dev/null +++ b/R/transfer_consensus.R @@ -0,0 +1,550 @@ +#' Consensus tree minimizing transfer distance +#' +#' Construct a consensus tree that minimizes the sum of transfer distances +#' to a set of input trees, using a greedy add-and-prune heuristic. +#' +#' Unlike the majority-rule consensus, which minimizes Robinson-Foulds +#' distance and can be highly unresolved when phylogenetic signal is low, +#' `TransferConsensus()` uses the finer-grained transfer distance +#' \insertCite{Lemoine2018}{TreeDist} to construct a more resolved consensus +#' tree. +#' +#' The algorithm pools all splits observed across input trees, computes +#' pairwise transfer distances between them, and greedily adds or removes +#' splits to minimize total transfer dissimilarity cost. The approach +#' follows \insertCite{Takazawa2026;textual}{TreeDist}, reimplemented for +#' 'TreeDist' infrastructure. +#' +#' @param trees An object of class `multiPhylo`: the input trees. +#' All trees must share the same tip labels. +#' @param scale Logical; if `TRUE` (default), use the scaled transfer +#' distance (normalized by light-side size minus one). If `FALSE`, use +#' the unscaled (raw Hamming) transfer distance. +#' @param greedy Character string specifying the greedy strategy: +#' `"best"` (default) picks the single highest-benefit action at each step; +#' `"first"` picks the first improving action encountered (faster but +#' potentially lower quality). +#' @param init Character string specifying the initial consensus: +#' `"empty"` (default) starts with no splits (purely additive); +#' `"majority"` starts with the majority-rule consensus and refines. +#' +#' @return A tree of class `phylo`. +#' +#' @references +#' \insertAllCited{} +#' +#' @examples +#' library(TreeTools) +#' # Generate bootstrap-like trees +#' trees <- as.phylo(1:20, nTip = 12) +#' +#' # Transfer consensus (more resolved than majority rule) +#' tc <- TransferConsensus(trees) +#' plot(tc) +#' +#' # Compare resolution +#' mr <- Consensus(trees, p = 0.5) +#' cat("Majority-rule splits:", NSplits(mr), "\n") +#' cat("Transfer consensus splits:", NSplits(tc), "\n") +#' +#' @importFrom TreeTools as.Splits TipLabels NSplits Consensus StarTree +#' @export +TransferConsensus <- function(trees, + scale = TRUE, + greedy = c("best", "first"), + init = c("empty", "majority")) { + greedy <- match.arg(greedy) + init <- match.arg(init) + + if (!inherits(trees, "multiPhylo")) { + stop("`trees` must be an object of class 'multiPhylo'.") + } + nTree <- length(trees) + if (nTree < 2L) stop("Need at least 2 trees.") + tipLabels <- TipLabels(trees[[1]]) + nTip <- length(tipLabels) + + # Convert each tree to a raw split matrix (TreeTools C++ internally) + splitsList <- lapply(trees, function(tr) { + sp <- as.Splits(tr, tipLabels) + unclass(sp) + }) + + # Delegate all work to C++ + nThreads <- max(1L, getOption("TreeDist.threads", + getOption("mc.cores", 1L))) + res <- cpp_transfer_consensus( + splitsList, nTip, scale, + greedy_best_flag = (greedy == "best"), + init_majority = (init == "majority"), + n_threads = nThreads + ) + + included <- res$included + if (!any(included)) { + return(StarTree(tipLabels)) + } + + rawSplits <- res$raw_splits[included, , drop = FALSE] + sp <- structure(rawSplits, nTip = nTip, tip.label = tipLabels, + class = "Splits") + as.phylo(sp) +} + + +# =========================================================================== +# Internal helpers +# =========================================================================== + +# popcount lookup for 0:255 +.POPCOUNT <- as.integer(sapply(0:255, function(x) sum(as.integer(intToBits(x))))) + +#' Pool unique splits, returning an integer (logical) matrix +#' @noRd +.PoolSplits <- function(trees, tipLabels) { + nTip <- length(tipLabels) + nTree <- length(trees) + + # Collect all splits as logical matrices and hash to unique set + env <- new.env(hash = TRUE, parent = emptyenv()) + splitsList <- list() + rawSplitsList <- list() + countVec <- integer(0) + treeMembers <- vector("list", nTree) + nextIdx <- 1L + + for (i in seq_len(nTree)) { + sp <- as.Splits(trees[[i]], tipLabels) + logMat <- as.logical(sp) # matrix: nSplits x nTip + # The Splits object is a raw matrix internally; preserve structure + rawMat <- unclass(sp) + if (is.null(dim(logMat))) { + logMat <- matrix(logMat, nrow = 1) + rawMat <- matrix(rawMat, nrow = 1) + } + nSp <- nrow(logMat) + # Canonicalise: ensure tip 1 is FALSE + nSp <- nrow(logMat) + members <- integer(nSp) + for (j in seq_len(nSp)) { + row <- logMat[j, ] + rawRow <- rawMat[j, ] + if (row[1]) { + row <- !row + rawRow <- .FlipRaw(rawRow, nTip) + } + key <- paste0(which(row), collapse = ",") + idx <- env[[key]] + if (is.null(idx)) { + env[[key]] <- nextIdx + splitsList[[nextIdx]] <- row + rawSplitsList[[nextIdx]] <- rawRow + countVec[nextIdx] <- 1L + members[j] <- nextIdx + nextIdx <- nextIdx + 1L + } else { + countVec[idx] <- countVec[idx] + 1L + members[j] <- idx + } + } + treeMembers[[i]] <- unique(members) + } + + nSplits <- length(splitsList) + splitMat <- matrix(FALSE, nrow = nSplits, ncol = nTip) + nBytes <- length(rawSplitsList[[1]]) + rawMat <- matrix(as.raw(0), nrow = nSplits, ncol = nBytes) + for (k in seq_len(nSplits)) { + splitMat[k, ] <- splitsList[[k]] + rawMat[k, ] <- rawSplitsList[[k]] + } + + lightSide <- pmin(rowSums(splitMat), nTip - rowSums(splitMat)) + + list( + splits = splitMat, # logical matrix nSplits x nTip + rawSplits = rawMat, # raw matrix nSplits x nBytes + counts = countVec, + lightSide = as.integer(lightSide), + treeMembers = treeMembers + ) +} + + +.FlipRaw <- function(rawVec, nTip) { + nBytes <- length(rawVec) + usedBits <- ((nTip - 1L) %% 8L) + 1L + lastMask <- as.raw(sum(2^(0:(usedBits - 1L)))) + out <- xor(rawVec, as.raw(0xff)) + out[nBytes] <- out[nBytes] & lastMask + out +} + + +#' Pairwise transfer distance matrix using logical split matrix +#' Transfer distance = min(hamming(a, b), n - hamming(a, b)) +#' hamming(a,b) when both are logical = sum(xor(a,b)) +#' @noRd +.TransferDistMat <- function(splitMat, nTip) { + # splitMat is logical: nSplits x nTip + # hamming = number of differing positions + # Use tcrossprod trick: hamming(a,b) = sum(a) + sum(b) - 2*sum(a&b) + # = nAgreeing... actually let's just compute XOR directly. + # Faster: hamming = nTip - 2 * (a %*% t(b) + (1-a) %*% t(1-b)) + # = nTip - 2 * (a %*% t(b) + (nTip - rowSums(a) - colSums(b) + a %*% t(b))) + # Simpler: agreement = a %*% t(b) + (1-a) %*% t(1-b) + # = 2 * (a %*% t(b)) - rowSums(a) - rep(rowSums(b)) + nTip + # hamming = nTip - agreement + sm <- splitMat + 0L # convert to integer + ab <- tcrossprod(sm) # sm %*% t(sm) + rs <- rowSums(sm) + hamming <- nTip - 2L * ab + outer(rs, rs, "+") - nTip + # Simplifies to: hamming = outer(rs, rs, "+") - 2 * ab + # Wait, let me re-derive: + # agreement_ij = sum(a_i == b_j) = sum(a&b) + sum(!a & !b) + # = ab[i,j] + (nTip - rs[i] - rs[j] + ab[i,j]) + # = 2*ab[i,j] + nTip - rs[i] - rs[j] + # hamming_ij = nTip - agreement_ij = rs[i] + rs[j] - 2*ab[i,j] + hamming <- outer(rs, rs, "+") - 2L * ab + + # Transfer distance = min(hamming, nTip - hamming) + pmin(hamming, nTip - hamming) +} + + +#' Compute TD (transfer dissimilarity cost) for each split +#' @noRd +.ComputeTD <- function(DIST, sentDist, treeMembers, lightSide, nTree, scale) { + nSplits <- nrow(DIST) + TD <- numeric(nSplits) + pMinus1 <- lightSide - 1L + + for (i in seq_len(nTree)) { + idx <- treeMembers[[i]] + # For each split b, min distance to any split in tree i + if (length(idx) == 1L) { + minD <- DIST[, idx] + } else { + minD <- apply(DIST[, idx, drop = FALSE], 1, min) + } + # Also consider sentinel distance + minD <- pmin(minD, sentDist) + if (scale) { + TD <- TD + pmin(minD / pMinus1, 1) + } else { + TD <- TD + pmin(minD, pMinus1) + } + } + TD +} + + +#' Compatibility matrix via vectorized bipartition check +#' @noRd +.CompatMat <- function(splitMat, nTip) { + # Two splits compatible iff one of the 4 intersections (A&B, A&~B, ~A&B, ~A&~B) is empty + sm <- splitMat + 0L # integer 0/1 matrix + notSm <- 1L - sm + + # A&B: a[i,] & b[j,] -> any nonzero -> tcrossprod(sm, sm) > 0 + ab <- tcrossprod(sm, sm) > 0L + anb <- tcrossprod(sm, notSm) > 0L + nab <- tcrossprod(notSm, sm) > 0L + nanb <- tcrossprod(notSm, notSm) > 0L + + # Compatible if at least one intersection is empty + !ab | !anb | !nab | !nanb +} + + +#' Initialize MATCH/MATCH2 from currently included splits +#' @noRd +.InitMatches <- function(st, DIST, sentDist, lightSide, scale) { + nSplits <- length(st$MATCH) + incIdx <- which(st$incl) + pMinus1 <- lightSide - 1L + + if (length(incIdx) == 0L) return(invisible()) + + # For each split b, find 1st and 2nd closest among included + subDIST <- DIST[, incIdx, drop = FALSE] + for (b in seq_len(nSplits)) { + dists <- subDIST[b, ] + ord <- order(dists) + bestDist <- dists[ord[1]] + thresh <- pMinus1[b] + if (scale && bestDist / thresh >= 1) next + if (!scale && bestDist >= thresh) next + st$MATCH[b] <- incIdx[ord[1]] + if (length(ord) > 1L) { + secDist <- dists[ord[2]] + if (scale && secDist / thresh < 1) { + st$MATCH2[b] <- incIdx[ord[2]] + } else if (!scale && secDist < thresh) { + st$MATCH2[b] <- incIdx[ord[2]] + } + } + } +} + + +#' Get distance from split b to its match (NA = sentinel) +#' @noRd +.Dist <- function(b, idx, DIST, sentDist) { + if (is.na(idx)) sentDist[b] else DIST[b, idx] +} + + +#' Vectorized add-benefit: returns benefit for each candidate +#' @noRd +.AddBenefitVec <- function(candidates, st, DIST, sentDist, TD, counts, + lightSide, scale) { + nSplits <- length(st$MATCH) + pMinus1 <- lightSide - 1L + + # Current match distances for all splits + matchDist <- ifelse(is.na(st$MATCH), sentDist, .DiagDist(st$MATCH, DIST, sentDist)) + + benefits <- numeric(length(candidates)) + for (ci in seq_along(candidates)) { + c <- candidates[ci] + newDist <- DIST[, c] + if (scale) { + diff <- (matchDist - newDist) / pMinus1 + } else { + diff <- matchDist - newDist + } + diff[diff < 0] <- 0 + benefits[ci] <- sum(diff * counts) - TD[c] + } + benefits +} + +#' Helper: get DIST\[b, MATCH\[b\]\] for all b, vectorized +#' @noRd +.DiagDist <- function(matchVec, DIST, sentDist) { + n <- length(matchVec) + out <- numeric(n) + notNA <- !is.na(matchVec) + if (any(notNA)) { + out[notNA] <- DIST[cbind(which(notNA), matchVec[notNA])] + } + out[!notNA] <- sentDist[!notNA] + out +} + + +#' Vectorized remove-benefit +#' @noRd +.RemoveBenefitVec <- function(candidates, st, DIST, sentDist, TD, counts, + lightSide, scale) { + nSplits <- length(st$MATCH) + pMinus1 <- lightSide - 1L + + # For remove, only splits whose MATCH == candidate are affected + benefits <- numeric(length(candidates)) + matchDist <- .DiagDist(st$MATCH, DIST, sentDist) + match2Dist <- .DiagDist(st$MATCH2, DIST, sentDist) + + for (ci in seq_along(candidates)) { + c <- candidates[ci] + affected <- st$MATCH == c & !is.na(st$MATCH) + if (any(affected)) { + if (scale) { + fn_cost <- sum((DIST[affected, c] - match2Dist[affected]) / + pMinus1[affected] * counts[affected]) + } else { + fn_cost <- sum((DIST[affected, c] - match2Dist[affected]) * + counts[affected]) + } + } else { + fn_cost <- 0 + } + benefits[ci] <- TD[c] + fn_cost + } + benefits +} + + +.DoAdd <- function(branchIdx, st, DIST, sentDist) { + st$incl[branchIdx] <- TRUE + nSplits <- length(st$MATCH) + + curMatchDist <- .DiagDist(st$MATCH, DIST, sentDist) + newDist <- DIST[, branchIdx] + + better <- newDist < curMatchDist + if (any(better)) { + st$MATCH2[better] <- st$MATCH[better] + st$MATCH[better] <- branchIdx + } + + # Check if it becomes second match for others + notBetter <- !better + secDist <- .DiagDist(st$MATCH2, DIST, sentDist) + betterSec <- notBetter & (newDist < secDist) + if (any(betterSec)) { + st$MATCH2[betterSec] <- branchIdx + } +} + + +.DoRemove <- function(branchIdx, st, DIST, sentDist, lightSide, scale) { + st$incl[branchIdx] <- FALSE + nSplits <- length(st$MATCH) + curInc <- which(st$incl) + pMinus1 <- lightSide - 1L + + # Splits whose first match was branchIdx + affected1 <- which(st$MATCH == branchIdx & !is.na(st$MATCH)) + if (length(affected1)) { + st$MATCH[affected1] <- st$MATCH2[affected1] + for (b in affected1) { + if (is.na(st$MATCH[b])) { + # Promoted value was sentinel — rescan for actual closest + st$MATCH[b] <- .FindSecond(b, NA_integer_, curInc, DIST, + pMinus1, scale) + } + # Find new second match + st$MATCH2[b] <- .FindSecond(b, st$MATCH[b], curInc, DIST, + pMinus1, scale) + } + } + + # Splits whose second match was branchIdx + affected2 <- which(st$MATCH2 == branchIdx & !is.na(st$MATCH2)) + if (length(affected2)) { + for (b in affected2) { + st$MATCH2[b] <- .FindSecond(b, st$MATCH[b], curInc, DIST, + pMinus1, scale) + } + } +} + + +.FindSecond <- function(b, matchIdx, curInc, DIST, pMinus1, scale) { + cands <- if (is.na(matchIdx)) curInc else setdiff(curInc, matchIdx) + if (length(cands) == 0L) return(NA_integer_) + dists <- DIST[b, cands] + best <- cands[which.min(dists)] + bestDist <- DIST[b, best] + if (scale && bestDist / pMinus1[b] >= 1) return(NA_integer_) + if (!scale && bestDist >= pMinus1[b]) return(NA_integer_) + best +} + + +.IsCompat <- function(idx, incl, compat, nTip) { + incIdx <- which(incl) + if (length(incIdx) == 0L) return(TRUE) + if (length(incIdx) >= nTip - 3L) return(FALSE) + all(compat[idx, incIdx]) +} + + +.GreedyBest <- function(st, DIST, sentDist, TD, counts, lightSide, + compat, sortOrd, scale, nSplits, nTip) { + repeat { + # Evaluate all candidates + addCands <- integer(0) + remCands <- integer(0) + for (idx in sortOrd) { + if (st$incl[idx]) { + remCands <- c(remCands, idx) + } else if (.IsCompat(idx, st$incl, compat, nTip)) { + addCands <- c(addCands, idx) + } + } + + bestBen <- 0 + bestIdx <- 0L + bestAction <- "" + + if (length(addCands)) { + addBen <- .AddBenefitVec(addCands, st, DIST, sentDist, TD, counts, + lightSide, scale) + mx <- max(addBen) + if (mx > bestBen) { + bestBen <- mx + bestIdx <- addCands[which.max(addBen)] + bestAction <- "add" + } + } + if (length(remCands)) { + remBen <- .RemoveBenefitVec(remCands, st, DIST, sentDist, TD, counts, + lightSide, scale) + mx <- max(remBen) + if (mx > bestBen) { + bestBen <- mx + bestIdx <- remCands[which.max(remBen)] + bestAction <- "remove" + } + } + + if (bestBen <= 0) break + if (bestAction == "add") { + .DoAdd(bestIdx, st, DIST, sentDist) + } else { + .DoRemove(bestIdx, st, DIST, sentDist, lightSide, scale) + } + } +} + + +.GreedyFirst <- function(st, DIST, sentDist, TD, counts, lightSide, + compat, sortOrd, scale, nSplits, nTip) { + improving <- TRUE + while (improving) { + improving <- FALSE + matchDist <- .DiagDist(st$MATCH, DIST, sentDist) + match2Dist <- .DiagDist(st$MATCH2, DIST, sentDist) + pMinus1 <- lightSide - 1L + + for (idx in sortOrd) { + if (st$incl[idx]) { + # Quick remove benefit + affected <- st$MATCH == idx & !is.na(st$MATCH) + if (any(affected)) { + if (scale) { + fn <- sum((DIST[affected, idx] - match2Dist[affected]) / + pMinus1[affected] * counts[affected]) + } else { + fn <- sum((DIST[affected, idx] - match2Dist[affected]) * + counts[affected]) + } + } else { + fn <- 0 + } + if (TD[idx] + fn > 0) { + .DoRemove(idx, st, DIST, sentDist, lightSide, scale) + improving <- TRUE + break + } + } else if (.IsCompat(idx, st$incl, compat, nTip)) { + newDist <- DIST[, idx] + if (scale) { + diff <- pmax((matchDist - newDist) / pMinus1, 0) + } else { + diff <- pmax(matchDist - newDist, 0) + } + if (sum(diff * counts) - TD[idx] > 0) { + .DoAdd(idx, st, DIST, sentDist) + improving <- TRUE + break + } + } + } + } +} + + +.SplitsToPhylo <- function(rawSplits, included, tipLabels, nTip) { + selectedIdx <- which(included) + if (length(selectedIdx) == 0L) { + return(TreeTools::StarTree(tipLabels)) + } + selectedRaw <- rawSplits[selectedIdx, , drop = FALSE] + sp <- structure(selectedRaw, nTip = nTip, tip.label = tipLabels, + class = "Splits") + as.phylo(sp) +} diff --git a/R/tree_distance.R b/R/tree_distance.R index 466de37e..11bf9e91 100644 --- a/R/tree_distance.R +++ b/R/tree_distance.R @@ -35,6 +35,7 @@ #' @export GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, maximize, reportMatching, ...) { + .ValidateSplitArgs(splits1, splits2, nTip) nSplits1 <- dim(splits1)[[1]] nSplits2 <- dim(splits2)[[1]] @@ -148,6 +149,7 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, } nTip <- length(tipLabels) if (nTip < 4) return(NULL) # nocov + if (nTip > 32767L) stop("This many tips are not (yet) supported.") splits_list <- as.Splits(tree1, tipLabels = tipLabels) n_threads <- as.integer(getOption("mc.cores", 1L)) @@ -201,6 +203,7 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, nTip <- length(tipLabels1) if (nTip < 4) return(NULL) + if (nTip > 32767L) stop("This many tips are not (yet) supported.") splits1 <- as.Splits(tree1, tipLabels = tipLabels1) splits2 <- as.Splits(tree2, tipLabels = tipLabels1) # Use tipLabels1 to ensure order consistency diff --git a/R/tree_distance_mast.R b/R/tree_distance_mast.R index 4cdd06e0..5c069841 100644 --- a/R/tree_distance_mast.R +++ b/R/tree_distance_mast.R @@ -93,6 +93,12 @@ MASTSize <- function(tree1, tree2 = tree1, rooted = TRUE) { #' @param nTip Integer specifying the number of leaves in each split. #' @keywords internal .MASTSizeEdges <- function(edge1, edge2, nTip) { + if (nrow(edge1) != nrow(edge2)) { + stop("Both trees must contain the same number of edges.") + } + if (nTip > 4096L) { + stop("Tree too large; please contact maintainer for advice.") + } cpp_mast(edge1 - 1L, Postorder(edge2) - 1L, nTip) } diff --git a/R/tree_distance_nni.R b/R/tree_distance_nni.R index 7cce0a83..45854764 100644 --- a/R/tree_distance_nni.R +++ b/R/tree_distance_nni.R @@ -73,6 +73,13 @@ NNIDist <- function(tree1, tree2 = tree1) { #' @importFrom TreeTools Postorder RenumberTips #' @importFrom ape Nnode.phylo .NNIDistSingle <- function(tree1, tree2, nTip, ...) { + if (nTip > 32768L) { + stop("Cannot calculate NNI distance for trees with so many tips.") + } + if (nrow(tree1[["edge"]]) != nrow(tree2[["edge"]])) { + stop("Both trees must have the same number of edges. ", + "Is one rooted and the other unrooted?") + } tree2 <- RenumberTips(tree2, as.character(tree1[["tip.label"]])) edge1 <- Postorder(tree1[["edge"]]) diff --git a/R/tree_distance_spr.R b/R/tree_distance_spr.R index 844d054b..85657a37 100644 --- a/R/tree_distance_spr.R +++ b/R/tree_distance_spr.R @@ -251,6 +251,9 @@ SPRDist.multiPhylo <- SPRDist.list } .SPRExact7 <- function(sp1, sp2) { + if (length(sp1) != 4L || length(sp2) != 4L) { + stop("Expected a length-4 raw vector of splits") # nocov + } spr_table_7(sp1, sp2) } @@ -407,7 +410,7 @@ SPRDist.multiPhylo <- SPRDist.list # \insertCite{deOliveira2008;textual}{TreeDist} # An exact match is unlikely due to the arbitrary breaking of ties when leaves # are selected for removal. -# SPR approximation via Oliveira Martins _et al._ (2008) +#' SPR approximation via Oliveira Martins _et al._ (2008) #' @examples #' # de Oliveira Martins et al 2008, fig. 7 #' tree1 <- ape::read.tree(text = "((1, 2), ((a, b), (c, d)), (3, (4, (5, (6, 7)))));") @@ -421,6 +424,11 @@ SPRDist.multiPhylo <- SPRDist.list #' @importFrom TreeTools DropTip TipsInSplits KeepTipPostorder #' @importFrom TreeTools edge_to_splits .SPRPairDeO <- function(tree1, tree2, check = TRUE) { + if (check) { + if (length(tree1[["tip.label"]]) != length(tree2[["tip.label"]])) { + stop("`tree1` and `tree2` must have the same number of tips.") + } + } moves <- 0 # Reduce trees (Fig. 7A in deOliveira2008) @@ -481,3 +489,37 @@ SPRDist.multiPhylo <- SPRDist.list # Return: moves } + +# R-level validation wrappers for internal C++ functions. +# C++ uses ASSERT() (compiled out in release); these ensure user-visible errors. +mismatch_size <- function(x, y) { + if (!inherits(x, "Splits") || is.null(attr(x, "nTip"))) { + stop("`x` lacks nTip attribute") + } + if (!inherits(y, "Splits") || is.null(attr(y, "nTip"))) { + stop("`y` lacks nTip attribute") + } + if (attr(x, "nTip") != attr(y, "nTip")) { + stop("`x` and `y` differ in `nTip`") + } + if (nrow(x) != nrow(y)) { + stop("`x` and `y` differ in number of splits.") + } + .Call(`_TreeDist_mismatch_size`, x, y) +} + +confusion <- function(x, y) { + if (!inherits(x, "Splits") || is.null(attr(x, "nTip"))) { + stop("`x` lacks nTip attribute") + } + if (!inherits(y, "Splits") || is.null(attr(y, "nTip"))) { + stop("`y` lacks nTip attribute") + } + if (attr(x, "nTip") != attr(y, "nTip")) { + stop("`x` and `y` differ in `nTip`") + } + if (nrow(x) != nrow(y)) { + stop("Input splits must contain same number of splits.") + } + .Call(`_TreeDist_confusion`, x, y) +} diff --git a/R/tree_distance_transfer.R b/R/tree_distance_transfer.R new file mode 100644 index 00000000..cd0942fd --- /dev/null +++ b/R/tree_distance_transfer.R @@ -0,0 +1,260 @@ +#' Transfer dissimilarity between phylogenetic trees +#' +#' Compute the transfer dissimilarity between phylogenetic trees, as defined +#' by \insertCite{Takazawa2026;textual}{TreeDist}. +#' The transfer dissimilarity uses the transfer distance +#' \insertCite{Lemoine2018}{TreeDist} to compare bipartitions, providing a +#' finer-grained measure than the Robinson--Foulds distance. Each branch in +#' each tree is scored by how many taxa must be moved to match its closest +#' counterpart in the other tree, and these scores are summed. +#' +#' The `scaled` variant divides each branch's contribution by its depth minus +#' one, giving equal weight to all branches regardless of their depth (analogous +#' to the Robinson--Foulds distance). The `unscaled` variant uses raw transfer +#' distances, giving more weight to deep branches. Neither variant satisfies +#' the triangle inequality for trees with six or more tips. +#' +#' @inheritParams TreeDistance +#' @param scale Logical; if `TRUE` (default), use the scaled transfer +#' dissimilarity. If `FALSE`, use the unscaled transfer dissimilarity. +#' +#' @return `TransferDist()` returns an object of class `dist` (if `tree2` is +#' `NULL`), a numeric matrix (if both `tree1` and `tree2` are lists), or a +#' numeric value (for a single pair). If `reportMatching = TRUE`, the +#' return value carries `matching` and `pairScores` attributes. +#' +#' @section Normalization: +#' +#' When `normalize = TRUE`, the scaled transfer dissimilarity is divided by +#' `2 * (n - 3)`, placing it in the range \[0, 1\]. The unscaled version is +#' divided by the maximum possible unscaled dissimilarity +#' (following \insertCite{Takazawa2026;textual}{TreeDist}). +#' +#' @examples +#' library(TreeTools) +#' TransferDist(BalancedTree(8), PectinateTree(8)) +#' TransferDist(BalancedTree(8), PectinateTree(8), scale = FALSE) +#' +#' # All-pairs +#' TransferDist(as.phylo(0:5, 8)) +#' +#' @references +#' \insertAllCited{} +#' +#' @family tree distances +#' +#' @importFrom TreeTools as.Splits TipLabels NSplits +#' @export +TransferDist <- function(tree1, tree2 = NULL, scale = TRUE, + normalize = FALSE, reportMatching = FALSE) { + + # --- Fast path: all-pairs (tree2 = NULL) --- + if (is.null(tree2) && !reportMatching) { + fast <- .TransferDistAllPairs(tree1, scale) + if (!is.null(fast)) { + if (!isFALSE(normalize)) { + nTip <- length(TipLabels(tree1[[1]])) + denom <- .TransferNormDenom(nTip, scale) + fast <- fast / denom + } + return(fast) + } + } + + # --- Fast path: cross-pairs --- + if (!is.null(tree2) && !reportMatching) { + fast <- .TransferDistCrossPairs(tree1, tree2, scale) + if (!is.null(fast)) { + if (!isFALSE(normalize)) { + nTip <- length(TipLabels( + if (inherits(tree1, c("phylo", "Splits"))) tree1 else tree1[[1]])) + denom <- .TransferNormDenom(nTip, scale) + fast <- fast / denom + } + return(fast) + } + } + + # --- Generic fallback via CalculateTreeDistance --- + # Capture `scale` in the closure for the Splits-level function + Func <- function(splits1, splits2, nTip, reportMatching = FALSE) { + TransferDistSplits(splits1, splits2, nTip = nTip, + scale = scale, reportMatching = reportMatching) + } + unnormalized <- CalculateTreeDistance(Func, tree1, tree2, reportMatching) + + if (!isFALSE(normalize)) { + nTip <- length(TipLabels( + if (inherits(tree1, c("phylo", "Splits"))) tree1 else tree1[[1]])) + denom <- .TransferNormDenom(nTip, scale) + unnormalized <- unnormalized / denom + } + + unnormalized +} + +#' @rdname TransferDist +#' @export +TransferDistance <- TransferDist + +#' @rdname TransferDist +#' @inheritParams SharedPhylogeneticInfoSplits +#' @export +TransferDistSplits <- function(splits1, splits2, + nTip = attr(splits1, "nTip"), + scale = TRUE, + reportMatching = FALSE) { + .ValidateSplitArgs(splits1, splits2, nTip) + solution <- cpp_transfer_dist_scored(splits1, splits2, + nTip = as.integer(nTip), + scale = scale) + ret <- solution[["score"]] + + if (reportMatching) { + nSplits1 <- nrow(splits1) + nSplits2 <- nrow(splits2) + matching <- solution[["matching"]] + matching[matching > nSplits2 | matching == 0L] <- NA + if (nSplits1 < nSplits2) { + matching <- matching[seq_len(nSplits1)] + } + attr(ret, "matching") <- matching + + # Compute full pairwise score matrix for reportMatching + pairScores <- matrix(0, nSplits1, nSplits2) + for (i in seq_len(nSplits1)) { + for (j in seq_len(nSplits2)) { + # Per-pair: the transfer distance contribution + # This is the individual δ(b_i, b*_j) / (depth(b_i) - 1) for scaled + # or min(δ(b_i, b*_j), depth(b_i) - 1) for unscaled + pair_res <- cpp_transfer_dist_scored( + splits1[i, , drop = FALSE], + splits2[j, , drop = FALSE], + nTip = as.integer(nTip), + scale = scale + ) + pairScores[i, j] <- pair_res[["score"]] + } + } + + if (!is.null(attr(splits1, "tip.label"))) { + matched1 <- !is.na(matching) + matched2 <- matching[matched1] + matched1 <- which(matched1) + attr(ret, "matchedSplits") <- + ReportMatching(splits1[[matched1]], splits2[[matched2]], + realMatch = rep(TRUE, length(matched1))) + } + + attr(ret, "matchedScores") <- pairScores[ + matrix(c(seq_along(matching), matching), ncol = 2L)] + attr(ret, "pairScores") <- pairScores + } + + ret +} + + +# ============================================================================ +# Internal helpers +# ============================================================================ + +# All-pairs fast path +.TransferDistAllPairs <- function(tree1, scale) { + if (inherits(tree1, c("phylo", "Splits"))) return(NULL) + if (length(tree1) < 2L) return(NULL) + + tipLabels <- TipLabels(tree1[[1]]) + if (is.null(tipLabels)) return(NULL) + nTip <- length(tipLabels) + if (nTip < 4L) return(NULL) + if (nTip > 32767L) stop("This many tips are not (yet) supported.") + + # Check all trees share same tip set + allLabels <- TipLabels(tree1) + if (is.list(allLabels)) { + if (!all(vapply(allLabels[-1], setequal, logical(1), tipLabels))) { + return(NULL) + } + } + + splitsList <- lapply(tree1, function(tr) { + unclass(as.Splits(tr, tipLabels)) + }) + + nThreads <- max(1L, getOption("TreeDist.threads", + getOption("mc.cores", 1L))) + + dists <- cpp_transfer_dist_all_pairs(splitsList, nTip, scale, nThreads) + + N <- length(tree1) + attr(dists, "Size") <- N + attr(dists, "Diag") <- FALSE + attr(dists, "Upper") <- FALSE + nms <- names(tree1) + if (!is.null(nms)) attr(dists, "Labels") <- nms + class(dists) <- "dist" + dists +} + + +# Cross-pairs fast path +.TransferDistCrossPairs <- function(tree1, tree2, scale) { + single1 <- inherits(tree1, c("phylo", "Splits")) + single2 <- inherits(tree2, c("phylo", "Splits")) + if (single1 && single2) return(NULL) # use generic path for single pair + + trees1 <- if (single1) list(tree1) else tree1 + trees2 <- if (single2) list(tree2) else tree2 + + tipLabels <- TipLabels(trees1[[1]]) + if (is.null(tipLabels)) return(NULL) + nTip <- length(tipLabels) + if (nTip < 4L) return(NULL) + if (nTip > 32767L) stop("This many tips are not (yet) supported.") + + # Check all trees share same tip set + allLabels1 <- TipLabels(trees1) + allLabels2 <- TipLabels(trees2) + if (is.list(allLabels1)) { + if (!all(vapply(allLabels1, setequal, logical(1), tipLabels))) return(NULL) + } + if (is.list(allLabels2)) { + if (!all(vapply(allLabels2, setequal, logical(1), tipLabels))) return(NULL) + } else { + if (!setequal(allLabels2, tipLabels)) return(NULL) + } + + splits1 <- lapply(trees1, function(tr) unclass(as.Splits(tr, tipLabels))) + splits2 <- lapply(trees2, function(tr) unclass(as.Splits(tr, tipLabels))) + + nThreads <- max(1L, getOption("TreeDist.threads", + getOption("mc.cores", 1L))) + + mat <- cpp_transfer_dist_cross_pairs(splits1, splits2, nTip, scale, nThreads) + + rownames(mat) <- names(trees1) + colnames(mat) <- names(trees2) + + # If one input was a single tree, simplify to vector + if (single1) return(mat[1, ]) + if (single2) return(mat[, 1]) + mat +} + + +# Normalization denominator +.TransferNormDenom <- function(nTip, scale) { + if (scale) { + # Scaled: each tree contributes at most (n-3) branches × 1.0 + 2.0 * (nTip - 3L) + } else { + # Unscaled: maximum dissimilarity between two caterpillar trees + # Takazawa 2026: (n^2 - 2n + 4)/4 for even n, (n^2 - 2n + 5)/4 for odd n + if (nTip %% 2L == 0L) { + (nTip^2 - 2 * nTip + 4) / 4 + } else { + (nTip^2 - 2 * nTip + 5) / 4 + } + } +} diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index 0ad954de..561174bf 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -132,6 +132,9 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, # Fast paths: use OpenMP batch functions when all trees share the same tip # set and no R-level cluster has been configured. Each branch mirrors the # generic path exactly but avoids per-pair R overhead. + if (!is.na(nTip) && nTip > 32767L) { + stop("This many tips are not (yet) supported.") + } if (!is.na(nTip) && is.null(cluster)) { .n_threads <- as.integer(getOption("mc.cores", 1L)) .batch_result <- if (identical(Func, MutualClusteringInfoSplits)) { @@ -232,7 +235,9 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, #' @importFrom stats setNames .SplitDistanceManyMany <- function(Func, splits1, splits2, tipLabels, nTip = length(tipLabels), ...) { - + if (!is.na(nTip) && nTip > 32767L) { + stop("This many tips are not (yet) supported.") + } if (is.na(nTip)) { tipLabels <- union(unlist(tipLabels, use.names = FALSE), unlist(TipLabels(splits2), use.names = FALSE)) @@ -396,6 +401,18 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, } } +# Validate split-matrix arguments before passing to C++. +# On ARM, Rcpp::stop() can call std::terminate() instead of returning +# a proper R error, so all expected-input validation must happen in R. +.ValidateSplitArgs <- function(x, y, nTip) { + if (ncol(x) != ncol(y)) { + stop("Input splits must address same number of tips.") + } + if (nTip > 32767L) { + stop("This many tips are not (yet) supported.") + } +} + .CheckLabelsSame <- function(labelList) { nTip <- unique(lengths(labelList)) if (length(nTip) != 1) { diff --git a/R/tree_information.R b/R/tree_information.R index 767c918d..580298c8 100644 --- a/R/tree_information.R +++ b/R/tree_information.R @@ -380,6 +380,23 @@ ConsensusInfo <- function(trees, info = "phylogenetic", p = 0.5, consensus_info(trees, mode == 1L, p = safeP) } +# R-level validation wrapper; shadows RcppExports version. +# C++ ASSERT() is compiled out in release builds. +consensus_info <- function(trees, phylo, p) { + if (p > 1 + 1e-15) { + stop("p must be <= 1.0 in consensus_info()") + } + if (p < 0.5) { + stop("p must be >= 0.5 in consensus_info()") + } + nTip <- NTip(trees[[1]]) + # CT_MAX_LEAVES = 16383 in information.h (lookup table size limit) + if (nTip > 16383L) { + stop("This many leaves are not yet supported") + } + .Call(`_TreeDist_consensus_info`, trees, phylo, p) +} + #' Maximum Clade Information Tree #' #' Analogous to the Maximum Clade Credibility tree: diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 6614f645..f2f400a3 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -1,3 +1,22 @@ +@article{Lemoine2018, + author = {Lemoine, Fr{\'e}d{\'e}ric and Domelevo Entfellner, Jean-Baka and Wilkinson, Eduan and Correia, Daniela and D{\'a}vila Felipe, Miraine and De Oliveira, Tulio and Gascuel, Olivier}, + title = {{Renewing Felsenstein's phylogenetic bootstrap in the era of big data}}, + journal = {Nature}, + year = {2018}, + volume = {556}, + number = {7702}, + pages = {452--456}, + doi = {10.1038/s41586-018-0043-0} +} + +@article{Takazawa2026, + author = {Takazawa, Yuuki and Takeda, Akiko and Hayamizu, Momoko and Gascuel, Olivier}, + title = {{Outperforming the majority-rule consensus tree using fine-grained dissimilarity measures}}, + journal = {bioRxiv}, + year = {2026}, + doi = {10.64898/2026.03.16.712085} +} + @article{Allen2001, author = {Allen, Benjamin L. and Steel, Mike A.}, doi = {10.1007/s00026-001-8006-8}, diff --git a/inst/WORDLIST b/inst/WORDLIST index d3c333c8..ca6a431f 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -1,6 +1,7 @@ ABCD Bocker Bogdanowicz +branchless Böcker Cai Colijn diff --git a/inst/include/TreeDist/cost_matrix.h b/inst/include/TreeDist/cost_matrix.h new file mode 100644 index 00000000..fe4dddcf --- /dev/null +++ b/inst/include/TreeDist/cost_matrix.h @@ -0,0 +1,276 @@ +#ifndef TREEDIST_COST_MATRIX_H_ +#define TREEDIST_COST_MATRIX_H_ + +// TreeDist CostMatrix — cache-aligned, block-padded cost matrix for the LAP. +// +// Rcpp-free public API. The Rcpp-dependent constructor +// (CostMatrix(Rcpp::NumericMatrix)) lives in TreeDist's own src/lap.h. + +#include "types.h" + +#include +#include +#include +#include + +namespace TreeDist { + +class CostMatrix { +private: + std::size_t dim_; + std::size_t dim8_; + alignas(64) std::vector data_; + alignas(64) std::vector t_data_; + bool transposed_; + + static std::size_t block_containing(const std::size_t x) { + return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE; + } + +public: + // Default constructor for pooled instances (zero-size, no allocation). + CostMatrix() : dim_(0), dim8_(0), transposed_(false) {} + + explicit CostMatrix(std::size_t dim) + : dim_(dim), + dim8_(block_containing(dim_)), + data_(std::vector(dim8_ * dim8_)), + t_data_(std::vector(dim8_ * dim8_)), + transposed_(false) {} + + // Resize for reuse. Only reallocates when the new dimension exceeds the + // current buffer capacity; otherwise just updates dim_/dim8_ and marks + // the transpose as stale. + void resize(std::size_t new_dim) { + dim_ = new_dim; + dim8_ = block_containing(new_dim); + const std::size_t needed = dim8_ * dim8_; + if (data_.size() < needed) { + data_.resize(needed); + t_data_.resize(needed); + } + transposed_ = false; + } + + void reset() noexcept { transposed_ = false; } + + [[nodiscard]] std::size_t dim() const noexcept { return dim_; } + [[nodiscard]] std::size_t dim8() const noexcept { return dim8_; } + + // ---- Element access ---- + + cost& operator()(lap_row i, lap_col j) { + return data_[static_cast(i) * dim8_ + j]; + } + + [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const { + return data_[static_cast(i) * dim8_ + j]; + } + + [[nodiscard]] const cost& row0(lap_col j) const { + return data_[j]; + } + + [[nodiscard]] const cost& entry0(lap_row i) const { + return data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] cost* row(lap_row i) { + return &data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] const cost* row(lap_row i) const { + return &data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] cost* col(lap_col i) { + return &t_data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] const cost* col(lap_col i) const { + return &t_data_[static_cast(i) * dim8_]; + } + + // Write a value to both the data and transposed-data arrays. + // After filling the entire matrix this way, call markTransposed() + // to skip the lazy transpose in makeTranspose(). + void setWithTranspose(lap_row i, lap_col j, cost value) { + data_[static_cast(i) * dim8_ + j] = value; + t_data_[static_cast(j) * dim8_ + i] = value; + } + + void markTransposed() noexcept { transposed_ = true; } + + // ---- Transpose ---- + + void makeTranspose() noexcept { + if (transposed_) return; + + const cost* __restrict__ data_ptr = data_.data(); + cost* __restrict__ t_data_ptr = t_data_.data(); + +#if defined(__GNUC__) || defined(__clang__) + data_ptr = + static_cast(__builtin_assume_aligned(data_ptr, 64)); + t_data_ptr = + static_cast(__builtin_assume_aligned(t_data_ptr, 64)); +#endif + for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) { + for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) { + for (std::size_t r = i; + r < std::min(i + BLOCK_SIZE, dim_); ++r) { + for (std::size_t c = j; + c < std::min(j + BLOCK_SIZE, dim_); ++c) { + t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; + } + } + } + } + transposed_ = true; + } + + void makeUntranspose() noexcept { + const cost* __restrict__ data_ptr = t_data_.data(); + cost* __restrict__ t_data_ptr = data_.data(); + +#if defined(__GNUC__) || defined(__clang__) + data_ptr = + static_cast(__builtin_assume_aligned(data_ptr, 64)); + t_data_ptr = + static_cast(__builtin_assume_aligned(t_data_ptr, 64)); +#endif + for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) { + for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) { + for (std::size_t r = i; + r < std::min(i + BLOCK_SIZE, dim_); ++r) { + for (std::size_t c = j; + c < std::min(j + BLOCK_SIZE, dim_); ++c) { + t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; + } + } + } + } + } + + // ---- Padding ---- + + void padTrAfterCol(lap_row start_row, cost value) { + std::size_t start_index = static_cast(start_row) * dim8_; + std::size_t end_index = dim_ * dim8_; + std::fill(t_data_.begin() + start_index, + t_data_.begin() + end_index, value); + } + + void padAfterRow(lap_row start_row, cost value) { + std::size_t start_index = static_cast(start_row) * dim8_; + std::size_t end_index = dim_ * dim8_; + std::fill(data_.begin() + start_index, + data_.begin() + end_index, value); + } + + void padTrColAfterRow(const lap_row r, const lap_col start_col, + const cost value) { + std::size_t r_offset = r * dim8_; + std::size_t actual_start_col = static_cast(start_col); + std::size_t start_index = r_offset + actual_start_col; + std::size_t end_index = start_index + dim_ - actual_start_col; + std::fill(t_data_.begin() + start_index, + t_data_.begin() + end_index, value); + } + + void padRowAfterCol(const lap_row r, const lap_col start_col, + const cost value) { + std::size_t r_offset = r * dim8_; + std::size_t actual_start_col = static_cast(start_col); + std::size_t start_index = r_offset + actual_start_col; + std::size_t end_index = start_index + dim_ - actual_start_col; + std::fill(data_.begin() + start_index, + data_.begin() + end_index, value); + } + + // ---- Search ---- + + std::pair findColMin(lap_col j, + lap_row search_dim = -1) { + makeTranspose(); + const cost* col_data = col(j); + const lap_row n = + (search_dim < 0) ? static_cast(dim_) : search_dim; + const auto min_ptr = std::min_element(col_data, col_data + n); + return {*min_ptr, + static_cast(std::distance(col_data, min_ptr))}; + } + + std::tuple findRowSubmin( + const lap_row* i, const std::vector& v + ) const { + assert(dim_ > 1); + + const cost* __restrict__ row_i = row(*i); + const lap_col dim = static_cast(dim_); + const cost* __restrict__ v_ptr = v.data(); + + const cost h0 = row_i[0] - v_ptr[0]; + const cost h1 = row_i[1] - v_ptr[1]; + + cost min_val, submin_val; + lap_col min_idx, submin_idx; + + if (h0 > h1) { + min_val = h1; submin_val = h0; + min_idx = 1; submin_idx = 0; + } else { + min_val = h0; submin_val = h1; + min_idx = 0; submin_idx = 1; + } + + const lap_col j_limit = + (dim < 4 ? 0 : static_cast(dim - 3)); + + for (lap_col j = 2; j < j_limit; j += 4) { + assert(BLOCK_SIZE >= 4); + const cost h0 = row_i[j] - v_ptr[j]; + const cost h1 = row_i[j + 1] - v_ptr[j + 1]; + const cost h2 = row_i[j + 2] - v_ptr[j + 2]; + const cost h3 = row_i[j + 3] - v_ptr[j + 3]; + if (h0 < submin_val) { + if (h0 < min_val) { + submin_val = min_val; min_val = h0; + submin_idx = min_idx; min_idx = j; + } else { submin_val = h0; submin_idx = j; } + } + if (h1 < submin_val) { + if (h1 < min_val) { + submin_val = min_val; min_val = h1; + submin_idx = min_idx; min_idx = j + 1; + } else { submin_val = h1; submin_idx = j + 1; } + } + if (h2 < submin_val) { + if (h2 < min_val) { + submin_val = min_val; min_val = h2; + submin_idx = min_idx; min_idx = j + 2; + } else { submin_val = h2; submin_idx = j + 2; } + } + if (h3 < submin_val) { + if (h3 < min_val) { + submin_val = min_val; min_val = h3; + submin_idx = min_idx; min_idx = j + 3; + } else { submin_val = h3; submin_idx = j + 3; } + } + } + for (lap_col j = j_limit; j < dim; ++j) { + const cost h = row_i[j] - v_ptr[j]; + if (h < submin_val) { + if (h < min_val) { + submin_val = min_val; min_val = h; + submin_idx = min_idx; min_idx = j; + } else { submin_val = h; submin_idx = j; } + } + } + return {min_val, submin_val, min_idx, submin_idx}; + } +}; + +} // namespace TreeDist + +#endif // TREEDIST_COST_MATRIX_H_ diff --git a/inst/include/TreeDist/lap.h b/inst/include/TreeDist/lap.h new file mode 100644 index 00000000..673cfa22 --- /dev/null +++ b/inst/include/TreeDist/lap.h @@ -0,0 +1,35 @@ +#ifndef TREEDIST_LAP_H_ +#define TREEDIST_LAP_H_ + +// LAP (Linear Assignment Problem) — Jonker–Volgenant declarations. +// +// Provides the lap() function signature. The implementation lives in +// lap_impl.h, guarded by TREEDIST_LAP_IMPLEMENTATION. Downstream +// packages should include lap_impl.h in exactly one translation unit +// with the guard defined. + +#include "lap_scratch.h" + +namespace TreeDist { + + // Primary overload: caller supplies pre-allocated scratch. + extern cost lap(lap_row dim, + CostMatrix& input_cost, + std::vector& rowsol, + std::vector& colsol, + bool allow_interrupt, + LapScratch& scratch); + + // Convenience overload: creates a temporary scratch. + inline cost lap(lap_row dim, + CostMatrix& input_cost, + std::vector& rowsol, + std::vector& colsol, + bool allow_interrupt = true) { + LapScratch scratch; + return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch); + } + +} // namespace TreeDist + +#endif // TREEDIST_LAP_H_ diff --git a/inst/include/TreeDist/lap_impl.h b/inst/include/TreeDist/lap_impl.h new file mode 100644 index 00000000..196c37f6 --- /dev/null +++ b/inst/include/TreeDist/lap_impl.h @@ -0,0 +1,254 @@ +#ifndef TREEDIST_LAP_IMPL_H_ +#define TREEDIST_LAP_IMPL_H_ + +// LAP (Linear Assignment Problem) — Jonker–Volgenant implementation. +// +// Guard this with #define TREEDIST_LAP_IMPLEMENTATION before including. +// Include in exactly one translation unit per package. +// +// Interrupt handling: define TREEDIST_CHECK_INTERRUPT() before including +// to enable user-interrupt checks (e.g., Rcpp::checkUserInterrupt()). +// If not defined, interrupt checks are silently skipped. +// +// Original algorithm: +// R. Jonker and A. Volgenant, "A Shortest Augmenting Path Algorithm +// for Dense and Sparse Linear Assignment Problems," Computing 38, +// 325-340, 1987. + +#ifdef TREEDIST_LAP_IMPLEMENTATION + +#include "lap.h" + +#ifndef TREEDIST_CHECK_INTERRUPT +#define TREEDIST_CHECK_INTERRUPT() ((void)0) +#define TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_ +#endif + +namespace TreeDist { + +namespace detail { + inline bool nontrivially_less_than(cost a, cost b) noexcept { + return a + ((a > ROUND_PRECISION) ? 8 : 0) < b; + } +} // namespace detail + +// The LAP hot loops are sensitive to instruction alignment — even unrelated +// changes to other code in the same translation unit can shift layout enough +// to cause measurable regressions. Force 64-byte function alignment and +// 16-byte loop alignment to stabilise performance across builds. +#if defined(__GNUC__) && !defined(__clang__) +__attribute__((optimize("align-functions=64", "align-loops=16"))) +#endif +cost lap(const lap_row dim, + CostMatrix& input_cost, + std::vector& rowsol, + std::vector& colsol, + const bool allow_interrupt, + LapScratch& scratch) +{ + lap_row num_free = 0; + scratch.ensure(dim); + auto& v = scratch.v; + auto& matches = scratch.matches; + std::fill(matches.begin(), matches.begin() + dim, 0); + const cost* __restrict__ v_ptr = v.data(); + + // COLUMN REDUCTION + for (lap_col j = dim; j--; ) { + const auto [min, imin] = input_cost.findColMin(j); + v[j] = min; + ++matches[imin]; + + if (matches[imin] == 1) { + rowsol[imin] = j; + colsol[j] = imin; + } else if (v_ptr[j] < v_ptr[rowsol[imin]]) { + const lap_col j1 = rowsol[imin]; + rowsol[imin] = j; + colsol[j] = imin; + colsol[j1] = -1; + } else { + colsol[j] = -1; + } + } + + // REDUCTION TRANSFER + auto& freeunassigned = scratch.freeunassigned; + + for (lap_row i = 0; i < dim; ++i) { + if (matches[i] == 0) { + freeunassigned[num_free++] = i; + } else if (matches[i] == 1) { + const lap_col j1 = rowsol[i]; + const cost* row_i = input_cost.row(i); + cost min_cost; + if (j1 == 0) { + min_cost = row_i[1] - v_ptr[1]; + for (lap_col j = 2; j < dim; ++j) { + const cost reduced_cost = row_i[j] - v_ptr[j]; + if (reduced_cost < min_cost) { + min_cost = reduced_cost; + } + } + } else { + min_cost = row_i[0] - v_ptr[0]; + for (lap_col j = 1; j < dim; ++j) { + if (j == j1) continue; + const cost reduced_cost = row_i[j] - v_ptr[j]; + if (reduced_cost < min_cost) { + min_cost = reduced_cost; + } + } + } + v[j1] -= min_cost; + } + } + + // AUGMENTING ROW REDUCTION + auto& col_list = scratch.col_list; + int loopcnt = 0; + + do { + ++loopcnt; + + lap_row previous_num_free = num_free; + num_free = 0; + lap_row k = 0; + while (k < previous_num_free) { + const lap_row i = freeunassigned[k++]; + const auto [umin, usubmin, min_idx, j2] = + input_cost.findRowSubmin(&i, v); + lap_col j1 = min_idx; + + lap_row i0 = colsol[j1]; + const bool strictly_less = + detail::nontrivially_less_than(umin, usubmin); + if (strictly_less) { + v[j1] -= (usubmin - umin); + } else if (i0 > -1) { + j1 = j2; + i0 = colsol[j2]; + } + + rowsol[i] = j1; + colsol[j1] = i; + + if (i0 > -1) { + if (strictly_less) { + freeunassigned[--k] = i0; + if (allow_interrupt) TREEDIST_CHECK_INTERRUPT(); + } else { + freeunassigned[num_free++] = i0; + } + } + } + } while (loopcnt < 2); + + // AUGMENT SOLUTION for each free row. + auto& d = scratch.d; + auto& predecessor = scratch.predecessor; + + for (lap_row f = 0; f < num_free; ++f) { + bool unassignedfound = false; + lap_row free_row = freeunassigned[f]; + const cost* free_row_cost = input_cost.row(free_row); + lap_col endofpath = 0; + lap_col last = 0; + lap_row i; + lap_col j1; + + for (lap_col j = 0; j < dim; ++j) { + d[j] = free_row_cost[j] - v_ptr[j]; + predecessor[j] = free_row; + col_list[j] = j; + } + + cost min = 0; + lap_col low = 0; + lap_col up = 0; + + do { + if (up == low) { + last = low - 1; + min = d[col_list[up++]]; + + for (lap_dim k = up; k < dim; ++k) { + const lap_col j = col_list[k]; + const cost h = d[j]; + if (h <= min) { + if (h < min) { + up = low; + min = h; + } + col_list[k] = col_list[up]; + col_list[up++] = j; + } + } + for (lap_dim k = low; k < up; ++k) { + if (colsol[col_list[k]] < 0) { + endofpath = col_list[k]; + unassignedfound = true; + break; + } + } + } + + if (!unassignedfound) { + j1 = col_list[low++]; + i = colsol[j1]; + const cost* row_i = input_cost.row(i); + const cost h = row_i[j1] - v_ptr[j1] - min; + + for (lap_dim k = up; k < dim; ++k) { + const lap_col j = col_list[k]; + cost v2 = row_i[j] - v_ptr[j] - h; + if (v2 < d[j]) { + predecessor[j] = i; + if (v2 == min) { + if (colsol[j] < 0) { + endofpath = j; + unassignedfound = true; + break; + } else { + col_list[k] = col_list[up]; + col_list[up++] = j; + } + } + d[j] = v2; + } + } + } + } while (!unassignedfound); + + for (lap_dim k = 0; k <= last; ++k) { + j1 = col_list[k]; + v[j1] += d[j1] - min; + } + + do { + i = predecessor[endofpath]; + colsol[endofpath] = i; + j1 = endofpath; + endofpath = rowsol[i]; + rowsol[i] = j1; + } while (i != free_row); + } + + // Calculate optimal cost. + cost lapcost = 0; + for (lap_dim i = 0; i < dim; ++i) { + lapcost += input_cost(i, rowsol[i]); + } + + return lapcost; +} + +} // namespace TreeDist + +#ifdef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_ +#undef TREEDIST_CHECK_INTERRUPT +#undef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_ +#endif + +#endif // TREEDIST_LAP_IMPLEMENTATION +#endif // TREEDIST_LAP_IMPL_H_ diff --git a/inst/include/TreeDist/lap_scratch.h b/inst/include/TreeDist/lap_scratch.h new file mode 100644 index 00000000..4a9ca43b --- /dev/null +++ b/inst/include/TreeDist/lap_scratch.h @@ -0,0 +1,44 @@ +#ifndef TREEDIST_LAP_SCRATCH_H_ +#define TREEDIST_LAP_SCRATCH_H_ + +// Reusable heap storage for one thread's LAPJV workspace. +// +// Pass to the scratch-taking overload of lap() to eliminate per-call +// std::vector allocations. In a serial context construct once before the +// loop; in an OpenMP context allocate one per thread and index by +// omp_get_thread_num(). All vectors are grown lazily; never shrunk. + +#include "cost_matrix.h" + +namespace TreeDist { + +struct LapScratch { + std::vector v; + std::vector matches; + std::vector freeunassigned; + std::vector col_list; + std::vector d; + std::vector predecessor; + std::vector rowsol; + std::vector colsol; + CostMatrix score_pool; + CostMatrix small_pool; + + void ensure(int dim) noexcept { + const int padded = static_cast( + ((static_cast(dim) + BLOCK_SIZE - 1) + / BLOCK_SIZE) * BLOCK_SIZE); + if (static_cast(v.size()) < padded) v.resize(padded); + if (static_cast(matches.size()) < dim) matches.resize(dim); + if (static_cast(freeunassigned.size()) < dim) freeunassigned.resize(dim); + if (static_cast(col_list.size()) < dim) col_list.resize(dim); + if (static_cast(d.size()) < dim) d.resize(dim); + if (static_cast(predecessor.size()) < dim) predecessor.resize(dim); + if (static_cast(rowsol.size()) < dim) rowsol.resize(dim); + if (static_cast(colsol.size()) < dim) colsol.resize(dim); + } +}; + +} // namespace TreeDist + +#endif // TREEDIST_LAP_SCRATCH_H_ diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h new file mode 100644 index 00000000..437633f0 --- /dev/null +++ b/inst/include/TreeDist/mutual_clustering.h @@ -0,0 +1,87 @@ +#ifndef TREEDIST_MUTUAL_CLUSTERING_H_ +#define TREEDIST_MUTUAL_CLUSTERING_H_ + +// Mutual Clustering Information (MCI) — public API. +// +// Provides the core MCI score computation between two split sets, +// clustering entropy, and lookup-table initialization. +// +// Implementation lives in mutual_clustering_impl.h, guarded by +// TREEDIST_MCI_IMPLEMENTATION. + +#include "lap.h" +#include // splitbit, SL_MAX_TIPS, count_bits +#include + +namespace TreeDist { + + // ---- Lookup tables (populated by init_lg2_tables) ---- + // + // lg2[i] = log2(i) for 0 <= i <= SL_MAX_TIPS + // lg2_double_factorial[i] = log2(i!!) for 0 <= i < 2*SL_MAX_TIPS - 2 + // lg2_unrooted[i] = log2((2i-5)!!) for i >= 3 + // lg2_rooted = &lg2_unrooted[0] + 1 (so lg2_rooted[i] = lg2_unrooted[i+1]) + // + // These are defined in mutual_clustering_impl.h. + + extern double lg2[SL_MAX_TIPS + 1]; + extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; + extern double lg2_unrooted[SL_MAX_TIPS + 2]; + extern double* lg2_rooted; + + // Populate lookup tables. Must be called once before any MCI + // computation. max_tips should be >= the largest tree size used. + void init_lg2_tables(int max_tips); + + // ---- Inline helpers ---- + + // Information content of a perfectly-matching split pair. + // ic_matching(a, b, n) = (a + b) * lg2[n] - a * lg2[a] - b * lg2[b] + [[nodiscard]] inline double ic_matching(int16 a, int16 b, + int16 n) noexcept { + const double lg2a = lg2[a]; + const double lg2b = lg2[b]; + const double lg2n = lg2[n]; + return (a + b) * lg2n - a * lg2a - b * lg2b; + } + + // Clustering entropy of a split set. + // Computes H_clust = -sum_i [ p_i * log2(p_i) + q_i * log2(q_i) ] + // where p_i = in_split[i] / n_tips and q_i = 1 - p_i. + [[nodiscard]] inline double clustering_entropy( + const int* in_split, int n_splits, int n_tips) { + if (n_tips <= 1 || n_splits == 0) return 0.0; + const double lg2n = std::log2(static_cast(n_tips)); + double ce = 0.0; + for (int i = 0; i < n_splits; ++i) { + const int a = in_split[i]; + const int b = n_tips - a; + if (a > 1 && b > 1) { + ce += (a * std::log2(static_cast(a)) + + b * std::log2(static_cast(b))) + / n_tips - lg2n; + } + } + return -ce; + } + + // ---- MCI score (declaration only) ---- + // + // Computes the Mutual Clustering Information between two split sets. + // Uses sort+merge exact-match detection + LAP on the reduced matrix. + // + // a_state[i] points to n_bins splitbit words for split i of tree A. + // a_in[i] = popcount of split i (tips in the "1" partition). + // Returns the MCI score (higher = more similar). + // + // Implementation in mutual_clustering_impl.h. + + double mutual_clustering_score( + const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, + const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, + int16 n_bins, int32 n_tips, + LapScratch& scratch); + +} // namespace TreeDist + +#endif // TREEDIST_MUTUAL_CLUSTERING_H_ diff --git a/inst/include/TreeDist/mutual_clustering_impl.h b/inst/include/TreeDist/mutual_clustering_impl.h new file mode 100644 index 00000000..7f98c8fb --- /dev/null +++ b/inst/include/TreeDist/mutual_clustering_impl.h @@ -0,0 +1,290 @@ +#ifndef TREEDIST_MUTUAL_CLUSTERING_IMPL_H_ +#define TREEDIST_MUTUAL_CLUSTERING_IMPL_H_ + +// Mutual Clustering Information — implementation. +// +// Guard with #define TREEDIST_MCI_IMPLEMENTATION before including. +// Include in exactly one translation unit per package (the same TU +// that defines TREEDIST_LAP_IMPLEMENTATION). +// +// Usage: +// #define TREEDIST_LAP_IMPLEMENTATION +// #define TREEDIST_MCI_IMPLEMENTATION +// #include +// #include + +#ifdef TREEDIST_MCI_IMPLEMENTATION + +#include "mutual_clustering.h" + +#include +#include +#include +#include + +namespace TreeDist { + +// ---- Table definitions ---- + +double lg2[SL_MAX_TIPS + 1]; +double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; +double lg2_unrooted[SL_MAX_TIPS + 2]; +double* lg2_rooted = &lg2_unrooted[0] + 1; + +void init_lg2_tables(int max_tips) { + lg2[0] = 0; + const int lg2_limit = std::min(max_tips + 1, + static_cast(SL_MAX_TIPS + 1)); + for (int i = 1; i < lg2_limit; ++i) { + lg2[i] = std::log2(static_cast(i)); + } + + for (int i = 0; i < 3; ++i) { + lg2_double_factorial[i] = 0; + lg2_unrooted[i] = 0; + } + + const int df_limit = std::min(2 * max_tips, + static_cast(SL_MAX_TIPS + SL_MAX_TIPS - 2)); + for (int i = 2; i < df_limit; ++i) { + lg2_double_factorial[i] = + lg2_double_factorial[i - 2] + std::log2(static_cast(i)); + } + + const int ur_limit = std::min(max_tips + 2, + static_cast(SL_MAX_TIPS + 2)); + for (int i = 3; i < ur_limit; ++i) { + lg2_unrooted[i] = lg2_double_factorial[i + i - 5]; + } +} + +// ---- Sort+merge exact-match detection (internal) ---- +// +// Canonicalise each split so bit 0 is always set (flip complement if not), +// sort by canonical form, merge-scan. O(n log n). +// +// Writes a_match[ai] = bi+1 if matched, else 0. Likewise b_match. +// Returns number of exact matches. + +namespace detail { + +static int16 find_exact_matches_raw( + const splitbit* const* a_state, const int16* /*a_in*/, int16 a_n, + const splitbit* const* b_state, const int16* /*b_in*/, int16 b_n, + int16 n_bins, int32 n_tips, + int16* a_match, int16* b_match) +{ + std::fill(a_match, a_match + a_n, int16(0)); + std::fill(b_match, b_match + b_n, int16(0)); + if (a_n == 0 || b_n == 0) return 0; + + const int16 last_bin = n_bins - 1; + const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0) + ? ~splitbit(0) + : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1; + + // Flat buffers for canonical forms + std::vector a_canon(static_cast(a_n) * n_bins); + std::vector b_canon(static_cast(b_n) * n_bins); + + for (int16 i = 0; i < a_n; ++i) { + const bool flip = !(a_state[i][0] & 1); + for (int16 bin = 0; bin < n_bins; ++bin) { + splitbit val = flip ? ~a_state[i][bin] : a_state[i][bin]; + if (bin == last_bin) val &= last_mask; + a_canon[i * n_bins + bin] = val; + } + } + for (int16 i = 0; i < b_n; ++i) { + const bool flip = !(b_state[i][0] & 1); + for (int16 bin = 0; bin < n_bins; ++bin) { + splitbit val = flip ? ~b_state[i][bin] : b_state[i][bin]; + if (bin == last_bin) val &= last_mask; + b_canon[i * n_bins + bin] = val; + } + } + + // Sort index arrays by canonical form + auto canon_less = [&](const splitbit* canon, int16 n_b, int16 i, int16 j) { + for (int16 bin = 0; bin < n_b; ++bin) { + const splitbit vi = canon[i * n_b + bin]; + const splitbit vj = canon[j * n_b + bin]; + if (vi < vj) return true; + if (vi > vj) return false; + } + return false; + }; + + std::vector a_order(a_n), b_order(b_n); + std::iota(a_order.begin(), a_order.end(), int16(0)); + std::iota(b_order.begin(), b_order.end(), int16(0)); + + std::sort(a_order.begin(), a_order.end(), + [&](int16 i, int16 j) { + return canon_less(a_canon.data(), n_bins, i, j); + }); + std::sort(b_order.begin(), b_order.end(), + [&](int16 i, int16 j) { + return canon_less(b_canon.data(), n_bins, i, j); + }); + + // Merge-scan + int16 exact_n = 0; + int16 ai_pos = 0, bi_pos = 0; + while (ai_pos < a_n && bi_pos < b_n) { + const int16 ai = a_order[ai_pos]; + const int16 bi = b_order[bi_pos]; + + int cmp = 0; + for (int16 bin = 0; bin < n_bins; ++bin) { + const splitbit va = a_canon[ai * n_bins + bin]; + const splitbit vb = b_canon[bi * n_bins + bin]; + if (va < vb) { cmp = -1; break; } + if (va > vb) { cmp = 1; break; } + } + + if (cmp < 0) { + ++ai_pos; + } else if (cmp > 0) { + ++bi_pos; + } else { + a_match[ai] = bi + 1; + b_match[bi] = ai + 1; + ++exact_n; + ++ai_pos; + ++bi_pos; + } + } + + return exact_n; +} + +} // namespace detail + + +// ---- MCI score implementation ---- + +double mutual_clustering_score( + const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, + const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, + int16 n_bins, int32 n_tips, + LapScratch& scratch) +{ + if (a_n_splits == 0 || b_n_splits == 0 || n_tips == 0) return 0.0; + + const int16 most_splits = std::max(a_n_splits, b_n_splits); + const double n_tips_rcp = 1.0 / static_cast(n_tips); + + constexpr cost max_score = BIG; + constexpr double over_max = 1.0 / static_cast(BIG); + const double max_over_tips = static_cast(BIG) * n_tips_rcp; + const double lg2_n = lg2[n_tips]; + + // --- Phase 1: O(n log n) exact-match detection --- + std::vector a_match_buf(a_n_splits); + std::vector b_match_buf(b_n_splits); + + const int16 exact_n = detail::find_exact_matches_raw( + a_state, a_in, a_n_splits, + b_state, b_in, b_n_splits, + n_bins, n_tips, + a_match_buf.data(), b_match_buf.data()); + + const int16* a_match = a_match_buf.data(); + const int16* b_match = b_match_buf.data(); + + // Accumulate exact-match score + double exact_score = 0.0; + for (int16 ai = 0; ai < a_n_splits; ++ai) { + if (a_match[ai]) { + const int16 na = a_in[ai]; + const int16 nA = static_cast(n_tips - na); + exact_score += ic_matching(na, nA, static_cast(n_tips)); + } + } + + // Early exit when everything matched exactly + if (exact_n == b_n_splits || exact_n == a_n_splits) { + return exact_score * n_tips_rcp; + } + + // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) --- + const int16 lap_n = most_splits - exact_n; + + std::vector a_unmatch, b_unmatch; + a_unmatch.reserve(lap_n); + b_unmatch.reserve(lap_n); + for (int16 ai = 0; ai < a_n_splits; ++ai) { + if (!a_match[ai]) a_unmatch.push_back(ai); + } + for (int16 bi = 0; bi < b_n_splits; ++bi) { + if (!b_match[bi]) b_unmatch.push_back(bi); + } + + scratch.score_pool.resize(lap_n); + CostMatrix& score = scratch.score_pool; + + const int16 a_unmatched_n = static_cast(a_unmatch.size()); + const int16 b_unmatched_n = static_cast(b_unmatch.size()); + + for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const int16 ai = a_unmatch[a_pos]; + const int16 na = a_in[ai]; + const int16 nA = static_cast(n_tips - na); + const splitbit* a_row = a_state[ai]; + + const double offset_a = lg2_n - lg2[na]; + const double offset_A = lg2_n - lg2[nA]; + + for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const int16 bi = b_unmatch[b_pos]; + const splitbit* b_row = b_state[bi]; + int16 a_and_b = 0; + for (int16 bin = 0; bin < n_bins; ++bin) { + a_and_b += TreeTools::count_bits(a_row[bin] & b_row[bin]); + } + + const int16 nb = b_in[bi]; + const int16 nB = static_cast(n_tips - nb); + const int16 a_and_B = na - a_and_b; + const int16 A_and_b = nb - a_and_b; + const int16 A_and_B = nA - A_and_b; + + if (a_and_b == A_and_b && a_and_b == a_and_B + && a_and_b == A_and_B) { + score(a_pos, b_pos) = max_score; + } else { + const double lg2_nb = lg2[nb]; + const double lg2_nB = lg2[nB]; + const double ic_sum = + a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + + a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + + A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + + A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); + score(a_pos, b_pos) = + max_score - static_cast(ic_sum * max_over_tips); + } + } + if (b_unmatched_n < lap_n) { + score.padRowAfterCol(a_pos, b_unmatched_n, max_score); + } + } + if (a_unmatched_n < lap_n) { + score.padAfterRow(a_unmatched_n, max_score); + } + + // --- Phase 3: solve LAP on the reduced matrix --- + scratch.ensure(lap_n); + auto& rowsol = scratch.rowsol; + auto& colsol = scratch.colsol; + + const double lap_score = + static_cast((max_score * lap_n) - + lap(lap_n, score, rowsol, colsol, false, scratch)) * over_max; + return lap_score + exact_score * n_tips_rcp; +} + +} // namespace TreeDist + +#endif // TREEDIST_MCI_IMPLEMENTATION +#endif // TREEDIST_MUTUAL_CLUSTERING_IMPL_H_ diff --git a/inst/include/TreeDist/types.h b/inst/include/TreeDist/types.h new file mode 100644 index 00000000..c953989b --- /dev/null +++ b/inst/include/TreeDist/types.h @@ -0,0 +1,32 @@ +#ifndef TREEDIST_TYPES_H_ +#define TREEDIST_TYPES_H_ + +// TreeDist public type definitions. +// +// Rcpp-free: uses only standard headers plus TreeTools (for SL_MAX_SPLITS). +// Downstream packages include via: #include + +#include +#include +#include // SL_MAX_SPLITS, splitbit + +namespace TreeDist { + + using int16 = int_fast16_t; + using int32 = int_fast32_t; + using cost = int_fast64_t; + + using lap_dim = int; + using lap_row = lap_dim; + using lap_col = lap_dim; + + constexpr cost BIG = + (std::numeric_limits::max)() / SL_MAX_SPLITS; + + constexpr cost ROUND_PRECISION = 2048 * 2048; + + constexpr std::size_t BLOCK_SIZE = 16; + +} // namespace TreeDist + +#endif // TREEDIST_TYPES_H_ diff --git a/man/HierarchicalMutualInfo.Rd b/man/HierarchicalMutualInfo.Rd index 9e4788d7..193b18ba 100644 --- a/man/HierarchicalMutualInfo.Rd +++ b/man/HierarchicalMutualInfo.Rd @@ -134,6 +134,7 @@ Other tree distances: \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \concept{tree distances} diff --git a/man/JaccardRobinsonFoulds.Rd b/man/JaccardRobinsonFoulds.Rd index 5df32914..7e515be7 100644 --- a/man/JaccardRobinsonFoulds.Rd +++ b/man/JaccardRobinsonFoulds.Rd @@ -136,6 +136,7 @@ Other tree distances: \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/KendallColijn.Rd b/man/KendallColijn.Rd index cfd506b5..57d01772 100644 --- a/man/KendallColijn.Rd +++ b/man/KendallColijn.Rd @@ -135,6 +135,7 @@ Other tree distances: \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/MASTSize.Rd b/man/MASTSize.Rd index 5ecdd271..db028b0a 100644 --- a/man/MASTSize.Rd +++ b/man/MASTSize.Rd @@ -72,6 +72,7 @@ Other tree distances: \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/MatchingSplitDistance.Rd b/man/MatchingSplitDistance.Rd index 83af446f..b5d64b27 100644 --- a/man/MatchingSplitDistance.Rd +++ b/man/MatchingSplitDistance.Rd @@ -94,6 +94,7 @@ Other tree distances: \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/NNIDist.Rd b/man/NNIDist.Rd index 70895e04..5e24301b 100644 --- a/man/NNIDist.Rd +++ b/man/NNIDist.Rd @@ -111,6 +111,7 @@ Other tree distances: \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/NyeSimilarity.Rd b/man/NyeSimilarity.Rd index 892ed185..d984b129 100644 --- a/man/NyeSimilarity.Rd +++ b/man/NyeSimilarity.Rd @@ -132,6 +132,7 @@ Other tree distances: \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/PathDist.Rd b/man/PathDist.Rd index 5cbfe90c..1192d4ce 100644 --- a/man/PathDist.Rd +++ b/man/PathDist.Rd @@ -78,6 +78,7 @@ Other tree distances: \code{\link{NyeSimilarity}()}, \code{\link{Robinson-Foulds}}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/Robinson-Foulds.Rd b/man/Robinson-Foulds.Rd index a425e8d1..7510e094 100644 --- a/man/Robinson-Foulds.Rd +++ b/man/Robinson-Foulds.Rd @@ -154,6 +154,7 @@ Other tree distances: \code{\link{NyeSimilarity}()}, \code{\link{PathDist}()}, \code{\link{SPRDist}()}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/SPRDist.Rd b/man/SPRDist.Rd index 42fc9b2c..5075abda 100644 --- a/man/SPRDist.Rd +++ b/man/SPRDist.Rd @@ -84,6 +84,7 @@ Other tree distances: \code{\link{NyeSimilarity}()}, \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, +\code{\link{TransferDist}()}, \code{\link{TreeDistance}()} } \author{ diff --git a/man/TransferConsensus.Rd b/man/TransferConsensus.Rd new file mode 100644 index 00000000..afd610b7 --- /dev/null +++ b/man/TransferConsensus.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/transfer_consensus.R +\name{TransferConsensus} +\alias{TransferConsensus} +\title{Consensus tree minimizing transfer distance} +\usage{ +TransferConsensus( + trees, + scale = TRUE, + greedy = c("best", "first"), + init = c("empty", "majority") +) +} +\arguments{ +\item{trees}{An object of class \code{multiPhylo}: the input trees. +All trees must share the same tip labels.} + +\item{scale}{Logical; if \code{TRUE} (default), use the scaled transfer +distance (normalized by light-side size minus one). If \code{FALSE}, use +the unscaled (raw Hamming) transfer distance.} + +\item{greedy}{Character string specifying the greedy strategy: +\code{"best"} (default) picks the single highest-benefit action at each step; +\code{"first"} picks the first improving action encountered (faster but +potentially lower quality).} + +\item{init}{Character string specifying the initial consensus: +\code{"empty"} (default) starts with no splits (purely additive); +\code{"majority"} starts with the majority-rule consensus and refines.} +} +\value{ +A tree of class \code{phylo}. +} +\description{ +Construct a consensus tree that minimizes the sum of transfer distances +to a set of input trees, using a greedy add-and-prune heuristic. +} +\details{ +Unlike the majority-rule consensus, which minimizes Robinson-Foulds +distance and can be highly unresolved when phylogenetic signal is low, +\code{TransferConsensus()} uses the finer-grained transfer distance +\insertCite{Lemoine2018}{TreeDist} to construct a more resolved consensus +tree. + +The algorithm pools all splits observed across input trees, computes +pairwise transfer distances between them, and greedily adds or removes +splits to minimize total transfer dissimilarity cost. The approach +follows \insertCite{Takazawa2026;textual}{TreeDist}, reimplemented for +'TreeDist' infrastructure. +} +\examples{ +library(TreeTools) +# Generate bootstrap-like trees +trees <- as.phylo(1:20, nTip = 12) + +# Transfer consensus (more resolved than majority rule) +tc <- TransferConsensus(trees) +plot(tc) + +# Compare resolution +mr <- Consensus(trees, p = 0.5) +cat("Majority-rule splits:", NSplits(mr), "\n") +cat("Transfer consensus splits:", NSplits(tc), "\n") + +} +\references{ +\insertAllCited{} +} diff --git a/man/TransferDist.Rd b/man/TransferDist.Rd new file mode 100644 index 00000000..2439130f --- /dev/null +++ b/man/TransferDist.Rd @@ -0,0 +1,118 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tree_distance_transfer.R +\name{TransferDist} +\alias{TransferDist} +\alias{TransferDistance} +\alias{TransferDistSplits} +\title{Transfer dissimilarity between phylogenetic trees} +\usage{ +TransferDist( + tree1, + tree2 = NULL, + scale = TRUE, + normalize = FALSE, + reportMatching = FALSE +) + +TransferDistance( + tree1, + tree2 = NULL, + scale = TRUE, + normalize = FALSE, + reportMatching = FALSE +) + +TransferDistSplits( + splits1, + splits2, + nTip = attr(splits1, "nTip"), + scale = TRUE, + reportMatching = FALSE +) +} +\arguments{ +\item{tree1, tree2}{Trees of class \code{phylo}, with leaves labelled identically, +or lists of such trees to undergo pairwise comparison. Where implemented, +\code{tree2 = NULL} will compute distances between each pair of trees in the list +\code{tree1} using a fast algorithm based on +\insertCite{Day1985;textual}{TreeDist}.} + +\item{scale}{Logical; if \code{TRUE} (default), use the scaled transfer +dissimilarity. If \code{FALSE}, use the unscaled transfer dissimilarity.} + +\item{normalize}{If a numeric value is provided, this will be used as a +maximum value against which to rescale results. +If \code{TRUE}, results will be rescaled against a maximum value calculated from +the specified tree sizes and topology, as specified in the "Normalization" +section below. +If \code{FALSE}, results will not be rescaled.} + +\item{reportMatching}{Logical specifying whether to return the clade +matchings as an attribute of the score.} + +\item{splits1, splits2}{Logical matrices where each row corresponds to a leaf, +either listed in the same order or bearing identical names (in any sequence), +and each column corresponds to a split, such that each leaf is identified as +a member of the ingroup (\code{TRUE}) or outgroup (\code{FALSE}) of the respective +split.} + +\item{nTip}{(Optional) Integer specifying the number of leaves in each split.} +} +\value{ +\code{TransferDist()} returns an object of class \code{dist} (if \code{tree2} is +\code{NULL}), a numeric matrix (if both \code{tree1} and \code{tree2} are lists), or a +numeric value (for a single pair). If \code{reportMatching = TRUE}, the +return value carries \code{matching} and \code{pairScores} attributes. +} +\description{ +Compute the transfer dissimilarity between phylogenetic trees, as defined +by \insertCite{Takazawa2026;textual}{TreeDist}. +The transfer dissimilarity uses the transfer distance +\insertCite{Lemoine2018}{TreeDist} to compare bipartitions, providing a +finer-grained measure than the Robinson--Foulds distance. Each branch in +each tree is scored by how many taxa must be moved to match its closest +counterpart in the other tree, and these scores are summed. +} +\details{ +The \code{scaled} variant divides each branch's contribution by its depth minus +one, giving equal weight to all branches regardless of their depth (analogous +to the Robinson--Foulds distance). The \code{unscaled} variant uses raw transfer +distances, giving more weight to deep branches. Neither variant satisfies +the triangle inequality for trees with six or more tips. +} +\section{Normalization}{ + + +When \code{normalize = TRUE}, the scaled transfer dissimilarity is divided by +\code{2 * (n - 3)}, placing it in the range [0, 1]. The unscaled version is +divided by the maximum possible unscaled dissimilarity +(following \insertCite{Takazawa2026;textual}{TreeDist}). +} + +\examples{ +library(TreeTools) +TransferDist(BalancedTree(8), PectinateTree(8)) +TransferDist(BalancedTree(8), PectinateTree(8), scale = FALSE) + +# All-pairs +TransferDist(as.phylo(0:5, 8)) + +} +\references{ +\insertAllCited{} +} +\seealso{ +Other tree distances: +\code{\link{HierarchicalMutualInfo}()}, +\code{\link{JaccardRobinsonFoulds}()}, +\code{\link{KendallColijn}()}, +\code{\link{MASTSize}()}, +\code{\link{MatchingSplitDistance}()}, +\code{\link{NNIDist}()}, +\code{\link{NyeSimilarity}()}, +\code{\link{PathDist}()}, +\code{\link{Robinson-Foulds}}, +\code{\link{SPRDist}()}, +\code{\link{TreeDistance}()} +} +\concept{tree distances} diff --git a/man/TreeDistance.Rd b/man/TreeDistance.Rd index 352d169c..f081b6d3 100644 --- a/man/TreeDistance.Rd +++ b/man/TreeDistance.Rd @@ -328,7 +328,8 @@ Other tree distances: \code{\link{NyeSimilarity}()}, \code{\link{PathDist}()}, \code{\link{Robinson-Foulds}}, -\code{\link{SPRDist}()} +\code{\link{SPRDist}()}, +\code{\link{TransferDist}()} } \author{ \href{https://orcid.org/0000-0001-5660-1727}{Martin R. Smith} diff --git a/man/cpp_transfer_consensus.Rd b/man/cpp_transfer_consensus.Rd new file mode 100644 index 00000000..a8ebc915 --- /dev/null +++ b/man/cpp_transfer_consensus.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{cpp_transfer_consensus} +\alias{cpp_transfer_consensus} +\title{Transfer consensus (C++ implementation)} +\usage{ +cpp_transfer_consensus( + splits_list, + n_tip, + scale, + greedy_best_flag, + init_majority, + n_threads = 1L +) +} +\arguments{ +\item{splits_list}{List of raw matrices (one per tree), each from as.Splits().} + +\item{n_tip}{Number of tips.} + +\item{scale}{Logical: use scaled transfer distance?} + +\item{greedy_best_flag}{Logical: TRUE for "best", FALSE for "first".} + +\item{init_majority}{Logical: TRUE to start from majority-rule splits.} +} +\value{ +A \code{LogicalVector} of length n_splits indicating which pooled splits +are included in the consensus, plus attributes "raw_splits" (a raw matrix +of all unique splits) and "light_side" (integer vector). +} +\description{ +Transfer consensus (C++ implementation) +} +\keyword{internal} diff --git a/man/cpp_transfer_dist.Rd b/man/cpp_transfer_dist.Rd new file mode 100644 index 00000000..204f1a01 --- /dev/null +++ b/man/cpp_transfer_dist.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{cpp_transfer_dist} +\alias{cpp_transfer_dist} +\title{Per-pair transfer dissimilarity} +\usage{ +cpp_transfer_dist(x, y, nTip) +} +\arguments{ +\item{x, y}{Raw matrices representing splits (from as.Splits()).} + +\item{nTip}{Integer: number of tips.} +} +\value{ +A list with components: +\itemize{ +\item score_scaled: scaled transfer dissimilarity (double) +\item score_unscaled: unscaled transfer dissimilarity (double) +\item \code{matching_xy}: integer vector, best match in y for each split in x (1-based, NA if sentinel) +\item \code{matching_yx}: integer vector, best match in x for each split in y (1-based, NA if sentinel) +} +} +\description{ +Per-pair transfer dissimilarity +} +\keyword{internal} diff --git a/man/cpp_transfer_dist_all_pairs.Rd b/man/cpp_transfer_dist_all_pairs.Rd new file mode 100644 index 00000000..92af73d4 --- /dev/null +++ b/man/cpp_transfer_dist_all_pairs.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{cpp_transfer_dist_all_pairs} +\alias{cpp_transfer_dist_all_pairs} +\title{All-pairs transfer dissimilarity (OpenMP)} +\usage{ +cpp_transfer_dist_all_pairs(splits_list, n_tip, scale, n_threads) +} +\arguments{ +\item{splits_list}{List of raw matrices (one per tree).} + +\item{n_tip}{Number of tips.} + +\item{scale}{Logical: use scaled transfer dissimilarity?} + +\item{n_threads}{Number of OpenMP threads.} +} +\value{ +Numeric vector of length choose(N,2) in dist order. +} +\description{ +All-pairs transfer dissimilarity (OpenMP) +} +\keyword{internal} diff --git a/man/cpp_transfer_dist_cross_pairs.Rd b/man/cpp_transfer_dist_cross_pairs.Rd new file mode 100644 index 00000000..60c55106 --- /dev/null +++ b/man/cpp_transfer_dist_cross_pairs.Rd @@ -0,0 +1,24 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/RcppExports.R +\name{cpp_transfer_dist_cross_pairs} +\alias{cpp_transfer_dist_cross_pairs} +\title{Cross-pairs transfer dissimilarity (OpenMP)} +\usage{ +cpp_transfer_dist_cross_pairs(splits_a, splits_b, n_tip, scale, n_threads) +} +\arguments{ +\item{splits_a, splits_b}{Lists of raw matrices.} + +\item{n_tip}{Number of tips.} + +\item{scale}{Logical: use scaled transfer dissimilarity?} + +\item{n_threads}{Number of OpenMP threads.} +} +\value{ +Numeric matrix of dimension \code{nA} x \code{nB}. +} +\description{ +Cross-pairs transfer dissimilarity (OpenMP) +} +\keyword{internal} diff --git a/man/dot-SPRPairDeO.Rd b/man/dot-SPRPairDeO.Rd new file mode 100644 index 00000000..c846b761 --- /dev/null +++ b/man/dot-SPRPairDeO.Rd @@ -0,0 +1,22 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/tree_distance_spr.R +\name{.SPRPairDeO} +\alias{.SPRPairDeO} +\title{SPR approximation via Oliveira Martins \emph{et al.} (2008)} +\usage{ +.SPRPairDeO(tree1, tree2, check = TRUE) +} +\description{ +SPR approximation via Oliveira Martins \emph{et al.} (2008) +} +\examples{ +# de Oliveira Martins et al 2008, fig. 7 +tree1 <- ape::read.tree(text = "((1, 2), ((a, b), (c, d)), (3, (4, (5, (6, 7)))));") +tree2 <- ape::read.tree(text = "((1, 2), 3, (4, (5, (((a, b), (c, d)), (6, 7)))));") +oPar <- par(mfrow =c(2, 1), mar = rep(0, 4)) +plot(tree1) +plot(tree2) +par(oPar) +SPRDist(tree1, tree2, method = "deO") +} +\keyword{internal} diff --git a/src/Makevars b/src/Makevars index 139b1d17..d3df534e 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,3 @@ +PKG_CPPFLAGS = -I../inst/include PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) diff --git a/src/Makevars.win b/src/Makevars.win index 139b1d17..d3df534e 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,2 +1,3 @@ +PKG_CPPFLAGS = -I../inst/include PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index 443df8c1..72309c5b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -522,6 +522,108 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// cpp_transfer_consensus +List cpp_transfer_consensus(const List& splits_list, const int n_tip, const bool scale, const bool greedy_best_flag, const bool init_majority, const int n_threads); +RcppExport SEXP _TreeDist_cpp_transfer_consensus(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP greedy_best_flagSEXP, SEXP init_majoritySEXP, SEXP n_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP); + Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP); + Rcpp::traits::input_parameter< const bool >::type scale(scaleSEXP); + Rcpp::traits::input_parameter< const bool >::type greedy_best_flag(greedy_best_flagSEXP); + Rcpp::traits::input_parameter< const bool >::type init_majority(init_majoritySEXP); + Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_transfer_consensus(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_threads)); + return rcpp_result_gen; +END_RCPP +} +// cpp_tc_profile +Rcpp::NumericVector cpp_tc_profile(const List& splits_list, const int n_tip, const bool scale, const bool greedy_best_flag, const bool init_majority, const int n_iter, const int n_threads); +RcppExport SEXP _TreeDist_cpp_tc_profile(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP greedy_best_flagSEXP, SEXP init_majoritySEXP, SEXP n_iterSEXP, SEXP n_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP); + Rcpp::traits::input_parameter< const int >::type n_tip(n_tipSEXP); + Rcpp::traits::input_parameter< const bool >::type scale(scaleSEXP); + Rcpp::traits::input_parameter< const bool >::type greedy_best_flag(greedy_best_flagSEXP); + Rcpp::traits::input_parameter< const bool >::type init_majority(init_majoritySEXP); + Rcpp::traits::input_parameter< const int >::type n_iter(n_iterSEXP); + Rcpp::traits::input_parameter< const int >::type n_threads(n_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_tc_profile(splits_list, n_tip, scale, greedy_best_flag, init_majority, n_iter, n_threads)); + return rcpp_result_gen; +END_RCPP +} +// cpp_transfer_dist +List cpp_transfer_dist(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip); +RcppExport SEXP _TreeDist_cpp_transfer_dist(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const RawMatrix& >::type x(xSEXP); + Rcpp::traits::input_parameter< const RawMatrix& >::type y(ySEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nTip(nTipSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist(x, y, nTip)); + return rcpp_result_gen; +END_RCPP +} +// cpp_transfer_dist_scored +List cpp_transfer_dist_scored(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip, bool scale); +RcppExport SEXP _TreeDist_cpp_transfer_dist_scored(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP, SEXP scaleSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const RawMatrix& >::type x(xSEXP); + Rcpp::traits::input_parameter< const RawMatrix& >::type y(ySEXP); + Rcpp::traits::input_parameter< const IntegerVector& >::type nTip(nTipSEXP); + Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist_scored(x, y, nTip, scale)); + return rcpp_result_gen; +END_RCPP +} +// cpp_transfer_dist_all_pairs +NumericVector cpp_transfer_dist_all_pairs(const List& splits_list, int n_tip, bool scale, int n_threads); +RcppExport SEXP _TreeDist_cpp_transfer_dist_all_pairs(SEXP splits_listSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP n_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const List& >::type splits_list(splits_listSEXP); + Rcpp::traits::input_parameter< int >::type n_tip(n_tipSEXP); + Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP); + Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist_all_pairs(splits_list, n_tip, scale, n_threads)); + return rcpp_result_gen; +END_RCPP +} +// cpp_transfer_dist_cross_pairs +NumericMatrix cpp_transfer_dist_cross_pairs(const List& splits_a, const List& splits_b, int n_tip, bool scale, int n_threads); +RcppExport SEXP _TreeDist_cpp_transfer_dist_cross_pairs(SEXP splits_aSEXP, SEXP splits_bSEXP, SEXP n_tipSEXP, SEXP scaleSEXP, SEXP n_threadsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const List& >::type splits_a(splits_aSEXP); + Rcpp::traits::input_parameter< const List& >::type splits_b(splits_bSEXP); + Rcpp::traits::input_parameter< int >::type n_tip(n_tipSEXP); + Rcpp::traits::input_parameter< bool >::type scale(scaleSEXP); + Rcpp::traits::input_parameter< int >::type n_threads(n_threadsSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_transfer_dist_cross_pairs(splits_a, splits_b, n_tip, scale, n_threads)); + return rcpp_result_gen; +END_RCPP +} +// cpp_mci_impl_score +double cpp_mci_impl_score(const Rcpp::RawMatrix& x, const Rcpp::RawMatrix& y, int n_tips); +RcppExport SEXP _TreeDist_cpp_mci_impl_score(SEXP xSEXP, SEXP ySEXP, SEXP n_tipsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type x(xSEXP); + Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type y(ySEXP); + Rcpp::traits::input_parameter< int >::type n_tips(n_tipsSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_mci_impl_score(x, y, n_tips)); + return rcpp_result_gen; +END_RCPP +} // cpp_robinson_foulds_distance List cpp_robinson_foulds_distance(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip); RcppExport SEXP _TreeDist_cpp_robinson_foulds_distance(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) { @@ -658,6 +760,13 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeDist_keep_and_reroot", (DL_FUNC) &_TreeDist_keep_and_reroot, 3}, {"_TreeDist_keep_and_reduce", (DL_FUNC) &_TreeDist_keep_and_reduce, 3}, {"_TreeDist_spr_table_7", (DL_FUNC) &_TreeDist_spr_table_7, 2}, + {"_TreeDist_cpp_transfer_consensus", (DL_FUNC) &_TreeDist_cpp_transfer_consensus, 6}, + {"_TreeDist_cpp_tc_profile", (DL_FUNC) &_TreeDist_cpp_tc_profile, 7}, + {"_TreeDist_cpp_transfer_dist", (DL_FUNC) &_TreeDist_cpp_transfer_dist, 3}, + {"_TreeDist_cpp_transfer_dist_scored", (DL_FUNC) &_TreeDist_cpp_transfer_dist_scored, 4}, + {"_TreeDist_cpp_transfer_dist_all_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_all_pairs, 4}, + {"_TreeDist_cpp_transfer_dist_cross_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_cross_pairs, 5}, + {"_TreeDist_cpp_mci_impl_score", (DL_FUNC) &_TreeDist_cpp_mci_impl_score, 3}, {"_TreeDist_cpp_robinson_foulds_distance", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_distance, 3}, {"_TreeDist_cpp_robinson_foulds_info", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_info, 3}, {"_TreeDist_cpp_matching_split_distance", (DL_FUNC) &_TreeDist_cpp_matching_split_distance, 3}, diff --git a/src/day_1985.cpp b/src/day_1985.cpp index 2f449aae..6773fdea 100644 --- a/src/day_1985.cpp +++ b/src/day_1985.cpp @@ -3,6 +3,8 @@ using namespace Rcpp; #include "tree_distances.h" /* includes */ #include "information.h" +#include +#include #include /* for root_on_node() */ #include /* for root_on_node() */ @@ -96,7 +98,7 @@ double calc_consensus_info(const List &trees, const LogicalVector &phylo, std::vector tables; if (std::size_t(n_trees) > tables.max_size()) { - Rcpp::stop("Not enough memory available to compute consensus of so many trees"); // LCOV_EXCL_LINE + ASSERT(false && "Not enough memory for consensus of so many trees"); // LCOV_EXCL_LINE } tables.reserve(n_trees); @@ -321,31 +323,19 @@ IntegerVector robinson_foulds_all_pairs(const List& tables) { // [[Rcpp::export]] double consensus_info(const List trees, const LogicalVector phylo, const NumericVector p) { - if (p[0] > 1 + 1e-15) { // epsilon catches floating point error - Rcpp::stop("p must be <= 1.0 in consensus_info()"); - } else if (p[0] < 0.5) { - Rcpp::stop("p must be >= 0.5 in consensus_info()"); - } + // Validated by R caller (ConsensusInfo checks p range) + ASSERT(p[0] <= 1 + 1e-15 && "p must be <= 1.0 in consensus_info()"); + ASSERT(p[0] >= 0.5 && "p must be >= 0.5 in consensus_info()"); - // First, peek at the tree size to determine allocation strategy - // We'll create a temporary ClusterTable just to check the size - try { - ClusterTable temp_table(Rcpp::List(trees(0))); - const int32 n_tip = temp_table.N(); - - if (n_tip <= ct_stack_threshold) { - // Small tree: use stack-allocated array - std::array S; - return calc_consensus_info(trees, phylo, p, S); - } else { - // Large tree: use heap-allocated vector - std::vector S(n_tip); - return calc_consensus_info(trees, phylo, p, S); - } - } catch(const std::exception& e) { - Rcpp::stop(e.what()); + // Peek at tree size to choose stack vs heap allocation for the work buffer + ClusterTable temp_table(Rcpp::List(trees(0))); + const int32 n_tip = temp_table.N(); + + if (n_tip <= ct_stack_threshold) { + std::array S; + return calc_consensus_info(trees, phylo, p, S); + } else { + std::vector S(n_tip); + return calc_consensus_info(trees, phylo, p, S); } - - ASSERT(false && "Unreachable code in consensus_tree"); - return 0.0; } diff --git a/src/hmi.cpp b/src/hmi.cpp index bb594a7f..bcbe7c6a 100644 --- a/src/hmi.cpp +++ b/src/hmi.cpp @@ -139,9 +139,8 @@ double hierarchical_self_info(const std::vector& nodes, size_t double HMI_xptr(SEXP ptr1, SEXP ptr2) { Rcpp::XPtr hp1(ptr1); Rcpp::XPtr hp2(ptr2); - if (hp1->nodes[hp1->root].n_tip != hp2->nodes[hp2->root].n_tip) { - Rcpp::stop("Trees must have the same number of leaves"); - } + ASSERT(hp1->nodes[hp1->root].n_tip == hp2->nodes[hp2->root].n_tip + && "Trees must have the same number of leaves"); return TreeDist::hierarchical_mutual_info(hp1->nodes, hp1->root, hp2->nodes, hp2->root).second; } @@ -170,12 +169,8 @@ Rcpp::NumericVector EHMI_xptr(SEXP hp1_ptr, SEXP hp2_ptr, double tolerance = 0.01, int minResample = 36) { - if (minResample < 2) { - Rcpp::stop("Must perform at least one resampling"); - } - if (tolerance < 1e-8) { - Rcpp::stop("Tolerance too low"); - } + ASSERT(minResample >= 2 && "Must perform at least one resampling"); + ASSERT(tolerance >= 1e-8 && "Tolerance too low"); Rcpp::XPtr hp1(hp1_ptr); Rcpp::XPtr hp2(hp2_ptr); diff --git a/src/hpart.cpp b/src/hpart.cpp index ffbfcaea..22bab059 100644 --- a/src/hpart.cpp +++ b/src/hpart.cpp @@ -126,7 +126,7 @@ size_t build_node_from_list(const RObject& node, if (Rf_isInteger(node) || Rf_isReal(node)) { const IntegerVector leaf_vec(node); if (leaf_vec.size() != 1) { - Rcpp::stop("List must only contain integers, not vectors of integers"); // #nocov + ASSERT(false && "List must only contain integers, not vectors of integers"); // #nocov } const int leaf_label = leaf_vec[0]; // 1-based R leaf label const size_t leaf_idx = leaf_label - 1; // 0-based label for HNode @@ -171,8 +171,9 @@ size_t build_node_from_list(const RObject& node, return my_idx; } - // Invalid node type - Rcpp::stop("Invalid node type"); // #nocov + // Invalid node type — unreachable when R caller validates input + ASSERT(false && "Invalid node type"); // #nocov + return 0; // #nocov } diff --git a/src/hpart_relabel.cpp b/src/hpart_relabel.cpp index 6f7906ef..1337c91d 100644 --- a/src/hpart_relabel.cpp +++ b/src/hpart_relabel.cpp @@ -13,7 +13,7 @@ void recompute_bitsets_postorder(TreeDist::HPart &hpart, const size_t node_idx, if (node.children.empty()) { // Leaf node if (node.leaf_count != 1) { - Rcpp::stop("Leaf node has leaf_count != 1"); // #nocov + ASSERT(node.leaf_count == 1 && "Leaf node has leaf_count != 1"); // #nocov } int new_index = mapping[node.label]; // mapping is 0-based node.label = new_index; diff --git a/src/information.h b/src/information.h index 2db2340b..cb2ce58b 100644 --- a/src/information.h +++ b/src/information.h @@ -2,7 +2,7 @@ #define _TREEDIST_INFO_H #include -#include /* for log2() */ +#include #include /* for log2() */ #include "ints.h" /* for int16 */ @@ -48,14 +48,12 @@ __attribute__((constructor)) inline double split_phylo_info(const int32 n_in, const int32 *n_tip, const double p) { - if (*n_tip > CT_MAX_LEAVES) { - Rcpp::stop("This many leaves are not yet supported."); - } + ASSERT(*n_tip <= CT_MAX_LEAVES && "This many leaves are not yet supported."); const int32 n_out = *n_tip - n_in; - assert(p > 0); - assert(p <= 1); - assert(n_in > 1); - assert(n_out > 1); + ASSERT(p > 0); + ASSERT(p <= 1); + ASSERT(n_in > 1); + ASSERT(n_out > 1); if (p == 1) { return (l2unrooted[*n_tip] - l2rooted[n_in] - l2rooted[n_out]); } else { @@ -74,14 +72,12 @@ inline double split_phylo_info(const int32 n_in, const int32 *n_tip, inline double split_clust_info(const int32 n_in, const int32 *n_tip, const double p) { - if (*n_tip > CT_MAX_LEAVES) { - Rcpp::stop("This many leaves are not yet supported."); - } + ASSERT(*n_tip <= CT_MAX_LEAVES && "This many leaves are not yet supported."); const int32 n_out = *n_tip - n_in; - assert(p > 0); - assert(p <= 1); - assert(n_in > 1); - assert(n_out > 1); + ASSERT(p > 0); + ASSERT(p <= 1); + ASSERT(n_in > 1); + ASSERT(n_out > 1); return -p * ( (((n_in * l2[n_in]) + (n_out * l2[n_out])) / *n_tip) - l2[*n_tip] ); diff --git a/src/ints.h b/src/ints.h index a5af8774..50fd3cc7 100644 --- a/src/ints.h +++ b/src/ints.h @@ -1,10 +1,13 @@ #ifndef _TREEDIST_INT_H #define _TREEDIST_INT_H -#include +#include -using int16 = int_fast16_t; -using int32 = int_fast32_t; +// Re-export shared types to global scope for backward compatibility. +using TreeDist::int16; +using TreeDist::int32; + +// Types used only within TreeDist's own source. using uint32 = uint_fast32_t; using grf_match = std::vector; diff --git a/src/lap.cpp b/src/lap.cpp index 4963fa9e..e77c712f 100644 --- a/src/lap.cpp +++ b/src/lap.cpp @@ -26,6 +26,15 @@ https://www.mathworks.com/matlabcentral/fileexchange/26836-lapjv-jonker-volgenant-algorithm-for-linear-assignment-problem-v3-0: * *************************************************************************/ + +// NOTE: The LAP hot loops are highly sensitive to instruction alignment and +// register allocation, which are affected by the TU's full include graph. +// Do NOT include lap_impl.h here — that header is for downstream LinkingTo +// consumers only. TreeDist's own lap() is compiled directly in this file +// to preserve the codegen context that was profiled and tuned. +// +// If the algorithm changes, update BOTH this file and lap_impl.h. + #include "lap.h" #include @@ -37,11 +46,42 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) { const lap_dim spare_rows = n_row - n_col; const cost max_score = cost(BIG / max_dim); const double x_max = maxX[0]; + const double scale_factor = max_score / x_max; std::vector rowsol(max_dim); std::vector colsol(max_dim); - cost_matrix input(x, x_max); + // Build cost matrix. Fill the transposed buffer first (matching R's + // column-major storage for sequential reads) then untranspose. + cost_matrix input(max_dim); + const double* __restrict__ src_data = REAL(x); + cost* __restrict__ t_ptr = input.col(0); + const std::size_t dim8 = input.dim8(); + + for (lap_col c = 0; c < n_col; ++c) { + const std::size_t t_off = static_cast(c) * dim8; + const std::size_t s_off = static_cast(c) * n_row; + for (lap_row r = 0; r < n_row; ++r) { + t_ptr[t_off + r] = static_cast(src_data[s_off + r] * scale_factor); + } + // Pad remaining rows in this transposed column + for (lap_row r = n_row; r < max_dim; ++r) { + t_ptr[t_off + r] = max_score; + } + for (std::size_t r = max_dim; r < dim8; ++r) { + t_ptr[t_off + r] = max_score; + } + } + // Pad remaining transposed columns + for (lap_col c = n_col; c < max_dim; ++c) { + const std::size_t t_off = static_cast(c) * dim8; + for (std::size_t r = 0; r < dim8; ++r) { + t_ptr[t_off + r] = max_score; + } + } + + // Untranspose: t_data_ -> data_ + input.makeUntranspose(); cost score = lap(max_dim, input, rowsol, colsol); @@ -60,27 +100,22 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) { ); } +namespace { inline bool nontrivially_less_than(cost a, cost b) noexcept { return a + ((a > ROUND_PRECISION) ? 8 : 0) < b; } +} // anonymous namespace /* This function is the jv shortest augmenting path algorithm to solve the assignment problem */ +namespace TreeDist { + cost lap(const lap_row dim, - cost_matrix &input_cost, + CostMatrix &input_cost, std::vector &rowsol, std::vector &colsol, const bool allow_interrupt, LapScratch &scratch) - - // input: - // dim - problem size - // input_cost - cost matrix - - // output: - // rowsol - column assigned to row in solution - // colsol - row assigned to column in solution - { lap_row num_free = 0; scratch.ensure(dim); @@ -316,3 +351,4 @@ cost lap(const lap_row dim, return lapcost; } +} // namespace TreeDist diff --git a/src/lap.h b/src/lap.h index b1a22508..592ce9b2 100644 --- a/src/lap.h +++ b/src/lap.h @@ -9,456 +9,27 @@ // because the LAP hot loops are sensitive to instruction alignment — even // unrelated inline functions in the same translation unit can shift code // layout enough to cause measurable regressions. - -#include -#include - -#include /* for numeric_limits */ -#include /* for vector */ - -#include "ints.h" - -/*************** TYPES *******************/ - -using cost = int_fast64_t; -using lap_dim = int; -using lap_row = lap_dim; -using lap_col = lap_dim; - -/* For a reason I've not determined, shrinking BIG is necessary to avoid - * an infinite loop in lap. */ -constexpr cost BIG = (std::numeric_limits::max)() / SL_MAX_SPLITS; -constexpr cost ROUND_PRECISION = 2048 * 2048; -constexpr size_t BLOCK_SIZE = 16; - -class CostMatrix { -private: - size_t dim_; // Important not to use int16, which will overflow on * - size_t dim8_; - alignas(64) std::vector data_; - alignas(64) std::vector t_data_; - bool transposed_; - - static size_t block_containing(const size_t x) { - return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE; - } - -public: - // Default constructor for pooled instances (zero-size, no allocation). - CostMatrix() : dim_(0), dim8_(0), transposed_(false) {} - - CostMatrix(size_t dim) - : dim_(dim), - dim8_(block_containing(dim_)), - data_(std::vector(dim8_ * dim8_)), - t_data_(std::vector(dim8_ * dim8_)), - transposed_(false) {} - - // Resize for reuse. Only reallocates when the new dimension exceeds the - // current buffer capacity; otherwise just updates dim_/dim8_ and marks - // the transpose as stale. New elements (if any) are value-initialised. - void resize(size_t new_dim) { - dim_ = new_dim; - dim8_ = block_containing(new_dim); - const size_t needed = dim8_ * dim8_; - if (data_.size() < needed) { - data_.resize(needed); - t_data_.resize(needed); - } - transposed_ = false; - } - - CostMatrix(const Rcpp::NumericMatrix& src, const double x_max) - : dim_((std::max(src.nrow(), src.ncol()))), // or pad here as needed - dim8_(block_containing(dim_)), - data_(std::vector(dim8_ * dim_)), - t_data_(std::vector(dim8_ * dim_)), - transposed_(true) - { - // Compute scale factor - const cost max_score = cost(BIG / dim_); - double scale_factor = max_score / x_max; - - const lap_row n_row = src.nrow(); - const lap_col n_col = src.ncol(); - - const double* __restrict src_data = REAL(src); - cost* __restrict dest_data = t_data_.data(); - - for (lap_col c = 0; c < n_col; ++c) { - const size_t data_c = c * dim8_; - const size_t src_c = c * n_row; - for (lap_row r = 0; r < n_row; ++r) { - // Marginally faster than std::transform - dest_data[data_c + r] = static_cast(src_data[src_c + r] * scale_factor); - } - - padTrColAfterRow(c, n_row, max_score); // padding now goes after row `c` - } - - padTrAfterCol(n_col, max_score); - - makeUntranspose(); - } - - // Reset for reuse at the same dimension — clears the transpose cache so the - // next findColMin() re-builds it from fresh data. Call between pair - // computations when reusing a thread-local CostMatrix instance. - void reset() noexcept { transposed_ = false; } - - [[nodiscard]] size_t dim() const noexcept { return dim_; } - - void makeTranspose() noexcept { - if (transposed_) return; - - const cost* __restrict data_ptr = data_.data(); - cost* __restrict t_data_ptr = t_data_.data(); - -#if defined(__GNUC__) || defined(__clang__) - data_ptr = static_cast(__builtin_assume_aligned(data_ptr, 64)); - t_data_ptr = static_cast(__builtin_assume_aligned(t_data_ptr, 64)); -#endif - for (size_t i = 0; i < dim_; i += BLOCK_SIZE) { - for (size_t j = 0; j < dim_; j += BLOCK_SIZE) { - for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) { - for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) { - t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; - } - } - } - } - - transposed_ = true; - } - - void makeUntranspose() noexcept { - const cost* __restrict data_ptr = t_data_.data(); - cost* __restrict t_data_ptr = data_.data(); - -#if defined(__GNUC__) || defined(__clang__) - data_ptr = static_cast(__builtin_assume_aligned(data_ptr, 64)); - t_data_ptr = static_cast(__builtin_assume_aligned(t_data_ptr, 64)); -#endif - for (size_t i = 0; i < dim_; i += BLOCK_SIZE) { - for (size_t j = 0; j < dim_; j += BLOCK_SIZE) { - for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) { - for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) { - t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; - } - } - } - } - } - - - // Access operator for read/write - cost& operator()(lap_row i, lap_col j) { - return data_[static_cast(i) * dim8_ + j]; - } - - // Const version for read-only access - [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const { - return data_[static_cast(i) * dim8_ + j]; - } - - [[nodiscard]] const cost& row0(lap_col j) const { - return data_[j]; - } - - [[nodiscard]] const cost& entry0(lap_row i) const { - return data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] cost* row(lap_row i) { - return &data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] const cost* row(lap_row i) const { - return &data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] cost* col(lap_col i) { - return &t_data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] const cost* col(lap_col i) const { - return &t_data_[static_cast(i) * dim8_]; - } - - void padTrAfterCol(lap_row start_row, cost value) { - size_t start_index = static_cast(start_row) * dim8_; - size_t end_index = dim_ * dim8_; - std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value); - } - - void padAfterRow(lap_row start_row, cost value) { - size_t start_index = static_cast(start_row) * dim8_; - size_t end_index = dim_ * dim8_; - std::fill(data_.begin() + start_index, data_.begin() + end_index, value); - } - - void padTrColAfterRow(const lap_row r, const lap_col start_col, - const cost value) { - size_t r_offset = r * dim8_; - size_t actual_start_col = static_cast(start_col); - size_t start_index = r_offset + actual_start_col; - size_t end_index = start_index + dim_ - actual_start_col; - std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value); - } - - void padRowAfterCol(const lap_row r, const lap_col start_col, - const cost value) { - size_t r_offset = r * dim8_; - size_t actual_start_col = static_cast(start_col); - size_t start_index = r_offset + actual_start_col; - size_t end_index = start_index + dim_ - actual_start_col; - std::fill(data_.begin() + start_index, data_.begin() + end_index, value); - } - - std::pair findColMin(lap_col j, - lap_row search_dim = -1) { - makeTranspose(); - const cost* col_data = col(j); - const lap_row n = (search_dim < 0) ? static_cast(dim_) : search_dim; - const auto min_ptr = std::min_element(col_data, col_data + n); - return {*min_ptr, - static_cast(std::distance(col_data, min_ptr))}; - } - - // test2000 = 119 ms (!) - // Find minimum and second minimum reduced cost over columns. - std::tuple findRowSubmin( - const lap_row* i, const std::vector& v - ) const { - assert(dim_ > 1); - - const cost* __restrict row_i = row(*i); - const lap_col dim = static_cast(dim_); - const cost* __restrict v_ptr = v.data(); - - const cost h0 = row_i[0] - v_ptr[0]; - const cost h1 = row_i[1] - v_ptr[1]; - - cost min_val, submin_val; - lap_col min_idx, submin_idx; - - if (h0 > h1) { - min_val = h1; submin_val = h0; - min_idx = 1; submin_idx = 0; - } else { - min_val = h0; submin_val = h1; - min_idx = 0; submin_idx = 1; - } - - const lap_col j_limit = (dim < 4 ? 0 : static_cast(dim - 3)); - - for (lap_col j = 2; j < j_limit; j += 4) { - assert(BLOCK_SIZE >= 4); // Unrolling loop x4 gives ~20% speedup - const cost h0 = row_i[j] - v_ptr[j]; - const cost h1 = row_i[j + 1] - v_ptr[j + 1]; - const cost h2 = row_i[j + 2] - v_ptr[j + 2]; - const cost h3 = row_i[j + 3] - v_ptr[j + 3]; - if (h0 < submin_val) { - if (h0 < min_val) { - submin_val = min_val; - min_val = h0; - submin_idx = min_idx; - min_idx = j; - } else { - submin_val = h0; - submin_idx = j; - } - } - if (h1 < submin_val) { - if (h1 < min_val) { - submin_val = min_val; - min_val = h1; - submin_idx = min_idx; - min_idx = j + 1; - } else { - submin_val = h1; - submin_idx = j + 1; - } - } - if (h2 < submin_val) { - if (h2 < min_val) { - submin_val = min_val; - min_val = h2; - submin_idx = min_idx; - min_idx = j + 2; - } else { - submin_val = h2; - submin_idx = j + 2; - } - } - if (h3 < submin_val) { - if (h3 < min_val) { - submin_val = min_val; - min_val = h3; - submin_idx = min_idx; - min_idx = j + 3; - } else { - submin_val = h3; - submin_idx = j + 3; - } - } - } - for (lap_col j = j_limit; j < dim; ++j) { - const cost h = row_i[j] - v_ptr[j]; - if (h < submin_val) { - if (h < min_val) { - submin_val = min_val; - min_val = h; - submin_idx = min_idx; - min_idx = j; - } else { - submin_val = h; - submin_idx = j; - } - } - } - return {min_val, submin_val, min_idx, submin_idx}; - } - - // test2000 = 260 ms - std::tuple findRowSubminNaive( - const lap_row* i, - const std::vector& v_ptr - ) { - const cost* __restrict row_i = row(*i); - const lap_col dim = static_cast(dim_); - - if (dim < 2) { - return {row_i[0] - v_ptr[0], std::numeric_limits::max(), 0, 0}; - } - - // Initialize with first two elements - cost h0 = row_i[0] - v_ptr[0]; - cost h1 = row_i[1] - v_ptr[1]; - - cost min_val, submin_val; - lap_col min_idx, submin_idx; - - if (h0 <= h1) { - min_val = h0; submin_val = h1; - min_idx = 0; submin_idx = 1; - } else { - min_val = h1; submin_val = h0; - min_idx = 1; submin_idx = 0; - } - - // Process remaining elements - for (lap_col j = 2; j < dim; ++j) { - const cost h = row_i[j] - v_ptr[j]; - - if (h < min_val) { - submin_val = min_val; - submin_idx = min_idx; - min_val = h; - min_idx = j; - } else if (h < submin_val) { - submin_val = h; - submin_idx = j; - } - } - - return {min_val, submin_val, min_idx, submin_idx}; - } - - // test2000 = 370 ms (!) - std::tuple findRowSubminTwoPassNaive( - const lap_row* i, - const std::vector& v_ptr - ) { - const cost* __restrict row_i = row(*i); - cost min_val = std::numeric_limits::max(); - lap_col min_idx = 0; - const lap_col dim = static_cast(dim_); - - for (lap_col j = 0; j < dim; ++j) { - const cost h = row_i[j] - v_ptr[j]; - if (h < min_val) { - min_val = h; - min_idx = j; - } - } - - // Second pass: find subminimum - cost submin_val = std::numeric_limits::max(); - lap_col submin_idx = (min_idx == 0) ? 1 : 0; - - for (lap_col j = 0; j < dim; ++j) { - if (j != min_idx) { - const cost h = row_i[j] - v_ptr[j]; - if (h < submin_val) { - submin_val = h; - submin_idx = j; - } - } - } - - return {min_val, submin_val, min_idx, submin_idx}; - } -}; - -using cost_matrix = CostMatrix; - -// --------------------------------------------------------------------------- -// LapScratch — reusable heap storage for one thread's LAPJV workspace. // -// Pass to the scratch-taking overload of lap() to eliminate the ~6 per-call -// std::vector allocations. In a serial context construct once before the -// loop; in an OpenMP context allocate one per thread and index by -// omp_get_thread_num(). All vectors are grown lazily; never shrunk. -// --------------------------------------------------------------------------- -struct LapScratch { - std::vector v; // column dual variables (padded) - std::vector matches; // assignment-count per row - std::vector freeunassigned; // list of unassigned rows - std::vector col_list; // column scan list - std::vector d; // Dijkstra distances - std::vector predecessor; // augmenting-path predecessors - // rowsol / colsol are included so score functions that don't need the - // solution afterwards can avoid a separate allocation. - std::vector rowsol; - std::vector colsol; - // Pooled CostMatrix instances — reused across pairs to avoid per-call - // heap allocation. Grown lazily on first use, never shrunk. - CostMatrix score_pool; // main cost matrix - CostMatrix small_pool; // reduced matrix (exact-match path) - - void ensure(int dim) noexcept { - const int padded = static_cast( - ((static_cast(dim) + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE); - if (static_cast(v.size()) < padded) v.resize(padded); - if (static_cast(matches.size()) < dim) matches.resize(dim); - if (static_cast(freeunassigned.size()) < dim) freeunassigned.resize(dim); - if (static_cast(col_list.size()) < dim) col_list.resize(dim); - if (static_cast(d.size()) < dim) d.resize(dim); - if (static_cast(predecessor.size()) < dim) predecessor.resize(dim); - if (static_cast(rowsol.size()) < dim) rowsol.resize(dim); - if (static_cast(colsol.size()) < dim) colsol.resize(dim); - } -}; - -/*************** FUNCTIONS *******************/ - -// Primary overload: caller supplies pre-allocated scratch (zero alloc in hot loop). -extern cost lap(lap_row dim, - cost_matrix &input_cost, - std::vector &rowsol, - std::vector &colsol, - bool allow_interrupt, - LapScratch &scratch); - -// Convenience overload: creates a temporary scratch (for one-off calls). -inline cost lap(lap_row dim, - cost_matrix &input_cost, - std::vector &rowsol, - std::vector &colsol, - bool allow_interrupt = true) { - LapScratch scratch; - return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch); -} +// Public types, CostMatrix, LapScratch, and lap() declarations live in +// inst/include/TreeDist/ headers (shared with downstream LinkingTo users). +// This wrapper re-exports them to global scope for backward compatibility. + +#include // types, CostMatrix, LapScratch, lap() + +#include "ints.h" // grf_match, uint32 (TreeDist-internal types) + +// Re-export shared types and classes to global scope. +using TreeDist::cost; +using TreeDist::lap_dim; +using TreeDist::lap_row; +using TreeDist::lap_col; +using TreeDist::BIG; +using TreeDist::ROUND_PRECISION; +using TreeDist::BLOCK_SIZE; +using TreeDist::CostMatrix; +using TreeDist::LapScratch; +using TreeDist::lap; + +using cost_matrix = TreeDist::CostMatrix; #endif diff --git a/src/mast.cpp b/src/mast.cpp index 23fb961e..c26af5ff 100644 --- a/src/mast.cpp +++ b/src/mast.cpp @@ -1,5 +1,6 @@ #include #include +#include #include "ints.h" using namespace Rcpp; @@ -63,12 +64,8 @@ int cpp_mast (IntegerMatrix edge1, IntegerMatrix edge2, IntegerVector nTip) { n_internal = n_tip - 1, n_all_nodes = n_tip + n_internal, n_edge = edge1.nrow(); - if (edge2.nrow() != n_edge) { - Rcpp::stop("Both trees must contain the same number of edges."); - } - if (n_tip > MAST_MAX_TIP) { - Rcpp::stop("Tree too large; please contact maintainer for advice."); - } + ASSERT(edge2.nrow() == n_edge && "Both trees must contain the same number of edges."); + ASSERT(n_tip <= MAST_MAX_TIP && "Tree too large; please contact maintainer for advice."); int16 t1_left[MAST_MAX_NODE] = {}, t1_right[MAST_MAX_NODE] = {}, diff --git a/src/nni_distance.cpp b/src/nni_distance.cpp index ac5c17dc..db3a5b3c 100644 --- a/src/nni_distance.cpp +++ b/src/nni_distance.cpp @@ -226,16 +226,7 @@ grf_match nni_rf_matching ( ASSERT(n_splits > 0); ASSERT(n_tips > 3); - if (n_tips > NNI_MAX_TIPS) { - Rcpp::stop("Cannot calculate NNI distance for trees with so many tips."); - } - - // #nocov start - if (static_cast(n_splits) * static_cast(n_bins) > - static_cast(std::numeric_limits::max())) { - Rcpp::stop("Cannot calculate NNI distance for trees with so many splits."); - } - // #nocov end + ASSERT(n_tips <= NNI_MAX_TIPS); // Validated by exported wrapper const int32_t last_bin = n_bins - 1; const int32_t unset_tips = (n_tips % SL_BIN_SIZE) ? @@ -298,10 +289,7 @@ IntegerVector cpp_nni_distance(const IntegerMatrix& edge1, const IntegerMatrix& edge2, const IntegerVector& nTip) { - if (nTip[0] > NNI_MAX_TIPS) { - Rcpp::stop("Cannot calculate NNI distance for trees with " - "so many tips."); - } + ASSERT(nTip[0] <= NNI_MAX_TIPS && "Cannot calculate NNI distance for trees with so many tips."); const int32_t n_tip = static_cast(nTip[0]); const int32_t node_0 = n_tip; const int32_t node_0_r = n_tip + 1; @@ -316,10 +304,8 @@ IntegerVector cpp_nni_distance(const IntegerMatrix& edge1, int32_t fack_score_bound = 0; int32_t li_score_bound = 0; - if (n_edge != int32_t(edge2.nrow())) { - Rcpp::stop("Both trees must have the same number of edges. " - "Is one rooted and the other unrooted?"); - } + ASSERT(n_edge == int32_t(edge2.nrow()) + && "Both trees must have the same number of edges."); if (n_tip < 4) { return(IntegerVector::create(Named("lower") = 0, @@ -349,7 +335,7 @@ IntegerVector cpp_nni_distance(const IntegerMatrix& edge1, const int32_t n_splits = n_distinct_edge - n_tip; if (n_splits < 1) { - Rcpp::stop("NNI distance is undefined for trees with no splits"); // #nocov + ASSERT(false && "NNI distance is undefined for trees with no splits"); // #nocov } std::unique_ptr splits1(new uint64_t[n_splits * n_bin]); diff --git a/src/spr.cpp b/src/spr.cpp index 7eee66d6..c9dd456c 100644 --- a/src/spr.cpp +++ b/src/spr.cpp @@ -47,18 +47,11 @@ IntegerVector calc_mismatch_size(const RawMatrix& x, const RawMatrix& y) { // [[Rcpp::export]] IntegerVector mismatch_size (const RawMatrix& x, const RawMatrix& y) { - if (x.rows() != y.rows()) { - throw std::invalid_argument("`x` and `y` differ in number of splits."); - } - if (!x.hasAttribute("nTip")) { - Rcpp::stop("`x` lacks nTip attribute"); - } - if (!y.hasAttribute("nTip")) { - Rcpp::stop("`y` lacks nTip attribute"); - } - if (static_cast(x.attr("nTip")) != static_cast(y.attr("nTip"))) { - Rcpp::stop("`x` and `y` differ in `nTip`"); - } + ASSERT(x.rows() == y.rows() && "`x` and `y` differ in number of splits."); + ASSERT(x.hasAttribute("nTip") && "`x` lacks nTip attribute"); + ASSERT(y.hasAttribute("nTip") && "`y` lacks nTip attribute"); + ASSERT(static_cast(x.attr("nTip")) == static_cast(y.attr("nTip")) + && "`x` and `y` differ in `nTip`"); return calc_mismatch_size(x, y); } @@ -102,25 +95,17 @@ IntegerVector calc_confusion(const RawMatrix &x, const RawMatrix &y) { // [[Rcpp::export]] IntegerVector confusion(const RawMatrix& x, const RawMatrix& y) { - if (x.rows() != y.rows()) { - throw std::invalid_argument("Input splits contain same number of splits."); - } - if (!x.hasAttribute("nTip")) { - Rcpp::stop("`x` lacks nTip attribute"); - } - if (!y.hasAttribute("nTip")) { - Rcpp::stop("`y` lacks nTip attribute"); - } - if (static_cast(x.attr("nTip")) != static_cast(y.attr("nTip"))) { - Rcpp::stop("`x` and `y` differ in `nTip`"); - } + ASSERT(x.rows() == y.rows() && "Input splits must contain same number of splits."); + ASSERT(x.hasAttribute("nTip") && "`x` lacks nTip attribute"); + ASSERT(y.hasAttribute("nTip") && "`y` lacks nTip attribute"); + ASSERT(static_cast(x.attr("nTip")) == static_cast(y.attr("nTip")) + && "`x` and `y` differ in `nTip`"); return calc_confusion(x, y); } IntegerMatrix reverse (const IntegerMatrix x) { - if (double(x.nrow()) > double(std::numeric_limits::max())) { - Rcpp::stop("This many edges are not (yet) supported."); - } + ASSERT(double(x.nrow()) <= double(std::numeric_limits::max()) + && "This many edges are not (yet) supported."); const intx n_edge = intx(x.nrow()); ASSERT(n_edge % 2 == 0); // Tree is binary IntegerMatrix ret(n_edge, 2); diff --git a/src/spr_lookup.cpp b/src/spr_lookup.cpp index 9794f248..1ac2281c 100644 --- a/src/spr_lookup.cpp +++ b/src/spr_lookup.cpp @@ -223,7 +223,7 @@ int lookup_7(const SplitSet7& sp1, const SplitSet7& sp2) { inline SplitSet7 read_splits(const Rcpp::RawVector& r) { if (r.size() != 4) - Rcpp::stop("Expected a length-4 raw vector of splits"); + ASSERT(false && "Expected a length-4 raw vector of splits"); SplitSet7 sp{}; for (int i = 0; i < 4; ++i) { diff --git a/src/transfer_consensus.cpp b/src/transfer_consensus.cpp new file mode 100644 index 00000000..514c6430 --- /dev/null +++ b/src/transfer_consensus.cpp @@ -0,0 +1,879 @@ +/* transfer_consensus.cpp + * + * C++ implementation of the Transfer Consensus algorithm (Takazawa 2025). + * Replaces the R-level .PoolSplits, .TransferDistMat, .ComputeTD, + * .CompatMat, and .GreedyBest/.GreedyFirst helpers. + * + * Exported function: cpp_transfer_consensus() + */ + +#ifdef _OPENMP +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using Rcpp::IntegerVector; +using Rcpp::List; +using Rcpp::LogicalVector; +using Rcpp::RawMatrix; +using TreeTools::count_bits; +using TreeTools::SplitList; + + +// ============================================================================ +// Types +// ============================================================================ + +using int16 = int_fast16_t; +using int32 = int_fast32_t; + +// ============================================================================ +// Pooled split representation +// ============================================================================ + +struct PooledSplits { + int n_splits; + int n_bins; + int n_tips; + splitbit last_mask; + + // Flat array of canonical split data: n_splits * n_bins elements. + // Split i occupies [i*n_bins .. (i+1)*n_bins). + std::vector data; + + // Per-split metadata + std::vector count; // how many trees contain this split + std::vector light_side; // min(popcount, n_tips - popcount) + + // Tree membership: tree_members[t] = list of unique-split indices in tree t + std::vector> tree_members; + + // Access helpers + const splitbit* split(int i) const { return &data[i * n_bins]; } + splitbit* split(int i) { return &data[i * n_bins]; } +}; + + +// ============================================================================ +// FNV-1a hash for canonical split arrays +// ============================================================================ + +struct SplitHash { + int n_bins; + explicit SplitHash(int nb) : n_bins(nb) {} + SplitHash() : n_bins(0) {} + + std::size_t operator()(const splitbit* sp) const { + std::size_t h = 14695981039346656037ULL; + const unsigned char* p = reinterpret_cast(sp); + const int bytes = n_bins * static_cast(sizeof(splitbit)); + for (int i = 0; i < bytes; ++i) { + h ^= static_cast(p[i]); + h *= 1099511628211ULL; + } + return h; + } +}; + +struct SplitEqual { + int n_bins; + explicit SplitEqual(int nb) : n_bins(nb) {} + SplitEqual() : n_bins(0) {} + + bool operator()(const splitbit* a, const splitbit* b) const { + for (int bin = 0; bin < n_bins; ++bin) { + if (a[bin] != b[bin]) return false; + } + return true; + } +}; + +// ============================================================================ +// pool_splits: deduplicate and canonicalise all splits from all trees +// ============================================================================ + +static PooledSplits pool_splits(const List& splits_list, int n_tips) { + const int n_tree = splits_list.size(); + + // Determine n_bins from first tree + const RawMatrix first_mat = Rcpp::as(splits_list[0]); + const int n_bytes = first_mat.ncol(); + const int n_bins = (n_bytes + static_cast(sizeof(splitbit)) - 1) + / static_cast(sizeof(splitbit)); + + const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0) + ? ~splitbit(0) + : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1; + + // Count total splits across all trees to pre-allocate pool.data. + // This avoids reallocation, which would invalidate the pointers stored + // as keys in split_map. + int total_splits = 0; + for (int t = 0; t < n_tree; ++t) { + const RawMatrix mat = Rcpp::as(splits_list[t]); + total_splits += mat.nrow(); + } + + // Hash map from canonical split → unique index + SplitHash hasher(n_bins); + SplitEqual eq(n_bins); + std::unordered_map + split_map(64, hasher, eq); + + // Temp buffer for canonicalisation + std::vector canon_buf(n_bins); + + PooledSplits pool; + pool.n_tips = n_tips; + pool.n_bins = n_bins; + pool.last_mask = last_mask; + pool.n_splits = 0; + pool.tree_members.resize(n_tree); + + // Reserve capacity for the maximum possible unique splits (= total splits). + // This ensures pool.data never reallocates, keeping split_map keys valid. + pool.data.reserve(static_cast(total_splits) * n_bins); + + for (int t = 0; t < n_tree; ++t) { + SplitList sl(Rcpp::as(splits_list[t])); + std::vector& members = pool.tree_members[t]; + members.reserve(sl.n_splits); + + for (int16 s = 0; s < sl.n_splits; ++s) { + // Copy split bits into canon_buf + for (int bin = 0; bin < n_bins; ++bin) { + canon_buf[bin] = sl.state[s][bin]; + } + // Mask last bin + canon_buf[n_bins - 1] &= last_mask; + + // Canonicalise: if bit 0 is set, flip + if (canon_buf[0] & splitbit(1)) { + for (int bin = 0; bin < n_bins; ++bin) { + canon_buf[bin] = ~canon_buf[bin]; + } + canon_buf[n_bins - 1] &= last_mask; + } + + // Look up in map + auto it = split_map.find(canon_buf.data()); + int idx; + if (it != split_map.end()) { + idx = it->second; + pool.count[idx]++; + } else { + idx = pool.n_splits++; + // Append canonical data (no reallocation due to reserve above) + const size_t old_sz = pool.data.size(); + pool.data.resize(old_sz + n_bins); + std::copy(canon_buf.begin(), canon_buf.end(), + pool.data.begin() + old_sz); + pool.count.push_back(1); + + // Compute light_side = min(popcount, n_tips - popcount) + int pc = 0; + for (int bin = 0; bin < n_bins; ++bin) { + pc += count_bits(canon_buf[bin]); + } + pool.light_side.push_back(std::min(pc, n_tips - pc)); + + // Insert pointer into map (points into pool.data; stable due to reserve) + split_map[pool.split(idx)] = idx; + } + + // Only record unique splits per tree + if (members.empty() || members.back() != idx) { + // Check if already present (small set, linear scan ok) + bool found = false; + for (int m : members) { + if (m == idx) { found = true; break; } + } + if (!found) members.push_back(idx); + } + } + } + + return pool; +} + + +// ============================================================================ +// transfer_dist_mat: pairwise transfer distance between all unique splits +// ============================================================================ + +// Returns a flat n_splits x n_splits matrix (row-major). +// DIST[i * n + j] = transfer distance between split i and split j. +static std::vector transfer_dist_mat(const PooledSplits& pool, + int n_threads) { + const int M = pool.n_splits; + const int nb = pool.n_bins; + const int nt = pool.n_tips; + + std::vector dist(M * M, 0); + + #ifdef _OPENMP + #pragma omp parallel for schedule(dynamic) num_threads(n_threads) + #endif + for (int i = 0; i < M; ++i) { + const splitbit* a = pool.split(i); + for (int j = i + 1; j < M; ++j) { + const splitbit* b = pool.split(j); + int hamming = 0; + for (int bin = 0; bin < nb; ++bin) { + hamming += count_bits(a[bin] ^ b[bin]); + } + int td = std::min(hamming, nt - hamming); + dist[i * M + j] = td; + dist[j * M + i] = td; + } + } + return dist; +} + + +// ============================================================================ +// compute_td: transfer dissimilarity cost for each split +// ============================================================================ + +static std::vector compute_td( + const std::vector& dist, + const PooledSplits& pool, + bool scale, + int n_threads +) { + const int M = pool.n_splits; + const int n_tree = static_cast(pool.tree_members.size()); + + std::vector td(M, 0.0); + + // Parallelise over splits (each td[b] is independent) + #ifdef _OPENMP + #pragma omp parallel for schedule(static) num_threads(n_threads) + #endif + for (int b = 0; b < M; ++b) { + const int p_minus_1 = pool.light_side[b] - 1; + if (p_minus_1 <= 0) continue; + + double acc = 0.0; + for (int t = 0; t < n_tree; ++t) { + const auto& members = pool.tree_members[t]; + const int n_mem = static_cast(members.size()); + + int min_d = p_minus_1; // sentinel distance + for (int k = 0; k < n_mem; ++k) { + int d = dist[b * M + members[k]]; + if (d < min_d) min_d = d; + } + if (scale) { + double contrib = static_cast(min_d) / p_minus_1; + acc += (contrib < 1.0) ? contrib : 1.0; + } else { + acc += (min_d < p_minus_1) ? min_d : p_minus_1; + } + } + td[b] = acc; + } + return td; +} + + +// ============================================================================ +// compat_mat: pairwise compatibility between all unique splits +// ============================================================================ + +// Returns flat M x M bool matrix. +// compat[i * M + j] = true iff splits i and j are compatible. +static std::vector compat_mat(const PooledSplits& pool, + int n_threads) { + const int M = pool.n_splits; + const int nb = pool.n_bins; + const int last_bin = nb - 1; + const splitbit lm = pool.last_mask; + + std::vector compat(M * M, 1); + + #ifdef _OPENMP + #pragma omp parallel for schedule(dynamic) num_threads(n_threads) + #endif + for (int i = 0; i < M; ++i) { + const splitbit* a = pool.split(i); + for (int j = i + 1; j < M; ++j) { + const splitbit* b = pool.split(j); + bool ab = false, anb = false, nab = false, nanb = false; + for (int bin = 0; bin < nb; ++bin) { + splitbit mask = (bin == last_bin) ? lm : ~splitbit(0); + splitbit a_bin = a[bin] & mask; + splitbit b_bin = b[bin] & mask; + if (!ab) ab = (a_bin & b_bin) != 0; + if (!anb) anb = (a_bin & ~b_bin & mask) != 0; + if (!nab) nab = (~a_bin & b_bin & mask) != 0; + if (!nanb) nanb = (~a_bin & ~b_bin & mask) != 0; + if (ab && anb && nab && nanb) break; + } + uint8_t comp = (!ab || !anb || !nab || !nanb) ? 1 : 0; + compat[i * M + j] = comp; + compat[j * M + i] = comp; + } + } + return compat; +} + + +// ============================================================================ +// Greedy loop helpers +// ============================================================================ + +// Sentinel distance for split b (distance to a leaf bipartition) +inline int sent_dist(int b, const PooledSplits& pool) { + return pool.light_side[b] - 1; +} + +// "Effective distance" from split b to its match (or sentinel if match == -1) +inline int eff_dist(int b, int match_idx, const std::vector& dist, + int M, const PooledSplits& pool) { + return (match_idx < 0) ? sent_dist(b, pool) : dist[b * M + match_idx]; +} + +// Find second-closest included split to split b (excluding matchIdx). +// Returns {index, distance} as a pair; index = -1 means sentinel. +static std::pair find_second( + int b, int matchIdx, + const std::vector& incl, + const std::vector& dist, + int M, const PooledSplits& pool, + bool scale +) { + int p_minus_1 = sent_dist(b, pool); + int best = -1; + int best_d = (p_minus_1 > 0) ? p_minus_1 : 0; + + for (int c = 0; c < M; ++c) { + if (!incl[c] || c == matchIdx) continue; + int d = dist[b * M + c]; + if (d < best_d) { + best_d = d; + best = c; + } + } + return {best, (best >= 0) ? best_d : p_minus_1}; +} + + +// ============================================================================ +// GreedyState: persistent state for the greedy loop. +// +// Maintains match_dist[], match2_dist[], n_incompat[], weight[] to avoid +// redundant O(M) scans in benefit calculations and compat checks. +// ============================================================================ + +struct GreedyState { + int M; + int n_tip; + int n_incl; + bool scale; + + std::vector& match; + std::vector& match2; + std::vector& incl; + const std::vector& dist; + const std::vector& td; + const PooledSplits& pool; + const std::vector& compat; + + // Cached per-split arrays + std::vector match_dist; // dist to match[b], or sentinel + std::vector match2_dist; // dist to match2[b], or sentinel + std::vector n_incompat; // # included splits incompatible with b + std::vector weight; // count[b] / (light_side[b] - 1) or count[b] + + GreedyState( + std::vector& match_, std::vector& match2_, + std::vector& incl_, + const std::vector& dist_, const std::vector& td_, + const PooledSplits& pool_, const std::vector& compat_, + bool scale_, int M_, int n_tip_ + ) : M(M_), n_tip(n_tip_), n_incl(0), scale(scale_), + match(match_), match2(match2_), incl(incl_), + dist(dist_), td(td_), pool(pool_), compat(compat_), + match_dist(M_), match2_dist(M_), n_incompat(M_, 0), weight(M_) + { + // Precompute weight = count[b] / (light_side[b] - 1) for scaled, + // count[b] for unscaled + for (int b = 0; b < M; ++b) { + int p1 = sent_dist(b, pool); + if (p1 <= 0) { weight[b] = 0.0; continue; } + weight[b] = scale + ? static_cast(pool.count[b]) / p1 + : static_cast(pool.count[b]); + } + + // Initialize match_dist / match2_dist from current match/match2 + for (int b = 0; b < M; ++b) { + match_dist[b] = eff_dist(b, match[b], dist, M, pool); + match2_dist[b] = eff_dist(b, match2[b], dist, M, pool); + } + + // Initialize n_incompat from current incl + for (int i = 0; i < M; ++i) { + if (!incl[i]) continue; + n_incl++; + for (int j = 0; j < M; ++j) { + if (!compat[j * M + i]) n_incompat[j]++; + } + } + } + + // O(1) compat check + bool is_compatible(int idx) const { + return n_incompat[idx] == 0 && n_incl < n_tip - 3; + } + + // Add-benefit using precomputed match_dist[] and weight[] + double add_benefit(int c) const { + double ben = -td[c]; + for (int b = 0; b < M; ++b) { + int diff = match_dist[b] - dist[b * M + c]; + if (diff <= 0) continue; + ben += diff * weight[b]; + } + return ben; + } + + // Remove-benefit using precomputed match2_dist[] and weight[] + double remove_benefit(int c) const { + double ben = td[c]; + for (int b = 0; b < M; ++b) { + if (match[b] != c) continue; + int diff = dist[b * M + c] - match2_dist[b]; + if (diff >= 0) continue; + ben += diff * weight[b]; + } + return ben; + } + + // Execute add: update incl, match, match2, match_dist, match2_dist, + // n_incompat, n_incl + void do_add(int idx) { + incl[idx] = 1; + n_incl++; + + // Update n_incompat for all splits + for (int j = 0; j < M; ++j) { + if (!compat[j * M + idx]) n_incompat[j]++; + } + + // Update match/match2 and their cached distances + for (int b = 0; b < M; ++b) { + int new_d = dist[b * M + idx]; + if (new_d < match_dist[b]) { + match2[b] = match[b]; + match2_dist[b] = match_dist[b]; + match[b] = idx; + match_dist[b] = new_d; + } else if (new_d < match2_dist[b]) { + match2[b] = idx; + match2_dist[b] = new_d; + } + } + } + + // Execute remove: update incl, match, match2, match_dist, match2_dist, + // n_incompat, n_incl + void do_remove(int idx) { + incl[idx] = 0; + n_incl--; + + // Update n_incompat + for (int j = 0; j < M; ++j) { + if (!compat[j * M + idx]) n_incompat[j]--; + } + + // Update match/match2 for affected splits + for (int b = 0; b < M; ++b) { + if (match[b] == idx) { + // Promote match2 → match + match[b] = match2[b]; + match_dist[b] = match2_dist[b]; + + if (match[b] < 0) { + // Defensive guard: promoted match2 was the sentinel (no real + // second-closest split was known). Rescan from scratch to find + // the actual closest included split. This requires a topology + // where match2[b] stayed at -1 while another split's removal + // makes b's greedy match the removed split — theoretically + // reachable but extremely unlikely in practice. + auto [first, first_d] = find_second(b, -1, incl, dist, M, pool, scale); // #nocov + match[b] = first; // #nocov + match_dist[b] = first_d; // #nocov + } + + // Find new second-closest + auto [sec, sec_d] = find_second(b, match[b], incl, dist, M, pool, scale); + match2[b] = sec; + match2_dist[b] = sec_d; + } else if (match2[b] == idx) { + auto [sec, sec_d] = find_second(b, match[b], incl, dist, M, pool, scale); + match2[b] = sec; + match2_dist[b] = sec_d; + } + } + } +}; + + +// ============================================================================ +// Greedy "best" strategy +// ============================================================================ + +static void greedy_best( + std::vector& match, + std::vector& match2, + std::vector& incl, + const std::vector& dist, + const std::vector& td, + const PooledSplits& pool, + const std::vector& compat, + const std::vector& sort_ord, + bool scale, + int M, int n_tip +) { + GreedyState st(match, match2, incl, dist, td, pool, compat, scale, M, n_tip); + + while (true) { + double best_ben = 0.0; + int best_idx = -1; + bool best_is_add = false; + + for (int si = 0; si < M; ++si) { + int idx = sort_ord[si]; + if (incl[idx]) { + double ben = st.remove_benefit(idx); + if (ben > best_ben) { + best_ben = ben; + best_idx = idx; + best_is_add = false; + } + } else { + if (!st.is_compatible(idx)) continue; + double ben = st.add_benefit(idx); + if (ben > best_ben) { + best_ben = ben; + best_idx = idx; + best_is_add = true; + } + } + } + + if (best_ben <= 0.0 || best_idx < 0) break; + + if (best_is_add) { + st.do_add(best_idx); + } else { + st.do_remove(best_idx); + } + } +} + + +// ============================================================================ +// Greedy "first" strategy +// ============================================================================ + +static void greedy_first( + std::vector& match, + std::vector& match2, + std::vector& incl, + const std::vector& dist, + const std::vector& td, + const PooledSplits& pool, + const std::vector& compat, + const std::vector& sort_ord, + bool scale, + int M, int n_tip +) { + GreedyState st(match, match2, incl, dist, td, pool, compat, scale, M, n_tip); + + bool improving = true; + while (improving) { + improving = false; + for (int si = 0; si < M; ++si) { + int idx = sort_ord[si]; + if (incl[idx]) { + if (st.remove_benefit(idx) > 0.0) { + st.do_remove(idx); + improving = true; + break; + } + } else { + if (!st.is_compatible(idx)) continue; + if (st.add_benefit(idx) > 0.0) { + st.do_add(idx); + improving = true; + break; + } + } + } + } +} + + +// ============================================================================ +// Init matches (for init = "majority") +// ============================================================================ + +static void init_matches( + std::vector& match, + std::vector& match2, + const std::vector& incl, + const std::vector& dist, + int M, const PooledSplits& pool, + bool scale +) { + // Collect included indices + std::vector inc_idx; + for (int i = 0; i < M; ++i) { + if (incl[i]) inc_idx.push_back(i); + } + if (inc_idx.empty()) return; + + for (int b = 0; b < M; ++b) { + int p_minus_1 = pool.light_side[b] - 1; + if (p_minus_1 <= 0) continue; + + // Find closest and second-closest among included + int best = -1, second = -1; + int best_d = p_minus_1, second_d = p_minus_1; // sentinel threshold + + for (int c : inc_idx) { + int d = dist[b * M + c]; + if (d < best_d) { + second = best; + second_d = best_d; + best = c; + best_d = d; + } else if (d < second_d) { + second = c; + second_d = d; + } + } + + if (best >= 0) { + if (scale) { + if (static_cast(best_d) / p_minus_1 >= 1.0) continue; + } else { + if (best_d >= p_minus_1) continue; + } + match[b] = best; + + if (second >= 0) { + if (scale) { + if (static_cast(second_d) / p_minus_1 < 1.0) + match2[b] = second; + } else { + if (second_d < p_minus_1) + match2[b] = second; + } + } + } + } +} + + +// ============================================================================ +// Main exported function +// ============================================================================ + +//' Transfer consensus (C++ implementation) +//' +//' @param splits_list List of raw matrices (one per tree), each from as.Splits(). +//' @param n_tip Number of tips. +//' @param scale Logical: use scaled transfer distance? +//' @param greedy_best_flag Logical: TRUE for "best", FALSE for "first". +//' @param init_majority Logical: TRUE to start from majority-rule splits. +//' +//' @return A `LogicalVector` of length n_splits indicating which pooled splits +//' are included in the consensus, plus attributes "raw_splits" (a raw matrix +//' of all unique splits) and "light_side" (integer vector). +//' @keywords internal +// [[Rcpp::export]] +List cpp_transfer_consensus( + const List& splits_list, + const int n_tip, + const bool scale, + const bool greedy_best_flag, + const bool init_majority, + const int n_threads = 1 +) { + const int n_tree = splits_list.size(); + + // ---- Pool unique splits ---- + PooledSplits pool = pool_splits(splits_list, n_tip); + const int M = pool.n_splits; + + if (M == 0) { + return List::create( + Rcpp::Named("included") = LogicalVector(0), + Rcpp::Named("raw_splits") = RawMatrix(0, 0), + Rcpp::Named("light_side") = IntegerVector(0) + ); + } + + // ---- Pairwise transfer distance matrix ---- + std::vector dist = transfer_dist_mat(pool, n_threads); + + // ---- Transfer dissimilarity cost ---- + std::vector td = compute_td(dist, pool, scale, n_threads); + + // ---- Compatibility matrix ---- + std::vector compat = compat_mat(pool, n_threads); + + // ---- Sort order (by count descending, then index ascending for ties) ---- + // Must match R's order(-counts, seq_along(counts)) for reproducibility. + std::vector sort_ord(M); + std::iota(sort_ord.begin(), sort_ord.end(), 0); + std::stable_sort(sort_ord.begin(), sort_ord.end(), + [&](int a, int b) { return pool.count[a] > pool.count[b]; }); + + // ---- Mutable state ---- + std::vector match(M, -1); // -1 = sentinel + std::vector match2(M, -1); + std::vector incl(M, 0); + + // ---- Init from majority if requested ---- + if (init_majority) { + double half = n_tree / 2.0; + for (int i = 0; i < M; ++i) { + if (pool.count[i] > half) { + incl[i] = 1; + } + } + init_matches(match, match2, incl, dist, M, pool, scale); + } + + // ---- Greedy loop ---- + if (greedy_best_flag) { + greedy_best(match, match2, incl, dist, td, pool, compat, sort_ord, + scale, M, n_tip); + } else { + greedy_first(match, match2, incl, dist, td, pool, compat, sort_ord, + scale, M, n_tip); + } + + // ---- Build output ---- + // Return raw split data for included splits so R can convert to phylo + LogicalVector incl_r(M); + for (int i = 0; i < M; ++i) incl_r[i] = incl[i] != 0; + + // Build raw matrix of ALL unique splits (n_splits x n_bytes) + // The canonical form may differ from the original raw form: we need to + // return the canonical splitbit data as raw bytes for as.phylo(). + const int n_bytes = pool.n_bins * static_cast(sizeof(splitbit)); + RawMatrix raw_splits(M, n_bytes); + for (int i = 0; i < M; ++i) { + const unsigned char* src = + reinterpret_cast(pool.split(i)); + for (int j = 0; j < n_bytes; ++j) { + raw_splits(i, j) = Rbyte(src[j]); + } + } + + IntegerVector light_side(M); + for (int i = 0; i < M; ++i) light_side[i] = pool.light_side[i]; + + return List::create( + Rcpp::Named("included") = incl_r, + Rcpp::Named("raw_splits") = raw_splits, + Rcpp::Named("light_side") = light_side + ); +} + + +// ============================================================================ +// Diagnostic: per-stage timing (for profiling without VTune) +// ============================================================================ + +//' @keywords internal +// [[Rcpp::export]] +Rcpp::NumericVector cpp_tc_profile( + const List& splits_list, + const int n_tip, + const bool scale, + const bool greedy_best_flag, + const bool init_majority, + const int n_iter, + const int n_threads = 1 +) { + using clk = std::chrono::high_resolution_clock; + const int n_tree = splits_list.size(); + + double t_pool = 0, t_dist = 0, t_td = 0, t_compat = 0, t_greedy = 0; + + for (int iter = 0; iter < n_iter; ++iter) { + auto t0 = clk::now(); + PooledSplits pool = pool_splits(splits_list, n_tip); + auto t1 = clk::now(); + const int M = pool.n_splits; + + std::vector dist = transfer_dist_mat(pool, n_threads); + auto t2 = clk::now(); + + std::vector td = compute_td(dist, pool, scale, n_threads); + auto t3 = clk::now(); + + std::vector compat_v = compat_mat(pool, n_threads); + auto t4 = clk::now(); + + std::vector sort_ord(M); + std::iota(sort_ord.begin(), sort_ord.end(), 0); + std::stable_sort(sort_ord.begin(), sort_ord.end(), + [&](int a, int b) { return pool.count[a] > pool.count[b]; }); + + std::vector match(M, -1); + std::vector match2(M, -1); + std::vector incl(M, 0); + + if (init_majority) { + double half = n_tree / 2.0; + for (int i = 0; i < M; ++i) { + if (pool.count[i] > half) incl[i] = 1; + } + init_matches(match, match2, incl, dist, M, pool, scale); + } + + if (greedy_best_flag) { + greedy_best(match, match2, incl, dist, td, pool, compat_v, sort_ord, + scale, M, n_tip); + } else { + greedy_first(match, match2, incl, dist, td, pool, compat_v, sort_ord, + scale, M, n_tip); + } + auto t5 = clk::now(); + + auto us = [](auto a, auto b) { + return std::chrono::duration(b - a).count(); + }; + t_pool += us(t0, t1); + t_dist += us(t1, t2); + t_td += us(t2, t3); + t_compat += us(t3, t4); + t_greedy += us(t4, t5); + } + + double inv = 1.0 / n_iter; + Rcpp::NumericVector result = { + t_pool * inv, t_dist * inv, t_td * inv, t_compat * inv, t_greedy * inv + }; + result.attr("names") = Rcpp::CharacterVector( + {"pool_splits", "transfer_dist_mat", "compute_td", "compat_mat", "greedy"} + ); + return result; +} diff --git a/src/transfer_distance.cpp b/src/transfer_distance.cpp new file mode 100644 index 00000000..a2ff23e8 --- /dev/null +++ b/src/transfer_distance.cpp @@ -0,0 +1,362 @@ +/* transfer_distance.cpp + * + * Per-pair and batch computation of the transfer dissimilarity between + * phylogenetic trees (Takazawa et al. 2026; Lemoine et al. 2018). + * + * The transfer distance between two bipartitions is the minimum number of + * taxa that must be moved to transform one bipartition into the other: + * delta(b, b*) = min(Hamming(b, b*), n - Hamming(b, b*)) + * + * The transfer dissimilarity between two trees sums, for each branch in + * each tree, the minimum transfer distance to any branch in the other tree. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +using Rcpp::IntegerVector; +using Rcpp::List; +using Rcpp::LogicalVector; +using Rcpp::Named; +using Rcpp::NumericMatrix; +using Rcpp::NumericVector; +using Rcpp::RawMatrix; +using TreeTools::count_bits; +using TreeTools::SplitList; + +using int16 = int_fast16_t; +using int32 = int_fast32_t; + + +// ============================================================================ +// Per-pair transfer distance computation +// ============================================================================ + +// Compute transfer distance between two bipartitions (split bitsets). +// Returns min(hamming, n_tips - hamming). +static inline int transfer_dist_splits( + const splitbit* a, const splitbit* b, int n_bins, int n_tips +) { + int hamming = 0; + for (int bin = 0; bin < n_bins; ++bin) { + hamming += count_bits(a[bin] ^ b[bin]); + } + return std::min(hamming, n_tips - hamming); +} + + +// For each split in `from`, find the minimum transfer distance to any split +// in `to`. Returns a vector of length from.n_splits with the min distances, +// and a matching vector with the index of the best match in `to` (-1 if no +// improvement over sentinel). +static void find_min_transfer( + const SplitList& from, + const SplitList& to, + int n_tips, + int n_bins, + std::vector& min_dist, // out: [from.n_splits] + std::vector& best_match // out: [from.n_splits] +) { + for (int16 i = 0; i < from.n_splits; ++i) { + // Sentinel distance: depth(b) - 1 + int pc = 0; + for (int bin = 0; bin < n_bins; ++bin) { + pc += count_bits(from.state[i][bin]); + } + int depth = std::min(pc, n_tips - pc); + int sentinel = depth - 1; + if (sentinel <= 0) { + min_dist[i] = 0; + best_match[i] = -1; + continue; + } + + int best_d = sentinel; + int best_j = -1; + + for (int16 j = 0; j < to.n_splits; ++j) { + int d = transfer_dist_splits(from.state[i], to.state[j], n_bins, n_tips); + if (d < best_d) { + best_d = d; + best_j = j; + if (d == 0) break; // can't do better + } + } + + min_dist[i] = best_d; + best_match[i] = best_j; + } +} + + +// Accumulate the transfer dissimilarity contribution from one direction. +// For scaled: sum of min(min_dist[i] / (depth[i] - 1), 1) +// For unscaled: sum of min(min_dist[i], depth[i] - 1) +static double accumulate_transfer( + const SplitList& sl, + const std::vector& min_dist, + int n_tips, int n_bins, + bool scale +) { + double total = 0.0; + for (int16 i = 0; i < sl.n_splits; ++i) { + int pc = 0; + for (int bin = 0; bin < n_bins; ++bin) { + pc += count_bits(sl.state[i][bin]); + } + int depth = std::min(pc, n_tips - pc); + int p_minus_1 = depth - 1; + if (p_minus_1 <= 0) continue; + + int d = min_dist[i]; + if (scale) { + double contrib = static_cast(d) / p_minus_1; + total += (contrib < 1.0) ? contrib : 1.0; + } else { + total += (d < p_minus_1) ? d : p_minus_1; + } + } + return total; +} + + +// ============================================================================ +// Exported: per-pair transfer distance (returns both scaled and unscaled) +// ============================================================================ + +//' Per-pair transfer dissimilarity +//' +//' @param x,y Raw matrices representing splits (from as.Splits()). +//' @param nTip Integer: number of tips. +//' +//' @return A list with components: +//' - score_scaled: scaled transfer dissimilarity (double) +//' - score_unscaled: unscaled transfer dissimilarity (double) +//' - `matching_xy`: integer vector, best match in y for each split in x (1-based, NA if sentinel) +//' - `matching_yx`: integer vector, best match in x for each split in y (1-based, NA if sentinel) +//' @keywords internal +// [[Rcpp::export]] +List cpp_transfer_dist( + const RawMatrix& x, const RawMatrix& y, + const IntegerVector& nTip +) { + // Validated by R caller (.ValidateSplitArgs or as.Splits with shared tipLabels) + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); + + const int n_tip = nTip[0]; + SplitList sl_x(x); + SplitList sl_y(y); + const int n_bins = sl_x.n_bins; + + // Direction x → y + std::vector min_dist_xy(sl_x.n_splits); + std::vector match_xy(sl_x.n_splits); + find_min_transfer(sl_x, sl_y, n_tip, n_bins, min_dist_xy, match_xy); + + // Direction y → x + std::vector min_dist_yx(sl_y.n_splits); + std::vector match_yx(sl_y.n_splits); + find_min_transfer(sl_y, sl_x, n_tip, n_bins, min_dist_yx, match_yx); + + // Accumulate both directions for scaled and unscaled + double score_scaled = + accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, true) + + accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, true); + double score_unscaled = + accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, false) + + accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, false); + + // Build R matching vectors (1-based, NA for sentinel) + IntegerVector r_match_xy(sl_x.n_splits); + for (int16 i = 0; i < sl_x.n_splits; ++i) { + r_match_xy[i] = (match_xy[i] >= 0) ? match_xy[i] + 1 : NA_INTEGER; + } + IntegerVector r_match_yx(sl_y.n_splits); + for (int16 i = 0; i < sl_y.n_splits; ++i) { + r_match_yx[i] = (match_yx[i] >= 0) ? match_yx[i] + 1 : NA_INTEGER; + } + + return List::create( + Named("score_scaled") = score_scaled, + Named("score_unscaled") = score_unscaled, + Named("matching_xy") = r_match_xy, + Named("matching_yx") = r_match_yx + ); +} + + +// ============================================================================ +// Exported: per-pair score-only (for GeneralizedRF compatibility) +// ============================================================================ + +// Returns a list with "score" and "matching" to be compatible with +// GeneralizedRF / CalculateTreeDistance dispatch. + +//' @keywords internal +// [[Rcpp::export]] +List cpp_transfer_dist_scored( + const RawMatrix& x, const RawMatrix& y, + const IntegerVector& nTip, bool scale +) { + // Validated by R caller (.ValidateSplitArgs or as.Splits with shared tipLabels) + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); + + const int n_tip = nTip[0]; + SplitList sl_x(x); + SplitList sl_y(y); + const int n_bins = sl_x.n_bins; + + std::vector min_dist_xy(sl_x.n_splits); + std::vector match_xy(sl_x.n_splits); + find_min_transfer(sl_x, sl_y, n_tip, n_bins, min_dist_xy, match_xy); + + std::vector min_dist_yx(sl_y.n_splits); + std::vector match_yx(sl_y.n_splits); + find_min_transfer(sl_y, sl_x, n_tip, n_bins, min_dist_yx, match_yx); + + double score = scale + ? (accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, true) + + accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, true)) + : (accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, false) + + accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, false)); + + // Build matching vector (1-based) — just the x→y direction + const int n_out = std::max(static_cast(sl_x.n_splits), + static_cast(sl_y.n_splits)); + IntegerVector matching(n_out, NA_INTEGER); + for (int16 i = 0; i < sl_x.n_splits; ++i) { + matching[i] = (match_xy[i] >= 0) ? match_xy[i] + 1 : NA_INTEGER; + } + + return List::create(Named("score") = score, Named("matching") = matching); +} + + +// ============================================================================ +// Batch: all-pairs transfer dissimilarity (OpenMP parallel) +// ============================================================================ + +// Static helper for score-only per-pair computation. +static double transfer_dissimilarity_score( + const SplitList& sl_x, const SplitList& sl_y, + int n_tip, int n_bins, bool scale +) { + // Avoid allocation when trees have 0 splits (star trees) + if (sl_x.n_splits == 0 && sl_y.n_splits == 0) return 0.0; + + std::vector min_dist_xy(sl_x.n_splits); + std::vector dummy_match_xy(sl_x.n_splits); + find_min_transfer(sl_x, sl_y, n_tip, n_bins, min_dist_xy, dummy_match_xy); + + std::vector min_dist_yx(sl_y.n_splits); + std::vector dummy_match_yx(sl_y.n_splits); + find_min_transfer(sl_y, sl_x, n_tip, n_bins, min_dist_yx, dummy_match_yx); + + return accumulate_transfer(sl_x, min_dist_xy, n_tip, n_bins, scale) + + accumulate_transfer(sl_y, min_dist_yx, n_tip, n_bins, scale); +} + + +//' All-pairs transfer dissimilarity (OpenMP) +//' +//' @param splits_list List of raw matrices (one per tree). +//' @param n_tip Number of tips. +//' @param scale Logical: use scaled transfer dissimilarity? +//' @param n_threads Number of OpenMP threads. +//' +//' @return Numeric vector of length choose(N,2) in dist order. +//' @keywords internal +// [[Rcpp::export]] +NumericVector cpp_transfer_dist_all_pairs( + const List& splits_list, + int n_tip, bool scale, int n_threads +) { + const int N = splits_list.size(); + const int n_pairs = N * (N - 1) / 2; + + // Pre-convert all trees to SplitLists + std::vector> sls(N); + int n_bins = 0; + for (int i = 0; i < N; ++i) { + sls[i] = std::make_unique(Rcpp::as(splits_list[i])); + if (i == 0) n_bins = sls[0]->n_bins; + } + + NumericVector result(n_pairs); + + // R dist format: column-major lower triangle + // pos k corresponds to (row, col) where row > col, iterating col first: + // (1,0), (2,0), ..., (N-1,0), (2,1), (3,1), ..., (N-1,1), ... + // k = col * (N - 1) - col * (col - 1) / 2 + (row - col - 1) (0-based) + #ifdef _OPENMP + #pragma omp parallel for schedule(dynamic) num_threads(n_threads) + #endif + for (int k = 0; k < n_pairs; ++k) { + // Invert k to (col, row) in R dist column-major order + // col = largest c such that c*(2N-c-1)/2 <= k + int col = 0; + int remaining = k; + while (remaining >= N - 1 - col) { + remaining -= (N - 1 - col); + ++col; + } + int row = col + 1 + remaining; + + result[k] = transfer_dissimilarity_score( + *sls[col], *sls[row], n_tip, n_bins, scale); + } + + return result; +} + + +//' Cross-pairs transfer dissimilarity (OpenMP) +//' +//' @param splits_a,splits_b Lists of raw matrices. +//' @param n_tip Number of tips. +//' @param scale Logical: use scaled transfer dissimilarity? +//' @param n_threads Number of OpenMP threads. +//' +//' @return Numeric matrix of dimension `nA` x `nB`. +//' @keywords internal +// [[Rcpp::export]] +NumericMatrix cpp_transfer_dist_cross_pairs( + const List& splits_a, const List& splits_b, + int n_tip, bool scale, int n_threads +) { + const int nA = splits_a.size(); + const int nB = splits_b.size(); + + std::vector> sls_a(nA); + std::vector> sls_b(nB); + int n_bins = 0; + + for (int i = 0; i < nA; ++i) { + sls_a[i] = std::make_unique(Rcpp::as(splits_a[i])); + if (i == 0) n_bins = sls_a[0]->n_bins; + } + for (int j = 0; j < nB; ++j) { + sls_b[j] = std::make_unique(Rcpp::as(splits_b[j])); + } + + NumericMatrix result(nA, nB); + const int total = nA * nB; + + #ifdef _OPENMP + #pragma omp parallel for schedule(dynamic) num_threads(n_threads) + #endif + for (int k = 0; k < total; ++k) { + int i = k / nB; + int j = k % nB; + result(i, j) = transfer_dissimilarity_score( + *sls_a[i], *sls_b[j], n_tip, n_bins, scale); + } + + return result; +} diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp index 4a3b77e1..99347300 100644 --- a/src/tree_distance_functions.cpp +++ b/src/tree_distance_functions.cpp @@ -1,35 +1,47 @@ +#include #include -#include /* for SL_MAX_TIPS */ -#include /* for log2() */ +// Provide the MCI table definitions and implementation in this TU. +#define TREEDIST_MCI_IMPLEMENTATION +#include #include "tree_distances.h" -using namespace Rcpp; - -constexpr int32 LG2_SIZE = SL_MAX_TIPS + 1; - -double lg2[LG2_SIZE]; -double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; -double lg2_unrooted[SL_MAX_TIPS + 2]; -double *lg2_rooted = &lg2_unrooted[0] + 1; +// Populate lookup tables at library load time. __attribute__((constructor)) void initialize_ldf() { - lg2[0] = 0; - for (int32 i = 1; i != LG2_SIZE; ++i) { - lg2[i] = log2(i); - } - for (int16 i = 0; i != 3; ++i) { - lg2_double_factorial[i] = 0; - lg2_unrooted[i] = 0; - } - assert(lg2_rooted[0] == 0); - assert(lg2_rooted[1] == 0); - for (int32 i = 2; i != SL_MAX_TIPS + SL_MAX_TIPS - 2; ++i) { - lg2_double_factorial[i] = lg2_double_factorial[i - 2] + log2(i); - } - for (int32 i = 3; i != SL_MAX_TIPS + 2; ++i) { - lg2_unrooted[i] = lg2_double_factorial[i + i - 5]; - assert(lg2_rooted[i - 1] == lg2_double_factorial[i + i - 5]); - } + TreeDist::init_lg2_tables(SL_MAX_TIPS); + } + +// Thin wrapper that exercises the installable-header version of +// mutual_clustering_score(), for test coverage and downstream validation. +// [[Rcpp::export]] +double cpp_mci_impl_score(const Rcpp::RawMatrix& x, + const Rcpp::RawMatrix& y, + int n_tips) { + using TreeTools::SplitList; + + const SplitList a(x); + const SplitList b(y); + TreeDist::LapScratch scratch; + + // Build arrays matching the header's raw-pointer API types. + std::vector a_ptrs(a.n_splits); + std::vector b_ptrs(b.n_splits); + std::vector a_in(a.n_splits); + std::vector b_in(b.n_splits); + for (TreeDist::int16 i = 0; i < a.n_splits; ++i) { + a_ptrs[i] = a.state[i]; + a_in[i] = static_cast(a.in_split[i]); } + for (TreeDist::int16 i = 0; i < b.n_splits; ++i) { + b_ptrs[i] = b.state[i]; + b_in[i] = static_cast(b.in_split[i]); + } + + return TreeDist::mutual_clustering_score( + a_ptrs.data(), a_in.data(), a.n_splits, + b_ptrs.data(), b_in.data(), b.n_splits, + a.n_bins, static_cast(n_tips), + scratch); +} diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index 6aaae498..f4b4ce85 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -1,4 +1,5 @@ #include +#include #include #include /* for unique_ptr, make_unique */ #include @@ -28,9 +29,9 @@ namespace TreeDist { } void check_ntip(const double n) { - if (n > static_cast(std::numeric_limits::max())) { - Rcpp::stop("This many tips are not (yet) supported."); - } + // Validated by R caller (nTip > 32767 guard in CalculateTreeDistance et al.) + ASSERT(n <= static_cast(std::numeric_limits::max()) + && "This many tips are not (yet) supported."); } @@ -605,18 +606,14 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, // [[Rcpp::export]] List cpp_robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { - if (x.cols() != y.cols()) { - Rcpp::stop("Input splits must address same number of tips."); - } + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); return robinson_foulds_distance(x, y, static_cast(nTip[0])); } // [[Rcpp::export]] List cpp_robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { - if (x.cols() != y.cols()) { - Rcpp::stop("Input splits must address same number of tips."); - } + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); return robinson_foulds_info(x, y, n_tip); @@ -625,9 +622,7 @@ List cpp_robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, // [[Rcpp::export]] List cpp_matching_split_distance(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { - if (x.cols() != y.cols()) { - Rcpp::stop("Input splits must address same number of tips."); - } + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); return matching_split_distance(x, y, n_tip); @@ -637,9 +632,7 @@ List cpp_matching_split_distance(const RawMatrix &x, const RawMatrix &y, List cpp_jaccard_similarity(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip, const NumericVector &k, const LogicalVector &allowConflict) { - if (x.cols() != y.cols()) { - Rcpp::stop("Input splits must address same number of tips."); - } + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); return jaccard_similarity(x, y, n_tip, k, allowConflict); @@ -648,9 +641,7 @@ List cpp_jaccard_similarity(const RawMatrix &x, const RawMatrix &y, // [[Rcpp::export]] List cpp_msi_distance(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { - if (x.cols() != y.cols()) { - Rcpp::stop("Input splits must address same number of tips."); - } + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); return msi_distance(x, y, n_tip); @@ -659,9 +650,7 @@ List cpp_msi_distance(const RawMatrix &x, const RawMatrix &y, // [[Rcpp::export]] List cpp_mutual_clustering(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { - if (x.cols() != y.cols()) { - Rcpp::stop("Input splits must address same number of tips."); - } + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); return mutual_clustering(x, y, n_tip); @@ -670,9 +659,7 @@ List cpp_mutual_clustering(const RawMatrix &x, const RawMatrix &y, // [[Rcpp::export]] List cpp_shared_phylo(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { - if (x.cols() != y.cols()) { - Rcpp::stop("Input splits must address same number of tips."); - } + ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); const int32 n_tip = static_cast(nTip[0]); TreeDist::check_ntip(n_tip); return shared_phylo(x, y, n_tip); diff --git a/src/tree_distances.h b/src/tree_distances.h index f5ba93e7..8d4f7b4f 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -2,18 +2,25 @@ #define _TREEDIST_TREE_DISTANCES_H #include "lap.h" +#include #include -/***** Constants requiring initialization *****/ +/***** Re-export shared lookup tables to global scope *****/ + +using TreeDist::lg2; +using TreeDist::lg2_double_factorial; +using TreeDist::lg2_unrooted; +using TreeDist::lg2_rooted; + +/***** Constants *****/ constexpr splitbit ALL_ONES = (std::numeric_limits::max)(); -extern double lg2[SL_MAX_TIPS + 1]; -extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; -extern double lg2_unrooted[SL_MAX_TIPS + 2]; -extern double *lg2_rooted; namespace TreeDist { + // Re-exported from mutual_clustering.h: + // ic_matching(int16 a, int16 b, int16 n) + // See equation 16 in Meila 2007 (k' denoted K). // nkK is converted to pkK in the calling function when divided by n. inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk, @@ -52,14 +59,6 @@ namespace TreeDist { } -[[nodiscard]] inline double ic_matching(const int16 a, const int16 b, const int16 n) noexcept { - const double lg2a = lg2[a]; - const double lg2b = lg2[b]; - const double lg2n = lg2[n]; - return (a + b) * lg2n - a * lg2a - b * lg2b; - // (a * (lg2n - lg2a)) + (b * (lg2n - lg2b)); is substantially slower - } - [[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept { assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok if (a == b) { diff --git a/tests/testthat/test-hmi.cpp.R b/tests/testthat/test-hmi.cpp.R index 78ad0f22..cc883e4a 100644 --- a/tests/testthat/test-hmi.cpp.R +++ b/tests/testthat/test-hmi.cpp.R @@ -6,9 +6,10 @@ test_that("HMI fails nicely", { expect_error(HierarchicalMutualInfo(bal5, pec5, normalize = "Error"), "`normalize` must be logical, or a function") - expect_error(EHMI_xptr(bal5, pec5, tolerance = 1e-16), - ".olerance too low") - expect_error(EHMI_xptr(bal5, pec5, minResample = 1), + # R-level guards in EHMI(); C++ uses ASSERT only + expect_error(EHMI(BalancedTree(5), PectinateTree(5), precision = 1e-16), + "Tolerance too low") + expect_error(EHMI(BalancedTree(5), PectinateTree(5), minResample = 1), "Must perform at least one resampl") }) diff --git a/tests/testthat/test-mast.R b/tests/testthat/test-mast.R index 8a6235ce..7d3e9289 100644 --- a/tests/testthat/test-mast.R +++ b/tests/testthat/test-mast.R @@ -1,7 +1,9 @@ library("TreeTools") test_that("MAST fails gracefully", { - expect_error(cpp_mast(BalancedTree(7)$edge, BalancedTree(8)$edge, 7)) # Different sizes + # R-level guard catches edge count mismatch + expect_error(.MASTSizeEdges(BalancedTree(7)$edge, BalancedTree(8)$edge, 7), + "same number of edges") expect_error(MASTSize(BalancedTree(8), UnrootTree(BalancedTree(8)))) expect_error(MASTSize(BalancedTree(10000), PectinateTree(10000))) # Too large diff --git a/tests/testthat/test-mci_impl.R b/tests/testthat/test-mci_impl.R new file mode 100644 index 00000000..58d72da2 --- /dev/null +++ b/tests/testthat/test-mci_impl.R @@ -0,0 +1,53 @@ +test_that("Header mutual_clustering_score matches MutualClusteringInfo", { + skip_if_not_installed("TreeDist") + library(TreeTools) + library(TreeDist) + + bal8 <- BalancedTree(8) + pec8 <- PectinateTree(8) + star8 <- StarTree(8) + + tips <- TipLabels(bal8) + n_tip <- length(tips) + splits_bal <- as.Splits(bal8, tips) + splits_pec <- as.Splits(pec8, tips) + splits_star <- as.Splits(star8, tips) + + # Score-only from the installable-header implementation + impl_score <- TreeDist:::cpp_mci_impl_score + + impl_bal_pec <- impl_score(splits_bal, splits_pec, n_tip) + impl_bal_bal <- impl_score(splits_bal, splits_bal, n_tip) + impl_star <- impl_score(splits_bal, splits_star, n_tip) + + # Reference from MutualClusteringInfo (unnormalized score) + ref_bal_pec <- MutualClusteringInfo(bal8, pec8) + ref_bal_bal <- MutualClusteringInfo(bal8, bal8) + + expect_equal(impl_bal_pec, ref_bal_pec, tolerance = 1e-10) + expect_equal(impl_bal_bal, ref_bal_bal, tolerance = 1e-10) + expect_equal(impl_star, 0) +}) + +test_that("Header MCI covers exact-match early exit and partial LAP", { + skip_if_not_installed("TreeDist") + library(TreeTools) + library(TreeDist) + impl_score <- TreeDist:::cpp_mci_impl_score + + # Two identical trees → all splits match exactly (early exit path) + bal20 <- BalancedTree(20) + tips <- TipLabels(bal20) + n_tip <- length(tips) + splits20 <- as.Splits(bal20, tips) + + result <- impl_score(splits20, splits20, n_tip) + expect_equal(result, MutualClusteringInfo(bal20, bal20), tolerance = 1e-10) + + # Trees that share some but not all splits → partial match + LAP + pec20 <- PectinateTree(20) + splits_pec20 <- as.Splits(pec20, tips) + + result2 <- impl_score(splits20, splits_pec20, n_tip) + expect_equal(result2, MutualClusteringInfo(bal20, pec20), tolerance = 1e-10) +}) diff --git a/tests/testthat/test-transfer_consensus.R b/tests/testthat/test-transfer_consensus.R new file mode 100644 index 00000000..e038e095 --- /dev/null +++ b/tests/testthat/test-transfer_consensus.R @@ -0,0 +1,700 @@ +test_that("Identical trees return fully resolved consensus", { + tree <- as.phylo(0, nTip = 10) + trees <- structure(rep(list(tree), 10), class = "multiPhylo") + + tc <- TransferConsensus(trees, greedy = "first") + expect_equal(NSplits(tc), NSplits(tree)) + + tc_best <- TransferConsensus(trees, greedy = "best") + expect_equal(NSplits(tc_best), NSplits(tree)) +}) + +test_that("Star tree returned when no signal", { + set.seed(6129) + # Fully random trees with many tips — negligible split overlap + trees <- structure(lapply(1:30, function(i) TreeTools::RandomTree(50, root = TRUE)), + class = "multiPhylo") + tc <- TransferConsensus(trees, greedy = "first") + + # Should be very unresolved (0 or near-0 splits) + expect_lte(NSplits(tc), 5) +}) + +test_that("Transfer consensus is at least as resolved as majority-rule for structured trees", +{ + # Trees from as.phylo with moderate overlap + trees <- as.phylo(0:29, nTip = 20) + tc <- TransferConsensus(trees, greedy = "best") + mr <- Consensus(trees, p = 0.5) + + # Transfer consensus should be at least as resolved + expect_gte(NSplits(tc), NSplits(mr)) +}) + +test_that("Both greedy strategies produce valid trees", { + trees <- as.phylo(1:15, nTip = 12) + + tc_first <- TransferConsensus(trees, greedy = "first") + tc_best <- TransferConsensus(trees, greedy = "best") + + expect_s3_class(tc_first, "phylo") + expect_s3_class(tc_best, "phylo") + expect_equal(sort(TipLabels(tc_first)), sort(TipLabels(trees[[1]]))) + expect_equal(sort(TipLabels(tc_best)), sort(TipLabels(trees[[1]]))) +}) + +test_that("scale = FALSE (unscaled) works", { + trees <- as.phylo(1:10, nTip = 10) + + tc_scaled <- TransferConsensus(trees, scale = TRUE) + tc_unscaled <- TransferConsensus(trees, scale = FALSE) + + expect_s3_class(tc_scaled, "phylo") + expect_s3_class(tc_unscaled, "phylo") +}) + +test_that("init = 'majority' works", { + trees <- as.phylo(0:19, nTip = 15) + + tc_empty <- TransferConsensus(trees, init = "empty") + tc_maj <- TransferConsensus(trees, init = "majority") + + expect_s3_class(tc_empty, "phylo") + expect_s3_class(tc_maj, "phylo") + # Both should produce reasonable trees + expect_gte(NSplits(tc_empty), 1) + expect_gte(NSplits(tc_maj), 1) +}) + +test_that("Error on bad input", { + expect_error(TransferConsensus(list(TreeTools::BalancedTree(5))), "multiPhylo") + expect_error(TransferConsensus( + structure(list(TreeTools::BalancedTree(5)), class = "multiPhylo")), + "at least 2") +}) + +test_that("Two-tree consensus returns a valid tree", { + trees <- as.phylo(1:2, nTip = 8) + tc <- TransferConsensus(trees) + expect_s3_class(tc, "phylo") + expect_equal(length(TipLabels(tc)), 8) +}) + + +# ========================================================================= +# C++ edge cases (transfer_consensus.cpp coverage) +# ========================================================================= + +test_that("TransferConsensus returns star tree for all-star input", { + star <- StarTree(8) + trees <- structure(rep(list(star), 3), class = "multiPhylo") + tc <- TransferConsensus(trees) + expect_s3_class(tc, "phylo") + expect_equal(NSplits(tc), 0) +}) + +test_that("TransferConsensus covers all parameter combinations", { + trees <- as.phylo(0:9, nTip = 10) + + # greedy="first", init="majority", scale=FALSE + tc <- TransferConsensus(trees, greedy = "first", init = "majority", + scale = FALSE) + expect_s3_class(tc, "phylo") + + # greedy="best", init="majority", scale=FALSE + tc2 <- TransferConsensus(trees, greedy = "best", init = "majority", + scale = FALSE) + expect_s3_class(tc2, "phylo") + + # greedy="first", init="empty", scale=FALSE + tc3 <- TransferConsensus(trees, greedy = "first", init = "empty", + scale = FALSE) + expect_s3_class(tc3, "phylo") +}) + +test_that("Greedy remove path fires with conflicting majority splits", { + # Two groups of trees with very different topologies: majority-init seeds + # splits from the larger group, some of which the greedy then removes. + t1 <- as.phylo(0, nTip = 12) + t2 <- as.phylo(100000, nTip = 12) + trees <- structure(c(rep(list(t1), 6), rep(list(t2), 5)), + class = "multiPhylo") + + # greedy="best" exercises do_remove + find_second in GreedyState + tc_best_s <- TransferConsensus(trees, greedy = "best", init = "majority", + scale = TRUE) + expect_s3_class(tc_best_s, "phylo") + + tc_best_u <- TransferConsensus(trees, greedy = "best", init = "majority", + scale = FALSE) + expect_s3_class(tc_best_u, "phylo") + + # greedy="first" exercises the same remove path via greedy_first + + tc_first_s <- TransferConsensus(trees, greedy = "first", init = "majority", + scale = TRUE) + expect_s3_class(tc_first_s, "phylo") + + tc_first_u <- TransferConsensus(trees, greedy = "first", init = "majority", + scale = FALSE) + expect_s3_class(tc_first_u, "phylo") +}) + +test_that("Greedy exercises diverse topologies (extra C++ path coverage)", { + # Random trees with many tips — sparse split overlap + set.seed(5123) + rand_trees <- structure( + lapply(1:30, function(i) TreeTools::RandomTree(10, root = TRUE)), + class = "multiPhylo") + # Diverse indexed trees + diverse_trees <- as.phylo(seq(0, 500, by = 25), nTip = 20) + # Three conflicting groups + mixed_trees <- structure(c( + rep(list(as.phylo(0, 8)), 5), + rep(list(as.phylo(300, 8)), 5), + rep(list(as.phylo(9999, 8)), 3) + ), class = "multiPhylo") + + for (trees in list(rand_trees, diverse_trees, mixed_trees)) { + for (gr in c("best", "first")) { + for (init in c("empty", "majority")) { + for (sc in c(TRUE, FALSE)) { + tc <- TransferConsensus(trees, greedy = gr, init = init, scale = sc) + expect_s3_class(tc, "phylo") + } + } + } + } +}) + +test_that("cpp_tc_profile() runs without error", { + trees <- as.phylo(0:4, nTip = 8) + tipLabels <- TipLabels(trees[[1]]) + splitsList <- lapply(trees, function(tr) unclass(as.Splits(tr, tipLabels))) + + # Greedy best, empty init + res <- TreeDist:::cpp_tc_profile(splitsList, 8L, TRUE, TRUE, FALSE, 1L, 1L) + expect_length(res, 5) + expect_true(all(res >= 0)) + expect_equal(names(res), + c("pool_splits", "transfer_dist_mat", "compute_td", + "compat_mat", "greedy")) + + # Greedy first, majority init, unscaled + res2 <- TreeDist:::cpp_tc_profile(splitsList, 8L, FALSE, FALSE, TRUE, 1L, 1L) + expect_length(res2, 5) + expect_true(all(res2 >= 0)) + + # Multiple iterations + res3 <- TreeDist:::cpp_tc_profile(splitsList, 8L, TRUE, TRUE, FALSE, 3L, 1L) + expect_length(res3, 5) + + # Greedy best, majority init + res4 <- TreeDist:::cpp_tc_profile(splitsList, 8L, TRUE, TRUE, TRUE, 1L, 1L) + expect_length(res4, 5) +}) + + +# ========================================================================= +# R-level internal helper functions (transfer_consensus.R lines 94-551) +# ========================================================================= + +test_that(".FlipRaw() flips bits and masks correctly", { + # 8 tips (1 byte, all bits used, mask = 0xFF) + expect_equal(TreeDist:::.FlipRaw(as.raw(0x07), 8), as.raw(0xf8)) + + # 5 tips (1 byte, 5 bits, mask = 0x1F) + expect_equal(TreeDist:::.FlipRaw(as.raw(0x07), 5), as.raw(0x18)) + + # Multi-byte: 10 tips (2 bytes, last byte 2 bits, mask = 0x03) + result <- TreeDist:::.FlipRaw(as.raw(c(0x00, 0x01)), 10) + expect_equal(result, as.raw(c(0xff, 0x02))) +}) + +test_that(".TransferDistMat() computes pairwise transfer distances", { + splitMat <- matrix(c( + TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, + TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, + FALSE, FALSE, FALSE, TRUE, TRUE, TRUE + ), nrow = 3, byrow = TRUE) + + DIST <- TreeDist:::.TransferDistMat(splitMat, 6) + expect_equal(dim(DIST), c(3, 3)) + expect_equal(diag(DIST), c(0, 0, 0)) + expect_true(isSymmetric(DIST)) + # {1,2} vs {1,2,3}: hamming = 1 + expect_equal(DIST[1, 2], 1) + # {1,2,3} vs {4,5,6}: complements → transfer = 0 + expect_equal(DIST[2, 3], 0) + + # Single split + DIST1 <- TreeDist:::.TransferDistMat( + matrix(c(TRUE, TRUE, FALSE, FALSE), nrow = 1), 4) + expect_equal(dim(DIST1), c(1, 1)) + expect_equal(DIST1[1, 1], 0) +}) + +test_that(".CompatMat() checks bipartition compatibility", { + splitMat <- matrix(c( + TRUE, TRUE, FALSE, FALSE, FALSE, FALSE, + FALSE, FALSE, TRUE, TRUE, FALSE, FALSE, + TRUE, FALSE, TRUE, FALSE, FALSE, FALSE + ), nrow = 3, byrow = TRUE) + + compat <- TreeDist:::.CompatMat(splitMat, 6) + expect_equal(dim(compat), c(3, 3)) + expect_true(compat[1, 2]) # {1,2} and {3,4}: disjoint → compatible + expect_false(compat[1, 3]) # {1,2} and {1,3}: all 4 intersections → incompatible + expect_true(isSymmetric(compat)) +}) + +test_that(".ComputeTD() works for both scale modes", { + DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3) + lightSide <- c(3, 2, 3) + sentDist <- lightSide - 1L + treeMembers <- list(c(1, 2), c(2, 3)) + + td_s <- TreeDist:::.ComputeTD(DIST, sentDist, treeMembers, lightSide, 2, TRUE) + td_u <- TreeDist:::.ComputeTD(DIST, sentDist, treeMembers, lightSide, 2, FALSE) + expect_length(td_s, 3) + expect_length(td_u, 3) + expect_true(all(td_s >= 0)) + expect_true(all(td_u >= 0)) + + # Single member per tree + td_single <- TreeDist:::.ComputeTD( + DIST, sentDist, list(c(1), c(3)), lightSide, 2, TRUE) + expect_length(td_single, 3) +}) + +test_that(".IsCompat() handles edge cases", { + compat <- matrix(TRUE, 3, 3) + compat[1, 3] <- compat[3, 1] <- FALSE + + # Empty included → always compatible + expect_true(TreeDist:::.IsCompat(1, c(FALSE, FALSE, FALSE), compat, 10)) + # Too many included (>= nTip - 3) → never compatible + expect_false(TreeDist:::.IsCompat(1, c(TRUE, TRUE, TRUE), compat, 6)) + # Compatible with included set + expect_true(TreeDist:::.IsCompat(1, c(FALSE, TRUE, FALSE), compat, 10)) + # Incompatible with an included split + expect_false(TreeDist:::.IsCompat(1, c(FALSE, FALSE, TRUE), compat, 10)) +}) + +test_that(".Dist() returns correct distance", { + DIST <- matrix(c(0, 2, 3, 2, 0, 4, 3, 4, 0), 3, 3) + sentDist <- c(5, 6, 7) + + expect_equal(TreeDist:::.Dist(1, 2, DIST, sentDist), 2) + expect_equal(TreeDist:::.Dist(1, NA, DIST, sentDist), 5) + expect_equal(TreeDist:::.Dist(3, NA, DIST, sentDist), 7) +}) + +test_that(".DiagDist() vectorizes correctly", { + DIST <- matrix(c(0, 2, 3, 2, 0, 4, 3, 4, 0), 3, 3) + sentDist <- c(5, 6, 7) + + expect_equal( + TreeDist:::.DiagDist(c(2, NA, 1), DIST, sentDist), + c(DIST[1, 2], 6, DIST[3, 1]) + ) + expect_equal(TreeDist:::.DiagDist(c(NA, NA, NA), DIST, sentDist), sentDist) + expect_equal( + TreeDist:::.DiagDist(c(2, 3, 1), DIST, sentDist), + c(DIST[1, 2], DIST[2, 3], DIST[3, 1]) + ) +}) + +test_that(".FindSecond() works for both scale modes and edge cases", { + DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3) + pMinus1 <- c(4, 4, 4) + + # Unscaled: exclude matchIdx, find best + expect_equal( + TreeDist:::.FindSecond(1, 2, c(1, 2, 3), DIST, pMinus1, FALSE), 1) + + # matchIdx = NA → search all + expect_equal( + TreeDist:::.FindSecond(1, NA_integer_, c(2, 3), DIST, pMinus1, FALSE), 2) + + # No candidates + expect_true(is.na( + TreeDist:::.FindSecond(1, 2, c(2), DIST, pMinus1, FALSE))) + + # All exceed threshold (unscaled) + DIST_far <- matrix(c(0, 5, 5, 5, 0, 5, 5, 5, 0), 3, 3) + expect_true(is.na( + TreeDist:::.FindSecond(1, NA_integer_, c(2, 3), DIST_far, c(2, 2, 2), FALSE) + )) + + # Scaled: normal case + expect_equal( + TreeDist:::.FindSecond(1, NA_integer_, c(2, 3), DIST, pMinus1, TRUE), 2) + + # Scaled: exceeds threshold + expect_true(is.na( + TreeDist:::.FindSecond(1, NA_integer_, c(2), DIST_far, c(2, 2, 2), TRUE) + )) +}) + +test_that(".DoAdd() updates match state correctly", { + DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3) + sentDist <- c(4, 4, 4) + + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, 3) + st$MATCH2 <- rep(NA_integer_, 3) + st$incl <- rep(FALSE, 3) + + # First add: all splits match the newly added split + TreeDist:::.DoAdd(2, st, DIST, sentDist) + expect_true(st$incl[2]) + expect_equal(st$MATCH[1], 2) + expect_equal(st$MATCH[2], 2) + expect_equal(st$MATCH[3], 2) + + # Second add: closer splits update, old match demotes to second + TreeDist:::.DoAdd(1, st, DIST, sentDist) + expect_equal(st$MATCH[1], 1) # self-match closer + expect_equal(st$MATCH2[1], 2) # old match becomes second + expect_equal(st$MATCH[2], 2) # split 2 still closest to itself + expect_equal(st$MATCH2[2], 1) # split 1 becomes second +}) + +test_that(".DoRemove() handles sentinel promotion and affected2", { + DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3) + sentDist <- c(4, 4, 4) + lightSide <- c(5, 5, 5) + + # Case 1: sentinel promotion (MATCH2 = NA, remove MATCH, no other included) + st <- new.env(parent = emptyenv()) + st$MATCH <- c(2L, NA_integer_, 2L) + st$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_) + st$incl <- c(FALSE, TRUE, FALSE) + + TreeDist:::.DoRemove(2, st, DIST, sentDist, lightSide, FALSE) + expect_true(is.na(st$MATCH[1])) + expect_true(is.na(st$MATCH[3])) + + # Case 2: sentinel promotion with rescan finding another match + st2 <- new.env(parent = emptyenv()) + st2$MATCH <- c(2L, 2L, NA_integer_) + st2$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_) + st2$incl <- c(TRUE, TRUE, FALSE) + + TreeDist:::.DoRemove(2, st2, DIST, sentDist, lightSide, FALSE) + expect_equal(st2$MATCH[1], 1L) # rescanned, found split 1 + + # Case 3: affected2 path (MATCH2 == removed, MATCH != removed) + st3 <- new.env(parent = emptyenv()) + st3$MATCH <- c(1L, 1L, 1L) + st3$MATCH2 <- c(2L, 2L, NA_integer_) + st3$incl <- c(TRUE, TRUE, FALSE) + + TreeDist:::.DoRemove(2, st3, DIST, sentDist, lightSide, FALSE) + expect_equal(st3$MATCH, c(1L, 1L, 1L)) # unchanged + expect_true(is.na(st3$MATCH2[1])) + expect_true(is.na(st3$MATCH2[2])) + + # Case 4: scaled mode + st4 <- new.env(parent = emptyenv()) + st4$MATCH <- c(2L, NA_integer_, NA_integer_) + st4$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_) + st4$incl <- c(TRUE, TRUE, FALSE) + + TreeDist:::.DoRemove(2, st4, DIST, sentDist, lightSide, TRUE) + expect_equal(st4$MATCH[1], 1L) +}) + +test_that(".AddBenefitVec() computes add benefits", { + DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3) + lightSide <- c(3, 3, 3) + sentDist <- lightSide - 1L + counts <- c(2, 3, 1) + TD <- c(0.5, 0.3, 0.8) + + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, 3) + st$MATCH2 <- rep(NA_integer_, 3) + st$incl <- rep(FALSE, 3) + + ben_s <- TreeDist:::.AddBenefitVec( + c(1, 2, 3), st, DIST, sentDist, TD, counts, lightSide, scale = TRUE) + expect_length(ben_s, 3) + + ben_u <- TreeDist:::.AddBenefitVec( + c(1, 2, 3), st, DIST, sentDist, TD, counts, lightSide, scale = FALSE) + expect_length(ben_u, 3) +}) + +test_that(".RemoveBenefitVec() computes remove benefits", { + DIST <- matrix(c(0, 1, 3, 1, 0, 2, 3, 2, 0), 3, 3) + lightSide <- c(3, 3, 3) + sentDist <- lightSide - 1L + counts <- c(2, 3, 1) + TD <- c(0.5, 0.3, 0.8) + + st <- new.env(parent = emptyenv()) + st$MATCH <- c(2L, 1L, 2L) + st$MATCH2 <- c(NA_integer_, NA_integer_, NA_integer_) + st$incl <- c(TRUE, TRUE, FALSE) + + ben_s <- TreeDist:::.RemoveBenefitVec( + c(1, 2), st, DIST, sentDist, TD, counts, lightSide, scale = TRUE) + expect_length(ben_s, 2) + + ben_u <- TreeDist:::.RemoveBenefitVec( + c(1, 2), st, DIST, sentDist, TD, counts, lightSide, scale = FALSE) + expect_length(ben_u, 2) + + # No affected splits (MATCH doesn't reference candidate) + st2 <- new.env(parent = emptyenv()) + st2$MATCH <- c(1L, 1L, 1L) + st2$MATCH2 <- rep(NA_integer_, 3) + st2$incl <- c(TRUE, TRUE, FALSE) + + ben_none <- TreeDist:::.RemoveBenefitVec( + c(2), st2, DIST, sentDist, TD, counts, lightSide, scale = TRUE) + # Only TD contribution, no fn_cost (no affected splits) + expect_equal(ben_none, TD[2]) +}) + +test_that(".PoolSplits() pools and deduplicates splits", { + trees <- as.phylo(0:4, nTip = 8) + tipLabels <- TipLabels(trees[[1]]) + pool <- TreeDist:::.PoolSplits(trees, tipLabels) + + expect_true(is.matrix(pool$splits)) + expect_true(is.matrix(pool$rawSplits)) + expect_equal(ncol(pool$splits), length(tipLabels)) + expect_equal(length(pool$counts), nrow(pool$splits)) + expect_equal(length(pool$lightSide), nrow(pool$splits)) + expect_equal(length(pool$treeMembers), length(trees)) + expect_true(all(pool$counts >= 1)) + expect_true(all(pool$lightSide >= 1)) +}) + +test_that(".SplitsToPhylo() converts splits to tree or star", { + trees <- as.phylo(0:2, nTip = 8) + tipLabels <- TipLabels(trees[[1]]) + pool <- TreeDist:::.PoolSplits(trees, tipLabels) + + # Some splits included + included <- rep(FALSE, nrow(pool$rawSplits)) + included[1:min(2, nrow(pool$rawSplits))] <- TRUE + tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, included, tipLabels, 8) + expect_s3_class(tr, "phylo") + expect_equal(sort(TipLabels(tr)), sort(tipLabels)) + + # No splits → star tree + tr_star <- TreeDist:::.SplitsToPhylo( + pool$rawSplits, rep(FALSE, nrow(pool$rawSplits)), tipLabels, 8) + expect_s3_class(tr_star, "phylo") + expect_equal(NSplits(tr_star), 0) +}) + +test_that(".InitMatches() initializes match state from included splits", { + # DIST: included = {1,2}; split 3 close to split 2 (dist 1 < thresh 2) + DIST <- matrix(c( + 0, 1, 3, 5, + 1, 0, 1, 4, + 3, 1, 0, 1, + 5, 4, 1, 0 + ), 4, 4, byrow = TRUE) + lightSide <- c(3, 3, 3, 3) + sentDist <- lightSide - 1L # all 2 + + # Scaled mode + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, 4) + st$MATCH2 <- rep(NA_integer_, 4) + st$incl <- c(TRUE, TRUE, FALSE, FALSE) + + TreeDist:::.InitMatches(st, DIST, sentDist, lightSide, scale = TRUE) + # Split 1: self-match (dist 0), second = split 2 (dist 1) + expect_equal(st$MATCH[1], 1) + expect_equal(st$MATCH2[1], 2) + # Split 3: best among included is split 2 (dist 1), 1/2 < 1 → matched + expect_equal(st$MATCH[3], 2) + # Split 4: best among included is split 2 (dist 4), 4/2 >= 1 → skipped + expect_true(is.na(st$MATCH[4])) + + # Unscaled mode + st2 <- new.env(parent = emptyenv()) + st2$MATCH <- rep(NA_integer_, 4) + st2$MATCH2 <- rep(NA_integer_, 4) + st2$incl <- c(TRUE, TRUE, FALSE, FALSE) + + TreeDist:::.InitMatches(st2, DIST, sentDist, lightSide, scale = FALSE) + expect_equal(st2$MATCH[1], 1) + expect_equal(st2$MATCH[3], 2) + + # Empty included set → no-op + st3 <- new.env(parent = emptyenv()) + st3$MATCH <- rep(NA_integer_, 4) + st3$MATCH2 <- rep(NA_integer_, 4) + st3$incl <- rep(FALSE, 4) + + TreeDist:::.InitMatches(st3, DIST, sentDist, lightSide, scale = TRUE) + expect_true(all(is.na(st3$MATCH))) +}) + + +# ========================================================================= +# Full R-level greedy pipeline integration tests +# ========================================================================= + +test_that("R-level GreedyBest pipeline produces valid consensus (scaled)", { + trees <- as.phylo(0:9, nTip = 8) + tipLabels <- TipLabels(trees[[1]]) + nTip <- length(tipLabels) + nTree <- length(trees) + + pool <- TreeDist:::.PoolSplits(trees, tipLabels) + nSplits <- nrow(pool$splits) + DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip) + sentDist <- pool$lightSide - 1L + TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide, + nTree, scale = TRUE) + compat <- TreeDist:::.CompatMat(pool$splits, nTip) + sortOrd <- order(-pool$counts, seq_len(nSplits)) + + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, nSplits) + st$MATCH2 <- rep(NA_integer_, nSplits) + st$incl <- rep(FALSE, nSplits) + + TreeDist:::.GreedyBest(st, DIST, sentDist, TD, pool$counts, pool$lightSide, + compat, sortOrd, scale = TRUE, nSplits, nTip) + expect_true(any(st$incl)) + + tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip) + expect_s3_class(tr, "phylo") +}) + +test_that("R-level GreedyFirst pipeline produces valid consensus (unscaled)", { + trees <- as.phylo(0:9, nTip = 8) + tipLabels <- TipLabels(trees[[1]]) + nTip <- length(tipLabels) + nTree <- length(trees) + + pool <- TreeDist:::.PoolSplits(trees, tipLabels) + nSplits <- nrow(pool$splits) + DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip) + sentDist <- pool$lightSide - 1L + TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide, + nTree, scale = FALSE) + compat <- TreeDist:::.CompatMat(pool$splits, nTip) + sortOrd <- order(-pool$counts, seq_len(nSplits)) + + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, nSplits) + st$MATCH2 <- rep(NA_integer_, nSplits) + st$incl <- rep(FALSE, nSplits) + + TreeDist:::.GreedyFirst(st, DIST, sentDist, TD, pool$counts, pool$lightSide, + compat, sortOrd, scale = FALSE, nSplits, nTip) + expect_true(any(st$incl)) + + tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip) + expect_s3_class(tr, "phylo") +}) + +test_that("R-level GreedyBest with majority init exercises InitMatches", { + trees <- as.phylo(0:14, nTip = 10) + tipLabels <- TipLabels(trees[[1]]) + nTip <- length(tipLabels) + nTree <- length(trees) + + pool <- TreeDist:::.PoolSplits(trees, tipLabels) + nSplits <- nrow(pool$splits) + DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip) + sentDist <- pool$lightSide - 1L + TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide, + nTree, scale = TRUE) + compat <- TreeDist:::.CompatMat(pool$splits, nTip) + sortOrd <- order(-pool$counts, seq_len(nSplits)) + + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, nSplits) + st$MATCH2 <- rep(NA_integer_, nSplits) + + # Majority init: include splits present in > half of trees + st$incl <- pool$counts > (nTree / 2) + if (any(st$incl)) { + TreeDist:::.InitMatches(st, DIST, sentDist, pool$lightSide, scale = TRUE) + } + + TreeDist:::.GreedyBest(st, DIST, sentDist, TD, pool$counts, pool$lightSide, + compat, sortOrd, scale = TRUE, nSplits, nTip) + + tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip) + expect_s3_class(tr, "phylo") +}) + +test_that("R-level GreedyFirst with majority init (unscaled)", { + trees <- as.phylo(0:14, nTip = 10) + tipLabels <- TipLabels(trees[[1]]) + nTip <- length(tipLabels) + nTree <- length(trees) + + pool <- TreeDist:::.PoolSplits(trees, tipLabels) + nSplits <- nrow(pool$splits) + DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip) + sentDist <- pool$lightSide - 1L + TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide, + nTree, scale = FALSE) + compat <- TreeDist:::.CompatMat(pool$splits, nTip) + sortOrd <- order(-pool$counts, seq_len(nSplits)) + + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, nSplits) + st$MATCH2 <- rep(NA_integer_, nSplits) + st$incl <- pool$counts > (nTree / 2) + if (any(st$incl)) { + TreeDist:::.InitMatches(st, DIST, sentDist, pool$lightSide, scale = FALSE) + } + + TreeDist:::.GreedyFirst(st, DIST, sentDist, TD, pool$counts, pool$lightSide, + compat, sortOrd, scale = FALSE, nSplits, nTip) + + tr <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip) + expect_s3_class(tr, "phylo") +}) + +test_that("R greedy pipeline matches C++ consensus", { + trees <- as.phylo(0:9, nTip = 8) + tipLabels <- TipLabels(trees[[1]]) + nTip <- length(tipLabels) + nTree <- length(trees) + + # C++ result + tc_cpp <- TransferConsensus(trees, greedy = "best", scale = TRUE) + + # R-level result + pool <- TreeDist:::.PoolSplits(trees, tipLabels) + nSplits <- nrow(pool$splits) + DIST <- TreeDist:::.TransferDistMat(pool$splits, nTip) + sentDist <- pool$lightSide - 1L + TD <- TreeDist:::.ComputeTD(DIST, sentDist, pool$treeMembers, pool$lightSide, + nTree, scale = TRUE) + compat <- TreeDist:::.CompatMat(pool$splits, nTip) + sortOrd <- order(-pool$counts, seq_len(nSplits)) + + st <- new.env(parent = emptyenv()) + st$MATCH <- rep(NA_integer_, nSplits) + st$MATCH2 <- rep(NA_integer_, nSplits) + st$incl <- rep(FALSE, nSplits) + + TreeDist:::.GreedyBest(st, DIST, sentDist, TD, pool$counts, pool$lightSide, + compat, sortOrd, scale = TRUE, nSplits, nTip) + + tc_r <- TreeDist:::.SplitsToPhylo(pool$rawSplits, st$incl, tipLabels, nTip) + + # Both should produce resolved trees with same tip labels + expect_s3_class(tc_r, "phylo") + expect_equal(sort(TipLabels(tc_r)), sort(TipLabels(tc_cpp))) + # Resolution should be similar (not necessarily identical due to + # possible differences in canonicalization/sort-order between R and C++) + expect_gte(NSplits(tc_r), 1) +}) diff --git a/tests/testthat/test-tree_distance.R b/tests/testthat/test-tree_distance.R index cd96e3e9..67319da5 100644 --- a/tests/testthat/test-tree_distance.R +++ b/tests/testthat/test-tree_distance.R @@ -76,16 +76,10 @@ test_that("Size mismatch causes error", { expect_error(MeilaVariationOfInformation(splits7, splits8), "Split lengths differ") - Test <- function(Func) { - expect_error(Func(splits8, as.Splits(BalancedTree(9)), 8)) - } - Test(cpp_robinson_foulds_distance) - Test(cpp_robinson_foulds_info) - Test(cpp_matching_split_distance) - Test(cpp_jaccard_similarity) - Test(cpp_msi_distance) - Test(cpp_mutual_clustering) - Test(cpp_shared_phylo) + # Validation is now in R (.ValidateSplitArgs), not C++ + splits9 <- as.Splits(BalancedTree(9)) + expect_error(.ValidateSplitArgs(splits8, splits9, 8L), + "same number of tips") }) test_that("Metrics handle polytomies", { diff --git a/tests/testthat/test-tree_distance_nni.R b/tests/testthat/test-tree_distance_nni.R index 8569231f..20bcda23 100644 --- a/tests/testthat/test-tree_distance_nni.R +++ b/tests/testthat/test-tree_distance_nni.R @@ -8,9 +8,8 @@ test_that("NNIDist() handles exceptions", { expect_error(NNIDist(list(PectinateTree(1:8), PectinateTree(as.character(1:8)))), "trees must bear identical labels") - expect_error(cpp_nni_distance( - PectinateTree(40000)$edge, # Will fail before not being postorder is problem - BalancedTree(40000)$edge, 40000), "so many tips") + # R-level guard catches too-many-tips + expect_error(NNIDist(PectinateTree(40000), BalancedTree(40000)), "so many tips") expect_error(NNIDist(BalancedTree(5), RootOnNode(BalancedTree(5), 1))) diff --git a/tests/testthat/test-tree_distance_transfer.R b/tests/testthat/test-tree_distance_transfer.R new file mode 100644 index 00000000..f3882a73 --- /dev/null +++ b/tests/testthat/test-tree_distance_transfer.R @@ -0,0 +1,494 @@ +test_that("TransferDist() returns 0 for identical trees", { + tr <- TreeTools::BalancedTree(8) + expect_equal(0, TransferDist(tr, tr)) + expect_equal(0, TransferDist(tr, tr, scale = FALSE)) +}) + + +test_that("TransferDist() is symmetric", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + expect_equal(TransferDist(bal8, pec8), TransferDist(pec8, bal8)) + expect_equal(TransferDist(bal8, pec8, scale = FALSE), + TransferDist(pec8, bal8, scale = FALSE)) +}) + + +test_that("TransferDist() scaled is bounded by RF", { + # The scaled transfer dissimilarity is bounded above by the RF distance + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + td_scaled <- TransferDist(bal8, pec8) + rf <- RobinsonFoulds(bal8, pec8) + expect_lte(td_scaled, rf) +}) + + +test_that("TransferDist() normalization", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + + # Scaled normalization denominator = 2 * (n - 3) = 10 + td_raw <- TransferDist(bal8, pec8) + td_norm <- TransferDist(bal8, pec8, normalize = TRUE) + expect_equal(td_norm, td_raw / (2 * (8 - 3))) + expect_true(td_norm >= 0) + expect_true(td_norm <= 1) + + # Unscaled normalization + td_raw_u <- TransferDist(bal8, pec8, scale = FALSE) + td_norm_u <- TransferDist(bal8, pec8, scale = FALSE, normalize = TRUE) + # Even n: denom = (n^2 - 2n + 4) / 4 = (64 - 16 + 4) / 4 = 13 + expect_equal(td_norm_u, td_raw_u / 13) +}) + + +test_that("TransferDist() all-pairs", { + trees <- as.phylo(0:5, 8) + + # All-pairs + d <- TransferDist(trees) + expect_s3_class(d, "dist") + expect_equal(attr(d, "Size"), 6L) + + # Check diagonal = 0 implicitly (dist objects don't store diagonal) + # Check that batch matches pairwise + dm <- as.matrix(d) + for (i in 1:5) { + for (j in (i + 1):6) { + expected <- TransferDist(trees[[i]], trees[[j]]) + expect_equal(dm[i, j], expected, tolerance = 1e-12, + info = paste("pair", i, j)) + } + } +}) + + +test_that("TransferDist() cross-pairs", { + trees1 <- as.phylo(0:2, 8) + trees2 <- as.phylo(3:5, 8) + + mat <- TransferDist(trees1, trees2) + expect_equal(dim(mat), c(3, 3)) + + # Check matches pairwise + for (i in 1:3) { + for (j in 1:3) { + expected <- TransferDist(trees1[[i]], trees2[[j]]) + expect_equal(mat[i, j], expected, tolerance = 1e-12, + info = paste("cross", i, j)) + } + } +}) + + +test_that("TransferDist() single tree vs list", { + tr <- TreeTools::BalancedTree(8) + trees <- as.phylo(0:3, 8) + + d1 <- TransferDist(tr, trees) + d2 <- TransferDist(trees, tr) + + expect_length(d1, 4) + expect_length(d2, 4) + + for (i in seq_along(trees)) { + expected <- TransferDist(tr, trees[[i]]) + expect_equal(d1[i], expected, tolerance = 1e-12) + expect_equal(d2[i], expected, tolerance = 1e-12) + } +}) + + +test_that("TransferDist() scaled vs unscaled relationship", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + + # Both should be non-negative + td_s <- TransferDist(bal8, pec8, scale = TRUE) + td_u <- TransferDist(bal8, pec8, scale = FALSE) + expect_true(td_s >= 0) + expect_true(td_u >= 0) + + # For identical trees, both = 0 + expect_equal(0, TransferDist(bal8, bal8, scale = TRUE)) + expect_equal(0, TransferDist(bal8, bal8, scale = FALSE)) +}) + + +test_that("TransferDist() reportMatching", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + + res <- TransferDist(bal8, pec8, reportMatching = TRUE) + expect_equal(res[[1]], TransferDist(bal8, pec8)) + + matching <- attr(res, "matching") + expect_true(!is.null(matching)) + + pairScores <- attr(res, "pairScores") + expect_true(!is.null(pairScores)) + expect_equal(nrow(pairScores), TreeTools::NSplits(bal8)) + expect_equal(ncol(pairScores), TreeTools::NSplits(pec8)) +}) + + +test_that("TransferDist() handles star trees", { + star <- TreeTools::StarTree(8) + bal8 <- TreeTools::BalancedTree(8) + + # Star tree has no internal splits; dissimilarity = sum over binary tree only + td <- TransferDist(bal8, star) + expect_true(td >= 0) + + # Star vs star = 0 + expect_equal(0, TransferDist(star, star)) +}) + + +test_that("TransferDist() handles small trees", { + # 4 tips: minimal non-trivial case + tr1 <- TreeTools::BalancedTree(4) + tr2 <- TreeTools::PectinateTree(4) + + td <- TransferDist(tr1, tr2) + expect_true(is.finite(td)) + expect_true(td >= 0) +}) + + +test_that("TransferDist() consistent with TransferConsensus internals", { + skip_if_not_installed("TreeTools") + + # The transfer dissimilarity for a single pair should be consistent with + # how TransferConsensus computes distances internally + set.seed(6419) + trees <- structure(lapply(1:10, function(i) ape::rtree(12)), + class = "multiPhylo") + + # All-pairs: check symmetry and non-negativity + d <- TransferDist(trees) + dm <- as.matrix(d) + expect_true(all(dm >= -1e-12)) + expect_true(isSymmetric(dm, tol = 1e-12)) +}) + + +# ========================================================================= +# Additional coverage: TransferDistance alias +# ========================================================================= + +test_that("TransferDistance() is an alias for TransferDist()", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + expect_equal(TransferDistance(bal8, pec8), TransferDist(bal8, pec8)) + expect_equal(TransferDistance(bal8, pec8, scale = FALSE), + TransferDist(bal8, pec8, scale = FALSE)) +}) + + +# ========================================================================= +# cpp_transfer_dist() direct (both scaled + unscaled + matching) +# ========================================================================= + +test_that("cpp_transfer_dist() returns both scores and matchings", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + tipLabels <- TreeTools::TipLabels(bal8) + + sp1 <- unclass(as.Splits(bal8, tipLabels)) + sp2 <- unclass(as.Splits(pec8, tipLabels)) + + res <- TreeDist:::cpp_transfer_dist(sp1, sp2, 8L) + + expect_true(res$score_scaled >= 0) + expect_true(res$score_unscaled >= 0) + expect_length(res$matching_xy, nrow(sp1)) + expect_length(res$matching_yx, nrow(sp2)) + + # Consistent with TransferDist + expect_equal(res$score_scaled, + TransferDist(bal8, pec8, scale = TRUE), tolerance = 1e-12) + expect_equal(res$score_unscaled, + TransferDist(bal8, pec8, scale = FALSE), tolerance = 1e-12) + + # Identical trees → distance 0, all matchings defined + res_id <- TreeDist:::cpp_transfer_dist(sp1, sp1, 8L) + expect_equal(res_id$score_scaled, 0) + expect_equal(res_id$score_unscaled, 0) +}) + +test_that("cpp_transfer_dist() handles star trees", { + star <- TreeTools::StarTree(8) + bal8 <- TreeTools::BalancedTree(8) + tipLabels <- TreeTools::TipLabels(bal8) + + sp_star <- unclass(as.Splits(star, tipLabels)) + sp_bal <- unclass(as.Splits(bal8, tipLabels)) + + # Star vs binary: only one direction contributes + res <- TreeDist:::cpp_transfer_dist(sp_bal, sp_star, 8L) + expect_true(res$score_scaled >= 0) + expect_true(res$score_unscaled >= 0) + expect_length(res$matching_xy, nrow(sp_bal)) + # matching_yx has 0 elements for star tree + expect_length(res$matching_yx, 0) + + # Star vs star: both scores = 0 + res_ss <- TreeDist:::cpp_transfer_dist(sp_star, sp_star, 8L) + expect_equal(res_ss$score_scaled, 0) + expect_equal(res_ss$score_unscaled, 0) +}) + +test_that("column mismatch caught by R-level guard", { + sp1 <- matrix(as.raw(0), nrow = 1, ncol = 1) + sp2 <- matrix(as.raw(0), nrow = 1, ncol = 2) + expect_error( + TreeDist:::.ValidateSplitArgs(sp1, sp2, 8L), + "same number of tips" + ) +}) + + +# ========================================================================= +# cpp_transfer_dist_scored() edge cases +# ========================================================================= + +test_that("column mismatch caught by R-level guard (scored)", { + sp1 <- matrix(as.raw(0), nrow = 1, ncol = 1) + sp2 <- matrix(as.raw(0), nrow = 1, ncol = 2) + expect_error( + TreeDist:::.ValidateSplitArgs(sp1, sp2, 8L), + "same number of tips" + ) +}) + +test_that("cpp_transfer_dist_scored() with asymmetric split counts", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + tipLabels <- TreeTools::TipLabels(bal8) + + sp_bal <- unclass(as.Splits(bal8, tipLabels)) + sp_pec <- unclass(as.Splits(pec8, tipLabels)) + + # BalancedTree(8) has fewer splits than PectinateTree(8) + res <- TreeDist:::cpp_transfer_dist_scored(sp_bal, sp_pec, 8L, TRUE) + expect_true(res$score >= 0) + # matching length = max(nSplits1, nSplits2) + expect_equal(length(res$matching), + max(nrow(sp_bal), nrow(sp_pec))) +}) + + +# ========================================================================= +# Normalization edge cases +# ========================================================================= + +test_that(".TransferNormDenom() for odd and even nTip", { + # Even nTip = 8: (64 - 16 + 4) / 4 = 13 + expect_equal(TreeDist:::.TransferNormDenom(8, FALSE), 13) + # Odd nTip = 9: (81 - 18 + 5) / 4 = 17 + expect_equal(TreeDist:::.TransferNormDenom(9, FALSE), 17) + # Scaled: 2 * (n - 3) + expect_equal(TreeDist:::.TransferNormDenom(8, TRUE), 10) + expect_equal(TreeDist:::.TransferNormDenom(9, TRUE), 12) +}) + +test_that("TransferDist() normalize with odd nTip", { + trees <- as.phylo(0:3, nTip = 9) + # Scaled normalization + d <- TransferDist(trees, normalize = TRUE) + d_raw <- TransferDist(trees, normalize = FALSE) + denom <- 2 * (9 - 3) + expect_equal(as.numeric(d), as.numeric(d_raw) / denom, tolerance = 1e-12) + + # Unscaled normalization with odd nTip + d_u <- TransferDist(trees, scale = FALSE, normalize = TRUE) + d_u_raw <- TransferDist(trees, scale = FALSE, normalize = FALSE) + denom_u <- (81 - 18 + 5) / 4 + expect_equal(as.numeric(d_u), as.numeric(d_u_raw) / denom_u, tolerance = 1e-12) +}) + +test_that("TransferDist() normalize with cross-pairs", { + trees1 <- as.phylo(0:2, 8) + trees2 <- as.phylo(3:5, 8) + + mat_raw <- TransferDist(trees1, trees2) + mat_norm <- TransferDist(trees1, trees2, normalize = TRUE) + denom <- 2 * (8 - 3) + expect_equal(mat_norm, mat_raw / denom, tolerance = 1e-12) + + # Unscaled cross-pairs normalize + mat_raw_u <- TransferDist(trees1, trees2, scale = FALSE) + mat_norm_u <- TransferDist(trees1, trees2, scale = FALSE, normalize = TRUE) + denom_u <- 13 + expect_equal(mat_norm_u, mat_raw_u / denom_u, tolerance = 1e-12) +}) + +test_that("TransferDist() normalize via generic fallback (reportMatching)", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + + # reportMatching forces generic path; normalize divides after + res <- TransferDist(bal8, pec8, normalize = TRUE, reportMatching = TRUE) + raw <- TransferDist(bal8, pec8, reportMatching = TRUE) + denom <- 2 * (8 - 3) + expect_equal(res[[1]], raw[[1]] / denom, tolerance = 1e-12) + + # Unscaled + normalize + reportMatching + res_u <- TransferDist(bal8, pec8, scale = FALSE, + normalize = TRUE, reportMatching = TRUE) + raw_u <- TransferDist(bal8, pec8, scale = FALSE, reportMatching = TRUE) + expect_equal(res_u[[1]], raw_u[[1]] / 13, tolerance = 1e-12) +}) + + +# ========================================================================= +# Fast path edge cases +# ========================================================================= + +test_that(".TransferDistAllPairs() returns NULL for edge cases", { + # Single phylo (not multiPhylo) + tr <- TreeTools::BalancedTree(8) + expect_null(TreeDist:::.TransferDistAllPairs(tr, TRUE)) + + # < 2 trees + expect_null(TreeDist:::.TransferDistAllPairs( + structure(list(tr), class = "multiPhylo"), TRUE)) + + # < 4 tips + small <- as.phylo(0:2, nTip = 3) + expect_null(TreeDist:::.TransferDistAllPairs(small, TRUE)) + + # Mismatched tip labels + t1 <- TreeTools::BalancedTree(paste0("a", 1:6)) + t2 <- TreeTools::BalancedTree(paste0("b", 1:6)) + mixed <- structure(list(t1, t2), class = "multiPhylo") + expect_null(TreeDist:::.TransferDistAllPairs(mixed, TRUE)) +}) + +test_that(".TransferDistCrossPairs() returns NULL for edge cases", { + tr1 <- TreeTools::BalancedTree(8) + tr2 <- TreeTools::PectinateTree(8) + + # Both single → delegate to generic + expect_null(TreeDist:::.TransferDistCrossPairs(tr1, tr2, TRUE)) + + # < 4 tips + small1 <- as.phylo(0:1, nTip = 3) + small2 <- as.phylo(2:3, nTip = 3) + expect_null(TreeDist:::.TransferDistCrossPairs(small1, small2, TRUE)) + + # Mismatched tip labels + t1 <- structure(list(TreeTools::BalancedTree(paste0("a", 1:6))), + class = "multiPhylo") + t2 <- structure(list(TreeTools::BalancedTree(paste0("b", 1:6))), + class = "multiPhylo") + expect_null(TreeDist:::.TransferDistCrossPairs(t1, t2, TRUE)) +}) + +test_that("TransferDist() all-pairs with star trees in mix", { + star <- TreeTools::StarTree(8) + bal <- TreeTools::BalancedTree(8) + pec <- TreeTools::PectinateTree(8) + trees <- structure(list(star, bal, pec), class = "multiPhylo") + + d <- TransferDist(trees) + dm <- as.matrix(d) + expect_equal(dm[1, 1], 0) # star self + expect_true(dm[1, 2] >= 0) # star vs binary + expect_equal(dm[2, 3], TransferDist(bal, pec), tolerance = 1e-12) +}) + +test_that("TransferDist() all-pairs with exactly 2 trees", { + trees <- as.phylo(0:1, nTip = 8) + d <- TransferDist(trees) + expect_s3_class(d, "dist") + expect_equal(attr(d, "Size"), 2L) + expect_equal(as.numeric(d), + TransferDist(trees[[1]], trees[[2]]), tolerance = 1e-12) +}) + +test_that("TransferDist() all-pairs with exactly 4 tips", { + trees <- as.phylo(0:3, nTip = 4) + d <- TransferDist(trees) + expect_s3_class(d, "dist") + dm <- as.matrix(d) + expect_true(all(dm >= -1e-12)) +}) + + +# ========================================================================= +# TransferDistSplits edge cases +# ========================================================================= + +test_that("TransferDistSplits() reportMatching with various configurations", { + bal8 <- TreeTools::BalancedTree(8) + pec8 <- TreeTools::PectinateTree(8) + tipLabels <- TreeTools::TipLabels(bal8) + + sp_bal <- as.Splits(bal8, tipLabels) + sp_pec <- as.Splits(pec8, tipLabels) + + # Unscaled reportMatching + res <- TransferDistSplits(sp_bal, sp_pec, scale = FALSE, + reportMatching = TRUE) + expect_true(is.numeric(res)) + expect_true(!is.null(attr(res, "matching"))) + expect_true(!is.null(attr(res, "pairScores"))) + expect_true(!is.null(attr(res, "matchedScores"))) + + # Scaled reportMatching (already tested above, but confirm matchedSplits) + res_s <- TransferDistSplits(sp_bal, sp_pec, scale = TRUE, + reportMatching = TRUE) + expect_true(!is.null(attr(res_s, "matchedSplits"))) +}) + +test_that("TransferDistSplits() with asymmetric nSplits (nSplits1 < nSplits2)", { + # Use majority-rule consensus (fewer splits) vs fully resolved tree + trees <- as.phylo(0:9, 8) + mr <- Consensus(trees, p = 0.5) + bal8 <- TreeTools::BalancedTree(8) + tipLabels <- TreeTools::TipLabels(bal8) + + sp_mr <- as.Splits(mr, tipLabels) + sp_bal <- as.Splits(bal8, tipLabels) + + # Confirm asymmetry + expect_lt(nrow(sp_mr), nrow(sp_bal)) + + res <- TransferDistSplits(sp_mr, sp_bal, reportMatching = TRUE) + # Matching should be truncated to nSplits1 + matching <- attr(res, "matching") + expect_equal(length(matching), nrow(sp_mr)) + expect_true(!is.null(attr(res, "pairScores"))) +}) + +test_that("TransferDist() for 3-tip trees uses fallback path", { + # 3 tips: fast path returns NULL, falls through to generic + tr1 <- TreeTools::BalancedTree(3) + tr2 <- TreeTools::PectinateTree(3) + td <- TransferDist(tr1, tr2) + expect_true(is.finite(td)) + expect_true(td >= 0) +}) + +test_that("TransferDist() with named tree list preserves labels", { + trees <- as.phylo(0:3, nTip = 8) + names(trees) <- paste0("tree_", 1:4) + + d <- TransferDist(trees) + expect_equal(attr(d, "Labels"), paste0("tree_", 1:4)) +}) + +test_that("TransferDist() cross-pairs with single Splits input", { + bal8 <- TreeTools::BalancedTree(8) + tipLabels <- TreeTools::TipLabels(bal8) + sp <- as.Splits(bal8, tipLabels) + trees <- as.phylo(0:3, nTip = 8) + + # Single Splits object vs list of trees + result <- TransferDist(sp, trees) + expect_length(result, 4) + expect_true(all(result >= 0)) +})