diff --git a/.github/workflows/ASan.yml b/.github/workflows/ASan.yml index 90a74a66..7c0b0a95 100644 --- a/.github/workflows/ASan.yml +++ b/.github/workflows/ASan.yml @@ -55,8 +55,9 @@ jobs: run: | export LD_PRELOAD=$(gcc -print-file-name=libasan.so) - echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" > src/Makevars - echo "PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars + # Replace PKG_CXXFLAGS in-place; preserve PKG_CPPFLAGS and PKG_LIBS + sed -i 's/^PKG_CXXFLAGS.*/PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer/' src/Makevars + echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars mkdir ~/.R echo "LDFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> ~/.R/Makevars diff --git a/.positai/settings.json b/.positai/settings.json index 627e8f7f..acec4998 100644 --- a/.positai/settings.json +++ b/.positai/settings.json @@ -30,8 +30,15 @@ "wc *": "allow" }, "external_directory": { + "C:/Users/pjjg18/GitHub/TreeSearch-a/*": "allow", + "C:/Users/pjjg18/.positai/skills/r-package-profiling/references/*": "allow", "../Quartet/*": "allow", "../TreeTools/*": "allow" + }, + "webfetch": { + "https://github.com/*": "allow", + "https://api.github.com/*": "allow", + "https://raw.githubusercontent.com/*": "allow" } }, "model": { diff --git a/AGENTS.md b/AGENTS.md index 237aa42b..6d81e883 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -1177,6 +1177,52 @@ for the cross-pairs case. --- +## LinkingTo Header Exposure (`expose-lapjv` branch) + +Extracted LAP and MCI C++ APIs into `inst/include/TreeDist/` headers so that +downstream packages (e.g., TreeSearch) can use `LinkingTo: TreeDist`: + +| Header | Content | +|--------|---------| +| `types.h` | `cost`, `lap_dim`, `lap_row`, `lap_col`, constants | +| `cost_matrix.h` | `CostMatrix` class (Rcpp-free) | +| `lap_scratch.h` | `LapScratch` struct | +| `lap.h` | `lap()` declarations | +| `lap_impl.h` | `lap()` implementation (include in exactly one TU) | +| `mutual_clustering.h` | MCI declarations | +| `mutual_clustering_impl.h` | MCI implementation (include in exactly one TU) | + +`src/lap.h` is now a thin wrapper that includes `` and +re-exports types to global scope. + +### LAPJV codegen regression (diagnosed, partially mitigated) + +Including `lap_impl.h` in `lap.cpp` changed the TU context enough for GCC 14's +register allocator to produce ~8% more instructions in the Dijkstra hot loop, +causing a consistent 20–25% regression on standalone LAPJV (n ≥ 400). + +**Root cause:** the installed-header version of `CostMatrix` (in +`inst/include/TreeDist/cost_matrix.h`) has a different method set than main's +monolithic `src/lap.h` (extra methods like `setWithTranspose()`, `dim8()`; +missing test variants like `findRowSubminNaive`). This changes GCC's +optimization heuristics for the entire TU, even though `lap()` never calls +the differing methods. + +**Fix:** define `lap()` directly in `lap.cpp` (not via `#include `) +with `__attribute__((optimize("align-functions=64", "align-loops=16")))`. +The `lapjv()` wrapper fills the transposed buffer first (matching R's +column-major storage) then untransposes — restoring the cache-friendly pattern. + +**Residual:** ~5–9% on LAPJV 1999×1999 vs main (from the CostMatrix class +definition difference). Tree distance metrics are unaffected — they call +`lap()` from `pairwise_distances.cpp`, whose TU context is unchanged. + +**Maintenance note:** if the `lap()` algorithm changes, update BOTH `src/lap.cpp` +and `inst/include/TreeDist/lap_impl.h`. The duplication is intentional — it +preserves the TU context that was profiled and tuned. + +--- + ## Remaining Optimization Opportunities - Sort+merge pre-scan for `rf_info_score`: **DONE** — replaced O(n²) loop with diff --git a/NEWS.md b/NEWS.md index 17785cba..082fec34 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,14 @@ -# Branch: `transfer-consensus` → merge target: `master` +# Branch: `expose-lapjv` → merge target: `master` + +This branch exposes the LAPJV (Jonker–Volgenant) linear assignment solver +via C++ headers so that downstream packages can link to it (`LinkingTo`), +and implements a transfer consensus method (`TransferConsensus()`, +`TransferDist()`). It also tightens error handling by moving all +`Rcpp::stop()` / `Rf_error()` calls out of C++ implementation code +into R-level input validation. + +# TreeDist 2.13.0.9000 This branch implements transfer consensus trees (`TransferConsensus()`) and a corresponding transfer distance metric (`TransferDist()`), providing a diff --git a/R/RcppExports.R b/R/RcppExports.R index a907e27f..9e4d02b2 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -267,6 +267,10 @@ cpp_transfer_dist_cross_pairs <- function(splits_a, splits_b, n_tip, scale, n_th .Call(`_TreeDist_cpp_transfer_dist_cross_pairs`, splits_a, splits_b, n_tip, scale, n_threads) } +cpp_mci_impl_score <- function(x, y, n_tips) { + .Call(`_TreeDist_cpp_mci_impl_score`, x, y, n_tips) +} + cpp_robinson_foulds_distance <- function(x, y, nTip) { .Call(`_TreeDist_cpp_robinson_foulds_distance`, x, y, nTip) } diff --git a/inst/include/TreeDist/cost_matrix.h b/inst/include/TreeDist/cost_matrix.h new file mode 100644 index 00000000..fe4dddcf --- /dev/null +++ b/inst/include/TreeDist/cost_matrix.h @@ -0,0 +1,276 @@ +#ifndef TREEDIST_COST_MATRIX_H_ +#define TREEDIST_COST_MATRIX_H_ + +// TreeDist CostMatrix — cache-aligned, block-padded cost matrix for the LAP. +// +// Rcpp-free public API. The Rcpp-dependent constructor +// (CostMatrix(Rcpp::NumericMatrix)) lives in TreeDist's own src/lap.h. + +#include "types.h" + +#include +#include +#include +#include + +namespace TreeDist { + +class CostMatrix { +private: + std::size_t dim_; + std::size_t dim8_; + alignas(64) std::vector data_; + alignas(64) std::vector t_data_; + bool transposed_; + + static std::size_t block_containing(const std::size_t x) { + return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE; + } + +public: + // Default constructor for pooled instances (zero-size, no allocation). + CostMatrix() : dim_(0), dim8_(0), transposed_(false) {} + + explicit CostMatrix(std::size_t dim) + : dim_(dim), + dim8_(block_containing(dim_)), + data_(std::vector(dim8_ * dim8_)), + t_data_(std::vector(dim8_ * dim8_)), + transposed_(false) {} + + // Resize for reuse. Only reallocates when the new dimension exceeds the + // current buffer capacity; otherwise just updates dim_/dim8_ and marks + // the transpose as stale. + void resize(std::size_t new_dim) { + dim_ = new_dim; + dim8_ = block_containing(new_dim); + const std::size_t needed = dim8_ * dim8_; + if (data_.size() < needed) { + data_.resize(needed); + t_data_.resize(needed); + } + transposed_ = false; + } + + void reset() noexcept { transposed_ = false; } + + [[nodiscard]] std::size_t dim() const noexcept { return dim_; } + [[nodiscard]] std::size_t dim8() const noexcept { return dim8_; } + + // ---- Element access ---- + + cost& operator()(lap_row i, lap_col j) { + return data_[static_cast(i) * dim8_ + j]; + } + + [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const { + return data_[static_cast(i) * dim8_ + j]; + } + + [[nodiscard]] const cost& row0(lap_col j) const { + return data_[j]; + } + + [[nodiscard]] const cost& entry0(lap_row i) const { + return data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] cost* row(lap_row i) { + return &data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] const cost* row(lap_row i) const { + return &data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] cost* col(lap_col i) { + return &t_data_[static_cast(i) * dim8_]; + } + + [[nodiscard]] const cost* col(lap_col i) const { + return &t_data_[static_cast(i) * dim8_]; + } + + // Write a value to both the data and transposed-data arrays. + // After filling the entire matrix this way, call markTransposed() + // to skip the lazy transpose in makeTranspose(). + void setWithTranspose(lap_row i, lap_col j, cost value) { + data_[static_cast(i) * dim8_ + j] = value; + t_data_[static_cast(j) * dim8_ + i] = value; + } + + void markTransposed() noexcept { transposed_ = true; } + + // ---- Transpose ---- + + void makeTranspose() noexcept { + if (transposed_) return; + + const cost* __restrict__ data_ptr = data_.data(); + cost* __restrict__ t_data_ptr = t_data_.data(); + +#if defined(__GNUC__) || defined(__clang__) + data_ptr = + static_cast(__builtin_assume_aligned(data_ptr, 64)); + t_data_ptr = + static_cast(__builtin_assume_aligned(t_data_ptr, 64)); +#endif + for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) { + for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) { + for (std::size_t r = i; + r < std::min(i + BLOCK_SIZE, dim_); ++r) { + for (std::size_t c = j; + c < std::min(j + BLOCK_SIZE, dim_); ++c) { + t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; + } + } + } + } + transposed_ = true; + } + + void makeUntranspose() noexcept { + const cost* __restrict__ data_ptr = t_data_.data(); + cost* __restrict__ t_data_ptr = data_.data(); + +#if defined(__GNUC__) || defined(__clang__) + data_ptr = + static_cast(__builtin_assume_aligned(data_ptr, 64)); + t_data_ptr = + static_cast(__builtin_assume_aligned(t_data_ptr, 64)); +#endif + for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) { + for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) { + for (std::size_t r = i; + r < std::min(i + BLOCK_SIZE, dim_); ++r) { + for (std::size_t c = j; + c < std::min(j + BLOCK_SIZE, dim_); ++c) { + t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; + } + } + } + } + } + + // ---- Padding ---- + + void padTrAfterCol(lap_row start_row, cost value) { + std::size_t start_index = static_cast(start_row) * dim8_; + std::size_t end_index = dim_ * dim8_; + std::fill(t_data_.begin() + start_index, + t_data_.begin() + end_index, value); + } + + void padAfterRow(lap_row start_row, cost value) { + std::size_t start_index = static_cast(start_row) * dim8_; + std::size_t end_index = dim_ * dim8_; + std::fill(data_.begin() + start_index, + data_.begin() + end_index, value); + } + + void padTrColAfterRow(const lap_row r, const lap_col start_col, + const cost value) { + std::size_t r_offset = r * dim8_; + std::size_t actual_start_col = static_cast(start_col); + std::size_t start_index = r_offset + actual_start_col; + std::size_t end_index = start_index + dim_ - actual_start_col; + std::fill(t_data_.begin() + start_index, + t_data_.begin() + end_index, value); + } + + void padRowAfterCol(const lap_row r, const lap_col start_col, + const cost value) { + std::size_t r_offset = r * dim8_; + std::size_t actual_start_col = static_cast(start_col); + std::size_t start_index = r_offset + actual_start_col; + std::size_t end_index = start_index + dim_ - actual_start_col; + std::fill(data_.begin() + start_index, + data_.begin() + end_index, value); + } + + // ---- Search ---- + + std::pair findColMin(lap_col j, + lap_row search_dim = -1) { + makeTranspose(); + const cost* col_data = col(j); + const lap_row n = + (search_dim < 0) ? static_cast(dim_) : search_dim; + const auto min_ptr = std::min_element(col_data, col_data + n); + return {*min_ptr, + static_cast(std::distance(col_data, min_ptr))}; + } + + std::tuple findRowSubmin( + const lap_row* i, const std::vector& v + ) const { + assert(dim_ > 1); + + const cost* __restrict__ row_i = row(*i); + const lap_col dim = static_cast(dim_); + const cost* __restrict__ v_ptr = v.data(); + + const cost h0 = row_i[0] - v_ptr[0]; + const cost h1 = row_i[1] - v_ptr[1]; + + cost min_val, submin_val; + lap_col min_idx, submin_idx; + + if (h0 > h1) { + min_val = h1; submin_val = h0; + min_idx = 1; submin_idx = 0; + } else { + min_val = h0; submin_val = h1; + min_idx = 0; submin_idx = 1; + } + + const lap_col j_limit = + (dim < 4 ? 0 : static_cast(dim - 3)); + + for (lap_col j = 2; j < j_limit; j += 4) { + assert(BLOCK_SIZE >= 4); + const cost h0 = row_i[j] - v_ptr[j]; + const cost h1 = row_i[j + 1] - v_ptr[j + 1]; + const cost h2 = row_i[j + 2] - v_ptr[j + 2]; + const cost h3 = row_i[j + 3] - v_ptr[j + 3]; + if (h0 < submin_val) { + if (h0 < min_val) { + submin_val = min_val; min_val = h0; + submin_idx = min_idx; min_idx = j; + } else { submin_val = h0; submin_idx = j; } + } + if (h1 < submin_val) { + if (h1 < min_val) { + submin_val = min_val; min_val = h1; + submin_idx = min_idx; min_idx = j + 1; + } else { submin_val = h1; submin_idx = j + 1; } + } + if (h2 < submin_val) { + if (h2 < min_val) { + submin_val = min_val; min_val = h2; + submin_idx = min_idx; min_idx = j + 2; + } else { submin_val = h2; submin_idx = j + 2; } + } + if (h3 < submin_val) { + if (h3 < min_val) { + submin_val = min_val; min_val = h3; + submin_idx = min_idx; min_idx = j + 3; + } else { submin_val = h3; submin_idx = j + 3; } + } + } + for (lap_col j = j_limit; j < dim; ++j) { + const cost h = row_i[j] - v_ptr[j]; + if (h < submin_val) { + if (h < min_val) { + submin_val = min_val; min_val = h; + submin_idx = min_idx; min_idx = j; + } else { submin_val = h; submin_idx = j; } + } + } + return {min_val, submin_val, min_idx, submin_idx}; + } +}; + +} // namespace TreeDist + +#endif // TREEDIST_COST_MATRIX_H_ diff --git a/inst/include/TreeDist/lap.h b/inst/include/TreeDist/lap.h new file mode 100644 index 00000000..673cfa22 --- /dev/null +++ b/inst/include/TreeDist/lap.h @@ -0,0 +1,35 @@ +#ifndef TREEDIST_LAP_H_ +#define TREEDIST_LAP_H_ + +// LAP (Linear Assignment Problem) — Jonker–Volgenant declarations. +// +// Provides the lap() function signature. The implementation lives in +// lap_impl.h, guarded by TREEDIST_LAP_IMPLEMENTATION. Downstream +// packages should include lap_impl.h in exactly one translation unit +// with the guard defined. + +#include "lap_scratch.h" + +namespace TreeDist { + + // Primary overload: caller supplies pre-allocated scratch. + extern cost lap(lap_row dim, + CostMatrix& input_cost, + std::vector& rowsol, + std::vector& colsol, + bool allow_interrupt, + LapScratch& scratch); + + // Convenience overload: creates a temporary scratch. + inline cost lap(lap_row dim, + CostMatrix& input_cost, + std::vector& rowsol, + std::vector& colsol, + bool allow_interrupt = true) { + LapScratch scratch; + return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch); + } + +} // namespace TreeDist + +#endif // TREEDIST_LAP_H_ diff --git a/inst/include/TreeDist/lap_impl.h b/inst/include/TreeDist/lap_impl.h new file mode 100644 index 00000000..196c37f6 --- /dev/null +++ b/inst/include/TreeDist/lap_impl.h @@ -0,0 +1,254 @@ +#ifndef TREEDIST_LAP_IMPL_H_ +#define TREEDIST_LAP_IMPL_H_ + +// LAP (Linear Assignment Problem) — Jonker–Volgenant implementation. +// +// Guard this with #define TREEDIST_LAP_IMPLEMENTATION before including. +// Include in exactly one translation unit per package. +// +// Interrupt handling: define TREEDIST_CHECK_INTERRUPT() before including +// to enable user-interrupt checks (e.g., Rcpp::checkUserInterrupt()). +// If not defined, interrupt checks are silently skipped. +// +// Original algorithm: +// R. Jonker and A. Volgenant, "A Shortest Augmenting Path Algorithm +// for Dense and Sparse Linear Assignment Problems," Computing 38, +// 325-340, 1987. + +#ifdef TREEDIST_LAP_IMPLEMENTATION + +#include "lap.h" + +#ifndef TREEDIST_CHECK_INTERRUPT +#define TREEDIST_CHECK_INTERRUPT() ((void)0) +#define TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_ +#endif + +namespace TreeDist { + +namespace detail { + inline bool nontrivially_less_than(cost a, cost b) noexcept { + return a + ((a > ROUND_PRECISION) ? 8 : 0) < b; + } +} // namespace detail + +// The LAP hot loops are sensitive to instruction alignment — even unrelated +// changes to other code in the same translation unit can shift layout enough +// to cause measurable regressions. Force 64-byte function alignment and +// 16-byte loop alignment to stabilise performance across builds. +#if defined(__GNUC__) && !defined(__clang__) +__attribute__((optimize("align-functions=64", "align-loops=16"))) +#endif +cost lap(const lap_row dim, + CostMatrix& input_cost, + std::vector& rowsol, + std::vector& colsol, + const bool allow_interrupt, + LapScratch& scratch) +{ + lap_row num_free = 0; + scratch.ensure(dim); + auto& v = scratch.v; + auto& matches = scratch.matches; + std::fill(matches.begin(), matches.begin() + dim, 0); + const cost* __restrict__ v_ptr = v.data(); + + // COLUMN REDUCTION + for (lap_col j = dim; j--; ) { + const auto [min, imin] = input_cost.findColMin(j); + v[j] = min; + ++matches[imin]; + + if (matches[imin] == 1) { + rowsol[imin] = j; + colsol[j] = imin; + } else if (v_ptr[j] < v_ptr[rowsol[imin]]) { + const lap_col j1 = rowsol[imin]; + rowsol[imin] = j; + colsol[j] = imin; + colsol[j1] = -1; + } else { + colsol[j] = -1; + } + } + + // REDUCTION TRANSFER + auto& freeunassigned = scratch.freeunassigned; + + for (lap_row i = 0; i < dim; ++i) { + if (matches[i] == 0) { + freeunassigned[num_free++] = i; + } else if (matches[i] == 1) { + const lap_col j1 = rowsol[i]; + const cost* row_i = input_cost.row(i); + cost min_cost; + if (j1 == 0) { + min_cost = row_i[1] - v_ptr[1]; + for (lap_col j = 2; j < dim; ++j) { + const cost reduced_cost = row_i[j] - v_ptr[j]; + if (reduced_cost < min_cost) { + min_cost = reduced_cost; + } + } + } else { + min_cost = row_i[0] - v_ptr[0]; + for (lap_col j = 1; j < dim; ++j) { + if (j == j1) continue; + const cost reduced_cost = row_i[j] - v_ptr[j]; + if (reduced_cost < min_cost) { + min_cost = reduced_cost; + } + } + } + v[j1] -= min_cost; + } + } + + // AUGMENTING ROW REDUCTION + auto& col_list = scratch.col_list; + int loopcnt = 0; + + do { + ++loopcnt; + + lap_row previous_num_free = num_free; + num_free = 0; + lap_row k = 0; + while (k < previous_num_free) { + const lap_row i = freeunassigned[k++]; + const auto [umin, usubmin, min_idx, j2] = + input_cost.findRowSubmin(&i, v); + lap_col j1 = min_idx; + + lap_row i0 = colsol[j1]; + const bool strictly_less = + detail::nontrivially_less_than(umin, usubmin); + if (strictly_less) { + v[j1] -= (usubmin - umin); + } else if (i0 > -1) { + j1 = j2; + i0 = colsol[j2]; + } + + rowsol[i] = j1; + colsol[j1] = i; + + if (i0 > -1) { + if (strictly_less) { + freeunassigned[--k] = i0; + if (allow_interrupt) TREEDIST_CHECK_INTERRUPT(); + } else { + freeunassigned[num_free++] = i0; + } + } + } + } while (loopcnt < 2); + + // AUGMENT SOLUTION for each free row. + auto& d = scratch.d; + auto& predecessor = scratch.predecessor; + + for (lap_row f = 0; f < num_free; ++f) { + bool unassignedfound = false; + lap_row free_row = freeunassigned[f]; + const cost* free_row_cost = input_cost.row(free_row); + lap_col endofpath = 0; + lap_col last = 0; + lap_row i; + lap_col j1; + + for (lap_col j = 0; j < dim; ++j) { + d[j] = free_row_cost[j] - v_ptr[j]; + predecessor[j] = free_row; + col_list[j] = j; + } + + cost min = 0; + lap_col low = 0; + lap_col up = 0; + + do { + if (up == low) { + last = low - 1; + min = d[col_list[up++]]; + + for (lap_dim k = up; k < dim; ++k) { + const lap_col j = col_list[k]; + const cost h = d[j]; + if (h <= min) { + if (h < min) { + up = low; + min = h; + } + col_list[k] = col_list[up]; + col_list[up++] = j; + } + } + for (lap_dim k = low; k < up; ++k) { + if (colsol[col_list[k]] < 0) { + endofpath = col_list[k]; + unassignedfound = true; + break; + } + } + } + + if (!unassignedfound) { + j1 = col_list[low++]; + i = colsol[j1]; + const cost* row_i = input_cost.row(i); + const cost h = row_i[j1] - v_ptr[j1] - min; + + for (lap_dim k = up; k < dim; ++k) { + const lap_col j = col_list[k]; + cost v2 = row_i[j] - v_ptr[j] - h; + if (v2 < d[j]) { + predecessor[j] = i; + if (v2 == min) { + if (colsol[j] < 0) { + endofpath = j; + unassignedfound = true; + break; + } else { + col_list[k] = col_list[up]; + col_list[up++] = j; + } + } + d[j] = v2; + } + } + } + } while (!unassignedfound); + + for (lap_dim k = 0; k <= last; ++k) { + j1 = col_list[k]; + v[j1] += d[j1] - min; + } + + do { + i = predecessor[endofpath]; + colsol[endofpath] = i; + j1 = endofpath; + endofpath = rowsol[i]; + rowsol[i] = j1; + } while (i != free_row); + } + + // Calculate optimal cost. + cost lapcost = 0; + for (lap_dim i = 0; i < dim; ++i) { + lapcost += input_cost(i, rowsol[i]); + } + + return lapcost; +} + +} // namespace TreeDist + +#ifdef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_ +#undef TREEDIST_CHECK_INTERRUPT +#undef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_ +#endif + +#endif // TREEDIST_LAP_IMPLEMENTATION +#endif // TREEDIST_LAP_IMPL_H_ diff --git a/inst/include/TreeDist/lap_scratch.h b/inst/include/TreeDist/lap_scratch.h new file mode 100644 index 00000000..4a9ca43b --- /dev/null +++ b/inst/include/TreeDist/lap_scratch.h @@ -0,0 +1,44 @@ +#ifndef TREEDIST_LAP_SCRATCH_H_ +#define TREEDIST_LAP_SCRATCH_H_ + +// Reusable heap storage for one thread's LAPJV workspace. +// +// Pass to the scratch-taking overload of lap() to eliminate per-call +// std::vector allocations. In a serial context construct once before the +// loop; in an OpenMP context allocate one per thread and index by +// omp_get_thread_num(). All vectors are grown lazily; never shrunk. + +#include "cost_matrix.h" + +namespace TreeDist { + +struct LapScratch { + std::vector v; + std::vector matches; + std::vector freeunassigned; + std::vector col_list; + std::vector d; + std::vector predecessor; + std::vector rowsol; + std::vector colsol; + CostMatrix score_pool; + CostMatrix small_pool; + + void ensure(int dim) noexcept { + const int padded = static_cast( + ((static_cast(dim) + BLOCK_SIZE - 1) + / BLOCK_SIZE) * BLOCK_SIZE); + if (static_cast(v.size()) < padded) v.resize(padded); + if (static_cast(matches.size()) < dim) matches.resize(dim); + if (static_cast(freeunassigned.size()) < dim) freeunassigned.resize(dim); + if (static_cast(col_list.size()) < dim) col_list.resize(dim); + if (static_cast(d.size()) < dim) d.resize(dim); + if (static_cast(predecessor.size()) < dim) predecessor.resize(dim); + if (static_cast(rowsol.size()) < dim) rowsol.resize(dim); + if (static_cast(colsol.size()) < dim) colsol.resize(dim); + } +}; + +} // namespace TreeDist + +#endif // TREEDIST_LAP_SCRATCH_H_ diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h new file mode 100644 index 00000000..437633f0 --- /dev/null +++ b/inst/include/TreeDist/mutual_clustering.h @@ -0,0 +1,87 @@ +#ifndef TREEDIST_MUTUAL_CLUSTERING_H_ +#define TREEDIST_MUTUAL_CLUSTERING_H_ + +// Mutual Clustering Information (MCI) — public API. +// +// Provides the core MCI score computation between two split sets, +// clustering entropy, and lookup-table initialization. +// +// Implementation lives in mutual_clustering_impl.h, guarded by +// TREEDIST_MCI_IMPLEMENTATION. + +#include "lap.h" +#include // splitbit, SL_MAX_TIPS, count_bits +#include + +namespace TreeDist { + + // ---- Lookup tables (populated by init_lg2_tables) ---- + // + // lg2[i] = log2(i) for 0 <= i <= SL_MAX_TIPS + // lg2_double_factorial[i] = log2(i!!) for 0 <= i < 2*SL_MAX_TIPS - 2 + // lg2_unrooted[i] = log2((2i-5)!!) for i >= 3 + // lg2_rooted = &lg2_unrooted[0] + 1 (so lg2_rooted[i] = lg2_unrooted[i+1]) + // + // These are defined in mutual_clustering_impl.h. + + extern double lg2[SL_MAX_TIPS + 1]; + extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; + extern double lg2_unrooted[SL_MAX_TIPS + 2]; + extern double* lg2_rooted; + + // Populate lookup tables. Must be called once before any MCI + // computation. max_tips should be >= the largest tree size used. + void init_lg2_tables(int max_tips); + + // ---- Inline helpers ---- + + // Information content of a perfectly-matching split pair. + // ic_matching(a, b, n) = (a + b) * lg2[n] - a * lg2[a] - b * lg2[b] + [[nodiscard]] inline double ic_matching(int16 a, int16 b, + int16 n) noexcept { + const double lg2a = lg2[a]; + const double lg2b = lg2[b]; + const double lg2n = lg2[n]; + return (a + b) * lg2n - a * lg2a - b * lg2b; + } + + // Clustering entropy of a split set. + // Computes H_clust = -sum_i [ p_i * log2(p_i) + q_i * log2(q_i) ] + // where p_i = in_split[i] / n_tips and q_i = 1 - p_i. + [[nodiscard]] inline double clustering_entropy( + const int* in_split, int n_splits, int n_tips) { + if (n_tips <= 1 || n_splits == 0) return 0.0; + const double lg2n = std::log2(static_cast(n_tips)); + double ce = 0.0; + for (int i = 0; i < n_splits; ++i) { + const int a = in_split[i]; + const int b = n_tips - a; + if (a > 1 && b > 1) { + ce += (a * std::log2(static_cast(a)) + + b * std::log2(static_cast(b))) + / n_tips - lg2n; + } + } + return -ce; + } + + // ---- MCI score (declaration only) ---- + // + // Computes the Mutual Clustering Information between two split sets. + // Uses sort+merge exact-match detection + LAP on the reduced matrix. + // + // a_state[i] points to n_bins splitbit words for split i of tree A. + // a_in[i] = popcount of split i (tips in the "1" partition). + // Returns the MCI score (higher = more similar). + // + // Implementation in mutual_clustering_impl.h. + + double mutual_clustering_score( + const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, + const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, + int16 n_bins, int32 n_tips, + LapScratch& scratch); + +} // namespace TreeDist + +#endif // TREEDIST_MUTUAL_CLUSTERING_H_ diff --git a/inst/include/TreeDist/mutual_clustering_impl.h b/inst/include/TreeDist/mutual_clustering_impl.h new file mode 100644 index 00000000..7f98c8fb --- /dev/null +++ b/inst/include/TreeDist/mutual_clustering_impl.h @@ -0,0 +1,290 @@ +#ifndef TREEDIST_MUTUAL_CLUSTERING_IMPL_H_ +#define TREEDIST_MUTUAL_CLUSTERING_IMPL_H_ + +// Mutual Clustering Information — implementation. +// +// Guard with #define TREEDIST_MCI_IMPLEMENTATION before including. +// Include in exactly one translation unit per package (the same TU +// that defines TREEDIST_LAP_IMPLEMENTATION). +// +// Usage: +// #define TREEDIST_LAP_IMPLEMENTATION +// #define TREEDIST_MCI_IMPLEMENTATION +// #include +// #include + +#ifdef TREEDIST_MCI_IMPLEMENTATION + +#include "mutual_clustering.h" + +#include +#include +#include +#include + +namespace TreeDist { + +// ---- Table definitions ---- + +double lg2[SL_MAX_TIPS + 1]; +double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; +double lg2_unrooted[SL_MAX_TIPS + 2]; +double* lg2_rooted = &lg2_unrooted[0] + 1; + +void init_lg2_tables(int max_tips) { + lg2[0] = 0; + const int lg2_limit = std::min(max_tips + 1, + static_cast(SL_MAX_TIPS + 1)); + for (int i = 1; i < lg2_limit; ++i) { + lg2[i] = std::log2(static_cast(i)); + } + + for (int i = 0; i < 3; ++i) { + lg2_double_factorial[i] = 0; + lg2_unrooted[i] = 0; + } + + const int df_limit = std::min(2 * max_tips, + static_cast(SL_MAX_TIPS + SL_MAX_TIPS - 2)); + for (int i = 2; i < df_limit; ++i) { + lg2_double_factorial[i] = + lg2_double_factorial[i - 2] + std::log2(static_cast(i)); + } + + const int ur_limit = std::min(max_tips + 2, + static_cast(SL_MAX_TIPS + 2)); + for (int i = 3; i < ur_limit; ++i) { + lg2_unrooted[i] = lg2_double_factorial[i + i - 5]; + } +} + +// ---- Sort+merge exact-match detection (internal) ---- +// +// Canonicalise each split so bit 0 is always set (flip complement if not), +// sort by canonical form, merge-scan. O(n log n). +// +// Writes a_match[ai] = bi+1 if matched, else 0. Likewise b_match. +// Returns number of exact matches. + +namespace detail { + +static int16 find_exact_matches_raw( + const splitbit* const* a_state, const int16* /*a_in*/, int16 a_n, + const splitbit* const* b_state, const int16* /*b_in*/, int16 b_n, + int16 n_bins, int32 n_tips, + int16* a_match, int16* b_match) +{ + std::fill(a_match, a_match + a_n, int16(0)); + std::fill(b_match, b_match + b_n, int16(0)); + if (a_n == 0 || b_n == 0) return 0; + + const int16 last_bin = n_bins - 1; + const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0) + ? ~splitbit(0) + : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1; + + // Flat buffers for canonical forms + std::vector a_canon(static_cast(a_n) * n_bins); + std::vector b_canon(static_cast(b_n) * n_bins); + + for (int16 i = 0; i < a_n; ++i) { + const bool flip = !(a_state[i][0] & 1); + for (int16 bin = 0; bin < n_bins; ++bin) { + splitbit val = flip ? ~a_state[i][bin] : a_state[i][bin]; + if (bin == last_bin) val &= last_mask; + a_canon[i * n_bins + bin] = val; + } + } + for (int16 i = 0; i < b_n; ++i) { + const bool flip = !(b_state[i][0] & 1); + for (int16 bin = 0; bin < n_bins; ++bin) { + splitbit val = flip ? ~b_state[i][bin] : b_state[i][bin]; + if (bin == last_bin) val &= last_mask; + b_canon[i * n_bins + bin] = val; + } + } + + // Sort index arrays by canonical form + auto canon_less = [&](const splitbit* canon, int16 n_b, int16 i, int16 j) { + for (int16 bin = 0; bin < n_b; ++bin) { + const splitbit vi = canon[i * n_b + bin]; + const splitbit vj = canon[j * n_b + bin]; + if (vi < vj) return true; + if (vi > vj) return false; + } + return false; + }; + + std::vector a_order(a_n), b_order(b_n); + std::iota(a_order.begin(), a_order.end(), int16(0)); + std::iota(b_order.begin(), b_order.end(), int16(0)); + + std::sort(a_order.begin(), a_order.end(), + [&](int16 i, int16 j) { + return canon_less(a_canon.data(), n_bins, i, j); + }); + std::sort(b_order.begin(), b_order.end(), + [&](int16 i, int16 j) { + return canon_less(b_canon.data(), n_bins, i, j); + }); + + // Merge-scan + int16 exact_n = 0; + int16 ai_pos = 0, bi_pos = 0; + while (ai_pos < a_n && bi_pos < b_n) { + const int16 ai = a_order[ai_pos]; + const int16 bi = b_order[bi_pos]; + + int cmp = 0; + for (int16 bin = 0; bin < n_bins; ++bin) { + const splitbit va = a_canon[ai * n_bins + bin]; + const splitbit vb = b_canon[bi * n_bins + bin]; + if (va < vb) { cmp = -1; break; } + if (va > vb) { cmp = 1; break; } + } + + if (cmp < 0) { + ++ai_pos; + } else if (cmp > 0) { + ++bi_pos; + } else { + a_match[ai] = bi + 1; + b_match[bi] = ai + 1; + ++exact_n; + ++ai_pos; + ++bi_pos; + } + } + + return exact_n; +} + +} // namespace detail + + +// ---- MCI score implementation ---- + +double mutual_clustering_score( + const splitbit* const* a_state, const int16* a_in, int16 a_n_splits, + const splitbit* const* b_state, const int16* b_in, int16 b_n_splits, + int16 n_bins, int32 n_tips, + LapScratch& scratch) +{ + if (a_n_splits == 0 || b_n_splits == 0 || n_tips == 0) return 0.0; + + const int16 most_splits = std::max(a_n_splits, b_n_splits); + const double n_tips_rcp = 1.0 / static_cast(n_tips); + + constexpr cost max_score = BIG; + constexpr double over_max = 1.0 / static_cast(BIG); + const double max_over_tips = static_cast(BIG) * n_tips_rcp; + const double lg2_n = lg2[n_tips]; + + // --- Phase 1: O(n log n) exact-match detection --- + std::vector a_match_buf(a_n_splits); + std::vector b_match_buf(b_n_splits); + + const int16 exact_n = detail::find_exact_matches_raw( + a_state, a_in, a_n_splits, + b_state, b_in, b_n_splits, + n_bins, n_tips, + a_match_buf.data(), b_match_buf.data()); + + const int16* a_match = a_match_buf.data(); + const int16* b_match = b_match_buf.data(); + + // Accumulate exact-match score + double exact_score = 0.0; + for (int16 ai = 0; ai < a_n_splits; ++ai) { + if (a_match[ai]) { + const int16 na = a_in[ai]; + const int16 nA = static_cast(n_tips - na); + exact_score += ic_matching(na, nA, static_cast(n_tips)); + } + } + + // Early exit when everything matched exactly + if (exact_n == b_n_splits || exact_n == a_n_splits) { + return exact_score * n_tips_rcp; + } + + // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) --- + const int16 lap_n = most_splits - exact_n; + + std::vector a_unmatch, b_unmatch; + a_unmatch.reserve(lap_n); + b_unmatch.reserve(lap_n); + for (int16 ai = 0; ai < a_n_splits; ++ai) { + if (!a_match[ai]) a_unmatch.push_back(ai); + } + for (int16 bi = 0; bi < b_n_splits; ++bi) { + if (!b_match[bi]) b_unmatch.push_back(bi); + } + + scratch.score_pool.resize(lap_n); + CostMatrix& score = scratch.score_pool; + + const int16 a_unmatched_n = static_cast(a_unmatch.size()); + const int16 b_unmatched_n = static_cast(b_unmatch.size()); + + for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) { + const int16 ai = a_unmatch[a_pos]; + const int16 na = a_in[ai]; + const int16 nA = static_cast(n_tips - na); + const splitbit* a_row = a_state[ai]; + + const double offset_a = lg2_n - lg2[na]; + const double offset_A = lg2_n - lg2[nA]; + + for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) { + const int16 bi = b_unmatch[b_pos]; + const splitbit* b_row = b_state[bi]; + int16 a_and_b = 0; + for (int16 bin = 0; bin < n_bins; ++bin) { + a_and_b += TreeTools::count_bits(a_row[bin] & b_row[bin]); + } + + const int16 nb = b_in[bi]; + const int16 nB = static_cast(n_tips - nb); + const int16 a_and_B = na - a_and_b; + const int16 A_and_b = nb - a_and_b; + const int16 A_and_B = nA - A_and_b; + + if (a_and_b == A_and_b && a_and_b == a_and_B + && a_and_b == A_and_B) { + score(a_pos, b_pos) = max_score; + } else { + const double lg2_nb = lg2[nb]; + const double lg2_nB = lg2[nB]; + const double ic_sum = + a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) + + a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) + + A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) + + A_and_B * (lg2[A_and_B] + offset_A - lg2_nB); + score(a_pos, b_pos) = + max_score - static_cast(ic_sum * max_over_tips); + } + } + if (b_unmatched_n < lap_n) { + score.padRowAfterCol(a_pos, b_unmatched_n, max_score); + } + } + if (a_unmatched_n < lap_n) { + score.padAfterRow(a_unmatched_n, max_score); + } + + // --- Phase 3: solve LAP on the reduced matrix --- + scratch.ensure(lap_n); + auto& rowsol = scratch.rowsol; + auto& colsol = scratch.colsol; + + const double lap_score = + static_cast((max_score * lap_n) - + lap(lap_n, score, rowsol, colsol, false, scratch)) * over_max; + return lap_score + exact_score * n_tips_rcp; +} + +} // namespace TreeDist + +#endif // TREEDIST_MCI_IMPLEMENTATION +#endif // TREEDIST_MUTUAL_CLUSTERING_IMPL_H_ diff --git a/inst/include/TreeDist/types.h b/inst/include/TreeDist/types.h new file mode 100644 index 00000000..c953989b --- /dev/null +++ b/inst/include/TreeDist/types.h @@ -0,0 +1,32 @@ +#ifndef TREEDIST_TYPES_H_ +#define TREEDIST_TYPES_H_ + +// TreeDist public type definitions. +// +// Rcpp-free: uses only standard headers plus TreeTools (for SL_MAX_SPLITS). +// Downstream packages include via: #include + +#include +#include +#include // SL_MAX_SPLITS, splitbit + +namespace TreeDist { + + using int16 = int_fast16_t; + using int32 = int_fast32_t; + using cost = int_fast64_t; + + using lap_dim = int; + using lap_row = lap_dim; + using lap_col = lap_dim; + + constexpr cost BIG = + (std::numeric_limits::max)() / SL_MAX_SPLITS; + + constexpr cost ROUND_PRECISION = 2048 * 2048; + + constexpr std::size_t BLOCK_SIZE = 16; + +} // namespace TreeDist + +#endif // TREEDIST_TYPES_H_ diff --git a/src/Makevars b/src/Makevars index 139b1d17..d3df534e 100644 --- a/src/Makevars +++ b/src/Makevars @@ -1,2 +1,3 @@ +PKG_CPPFLAGS = -I../inst/include PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) diff --git a/src/Makevars.win b/src/Makevars.win index 139b1d17..d3df534e 100644 --- a/src/Makevars.win +++ b/src/Makevars.win @@ -1,2 +1,3 @@ +PKG_CPPFLAGS = -I../inst/include PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index b3d661be..72309c5b 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -611,6 +611,19 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// cpp_mci_impl_score +double cpp_mci_impl_score(const Rcpp::RawMatrix& x, const Rcpp::RawMatrix& y, int n_tips); +RcppExport SEXP _TreeDist_cpp_mci_impl_score(SEXP xSEXP, SEXP ySEXP, SEXP n_tipsSEXP) { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type x(xSEXP); + Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type y(ySEXP); + Rcpp::traits::input_parameter< int >::type n_tips(n_tipsSEXP); + rcpp_result_gen = Rcpp::wrap(cpp_mci_impl_score(x, y, n_tips)); + return rcpp_result_gen; +END_RCPP +} // cpp_robinson_foulds_distance List cpp_robinson_foulds_distance(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip); RcppExport SEXP _TreeDist_cpp_robinson_foulds_distance(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) { @@ -753,6 +766,7 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeDist_cpp_transfer_dist_scored", (DL_FUNC) &_TreeDist_cpp_transfer_dist_scored, 4}, {"_TreeDist_cpp_transfer_dist_all_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_all_pairs, 4}, {"_TreeDist_cpp_transfer_dist_cross_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_cross_pairs, 5}, + {"_TreeDist_cpp_mci_impl_score", (DL_FUNC) &_TreeDist_cpp_mci_impl_score, 3}, {"_TreeDist_cpp_robinson_foulds_distance", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_distance, 3}, {"_TreeDist_cpp_robinson_foulds_info", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_info, 3}, {"_TreeDist_cpp_matching_split_distance", (DL_FUNC) &_TreeDist_cpp_matching_split_distance, 3}, diff --git a/src/ints.h b/src/ints.h index a5af8774..50fd3cc7 100644 --- a/src/ints.h +++ b/src/ints.h @@ -1,10 +1,13 @@ #ifndef _TREEDIST_INT_H #define _TREEDIST_INT_H -#include +#include -using int16 = int_fast16_t; -using int32 = int_fast32_t; +// Re-export shared types to global scope for backward compatibility. +using TreeDist::int16; +using TreeDist::int32; + +// Types used only within TreeDist's own source. using uint32 = uint_fast32_t; using grf_match = std::vector; diff --git a/src/lap.cpp b/src/lap.cpp index 4963fa9e..e77c712f 100644 --- a/src/lap.cpp +++ b/src/lap.cpp @@ -26,6 +26,15 @@ https://www.mathworks.com/matlabcentral/fileexchange/26836-lapjv-jonker-volgenant-algorithm-for-linear-assignment-problem-v3-0: * *************************************************************************/ + +// NOTE: The LAP hot loops are highly sensitive to instruction alignment and +// register allocation, which are affected by the TU's full include graph. +// Do NOT include lap_impl.h here — that header is for downstream LinkingTo +// consumers only. TreeDist's own lap() is compiled directly in this file +// to preserve the codegen context that was profiled and tuned. +// +// If the algorithm changes, update BOTH this file and lap_impl.h. + #include "lap.h" #include @@ -37,11 +46,42 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) { const lap_dim spare_rows = n_row - n_col; const cost max_score = cost(BIG / max_dim); const double x_max = maxX[0]; + const double scale_factor = max_score / x_max; std::vector rowsol(max_dim); std::vector colsol(max_dim); - cost_matrix input(x, x_max); + // Build cost matrix. Fill the transposed buffer first (matching R's + // column-major storage for sequential reads) then untranspose. + cost_matrix input(max_dim); + const double* __restrict__ src_data = REAL(x); + cost* __restrict__ t_ptr = input.col(0); + const std::size_t dim8 = input.dim8(); + + for (lap_col c = 0; c < n_col; ++c) { + const std::size_t t_off = static_cast(c) * dim8; + const std::size_t s_off = static_cast(c) * n_row; + for (lap_row r = 0; r < n_row; ++r) { + t_ptr[t_off + r] = static_cast(src_data[s_off + r] * scale_factor); + } + // Pad remaining rows in this transposed column + for (lap_row r = n_row; r < max_dim; ++r) { + t_ptr[t_off + r] = max_score; + } + for (std::size_t r = max_dim; r < dim8; ++r) { + t_ptr[t_off + r] = max_score; + } + } + // Pad remaining transposed columns + for (lap_col c = n_col; c < max_dim; ++c) { + const std::size_t t_off = static_cast(c) * dim8; + for (std::size_t r = 0; r < dim8; ++r) { + t_ptr[t_off + r] = max_score; + } + } + + // Untranspose: t_data_ -> data_ + input.makeUntranspose(); cost score = lap(max_dim, input, rowsol, colsol); @@ -60,27 +100,22 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) { ); } +namespace { inline bool nontrivially_less_than(cost a, cost b) noexcept { return a + ((a > ROUND_PRECISION) ? 8 : 0) < b; } +} // anonymous namespace /* This function is the jv shortest augmenting path algorithm to solve the assignment problem */ +namespace TreeDist { + cost lap(const lap_row dim, - cost_matrix &input_cost, + CostMatrix &input_cost, std::vector &rowsol, std::vector &colsol, const bool allow_interrupt, LapScratch &scratch) - - // input: - // dim - problem size - // input_cost - cost matrix - - // output: - // rowsol - column assigned to row in solution - // colsol - row assigned to column in solution - { lap_row num_free = 0; scratch.ensure(dim); @@ -316,3 +351,4 @@ cost lap(const lap_row dim, return lapcost; } +} // namespace TreeDist diff --git a/src/lap.h b/src/lap.h index b1a22508..592ce9b2 100644 --- a/src/lap.h +++ b/src/lap.h @@ -9,456 +9,27 @@ // because the LAP hot loops are sensitive to instruction alignment — even // unrelated inline functions in the same translation unit can shift code // layout enough to cause measurable regressions. - -#include -#include - -#include /* for numeric_limits */ -#include /* for vector */ - -#include "ints.h" - -/*************** TYPES *******************/ - -using cost = int_fast64_t; -using lap_dim = int; -using lap_row = lap_dim; -using lap_col = lap_dim; - -/* For a reason I've not determined, shrinking BIG is necessary to avoid - * an infinite loop in lap. */ -constexpr cost BIG = (std::numeric_limits::max)() / SL_MAX_SPLITS; -constexpr cost ROUND_PRECISION = 2048 * 2048; -constexpr size_t BLOCK_SIZE = 16; - -class CostMatrix { -private: - size_t dim_; // Important not to use int16, which will overflow on * - size_t dim8_; - alignas(64) std::vector data_; - alignas(64) std::vector t_data_; - bool transposed_; - - static size_t block_containing(const size_t x) { - return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE; - } - -public: - // Default constructor for pooled instances (zero-size, no allocation). - CostMatrix() : dim_(0), dim8_(0), transposed_(false) {} - - CostMatrix(size_t dim) - : dim_(dim), - dim8_(block_containing(dim_)), - data_(std::vector(dim8_ * dim8_)), - t_data_(std::vector(dim8_ * dim8_)), - transposed_(false) {} - - // Resize for reuse. Only reallocates when the new dimension exceeds the - // current buffer capacity; otherwise just updates dim_/dim8_ and marks - // the transpose as stale. New elements (if any) are value-initialised. - void resize(size_t new_dim) { - dim_ = new_dim; - dim8_ = block_containing(new_dim); - const size_t needed = dim8_ * dim8_; - if (data_.size() < needed) { - data_.resize(needed); - t_data_.resize(needed); - } - transposed_ = false; - } - - CostMatrix(const Rcpp::NumericMatrix& src, const double x_max) - : dim_((std::max(src.nrow(), src.ncol()))), // or pad here as needed - dim8_(block_containing(dim_)), - data_(std::vector(dim8_ * dim_)), - t_data_(std::vector(dim8_ * dim_)), - transposed_(true) - { - // Compute scale factor - const cost max_score = cost(BIG / dim_); - double scale_factor = max_score / x_max; - - const lap_row n_row = src.nrow(); - const lap_col n_col = src.ncol(); - - const double* __restrict src_data = REAL(src); - cost* __restrict dest_data = t_data_.data(); - - for (lap_col c = 0; c < n_col; ++c) { - const size_t data_c = c * dim8_; - const size_t src_c = c * n_row; - for (lap_row r = 0; r < n_row; ++r) { - // Marginally faster than std::transform - dest_data[data_c + r] = static_cast(src_data[src_c + r] * scale_factor); - } - - padTrColAfterRow(c, n_row, max_score); // padding now goes after row `c` - } - - padTrAfterCol(n_col, max_score); - - makeUntranspose(); - } - - // Reset for reuse at the same dimension — clears the transpose cache so the - // next findColMin() re-builds it from fresh data. Call between pair - // computations when reusing a thread-local CostMatrix instance. - void reset() noexcept { transposed_ = false; } - - [[nodiscard]] size_t dim() const noexcept { return dim_; } - - void makeTranspose() noexcept { - if (transposed_) return; - - const cost* __restrict data_ptr = data_.data(); - cost* __restrict t_data_ptr = t_data_.data(); - -#if defined(__GNUC__) || defined(__clang__) - data_ptr = static_cast(__builtin_assume_aligned(data_ptr, 64)); - t_data_ptr = static_cast(__builtin_assume_aligned(t_data_ptr, 64)); -#endif - for (size_t i = 0; i < dim_; i += BLOCK_SIZE) { - for (size_t j = 0; j < dim_; j += BLOCK_SIZE) { - for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) { - for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) { - t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; - } - } - } - } - - transposed_ = true; - } - - void makeUntranspose() noexcept { - const cost* __restrict data_ptr = t_data_.data(); - cost* __restrict t_data_ptr = data_.data(); - -#if defined(__GNUC__) || defined(__clang__) - data_ptr = static_cast(__builtin_assume_aligned(data_ptr, 64)); - t_data_ptr = static_cast(__builtin_assume_aligned(t_data_ptr, 64)); -#endif - for (size_t i = 0; i < dim_; i += BLOCK_SIZE) { - for (size_t j = 0; j < dim_; j += BLOCK_SIZE) { - for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) { - for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) { - t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c]; - } - } - } - } - } - - - // Access operator for read/write - cost& operator()(lap_row i, lap_col j) { - return data_[static_cast(i) * dim8_ + j]; - } - - // Const version for read-only access - [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const { - return data_[static_cast(i) * dim8_ + j]; - } - - [[nodiscard]] const cost& row0(lap_col j) const { - return data_[j]; - } - - [[nodiscard]] const cost& entry0(lap_row i) const { - return data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] cost* row(lap_row i) { - return &data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] const cost* row(lap_row i) const { - return &data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] cost* col(lap_col i) { - return &t_data_[static_cast(i) * dim8_]; - } - - [[nodiscard]] const cost* col(lap_col i) const { - return &t_data_[static_cast(i) * dim8_]; - } - - void padTrAfterCol(lap_row start_row, cost value) { - size_t start_index = static_cast(start_row) * dim8_; - size_t end_index = dim_ * dim8_; - std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value); - } - - void padAfterRow(lap_row start_row, cost value) { - size_t start_index = static_cast(start_row) * dim8_; - size_t end_index = dim_ * dim8_; - std::fill(data_.begin() + start_index, data_.begin() + end_index, value); - } - - void padTrColAfterRow(const lap_row r, const lap_col start_col, - const cost value) { - size_t r_offset = r * dim8_; - size_t actual_start_col = static_cast(start_col); - size_t start_index = r_offset + actual_start_col; - size_t end_index = start_index + dim_ - actual_start_col; - std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value); - } - - void padRowAfterCol(const lap_row r, const lap_col start_col, - const cost value) { - size_t r_offset = r * dim8_; - size_t actual_start_col = static_cast(start_col); - size_t start_index = r_offset + actual_start_col; - size_t end_index = start_index + dim_ - actual_start_col; - std::fill(data_.begin() + start_index, data_.begin() + end_index, value); - } - - std::pair findColMin(lap_col j, - lap_row search_dim = -1) { - makeTranspose(); - const cost* col_data = col(j); - const lap_row n = (search_dim < 0) ? static_cast(dim_) : search_dim; - const auto min_ptr = std::min_element(col_data, col_data + n); - return {*min_ptr, - static_cast(std::distance(col_data, min_ptr))}; - } - - // test2000 = 119 ms (!) - // Find minimum and second minimum reduced cost over columns. - std::tuple findRowSubmin( - const lap_row* i, const std::vector& v - ) const { - assert(dim_ > 1); - - const cost* __restrict row_i = row(*i); - const lap_col dim = static_cast(dim_); - const cost* __restrict v_ptr = v.data(); - - const cost h0 = row_i[0] - v_ptr[0]; - const cost h1 = row_i[1] - v_ptr[1]; - - cost min_val, submin_val; - lap_col min_idx, submin_idx; - - if (h0 > h1) { - min_val = h1; submin_val = h0; - min_idx = 1; submin_idx = 0; - } else { - min_val = h0; submin_val = h1; - min_idx = 0; submin_idx = 1; - } - - const lap_col j_limit = (dim < 4 ? 0 : static_cast(dim - 3)); - - for (lap_col j = 2; j < j_limit; j += 4) { - assert(BLOCK_SIZE >= 4); // Unrolling loop x4 gives ~20% speedup - const cost h0 = row_i[j] - v_ptr[j]; - const cost h1 = row_i[j + 1] - v_ptr[j + 1]; - const cost h2 = row_i[j + 2] - v_ptr[j + 2]; - const cost h3 = row_i[j + 3] - v_ptr[j + 3]; - if (h0 < submin_val) { - if (h0 < min_val) { - submin_val = min_val; - min_val = h0; - submin_idx = min_idx; - min_idx = j; - } else { - submin_val = h0; - submin_idx = j; - } - } - if (h1 < submin_val) { - if (h1 < min_val) { - submin_val = min_val; - min_val = h1; - submin_idx = min_idx; - min_idx = j + 1; - } else { - submin_val = h1; - submin_idx = j + 1; - } - } - if (h2 < submin_val) { - if (h2 < min_val) { - submin_val = min_val; - min_val = h2; - submin_idx = min_idx; - min_idx = j + 2; - } else { - submin_val = h2; - submin_idx = j + 2; - } - } - if (h3 < submin_val) { - if (h3 < min_val) { - submin_val = min_val; - min_val = h3; - submin_idx = min_idx; - min_idx = j + 3; - } else { - submin_val = h3; - submin_idx = j + 3; - } - } - } - for (lap_col j = j_limit; j < dim; ++j) { - const cost h = row_i[j] - v_ptr[j]; - if (h < submin_val) { - if (h < min_val) { - submin_val = min_val; - min_val = h; - submin_idx = min_idx; - min_idx = j; - } else { - submin_val = h; - submin_idx = j; - } - } - } - return {min_val, submin_val, min_idx, submin_idx}; - } - - // test2000 = 260 ms - std::tuple findRowSubminNaive( - const lap_row* i, - const std::vector& v_ptr - ) { - const cost* __restrict row_i = row(*i); - const lap_col dim = static_cast(dim_); - - if (dim < 2) { - return {row_i[0] - v_ptr[0], std::numeric_limits::max(), 0, 0}; - } - - // Initialize with first two elements - cost h0 = row_i[0] - v_ptr[0]; - cost h1 = row_i[1] - v_ptr[1]; - - cost min_val, submin_val; - lap_col min_idx, submin_idx; - - if (h0 <= h1) { - min_val = h0; submin_val = h1; - min_idx = 0; submin_idx = 1; - } else { - min_val = h1; submin_val = h0; - min_idx = 1; submin_idx = 0; - } - - // Process remaining elements - for (lap_col j = 2; j < dim; ++j) { - const cost h = row_i[j] - v_ptr[j]; - - if (h < min_val) { - submin_val = min_val; - submin_idx = min_idx; - min_val = h; - min_idx = j; - } else if (h < submin_val) { - submin_val = h; - submin_idx = j; - } - } - - return {min_val, submin_val, min_idx, submin_idx}; - } - - // test2000 = 370 ms (!) - std::tuple findRowSubminTwoPassNaive( - const lap_row* i, - const std::vector& v_ptr - ) { - const cost* __restrict row_i = row(*i); - cost min_val = std::numeric_limits::max(); - lap_col min_idx = 0; - const lap_col dim = static_cast(dim_); - - for (lap_col j = 0; j < dim; ++j) { - const cost h = row_i[j] - v_ptr[j]; - if (h < min_val) { - min_val = h; - min_idx = j; - } - } - - // Second pass: find subminimum - cost submin_val = std::numeric_limits::max(); - lap_col submin_idx = (min_idx == 0) ? 1 : 0; - - for (lap_col j = 0; j < dim; ++j) { - if (j != min_idx) { - const cost h = row_i[j] - v_ptr[j]; - if (h < submin_val) { - submin_val = h; - submin_idx = j; - } - } - } - - return {min_val, submin_val, min_idx, submin_idx}; - } -}; - -using cost_matrix = CostMatrix; - -// --------------------------------------------------------------------------- -// LapScratch — reusable heap storage for one thread's LAPJV workspace. // -// Pass to the scratch-taking overload of lap() to eliminate the ~6 per-call -// std::vector allocations. In a serial context construct once before the -// loop; in an OpenMP context allocate one per thread and index by -// omp_get_thread_num(). All vectors are grown lazily; never shrunk. -// --------------------------------------------------------------------------- -struct LapScratch { - std::vector v; // column dual variables (padded) - std::vector matches; // assignment-count per row - std::vector freeunassigned; // list of unassigned rows - std::vector col_list; // column scan list - std::vector d; // Dijkstra distances - std::vector predecessor; // augmenting-path predecessors - // rowsol / colsol are included so score functions that don't need the - // solution afterwards can avoid a separate allocation. - std::vector rowsol; - std::vector colsol; - // Pooled CostMatrix instances — reused across pairs to avoid per-call - // heap allocation. Grown lazily on first use, never shrunk. - CostMatrix score_pool; // main cost matrix - CostMatrix small_pool; // reduced matrix (exact-match path) - - void ensure(int dim) noexcept { - const int padded = static_cast( - ((static_cast(dim) + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE); - if (static_cast(v.size()) < padded) v.resize(padded); - if (static_cast(matches.size()) < dim) matches.resize(dim); - if (static_cast(freeunassigned.size()) < dim) freeunassigned.resize(dim); - if (static_cast(col_list.size()) < dim) col_list.resize(dim); - if (static_cast(d.size()) < dim) d.resize(dim); - if (static_cast(predecessor.size()) < dim) predecessor.resize(dim); - if (static_cast(rowsol.size()) < dim) rowsol.resize(dim); - if (static_cast(colsol.size()) < dim) colsol.resize(dim); - } -}; - -/*************** FUNCTIONS *******************/ - -// Primary overload: caller supplies pre-allocated scratch (zero alloc in hot loop). -extern cost lap(lap_row dim, - cost_matrix &input_cost, - std::vector &rowsol, - std::vector &colsol, - bool allow_interrupt, - LapScratch &scratch); - -// Convenience overload: creates a temporary scratch (for one-off calls). -inline cost lap(lap_row dim, - cost_matrix &input_cost, - std::vector &rowsol, - std::vector &colsol, - bool allow_interrupt = true) { - LapScratch scratch; - return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch); -} +// Public types, CostMatrix, LapScratch, and lap() declarations live in +// inst/include/TreeDist/ headers (shared with downstream LinkingTo users). +// This wrapper re-exports them to global scope for backward compatibility. + +#include // types, CostMatrix, LapScratch, lap() + +#include "ints.h" // grf_match, uint32 (TreeDist-internal types) + +// Re-export shared types and classes to global scope. +using TreeDist::cost; +using TreeDist::lap_dim; +using TreeDist::lap_row; +using TreeDist::lap_col; +using TreeDist::BIG; +using TreeDist::ROUND_PRECISION; +using TreeDist::BLOCK_SIZE; +using TreeDist::CostMatrix; +using TreeDist::LapScratch; +using TreeDist::lap; + +using cost_matrix = TreeDist::CostMatrix; #endif diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp index 4a3b77e1..99347300 100644 --- a/src/tree_distance_functions.cpp +++ b/src/tree_distance_functions.cpp @@ -1,35 +1,47 @@ +#include #include -#include /* for SL_MAX_TIPS */ -#include /* for log2() */ +// Provide the MCI table definitions and implementation in this TU. +#define TREEDIST_MCI_IMPLEMENTATION +#include #include "tree_distances.h" -using namespace Rcpp; - -constexpr int32 LG2_SIZE = SL_MAX_TIPS + 1; - -double lg2[LG2_SIZE]; -double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; -double lg2_unrooted[SL_MAX_TIPS + 2]; -double *lg2_rooted = &lg2_unrooted[0] + 1; +// Populate lookup tables at library load time. __attribute__((constructor)) void initialize_ldf() { - lg2[0] = 0; - for (int32 i = 1; i != LG2_SIZE; ++i) { - lg2[i] = log2(i); - } - for (int16 i = 0; i != 3; ++i) { - lg2_double_factorial[i] = 0; - lg2_unrooted[i] = 0; - } - assert(lg2_rooted[0] == 0); - assert(lg2_rooted[1] == 0); - for (int32 i = 2; i != SL_MAX_TIPS + SL_MAX_TIPS - 2; ++i) { - lg2_double_factorial[i] = lg2_double_factorial[i - 2] + log2(i); - } - for (int32 i = 3; i != SL_MAX_TIPS + 2; ++i) { - lg2_unrooted[i] = lg2_double_factorial[i + i - 5]; - assert(lg2_rooted[i - 1] == lg2_double_factorial[i + i - 5]); - } + TreeDist::init_lg2_tables(SL_MAX_TIPS); + } + +// Thin wrapper that exercises the installable-header version of +// mutual_clustering_score(), for test coverage and downstream validation. +// [[Rcpp::export]] +double cpp_mci_impl_score(const Rcpp::RawMatrix& x, + const Rcpp::RawMatrix& y, + int n_tips) { + using TreeTools::SplitList; + + const SplitList a(x); + const SplitList b(y); + TreeDist::LapScratch scratch; + + // Build arrays matching the header's raw-pointer API types. + std::vector a_ptrs(a.n_splits); + std::vector b_ptrs(b.n_splits); + std::vector a_in(a.n_splits); + std::vector b_in(b.n_splits); + for (TreeDist::int16 i = 0; i < a.n_splits; ++i) { + a_ptrs[i] = a.state[i]; + a_in[i] = static_cast(a.in_split[i]); } + for (TreeDist::int16 i = 0; i < b.n_splits; ++i) { + b_ptrs[i] = b.state[i]; + b_in[i] = static_cast(b.in_split[i]); + } + + return TreeDist::mutual_clustering_score( + a_ptrs.data(), a_in.data(), a.n_splits, + b_ptrs.data(), b_in.data(), b.n_splits, + a.n_bins, static_cast(n_tips), + scratch); +} diff --git a/src/tree_distances.h b/src/tree_distances.h index f5ba93e7..8d4f7b4f 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -2,18 +2,25 @@ #define _TREEDIST_TREE_DISTANCES_H #include "lap.h" +#include #include -/***** Constants requiring initialization *****/ +/***** Re-export shared lookup tables to global scope *****/ + +using TreeDist::lg2; +using TreeDist::lg2_double_factorial; +using TreeDist::lg2_unrooted; +using TreeDist::lg2_rooted; + +/***** Constants *****/ constexpr splitbit ALL_ONES = (std::numeric_limits::max)(); -extern double lg2[SL_MAX_TIPS + 1]; -extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2]; -extern double lg2_unrooted[SL_MAX_TIPS + 2]; -extern double *lg2_rooted; namespace TreeDist { + // Re-exported from mutual_clustering.h: + // ic_matching(int16 a, int16 b, int16 n) + // See equation 16 in Meila 2007 (k' denoted K). // nkK is converted to pkK in the calling function when divided by n. inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk, @@ -52,14 +59,6 @@ namespace TreeDist { } -[[nodiscard]] inline double ic_matching(const int16 a, const int16 b, const int16 n) noexcept { - const double lg2a = lg2[a]; - const double lg2b = lg2[b]; - const double lg2n = lg2[n]; - return (a + b) * lg2n - a * lg2a - b * lg2b; - // (a * (lg2n - lg2a)) + (b * (lg2n - lg2b)); is substantially slower - } - [[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept { assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok if (a == b) { diff --git a/tests/testthat/test-mci_impl.R b/tests/testthat/test-mci_impl.R new file mode 100644 index 00000000..58d72da2 --- /dev/null +++ b/tests/testthat/test-mci_impl.R @@ -0,0 +1,53 @@ +test_that("Header mutual_clustering_score matches MutualClusteringInfo", { + skip_if_not_installed("TreeDist") + library(TreeTools) + library(TreeDist) + + bal8 <- BalancedTree(8) + pec8 <- PectinateTree(8) + star8 <- StarTree(8) + + tips <- TipLabels(bal8) + n_tip <- length(tips) + splits_bal <- as.Splits(bal8, tips) + splits_pec <- as.Splits(pec8, tips) + splits_star <- as.Splits(star8, tips) + + # Score-only from the installable-header implementation + impl_score <- TreeDist:::cpp_mci_impl_score + + impl_bal_pec <- impl_score(splits_bal, splits_pec, n_tip) + impl_bal_bal <- impl_score(splits_bal, splits_bal, n_tip) + impl_star <- impl_score(splits_bal, splits_star, n_tip) + + # Reference from MutualClusteringInfo (unnormalized score) + ref_bal_pec <- MutualClusteringInfo(bal8, pec8) + ref_bal_bal <- MutualClusteringInfo(bal8, bal8) + + expect_equal(impl_bal_pec, ref_bal_pec, tolerance = 1e-10) + expect_equal(impl_bal_bal, ref_bal_bal, tolerance = 1e-10) + expect_equal(impl_star, 0) +}) + +test_that("Header MCI covers exact-match early exit and partial LAP", { + skip_if_not_installed("TreeDist") + library(TreeTools) + library(TreeDist) + impl_score <- TreeDist:::cpp_mci_impl_score + + # Two identical trees → all splits match exactly (early exit path) + bal20 <- BalancedTree(20) + tips <- TipLabels(bal20) + n_tip <- length(tips) + splits20 <- as.Splits(bal20, tips) + + result <- impl_score(splits20, splits20, n_tip) + expect_equal(result, MutualClusteringInfo(bal20, bal20), tolerance = 1e-10) + + # Trees that share some but not all splits → partial match + LAP + pec20 <- PectinateTree(20) + splits_pec20 <- as.Splits(pec20, tips) + + result2 <- impl_score(splits20, splits_pec20, n_tip) + expect_equal(result2, MutualClusteringInfo(bal20, pec20), tolerance = 1e-10) +})