diff --git a/.github/workflows/ASan.yml b/.github/workflows/ASan.yml
index 90a74a66..7c0b0a95 100644
--- a/.github/workflows/ASan.yml
+++ b/.github/workflows/ASan.yml
@@ -55,8 +55,9 @@ jobs:
         run: |
           export LD_PRELOAD=$(gcc -print-file-name=libasan.so)
 
-          echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" > src/Makevars
-          echo "PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars
+          # Replace PKG_CXXFLAGS in-place; preserve PKG_CPPFLAGS and PKG_LIBS
+          sed -i 's/^PKG_CXXFLAGS.*/PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer/' src/Makevars
+          echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars
 
           mkdir ~/.R
           echo "LDFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> ~/.R/Makevars
diff --git a/.positai/settings.json b/.positai/settings.json
index 627e8f7f..acec4998 100644
--- a/.positai/settings.json
+++ b/.positai/settings.json
@@ -30,8 +30,15 @@
       "wc *": "allow"
     },
     "external_directory": {
+      "C:/Users/pjjg18/GitHub/TreeSearch-a/*": "allow",
+      "C:/Users/pjjg18/.positai/skills/r-package-profiling/references/*": "allow",
       "../Quartet/*": "allow",
       "../TreeTools/*": "allow"
+    },
+    "webfetch": {
+      "https://github.com/*": "allow",
+      "https://api.github.com/*": "allow",
+      "https://raw.githubusercontent.com/*": "allow"
     }
   },
   "model": {
diff --git a/AGENTS.md b/AGENTS.md
index 237aa42b..6d81e883 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1177,6 +1177,52 @@ for the cross-pairs case.
 
 ---
 
+## LinkingTo Header Exposure (`expose-lapjv` branch)
+
+Extracted LAP and MCI C++ APIs into `inst/include/TreeDist/` headers so that
+downstream packages (e.g., TreeSearch) can use `LinkingTo: TreeDist`:
+
+| Header | Content |
+|--------|---------|
+| `types.h` | `cost`, `lap_dim`, `lap_row`, `lap_col`, constants |
+| `cost_matrix.h` | `CostMatrix` class (Rcpp-free) |
+| `lap_scratch.h` | `LapScratch` struct |
+| `lap.h` | `lap()` declarations |
+| `lap_impl.h` | `lap()` implementation (include in exactly one TU) |
+| `mutual_clustering.h` | MCI declarations |
+| `mutual_clustering_impl.h` | MCI implementation (include in exactly one TU) |
+
+`src/lap.h` is now a thin wrapper that includes `<TreeDist/lap.h>` and
+re-exports types to global scope.
+
+### LAPJV codegen regression (diagnosed, partially mitigated)
+
+Including `lap_impl.h` in `lap.cpp` changed the TU context enough for GCC 14's
+register allocator to produce ~8% more instructions in the Dijkstra hot loop,
+causing a consistent 20–25% regression on standalone LAPJV (n ≥ 400).
+
+**Root cause:** the installed-header version of `CostMatrix` (in
+`inst/include/TreeDist/cost_matrix.h`) has a different method set than main's
+monolithic `src/lap.h` (extra methods like `setWithTranspose()`, `dim8()`;
+missing test variants like `findRowSubminNaive`).  This changes GCC's
+optimization heuristics for the entire TU, even though `lap()` never calls
+the differing methods.
+
+**Fix:** define `lap()` directly in `lap.cpp` (not via `#include <TreeDist/lap_impl.h>`)
+with `__attribute__((optimize("align-functions=64", "align-loops=16")))`.
+The `lapjv()` wrapper fills the transposed buffer first (matching R's
+column-major storage) then untransposes — restoring the cache-friendly pattern.
+
+**Residual:** ~5–9% on LAPJV 1999×1999 vs main (from the CostMatrix class
+definition difference).  Tree distance metrics are unaffected — they call
+`lap()` from `pairwise_distances.cpp`, whose TU context is unchanged.
+
+**Maintenance note:** if the `lap()` algorithm changes, update BOTH `src/lap.cpp`
+and `inst/include/TreeDist/lap_impl.h`.  The duplication is intentional — it
+preserves the TU context that was profiled and tuned.
+
+---
+
 ## Remaining Optimization Opportunities
 
 - Sort+merge pre-scan for `rf_info_score`: **DONE** — replaced O(n²) loop with
diff --git a/NEWS.md b/NEWS.md
index 17785cba..082fec34 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,14 @@
 <!-- AI-generated branch summary (2026-03-22) -->
-# Branch: `transfer-consensus` → merge target: `master`
+# Branch: `expose-lapjv` → merge target: `master`
+
+This branch exposes the LAPJV (Jonker–Volgenant) linear assignment solver
+via C++ headers so that downstream packages can link to it (`LinkingTo`),
+and implements a transfer consensus method (`TransferConsensus()`,
+`TransferDist()`).  It also tightens error handling by moving all
+`Rcpp::stop()` / `Rf_error()` calls out of C++ implementation code
+into R-level input validation.
+
+# TreeDist 2.13.0.9000
 
 This branch implements transfer consensus trees (`TransferConsensus()`) and
 a corresponding transfer distance metric (`TransferDist()`), providing a
diff --git a/R/RcppExports.R b/R/RcppExports.R
index a907e27f..9e4d02b2 100644
--- a/R/RcppExports.R
+++ b/R/RcppExports.R
@@ -267,6 +267,10 @@ cpp_transfer_dist_cross_pairs <- function(splits_a, splits_b, n_tip, scale, n_th
     .Call(`_TreeDist_cpp_transfer_dist_cross_pairs`, splits_a, splits_b, n_tip, scale, n_threads)
 }
 
+cpp_mci_impl_score <- function(x, y, n_tips) {
+    .Call(`_TreeDist_cpp_mci_impl_score`, x, y, n_tips)
+}
+
 cpp_robinson_foulds_distance <- function(x, y, nTip) {
     .Call(`_TreeDist_cpp_robinson_foulds_distance`, x, y, nTip)
 }
diff --git a/inst/include/TreeDist/cost_matrix.h b/inst/include/TreeDist/cost_matrix.h
new file mode 100644
index 00000000..fe4dddcf
--- /dev/null
+++ b/inst/include/TreeDist/cost_matrix.h
@@ -0,0 +1,276 @@
+#ifndef TREEDIST_COST_MATRIX_H_
+#define TREEDIST_COST_MATRIX_H_
+
+// TreeDist CostMatrix — cache-aligned, block-padded cost matrix for the LAP.
+//
+// Rcpp-free public API.  The Rcpp-dependent constructor
+// (CostMatrix(Rcpp::NumericMatrix)) lives in TreeDist's own src/lap.h.
+
+#include "types.h"
+
+#include <algorithm>
+#include <cassert>
+#include <tuple>
+#include <vector>
+
+namespace TreeDist {
+
+class CostMatrix {
+private:
+  std::size_t dim_;
+  std::size_t dim8_;
+  alignas(64) std::vector<cost> data_;
+  alignas(64) std::vector<cost> t_data_;
+  bool transposed_;
+
+  static std::size_t block_containing(const std::size_t x) {
+    return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
+  }
+
+public:
+  // Default constructor for pooled instances (zero-size, no allocation).
+  CostMatrix() : dim_(0), dim8_(0), transposed_(false) {}
+
+  explicit CostMatrix(std::size_t dim)
+    : dim_(dim),
+      dim8_(block_containing(dim_)),
+      data_(std::vector<cost>(dim8_ * dim8_)),
+      t_data_(std::vector<cost>(dim8_ * dim8_)),
+      transposed_(false) {}
+
+  // Resize for reuse.  Only reallocates when the new dimension exceeds the
+  // current buffer capacity; otherwise just updates dim_/dim8_ and marks
+  // the transpose as stale.
+  void resize(std::size_t new_dim) {
+    dim_ = new_dim;
+    dim8_ = block_containing(new_dim);
+    const std::size_t needed = dim8_ * dim8_;
+    if (data_.size() < needed) {
+      data_.resize(needed);
+      t_data_.resize(needed);
+    }
+    transposed_ = false;
+  }
+
+  void reset() noexcept { transposed_ = false; }
+
+  [[nodiscard]] std::size_t dim() const noexcept { return dim_; }
+  [[nodiscard]] std::size_t dim8() const noexcept { return dim8_; }
+
+  // ---- Element access ----
+
+  cost& operator()(lap_row i, lap_col j) {
+    return data_[static_cast<std::size_t>(i) * dim8_ + j];
+  }
+
+  [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const {
+    return data_[static_cast<std::size_t>(i) * dim8_ + j];
+  }
+
+  [[nodiscard]] const cost& row0(lap_col j) const {
+    return data_[j];
+  }
+
+  [[nodiscard]] const cost& entry0(lap_row i) const {
+    return data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] cost* row(lap_row i) {
+    return &data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] const cost* row(lap_row i) const {
+    return &data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] cost* col(lap_col i) {
+    return &t_data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  [[nodiscard]] const cost* col(lap_col i) const {
+    return &t_data_[static_cast<std::size_t>(i) * dim8_];
+  }
+
+  // Write a value to both the data and transposed-data arrays.
+  // After filling the entire matrix this way, call markTransposed()
+  // to skip the lazy transpose in makeTranspose().
+  void setWithTranspose(lap_row i, lap_col j, cost value) {
+    data_[static_cast<std::size_t>(i) * dim8_ + j] = value;
+    t_data_[static_cast<std::size_t>(j) * dim8_ + i] = value;
+  }
+
+  void markTransposed() noexcept { transposed_ = true; }
+
+  // ---- Transpose ----
+
+  void makeTranspose() noexcept {
+    if (transposed_) return;
+
+    const cost* __restrict__ data_ptr = data_.data();
+    cost* __restrict__ t_data_ptr = t_data_.data();
+
+#if defined(__GNUC__) || defined(__clang__)
+    data_ptr =
+      static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
+    t_data_ptr =
+      static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
+#endif
+    for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) {
+      for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) {
+        for (std::size_t r = i;
+             r < std::min(i + BLOCK_SIZE, dim_); ++r) {
+          for (std::size_t c = j;
+               c < std::min(j + BLOCK_SIZE, dim_); ++c) {
+            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
+          }
+        }
+      }
+    }
+    transposed_ = true;
+  }
+
+  void makeUntranspose() noexcept {
+    const cost* __restrict__ data_ptr = t_data_.data();
+    cost* __restrict__ t_data_ptr = data_.data();
+
+#if defined(__GNUC__) || defined(__clang__)
+    data_ptr =
+      static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
+    t_data_ptr =
+      static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
+#endif
+    for (std::size_t i = 0; i < dim_; i += BLOCK_SIZE) {
+      for (std::size_t j = 0; j < dim_; j += BLOCK_SIZE) {
+        for (std::size_t r = i;
+             r < std::min(i + BLOCK_SIZE, dim_); ++r) {
+          for (std::size_t c = j;
+               c < std::min(j + BLOCK_SIZE, dim_); ++c) {
+            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
+          }
+        }
+      }
+    }
+  }
+
+  // ---- Padding ----
+
+  void padTrAfterCol(lap_row start_row, cost value) {
+    std::size_t start_index = static_cast<std::size_t>(start_row) * dim8_;
+    std::size_t end_index   = dim_ * dim8_;
+    std::fill(t_data_.begin() + start_index,
+              t_data_.begin() + end_index, value);
+  }
+
+  void padAfterRow(lap_row start_row, cost value) {
+    std::size_t start_index = static_cast<std::size_t>(start_row) * dim8_;
+    std::size_t end_index   = dim_ * dim8_;
+    std::fill(data_.begin() + start_index,
+              data_.begin() + end_index, value);
+  }
+
+  void padTrColAfterRow(const lap_row r, const lap_col start_col,
+                        const cost value) {
+    std::size_t r_offset = r * dim8_;
+    std::size_t actual_start_col = static_cast<std::size_t>(start_col);
+    std::size_t start_index = r_offset + actual_start_col;
+    std::size_t end_index = start_index + dim_ - actual_start_col;
+    std::fill(t_data_.begin() + start_index,
+              t_data_.begin() + end_index, value);
+  }
+
+  void padRowAfterCol(const lap_row r, const lap_col start_col,
+                      const cost value) {
+    std::size_t r_offset = r * dim8_;
+    std::size_t actual_start_col = static_cast<std::size_t>(start_col);
+    std::size_t start_index = r_offset + actual_start_col;
+    std::size_t end_index = start_index + dim_ - actual_start_col;
+    std::fill(data_.begin() + start_index,
+              data_.begin() + end_index, value);
+  }
+
+  // ---- Search ----
+
+  std::pair<cost, lap_row> findColMin(lap_col j,
+                                      lap_row search_dim = -1) {
+    makeTranspose();
+    const cost* col_data = col(j);
+    const lap_row n =
+      (search_dim < 0) ? static_cast<lap_row>(dim_) : search_dim;
+    const auto min_ptr = std::min_element(col_data, col_data + n);
+    return {*min_ptr,
+            static_cast<lap_row>(std::distance(col_data, min_ptr))};
+  }
+
+  std::tuple<cost, cost, lap_col, lap_col> findRowSubmin(
+      const lap_row* i, const std::vector<cost>& v
+  ) const {
+    assert(dim_ > 1);
+
+    const cost* __restrict__ row_i = row(*i);
+    const lap_col dim = static_cast<lap_col>(dim_);
+    const cost* __restrict__ v_ptr = v.data();
+
+    const cost h0 = row_i[0] - v_ptr[0];
+    const cost h1 = row_i[1] - v_ptr[1];
+
+    cost min_val, submin_val;
+    lap_col min_idx, submin_idx;
+
+    if (h0 > h1) {
+      min_val = h1; submin_val = h0;
+      min_idx = 1;  submin_idx = 0;
+    } else {
+      min_val = h0; submin_val = h1;
+      min_idx = 0;  submin_idx = 1;
+    }
+
+    const lap_col j_limit =
+      (dim < 4 ? 0 : static_cast<lap_col>(dim - 3));
+
+    for (lap_col j = 2; j < j_limit; j += 4) {
+      assert(BLOCK_SIZE >= 4);
+      const cost h0 = row_i[j]     - v_ptr[j];
+      const cost h1 = row_i[j + 1] - v_ptr[j + 1];
+      const cost h2 = row_i[j + 2] - v_ptr[j + 2];
+      const cost h3 = row_i[j + 3] - v_ptr[j + 3];
+      if (h0 < submin_val) {
+        if (h0 < min_val) {
+          submin_val = min_val; min_val = h0;
+          submin_idx = min_idx; min_idx = j;
+        } else { submin_val = h0; submin_idx = j; }
+      }
+      if (h1 < submin_val) {
+        if (h1 < min_val) {
+          submin_val = min_val; min_val = h1;
+          submin_idx = min_idx; min_idx = j + 1;
+        } else { submin_val = h1; submin_idx = j + 1; }
+      }
+      if (h2 < submin_val) {
+        if (h2 < min_val) {
+          submin_val = min_val; min_val = h2;
+          submin_idx = min_idx; min_idx = j + 2;
+        } else { submin_val = h2; submin_idx = j + 2; }
+      }
+      if (h3 < submin_val) {
+        if (h3 < min_val) {
+          submin_val = min_val; min_val = h3;
+          submin_idx = min_idx; min_idx = j + 3;
+        } else { submin_val = h3; submin_idx = j + 3; }
+      }
+    }
+    for (lap_col j = j_limit; j < dim; ++j) {
+      const cost h = row_i[j] - v_ptr[j];
+      if (h < submin_val) {
+        if (h < min_val) {
+          submin_val = min_val; min_val = h;
+          submin_idx = min_idx; min_idx = j;
+        } else { submin_val = h; submin_idx = j; }
+      }
+    }
+    return {min_val, submin_val, min_idx, submin_idx};
+  }
+};
+
+} // namespace TreeDist
+
+#endif // TREEDIST_COST_MATRIX_H_
diff --git a/inst/include/TreeDist/lap.h b/inst/include/TreeDist/lap.h
new file mode 100644
index 00000000..673cfa22
--- /dev/null
+++ b/inst/include/TreeDist/lap.h
@@ -0,0 +1,35 @@
+#ifndef TREEDIST_LAP_H_
+#define TREEDIST_LAP_H_
+
+// LAP (Linear Assignment Problem) — Jonker–Volgenant declarations.
+//
+// Provides the lap() function signature.  The implementation lives in
+// lap_impl.h, guarded by TREEDIST_LAP_IMPLEMENTATION.  Downstream
+// packages should include lap_impl.h in exactly one translation unit
+// with the guard defined.
+
+#include "lap_scratch.h"
+
+namespace TreeDist {
+
+  // Primary overload: caller supplies pre-allocated scratch.
+  extern cost lap(lap_row dim,
+                  CostMatrix& input_cost,
+                  std::vector<lap_col>& rowsol,
+                  std::vector<lap_row>& colsol,
+                  bool allow_interrupt,
+                  LapScratch& scratch);
+
+  // Convenience overload: creates a temporary scratch.
+  inline cost lap(lap_row dim,
+                  CostMatrix& input_cost,
+                  std::vector<lap_col>& rowsol,
+                  std::vector<lap_row>& colsol,
+                  bool allow_interrupt = true) {
+    LapScratch scratch;
+    return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch);
+  }
+
+} // namespace TreeDist
+
+#endif // TREEDIST_LAP_H_
diff --git a/inst/include/TreeDist/lap_impl.h b/inst/include/TreeDist/lap_impl.h
new file mode 100644
index 00000000..196c37f6
--- /dev/null
+++ b/inst/include/TreeDist/lap_impl.h
@@ -0,0 +1,254 @@
+#ifndef TREEDIST_LAP_IMPL_H_
+#define TREEDIST_LAP_IMPL_H_
+
+// LAP (Linear Assignment Problem) — Jonker–Volgenant implementation.
+//
+// Guard this with #define TREEDIST_LAP_IMPLEMENTATION before including.
+// Include in exactly one translation unit per package.
+//
+// Interrupt handling:  define TREEDIST_CHECK_INTERRUPT() before including
+// to enable user-interrupt checks (e.g., Rcpp::checkUserInterrupt()).
+// If not defined, interrupt checks are silently skipped.
+//
+// Original algorithm:
+//   R. Jonker and A. Volgenant, "A Shortest Augmenting Path Algorithm
+//   for Dense and Sparse Linear Assignment Problems," Computing 38,
+//   325-340, 1987.
+
+#ifdef TREEDIST_LAP_IMPLEMENTATION
+
+#include "lap.h"
+
+#ifndef TREEDIST_CHECK_INTERRUPT
+#define TREEDIST_CHECK_INTERRUPT() ((void)0)
+#define TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_
+#endif
+
+namespace TreeDist {
+
+namespace detail {
+  inline bool nontrivially_less_than(cost a, cost b) noexcept {
+    return a + ((a > ROUND_PRECISION) ? 8 : 0) < b;
+  }
+} // namespace detail
+
+// The LAP hot loops are sensitive to instruction alignment — even unrelated
+// changes to other code in the same translation unit can shift layout enough
+// to cause measurable regressions.  Force 64-byte function alignment and
+// 16-byte loop alignment to stabilise performance across builds.
+#if defined(__GNUC__) && !defined(__clang__)
+__attribute__((optimize("align-functions=64", "align-loops=16")))
+#endif
+cost lap(const lap_row dim,
+         CostMatrix& input_cost,
+         std::vector<lap_col>& rowsol,
+         std::vector<lap_row>& colsol,
+         const bool allow_interrupt,
+         LapScratch& scratch)
+{
+  lap_row num_free = 0;
+  scratch.ensure(dim);
+  auto& v       = scratch.v;
+  auto& matches = scratch.matches;
+  std::fill(matches.begin(), matches.begin() + dim, 0);
+  const cost* __restrict__ v_ptr = v.data();
+
+  // COLUMN REDUCTION
+  for (lap_col j = dim; j--; ) {
+    const auto [min, imin] = input_cost.findColMin(j);
+    v[j] = min;
+    ++matches[imin];
+
+    if (matches[imin] == 1) {
+      rowsol[imin] = j;
+      colsol[j] = imin;
+    } else if (v_ptr[j] < v_ptr[rowsol[imin]]) {
+      const lap_col j1 = rowsol[imin];
+      rowsol[imin] = j;
+      colsol[j] = imin;
+      colsol[j1] = -1;
+    } else {
+      colsol[j] = -1;
+    }
+  }
+
+  // REDUCTION TRANSFER
+  auto& freeunassigned = scratch.freeunassigned;
+
+  for (lap_row i = 0; i < dim; ++i) {
+    if (matches[i] == 0) {
+      freeunassigned[num_free++] = i;
+    } else if (matches[i] == 1) {
+      const lap_col j1 = rowsol[i];
+      const cost* row_i = input_cost.row(i);
+      cost min_cost;
+      if (j1 == 0) {
+        min_cost = row_i[1] - v_ptr[1];
+        for (lap_col j = 2; j < dim; ++j) {
+          const cost reduced_cost = row_i[j] - v_ptr[j];
+          if (reduced_cost < min_cost) {
+            min_cost = reduced_cost;
+          }
+        }
+      } else {
+        min_cost = row_i[0] - v_ptr[0];
+        for (lap_col j = 1; j < dim; ++j) {
+          if (j == j1) continue;
+          const cost reduced_cost = row_i[j] - v_ptr[j];
+          if (reduced_cost < min_cost) {
+            min_cost = reduced_cost;
+          }
+        }
+      }
+      v[j1] -= min_cost;
+    }
+  }
+
+  // AUGMENTING ROW REDUCTION
+  auto& col_list = scratch.col_list;
+  int loopcnt = 0;
+
+  do {
+    ++loopcnt;
+
+    lap_row previous_num_free = num_free;
+    num_free = 0;
+    lap_row k = 0;
+    while (k < previous_num_free) {
+      const lap_row i = freeunassigned[k++];
+      const auto [umin, usubmin, min_idx, j2] =
+        input_cost.findRowSubmin(&i, v);
+      lap_col j1 = min_idx;
+
+      lap_row i0 = colsol[j1];
+      const bool strictly_less =
+        detail::nontrivially_less_than(umin, usubmin);
+      if (strictly_less) {
+        v[j1] -= (usubmin - umin);
+      } else if (i0 > -1) {
+        j1 = j2;
+        i0 = colsol[j2];
+      }
+
+      rowsol[i] = j1;
+      colsol[j1] = i;
+
+      if (i0 > -1) {
+        if (strictly_less) {
+          freeunassigned[--k] = i0;
+          if (allow_interrupt) TREEDIST_CHECK_INTERRUPT();
+        } else {
+          freeunassigned[num_free++] = i0;
+        }
+      }
+    }
+  } while (loopcnt < 2);
+
+  // AUGMENT SOLUTION for each free row.
+  auto& d           = scratch.d;
+  auto& predecessor = scratch.predecessor;
+
+  for (lap_row f = 0; f < num_free; ++f) {
+    bool unassignedfound = false;
+    lap_row free_row = freeunassigned[f];
+    const cost* free_row_cost = input_cost.row(free_row);
+    lap_col endofpath = 0;
+    lap_col last = 0;
+    lap_row i;
+    lap_col j1;
+
+    for (lap_col j = 0; j < dim; ++j) {
+      d[j] = free_row_cost[j] - v_ptr[j];
+      predecessor[j] = free_row;
+      col_list[j] = j;
+    }
+
+    cost min = 0;
+    lap_col low = 0;
+    lap_col up = 0;
+
+    do {
+      if (up == low) {
+        last = low - 1;
+        min = d[col_list[up++]];
+
+        for (lap_dim k = up; k < dim; ++k) {
+          const lap_col j = col_list[k];
+          const cost h = d[j];
+          if (h <= min) {
+            if (h < min) {
+              up = low;
+              min = h;
+            }
+            col_list[k] = col_list[up];
+            col_list[up++] = j;
+          }
+        }
+        for (lap_dim k = low; k < up; ++k) {
+          if (colsol[col_list[k]] < 0) {
+            endofpath = col_list[k];
+            unassignedfound = true;
+            break;
+          }
+        }
+      }
+
+      if (!unassignedfound) {
+        j1 = col_list[low++];
+        i = colsol[j1];
+        const cost* row_i = input_cost.row(i);
+        const cost h = row_i[j1] - v_ptr[j1] - min;
+
+        for (lap_dim k = up; k < dim; ++k) {
+          const lap_col j = col_list[k];
+          cost v2 = row_i[j] - v_ptr[j] - h;
+          if (v2 < d[j]) {
+            predecessor[j] = i;
+            if (v2 == min) {
+              if (colsol[j] < 0) {
+                endofpath = j;
+                unassignedfound = true;
+                break;
+              } else {
+                col_list[k] = col_list[up];
+                col_list[up++] = j;
+              }
+            }
+            d[j] = v2;
+          }
+        }
+      }
+    } while (!unassignedfound);
+
+    for (lap_dim k = 0; k <= last; ++k) {
+      j1 = col_list[k];
+      v[j1] += d[j1] - min;
+    }
+
+    do {
+      i = predecessor[endofpath];
+      colsol[endofpath] = i;
+      j1 = endofpath;
+      endofpath = rowsol[i];
+      rowsol[i] = j1;
+    } while (i != free_row);
+  }
+
+  // Calculate optimal cost.
+  cost lapcost = 0;
+  for (lap_dim i = 0; i < dim; ++i) {
+    lapcost += input_cost(i, rowsol[i]);
+  }
+
+  return lapcost;
+}
+
+} // namespace TreeDist
+
+#ifdef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_
+#undef TREEDIST_CHECK_INTERRUPT
+#undef TREEDIST_CHECK_INTERRUPT_DEFINED_HERE_
+#endif
+
+#endif // TREEDIST_LAP_IMPLEMENTATION
+#endif // TREEDIST_LAP_IMPL_H_
diff --git a/inst/include/TreeDist/lap_scratch.h b/inst/include/TreeDist/lap_scratch.h
new file mode 100644
index 00000000..4a9ca43b
--- /dev/null
+++ b/inst/include/TreeDist/lap_scratch.h
@@ -0,0 +1,44 @@
+#ifndef TREEDIST_LAP_SCRATCH_H_
+#define TREEDIST_LAP_SCRATCH_H_
+
+// Reusable heap storage for one thread's LAPJV workspace.
+//
+// Pass to the scratch-taking overload of lap() to eliminate per-call
+// std::vector allocations.  In a serial context construct once before the
+// loop; in an OpenMP context allocate one per thread and index by
+// omp_get_thread_num().  All vectors are grown lazily; never shrunk.
+
+#include "cost_matrix.h"
+
+namespace TreeDist {
+
+struct LapScratch {
+  std::vector<cost>    v;
+  std::vector<lap_col> matches;
+  std::vector<lap_row> freeunassigned;
+  std::vector<lap_col> col_list;
+  std::vector<cost>    d;
+  std::vector<lap_row> predecessor;
+  std::vector<lap_col> rowsol;
+  std::vector<lap_row> colsol;
+  CostMatrix score_pool;
+  CostMatrix small_pool;
+
+  void ensure(int dim) noexcept {
+    const int padded = static_cast<int>(
+      ((static_cast<std::size_t>(dim) + BLOCK_SIZE - 1)
+       / BLOCK_SIZE) * BLOCK_SIZE);
+    if (static_cast<int>(v.size())              < padded) v.resize(padded);
+    if (static_cast<int>(matches.size())        < dim)    matches.resize(dim);
+    if (static_cast<int>(freeunassigned.size()) < dim)    freeunassigned.resize(dim);
+    if (static_cast<int>(col_list.size())       < dim)    col_list.resize(dim);
+    if (static_cast<int>(d.size())              < dim)    d.resize(dim);
+    if (static_cast<int>(predecessor.size())    < dim)    predecessor.resize(dim);
+    if (static_cast<int>(rowsol.size())         < dim)    rowsol.resize(dim);
+    if (static_cast<int>(colsol.size())         < dim)    colsol.resize(dim);
+  }
+};
+
+} // namespace TreeDist
+
+#endif // TREEDIST_LAP_SCRATCH_H_
diff --git a/inst/include/TreeDist/mutual_clustering.h b/inst/include/TreeDist/mutual_clustering.h
new file mode 100644
index 00000000..437633f0
--- /dev/null
+++ b/inst/include/TreeDist/mutual_clustering.h
@@ -0,0 +1,87 @@
+#ifndef TREEDIST_MUTUAL_CLUSTERING_H_
+#define TREEDIST_MUTUAL_CLUSTERING_H_
+
+// Mutual Clustering Information (MCI) — public API.
+//
+// Provides the core MCI score computation between two split sets,
+// clustering entropy, and lookup-table initialization.
+//
+// Implementation lives in mutual_clustering_impl.h, guarded by
+// TREEDIST_MCI_IMPLEMENTATION.
+
+#include "lap.h"
+#include <TreeTools/SplitList.h>   // splitbit, SL_MAX_TIPS, count_bits
+#include <cmath>
+
+namespace TreeDist {
+
+  // ---- Lookup tables (populated by init_lg2_tables) ----
+  //
+  // lg2[i]                = log2(i)  for  0 <= i <= SL_MAX_TIPS
+  // lg2_double_factorial[i] = log2(i!!) for 0 <= i < 2*SL_MAX_TIPS - 2
+  // lg2_unrooted[i]       = log2((2i-5)!!) for i >= 3
+  // lg2_rooted             = &lg2_unrooted[0] + 1  (so lg2_rooted[i] = lg2_unrooted[i+1])
+  //
+  // These are defined in mutual_clustering_impl.h.
+
+  extern double lg2[SL_MAX_TIPS + 1];
+  extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
+  extern double lg2_unrooted[SL_MAX_TIPS + 2];
+  extern double* lg2_rooted;
+
+  // Populate lookup tables.  Must be called once before any MCI
+  // computation.  max_tips should be >= the largest tree size used.
+  void init_lg2_tables(int max_tips);
+
+  // ---- Inline helpers ----
+
+  // Information content of a perfectly-matching split pair.
+  // ic_matching(a, b, n) = (a + b) * lg2[n] - a * lg2[a] - b * lg2[b]
+  [[nodiscard]] inline double ic_matching(int16 a, int16 b,
+                                          int16 n) noexcept {
+    const double lg2a = lg2[a];
+    const double lg2b = lg2[b];
+    const double lg2n = lg2[n];
+    return (a + b) * lg2n - a * lg2a - b * lg2b;
+  }
+
+  // Clustering entropy of a split set.
+  // Computes H_clust = -sum_i [ p_i * log2(p_i) + q_i * log2(q_i) ]
+  // where p_i = in_split[i] / n_tips and q_i = 1 - p_i.
+  [[nodiscard]] inline double clustering_entropy(
+      const int* in_split, int n_splits, int n_tips) {
+    if (n_tips <= 1 || n_splits == 0) return 0.0;
+    const double lg2n = std::log2(static_cast<double>(n_tips));
+    double ce = 0.0;
+    for (int i = 0; i < n_splits; ++i) {
+      const int a = in_split[i];
+      const int b = n_tips - a;
+      if (a > 1 && b > 1) {
+        ce += (a * std::log2(static_cast<double>(a)) +
+               b * std::log2(static_cast<double>(b)))
+              / n_tips - lg2n;
+      }
+    }
+    return -ce;
+  }
+
+  // ---- MCI score (declaration only) ----
+  //
+  // Computes the Mutual Clustering Information between two split sets.
+  // Uses sort+merge exact-match detection + LAP on the reduced matrix.
+  //
+  // a_state[i] points to n_bins splitbit words for split i of tree A.
+  // a_in[i]    = popcount of split i (tips in the "1" partition).
+  // Returns the MCI score (higher = more similar).
+  //
+  // Implementation in mutual_clustering_impl.h.
+
+  double mutual_clustering_score(
+      const splitbit* const* a_state, const int16* a_in, int16 a_n_splits,
+      const splitbit* const* b_state, const int16* b_in, int16 b_n_splits,
+      int16 n_bins, int32 n_tips,
+      LapScratch& scratch);
+
+} // namespace TreeDist
+
+#endif // TREEDIST_MUTUAL_CLUSTERING_H_
diff --git a/inst/include/TreeDist/mutual_clustering_impl.h b/inst/include/TreeDist/mutual_clustering_impl.h
new file mode 100644
index 00000000..7f98c8fb
--- /dev/null
+++ b/inst/include/TreeDist/mutual_clustering_impl.h
@@ -0,0 +1,290 @@
+#ifndef TREEDIST_MUTUAL_CLUSTERING_IMPL_H_
+#define TREEDIST_MUTUAL_CLUSTERING_IMPL_H_
+
+// Mutual Clustering Information — implementation.
+//
+// Guard with #define TREEDIST_MCI_IMPLEMENTATION before including.
+// Include in exactly one translation unit per package (the same TU
+// that defines TREEDIST_LAP_IMPLEMENTATION).
+//
+// Usage:
+//   #define TREEDIST_LAP_IMPLEMENTATION
+//   #define TREEDIST_MCI_IMPLEMENTATION
+//   #include <TreeDist/lap_impl.h>
+//   #include <TreeDist/mutual_clustering_impl.h>
+
+#ifdef TREEDIST_MCI_IMPLEMENTATION
+
+#include "mutual_clustering.h"
+
+#include <algorithm>
+#include <cmath>
+#include <numeric>
+#include <vector>
+
+namespace TreeDist {
+
+// ---- Table definitions ----
+
+double lg2[SL_MAX_TIPS + 1];
+double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
+double lg2_unrooted[SL_MAX_TIPS + 2];
+double* lg2_rooted = &lg2_unrooted[0] + 1;
+
+void init_lg2_tables(int max_tips) {
+  lg2[0] = 0;
+  const int lg2_limit = std::min(max_tips + 1,
+                                 static_cast<int>(SL_MAX_TIPS + 1));
+  for (int i = 1; i < lg2_limit; ++i) {
+    lg2[i] = std::log2(static_cast<double>(i));
+  }
+
+  for (int i = 0; i < 3; ++i) {
+    lg2_double_factorial[i] = 0;
+    lg2_unrooted[i] = 0;
+  }
+
+  const int df_limit = std::min(2 * max_tips,
+                                static_cast<int>(SL_MAX_TIPS + SL_MAX_TIPS - 2));
+  for (int i = 2; i < df_limit; ++i) {
+    lg2_double_factorial[i] =
+      lg2_double_factorial[i - 2] + std::log2(static_cast<double>(i));
+  }
+
+  const int ur_limit = std::min(max_tips + 2,
+                                static_cast<int>(SL_MAX_TIPS + 2));
+  for (int i = 3; i < ur_limit; ++i) {
+    lg2_unrooted[i] = lg2_double_factorial[i + i - 5];
+  }
+}
+
+// ---- Sort+merge exact-match detection (internal) ----
+//
+// Canonicalise each split so bit 0 is always set (flip complement if not),
+// sort by canonical form, merge-scan.  O(n log n).
+//
+// Writes a_match[ai] = bi+1 if matched, else 0.  Likewise b_match.
+// Returns number of exact matches.
+
+namespace detail {
+
+static int16 find_exact_matches_raw(
+    const splitbit* const* a_state, const int16* /*a_in*/, int16 a_n,
+    const splitbit* const* b_state, const int16* /*b_in*/, int16 b_n,
+    int16 n_bins, int32 n_tips,
+    int16* a_match, int16* b_match)
+{
+  std::fill(a_match, a_match + a_n, int16(0));
+  std::fill(b_match, b_match + b_n, int16(0));
+  if (a_n == 0 || b_n == 0) return 0;
+
+  const int16 last_bin = n_bins - 1;
+  const splitbit last_mask = (n_tips % SL_BIN_SIZE == 0)
+    ? ~splitbit(0)
+    : (splitbit(1) << (n_tips % SL_BIN_SIZE)) - 1;
+
+  // Flat buffers for canonical forms
+  std::vector<splitbit> a_canon(static_cast<std::size_t>(a_n) * n_bins);
+  std::vector<splitbit> b_canon(static_cast<std::size_t>(b_n) * n_bins);
+
+  for (int16 i = 0; i < a_n; ++i) {
+    const bool flip = !(a_state[i][0] & 1);
+    for (int16 bin = 0; bin < n_bins; ++bin) {
+      splitbit val = flip ? ~a_state[i][bin] : a_state[i][bin];
+      if (bin == last_bin) val &= last_mask;
+      a_canon[i * n_bins + bin] = val;
+    }
+  }
+  for (int16 i = 0; i < b_n; ++i) {
+    const bool flip = !(b_state[i][0] & 1);
+    for (int16 bin = 0; bin < n_bins; ++bin) {
+      splitbit val = flip ? ~b_state[i][bin] : b_state[i][bin];
+      if (bin == last_bin) val &= last_mask;
+      b_canon[i * n_bins + bin] = val;
+    }
+  }
+
+  // Sort index arrays by canonical form
+  auto canon_less = [&](const splitbit* canon, int16 n_b, int16 i, int16 j) {
+    for (int16 bin = 0; bin < n_b; ++bin) {
+      const splitbit vi = canon[i * n_b + bin];
+      const splitbit vj = canon[j * n_b + bin];
+      if (vi < vj) return true;
+      if (vi > vj) return false;
+    }
+    return false;
+  };
+
+  std::vector<int16> a_order(a_n), b_order(b_n);
+  std::iota(a_order.begin(), a_order.end(), int16(0));
+  std::iota(b_order.begin(), b_order.end(), int16(0));
+
+  std::sort(a_order.begin(), a_order.end(),
+            [&](int16 i, int16 j) {
+              return canon_less(a_canon.data(), n_bins, i, j);
+            });
+  std::sort(b_order.begin(), b_order.end(),
+            [&](int16 i, int16 j) {
+              return canon_less(b_canon.data(), n_bins, i, j);
+            });
+
+  // Merge-scan
+  int16 exact_n = 0;
+  int16 ai_pos = 0, bi_pos = 0;
+  while (ai_pos < a_n && bi_pos < b_n) {
+    const int16 ai = a_order[ai_pos];
+    const int16 bi = b_order[bi_pos];
+
+    int cmp = 0;
+    for (int16 bin = 0; bin < n_bins; ++bin) {
+      const splitbit va = a_canon[ai * n_bins + bin];
+      const splitbit vb = b_canon[bi * n_bins + bin];
+      if (va < vb) { cmp = -1; break; }
+      if (va > vb) { cmp =  1; break; }
+    }
+
+    if (cmp < 0) {
+      ++ai_pos;
+    } else if (cmp > 0) {
+      ++bi_pos;
+    } else {
+      a_match[ai] = bi + 1;
+      b_match[bi] = ai + 1;
+      ++exact_n;
+      ++ai_pos;
+      ++bi_pos;
+    }
+  }
+
+  return exact_n;
+}
+
+} // namespace detail
+
+
+// ---- MCI score implementation ----
+
+double mutual_clustering_score(
+    const splitbit* const* a_state, const int16* a_in, int16 a_n_splits,
+    const splitbit* const* b_state, const int16* b_in, int16 b_n_splits,
+    int16 n_bins, int32 n_tips,
+    LapScratch& scratch)
+{
+  if (a_n_splits == 0 || b_n_splits == 0 || n_tips == 0) return 0.0;
+
+  const int16 most_splits = std::max(a_n_splits, b_n_splits);
+  const double n_tips_rcp = 1.0 / static_cast<double>(n_tips);
+
+  constexpr cost max_score  = BIG;
+  constexpr double over_max = 1.0 / static_cast<double>(BIG);
+  const double max_over_tips = static_cast<double>(BIG) * n_tips_rcp;
+  const double lg2_n = lg2[n_tips];
+
+  // --- Phase 1: O(n log n) exact-match detection ---
+  std::vector<int16> a_match_buf(a_n_splits);
+  std::vector<int16> b_match_buf(b_n_splits);
+
+  const int16 exact_n = detail::find_exact_matches_raw(
+    a_state, a_in, a_n_splits,
+    b_state, b_in, b_n_splits,
+    n_bins, n_tips,
+    a_match_buf.data(), b_match_buf.data());
+
+  const int16* a_match = a_match_buf.data();
+  const int16* b_match = b_match_buf.data();
+
+  // Accumulate exact-match score
+  double exact_score = 0.0;
+  for (int16 ai = 0; ai < a_n_splits; ++ai) {
+    if (a_match[ai]) {
+      const int16 na = a_in[ai];
+      const int16 nA = static_cast<int16>(n_tips - na);
+      exact_score += ic_matching(na, nA, static_cast<int16>(n_tips));
+    }
+  }
+
+  // Early exit when everything matched exactly
+  if (exact_n == b_n_splits || exact_n == a_n_splits) {
+    return exact_score * n_tips_rcp;
+  }
+
+  // --- Phase 2: fill cost matrix for unmatched splits only (O(k²)) ---
+  const int16 lap_n = most_splits - exact_n;
+
+  std::vector<int16> a_unmatch, b_unmatch;
+  a_unmatch.reserve(lap_n);
+  b_unmatch.reserve(lap_n);
+  for (int16 ai = 0; ai < a_n_splits; ++ai) {
+    if (!a_match[ai]) a_unmatch.push_back(ai);
+  }
+  for (int16 bi = 0; bi < b_n_splits; ++bi) {
+    if (!b_match[bi]) b_unmatch.push_back(bi);
+  }
+
+  scratch.score_pool.resize(lap_n);
+  CostMatrix& score = scratch.score_pool;
+
+  const int16 a_unmatched_n = static_cast<int16>(a_unmatch.size());
+  const int16 b_unmatched_n = static_cast<int16>(b_unmatch.size());
+
+  for (int16 a_pos = 0; a_pos < a_unmatched_n; ++a_pos) {
+    const int16 ai = a_unmatch[a_pos];
+    const int16 na = a_in[ai];
+    const int16 nA = static_cast<int16>(n_tips - na);
+    const splitbit* a_row = a_state[ai];
+
+    const double offset_a = lg2_n - lg2[na];
+    const double offset_A = lg2_n - lg2[nA];
+
+    for (int16 b_pos = 0; b_pos < b_unmatched_n; ++b_pos) {
+      const int16 bi = b_unmatch[b_pos];
+      const splitbit* b_row = b_state[bi];
+      int16 a_and_b = 0;
+      for (int16 bin = 0; bin < n_bins; ++bin) {
+        a_and_b += TreeTools::count_bits(a_row[bin] & b_row[bin]);
+      }
+
+      const int16 nb = b_in[bi];
+      const int16 nB = static_cast<int16>(n_tips - nb);
+      const int16 a_and_B = na - a_and_b;
+      const int16 A_and_b = nb - a_and_b;
+      const int16 A_and_B = nA - A_and_b;
+
+      if (a_and_b == A_and_b && a_and_b == a_and_B
+          && a_and_b == A_and_B) {
+        score(a_pos, b_pos) = max_score;
+      } else {
+        const double lg2_nb = lg2[nb];
+        const double lg2_nB = lg2[nB];
+        const double ic_sum =
+          a_and_b * (lg2[a_and_b] + offset_a - lg2_nb) +
+          a_and_B * (lg2[a_and_B] + offset_a - lg2_nB) +
+          A_and_b * (lg2[A_and_b] + offset_A - lg2_nb) +
+          A_and_B * (lg2[A_and_B] + offset_A - lg2_nB);
+        score(a_pos, b_pos) =
+          max_score - static_cast<cost>(ic_sum * max_over_tips);
+      }
+    }
+    if (b_unmatched_n < lap_n) {
+      score.padRowAfterCol(a_pos, b_unmatched_n, max_score);
+    }
+  }
+  if (a_unmatched_n < lap_n) {
+    score.padAfterRow(a_unmatched_n, max_score);
+  }
+
+  // --- Phase 3: solve LAP on the reduced matrix ---
+  scratch.ensure(lap_n);
+  auto& rowsol = scratch.rowsol;
+  auto& colsol = scratch.colsol;
+
+  const double lap_score =
+    static_cast<double>((max_score * lap_n) -
+      lap(lap_n, score, rowsol, colsol, false, scratch)) * over_max;
+  return lap_score + exact_score * n_tips_rcp;
+}
+
+} // namespace TreeDist
+
+#endif // TREEDIST_MCI_IMPLEMENTATION
+#endif // TREEDIST_MUTUAL_CLUSTERING_IMPL_H_
diff --git a/inst/include/TreeDist/types.h b/inst/include/TreeDist/types.h
new file mode 100644
index 00000000..c953989b
--- /dev/null
+++ b/inst/include/TreeDist/types.h
@@ -0,0 +1,32 @@
+#ifndef TREEDIST_TYPES_H_
+#define TREEDIST_TYPES_H_
+
+// TreeDist public type definitions.
+//
+// Rcpp-free: uses only standard headers plus TreeTools (for SL_MAX_SPLITS).
+// Downstream packages include via:  #include <TreeDist/types.h>
+
+#include <cstdint>
+#include <limits>
+#include <TreeTools/SplitList.h>   // SL_MAX_SPLITS, splitbit
+
+namespace TreeDist {
+
+  using int16 = int_fast16_t;
+  using int32 = int_fast32_t;
+  using cost  = int_fast64_t;
+
+  using lap_dim = int;
+  using lap_row = lap_dim;
+  using lap_col = lap_dim;
+
+  constexpr cost BIG =
+    (std::numeric_limits<cost>::max)() / SL_MAX_SPLITS;
+
+  constexpr cost ROUND_PRECISION = 2048 * 2048;
+
+  constexpr std::size_t BLOCK_SIZE = 16;
+
+} // namespace TreeDist
+
+#endif // TREEDIST_TYPES_H_
diff --git a/src/Makevars b/src/Makevars
index 139b1d17..d3df534e 100644
--- a/src/Makevars
+++ b/src/Makevars
@@ -1,2 +1,3 @@
+PKG_CPPFLAGS = -I../inst/include
 PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
diff --git a/src/Makevars.win b/src/Makevars.win
index 139b1d17..d3df534e 100644
--- a/src/Makevars.win
+++ b/src/Makevars.win
@@ -1,2 +1,3 @@
+PKG_CPPFLAGS = -I../inst/include
 PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
 PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
index b3d661be..72309c5b 100644
--- a/src/RcppExports.cpp
+++ b/src/RcppExports.cpp
@@ -611,6 +611,19 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// cpp_mci_impl_score
+double cpp_mci_impl_score(const Rcpp::RawMatrix& x, const Rcpp::RawMatrix& y, int n_tips);
+RcppExport SEXP _TreeDist_cpp_mci_impl_score(SEXP xSEXP, SEXP ySEXP, SEXP n_tipsSEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type x(xSEXP);
+    Rcpp::traits::input_parameter< const Rcpp::RawMatrix& >::type y(ySEXP);
+    Rcpp::traits::input_parameter< int >::type n_tips(n_tipsSEXP);
+    rcpp_result_gen = Rcpp::wrap(cpp_mci_impl_score(x, y, n_tips));
+    return rcpp_result_gen;
+END_RCPP
+}
 // cpp_robinson_foulds_distance
 List cpp_robinson_foulds_distance(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip);
 RcppExport SEXP _TreeDist_cpp_robinson_foulds_distance(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) {
@@ -753,6 +766,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_TreeDist_cpp_transfer_dist_scored", (DL_FUNC) &_TreeDist_cpp_transfer_dist_scored, 4},
     {"_TreeDist_cpp_transfer_dist_all_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_all_pairs, 4},
     {"_TreeDist_cpp_transfer_dist_cross_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_cross_pairs, 5},
+    {"_TreeDist_cpp_mci_impl_score", (DL_FUNC) &_TreeDist_cpp_mci_impl_score, 3},
     {"_TreeDist_cpp_robinson_foulds_distance", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_distance, 3},
     {"_TreeDist_cpp_robinson_foulds_info", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_info, 3},
     {"_TreeDist_cpp_matching_split_distance", (DL_FUNC) &_TreeDist_cpp_matching_split_distance, 3},
diff --git a/src/ints.h b/src/ints.h
index a5af8774..50fd3cc7 100644
--- a/src/ints.h
+++ b/src/ints.h
@@ -1,10 +1,13 @@
 #ifndef _TREEDIST_INT_H
 #define _TREEDIST_INT_H
 
-#include <cstdint>
+#include <TreeDist/types.h>
 
-using int16 = int_fast16_t;
-using int32 = int_fast32_t;
+// Re-export shared types to global scope for backward compatibility.
+using TreeDist::int16;
+using TreeDist::int32;
+
+// Types used only within TreeDist's own source.
 using uint32 = uint_fast32_t;
 using grf_match = std::vector<int>;
 
diff --git a/src/lap.cpp b/src/lap.cpp
index 4963fa9e..e77c712f 100644
--- a/src/lap.cpp
+++ b/src/lap.cpp
@@ -26,6 +26,15 @@
  https://www.mathworks.com/matlabcentral/fileexchange/26836-lapjv-jonker-volgenant-algorithm-for-linear-assignment-problem-v3-0:
  *
  *************************************************************************/
+
+// NOTE: The LAP hot loops are highly sensitive to instruction alignment and
+// register allocation, which are affected by the TU's full include graph.
+// Do NOT include lap_impl.h here — that header is for downstream LinkingTo
+// consumers only.  TreeDist's own lap() is compiled directly in this file
+// to preserve the codegen context that was profiled and tuned.
+//
+// If the algorithm changes, update BOTH this file and lap_impl.h.
+
 #include "lap.h"
 #include <Rcpp/Lightest>
 
@@ -37,11 +46,42 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) {
   const lap_dim spare_rows = n_row - n_col;
   const cost max_score = cost(BIG / max_dim);
   const double x_max = maxX[0];
+  const double scale_factor = max_score / x_max;
   
   std::vector<lap_col> rowsol(max_dim);
   std::vector<lap_row> colsol(max_dim);
   
-  cost_matrix input(x, x_max);
+  // Build cost matrix.  Fill the transposed buffer first (matching R's
+  // column-major storage for sequential reads) then untranspose.
+  cost_matrix input(max_dim);
+  const double* __restrict__ src_data = REAL(x);
+  cost* __restrict__ t_ptr = input.col(0);
+  const std::size_t dim8 = input.dim8();
+  
+  for (lap_col c = 0; c < n_col; ++c) {
+    const std::size_t t_off = static_cast<std::size_t>(c) * dim8;
+    const std::size_t s_off = static_cast<std::size_t>(c) * n_row;
+    for (lap_row r = 0; r < n_row; ++r) {
+      t_ptr[t_off + r] = static_cast<cost>(src_data[s_off + r] * scale_factor);
+    }
+    // Pad remaining rows in this transposed column
+    for (lap_row r = n_row; r < max_dim; ++r) {
+      t_ptr[t_off + r] = max_score;
+    }
+    for (std::size_t r = max_dim; r < dim8; ++r) {
+      t_ptr[t_off + r] = max_score;
+    }
+  }
+  // Pad remaining transposed columns
+  for (lap_col c = n_col; c < max_dim; ++c) {
+    const std::size_t t_off = static_cast<std::size_t>(c) * dim8;
+    for (std::size_t r = 0; r < dim8; ++r) {
+      t_ptr[t_off + r] = max_score;
+    }
+  }
+  
+  // Untranspose: t_data_ -> data_
+  input.makeUntranspose();
   
   cost score = lap(max_dim, input, rowsol, colsol);
   
@@ -60,27 +100,22 @@ Rcpp::List lapjv(Rcpp::NumericMatrix &x, Rcpp::NumericVector &maxX) {
   );
 }
 
+namespace {
 inline bool nontrivially_less_than(cost a, cost b) noexcept {
   return a + ((a > ROUND_PRECISION) ? 8 : 0) < b;
 }
+} // anonymous namespace
 
 /* This function is the jv shortest augmenting path algorithm to solve the 
    assignment problem */
+namespace TreeDist {
+
 cost lap(const lap_row dim,
-         cost_matrix &input_cost,
+         CostMatrix &input_cost,
          std::vector<lap_col> &rowsol,
          std::vector<lap_row> &colsol,
          const bool allow_interrupt,
          LapScratch &scratch)
-  
-  // input:
-  // dim        - problem size
-  // input_cost - cost matrix
-  
-  // output:
-  // rowsol     - column assigned to row in solution
-  // colsol     - row assigned to column in solution
-  
 {
   lap_row num_free = 0;
   scratch.ensure(dim);
@@ -316,3 +351,4 @@ cost lap(const lap_row dim,
 
   return lapcost;
 }
+} // namespace TreeDist
diff --git a/src/lap.h b/src/lap.h
index b1a22508..592ce9b2 100644
--- a/src/lap.h
+++ b/src/lap.h
@@ -9,456 +9,27 @@
 // because the LAP hot loops are sensitive to instruction alignment — even
 // unrelated inline functions in the same translation unit can shift code
 // layout enough to cause measurable regressions.
-
-#include <TreeTools/SplitList.h>
-#include <Rcpp/Lightest>
-
-#include <limits> /* for numeric_limits */
-#include <vector> /* for vector */
-
-#include "ints.h"
-
-/*************** TYPES      *******************/
-
-using cost = int_fast64_t;
-using lap_dim = int;
-using lap_row = lap_dim;
-using lap_col = lap_dim;
-
-/* For a reason I've not determined, shrinking BIG is necessary to avoid 
- * an infinite loop in lap. */
-constexpr cost BIG = (std::numeric_limits<cost>::max)() / SL_MAX_SPLITS;
-constexpr cost ROUND_PRECISION = 2048 * 2048;
-constexpr size_t BLOCK_SIZE = 16;
-
-class CostMatrix {
-private:
-  size_t dim_; // Important not to use int16, which will overflow on *
-  size_t dim8_;
-  alignas(64) std::vector<cost> data_;
-  alignas(64) std::vector<cost> t_data_;
-  bool transposed_;
-  
-  static size_t block_containing(const size_t x) {
-    return ((x + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE;
-  }
-  
-public:
-  // Default constructor for pooled instances (zero-size, no allocation).
-  CostMatrix() : dim_(0), dim8_(0), transposed_(false) {}
-
-  CostMatrix(size_t dim)
-    : dim_(dim),
-      dim8_(block_containing(dim_)),
-      data_(std::vector<cost>(dim8_ * dim8_)),
-      t_data_(std::vector<cost>(dim8_ * dim8_)),
-      transposed_(false) {}
-
-  // Resize for reuse.  Only reallocates when the new dimension exceeds the
-  // current buffer capacity; otherwise just updates dim_/dim8_ and marks
-  // the transpose as stale.  New elements (if any) are value-initialised.
-  void resize(size_t new_dim) {
-    dim_ = new_dim;
-    dim8_ = block_containing(new_dim);
-    const size_t needed = dim8_ * dim8_;
-    if (data_.size() < needed) {
-      data_.resize(needed);
-      t_data_.resize(needed);
-    }
-    transposed_ = false;
-  }
-      
-  CostMatrix(const Rcpp::NumericMatrix& src, const double x_max)
-    : dim_((std::max(src.nrow(), src.ncol()))),  // or pad here as needed
-      dim8_(block_containing(dim_)),
-      data_(std::vector<cost>(dim8_ * dim_)),
-      t_data_(std::vector<cost>(dim8_ * dim_)),
-      transposed_(true)
-  {
-    // Compute scale factor
-    const cost max_score = cost(BIG / dim_);
-    double scale_factor = max_score / x_max;
-    
-    const lap_row n_row = src.nrow();
-    const lap_col n_col = src.ncol();
-    
-    const double* __restrict src_data = REAL(src);
-    cost* __restrict dest_data = t_data_.data();
-    
-    for (lap_col c = 0; c < n_col; ++c) {
-      const size_t data_c = c * dim8_;
-      const size_t src_c = c * n_row;
-      for (lap_row r = 0; r < n_row; ++r) {
-        // Marginally faster than std::transform
-        dest_data[data_c + r] = static_cast<cost>(src_data[src_c + r] * scale_factor);
-      }
-      
-      padTrColAfterRow(c, n_row, max_score);  // padding now goes after row `c`
-    }
-    
-    padTrAfterCol(n_col, max_score);
-    
-    makeUntranspose();
-  }
-  
-  // Reset for reuse at the same dimension — clears the transpose cache so the
-  // next findColMin() re-builds it from fresh data.  Call between pair
-  // computations when reusing a thread-local CostMatrix instance.
-  void reset() noexcept { transposed_ = false; }
-
-  [[nodiscard]] size_t dim() const noexcept { return dim_; }
-
-  void makeTranspose() noexcept {
-    if (transposed_) return;
-    
-    const cost* __restrict data_ptr = data_.data();
-    cost* __restrict t_data_ptr = t_data_.data();
-    
-#if defined(__GNUC__) || defined(__clang__)
-    data_ptr = static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
-    t_data_ptr = static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
-#endif
-    for (size_t i = 0; i < dim_; i += BLOCK_SIZE) {
-      for (size_t j = 0; j < dim_; j += BLOCK_SIZE) {
-        for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) {
-          for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) {
-            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
-          }
-        }
-      }
-    }
-    
-    transposed_ = true;
-  }
-  
-  void makeUntranspose() noexcept {
-    const cost* __restrict data_ptr = t_data_.data();
-    cost* __restrict t_data_ptr = data_.data();
-    
-#if defined(__GNUC__) || defined(__clang__)
-    data_ptr = static_cast<const cost*>(__builtin_assume_aligned(data_ptr, 64));
-    t_data_ptr = static_cast<cost*>(__builtin_assume_aligned(t_data_ptr, 64));
-#endif
-    for (size_t i = 0; i < dim_; i += BLOCK_SIZE) {
-      for (size_t j = 0; j < dim_; j += BLOCK_SIZE) {
-        for (size_t r = i; r < std::min(i + BLOCK_SIZE, dim_); ++r) {
-          for (size_t c = j; c < std::min(j + BLOCK_SIZE, dim_); ++c) {
-            t_data_ptr[c * dim8_ + r] = data_ptr[r * dim8_ + c];
-          }
-        }
-      }
-    }
-  }
-  
-  
-  // Access operator for read/write
-  cost& operator()(lap_row i, lap_col j) {
-    return data_[static_cast<size_t>(i) * dim8_ + j];
-  }
-  
-  // Const version for read-only access
-  [[nodiscard]] const cost& operator()(lap_row i, lap_col j) const {
-    return data_[static_cast<size_t>(i) * dim8_ + j];
-  }
-  
-  [[nodiscard]] const cost& row0(lap_col j) const {
-    return data_[j];
-  }
-  
-  [[nodiscard]] const cost& entry0(lap_row i) const {
-    return data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] cost* row(lap_row i) {
-    return &data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] const cost* row(lap_row i) const {
-    return &data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] cost* col(lap_col i) {
-    return &t_data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  [[nodiscard]] const cost* col(lap_col i) const {
-    return &t_data_[static_cast<size_t>(i) * dim8_];
-  }
-  
-  void padTrAfterCol(lap_row start_row, cost value) {
-    size_t start_index = static_cast<size_t>(start_row) * dim8_;
-    size_t end_index   = dim_ * dim8_;
-    std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value);
-  }
-  
-  void padAfterRow(lap_row start_row, cost value) {
-    size_t start_index = static_cast<size_t>(start_row) * dim8_;
-    size_t end_index   = dim_ * dim8_;
-    std::fill(data_.begin() + start_index, data_.begin() + end_index, value);
-  }
-  
-  void padTrColAfterRow(const lap_row r, const lap_col start_col,
-                         const cost value) {
-    size_t r_offset = r * dim8_;
-    size_t actual_start_col = static_cast<size_t>(start_col);
-    size_t start_index = r_offset + actual_start_col;
-    size_t end_index = start_index + dim_ - actual_start_col;
-    std::fill(t_data_.begin() + start_index, t_data_.begin() + end_index, value);
-  }
-  
-  void padRowAfterCol(const lap_row r, const lap_col start_col,
-                      const cost value) {
-    size_t r_offset = r * dim8_;
-    size_t actual_start_col = static_cast<size_t>(start_col);
-    size_t start_index = r_offset + actual_start_col;
-    size_t end_index = start_index + dim_ - actual_start_col;
-    std::fill(data_.begin() + start_index, data_.begin() + end_index, value);
-  }
-  
-  std::pair<cost, lap_row> findColMin(lap_col j,
-                                      lap_row search_dim = -1) {
-    makeTranspose();
-    const cost* col_data = col(j);
-    const lap_row n = (search_dim < 0) ? static_cast<lap_row>(dim_) : search_dim;
-    const auto min_ptr = std::min_element(col_data, col_data + n);
-    return {*min_ptr,
-            static_cast<lap_row>(std::distance(col_data, min_ptr))};
-  }
-  
-  // test2000 = 119 ms (!)
-  //     Find minimum and second minimum reduced cost over columns.
-  std::tuple<cost, cost, lap_col, lap_col> findRowSubmin(
-      const lap_row* i, const std::vector<cost>& v
-  ) const {
-    assert(dim_ > 1);
-    
-    const cost* __restrict row_i = row(*i);
-    const lap_col dim = static_cast<lap_col>(dim_);
-    const cost* __restrict v_ptr = v.data();
-    
-    const cost h0 = row_i[0] - v_ptr[0];
-    const cost h1 = row_i[1] - v_ptr[1];
-    
-    cost min_val, submin_val;
-    lap_col min_idx, submin_idx;
-    
-    if (h0 > h1) {
-      min_val = h1; submin_val = h0;
-      min_idx = 1; submin_idx = 0;
-    } else {
-      min_val = h0; submin_val = h1;
-      min_idx = 0; submin_idx = 1;
-    }
-    
-    const lap_col j_limit = (dim < 4 ? 0 : static_cast<lap_col>(dim - 3));
-    
-    for (lap_col j = 2; j < j_limit; j += 4) {
-      assert(BLOCK_SIZE >= 4);  // Unrolling loop x4 gives ~20% speedup
-      const cost h0 = row_i[j] - v_ptr[j];
-      const cost h1 = row_i[j + 1] - v_ptr[j + 1];
-      const cost h2 = row_i[j + 2] - v_ptr[j + 2];
-      const cost h3 = row_i[j + 3] - v_ptr[j + 3];
-      if (h0 < submin_val) {
-        if (h0 < min_val) {
-          submin_val = min_val;
-          min_val = h0;
-          submin_idx = min_idx;
-          min_idx = j;
-        } else {
-          submin_val = h0;
-          submin_idx = j;
-        }
-      }
-      if (h1 < submin_val) {
-        if (h1 < min_val) {
-          submin_val = min_val;
-          min_val = h1;
-          submin_idx = min_idx;
-          min_idx = j + 1;
-        } else {
-          submin_val = h1;
-          submin_idx = j + 1;
-        }
-      }
-      if (h2 < submin_val) {
-        if (h2 < min_val) {
-          submin_val = min_val;
-          min_val = h2;
-          submin_idx = min_idx;
-          min_idx = j + 2;
-        } else {
-          submin_val = h2;
-          submin_idx = j + 2;
-        }
-      }
-      if (h3 < submin_val) {
-        if (h3 < min_val) {
-          submin_val = min_val;
-          min_val = h3;
-          submin_idx = min_idx;
-          min_idx = j + 3;
-        } else {
-          submin_val = h3;
-          submin_idx = j + 3;
-        }
-      }
-    }
-    for (lap_col j = j_limit; j < dim; ++j) {
-      const cost h = row_i[j] - v_ptr[j];
-      if (h < submin_val) {
-        if (h < min_val) {
-          submin_val = min_val;
-          min_val = h;
-          submin_idx = min_idx;
-          min_idx = j;
-        } else {
-          submin_val = h;
-          submin_idx = j;
-        }
-      }
-    }
-    return {min_val, submin_val, min_idx, submin_idx};
-  }
-  
-  // test2000 = 260 ms
-  std::tuple<cost, cost, lap_col, lap_col> findRowSubminNaive(
-      const lap_row* i,
-      const std::vector<cost>& v_ptr
-  ) {
-    const cost* __restrict row_i = row(*i);
-    const lap_col dim = static_cast<lap_col>(dim_);
-    
-    if (dim < 2) {
-      return {row_i[0] - v_ptr[0], std::numeric_limits<cost>::max(), 0, 0};
-    }
-    
-    // Initialize with first two elements
-    cost h0 = row_i[0] - v_ptr[0];
-    cost h1 = row_i[1] - v_ptr[1];
-    
-    cost min_val, submin_val;
-    lap_col min_idx, submin_idx;
-    
-    if (h0 <= h1) {
-      min_val = h0; submin_val = h1;
-      min_idx = 0; submin_idx = 1;
-    } else {
-      min_val = h1; submin_val = h0;
-      min_idx = 1; submin_idx = 0;
-    }
-    
-    // Process remaining elements
-    for (lap_col j = 2; j < dim; ++j) {
-      const cost h = row_i[j] - v_ptr[j];
-      
-      if (h < min_val) {
-        submin_val = min_val;
-        submin_idx = min_idx;
-        min_val = h;
-        min_idx = j;
-      } else if (h < submin_val) {
-        submin_val = h;
-        submin_idx = j;
-      }
-    }
-    
-    return {min_val, submin_val, min_idx, submin_idx};
-  }
-  
-  // test2000 = 370 ms (!)
-  std::tuple<cost, cost, lap_col, lap_col> findRowSubminTwoPassNaive(
-      const lap_row* i,
-      const std::vector<cost>& v_ptr
-  ) {
-    const cost* __restrict row_i = row(*i);
-    cost min_val = std::numeric_limits<cost>::max();
-    lap_col min_idx = 0;
-    const lap_col dim = static_cast<lap_col>(dim_);
-    
-    for (lap_col j = 0; j < dim; ++j) {
-      const cost h = row_i[j] - v_ptr[j];
-      if (h < min_val) {
-        min_val = h;
-        min_idx = j;
-      }
-    }
-    
-    // Second pass: find subminimum
-    cost submin_val = std::numeric_limits<cost>::max();
-    lap_col submin_idx = (min_idx == 0) ? 1 : 0;
-    
-    for (lap_col j = 0; j < dim; ++j) {
-      if (j != min_idx) {
-        const cost h = row_i[j] - v_ptr[j];
-        if (h < submin_val) {
-          submin_val = h;
-          submin_idx = j;
-        }
-      }
-    }
-    
-    return {min_val, submin_val, min_idx, submin_idx};
-  }
-};
-
-using cost_matrix = CostMatrix;
-
-// ---------------------------------------------------------------------------
-// LapScratch — reusable heap storage for one thread's LAPJV workspace.
 //
-// Pass to the scratch-taking overload of lap() to eliminate the ~6 per-call
-// std::vector allocations.  In a serial context construct once before the
-// loop; in an OpenMP context allocate one per thread and index by
-// omp_get_thread_num().  All vectors are grown lazily; never shrunk.
-// ---------------------------------------------------------------------------
-struct LapScratch {
-  std::vector<cost>    v;               // column dual variables (padded)
-  std::vector<lap_col> matches;         // assignment-count per row
-  std::vector<lap_row> freeunassigned;  // list of unassigned rows
-  std::vector<lap_col> col_list;        // column scan list
-  std::vector<cost>    d;               // Dijkstra distances
-  std::vector<lap_row> predecessor;     // augmenting-path predecessors
-  // rowsol / colsol are included so score functions that don't need the
-  // solution afterwards can avoid a separate allocation.
-  std::vector<lap_col> rowsol;
-  std::vector<lap_row> colsol;
-  // Pooled CostMatrix instances — reused across pairs to avoid per-call
-  // heap allocation.  Grown lazily on first use, never shrunk.
-  CostMatrix score_pool;               // main cost matrix
-  CostMatrix small_pool;               // reduced matrix (exact-match path)
-
-  void ensure(int dim) noexcept {
-    const int padded = static_cast<int>(
-        ((static_cast<size_t>(dim) + BLOCK_SIZE - 1) / BLOCK_SIZE) * BLOCK_SIZE);
-    if (static_cast<int>(v.size())              < padded) v.resize(padded);
-    if (static_cast<int>(matches.size())        < dim)    matches.resize(dim);
-    if (static_cast<int>(freeunassigned.size()) < dim)    freeunassigned.resize(dim);
-    if (static_cast<int>(col_list.size())       < dim)    col_list.resize(dim);
-    if (static_cast<int>(d.size())              < dim)    d.resize(dim);
-    if (static_cast<int>(predecessor.size())    < dim)    predecessor.resize(dim);
-    if (static_cast<int>(rowsol.size())         < dim)    rowsol.resize(dim);
-    if (static_cast<int>(colsol.size())         < dim)    colsol.resize(dim);
-  }
-};
-
-/*************** FUNCTIONS  *******************/
-
-// Primary overload: caller supplies pre-allocated scratch (zero alloc in hot loop).
-extern cost lap(lap_row dim,
-                cost_matrix &input_cost,
-                std::vector<lap_col> &rowsol,
-                std::vector<lap_row> &colsol,
-                bool allow_interrupt,
-                LapScratch &scratch);
-
-// Convenience overload: creates a temporary scratch (for one-off calls).
-inline cost lap(lap_row dim,
-                cost_matrix &input_cost,
-                std::vector<lap_col> &rowsol,
-                std::vector<lap_row> &colsol,
-                bool allow_interrupt = true) {
-  LapScratch scratch;
-  return lap(dim, input_cost, rowsol, colsol, allow_interrupt, scratch);
-}
+// Public types, CostMatrix, LapScratch, and lap() declarations live in
+// inst/include/TreeDist/ headers (shared with downstream LinkingTo users).
+// This wrapper re-exports them to global scope for backward compatibility.
+
+#include <TreeDist/lap.h>     // types, CostMatrix, LapScratch, lap()
+
+#include "ints.h"             // grf_match, uint32 (TreeDist-internal types)
+
+// Re-export shared types and classes to global scope.
+using TreeDist::cost;
+using TreeDist::lap_dim;
+using TreeDist::lap_row;
+using TreeDist::lap_col;
+using TreeDist::BIG;
+using TreeDist::ROUND_PRECISION;
+using TreeDist::BLOCK_SIZE;
+using TreeDist::CostMatrix;
+using TreeDist::LapScratch;
+using TreeDist::lap;
+
+using cost_matrix = TreeDist::CostMatrix;
 
 #endif
diff --git a/src/tree_distance_functions.cpp b/src/tree_distance_functions.cpp
index 4a3b77e1..99347300 100644
--- a/src/tree_distance_functions.cpp
+++ b/src/tree_distance_functions.cpp
@@ -1,35 +1,47 @@
+#include <TreeTools/SplitList.h>
 #include <Rcpp/Lightest>
-#include <TreeTools/SplitList.h> /* for SL_MAX_TIPS */
 
-#include <cmath> /* for log2() */
+// Provide the MCI table definitions and implementation in this TU.
+#define TREEDIST_MCI_IMPLEMENTATION
+#include <TreeDist/mutual_clustering_impl.h>
 
 #include "tree_distances.h"
 
-using namespace Rcpp;
-
-constexpr int32 LG2_SIZE = SL_MAX_TIPS + 1;
-
-double lg2[LG2_SIZE];
-double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
-double lg2_unrooted[SL_MAX_TIPS + 2];
-double *lg2_rooted = &lg2_unrooted[0] + 1;
+// Populate lookup tables at library load time.
 __attribute__((constructor))
   void initialize_ldf() {
-    lg2[0] = 0;
-    for (int32 i = 1; i != LG2_SIZE; ++i) {
-      lg2[i] = log2(i);
-    }
-    for (int16 i = 0; i != 3; ++i) {
-      lg2_double_factorial[i] = 0;
-      lg2_unrooted[i] = 0;
-    }
-    assert(lg2_rooted[0] == 0);
-    assert(lg2_rooted[1] == 0);
-    for (int32 i = 2; i != SL_MAX_TIPS + SL_MAX_TIPS - 2; ++i) {
-      lg2_double_factorial[i] = lg2_double_factorial[i - 2] + log2(i);
-    }
-    for (int32 i = 3; i != SL_MAX_TIPS + 2; ++i) {
-      lg2_unrooted[i] = lg2_double_factorial[i + i - 5];
-      assert(lg2_rooted[i - 1] == lg2_double_factorial[i + i - 5]);
-    }
+    TreeDist::init_lg2_tables(SL_MAX_TIPS);
+  }
+
+// Thin wrapper that exercises the installable-header version of
+// mutual_clustering_score(), for test coverage and downstream validation.
+// [[Rcpp::export]]
+double cpp_mci_impl_score(const Rcpp::RawMatrix& x,
+                          const Rcpp::RawMatrix& y,
+                          int n_tips) {
+  using TreeTools::SplitList;
+
+  const SplitList a(x);
+  const SplitList b(y);
+  TreeDist::LapScratch scratch;
+
+  // Build arrays matching the header's raw-pointer API types.
+  std::vector<const splitbit*> a_ptrs(a.n_splits);
+  std::vector<const splitbit*> b_ptrs(b.n_splits);
+  std::vector<TreeDist::int16> a_in(a.n_splits);
+  std::vector<TreeDist::int16> b_in(b.n_splits);
+  for (TreeDist::int16 i = 0; i < a.n_splits; ++i) {
+    a_ptrs[i] = a.state[i];
+    a_in[i] = static_cast<TreeDist::int16>(a.in_split[i]);
   }
+  for (TreeDist::int16 i = 0; i < b.n_splits; ++i) {
+    b_ptrs[i] = b.state[i];
+    b_in[i] = static_cast<TreeDist::int16>(b.in_split[i]);
+  }
+
+  return TreeDist::mutual_clustering_score(
+    a_ptrs.data(), a_in.data(), a.n_splits,
+    b_ptrs.data(), b_in.data(), b.n_splits,
+    a.n_bins, static_cast<TreeDist::int32>(n_tips),
+    scratch);
+}
diff --git a/src/tree_distances.h b/src/tree_distances.h
index f5ba93e7..8d4f7b4f 100644
--- a/src/tree_distances.h
+++ b/src/tree_distances.h
@@ -2,18 +2,25 @@
 #define _TREEDIST_TREE_DISTANCES_H
 
 #include "lap.h"
+#include <TreeDist/mutual_clustering.h>
 #include <TreeTools/SplitList.h>
 
-/***** Constants requiring initialization *****/
+/***** Re-export shared lookup tables to global scope *****/
+
+using TreeDist::lg2;
+using TreeDist::lg2_double_factorial;
+using TreeDist::lg2_unrooted;
+using TreeDist::lg2_rooted;
+
+/***** Constants *****/
 
 constexpr splitbit ALL_ONES = (std::numeric_limits<splitbit>::max)();
-extern double lg2[SL_MAX_TIPS + 1];
-extern double lg2_double_factorial[SL_MAX_TIPS + SL_MAX_TIPS - 2];
-extern double lg2_unrooted[SL_MAX_TIPS + 2];
-extern double *lg2_rooted;
 
 namespace TreeDist {
 
+  // Re-exported from mutual_clustering.h:
+  //   ic_matching(int16 a, int16 b, int16 n)
+
   // See equation 16 in Meila 2007 (k' denoted K).
   // nkK is converted to pkK in the calling function when divided by n.
   inline void add_ic_element(double& ic_sum, const int16 nkK, const int16 nk,
@@ -52,14 +59,6 @@ namespace TreeDist {
   }
 
 
-[[nodiscard]] inline double ic_matching(const int16 a, const int16 b, const int16 n) noexcept {
-    const double lg2a = lg2[a];
-    const double lg2b = lg2[b];
-    const double lg2n = lg2[n];
-    return (a + b) * lg2n - a * lg2a - b * lg2b;
-    //  (a * (lg2n - lg2a)) + (b * (lg2n - lg2b)); is substantially slower
-  }
-
 [[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept {
     assert(SL_MAX_TIPS + 2 <= std::numeric_limits<int16>::max()); // verify int16 ok
     if (a == b) {
diff --git a/tests/testthat/test-mci_impl.R b/tests/testthat/test-mci_impl.R
new file mode 100644
index 00000000..58d72da2
--- /dev/null
+++ b/tests/testthat/test-mci_impl.R
@@ -0,0 +1,53 @@
+test_that("Header mutual_clustering_score matches MutualClusteringInfo", {
+  skip_if_not_installed("TreeDist")
+  library(TreeTools)
+  library(TreeDist)
+
+  bal8 <- BalancedTree(8)
+  pec8 <- PectinateTree(8)
+  star8 <- StarTree(8)
+
+  tips <- TipLabels(bal8)
+  n_tip <- length(tips)
+  splits_bal <- as.Splits(bal8, tips)
+  splits_pec <- as.Splits(pec8, tips)
+  splits_star <- as.Splits(star8, tips)
+
+  # Score-only from the installable-header implementation
+  impl_score <- TreeDist:::cpp_mci_impl_score
+
+  impl_bal_pec <- impl_score(splits_bal, splits_pec, n_tip)
+  impl_bal_bal <- impl_score(splits_bal, splits_bal, n_tip)
+  impl_star <- impl_score(splits_bal, splits_star, n_tip)
+
+  # Reference from MutualClusteringInfo (unnormalized score)
+  ref_bal_pec <- MutualClusteringInfo(bal8, pec8)
+  ref_bal_bal <- MutualClusteringInfo(bal8, bal8)
+
+  expect_equal(impl_bal_pec, ref_bal_pec, tolerance = 1e-10)
+  expect_equal(impl_bal_bal, ref_bal_bal, tolerance = 1e-10)
+  expect_equal(impl_star, 0)
+})
+
+test_that("Header MCI covers exact-match early exit and partial LAP", {
+  skip_if_not_installed("TreeDist")
+  library(TreeTools)
+  library(TreeDist)
+  impl_score <- TreeDist:::cpp_mci_impl_score
+
+  # Two identical trees → all splits match exactly (early exit path)
+  bal20 <- BalancedTree(20)
+  tips <- TipLabels(bal20)
+  n_tip <- length(tips)
+  splits20 <- as.Splits(bal20, tips)
+
+  result <- impl_score(splits20, splits20, n_tip)
+  expect_equal(result, MutualClusteringInfo(bal20, bal20), tolerance = 1e-10)
+
+  # Trees that share some but not all splits → partial match + LAP
+  pec20 <- PectinateTree(20)
+  splits_pec20 <- as.Splits(pec20, tips)
+
+  result2 <- impl_score(splits20, splits_pec20, n_tip)
+  expect_equal(result2, MutualClusteringInfo(bal20, pec20), tolerance = 1e-10)
+})