From 18e3a52a27e9c175eaec2c7101e357ecbf53e8d0 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 1 Apr 2026 14:08:49 +0100 Subject: [PATCH 1/6] Replace b_complement stack arrays with std::vector for large-tree support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The two splitbit b_complement[SL_MAX_SPLITS][SL_MAX_BINS] stack arrays in robinson_foulds_distance() and robinson_foulds_info() would overflow the stack when compiled against TreeTools with SL_MAX_TIPS = 32768 (128 GB). Replace with std::vector sized to actual dimensions (b.n_splits * n_bins). These are serial per-pair paths (reportMatching = TRUE), so heap allocation cost is negligible. Also upgrade assert() to static_assert() in tree_distances.h for the int16 width checks — these now fire at compile time rather than silently passing in release builds. --- src/tree_distances.cpp | 31 ++++++++++++++++++------------- src/tree_distances.h | 12 ++++++++---- 2 files changed, 26 insertions(+), 17 deletions(-) diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index f4b4ce85..67094e64 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -51,12 +51,14 @@ inline List robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, grf_match matching(a.n_splits, NA_INTEGER); - splitbit b_complement[SL_MAX_SPLITS][SL_MAX_BINS]; + const int32 n_bins = a.n_bins; + std::vector b_complement(b.n_splits * n_bins); for (int32 i = b.n_splits; i--; ) { + splitbit* bc_i = &b_complement[i * n_bins]; for (int32 bin = last_bin; bin--; ) { - b_complement[i][bin] = ~b.state[i][bin]; + bc_i[bin] = ~b.state[i][bin]; } - b_complement[i][last_bin] = b.state[i][last_bin] ^ unset_mask; + bc_i[last_bin] = b.state[i][last_bin] ^ unset_mask; } for (int32 ai = a.n_splits; ai--; ) { @@ -64,16 +66,17 @@ inline List robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, bool all_match = true; bool all_complement = true; + const splitbit* bc_bi = &b_complement[bi * n_bins]; - for (int32 bin = 0; bin < a.n_bins; ++bin) { + for (int32 bin = 0; bin < n_bins; ++bin) { if ((a.state[ai][bin] != b.state[bi][bin])) { all_match = false; break; } } if (!all_match) { - for (int32 bin = 0; bin < a.n_bins; ++bin) { - if (a.state[ai][bin] != b_complement[bi][bin]) { + for (int32 bin = 0; bin < n_bins; ++bin) { + if (a.state[ai][bin] != bc_bi[bin]) { all_complement = false; break; } @@ -105,29 +108,31 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, grf_match matching(a.n_splits, NA_INTEGER); - /* Dynamic allocation 20% faster for 105 tips, but VLA not permitted in C11 */ - splitbit b_complement[SL_MAX_SPLITS][SL_MAX_BINS]; + const int16 n_bins = a.n_bins; + std::vector b_complement(b.n_splits * n_bins); for (int16 i = 0; i < b.n_splits; i++) { + splitbit* bc_i = &b_complement[i * n_bins]; for (int16 bin = 0; bin < last_bin; ++bin) { - b_complement[i][bin] = ~b.state[i][bin]; + bc_i[bin] = ~b.state[i][bin]; } - b_complement[i][last_bin] = b.state[i][last_bin] ^ unset_mask; + bc_i[last_bin] = b.state[i][last_bin] ^ unset_mask; } for (int16 ai = 0; ai < a.n_splits; ++ai) { for (int16 bi = 0; bi < b.n_splits; ++bi) { bool all_match = true, all_complement = true; + const splitbit* bc_bi = &b_complement[bi * n_bins]; - for (int16 bin = 0; bin < a.n_bins; ++bin) { + for (int16 bin = 0; bin < n_bins; ++bin) { if ((a.state[ai][bin] != b.state[bi][bin])) { all_match = false; break; } } if (!all_match) { - for (int16 bin = 0; bin < a.n_bins; ++bin) { - if ((a.state[ai][bin] != b_complement[bi][bin])) { + for (int16 bin = 0; bin < n_bins; ++bin) { + if ((a.state[ai][bin] != bc_bi[bin])) { all_complement = false; break; } diff --git a/src/tree_distances.h b/src/tree_distances.h index 8d4f7b4f..12bcaee2 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -39,7 +39,8 @@ namespace TreeDist { // Returns lg2_unrooted[x] - lg2_trees_matching_split(y, x - y) [[nodiscard]] inline double mmsi_pair_score(const int16 x, const int16 y) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), + "int16 too narrow for SL_MAX_TIPS"); return lg2_unrooted[x] - (lg2_rooted[y] + lg2_rooted[x - y]); } @@ -60,7 +61,8 @@ namespace TreeDist { [[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), + "int16 too narrow for SL_MAX_TIPS"); if (a == b) { return lg2_rooted[a] + lg2_rooted[n - a]; } @@ -71,7 +73,8 @@ namespace TreeDist { } [[nodiscard]] inline double one_overlap_notb(const int16 a, const int16 n_minus_b, const int16 n) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + static_assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max(), + "int16 too narrow for SL_MAX_TIPS"); const int16 b = n - n_minus_b; if (a == b) { return lg2_rooted[b] + lg2_rooted[n_minus_b]; @@ -90,7 +93,8 @@ namespace TreeDist { const int16 n_tips, const int16 in_a, const int16 in_b, const int16 n_bins) noexcept { - assert(SL_MAX_BINS <= INT16_MAX); + static_assert(SL_MAX_BINS <= INT16_MAX, + "int16 too narrow for SL_MAX_BINS"); int16 n_ab = 0; for (int16 bin = 0; bin < n_bins; ++bin) { From 2e83910e2189bd804c5a396f843b88346dab16cf Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 1 Apr 2026 15:22:36 +0100 Subject: [PATCH 2/6] Bump version to 2.13.0.9003; NEWS.md: document large-tree support --- DESCRIPTION | 2 +- NEWS.md | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 18290d73..fcca11a5 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeDist Title: Calculate and Map Distances Between Phylogenetic Trees -Version: 2.13.0.9002 +Version: 2.13.0.9003 Authors@R: c(person("Martin R.", "Smith", email = "martin.smith@durham.ac.uk", role = c("aut", "cre", "cph", "prg"), diff --git a/NEWS.md b/NEWS.md index b3829196..feedfa22 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,4 @@ -# TreeDist 2.13.0.9002 +# TreeDist 2.13.0.9003 ## New features @@ -17,6 +17,13 @@ Information) C++ implementations are now exposed via `inst/include/TreeDist/` headers, allowing downstream packages to use `LinkingTo: TreeDist`. +## Internals + +- Stack-allocated split buffers replaced with dynamically-sized vectors, + removing a hard dependency on the compile-time `SL_MAX_SPLITS` constant. + TreeDist now supports trees of any size permitted by TreeTools, including + the forthcoming increase to 32 768 tips. + ## Performance - `RobinsonFoulds()` now uses a fast C++ batch path for cross-distance From 111629c6ff9a66e8c743d02da662806c92fed5fa Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 1 Apr 2026 15:33:00 +0100 Subject: [PATCH 3/6] Add Rcpp::checkUserInterrupt() to serial cost-matrix-fill loops All 7 serial per-pair distance functions in tree_distances.cpp now check for user interrupt every 1024 iterations of the outer split loop. This allows Ctrl+C to break long-running single-pair computations on large trees (e.g. 25 000 tips) that previously ran uninterruptibly. The LAP solver already had interrupt support (allow_interrupt = true). --- src/tree_distances.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index 67094e64..ced22fcf 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -62,6 +62,7 @@ inline List robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, } for (int32 ai = a.n_splits; ai--; ) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); for (int32 bi = b.n_splits; bi--; ) { bool all_match = true; @@ -119,6 +120,7 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, } for (int16 ai = 0; ai < a.n_splits; ++ai) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); for (int16 bi = 0; bi < b.n_splits; ++bi) { bool all_match = true, all_complement = true; @@ -173,6 +175,7 @@ inline List matching_split_distance(const RawMatrix &x, const RawMatrix &y, cost_matrix score(most_splits); for (int16 ai = 0; ai < a.n_splits; ++ai) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); for (int16 bi = 0; bi < b.n_splits; ++bi) { splitbit total = 0; for (int16 bin = 0; bin < a.n_bins; ++bin) { @@ -232,6 +235,7 @@ inline List jaccard_similarity(const RawMatrix &x, const RawMatrix &y, cost_matrix score(most_splits); for (int16 ai = 0; ai < a.n_splits; ++ai) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); const int16 na = a.in_split[ai]; const int16 nA = n_tips - na; @@ -338,6 +342,7 @@ List msi_distance(const RawMatrix &x, const RawMatrix &y, const int32 n_tips) { splitbit different[SL_MAX_BINS]; for (int16 ai = 0; ai < a.n_splits; ++ai) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); for (int16 bi = 0; bi < b.n_splits; ++bi) { int16 n_different = 0, @@ -415,6 +420,7 @@ List mutual_clustering(const RawMatrix &x, const RawMatrix &y, std::unique_ptr b_match = std::make_unique(b.n_splits); for (int16 ai = 0; ai < a.n_splits; ++ai) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); if (a_match[ai]) continue; const int16 na = a.in_split[ai]; @@ -571,6 +577,7 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, cost_matrix score(most_splits); for (int16 ai = a.n_splits; ai--; ) { + if ((ai & 1023) == 0) Rcpp::checkUserInterrupt(); for (int16 bi = b.n_splits; bi--; ) { const double spi_over = TreeDist::spi_overlap( a.state[ai], b.state[bi], n_tips, a.in_split[ai], b.in_split[bi], From dfc2c1c83c431b26558cce1b85f946f912d552fa Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 1 Apr 2026 15:34:51 +0100 Subject: [PATCH 4/6] Update NEWS.md --- NEWS.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index feedfa22..a051898a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -21,8 +21,7 @@ - Stack-allocated split buffers replaced with dynamically-sized vectors, removing a hard dependency on the compile-time `SL_MAX_SPLITS` constant. - TreeDist now supports trees of any size permitted by TreeTools, including - the forthcoming increase to 32 768 tips. + TreeDist now supports trees of any size permitted by TreeTools. ## Performance From 1f5dfdf3c710efb561013e3175ef5429193cff27 Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 1 Apr 2026 15:41:55 +0100 Subject: [PATCH 5/6] Update ASAN tests --- .github/workflows/memcheck.yml | 7 +++++-- memcheck/all.R | 4 ---- memcheck/tests.R | 5 ++--- memcheck/vignettes.R | 8 +++----- 4 files changed, 10 insertions(+), 14 deletions(-) delete mode 100644 memcheck/all.R diff --git a/.github/workflows/memcheck.yml b/.github/workflows/memcheck.yml index 2b9a3c8f..dbf58e22 100644 --- a/.github/workflows/memcheck.yml +++ b/.github/workflows/memcheck.yml @@ -6,6 +6,7 @@ on: branches: - main - master + - '**valgrind**' paths: - '.github/workflows/memcheck.yml' - 'src/**' @@ -30,7 +31,8 @@ name: mem-check jobs: mem-check: runs-on: ubuntu-24.04 - name: valgrind ${{ matrix.config.test }}, ubuntu, R release + + name: valgrind ${{ matrix.config.test }} strategy: fail-fast: false @@ -39,12 +41,13 @@ jobs: - {test: 'tests'} - {test: 'examples'} - {test: 'vignettes'} - + env: R_REMOTES_NO_ERRORS_FROM_WARNINGS: true _R_CHECK_FORCE_SUGGESTS_: false RSPM: https://packagemanager.rstudio.com/cran/__linux__/noble/latest GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + ASAN_OPTIONS: verify_asan_link_order=0 steps: - uses: ms609/actions/memcheck@main diff --git a/memcheck/all.R b/memcheck/all.R deleted file mode 100644 index 622f618f..00000000 --- a/memcheck/all.R +++ /dev/null @@ -1,4 +0,0 @@ -devtools::load_all() -devtools::run_examples() -devtools::build_vignettes() -devtools::test() \ No newline at end of file diff --git a/memcheck/tests.R b/memcheck/tests.R index 4456a9d4..e85bebd2 100644 --- a/memcheck/tests.R +++ b/memcheck/tests.R @@ -1,5 +1,4 @@ -# Code to be run with -# R -d "valgrind --tool=memcheck --leak-check=full" --vanilla < tests/thisfile.R -# First build and install the package. +# Code to be run with +# R -d "valgrind --tool=memcheck --leak-check=full --error-exitcode=1" --vanilla < memcheck/thisfile.R library("TreeDist") devtools::test() diff --git a/memcheck/vignettes.R b/memcheck/vignettes.R index 74430cd1..0c02f716 100644 --- a/memcheck/vignettes.R +++ b/memcheck/vignettes.R @@ -1,5 +1,3 @@ -# Code to be run with -# R -d "valgrind --tool=memcheck --leak-check=full" --vanilla < tests/thisfile.R -# First build and install the package. -library("TreeDist") -devtools::build_vignettes() +# Code to be run with +# R -d "valgrind --tool=memcheck --leak-check=full --error-exitcode=1" --vanilla < memcheck/thisfile.R +devtools::build_vignettes(install = FALSE) From 3d9e1d18ef5dd926202d7345fa86735ad2d7b59e Mon Sep 17 00:00:00 2001 From: "Martin R. Smith" <1695515+ms609@users.noreply.github.com> Date: Wed, 1 Apr 2026 16:37:13 +0100 Subject: [PATCH 6/6] nTip guard --- R/RcppExports.R | 4 ++++ R/transfer_consensus.R | 4 ++-- R/tree_distance.R | 4 ++-- R/tree_distance_transfer.R | 4 ++-- R/tree_distance_utilities.R | 27 ++++++++++++++++++--------- R/zzz.R | 6 ++++++ src/RcppExports.cpp | 11 +++++++++++ src/pairwise_distances.cpp | 14 ++++++++++++++ src/tree_distances.cpp | 20 +++++++++++++++----- src/tree_distances.h | 4 ++++ tests/testthat/test-ntip-limit.R | 24 ++++++++++++++++++++++++ 11 files changed, 102 insertions(+), 20 deletions(-) create mode 100644 tests/testthat/test-ntip-limit.R diff --git a/R/RcppExports.R b/R/RcppExports.R index 7cd2fcd0..8dd5b1a2 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -275,6 +275,10 @@ cpp_mci_impl_score <- function(x, y, n_tips) { .Call(`_TreeDist_cpp_mci_impl_score`, x, y, n_tips) } +cpp_sl_max_tips <- function() { + .Call(`_TreeDist_cpp_sl_max_tips`) +} + cpp_robinson_foulds_distance <- function(x, y, nTip) { .Call(`_TreeDist_cpp_robinson_foulds_distance`, x, y, nTip) } diff --git a/R/transfer_consensus.R b/R/transfer_consensus.R index 95edbffb..45750fc8 100644 --- a/R/transfer_consensus.R +++ b/R/transfer_consensus.R @@ -65,7 +65,7 @@ TransferConsensus <- function(trees, if (nTip < 4L) { return(StarTree(tipLabels)) } - if (nTip > 32767L) stop("This many tips are not (yet) supported.") + .CheckMaxTips(nTip) # Convert each tree to a raw split matrix (TreeTools C++ internally). # as.Splits() will error if a tree's tips don't match tipLabels. @@ -114,7 +114,7 @@ tc_profile <- function(trees, scale = TRUE, greedy = "best", tipLabels <- TipLabels(trees[[1]]) nTip <- length(tipLabels) if (nTip < 4L) stop("Need at least 4 tips for profiling.") - if (nTip > 32767L) stop("This many tips are not (yet) supported.") + .CheckMaxTips(nTip) splitsList <- lapply(trees, function(tr) unclass(as.Splits(tr, tipLabels))) diff --git a/R/tree_distance.R b/R/tree_distance.R index 11bf9e91..5ea3a9cb 100644 --- a/R/tree_distance.R +++ b/R/tree_distance.R @@ -149,7 +149,7 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, } nTip <- length(tipLabels) if (nTip < 4) return(NULL) # nocov - if (nTip > 32767L) stop("This many tips are not (yet) supported.") + .CheckMaxTips(nTip) splits_list <- as.Splits(tree1, tipLabels = tipLabels) n_threads <- as.integer(getOption("mc.cores", 1L)) @@ -203,7 +203,7 @@ GeneralizedRF <- function(splits1, splits2, nTip, PairScorer, nTip <- length(tipLabels1) if (nTip < 4) return(NULL) - if (nTip > 32767L) stop("This many tips are not (yet) supported.") + .CheckMaxTips(nTip) splits1 <- as.Splits(tree1, tipLabels = tipLabels1) splits2 <- as.Splits(tree2, tipLabels = tipLabels1) # Use tipLabels1 to ensure order consistency diff --git a/R/tree_distance_transfer.R b/R/tree_distance_transfer.R index cd0942fd..55dce2e2 100644 --- a/R/tree_distance_transfer.R +++ b/R/tree_distance_transfer.R @@ -168,7 +168,7 @@ TransferDistSplits <- function(splits1, splits2, if (is.null(tipLabels)) return(NULL) nTip <- length(tipLabels) if (nTip < 4L) return(NULL) - if (nTip > 32767L) stop("This many tips are not (yet) supported.") + .CheckMaxTips(nTip) # Check all trees share same tip set allLabels <- TipLabels(tree1) @@ -211,7 +211,7 @@ TransferDistSplits <- function(splits1, splits2, if (is.null(tipLabels)) return(NULL) nTip <- length(tipLabels) if (nTip < 4L) return(NULL) - if (nTip > 32767L) stop("This many tips are not (yet) supported.") + .CheckMaxTips(nTip) # Check all trees share same tip set allLabels1 <- TipLabels(trees1) diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index 561174bf..f47a4e54 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -1,3 +1,18 @@ +# Validate that nTip does not exceed the compiled SL_MAX_TIPS limit. +# Called from every distance entry point before any C++ work. +.CheckMaxTips <- function(nTip) { + if (!is.na(nTip) && nTip > .SL_MAX_TIPS) { + stop( + "Trees with ", nTip, " tips exceed the compiled limit of ", + .SL_MAX_TIPS, " tips.", + if (.SL_MAX_TIPS < 32768L) + "\nUpdate TreeTools and reinstall TreeDist to support more tips." + else "", + call. = FALSE + ) + } +} + #' Wrapper for tree distance calculations #' #' Calls tree distance functions from trees or lists of trees @@ -132,9 +147,7 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, # Fast paths: use OpenMP batch functions when all trees share the same tip # set and no R-level cluster has been configured. Each branch mirrors the # generic path exactly but avoids per-pair R overhead. - if (!is.na(nTip) && nTip > 32767L) { - stop("This many tips are not (yet) supported.") - } + .CheckMaxTips(nTip) if (!is.na(nTip) && is.null(cluster)) { .n_threads <- as.integer(getOption("mc.cores", 1L)) .batch_result <- if (identical(Func, MutualClusteringInfoSplits)) { @@ -235,9 +248,7 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, #' @importFrom stats setNames .SplitDistanceManyMany <- function(Func, splits1, splits2, tipLabels, nTip = length(tipLabels), ...) { - if (!is.na(nTip) && nTip > 32767L) { - stop("This many tips are not (yet) supported.") - } + .CheckMaxTips(nTip) if (is.na(nTip)) { tipLabels <- union(unlist(tipLabels, use.names = FALSE), unlist(TipLabels(splits2), use.names = FALSE)) @@ -408,9 +419,7 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, if (ncol(x) != ncol(y)) { stop("Input splits must address same number of tips.") } - if (nTip > 32767L) { - stop("This many tips are not (yet) supported.") - } + .CheckMaxTips(nTip) } .CheckLabelsSame <- function(labelList) { diff --git a/R/zzz.R b/R/zzz.R index 826528ea..614ebbda 100644 --- a/R/zzz.R +++ b/R/zzz.R @@ -1,3 +1,9 @@ +.SL_MAX_TIPS <- NULL # populated in .onLoad + +.onLoad <- function(libname, pkgname) { + .SL_MAX_TIPS <<- cpp_sl_max_tips() +} + .onUnload <- function(libpath) { StopParallel(quietly = TRUE) library.dynam.unload("TreeDist", libpath) diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp index be5b8963..c78c3efc 100644 --- a/src/RcppExports.cpp +++ b/src/RcppExports.cpp @@ -636,6 +636,16 @@ BEGIN_RCPP return rcpp_result_gen; END_RCPP } +// cpp_sl_max_tips +int cpp_sl_max_tips(); +RcppExport SEXP _TreeDist_cpp_sl_max_tips() { +BEGIN_RCPP + Rcpp::RObject rcpp_result_gen; + Rcpp::RNGScope rcpp_rngScope_gen; + rcpp_result_gen = Rcpp::wrap(cpp_sl_max_tips()); + return rcpp_result_gen; +END_RCPP +} // cpp_robinson_foulds_distance List cpp_robinson_foulds_distance(const RawMatrix& x, const RawMatrix& y, const IntegerVector& nTip); RcppExport SEXP _TreeDist_cpp_robinson_foulds_distance(SEXP xSEXP, SEXP ySEXP, SEXP nTipSEXP) { @@ -780,6 +790,7 @@ static const R_CallMethodDef CallEntries[] = { {"_TreeDist_cpp_transfer_dist_all_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_all_pairs, 4}, {"_TreeDist_cpp_transfer_dist_cross_pairs", (DL_FUNC) &_TreeDist_cpp_transfer_dist_cross_pairs, 5}, {"_TreeDist_cpp_mci_impl_score", (DL_FUNC) &_TreeDist_cpp_mci_impl_score, 3}, + {"_TreeDist_cpp_sl_max_tips", (DL_FUNC) &_TreeDist_cpp_sl_max_tips, 0}, {"_TreeDist_cpp_robinson_foulds_distance", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_distance, 3}, {"_TreeDist_cpp_robinson_foulds_info", (DL_FUNC) &_TreeDist_cpp_robinson_foulds_info, 3}, {"_TreeDist_cpp_matching_split_distance", (DL_FUNC) &_TreeDist_cpp_matching_split_distance, 3}, diff --git a/src/pairwise_distances.cpp b/src/pairwise_distances.cpp index c66b770f..67db48ab 100644 --- a/src/pairwise_distances.cpp +++ b/src/pairwise_distances.cpp @@ -305,6 +305,7 @@ NumericVector cpp_mutual_clustering_all_pairs( const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); if (N < 2) return NumericVector(0); @@ -397,6 +398,7 @@ NumericVector cpp_rf_info_all_pairs( const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); if (N < 2) return NumericVector(0); const int n_pairs = N * (N - 1) / 2; @@ -516,6 +518,7 @@ NumericVector cpp_msd_all_pairs( const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); if (N < 2) return NumericVector(0); const int n_pairs = N * (N - 1) / 2; @@ -618,6 +621,7 @@ NumericVector cpp_msi_all_pairs( const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); if (N < 2) return NumericVector(0); const int n_pairs = N * (N - 1) / 2; @@ -710,6 +714,7 @@ NumericVector cpp_shared_phylo_all_pairs( const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); if (N < 2) return NumericVector(0); const int n_pairs = N * (N - 1) / 2; @@ -875,6 +880,7 @@ NumericVector cpp_jaccard_all_pairs( const bool allow_conflict = true, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); if (N < 2) return NumericVector(0); const int n_pairs = N * (N - 1) / 2; @@ -944,6 +950,7 @@ NumericMatrix cpp_mutual_clustering_cross_pairs( const List& splits_a, const List& splits_b, const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int nA = splits_a.size(); const int nB = splits_b.size(); if (nA == 0 || nB == 0) return NumericMatrix(nA, nB); @@ -987,6 +994,7 @@ NumericMatrix cpp_rf_info_cross_pairs( const List& splits_a, const List& splits_b, const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int nA = splits_a.size(); const int nB = splits_b.size(); if (nA == 0 || nB == 0) return NumericMatrix(nA, nB); @@ -1027,6 +1035,7 @@ NumericMatrix cpp_msd_cross_pairs( const List& splits_a, const List& splits_b, const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int nA = splits_a.size(); const int nB = splits_b.size(); if (nA == 0 || nB == 0) return NumericMatrix(nA, nB); @@ -1070,6 +1079,7 @@ NumericMatrix cpp_msi_cross_pairs( const List& splits_a, const List& splits_b, const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int nA = splits_a.size(); const int nB = splits_b.size(); if (nA == 0 || nB == 0) return NumericMatrix(nA, nB); @@ -1110,6 +1120,7 @@ NumericMatrix cpp_shared_phylo_cross_pairs( const List& splits_a, const List& splits_b, const int n_tip, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int nA = splits_a.size(); const int nB = splits_b.size(); if (nA == 0 || nB == 0) return NumericMatrix(nA, nB); @@ -1153,6 +1164,7 @@ NumericMatrix cpp_jaccard_cross_pairs( const bool allow_conflict = true, const int n_threads = 1 ) { + TreeDist::check_ntip(n_tip); const int nA = splits_a.size(); const int nB = splits_b.size(); if (nA == 0 || nB == 0) return NumericMatrix(nA, nB); @@ -1206,6 +1218,7 @@ NumericVector cpp_clustering_entropy_batch( const List& splits_list, const int n_tip ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); NumericVector result(N); if (N == 0 || n_tip <= 0) return result; @@ -1239,6 +1252,7 @@ NumericVector cpp_splitwise_info_batch( const List& splits_list, const int n_tip ) { + TreeDist::check_ntip(n_tip); const int N = splits_list.size(); NumericVector result(N); if (N == 0 || n_tip < 4) return result; diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index ced22fcf..fc2d8ff4 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -28,15 +28,23 @@ namespace TreeDist { } } - void check_ntip(const double n) { - // Validated by R caller (nTip > 32767 guard in CalculateTreeDistance et al.) - ASSERT(n <= static_cast(std::numeric_limits::max()) - && "This many tips are not (yet) supported."); + void check_ntip(const int32 n) { + if (n > SL_MAX_TIPS) { + Rcpp::stop("Trees with %d tips exceed the compiled limit of %d. " + "Update TreeTools to support more tips, then reinstall " + "TreeDist.", static_cast(n), + static_cast(SL_MAX_TIPS)); + } } } +// [[Rcpp::export]] +int cpp_sl_max_tips() { + return static_cast(SL_MAX_TIPS); +} + using TreeDist::resize_uninitialized; inline List robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, @@ -619,7 +627,9 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, List cpp_robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); - return robinson_foulds_distance(x, y, static_cast(nTip[0])); + const int32 n_tip = static_cast(nTip[0]); + TreeDist::check_ntip(n_tip); + return robinson_foulds_distance(x, y, n_tip); } // [[Rcpp::export]] diff --git a/src/tree_distances.h b/src/tree_distances.h index 12bcaee2..1b24c2b3 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -18,6 +18,10 @@ constexpr splitbit ALL_ONES = (std::numeric_limits::max)(); namespace TreeDist { + // Validate that n_tips does not exceed the compiled SL_MAX_TIPS limit. + // Defined in tree_distances.cpp; calls Rcpp::stop() on failure. + void check_ntip(int32 n); + // Re-exported from mutual_clustering.h: // ic_matching(int16 a, int16 b, int16 n) diff --git a/tests/testthat/test-ntip-limit.R b/tests/testthat/test-ntip-limit.R new file mode 100644 index 00000000..6bee634e --- /dev/null +++ b/tests/testthat/test-ntip-limit.R @@ -0,0 +1,24 @@ +test_that(".SL_MAX_TIPS is populated", { + expect_true(is.integer(TreeDist:::.SL_MAX_TIPS)) + expect_true(TreeDist:::.SL_MAX_TIPS >= 2048L) +}) + +test_that("Trees exceeding SL_MAX_TIPS are rejected", { + limit <- TreeDist:::.SL_MAX_TIPS + # Build a mock Splits raw matrix with limit + 1 tips. + # The matrix needs: nrow = limit - 2 splits, ncol = ceil((limit + 1) / 8) + bad_nTip <- limit + 1L + n_splits <- bad_nTip - 3L + n_cols <- ceiling(bad_nTip / 8) + mock <- matrix(as.raw(0), nrow = n_splits, ncol = n_cols) + class(mock) <- c("Splits", class(mock)) + attr(mock, "nTip") <- bad_nTip + attr(mock, "tip.label") <- paste0("t", seq_len(bad_nTip)) + + expect_error( + TreeDist:::GeneralizedRF(mock, mock, bad_nTip, + TreeDist:::cpp_robinson_foulds_distance, + maximize = FALSE, reportMatching = FALSE), + "exceed" + ) +})