From 4c75cda79da6f650b0311490c4f7408bfc5a2057 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 15 Apr 2026 18:08:37 +0000 Subject: [PATCH 1/4] Initial plan From 6fb34a673dca29bbe5abd78ba4904a1c47177835 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 15 Apr 2026 18:37:21 +0000 Subject: [PATCH 2/4] Harden tip-count guards and remove fixed-size RF complement buffers Agent-Logs-Url: https://github.com/ms609/TreeDist/sessions/bed5ef86-4ca3-4535-979c-531436fd6bb8 Co-authored-by: ms609 <1695515+ms609@users.noreply.github.com> --- R/tree_distance_utilities.R | 20 ++++++------ src/tree_distances.cpp | 31 +++++++++++-------- src/tree_distances.h | 6 ++-- tests/testthat/test-tree_distance_utilities.R | 12 +++++++ 4 files changed, 44 insertions(+), 25 deletions(-) diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index 561174bf..de9598eb 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -11,6 +11,14 @@ #' @importFrom TreeTools as.Splits TipLabels #' @importFrom utils combn #' @export +.MaxSupportedTips <- 32767L + +.AssertNtipSupported <- function(nTip) { + if (!is.na(nTip) && nTip > .MaxSupportedTips) { + stop("This many tips are not (yet) supported.") + } +} + CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, reportMatching = FALSE, ...) { supportedClasses <- c("phylo", "Splits") @@ -132,9 +140,7 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, # Fast paths: use OpenMP batch functions when all trees share the same tip # set and no R-level cluster has been configured. Each branch mirrors the # generic path exactly but avoids per-pair R overhead. - if (!is.na(nTip) && nTip > 32767L) { - stop("This many tips are not (yet) supported.") - } + .AssertNtipSupported(nTip) if (!is.na(nTip) && is.null(cluster)) { .n_threads <- as.integer(getOption("mc.cores", 1L)) .batch_result <- if (identical(Func, MutualClusteringInfoSplits)) { @@ -235,9 +241,7 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, #' @importFrom stats setNames .SplitDistanceManyMany <- function(Func, splits1, splits2, tipLabels, nTip = length(tipLabels), ...) { - if (!is.na(nTip) && nTip > 32767L) { - stop("This many tips are not (yet) supported.") - } + .AssertNtipSupported(nTip) if (is.na(nTip)) { tipLabels <- union(unlist(tipLabels, use.names = FALSE), unlist(TipLabels(splits2), use.names = FALSE)) @@ -408,9 +412,7 @@ CalculateTreeDistance <- function(Func, tree1, tree2 = NULL, if (ncol(x) != ncol(y)) { stop("Input splits must address same number of tips.") } - if (nTip > 32767L) { - stop("This many tips are not (yet) supported.") - } + .AssertNtipSupported(nTip) } .CheckLabelsSame <- function(labelList) { diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index f4b4ce85..74183fec 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include /* for unique_ptr, make_unique */ #include @@ -29,9 +30,12 @@ namespace TreeDist { } void check_ntip(const double n) { - // Validated by R caller (nTip > 32767 guard in CalculateTreeDistance et al.) - ASSERT(n <= static_cast(std::numeric_limits::max()) - && "This many tips are not (yet) supported."); + constexpr int32 max_supported_tips = std::min( + int32(SL_MAX_TIPS), int32(std::numeric_limits::max()) + ); + if (n > max_supported_tips) { + Rcpp::stop("This many tips are not (yet) supported."); + } } @@ -51,12 +55,12 @@ inline List robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, grf_match matching(a.n_splits, NA_INTEGER); - splitbit b_complement[SL_MAX_SPLITS][SL_MAX_BINS]; + std::vector b_complement(size_t(b.n_splits) * size_t(a.n_bins)); for (int32 i = b.n_splits; i--; ) { for (int32 bin = last_bin; bin--; ) { - b_complement[i][bin] = ~b.state[i][bin]; + b_complement[size_t(i) * a.n_bins + bin] = ~b.state[i][bin]; } - b_complement[i][last_bin] = b.state[i][last_bin] ^ unset_mask; + b_complement[size_t(i) * a.n_bins + last_bin] = b.state[i][last_bin] ^ unset_mask; } for (int32 ai = a.n_splits; ai--; ) { @@ -73,7 +77,7 @@ inline List robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, } if (!all_match) { for (int32 bin = 0; bin < a.n_bins; ++bin) { - if (a.state[ai][bin] != b_complement[bi][bin]) { + if (a.state[ai][bin] != b_complement[size_t(bi) * a.n_bins + bin]) { all_complement = false; break; } @@ -105,13 +109,12 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, grf_match matching(a.n_splits, NA_INTEGER); - /* Dynamic allocation 20% faster for 105 tips, but VLA not permitted in C11 */ - splitbit b_complement[SL_MAX_SPLITS][SL_MAX_BINS]; + std::vector b_complement(size_t(b.n_splits) * size_t(a.n_bins)); for (int16 i = 0; i < b.n_splits; i++) { for (int16 bin = 0; bin < last_bin; ++bin) { - b_complement[i][bin] = ~b.state[i][bin]; + b_complement[size_t(i) * a.n_bins + bin] = ~b.state[i][bin]; } - b_complement[i][last_bin] = b.state[i][last_bin] ^ unset_mask; + b_complement[size_t(i) * a.n_bins + last_bin] = b.state[i][last_bin] ^ unset_mask; } for (int16 ai = 0; ai < a.n_splits; ++ai) { @@ -127,7 +130,7 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, } if (!all_match) { for (int16 bin = 0; bin < a.n_bins; ++bin) { - if ((a.state[ai][bin] != b_complement[bi][bin])) { + if ((a.state[ai][bin] != b_complement[size_t(bi) * a.n_bins + bin])) { all_complement = false; break; } @@ -607,7 +610,9 @@ inline List shared_phylo (const RawMatrix &x, const RawMatrix &y, List cpp_robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, const IntegerVector &nTip) { ASSERT(x.cols() == y.cols() && "Input splits must address same number of tips."); - return robinson_foulds_distance(x, y, static_cast(nTip[0])); + const int32 n_tip = static_cast(nTip[0]); + TreeDist::check_ntip(n_tip); + return robinson_foulds_distance(x, y, n_tip); } // [[Rcpp::export]] diff --git a/src/tree_distances.h b/src/tree_distances.h index 8d4f7b4f..8e0b7c20 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -39,7 +39,7 @@ namespace TreeDist { // Returns lg2_unrooted[x] - lg2_trees_matching_split(y, x - y) [[nodiscard]] inline double mmsi_pair_score(const int16 x, const int16 y) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok return lg2_unrooted[x] - (lg2_rooted[y] + lg2_rooted[x - y]); } @@ -60,7 +60,7 @@ namespace TreeDist { [[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok if (a == b) { return lg2_rooted[a] + lg2_rooted[n - a]; } @@ -71,7 +71,7 @@ namespace TreeDist { } [[nodiscard]] inline double one_overlap_notb(const int16 a, const int16 n_minus_b, const int16 n) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok const int16 b = n - n_minus_b; if (a == b) { return lg2_rooted[b] + lg2_rooted[n_minus_b]; diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 85164096..806464dc 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -31,6 +31,18 @@ test_that("CalculateTreeDistance() errs appropriately", { expect_error(CalculateTreeDistance(RobinsonFouldsSplits, BalancedTree(8), "Not a tree")) }) +test_that("Tip-count guard is applied consistently", { + expect_no_error(.AssertNtipSupported(32767L)) + expect_error(.AssertNtipSupported(32768L), + "This many tips are not \\(yet\\) supported\\.") + + splits8 <- unclass(as.Splits(BalancedTree(8))) + expect_error(cpp_robinson_foulds_distance(splits8, splits8, 32768L), + "This many tips are not \\(yet\\) supported\\.") + expect_error(cpp_robinson_foulds_info(splits8, splits8, 32768L), + "This many tips are not \\(yet\\) supported\\.") +}) + test_that("CalculateTreeDistance() handles splits appropriately", { set.seed(101) tree10 <- ape::rtree(10) From 603dcb91d130ddcff96fabd686a055e2c8afb69b Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 15 Apr 2026 18:42:44 +0000 Subject: [PATCH 3/4] Refine SL_MAX_TIPS guard coverage and RF heap allocation safeguards Agent-Logs-Url: https://github.com/ms609/TreeDist/sessions/bed5ef86-4ca3-4535-979c-531436fd6bb8 Co-authored-by: ms609 <1695515+ms609@users.noreply.github.com> --- R/tree_distance_utilities.R | 1 + src/tree_distances.cpp | 6 ++++++ tests/testthat/test-tree_distance_utilities.R | 9 +++++++++ 3 files changed, 16 insertions(+) diff --git a/R/tree_distance_utilities.R b/R/tree_distance_utilities.R index de9598eb..01b0ddca 100644 --- a/R/tree_distance_utilities.R +++ b/R/tree_distance_utilities.R @@ -11,6 +11,7 @@ #' @importFrom TreeTools as.Splits TipLabels #' @importFrom utils combn #' @export +# Keep in sync with C++ guard: min(SL_MAX_TIPS, int16_t::max()). .MaxSupportedTips <- 32767L .AssertNtipSupported <- function(nTip) { diff --git a/src/tree_distances.cpp b/src/tree_distances.cpp index 74183fec..f79b48da 100644 --- a/src/tree_distances.cpp +++ b/src/tree_distances.cpp @@ -30,6 +30,10 @@ namespace TreeDist { } void check_ntip(const double n) { + // SplitList dimensions are bounded by SL_MAX_TIPS, and current scoring + // paths use int16-sized counts internally. + static_assert(SL_MAX_TIPS <= std::numeric_limits::max(), + "SL_MAX_TIPS must fit in int32"); constexpr int32 max_supported_tips = std::min( int32(SL_MAX_TIPS), int32(std::numeric_limits::max()) ); @@ -55,6 +59,7 @@ inline List robinson_foulds_distance(const RawMatrix &x, const RawMatrix &y, grf_match matching(a.n_splits, NA_INTEGER); + // Heap-backed scratch avoids large fixed-size stack allocation. std::vector b_complement(size_t(b.n_splits) * size_t(a.n_bins)); for (int32 i = b.n_splits; i--; ) { for (int32 bin = last_bin; bin--; ) { @@ -109,6 +114,7 @@ inline List robinson_foulds_info(const RawMatrix &x, const RawMatrix &y, grf_match matching(a.n_splits, NA_INTEGER); + // Heap-backed scratch avoids large fixed-size stack allocation. std::vector b_complement(size_t(b.n_splits) * size_t(a.n_bins)); for (int16 i = 0; i < b.n_splits; i++) { for (int16 bin = 0; bin < last_bin; ++bin) { diff --git a/tests/testthat/test-tree_distance_utilities.R b/tests/testthat/test-tree_distance_utilities.R index 806464dc..61f513d8 100644 --- a/tests/testthat/test-tree_distance_utilities.R +++ b/tests/testthat/test-tree_distance_utilities.R @@ -32,6 +32,8 @@ test_that("CalculateTreeDistance() errs appropriately", { }) test_that("Tip-count guard is applied consistently", { + expect_no_error(.AssertNtipSupported(1000L)) + expect_no_error(.AssertNtipSupported(32766L)) expect_no_error(.AssertNtipSupported(32767L)) expect_error(.AssertNtipSupported(32768L), "This many tips are not \\(yet\\) supported\\.") @@ -41,6 +43,13 @@ test_that("Tip-count guard is applied consistently", { "This many tips are not \\(yet\\) supported\\.") expect_error(cpp_robinson_foulds_info(splits8, splits8, 32768L), "This many tips are not \\(yet\\) supported\\.") + + trees <- list(BalancedTree(8), PectinateTree(8)) + class(trees) <- "multiPhylo" + expect_error( + .SplitDistanceAllPairs(RobinsonFouldsSplits, trees, letters[1:8], 32768L), + "This many tips are not \\(yet\\) supported\\." + ) }) test_that("CalculateTreeDistance() handles splits appropriately", { From 03d157376759e7a1a6b0d08af154d1537e528023 Mon Sep 17 00:00:00 2001 From: R script <1695515+ms609@users.noreply.github.com> Date: Thu, 16 Apr 2026 08:51:15 +0100 Subject: [PATCH 4/4] int16 global --- src/tree_distances.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/tree_distances.h b/src/tree_distances.h index 8e0b7c20..8d4f7b4f 100644 --- a/src/tree_distances.h +++ b/src/tree_distances.h @@ -39,7 +39,7 @@ namespace TreeDist { // Returns lg2_unrooted[x] - lg2_trees_matching_split(y, x - y) [[nodiscard]] inline double mmsi_pair_score(const int16 x, const int16 y) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok return lg2_unrooted[x] - (lg2_rooted[y] + lg2_rooted[x - y]); } @@ -60,7 +60,7 @@ namespace TreeDist { [[nodiscard]] inline double one_overlap(const int16 a, const int16 b, const int16 n) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok if (a == b) { return lg2_rooted[a] + lg2_rooted[n - a]; } @@ -71,7 +71,7 @@ namespace TreeDist { } [[nodiscard]] inline double one_overlap_notb(const int16 a, const int16 n_minus_b, const int16 n) noexcept { - assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok + assert(SL_MAX_TIPS + 2 <= std::numeric_limits::max()); // verify int16 ok const int16 b = n - n_minus_b; if (a == b) { return lg2_rooted[b] + lg2_rooted[n_minus_b];