diff --git a/DESCRIPTION b/DESCRIPTION index f3c54e4d..55995a7f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeTools Title: Create, Modify and Analyse Phylogenetic Trees -Version: 2.1.0.9006 +Version: 2.1.0.9007 Authors@R: c( person("Martin R.", 'Smith', role = c("aut", "cre", "cph"), email = "martin.smith@durham.ac.uk", diff --git a/NEWS.md b/NEWS.md index 4f6ce9cc..cd6b20bb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,10 @@ -# TreeTools 2.1.0.9006 (2026-03-13) # +# TreeTools 2.1.0.9007 (2026-03-13) # + +- `duplicated.Splits()` uses hash-based O(n) de-duplication, replacing + O(n²) pairwise comparison. +# TreeTools 2.1.0.9006 (2026-03-13) # + - `NodeDepth()` for unrooted trees rewritten as O(n) two-pass C++ algorithm, replacing iterative R while-loop. diff --git a/src/splits.cpp b/src/splits.cpp index 2024caa3..0e698bf9 100644 --- a/src/splits.cpp +++ b/src/splits.cpp @@ -1,6 +1,8 @@ #include #include // for make_unique #include /* for errors */ +#include /* for string (hash key) */ +#include /* for unordered_set */ #include "../inst/include/TreeTools/assert.h" /* for ASSERT */ #include "../inst/include/TreeTools.h" @@ -197,44 +199,30 @@ LogicalVector duplicated_splits(const RawMatrix splits, } } + // Hash-based O(n) deduplication LogicalVector ret(n_split); + std::unordered_set seen; + seen.reserve(n_split * 2); + std::string key(check_bins, '\0'); + if (fromLast[0]) { - for (intx it = n_split - 1; it--; ) { - const intx i = it + 1; // nothing to duplicate split(0, _) - if (ret[i]) { - continue; + // Scan from end; first seen (from end) is kept, earlier dupes are marked + for (intx i = n_split; i--; ) { + for (intx b = 0; b < check_bins; ++b) { + key[b] = static_cast(compare(i, b)); } - for (intx j = i; j--; ) { - // Rcout << " check split " << i << " (" << uintx(compare(i, 0)) << - // ") vs " << j << " (" << uintx(compare(j, 0)) << "): "; - for(intx bin = 0; compare(i, bin) == compare(j, bin); ) { - // Rcout << " [bin " << bin << "] "; - ++bin; - if (bin == check_bins) { - // Rcout << "Duplicate!"; - ret[j] = true; - break; - } - } - // Rcout << "\n"; - + if (!seen.insert(key).second) { + ret[i] = true; } } } else { - for (intx i = 0; i != n_split - 1; ++i) { - if (ret[i]) { - continue; + // Scan from start; first seen is kept, later dupes are marked + for (intx i = 0; i < n_split; ++i) { + for (intx b = 0; b < check_bins; ++b) { + key[b] = static_cast(compare(i, b)); } - for (intx j = i + 1; j != n_split; ++j) { - - for(intx bin = 0; compare(i, bin) == compare(j, bin); ) { - ++bin; - if (bin == check_bins) { - ret[j] = true; - break; - } - } - + if (!seen.insert(key).second) { + ret[i] = true; } } }