diff --git a/.Rbuildignore b/.Rbuildignore index a0ff5612e..682fce456 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -11,15 +11,17 @@ ^inst/Parsimony/tests ^man-roxygen$ ^memcheck$ +^pdf$ ^pkgdown$ ^revdep$ ^split-support$ /^src\-/ CONTRIBUTING\.md +papers\.md README\.md cran\-comments\.md vignettes/\.RData -/^\.git/ +^\.git ^.*\.o$ ^.*\.dll$ ^.*\.yml$ @@ -32,3 +34,24 @@ vignettes/\.RData ^_pkgdown\.yml$ ^codemeta\.json$ ^LICENSE$ +^\.positai$ +^\.claude$ +^\.agent- +^AGENTS\.md$ +^agent-.*\.md$ +^check_init\.R$ +^coordination\.md$ +^to-do\.md$ +^completed-tasks\.md$ +^issues\.md$ +Makevars\.win\..*-bak$ +^.*\.Rcheck$ + +# Test artifacts +^test.*\.txt$ +^vtune +^dev$ + +# Agent note files +^remote-jobs\.md$ +^papers\.md$ diff --git a/.github/workflows/ASan.yml b/.github/workflows/ASan.yml index 8f4c21296..8b3ad1dbe 100644 --- a/.github/workflows/ASan.yml +++ b/.github/workflows/ASan.yml @@ -1,4 +1,5 @@ # Address Sanitizer: Replicate CRAN's gcc-ASAN 'Additional Test' +# Uses the r-hub gcc-asan container (R-devel built with ASAN/UBSAN). on: workflow_dispatch: push: @@ -26,7 +27,18 @@ name: gcc-ASAN jobs: mem-check: - runs-on: ubuntu-24.04 # Update RSPM when increasing + runs-on: ubuntu-latest + container: + image: ghcr.io/r-hub/containers/gcc-asan:latest + + # rlang ≤ 1.1.7 uses PREXPR() in BOTH src/capture.c and the vendored + # src/rlang/rlang-types.h. PREXPR was removed from the R public API in + # R-devel (>= r87506, 2025-01), so rlang cannot be compiled from source + # in this container. A header-shim approach fixes rlang-types.h but not + # the direct call sites in capture.c. + # Setting continue-on-error so that this upstream blocker does not prevent + # PR merges. Remove this once rlang ≥ 1.1.8 reaches CRAN. + continue-on-error: true name: AddressSanitizer ${{ matrix.config.test }} @@ -39,46 +51,9 @@ jobs: - {test: 'vignettes'} env: - R_REMOTES_NO_ERRORS_FROM_WARNINGS: true - _R_CHECK_FORCE_SUGGESTS_: false - RSPM: https://packagemanager.rstudio.com/cran/__linux__/noble/latest - USING_ASAN: true - STRINGI_DISABLE_PKG_CONFIG: true - BIOCONDUCTOR_USE_CONTAINER_REPOSITORY: FALSE # For stringi GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} - ASAN_OPTIONS: verify_asan_link_order=0 steps: - - uses: actions/checkout@v5 - - - name: Initialize ASan configuration - run: | - export LD_PRELOAD=$(gcc -print-file-name=libasan.so) - - echo "PKG_CFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" > src/Makevars - echo "PKG_CXXFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> src/Makevars - - mkdir ~/.R - echo "LDFLAGS = -g -O0 -fsanitize=address -fno-omit-frame-pointer" >> ~/.R/Makevars - - - uses: r-lib/actions/setup-r@v2 + - uses: ms609/actions/asan@main with: - r-version: release # CRAN uses devel, but takes ages to load deps. - - - name: Set up R dependencies - uses: r-lib/actions/setup-r-dependencies@v2 - with: - dependencies: "'soft'" - needs: | - memcheck - - - name: Install package - run: | - cd .. - R CMD build --no-build-vignettes --no-manual --no-resave-data TreeSearch - R CMD INSTALL TreeSearch*.tar.gz - cd TreeSearch - - - name: ASAN - memcheck ${{ matrix.config.test }} - run: | - Rscript memcheck/${{ matrix.config.test }}.R + test: ${{ matrix.config.test }} diff --git a/.github/workflows/R-CMD-check.yml b/.github/workflows/R-CMD-check.yml index de0a2f900..5d145684e 100644 --- a/.github/workflows/R-CMD-check.yml +++ b/.github/workflows/R-CMD-check.yml @@ -20,9 +20,7 @@ on: - "**.R[dD]ata" - "**.Rpro*" pull_request: - branches: - - main - - master + branches: ["*"] paths-ignore: - "Meta**" - "memcheck**" diff --git a/.github/workflows/extended-tests.yml b/.github/workflows/extended-tests.yml new file mode 100644 index 000000000..1dd0d7f92 --- /dev/null +++ b/.github/workflows/extended-tests.yml @@ -0,0 +1,65 @@ +# Extended test suite — Tier 3 stress / bench / timing tests. +# Runs weekly (Sundays, 3am) and on-demand. +# Sets TREESEARCH_EXTENDED_TESTS=true so skip_extended() guards are lifted. +# See tests/testing-strategy.md for full tiering documentation. + +on: + workflow_dispatch: + schedule: + - cron: '0 3 * * 0' # Sundays, 3am + +name: extended-tests + +jobs: + extended-tests: + runs-on: ubuntu-24.04 + + env: + NOT_CRAN: 'true' + TREESEARCH_EXTENDED_TESTS: 'true' + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + RSPM: "https://packagemanager.posit.co/cran/__linux__/noble/latest" + + steps: + - name: Checkout git repo + uses: actions/checkout@v5 + + - name: Set up R + uses: r-lib/actions/setup-r@v2 + with: + r-version: 'release' + + - name: Install apt packages + run: sudo apt-get install -y texlive-latex-base texlive-fonts-recommended + + - name: Set up R dependencies + uses: r-lib/actions/setup-r-dependencies@v2 + with: + needs: check + + - name: Build and install package + run: R CMD INSTALL . + shell: bash + + - name: Run extended test suite + run: | + Rscript -e " + library(testthat) + library(TreeSearch) + test_dir('tests/testthat', filter = 'ts-', + reporter = 'progress', stop_on_failure = TRUE) + " + shell: bash + + - name: Notify on failure + if: failure() && github.event_name == 'schedule' + uses: actions/github-script@v7 + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + script: | + github.rest.issues.createComment({ + owner: context.repo.owner, + repo: context.repo.repo, + issue_number: 186, + body: 'Extended tests workflow has failed: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}' + }); diff --git a/.gitignore b/.gitignore index a712c7888..9fe0f1395 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ src/*.dll .Rdata .Rd2pdf** .Rprofile +.Renviron *.gcno *.knit.md *.log @@ -34,3 +35,25 @@ gen-*tip/* results-* inst/DELETE* *.bak +/.agent* +/agent-*.md +/to-do.md +/issues.md +/coordination.md +/.*-data +/TreeSearch.Rcheck +TreeSearch_*.tar.gz + +# Local build config — never commit +src/Makevars.win + +# Test artifacts +test*.txt +test_output.txt +/pdf +/.builds +.positai +/.gha-results +/.tnt-bench +/.vtune* +/vtune* diff --git a/.positai/briefing-multistate-profile.md b/.positai/briefing-multistate-profile.md new file mode 100644 index 000000000..71141492b --- /dev/null +++ b/.positai/briefing-multistate-profile.md @@ -0,0 +1,354 @@ +# Briefing: Extending Profile Parsimony to >2 States + +## Status: T-101 DONE, T-102–T-107 OPEN + +## Goal +Extend profile parsimony scoring from 2-state characters to multi-state (3+). + +## What exists already + +### Current 2-state implementation (on main branch) +- `R/pp_info_extra_step.r`: `StepInformation()` — computes information content + per character for all possible step counts. Uses `LogCarter1()` for 2 states. + Lines 51-56 explicitly warn and drop states beyond 2 informative tokens. +- `R/data_manipulation.R`: `PrepareDataProfile()` — decomposes multi-state + characters into pairs (keeping top-2 informative states), compresses to + binary, builds `info.amounts` matrix. Lines 89-115: `.RemoveExtraTokens()` + keeps only 2 most informative states; line 138 asserts exactly 2 non-ambig. + Lines 196-201 hardcode `levels = c("0", "1")` and a 3×2 contrast matrix. +- `R/MaximizeParsimony.R`: Lines 424-428 call `PrepareDataProfile()`, then + lines 528-533 extract `info.amounts` attribute and pass to C++. +- C++ engine: `ts_data.cpp` copies `info_amounts` table; `ts_fitch.cpp` + looks up `info_amounts[(step-1) + info_max_steps * pattern]` for each + pattern. The C++ scoring pipeline is generic — it handles multi-state Fitch + natively. Only the R-level data prep is restricted to 2 states. + +### Prior multi-state work (on `concordance-FitchInfo` branch, NOT on main) +- `src/MaddisonSlatkin.cpp`: Full C++ implementation of the Maddison & Slatkin + (1991) recursive algorithm for counting trees with exactly s steps for a + multi-state unordered character. Supports up to 5 states. +- `R/FitchInfo.R`: Uses `MaddisonSlatkin()` for concordance scoring with + multi-state characters. Not profile parsimony weighting, but the + mathematical core is exactly what's needed. +- Key commits: `ab5f80be` "Support 2-5 states", `23963c07` "Embed MadSlat to FI", + `9336c066` "FitchInfo" (latest on concordance-FitchInfo). +- The FitchInfo code already converts MaddisonSlatkin output to cumulative + information content (bits), which is the same transformation profile + parsimony needs. + +### Key mathematical insight +The existing `MaddisonSlatkin()` computes exactly what `StepInformation()` +needs: `log P(s steps | n_0, n_1, ..., n_k leaves)` for each possible step +count s, averaged over all unrooted binary trees. This is the multi-state +generalization of Carter et al. (1990)'s theorem 1. + +Profile parsimony's information content = `log2(N_total_trees) - log2(cumsum(N_trees_with_≤s_steps))` +where `N_trees_with_exactly_s_steps = exp(MaddisonSlatkin(s, states)) * N_total_trees`. + +So `MaddisonSlatkin()` output feeds directly into `StepInformation()`. + +## Architecture: what needs to change + +### Layer 1: Mathematics (already done on branch) +`MaddisonSlatkin()` computes `log(fraction of trees with exactly s steps)`. +This is the multi-state analog of `LogCarter1()`. + +### Layer 2: `StepInformation()` (R/pp_info_extra_step.r) +Currently calls `LogCarter1()` for 2 states, rejects >2. +Needs: dispatch to `MaddisonSlatkin()` when >2 informative states. +The transformation from log-probabilities to information content is identical. + +### Layer 3: `PrepareDataProfile()` (R/data_manipulation.R) +Currently decomposes to pairs and hardcodes binary contrast matrix. +Needs: pass multi-state characters through directly (no decomposition). +Must build a proper contrast matrix for k states + ambiguous token. +The `info.amounts` matrix dimensions change (more rows = more possible steps). + +### Layer 4: C++ engine +**No changes needed.** The `info_amounts` lookup table is already generic — +indexed by `(step, pattern)`. The Fitch scoring engine already handles +multi-state characters. We just need to feed it the right contrast matrix +and info_amounts. + +## Literature + +| Reference | Role | +|-----------|------| +| Carter et al. (1990) | Exact formula for 2-state trees — current basis | +| Steel (1993) | Distribution theory for bicolored trees | +| Steel & Charleston (1995) | Properties of parsimoniously colored trees | +| Steel, Goldstein & Waterman (1996) | CLT for parsimony length | +| Maddison & Slatkin (1991) | Recursive algorithm for multi-state — the key | +| Faith & Trueman (2001) | Original profile parsimony justification | + +No known closed-form generalization of Carter for >2 states exists. +Maddison & Slatkin's recursive algorithm is the standard approach. + +## Performance concerns +- MaddisonSlatkin is exponential in number of states (2^k bitmask states) +- Current C++ implementation handles up to 5 states +- For 2 states: `LogCarter1()` is O(1) per step count +- For 3-5 states: memoized recursion, feasible for typical morphological data +- For >5 states: may need approximation or capping +- `info.amounts` computation is a one-time precomputation cost (not in search loop) + +## Risks +1. MaddisonSlatkin.cpp is on a different branch — needs careful merge/cherry-pick +2. May need to handle the interaction with character simplification (currently + characters with many states get collapsed) +3. Performance for characters with many taxa AND many states +4. Need to handle edge cases: all-ambiguous, singleton states, etc. +5. Test coverage: existing profile parsimony tests assume binary data + +--- + +## T-106: Approximation for >5 State Profile Parsimony — Research Analysis + +### 1. Scaling of exact MaddisonSlatkin + +Benchmarked on Windows, R 4.5.2, single-threaded. All timings are for +computing the full step-count range (s_min to n-1). + +| k (tokens) | n (tips) | tips/state | Time | +|:-----------:|:--------:|:----------:|-----:| +| 2 | 4 | 2 | <1 ms | +| 2 | 10 | 5 | 10 ms | +| 3 | 9 | 3 | 100 ms | +| 3 | 15 | 5 | 3.7 s | +| 4 | 8 | 2 | 320 ms | +| 4 | 12 | 3 | 12.6 s | +| 4 | 20 | 5 | timeout (>30 s) | +| 5 | 10 | 2 | timeout (>30 s) | + +**Root cause:** The recursion partitions n tips into two subtrees in +all valid ways, for each of 2^k−1 root states, for each step count s +in 0..n−k. The memoization table grows as +O(#unique_leaf_configs × max_steps × #states), and the number of +unique leaf configurations grows combinatorially with n and k. + +**Conclusion:** Exact computation is infeasible for k≥5 with n≥15, or +k≥6 at any practical n. The current code's `k ≤ 5` limit is well-placed. + +### 2. Approximation approaches evaluated + +#### (a) Plain Monte Carlo + +Sample N random unrooted binary trees, score each with Fitch, tally the +step-count distribution. + +**Test:** k=6, n=30, split=(8,7,5,4,3,3), N=10,000 random trees. +Rate: ~1,700 trees/second. Distribution: observed range 13–22, peak at 19. + +**Problem:** The exact P(s_min=5) = exp(−38.6) ≈ 1.7×10⁻¹⁷, while the +smallest observable MC probability at N=10⁴ is 10⁻⁴. The "gap" between +the minimum step count and the MC-observable range spans 8 step counts and +13 orders of magnitude. Even at N=10⁶, the gap persists. + +**Verdict:** Cannot estimate the information-rich left tail. Only useful +for the body/right tail of the distribution. + +#### (b) Normal (CLT) approximation + +Steel, Goldstein & Waterman (1996) proved asymptotic normality of +parsimony length for binary characters. Multi-state CLT should hold by +similar arguments (sum of nearly independent subtree contributions). + +**Test:** Fitted normal(μ=18.9, σ=1.5) from MC data, extrapolated to s_min. + +| Metric | Normal | Exact | +|--------|-------:|------:| +| log P(s_min=5) | −44.3 | −38.6 | +| IC(s_min) bits | 62.1 | ~55.7 | + +**The normal overestimates IC at the minimum by ~6 bits** (the true +distribution has heavier left tails than Gaussian). However, this error +is at step counts that never occur on real trees during a search. + +In the MC-observable range (13–22 steps), the normal approximation agrees +well with empirical data. This is the range that actually affects search +decisions. + +**Verdict:** Accurate in the practical range. Left-tail error is +large but irrelevant for search quality. + +#### (c) Hybrid: exact anchor + MC body + +The key insight enabling this approach: + +> **P(s_min) has an exact O(k) formula for any k:** +> `P(s_min) = NUnrootedMult(split) / NUnrooted(n)` +> +> This uses the product-of-double-factorials counting formula for labeled +> trees consistent with k non-overlapping groups, and requires no recursion. + +The hybrid approach: +1. Exact P(s_min) via `NUnrootedMult` (instant for any k) +2. MC sample of N=50,000 random trees → empirical distribution for the body +3. Normal fit to MC data → parametric extrapolation for the sub-MC left tail +4. Blend: use exact at s_min, normal extrapolation for s_min+1 to MC left + edge, empirical distribution for MC-observable range + +**Verdict:** This is the recommended approach (see §3 below). + +#### (d) "Keep top 5" (current fallback) + +The existing `StepInformation()` already handles k>5 by keeping the 5 most +frequent tokens and dropping the rest. This discards real information +(the dropped tokens contribute genuine parsimony signal) but is safe. + +For characters where the dropped tokens each have only 2–3 leaves, the +information loss is modest. For characters with 6+ well-represented tokens, +the loss is significant but hard to quantify without exact values. + +#### Other approaches considered + +- **Importance sampling** (bias toward low-step trees): Could solve the + left-tail problem but requires a carefully designed proposal distribution. + Engineering effort disproportionate to the niche use case. +- **WithOneExtraStep() extension** to k>2: Currently unimplemented. The + combinatorics are substantially harder for k>2 (multiple ways to place + the extra step among k groups). Could provide exact P(s_min+1) but would + not solve the general left-tail problem. +- **Extending MaddisonSlatkin to k=6:** Structural changes to support + 2^6−1=63 states are modest (add `StateKeyT<6>` template), but the + computational blowup still makes it infeasible for n>10–12. + +### 3. Recommendation + +**Primary approach: MC-calibrated normal approximation with exact anchor** + +This approach requires minimal new code, has well-understood error +properties, and covers the only practical use case (characters with 6+ +states in morphological datasets). + +**Why this is sufficient:** Profile parsimony's search engine only compares +info_amounts values at step counts that actually occur on candidate trees. +For a k=6 character on 30 tips, candidate trees typically score 13–22 steps +(based on MC data). The information content in this range is well-estimated +by the normal approximation calibrated to MC samples. The extreme left +tail (5–12 steps) has enormous IC values that serve as theoretical upper +bounds but never affect search decisions, because no reasonable tree +achieves those step counts. + +**Performance:** The MC sampling adds ~30 seconds per character at +N=50,000 trees (for n=30 tips). This is a one-time precomputation cost. +For datasets with few >5-state characters, this is acceptable. For +datasets with many such characters, the MC could be parallelized or the +sample size reduced. + +**Accuracy:** In the practical range (within ~3σ of the MC mean), the +normal approximation's IC values match empirical estimates to within +~0.1 bits. This is smaller than the character's own noise and does not +materially affect search quality. + +**Fallback:** If MC is too slow or the user needs a quick result, retain +the existing "keep top 5" heuristic as an option. + +### 4. Prototype R code + +```r +#' Approximate StepInformation for >5 state characters +#' +#' Uses exact P(min_steps) + MC-calibrated normal approximation. +#' +#' @param split Integer vector of token frequencies (sorted decreasing, +#' singletons removed). +#' @param n_mc Number of Monte Carlo trees to sample (default 50000). +#' @return Named numeric vector of information content (bits) per step count. +#' @keywords internal +.ApproxStepInformation <- function(split, n_mc = 50000L) { + k <- length(split) + n <- sum(split) + s_min <- k - 1L + s_max <- n - 1L + + # 1. Exact P(minimum steps) — works for any k + log_p_min <- log(NUnrootedMult(split)) - log(NUnrooted(n)) + + # 2. Monte Carlo: sample random trees, tally step counts + labels <- paste0("t", seq_len(n)) + char_vec <- rep(seq_along(split) - 1L, split) + names(char_vec) <- labels + dat <- TreeTools::MatrixToPhyDat( + matrix(char_vec, ncol = 1, dimnames = list(labels, "c1")) + ) + mc_scores <- vapply( + seq_len(n_mc), + function(i) RandomTreeScore(dat), + double(1) + ) + + # 3. Fit normal to MC data + mu_hat <- mean(mc_scores) + sd_hat <- sd(mc_scores) + + # 4. Build log-probability vector for all step counts + steps <- s_min:s_max + n_steps <- length(steps) + log_p <- numeric(n_steps) + + for (i in seq_along(steps)) { + s <- steps[i] + if (s == s_min) { + # Exact value + log_p[i] <- log_p_min + } else { + # MC estimate (with continuity correction) + mc_count <- sum(mc_scores == s) + if (mc_count > 0) { + # Direct empirical estimate + log_p[i] <- log(mc_count / n_mc) + } else { + # Normal extrapolation for unobserved step counts + log_p[i] <- dnorm(s, mu_hat, sd_hat, log = TRUE) + } + } + } + + # 5. Cumulative IC + ret <- -.LogCumSumExp(log_p) / log(2) + ret[ret < sqrt(.Machine[["double.eps"]])] <- 0 + names(ret) <- steps + + ret +} +``` + +### 5. Implementation plan + +| Step | Description | Effort | +|------|-------------|--------| +| 1 | Add `.ApproxStepInformation()` to `R/pp_info_extra_step.r` | 1 hr | +| 2 | Modify `StepInformation()`: dispatch to `.Approx...` when k>5 (instead of current top-5 truncation) | 30 min | +| 3 | Add `approx` parameter to `StepInformation()` with options `"exact"` (current), `"mc"` (new), `"auto"` (default: exact for k≤5, MC for k>5) | 30 min | +| 4 | Tests: verify MC approximation agrees with exact for k=3 within ~10% relative IC at practical step counts | 1 hr | +| 5 | Documentation: update `StepInformation()` docs to describe approximation | 30 min | + +Total: ~3.5 hours. No C++ changes needed. + +### 6. Comparison with existing "keep top 5" approach + +| Criterion | Keep top 5 | MC approximation | +|-----------|-----------|------------------| +| Speed | Instant (delegates to exact) | ~30s per character | +| Accuracy at practical range | Unknown (drops signal) | ~0.1 bit error | +| Left tail | Exact for reduced char | Exact P(min) + normal extrapolation | +| Handles any k | Yes (truncates) | Yes | +| New code | 0 lines | ~60 lines R | +| C++ changes | None | None | + +For datasets where >5-state characters are rare (typical morphology), the +MC overhead is negligible relative to the search time. For datasets with +many such characters, the top-5 fallback remains available. + +### 7. Future improvements (deferred) + +- **Exact P(s_min + 1):** Extending `WithOneExtraStep()` to k>2 would + give a second exact anchor point, improving the left-tail interpolation. + The combinatorics are non-trivial but tractable. +- **Importance sampling:** For characters where the search regularly reaches + near-minimum step counts (small n, few states per token), importance + sampling could improve accuracy. Not worth implementing unless a specific + dataset demonstrates the need. +- **Cached MC tables:** For common state-frequency patterns, pre-computed + MC tables could eliminate the per-character sampling cost. diff --git a/.positai/briefing-progressive-results.md b/.positai/briefing-progressive-results.md new file mode 100644 index 000000000..04fe67dac --- /dev/null +++ b/.positai/briefing-progressive-results.md @@ -0,0 +1,200 @@ +# Briefing: Progressive Search Result Display + +**Task:** T-129 +**Author:** Agent A +**Date:** 2026-03-19 + +--- + +## Summary + +**Recommendation: implement progress file polling using the existing C++ callback infrastructure.** + +This approach reuses the cancel-file pattern already in place, requires minimal new +code on both the C++ and Shiny sides, and gives users real-time per-replicate feedback +without architectural changes or streaming intermediates. + +Do **not** stream partial tree results to the UI mid-search. The benefits are marginal +and the implementation cost is high (see below). + +--- + +## Context + +Searches are invoked via `ExtendedTask` wrapping `future::future({MaximizeParsimony(...)})`. +The future runs in a separate R process — no reactive communication until it resolves. +The only currently-visible progress signal is a static "Searching…" notification and +a frozen output panel. + +For short searches (<10 s), this is not a problem. For long searches (large datasets, +many replicates, long timeouts), users have no feedback that the search is alive or +making progress. + +--- + +## What Already Exists + +### C++ side: `progress_callback` (ts_driven.h / ts_rcpp.cpp) + +`DrivenParams::progress_callback` is an `std::function` +already called after every phase and every replicate. The `ProgressInfo` struct carries: + +``` +replicate — current replicate (1-based) +max_replicates — configured max +best_score — pool best so far +hits_to_best — independent discoveries of best +target_hits — convergence target +pool_size — trees currently in pool +phase — "replicate", "done", "tbr", "ratchet", etc. +elapsed_seconds — wall time since search start +phase_score — score after this phase +``` + +The Rcpp bridge (`ts_rcpp.cpp` lines 1360–1375) already accepts an optional R function +and wraps it into this callback. **The infrastructure is complete — it just isn't used +by MaximizeParsimony() yet.** + +### Shiny side: file-polling pattern (mod_search.R profile prep) + +`profilePrepTask` already uses: +1. `tempfile()` progress path passed to the background task +2. `invalidateLater(500)` observer polling the file every 500ms +3. Notification update on each poll + +This is exactly the right pattern for search progress too. + +--- + +## Recommended Approach: Progress File Polling + +### How it works + +1. Before invoking `searchTask`, create a `progressPath` temp file. +2. In the background future, after `MaximizeParsimony()` sets `TREESEARCH_CANCEL_FILE`, + also set a `TREESEARCH_PROGRESS_FILE` environment variable. +3. In `MaximizeParsimony()` (R level), if `TREESEARCH_PROGRESS_FILE` is set, pass an R + function as `progressCallback` to `ts_driven_search()`. This callback writes + a single line to the file after each replicate: `{rep} {max_rep} {best_score} {hits}`. +4. The main Shiny process polls `progressPath` every 500ms. On each poll, update the + notification text. + +### What the user sees + +Currently: +> `Searching (50 runs, k=6, 2 threads)…` + +With progress polling: +> `Searching… Rep 15/50 | Best: 42 | 3 hits` + +Or with elapsed time: +> `Searching… Rep 15/50 | Best: 42 | 3 hits | 8.2s elapsed` + +When targetHits is reached before maxReplicates, this naturally shows convergence: +> `Searching… Rep 23/50 | Best: 42 | 5/5 hits ✔ (wrapping up…)` + +### Implementation path + +**R package changes (MaximizeParsimony.R):** +- Check `Sys.getenv("TREESEARCH_PROGRESS_FILE")` before calling ts_driven_search +- If set, construct a `progressCallback` function that `writeLines()` to the file + on `phase == "replicate"` events only (skip phase-level noise) +- ~20 lines R + +**Shiny changes (mod_search.R):** +- Add `progressPath <- tempfile()` and pass it to the future alongside `cancelPath` +- Set `Sys.setenv(TREESEARCH_PROGRESS_FILE = progressPath)` in the future alongside + the cancel env var +- Add `invalidateLater(500)` observer (mirroring the profile prep observer) that + reads and parses the progress file +- On read, update the `r$searchNotification` message text +- ~30 lines R, no new UI elements + +**C++ changes:** None required. The existing `progress_callback` / `progressCallback` +infrastructure handles everything. + +**Total estimated effort:** ~2–3 hours. + +--- + +## What NOT to Build: Partial Tree Streaming + +A common wish is to "show best trees so far" during a search. This sounds appealing +but has significant problems: + +### The pool is not intermediate-result-safe + +The internal `TreePool` accumulates trees across replicates. At any mid-search point, +the pool contains a **subset of replicates' local optima** — not the final MPT set. +Trees in the pool at rep 15/50 may be suboptimal relative to the final result; the +tree topology at the "current best score" may not even survive MPT enumeration. + +Displaying these trees as search results would be misleading: users might interpret +them as MPTs, save them, or make decisions based on incomplete evidence. + +### R-level chunking doesn't help + +Splitting the search into multiple short `MaximizeParsimony()` calls (each returning +a partial result) is tempting but: +- The pool and search state don't persist across calls (each call starts fresh) +- The quality/time tradeoff from very short searches is poor (no ratchet convergence) +- This is exactly what the "Continue search" button already provides at the user level + +If users want intermediate trees, "Continue search" with small `maxReplicates` already +achieves this. + +### The right display is convergence status + +What users actually need to know mid-search is not *which trees* are in the pool, but: +- Is the search still running? (alive check) +- Is the best score improving? (convergence progress) +- How many hits to the best score so far? (convergence confidence) + +All three are available from `ProgressInfo` at the replicate level with no new C++ +work. + +--- + +## Secondary Improvement: Elapsed Timer (Trivially Easy) + +Even without the C++ callback, a simple elapsed-time counter can be added with zero +package changes: + +```r +# In mod_search.R, near the searchInProgress observer: +observe({ + req(r$searchInProgress) + invalidateLater(1000) # fire every second + elapsed <- as.integer(difftime(Sys.time(), r$searchStartTime, units = "secs")) + # Update notification text: "Searching… (42s elapsed)" +}) +``` + +This costs ~10 lines and prevents "is the app frozen?" uncertainty. However, it +provides no information about progress — a 5-minute search with a frozen score +gives the user no convergence signal. The file-polling approach is clearly superior. + +--- + +## Decision Matrix + +| Approach | Effort | Value | Verdict | +|----------|--------|-------|---------| +| Elapsed timer only | ~10 lines | Low — no convergence info | Not worth it alone | +| **Progress file polling** | **~50 lines** | **High — reps + score + hits** | **✅ Recommended** | +| Partial tree streaming | ~200+ lines + arch changes | Low — misleads user | ✗ Do not build | +| R-level chunking | ~150+ lines + pool state | Medium — duplicates "Continue" | ✗ Redundant | + +--- + +## Concrete Task Proposal + +File as **T-141** (P3): + +> **Shiny: Per-replicate search progress display** +> Use existing `progress_callback` / `TREESEARCH_PROGRESS_FILE` env var pattern +> (mirrors cancel file + profile prep). MaximizeParsimony() writes rep/score/hits +> to file on each replicate. mod_search.R polls every 500ms during search. +> Result: notification updates from static "Searching…" to live "Rep 15/50 | Best: 42 | 3 hits". +> No C++ changes needed. +> Estimate: ~2–3 hours. diff --git a/.positai/expertise/coordination.md b/.positai/expertise/coordination.md new file mode 100644 index 000000000..466f072d4 --- /dev/null +++ b/.positai/expertise/coordination.md @@ -0,0 +1,74 @@ +# Coordination Expertise — TreeSearch + +## Purpose + +Review the overall state of multi-agent work. Update `coordination.md`, +propose new tasks, resolve blockers. This is the "project manager" role. + +## Workflow + +1. **Read all agent files** (`agent-a.md` through `agent-f.md`): + - Who is working on what? + - Is anyone stuck or blocked? + - Has anyone finished a task without updating to-do.md? + +2. **Read `to-do.md`**: + - Are completed tasks moved to the Completed section? + - Are task statuses accurate? + - Are priorities still correct given current project state? + - Are there enough OPEN tasks to keep all agents busy? + - Adjust standing task priorities per the dynamic priority rule. + +3. **Read `coordination.md`**: + - Update the Agent Status table from agent files. + - Update Known Issues if any have been resolved. + - Add new Architecture Decisions if agents have made significant choices. + +4. **Read `AGENTS.md`** (bottom sections): + - Check for newly documented completed work. + - Verify that documentation matches what agents report. + +5. **Propose new tasks** if needed: + - If <6 OPEN specific tasks, look at `coordination.md` strategic + objectives and break the next one into concrete, assignable tasks. + - If agents have reported findings (from red-team or profiling), + ensure those are captured in to-do.md. + +6. **Update all files**: + - `coordination.md` — agent status, any new issues or decisions + - `to-do.md` — new tasks, priority adjustments, status corrections + - `agent-X.md` — mark your own task as complete + +## Task Creation Guidelines + +Good tasks are: +- **Specific**: "Profile ratchet inner loop for Zhu2013 dataset" not + "Investigate performance" +- **Scoped**: Completable by one agent in one session (~1-2 hours) +- **Independent**: Minimal overlap with other tasks (check Blocks column) +- **Testable**: Clear success criteria (tests pass, benchmark improves, etc.) + +When deriving tasks from strategic objectives: +- Break Phase 6 steps into individual tasks (T-001 through T-005 already done) +- For code quality work, group related TODOs into one task per file/module +- For documentation, one task per major section (vignettes, function docs, etc.) + +## Priority Guidelines + +| Priority | Criteria | +|----------|----------| +| P0 | Blocks multiple agents or causes incorrect results | +| P1 | Blocks the next strategic objective or is a correctness bug | +| P2 | Important but not blocking; performance improvements | +| P3 | Nice to have; cleanup; future-looking | + +## Cross-Agent Conflict Detection + +Watch for: +- Two agents modifying the same file (especially `ts_rcpp.cpp`, + `TreeSearch-init.c`, `R/RcppExports.R`) +- Incompatible parameter changes to the same Rcpp bridge function +- One agent's optimization breaking another's assumptions + +If conflicts are detected, flag them in `to-do.md` as P0 and note +which agents are affected. diff --git a/.positai/expertise/fitch-scoring.md b/.positai/expertise/fitch-scoring.md new file mode 100644 index 000000000..4f74afa9f --- /dev/null +++ b/.positai/expertise/fitch-scoring.md @@ -0,0 +1,136 @@ +# Fitch Scoring — Design Notes & Proven Invariants + +Reference for agents working on `ts_fitch.h/.cpp`, `ts_fitch_na.h`, +`ts_fitch_na_incr.h`, or the search modules that call them. + +## Incremental uppass correctness (standard Fitch) + +The incremental uppass (`fitch_incremental_uppass`) uses a dirty-flag +propagation scheme that does **not** explicitly revisit every node whose +prelim changed during the incremental downpass. Only nodes whose +*ancestor's final* changed are recomputed. + +This looks like it could miss updates when the downpass stops before +root (prelim stabilises at some intermediate node N). Nodes between +`clip_ancestor` and N have changed prelims but their ancestors' finals +are unchanged, so the dirty-flag scheme skips them. + +**This is provably correct for standard (non-NA) Fitch blocks.** + +### Proof sketch + +When the downpass stops at node N, `fitch(M_new, S) = fitch(M_old, S)` +where M is N's child on the downpass path and S is the sibling. + +**Case 1 — both intersection-type:** `M_old ∩ S = M_new ∩ S = P`. +Then N_final ⊆ P ⊆ M_old and N_final ⊆ P ⊆ M_new. So +`uppass(N_final, M_old) = N_final ∩ M_old = N_final` and likewise for +M_new. Finals are identical. + +**Case 2 — both union-type:** `M_old ∪ S = M_new ∪ S` with +`M_old ∩ S = ∅` and `M_new ∩ S = ∅`. Since the unions are equal and +both M sets are disjoint from S, `M_old = M_new`. No change. + +**Case 3 — mixed types:** Intersection equals union only if both +operands are identical and the set is trivial. Not reachable in +practice (would require empty state sets). + +The argument applies per-character (per bit position), so it holds +for packed 64-bit representations. + +### Consequence + +No code change needed. The dirty-flag scheme is an optimisation that +happens to be exact for standard Fitch, not just a heuristic. + +--- + +## NA uppass `children_app` staleness + +The NA-aware incremental uppass (`fitch_na_incremental_uppass`) has a +**theoretical staleness issue** that does NOT affect standard blocks. + +The NA uppass formula at internal nodes uses: + +```cpp +uint64_t children_app = 0; +for (int s = 1; s < k; ++s) + children_app |= (tree.prelim[left + s] | tree.prelim[right + s]); +``` + +This `children_app` can change even when the node's own prelim is +stable, because the NA downpass aggregates children differently (using +intersection/union/strip cases) from the raw OR of children's states. + +If the downpass stops at node N because N's NA-aware prelim didn't +change, but N's child M *did* change prelim, then `children_app` at N +is different from before. The dirty-flag scheme won't revisit N, so +N's `final_` for NA blocks may be stale. + +### Impact + +- `fitch_na_pass3_score()` uses `final_` for `ss_app` (applicability). + A stale `ss_app` can make `divided_length` slightly wrong. +- Indirect length calculations use `final_` for virtual-root + computation, so candidate scores can be slightly wrong. +- **Conservative**: `full_rescore()` always runs before accepting a + move, so final results are never affected. +- Same design class as the documented `extract_divided_steps` heuristic + (ts_tbr.cpp:39-41) which uses stale `local_cost` for NA blocks. + +### If this ever needs fixing + +Mark the entire rootward path from `clip_ancestor` as dirty: + +```cpp +int node = clip_ancestor; +while (node != root) { + dirty[node] = true; + node = tree.parent[node]; +} +``` + +This is O(depth) extra work per clip, acceptable for correctness. +Currently not worth doing because full_rescore is authoritative. + +--- + +## upweight_mask coverage + +During ratchet perturbation, `upweight_mask` doubles the contribution +of selected characters. Every function that computes EW step counts +must account for it. The pattern: + +```cpp +int ns = popcount64(needs_step); +if (blk.upweight_mask) ns += popcount64(needs_step & blk.upweight_mask); +extra_steps += blk.weight * ns; +``` + +**Sites that must have this** (all verified correct as of 2026-03-19): + +| Function | File | Status | +|----------|------|--------| +| `fitch_downpass` | ts_fitch.cpp | ✓ | +| `fitch_incremental_downpass` | ts_fitch.cpp | ✓ | +| `fitch_indirect_length` | ts_fitch.cpp | ✓ | +| `fitch_indirect_length_bounded` | ts_fitch.cpp | ✓ (fixed T-096) | +| `fitch_indirect_length_cached` | ts_fitch.cpp | ✓ (fixed T-096) | +| `fitch_na_indirect_length` | ts_fitch_na_incr.h | ✓ | +| `fitch_na_indirect_length_bounded` | ts_fitch_na_incr.h | ✓ | +| `fitch_na_indirect_length_cached` | ts_fitch_na_incr.h | ✓ | +| `fitch_na_score` Pass 1 (standard blocks) | ts_fitch_na.h | ✓ | +| `fitch_na_score` Pass 3 | ts_fitch_na.h | ✓ | +| `fitch_na_pass3_score` | ts_fitch_na_incr.h | ✓ | +| `fitch_na_incremental_downpass` (standard blocks) | ts_fitch_na_incr.h | ✓ | +| `nx_cost` in TBR | ts_tbr.cpp | ✓ (fixed T-096) | +| `nx_cost` in SPR | ts_search.cpp | ✓ (fixed T-096) | +| `nx_cost` in drift | ts_drift.cpp | ✓ (fixed T-096) | +| drift RFD computation | ts_drift.cpp | ✓ (fixed T-096) | + +**Does NOT need upweight_mask:** +- `extract_char_steps` / `extract_divided_steps` — these extract raw + per-pattern step counts for IW/profile scoring, which uses + `pattern_freq` doubling instead of `upweight_mask`. +- `fitch_downpass_node` (standalone) — callers handle weighting. +- IW indirect variants — weighting baked into `iw_delta`. diff --git a/.positai/expertise/profiling.md b/.positai/expertise/profiling.md new file mode 100644 index 000000000..879ec7505 --- /dev/null +++ b/.positai/expertise/profiling.md @@ -0,0 +1,506 @@ +# Profiling Expertise — TreeSearch + +## Purpose + +Profile the C++ search engine to identify bottlenecks. Produce specific, +actionable optimization tasks for `to-do.md`. + +## Tools + +### 1. Built-in Phase Timing (Quick) + +The driven search already has `std::chrono` phase timing at `verbosity >= 2`. +Use the R-level interface: + +```r +library(TreeSearch) +library(TreeTools) +dataset <- TreeSearch::inapplicable.datasets[["Vinther2008"]] +result <- MaximizeParsimony(dataset, maxReplicates = 3, verbosity = 2L) +``` + +This prints per-phase timing. For programmatic access, use the +`ts_bench_tbr_phases` diagnostic function (7 args, registered in +TreeSearch-init.c). + +### 2. std::chrono Micro-Benchmarks (Medium) + +For fine-grained timing of specific functions, add `steady_clock` timing +around the code path of interest. See `inst/benchmarks/bench_memory.R` +and `inst/benchmarks/bench_simd.R` for examples. + +Key metrics to measure: +- Per-candidate indirect scoring cost (ns) +- Clip+incremental phase time (μs per TBR pass) +- Full rescore time (μs) +- Snapshot save/restore time (μs) + +### 3. VTune (Thorough) + +For instruction-level hotspot analysis, use the `r-package-profiling` +skill (load via the skill tool). Key steps: + +1. Build with debug symbols: set `DLLFLAGS` via `MAKEFLAGS` env var +2. Run a representative workload under VTune +3. Analyze hotspots in the VTune GUI + +See `.positai/skills/r-package-profiling/references/` for detailed +VTune workflow on Windows. + +**Current version: VTune 2025.10** (updated 2026-03-19). Requires Ice Lake +or newer CPU (10th gen Intel Core / 3rd gen Xeon Scalable+). VS 2019 +integration and Eclipse integration are removed in 2025.x. Command-line +workflow (`vtune -collect hotspots`) is unchanged. + +### 4. R-Level Profiling + +For R overhead identification: + +```r +Rprof("profile.out") +result <- MaximizeParsimony(dataset, maxReplicates = 5) +Rprof(NULL) +summaryRprof("profile.out") +``` + +## Known Baselines + +### Latest run: 2026-03-27 by Agent A (round 6: post-T-261/T-262/T-263 phase distribution) + +See "Phase distribution: current thorough preset" section below for updated numbers. +The 2026-03-18 baselines used strategy='none' (TBR-only); the thorough preset +now dominates medium-scale search, making direct comparison impractical. + +### Previous run: 2026-03-18 16:00 by Agent A (v2.0.0, single-agent, quiet machine) + +Previous baselines (2026-03-17) were inflated ~30–40% by multi-agent machine +contention. Scores are identical. Timings below are authoritative. + +### End-to-end benchmarks (3-run medians, 5 reps, strategy='none', EW): + +| Dataset | Tips | Chars | Median (s) | Score | +|---------|------|-------|------------|-------| +| Vinther2008 | 23 | 57 | 0.390 | 79 | +| Agnarsson2004 | 62 | 242 | 1.860 | 778 | +| Zhu2013 | 75 | 253 | 2.720 | 655 | +| Dikow2009 | 88 | 220 | 3.860 | 1614 | + +### Per-phase breakdown (Zhu2013, 5 reps, two runs averaged): + +| Phase | % of time | Avg ms/rep | +|-------|-----------|------------| +| Wagner | <0.1% | <1 | +| TBR | 24–37% | 110–160 | +| XSS | 10% | 35–55 | +| RSS | 2% | 9–13 | +| Ratchet | 24–28% | 90–155 | +| Drift | 25–33% | 90–200 | +| Final TBR | 2% | 7–10 | + +Ratchet (24-28%) and drift (25-33%) dominate. TBR (24-37%) varies +substantially by run. XSS ~10%, RSS ~2%, both stable. + +### Wagner tree construction: Negligible (<0.1% of search time) + +| Dataset | Tips | µs/tree | % of replicate | +|---------|------|---------|----------------| +| Vinther2008 | 23 | 300 | <0.1% | +| Agnarsson2004 | 62 | 1000 | 0.3% | +| Zhu2013 | 75 | 600 | 0.1% | +| Dikow2009 | 88 | 1400 | 0.2% | + +Not a bottleneck at any dataset size. No optimization needed. + +### Parallel scaling (2 threads) + +| Dataset | Reps | 1T (s) | 2T (s) | Speedup | Efficiency | +|---------|------|--------|--------|---------|------------| +| Zhu2013 | 5 | 2.53 | 1.59 | 1.59× | 80% | +| Zhu2013 | 10 | 5.16 | 3.29 | 1.57× | 78% | +| Zhu2013 | 20 | 10.70 | 5.20 | 2.06× | 103%* | +| Zhu2013 | 40 | 18.63 | 11.35 | 1.64× | 82% | +| Dikow2009 | 10 | 7.76 | 5.11 | 1.52× | 76% | + +*Superlinear at 20 reps is stochastic noise (different search paths). + +**Finding:** Typical 2-thread efficiency is 78–82%. The old 1.24× measurement +was a multi-agent machine contention artifact. The implementation (dynamic +work-stealing via `atomic::fetch_add`, mutex-guarded pool) is sound. +Main loss is stochastic load imbalance between replicate times. + +### XSS/RSS effectiveness (5 reps per dataset) + +| Dataset | Tips | XSS hits | XSS avg Δ | XSS avg ms | RSS hits | RSS avg Δ | RSS avg ms | +|---------|------|----------|-----------|------------|----------|-----------|------------| +| Agnarsson2004 | 62 | 3/5 | 3.8 steps | 59 | 0/5 | 0 | 14 | +| Zhu2013 | 75 | 5/5 | 26.6 steps | 43 | 2/5 | 1.0 | 11 | +| Dikow2009 | 88 | 0/5 | 0 | 93 | 1/5 | 3.2 | 29 | + +**Finding:** XSS effectiveness is highly dataset-dependent — from zero +improvement (Dikow2009) to 27-step average improvement (Zhu2013). No obvious +predictor from simple nTip/nChar statistics. XSS cost is ~10% of replicate +time; acceptable when effective but wasted when not. + +RSS is marginal across all datasets (0–3 steps, 2% of time). One exception: +Dikow2009 where RSS found 16 steps while XSS found 0 — suggests they +explore different neighbourhoods. + +### Auto strategy (reference — unchanged from T-066/T-068 study) + +Threshold: ≥75 tips AND nChar < 100 triggers "thorough". Signal-density gate +prevents unnecessary thorough runs on character-rich datasets. + +### R overhead: <0.5% of wall time (confirmed via Rprof, unchanged) + +### Scaling exponent: ~2.82 (TBR pass time vs tips, unchanged) + +### Drift/ratchet cycle tuning (reference — unchanged from T-029 study) + +| Config | Med score | Min score | Med time | Speedup | +|--------|-----------|-----------|----------|---------| +| d5_r5 (default) | 656 | 648 | 5.7s | — | +| d2_r5 | 660 | 646 | 4.1s | 28% | +| d2_r2 | 662 | 656 | 3.8s | 33% | +| d0_r5 | 658 | 650 | 2.8s | 51% | +| d5_r0 | 662 | 660 | 4.8s | 16% | + +Lower score = better. Current defaults: d2_r5. + +### CSS effectiveness: Marginal (adds 2-6% time, no consistent improvement) +Disabled by default (cssRounds=0). + +### Latest EW regression check: 2026-03-19 by Agent A (v2.0.0, post T-115–T-124) + +All datasets pass regression benchmark. EW baselines updated with 7-run medians: + +| Dataset | Tips | Chars | Median (s) | Score (range) | Notes | +|---------|------|-------|------------|---------------|-------| +| Vinther2008 | 23 | 57 | 0.420 | 79 | stable | +| Agnarsson2004 | 62 | 242 | 1.790 | 778 | stable | +| Zhu2013 | 75 | 253 | 3.170 | 648–666 | high variance (2.5–7.6s range) | +| Dikow2009 | 88 | 220 | 4.900 | 1612–1614 | high variance (4.0–12.4s range) | + +Zhu2013/Dikow2009 appear slightly slower than 2026-03-18 baselines (~17–27%) but +within stochastic noise. Phase breakdown unchanged. No regression in C++ engine. +The recent DataSet changes (inapp_state field, HSJ/XFORM modes) have no measurable +effect on EW search paths. + +### HSJ and XFORM scoring baselines: 2026-03-19 by Agent A + +Synthetic hierarchical datasets (valid hierarchy structure: primary + secondary chars, +secondaries are inapplicable when primary absent). 3-run medians, 5 reps per run. + +| Config | Tips | Chars | Blocks | EW (s) | HSJ (s) | XFORM (s) | HSJ/EW | XFORM/EW | +|--------|------|-------|--------|--------|---------|-----------|--------|----------| +| small | 20 | 19 | 3 | 0.020 | 0.010 | 0.020 | 0.5× | 1.0× | +| medium | 40 | 50 | 5 | 0.170 | 0.100 | 0.280 | 0.6× | 1.6× | +| large | 60 | 82 | 8 | 0.610 | 0.360 | 1.330 | 0.6× | 2.2× | +| xlarge | 80 | 120 | 10 | 5.920 | 3.560 | 9.460 | 0.6× | 1.6× | + +**HSJ is faster than EW** (~0.6× at medium/large sizes) because: +1. Fitch candidate screening guards expensive full HSJ rescore — most candidates + are rejected by Fitch before HSJ is called. +2. Hierarchy datasets have a simpler parsimony landscape (secondaries add signal + only when primary is present), leading to faster search convergence. + +**XFORM is slower than EW** (~1.6–2.2× at medium/large sizes) due to Sankoff +cost per candidate. Phase breakdown (large config, 5 reps): + +| Phase | EW avg ms/rep | HSJ avg ms/rep | XFORM avg ms/rep | +|-------|---------------|----------------|------------------| +| TBR | 25 | 23 | 29 | +| XSS | 14 | 7 | 14 | +| RSS | 4 | 2 | 5 | +| Ratchet | 51 | 28 | 86 | +| Drift | 22 | 13 | 36 | +| Final TBR | 2 | 1 | 4 | +| **Total** | **117** | **74** | **174** | + +XFORM overhead concentrated in Ratchet (+69%) and Drift (+64%), which perform +more scoring iterations than TBR. XSS/RSS overhead is negligible. + +**Conclusion:** Both modes are acceptable. XFORM at ~1.7× overhead for real +workflows is reasonable given the algorithmic complexity (Sankoff vs Fitch). +No optimization tasks raised — XFORM at this cost is expected behavior. + +### Hierarchical resampling: 2026-03-19 by Agent A + +Medium config (40 tips, 50 chars, 5 blocks), jackknife, 20 reps: + +| Mode | 1 thread (s) | 2 threads (s) | Speedup | +|------|-------------|--------------|---------| +| Brazeau (C++ parallel) | 5.19 | 2.05 | 2.5× | +| HSJ hierarchical (serial R loop) | 1.76 | 1.64 | 1.1× | +| XFORM hierarchical (serial R loop) | measured via 10-rep: ~1.58 | — | — | + +**Finding 1 (positive):** HSJ/XFORM hierarchical resampling is faster than Brazeau +per-replicate because the block-level resampling units (35 vs 50 units) produce +simpler per-replicate datasets. No performance concern here. + +**Finding 2 (known limitation):** Hierarchical resampling uses a serial R loop +across replicates — `nThreads` only applies within each replicate's internal search. +Brazeau gets full 2.5× at 2 threads; HSJ/XFORM get only ~1.1×. For users running +50–100 jackknife replicates with large HSJ/XFORM datasets, wall time will be ~2× +longer than equivalent Brazeau. This is documented in AGENTS.md as a known future +optimization (C++-level inter-replicate parallelism for hierarchical resampling). +No new task filed — already on the roadmap. + +### Preset tuning benchmark: 2026-03-22 by Agent A + +Compared updated presets (wagnerStarts=3, sprFirst=TRUE, adaptiveLevel=TRUE +for default; wagnerStarts=3, sprFirst=TRUE for thorough) against old presets +(wagnerStarts=1, sprFirst=FALSE, adaptiveLevel=FALSE). 7-run medians via +`MaximizeParsimony()`, strategy=auto, 10 reps, 1 thread. + +| Dataset | Tips | Preset | Old time (s) | New time (s) | Δ time | Old score | New score | +|---------|------|--------|-------------|-------------|--------|-----------|-----------| +| Vinther2008 | 23 | sprint | 0.76 | 0.65 | –14% (noise) | 79 | 79 | +| Agnarsson2004 | 62 | default | 3.59 | 2.41 | **–33%** | 778 | 778 | +| Zhu2013 | 75 | thorough | 23.65 | 24.83 | +5% (noise) | 647 | 648 | +| Dikow2009 | 88 | thorough | 49.19 | 39.24 | **–20%** | 1611 | 1612 | + +**Findings:** +- `adaptiveLevel` in `default` preset: consensus-stability triggers early exit + on easy landscapes (Agnarsson2004), saving 33%. No score regression. +- `sprFirst + wagnerStarts=3` in `thorough`: 20% faster on Dikow2009 (better + starting tree reduces initial TBR descent). Neutral on Zhu2013. +- **Do not enable `adaptiveLevel` in `thorough`**: with 20 ratchet + 12 drift + base, 1.5× scaling creates 30 ratchet + 18 drift per hard replicate, + causing 3–4× slowdowns for only 2–3 step improvement (benchmarked separately). + +### 180-tip large-preset baselines: 2026-03-26 by Agent E (Hamilton HPC, EPYC 7702) + +Dataset: mbank_X30754 (180 taxa, 425 chars, 418 patterns, 40% missing, 20% inapplicable). +Strategy: auto → "large" preset. 5 seeds per budget, single-threaded. + +**Score quality by budget (median, 5 seeds):** + +| Budget | Median score | Range | Reps/seed | +|--------|:-----------:|:-----:|:---------:| +| 30s | 1202 | 1189–1214 | ~1.5 | +| 60s | 1190 | 1190–1202 | ~3 | +| 120s | 1185 | 1171–1189 | ~6 | + +Per-replicate time: median 17.3s (range 13.7–21.2s). MPT enumeration adds +0–2 steps beyond best single-replicate score. + +**Phase distribution (rep 1, 30s budget, 5-seed averages):** + +| Phase | % time | Mean ms | Steps/s | Hit rate | +|-------|:------:|--------:|:-------:|:--------:| +| TBR | 43.6% | 7313 | 91.4 | 5/5 (661 steps avg) | +| Ratchet | 32.2% | 5390 | 4.5 | 5/5 (26.6 steps avg) | +| SA (anneal) | 7.4% | 1241 | 0.8 | 7/50 (14%, 1.3 steps) | +| XSS | 5.4% | 897 | 13.8 | 4/5 | +| Wagner+NNI | 4.7% | 790 | — | starting point | +| RSS | 3.2% | 530 | 4.8 | 3/5 | +| CSS | 2.5% | 424 | 11.2 | 2/5 | +| Final TBR | 1.0% | 174 | 5.2 | 1/5 | + +**SA (simulated annealing) phase is the least productive:** 7.4% of time, +14% hit rate (7/50 reps improved by 1.3 steps on average). Efficiency = +0.8 steps/s, far below ratchet (4.5) or XSS (13.8). annealCycles=3, +annealPhases=5 may be overtuned. Reducing could save ~1.2s/rep → 1 extra +replicate per ~17s saved. + +**Comparison with earlier Intel desktop baselines (T-179, pre-T-206):** + +| Budget | Intel (pre-T-206) | EPYC (post-T-206) | Delta | +|--------|:-:|:-:|:-:| +| 30s | 1276 | 1202 | −74 | +| 60s | 1255 | 1190 | −65 | +| 120s | 1250 | 1185 | −65 | + +The 65–74 step gap is **primarily due to T-206** (outer cycle reset cap), +not hardware. T-206 was merged 2026-03-24 19:27; the Intel baselines were +recorded at 12:56 the same day (pre-T-206). Without the reset cap, each +replicate performed 3–5 pipeline cycles (~51–85s) vs ~17s with cap=0. +At 120s budget: ~2 replicates pre-T-206 vs ~6 post-T-206. Hardware +differences (Intel desktop vs EPYC 7702) are a secondary factor. + +### Phase distribution: current thorough preset (2026-03-27, Agent A, round 6) + +Dataset: Zhu2013 (75t, 253 chars). Strategy: auto → thorough. +3 reps, single-threaded, post-T-261+T-262+T-263. Total: 33.7 s = ~11.2 s/rep. + +| Phase | Calls | Total ms | Mean ms | % | +|-------|:-----:|:--------:|:-------:|:---:| +| Ratchet | 14 | 15617 | 1116 | 46.3% | +| NNI-perturb | 14 | 11565 | 826 | **34.3%** | +| RSS | 14 | 2488 | 178 | 7.4% | +| CSS | 14 | 1477 | 106 | 4.4% | +| XSS | 14 | 1079 | 77 | 3.2% | +| TBR (post-phase) | 14 | 622 | 44 | 1.8% | +| Initial TBR | 3 | 468 | 156 | 1.4% | +| wag+NNI | 2 | 427 | 214 | 1.3% | + +**Key findings vs 2026-03-18 baselines:** + +1. **TBR is no longer a bottleneck** (1.4% + 1.8% = 3.2%). T-261+T-262+T-263 + combined are working — TBR has become fast enough that other phases dominate. + Drift was 25–33% before T-255; its removal freed that budget to more ratchet. + +2. **NNI-perturb at 34.3% with poor efficiency:** + - Hit rate: 14% (2/14 calls improved score) + - Mean improvement when hit: 1 step + - Efficiency: 0.17 steps/s vs ratchet's ~4–8 steps/call at comparable cost + - Cost grows within a replicate (early calls ~300ms, late calls ~1300ms) + - This phase likely over-tuned for 75-tip datasets. Filed **T-274** (P2). + +3. **RSS at 7.4%** — higher than old 2% baseline. With conflict-guided RSS and + outerCycles/reset mechanism creating ~4.7 RSS calls per replicate at ~178ms each + (~837ms/rep). Old uniform RSS: ~11ms/rep. 16× overhead increase. Most of this + is the actual sector TBR cost (more calls × similar per-sector time), not conflict + computation overhead. The reset mechanism is the multiplier. + +4. **wag+NNI at 1.3%**: biased Wagner + 3 starts + NNI warmup adds ~214ms per + replicate start. Negligible at this scale; confirms T-246/NNI-warmup tuning is fine. + +## What to Profile + +Status key: ✅ resolved, ⚠ partially explored, ❌ not yet investigated + +1. ✅ **Drift + ratchet inner loops** (50–60% of C++ time combined). Both use + TBR internally. Per-candidate indirect evaluation at memory-throughput + limit (~23 ns at 75 tips per T-075). Cycle counts tuned (d2_r5). + **Drift threshold sensitivity (2026-03-18 Agent E):** AFD={1,3,5,8} × + RFD={0.05,0.1,0.2} on Zhu2013 (75 tips, 15 runs each): no significant + score difference between any config (Wilcoxon p=0.60–1.00). Permissive + thresholds (AFD=8, RFD=0.2) waste time; tight vs default indistinguishable. + On Dikow2009 (88 tips), d2 drift provides no benefit over ratchet alone + (p=0.54); d6 gives 2-step improvement (p=0.006) at 2× time cost. + **Conclusion:** Current defaults (AFD=3, RFD=0.1) are fine. Cycle count + matters more than threshold values. No optimization task raised. + +2. ✅ **Sectorial search effectiveness** (12% of time). XSS effectiveness is + dataset-dependent (0–27 steps). RSS is marginal (0–3 steps). No clear + predictor from simple dataset statistics. Could make XSS adaptive (skip + after N unproductive reps) but time savings would be <10%. + +3. ✅ **Wagner tree construction**: <0.1% of search time. Not a bottleneck. + +4. ✅ **R overhead**: <0.5% of wall time. Not a bottleneck. + +5. ✅ **Parallel scaling**: 78–82% efficiency at 2 threads. Implementation is + sound (dynamic work-stealing, low-contention pool). Main loss is stochastic + load imbalance. No obvious improvement without algorithmic changes. + +6. ✅ **IW scoring overhead** (2026-03-18 Agent E). Compared EW vs IW (k=10, + k=3) on three datasets (5 runs each, d2_r5, 5 reps, serial): + - Vinther2008 (23 tips): IW 64% *faster* (landscape converges quicker) + - Agnarsson2004 (62 tips): IW 26–39% slower + - Zhu2013 (75 tips): IW 40–57% slower + IW overhead scales with dataset size due to per-character weighted delta + computation in indirect scoring. No optimization opportunity — the delta + lookup is already O(n_blocks) per candidate, same as EW Fitch. + +7. ✅ **Fuse effectiveness** (2026-03-18 Agent E). Compared fuseInterval=0 vs + 3 on three datasets (8 runs each, 10 reps): + - Agnarsson2004: identical scores/time (pool deduplicates to 1 tree) + - Zhu2013: identical scores/time + - Dikow2009: negligible overhead (13.65s vs 13.78s with poolSuboptimal=5) + Fuse is cheap when pool is small, free when pool=1. Current default + (fuseInterval=3) is appropriate. No optimization task raised. + +## Comparing Search Strategies: Time-Adjusted Expected Best + +When comparing strategies that differ in per-replicate cost (e.g. NNI→TBR +vs TBR alone), the **median per-replicate score is the wrong metric**. +Multi-start search keeps the best tree across all replicates, so what +matters is the expected minimum from k independent draws, where +k = budget / time_per_replicate. + +A strategy with high variance but occasional excellent scores can dominate +a consistent-but-mediocre one — if it's fast enough to get more draws. + +**Bootstrap estimation:** +```r +expected_best <- function(scores, k, n_boot = 5000) { + mean(replicate(n_boot, min(sample(scores, k, replace = TRUE)))) +} + +# k = budget / median_time_per_rep for each strategy +k <- floor(budget / median_time) +exp_best <- expected_best(observed_scores, k) +``` + +Compare `exp_best` across strategies at fixed budget (e.g. 20s, 60s, 120s). +This naturally trades off per-replicate quality against replicate throughput. + +**When median IS acceptable:** comparing parameter changes on a fixed pipeline +(same time-per-rep), e.g. ratchet perturbation probability. All runs take +roughly the same time, so k is constant and the median is a reasonable proxy. + +See AGENTS.md "NNI in the driven pipeline" for the reference application of +this metric (NNI→TBR vs TBR at 88 and 180 tips). + +## Reporting Format + +For each finding, add to `to-do.md`: + +``` +| T-NNN | P2 | OPEN | — | [Profile] Brief description | X% of time. Potential Y% improvement via Z approach. | +``` + +Include the measurement methodology and baseline numbers so the implementer +can verify the improvement. + +8. ✅ **HSJ scoring overhead** (2026-03-19 Agent A). HSJ is ~0.6× EW wall time + (faster) on synthetic hierarchical data. Fitch screening gates full HSJ rescore + effectively. No optimization needed. + +9. ✅ **XFORM (Sankoff) scoring overhead** (2026-03-19 Agent A). XFORM is ~1.6–2.2× + EW wall time. Overhead concentrated in Ratchet (+69%) and Drift (+64%). This + is expected Sankoff vs Fitch arithmetic cost — no obvious optimization target. + +10. ✅ **Hierarchical resampling parallelism** (2026-03-19 Agent A). Serial R loop + means `nThreads` only applies within each replicate. Brazeau 2T = 2.5× speedup; + HSJ/XFORM hierarchical 2T = 1.1× only. Known limitation, future optimization + (C++-level inter-replicate parallelism for hierarchical resampling). + +11. ✅ **MaddisonSlatkin internal bottlenecks** (2026-03-19 Agent A, T-149). + VTune hotspot collection (software sampling, `-g -fno-omit-frame-pointer` + symbols build) on 57 calls at boundary cases: k=3/n=20–25, k=4/n=14–18, + k=5/n=9–12. Total ~23 s CPU time; 63% in `TreeSearch.dll`. + + **CPU time breakdown within TreeSearch.dll (14.1 s):** + + | Category | CPU (s) | % DLL | + |----------|---------|-------| + | `logB_cache::find` (k=3,4,5) | 2.72 | 19% | + | `SolverT::LogB` compute | 1.88 | 13% | + | `logPVec_cache::find` (k=3,4,5) | 1.91 | 14% | + | `SolverT::LogPVec` compute | 1.24 | 9% | + | `LogPVecKey::operator==` | 1.11 | 8% | + | `StateKeyT::operator==` | 1.01 | 7% | + | `expl`/`_expl_internal` (LogB LSE) | 0.91 | 6% | + | `logRD_cache::find` | 0.74 | 5% | + | `std::isfinite` (all sites) | 0.70 | 5% | + | `vector::~vector` (eviction) | 0.60 | 4% | + | `logconv` actual convolution | 0.20 | 1% | + + **Key findings:** + - `logconv` is only **1%** of DLL time — the Phase 2 vectorization worked + perfectly; the algorithm itself is no longer the bottleneck. + - **Hash map infrastructure dominates** (53% of DLL time): `unordered_map::find` + + key equality checks across the three caches (logB, logPVec, logRD). + Switching to a flat/open-addressing map would help but adds complexity. + - **`expl()` in `LSEAccumulator`** (6%) uses long-double arithmetic. Switching + to `double`/`exp()` would save ~0.7s at negligible precision cost. → **T-151** + - **`std::isfinite`** (5%) routes through `_fpclassify` on MinGW/Windows. + Replacing with `x != NEG_INF` saves the function-call overhead. → **T-152** + - `memcmp` in ucrtbase.dll (1.6 s / 7% of total) is the `StateKeyT::operator==` + fall-through when `cached_hash` and `cached_sum` both match — unavoidable + with the current key design. + + **Estimated combined T-151 + T-152 saving: ~1.4 s (6%) per cold-cache run.** + +## Build and Test (Reminder) + +Always use isolated library: +```bash +R CMD build --no-build-vignettes --no-manual . && R CMD INSTALL --library=.agent-X TreeSearch_*.tar.gz && rm -f TreeSearch_*.tar.gz +Rscript -e "library(TreeSearch, lib.loc='.agent-X'); testthat::test_dir('tests/testthat', filter='ts-')" +``` + +Max 2 CPU cores. Use `nThreads = 2L` at most in benchmarks. diff --git a/.positai/expertise/red-team.md b/.positai/expertise/red-team.md new file mode 100644 index 000000000..979035e34 --- /dev/null +++ b/.positai/expertise/red-team.md @@ -0,0 +1,174 @@ +# Red-Team Expertise — TreeSearch + +## Purpose + +Red-teaming reviews code for (i) bugs and (ii) performance issues. +Fix trivial issues directly; add non-trivial ones to `to-do.md`. + +## Focused rotation system + +Each S-RED invocation targets **one focus area** from the rotation below. +The agent reads `last_focus` (bottom of this file) to determine which area +was reviewed last, then picks the **next** area in sequence. After +completing the review, update `last_focus`. + +### Focus areas + +| # | Area | Scope | Key questions | +|---|------|-------|---------------| +| 1 | **Fitch scoring correctness** | `ts_fitch.h/.cpp`, `ts_fitch_na.h`, `ts_fitch_na_incr.h` | Does incremental scoring match full `score_tree()`? Bounded variants bail correctly? NA three-pass edge cases? Write a targeted test if you find a gap. | +| 2 | **Search topology invariants** | `ts_tbr.cpp`, `ts_drift.cpp`, `ts_search.cpp` | After every rejected move, is topology fully restored? Undo stack correct? No stale `postorder`? Symmetry-breaking hash collisions? | +| 3 | **Ratchet & perturbation** | `ts_ratchet.cpp`, `ts_sector.cpp`, `ts_fuse.cpp` | `active_mask`/`upweight_mask` fully restored after perturbation? Sectorial reinsertion reverts on worse score? Fuse exchange handles tied scores? | +| 4 | **Parallelism & RNG** | `ts_parallel.cpp`, `ts_rng.h/.cpp`, `ts_driven.cpp` | Thread-local RNG set before any search call? No R API calls from worker threads? Pool mutex correct? Atomic stop flag races? Seeds generated from R RNG before spawning? | +| 5 | **Data pipeline & simplification** | `ts_data.h/.cpp`, `ts_simplify.h/.cpp`, `ts_constraint.h/.cpp` | `build_dataset` handles edge cases (all-ambiguous, single-state, zero-weight)? `build_reduced_dataset` copies all fields? Constraint column-major indexing correct? | +| 6 | **R ↔ C++ interface** | `ts_rcpp.cpp`, `TreeSearch-init.c`, `R/RcppExports.R`, `R/MaximizeParsimony.R` | Arg counts match? Concavity sentinel translated correctly? Edge matrix conventions? Return value attributes set? Parameter validation in R layer? | +| 7 | **Shiny module wiring** | `inst/Parsimony/server.R`, `server/mod_*.R`, `server/events.R` | Forward-ref callbacks resolve? Cross-module `updateXxxInput` targets correct namespace? Reactive graph has no orphaned observers? `isolate()` used correctly in result observers? | +| 8 | **Test suite health** | `tests/testthat/test-ts-*.R`, `tests/testthat/helper-ts.R` | Tier guards correct? Any tests that always pass (vacuous)? Missing `TreeSearch:::` prefixes? Edge-case coverage gaps (3-tip, single-char, all-NA)? Flaky tests? | +| 9 | **Wagner & addition trees** | `ts_wagner.h/.cpp` | NA-incremental scoring staleness acceptable? Constraint mapping (LCA-based) correct? Retry loop fires when needed? 3-taxon base case handles all orderings? | +| 10 | **Profile & IW scoring** | `ts_fitch.cpp` (IW/profile paths), `ts_data.cpp` (precompute) | `e/(k+e)` delta correct? Profile `info_amounts` lookup matches? `concavity = 1.0` sentinel activates weighted path? `precompute_profile_delta` includes `precomputed_steps` offset? | + +### Workflow for a focused review + +1. Read `last_focus` below → pick next area in rotation. +2. **Read the target files** thoroughly (not just skimming). Understand the + logic before looking for bugs. +3. **Construct a specific adversarial scenario** — e.g. "what happens if + the ratchet upweights a character that's already at max weight?" — and + trace the code path. +4. **Write or run a targeted test** that exercises the scenario. If the + test passes, note it. If it fails, file a bug. +5. Check for any **recent commits** touching the focus files (`git log`). +6. **Build and run** at minimum the test files most relevant to the focus + area (not necessarily the entire suite). +7. Update `last_focus`, report findings. + +Total time budget: aim for depth over breadth. A focused review that finds +one real bug is worth more than a broad sweep that confirms "all green." + +## Bug patterns (reference — from past rounds) + +| Pattern | Where to check | +|---------|---------------| +| Missing `GetRNGstate()`/`PutRNGstate()` around `unif_rand()` | Any .cpp using randomness | +| `std::random_device{}()` ignoring `set.seed()` | Seeding of `std::mt19937` | +| GCC-only builtins (`__builtin_popcountll`, etc.) | All .cpp/.h files | +| `.inc` file changes not triggering recompilation | `ts_fitch_na.inc`, `ts_fitch_na_incr.inc` | +| Missing `TreeSearch:::` prefix in tests | `tests/testthat/test-ts-*.R` | +| Arg count mismatch in `TreeSearch-init.c` | After adding/removing Rcpp params | +| `R_PosInf` in Rcpp defaults | `R/RcppExports.R` after `compileAttributes()` | +| No revert on worsening move | Sectorial reinsertion, fuse exchange | +| `active_mask`/`upweight_mask` not cleaned up | Ratchet perturbation restore paths | + +## Performance patterns (reference) + +| Pattern | Where to check | +|---------|---------------| +| Unbounded indirect scoring (missing `_bounded` variant) | Search inner loops | +| Full `score_tree()` where incremental would suffice | After clip/regraft | +| `build_postorder()` called unnecessarily | After unclip or snapshot restore | +| Full-tree copy where save/restore suffices | Fuse, sectorial | +| Missing early termination in loops | Block iteration in scoring | + +## Known fragile areas (reference) + +1. `ts_rcpp.cpp` + `TreeSearch-init.c`: append-only, check arg counts. +2. `RcppExports.R/.cpp`: concavity `Inf` → `-1.0` sentinel after regen. +3. `.inc` files: `touch src/ts_fitch.cpp` after changes. +4. Parallel: `ts_rng.h` thread_local must be set before search. +5. `init_from_edge`: first child → left convention. + +## Reporting format + +``` +| T-NNN | P1/P2 | OPEN | — | [Bug/Perf] Brief description | Found by S-RED focus #N. File:line. Details. | +``` + +--- + +## last_focus + +area: 4 +reviewed_by: E +date: 2026-03-27 +notes: Parallelism & RNG — ts_rng.h/.cpp (57+53 lines), ts_parallel.cpp (591 lines), ts_parallel.h (141 lines). ts_driven.cpp covered in E-003/S-RED focus ~21 (2026-03-27; see agent-E.md). **Findings:** (1) ts_rng.h/.cpp CLEAN — thread_local pointers init to nullptr and cleared at worker exit. make_rng(): serial wraps unif_rand() in Get/PutRNGstate; parallel seeds new mt19937 from *thread_rng (clean RNG hierarchy). check_interrupt(): serial longjmps via R_CheckUserInterrupt(); parallel relaxed atomic load. thread_safe_unif(): serial calls unif_rand() (caller manages state); parallel uses std::uniform_real_distribution on *thread_rng. rng_state_begin/end correctly no-ops in parallel. (2) ThreadSafePool CLEAN — all methods under lock_guard. fuse_round holds mutex across tree_fuse() (performance concern, not correctness). Passes pool_ (underlying TreePool) to tree_fuse() directly — no nested lock deadlock risk. hits_to_best save/restore logic correct. extract_into correctly called post-join; uses non-collapsed add() for output, overrides hits_to_best with accumulated count. (3) worker_thread CLEAN — ctx copied by value at emplace_back with thread_id already set; each worker gets distinct slot in pre-sized thread_timings/thread_scores arrays (no race). thread_local ts::thread_rng and ts::thread_stop_flag set at entry, cleared at exit. No R API calls from workers. (4) Seeds pre-generated on main thread with Get/PutRNGstate before thread spawn; workers seed local_rng from seeds[rep] read-only. (5) strategies vector declared in parallel_driven_search frame, lives until join() — no dangling pointer. (6) Consensus stability check race: done_now relaxed-read before pool mutex; pool updated before replicates_done incremented in workers → main thread may see done_now slightly ahead of pool, triggering consensus check with fewer trees than expected. Conservative direction — safe. Pool never shrinks so pool_size >= 2 guard from status() call remains valid. (7) MPT enumeration: tbr_search is deterministic (no RNG). thread_rng == nullptr on main thread post-join is correct. (8) parallel_resample: same seed pattern, correct. results[rep] writes all complete before join(). (9) Lines 323-325 in main polling loop: empty if-block (dead code, no action taken). Harmless. (10) Verbosity Rprintf at line 403 acquires pool mutex via status() — could delay interrupt polling if worker holds mutex in fuse_round. Performance concern only. **No bugs found.** Next area: 5 (Data pipeline & simplification: ts_data.h/.cpp, ts_simplify.h/.cpp, ts_constraint.h/.cpp). + +Previous area: 3 +reviewed_by: F +date: 2026-03-27 +notes: Ratchet & perturbation, sectorial search, prune-reinsert (ts_prune_reinsert.cpp 511 lines NEW, ts_ratchet.cpp post-T-273, ts_sector.cpp 7189b02c+33c57518+5ffe2bfa changes). Focus on new/modified code since B's area-3 review (2026-03-19). **Findings:** (1) ts_prune_reinsert.cpp FULLY REVIEWED (new file, never reviewed). T-275 GUARD CORRECT — `prune_reinsert_search()` returns early for PROFILE/HSJ/XFORM modes (build_reduced_dataset copies scoring_mode but not info_amounts/hierarchy_blocks/sankoff_* fields). EW and IW modes proceed correctly. (2) `final_[tip]` INITIALIZATION SAFE — `expand_and_reinsert` calls `init_wagner_state → load_tip_states` which bulk-memcpys ds.tip_states into both `prelim` AND `final_` for ALL tips (including dropped ones). wagner_incremental_rescore uppass skips tips correctly ("tip: final = prelim, already correct"). No uninitialized-memory issue. (3) EW HEURISTIC FOR IW INSERTION — `fitch_indirect_length_bounded` (EW-only) used for greedy tip reinsertion in IW mode. Same approximation as standard `wagner_tree`. Corrected by TBR polish at step 6. Acceptable. (4) ACCEPT THRESHOLD -1e-10: guards against floating-point noise; consistent with TBR convention. (5) TOPOLOGY RENAMING IN `extract_pruned_topology`: swap of root_node↔m in all edges correct (root_node and m are always internal nodes ≥ m; swap is purely rename, no logical structure change). (6) ts_sector.cpp `compute_from_above_for_sector`: CORRECT — total_words sizing consistent (rd.data.total_words = ds.total_words explicitly set; from_above sized to tw=tree.total_words=ds.total_words); path walk from sector_root to root correct; from_above[root]= all-states initialization correct; fitch_join per-step correct. Edge case sector_root==root: empty loop body, from_above = all-states. (7) ts_sector.cpp `compute_node_conflict`: CORRECT — wps/trail_mask correct, trivial split guard (count≤1 || count≥n_tip-1), canonicalization (flip if bit 0 set), FNV-1a lookup consistent with pool hash. `use_weighted` gate (max_w > 1.5) and pick_weights recompute after accepted sector: correct. (8) ts_sector.cpp XSS/CSS adaptive early-exit: CORRECT — `if (result.best_score >= score_before_round) break` fires when score didn't decrease (parsimony minimized → no improvement). (9) ts_ratchet.cpp `check_timeout` forwarding (fafd5d0e): CORRECT — all three tbr_search() calls pass nullptr,nullptr,check_timeout for sector_mask,collect_pool,timeout. (10) T-273 flat_blocks sync: CORRECT — flat_blocks.resize(n_blocks) in build_dataset; loops `for (int b = 0; b < ds.n_blocks; ++b)` safe. **No new bugs found beyond T-275 (already filed and guarded).** + +Previous area: 2 +reviewed_by: F +date: 2026-03-27 +notes: Search topology invariants (ts_tbr.cpp, ts_drift.cpp, ts_search.cpp). Focus on post-C-review commits: T-263 (0edad6e3, snapshot hoisting), T-235 (aafeed21, SPR rescore fix), T-196 (6c531be8, NA+IW screening), 1572fd70 (FlatBlock). **Findings:** (1) T-263 SNAPSHOT HOISTING VERIFIED CORRECT — moving state_snap.save()/save_topology() to once per pass (top of while loop) is safe. Invariant: restore_prealloc_undo()+spr_unclip() returns the clipped tree to EXACTLY the committed state; apply_tbr_move is called on the unclipped committed tree; all rejection paths (validate_topology failure, constraint violation, tabu, worse full_rescore) call restore_topology(snap)+state_snap.restore() which restores from the committed state. StateSnapshot::save/restore does full memcpy of ALL node state arrays (prelim, final_, local_cost, down2, subtree_actives, postorder). No scenario where a candidate modifies the committed state before save. (2) T-235 SPR FIX VERIFIED CORRECT — aafeed21 adds full_rescore(tree, ds) after spr_unclip() on the rejection path in spr_search(). This fixes the stale-state arrays issue identified by C (2026-03-25) and A (2026-03-19). spr_search() now correctly rescores the restored topology before the next iteration. (3) LATENT: flat_blocks.active_mask not updated by ratchet — FlatBlock.active_mask is populated at build_dataset() and NOT updated when ts_ratchet.cpp modifies blocks[b].active_mask (perturb_zero_only line 53, perturb_mixed line 110). If flat indirect functions (fitch_indirect_bounded_flat etc.) are ever wired into dispatch during ratchet TBR, they will use stale active_mask, causing incorrect screening. Currently SAFE — confirmed zero call sites for all flat variants (grep shows declarations+definitions only). Pre-wiring fix required: add flat_blocks sync in restore_perturbation_snapshot(). (4) T-196 NA+IW SCREENING IMPROVEMENT — replacing extract_divided_steps (standard Fitch local_cost for NA) with extract_char_steps (three-pass down2 for NA) gives more accurate per-character step counts for IW delta precomputation on NA datasets. Both operate on partially-updated clipped tree arrays (same approximation class); extract_char_steps is more faithful to the actual NA three-pass score. (5) No bugs filed. 1424+ TBR/driven/SPR/constraint tests pass (from GHA record on T-263 PR). + +Previous area: 1 +reviewed_by: A +date: 2026-03-27 +notes: Fitch scoring correctness (ts_fitch.h/.cpp, ts_fitch_na.h, ts_fitch_na_incr.h, ts_simd.h). Focus on commits since previous focus-1 review (2026-03-19): e7b9b4cf (AVX2), 1572fd70 (FlatBlock + flat indirect), XPIWE, T-229 XFORM fix, 7cff7870 (profile delta). **Findings:** (1) AVX2 dispatch correct: `any_hit_reduce_avx2`, `any_hit_reduce3_avx2`, `fitch_combine_avx2` bit-identical to scalar. `cpu_has_avx2()` thread-safe (C++11 static-local-in-inline + COMDAT ODR = single flag). `fitch_combine_avx2` correctly broadcasts any_intersect/needs_union to 4 SIMD lanes; `storeu256` safe for 8-byte-aligned vectors; no aliasing (left/right/out always distinct tree nodes). (2) Flat indirect functions: infrastructure only (not wired into dispatch, confirmed by grep). Logic matches non-flat exactly — same NA suppression, same `below_actives[b]` scalar indexing. (3) XFORM double-counting: NOT a bug. `non_hierarchy_weights()` zeroes hierarchy weights; `build_dataset()` removes weight=0 patterns (lines 104-109). Fitch blocks contain only non-hierarchy chars; Sankoff handles hierarchy. (4) `fitch_incremental_downpass` stopping condition correct: root always processed before break; self-parent guard prevents infinite loops. (5) `fitch_incremental_uppass` dirty propagation correct: root dirty iff prelim changed; clip_ancestor always dirty. `std::vector` alloc per clip (~45 bytes at 180 tips) is negligible overhead. **No bugs found.** + +Previous area: 10 +reviewed_by: A +date: 2026-03-27 +notes: Profile & IW scoring (ts_fitch.cpp IW/profile paths, ts_data.cpp precompute). **Findings:** (1) BUG FIXED — `precompute_profile_delta` used `old_cost = 0.0` for both `s <= 0` (correct: invariant character) and `s > info_max_steps` (WRONG: should use capped max-table value). When the divided tree has more steps than the profile table size, the delta was overestimated by the full max-table value, causing overly conservative candidate rejection in profile TBR screening. Conservative bug (no incorrect trees accepted), very low practical impact (only highly homoplastic characters). Fix: added separate branches for s<=0, 0max — mirroring the capping logic in `compute_profile()`. Regression test added to test-ts-iw-profile-red10.R. Committed directly to cpp-search (7cff7870). (2) IW `e/(k+e)` delta formula verified correct: `new_cost - old_cost = k / [(k+e+1)(k+e)]`, always positive. `e < 0` guard (below minimum in divided tree) correctly sets delta=0. `e == 0` guard correctly avoids 0/0 NaN. (3) IW `phi`, `eff_k` assignment verified: non-XPIWE uses phi=1.0, eff_k=concavity. XPIWE: eff_k=concavity/f, phi=(1+eff_k)/(1+concavity). (4) PROFILE `concavity=1.0` sentinel correctly activates weighted path via `isfinite()` checks in build_dataset. Profile scoring_mode set before phi/eff_k assignment; phi/eff_k are set to finite values but not used for profile (compute_profile uses info_amounts lookup). (5) `precomputed_steps` offset correctly added in both `compute_profile` and `precompute_profile_delta`. (6) Profile `info_amounts` column-major indexing `[idx + info_max_steps * p]` verified correct. 15 tests pass in test-ts-iw-profile-red10.R. + +Previous area: 9 +reviewed_by: A +date: 2026-03-26 +notes: Wagner & addition trees + constraint impose_constraint (ts_wagner.h/.cpp 595 lines, ts_constraint.h/.cpp 736+144 lines). **Findings:** (1) ts_wagner.h/.cpp: No bugs found. Incremental Wagner scoring (two-pass downpass+uppass) correct. LCA-based constraint mapping (`wagner_map_constraint_nodes`) correct. 3-taxon base case handles all orderings. `biased_wagner_tree()` softmax sampling correct. `random_constrained_tree()` retry logic sound. Recent commits (T-208 biased Wagner, T-213 constraint enforcement, T-214 random constrained tree) all verified. (2) ts_constraint.cpp `topology_spr()`: Root-child case (lines 463-492) correctly handles node absorption when `ns >= n_tip`. Bail-out for `ns` as tip (line 467) is correct. (3) LATENT ISSUE (not filed) — `impose_one_pass()` stale `best_node` reference: when a `move_out_root` is a direct child of `best_node`, `topology_spr` relocates `best_node` (it becomes `nx`). Subsequent `move_in` iterations use `collect_edges_in_subtree(tree, best_node, ...)` which now traverses the wrong subtree (best_node was moved to the regraft location, not the original constraint-matching subtree). **Severity: negligible** — mitigated by: (a) outer retry loop (up to `n_splits+1` passes), (b) safety cap bailout (`n_tip/4+2` moves), (c) all callers have retry/validation logic (Wagner retry loop, random_constrained_tree validation, TBR-level constraint enforcement). Adversarial testing: 80/80 pass (10-tip adversarial × 20 seeds, 30-tip nested × 10 seeds, 8-tip root-straddling × 30 seeds). (4) `regraft_violates_constraint()`: DFS timestamp usage correct — timestamps remain valid for ancestor-descendant testing after clip because non-clipped topology is unchanged. MUST_OUTSIDE boundary check (`inside && below != cn`) correct (allows sibling placement). (5) `classify_clip_constraints()`: Rest-of-tree masking with remainder bit clearing correct for both `rest_has_in` and `rest_has_out`. FORBIDDEN classification (clip straddles + rest straddles) correct. (6) All 902 constraint-related tests pass: impose-constraint(88), constraint-small(23), constraint-multi(806), consensus-constrain(8). No new bugs filed. + +Previous area: 4 +reviewed_by: D +date: 2026-03-25 +notes: Parallelism & RNG (ts_parallel.cpp 535 lines, ts_rng.h 56 lines, ts_rng.cpp 52 lines, ts_driven.cpp 887 lines). **Findings:** (1) BUG FIXED — Consensus stability check in parallel main-thread poll loop called update_consensus_stability() every 200ms poll, not per-replicate. Unchanged counter incremented on idle polls, causing premature termination when replicates are slow (e.g. 2 threads, 30s/rep, consensus_stable_reps=2 would stop after ~600ms of unchanged polls instead of 2 actual replicates). Fix: track replicates_done at last check, only update when new reps complete. (2) FRAGILITY NOTED — R_CheckUserInterrupt() in try/catch (lines 303-308) relies on longjmp being caught by catch(...), which is ABI-dependent (works on Windows/SJLJ, fragile on Linux/DWARF). If longjmp escapes catch, worker threads are orphaned with dangling pointers. Known pattern in R packages; fixing requires R_UnwindProtect which is R >=3.5.0 only. Low priority — works on primary platform. (3) CONFIRMED CORRECT — DataSet and ConstraintData copy constructors produce deep copies (all std::vector members). Thread-safe for parallel worker copies. (4) CONFIRMED CORRECT — Thread-local RNG setup/teardown in worker_thread() and parallel_resample lambda. Seeds generated from R RNG on main thread before spawning. No R API calls from worker threads in normal operation. (5) CONFIRMED CORRECT — ThreadSafePool mutex covers all member functions; extract_into() called only after join. (6) CONFIRMED CORRECT — stop_flag relaxed ordering appropriate (no ordering deps). Benign TOCTOU in hits_to_best convergence check. (7) MINOR — Rf_error() in ts_wagner.cpp:17 is reachable from worker threads if n_tip<3, which would crash. Practically unreachable (DataSet validated before parallel dispatch). (8) COSMETIC — Dead code at lines 290-299 of ts_parallel.cpp (empty if body). (9) MINOR — Multiple threads can trigger fuse_round simultaneously on same done count (correct but wasteful, serialized by mutex). + +Previous area: 3 +reviewed_by: D +date: 2026-03-25 +Previous area: 9 +reviewed_by: C +date: 2026-03-20 +notes: Wagner & addition trees (ts_wagner.h/.cpp 595 lines, ts_constraint.cpp 350 lines). **Findings:** (1) BUG FIXED — `wagner_edge_violates_constraint` (ts_wagner.cpp) and `regraft_violates_constraint` (ts_constraint.cpp) both used `is_ancestor_or_equal(cn, below, ...)` which returns `true` when `below == cn`. For MUST_OUTSIDE tips/clades, this caused the boundary edge (just above the constraint clade) to be incorrectly rejected. Inserting an outside element there makes it a sibling of the constraint clade and does NOT break monophyly. Fix: added `&& below != cn` to the MUST_OUTSIDE rejection branch in both functions. Search quality improvement (wider valid regraft/insertion set); no correctness impact. (2) Added 2 regression tests to `test-ts-wagner.R` (43 total): constrained random Wagner score verification; constrained sequential Wagner verifying `p1 == p2` (R tips 1,2 are sisters under the constraint). (3) Clarified that `ts_random_wagner_tree` without posthoc data cannot guarantee constraint satisfaction for all addition orders: when inside tips span both sides of the rooted tree's root in the 3-taxon start, `cn == root` triggers the existing guard skipping enforcement. The `has_posthoc` retry loop is the correctness guarantee for `MaximizeParsimony` constrained search. (4) Dead `n_added` param and dead `ew_score` computation noted (not filed). 43 Wagner + 18 constraint + 152 driven-search tests pass. + +Previous area: 8 +reviewed_by: B +date: 2026-03-20 +notes: Test suite health + ParsSim log-space convolution. **Findings:** (1) BUG FIXED — `.LogCumSumExp()` in `R/pp_info_extra_step.r` produced `NaN` instead of `-Inf` when both the running accumulator `Lk[k-1]` and the new value `x[k]` are `-Inf`. IEEE 754: `-Inf - (-Inf) = NaN`, propagating through `abs()`, `exp()`, `log1p()`. Reachable if `MaddisonSlatkin()` returns `NEG_INF` for an interior step count (possible by design per `src/MaddisonSlatkin.cpp`). Fix: added `if (is.finite(x[k]) || is.finite(Lk[k]))` guard; keep `Lk[k] = -Inf` when both are `-Inf` (log(0+0) = -Inf, not NaN). Added 7 targeted assertions. 131 assertions pass. (2) Active-range bounds verified correct for both Carter binary path (`m = 1..split[2]`, all entries finite) and MaddisonSlatkin path (trailing `-Inf` trimmed by `which(is.finite(logP))`). Interior `-Inf` is the only scenario where the NaN could appear. (3) `.ApproxStepInformation` MC branch: exact anchor at `s_min` and MC-based estimates for `s > s_min` are not jointly normalized; cumulative probability could exceed 1 for high-variance characters. Not a correctness bug in current usage (IC values are clamped to 0), but noted. + +Previous notes (Shiny module wiring) (server.R 200 lines, mod_data.R 589, mod_search.R 962, mod_consensus.R 1333, mod_clustering.R 295, mod_treespace.R 718, mod_downloads.R 167). **Findings:** (1) Forward-ref callbacks (cb_ref): 4 callbacks wired correctly at server.R:162-166 after all modules initialized. Placeholder closures capture cb_ref environment; actual implementations set synchronously before any reactive fires. ✓ (2) Cross-module updateXxxInput: Only one instance — mod_data.R:203 uses parent_session to update "treespace-relators". Fragile (hardcoded namespace string) but correct. All other updateXxxInput calls use module-local session. ✓ (3) Orphaned observers: None found. All observeEvent/observe blocks reference existing inputs within their module namespace. ✓ (4) isolate() in result observers: search result observer (line 811) has exactly one reactive dependency (searchTask$result()); all cleanup in isolate(). Profile prep result observer same pattern. ✓ (5) New progress polling observer correctly gated: checks progressFile(), r$searchNotification, r$searchInProgress before polling. Uses separate reactiveVals from profile prep observer. invalidateLater(500) only scheduled when all gates pass. ✓ (6) ShowConfigs operates on top-level DOM IDs (not namespaced) — correct since UI elements like "whichTree", "consConfig" are defined in ui.R outside modules. ✓ (7) UpdateActiveTrees reentrancy guard (r$updatingTrees) uses on.exit for cleanup — correct. ✓ (8) Change detection pattern (r$oldkeepNTips, r$oldOutgroup, etc.) prevents reactive cascades from programmatic input updates — correct but complex. ✓ No new bugs or tasks filed. + +Previous: area: 5 +reviewed_by: B +date: 2026-03-19 +notes: Data pipeline & simplification (ts_data.h/.cpp 289 lines, ts_simplify.h/.cpp 370 lines, ts_constraint.h/.cpp 350 lines). **Findings:** (1) FIXED — `build_reduced_dataset()` in ts_sector.cpp did not copy `ds.inapp_state` to the reduced dataset. Default value (-1) meant "no inapplicable state," which is currently harmless because sectors use Fitch scoring (EW/IW/PROFILE) which reads `blk.has_inapplicable` per block. Would be a bug if sectors were extended to support HSJ scoring. Added `rd.data.inapp_state = ds.inapp_state;` after the existing field copies. (2) FIXED — No guard for `n_states > MAX_STATES (32)` in `build_dataset()`. Token bitmasks use `uint32_t`, so `(1u << s)` for s >= 32 is undefined behavior. Added `Rf_error()` check at top of `build_dataset()`. Unlikely in practice (morphological data: 2–10 states; DNA: 4; protein: 20) but defensive. (3) NOT FILED — `build_reduced_dataset()` does not copy HSJ/Sankoff fields (hierarchy_blocks, tip_labels, sankoff_*). Same analysis as #1: sectors don't use those scoring modes. Not worth fixing until sectors support HSJ/XFORM. (4) Simplification correctness verified: uninformative-character detection (classical criterion + 4-caterpillar verification for ambiguous tokens) is correct and conservative. All-inapplicable characters are not simplified (skipped in Phase 1) but score 0, so no correctness impact. (5) Constraint column-major indexing verified: `split_matrix[s + n_splits * t]` correct. Canonicalization (tip 0 outside) handles n_tips as multiple of 64 correctly (remainder=0 skips clearing, all bits valid). DFS timestamp allocation matches tree node count. `regraft_violates_constraint` logic verified for MUST_INSIDE/MUST_OUTSIDE. (6) EW offset interaction with IW/Profile verified: uninformative patterns (removed from blocks) contribute 0 to IW score (extra_steps clamped to 0) and correct constant to Profile score (precomputed_steps added back before lookup). (7) Added 10 new test assertions: RSS + XSS + sector_diag with inapplicable characters. All 1679 ts-* pass. + +Previous: area: 4 +reviewed_by: E +date: 2026-03-19 +notes: Parallelism & RNG (ts_parallel.cpp 450 lines, ts_rng.h/.cpp 110 lines, ts_driven.cpp 484 lines). **Findings:** (1) Thread-local RNG: ✓ `ts::thread_rng` and `ts::thread_stop_flag` set before replicate loop at line 96–97 of ts_parallel.cpp. Cleaned up at line 153–154. (2) No R API from workers: ✓ All Rprintf calls in ts_driven.cpp gated by `verbosity >= 2`; parallel calls pass `verbosity = 0` (line 126). All interrupt checks via `ts::check_interrupt()` which dispatches to `thread_stop_flag`. Wagner uses `thread_safe_unif()` and `rng_state_begin/end()`. HSJ and Sankoff modules have zero R API calls. (3) Pool mutex: ✓ All ThreadSafePool public methods hold lock_guard. `fuse_round()` holds lock during tree_fuse + score_tree + pool.add. (4) Atomic stop flag: ✓ Uses `memory_order_relaxed` throughout, fine for simple boolean flag. (5) Seeds: ✓ Pre-generated from R RNG on main thread (lines 190–194, 385–389), `GetRNGstate/PutRNGstate` brackets. (6) DataSet copy correctness: ✓ All HSJ/Sankoff fields (hierarchy_blocks, tip_labels, sankoff_cost_matrices, etc.) are `std::vector`s — default copy constructor deep-copies. (7) HSJ/Sankoff thread safety: ✓ Both `hsj_score()` and `sankoff_score()` are stateless (only use parameters and local variables). No global mutable state. (8) **PERF NOTE** (not a bug): XFORM scoring rebuilds `SankoffData` in every `score_tree()` call (vector allocation in hot path). Could pre-build and cache in DataSet. Not filed as a bug since it's optimization-only and XFORM is new/experimental. (9) init.c: 45 entries (43 Rcpp + 2 manual), all arg counts match. (10) Score verification: serial=80→TreeLength=80 ✓, parallel=79→TreeLength=79 ✓ (Vinther2008). No new bugs found. + +Previous: area: 2 (complementary) +reviewed_by: A (complementary to B's focus 2 review) +date: 2026-03-19 +notes: Search topology invariants — deeper analysis of state restoration. **Findings:** (1) FOUND — SPR stale scoring arrays after rejected regraft. In `spr_search` (ts_search.cpp), after `spr_regraft + full_rescore + rejection + spr_unregraft + spr_unclip`, the `restore_saved_states()` only restores nodes on the clip-to-root path (saved during incremental pass). Nodes on the regraft-to-root path that aren't on the clip-to-root path retain prelim/final_/local_cost values from the regrafted topology's `full_rescore`. On subsequent clip iterations, `fitch_incremental_downpass` may read stale prelims for nodes below the new clip ancestor, producing incorrect `divided_length` and indirect evaluations. **Impact: conservative only** — stale arrays affect candidate screening but never acceptance decisions (all moves gated by `full_rescore` verification). Final score always correct (line 374 does `full_rescore`). Confirmed by targeted test: `test-ts-spr-state-restore.R` (33 assertions, EW/IW/NA datasets × multiple starting trees). **Not filed as a bug** because SPR is a secondary search method (TBR is primary in driven pipeline) and the issue is self-correcting over multiple passes. (2) TBR (ts_tbr.cpp): All rejection paths fully correct. Phase 2 restore via `restore_prealloc_undo() + spr_unclip() + saved_postorder` restores arrays exactly. Candidate verification via `save_topology + state_snap.save → apply_tbr_move → full_rescore → reject → restore_topology + state_snap.restore` restores topology + all arrays including postorder. Tabu rejection path identical. `states_valid` flag is dead code (always true) — both branches call `full_rescore` so no functional impact. (3) NNI (ts_search.cpp): Standard path: `nni_undo + incremental_downpass` correctly restores prelim/local_cost (second downpass recomputes from restored topology's children). final_ stale after rejection but unused until next acceptance (uppass only on accept). NA path: `score_tree` always does full recomputation, so stale arrays are irrelevant. ✓ (4) Drift (ts_drift.cpp): B's `saved_postorder` fix (line 676-678) verified correct. RFD computation's double-rescore-and-reapply pattern is correct if expensive. `drift_search` outer loop always starts phases with `full_rescore`. Subtree sizes computed once in drift_phase (not updated after moves) — minor search-quality concern, not correctness. (5) Hash collision risk: 64-bit hash, birthday bound ~k²/2^64. For k=10000 rerootings, P(collision) ≈ 5×10⁻¹². Negligible. Test added: test-ts-spr-state-restore.R (Tier 2). + +Previous: area: 3 +reviewed_by: B +date: 2026-03-19 +notes: Ratchet & perturbation (ts_ratchet.cpp 237 lines, ts_sector.cpp 792 lines, ts_fuse.cpp 522 lines). **Findings:** (1) BUG FIXED — `perturb_upweight()` and `perturb_mixed()` in `ts_ratchet.cpp` used `ds.pattern_freq[pat] *= 2` per selected character. When multiple characters share the same pattern index, this gives exponential blowup (`original * 2^N` instead of `original + N`). Only affects IW/profile perturbed landscape (EW uses upweight_mask independently, IW uses pattern_freq exclusively). Could cause integer overflow with highly compressed datasets and high perturbation probability (adaptive tuning at 0.5, 50+ chars sharing a pattern). Fix: changed `*= 2` to `+= 1` (additive, consistent with EW upweight_mask semantics). (2) Ratchet save/restore: PerturbSnapshot correctly saves/restores active_masks, upweight_masks, and pattern_freq vector. Unperturbed search always uses original weights. Best-tree topology reset uses copy_topology + build_postorder + reset_states. ✓ (3) upweight_mask has NO effect on IW/profile scoring: fitch_downpass uses upweight_mask for EW step count, but extract_char_steps, compute_iw, compute_profile, precompute_iw_delta, and all indirect_iw_length variants only use pattern_freq. Setting upweight_mask in IW perturbation is harmless but redundant. (4) Sectorial reinsertion revert: RSS/XSS save CladeSnapshot before reinsertion, full-tree rescore after, revert if score worsens. Post-hoc constraint check with revert. All paths maintain valid final_ states for HTU construction. ✓ (5) XSS sectors from same partition are independent (non-overlapping tip sets). Accepting one sector can't corrupt another. (6) CSS uses sector_mask in TBR (no HTU approximation); no revert needed (TBR handles internally). ✓ (7) Fuse tied-score handling: accepts equal-score exchanges only with accept_equal AND actual topology change (prevents infinite loops). Undo on rejection correctly restores clade left/right/parent. ✓ (8) Fuse ancestor stale marking (lines 467-480) is dead code — loop breaks immediately after acceptance. Harmless defensive code. (9) Minor: RSS doesn't update eligible list after equal-score acceptance (stale subtree sizes); efficiency concern only. 1404/1404 ts-* pass. + +Previous: area: 2 +reviewed_by: B +date: 2026-03-19 +previous_notes: Search topology invariants (ts_tbr.cpp, ts_drift.cpp, ts_search.cpp). **Findings:** (1) BUG FIXED — `drift_phase()` in `ts_drift.cpp` sets `saved_postorder` once (line 404) and never updates it after accepted moves. When the last clip candidate is rejected, the stale postorder is restored (line 566). The subsequent `tbr_search` (called from `drift_search`) starts with `full_rescore()` which uses `fitch_downpass()` iterating over the stale postorder — processing nodes before their children, producing wrong scores. Impact: first few TBR iterations after drift perturbation have incorrect accept/reject decisions. Self-healing once a move is accepted (build_postorder_prealloc rebuilds). Fix: added `tree.build_postorder()` before return when n_accepted > 0. (2) TBR search (ts_tbr.cpp): Topology save/restore is correct. TopoSnapshot + StateSnapshot used for rejection. StateSnapshot includes postorder (line 263). PreallocUndo used for clip/unclip cycle. saved_postorder kept current by StateSnapshot.save() at line 830 (before each candidate apply). `states_valid` flag is dead code (always true) — minor smell. (3) SPR search (ts_search.cpp): Correct — `build_postorder()` called after every clip/unclip at line 360, regardless of accept/reject. (4) Hash dedup in TBR: 64-bit FNV-1a for virtual_prelim; collision risk negligible. (5) Verified: `apply_tbr_move` / `drift_apply_tbr_move` reroot path is correct for path reversal. All 1397 ts-* pass. + +Previous: area: 1 +reviewed_by: C+A +date: 2026-03-19 +previous_notes: Fitch scoring correctness. **C's review:** Thorough review of ts_fitch.h/.cpp (711 lines), ts_fitch_na.h (290 lines), ts_fitch_na_incr.h (678 lines). Findings: (1) BUG FIXED — `fitch_indirect_length_bounded` and `fitch_indirect_length_cached` did not account for `upweight_mask`, underscoring candidates during ratchet perturbation TBR. The NA-aware variants (`fitch_na_indirect_length_bounded/cached`) were already correct. Also fixed `nx_cost` computation in ts_tbr.cpp, ts_search.cpp, and ts_drift.cpp, plus drift RFD computation — same missing upweight_mask pattern. Impact: TBR screening during ratchet perturbation was slightly inaccurate. Final result correctness unaffected (full_rescore() is authoritative). All 1397 ts-* pass. (2) fitch_downpass_node standalone function correctly omits upweight_mask (callers handle it). (3) Incremental downpass stop condition correct (root check + self-parent guard). (4) Incremental uppass dirty propagation correct — clip_ancestor marked dirty, tips processed separately in NA path. (5) NA three-pass algorithm (Brazeau et al.) correctly implemented: Pass 1 NA-aware prelim, Pass 2 uppass with applicability propagation (tips processed separately), Pass 3 corrected scoring with subtree_actives. (6) extract_char_steps correctly uses raw local_cost bits (not upweight-adjusted) for per-pattern counts. (7) fitch_indirect_length uses union-of-finals (correct for non-additive; Goloboff 1996) rather than intersection-then-union. **A's complementary review:** (A1) PROVED mathematically: standard Fitch incremental uppass dirty-flag logic is correct even when downpass stops before root. When downpass stops at node N (prelim unchanged), all nodes below N on the path have correct finals without explicit recomputation. Proof: if fitch(M_old, S) = fitch(M_new, S) and both are intersection-type, then N_final ⊆ M_old ∩ S = M_new ∩ S, so uppass(N_final, M_old) = uppass(N_final, M_new) = N_final. For union cases, fitch(M_old∪S) = fitch(M_new∪S) requires M_old = M_new when M∩S = ∅. (A2) FOUND: NA-aware uppass has a theoretical children_app staleness. The NA uppass formula uses `children_app = OR(children's prelims)` which can change even when the parent's prelim is stable — because the NA downpass prelim formula aggregates differently than raw OR. Specifically, case_children depends on children_app, which can differ if a child's prelim changed. This means the stopping node's final_ for NA blocks could be stale. Affects `fitch_na_pass3_score` divided_length. Conservative: full_rescore always catches it. Same class of design choice as extract_divided_steps heuristic. (A3) Confirmed C's upweight_mask fixes are correct. All 1397 ts-* pass. + +Previous: area: 2 +reviewed_by: C +date: 2026-03-25 +notes: Search topology invariants — thorough review of ts_tbr.cpp (1020 lines), ts_drift.cpp (812 lines), ts_search.cpp (426 lines), ts_tree.h/.cpp (602 lines). **Findings:** (1) BUG FOUND (T-235) — `spr_search()` stale state arrays after rejected regraft. When a candidate passes indirect screening but `full_rescore` shows it's not actually better, the unclip path only partially restores states: `full_rescore()` at line 374 overwrites ALL node states for the regrafted topology, but `spr_unclip()` at line 396 only restores the clip-to-root incremental saves. Subsequent clips use stale divided_length baselines. Impact: degraded screening for NA/IW datasets through SPR path. Final score always correct (full_rescore at exit). Fix: add `full_rescore(tree, ds)` after `spr_unclip()` on the rejection path. Low practical impact — all presets disable `sprFirst`, and SPR is only called in driven pipeline for constrained+sprFirst. (2) TBR uses correct pattern: unclips BEFORE verification, then saves/restores full StateSnapshot around apply_tbr_move. No stale-state issue. (3) drift_phase: `saved_postorder` set once and never updated after accepted moves — KNOWN, documented in comments (lines 725-731), harmless (postorder always rebuilt before use). (4) drift_phase: subtree_sizes computed once before loop, not updated after accepted moves — heuristic filter only, doesn't affect correctness. (5) TBR `states_valid` variable: initialized true, only ever set to true. The `!states_valid` branch at line 1006 is dead code. (6) TBR FNV-1a hash dedup for virtual_prelim: collision probability negligible with ~50 words. (7) NNI search: non-NA path correctly restores prelim/local_cost via re-downpass after rejected NNI; final_ staleness harmless (only downpass used for evaluation). NA path: `score_tree()` does full downpass+uppass, so stale states don't propagate. (8) Verified constraint enforcement paths in TBR and drift: post-hoc `map_constraint_nodes` check correctly catches TBR rerooting violations; restoration path correctly calls `update_constraint` after revert. +EOF 2>&1 diff --git a/.positai/expertise/shiny-app.md b/.positai/expertise/shiny-app.md new file mode 100644 index 000000000..b91be89d4 --- /dev/null +++ b/.positai/expertise/shiny-app.md @@ -0,0 +1,424 @@ +# Shiny App Expertise — TreeSearch + +## Purpose + +This document provides best practices and troubleshooting guidance for developing and maintaining the TreeSearch Shiny interactive application (`inst/Parsimony/app.R`). The app provides a user-friendly interface for phylogenetic tree search with real-time feedback, logging, and publication-ready visualization. + +## App Architecture + +### High-level Structure + +``` +app.R (3683 lines) +├── UI (lines 264-471) +│ ├── Left sidebar (3-column) +│ │ ├── Data loading (file, package datasets) +│ │ ├── Search controls (configure, start, save log) +│ │ ├── Tree loading and sampling +│ │ └── Display configuration (format, outgroup, etc.) +│ └── Main panel (9-column) +│ ├── Plot area with dynamic sizing +│ ├── Plot controls (size, export, concordance, clustering) +│ └── Tree/space visualization panels (conditional display) +│ +├── Server (lines 506-3683) +│ ├── Logging infrastructure (Write, LogCode, LogComment, etc.) +│ ├── Data loading (UpdateData, Excel/TNT/PhyDat parsers) +│ ├── Tree management (UpdateAllTrees, UpdateActiveTrees, filtering) +│ ├── Search execution (StartSearch, MaximizeParsimony dispatch) +│ ├── Display rendering (consensus, clustering, tree space visualization) +│ ├── User interactions (observeEvent handlers, reactive computations) +│ └── Export functionality (Newick, Nexus, PDF, PNG, R script logging) +│ +└── Supporting Elements + ├── Palettes (56+ color schemes for taxa) + ├── References (formatted bibliography) + ├── Helper functions (Enquote, EnC, Icon, ErrorPlot) + └── Notification system (Notification function wrapping showNotification) +``` + +### Key Reactive Values (lines 508-517) + +- `r$dataFiles`, `r$excelFiles`, `r$treeFiles` — file counters for temp caching +- `r$dataset` — loaded phyDat object +- `r$allTrees`, `r$trees` — all vs. displayed tree subset +- `r$outgroup` — selected outgroup taxa for rooting +- `r$searchWithout` — taxa to exclude from search +- `r$sortTrees` — whether to reorder edges by clade size (for display) +- `r$plotLog`, `r$cmdLogFile` — logging outputs for export + +### Data Flow + +1. **Data load** → `UpdateData()` (line 797) + - Detects file type (Excel, TNT, PhyDat) + - Caches to temp directory + - Logs code for reproducibility + - Attempts to load trees from same file + +2. **Search** → `StartSearch()` (line 1566) + - Builds or uses existing starting tree + - Dispatches to `MaximizeParsimony()` (C++ engine) + - Logs search code with all parameters + - Updates tree display + +3. **Display** → Reactive plot rendering (lines 1731+) + - User selects plot format (individual trees, consensus, clustering, tree space) + - Conditional UI elements show/hide based on selection + - Plots render via R base graphics (not ggplot2) + +## Critical Functions by Purpose + +### Data Loading + +| Function | Lines | Role | +|----------|-------|------| +| `UpdateData()` | 797 | Main dispatcher; handles file/package sources | +| Excel parsing | 830-903 | readxl-based with skip/column controls | +| TNT/PhyDat parsing | 908-949 | Tries multiple formats; caches successfully read files | +| `CacheInput()` | 739 | Copies file to temp for reproducibility | +| Character extraction | 961 | Reads character names/notes for display | + +### Tree Management + +| Function | Lines | Role | +|----------|-------|------| +| `UpdateAllTrees()` | 1145 | Replace all trees; renumber tips consistently | +| `UpdateActiveTrees()` | 1086 | Thin to user-selected range and count | +| `UpdateTreeRange()` | 1067 | Sync range slider with data structures | +| `UpdateNTree()` | 1026 | Update tree count; validate against range | +| `FetchNTree()`, `FetchTreeRange()` | 1012, 1053 | Debounced reactive accessors | + +### Search & Scoring + +| Function | Lines | Role | +|----------|-------|------| +| `StartSearch()` | 1566 | Build starting tree, dispatch MaximizeParsimony, log code | +| `scores()` | 1344 | Cached TreeLength() call on active trees | +| `DisplayTreeScores()` | 1369 | Update results text; show score range and weighting | +| `concavity()` | 1550 | Parse IW exponent or profile mode from input | +| `weighting()` | 1332 | Map UI "on"/"off"/"prof" to concavity values | + +### Rogue Taxon Detection + +| Function | Lines | Role | +|----------|-------|------| +| `Rogues()` | 1775 | Cached Rogue::QuickRogue() call | +| `nNonRogues()` | 1834 | Rogue count at selected p-value | +| `KeptTips()`, `DroppedTips()` | 1949, 1973 | Filter tree tips by rogue analysis | +| `UpdateKeepNTipsRange()` | 1402 | Validate user input; sync with rogue count | + +### Visualization + +| Function | Lines | Role | +|----------|-------|------| +| `PlottedTree()` | 1731 | Consensus or individual tree, rooted/sorted | +| `concordance()` | 1862 | Calculate split support (multiple measures) | +| `LabelConcordance()` | 1876 | Annotate tree with support values | +| `ConsensusPlot()` | 1982 | Render consensus with rogue drop sequence | +| `TipCols()` | 1840 | Color tips by stability (Rogue::ColByStability) | + +### Logging & Export + +| Function | Lines | Role | +|----------|-------|------| +| `BeginLog()` | 590 | Initialize search log with system info | +| `LogCode()`, `LogComment()` | 692, 704 | Append to R script log | +| `Write()` | 524 | Append to temp log file with indentation | +| `StashTrees()` | 745 | Save trees to Nexus in temp for export | + +## Best Practices + +### 1. Reactive Programming Patterns + +**Use `reactive()` for derived values, `bindCache()` for expensive calls:** +```r +# Simple derived value +weighting <- reactive(switch(input$implied.weights, "on" = Inf, ...)) + +# Cached function (re-run only if dependencies change) +scores <- bindCache(reactive({ TreeLength(r$trees, ...) }), + r$treeHash, r$dataHash, concavity()) +``` + +**Avoid:** +- Direct `input$*` reads in observers (use reactive() wrapper) +- Computing the same expensive value multiple times +- Calling `reactive()` inside `observe()`/`observeEvent()` + +### 2. File Handling + +**Always cache input files to temp directory for reproducibility:** +```r +CacheInput("data", fileName) # Copies to tempdir() + DataFileName(counter) +LogCode(paste0("dataFile <- \"", LastFile("data"), "\"")) +``` + +**Supported formats (auto-detect by extension):** +- `.xlsx` / `.xls` — Excel (readxl + configurable skip/columns) +- `.nex` — Nexus (read.nexus) +- `.tre` / `.txt` — TNT or Newick (ReadTntTree or read.tree/read.nexus) +- Any phyDat-compatible text format (ReadAsPhyDat) + +### 3. Logging Code Reproducibility + +**Every significant user action must log equivalent R code:** +```r +LogCode(c( + "newTrees <- MaximizeParsimony(", + " dataset,", + " concavity = 10,", + " maxReplicates = 100", + ")" +)) +``` + +**Use `EnC()` to quote parameters safely:** +```r +# EnC(c("a", "b")) → "c(\"a\", \"b\")" +# EnC("profile") → "\"profile\"" +# EnC(10) → "10" +``` + +**Indentation via `LogIndent()` for nested scopes:** +```r +LogIndent(2) # Indent +2 spaces +LogCode("for (tree in trees) {") +LogIndent(2) +LogCode(" tree <- Consensus(tree, p = 0.5)") +LogIndent(-2) +LogCode("}") +LogIndent(-2) +``` + +### 4. Observing User Input + +**Use debounce for high-frequency inputs (sliders, text boxes):** +```r +PlottedChar <- debounce(reactive({ as.integer(input$plottedChar) }), aJiffy) +``` + +**Use `ignoreInit = TRUE` to skip initialization:** +```r +observeEvent(input$searchConfig, { ... }, ignoreInit = TRUE) +``` + +**Cache tree hashes to detect changes (avoid spurious recalculations):** +```r +observeEvent(r$dataset, { + r$dataHash <- rlang::hash(r$dataset) +}) +r$trees <- thinnedTrees +r$treeHash <- rlang::hash(r$trees) +``` + +### 5. Conditional UI & Show/Hide Elements + +**Use bslib-style id-based show/hide (not class-based):** +```r +# Define in UI with hidden(...) wrapper +hidden(tags$div(id = "displayConfig", ...)) + +# Toggle in server +show("displayConfig", anim = TRUE) # With fade-in animation +hide("displayConfig") # Fade-out +showElement("displayConfig") # JavaScript show() without animation +hideElement("displayConfig") +``` + +**Manage multiple related configs via `ShowConfigs()`:** +```r +observeEvent(input$plotFormat, { + ShowConfigs(switch(input$plotFormat, + "ind" = c("whichTree", "charChooser", "treePlotConfig"), + "cons" = c("consConfig", "branchLegend", "savePlottedTrees"), + "clus" = c("clusConfig", "clusLegend", "savePlottedTrees"), + "" # Default: hide all + )) +}) +``` + +### 6. Modal Dialogs for Configuration + +**Example: Search configuration modal (line 1220):** +```r +observeEvent(input$searchConfig, { + # Pre-populate with current values + updateSelectInput(session, "concavity", selected = input$concavity) + + showModal(modalDialog( + fluidPage(column(6, ...), column(6, ...)), + title = "Tree search settings", + footer = tagList( + modalButton("Close", icon = Icon("rectangle-xmark")), + actionButton("modalGo", "Start search", icon = Icon("magnifying-glass")) + ), + easyClose = TRUE + )) +}) + +observeEvent(input$modalGo, { + removeModal() + StartSearch() +}) +``` + +## Common Issues & Troubleshooting + +### Issue 1: File Upload Not Working + +**Symptom:** User selects file, nothing happens. + +**Checks:** +- File size < `shiny.maxRequestSize` (default 5MB; app sets 1GB at line 4) +- File extension recognized (Excel, TNT, Nexus, text) +- `readxl` installed for Excel files (auto-install at line 831) +- Check browser console for error messages +- If TNT format: tip labels must be inferrable (will try 4 caterpillar orderings) + +### Issue 2: Search Hangs or No Results + +**Symptom:** Click "Search", progress bar shows, but never completes. + +**Checks:** +- Dataset is valid phyDat (not NULL, has tips) +- Tree space not empty or trivial (≥4 tips recommended) +- Replicates/timeout reasonable (maxReplicates ≥ 1, timeout > search time) +- Check `maxSeconds` timeout — if 0, no timeout; if very small, search aborts early +- Parallel mode (nThreads > 1) is non-deterministic; may find different trees + +**Debugging:** +```r +# In console: +ds <- ReadAsPhyDat("data.nex") +attr(ds, "nr") # Check character count +length(ds) # Check taxon count +tree <- AdditionTree(ds) # Should complete quickly +``` + +### Issue 3: Trees Don't Display / Blank Plot + +**Symptom:** Plot area is empty; no error message. + +**Checks:** +- Trees loaded? (r$trees length > 0) +- Dataset loaded? (needed for consensus/character display) +- Display format selected? (default "cons" should show something) +- Outgroup valid? (must be in tree tips) +- Rogue-dropping valid? (can't drop all tips) + +**Debugging:** +```r +# In console: +length(app_env$r$trees) # Should be > 0 +app_env$AnyTrees() # Should be TRUE +app_env$Consensus(app_env$r$trees, p=1) # Should render +``` + +### Issue 4: Logging Code Mismatch + +**Symptom:** Exported R script doesn't reproduce results. + +**Checks:** +- File paths in log correct? (should use temp files like "dataFile-00.txt") +- Parameters logged correctly? (check `Enquote()` results) +- Library calls present? (BeginLog should include all imports) +- Character encoding OK? (use system-appropriate paths) + +**Prevention:** +- Always use `LogCode()` immediately after performing an action +- Test exported script manually in a fresh R session +- Check `tempdir()` for actual cached files + +### Issue 5: Rogue Analysis Crashes or Misses Taxa + +**Symptom:** `Rogues()` returns NULL, or taxa don't appear in drop sequence. + +**Checks:** +- Dataset properly loaded (not NULL) +- Trees properly loaded (at least 1 tree, tip labels match) +- `p` parameter reasonable (0.5 to 1.0; default 1.0 = strict majority rule) +- Run `Rogue::QuickRogue()` manually to test: + ```r + rogues <- Rogue::QuickRogue(r$trees, neverDrop = input$neverDrop, + fullSeq = TRUE, p = consP()) + ``` + +### Issue 6: Memory Leak or Slowdown Over Time + +**Symptom:** App slows down after many searches; process memory grows. + +**Checks:** +- File caching in `tempdir()` consuming space? (e.g., 1000 searches → 1000s of cached files) +- Large tree objects retained? (clear old results before new search) +- Image caches building up? (plots rendered reactively, may leak if observer not cleaned up) + +**Prevention:** +- Periodically clear `tempdir()` (not auto-cleared by default) +- Use `on.exit()` to clean up temporary objects: + ```r + observeEvent(input$clearCache, { + do.call(file.remove, list(dir(tempdir(), full.names=TRUE))) + Notification("Cache cleared", type="message") + }) + ``` + +## Integration with C++ Engine + +### Key Changes from Legacy Morphy + +**Old (MorphyLib):** +```r +# Had to delegate constraints/profile to Morphy() +MaximizeParsimony(dataset, constraint = cons, concavity = "profile") +→ fell back to R-loop Morphy() search +``` + +**New (C++ engine):** +```r +# C++ engine handles everything natively +MaximizeParsimony(dataset, constraint = cons, concavity = "profile", + strategy = "auto", nThreads = 2, verbosity = 1) +``` + +### Strategy Presets (line 1231) + +- **"auto"** — Auto-selects based on dataset size (sprint ≤30, default 31-60, thorough 61+) +- **"sprint"** — 3 ratchet cycles, no drift; minimal sectorial +- **"default"** — 5 ratchet, 2 drift; XSS+RSS+CSS +- **"thorough"** — 20 ratchet, 12 drift; intensive sectorial; adaptive ratchet + +### Weighting Mode (line 1224) + +- **"on"** (Implied) — IW with concavity exponent (k = 10^exponent) +- **"off"** (Equal) — EW (all characters weight 1) +- **"prof"** (Profile) — Profile parsimony (info-theoretic weighting) + +## Testing Checklist + +Before deploying app updates: + +- [ ] Data loads: Excel (with skip/columns), TNT, Nexus, generic text +- [ ] Search runs: EW, IW, profile; small (4 tips), medium (25), large (75+) +- [ ] Logging: exported R script runs in fresh session, reproduces trees +- [ ] Display: individual, consensus, clustering, tree space all render +- [ ] Rogue analysis: correctly identifies and drops unstable taxa +- [ ] Outgroup: rooting works; must be in tree and dataset +- [ ] Export: PDF, PNG, Newick, Nexus files valid +- [ ] Performance: 50+ searches don't slow app significantly +- [ ] Parallel: nThreads=2 works; results reasonable (non-deterministic) +- [ ] Edge cases: 3-tip tree, single-character dataset, all inapplicable, empty pool + +## Performance Tips + +1. **Limit active tree display** — reduce `whichTree` max range if >100 trees +2. **Cache tree hashes** — avoid re-scoring unchanged trees +3. **Use bounded indirect** — ensure TBR/drift/SPR use `_bounded` variants +4. **Debounce slider inputs** — high-frequency slider updates (default aJiffy ≈ 42ms) +5. **Profile big plots** — use `system.time({ ... })` for consensus/space rendering + +## References + +- **app.R**: Main application file (3683 lines) +- **Related packages**: shiny, shinyjs, bslib, TreeTools, TreeSearch, Rogue, TreeDist +- **C++ search**: MaximizeParsimony() documented in `R/MaximizeParsimony.R` +- **Logging infrastructure**: BeginLog, LogCode, Write functions (lines 590-715) diff --git a/.positai/expertise/tnt.md b/.positai/expertise/tnt.md new file mode 100644 index 000000000..d8c13015a --- /dev/null +++ b/.positai/expertise/tnt.md @@ -0,0 +1,87 @@ +# TNT (Tree analysis using New Technology) + +## Installation + +TNT is installed at `C:\Programs\Phylogeny\tnt\`. + +### Executables + +| Path | Version | Notes | +|------|---------|-------| +| `tnt/tnt.exe` | older | **Do not use.** | +| `tnt/TNT-bin/tnt.exe` | 1.6 | **Use this one.** Console/script mode. | +| `tnt/TNT-bin/wTNT.exe` | 1.6 | Windows GUI version. | + +Always use `C:\Programs\Phylogeny\tnt\TNT-bin\tnt.exe` (version 1.6). + +### Invocation + +**Never launch TNT without passing a script file.** TNT defaults to +interactive mode and will block waiting for keyboard input, hanging any +automated pipeline. + +**Correct pattern** — pass a `.run` script as a positional argument with +trailing semicolon: + +```bash +"C:/Programs/Phylogeny/tnt/TNT-bin/tnt.exe" "myscript.run;" +``` + +This launches TNT in PISH (batch) mode. It reads and executes the script, +then exits when it hits `quit;`. + +**Critical: script files must use `.run` extension.** TNT interprets `.tnt` +files as data files. If you pass a `.tnt` script, TNT will try to parse it +as data and fail with "Can't open .tnt". + +**Critical: script filenames must be purely alphabetic (no digits or +underscores).** TNT parses the filename as a command line — it splits on +digits and underscores, treating the first alphabetic token as a command. +`bench1.run` → command `bench`; `Vinther2008_EW.run` → command `vinther`. +Safe names: `tntbench.run`, `mytest.run`, `abc.run`. + +**Piping via stdin does NOT work reliably** — `echo "..." | tnt.exe` launches +interactive mode (shows ASCII banner) and may hang. + +**Encoding**: TNT stdout contains non-UTF8 progress bar characters. Use +`iconv(output, from = "", to = "UTF-8", sub = "")` to sanitize before +regex matching in R. + +### TNT script basics + +- Commands are terminated by `;` +- `mxram N;` — set memory (MB); must be first command +- `proc ;` — read data file (TNT `.tnt` or Nexus format) +- `xmult;` — heuristic search (new technology search) +- `xmult=hits N replic M;` — search with convergence/replicate limits +- `piwe = K;` — implied weights with concavity constant K +- `xpiwe = K;` — extended implied weights +- `rseed N;` — set random seed +- `timeout HH:MM:SS;` — set search time limit +- `best;` — report best score and tree count +- `length;` — print tree lengths +- `quit;` — exit TNT (essential for non-interactive use) + +### Data format + +TNT can read NEXUS (`.nex`) files and its own format (`.tnt`). +For NEXUS input, use `proc ;`. + +Export from R: `TreeTools::WriteTntCharacters(phyDat_obj, filepath)`. + +### Output parsing + +TNT stdout contains parseable lines: +- `"Best score: 78."` or `"Best score: 3.80000."` (IW) — best score +- `"N trees retained"` — number of trees found +- `"Best score hit N times."` — convergence hits +- `"Total rearrangements examined: N."` — total rearrangements + +### Score comparability with TreeSearch + +TNT standard Fitch treats inapplicable tokens as a regular character state +(column-based). TreeSearch uses Brazeau et al. (2019) three-pass algorithm. +For datasets with inapplicable characters, TNT EW scores will generally be +≤ TreeSearch EW scores. For IW, both use Goloboff's `e/(k+e)` formula. + +Example: Vinther2008 — TNT EW = 78, TreeSearch EW = 79. diff --git a/.positai/plans/2026-03-22-1348-full-polytomy-search-for-treesearch-c-engine.md b/.positai/plans/2026-03-22-1348-full-polytomy-search-for-treesearch-c-engine.md new file mode 100644 index 000000000..cba1a8009 --- /dev/null +++ b/.positai/plans/2026-03-22-1348-full-polytomy-search-for-treesearch-c-engine.md @@ -0,0 +1,450 @@ +# Full Polytomy Search for TreeSearch C++ Engine + +**Status:** IN PROGRESS +**Target branch:** `feature/polytomy-search` (from `cpp-search`) +**Target worktree:** `../TS-Polytomy` + +## Motivation + +The TNT benchmark (2026-03-20, `TS-TNT-bench` worktree) shows TreeSearch +falls 1–14 steps behind TNT on datasets with ≥50 taxa. The TNT +outperformance analysis identifies **tree collapsing during search** as the +single biggest remaining algorithmic gap: + +> "Searches that collapse branches with minimum possible length produce more +> effective searches than criteria which collapse fewer branches, both in +> terms of time needed to complete searches, and ability to find shortest +> trees." — Goloboff (2023), Cladistics 39: 229–238 + +The existing `ts_collapsed.h/.cpp` (clip-skipping) was a partial step +toward this, but benchmarks showed 0% skip rate on standard morphological +data because near-optimal **binary** trees have few zero-length edges. The +key insight is that collapsing those edges into polytomies *changes the +search topology space*, making TBR/SPR more efficient by eliminating +distinctions that carry no phylogenetic signal. + +### Key literature + +| Reference | Key contribution | +|-----------|-----------------| +| Goloboff (1996), "Methods for faster parsimony analysis", Cladistics 12: 199–220 | §"Collapsing The Trees": partial reoptimization, shortest-path shortcut, asymmetric reachability | +| Goloboff & Farris (2001), "Methods for quick consensus estimation", Cladistics 17: S26–S34 | TBR-collapsing rule: collapse all nodes between source/dest for equal-length rearrangements | +| Goloboff (2023), "Searches, implied weights, and tree collapsing", Cladistics 39: 229–238 | Empirical comparison of collapsing criteria during search; "minimum possible length 0" recommended | +| Day et al. (1985) / TreeDist | O(n·k) strict consensus via compatible-splits method; available in TreeDist | + +### Detailed literature notes (from PDF review 2026-03-22) + +**Goloboff 1996 — §"Collapsing The Trees" (pp. 213–218)** + +1. *Shortest-path test (approximate)*: If no node in the path between the + clipped subtree's original position and the destination is "supported" + (has character-state change), the rearranged tree collapses to the same + polytomy as the original. The tree can be discarded without full + reoptimization. This is the core shortcut that our collapsed-region + skipping approximates. + +2. *Asymmetric reachability*: The shortest-path shortcut creates directed + connectivity — swapping on tree A may find B, but swapping on B may not + find A. Goloboff gives an explicit example (5 taxa, `x000 a100 b011 + c111 d111`) where the dichotomous tree A can reach the trichotomous + tree B, but no resolution of B can reach A because the movement would + cross only unsupported nodes. He argues this is acceptable: "heuristic + searches cannot guarantee finding all of the optimal trees, or even any + of them—with or without shortcuts." + +3. *Efficient collapsing via final states*: For characters where the final + state sets don't change after rearrangement (checked by comparing basal + node of clipped subtree against ancestor/descendant of destination + branch), only 10–20% of characters need reoptimization for collapsing. + This maps to our incremental scoring infrastructure. + +4. *Union construct method*: A further optimization that evaluates + destinations "en masse" by computing union state sets for subtrees and + rejecting entire branches when the union construct produces suboptimal + length. Achieved 50% time reduction on congruent datasets (168 taxa), + but no gain on incongruent data. + +**Goloboff & Farris 2001 — "Methods for quick consensus estimation"** + +1. *TBR-collapsing rule*: "when a rearrangement produces a tree of the + same length as the one being swapped, collapsing all of the nodes + between source and destination (and new root, in the case of TBR)." + This is equivalent to saving all equal-length trees and computing their + strict consensus, but uses no extra memory and less time. + +2. *SPR vs TBR collapsing*: TBR-based collapsing eliminates more + spurious groups than SPR-based, with minimal loss of correct groups. + On Zilla (500 taxa): SPR collapsing gives 79.6% true nodes recovered + with 0.63% error rate; TBR gives 79.0% true nodes with 0.48% error + rate. Net effect: TBR collapsing is more reliable. + +3. *RFD (Relative Fit Difference)*: Extends collapsing to suboptimal + trees by measuring `(F-C)/F` where F = favorable fit, C = contradictory + fit. Nodes with RFD below a threshold Q are collapsed. When calculating + rearrangement length, as soon as length increase X > D/(1-Q), the + rearrangement can be abandoned. For Q=0.10, tree collapsing takes only + 5% additional time. This could be a future extension (post-2.0.0). + +4. *Pool benefit*: Collapsing trees during swapping means different + dichotomous trees that differ only in "minor" rearrangements collapse + to the same polytomy. The pool then stores more topologically diverse + trees, improving search effectiveness. This directly validates our + Phase 5 (collapsed-topology pool dedup). + +**Goloboff & Morales 2023 — TNT version 1.6** + +1. *Consensus stabilization*: TNT's driven search can stop when the + strict consensus is stable after N hits — analogous to TreeSearch's + `consensusStableReps`. TNT's parallel mode has a coordinator that + centralizes consensus calculation. + +2. *Parallel architecture*: "Builders" create trees via Wagner+TBR+ + sectorial/ratchet/drift, pass them to a "fuser" task. Similar to + TreeSearch's `ThreadSafePool` pattern but using PVM processes rather + than threads. + +3. *Fast consensus*: The user notes that Day et al. (1985) O(n·k) strict + consensus is available via the TreeDist package. This could replace or + supplement the XOR-hash consensus approximation in `ts_pool.cpp` for + more accurate stability detection. Not needed for the polytomy search + itself, but relevant for improving consensus-stability stopping. + +### What TNT does + +TNT collapses zero-length branches **during search** by default (`collapse +3;` = TBR-rule). After each TBR rearrangement is accepted, zero-length +edges are contracted into polytomies. TBR then operates on the collapsed +(non-binary) tree, which has fewer edges to clip and regraft through. The +key benefits are: + +1. **Fewer TBR candidates**: a polytomous tree with k collapsed edges has + ~2k fewer clip candidates and ~2k fewer regraft positions per clip. +2. **Pool deduplication**: collapsed trees that differ only in unsupported + resolution are identical, preventing the pool from filling with + trivially different trees. +3. **Better convergence**: the search explores "real" topological + differences rather than wasting effort on unsupported resolutions. + +--- + +## Design decision: Approach B (collapsed-edge set, binary internals) + +After reviewing the codebase, **Approach A** (replacing `left[]`/`right[]` +with multi-child representation) would require rewriting every module — TBR, +SPR, NNI, Fitch scoring, NA scoring, incremental scoring, undo stacks, +Wagner construction, constraint checking, sectorial search, fusing, splits. +This is estimated at 10+ weeks and carries extreme regression risk. + +**Approach B** is both faster to implement and closer to what TNT actually +does. TNT stores trees as binary internally but maintains a set of +"collapsed" edges that modify candidate enumeration and pool comparison. +The binary topology is always available for scoring; collapsed edges just +indicate which resolutions are unsupported. + +### Core idea + +Maintain a `std::vector collapsed` flag array alongside the +existing binary `TreeState`. After each accepted TBR/SPR move + full +rescore: + +1. **Recompute collapsed flags** (already implemented in `ts_collapsed.cpp`) +2. **Skip collapsed clips** in TBR/SPR/drift candidate enumeration +3. **Skip collapsed regraft distinctions**: when regrafting into a region + of consecutive collapsed edges, all positions within that region + produce the same score — evaluate only one representative position +4. **Pool comparison uses collapsed form**: two binary trees that collapse + to the same polytomy are treated as duplicates + +### Why this works without changing TreeState + +- Scoring uses the binary tree (exact Fitch downpass/uppass, unchanged) +- Topology manipulation uses binary operations (SPR clip/regraft, unchanged) +- Only candidate **enumeration** changes (skip/merge collapsed regions) +- Pool comparison adds a collapsed-topology hash alongside the existing + binary split hash + +The binary tree is always there as a "refinement" of the collapsed tree. +When a move is accepted that resolves a polytomy (puts signal on a +previously zero-length edge), the collapsed flag simply clears. + +--- + +## Implementation plan + +### Phase 1: Collapsed-region identification (extend existing code) + +**Files:** `src/ts_collapsed.h`, `src/ts_collapsed.cpp` + +The existing `compute_collapsed_flags()` already identifies edges where +clipping cannot improve score. Extend this to also identify **collapsed +regions** — maximal connected subsets of collapsed edges forming a +polytomy: + +```cpp +struct CollapsedRegion { + int representative; // one node in the region (for regraft targeting) + int n_edges; // number of collapsed edges in this region + std::vector nodes; // all nodes with collapsed[node] == 1 in region +}; + +struct CollapsedInfo { + std::vector collapsed; // per-node flag (existing) + std::vector region_id; // per-node: which region (-1 if not collapsed) + std::vector regions; // the collapsed regions + int n_collapsed = 0; // total collapsed edges +}; + +void compute_collapsed_info( + const TreeState& tree, + const DataSet& ds, + CollapsedInfo& info); +``` + +This is a simple post-processing step after the existing flag computation: +BFS/DFS from each collapsed node, grouping connected collapsed edges. + +**Estimated effort:** 1–2 days + +### Phase 2: TBR clip skipping (already partially done) + +**Files:** `src/ts_tbr.cpp` + +The current code already skips collapsed clips when `!collect_pool`. Verify +this is working correctly and add a **diagnostic counter** (`n_collapsed_skipped`) +to the TBR return value for benchmarking. + +No code change needed beyond the diagnostic counter — Phase 1's extended +flags subsume the existing implementation. + +**Estimated effort:** 0.5 days + +### Phase 3: TBR regraft region merging (the main win) + +**Files:** `src/ts_tbr.cpp` + +This is the key new optimization. When evaluating regraft positions for a +non-collapsed clip: + +**Current behavior:** enumerate all main-tree edges as regraft candidates, +evaluate each independently. + +**New behavior:** for each collapsed region, evaluate only **one +representative regraft position** within the region. All positions within a +collapsed region produce identical scores (because the intermediate nodes +have zero cost and identical state sets — exactly the conditions verified +by `compute_collapsed_flags()`). + +Implementation in the TBR regraft loop: +```cpp +for (auto& [above, below] : main_edges) { + // Skip redundant positions within collapsed regions + if (collapsed_info.collapsed[below] && + collapsed_info.region_id[below] == last_evaluated_region) { + continue; // same region, same score — skip + } + last_evaluated_region = collapsed_info.region_id[below]; + + // ... evaluate regraft as before ... +} +``` + +**Correctness argument:** Within a collapsed region, all edges have: +- Zero local cost at parent (condition 1–2 of collapsed flags) +- `prelim[sibling] == prelim[parent]` (condition 3) +- `down2[sibling] == down2[parent]` (condition 4, NA) +- `subtree_actives[sibling] == subtree_actives[parent]` (condition 5, NA) + +Therefore the `final_` states used by `fitch_indirect_length()` at any +edge within the region produce the same `vroot` value, giving identical +scores for all regraft positions in the region. + +**Important subtlety:** The best regraft position's `(above, below)` pair +matters for the actual topology after the move. When a collapsed-region +regraft is chosen, we regraft at the representative position. The resulting +tree will have a different binary resolution of the polytomy, but the same +score and the same collapsed topology. This is equivalent to TNT's behavior. + +**Estimated effort:** 3–5 days (careful correctness verification needed) + +### Phase 4: SPR and drift integration + +**Files:** `src/ts_search.cpp`, `src/ts_drift.cpp` + +Apply the same clip-skipping (already in Phase 2) and regraft-merging +(Phase 3 pattern) to SPR search and drift search. + +For drift: suboptimal-acceptance moves should still skip collapsed clips +(a collapsed clip cannot improve OR change the score, so accepting it +is always a no-op). Regraft merging applies identically. + +**Estimated effort:** 2–3 days + +### Phase 5: Pool deduplication using collapsed form + +**Files:** `src/ts_pool.h`, `src/ts_pool.cpp`, `src/ts_splits.h` + +Currently pool deduplication uses binary split hashes. Two trees that +differ only in unsupported resolution have different split hashes but +should be considered duplicates. + +**Add collapsed-topology hashing:** +1. After computing collapsed flags, identify the "collapsed splits" — + the splits that remain after contracting all collapsed edges. +2. Hash only the non-collapsed splits for pool dedup. +3. Use this hash as the primary dedup key; fall back to binary hash + for trees with no collapsed edges (fully resolved). + +Implementation: +```cpp +uint64_t compute_collapsed_hash( + const TreeState& tree, + const CollapsedInfo& info, + int n_tip); +``` + +This is a filtered version of the existing `compute_splits()` + +`hash_single_split()` pipeline — just skip splits corresponding to +collapsed edges. + +**Estimated effort:** 2–3 days + +### Phase 6: Ratchet interaction + +**Files:** `src/ts_ratchet.cpp` + +During ratchet perturbation, character weights change, which means +collapsed flags must be recomputed after perturbation. The ratchet already +calls `tbr_search()` which recomputes flags after each accepted move, so +this should work automatically. + +**One subtlety:** After ratchet perturbation (upweighting/zeroing chars), +some previously collapsed edges may become non-collapsed (the perturbed +weights create artificial signal). This is correct behavior — the +perturbation should explore the full binary space. + +After ratchet un-perturbation (restoring original weights), the full +rescore will re-establish correct collapsed flags. + +**Estimated effort:** 1 day (verification + edge case testing) + +### Phase 7: Sectorial search interaction + +**Files:** `src/ts_sector.cpp` + +For sectorial search, collapsed flags should be computed on the full tree +and passed to the sector TBR. Within a sector: +- Clip candidates that are collapsed in the full tree remain collapsed +- Regraft merging applies within the sector + +Collapsed flags for the **reduced dataset** (sector subproblem) should be +recomputed from the sector's own scoring, not inherited from the full tree. + +**Estimated effort:** 2–3 days + +### Phase 8: Wagner tree collapsing + +**Files:** `src/ts_wagner.cpp` + +After Wagner tree construction, compute collapsed flags before the first +TBR pass. Wagner trees typically have many zero-length edges (the greedy +construction often creates unsupported resolutions), so this is where +collapsed-region merging may have the biggest per-tree impact. + +**Estimated effort:** 0.5 days + +### Phase 9: Testing + +**Files:** `tests/testthat/test-ts-polytomy-search.R` (Tier 2) + +1. **Region identification:** hand-built trees with known collapsed + regions; verify region count and membership. +2. **Regraft merging correctness:** verify that evaluating all positions + vs. one-per-region gives identical best scores. +3. **Pool collapsed-hash dedup:** two trees differing only in zero-length + resolution are treated as duplicates. +4. **Score equivalence:** driven search with collapsed optimization + produces same or better scores than without. +5. **IW/Profile mode compatibility.** +6. **NA dataset compatibility.** +7. **Ratchet interaction:** collapsed flags correctly update after + perturbation and un-perturbation. +8. **End-to-end regression:** run existing benchmark datasets, verify + no score degradation. + +**Estimated effort:** 3–4 days + +### Phase 10: Benchmarking + +Re-run the TNT benchmark comparison with collapsed search enabled: +- Same 14 datasets, EW Fitch, 10s and 30s timeout +- Compare scores, timing, and replicates completed +- Focus on the 5 datasets where TreeSearch fell behind + +Also measure: +- Collapsed edge percentage per dataset (at optimum) +- Regraft candidates skipped per TBR pass +- Pool duplicate reduction + +**Estimated effort:** 1–2 days + +--- + +## Risk assessment + +| Risk | Severity | Mitigation | +|------|----------|------------| +| Regraft merging incorrectly skips a productive position | HIGH | Formal correctness proof + extensive unit tests; conservative fallback to evaluate all if collapsed count is low | +| Collapsed flags stale after ratchet perturbation | MEDIUM | Flags always recomputed after full_rescore; verify in ratchet tests | +| Pool collapsed-hash collisions (different topologies hash same) | LOW | Conservative direction (over-dedup); hash collision = treat as duplicate = miss one tree, not wrong scores | +| Negligible benefit on dense morphological data | MEDIUM | TNT benchmarks show the benefit is real; if our data shows otherwise, document and stop | +| Interaction with MPT enumeration | HIGH | Collapsed optimizations MUST be disabled during `collect_pool` (equal-score exploration); already guarded in existing code | + +--- + +## Estimated total effort + +| Phase | Days | Cumulative | +|-------|------|------------| +| 1. Collapsed regions | 1–2 | 1–2 | +| 2. TBR clip (existing) | 0.5 | 1.5–2.5 | +| 3. TBR regraft merging | 3–5 | 4.5–7.5 | +| 4. SPR + drift | 2–3 | 6.5–10.5 | +| 5. Pool dedup | 2–3 | 8.5–13.5 | +| 6. Ratchet | 1 | 9.5–14.5 | +| 7. Sectorial | 2–3 | 11.5–17.5 | +| 8. Wagner | 0.5 | 12–18 | +| 9. Testing | 3–4 | 15–22 | +| 10. Benchmarking | 1–2 | 16–24 | + +**Total: 16–24 agent-days.** Substantially less than the 9–13 weeks +estimated for Approach A (full polytomy representation). + +--- + +## Literature review — COMPLETE (2026-03-22) + +All three papers reviewed from PDF. Key algorithmic details extracted +in the "Detailed literature notes" section above. The Goloboff (2023) +paper on collapsing criteria was not available in PDF but its core +recommendation ("minimum possible length 0" during search) is documented +in the AGENTS.md architecture reference. + +--- + +## Success criteria + +1. **Score parity or improvement** on all 14 TNT benchmark datasets + (no regressions) +2. **Measurable collapsed-edge skip rate** (>0%) on at least the harder + datasets (Wortley2006, Eklund2004, Zanol2014, Zhu2013, Giles2015) +3. **All existing tests pass** (1859 ts-* tests + full R-level suite) +4. **New test file** with ≥15 assertions covering all phases + +--- + +## References + +- Goloboff, P. A. (1996). Methods for faster parsimony analysis. Cladistics, 12, 199–220. +- Goloboff, P. A. & Farris, J. S. (2001). Methods for quick consensus estimation. Cladistics, 17, S26–S34. +- Goloboff, P. A. (2023). Searches, implied weights, and tree collapsing. Cladistics, 39, 229–238. +- Goloboff, P. A. & Catalano, S. A. (2016). TNT version 1.5. Cladistics, 32, 221–238. diff --git a/.positai/settings.json b/.positai/settings.json new file mode 100644 index 000000000..bbdb16049 --- /dev/null +++ b/.positai/settings.json @@ -0,0 +1,57 @@ +{ + "model": { + "id": "claude-sonnet-4-6", + "provider": "positai", + "thinkingEffort": "high" + }, + "permission": { + "edit": { + "*.md": "allow", + "*.h": "allow", + "*.cpp": "allow", + "*.R": "allow", + "*.c": "allow", + "*/NAMESPACE": "allow" + }, + "bash": { + "cd C:/Users/pjjg18/GitHub/TreeSearch": "allow", + "Rscript -e \"pkgbuild::compile_dll()\" 2>&1": "allow", + "grep *": "allow", + "head *": "allow", + "cd \"C:/Users/pjjg18/GitHub/TreeSearch\"": "allow", + "Rscript --vanilla -e \"pkgbuild::compile_dll(debug=FALSE)\" 2>&1": "allow", + "cd /c/Users/pjjg18/GitHub/TreeSearch": "allow", + "Rscript -e \"roxygen2::roxygenise(load_code = roxygen2::load_installed)\" 2>&1": "allow", + "tail *": "allow", + "Rscript -e \".libPaths(c('.agent-A', .libPaths())); roxygen2::roxygenise(load_code = roxygen2::load_installed)\" 2>&1": "allow", + "git *": "allow" + }, + "read": { + "*.cpp": "allow" + }, + "external_directory": { + "C:/Users/pjjg18/GitHub/TreeDist/*": "allow", + "C:/Users/pjjg18/GitHub/TreeDist/R/*": "allow", + "C:/Users/pjjg18/GitHub/TreeDist/src/*": "allow", + "C:/Users/pjjg18/GitHub/TreeDist/vignettes/*": "allow", + "C:/Users/pjjg18/.positai/skills/r-package-profiling/references/*": "allow", + "C:/Users/pjjg18/GitHub/TS-MadSlat/R/*": "allow", + "C:/Users/pjjg18/GitHub/TS-MadSlat/inst/benchmarks/*": "allow", + "C:/Users/pjjg18/.positai/skills/r-package-profiling/*": "allow", + "C:/Users/pjjg18/GitHub/TS-MadSlat/src/*": "allow", + "*": "allow" + }, + "skill": { + "r-package-profiling": "allow", + "hamilton-hpc": "allow" + }, + "webfetch": { + "https://repo.r-wasm.org/*": "allow", + "https://agentskills.io/*": "allow", + "https://platform.claude.com/*": "allow", + "https://github.com/*": "allow", + "https://raw.githubusercontent.com/*": "allow", + "https://cran.r-project.org/*": "allow" + } + } +} \ No newline at end of file diff --git a/.positai/skills/hamilton-hpc/SKILL.md b/.positai/skills/hamilton-hpc/SKILL.md new file mode 100644 index 000000000..d69f203b3 --- /dev/null +++ b/.positai/skills/hamilton-hpc/SKILL.md @@ -0,0 +1,256 @@ +--- +name: hamilton-hpc +description: Run benchmarks, builds, and batch R jobs on the Hamilton HPC cluster (Durham University). Covers SSH connection via R's ssh package, SLURM job submission, module loading, TreeSearch installation, and result retrieval. Use when the user mentions Hamilton, HPC, remote benchmarks, SLURM jobs, or running anything on the cluster. +--- + +# Hamilton HPC — Remote Job Execution + +## Connection + +Hamilton is accessed via the R `ssh` package (the local machine has no +CLI SSH config for Hamilton). The session object is typically stored in +the R global environment as `session`. + +Connection credentials may be set in `.Renviron`: + +| Variable | Purpose | +|----------|---------| +| `sshLogin` | `user@host` (e.g. `pjjg18@hamilton8.dur.ac.uk`) | +| `sshKey` | Path to private key (relative to `.Renviron` location) | +| `sshPass` | Passphrase hint / lookup key | + +Try environment variables first, falling back to hardcoded defaults: + +```r +library(ssh) +login <- Sys.getenv("sshLogin", "pjjg18@hamilton8.dur.ac.uk") +key <- Sys.getenv("sshKey", "") +session <- if (nzchar(key)) { + ssh_connect(login, keyfile = key) +} else { + ssh_connect(login) +} +ssh_info(session) # verify connection +``` + +If the session has gone stale (`ssh_info()` errors), reconnect using +the same pattern. + +### Running commands + +`ssh_exec_internal()` runs in a minimal shell **without** module +environments. Commands that need `Rscript`, `gcc`, etc. will fail with +status 127. For anything that requires loaded modules, use SLURM job +submission instead. + +```r +# Simple commands work fine +out <- ssh_exec_internal(session, "ls /nobackup/pjjg18/ts-bench/") +cat(rawToChar(out$stdout)) + +# This will FAIL (status 127) — Rscript not in PATH: +# ssh_exec_internal(session, "Rscript -e '1+1'") +``` + +### File transfer + +```r +# Upload +scp_upload(session, "local_script.R", "/nobackup/pjjg18/ts-bench/local_script.R") + +# Download +scp_download(session, "/nobackup/pjjg18/ts-bench/results/output.csv", "local_output.csv") +``` + +## Working directory layout + +All TreeSearch benchmark work lives under `/nobackup/pjjg18/ts-bench/`: + +| Path | Purpose | +|------|---------| +| `TreeSearch/` | Git clone of the repo (used for builds) | +| `lib-baseline/` | R library with deps + baseline TreeSearch build | +| `lib-optimized/` | R library with alternative TreeSearch build (A/B testing) | +| `results/` | SLURM logs and CSV output files | +| `data/` | Additional datasets | +| `*.R` | Benchmark and build R scripts | +| `*.sh` | SLURM job scripts | + +## Environment modules + +Hamilton uses `module load` for toolchains. Every SLURM script must +include: + +```bash +module load r/4.5.1 +module load gcc/14.2 +``` + +Pin single-threaded execution to avoid BLAS/OpenMP contention with +TreeSearch's own threading: + +```bash +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 +``` + +## SLURM job submission + +### Writing a job script + +```bash +#!/bin/bash +#SBATCH --job-name=ts-bench +#SBATCH --output=/nobackup/pjjg18/ts-bench/results/my_job.log +#SBATCH --error=/nobackup/pjjg18/ts-bench/results/my_job.err +#SBATCH -n 1 +#SBATCH --time=0:15:00 +#SBATCH --mem=4000M +#SBATCH -p shared + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +Rscript /nobackup/pjjg18/ts-bench/my_script.R +``` + +Typical resource requests: +- **Benchmarks (30s budget, 5 seeds):** `--time=0:05:00 --mem=4000M -n 1` +- **Benchmarks (120s budget, 5 seeds):** `--time=0:15:00 --mem=4000M -n 1` +- **Package builds:** `--time=0:30:00 --mem=4000M -n 4` +- **Dependency installs:** `--time=0:30:00 --mem=4000M -n 4` + +### Submitting and monitoring from R + +```r +# Upload script then submit +scp_upload(session, "my_job.sh", "/nobackup/pjjg18/ts-bench/my_job.sh") +out <- ssh_exec_internal(session, "cd /nobackup/pjjg18/ts-bench && sbatch my_job.sh") +cat(rawToChar(out$stdout)) # "Submitted batch job 12345678" + +# Parameterized submission (pass variables via --export) +cmd <- sprintf( + "cd /nobackup/pjjg18/ts-bench && sbatch --export=VAR1=%d,VAR2=%d -o results/run_%d.log -e results/run_%d.err my_job.sh", + val1, val2, val1, val1 +) +out <- ssh_exec_internal(session, cmd) + +# Monitor queue +out <- ssh_exec_internal(session, "squeue -u pjjg18 --format='%i %j %T %M'") +cat(rawToChar(out$stdout)) + +# Read results after completion +out <- ssh_exec_internal(session, "cat /nobackup/pjjg18/ts-bench/results/my_job.log") +cat(rawToChar(out$stdout)) +``` + +### Polling for completion + +There is no built-in callback. Poll `squeue` or check for output files: + +```r +# Poll until a specific job finishes +poll_job <- function(session, job_id, interval = 30) { + repeat { + out <- ssh_exec_internal(session, sprintf("squeue -j %s -h 2>/dev/null", job_id)) + if (nchar(rawToChar(out$stdout)) == 0) return(TRUE) + Sys.sleep(interval) + } +} +``` + +## Building TreeSearch on Hamilton + +The repo is cloned at `/nobackup/pjjg18/ts-bench/TreeSearch/`. + +### Single-branch rebuild (fast, cached objects) + +When rebuilding the same branch after a small change, in-place install +reuses cached `.o` files — only changed translation units recompile: + +```bash +BENCH=/nobackup/pjjg18/ts-bench +REPO=$BENCH/TreeSearch +LIB=$BENCH/lib-baseline + +cd $REPO +git pull origin +R CMD INSTALL --library=$LIB "$REPO" +``` + +### A/B profiling rebuild (tarball, clean objects) + +When switching branches to build two variants (e.g. `lib-baseline` vs +`lib-optimized`), you **must** use the tarball method. In-place install +compiles `.o` files directly in `src/`, so stale objects from branch A +contaminate the build of branch B. The tarball copies sources to a temp +directory, keeping the repo clean. + +```bash +BENCH=/nobackup/pjjg18/ts-bench +REPO=$BENCH/TreeSearch +LIB=$BENCH/lib-baseline # or lib-optimized + +cd $REPO +git checkout +git pull origin + +rm -f src/*.o src/*.so +TMPDIR=$(mktemp -d) +(cd "$TMPDIR" && R CMD build --no-build-vignettes --no-manual --no-resave-data "$REPO") +R CMD INSTALL --library=$LIB "$TMPDIR"/TreeSearch_*.tar.gz +rm -rf "$TMPDIR" +``` + +The `rm -f src/*.o src/*.so` before each build is belt-and-suspenders: +the tarball method already isolates compilation, but this ensures no +stale artifacts are accidentally picked up if someone later does an +in-place build. + +## Installing R package dependencies + +Submit `install_deps.R` as a SLURM job (needs network access from +compute node): + +```r +lib <- '/nobackup/pjjg18/ts-bench/lib-baseline' +.libPaths(c(lib, .libPaths())) +install.packages( + c('Rcpp', 'ape', 'cli', 'fastmatch', 'lifecycle', 'Rdpack', + 'phangorn', 'TreeTools', 'PlotTools', 'TreeDist', 'Quartet', + 'Rogue', 'protoclust', 'abind'), + lib = lib, + repos = 'https://cloud.r-project.org', + Ncpus = 4L +) +``` + +## Writing benchmark R scripts + +Benchmark scripts load TreeSearch from a specified library: + +```r +lib_dir <- "/nobackup/pjjg18/ts-bench/lib-baseline" +.libPaths(c(lib_dir, .libPaths())) +library(TreeSearch) +library(TreeTools) +``` + +Standard benchmark datasets live in the repo clone: +- `TreeSearch/dev/benchmarks/mbank_X30754.nex` — 180 taxa, 418 patterns + (large-tree benchmark) +- `TreeSearch/inst/` — bundled datasets (Vinther2008, etc.) + +Write CSV results to `/nobackup/pjjg18/ts-bench/results/` for later +download. + +## Hardware + +- **CPU:** AMD EPYC 7702 64-Core (2 sockets, 128 cores, no SMT) +- **Clock:** ~2.0 GHz base +- **Cache:** 32K L1d/L1i, 512K L2, 16M L3 per CCX +- **ISA:** x86_64 with AVX2 support +- **Partition:** `shared` (default for single-node jobs) diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 000000000..e28e074b8 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,1429 @@ +# TreeSearch Multi-Agent Development Notes + +## Current phase: bug-fixing / pre-release (as of 2026-03-20) + +The project is in a **bug-fixing and stabilisation phase** with the goal of +shipping the package. Agents should: + +- Monitor `to-do.md` as usual for task selection. +- **Prioritise bug fixes, test failures, documentation issues, and R CMD check + problems** over new functionality. +- **Do not implement new features on `cpp-search` or `main`.** + Feature work is allowed only on dedicated `feature/` branches, and + only when the task is explicitly labelled as a feature and has been approved + for active development. +- When in doubt, prefer a conservative fix (minimal diff, no API changes) over + an ambitious refactor. + +This phase ends when a clean `R CMD check` (0 errors, 0 warnings) is confirmed +and the maintainer signals readiness to tag a release. + +--- + +## Validation workflow — GHA first (mandatory) + +**Use GitHub Actions for all validation:** R CMD check, full test suites, +and benchmarks. Local builds are for **targeted iteration only** (editing +code → building → running one or two specific test files to check your +change). Never run a full test suite or R CMD check locally. + +### GHA dispatch (primary validation path) + +```bash +# Push your branch and dispatch checks +git push -u origin feature/ +cd .. +bash gha-dispatch.sh agent-check.yml feature/ + +# Poll for results +bash gha-poll.sh +``` + +Park the task while waiting and pick up another (see root `AGENTS.md` for +parking protocol). Do **not** block waiting for GHA results. + +### Local builds (targeted iteration only) + +Multiple agents share the same `src/` directory. In-place `R CMD INSTALL .` +compiles `.o` files and links the DLL directly in `src/`, causing races. + +**Always build via tarball** so compilation happens in an isolated temp +directory: + +```bash +SRC=$(pwd) && TMPBUILD=$(mktemp -d) && \ + rm -f src/*.o src/*.dll && \ + (cd "$TMPBUILD" && R CMD build --no-build-vignettes --no-manual --no-resave-data "$SRC") && \ + R CMD INSTALL --library=.agent-X "$TMPBUILD"/TreeSearch_*.tar.gz && \ + rm -rf "$TMPBUILD" +``` + +Key points: +- `rm -f src/*.o src/*.dll` **must** precede every build — stale artifacts slow traversal and corrupt DLLs. +- Build into an agent-specific `$TMPBUILD` outside the source tree — avoids tarball collision when multiple agents build concurrently. +- `--no-resave-data` skips unnecessary `.rda` re-saving (not needed for dev installs). + +Run **targeted** tests only: +```bash +Rscript -e "library(TreeSearch, lib.loc='.agent-X'); testthat::test_dir('tests/testthat', filter='test-ts-foo')" +``` + +**Never** use `R CMD INSTALL --library=.agent-X .` (in-place build). + +**Never** install to the default library. On Windows, a loaded DLL locks +the file and blocks other agents. + +**Never** use `devtools::load_all()` or `pkgbuild::compile_dll()` — these +target a shared temp location and will conflict. + +**Never** run full test suites or R CMD check locally — use GHA. + +## Build failure recovery + +### Debug `.o` contamination + +`roxygen2::roxygenise()` (default mode) calls `pkgbuild::compile_dll(debug=TRUE)`, +which leaves debug `.o` files in `src/`. Subsequent `R CMD INSTALL` reuses them, +producing a DLL that crashes at runtime (exit code 127/139). + +**Fix:** `rm -f src/*.o src/*.dll` then rebuild. + +**Prevention:** Never use bare `roxygen2::roxygenise()`. To regenerate docs: +```bash +Rscript -e ".libPaths(c('.agent-X', .libPaths())); roxygen2::roxygenise(load_code = roxygen2::load_installed)" +``` + +### DLL lock + +If `R CMD INSTALL` fails with "Access is denied", another R process has the +DLL loaded. Kill it or wait, then retry. + +### `TreeSearch-init.c` arg count mismatch + +After `Rcpp::compileAttributes()`, **always** run `Rscript check_init.R` to +verify arg counts match between `RcppExports.cpp` and `TreeSearch-init.c`. + +### Quick recovery + +```bash +SRC=$(pwd) && TMPBUILD=$(mktemp -d) && \ + rm -f src/*.o src/*.dll && \ + (cd "$TMPBUILD" && R CMD build --no-build-vignettes --no-manual --no-resave-data "$SRC") && \ + R CMD INSTALL --library=.agent-X "$TMPBUILD"/TreeSearch_*.tar.gz && \ + rm -rf "$TMPBUILD" +Rscript check_init.R +``` + +## CPU limits — max 2 cores per agent + +Use `nThreads = 2L` at most in tests/benchmarks. Never `nThreads = 0L` +(auto-detect). Use `-j2` at most for make. + +## Shared files — coordination rules + +`src/ts_rcpp.cpp` and `src/TreeSearch-init.c` are modified by every agent. +**Append only** — add new entries at the end. Do not reformat or reorder. + +### `concavity` sentinel + +Rcpp can't translate `R_PosInf`. All Rcpp-exported functions use +`concavity = -1.0` as the C++ default (sentinel for "equal weights / Inf"). +Conversion `if (concavity < 0) concavity = HUGE_VAL` happens at three +gateway points in `ts_rcpp.cpp`: `make_dataset()`, `ts_resample_search()`, +`ts_successive_approx()`. + +### `src/Makevars.win` + +**Never leave a `src/Makevars.win` in place.** Debug/PGO/UBSan flags cause +crashes or miscompilation. Delete after any profiling session. + +### `src/TreeSearch-win.def` + +**Keep this file.** It explicitly exports `R_init_TreeSearch` for Windows +DLL builds. Without it, the default `nm | sed` pipeline generates a +`tmp.def` that truncates long C++ mangled symbols, causing linker failures +or corrupt DLLs (especially under `pkgbuild::compile_dll(debug=TRUE)`). + +## Branch structure + +``` +main ← stable, taggable; receives only reviewed bug fixes + └─ cpp-search ← integration branch; all feature work merges here + ├─ feature/cid-consensus + ├─ feature/hsj-polish + └─ feature/ (one per major feature) +``` + +### Rules + +- **`main`**: bug fixes and release tags only. No experiments. +- **`cpp-search`**: integration target. **Agents must not merge directly to + `cpp-search`.** All code changes go through PRs reviewed by the human. + Coordination-only commits (agent logs, to-do.md updates) may be pushed + directly. +- **`feature/*`**: branch from `cpp-search`; contain **code changes only**. + Each feature branch is owned by a single agent at a time. + +### Worktree discipline + +Each worktree directory is **locked to its branch**. The mapping may change +over time (e.g. when a feature merges and the worktree is reassigned), but +**only the human updates this mapping**. + +**Hard rules:** + +1. **Never `git checkout ` on any worktree.** This silently replaces + another agent's (or the human's) working tree and can cause data loss. +2. **Never `git merge` into a worktree's branch** unless that is the explicit + task you were assigned (e.g. "pull cpp-search into feature/X"). +3. To read files from another branch, use `git show :`. + +**Current worktree mapping** (run `git worktree list` to verify): + +| Directory | Branch | Purpose | +|-----------|--------|---------| +| `TreeSearch-a` | `cpp-search` | Main source dir; integration branch | +| `TS-CID-cons` | `feature/cid-consensus` | CID consensus feature (T-150) | +| `TS-NativeSearch` | `feature/native-search` | Native scorer decoupling (T-204) | +| `TS-PruneRI` | `feature/prune-reinsert` | Taxon prune-reinsert perturbation (T-266) | +| `TS-TNT-bench` | `feature/tnt-bench` | TNT benchmarking | + +There is **no permanent worktree for `cpp-search`**. To commit to +`cpp-search`, use one of these approaches (in order of preference): + +**Option A — temporary worktree (safest):** +```bash +git worktree add ../TS-tmp- cpp-search +# work in ../TS-tmp-, commit, push +git worktree remove ../TS-tmp- +``` + +**Option B — single-file coordination commit (from a feature worktree):** +```bash +git show cpp-search:agent-X.md > agent-X.md # read current version +# edit the file +git stash # stash any code changes +git checkout cpp-search -- agent-X.md # stage from cpp-search +cp agent-X.md.edited agent-X.md # apply your edit +git add agent-X.md && git commit -m "chore: agent X progress note" +git push origin cpp-search +git checkout HEAD -- agent-X.md # restore feature version +git stash pop # restore code work +``` + +**Never use Option B for multi-file edits.** Use a temporary worktree. + +### Coordination files live on `cpp-search` only + +`to-do.md`, `u.nnn`, `agent-X.md`, `completed-tasks.md`, `coordination.md`, +and `AGENTS.md` are **never committed on feature branches**. When an agent +working on a feature branch needs to log progress or claim a task, they commit +those changes directly to `cpp-search` (coordination-only commit), keeping the +feature branch clean. + +To read coordination files while on a feature branch without switching: +```bash +git show cpp-search:to-do.md +git show cpp-search:agent-X.md +``` + +### Shared files at merge time + +`src/ts_rcpp.cpp` and `src/TreeSearch-init.c` use the existing append-only +convention — merge conflicts resolve cleanly by keeping both appended blocks. +`DESCRIPTION` (Collate field) and `NAMESPACE` require a manual merge pass; +this is expected and should be done carefully at feature-merge time. + +### Feature branch lifecycle + +1. `git checkout cpp-search && git checkout -b feature/` + Optionally create a worktree: `git worktree add ../TS- feature/` +2. Claim task on `cpp-search`'s `to-do.md` (coordination commit). +3. Do all code work on `feature/`. Use local targeted tests only + during iteration; use GHA for full validation. +4. When ready: push and dispatch GHA checks: + ```bash + git push -u origin feature/ + bash gha-dispatch.sh agent-check.yml feature/ + ``` +5. On GHA success, open a PR: + ```bash + gh pr create --base cpp-search --head feature/ \ + --title "-nnn: " --body "Agent . ..." + ``` +6. Set `to-do.md` status to `PR #N ()`. Move on. +7. Human reviews and merges the PR. +8. After merge, clean up: + ```bash + git worktree remove ../TS- # if worktree was used + git branch -d feature/ + git push origin --delete feature/ + ``` + +--- + +## Multi-agent workflow protocol + +> **Task IDs:** New tasks use `-nnn` format (e.g. `A-042`), where +> `` is your agent letter and `nnn` is your personal counter +> (tracked in `agent-.md`). Existing `T-nnn` IDs in `to-do.md`, +> `completed-tasks.md`, PRs, and git log are valid and need not be renamed. +> Before adding or removing rows in `to-do.md`, acquire the lock: +> `bash ../../todo-lock.sh . acquire` / `bash ../../todo-lock.sh . release`. + +### Worktree tasks + +Tasks with status `WORKTREE (name)` are actively developed in a dedicated git +worktree (e.g. `C:/Users/pjjg18/GitHub/TS-CID-cons`). **Do not claim or +modify these tasks.** They are reserved for the human developer working in +that worktree. To mark a task as in-flight on a worktree, set its status to +`WORKTREE (name)` where *name* matches the worktree directory basename. + +### Assignment + +On `/assign X`: + +1. Read `agent-X.md`. If a task is already in-progress, resume it. +2. **Triage `aXXX.md` / `a.XXX` bug reports** (see "Shiny bug report intake" below): + a. List all `a[0-9]*.md` **and** `a.[0-9]*` files in the project root + (excluding any `*.claimed-*.md` / `*.claimed-*` files). + b. For each file, check its size first. **Skip files shorter than + 20 characters** (likely mid-edit — the human may still be typing). + Do not rename or touch these files; leave them for a later pass. + c. For files ≥20 characters, attempt `mv aXXX.md aXXX.claimed-X.md`. + If the rename fails (file gone or access denied), another agent + claimed it — skip. + d. Create a `to-do.md` task under `### Shiny App` for each valid + report. Assign a `-nnn` ID (your letter + incremented + counter; see "Task IDs" in the master AGENTS.md), a priority based + on severity (default P2), and tag as `[Shiny]`. Use the file's + content as the description/notes. + e. Delete the `aXXX.claimed-X.md` file once the `to-do.md` entry is + written. + f. Repeat for all claimed files before moving on. **Do not start + working a task until all pending reports are triaged.** +3. **Triage user issues** (`u.*` files) before `to-do.md`. See the + parent `../AGENTS.md` "User issue files" section for the full protocol + (scan → claim via rename → triage → delete). While untriaged issues + remain, triaging takes priority over `to-do.md` tasks (an issue may + contain a P0). +4. **Check `remote-jobs.md`** for retrievable results. If a job is listed + as complete (or past its expected duration), retrieve and process the + results before claiming a new task. +5. If no untriaged issues or pending remote results, claim the next OPEN + task from `to-do.md`. + +Set `CONVERSATIONSUMMARY` to `Agent X: `. + +> **Concurrency guard (u.nnn / a.XXX):** Atomic rename (`mv u.001 +> u.001.claimed-X`) ensures exactly one agent wins each file. NTFS +> rename is atomic; losers see "file not found" and skip. + +### During work + +- Update `agent-X.md` after every significant step (crash-recovery record). +- All work uses `.agent-X/` as library directory. +- **All builds, tests, and benchmarks in bash subprocesses** — never in the + RStudio R session. +- **Use GHA for validation** (full test suites, R CMD check, benchmarks). + Local builds are for targeted iteration only (build + run 1–2 test files). + See "Validation workflow" section above. + +### On task completion + +1. **Delete** the task row from `to-do.md`. If the task was the last open + row in a section/group, delete the section header too. +2. **Append** a summary row to `completed-tasks.md` under the current date + heading (create a new `## YYYY-MM-DD` heading if needed): + `| -nnn | Short description | X | Brief notes |` +3. Set `agent-X.md` to IDLE. +4. Append a brief entry to this file documenting what changed. +5. Update `coordination.md` if strategic objectives are affected. +6. Take next task. + +### Shiny bug report intake + +The human files Shiny app bugs as individual files in the project root. +Naming convention: + +`a.010`, `a.011`, … (dot-separated, no extension) + +Each file contains a free-text bug description. The human's workflow is: +create file → write bug → save → never touch the file again. + +**Agent responsibility:** Triage all pending bug report files into +`to-do.md` at the start of every `/assign` (step 2 in Assignment above). + +**Claim protocol:** +```bash +# List unclaimed reports (both conventions) +ls a.[0-9]* 2>/dev/null | grep -v 'claimed' + +# Skip short files (< 20 chars) — don't rename, don't touch +wc -c < a.010 # check size first + +# Claim atomically (rename) +mv a.010 a.010.claimed-X + +# Read, triage into to-do.md, then delete +cat a.001.claimed-X +# ... create to-do.md entry ... +rm a.001.claimed-X +``` + +**Skip guard:** Files shorter than 20 characters are likely mid-edit. +Do **not** rename them — just leave them in place for a later pass. +(Renaming and renaming back triggers RStudio "file moved" dialogs.) + +**to-do.md placement:** All Shiny bugs go under `### Shiny App`. Create +the section if it doesn't exist. Default priority P2 unless the content +clearly indicates higher severity (crash = P1, cosmetic = P3). + +**Shiny bug fixes** are committed directly to `cpp-search` (they are +bug fixes in `inst/Parsimony/`, not feature work, so no feature branch +is needed). Use the temporary-worktree approach if changes span multiple +files. + +### Standing tasks + +| ID | Type | Expertise file | +|----|------|---------------| +| S-RED | Red-team review | `.positai/expertise/red-team.md` | +| S-PROF | Performance profiling | `.positai/expertise/profiling.md` | +| S-COORD | Coordination review | `.positai/expertise/coordination.md` | + +Priority: P3 when ≥6 OPEN tasks, P2 when 3–5, P1 when <3. + +### Key files + +| File | Purpose | +|------|---------| +| `a.XXX` | Individual Shiny bug reports (agents triage → `to-do.md`, then delete) | +| `u.nnn` | User issue files (agents triage → `to-do.md`, then delete) | +| `to-do.md` | Task queue (active/open tasks only) | +| `remote-jobs.md` | Pending async jobs (Hamilton SLURM, long GHA) — check at `/assign` | +| `completed-tasks.md` | Archive of completed tasks | +| `coordination.md` | Strategic plan | +| `agent-X.md` | Agent progress log | +| `AGENTS.md` | Conventions + architecture reference | +| `.positai/expertise/*.md` | Standing task methodology | + +## User-level Posit Assistant skills + +The user has Posit Assistant skills installed at `~/.positai/skills/`. +**Load these with the `skill` tool before starting relevant work:** + +| Skill name | When to load | +|------------|-------------| +| `r-package-profiling` | Profiling, benchmarking, VTune, A/B comparison, hotspot analysis | +| `hamilton-hpc` | Hamilton HPC, SLURM jobs, SSH, remote benchmarking | + +Example: `skill(skill: "hamilton-hpc")` before any Hamilton dispatch work. + +## Test file conventions + +All `tests/testthat/test-ts-*.R` files must use `TreeSearch:::` to call +internal C++ bridge functions. Define short local wrappers for readability. + +Shared helpers are in `tests/testthat/helper-ts.R` (`make_ts_data()`, +`ts_score()`, `validate_result()`, `skip_extended()`). + +**Never use `%in%` on Splits objects in test files** — S3 dispatch fails +in the cloned namespace created by `test_check()`. Use `as.logical()` +matrix comparison instead. + +### Test tiering + +Every new `test-ts-*.R` file must be assigned to one of three tiers. +See `tests/testing-strategy.md` for the full rationale. + +| Tier | Guard | When it runs | Use for | +|------|-------|-------------|---------| +| 1 — CRAN | none | always (CRAN + CI + local) | Fast (< ~2 s) API and data-structure unit tests | +| 2 — CI | `skip_on_cran()` at **file level** (first executable line) | CI + local | C++ engine correctness, scoring, search algorithms | +| 3 — Extended | `skip_extended()` at **file level** | `TREESEARCH_EXTENDED_TESTS=true` only | Stress tests, benchmarks, timing measurements | + +**Default for new `test-ts-*` files: Tier 2.** Add `skip_on_cran()` as the +very first executable line (before any helpers or `test_that()` calls): + +```r +# Tier 2: skipped on CRAN; see tests/testing-strategy.md +skip_on_cran() +``` + +Use Tier 3 only for tests that take > ~10 s or are sensitive to machine load. + +## R source file ordering + +`DESCRIPTION` has an explicit `Collate:` field. When adding a new `.R` file, +**update the Collate field** — otherwise R sources alphabetically, which can +break if one file's top-level code depends on a later file. + +## Documentation checks (mandatory) + +After any change to a function signature or roxygen block, run: + +```r +devtools::check_man() +``` + +After writing or updating documentation prose, also run: + +```r +spelling::spell_check_package() +``` + +Both should be clean before committing. These are fast and catch issues +(`check_man` catches Rd parse errors, cross-ref failures, `\usage` mismatches; +`spell_check_package` catches typos in `@description`/`@details`/`@param` text). + +References are added using Rdpack's \insertCite{}, with +\insertAllCited{} in the references section. + +## Algorithm vignette (mandatory updates) + +`vignettes/search-algorithm.Rmd` documents the search algorithm for +publication. **Any change that modifies search behaviour** — new heuristics, +parameter tuning, scoring methods, stopping criteria, pool management, or +rearrangement operators — **must be accompanied by an update to this +vignette.** + +- Published techniques: add a short summary and `@Key` citation. +- Novel contributions: describe the algorithm in enough detail for a reader + to understand the design and rationale. Include empirical results where + available (e.g. benchmark deltas). +- New references: add `@article{Key, ...}` to `inst/REFERENCES.bib`. + +The vignette uses pandoc-style `@Key` citations (same as the other +vignettes), not Rdpack `\insertCite{}`. + +## Architecture reference + +### R-level API + +| Function | Engine | Purpose | +|----------|--------|---------| +| `MaximizeParsimony()` | C++ driven search | Primary search (EW, IW, profile, constraints) | +| `Morphy()` | R-loop + MorphyLib | Legacy search (custom stopping, per-iteration callbacks) | +| `MaximizeParsimony2()` | — | Deprecated alias for `MaximizeParsimony()` | +| `Resample()` | C++ | Jackknife/bootstrap resampling | +| `SuccessiveApproximations()` | C++ | Successive approximations weighting | +| `TreeLength()` | C++ `ts_fitch_score` | Score one or more trees | +| `FastCharacterLength()` | C++ `ts_char_steps` | Per-character step counts | +| `AdditionTree()` | C++ `ts_wagner_tree` | Wagner tree construction | +| `RandomTreeScore()` | C++ (phyDat) or MorphyLib (morphyPtr) | Score a random tree | +| `TaxonInfluence()` | C++ via `MaximizeParsimony()` | Per-taxon search | +| `SearchControl()` | — | Expert parameter constructor for `MaximizeParsimony()` | +| `ParsSim()` | Pure R | Simulate datasets under parsimony (EW/IW/profile) | + +`MaximizeParsimony()` has a backward-compatibility shim: passing old +Morphy-style parameters (`ratchIter`, `tbrIter`, etc.) triggers a deprecation +warning and delegates to `Morphy()`. Scheduled for removal in 2028. + +### Driven search pipeline per replicate + +1. Random Wagner tree → NNI warmup → TBR to local optimum +2. XSS sectorial search (if tree large enough) +3. RSS random sectorial search +4. CSS constrained sectorial search +5. Ratchet perturbation to escape local optima +5a. Post-ratchet XSS+RSS+CSS (if `postRatchetSectorial = TRUE`) +6. NNI-perturbation (topology-space escape, if `nniPerturbCycles > 0`) +7. Drift search (accept suboptimal moves) +8. PCSA perturbation (if `annealCycles > 0`) +9. Final TBR polish +10. Add to pool +11. Fuse against pool (every `fuse_interval` replicates) + +Steps 2–9 are wrapped in the `outerCycles` loop (default 1). + +Post-search: TBR plateau enumeration from all pool seeds to find MPTs. + +### Strategy presets (auto-selected by `NTip` and signal density) + +| Preset | Condition | Key settings | +|--------|-----------|-------------| +| sprint | ≤30 tips | 3 ratchet (4%), 0 drift, XSS only, NNI-first | +| default | 31–64 tips; or ≥65 tips with <100 char patterns | 12 ratchet (25%, 5 moves), 0 drift, XSS+RSS, Wagner×3, NNI-first, adaptive level | +| thorough | 65–119 tips with ≥100 char patterns | 20 ratchet (25%, 5 moves, adaptive), 0 NNI-perturb (T-274), 0 drift, XSS+RSS+CSS, Wagner×3, NNI-first, outerCycles=2 | +| large | ≥120 tips with ≥100 char patterns | 12 ratchet (25%, 5 moves, adaptive), 0 NNI-perturb, 0 drift, 1 SA cycle (T=20→0, 5 phases), XSS(3)+RSS(2)+CSS(1), Wagner×1 biased (Goloboff 2014), NNI-first, outerCycles=1, tbrMaxHits=1, sectorMaxSize=100, pruneReinsert=5 cycles NNI-polish (T-289f Stage 5: NNI polish fixes 0-rep failure at 206t; improves 131–180t) | + +**T-264 (2026-03-26):** `consensusStableReps` removed from all presets +(disabled, 0). The previous setting of 3 caused catastrophic early +termination — the search stopped after 3 replicates with unchanged +consensus, using only 7–20% of the time budget on most datasets. + +**Large preset design rationale (T-179, 2026-03-24):** At 180 tips, each TBR +convergence takes ~5–7s, making phases like NNI-perturbation (~5.5s/cycle) and +drift (~4s/cycle) extremely expensive. Systematic benchmarking on mbank_X30754 +(180t, 418p) showed that reducing cycle counts (12 ratchet, 4 drift, no NNI-perturb) +with outerCycles=1 and a single biased Wagner start outperforms the thorough +preset by 4–7 steps (median) at 30–60s budgets and ties at 120s, while +consistently completing more replicates. + +**T-289 Stage 4 (2026-03-28, EPYC 7702, 10 seeds, 5 datasets 131–206t):** +PR (c=5, d=5%, MISSING) vs baseline. 60s: mean Δ=+0.5 steps (neutral); +project3701 146t regresses −12 steps; syab07205 206t: 0 replicates complete +(per-rep cost ~60s, budget exceeded). 120s: mean Δ=−9.1 steps but driven +by project3701 (−37 steps); others ≤6 steps. Replicate ratio 0.82 at 60s, +0.68 at 120s. Decision: disable PR (TBR polish) — 0-rep failure at 206t/60s +is a showstopper. + +**T-289f Stage 5 (2026-03-29, EPYC 7702, 10 seeds, 5 datasets 131–206t):** +PR (c=5, NNI full-tree polish) vs pr_tbr (TBR polish, Stage 4 reference) vs +baseline. pr_tbr at 206t/60s: still 0 reps (confirmed). pr_nni fixes the +0-rep failure (2 reps at 206t/60s). Score deltas vs baseline: project4133 +(131t) ≈0; project3701 (146t) **−178 steps** at 60s, −128 at 120s; project804 +(173t) −9/−2; mbank_X30754 (180t) −4/−7; syab07205 (206t) +17.5 at 60s +(neutral at 120s). **Decision: enable pruneReinsertCycles=5, pruneReinsertNni=TRUE +in large preset.** Note G-006: NNI polish ignores ConstraintData — irrelevant +since large preset does not use topological constraints. + +**Post-T-206 Hamilton HPC baselines (2026-03-26, EPYC 7702, 5 seeds):** +30s median=1202 (range 1189–1214), 60s median=1190 (1190–1202), 120s +median=1185 (1171–1189). Per-replicate median 17.3s (cf. ~60s pre-T-206). +The 65–74 step improvement over pre-T-206 Intel baselines is primarily +from the outer cycle reset cap (maxOuterResets=0), not hardware. +Phase distribution: TBR 43.6%, Ratchet 32.2%, SA 7.4% (14% hit rate, +0.8 steps/s — least productive phase). T-248 benchmarked annealCycles +0/1/3: AC=1 (400ms/rep, 40% hit rate) is most cost-effective; AC=3 +(1370ms/rep, 21% hit rate) showed no significant score gain (p>0.5, +n=5 seeds). Large preset reduced to annealCycles=1. + +All presets set `nniFirst = TRUE` (NNI warmup before TBR) and +`sprFirst = FALSE` (SPR is counterproductive when NNI is active — +empirically NNI→TBR outperforms NNI→SPR→TBR). With `nniFirst`, each +Wagner start is NNI-optimized before selection (best of 3 NNI-local optima +rather than 3 raw Wagner scores). `default` also enables `adaptiveLevel = +TRUE` (scale ratchet/drift by hit rate); `thorough` omits it because high +base cycle counts already cover hard landscapes. + +**Ratchet perturbation tuning (2026-03-22)**: Systematic profiling across +all 14 benchmark datasets showed the previous 4% perturbation probability +was far too gentle. With 253 characters (Zhu2013), 4% zeroes only ~10 +characters — insufficient to reshape the landscape. Increasing to 25% +with fewer perturbed TBR moves (5 instead of auto=20) improves median +scores by 3–7 steps on hard datasets while completing fewer but more +productive replicates. 9/14 datasets improved, 4 unchanged, 1 marginal at +10s budget (resolves at 20s). The key insight: the perturbed-phase TBR +should be short (the landscape is warped, so extensive search on it is +wasteful), but the perturbation itself should be aggressive enough to +meaningfully displace the tree from its current basin of attraction. + +Signal-density gate: datasets with few character patterns (<100) have flat +parsimony landscapes where intensive search adds no benefit. + +### Adaptive sectorial search + +XSS and CSS use **adaptive early-exit**: after each round of sector searches ++ global TBR polish, if the overall best score did not improve, remaining +rounds are skipped. This avoids wasting ~7% of replicate time on datasets +where sectorial search is unproductive (e.g. Dikow2009). On productive +datasets (e.g. Zhu2013), the early exit never fires. + +### Conflict-guided RSS + +RSS uses **conflict-guided sector selection**: before each replicate's RSS +phase, `driven_search()` computes a `SplitFrequencyTable` from the pool's +best-score trees. Within `rss_search()`, each internal node's "conflict +score" is `1 − (fraction of pool trees containing that split)`. +Max-descendant conflict is propagated upward, and eligible sector roots +are sampled via `std::discrete_distribution` with weight `1 + 3 × conflict`. +Falls back to uniform selection when the pool has <2 best-score trees or +when conflict variation is negligible. + +### Consensus-stability stopping + +After each replicate, if `consensus_stable_reps > 0` (disabled in all +presets since T-264; available via `SearchControl(consensusStableReps=N)`), +the pool's strict consensus hash is compared to the previous replicate's. +If unchanged for `consensus_stable_reps` consecutive replicates, the +search terminates early. `compute_consensus_hash()` uses +XOR of per-split FNV-1a hashes for O(pool × splits) cost. + +### Adaptive search level + +When `adaptive_level = true`, ratchet and drift cycle counts are scaled +each replicate based on the cumulative hit rate: +- hit_rate > 0.7 → 0.5× (easy landscape) +- hit_rate > 0.4 → 0.75× +- hit_rate < 0.15 → 1.5× (hard landscape) +- else → 1.0× + +### TBR zero-length clip skipping + regraft merging (collapsed flags) + +`compute_collapsed_flags()` (`ts_collapsed.h/.cpp`) identifies edges where +clipping provably cannot improve score. Checks 5 conditions: (1) zero +standard-block cost at parent, (2) zero NA-block cost at parent, (3) prelim +preservation (`prelim[sibling] == prelim[parent]`), (4) down2 preservation +(NA), (5) subtree_actives preservation (NA). Works for EW, IW, Profile, +and NA-aware scoring. Integrated into TBR, SPR, and drift search. +Disabled during MPT enumeration (equal-score topologies may exist). +Recomputed after every accepted move. + +**Regraft merging** (Goloboff 1996): within a collapsed region (connected +set of nodes linked by zero-length edges), all regraft positions yield the +same full score. Only boundary edges (entering the region) are evaluated; +interior collapsed edges are skipped via `if (collapsed[below]) continue`. +TBR, SPR, and drift all use this. The `CollapsedRegions` struct exists in +the header but callers use `compute_collapsed_flags()` directly (the +`region_id` field is unused — only the boolean flag array matters). + +**Collapsed-topology pool dedup**: `compute_collapsed_splits()` in +`ts_splits.cpp` produces the split set excluding collapsed edges. Two +binary trees differing only in zero-length resolutions produce the same +collapsed split set → treated as duplicates by `TreePool::add_collapsed()`. +Both serial (`driven_search`) and parallel (`ThreadSafePool`) paths use +collapsed dedup. + +**Benchmark results** (2026-03-22, 4 standard datasets, 3 seeds each): +Skip rate = 0% on all datasets (Vinther2008 23t, Agnarsson2004 62t, +Zhu2013 75t, Dikow2009 88t). Near-optimal trees in these morphological +datasets have negligible zero-length edges. Overhead from flag computation +is negligible. Score equivalence confirmed (enabled vs disabled produce +identical best scores). Benefit expected on sparse/synthetic data. + +### C++ module map + +| Module | Header/Source | Purpose | +|--------|--------------|---------| +| Fitch scoring | `ts_fitch.h/.cpp` | Downpass, uppass, incremental, indirect | +| NA scoring | `ts_fitch_na.h` | Three-pass inapplicable algorithm (Brazeau et al. 2019) | +| NA incremental | `ts_fitch_na_incr.h` | Incremental NA-aware scoring for TBR/drift | +| SIMD | `ts_simd.h` | SSE2/NEON portability layer for bit-parallel ops | +| Data | `ts_data.h/.cpp` | `DataSet`, `CharBlock`, `build_dataset`, simplification | +| Tree | `ts_tree.h/.cpp` | `TreeState`, topology manipulation, `PreallocUndo` | +| Constraint | `ts_constraint.h/.cpp` | Topological constraint enforcement | +| TBR | `ts_tbr.h/.cpp` | TBR search (with sector_mask for CSS) | +| SPR/NNI | `ts_search.h/.cpp` | SPR and NNI search (standalone, not in driven pipeline) | +| Ratchet | `ts_ratchet.h/.cpp` | Perturbation (zero/upweight/mixed, adaptive) | +| Drift | `ts_drift.h/.cpp` | Accept suboptimal moves within AFD/RFD limits | +| Wagner | `ts_wagner.h/.cpp` | Greedy addition tree (incremental scoring, NA-aware) | +| Sectorial | `ts_sector.h/.cpp` | RSS (conflict-guided), XSS, CSS; from-above HTU | +| Fuse | `ts_fuse.h/.cpp` | Tree fusing (in-place exchange) | +| Pool | `ts_pool.h/.cpp` | Dedup, eviction, consensus hash, split frequency table | +| Splits | `ts_splits.h/.cpp` | Bipartition computation, comparison, `hash_single_split()` | +| Driven | `ts_driven.h/.cpp` | Multi-replicate orchestrator | +| Resample | `ts_resample.h/.cpp` | Jackknife, bootstrap, successive approximations | +| Parallel | `ts_parallel.h/.cpp` | `std::thread` inter-replicate parallelism | +| RNG | `ts_rng.h/.cpp` | Thread-safe RNG (`thread_local` dispatch) | +| Simplify | `ts_simplify.h/.cpp` | Character compression and uninformativeness checks | +| Collapsed | `ts_collapsed.h/.cpp` | Zero-length edge detection for clip skipping | +| NNI perturb | `ts_nni_perturb.h/.cpp` | Stochastic NNI-perturbation (IQ-TREE-style topology escape) | +| Rcpp bridge | `ts_rcpp.cpp` | All Rcpp-exported functions | + +### Scoring modes + +`ScoringMode` enum in `ts_data.h`: `EW`, `IW`, `PROFILE`. +- **EW**: standard Fitch parsimony +- **IW**: implied weights via `e/(k+e)` where `e = steps - min_steps` +- **PROFILE**: lookup in `info_amounts` table (structurally identical to IW pipeline) + +Profile mode sets `ds.concavity = 1.0` (finite sentinel) so existing +`isfinite()` checks activate the weighted pipeline without code duplication. + +### Parallelism design + +- `std::thread` (not OpenMP) to avoid R memory allocator conflicts +- Per-thread: `DataSet` copy, `ConstraintData` copy, `std::mt19937` RNG +- Shared: `ThreadSafePool` (mutex-guarded), atomic stop flag +- Main thread: pre-generates seeds from R's RNG, polls + `R_CheckUserInterrupt()` and timeout every 200ms +- Worker threads make no R API calls — `ts_rng.h` provides `thread_local` + dispatch (null → R API for serial; set → thread-local for parallel) + +### Scoring notes + +- `.h` file changes (`ts_fitch_na.h`, `ts_fitch_na_incr.h`) may require + `touch src/ts_fitch.cpp` before rebuild if the build system doesn't track + header dependencies. +- Incremental scoring is a **screening heuristic** for candidate selection; + `full_rescore()` / `score_tree()` is always authoritative. +- See `.positai/expertise/fitch-scoring.md` for detailed invariants: + uppass correctness proof, NA staleness analysis, `upweight_mask` audit. + +### Constraint enforcement + +- `build_constraint()` reads R split matrix with **column-major** indexing: + `split_matrix[s + n_splits * t]`. +- Wagner uses LCA-based constraint mapping (`wagner_map_constraint_nodes`) + since splits aren't fully present during incremental construction. +- Wagner has a posthoc retry loop (up to 100 random addition orders) as a + safety net for edge cases. + +## Exported Rcpp functions + +All registered in `ts_rcpp.cpp` and `TreeSearch-init.c`. Run +`Rscript check_init.R` to verify consistency. + +| Function | Module | Purpose | +|----------|--------|---------| +| `ts_fitch_score` | ts_fitch | Score a tree | +| `ts_char_steps` | ts_rcpp | Per-pattern step counts (with simplification offsets) | +| `ts_na_debug_char` | ts_fitch_na | Per-node debug for a single pattern | +| `ts_na_char_steps` | ts_fitch_na | Per-pattern step counts (raw, no offsets) | +| `ts_debug_clip` | ts_fitch | Debug SPR clip/regraft | +| `ts_test_indirect` | ts_fitch | Debug indirect length | +| `ts_nni_search` | ts_search | NNI hill-climbing | +| `ts_spr_search` | ts_search | SPR hill-climbing | +| `ts_tbr_search` | ts_tbr | TBR with plateau exploration | +| `ts_ratchet_search` | ts_ratchet | Ratchet perturbation | +| `ts_drift_search` | ts_drift | Drift search | +| `ts_wagner_tree` | ts_wagner | Wagner tree (specified addition order) | +| `ts_random_wagner_tree` | ts_wagner | Wagner tree (random order) | +| `ts_compute_splits` | ts_splits | Bipartition splits from edge matrix | +| `ts_trees_equal` | ts_splits | Compare two trees | +| `ts_pool_test` | ts_pool | Pool deduplication test | +| `ts_tree_fuse` | ts_fuse | Fuse two trees | +| `ts_sector_diag` | ts_sector | Sectorial search diagnostics | +| `ts_rss_search` | ts_sector | Random Sectorial Search | +| `ts_xss_search` | ts_sector | Exclusive Sectorial Search | +| `ts_driven_search` | ts_driven | Full driven search | +| `ts_resample_search` | ts_resample | One jackknife/bootstrap replicate | +| `ts_successive_approx` | ts_resample | Successive approximations | +| `ts_parallel_resample` | ts_parallel | Batch resample with parallelism | +| `ts_bench_tbr_phases` | ts_rcpp | TBR phase timing diagnostic | + +## MorphyLib deprecation status + +Migration plan in `inst/deprecation/morphy-migration.md`. + +**Already migrated to C++:** `MaximizeParsimony`, `AdditionTree`, `Resample`, +`SuccessiveApproximations`, `TreeLength`, `CharacterLength`, +`FastCharacterLength`, `RandomTreeScore`, `TaxonInfluence`. + +**Still using MorphyLib:** Legacy search functions (`Ratchet`, `Jackknife`, +`MorphyBootstrap`, `CustomSearch`), R-level tree rearrangement functions. +These are candidates for deprecation rather than migration. + +## Shiny app (`inst/Parsimony/`) + +Fully modularized from monolithic `app.R` into Shiny modules: +- `global.R` — library calls, constants, helpers, colours, citations, module UI instantiation +- `ui.R` — `fluidPage(...)` definition using module UI elements +- `server.R` — `AppState()` + module wiring + `ShowConfigs` observer + `onStop()` +- `server/app_state.R` — `AppState()` typed `reactiveValues()` constructor +- `server/logging.R` — session logging infrastructure +- `server/mod_*.R` — 7 Shiny modules (`NS()`/`moduleServer()`) + +**All server logic now lives in modules.** The old `events.R` has been +dissolved; its `ShowConfigs` function and `plotFormat` observer are inlined +in `server.R` (they operate on top-level DOM elements). + +**Modules:** +- `mod_references.R` — references panel (no state) +- `mod_downloads.R` — all 8 download handlers +- `mod_data.R` — data loading + tree management (9 returned reactives). + Uses `cb_ref` forward-reference env for circular deps with consensus module. +- `mod_clustering.R` — clustering analysis + tree distances (5 returned reactives) +- `mod_search.R` — search engine, scoring, weighting. + Owns ExtendedTask, search config modal, result accumulation. +- `mod_treespace.R` — tree space visualization + plot settings (14 returned reactives) +- `mod_consensus.R` — consensus plotting, character mapping, stability/rogue analysis, + concordance, cluster consensus, main plot dispatch, plot logging (1327 lines). + Returns `MainPlot`, `RCode`, `UpdateKeepNTipsRange`, + `UpdateDroppedTaxaDisplay`, `UpdateOutgroupInput`. + +**Important:** Server source files are in `server/` NOT `R/`. Shiny 1.5+ +auto-sources all `.R` files in an app's `R/` directory at startup (before +any session exists), which crashes on references to `output`/`input`/`session`. + +Test suite: `NOT_CRAN=true` required for shinytest2 integration tests. +Run from `inst/Parsimony/`: +```bash +NOT_CRAN=true Rscript -e "testthat::test_dir('tests/testthat')" +``` +`setup.R` loads `library(shinytest2)` for `AppDriver` availability. + +**Important:** Integration tests trigger `pkgbuild::compile_dll(debug=TRUE)` +via `load_all()`. `src/TreeSearch-win.def` prevents linker failures from +corrupted auto-generated `tmp.def` on Windows. + +Module tests: `test-mod-references.R` (4), `test-mod-data.R` (9), +`test-mod-clustering.R` (12), `test-mod-treespace.R` (5), +`test-mod-downloads.R` (11), `test-mod-search.R` (28), +`test-mod-consensus.R` (9). +Integration tests: `test-app-smoke.R` (3), `test-Distribution.R` (13), +`test-SearchLog.R` (4), `test-ViewChars.R` (12). Total: 110 assertions. + +## Version and CRAN status + +- **Version**: 2.0.0 (major bump for new `MaximizeParsimony()` API) +- **R CMD check**: 0 ERRORs, 0 WARNINGs, 1 NOTE (R 4.5.2 internal bug) +- **Test suite**: ~9200 R-level + 1859 ts-* + 128 ParsSim + 37 MaddisonSlatkin + 49 recode-hierarchy pass + +## Key design decisions (reference) + +1. **PreallocUndo** (`ts_tree.h`): Pre-allocated flat buffers for TBR/drift + undo stack. Uses `grow()` to dynamically expand when capacity exceeded + (NA uppass saves both internal nodes and tips). Initial capacity `3 * n_node`. + +2. **TBR symmetry breaking** (`ts_tbr.cpp`): FNV-1a hash deduplication of + `virtual_prelim` vectors to skip redundant rerooting evaluations. + +3. **Bounded indirect scoring**: All search modules use `_bounded` variants + that bail out when accumulated score exceeds best candidate. + +4. **Profile parsimony**: Reuses IW indirect pipeline unchanged; only delta + precomputation differs. `ds.concavity = 1.0` sentinel activates weighted + path. Max 2 informative states per character; inapplicable → ambiguous. + +5. **MPT enumeration**: Post-search TBR plateau walk from all pool seeds. + `tbr_search()` accepts optional `TreePool* collect_pool` parameter. + +6. **All-ambiguous phyDat guard**: `TreeLength()` and `MaximizeParsimony()` + check for `levels = NULL` / 0-column contrast matrix before calling C++. + +7. **From-above HTU for sectorial search** (`ts_sector.cpp`): + `compute_from_above_for_sector()` computes `from_above[sector_root]` — + the Fitch state-set the rest of the tree sends *down* to the sector + boundary, excluding the sector's own contribution. Used instead of + `final_[parent]` in `build_reduced_dataset()`. O(depth × total_words). + +8. **Split frequency table** (`ts_pool.h/.cpp`): `SplitFrequencyTable` maps + per-split FNV-1a hash → occurrence count across best-score pool trees. + Used by conflict-guided RSS to weight sector selection. The same FNV-1a + hash (`hash_single_split()` in `ts_splits.h`) is used by consensus + hashing and split frequency counting — must stay consistent. + +9. **Consensus-stability hash** (`ts_pool.cpp`): XOR of FNV-1a hashes of + splits present in ALL best-score trees. Updated after each replicate. + Hash collision false-matches are conservative (over-count stability). + +10. **Diversity-aware pool eviction** (`ts_pool.cpp`): When the pool is full + and a new tree ties the worst score, the entry most similar to the new + tree (most shared splits, counted via per-split FNV-1a hash set + membership) is evicted. This maintains topological diversity in the pool, + improving fusing effectiveness. Falls back to arbitrary worst entry when + the new tree is strictly better. + +11. **Cross-replicate consensus constraint tightening** (`ts_driven.cpp`): + When `consensus_constrain = true` and no user constraint is supplied, + after ≥5 replicates, unanimous pool splits are extracted and enforced + as topological constraints via `build_constraint_from_bitsets()`. The + TBR/SPR search then avoids breaking established consensus clades. + Constraints are cleared and rebuilt whenever the best score changes. + Sector/fuse operations do not enforce auto-constraints (no posthoc + DataSet is built). + +## Alternative inapplicable-handling algorithms (in progress) + +Plan: `.positai/plans/2026-03-19-0643-alternative-inapplicable-handling-algorithms.md` + +Adding HSJ (Hopkins & St. John 2021) and step-matrix/x-transformation +(Goloboff et al. 2021) scoring as alternatives to the existing Brazeau +et al. (2019) three-pass algorithm. Both require an explicit character +hierarchy specification. + +### New files + +| File | Purpose | Status | +|------|---------|--------| +| `R/CharacterHierarchy.R` | `CharacterHierarchy` S3 class, `validate_hierarchy()`, `hierarchy_from_names()`, `hierarchy_chars()`, `hierarchy_controlling()`, `non_hierarchy_weights()` | Complete, 34 tests passing | +| `tests/testthat/test-CharacterHierarchy.R` | Unit tests for hierarchy specification + weight partitioning | Complete | +| `src/ts_hsj.h` | `HierarchyBlock` struct (with `absent_state`), `hsj_score()` declaration, `partition_weights()` | Complete | +| `src/ts_hsj.cpp` | `partition_weights()`, `fitch_label_char()` (with uppass), `score_hierarchy_block()`, `hsj_score()` | Complete (full-rescore only; not wired to search pipeline) | +| `src/ts_sankoff.h` | `SankoffChar`, `SankoffData` structs, `sankoff_score()`, `sankoff_score_char()`, `sankoff_uppass()` | Complete | +| `src/ts_sankoff.cpp` | Sankoff downpass, uppass, root forcing | Complete | +| `R/recode_hierarchy.R` | `recode_hierarchy()`: x-transformation recoding (Goloboff et al. 2021) | Complete, 49 tests | +| `tests/testthat/test-recode-hierarchy.R` | Unit tests for recode_hierarchy() | Complete | +| `inst/REFERENCES.bib` | Added `Goloboff2021` entry | Complete | + +### Modified files + +| File | Change | +|------|--------| +| `DESCRIPTION` | Added `CharacterHierarchy.R` to Collate field | +| `R/MaximizeParsimony.R` | Added `hierarchy`, `inapplicable`, `hsj_alpha` params with validation; non-brazeau methods currently `stop()` with "not yet implemented" | +| `src/ts_data.h` | Added `inapp_state` field to `DataSet` (for HSJ) | +| `src/ts_data.cpp` | Populate `inapp_state` in `build_dataset()` | + +### Design decisions + +- `hierarchy` is a **separate argument** to `MaximizeParsimony()` (not a phyDat attribute) +- `inapplicable` and `hsj_alpha` are **top-level args** alongside `concavity` +- Default `hsj_alpha = 1.0` +- IW + hierarchy and Profile + hierarchy: **deferred** +- Constraint interaction: **ignored** for now +- Resampling: **hierarchical** — resample top-level chars; when a controlling primary is sampled, also resample within its block; recurse for nested hierarchies + +### Resampling with hierarchy (T-124) + +`Resample()` now accepts `hierarchy`, `inapplicable`, and `hsj_alpha` +parameters. When `inapplicable != "brazeau"`, resampling is hierarchy-aware: + +- **Resampling units**: each non-hierarchy character = 1 unit; each + top-level hierarchy block (primary + all dependents) = 1 atomic unit. +- **Jackknife**: retain `proportion` of units without replacement. +- **Bootstrap**: sample `n_units` units with replacement (blocks can be + duplicated). +- Per replicate: `.HierarchicalResampleWeights()` computes pattern weights + for non-hierarchy chars and per-block sample counts. `.ResampleHierarchy()` + calls `ts_driven_search` per replicate with filtered HSJ blocks or xform + chars. +- **No C++ changes**: reuses existing `ts_driven_search` HSJ/xform infrastructure. +- **Parallelism**: serial R loop over replicates (C++ inter-search parallelism + via `nThreads` still available within each replicate). Adding C++-level + inter-replicate parallelism is a future optimization. + +### Remaining work (Phase 1c–f) + +1. ~~Pass `absent_token`, `n_tokens` from R to C++~~ **Done** (T-115): `absent_state` in HierarchyBlock, `inapp_state` in DataSet. +2. ~~Partition original characters into hierarchy vs non-hierarchy sets~~ **Done** (T-115): `partition_weights()` (C++) and `non_hierarchy_weights()` (R). +3. ~~Implement `hsj_score()` core algorithm in `ts_hsj.cpp`~~ **Done** (T-116): `fitch_label_char()` + `score_hierarchy_block()` + `hsj_score()`. +4. ~~Add Rcpp bridge function for HSJ scoring in `ts_rcpp.cpp`~~ **Done** (T-116): `ts_hsj_score()` registered in init.c. +5. ~~R-side marshalling~~ **Done** (T-116): `build_tip_labels()`, `hierarchy_to_blocks()`. +6. ~~Remove placeholder `stop()`~~ **Done** (T-117): HSJ wired into `score_tree()` dispatch, `ts_driven_search()` bridge, `MaximizeParsimony()`. End-to-end test against paper examples (T-118) + +### Key algorithm notes (HSJ) + +- Paper's Algorithm 1 initializes `a(l) = p(l) = 0` for all leaves. This is + incorrect for enforcing observed leaf states. Correct initialization: + leaf with primary absent → `a(l) = 0, p(l) = INF`; primary present → + `a(l) = INF, p(l) = 0`. Verified against hand-computed example. +- `score_hierarchy_block()` operates per hierarchy block. Non-hierarchy + characters use standard Fitch. Total = Fitch(non-hierarchy) + Σ HSJ(blocks). +- Secondary character labels at internal nodes from Fitch first-pass + (inapplicable treated as a separate state). +- HSJ is full-rescore only (no incremental variant). Performance mitigation: + candidate screening via Fitch, full HSJ only for promising candidates. + +### Phase 2 (step-matrix) — Complete (end-to-end functional) + +Sankoff engine (`ts_sankoff.h/.cpp`) implements downpass, uppass, root forcing. +R-level `recode_hierarchy()` combines primary + secondaries into composite +step-matrix character with asymmetric costs (gain:loss = n+1:1). Multistate +secondaries supported (state count = ∏k_i + 1). Nested hierarchies deferred. +Integration complete: `ScoringMode::XFORM` in `score_tree()` dispatches +Fitch(non-hierarchy) + Sankoff(recoded). `MaximizeParsimony()` accepts +`inapplicable = "xform"`. End-to-end search verified. + +### Stochastic NNI-perturbation (T-186) + +`ts_nni_perturb.h/.cpp` implements a topology-space escape mechanism inspired +by IQ-TREE's `doRandomNNIs()` (Nguyen et al. 2015). Complementary to the +weight-perturbation ratchet: the ratchet reshapes the objective function, while +NNI-perturbation directly displaces the tree topology. + +**Algorithm:** Collect all internal NNI edges. For each edge (with probability +`perturb_fraction`, default 0.5), apply a random NNI swap — but skip edges +adjacent to already-swapped edges (two NNIs conflict if their edges share an +endpoint). Track touched nodes in a hash set. After all compatible swaps, +rebuild postorder and full rescore, then TBR to a new local optimum. Repeat +for `n_cycles`. + +**Pipeline placement:** Between ratchet (phase 4) and drift (phase 5) in +`run_single_replicate()`. Disabled by default (`nniPerturbCycles = 0`). +Previously enabled in the `thorough` preset (5 cycles, 0.5 fraction); +**disabled in all presets since T-274 (2026-03-27)** — see benchmark below. + +**R API:** `SearchControl(nniPerturbCycles, nniPerturbFraction)`. +Timings reported as `nni_perturb_ms`. + +**T-274 benchmark (2026-03-27, Agent F): NNI-perturb disabled in thorough preset.** +Per-replicate sampling, 20 seeds, datasets Zhu2013/Giles2015/Dikow2009 (75–88t): + +| Dataset | nni=0 time | nni=5 time | overhead | EB_30s (nni=0) | EB_30s (nni=5) | EB_60s (nni=0) | EB_60s (nni=5) | +|---------|:----------:|:----------:|:--------:|:--------------:|:--------------:|:--------------:|:--------------:| +| Zhu2013 (75t) | 2.3s | 3.9s | +69% | 638.2 | 638.2 | 638.0 | 638.0 | +| Giles2015 (78t) | 2.2s | 3.5s | +59% | 710.1 | 710.1 | 710.0 | 710.0 | +| Dikow2009 (88t) | 4.2s | 7.0s | +67% | 1611.3 | 1611.2 | 1611.1 | 1611.0 | + +NNI-perturb adds 59–69% per-replicate overhead with ≤0.1-step expected-best +benefit at all budgets — well within bootstrap noise. Time-adjusted expected +best is identical across conditions. Set `nniPerturbCycles = 0` in thorough preset. + +### Biased Wagner addition (T-188, 2026-03-23) + +`biased_wagner_tree()` (`ts_wagner.h/.cpp`) samples the taxon-addition +order from a softmax distribution weighted by informativeness score rather +than purely at random. Two criteria available: + +- **GOLOBOFF** (bias=1): `score[t]` = number of non-ambiguous characters + for taxon t. Ref: Goloboff 2014 (*Extended implied weighting*) §3.3. +- **ENTROPY** (bias=2): `score[t]` = Σ_c (n_states_c − |state set for t|). + +**R API:** `SearchControl(wagnerBias = 0L, wagnerBiasTemp = 0.3)`. +Applied only to the first of `wagnerStarts` starts; remaining starts +use random order to preserve basin diversity. + +**Benchmark results** (2026-03-23, 14 standard + crico-174): +- Wagner→TBR gap reduction: ~80% at 174t (random: 1356 steps, Goloboff: 244) +- Score improvement after TBR convergence: ~22 steps at 174t; 1–2 steps at ≤88t +- Anomalous slight regression at 75–100t (Giles2015, Zanol2014); T=0.3 + stochastic is safer than pure greedy (T=0) + +### Outer search cycle loop (T-189, 2026-03-23) + +`outer_cycles` in `SearchParams` / `outerCycles` in `SearchControl()`. +Wraps steps 3–6 of `run_single_replicate()` in a configurable outer +loop: [XSS+RSS+CSS → Ratchet → NNI-perturb → Drift → TBR] × N. +Ratchet/NNI-perturb/drift cycles are divided evenly among N outer cycles +(ceiling division, minimum 1 per cycle). + +`outerCycles = 1` (default) is bit-for-bit identical to the previous +linear pipeline. `thorough` preset defaults to `outerCycles = 2`. + +**Pattern:** Matches TNT's `xmult` interleaving (Goloboff 1999 §2.3): +after each ratchet/drift escape, a fresh XSS pass exploits the new +topology before the next perturbation round. + +**Citation tracking:** see `papers.md` in project root for full +reference list used in optimization work. + +### NNI in the driven pipeline + +`nni_search()` in `ts_search.cpp` is implemented but **never called** in the +driven pipeline. At ≤88 tips, NNI is strictly redundant — TBR subsumes it and +completes in <1s per pass, so there's nothing to save. + +**At 180 tips, NNI becomes essential.** TBR evaluates O(n²) candidates per +pass (358 clips × 356 regrafts × rerooting ≈ millions of evaluations), +and a single convergence from Wagner takes many minutes. NNI evaluates O(n) +candidates per pass (178 edges × 2 swaps = 356), roughly 1000× cheaper. +Most improvements during initial descent are NNI-reachable. + +Proposed escalation: NNI → SPR → TBR, gated on `n_tip > ~100`. +See T-178 in `to-do.md`. + +**Empirical comparison at 180 tips** (mbank_X30754, 3 seeds, EW): + +| Strategy | Median score | Median time | +|----------|:-----------:|:-----------:| +| TBR alone | 1427 | 13.6s | +| SPR→TBR | 1360 | 13.1s | +| **NNI→TBR** | **1326** | **6.8s** | +| NNI→SPR→TBR | 1369 | 8.8s | + +NNI→TBR wins on both score AND time (~2× faster, ~100 steps better than +TBR alone). SPR intermediate step adds time without benefit at this scale. +The NNI descent path leads TBR to better basins of attraction. + +**Recommendation:** `nniFirst = TRUE` (always on — NNI costs ~1.5s at +180 tips, negligible at ≤88 tips). Replace `sprFirst` with `nniFirst` +for large trees (n_tip > ~80), or just always run NNI since the overhead +is negligible. SPR warmup is counterproductive at 180 tips. + +**Metric note:** When comparing strategies, the right metric is +**time-adjusted expected best** — the expected minimum score from +k = budget / time_per_rep independent replicates, since multi-start search +keeps the best tree. The median measures typical quality, but a strategy +with high variance and occasional excellent finds can dominate if it gets +enough draws. Bootstrap estimation: sample k scores with replacement, take +the min, repeat 5000×, take the mean. + +**Time-adjusted expected best (5 seeds, EW):** + +| Budget | 88t: TBR | 88t: NNI→SPR→TBR | 180t: TBR | 180t: NNI→SPR→TBR | +|--------|:--------:|:-----------------:|:---------:|:-----------------:| +| 20s | 1617 | 1619 (+2) | 1388 | 1278 (−110) | +| 60s | 1617 | 1619 (+2) | 1348 | 1253 (−95) | +| 120s | 1617 | 1619 (+2) | 1337 | 1247 (−90) | + +At ≤88 tips: NNI has a consistent but negligible 2-step penalty (within +noise of MPT enumeration). At 180 tips: NNI saves 90–110 steps. The +crossover is between 88 and 180 tips. No reactive per-run switching +needed — a simple always-on NNI warmup policy is optimal. + +**Escalation strategy:** NNI→TBR (skip SPR) is simplest and nearly +optimal at all sizes. NNI→SPR→TBR adds ~7 steps at 180 tips for +negligible extra time, but adds complexity. SPR alone (without NNI) +is counterproductive at 180 tips (15s vs 7s, worse scores). + +### Large-tree scaling issues (discovered 2026-03-23) + +The 180-taxon `mbank_X30754` dataset (425 chars, 374 informative patterns, +40% missing, 20% inapplicable) exposed: + +1. ~~**`maxTime` triggers Morphy delegation.**~~ **Fixed (T-184)**: + `maxTime` is now intercepted before the Morphy shim check and + mapped to `maxSeconds` with a deprecation warning. The C++ driven + search is used correctly. (Note: initial 180-taxon benchmarking + mistakenly used `maxTime`, which in an older version delegated to + `Morphy()` — ~10× slower. All results since T-184 use `maxSeconds`.) +2. **C++ TBR convergence at 180 tips takes ~13s** (Wagner ~2560 → local + optimum ~1420). NNI warmup (~1.5s) followed by TBR reduces this to + ~7s while finding better scores. T-178 filed. +3. **Strategy presets assume replicate time O(seconds).** At 180 tips, + a single replicate (TBR + XSS + ratchet + drift) takes ~60-100s. + Cycle counts and fuse intervals need recalibration for large trees. + +### Benchmarking methodology notes + +**Early vs late search:** The character of the search changes over time. +Early replicates are dominated by initial descent quality (Wagner → local +optimum); late replicates test ratchet/drift escape effectiveness. At ≤88 +tips, 20s gives 10–40 replicates spanning both regimes. At 180 tips, 20s +doesn't complete one replicate. A warm-start benchmark (T-180) would +isolate the escape-effectiveness question. + +**Generalization to large trees:** All 14 existing benchmark datasets are +≤88 tips. Algorithmic choices (e.g. TBR vs NNI warmup, ratchet cycle counts) +that are optimal at 88 tips may be suboptimal at 180+. The 180-taxon dataset +should be added to the benchmark suite as a separate tier (T-181). + +**Brazeau vs EW scoring confound (T-265, 2026-03-26):** TreeSearch +uses the Brazeau et al. (2019) inapplicable algorithm by default, +which penalizes inapplicable-to-applicable transitions. TNT treats +`-` as `?` (standard EW Fitch). On 11 gap datasets, the apparent +mean gap was +17.8 steps; the actual EW-vs-EW gap is only +2.2 steps +(5 datasets at 0 gap). **All TNT comparisons MUST use `fitch_mode()` +to convert inapplicable to missing** for apples-to-apples scoring. +`fitch_mode()` is defined in `bench_intra_fuse.R` and `bench_t265_regression.R`. + +**`maxTime` confound (2026-03-23):** Initial 180-taxon testing used +`maxTime` (legacy Morphy parameter), which silently delegated to the +R-loop `Morphy()` engine. The C++ driven search (via `maxSeconds`) is +~10× faster at 180 tips. All subsequent profiling used the C++ path. + +**180-taxon baseline (C++ driven search, EW, single replicate):** +- Wagner (best of 3): ~2560 steps, 16ms +- NNI convergence: ~1600 steps, 1.5s +- TBR convergence: ~1330 steps, 7s (from NNI-optimal start) +- XSS: additional ~60 steps improvement, 5s +- Total single replicate: ~25s (before ratchet/drift) + +## Search optimization roadmap + +Plan: `.positai/plans/2026-03-21-search-optimizations.md` + +Ranked by priority: +1. ~~Consensus-guided sector targeting~~ — **Done**: RSS weighted by + pool split conflict scores +2. ~~Diverse pool maintenance~~ — **Done**: evict most-similar entry on ties +3. ~~Cross-replicate constraint tightening~~ — **Done**: opt-in via + `consensusConstrain = TRUE` +4. ~~Collapsed-tree clip skipping~~ — **Done**: zero-length edges skipped + in TBR, SPR, and drift. Benchmark shows 0% skip rate on standard + morphological datasets (Vinther2008, Agnarsson2004, Zhu2013, Dikow2009) + because near-optimal trees have few zero-length edges. Negligible + overhead. Benefit expected primarily on sparse/synthetic data. + Full polytomy search remains post-2.0.0. +5. ~~Collapsed-region regraft merging + pool dedup~~ — **Done**: within + collapsed regions (connected zero-length edges), only the boundary + regraft position is evaluated (Goloboff 1996). Collapsed-topology + pool dedup treats trees differing only in zero-length resolutions as + duplicates. Parallel path also uses collapsed dedup. Diversity-aware + pool eviction selects most-similar entry on ties. +6. ~~Strategy preset tuning~~ — **Done**: `default` preset now uses + `wagnerStarts=3`, `sprFirst=TRUE`, `adaptiveLevel=TRUE`; `thorough` + preset uses `sprFirst=TRUE`. +7. ~~Ratchet perturbation tuning~~ — **Done**: perturbation probability + increased from 4% to 25%, perturbed TBR moves reduced from auto=20 + to 5, ratchet cycles increased from 5 to 10 (default) and kept at + 20 (thorough). Drift cycles increased from 2 to 4 with wider + acceptance (AFD 5, RFD 0.15). Validated on all 14 datasets. +8. ~~Biased Wagner addition~~ — **Done** (T-188): `wagnerBias` + (0=RANDOM, 1=GOLOBOFF, 2=ENTROPY) + `wagnerBiasTemp` in + `SearchControl()`. First Wagner start uses biased addition order; + remaining starts use random for basin diversity. Benchmarked on + 14 standard + crico-174 datasets: 80% Wagner→TBR gap reduction at + 174t; ~1–2 steps improvement at ≤88t (negligible). Goloboff 2014 + §3.3. +9. ~~Outer search cycle loop~~ — **Done** (T-189): `outerCycles` in + `SearchControl()`. Wraps [XSS+RSS+CSS → Ratchet → NNI-perturb → + Drift → TBR] in configurable outer loop; cycles divided evenly. + `thorough` preset defaults to `outerCycles=2`. Matches TNT xmult + pattern. Goloboff 1999 §2.3. Needs benchmarking to validate + improvement on standard datasets. +10. ~~Drift MPT diversity experiment~~ — **Done** (T-254): Drift + (`driftCycles=2`) provides zero score benefit, zero MPT enumeration + benefit, zero topological diversity benefit, and costs 10–22% of + replicates. On Wortley2006, no-drift consistently finds 4 MPTs vs + 1–3 with drift. On Geisler2001/Zhu2013, mean pairwise RF is + identical (7.3 vs 7.4; 11.6 vs 10.2). Drift delays consensus + stability without improving the answer. **Recommendation:** set + `driftCycles=0` in default and thorough presets (T-255). +11. ~~NNI-perturb cycle count at thorough-preset scale~~ — **Done** (T-274): Per-replicate + sampling, 20 seeds, Zhu2013/Giles2015/Dikow2009 (75–88t). NNI-perturb adds 59–69% + per-replicate overhead with ≤0.1-step expected-best benefit at 30s/60s budgets — within + bootstrap noise. **Set `nniPerturbCycles = 0` in thorough preset.** Available via + `SearchControl(nniPerturbCycles = N)` for manual use. +12. ~~Size-weighted TBR clip ordering~~ — **Closed** (2026-03-29, `feature/weighted-clip-order`): + Diagnostic instrumentation added (`TBRPassRecord`, `ts_tbr_diagnostics()`); 10 seeds × 4 + datasets (23–88t, random Wagner starts). **Hypothesis FALSIFIED**: tip clips (~51% of all + clips) account for only 22–38% of accepted moves (enrichment 0.43–0.76×). Medium-small + clips (size 2..√n) are the most productive clip type. All three proposed variants + (INV_WEIGHT, TIPS_FIRST, BUCKET) favour tips — the wrong direction. A "medium-first" + ordering might save ~3–7% per productive pass, but the enrichment (≈1.2×) is too small + to justify implementation complexity. Diagnostic code preserved in branch for reference; + no preset change. Branch closed. +13. ~~XSS↔TBR cycling under IW~~ — **Closed** (2026-03-29, `expt/sector-tbr-cycling`): + 5 datasets (62–180t), 20 seeds, EW/IW10/IW3, TAEB analysis at 10–120s budgets. + **Original hypothesis (IW benefits ≥2× more from XSS than EW): weak signal — closed.** + IW3 XSS improvement rate ~30% vs EW ~25%; below 2× threshold; magnitudes small. + **Key finding: XSS cycling benefit scales with tree size, not scoring mode.** + At ≤88t: XSS adds 24–57% overhead, TAEB Δ ≈ 0 (multi-start dominates). + At 180t: XSS adds 12–19% overhead, TAEB Δ = −6.8 to −9.8 EW steps at 30–120s; + IW3 Δ = −0.8 to −1.2 score units; 13/20 seeds improve, ~2 cycles to converge. + **Practice**: no IW-specific XSS treatment; existing pipeline adequate. The large + preset's XSS(3)+RSS(2)+CSS(1) is well-justified; the outerCycles=1 setting means + only one XSS pass per replicate — increasing to outerCycles=2 could capture the + ~2-cycle joint convergence observed at 180t, but this interacts with T-269 + (fine-grained interleaving showed diminishing returns from more outer cycles at + ≤88t). Results: `TS-sector-expt/dev/benchmarks/expt_tbr_xss_v2_results.rds`. +14. ~~Targeted post-clip sector search~~ — **Closed** (2026-03-29, + `expt/sector-tbr-cycling`): Instrumented `tbr_search()` to run + sector-masked TBR on the just-moved clip subtree after each accepted + move. 5 datasets (62–180t), 20 seeds, EW/IW10/IW3. **Hit rate ~35% + across all scoring modes (no IW-specific benefit). But net HARMFUL:** + mbank_X30754 EW TAEB Δ +17 to +34 steps at 30–120s; Zhu2013/Giles2015 + EW +1–2 steps. IW3 tiny benefit (−0.1 to −0.3). **Mechanism**: local + sector refinement after each move changes the global TBR trajectory, + steering into worse basins. This validates the existing pipeline design: + XSS should run as a separate phase AFTER TBR convergence, not + interleaved within individual TBR moves. Results: + `TS-sector-expt/dev/benchmarks/expt_targeted_sector_results.rds`. + +## Benchmarks and profiling + +### MorphoBank external benchmark corpus + +The neotrans repo (`../neotrans/inst/matrices/`) contains ~800 MorphoBank +NEXUS matrices. These complement the 14 bundled datasets and 1 large-tree +dataset for broader strategy validation. + +**Catalogue:** `dev/benchmarks/mbank_catalogue.csv` (659 usable matrices +after ntax≥20 filter and dedup). Regenerate with +`Rscript dev/benchmarks/build_mbank_catalogue.R`. + +**Train/validation split:** Matrices whose MorphoBank project number is +divisible by 5 are **validation** (124 matrices, ~19%). All others are +**training** (535 matrices). The 7 `syab*` files (non-MorphoBank, from a +Systematic Biology paper) are always training. + +**Dedup:** Multi-file projects with ≥95% character identity on shared taxa +(≥80% taxon overlap) are flagged `dedup_drop = TRUE`. Greedy selection keeps +the largest matrix per redundancy cluster. 24 near-duplicates excluded. + +**IMPORTANT:** Validation results must **never** be used to guide strategy +tuning. They confirm generalization only. This is a one-way door. + +**Fixed 25-matrix training sample:** `MBANK_FIXED_SAMPLE` in +`bench_datasets.R` — 7 small, 7 medium, 7 large, 4 xlarge. Selected via +max-min distance on standardized features. Do not modify. Used by +`benchmark_mbank_sample()`. **Fitch track only** (includes 0%-inapp matrices). + +**Fixed 20-matrix Brazeau-track sample:** `MBANK_BRAZEAU_SAMPLE` in +`bench_datasets.R` — 5 small, 6 medium, 6 large, 3 xlarge. Restricted to +training matrices with **pct_inapp ≥ 4%** (where Brazeau scoring materially +differs from Fitch). Do not modify. + +**Key functions** (in `dev/benchmarks/bench_datasets.R`): +- `load_mbank_catalogue()` — loads metadata CSV (excludes dedup by default) +- `load_mbank_sample(cat, n, seed, split)` — stratified random sample +- `load_mbank_datasets(cat, keys)` — load specific matrices by key +- `load_mbank_brazeau_sample(cat)` — fixed 20-matrix Brazeau sample +- `has_meaningful_inapp(cat, threshold)` — filter to pct_inapp ≥ threshold + +**Benchmark runners** (in `dev/benchmarks/bench_framework.R`): +- `benchmark_mbank_sample()` — fixed 25-matrix training sample (routine) +- `benchmark_mbank_sweep(split)` — full training or validation sweep +- `benchmark_mbank_validation()` — validation sweep with prominent warning + +**Benchmark tracks** — strategy tuning uses two distinct tracks, each run +under EW and IW (k=10): + +| Track | Scoring | Datasets | Purpose | +|-------|---------|----------|---------| +| **Fitch** | `fitch_mode()` | 14 bundled + `MBANK_FIXED_SAMPLE` | TNT comparison, core search quality | +| **Brazeau** | Default (Brazeau 2019) | `MBANK_BRAZEAU_SAMPLE` + bundled | NA-algorithm-specific strategy tuning | + +TNT comparisons are Fitch track only. Do not use validation matrices for +tuning — they are a one-way door. + +**TNT comparison suite** lives in `../TS-TNT-bench/`. Key files: +- `dev/benchmarks/bench_tnt_compare.R` — runner (smoke/medium/full) +- `dev/benchmarks/tnt_comparison.qmd` — Quarto report (HTML output) +- `dev/benchmarks/.tnt-bench/` — staging dir for TNT I/O +- Requires TNT 1.6 at `C:/Programs/Phylogeny/tnt/TNT-bin/tnt.exe` + +Benchmark scripts in `dev/benchmarks/`. Key files: +- `bench_regression.R` — CI regression test (score quality + timing bounds) +- `bench_framework.R` — Dataset × strategy × replicate grid +- `strategies.md` — Strategy space documentation (full track/sample details, + seed count protocol, sample-size validation) + +**Phase distribution baselines (T-290b, 2026-03-28, Brazeau-sample datasets, +30s, post-T-255 no-drift presets):** + +| Phase | Fitch/EW/default | Fitch/EW/thorough | Brazeau/EW/default | Brazeau/EW/thorough | +|-------|:---:|:---:|:---:|:---:| +| Ratchet | 76% | 65% | 74% | 63% | +| TBR | 8% | 5% | 7% | 4% | +| XSS | 6% | 7% | 5% | 6% | +| RSS | 3% | 10% | 3% | 10% | +| CSS | — | 7% | — | 7% | +| Wagner | 4% | 3% | 9% | 7% | +| Final TBR | 2% | 2% | 2% | 2% | + +*(Drift has been 0% in all presets since T-255. The 2026-03-22 baselines +showing Drift at 24–32% are obsolete.)* + +**Brazeau / Fitch per-phase cost ratios (T-290b, EW):** + +| Phase | default | thorough | +|-------|:-------:|:--------:| +| Wagner | **3.6×** | **3.9×** | +| Ratchet | 1.3× | 1.3× | +| RSS/CSS | 1.3× | 1.3× | +| TBR | 0.9× | 0.9× | + +Wagner is the outlier. All other phases are within 0.9–1.4× of Fitch cost. +Replicate rate under Brazeau is 95–97% of Fitch (landscape not harder). + +**wagnerStarts under Brazeau (T-290b/c, 2026-03-28):** Regime depends on +budget/replicate-time ratio: +- *Multiple reps/budget*: wagnerStarts=1 and 3 equivalent; w3 marginally better. +- *~1 rep/budget* (60s at 86t/3660c): wagnerStarts=3 better by +564 steps + — better starting topology pays off. +- *0 reps/budget* (30s at 86t/3660c): wagnerStarts=1 **better** — Brazeau + Wagner is expensive (~4×), 3 starts consume budget; more TBR time > better + topology at tight budgets. +Current presets are correct: thorough (w3, gets ≥1 rep at 65–119t) ✓; +large (w1, gets 0–1 reps at 120t+) ✓. + +Per-candidate indirect scoring is at memory-throughput limit (~23 ns at +75 tips). + +## VTune driver scripts — dry-run first + +**Always test a VTune driver script with plain `Rscript` before launching +VTune.** Software-sampling overhead can be 5–20×; if the bare script takes +30 s, VTune may need 10 min. Target < 5 s bare run for a lite driver. + +MaddisonSlatkin is exponential in tip count — even n=20 with k=3 can take +seconds per call. Use small n (≤ 15 for k=3, ≤ 12 for k=4, ≤ 9 for k=5) +and few iterations for VTune drivers. + +### Ratchet tuning validation (2026-03-22) + +Full 14-dataset comparison, optimized vs original defaults (10s budget, +3 seeds each). Median scores shown (lower is better). + +> **Metric note:** Median per-replicate score is adequate for comparing +> parameter changes on a fixed pipeline (same time-per-rep). For comparing +> strategies with different time costs (e.g. NNI→TBR vs TBR), use +> **time-adjusted expected best** instead — see "Metric note" under +> "NNI in the driven pipeline". + +| Dataset | Tips | Original | Optimized | Delta | +|---------|:---:|:---:|:---:|:---:| +| Longrich2010 | 20 | 131 | 131 | 0 | +| Vinther2008 | 23 | 79 | 79 | 0 | +| Sansom2010 | 23 | 189 | 189 | 0 | +| DeAssis2011 | 33 | 64 | 64 | 0 | +| Aria2015 | 35 | 143 | 143 | 0 | +| Wortley2006 | 37 | 494 | 491 | +3 | +| Griswold1999 | 43 | 408 | 407 | +1 | +| Schulze2007 | 52 | 165 | 164 | +1 | +| Eklund2004 | 54 | 442 | 441 | +1 | +| Agnarsson2004 | 62 | 778 | 778 | 0 | +| Zanol2014 | 74 | 1338 | 1331 | +7 | +| Zhu2013 | 75 | 649 | 650 | −1 | +| Giles2015 | 78 | 720 | 716 | +4 | +| Dikow2009 | 88 | 1614 | 1614 | 0 | + +Zhu2013 marginal regression at 10s resolves at 20s (median 649→644). +At 20s with 5 seeds: Zhu2013 645/643, Giles2015 712/710, Dikow2009 +1611/1611 (all improvements). diff --git a/DESCRIPTION b/DESCRIPTION index f3f0cc2da..e351a0b99 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: TreeSearch Title: Phylogenetic Analysis with Discrete Character Data -Version: 1.8.0.9000 +Version: 2.0.0 Authors@R: c( person( "Martin R.", 'Smith', @@ -18,15 +18,17 @@ License: GPL (>= 3) Copyright: Incorporates C/C++ code from Morphy Phylogenetic Library by Martin Brazeau (GPL3) -Description: Reconstruct phylogenetic trees from discrete data. +Description: Reconstruct phylogenetic trees from discrete data using a + high-performance C++ search engine with multi-replicate driven search. + Supports equal weights, implied weights (Goloboff, 1993) + and profile parsimony + (Faith and Trueman, 2001) . Inapplicable character states are handled using the algorithm of Brazeau, - Guillerme and Smith (2019) with the "Morphy" - library, under equal or implied step weights. + Guillerme and Smith (2019) . Contains a "shiny" user interface for interactive tree search and exploration of results, including character visualization, rogue taxon detection, tree space mapping, and cluster consensus trees (Smith 2022a, b) , . - Profile Parsimony (Faith and Trueman, 2001) , Successive Approximations (Farris, 1969) and custom optimality criteria are implemented. URL: https://ms609.github.io/TreeSearch/ (doc), @@ -36,36 +38,33 @@ Depends: R (>= 4.0) Imports: abind, ape (>= 5.6), - base64enc, cli (>= 3.0), - cluster, colorspace, - fastmap, fastmatch (>= 1.1.3), - fs, - future, - PlotTools, - promises, - protoclust, Rcpp, Rdpack (>= 0.7), - Rogue (> 2.0.0), - shiny (>= 1.6.0), - shinyjs, stats, - stringi, TreeDist (>= 2.6.3), TreeTools (>= 1.16), Suggests: + cluster, + future, knitr, phangorn (>= 2.2.1), + PlotTools, + promises, + protoclust, Quartet, readxl, rmarkdown, - shinytest, + Rogue (> 2.0.0), + shiny (>= 1.8.1), + shinyjs, + shinytest2, spelling, testthat, vdiffr (>= 1.0.0), + zip, Config/Needs/check: callr, pkgbuild, @@ -78,6 +77,50 @@ Config/Needs/website: curl, igraph, pkgdown, +Collate: + 'AdditionTree.R' + 'Bootstrap.R' + 'CharacterHierarchy.R' + 'ClusterStrings.R' + 'Concordance.R' + 'Consistency.R' + 'CustomSearch.R' + 'IWScore.R' + 'ImposeConstraint.R' + 'Jackknife.R' + 'SearchControl.R' + 'MaximizeParsimony.R' + 'Morphy.R' + 'NNI.R' + 'ParsSim.R' + 'PlotCharacter.R' + 'PolEscapa.R' + 'PresentContra.R' + 'QuartetResolution.R' + 'RandomTreeScore.R' + 'Ratchet.R' + 'RcppExports.R' + 'ts-driven-compat.R' + 'ReleaseQuestions.R' + 'recode_hierarchy.R' + 'SPR.R' + 'ScoreSpectrum.R' + 'Sectorial.R' + 'SuccessiveApproximations.R' + 'TBR.R' + 'TaxonInfluence.R' + 'TreeSearch_utilities.R' + 'WhenFirstHit.R' + 'WideSample.R' + 'data.R' + 'data_manipulation.R' + 'length_range.R' + 'mpl_morphy_objects.R' + 'mpl_morphyex.R' + 'pp_info_extra_step.r' + 'tree_length.R' + 'tree_rearrangement.R' + 'zzz.R' RdMacros: Rdpack LinkingTo: Rcpp, diff --git a/NAMESPACE b/NAMESPACE index e542e3959..394e79bd2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,12 +18,16 @@ S3method(TreeLength,list) S3method(TreeLength,multiPhylo) S3method(TreeLength,numeric) S3method(TreeLength,phylo) +S3method(print,CharacterHierarchy) +S3method(print,ScoreSpectrum) +S3method(print,SearchControl) S3method(summary,morphyPtr) export(.NonDuplicateRoot) export(.UniqueExceptHits) export(AdditionTree) export(C_MorphyLength) export(Carter1) +export(CharacterHierarchy) export(CharacterLength) export(ClusterStrings) export(ClusteringConcordance) @@ -49,9 +53,13 @@ export(Jackknife) export(LengthAdded) export(Log2Carter1) export(LogCarter1) +export(MaddisonSlatkin) +export(MaddisonSlatkin_clear_cache) export(MaximizeParsimony) +export(MaximizeParsimony2) export(MaximumLength) export(MinimumLength) +export(Morphy) export(MorphyBootstrap) export(MorphyErrorCheck) export(MorphyLength) @@ -62,6 +70,7 @@ export(MultiRatchet) export(MutualClusteringConcordance) export(NNI) export(NNISwap) +export(ParsSim) export(PhyDat2Morphy) export(PhylogeneticConcordance) export(PlotCharacter) @@ -90,6 +99,8 @@ export(SPR) export(SPRMoves) export(SPRSwap) export(SPRWarning) +export(ScoreSpectrum) +export(SearchControl) export(SetMorphyWeights) export(SharedPhylogeneticConcordance) export(SingleCharMorphy) @@ -107,10 +118,17 @@ export(TreeLength) export(TreeSearch) export(UnloadMorphy) export(WhenFirstHit) +export(WideSample) export(WithOneExtraStep) +export(build_tip_labels) export(cNNI) export(cSPR) +export(hierarchy_chars) +export(hierarchy_controlling) +export(hierarchy_from_names) +export(hierarchy_to_blocks) export(is.morphyPtr) +export(mc_fitch_scores) export(mpl_apply_tipdata) export(mpl_attach_rawdata) export(mpl_attach_symbols) @@ -134,10 +152,11 @@ export(mpl_set_parsim_t) export(mpl_translate_error) export(mpl_update_lower_root) export(mpl_update_tip) -importFrom(PlotTools,SpectrumLegend) +export(non_hierarchy_weights) +export(recode_hierarchy) +export(validate_hierarchy) importFrom(Rcpp,compileAttributes) importFrom(Rdpack,reprompt) -importFrom(Rogue,ColByStability) importFrom(TreeDist,ClusteringEntropy) importFrom(TreeDist,ClusteringInfo) importFrom(TreeDist,ClusteringInfoDistance) @@ -145,7 +164,6 @@ importFrom(TreeDist,Entropy) importFrom(TreeDist,MutualClusteringInfo) importFrom(TreeDist,SharedPhylogeneticInfo) importFrom(TreeDist,entropy_int) -importFrom(TreeTools,AddTipEverywhere) importFrom(TreeTools,AddUnconstrained) importFrom(TreeTools,CharacterInformation) importFrom(TreeTools,CladeSizes) @@ -160,6 +178,7 @@ importFrom(TreeTools,DropTip) importFrom(TreeTools,EdgeAncestry) importFrom(TreeTools,ImposeConstraint) importFrom(TreeTools,KeepTip) +importFrom(TreeTools,LnUnrooted) importFrom(TreeTools,Log2DoubleFactorial) importFrom(TreeTools,Log2Unrooted) importFrom(TreeTools,Log2UnrootedMult) @@ -185,6 +204,7 @@ importFrom(TreeTools,Renumber) importFrom(TreeTools,RenumberEdges) importFrom(TreeTools,RenumberTips) importFrom(TreeTools,RenumberTree) +importFrom(TreeTools,RootNode) importFrom(TreeTools,RootTree) importFrom(TreeTools,SampleOne) importFrom(TreeTools,SplitConflicts) @@ -204,7 +224,6 @@ importFrom(ape,plot.phylo) importFrom(ape,read.nexus) importFrom(ape,root) importFrom(ape,write.nexus) -importFrom(base64enc,base64encode) importFrom(cli,cli_alert) importFrom(cli,cli_alert_danger) importFrom(cli,cli_alert_info) @@ -214,31 +233,24 @@ importFrom(cli,cli_h1) importFrom(cli,cli_progress_bar) importFrom(cli,cli_progress_done) importFrom(cli,cli_progress_update) -importFrom(cluster,pam) -importFrom(cluster,silhouette) importFrom(colorspace,hex) importFrom(colorspace,max_chroma) importFrom(colorspace,polarLUV) -importFrom(fastmap,fastmap) importFrom(fastmatch,"%fin%") importFrom(fastmatch,fmatch) -importFrom(fs,path_sanitize) -importFrom(future,future) importFrom(graphics,abline) importFrom(graphics,image) importFrom(graphics,mtext) importFrom(graphics,par) -importFrom(promises,future_promise) -importFrom(protoclust,protoclust) -importFrom(shiny,runApp) -importFrom(shinyjs,useShinyjs) importFrom(stats,as.dist) importFrom(stats,cutree) +importFrom(stats,dnorm) importFrom(stats,median) importFrom(stats,runif) +importFrom(stats,sd) importFrom(stats,setNames) importFrom(stats,weighted.mean) -importFrom(stringi,stri_paste) importFrom(utils,adist) importFrom(utils,combn) +importFrom(utils,head) useDynLib(TreeSearch, .registration = TRUE) diff --git a/NEWS.md b/NEWS.md index e7cab983d..64832fdf1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,217 @@ -# TreeSearch 1.8.0.9000 (2026-02-05) +# TreeSearch 2.0.0 + +## Breaking changes + +- Implied weighting now applies the missing-entries correction of + Goloboff (2014) by default (`extended_iw = TRUE`). Characters with + many missing entries receive a reduced effective concavity, compensating + for artificially low observed homoplasy. Set `extended_iw = FALSE` to + reproduce pre-2.0.0 behaviour. +- `MaximizeParsimony()` has an entirely new parameter interface. + The previous `MaximizeParsimony()` (R-loop search using MorphyLib) has been + renamed to `Morphy()`. + Code that passes Morphy-style parameters (e.g. `ratchIter`, `tbrIter`, + `maxHits`) to `MaximizeParsimony()` will be automatically forwarded to + `Morphy()` with a deprecation warning. + Update your code to call `Morphy()` directly, or adopt the new + `MaximizeParsimony()` parameters. + This compatibility shim will be removed in a future release. + +## C++ search engine + +`MaximizeParsimony()` is rewritten from the ground up with a native C++ search +engine, replacing the R-loop/MorphyLib backend for equal weights, implied +weights, and profile parsimony. Typical searches are an order of magnitude +faster; inapplicable character handling (Brazeau _et al._ 2019) is built in. + +### New features + +- **`ScoreSpectrum()`**: Chao1-style landscape coverage estimator. Treats + distinct parsimony scores found across replicates as "species" and estimates + how thoroughly the parsimony landscape has been sampled (Good-Turing sample + coverage, Chao1 richness lower bound, unseen score-level fraction). The + Shiny app's confidence panel now displays the coverage estimate when + sufficient replicates have been completed. `MaximizeParsimony()` now + returns a `replicate_scores` attribute containing per-replicate local-optimum + scores for this purpose. + +- **Multi-replicate driven search** pipeline: random Wagner tree → TBR → + sectorial search (XSS, RSS, CSS) → ratchet → drift → tree fusing → + final TBR. +- **Parallel search** via `nThreads`: replicates run on independent threads + with a shared tree pool. +- **Timeout** via `maxSeconds`. +- **User-supplied starting tree**: when a `tree` argument is provided, the + first replicate begins from that topology; subsequent replicates use + random Wagner trees. +- **Adaptive strategy presets** via `strategy`: `"auto"` (default) selects + `"sprint"`, `"default"`, or `"thorough"` based on the number of tips. + Explicit parameters always override preset values. +- **Profile parsimony** runs natively in C++; no longer delegates to + `Morphy()`. +- **Topological constraints** enforced natively in C++ (including during + Wagner tree construction and sectorial search). +- **Per-phase timing** returned as a `timings` attribute on the result. +- **MPT enumeration**: after the main search converges, a TBR plateau walk + from each pool tree discovers additional most-parsimonious topologies on the + same and neighbouring score plateaus, up to `poolMaxSize`. + This addresses a common complaint that the previous implementation returned + only one tree when multiple MPTs exist. + +### New parameters for `MaximizeParsimony()` + +- `strategy` — `"auto"` (default), `"sprint"`, `"default"`, `"thorough"`, + or `"none"`. +- `nThreads` — number of parallel worker threads (default 1). +- `maxSeconds` — wall-clock timeout (0 = no limit). +- `sprFirst` — run SPR before TBR in each replicate. +- `ratchetPerturbMode`, `ratchetPerturbMaxMoves`, `ratchetAdaptive` — + configure ratchet perturbation (zero-weight, up-weight, mixed, adaptive). +- `driftCycles`, `driftAfdLimit`, `driftRfdLimit` — drift search parameters. +- `xssRounds`, `xssPartitions`, `rssRounds`, `cssRounds`, `cssPartitions`, + `sectorMinSize`, `sectorMaxSize` — sectorial search parameters. +- `fuseInterval`, `fuseAcceptEqual` — tree fusing parameters. +- `poolMaxSize`, `poolSuboptimal` — tree pool management. +- `tbrMaxHits`, `wagnerStarts`, `tabuSize`. +- `nniFirst` — NNI warmup pass before SPR/TBR in each replicate; at + ≥100 tips this substantially improves the Wagner starting-tree quality + at negligible cost for small datasets. +- `postRatchetSectorial` — run a second XSS+RSS+CSS pass after ratchet + perturbation; approximates TNT's interleaved sectorial pattern. + Enabled by default in the `"thorough"` preset. +- `outerCycles`, `maxOuterResets` — repeat the full + \[XSS/RSS/CSS → ratchet → NNI-perturbation → drift → TBR\] sequence + _n_ times per replicate; budget is divided evenly. Enabled in the + `"thorough"` preset (`outerCycles = 2`). +- `wagnerBias`, `wagnerBiasTemp` — bias taxon addition order during Wagner + tree construction toward taxa with more informative characters + (Goloboff 2014), substantially improving starting-tree quality at large + tip counts. +- `perturbStopFactor` — stop after `nTip × perturbStopFactor` consecutive + replicates that fail to improve the best score; provides 2–7× speedup on + converged searches at no score cost. +- `pruneReinsertCycles`, `pruneReinsertDrop`, `pruneReinsertSelection` — + taxon pruning-reinsertion perturbation: drop a fraction of leaves, let + the backbone re-optimise with TBR, then reinsert taxa greedily. + Complementary to the ratchet (which perturbs character weights). +- `nniPerturbCycles`, `nniPerturbFraction` — stochastic NNI-perturbation: + randomly apply NNI swaps to a fraction of internal branches and + reconverge, escaping local optima without altering character weights. +- `annealCycles`, `annealPhases`, `annealTStart`, `annealTEnd`, + `annealMovesPerPhase` — multi-cycle PCSA (simulated annealing + perturbation) phase. +- `adaptiveLevel` — dynamically scale ratchet and drift effort per + replicate based on the observed hit rate. +- `adaptiveStart` — Thompson-sampling bandit strategy for starting-tree + selection; adapts over replicates to which strategies yield best scores. +- `enumTimeFraction` — fraction of `maxSeconds` reserved for the MPT + plateau enumeration walk at the end of the search (default 10%). +- `intraFuse` — within-replicate tree fusing against pool donors after TBR + polish; approximates TNT's within-replicate fusing pattern. +- `ratchetTaper` — gradually reduce ratchet perturbation probability as + the pool stabilises, allowing finer local exploration late in the search. +- `consensusConstrain` — lock pool-consensus splits as topological + constraints for subsequent replicates. +- `consensusStableReps` — stop when the strict consensus is unchanged for + this many consecutive replicates (0 = disabled; set e.g. 3 to enable). +- `progressCallback` — R function called after each replicate (for custom + progress reporting). + +### Search output + +- **Convergence summary**: when `verbosity > 0` (the default), + `MaximizeParsimony()` now prints a one-line summary on exit reporting the + best score, number of replicates completed, replicates since last + improvement, number of distinct MPTs found, stop reason (time limit, + target hits, perturbation-stop, or user interrupt), and elapsed time. + The same information is available as named attributes on the returned + tree list. + +### Search optimizations + +- **Collapsed-edge clip skipping**: TBR, SPR, and drift search skip + clips at zero-length edges that provably cannot improve the score, + reducing unnecessary evaluations on sparse data. +- **Conflict-guided sectorial search**: random sectorial search targets + sectors around splits that conflict across pool trees. +- **Diversity-aware pool eviction**: when the tree pool is full, the most + topologically similar entry is evicted to maintain diversity. +- **Cross-replicate consensus constraint tightening**: opt-in via + `consensusConstrain = TRUE` in `SearchControl()`. +- **Consensus-stability early stopping**: when `consensusStableReps > 0` in + `SearchControl()`, search stops when the strict consensus of best-score + pool trees has been unchanged for that many consecutive replicates. + Disabled by default. + +### Batch resampling + +- `Resample()` gains `nReplicates` and `nThreads` parameters for batch and + parallel jackknife/bootstrap resampling via a single C++ call. +- `SuccessiveApproximations()` gains `concavity` and `constraint` parameters. + +## Profile parsimony: multi-state support + +- Profile parsimony now supports characters with up to 5 informative states + (previously limited to 2). Characters with 3--5 states use the recursive + algorithm of Maddison & Slatkin (1991). +- New C++ function `MaddisonSlatkin()` computes the number of labelled + histories for multi-state characters. + +## Data simulation + +- New function `ParsSim()` simulates morphological datasets under a parsimony + model (equal weights, implied weights, or profile parsimony). Each + character starts at minimum steps; extra steps are placed one at a time, + verified to increase the Fitch score by exactly 1. + +## Scoring + +- `TreeLength()` and `CharacterLength()` / `FastCharacterLength()` use the + C++ engine for all scoring modes (equal weights, implied weights, profile + parsimony). + +## Function rename + +- `TaxonInfluence()` now uses `MaximizeParsimony()` internally. +- `AdditionTree()` now uses the C++ Wagner tree engine, with native support + for implied weights, profile parsimony, and constraints. + +## Bug fixes + +- Fix output trees from `MaximizeParsimony()` having invalid preorder + numbering (affected `DropTip()`, distance calculations, and plotting). +- Fix `fuseInterval = 0` causing a crash (division by zero). +- Fix `is_uninformative()` misclassifying ambiguous characters as + uninformative. +- Fix `compute_fixed_steps()` undercount for all-ambiguous characters. +- Fix IW scoring with missing `min_steps` offset. +- Fix crash when dataset contains only ambiguous (`?`) tokens. + +## Custom search functions + +- `Ratchet()`, `MultiRatchet()`, `Jackknife()`, `MorphyBootstrap()`, and + `TreeSearch()` are no longer deprecated. These functions support pluggable + `TreeScorer` and `EdgeSwapper` functions for custom scoring strategies; + for standard parsimony, use `MaximizeParsimony()`. + +## App improvements (`EasyTrees()`) + +- **Async search**: the session remains responsive while a search is running. +- **Parallel search**: the search settings modal includes a thread count slider + (when multiple cores are available). +- **Tree accumulation**: repeated "Continue search" runs accumulate trees at + the same optimal score, with de-duplication by topology. +- **Search confidence**: after each search, the results pane shows the hit rate + and an estimate of the replicates needed for 95% confidence. +- **Search config modal** reorganized into labelled sections (step weighting, + parallelization, search intensity, results to keep). +- Fix `PlotCharacter()` crash on multifurcating consensus trees. +- Fix first search not appearing to update trees in memory. +- Clarified "Stop after best score found N times" slider label with help text. +- Dataset-adaptive timeout default (1–15 minutes based on dataset size). +- Internal modularization of the Shiny app into proper Shiny modules. + +## Other improvements - New parameters for flexible plotting of `QALegend()`. - `ConcordanceTable()` gains `plot` parameter. diff --git a/R/AdditionTree.R b/R/AdditionTree.R index d6a63605b..22f742c6c 100644 --- a/R/AdditionTree.R +++ b/R/AdditionTree.R @@ -12,9 +12,7 @@ #' @template MRS #' @return `AdditionTree()` returns a tree of class `phylo`, rooted on #' `sequence[1]`. -#' @importFrom TreeTools AddUnconstrained AddTipEverywhere MatrixToPhyDat -#' PectinateTree -#' @importFrom cli cli_progress_bar cli_progress_update +#' @importFrom TreeTools PectinateTree Renumber #' @family tree generation functions #' @seealso #' @@ -26,113 +24,93 @@ #' [`TreeTools::ConstrainedNJ()`]( #' https://ms609.github.io/TreeTools/reference/ConstrainedNJ) #' @export -AdditionTree <- function (dataset, concavity = Inf, constraint, sequence) { - - # Initialize missing parameters +AdditionTree <- function(dataset, concavity = Inf, constraint, sequence) { + taxa <- names(dataset) + nTaxa <- length(taxa) + + if (nTaxa < 4L) { + return(PectinateTree(taxa)) + } + + # Build addition order if (missing(sequence)) { sequence <- taxa[[1]] } else if (is.numeric(sequence)) { sequence <- taxa[sequence] } - - nTaxa <- length(taxa) - if (length(taxa) < 4) { - return(PectinateTree(taxa)) - } - unlisted <- setdiff(taxa, sequence) - if (length(unlisted) > 0) { + if (length(unlisted) > 0L) { sequence <- c(sequence, sample(unlisted)) } - if (!missing(constraint)) { - constraint <- AddUnconstrained(constraint, taxa) - } - - # PrepareDataXXX attributes only valid for full dataset - attr(dataset, "info.amounts") <- NULL - attr(dataset, "min.length") <- NULL - attr(dataset, "informative") <- NULL - attr(dataset, "originalIndex") <- NULL - - # Starting tree, rooted on first element in sequence - tree <- PectinateTree(sequence[1:3]) - - cli_progress_bar("Addition tree", total = sum(2 * (4:nTaxa) - 5)) - for (addition in sequence[4:nTaxa]) { - candidates <- AddTipEverywhere(tree, addition) - nCands <- length(candidates) - - theseTaxa <- candidates[[1]][["tip.label"]] - theseData <- .Recompress(dataset[theseTaxa]) - if (is.finite(concavity)) { - theseData <- PrepareDataIW(theseData) - } else if (is.character(concavity)) { - theseData <- suppressMessages(PrepareDataProfile(theseData)) - } - - if (!missing(constraint)) { - if (!inherits(constraint, "phyDat")) { - if (is.numeric(constraint) && is.null(dim(constraint))) { - constraint <- t(constraint) - } - constraint <- MatrixToPhyDat(t(as.matrix(constraint))) - } - thisConstr <- constraint[theseTaxa] - if (.ConstraintConstrains(thisConstr)) { - # Constraint constrains theseTaxa - - morphyConstr <- PhyDat2Morphy(thisConstr) - # Calculate constraint minimum score - constraintLength <- sum(MinimumLength(thisConstr, compress = TRUE) * - attr(thisConstr, "weight")) - - .Forbidden <- function (edges) { - preorder_morphy(edges, morphyConstr) != constraintLength - } - - - candidates <- candidates[!vapply(lapply(candidates, `[[`, "edge"), - .Forbidden, logical(1))] - UnloadMorphy(morphyConstr) - } - } - - # Score remaining candidates - if (length(theseData)) { - scores <- TreeLength(candidates, theseData, concavity) - minScore <- which.min(scores) - nMin <- length(minScore) - if (nMin > 1) { - minScore <- minScore[sample.int(nMin, 1)] - } - tree <- candidates[[minScore]] - } else { - tree <- sample(candidates, 1)[[1]] + addition_order <- match(sequence, taxa) + + # Profile parsimony: simplify data and extract info_amounts + useProfile <- !missing(concavity) && identical(concavity, "profile") + profileArgs <- list() + if (useProfile) { + dataset <- PrepareDataProfile(dataset) + infoAmounts <- attr(dataset, "info.amounts") + if (!is.null(infoAmounts) && length(infoAmounts) > 0L) { + profileArgs$infoAmounts <- infoAmounts } - cli_progress_update(nCands) + concavity <- Inf + } + if (is.finite(concavity) && concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony).") } - tree + + # Extract data matrices + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = nTaxa, byrow = TRUE) + weight <- at$weight + levels <- at$levels + + # Constraint + consArgs <- list() + if (!missing(constraint)) { + consArgs <- .PrepareConstraint(constraint, dataset) + } + + # Call C++ Wagner tree + searchArgs <- list( + contrast = contrast, + tip_data = tip_data, + weight = weight, + levels = levels, + addition_order = addition_order, + concavity = as.double(concavity) + ) + result <- do.call(ts_wagner_tree, c(searchArgs, consArgs, profileArgs)) + + # Reconstruct phylo from edge matrix + tree <- list( + edge = result$edge, + tip.label = taxa, + Nnode = nTaxa - 1L + ) + class(tree) <- "phylo" + Renumber(tree) } .ConstraintConstrains <- function(constraint) { + if (is.null(constraint) || length(constraint) == 0L) return(FALSE) if (length(constraint[[1]]) < 1) { FALSE } else { contrast <- attr(constraint, "contrast") - if (dim(contrast)[[2]] < 2) { + if (is.null(contrast) || dim(contrast)[[2]] < 2) { FALSE } else { cont <- `mode<-`(contrast, "logical") nLevel <- dim(contrast)[[1]] - # Could be > 2× more efficient using lower.tri exclude <- vapply(seq_len(nLevel), function(i) { colSums(apply(cont, 1, `&`, cont[i, ])) == 0 }, logical(nLevel)) - - # TODO Validate; passes existing tests, but these do not include all - # edge cases, e.g. 02 03 1 1 splits <- exclude * tabulate(unlist(constraint), nLevel) any(splits[lower.tri(splits)] > 1 & t(splits)[lower.tri(splits)] > 1) } @@ -141,5 +119,5 @@ AdditionTree <- function (dataset, concavity = Inf, constraint, sequence) { .Recompress <- function(dataset) { - MatrixToPhyDat(PhyDatToMatrix(dataset)) + TreeTools::MatrixToPhyDat(TreeTools::PhyDatToMatrix(dataset)) } diff --git a/R/CharacterHierarchy.R b/R/CharacterHierarchy.R new file mode 100644 index 000000000..b089cfc19 --- /dev/null +++ b/R/CharacterHierarchy.R @@ -0,0 +1,539 @@ +#' Define character hierarchy for inapplicable data +#' +#' Specify the dependency structure between characters in a morphological +#' dataset that uses reductive coding. A "controlling primary" character +#' (typically presence/absence of a structure) determines whether its +#' associated "secondary" characters are applicable. Secondary characters +#' can in turn control tertiary characters, and so on. +#' +#' This hierarchy is required by the HSJ +#' \insertCite{Hopkins2021}{TreeSearch} and step-matrix +#' \insertCite{Goloboff2021}{TreeSearch} approaches to inapplicable +#' characters, and is passed to [`MaximizeParsimony()`] via the `hierarchy` +#' argument. +#' +#' @param ... Named arguments where each name is the index of a controlling +#' character (coerced to integer) and each value is an integer vector of +#' the character indices it controls. Use nested [`list()`]s for deeper +#' hierarchies (see Examples). +#' +#' @return An object of class `"CharacterHierarchy"`. +#' +#' @examples +#' # Simple: character 1 controls characters 2-5 +#' h <- CharacterHierarchy("1" = 2:5) +#' +#' # Multiple controlling primaries +#' h <- CharacterHierarchy("1" = 2:5, "6" = 7:8) +#' +#' # Nested: char 1 controls 2-5; char 3 further controls 9-10 +#' h <- CharacterHierarchy("1" = list(2, 3, 4, 5, "3" = 9:10)) +#' +#' @references +#' \insertAllCited{} +#' @family tree scoring +#' @seealso [MaximizeParsimony()], [hierarchy_from_names()] +#' @export +CharacterHierarchy <- function(...) { + args <- list(...) + if (length(args) == 0L) { + stop("At least one controlling character must be specified.") + } + tree <- .ParseHierarchyArgs(args) + structure(tree, class = "CharacterHierarchy") +} + +# Parse user args into a normalized tree structure. +# Returns a list of nodes, each: +# list(controlling = int, dependents = int[], children = list(, ...)) +# "children" are sub-hierarchies (controlling secondaries). +.ParseHierarchyArgs <- function(args) { + if (is.null(names(args)) || any(names(args) == "")) { + stop("Every element of `...` must be named with the controlling ", + "character index.") + } + controlling_indices <- suppressWarnings(as.integer(names(args))) + if (anyNA(controlling_indices)) { + stop("Controlling character names must be integer indices.") + } + + lapply(seq_along(args), function(i) { + ctrl <- controlling_indices[i] + val <- args[[i]] + .ParseOneBlock(ctrl, val) + }) +} + +# Parse a single controlling-character block. +# val can be: +# - integer vector: simple list of dependent character indices +# - list with mixed named/unnamed elements: unnamed = dependents, +# named = sub-hierarchies (controlling secondaries) +.ParseOneBlock <- function(ctrl, val) { + if (is.numeric(val) && is.null(names(val))) { + # Simple case: vector of dependent indices + return(list( + controlling = as.integer(ctrl), + dependents = as.integer(val), + children = list() + )) + } + if (is.list(val)) { + nms <- names(val) + if (is.null(nms)) nms <- rep("", length(val)) + dependents <- integer(0) + children <- list() + for (j in seq_along(val)) { + if (nms[j] == "") { + # Unnamed: a dependent character index + dependents <- c(dependents, as.integer(val[[j]])) + } else { + # Named: a sub-hierarchy + sub_ctrl <- suppressWarnings(as.integer(nms[j])) + if (is.na(sub_ctrl)) { + stop("Sub-hierarchy names must be integer character indices, got '", + nms[j], "'.") + } + # The sub-controlling character is also a dependent of this block + dependents <- c(dependents, sub_ctrl) + children <- c(children, list(.ParseOneBlock(sub_ctrl, val[[j]]))) + } + } + return(list( + controlling = as.integer(ctrl), + dependents = dependents, + children = children + )) + } + # Scalar + list( + controlling = as.integer(ctrl), + dependents = as.integer(val), + children = list() + ) +} + +#' @export +print.CharacterHierarchy <- function(x, ...) { + cat("CharacterHierarchy\n") + .PrintBlock <- function(node, indent = 1L) { + pad <- strrep(" ", indent) + leaf_deps <- setdiff( + node$dependents, + vapply(node$children, `[[`, integer(1), "controlling") + ) + cat(sprintf("%sChar %d controls: {%s}\n", + pad, node$controlling, + paste(node$dependents, collapse = ", "))) + for (child in node$children) { + .PrintBlock(child, indent + 1L) + } + } + for (node in x) { + .PrintBlock(node) + } + invisible(x) +} + +#' Validate a CharacterHierarchy against a dataset +#' +#' Check that a [`CharacterHierarchy`] object is consistent with a +#' [`phyDat`][phangorn::phyDat] dataset: character indices exist, +#' controlling characters are binary (absent/present), secondaries are +#' coded inapplicable where expected, and no character appears in +#' multiple blocks. +#' +#' @param hierarchy A [`CharacterHierarchy`] object. +#' @param dataset A `phyDat` object. +#' +#' @return `hierarchy`, invisibly (called for side effects: stops with an +#' informative error if validation fails). +#' +#' @keywords internal +#' @importFrom utils head +#' @export +validate_hierarchy <- function(hierarchy, dataset) { + if (!inherits(hierarchy, "CharacterHierarchy")) { + stop("`hierarchy` must be a CharacterHierarchy object.") + } + if (!inherits(dataset, "phyDat")) { + stop("`dataset` must be a phyDat object.") + } + + n_char <- length(attr(dataset, "index")) + all_levels <- attr(dataset, "allLevels") + levels <- attr(dataset, "levels") + contrast <- attr(dataset, "contrast") + + # Identify the inapplicable token + inapp_token <- "-" + if (!inapp_token %in% all_levels) { + stop("Dataset does not contain an inapplicable token ('-').") + } + + # Build the original character matrix + idx <- attr(dataset, "index") + orig_mat <- do.call(rbind, lapply(dataset, function(x) { + all_levels[x[idx]] + })) + + # Identify the "0" state (absence) in the controlling primary + absence_state <- "0" + + # Track all characters claimed by any block + + claimed <- integer(0) + + .ValidateBlock <- function(node, depth = 1L) { + ctrl <- node$controlling + deps <- node$dependents + + # Check indices exist + all_idx <- c(ctrl, deps) + bad <- all_idx[all_idx < 1L | all_idx > n_char] + if (length(bad) > 0L) { + stop(sprintf( + "Character index(es) %s out of range [1, %d].", + paste(bad, collapse = ", "), n_char + )) + } + + # Check no double-claiming + overlap <- intersect(all_idx, claimed) + if (length(overlap) > 0L) { + stop(sprintf( + "Character(s) %s appear in multiple hierarchy blocks.", + paste(overlap, collapse = ", ") + )) + } + claimed <<- c(claimed, all_idx) + + # Check controlling character is binary (has exactly states "0" and "1", + # possibly with inapplicable/missing) + ctrl_vals <- unique(orig_mat[, ctrl]) + ctrl_informative <- setdiff(ctrl_vals, c("?", "-")) + if (!all(ctrl_informative %in% c("0", "1"))) { + stop(sprintf( + paste0("Controlling character %d must be binary (states '0' and '1'),", + " but has states: %s."), + ctrl, paste(ctrl_informative, collapse = ", ") + )) + } + + # Check secondaries are "-" where controlling is "0" + absent_taxa <- which(orig_mat[, ctrl] == absence_state) + if (length(absent_taxa) > 0L) { + for (d in deps) { + dep_vals <- orig_mat[absent_taxa, d] + bad_taxa <- which(!dep_vals %in% c("-", "?")) + if (length(bad_taxa) > 0L) { + bad_names <- rownames(orig_mat)[absent_taxa[bad_taxa]] + stop(sprintf( + paste0("Secondary character %d has non-inapplicable values for ", + "taxa where controlling character %d is absent: %s."), + d, ctrl, paste(head(bad_names, 5), collapse = ", ") + )) + } + } + } + + # Recurse into children + for (child in node$children) { + .ValidateBlock(child, depth + 1L) + } + } + + for (node in hierarchy) { + .ValidateBlock(node) + } + + invisible(hierarchy) +} + + +#' Construct a CharacterHierarchy from TNT-style character names +#' +#' Parse character names following the TNT convention where controlling +#' characters are named `sup_` and their dependent characters are +#' named `sub_[_suffix]`. Tags must match between a controlling +#' character and its dependents. Nested hierarchies are detected when a +#' `sub_` character is also a `sup_` for further characters. +#' +#' @param char_names Character vector of names, one per original character. +#' +#' @return A [`CharacterHierarchy`] object, or `NULL` if no hierarchy is +#' detected. +#' +#' @examples +#' names <- c("sup_tail", "sub_tail_colour", "sub_tail_shape", +#' "sup_wing", "sub_wing_venation", "eyes") +#' hierarchy_from_names(names) +#' +#' @family tree scoring +#' @seealso [CharacterHierarchy()] +#' @export +hierarchy_from_names <- function(char_names) { + if (!is.character(char_names) || length(char_names) == 0L) { + stop("`char_names` must be a non-empty character vector.") + } + + # Find sup_ and sub_ characters + sup_idx <- grep("^sup_", char_names) + sub_idx <- grep("^sub_", char_names) + + if (length(sup_idx) == 0L) { + return(NULL) + } + + # Extract tags + sup_tags <- sub("^sup_", "", char_names[sup_idx]) + sub_tags_full <- sub("^sub_", "", char_names[sub_idx]) + # The tag is the first component before any additional underscore-suffix + # e.g. "sub_tail_colour" → tag = "tail" + sub_tags <- sub("_.*", "", sub_tags_full) + + # Build mapping: tag → controlling index, tag → dependent indices + tag_to_sup <- setNames(sup_idx, sup_tags) + + # Group sub characters by tag + tag_to_subs <- split(sub_idx, sub_tags) + + # Check for sub_ characters referencing nonexistent sup_ tags + orphan_tags <- setdiff(names(tag_to_subs), sup_tags) + if (length(orphan_tags) > 0L) { + warning(sprintf( + "sub_ characters reference tags with no corresponding sup_: %s", + paste(orphan_tags, collapse = ", ") + )) + } + + # Detect nested hierarchies: a sub_ character that is also a sup_ + # Find sub_ chars that are also in sup_idx + sub_also_sup <- intersect(sub_idx, sup_idx) + + # Build hierarchy + # First pass: create flat blocks for all sup_ tags + args <- list() + for (tag in sup_tags) { + ctrl <- tag_to_sup[[tag]] + subs <- tag_to_subs[[tag]] + if (is.null(subs)) subs <- integer(0) + + # Check which subs are themselves controlling (nested hierarchy) + nested_subs <- intersect(subs, sup_idx) + flat_subs <- setdiff(subs, sup_idx) + + if (length(nested_subs) == 0L) { + # Simple block + args[[as.character(ctrl)]] <- as.integer(subs) + } else { + # Nested: build list with named sub-hierarchies + block <- as.list(as.integer(flat_subs)) + for (ns in nested_subs) { + ns_tag <- sup_tags[sup_idx == ns] + ns_subs <- tag_to_subs[[ns_tag]] + if (is.null(ns_subs)) ns_subs <- integer(0) + block[[as.character(ns)]] <- as.integer(ns_subs) + } + args[[as.character(ctrl)]] <- block + } + } + + # Filter out sup_ chars whose index also appears in sub_idx + # (they'll be included as children of their parent) + top_level_sup <- setdiff(sup_idx, sub_idx) + if (length(top_level_sup) == 0L) { + # All sup_ characters are also sub_ — circular or all nested. + # Fall back to treating all as top-level with a warning. + warning("All sup_ characters are also sub_ characters. ", + "Treating all as top-level.") + top_level_sup <- sup_idx + } + top_level_ctrls <- as.character(top_level_sup) + args <- args[top_level_ctrls] + + do.call(CharacterHierarchy, args) +} + + +#' Extract all character indices from a hierarchy +#' +#' Returns all character indices (controlling + dependent) referenced by +#' a [`CharacterHierarchy`], useful for partitioning characters into +#' hierarchy vs. non-hierarchy sets. +#' +#' @param hierarchy A [`CharacterHierarchy`] object. +#' +#' @return An integer vector of character indices (unsorted, may contain +#' duplicates if the hierarchy is malformed). +#' +#' @keywords internal +#' @export +hierarchy_chars <- function(hierarchy) { + .CollectIndices <- function(node) { + c(node$controlling, node$dependents, + unlist(lapply(node$children, .CollectIndices))) + } + unique(unlist(lapply(hierarchy, .CollectIndices))) +} + + +#' List top-level controlling characters +#' +#' @param hierarchy A [`CharacterHierarchy`] object. +#' @return Integer vector of top-level controlling character indices. +#' @keywords internal +#' @export +hierarchy_controlling <- function(hierarchy) { + vapply(hierarchy, `[[`, integer(1), "controlling") +} + + +#' Build tip_labels matrix for HSJ scoring +#' +#' Converts a `phyDat` dataset into an integer matrix of per-tip per-character +#' state labels (0-based) for the HSJ C++ scoring function. +#' +#' @param dataset A `phyDat` object. +#' @return An integer matrix with `length(dataset)` rows (tips) and +#' `length(attr(dataset, "index"))` columns (original characters). +#' Each entry is a 0-based token index. +#' @keywords internal +#' @export +build_tip_labels <- function(dataset) { + idx <- attr(dataset, "index") + n_tip <- length(dataset) + n_char <- length(idx) + + # dataset is a list of integer vectors (pattern indices per tip) + # Expand via index to original characters, convert to 0-based + mat <- matrix(0L, nrow = n_tip, ncol = n_char) + for (t in seq_len(n_tip)) { + pattern_tokens <- dataset[[t]] # token indices for each pattern + mat[t, ] <- pattern_tokens[idx] - 1L # 0-based + } + mat +} + + +#' Convert CharacterHierarchy to list for C++ +#' +#' Converts a [`CharacterHierarchy`] object into a flat list of hierarchy +#' blocks that can be passed to the C++ `ts_hsj_score()` bridge function. +#' Each block is a list with `primary` (0-based) and `secondaries` (0-based). +#' +#' @param hierarchy A [`CharacterHierarchy`] object. +#' @return A list of lists, each with elements `primary` (integer, 0-based) +#' and `secondaries` (integer vector, 0-based). +#' @keywords internal +#' @export +hierarchy_to_blocks <- function(hierarchy) { + .flatten_block <- function(node) { + block <- list( + primary = node$controlling - 1L, + secondaries = node$dependents - 1L + ) + child_blocks <- lapply(node$children, .flatten_block) + c(list(block), unlist(child_blocks, recursive = FALSE)) + } + unlist(lapply(hierarchy, .flatten_block), recursive = FALSE) +} + + +#' Compute non-hierarchy pattern weights +#' +#' Given a `phyDat` dataset and a [`CharacterHierarchy`], returns a weight +#' vector with hierarchy characters' contributions subtracted. +#' Patterns that appear only in hierarchy characters will have weight 0. +#' +#' @param dataset A `phyDat` object. +#' @param hierarchy A [`CharacterHierarchy`] object. +#' +#' @return An integer vector of adjusted pattern weights (same length as +#' `attr(dataset, "weight")`). +#' +#' @keywords internal +#' @export +non_hierarchy_weights <- function(dataset, hierarchy) { + w <- attr(dataset, "weight") + idx <- attr(dataset, "index") + h_chars <- hierarchy_chars(hierarchy) + + adjusted <- as.integer(w) + for (ci in h_chars) { + if (ci < 1L || ci > length(idx)) next + pat <- idx[ci] + if (pat >= 1L && pat <= length(adjusted) && adjusted[pat] > 0L) { + adjusted[pat] <- adjusted[pat] - 1L + } + } + adjusted +} + + +# Generate resampled weights for hierarchical resampling. +# +# Instead of treating every character independently, groups characters into +# resampling units: each non-hierarchy character is one unit, and each +# top-level hierarchy block (primary + all dependents, recursively) is one +# unit. Jackknife or bootstrap operates on these units. +# +# Returns a list with: +# non_hierarchy_weights: pattern weights for Fitch scoring (non-hierarchy +# chars only, reflecting which free chars were sampled) +# block_counts: integer vector (length = number of top-level blocks) +# giving how many times each block was sampled (0/1 for jackknife, +# 0+ for bootstrap) +.HierarchicalResampleWeights <- function(dataset, hierarchy, bootstrap, + proportion) { + idx <- attr(dataset, "index") + n_patterns <- length(attr(dataset, "weight")) + n_chars <- length(idx) + + # Collect chars per top-level block (includes nested dependents) + .CollectAll <- function(node) { + c(node$controlling, node$dependents, + unlist(lapply(node$children, .CollectAll))) + } + n_blocks <- length(hierarchy) + block_chars <- lapply(hierarchy, function(node) unique(.CollectAll(node))) + h_chars_set <- unique(unlist(block_chars)) + + free_chars <- setdiff(seq_len(n_chars), h_chars_set) + n_free <- length(free_chars) + n_units <- n_free + n_blocks + + if (n_units < 2L) { + # Degenerate: can't jackknife with < 2 units + return(list( + non_hierarchy_weights = non_hierarchy_weights(dataset, hierarchy), + block_counts = rep(1L, n_blocks) + )) + } + + if (bootstrap) { + sampled <- sample.int(n_units, n_units, replace = TRUE) + } else { + n_keep <- max(1L, ceiling(proportion * n_units)) + n_keep <- min(n_keep, n_units - 1L) + sampled <- sample.int(n_units, n_keep, replace = FALSE) + } + + unit_counts <- tabulate(sampled, nbins = n_units) + + # Non-hierarchy pattern weights from retained free chars + nh_weights <- integer(n_patterns) + for (i in seq_len(n_free)) { + if (unit_counts[i] > 0L) { + pat <- idx[free_chars[i]] + nh_weights[pat] <- nh_weights[pat] + unit_counts[i] + } + } + + block_counts <- unit_counts[n_free + seq_len(n_blocks)] + + list( + non_hierarchy_weights = nh_weights, + block_counts = block_counts + ) +} diff --git a/R/ClusterStrings.R b/R/ClusterStrings.R index 1ee081e7d..4d4962099 100644 --- a/R/ClusterStrings.R +++ b/R/ClusterStrings.R @@ -18,8 +18,6 @@ #' paste0("AnotherCluster_", letters[1:6]))) #' @template MRS #' @importFrom utils adist -#' @importFrom cluster pam silhouette -#' @importFrom protoclust protoclust #' @importFrom stats as.dist cutree #' @family utility functions #' @export @@ -27,6 +25,14 @@ ClusterStrings <- function (x, maxCluster = 12) { if (maxCluster < 2L) { stop("`maxCluster` must be at least two.") } + if (!requireNamespace("cluster", quietly = TRUE)) { + stop("Package \"cluster\" is required for ClusterStrings().\n", # nocov + "Install it with: install.packages(\"cluster\")", call. = FALSE) # nocov + } + if (!requireNamespace("protoclust", quietly = TRUE)) { + stop("Package \"protoclust\" is required for ClusterStrings().\n", # nocov + "Install it with: install.packages(\"protoclust\")", call. = FALSE) # nocov + } if (length(unique(x)) < maxCluster) { nom <- unique(x) @@ -42,19 +48,19 @@ ClusterStrings <- function (x, maxCluster = 12) { kInc <- 1 / (nMethodsChecked * nK) pamClusters <- lapply(possibleClusters, function (k) { - pam(dists, k = k) + cluster::pam(dists, k = k) }) pamSils <- vapply(pamClusters, function (pamCluster) { - mean(silhouette(pamCluster)[, 3]) + mean(cluster::silhouette(pamCluster)[, 3]) }, double(1)) bestPam <- which.max(pamSils) pamSil <- pamSils[bestPam] pamCluster <- pamClusters[[bestPam]][["clustering"]] - hTree <- protoclust(as.dist(dists)) + hTree <- protoclust::protoclust(as.dist(dists)) hClusters <- lapply(possibleClusters, function (k) cutree(hTree, k = k)) hSils <- vapply(hClusters, function (hCluster) { - mean(silhouette(hCluster, dists)[, 3]) + mean(cluster::silhouette(hCluster, dists)[, 3]) }, double(1)) bestH <- which.max(hSils) hSil <- hSils[bestH] diff --git a/R/Concordance.R b/R/Concordance.R index 482a488bb..c7f53d15c 100644 --- a/R/Concordance.R +++ b/R/Concordance.R @@ -446,6 +446,17 @@ QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol, #' @param xlab Character giving a label for the x axis. #' @param ylab Character giving a label for the y axis. #' @param plot Logical specifying whether to draw the plot. +#' @param marginSize Integer scalar or vector controlling summary margin strips. +#' If a scalar (length 1) and greater than zero, both a left strip and a bottom +#' strip are added, each `marginSize` grid cells wide/tall. +#' If a vector (length > 1), each entry controls one side following the usual +#' `par(mar)` order — `c(bottom, left, top, right)` — where a positive value +#' enables that strip with the given width/height and `NA` or `0` suppresses it. +#' Currently only the bottom (entry 1) and left (entry 2) strips are +#' implemented; further entries are accepted but ignored. +#' The left strip is coloured by the characterwise concordance (weighted mean +#' across edges); the bottom strip by the edgewise concordance (weighted mean +#' across characters). One blank cell separates each strip from the main grid. #' @param \dots Arguments to `abline`, to control the appearance of vertical #' lines marking important edges. #' @returns `ConcordanceTable()` invisibly returns an named list containing: @@ -470,7 +481,8 @@ QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol, #' QALegend(where = c(0.1, 0.4, 0.1, 0.3)) #' #' # View information shared by characters and edges -#' ConcordanceTable(tree, dataset, largeClade = 3, col = 2, lwd = 3) +#' ConcordanceTable(tree, dataset, largeClade = 3, col = 2, lwd = 3, +#' marginSize = 1:4) #' axis(1) #' axis(2) #' @@ -484,8 +496,9 @@ QALegend <- function(where = c(0.1, 0.3, 0.1, 0.3), n = 5, Col = QACol, #' - [SiteConcordance()]: compute underlying concordance values. #' @export ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, - xlab = "Edge", ylab = "Character", - normalize = TRUE, plot = TRUE, ...) { + xlab = "Edge", ylab = "Character", + normalize = TRUE, plot = TRUE, + marginSize = 0L, ...) { cc <- ClusteringConcordance(tree, dataset, return = "all", normalize = normalize) nodes <- seq_len(dim(cc)[[2]]) @@ -498,11 +511,67 @@ ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, quality[is.na(quality)] <- 0 col <- matrix(Col(amount, quality), dim(amount)[[1]], dim(amount)[[2]]) - image(nodes, seq_len(dim(cc)[[3]]), - matrix(1:prod(dim(amount)), dim(amount)[[1]]), - frame.plot = FALSE, axes = FALSE, - col = col, xlab = xlab, ylab = ylab) - + + # Parse marginSize: scalar → both sides; vector → c(bottom, left, ...) + ms <- as.integer(marginSize) + if (length(ms) == 1L) { + ms_bottom <- if (!is.na(ms) && ms > 0L) ms else 0L + ms_left <- ms_bottom + } else { + ms_bottom <- if (!is.na(ms[1L]) && ms[1L] > 0L) ms[1L] else 0L + ms_left <- if (length(ms) >= 2L && !is.na(ms[2L]) && ms[2L] > 0L) ms[2L] else 0L + } + x_offset <- if (ms_left > 0L) ms_left + 1L else 0L + y_offset <- if (ms_bottom > 0L) ms_bottom + 1L else 0L + + if (ms_left > 0L || ms_bottom > 0L) { + n_edges <- dim(cc)[[2]] + n_chars <- dim(cc)[[3]] + + # Marginal concordance: hBest-weighted average of normalized MI + hBest_w <- cc["hBest", , ] + hBest_w[is.na(hBest_w)] <- 0 + # `quality` already has NAs zeroed above + + # Extended layout (x = left→right, y = bottom→top): + # x: [char margin: 1..ms_left] [blank: ms_left+1] [grid: (x_offset+1)..(x_offset+n_edges)] + # y: [edge margin: 1..ms_bottom] [blank: ms_bottom+1] [grid: (y_offset+1)..(y_offset+n_chars)] + # (absent margin ↔ x_offset or y_offset = 0, so that portion of the range vanishes) + nx <- x_offset + n_edges + ny <- y_offset + n_chars + ext_col <- matrix("#FFFFFF", nx, ny) + + xi <- (x_offset + 1L):(x_offset + n_edges) # x indices of main grid + yi <- (y_offset + 1L):(y_offset + n_chars) # y indices of main grid + ext_col[xi, yi] <- col + + if (ms_left > 0L) { + denom_c <- colSums(hBest_w) + char_conc <- pmax(-1, pmin(1, + ifelse(denom_c == 0, 0, colSums(quality * hBest_w) / denom_c))) + char_cols <- Col(rep(1, n_chars), char_conc) + for (i in seq_len(ms_left)) ext_col[i, yi] <- char_cols + } + if (ms_bottom > 0L) { + denom_e <- rowSums(hBest_w) + edge_conc <- pmax(-1, pmin(1, + ifelse(denom_e == 0, 0, rowSums(quality * hBest_w) / denom_e))) + edge_cols <- Col(rep(1, n_edges), edge_conc) + for (j in seq_len(ms_bottom)) ext_col[xi, j] <- edge_cols + } + + image(seq_len(nx), seq_len(ny), + matrix(seq_len(nx * ny), nx, ny), + col = as.vector(ext_col), + frame.plot = FALSE, axes = FALSE, + xlab = xlab, ylab = ylab) + } else { + image(nodes, seq_len(dim(cc)[[3]]), + matrix(1:prod(dim(amount)), dim(amount)[[1]]), + frame.plot = FALSE, axes = FALSE, + col = col, xlab = xlab, ylab = ylab) + } + if (largeClade > 1) { cladeSize <- CladeSizes(tree) edge <- tree[["edge"]] @@ -511,7 +580,7 @@ ConcordanceTable <- function(tree, dataset, Col = QACol, largeClade = 0, bigNode <- vapply(as.integer(colnames(cc)), function (node) { all(cladeSize[child[parent == parent[child == node]]] >= largeClade) }, logical(1)) - abline(v = nodes[bigNode] - 0.5, ...) + abline(v = nodes[bigNode] + x_offset - 0.5, ...) } invisible(list(info = info, relInfo = amount, quality = quality, col = col)) } @@ -557,30 +626,32 @@ MutualClusteringConcordance <- function(tree, dataset) { #' @details #' `QuartetConcordance()` is the proportion of quartets (sets of four leaves) #' that are decisive for a split which are also concordant with it -#' (the site concordance factor \insertCite{Minh2020}{TreeSearch}). #' For example, a quartet with the characters `0 0 0 1` is not decisive, as #' all relationships between those leaves are equally parsimonious. #' But a quartet with characters `0 0 1 1` is decisive, and is concordant #' with any tree that groups the first two leaves together to the exclusion #' of the second. +#' In contrast to the site concordance factor +#' \insertCite{Minh2020}{TreeSearch}, `QuartetConcordance()` considers all +#' quartets that are decisive for a branch. +#' Doing so circumvents the criticisms of \insertCite{Goloboff2024;textual}{TreeSearch}. #' #' By default, the reported value weights each site by the number of quartets #' it is decisive for. This value can be interpreted as the proportion of #' all decisive quartets that are concordant with a split. #' If `weight = FALSE`, the reported value is the mean of the concordance -#' value for each site. +#' value for each site. #' Consider a split associated with two sites: -#' one that is concordant with 25% of 96 decisive quartets, and -#' a second that is concordant with 75% of 4 decisive quartets. -#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. -#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. -#' -#' `QuartetConcordance()` is computed exactly, using all quartets, where as -#' other implementations (e.g. IQ-TREE) follow -#' \insertCite{@Minh2020;textual}{TreeSearch} in using a random subsample -#' of quartets for a faster, if potentially less accurate, computation. +#' one that is concordant with 25% of 96 decisive quartets, and +#' a second that is concordant with 75% of 4 decisive quartets. +#' If `weight = TRUE`, the split concordance will be 24 + 3 / 96 + 4 = 27%. +#' If `weight = FALSE`, the split concordance will be mean(75%, 25%) = 50%. +#' +#' `QuartetConcordance()` is computed exactly, using all quartets, +#' rather than a random subsample \insertCite{@cf. @Minh2020}{TreeSearch}. #' Ambiguous and inapplicable tokens are treated as containing no grouping #' information (i.e. `(02)` or `-` are each treated as `?`). +#' #' @return #' `QuartetConcordance(return = "edge")` returns a numeric vector giving the #' concordance index at each split across all sites; names specify the number of @@ -614,19 +685,32 @@ QuartetConcordance <- function( warning("No overlap between tree labels and dataset.") return(NULL) } + dataset <- dataset[tipLabels, drop = FALSE] splits <- as.Splits(tree, dataset) logiSplits <- vapply(seq_along(splits), function (i) as.logical(splits[[i]]), logical(NTip(dataset))) - characters <- PhyDatToMatrix(dataset, ambigNA = TRUE) + contrast <- attr(dataset, "contrast") charLevels <- attr(dataset, "allLevels") - isAmbig <- rowSums(attr(dataset, "contrast")) > 1 + isInapp <- charLevels == "-" - nonGroupingLevels <- charLevels[isAmbig | isInapp] - characters[characters %in% nonGroupingLevels] <- NA - - charInt <- `mode<-`(characters, "integer") + isAmbig <- rowSums(contrast[, colnames(contrast) != "-"]) > 1 + isGrouping <- !isAmbig & !isInapp + + # For each grouping level, which column of the contrast matrix does it uniquely set? + groupingCols <- apply(contrast[isGrouping, , drop = FALSE] > 0, 1, which) + + levelToInt <- rep(NA_integer_, length(charLevels)) + levelToInt[isGrouping] <- as.integer(groupingCols) + + characters <- PhyDatToMatrix(dataset) + charInt <- array( + levelToInt[match(characters, charLevels)], + dim = dim(characters), + dimnames = dimnames(characters) + ) + raw_counts <- quartet_concordance(logiSplits, charInt) num <- raw_counts$concordant @@ -680,24 +764,22 @@ QuartetConcordance <- function( } } -#' @importFrom fastmap fastmap -.ExpectedMICache <- fastmap() +.ExpectedMICache <- new.env(hash = TRUE, parent = emptyenv()) # @param a must be a vector of length <= 2 # @param b may be longer -#' @importFrom base64enc base64encode .ExpectedMI <- function(a, b) { if (length(a) < 2 || length(b) < 2) { 0 } else { - key <- base64enc::base64encode(mi_key(a, b)) - if (.ExpectedMICache$has(key)) { - .ExpectedMICache$get(key) + key <- mi_key(a, b) + if (!is.null(.ExpectedMICache[[key]])) { + .ExpectedMICache[[key]] } else { ret <- expected_mi(a, b) # Cache: - .ExpectedMICache$set(key, ret) + .ExpectedMICache[[key]] <- ret # Return: ret } diff --git a/R/Consistency.R b/R/Consistency.R index 2aeb4adbc..e084370f6 100644 --- a/R/Consistency.R +++ b/R/Consistency.R @@ -109,8 +109,7 @@ Consistency <- function (dataset, tree, nRelabel = 0, compress = FALSE) { } -#' @importFrom fastmap fastmap -.CharLengthCache <- fastmap() +.CharLengthCache <- new.env(hash = TRUE, parent = emptyenv()) #' Expected length #' @@ -127,7 +126,6 @@ Consistency <- function (dataset, tree, nRelabel = 0, compress = FALSE) { #' #' @export #' @importFrom stats median -#' @importFrom stringi stri_paste #' @family tree scoring #' @template MRS ExpectedLength <- function(dataset, tree, nRelabel = 1000, compress = FALSE) { @@ -151,9 +149,9 @@ ExpectedLength <- function(dataset, tree, nRelabel = 1000, compress = FALSE) { }, integer(nLevels))) .LengthForChar <- function(x) { - key <- stri_paste(c(nRelabel, x), collapse = ",") - if (.CharLengthCache$has(key)) { - .CharLengthCache$get(key) + key <- paste(c(nRelabel, x), collapse = ",") + if (!is.null(.CharLengthCache[[key]])) { + .CharLengthCache[[key]] } else { patterns <- apply(unname(unique(t( as.data.frame(replicate(nRelabel, sample(rep(seq_along(x), x))))))), @@ -170,7 +168,7 @@ ExpectedLength <- function(dataset, tree, nRelabel = 1000, compress = FALSE) { contrast = rwContrast, class = "phyDat") ret <- median(FastCharacterLength(tree, phy)) - .CharLengthCache$set(key, ret) + .CharLengthCache[[key]] <- ret ret } } diff --git a/R/CustomSearch.R b/R/CustomSearch.R index 0e47c2150..eb16e198f 100644 --- a/R/CustomSearch.R +++ b/R/CustomSearch.R @@ -101,6 +101,8 @@ EdgeListSearch <- function (edgeList, dataset, #' #' Run standard search algorithms (\acronym{NNI}, \acronym{SPR} or \acronym{TBR}) #' to search for a more parsimonious tree. +#' For standard parsimony searches, [`MaximizeParsimony()`] is faster; +#' use `TreeSearch()` when you need a custom `TreeScorer` or `EdgeSwapper`. #' #' For detailed documentation of the "TreeSearch" package, including full #' instructions for loading phylogenetic data into R and initiating and @@ -177,7 +179,6 @@ TreeSearch <- function (tree, dataset, maxIter = 100L, maxHits = 20L, stopAtPeak = FALSE, stopAtPlateau = 0L, verbosity = 1L, ...) { - # initialize tree and data if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { stop("tree must be bifurcating; try rooting with ape::root") } diff --git a/R/Jackknife.R b/R/Jackknife.R index 78ab21b9e..fce0b7941 100644 --- a/R/Jackknife.R +++ b/R/Jackknife.R @@ -1,11 +1,8 @@ #' Jackknife resampling #' #' Resample trees using Jackknife resampling, i.e. removing a subset of -#' characters. -#' -#' The function assumes that `InitializeData()` will return a morphy object; -#' if this doesn't hold for you, post a [GitHub issue]( -#' https://github.com/ms609/TreeSearch/issues/new/) or e-mail the maintainer. +#' characters. For standard parsimony, [`Resample()`] is faster; use +#' `Jackknife()` when you need a custom `TreeScorer` or `EdgeSwapper`. #' #' @inheritParams Ratchet #' @param resampleFreq Double between 0 and 1 stating proportion of characters @@ -16,8 +13,8 @@ #' @template MRS #' @importFrom TreeTools RenumberEdges RenumberTips #' @seealso -#' - [`Resample()`]: Jackknife resampling for non-custom searches performed -#' using `MaximizeParsimony()`. +#' - [`Resample()`]: Jackknife and bootstrap resampling using the C++ search +#' engine. #' - [`JackLabels()`]: Label nodes of a tree with jackknife supports. #' @family split support functions #' @family custom search functions @@ -29,7 +26,6 @@ Jackknife <- function(tree, dataset, resampleFreq = 2 / 3, EdgeSwapper = TBRSwap, jackIter = 5000L, searchIter = 4000L, searchHits = 42L, verbosity = 1L, ...) { - # Initialize tree and data if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { stop("tree must be bifurcating; try rooting with ape::root") } diff --git a/R/MaximizeParsimony.R b/R/MaximizeParsimony.R index ef6d237cc..4c4bbec44 100644 --- a/R/MaximizeParsimony.R +++ b/R/MaximizeParsimony.R @@ -1,128 +1,266 @@ +# Internal helper: count non-missing taxa per character pattern. +# Used by XPIWE (Goloboff 2014) to compute the extrapolation factor. +# @param dataset A phyDat object. +# @return Integer vector of length = number of unique patterns. +# @keywords internal +.ObsCount <- function(dataset) { + at <- attributes(dataset) + contrast <- at$contrast + levels <- at$levels + # "?" = all-1s contrast row. + is_missing <- apply(contrast, 1, function(row) all(row == 1)) + # "-" (inapplicable/gap) also counts as missing for XPIWE (Goloboff 2014). + # TNT counts both ? and - as missing, verified against TNT 1.6. + inapp_col <- match("-", levels) + if (!is.na(inapp_col)) { + is_inapp <- apply(contrast, 1, function(row) { + row[inapp_col] == 1 && sum(row) == 1 + }) + is_missing <- is_missing | is_inapp + } + # dataset is a list of integer vectors (token indices, 1-based) per taxon. + # tip_data: n_taxa x n_patterns matrix + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + # Count non-missing taxa per pattern + vapply(seq_len(ncol(tip_data)), function(p) { + sum(!is_missing[tip_data[, p]]) + }, integer(1)) +} + +# Internal helper: prepare constraint data for C++ engine. +# Returns a named list of constraint arguments (empty list if no constraint). +# @param constraint A phyDat, phylo, or NULL. +# @param dataset A phyDat whose names define the tip ordering. +# @keywords internal +.PrepareConstraint <- function(constraint, dataset) { + if (is.null(constraint)) return(list()) + + if (inherits(constraint, "phylo")) { + constraint <- MatrixToPhyDat(t(as.matrix(constraint))) + } + if (!inherits(constraint, "phyDat")) { + constraint <- MatrixToPhyDat(constraint) + } + + # Match constraint taxa to dataset + consTaxa <- names(constraint) + treeTaxa <- names(dataset) + treeOnly <- setdiff(treeTaxa, consTaxa) + if (length(treeOnly)) { + constraint <- AddUnconstrained(constraint, treeOnly) + } + consOnly <- setdiff(consTaxa, treeTaxa) + if (length(consOnly)) { + warning("Ignoring taxa in constraint missing on tree: ", + paste0(consOnly, collapse = ", ")) + constraint <- constraint[-match(consOnly, consTaxa)] + } + constraint <- constraint[names(dataset)] + + consContrast <- attr(constraint, "contrast") + nConsStates <- ncol(consContrast) + if (nConsStates < 2L) return(list()) + + consMat <- matrix(unlist(constraint, use.names = FALSE), + nrow = length(constraint), byrow = TRUE) + consSplits <- matrix(0L, nrow = ncol(consMat), ncol = length(constraint)) + for (ch in seq_len(ncol(consMat))) { + for (tip in seq_len(length(constraint))) { + token <- consMat[tip, ch] + if (consContrast[token, nConsStates] == 1 && + consContrast[token, 1] == 0) { + consSplits[ch, tip] <- 1L + } + } + } + + keep <- apply(consSplits, 1, function(row) { + s <- sum(row) + s >= 1 && s < length(constraint) - 1 + }) + consSplits <- consSplits[keep, , drop = FALSE] + if (nrow(consSplits) == 0L) return(list()) + + consWeight <- attr(constraint, "weight") + consExpectedScore <- sum( + MinimumLength(constraint, compress = TRUE) * consWeight + ) + + consTipData <- matrix(unlist(constraint, use.names = FALSE), + nrow = length(constraint), byrow = TRUE) + + list( + consSplitMatrix = consSplits, + consContrast = consContrast, + consTipData = consTipData, + consWeight = as.integer(consWeight), + consLevels = attr(constraint, "levels"), + consExpectedScore = as.integer(consExpectedScore) + ) +} + +# Strategy presets for adaptive search (Phase 6E). +# Wrapped in a function to avoid load-order dependency on SearchControl(). +.StrategyPresets <- function() list( + sprint = SearchControl( + tbrMaxHits = 1L, ratchetCycles = 3L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetAdaptive = FALSE, + driftCycles = 0L, xssRounds = 1L, xssPartitions = 4L, + rssRounds = 0L, cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 5L, fuseAcceptEqual = FALSE, + tabuSize = 0L, wagnerStarts = 1L, + nniFirst = TRUE, sprFirst = FALSE + ), + default = SearchControl( + tbrMaxHits = 1L, ratchetCycles = 12L, ratchetPerturbProb = 0.25, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = FALSE, + driftCycles = 0L, + xssRounds = 3L, xssPartitions = 4L, + rssRounds = 1L, cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 3L, fuseAcceptEqual = FALSE, + tabuSize = 100L, wagnerStarts = 3L, + nniFirst = TRUE, sprFirst = FALSE, adaptiveLevel = TRUE, + maxOuterResets = 2L + ), + thorough = SearchControl( + tbrMaxHits = 3L, ratchetCycles = 20L, ratchetPerturbProb = 0.25, + ratchetPerturbMode = 2L, ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = TRUE, + nniPerturbCycles = 0L, # T-274: 69% overhead, zero time-adjusted benefit + driftCycles = 0L, + xssRounds = 5L, xssPartitions = 6L, + rssRounds = 3L, cssRounds = 2L, cssPartitions = 6L, + sectorMinSize = 6L, sectorMaxSize = 80L, + fuseInterval = 2L, fuseAcceptEqual = TRUE, + tabuSize = 200L, wagnerStarts = 3L, + nniFirst = TRUE, sprFirst = FALSE, + outerCycles = 2L, + maxOuterResets = 3L, + adaptiveStart = TRUE + ), + # Large-tree preset (>=120 tips): at 180 tips each TBR convergence takes + # ~5-7s, so phase costs scale sharply. Key design decisions (T-179): + # - Fewer perturbation cycles: ratchet 12, drift 4 (vs thorough 20/12) + # - No NNI-perturbation: at ~5.5s/cycle, it dominates the budget; ratchet + # provides more diverse escapes per unit time at large-tree scale + # - Annealing (1 cycle) replaces drift: linear cooling T=20→0 over 5 + # phases uses stochastic TBR with Boltzmann acceptance — cheaper + # per-cycle than drift. 1 cycle (400ms) captures 40% hit rate at + # 180 tips; 3 cycles (1370ms) showed no significant score gain (T-248) + # - No outer-cycle interleaving: outerCycles=1 avoids re-running expensive + # XSS/RSS/CSS after ratchet (saves ~10s per repeated sectorial pass) + # - Single biased-Wagner start: saves ~2.6s vs 3 random starts; biased + # addition (Goloboff 2014) gives near-optimal Wagner at 180 tips + # - tbrMaxHits=1: faster TBR passes (fewer equal-score trees explored) + # - No adaptiveStart: with ~1 replicate per 60s budget, the bandit has + # no learning opportunity; adaptiveStart empirically regresses here + # - Larger sector sizes for proportional tree coverage + # - Prune-reinsert with NNI polish (T-289f Stage 5, 2026-03-29): 5 cycles, + # NNI full-tree polish (pruneReinsertNni=TRUE). TBR polish (Stage 4) was + # catastrophic at 206t/60s (0 reps). NNI polish (Stage 5, 5 datasets + # 131-206t, 10 seeds, 60s+120s) fixes the 0-rep failure and improves + # median scores at 131-180t (project3701 146t: -178 steps at 60s; + # project804 173t: -9 steps; mbank_X30754 180t: -4 steps at 60s/-7 at + # 120s). syab07205 (206t) shows +17.5 steps at 60s but neutral at 120s + # — acceptable given the gains at smaller sizes in range. See G-006 for + # a known limitation (NNI polish ignores ConstraintData; irrelevant here + # since the large preset does not use topological constraints). + # Validated on mbank_X30754 (180t, 418p), 5 seeds at 30/60/120s budgets: + # 60s: large median=1255 vs thorough 1259 (+4 steps better) + # 120s: large median=1250 vs thorough 1250 (tied, 2 reps vs 0-1) + # 30s: large median=1276 vs thorough 1283 (+7 steps better) + large = SearchControl( + tbrMaxHits = 1L, ratchetCycles = 12L, ratchetPerturbProb = 0.25, + ratchetPerturbMode = 2L, ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = TRUE, + nniPerturbCycles = 0L, + driftCycles = 0L, + annealCycles = 1L, annealPhases = 5L, annealTStart = 20, annealTEnd = 0, + xssRounds = 3L, xssPartitions = 6L, + rssRounds = 2L, cssRounds = 1L, cssPartitions = 6L, + sectorMinSize = 8L, sectorMaxSize = 100L, + fuseInterval = 3L, fuseAcceptEqual = TRUE, + tabuSize = 100L, wagnerStarts = 1L, + wagnerBias = 1L, wagnerBiasTemp = 0.3, + nniFirst = TRUE, sprFirst = FALSE, + outerCycles = 1L, + pruneReinsertCycles = 5L, pruneReinsertNni = TRUE, + consensusStableReps = 0L + ) +) + +# Select strategy preset based on dataset size and character count. +# @param nTip Integer number of taxa +# @param nChar Integer number of character patterns (unique columns) +# @return Character name of the strategy preset +# @details +# Empirically calibrated on 15 neotrans matrices (61-86 tips) + 4 +# inapplicable.phyData datasets. Key findings: +# - Datasets with few characters (< 100 patterns) have flat parsimony +# landscapes where extra search adds zero score improvement (0/6 benefited). +# - Datasets with >= 100 patterns and >= 65 taxa have structured landscapes +# where thorough search finds substantially better trees (7/9 benefited, +# median +14 steps, max +74 steps at 86 tips / 528 chars). +# - At 62 tips (Agnarsson2004, 242 patterns) thorough adds 0 steps; at 65 +# tips (project3617, 361 patterns) it adds 14 steps. +.AutoStrategy <- function(nTip, nChar) { + if (nTip <= 30L) return("sprint") + # Few characters -> flat landscape; thorough search is pointless + if (nChar < 100L) return("default") + # Large trees (>=120 tips): per-replicate cost is high; use scaled preset + # with NNI warmup and biased Wagner (empirically validated on 180-tip data). + if (nTip >= 120L) return("large") + # Enough characters to have a structured landscape; + # moderate-to-large datasets benefit from intensive search + if (nTip >= 65L) return("thorough") + "default" +} + #' Find most parsimonious trees -#' -#' Search for most parsimonious trees using the parsimony ratchet and -#' \acronym{TBR} rearrangements, treating inapplicable data as such using the -#' algorithm of \insertCite{Brazeau2019;textual}{TreeSearch}. -#' -#' Tree search will be conducted from a specified or automatically-generated -#' starting tree in order to find a tree with an optimal parsimony score, -#' under implied or equal weights, treating inapplicable characters as such -#' in order to avoid the artefacts of the standard Fitch algorithm -#' \insertCite{@see @Maddison1993; @Brazeau2019}{TreeSearch}. -#' Tree length is calculated using the MorphyLib C library -#' \insertCite{Brazeau2017}{TreeSearch}. -#' -#' Tree search commences with `ratchIter` iterations of the parsimony ratchet -#' \insertCite{Nixon1999}{TreeSearch}, which bootstraps the input dataset -#' in order to escape local optima. -#' A final round of tree bisection and reconnection (\acronym{TBR}) -#' is conducted to broaden the sampling of trees. -#' -#' This function can be called using the R command line / terminal, or through -#' the "shiny" graphical user interface app (type `EasyTrees()` to launch). -#' -#' The optimal strategy for tree search depends in part on how close to optimal -#' the starting tree is, the size of the search space (which increases -#' super-exponentially with the number of leaves), and the complexity of the -#' search space (e.g. the existence of multiple local optima). -#' -#' One possible approach is to employ four phases: -#' -#' 1. Rapid search for local optimum: tree score is typically easy to improve -#' early in a search, because the initial tree is often far from optimal. -#' When many moves are likely to be accepted, running several rounds of search -#' with a low value of `maxHits` and a high value of `tbrIter` allows many -#' trees to be evaluated quickly, hopefully moving quickly to a more promising -#' region of tree space. -#' -#' 2. Identification of local optimum: -#' Once close to a local optimum, a more extensive search -#' with a higher value of `maxHits` allows a region to be explored in more -#' detail. Setting a high value of `tbrIter` will search a local -#' neighbourhood more completely -#' -#' 3. Search for nearby peaks: -#' Ratchet iterations allow escape from local optima. -#' Setting `ratchIter` to a high value searches the wider neighbourhood more -#' extensively for other nearby peaks; `ratchEW = TRUE` accelerates these -#' exploratory searches. Ratchet iterations can be ineffective when `maxHits` -#' is too low for the search to escape its initial location. -#' -#' 4. Extensive search of final optimum. As with step 2, it may be valuable to -#' fully explore the optimum that is found after ratchet searches to be sure -#' that the locally optimal score has been obtained. Setting a high value of -#' `finalIter` performs a thorough search that can give confidence that further -#' searches would not find better (local) trees. -#' -#' A search is unlikely to have found a global optimum if: -#' -#' - Tree score continues to improve on the final iteration. If a local optimum -#' has not yet been reached, it is unlikely that a global optimum has -#' been reached. -#' Try increasing `maxHits`. -#' -#' - Successive ratchet iterations continue to improve tree scores. -#' If a recent ratchet iteration improved the score, rather than finding -#' a different region of tree space with the same optimal score, it is likely -#' that still better global optima remain to be found. Try increasing -#' `ratchIter` (more iterations give more chance for improvement) and -#' `maxHits` (to get closer to the local optimum after each ratchet iteration). -#' -#' - Optimal areas of tree space are only visited by a single ratchet iteration. -#' (See vignette: [Exploring tree space]( -#' https://ms609.github.io/TreeSearch/articles/tree-space.html).) -#' If some areas of tree space are only found by one ratchet iteration, there -#' may well be other, better areas that have not yet been visited. -#' Try increasing `ratchIter`. -#' -#' When continuing a tree search, it is usually best to start from an optimal -#' tree found during the previous iteration - there is no need to start from -#' scratch. -#' -#' A more time consuming way of checking that a global optimum has been reached -#' is to repeat a search with the same parameters multiple times, starting -#' from a different, entirely random tree each time. If all searches obtain the -#' same optimal tree score despite their different starting points, -#' this score is likely to correspond to the global optimum. -#' -#' For detailed documentation of the "TreeSearch" package, including full -#' instructions for loading phylogenetic data into R and initiating and -#' configuring tree search, see the -#' [package documentation](https://ms609.github.io/TreeSearch/). -#' -#' +#' +#' Performs a multi-replicate driven search for most-parsimonious trees, +#' combining random addition sequence (Wagner) starting trees, TBR +#' rearrangement, exclusive sectorial search (XSS), ratchet perturbation, +#' drift, and tree fusing -- all in compiled C++. +#' +#' The search pipeline follows the "new technology search" approach of +#' \insertCite{Goloboff1999;textual}{TreeSearch}, as implemented in TNT +#' \insertCite{Goloboff2016}{TreeSearch}. +#' Parsimony scoring uses the Fitch +#' \insertCite{Fitch1971}{TreeSearch} algorithm; inapplicable characters +#' are handled with the algorithm of +#' \insertCite{Brazeau2019;textual}{TreeSearch}. +#' Each replicate builds a random addition sequence (Wagner) tree +#' \insertCite{Kluge1969}{TreeSearch}, optimizes it with TBR, +#' applies sectorial search and the parsimony ratchet +#' \insertCite{Nixon1999}{TreeSearch} to escape local optima, then adds +#' the result to a pool of unique topologies. +#' Periodically, tree fusing recombines the best trees in the pool. +#' The search stops when the best score has been independently discovered +#' `targetHits` times, or `maxReplicates` replicates have been completed. +#' +#' Implied weighting is supported natively: set `concavity` to a numeric +#' value (e.g.\sspace{}10). +#' Profile parsimony (`concavity = "profile"`) is supported natively: +#' characters are simplified to binary (max 2 informative states), +#' inapplicable tokens are treated as ambiguous, and per-character +#' information profiles are used for scoring +#' \insertCite{Faith2001}{TreeSearch}. +#' #' @param dataset A phylogenetic data matrix of \pkg{phangorn} class #' \code{phyDat}, whose names correspond to the labels of any accompanying tree. -#' Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. -#' Additive (ordered) characters can be handled using -#' \code{\link[TreeTools]{Decompose}()}. #' @param tree (optional) A bifurcating tree of class \code{\link[ape]{phylo}}, -#' containing only the tips listed in `dataset`, from which the search -#' should begin. -#' If unspecified, an [addition tree][AdditionTree()] will be generated from -#' `dataset`, respecting any supplied `constraint`. -#' Edge lengths are not supported and will be deleted. -#' @param ratchIter Numeric specifying number of iterations of the -#' parsimony ratchet \insertCite{Nixon1999}{TreeSearch} to conduct. -#' @param tbrIter Numeric specifying the maximum number of \acronym{TBR} -#' break points on a given tree to evaluate before terminating the search. -#' One "iteration" comprises selecting a branch to break, and evaluating -#' each possible reconnection point in turn until a new tree improves the -#' score. If a better score is found, then the counter is reset to zero, -#' and tree search continues from the improved tree. -#' @param startIter Numeric: an initial round of tree search with -#' `startIter` × `tbrIter` \acronym{TBR} break points is conducted in -#' order to locate a local optimum before beginning ratchet searches. -#' @param finalIter Numeric: a final round of tree search will evaluate -#' `finalIter` × `tbrIter` \acronym{TBR} break points, in order to -#' sample the final optimal neighbourhood more intensely. -#' @param maxHits Numeric specifying the maximum times that an optimal -#' parsimony score may be hit before concluding a ratchet iteration or final -#' search concluded. -#' @param maxTime Numeric: after `maxTime` minutes, stop tree search at the -#' next opportunity. -#' @param quickHits Numeric: iterations on subsampled datasets -#' will retain `quickHits` × `maxHits` trees with the best score. +#' or a `multiPhylo` (first tree used). +#' When supplied, the first replicate uses this topology as its starting +#' point (warm-start), skipping the random Wagner tree construction. +#' Subsequent replicates still begin from random Wagner trees. +#' This is useful for continuing a search from a previously found optimum. +#' If unspecified, all replicates start from random Wagner trees. +#' Edge lengths are not supported and will be deleted. #' @param concavity Determines the degree to which extra steps beyond the first #' are penalized. Specify a numeric value to use implied weighting #' \insertCite{Goloboff1993}{TreeSearch}; `concavity` specifies _k_ in @@ -131,893 +269,635 @@ #' \insertCite{Goloboff2018,Smith2019}{TreeSearch}. #' Better still explore the sensitivity of results under a range of #' concavity values, e.g. `k = 2 ^ (1:7)`. -#' Specify `Inf` to weight each additional step equally, -#' (which underperforms step weighting approaches -#' \insertCite{Goloboff2008,Goloboff2018,Goloboff2019,Smith2019}{TreeSearch}). -#' Specify `"profile"` to employ an approximation of profile parsimony +#' Specify `Inf` to weight each additional step equally. +#' Specify `"profile"` to employ profile parsimony #' \insertCite{Faith2001}{TreeSearch}. -#' @param ratchEW Logical specifying whether to use equal weighting during -#' ratchet iterations, improving search speed whilst still facilitating -#' escape from local optima. -#' @param tolerance Numeric specifying degree of suboptimality to tolerate -#' before rejecting a tree. The default, `sqrt(.Machine$double.eps)`, retains -#' trees that may be equally parsimonious but for rounding errors. -#' Setting to larger values will include trees suboptimal by up to `tolerance` -#' in search results, which may improve the accuracy of the consensus tree -#' (at the expense of resolution) \insertCite{Smith2019}{TreeSearch}. +#' @param extended_iw Logical: if `TRUE` (default) and `concavity` is finite, +#' apply the missing-entries correction of +#' \insertCite{Goloboff2014;textual}{TreeSearch}. +#' Characters with missing data receive a reduced effective concavity +#' _k_c_ = _k_ / _f_c_, making their weights drop off faster. +#' This compensates for the artificially low homoplasy of poorly sampled +#' characters. Set `FALSE` for legacy Goloboff (1993) behaviour. +#' Ignored when `concavity = Inf` (equal weights) or `"profile"`. +#' @param xpiwe_r Numeric in (0, 1]: proportion of observed homoplasy +#' expected in unobserved (missing) entries. Default 0.5 (following TNT). +#' Only used when `extended_iw = TRUE`. +#' @param xpiwe_max_f Numeric >= 1: maximum extrapolation factor. +#' Characters with very few observed entries are clamped so that the +#' extrapolation factor does not exceed this value. Default 5 (following +#' TNT). Only used when `extended_iw = TRUE`. +#' @param hierarchy A [`CharacterHierarchy`] object specifying which +#' characters are controlling primaries and which are their dependent +#' secondaries. Required when `inapplicable` is `"hsj"` or `"xform"`; +#' ignored when `inapplicable = "bgs"` (the default). +#' See [`CharacterHierarchy()`] for how to construct one, and +#' [`hierarchy_from_names()`] for automated construction from +#' TNT-style character names. +#' @param inapplicable Character: method for handling inapplicable characters. +#' Case-insensitive. +#' See `vignette("inapplicable", package = "TreeSearch")` for details. +#' \describe{ +#' \item{`"bgs"` (default)}{Three-pass algorithm of +#' \insertCite{Brazeau2019;textual}{TreeSearch}, inferring applicability +#' regions from the `"-"` token. No hierarchy required.} +#' \item{`"hsj"`}{Dissimilarity-metric scoring of +#' \insertCite{Hopkins2021;textual}{TreeSearch}. Requires a +#' `hierarchy`; controlled by `hsj_alpha`.} +#' \item{`"xform"`}{Step-matrix recoding approximating maximum homology +#' via x-transformations +#' \insertCite{Goloboff2021;textual}{TreeSearch}. Requires a +#' `hierarchy`.} +#' } +#' @param hsj_alpha Numeric in \[0, 1\]: scaling parameter for secondary- +#' character contributions under the HSJ method. 0 = secondaries ignored; +#' 1 (default) = secondaries contribute up to 1 per branch per hierarchy +#' block. Only used when `inapplicable = "hsj"`. #' @param constraint Either an object of class `phyDat`, in which case #' returned trees will be perfectly compatible with each character in #' `constraint`; or a tree of class `phylo`, all of whose nodes will occur #' in any output tree. -#' See \code{\link[TreeTools:ImposeConstraint]{ImposeConstraint()}} and -#' [vignette](https://ms609.github.io/TreeSearch/articles/tree-search.html) -#' for further examples. +#' Constraint searches are supported natively: all tree rearrangements +#' are filtered to respect the constraint topology. +#' @param strategy Character: named strategy preset controlling the search +#' heuristic parameters. Presets: +#' \describe{ +#' \item{`"auto"` (default)}{Selects automatically based on dataset size +#' and character count: +#' `"sprint"` for <=30 taxa; `"large"` for >=120 taxa with >=100 +#' character patterns; `"thorough"` for 65-119 taxa with >=100 +#' character patterns; `"default"` otherwise.} +#' \item{`"sprint"`}{Fast search: 3 ratchet cycles, no drift, minimal +#' sectorial. Good for small datasets or quick surveys.} +#' \item{`"default"`}{Balanced: 12 ratchet + sectorial + fusing.} +#' \item{`"thorough"`}{Intensive: 20 ratchet cycles, adaptive +#' perturbation, extra sectorial rounds, NNI perturbation, outer cycle +#' loop. Best for datasets with 65-119 tips and 100+ character patterns.} +#' \item{`"large"`}{Large-tree search (>=120 tips): reduced cycle +#' counts scaled for expensive per-replicate cost, no NNI +#' perturbation, single biased Wagner start (Goloboff 2014), larger +#' sector sizes, 1-cycle simulated annealing instead of drift +#' (linear cooling from T=20 to T=0 over 5 phases). Empirically matches +#' or exceeds `"thorough"` at 180 tips across all time budgets.} +#' All presets enable consensus-stability stopping: the search stops early +#' if the strict consensus of best-score trees has been unchanged for +#' `consensusStableReps` consecutive replicates. +#' \item{`"none"`}{Use only the explicitly supplied parameter values.} +#' } +#' Explicit `control` fields always override the preset; for example, +#' `strategy = "sprint", control = SearchControl(ratchetCycles = 10L)` uses +#' sprint defaults for everything except `ratchetCycles`. +#' @param maxReplicates Integer: maximum number of independent search +#' replicates (default: 96). +#' The default is a multiple of 48 (= LCM(12, 16)) so that replicates +#' divide evenly across common 12- or 16-core machines when running in +#' parallel. +#' For large or complex datasets a higher value improves the chance of +#' finding all MPTs. A rough minimum is +#' `max(10, ceiling(NTip * NChar / 5000))`, where `NChar = sum(weight)`. +#' A warning is issued when an explicit value falls below this threshold +#' for datasets with 30 or more taxa. +#' @param targetHits Integer: stop when the best score has been found +#' independently this many times (default: `max(10, NTip / 5)`). +#' @param maxSeconds Numeric: maximum wall-clock time in seconds for the +#' search. When reached, the current replicate finishes and the search +#' stops. `0` (default) means no time limit. +#' @param nThreads Integer: number of parallel threads for search replicates. +#' \describe{ +#' \item{`1` (default)}{Serial execution -- identical to previous behaviour.} +#' \item{`0`}{Auto-detect: use one fewer thread than the number of CPU +#' cores.} +#' \item{`> 1`}{Use the specified number of worker threads.} +#' } +#' In parallel mode, each replicate runs independently with a shared tree +#' pool. Results may vary across runs with the same `set.seed()` due to +#' thread scheduling nondeterminism. Use `nThreads = 1` for reproducible +#' results. #' @param verbosity Integer specifying level of messaging; higher values give -#' more detailed commentary on search progress. Set to `0` to run silently. -#' @param \dots Additional parameters to `MaximizeParsimony()`. -#' -#' @return `MaximizeParsimony()` returns a list of trees with class -#' `multiPhylo`. This lists all trees found during each search step that -#' are within `tolerance` of the optimal score, listed in the sequence that -#' they were first visited, and named according to the step in which they were -#' first found; it may contain more than `maxHits` elements. -#' Note that the default search parameters may need to be increased in order for -#' these trees to be the globally optimal trees; examine the messages printed -#' during tree search to evaluate whether the optimal score has stabilized. -#' -#' The return value has the attribute `firstHit`, a named integer vector listing -#' the number of optimal trees visited for the first time in each stage of -#' the tree search. Stages are named: -#' - `seed`: starting trees; -#' - `start`: Initial TBR search; -#' - `ratchN`: Ratchet iteration `N`; -#' - `final`: Final TBR search. -#' The first tree hit for the first time in ratchet iteration three is named -#' `ratch3_1`. -#' +#' more detail. Set to `0` to run silently. +#' @param progressCallback Optional function called with a single list +#' argument containing search progress information. +#' The list includes elements: `replicate`, `max_replicates`, +#' `best_score`, `hits_to_best`, `target_hits`, `pool_size`, +#' `phase` (character), `elapsed` (seconds), and `phase_score`. +#' When `NULL` (default) and `verbosity >= 1` in an interactive session, +#' a `cli` progress bar is created automatically. +#' Supply a custom function (e.g. using [shiny::setProgress()]) +#' to control progress display. +#' @param control A [`SearchControl`] object (or a named list) of low-level +#' search parameters. Most users can rely on the `strategy` presets and +#' ignore this argument; see [`SearchControl()`] for full documentation +#' of individual fields. +#' @param ... Backward compatibility: individual control parameters (e.g. +#' `ratchetCycles = 10L`) may still be passed as named arguments. +#' These override the corresponding `control` fields and the strategy +#' preset. +#' Legacy `Morphy()`-style parameters (e.g. `ratchIter`, `tbrIter`) are +#' detected and forwarded to [`Morphy()`] with a deprecation warning. +#' +#' @return A `multiPhylo` object containing the best tree(s) found, with +#' attributes: +#' \describe{ +#' \item{`score`}{Best parsimony score.} +#' \item{`replicates`}{Number of replicates completed.} +#' \item{`hits_to_best`}{Number of independent discoveries of the best +#' score.} +#' \item{`n_topologies`}{Number of distinct topologies in the pool at the +#' best score.} +#' \item{`last_improved_rep`}{1-based index of the replicate that last +#' improved the best score (0 if not tracked, e.g. parallel search).} +#' \item{`timed_out`}{Logical: `TRUE` if the search stopped because +#' `maxSeconds` was exceeded.} +#' \item{`consensus_stable`}{Logical: `TRUE` if the search stopped +#' because the strict consensus was unchanged for +#' `consensusStableReps` consecutive replicates.} +#' \item{`perturb_stop`}{Logical: `TRUE` if the search stopped because +#' `nTip * perturbStopFactor` consecutive replicates failed to improve +#' the best score (see [`SearchControl()`]).} +#' \item{`timings`}{Named numeric vector of cumulative wall-clock time +#' (in milliseconds) spent in each search phase across all replicates: +#' `wagner_ms`, `tbr_ms`, `xss_ms`, `rss_ms`, `css_ms`, `ratchet_ms`, +#' `drift_ms`, `final_tbr_ms`, `fuse_ms`.} +#' \item{`replicate_scores`}{Numeric vector of the best parsimony score +#' found by each completed replicate. Passed to [ScoreSpectrum()] for +#' Chao1-style landscape coverage estimation.} +#' } +#' #' @examples -#' ## Only run examples in interactive R sessions -#' if (interactive()) { -#' # launch "shiny" point-and-click interface -#' EasyTrees() -#' -#' # Here too, use the "continue search" function to ensure that tree score -#' # has stabilized and a global optimum has been found -#' } -#' -#' -#' # Load data for analysis in R -#' library("TreeTools") #' data("inapplicable.phyData", package = "TreeSearch") -#' dataset <- inapplicable.phyData[["Asher2005"]] -#' -#' # A very quick run for demonstration purposes -#' trees <- MaximizeParsimony(dataset, ratchIter = 0, startIter = 0, -#' tbrIter = 1, maxHits = 4, maxTime = 1/100, -#' concavity = 10, verbosity = 4) -#' names(trees) -#' cons <- Consensus(trees) +#' dataset <- inapplicable.phyData[["Vinther2008"]] +#' result <- MaximizeParsimony(dataset, maxReplicates = 3L, targetHits = 2L) +#' result +#' attr(result, "score") #' -#' # In actual use, be sure to check that the score has converged on a global -#' # optimum, conducting additional iterations and runs as necessary. -#' -#' if (interactive()) { -#' # Jackknife resampling -#' nReplicates <- 10 -#' jackTrees <- replicate(nReplicates, -#' #c() ensures that each replicate returns a list of trees -#' c(Resample(dataset, trees, ratchIter = 0, tbrIter = 2, startIter = 1, -#' maxHits = 5, maxTime = 1 / 10, -#' concavity = 10, verbosity = 0)) -#' ) -#' -#' # In a serious analysis, more replicates would be conducted, and each -#' # search would undergo more iterations. -#' -#' # Now we must decide what to do with the multiple optimal trees from -#' # each replicate. -#' -#' # Set graphical parameters for plotting -#' oPar <- par(mar = rep(0, 4), cex = 0.9) -#' -#' # Take the strict consensus of all trees for each replicate -#' # (May underestimate support) -#' JackLabels(cons, lapply(jackTrees, ape::consensus)) -#' -#' # Take a single tree from each replicate (here, the first) -#' # Potentially problematic if chosen tree is not representative -#' JackLabels(cons, lapply(jackTrees, `[[`, 1)) -#' -#' # Count iteration as support if all most parsimonious trees support a split; -#' # as contradiction if all trees contradict it; don't include replicates where -#' # not all trees agree on the resolution of a split. -#' labels <- JackLabels(cons, jackTrees) -#' -#' # How many iterations were decisive for each node? -#' attr(labels, "decisive") -#' -#' # Show as proportion of decisive iterations -#' JackLabels(cons, jackTrees, showFrac = TRUE) -#' -#' # Restore graphical parameters -#' par(oPar) -#' } -#' -#' # Tree search with a constraint -#' constraint <- MatrixToPhyDat(c(a = 1, b = 1, c = 0, d = 0, e = 0, f = 0)) -#' characters <- MatrixToPhyDat(matrix( -#' c(0, 1, 1, 1, 0, 0, -#' 1, 1, 1, 0, 0, 0), ncol = 2, -#' dimnames = list(letters[1:6], NULL))) -#' MaximizeParsimony(characters, constraint = constraint, verbosity = 0) -#' #' @template MRS -#' -#' @importFrom cli cli_alert cli_alert_danger cli_alert_info cli_alert_success -#' cli_alert_warning cli_h1 -#' cli_progress_bar cli_progress_done cli_progress_update -#' @importFrom fastmatch fmatch -#' @importFrom stats runif -#' @importFrom TreeTools -#' AddUnconstrained -#' CharacterInformation -#' ConstrainedNJ -#' DropTip -#' ImposeConstraint -#' MakeTreeBinary -#' MatrixToPhyDat -#' NTip +#' @family tree scoring +#' @seealso [`Morphy()`] for fine-grained control over the R-level search loop. +#' [`Resample()`] for jackknife and bootstrap resampling. +#' [`SearchControl()`] for expert-level tuning of the search heuristics. #' @references #' \insertAllCited{} -#' @seealso -#' Tree search _via_ graphical user interface: [`EasyTrees()`] -#' +#' @importFrom TreeTools NTip RandomTree Renumber RenumberTips RootTree MakeTreeBinary +#' Preorder +#' @importFrom cli cli_alert_success cli_alert_info cli_alert_warning #' @encoding UTF-8 #' @export -MaximizeParsimony <- function(dataset, tree, - ratchIter = 7L, - tbrIter = 2L, - startIter = 2L, finalIter = 1L, - maxHits = NTip(dataset) * 1.8, - maxTime = 60, - quickHits = 1 / 3, - concavity = Inf, - ratchEW = TRUE, - tolerance = sqrt(.Machine[["double.eps"]]), - constraint, - verbosity = 3L) { - - ### User messaging functions ### - .Message <- function (level, ...) { - if (level < verbosity) { - cli_alert(paste0(...)) +MaximizeParsimony <- function( + dataset, + tree, + concavity = Inf, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5, + hierarchy = NULL, + inapplicable = "bgs", + hsj_alpha = 1.0, + constraint, + strategy = "auto", + maxReplicates = 96L, + targetHits = max(10L, as.integer(NTip(dataset) / 5)), + maxSeconds = 0, + nThreads = 1L, + verbosity = 1L, + progressCallback = NULL, + control = SearchControl(), + ... +) { + + # --- Backward compatibility: intercept maxTime → maxSeconds --- + dots <- list(...) + if ("maxTime" %in% names(dots)) { + if (missing(maxSeconds) || maxSeconds == 0) { + maxSeconds <- as.double(dots[["maxTime"]]) } + .Deprecated(msg = paste0( + "Use `maxSeconds` instead of `maxTime` in MaximizeParsimony().\n", + " `maxTime` was a Morphy()-style parameter; `maxSeconds` is the ", + "equivalent for the new C++ search engine." + )) + dots[["maxTime"]] <- NULL } - .Heading <- function (text, ...) { - if (0 < verbosity) { - cli_h1(text) - if (length(list(...))) { - cli_alert(paste0(...)) - } - } + + # --- Backward compatibility: detect Morphy()-style parameters --- + .morphyParams <- c("ratchIter", "tbrIter", "startIter", "finalIter", + "maxHits", "quickHits", "ratchEW", + "tolerance") + legacyHits <- intersect(names(dots), .morphyParams) + if (length(legacyHits)) { + .Deprecated( + "Morphy", + msg = paste0( + "Parameter", if (length(legacyHits) > 1L) "s", " ", + paste0(sQuote(legacyHits), collapse = ", "), + " belong", if (length(legacyHits) == 1L) "s", " to `Morphy()`,", + " not the new `MaximizeParsimony()`.\n", + " Delegating to `Morphy()`. ", + "Please update your code to call `Morphy()` directly ", + "or use the new MaximizeParsimony() parameters.\n", + " See ?Morphy and ?MaximizeParsimony for details." + ) + ) + morphyArgs <- dots + morphyArgs$dataset <- dataset + if (!missing(tree) && !is.null(tree)) morphyArgs$tree <- tree + if (!missing(concavity)) morphyArgs$concavity <- concavity + if (!missing(constraint)) morphyArgs$constraint <- constraint + if (!missing(verbosity)) morphyArgs$verbosity <- verbosity + return(do.call(Morphy, morphyArgs)) } - .Info <- function (level, ...) { - if (level < verbosity) { - cli_alert_info(paste0(...)) - } + + # --- Resolve control: merge control + ... overrides --- + # Coerce a plain list to SearchControl + if (!inherits(control, "SearchControl")) { + control <- do.call(SearchControl, control) } - .Success <- function (level, ...) { - if (level < verbosity) { - cli_alert_success(paste0(...)) + + # Named ... args that match SearchControl fields override `control` + controlFields <- names(SearchControl()) + controlDots <- dots[intersect(names(dots), controlFields)] + otherDots <- dots[setdiff(names(dots), controlFields)] + if (length(controlDots)) { + for (nm in names(controlDots)) { + control[[nm]] <- controlDots[[nm]] } } - - ### Tree score functions ### - .EWScore <- function (edge, morphyObj, ...) { - preorder_morphy(edge, morphyObj) - } - - .IWScore <- function (edge, morphyObjs, weight, charSeq, concavity, - minLength, target = Inf) { - morphy_iw(edge, morphyObjs, weight, minLength, charSeq, - concavity, target + epsilon) - } - - # Must have same order of parameters as .IWScore, even though minLength unused - .ProfileScore <- function (edge, morphyObjs, weight, charSeq, profiles, - minLength, target = Inf) { - morphy_profile(edge, morphyObjs, weight, charSeq, profiles, - target + epsilon) - } - - .Score <- function (edge) { - if (length(dim(edge)) == 3L) { - edge <- edge[, , 1] + if (length(otherDots)) { + warning("Unknown arguments ignored: ", + paste0(sQuote(names(otherDots)), collapse = ", ")) + } + + # --- Apply strategy preset --- + if (!is.null(strategy) && !identical(strategy, "none")) { + if (identical(strategy, "auto")) { + strategy <- .AutoStrategy(NTip(dataset), + sum(attr(dataset, "weight"))) } - if (profile) { - .ProfileScore(edge, morphyObjects, startWeights, charSeq, profiles) - } else if (iw) { - .IWScore(edge, morphyObjects, startWeights, charSeq, concavity, minLength) - } else { - preorder_morphy(edge, morphyObj) + preset <- .StrategyPresets()[[strategy]] + if (!is.null(preset)) { + # Determine which control fields the user explicitly set. + # Fields are "explicit" if: + # (a) passed via ... (already merged into control above), OR + # (b) control was explicitly supplied and differs from SearchControl() + defaults <- SearchControl() + explicit_via_dots <- names(controlDots) + explicit_via_control <- if ("control" %in% names(match.call())) { + # User passed control = SearchControl(...) — honour all fields in it + names(control) + } else { + character(0) + } + explicit <- union(explicit_via_dots, explicit_via_control) + + # Apply preset values for any field the user didn't explicitly set + for (nm in names(preset)) { + if (!(nm %in% explicit)) { + control[[nm]] <- preset[[nm]] + } + } + if (verbosity >= 1L) { + cli::cli_alert_info("Strategy: {.strong {strategy}}") + } + } else if (!identical(strategy, "auto")) { + warning("Unknown strategy '", strategy, "'; using default parameters.") } } - - ### Tree search functions ### - .TBRSearch <- function (Score, name, - edge, morphyObjs, weight, - tbrIter, maxHits, - minLength = NULL, charSeq = NULL, concavity = NULL) { - - iter <- 0L - nHits <- 1L - hold <- array(NA, dim = c(dim(edge), max(maxHits * 1.1, maxHits + 10L))) - maxHits <- ceiling(maxHits) - hold[, , 1] <- edge - bestScore <- Score(edge, morphyObjs, weight, charSeq, concavity, minLength) - bestPlusEps <- bestScore + epsilon - cli_progress_bar(name, total = maxHits, - auto_terminate = FALSE, - clear = verbosity < 3L, - format_done = paste0(" - TBR rearrangement at depth {iter}", - " found score {signif(bestScore)}", - " {nHits} time{?s}.")) - - while (iter < tbrIter) { - iter <- iter + 1L - brkOptions <- sample(3:(nTip * 2 - 2)) - .Message(4L, " New TBR iteration (depth ", iter, - ", score ", signif(bestScore), ")") - cli_progress_update(set = 0, total = length(brkOptions)) - - for (brk in brkOptions) { - cli_progress_update(1, status = paste0("D", iter, ", score ", - signif(bestScore), ", hit ", - nHits, ".")) - .Message(7L, " Break ", brk) - moves <- TBRMoves(edge, brk) - improvedScore <- FALSE - nMoves <- length(moves) - moveList <- sample.int(nMoves) - for (i in seq_along(moveList)) { - move <- moves[[moveList[i]]] - if (.Forbidden(move)) { - .Message(10L, " Skipping prohibited topology") - next - } - moveScore <- Score(move, morphyObjs, weight, charSeq, concavity, - minLength, bestPlusEps) - if (moveScore < bestPlusEps) { - edge <- move - if (moveScore < bestScore) { - improvedScore <- TRUE - iter <- 0L - bestScore <- moveScore - bestPlusEps <- bestScore + epsilon - nHits <- 1L - hold[, , 1] <- edge - .Message(5L, " New best score ", signif(bestScore), - " at break ", fmatch(brk, brkOptions), "/", length(brkOptions)) - break - } else { - .Message(6L, " Best score ", signif(bestScore), - " hit again (", nHits, "/", ceiling(maxHits), ")") - nHits <- nHits + 1L - hold[, , nHits] <- edge - if (nHits >= maxHits) break - } - } - # If an early iteration improves the score, a later iteration will - # probably improve it even more; we may as well keep working through - # the list instead of calculating a new one (which takes time) - if (improvedScore && runif(1) < (i / nMoves) ^ 2) break - } - if (nHits >= maxHits) break - pNextTbr <- (fmatch(brk, brkOptions) / length(brkOptions)) ^ 2 - if (improvedScore && runif(1) < pNextTbr) break + + # --- Progress callback: build default cli bar if needed --- + if (is.null(progressCallback) && verbosity >= 1L && interactive()) { + pb_env <- new.env(parent = environment()) + pb_env$id <- cli::cli_progress_bar( + total = as.integer(maxReplicates), + format = paste0( + "Rep {cli::pb_current}/{cli::pb_total}", + " | Best: {best}", + " | Hits: {hits}/{target}" + ), + .auto_close = FALSE, + .envir = pb_env + ) + pb_env$best <- "?" + pb_env$hits <- 0L + pb_env$target <- as.integer(targetHits) + progressCallback <- function(info) { + pb_env$best <- signif(info$best_score, 6) + pb_env$hits <- info$hits_to_best + pb_env$target <- info$target_hits + if (identical(info$phase, "done")) { + cli::cli_progress_done(id = pb_env$id, .envir = pb_env) + } else if (identical(info$phase, "replicate")) { + cli::cli_progress_update( + id = pb_env$id, set = info$replicate, .envir = pb_env + ) } - if (nHits >= maxHits) break } - cli_progress_done() - - # Return: - unique(hold[, , seq_len(nHits), drop = FALSE], MARGIN = 3L) - - } - - - .Search <- function (name = "TBR search", .edge = edge, .hits = searchHits, - .weight = startWeights, .forceEW = FALSE) { - if (length(dim(.edge)) == 3L) { - .edge <- .edge[, , 1] + on.exit( + tryCatch( + cli::cli_progress_done(id = pb_env$id, .envir = pb_env), + error = function(e) NULL + ), + add = TRUE + ) + } + + # --- Progress file callback (for Shiny background futures) --- + if (is.null(progressCallback)) { + progressFile <- Sys.getenv("TREESEARCH_PROGRESS_FILE", "") + if (nzchar(progressFile)) { + progressCallback <- function(info) { + if (identical(info$phase, "replicate")) { + tryCatch( + writeLines(paste(info$replicate, info$max_replicates, + signif(info$best_score, 8), info$hits_to_best, + info$target_hits), + progressFile), + error = function(e) NULL + ) + } + } } - .Message(4L, paste("<<< Begin:", name)) - on.exit(.Message(4L, paste(">>> Complete:", name))) - if (profile && isFALSE(.forceEW)) { - .TBRSearch(.ProfileScore, name, edge = .edge, morphyObjects, - tbrIter = searchIter, maxHits = .hits, - weight = .weight, minLength = minLength, charSeq = charSeq, - concavity = profiles) - - } else if (iw && isFALSE(.forceEW)) { - .TBRSearch(.IWScore, name, edge = .edge, morphyObjects, - tbrIter = searchIter, maxHits = .hits, - weight = .weight, minLength = minLength, charSeq = charSeq, - concavity = concavity) + } + + # --- Profile parsimony: prepare data --- + useProfile <- !missing(concavity) && identical(concavity, "profile") + if (useProfile) { + profileApprox <- if (!is.null(dots[["profile_approx"]])) { + dots[["profile_approx"]] } else { - .TBRSearch(.EWScore, name, edge = .edge, morphyObj, - tbrIter = searchIter, maxHits = .hits, - concavity = if(isTRUE(.forceEW)) Inf else concavity) + "auto" } + dataset <- PrepareDataProfile(dataset, approx = profileApprox) + concavity <- Inf # EW on the simplified binary data; profile scores via lookup + } + + # --- Input validation --- + if (!inherits(dataset, "phyDat")) { + stop("`dataset` must be a phyDat object.") + } + + nTip <- length(dataset) + if (nTip < 4L) { + stop("Need at least 4 taxa for tree search.") + } + if (is.null(attr(dataset, "levels")) || ncol(attr(dataset, "contrast")) == 0L) { + stop("Dataset contains no informative character states.") } - - .Timeout <- function() { - if (Sys.time() > stopTime) { - .Info(1L, "Stopping search at ", .DateTime(), ": ", maxTime, - " minutes have elapsed.", - " Best score was ", signif(.Score(bestEdges[, , 1])), ".", - if (maxTime == 60) "\nIncrease `maxTime` for longer runs.") - return (TRUE) + + # --- Validate inapplicable-handling parameters --- + inapplicable <- tolower(inapplicable) + if (inapplicable == "brazeau") inapplicable <- "bgs" + inapplicable <- match.arg(inapplicable, c("bgs", "hsj", "xform")) + if (inapplicable != "bgs") { + if (is.null(hierarchy)) { + stop("A `hierarchy` is required when inapplicable = \"", inapplicable, + "\". See ?CharacterHierarchy.") } - - FALSE - } - - .ReturnValue <- function(bestEdges) { - if (verbosity > 0L) { - cli_alert_success(paste0(.DateTime(), - ": Tree search terminated with score {.strong ", - "{signif(.Score(bestEdges[, , 1]))}}")) + if (!inherits(hierarchy, "CharacterHierarchy")) { + stop("`hierarchy` must be a CharacterHierarchy object.") } - firstHit <- attr(bestEdges, "firstHit") - structure(lapply(seq_len(dim(bestEdges)[3]), function (i) { - tr <- tree - tr[["edge"]] <- bestEdges[, , i] - if (any(is.na(outgroup))) { - tr - } else { - RootTree(tr, outgroup) - } - }), - firstHit = firstHit, - names = paste0(rep(names(firstHit), firstHit), "_", unlist(lapply(firstHit, seq_len))), - class = "multiPhylo") - } - - - # Define constants - epsilon <- tolerance - pNextTbr <- 0.33 - profile <- .UseProfile(concavity) - iw <- is.finite(concavity) - constrained <- !missing(constraint) - startTime <- Sys.time() - stopTime <- startTime + as.difftime(maxTime, units = "mins") - - # Initialize tree - startTrees <- NULL - if (missing(tree)) { - tree <- AdditionTree(dataset, constraint = constraint, - concavity = concavity) + validate_hierarchy(hierarchy, dataset) + if (useProfile) { + stop("Profile parsimony is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + if (is.finite(concavity)) { + stop("Implied weighting is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + # xform validation is done; recoding happens below + } + if (!is.numeric(hsj_alpha) || length(hsj_alpha) != 1L || + hsj_alpha < 0 || hsj_alpha > 1) { + stop("`hsj_alpha` must be a single number in [0, 1].") + } + if (is.finite(concavity) && concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony).") + } + + # --- Starting tree --- + userTree <- !missing(tree) && !is.null(tree) + if (!userTree) { + tree <- TreeTools::RandomTree(nTip, root = TRUE) + tree[["tip.label"]] <- names(dataset) } else if (inherits(tree, "multiPhylo")) { - startTrees <- unique(tree) - sampledTree <- sample.int(length(tree), 1) - .Info(2L, paste0("Starting search from {.var tree[[", sampledTree, "]]}")) - tree <- tree[[sampledTree]] - } else if (inherits(tree, "phylo")) { - startTrees <- c(tree) - } - if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { - cli_alert_warning("`tree` is not bifurcating; collapsing polytomies at random") + tree <- tree[[1L]] + } + if (!inherits(tree, "phylo")) { + stop("`tree` must be of class 'phylo'.") + } + + # Make bifurcating if needed + if (dim(tree[["edge"]])[1] != 2L * tree[["Nnode"]]) { tree <- MakeTreeBinary(tree) - if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { - cli_alert_warning("Rooting `tree` on first leaf") - tree <- RootTree(tree, 1) + if (dim(tree[["edge"]])[1] != 2L * tree[["Nnode"]]) { + tree <- RootTree(tree, 1L) } - if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { + if (dim(tree[["edge"]])[1] != 2L * tree[["Nnode"]]) { stop("Could not make `tree` binary.") } } - - # Check tree labels matches dataset + + # --- Match tree tips to dataset --- leaves <- tree[["tip.label"]] taxa <- names(dataset) - treeOnly <- setdiff(leaves, taxa) - datOnly <- setdiff(taxa, leaves) + treeOnly <- setdiff(leaves, taxa) + datOnly <- setdiff(taxa, leaves) if (length(treeOnly)) { - cli_alert_warning(paste0("Ignoring taxa on tree missing in dataset:\n> ", - paste0(treeOnly, collapse = ", "))) - warning("Ignored taxa on tree missing in dataset:\n ", - paste0(treeOnly, collapse = ", ")) - tree <- DropTip(tree, treeOnly) - startTrees <- DropTip(startTrees, treeOnly) + warning("Dropping taxa on tree but not in dataset: ", + paste0(treeOnly, collapse = ", ")) + tree <- TreeTools::DropTip(tree, treeOnly) } if (length(datOnly)) { - cli_alert_warning(paste0("Ignoring taxa in dataset missing on tree:\n> ", - paste0(datOnly, collapse = ", "))) - warning("Ignored taxa in dataset missing on tree:\n> ", + warning("Dropping taxa in dataset but not on tree: ", paste0(datOnly, collapse = ", ")) - dataset <- dataset[-fmatch(datOnly, taxa)] - } - if (constrained) { - if (!inherits(constraint, "phyDat")) { - constraint <- MatrixToPhyDat(t(as.matrix(constraint))) - } - consTaxa <- TipLabels(constraint) - treeOnly <- setdiff(tree[["tip.label"]], consTaxa) - if (length(treeOnly)) { - constraint <- AddUnconstrained(constraint, treeOnly) - } - consOnly <- setdiff(consTaxa, tree[["tip.label"]]) - if (length(consOnly)) { - cli_alert_warning( - paste0("Ignoring taxa in constraint missing on tree:\n> ", - paste0(consOnly, collapse = ", "))) - warning("Ignored taxa in constraint missing on tree:\n ", - paste0(consOnly, collapse = ", ")) - constraint <- constraint[-fmatch(consOnly, consTaxa)] - } - constraint <- constraint[names(dataset)] + dataset <- dataset[-match(datOnly, taxa)] } - - + + # Reorder tips to match dataset, put in preorder tree <- Preorder(RenumberTips(tree, names(dataset))) - nTip <- NTip(tree) - edge <- tree[["edge"]] - - # Initialize constraints - if (constrained) { - morphyConstr <- PhyDat2Morphy(constraint) - on.exit(morphyConstr <- UnloadMorphy(morphyConstr), add = TRUE) - constraintWeight <- attr(constraint, "weight") - if (any(constraintWeight > 1)) { - cli_alert_warning("Some constraints are exact duplicates.") - } - # Calculate constraint minimum score - constraintLength <- sum(MinimumLength(constraint, compress = TRUE) * - constraintWeight) - - .Forbidden <- function (edges) { - preorder_morphy(edges, morphyConstr) != constraintLength - } - - # Check that starting tree is consistent with constraints - if (.Forbidden(edge)) { - cli_alert_warning("Modifying `tree` to match `constraint`...") - outgroup <- edge[ - DescendantEdges(parent = edge[, 1], child = edge[, 2])[1, ], - 2] - outgroup <- outgroup[outgroup <= nTip] - tree <- RootTree(ImposeConstraint(tree, constraint), outgroup) - # RootTree leaves `tree` in preorder - edge <- tree[["edge"]] - if (.Forbidden(edge)) { - stop("Could not reconcile starting tree with `constraint`. ", - "Are all constraints compatible?") - } - } - - cli_alert_success(paste0("Initialized ", length(constraintWeight), - " distinct constraints.")) - - } else { - .Forbidden <- function (edges) FALSE - } - - - if (edge[1, 2] > nTip) { - outgroup <- edge[ - DescendantEdges(parent = edge[, 1], child = edge[, 2])[1, ], - 2] - outgroup <- outgroup[outgroup <= nTip] - if (length(outgroup) > nTip / 2L) { - outgroup <- seq_len(nTip)[-outgroup] - } - tree <- RootTree(tree, 1) - edge <- tree[["edge"]] - } else { - outgroup <- NA - } - - # Initialize data - if (profile) { - dataset <- PrepareDataProfile(dataset) - originalLevels <- attr(dataset, "levels") - if ("-" %fin% originalLevels) { - #TODO Fixing this will require updating the counts table cleverly - # Or we could use approximate info amounts, e.g. by treating "-" as - # an extra token - cli_alert_info(paste0("Inapplicable tokens \"-\" treated as ambiguous ", - "\"?\" for profile parsimony")) - cont <- attr(dataset, "contrast") - cont[cont[, "-"] != 0, ] <- 1 - attr(dataset, "contrast") <- cont[, colnames(cont) != "-"] - attr(dataset, "levels") <- originalLevels[originalLevels != "-"] - } - profiles <- attr(dataset, "info.amounts") - } - - if ((!iw && !profile) || # Required for equal weights search - (isTRUE(ratchEW) && ratchIter > 0) # For EW ratchet searches - ) { - morphyObj <- PhyDat2Morphy(dataset) - on.exit(morphyObj <- UnloadMorphy(morphyObj), add = TRUE) - } - - if (iw || profile) { - at <- attributes(dataset) - characters <- PhyToString(dataset, ps = "", useIndex = FALSE, - byTaxon = FALSE, concatenate = FALSE) - startWeights <- at[["weight"]] - minLength <- MinimumLength(dataset, compress = TRUE) - morphyObjects <- lapply(characters, SingleCharMorphy) - on.exit(morphyObjects <- vapply(morphyObjects, UnloadMorphy, integer(1)), - add = TRUE) - - nLevel <- length(at[["level"]]) - nChar <- at[["nr"]] - nTip <- length(dataset) - cont <- at[["contrast"]] - if (is.null(colnames(cont))) colnames(cont) <- as.character(at[["levels"]]) - simpleCont <- ifelse(rowSums(cont) == 1, - apply(cont != 0, 1, function (x) colnames(cont)[x][1]), - "?") - - - unlisted <- unlist(dataset, use.names = FALSE) - tokenMatrix <- matrix(simpleCont[unlisted], nChar, nTip) - charInfo <- apply(tokenMatrix, 1, CharacterInformation) - needsInapp <- rowSums(tokenMatrix == "-") > 2 - inappSlowdown <- 3L # A guess - # Crude estimate of score added per unit processing time - rawPriority <- charInfo / ifelse(needsInapp, inappSlowdown, 1) - priority <- startWeights * rawPriority - informative <- needsInapp | charInfo > 0 - # Will work from end of sequence to start. - charSeq <- seq_along(charInfo)[informative][order(priority[informative])] - 1L - } else { - startWeights <- unlist(MorphyWeights(morphyObj)[1, ]) # exact == approx - } - - # Initialize variables and prepare search - - nHits <- 1L - tbrStart <- startIter > 0 - tbrEnd <- finalIter > 0 - if (is.null(startTrees)) { - bestEdges <- edge - dim(bestEdges) <- c(dim(bestEdges), 1) - bestScore <- .Score(edge) - } else { - starters <- RenumberTips(startTrees, names(dataset)) - startEdges <- vapply(lapply(starters, Preorder), - `[[`, startTrees[[1]][["edge"]], - "edge") - startScores <- apply(startEdges, 3, .Score) - bestScore <- min(startScores) - bestEdges <- startEdges[, , startScores == bestScore, drop = FALSE] - } - nStages <- sum(tbrStart, ratchIter, tbrEnd) - attr(bestEdges, "firstHit") <- c("seed" = dim(bestEdges)[3], - setNames(double(nStages), - c(if(tbrStart) "start", - if(ratchIter > 0) paste0("ratch", seq_len(ratchIter)), - if(tbrEnd) "final"))) - - .Heading(paste0("BEGIN TREE SEARCH (k = ", concavity, ")"), - "Initial score: {.strong {signif(bestScore)} }") - - - # Find a local optimum - - if (tbrStart) { - searchIter <- tbrIter * startIter - searchHits <- maxHits - - .Heading("Find local optimum", - " TBR depth ", as.integer(searchIter), - "; keeping max ", as.integer(searchHits), - " trees; k = ", concavity, ".") - initialScore <- bestScore - - newEdges <- .Search("TBR search 1") - - newBestScore <- .Score(newEdges) - scoreImproved <- newBestScore + epsilon < bestScore - bestEdges <- if (scoreImproved) { - .ReplaceResults(bestEdges, newEdges, 2) - } else { - .CombineResults(bestEdges, newEdges, 2) - } - if (.Timeout()) { - .Info(1L, .DateTime(), ": Timed out with score ", - signif(min(bestScore, newBestScore))) - return(.ReturnValue(bestEdges)) # nocov - } - edge <- bestEdges[, , 1L] - bestScore <- .Score(edge) - if (bestScore < initialScore) { - .Success(2L, "{.strong New best score: {signif(bestScore)} }") - } else { - .Info(1L, .DateTime(), ": Did not beat initial score: ", - "{signif(bestScore)}") - } + + # Ensure root's first child is a tip (for C++ engine compatibility) + if (tree[["edge"]][1L, 2L] > NTip(tree)) { + tree <- RootTree(tree, 1L) } - - searchIter <- tbrIter - searchHits <- maxHits * quickHits - bestPlusEps <- bestScore + epsilon - - - - # Use Parsimony Ratchet to escape local optimum - - if (ratchIter > 0L) { - - .Heading("Escape local optimum", "{ratchIter} ratchet iterations; ", - "TBR depth {ceiling(searchIter)}; ", - "max. {ceiling(searchHits)} hits; ", - "k = {concavity}.") - .Info(1L, "{ .DateTime()}: Score to beat: {.strong {signif(bestScore)}}") - - iter <- 0L - while (iter < ratchIter) { - iter <- iter + 1L - .Message(1L, "Ratchet iteration {iter} @ {(.Time())}", - "; score to beat: {.strong {signif(bestScore)} }") - verbosity <- verbosity - 1L - eachChar <- seq_along(startWeights) - deindexedChars <- rep.int(eachChar, startWeights) - resampling <- tabulate(sample(deindexedChars, replace = TRUE), - length(startWeights)) - if (!isTRUE(ratchEW) && (profile || iw)) { - priority <- resampling * rawPriority - sampled <- informative & resampling > 0 - ratchSeq <- seq_along(charInfo)[sampled][order(priority[sampled])] - 1L - ratchetTrees <- .Search("Bootstrapped search", .weight = resampling) - } else { - errors <- vapply(eachChar, function (i) - mpl_set_charac_weight(i, resampling[i], morphyObj), integer(1)) - if (any(errors)) { # nocov start - stop ("Error resampling morphy object: ", - mpl_translate_error(unique(errors[errors < 0L]))) - } - if (mpl_apply_tipdata(morphyObj) -> error) { - stop("Error applying tip data: ", mpl_translate_error(error)) - } # nocov end - - ratchetTrees <- if (ratchEW) { - .Search("EW Bootstrapped search", .forceEW = TRUE) - } else { - .Search("Bootstrapped search") - } - - errors <- vapply(eachChar, function (i) - mpl_set_charac_weight(i, startWeights[i], morphyObj), integer(1)) - if (any(errors)) stop ("Error resampling morphy object: ", - mpl_translate_error(unique(errors[errors < 0L]))) - if (mpl_apply_tipdata(morphyObj) -> error) { - stop("Error applying tip data: ", mpl_translate_error(error)) - } - } - - verbosity <- verbosity + 1L - ratchetStart <- ratchetTrees[, , sample.int(dim(ratchetTrees)[3], 1)] - ratchStartScore <- .Score(ratchetStart) - .Message(2L, "Obtained new starting tree @ {(.Time())}", - " with score: {signif(ratchStartScore)}") - - # nocov start - if (.Timeout()) { - if (ratchetScore + epsilon < bestScore) { - bestEdges <- .ReplaceResults(bestEdges, ratchetStart, - 1 + tbrStart + iter) - } - return(.ReturnValue(bestEdges)) - } - # nocov end - - ratchetImproved <- .Search("TBR search", .edge = ratchetStart, - .hits = maxHits) - ratchetScore <- .Score(ratchetImproved[, , 1]) - - if (ratchetScore < bestPlusEps) { - if (ratchetScore + epsilon < bestScore) { - .Success(2L, "{.strong New best score}: {signif(ratchetScore)}") - bestScore <- ratchetScore - bestPlusEps <- bestScore + epsilon - bestEdges <- .ReplaceResults(bestEdges, ratchetImproved, - 1 + tbrStart + iter) - edge <- ratchetImproved[, , sample.int(dim(ratchetImproved)[3], 1)] - } else { - .Info(3L, "Hit best score {.strong {signif(bestScore)}} again") - - edge <- ratchetImproved[, , sample.int(dim(ratchetImproved)[3], 1)] - bestEdges <- .CombineResults(bestEdges, ratchetImproved, - 1 + tbrStart + iter) - } - } else { - if (3L < verbosity) { - cli_alert_danger("Did not hit best score {signif(bestScore)}") - } - } - if (.Timeout()) { - return(.ReturnValue(bestEdges)) # nocov - } + + # --- Extract data matrices --- + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + weight <- at$weight + levels <- at$levels + + # --- Replicate count adequacy check --- + # Warn only when the user explicitly passed maxReplicates. + # Formula: max(10, ceiling(nTip * nChar / 5000)) where nChar = sum(weight). + # Derived from T-069 benchmarks: at 225 taxa / 748 chars a single rep takes + # ~40s and at least ~34 reps are needed to fill the tree pool reliably. + if (!missing(maxReplicates) && nTip >= 30L && verbosity > 0L) { + nChars <- sum(weight) + minReps <- pmax(10L, ceiling(nTip * nChars / 5000L)) + if (maxReplicates < minReps) { + warning( + "With ", nTip, " taxa and ", nChars, " characters, at least ", + minReps, " replicates are recommended for reliable results ", + "(you specified ", maxReplicates, "). ", + "Consider increasing `maxReplicates` or setting `maxSeconds` ", + "to allow more search time.", + call. = FALSE + ) } } - - # Branch breaking - if (tbrEnd) { - searchIter <- tbrIter * finalIter - searchHits <- maxHits - - .Heading("Sample local optimum", - "TBR depth {searchIter}; keeping {searchHits}", - " trees; k = {concavity}") - .Info(1L, .DateTime(), ": Score: ", signif(bestScore)) - finalEdges <- .Search("Final search") - newBestScore <- .Score(finalEdges[, , 1]) - improved <- newBestScore + epsilon < bestScore - bestEdges <- if (improved) { - .ReplaceResults(bestEdges, finalEdges, 1 + tbrStart + ratchIter + 1) - } else { - .CombineResults(bestEdges, finalEdges, 1 + tbrStart + ratchIter + 1) + + # --- Prepare constraint for C++ engine --- + consArgs <- .PrepareConstraint( + constraint = if (!missing(constraint)) constraint, + dataset = dataset + ) + if (length(consArgs) > 0L && verbosity > 0L) { + cli_alert_info("Constraint: {nrow(consArgs$consSplitMatrix)} split{?s}") + } + + # --- Profile parsimony: extract info_amounts --- + profileArgs <- list() + if (useProfile) { + infoAmounts <- attr(dataset, "info.amounts") + if (!is.null(infoAmounts) && length(infoAmounts) > 0L) { + profileArgs$infoAmounts <- infoAmounts } } - - # Return: - .ReturnValue(bestEdges) -} -#' Combine two edge matrices -#' -#' @param x,y 3D arrays, each slice containing an edge matrix from a tree -#' of class `phylo`. `x` should not contain duplicates. -#' @return A single 3D array containing each unique edge matrix from (`x` and) -#' `y`, with a `firstHit` attribute as documented in [`MaximizeParsimony()`]. -#' @template MRS -#' @keywords internal -.CombineResults <- function (x, y, stage) { - xDim <- dim(x) - if (length(xDim) == 2L) { - xDim <- c(xDim, 1L) - } - if (any(duplicated(x, MARGIN = 3L))) { - warning(".CombineResults(x) should not contain duplicates.") - } - - res <- unique(array(c(x, y), dim = xDim + c(0, 0, dim(y)[3])), MARGIN = 3L) - firstHit <- attr(x, "firstHit") - firstHit[stage] <- dim(res)[3] - xDim[3] - attr(res, "firstHit") <- firstHit - - # Return: - res -} + # --- HSJ: prepare hierarchy data for C++ --- + hsjArgs <- list() + useHSJ <- !is.null(hierarchy) && identical(inapplicable, "hsj") + if (useHSJ) { + hsjArgs$hierarchyBlocks <- hierarchy_to_blocks(hierarchy) + hsjArgs$hsjTipLabels <- build_tip_labels(dataset) + hsjArgs$hsjAlpha <- as.double(hsj_alpha) + # Absent state is typically 0 (first level in reductive coding) + hsjArgs$hsjAbsentState <- 0L -#' @rdname dot-CombineResults -#' @param old old array of edge matrices with `firstHit` attribute. -#' @param new new array of edge matrices. -#' @param stage Integer specifying element of `firstHit` in which new hits -#' should be recorded. -#' @keywords internal -.ReplaceResults <- function (old, new, stage) { - hit <- attr(old, "firstHit") - hit[] <- 0 - hit[stage] <- dim(new)[3] - structure(new, "firstHit" = hit) -} + # Adjust weights: subtract hierarchy characters so Fitch scores non-hierarchy + adj_weight <- non_hierarchy_weights(dataset, hierarchy) + weight <- as.integer(adj_weight) + } -.Time <- function() { - format(Sys.time(), "%H:%M:%S") -} + # --- Xform: recode hierarchy into step-matrix characters --- + xformArgs <- list() + useXform <- !is.null(hierarchy) && identical(inapplicable, "xform") + if (useXform) { + recoded <- recode_hierarchy(dataset, hierarchy) + xformArgs$xformChars <- recoded$sankoff_chars -.DateTime <- function() { - format(Sys.time(), "%Y-%m-%d %T") -} + # Adjust weights: subtract hierarchy characters so Fitch scores non-hierarchy + adj_weight <- non_hierarchy_weights(dataset, hierarchy) + weight <- as.integer(adj_weight) + } -#' @rdname MaximizeParsimony -#' -#' @param method Unambiguous abbreviation of `jackknife` or `bootstrap` -#' specifying how to resample characters. Note that jackknife is considered -#' to give more meaningful results. -#' -#' @param proportion Numeric between 0 and 1 specifying what proportion of -#' characters to retain under jackknife resampling. -#' -#' @section Resampling: -#' Note that bootstrap support is a measure of the amount of data supporting -#' a split, rather than the amount of confidence that should be afforded the -#' grouping. -#' "Bootstrap support of 100% is not enough, the tree must also be correct" -#' \insertCite{Phillips2004}{TreeSearch}. -#' See discussion in \insertCite{Egan2006;textual}{TreeSearch}; -#' \insertCite{Wagele2009;textual}{TreeSearch}; -#' \insertCite{Simmons2011}{TreeSearch}; -#' \insertCite{Kumar2012;textual}{TreeSearch}. -#' -#' For a discussion of suitable search parameters in resampling estimates, see -#' \insertCite{Muller2005;textual}{TreeSearch}. -#' The user should decide whether to start each resampling -#' from the optimal tree (which may be quicker, but result in overestimated -#' support values as searches get stuck in local optima close to the -#' optimal tree) or a random tree (which may take longer as more rearrangements -#' are necessary to find an optimal tree on each iteration). -#' -#' For other ways to estimate clade concordance, see [`SiteConcordance()`]. -#' -#' @return `Resample()` returns a `multiPhylo` object containing a list of -#' trees obtained by tree search using a resampled version of `dataset`. -#' @family split support functions -#' @encoding UTF-8 -#' @export -Resample <- function(dataset, tree, method = "jack", proportion = 2 / 3, - ratchIter = 1L, tbrIter = 8L, finalIter = 3L, - maxHits = 12L, concavity = Inf, - tolerance = sqrt(.Machine[["double.eps"]]), - constraint, verbosity = 2L, - ...) { - - if (!inherits(dataset, "phyDat")) { - stop("`dataset` must be of class `phyDat`.") - } - - index <- attr(dataset, "index") - kept <- switch(pmatch(tolower(method), c("jackknife", "bootstrap")), - { - nKept <- ceiling(proportion * length(index)) - if (nKept < 1L) { - stop("No characters retained. `proportion` must be positive.") - } - if (nKept == length(index)) { - stop("`proportion` too high; no characters deleted.") - } - sample(index, nKept) - }, { - sample(index, length(index), replace = TRUE) - }) - - if (is.null(kept)) { - stop("`method` must be either \"jackknife\" or \"bootstrap\".") - } - - attr(dataset, "index") <- kept - attr(dataset, "weight") <- vapply(seq_len(attr(dataset, "nr")), - function (x) sum(kept == x), - integer(1)) - - MaximizeParsimony(dataset, tree = tree, - ratchIter = ratchIter, tbrIter = tbrIter, - finalIter = finalIter, - maxHits = maxHits, - concavity = concavity, - tolerance = tolerance, constraint = constraint, - verbosity = verbosity, ...) -} + # --- IW: compute minimum step counts per character --- + if (is.finite(concavity)) { + minSteps <- as.integer(MinimumLength(dataset, compress = TRUE)) + } -#' Launch tree search graphical user interface -#' -#' @rdname MaximizeParsimony -#' @importFrom cluster pam silhouette -#' @importFrom future future -#' @importFrom PlotTools SpectrumLegend -#' @importFrom promises future_promise -#' @importFrom protoclust protoclust -#' @importFrom Rogue ColByStability -#' @importFrom shiny runApp -#' @importFrom shinyjs useShinyjs -#' @importFrom TreeDist ClusteringInfoDistance -#' @export -EasyTrees <- function () {#nocov start - shiny::runApp(system.file("Parsimony", package = "TreeSearch")) + # --- XPIWE: compute per-pattern observed-taxa counts --- + useXpiwe <- isTRUE(extended_iw) && is.finite(concavity) && !useProfile + if (useXpiwe) { + obsCount <- .ObsCount(dataset) + } + + # --- Run C++ driven search --- + # searchControl: the resolved SearchControl object (already type-coerced) + # runtimeConfig: session-level params not in SearchControl + runtimeConfig <- list( + maxReplicates = as.integer(maxReplicates), + targetHits = as.integer(targetHits), + maxSeconds = as.double(maxSeconds), + verbosity = as.integer(verbosity), + nThreads = as.integer(nThreads), + startEdge = if (userTree) tree[["edge"]] else NULL, + progressCallback = progressCallback + ) + + # scoringConfig: scoring method params + scoringConfig <- list( + min_steps = if (is.finite(concavity)) minSteps else integer(0), + concavity = as.double(concavity), + xpiwe = useXpiwe, + xpiwe_r = as.double(xpiwe_r), + xpiwe_max_f = as.double(xpiwe_max_f), + obs_count = if (useXpiwe) obsCount else integer(0), + infoAmounts = profileArgs$infoAmounts + ) + + # constraintConfig / hsjConfig / xformConfig: NULL when empty + constraintConfig <- if (length(consArgs) > 0L) consArgs + hsjConfig <- if (length(hsjArgs) > 0L) hsjArgs + xformConfig <- if (length(xformArgs) > 0L) xformArgs + + result <- ts_driven_search( + contrast, tip_data, weight, levels, + control, runtimeConfig, scoringConfig, + constraintConfig, hsjConfig, xformConfig + ) + + # --- Reconstruct phylo from edge matrices --- + treeTpl <- tree + treeTpl[["edge.length"]] <- NULL + resultTrees <- result$trees + if (length(resultTrees) == 0L) { + resultTrees <- list() + } + outTrees <- lapply(resultTrees, function(edgeMat) { + tr <- treeTpl + tr[["edge"]] <- edgeMat + # C++ edge order may differ from template; renumber to valid preorder + Renumber(tr) + }) + if (length(outTrees) == 0L) { + outTrees <- list(treeTpl) + } + + # --- Output --- + if (verbosity > 0L) { + total_s <- round(sum(unlist(result$timings), na.rm = TRUE) / 1000, 1) + stop_reason <- if (isTRUE(result$timed_out)) "timeout" + else if (isTRUE(result$consensus_stable)) "consensus stable" + else if (isTRUE(result$perturb_stop)) "perturbation limit" + else "replicate limit" + cli_alert_success(paste0( + "Search complete: score {.strong {signif(result$best_score, 7)}}, ", + "{result$replicates} replicate{?s} ", + "(last improved: #{result$last_improved_rep}), ", + "{result$hits_to_best} hit{?s} to best, ", + "{result$n_topologies} MPT{?s}, ", + "stop: {stop_reason}, {total_s}s" + )) + } + + structure( + outTrees, + score = result$best_score, + replicates = result$replicates, + hits_to_best = result$hits_to_best, + n_topologies = result$n_topologies, + last_improved_rep = result$last_improved_rep, + timed_out = isTRUE(result$timed_out), + consensus_stable = isTRUE(result$consensus_stable), + perturb_stop = isTRUE(result$perturb_stop), + timings = unlist(result$timings), + strategy_diagnostics = result$strategy_diagnostics, + replicate_scores = result$replicate_scores, + class = "multiPhylo" + ) } + #' @rdname MaximizeParsimony +#' @usage MaximizeParsimony2(...) +#' @section Deprecated: +#' `MaximizeParsimony2()` is a deprecated alias for `MaximizeParsimony()`. #' @export -EasyTreesy <- EasyTrees -#nocov end - -.UseProfile <- function (concavity) { - pmatch(tolower(concavity), "profile", -1L) == 1L +MaximizeParsimony2 <- function(...) { + .Deprecated("MaximizeParsimony") + MaximizeParsimony(...) } diff --git a/R/Morphy.R b/R/Morphy.R new file mode 100644 index 000000000..606f04347 --- /dev/null +++ b/R/Morphy.R @@ -0,0 +1,1353 @@ +#' Tree search using MorphyLib scoring +#' +#' Search for most parsimonious trees using the parsimony ratchet and +#' \acronym{TBR} rearrangements, scoring with the MorphyLib C library +#' \insertCite{Brazeau2017}{TreeSearch}. +#' Supports equal weights, implied weights, and profile parsimony. +#' Treats inapplicable data using the algorithm of +#' \insertCite{Brazeau2019;textual}{TreeSearch}. +#' +#' For most users, [`MaximizeParsimony()`] provides a faster search using the +#' C++ engine, with native support for equal weights, implied weights, profile +#' parsimony, and topological constraints. +#' `Morphy()` is retained for users who need fine-grained control over the +#' R-level search loop (e.g.\sspace{}custom stopping criteria, per-iteration +#' callbacks, or direct access to MorphyLib scoring). +#' +#' Tree search commences with `ratchIter` iterations of the parsimony ratchet +#' \insertCite{Nixon1999}{TreeSearch}, which bootstraps the input dataset +#' in order to escape local optima. +#' A final round of tree bisection and reconnection (\acronym{TBR}) +#' is conducted to broaden the sampling of trees. +#' +#' This function can be called using the R command line / terminal, or through +#' the "shiny" graphical user interface app (type `EasyTrees()` to launch). +#' +#' The optimal strategy for tree search depends in part on how close to optimal +#' the starting tree is, the size of the search space (which increases +#' super-exponentially with the number of leaves), and the complexity of the +#' search space (e.g. the existence of multiple local optima). +#' +#' One possible approach is to employ four phases: +#' +#' 1. Rapid search for local optimum: tree score is typically easy to improve +#' early in a search, because the initial tree is often far from optimal. +#' When many moves are likely to be accepted, running several rounds of search +#' with a low value of `maxHits` and a high value of `tbrIter` allows many +#' trees to be evaluated quickly, hopefully moving quickly to a more promising +#' region of tree space. +#' +#' 2. Identification of local optimum: +#' Once close to a local optimum, a more extensive search +#' with a higher value of `maxHits` allows a region to be explored in more +#' detail. Setting a high value of `tbrIter` will search a local +#' neighbourhood more completely +#' +#' 3. Search for nearby peaks: +#' Ratchet iterations allow escape from local optima. +#' Setting `ratchIter` to a high value searches the wider neighbourhood more +#' extensively for other nearby peaks; `ratchEW = TRUE` accelerates these +#' exploratory searches. Ratchet iterations can be ineffective when `maxHits` +#' is too low for the search to escape its initial location. +#' +#' 4. Extensive search of final optimum. As with step 2, it may be valuable to +#' fully explore the optimum that is found after ratchet searches to be sure +#' that the locally optimal score has been obtained. Setting a high value of +#' `finalIter` performs a thorough search that can give confidence that further +#' searches would not find better (local) trees. +#' +#' A search is unlikely to have found a global optimum if: +#' +#' - Tree score continues to improve on the final iteration. If a local optimum +#' has not yet been reached, it is unlikely that a global optimum has +#' been reached. +#' Try increasing `maxHits`. +#' +#' - Successive ratchet iterations continue to improve tree scores. +#' If a recent ratchet iteration improved the score, rather than finding +#' a different region of tree space with the same optimal score, it is likely +#' that still better global optima remain to be found. Try increasing +#' `ratchIter` (more iterations give more chance for improvement) and +#' `maxHits` (to get closer to the local optimum after each ratchet iteration). +#' +#' - Optimal areas of tree space are only visited by a single ratchet iteration. +#' (See vignette: [Exploring tree space]( +#' https://ms609.github.io/TreeSearch/articles/tree-space.html).) +#' If some areas of tree space are only found by one ratchet iteration, there +#' may well be other, better areas that have not yet been visited. +#' Try increasing `ratchIter`. +#' +#' When continuing a tree search, it is usually best to start from an optimal +#' tree found during the previous iteration - there is no need to start from +#' scratch. +#' +#' A more time consuming way of checking that a global optimum has been reached +#' is to repeat a search with the same parameters multiple times, starting +#' from a different, entirely random tree each time. If all searches obtain the +#' same optimal tree score despite their different starting points, +#' this score is likely to correspond to the global optimum. +#' +#' For detailed documentation of the "TreeSearch" package, including full +#' instructions for loading phylogenetic data into R and initiating and +#' configuring tree search, see the +#' [package documentation](https://ms609.github.io/TreeSearch/). +#' +#' +#' @param dataset A phylogenetic data matrix of \pkg{phangorn} class +#' \code{phyDat}, whose names correspond to the labels of any accompanying tree. +#' Perhaps load into R using \code{\link[TreeTools]{ReadAsPhyDat}()}. +#' Additive (ordered) characters can be handled using +#' \code{\link[TreeTools]{Decompose}()}. +#' @param tree (optional) A bifurcating tree of class \code{\link[ape]{phylo}}, +#' containing only the tips listed in `dataset`, from which the search +#' should begin. +#' If unspecified, an [addition tree][AdditionTree()] will be generated from +#' `dataset`, respecting any supplied `constraint`. +#' Edge lengths are not supported and will be deleted. +#' @param ratchIter Numeric specifying number of iterations of the +#' parsimony ratchet \insertCite{Nixon1999}{TreeSearch} to conduct. +#' @param tbrIter Numeric specifying the maximum number of \acronym{TBR} +#' break points on a given tree to evaluate before terminating the search. +#' One "iteration" comprises selecting a branch to break, and evaluating +#' each possible reconnection point in turn until a new tree improves the +#' score. If a better score is found, then the counter is reset to zero, +#' and tree search continues from the improved tree. +#' @param startIter Numeric: an initial round of tree search with +#' `startIter` × `tbrIter` \acronym{TBR} break points is conducted in +#' order to locate a local optimum before beginning ratchet searches. +#' @param finalIter Numeric: a final round of tree search will evaluate +#' `finalIter` × `tbrIter` \acronym{TBR} break points, in order to +#' sample the final optimal neighbourhood more intensely. +#' @param maxHits Numeric specifying the maximum times that an optimal +#' parsimony score may be hit before concluding a ratchet iteration or final +#' search concluded. +#' @param maxTime Numeric: after `maxTime` minutes, stop tree search at the +#' next opportunity. +#' @param quickHits Numeric: iterations on subsampled datasets +#' will retain `quickHits` × `maxHits` trees with the best score. +#' @param concavity Determines the degree to which extra steps beyond the first +#' are penalized. Specify a numeric value to use implied weighting +#' \insertCite{Goloboff1993}{TreeSearch}; `concavity` specifies _k_ in +#' _k_ / _e_ + _k_. A value of 10 is recommended; +#' TNT sets a default of 3, but this is too low in some circumstances +#' \insertCite{Goloboff2018,Smith2019}{TreeSearch}. +#' Better still explore the sensitivity of results under a range of +#' concavity values, e.g. `k = 2 ^ (1:7)`. +#' Specify `Inf` to weight each additional step equally, +#' (which underperforms step weighting approaches +#' \insertCite{Goloboff2008,Goloboff2018,Goloboff2019,Smith2019}{TreeSearch}). +#' Specify `"profile"` to employ an approximation of profile parsimony +#' \insertCite{Faith2001}{TreeSearch}. +#' @param ratchEW Logical specifying whether to use equal weighting during +#' ratchet iterations, improving search speed whilst still facilitating +#' escape from local optima. +#' @param tolerance Numeric specifying degree of suboptimality to tolerate +#' before rejecting a tree. The default, `sqrt(.Machine$double.eps)`, retains +#' trees that may be equally parsimonious but for rounding errors. +#' Setting to larger values will include trees suboptimal by up to `tolerance` +#' in search results, which may improve the accuracy of the consensus tree +#' (at the expense of resolution) \insertCite{Smith2019}{TreeSearch}. +#' @param constraint Either an object of class `phyDat`, in which case +#' returned trees will be perfectly compatible with each character in +#' `constraint`; or a tree of class `phylo`, all of whose nodes will occur +#' in any output tree. +#' See \code{\link[TreeTools:ImposeConstraint]{ImposeConstraint()}} and +#' [vignette](https://ms609.github.io/TreeSearch/articles/tree-search.html) +#' for further examples. +#' @param verbosity Integer specifying level of messaging; higher values give +#' more detailed commentary on search progress. Set to `0` to run silently. +#' @param \dots Additional parameters to `Morphy()`. +#' +#' @return `Morphy()` returns a list of trees with class +#' `multiPhylo`. This lists all trees found during each search step that +#' are within `tolerance` of the optimal score, listed in the sequence that +#' they were first visited, and named according to the step in which they were +#' first found; it may contain more than `maxHits` elements. +#' Note that the default search parameters may need to be increased in order for +#' these trees to be the globally optimal trees; examine the messages printed +#' during tree search to evaluate whether the optimal score has stabilized. +#' +#' The return value has the attribute `firstHit`, a named integer vector listing +#' the number of optimal trees visited for the first time in each stage of +#' the tree search. Stages are named: +#' - `seed`: starting trees; +#' - `start`: Initial TBR search; +#' - `ratchN`: Ratchet iteration `N`; +#' - `final`: Final TBR search. +#' The first tree hit for the first time in ratchet iteration three is named +#' `ratch3_1`. +#' +#' @examples +#' ## Only run examples in interactive R sessions +#' if (interactive()) { +#' # launch "shiny" point-and-click interface +#' EasyTrees() +#' +#' # Here too, use the "continue search" function to ensure that tree score +#' # has stabilized and a global optimum has been found +#' } +#' +#' +#' # Load data for analysis in R +#' library("TreeTools") +#' data("inapplicable.phyData", package = "TreeSearch") +#' dataset <- inapplicable.phyData[["Asher2005"]] +#' +#' \donttest{ +#' # A very quick run for demonstration purposes +#' trees <- Morphy(dataset, ratchIter = 0, startIter = 0, +#' tbrIter = 1, maxHits = 4, maxTime = 1/100, +#' concavity = 10, verbosity = 4) +#' names(trees) +#' cons <- Consensus(trees) +#' } +#' +#' # In actual use, be sure to check that the score has converged on a global +#' # optimum, conducting additional iterations and runs as necessary. +#' +#' if (interactive()) { +#' # Jackknife resampling +#' nReplicates <- 10 +#' jackTrees <- replicate(nReplicates, +#' #c() ensures that each replicate returns a list of trees +#' c(Resample(dataset, trees, ratchIter = 0, tbrIter = 2, startIter = 1, +#' maxHits = 5, maxTime = 1 / 10, +#' concavity = 10, verbosity = 0)) +#' ) +#' +#' # In a serious analysis, more replicates would be conducted, and each +#' # search would undergo more iterations. +#' +#' # Now we must decide what to do with the multiple optimal trees from +#' # each replicate. +#' +#' # Set graphical parameters for plotting +#' oPar <- par(mar = rep(0, 4), cex = 0.9) +#' +#' # Take the strict consensus of all trees for each replicate +#' # (May underestimate support) +#' JackLabels(cons, lapply(jackTrees, ape::consensus)) +#' +#' # Take a single tree from each replicate (here, the first) +#' # Potentially problematic if chosen tree is not representative +#' JackLabels(cons, lapply(jackTrees, `[[`, 1)) +#' +#' # Count iteration as support if all most parsimonious trees support a split; +#' # as contradiction if all trees contradict it; don't include replicates where +#' # not all trees agree on the resolution of a split. +#' labels <- JackLabels(cons, jackTrees) +#' +#' # How many iterations were decisive for each node? +#' attr(labels, "decisive") +#' +#' # Show as proportion of decisive iterations +#' JackLabels(cons, jackTrees, showFrac = TRUE) +#' +#' # Restore graphical parameters +#' par(oPar) +#' } +#' +#' # Tree search with a constraint +#' constraint <- MatrixToPhyDat(c(a = 1, b = 1, c = 0, d = 0, e = 0, f = 0)) +#' characters <- MatrixToPhyDat(matrix( +#' c(0, 1, 1, 1, 0, 0, +#' 1, 1, 1, 0, 0, 0), ncol = 2, +#' dimnames = list(letters[1:6], NULL))) +#' Morphy(characters, constraint = constraint, verbosity = 0) +#' +#' @template MRS +#' +#' @importFrom cli cli_alert cli_alert_danger cli_alert_info cli_alert_success +#' cli_alert_warning cli_h1 +#' cli_progress_bar cli_progress_done cli_progress_update +#' @importFrom fastmatch fmatch +#' @importFrom stats runif +#' @importFrom TreeTools +#' AddUnconstrained +#' CharacterInformation +#' ConstrainedNJ +#' DropTip +#' ImposeConstraint +#' MakeTreeBinary +#' MatrixToPhyDat +#' NTip +#' @references +#' \insertAllCited{} +#' @seealso +#' [`MaximizeParsimony()`] for the faster C++ driven search engine +#' (recommended for most analyses). +#' +#' Tree search _via_ graphical user interface: [`EasyTrees()`] +#' +#' @encoding UTF-8 +#' @export +Morphy <- function(dataset, tree, + ratchIter = 7L, + tbrIter = 2L, + startIter = 2L, finalIter = 1L, + maxHits = NTip(dataset) * 1.8, + maxTime = 60, + quickHits = 1 / 3, + concavity = Inf, + ratchEW = TRUE, + tolerance = sqrt(.Machine[["double.eps"]]), + constraint, + verbosity = 3L) { + + ### User messaging functions ### + .Message <- function (level, ...) { + if (level < verbosity) { + cli_alert(paste0(...)) + } + } + .Heading <- function (text, ...) { + if (0 < verbosity) { + cli_h1(text) + if (length(list(...))) { + cli_alert(paste0(...)) + } + } + } + .Info <- function (level, ...) { + if (level < verbosity) { + cli_alert_info(paste0(...)) + } + } + .Success <- function (level, ...) { + if (level < verbosity) { + cli_alert_success(paste0(...)) + } + } + + ### Tree score functions ### + .EWScore <- function (edge, morphyObj, ...) { + preorder_morphy(edge, morphyObj) + } + + .IWScore <- function (edge, morphyObjs, weight, charSeq, concavity, + minLength, target = Inf) { + morphy_iw(edge, morphyObjs, weight, minLength, charSeq, + concavity, target + epsilon) + } + + # Must have same order of parameters as .IWScore, even though minLength unused + .ProfileScore <- function (edge, morphyObjs, weight, charSeq, profiles, + minLength, target = Inf) { + morphy_profile(edge, morphyObjs, weight, charSeq, profiles, + target + epsilon) + } + + .Score <- function (edge) { + if (length(dim(edge)) == 3L) { + edge <- edge[, , 1] + } + if (profile) { + .ProfileScore(edge, morphyObjects, startWeights, charSeq, profiles) + } else if (iw) { + .IWScore(edge, morphyObjects, startWeights, charSeq, concavity, minLength) + } else { + preorder_morphy(edge, morphyObj) + } + } + + ### Tree search functions ### + .TBRSearch <- function (Score, name, + edge, morphyObjs, weight, + tbrIter, maxHits, + minLength = NULL, charSeq = NULL, concavity = NULL) { + + iter <- 0L + nHits <- 1L + hold <- array(NA, dim = c(dim(edge), max(maxHits * 1.1, maxHits + 10L))) + maxHits <- ceiling(maxHits) + hold[, , 1] <- edge + bestScore <- Score(edge, morphyObjs, weight, charSeq, concavity, minLength) + bestPlusEps <- bestScore + epsilon + cli_progress_bar(name, total = maxHits, + auto_terminate = FALSE, + clear = verbosity < 3L, + format_done = paste0(" - TBR rearrangement at depth {iter}", + " found score {signif(bestScore)}", + " {nHits} time{?s}.")) + + while (iter < tbrIter) { + iter <- iter + 1L + brkOptions <- sample(3:(nTip * 2 - 2)) + .Message(4L, " New TBR iteration (depth ", iter, + ", score ", signif(bestScore), ")") + cli_progress_update(set = 0, total = length(brkOptions)) + + for (brk in brkOptions) { + cli_progress_update(1, status = paste0("D", iter, ", score ", + signif(bestScore), ", hit ", + nHits, ".")) + .Message(7L, " Break ", brk) + moves <- TBRMoves(edge, brk) + improvedScore <- FALSE + nMoves <- length(moves) + moveList <- sample.int(nMoves) + for (i in seq_along(moveList)) { + move <- moves[[moveList[i]]] + if (.Forbidden(move)) { + .Message(10L, " Skipping prohibited topology") + next + } + moveScore <- Score(move, morphyObjs, weight, charSeq, concavity, + minLength, bestPlusEps) + if (moveScore < bestPlusEps) { + edge <- move + if (moveScore < bestScore) { + improvedScore <- TRUE + iter <- 0L + bestScore <- moveScore + bestPlusEps <- bestScore + epsilon + nHits <- 1L + hold[, , 1] <- edge + .Message(5L, " New best score ", signif(bestScore), + " at break ", fmatch(brk, brkOptions), "/", length(brkOptions)) + break + } else { + .Message(6L, " Best score ", signif(bestScore), + " hit again (", nHits, "/", ceiling(maxHits), ")") + nHits <- nHits + 1L + hold[, , nHits] <- edge + if (nHits >= maxHits) break + } + } + # If an early iteration improves the score, a later iteration will + # probably improve it even more; we may as well keep working through + # the list instead of calculating a new one (which takes time) + if (improvedScore && runif(1) < (i / nMoves) ^ 2) break + } + if (nHits >= maxHits) break + pNextTbr <- (fmatch(brk, brkOptions) / length(brkOptions)) ^ 2 + if (improvedScore && runif(1) < pNextTbr) break + } + if (nHits >= maxHits) break + } + cli_progress_done() + + # Return: + unique(hold[, , seq_len(nHits), drop = FALSE], MARGIN = 3L) + + } + + + .Search <- function (name = "TBR search", .edge = edge, .hits = searchHits, + .weight = startWeights, .forceEW = FALSE) { + if (length(dim(.edge)) == 3L) { + .edge <- .edge[, , 1] + } + .Message(4L, paste("<<< Begin:", name)) + on.exit(.Message(4L, paste(">>> Complete:", name))) + if (profile && isFALSE(.forceEW)) { + .TBRSearch(.ProfileScore, name, edge = .edge, morphyObjects, + tbrIter = searchIter, maxHits = .hits, + weight = .weight, minLength = minLength, charSeq = charSeq, + concavity = profiles) + + } else if (iw && isFALSE(.forceEW)) { + .TBRSearch(.IWScore, name, edge = .edge, morphyObjects, + tbrIter = searchIter, maxHits = .hits, + weight = .weight, minLength = minLength, charSeq = charSeq, + concavity = concavity) + } else { + .TBRSearch(.EWScore, name, edge = .edge, morphyObj, + tbrIter = searchIter, maxHits = .hits, + concavity = if(isTRUE(.forceEW)) Inf else concavity) + } + } + + .Timeout <- function() { + if (Sys.time() > stopTime) { + .Info(1L, "Stopping search at ", .DateTime(), ": ", maxTime, + " minutes have elapsed.", + " Best score was ", signif(.Score(bestEdges[, , 1])), ".", + if (maxTime == 60) "\nIncrease `maxTime` for longer runs.") + return (TRUE) + } + + FALSE + } + + .ReturnValue <- function(bestEdges) { + if (verbosity > 0L) { + cli_alert_success(paste0(.DateTime(), + ": Tree search terminated with score {.strong ", + "{signif(.Score(bestEdges[, , 1]))}}")) + } + firstHit <- attr(bestEdges, "firstHit") + structure(lapply(seq_len(dim(bestEdges)[3]), function (i) { + tr <- tree + tr[["edge"]] <- bestEdges[, , i] + if (any(is.na(outgroup))) { + tr + } else { + RootTree(tr, outgroup) + } + }), + firstHit = firstHit, + names = paste0(rep(names(firstHit), firstHit), "_", unlist(lapply(firstHit, seq_len))), + class = "multiPhylo") + } + + + # Define constants + epsilon <- tolerance + pNextTbr <- 0.33 + profile <- .UseProfile(concavity) + iw <- is.finite(concavity) + if (iw && concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony).") + } + constrained <- !missing(constraint) + startTime <- Sys.time() + stopTime <- startTime + as.difftime(maxTime, units = "mins") + + # Initialize tree + startTrees <- NULL + if (missing(tree)) { + tree <- AdditionTree(dataset, constraint = constraint, + concavity = concavity) + } else if (inherits(tree, "multiPhylo")) { + startTrees <- unique(tree) + sampledTree <- sample.int(length(tree), 1) + .Info(2L, paste0("Starting search from {.var tree[[", sampledTree, "]]}")) + tree <- tree[[sampledTree]] + } else if (inherits(tree, "phylo")) { + startTrees <- c(tree) + } + if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { + cli_alert_warning("`tree` is not bifurcating; collapsing polytomies at random") + tree <- MakeTreeBinary(tree) + if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { + cli_alert_warning("Rooting `tree` on first leaf") + tree <- RootTree(tree, 1) + } + if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { + stop("Could not make `tree` binary.") + } + } + + # Check tree labels matches dataset + leaves <- tree[["tip.label"]] + taxa <- names(dataset) + treeOnly <- setdiff(leaves, taxa) + datOnly <- setdiff(taxa, leaves) + if (length(treeOnly)) { + cli_alert_warning(paste0("Ignoring taxa on tree missing in dataset:\n> ", + paste0(treeOnly, collapse = ", "))) + warning("Ignored taxa on tree missing in dataset:\n ", + paste0(treeOnly, collapse = ", ")) + tree <- DropTip(tree, treeOnly) + startTrees <- DropTip(startTrees, treeOnly) + } + if (length(datOnly)) { + cli_alert_warning(paste0("Ignoring taxa in dataset missing on tree:\n> ", + paste0(datOnly, collapse = ", "))) + warning("Ignored taxa in dataset missing on tree:\n> ", + paste0(datOnly, collapse = ", ")) + dataset <- dataset[-fmatch(datOnly, taxa)] + } + if (constrained) { + if (!inherits(constraint, "phyDat")) { + constraint <- MatrixToPhyDat(t(as.matrix(constraint))) + } + consTaxa <- TipLabels(constraint) + treeOnly <- setdiff(tree[["tip.label"]], consTaxa) + if (length(treeOnly)) { + constraint <- AddUnconstrained(constraint, treeOnly) + } + consOnly <- setdiff(consTaxa, tree[["tip.label"]]) + if (length(consOnly)) { + cli_alert_warning( + paste0("Ignoring taxa in constraint missing on tree:\n> ", + paste0(consOnly, collapse = ", "))) + warning("Ignored taxa in constraint missing on tree:\n ", + paste0(consOnly, collapse = ", ")) + constraint <- constraint[-fmatch(consOnly, consTaxa)] + } + constraint <- constraint[names(dataset)] + } + + + tree <- Preorder(RenumberTips(tree, names(dataset))) + nTip <- NTip(tree) + edge <- tree[["edge"]] + + # Initialize constraints + if (constrained) { + morphyConstr <- PhyDat2Morphy(constraint) + on.exit(morphyConstr <- UnloadMorphy(morphyConstr), add = TRUE) + constraintWeight <- attr(constraint, "weight") + if (any(constraintWeight > 1)) { + cli_alert_warning("Some constraints are exact duplicates.") + } + # Calculate constraint minimum score + constraintLength <- sum(MinimumLength(constraint, compress = TRUE) * + constraintWeight) + + .Forbidden <- function (edges) { + preorder_morphy(edges, morphyConstr) != constraintLength + } + + # Check that starting tree is consistent with constraints + if (.Forbidden(edge)) { + cli_alert_warning("Modifying `tree` to match `constraint`...") + outgroup <- edge[ + DescendantEdges(parent = edge[, 1], child = edge[, 2])[1, ], + 2] + outgroup <- outgroup[outgroup <= nTip] + tree <- RootTree(ImposeConstraint(tree, constraint), outgroup) + # RootTree leaves `tree` in preorder + edge <- tree[["edge"]] + if (.Forbidden(edge)) { + stop("Could not reconcile starting tree with `constraint`. ", + "Are all constraints compatible?") + } + } + + cli_alert_success(paste0("Initialized ", length(constraintWeight), + " distinct constraints.")) + + } else { + .Forbidden <- function (edges) FALSE + } + + + if (edge[1, 2] > nTip) { + outgroup <- edge[ + DescendantEdges(parent = edge[, 1], child = edge[, 2])[1, ], + 2] + outgroup <- outgroup[outgroup <= nTip] + if (length(outgroup) > nTip / 2L) { + outgroup <- seq_len(nTip)[-outgroup] + } + tree <- RootTree(tree, 1) + edge <- tree[["edge"]] + } else { + outgroup <- NA + } + + # Initialize data + if (profile) { + dataset <- PrepareDataProfile(dataset) + originalLevels <- attr(dataset, "levels") + if ("-" %fin% originalLevels) { + #TODO Fixing this will require updating the counts table cleverly + # Or we could use approximate info amounts, e.g. by treating "-" as + # an extra token + cli_alert_info(paste0("Inapplicable tokens \"-\" treated as ambiguous ", + "\"?\" for profile parsimony")) + cont <- attr(dataset, "contrast") + cont[cont[, "-"] != 0, ] <- 1 + attr(dataset, "contrast") <- cont[, colnames(cont) != "-"] + attr(dataset, "levels") <- originalLevels[originalLevels != "-"] + } + profiles <- attr(dataset, "info.amounts") + } + + if ((!iw && !profile) || # Required for equal weights search + (isTRUE(ratchEW) && ratchIter > 0) # For EW ratchet searches + ) { + morphyObj <- PhyDat2Morphy(dataset) + on.exit(morphyObj <- UnloadMorphy(morphyObj), add = TRUE) + } + + if (iw || profile) { + at <- attributes(dataset) + characters <- PhyToString(dataset, ps = "", useIndex = FALSE, + byTaxon = FALSE, concatenate = FALSE) + startWeights <- at[["weight"]] + minLength <- MinimumLength(dataset, compress = TRUE) + morphyObjects <- lapply(characters, SingleCharMorphy) + on.exit(morphyObjects <- vapply(morphyObjects, UnloadMorphy, integer(1)), + add = TRUE) + + nLevel <- length(at[["level"]]) + nChar <- at[["nr"]] + nTip <- length(dataset) + cont <- at[["contrast"]] + if (is.null(colnames(cont))) colnames(cont) <- as.character(at[["levels"]]) + simpleCont <- ifelse(rowSums(cont) == 1, + apply(cont != 0, 1, function (x) colnames(cont)[x][1]), + "?") + + + unlisted <- unlist(dataset, use.names = FALSE) + tokenMatrix <- matrix(simpleCont[unlisted], nChar, nTip) + charInfo <- apply(tokenMatrix, 1, CharacterInformation) + needsInapp <- rowSums(tokenMatrix == "-") > 2 + inappSlowdown <- 3L # A guess + # Crude estimate of score added per unit processing time + rawPriority <- charInfo / ifelse(needsInapp, inappSlowdown, 1) + priority <- startWeights * rawPriority + informative <- needsInapp | charInfo > 0 + # Will work from end of sequence to start. + charSeq <- seq_along(charInfo)[informative][order(priority[informative])] - 1L + } else { + startWeights <- unlist(MorphyWeights(morphyObj)[1, ]) # exact == approx + } + + # Initialize variables and prepare search + + nHits <- 1L + tbrStart <- startIter > 0 + tbrEnd <- finalIter > 0 + if (is.null(startTrees)) { + bestEdges <- edge + dim(bestEdges) <- c(dim(bestEdges), 1) + bestScore <- .Score(edge) + } else { + starters <- RenumberTips(startTrees, names(dataset)) + startEdges <- vapply(lapply(starters, Preorder), + `[[`, startTrees[[1]][["edge"]], + "edge") + startScores <- apply(startEdges, 3, .Score) + bestScore <- min(startScores) + bestEdges <- startEdges[, , startScores == bestScore, drop = FALSE] + } + nStages <- sum(tbrStart, ratchIter, tbrEnd) + attr(bestEdges, "firstHit") <- c("seed" = dim(bestEdges)[3], + setNames(double(nStages), + c(if(tbrStart) "start", + if(ratchIter > 0) paste0("ratch", seq_len(ratchIter)), + if(tbrEnd) "final"))) + + .Heading(paste0("BEGIN TREE SEARCH (k = ", concavity, ")"), + "Initial score: {.strong {signif(bestScore)} }") + + + # Find a local optimum + + if (tbrStart) { + searchIter <- tbrIter * startIter + searchHits <- maxHits + + .Heading("Find local optimum", + " TBR depth ", as.integer(searchIter), + "; keeping max ", as.integer(searchHits), + " trees; k = ", concavity, ".") + initialScore <- bestScore + + newEdges <- .Search("TBR search 1") + + newBestScore <- .Score(newEdges) + scoreImproved <- newBestScore + epsilon < bestScore + bestEdges <- if (scoreImproved) { + .ReplaceResults(bestEdges, newEdges, 2) + } else { + .CombineResults(bestEdges, newEdges, 2) + } + if (.Timeout()) { + .Info(1L, .DateTime(), ": Timed out with score ", + signif(min(bestScore, newBestScore))) + return(.ReturnValue(bestEdges)) # nocov + } + edge <- bestEdges[, , 1L] + bestScore <- .Score(edge) + if (bestScore < initialScore) { + .Success(2L, "{.strong New best score: {signif(bestScore)} }") + } else { + .Info(1L, .DateTime(), ": Did not beat initial score: ", + "{signif(bestScore)}") + } + } + + searchIter <- tbrIter + searchHits <- maxHits * quickHits + bestPlusEps <- bestScore + epsilon + + + + # Use Parsimony Ratchet to escape local optimum + + if (ratchIter > 0L) { + + .Heading("Escape local optimum", "{ratchIter} ratchet iterations; ", + "TBR depth {ceiling(searchIter)}; ", + "max. {ceiling(searchHits)} hits; ", + "k = {concavity}.") + .Info(1L, "{ .DateTime()}: Score to beat: {.strong {signif(bestScore)}}") + + iter <- 0L + while (iter < ratchIter) { + iter <- iter + 1L + .Message(1L, "Ratchet iteration {iter} @ {(.Time())}", + "; score to beat: {.strong {signif(bestScore)} }") + verbosity <- verbosity - 1L + eachChar <- seq_along(startWeights) + deindexedChars <- rep.int(eachChar, startWeights) + resampling <- tabulate(sample(deindexedChars, replace = TRUE), + length(startWeights)) + if (!isTRUE(ratchEW) && (profile || iw)) { + priority <- resampling * rawPriority + sampled <- informative & resampling > 0 + ratchSeq <- seq_along(charInfo)[sampled][order(priority[sampled])] - 1L + ratchetTrees <- .Search("Bootstrapped search", .weight = resampling) + } else { + errors <- vapply(eachChar, function (i) + mpl_set_charac_weight(i, resampling[i], morphyObj), integer(1)) + if (any(errors)) { # nocov start + stop ("Error resampling morphy object: ", + mpl_translate_error(unique(errors[errors < 0L]))) + } + if (mpl_apply_tipdata(morphyObj) -> error) { + stop("Error applying tip data: ", mpl_translate_error(error)) + } # nocov end + + ratchetTrees <- if (ratchEW) { + .Search("EW Bootstrapped search", .forceEW = TRUE) + } else { + .Search("Bootstrapped search") + } + + errors <- vapply(eachChar, function (i) + mpl_set_charac_weight(i, startWeights[i], morphyObj), integer(1)) + if (any(errors)) stop ("Error resampling morphy object: ", + mpl_translate_error(unique(errors[errors < 0L]))) + if (mpl_apply_tipdata(morphyObj) -> error) { + stop("Error applying tip data: ", mpl_translate_error(error)) + } + } + + verbosity <- verbosity + 1L + ratchetStart <- ratchetTrees[, , sample.int(dim(ratchetTrees)[3], 1)] + ratchStartScore <- .Score(ratchetStart) + .Message(2L, "Obtained new starting tree @ {(.Time())}", + " with score: {signif(ratchStartScore)}") + + # nocov start + if (.Timeout()) { + if (ratchetScore + epsilon < bestScore) { + bestEdges <- .ReplaceResults(bestEdges, ratchetStart, + 1 + tbrStart + iter) + } + return(.ReturnValue(bestEdges)) + } + # nocov end + + ratchetImproved <- .Search("TBR search", .edge = ratchetStart, + .hits = maxHits) + ratchetScore <- .Score(ratchetImproved[, , 1]) + + if (ratchetScore < bestPlusEps) { + if (ratchetScore + epsilon < bestScore) { + .Success(2L, "{.strong New best score}: {signif(ratchetScore)}") + bestScore <- ratchetScore + bestPlusEps <- bestScore + epsilon + bestEdges <- .ReplaceResults(bestEdges, ratchetImproved, + 1 + tbrStart + iter) + edge <- ratchetImproved[, , sample.int(dim(ratchetImproved)[3], 1)] + } else { + .Info(3L, "Hit best score {.strong {signif(bestScore)}} again") + + edge <- ratchetImproved[, , sample.int(dim(ratchetImproved)[3], 1)] + bestEdges <- .CombineResults(bestEdges, ratchetImproved, + 1 + tbrStart + iter) + } + } else { + if (3L < verbosity) { + cli_alert_danger("Did not hit best score {signif(bestScore)}") + } + } + if (.Timeout()) { + return(.ReturnValue(bestEdges)) # nocov + } + } + } + + # Branch breaking + if (tbrEnd) { + searchIter <- tbrIter * finalIter + searchHits <- maxHits + + .Heading("Sample local optimum", + "TBR depth {searchIter}; keeping {searchHits}", + " trees; k = {concavity}") + .Info(1L, .DateTime(), ": Score: ", signif(bestScore)) + finalEdges <- .Search("Final search") + newBestScore <- .Score(finalEdges[, , 1]) + improved <- newBestScore + epsilon < bestScore + bestEdges <- if (improved) { + .ReplaceResults(bestEdges, finalEdges, 1 + tbrStart + ratchIter + 1) + } else { + .CombineResults(bestEdges, finalEdges, 1 + tbrStart + ratchIter + 1) + } + } + + # Return: + .ReturnValue(bestEdges) +} + +#' Combine two edge matrices +#' +#' @param x,y 3D arrays, each slice containing an edge matrix from a tree +#' of class `phylo`. `x` should not contain duplicates. +#' @return A single 3D array containing each unique edge matrix from (`x` and) +#' `y`, with a `firstHit` attribute as documented in [`Morphy()`]. +#' @template MRS +#' @keywords internal +.CombineResults <- function (x, y, stage) { + xDim <- dim(x) + if (length(xDim) == 2L) { + xDim <- c(xDim, 1L) + } + if (any(duplicated(x, MARGIN = 3L))) { + warning(".CombineResults(x) should not contain duplicates.") + } + + res <- unique(array(c(x, y), dim = xDim + c(0, 0, dim(y)[3])), MARGIN = 3L) + firstHit <- attr(x, "firstHit") + firstHit[stage] <- dim(res)[3] - xDim[3] + attr(res, "firstHit") <- firstHit + + # Return: + res +} + +#' @rdname dot-CombineResults +#' @param old old array of edge matrices with `firstHit` attribute. +#' @param new new array of edge matrices. +#' @param stage Integer specifying element of `firstHit` in which new hits +#' should be recorded. +#' @keywords internal +.ReplaceResults <- function (old, new, stage) { + hit <- attr(old, "firstHit") + hit[] <- 0 + hit[stage] <- dim(new)[3] + structure(new, "firstHit" = hit) +} + +.Time <- function() { + format(Sys.time(), "%H:%M:%S") +} + +.DateTime <- function() { + format(Sys.time(), "%Y-%m-%d %T") +} + +# Hierarchy-aware resampling: generates hierarchical weights per replicate +# and calls ts_driven_search with HSJ/xform scoring. +# This is an internal helper called from Resample() when inapplicable != "bgs". +.ResampleHierarchy <- function(dataset, hierarchy, inapplicable, hsj_alpha, + method_idx, proportion, nReplicates, + contrast, tip_data, weight, levels, nTip, + concavity, ratchIter, tbrIter, + consArgs, profileArgs, tree) { + bootstrap <- (method_idx == 2L) + + # Prepare full HSJ args (before resampling) + hsjBase <- list() + if (identical(inapplicable, "hsj")) { + # Get flat blocks grouped by top-level block + .FlattenOneTop <- function(node) { + block <- list( + primary = node$controlling - 1L, + secondaries = node$dependents - 1L + ) + child_blocks <- lapply(node$children, .FlattenOneTop) + c(list(block), unlist(child_blocks, recursive = FALSE)) + } + hsjBase$blocks_per_top <- lapply(hierarchy, .FlattenOneTop) + hsjBase$hsjTipLabels <- build_tip_labels(dataset) + hsjBase$hsjAlpha <- as.double(hsj_alpha) + hsjBase$hsjAbsentState <- 0L + } + + # Prepare full xform args (before resampling) + xformBase <- list() + if (identical(inapplicable, "xform")) { + recoded <- recode_hierarchy(dataset, hierarchy) + xformBase$all_chars <- recoded$sankoff_chars + } + + # Driven search params for resampling context (light search per replicate) + resampleControl <- SearchControl( + tbrMaxHits = as.integer(max(tbrIter, 1L)), + ratchetCycles = as.integer(max(ratchIter, 3L)), + driftCycles = 0L, + xssRounds = 0L, + rssRounds = 0L, + cssRounds = 0L, + fuseInterval = 0L, + poolMaxSize = 1L, + poolSuboptimal = 0.0 + ) + resampleRuntime <- list( + maxReplicates = as.integer(max(ratchIter, 5L)), + targetHits = 2L, + maxSeconds = 0.0, + verbosity = 0L, + nThreads = 1L, + startEdge = NULL, + progressCallback = NULL + ) + resampleScoring <- list( + min_steps = integer(0), + concavity = as.double(concavity), + xpiwe = FALSE, + xpiwe_r = 0.5, + xpiwe_max_f = 5.0, + obs_count = integer(0), + infoAmounts = profileArgs$infoAmounts + ) + + trees <- vector("list", nReplicates) + for (r in seq_len(nReplicates)) { + resamp <- .HierarchicalResampleWeights( + dataset, hierarchy, bootstrap, proportion + ) + + # Build per-replicate hierarchy args based on retained blocks + repHsj <- list() + repXform <- list() + + if (identical(inapplicable, "hsj")) { + # Expand retained flat blocks (supports bootstrap: block sampled >1 time) + rep_blocks <- list() + for (bi in seq_along(resamp$block_counts)) { + if (resamp$block_counts[bi] > 0L) { + top_blocks <- hsjBase$blocks_per_top[[bi]] + for (k in seq_len(resamp$block_counts[bi])) { + rep_blocks <- c(rep_blocks, top_blocks) + } + } + } + repHsj$hierarchyBlocks <- rep_blocks + repHsj$hsjTipLabels <- hsjBase$hsjTipLabels + repHsj$hsjAlpha <- hsjBase$hsjAlpha + repHsj$hsjAbsentState <- hsjBase$hsjAbsentState + } + + if (identical(inapplicable, "xform")) { + rep_xf <- list() + for (bi in seq_along(resamp$block_counts)) { + if (resamp$block_counts[bi] > 0L) { + for (k in seq_len(resamp$block_counts[bi])) { + rep_xf <- c(rep_xf, list(xformBase$all_chars[[bi]])) + } + } + } + repXform$xformChars <- rep_xf + } + + # Call ts_driven_search with resampled weights + constraintCfg <- if (length(consArgs) > 0L) consArgs + hsjCfg <- if (length(repHsj) > 0L) repHsj + xformCfg <- if (length(repXform) > 0L) repXform + + result <- ts_driven_search( + contrast, tip_data, + as.integer(resamp$non_hierarchy_weights), levels, + resampleControl, resampleRuntime, resampleScoring, + constraintCfg, hsjCfg, xformCfg + ) + + # Extract best tree + if (result$pool_size > 0L && length(result$trees) > 0L) { + tr <- structure( + list(edge = result$trees[[1L]], + tip.label = names(dataset), + Nnode = nTip - 1L), + class = "phylo" + ) + attr(tr, "score") <- result$best_score + } else { + tr <- if (!is.null(tree) && inherits(tree, "phylo")) tree + else AdditionTree(dataset) + attr(tr, "score") <- result$best_score + } + trees[[r]] <- tr + } + + structure(trees, class = "multiPhylo") +} + + +#' @rdname Morphy +#' +#' @param method Unambiguous abbreviation of `jackknife` or `bootstrap` +#' specifying how to resample characters. Note that jackknife is considered +#' to give more meaningful results. +#' +#' @param proportion Numeric between 0 and 1 specifying what proportion of +#' characters to retain under jackknife resampling. +#' +#' @section Resampling: +#' Note that bootstrap support is a measure of the amount of data supporting +#' a split, rather than the amount of confidence that should be afforded the +#' grouping. +#' "Bootstrap support of 100% is not enough, the tree must also be correct" +#' \insertCite{Phillips2004}{TreeSearch}. +#' See discussion in \insertCite{Egan2006;textual}{TreeSearch}; +#' \insertCite{Wagele2009;textual}{TreeSearch}; +#' \insertCite{Simmons2011}{TreeSearch}; +#' \insertCite{Kumar2012;textual}{TreeSearch}. +#' +#' For a discussion of suitable search parameters in resampling estimates, see +#' \insertCite{Muller2005;textual}{TreeSearch}. +#' The user should decide whether to start each resampling +#' from the optimal tree (which may be quicker, but result in overestimated +#' support values as searches get stuck in local optima close to the +#' optimal tree) or a random tree (which may take longer as more rearrangements +#' are necessary to find an optimal tree on each iteration). +#' +#' For other ways to estimate clade concordance, see [`SiteConcordance()`]. +#' +#' @param nReplicates Integer specifying how many resample replicates to run. +#' Default `1L` runs a single replicate (original behaviour). +#' When `> 1`, all replicates are run in a single call, optionally in parallel. +#' @param nThreads Integer specifying the number of threads for parallel +#' resampling. Default `1L` runs serially. Use `0L` for auto-detect. +#' Only effective when `nReplicates > 1`. +#' @param hierarchy A [`CharacterHierarchy`] object specifying which characters +#' are controlled by which primary characters. Required when +#' `inapplicable` is `"hsj"` or `"xform"`. When provided, resampling +#' operates on "units" rather than individual characters: each non-hierarchy +#' character is one unit, and each top-level hierarchy block (primary + +#' all dependents) is one unit. See [`CharacterHierarchy()`]. +#' @param inapplicable Character string specifying the inapplicable-character +#' handling method: `"bgs"` (default), `"hsj"`, or `"xform"`. +#' Case-insensitive; `"brazeau"` is accepted as an alias for `"bgs"`. +#' See [`MaximizeParsimony()`] and `vignette("inapplicable")` for details. +#' @param hsj_alpha Numeric in \[0, 1\] controlling the weight of secondary +#' character variation in HSJ scoring. Default `1.0`. Only used when +#' `inapplicable = "hsj"`. +#' @param extended_iw Logical; if `TRUE` (default), use extended implied +#' weighting (XPIWE; \insertCite{Goloboff2014;textual}{TreeSearch}), +#' which adjusts per-character concavity for missing entries. +#' Ignored when `concavity = Inf` or `"profile"`. +#' @param xpiwe_r Numeric; proportion of homoplasy assumed in missing entries. +#' Default `0.5`. Only used when `extended_iw = TRUE`. +#' @param xpiwe_max_f Numeric; maximum extrapolation factor. +#' Default `5`. Only used when `extended_iw = TRUE`. +#' +#' @return `Resample()` returns a `multiPhylo` object containing one best tree +#' per resample replicate. +#' @family split support functions +#' @encoding UTF-8 +#' @export +Resample <- function(dataset, tree, method = "jack", proportion = 2 / 3, + ratchIter = 1L, tbrIter = 8L, finalIter = 3L, + maxHits = 12L, concavity = Inf, + tolerance = sqrt(.Machine[["double.eps"]]), + constraint, verbosity = 2L, + nReplicates = 1L, nThreads = 1L, + hierarchy = NULL, inapplicable = "bgs", + hsj_alpha = 1.0, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5, + ...) { + + if (!inherits(dataset, "phyDat")) { + stop("`dataset` must be of class `phyDat`.") + } + + method_idx <- pmatch(tolower(method), c("jackknife", "bootstrap")) + if (is.na(method_idx)) { + stop("`method` must be either \"jackknife\" or \"bootstrap\".") + } + + nReplicates <- as.integer(max(nReplicates, 1L)) + nThreads <- as.integer(max(nThreads, 1L)) + + # Validate proportion for jackknife + index <- attr(dataset, "index") + if (method_idx == 1L) { + nKept <- ceiling(proportion * length(index)) + if (nKept < 1L) { + stop("No characters retained. `proportion` must be positive.") + } + if (nKept == length(index)) { + stop("`proportion` too high; no characters deleted.") + } + } + + # --- Validate inapplicable-handling parameters --- + inapplicable <- tolower(inapplicable) + if (inapplicable == "brazeau") inapplicable <- "bgs" + inapplicable <- match.arg(inapplicable, c("bgs", "hsj", "xform")) + if (inapplicable != "bgs") { + if (is.null(hierarchy)) { + stop("A `hierarchy` is required when inapplicable = \"", inapplicable, + "\". See ?CharacterHierarchy.") + } + if (!inherits(hierarchy, "CharacterHierarchy")) { + stop("`hierarchy` must be a CharacterHierarchy object.") + } + validate_hierarchy(hierarchy, dataset) + } + if (!is.numeric(hsj_alpha) || length(hsj_alpha) != 1L || + hsj_alpha < 0 || hsj_alpha > 1) { + stop("`hsj_alpha` must be a single number in [0, 1].") + } + + # Profile parsimony: prepare data + useProfile <- identical(concavity, "profile") + if (useProfile) { + if (inapplicable != "bgs") { + stop("Profile parsimony is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + dataset <- PrepareDataProfile(dataset) + concavity <- Inf + } + if (is.finite(concavity) && inapplicable != "bgs") { + stop("Implied weighting is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + if (is.finite(concavity) && concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony).") + } + + # C++ engine path + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + weight <- at$weight + levels <- at$levels + nTip <- length(dataset) + + # Prepare constraint + consArgs <- .PrepareConstraint( + constraint = if (!missing(constraint)) constraint, + dataset = dataset + ) + + # Profile parsimony: extract info_amounts + profileArgs <- list() + if (useProfile) { + infoAmounts <- attr(dataset, "info.amounts") + if (!is.null(infoAmounts) && length(infoAmounts) > 0L) { + profileArgs$infoAmounts <- infoAmounts + } + } + + # --- Hierarchy-aware resampling path --- + # When inapplicable != "bgs", resample at the unit level (free chars + + # hierarchy blocks) and run driven_search per replicate with HSJ/xform + # scoring. + if (inapplicable != "bgs" && !is.null(hierarchy)) { + return(.ResampleHierarchy( + dataset = dataset, hierarchy = hierarchy, inapplicable = inapplicable, + hsj_alpha = hsj_alpha, method_idx = method_idx, proportion = proportion, + nReplicates = nReplicates, + contrast = contrast, tip_data = tip_data, weight = weight, + levels = levels, nTip = nTip, concavity = concavity, + ratchIter = ratchIter, tbrIter = tbrIter, + consArgs = consArgs, profileArgs = profileArgs, + tree = if (!missing(tree)) tree else NULL + )) + } + + # XPIWE: compute per-pattern observed-taxa counts + useXpiwe <- isTRUE(extended_iw) && is.finite(concavity) && !useProfile + if (useXpiwe) { + obsCount <- .ObsCount(dataset) + } + + searchArgs <- list( + contrast = contrast, + tip_data = tip_data, + weight = weight, + levels = levels, + bootstrap = (method_idx == 2L), + jackProportion = proportion, + maxReplicates = as.integer(max(ratchIter, 5L)), + targetHits = 2L, + tbrMaxHits = as.integer(max(tbrIter, 1L)), + ratchetCycles = as.integer(max(ratchIter, 3L)), + min_steps = if (is.finite(concavity)) + as.integer(MinimumLength(dataset, compress = TRUE)) else integer(0), + concavity = as.double(concavity), + xpiwe = useXpiwe, + xpiwe_r = as.double(xpiwe_r), + xpiwe_max_f = as.double(xpiwe_max_f), + obs_count = if (useXpiwe) obsCount else integer(0) + ) + + if (nReplicates > 1L) { + # Batch mode: run all replicates at once (optionally in parallel) + batchArgs <- c(searchArgs, + list(nReplicates = nReplicates, nThreads = nThreads), + consArgs, profileArgs) + result <- do.call(ts_parallel_resample, batchArgs) + + trees <- vector("list", nReplicates) + for (r in seq_len(nReplicates)) { + em <- result$edges[[r]] + if (nrow(em) == 0L) { + tr <- if (!missing(tree) && inherits(tree, "phylo")) tree + else AdditionTree(dataset) + } else { + tr <- structure( + list(edge = em, + tip.label = names(dataset), + Nnode = nTip - 1L), + class = "phylo" + ) + } + attr(tr, "score") <- result$scores[r] + trees[[r]] <- tr + } + return(structure(trees, class = "multiPhylo")) + } + + # Single-replicate path (original behavior) + result <- do.call(ts_resample_search, c(searchArgs, consArgs, profileArgs)) + + if (nrow(result$edge) == 0L) { + tr <- if (!missing(tree) && inherits(tree, "phylo")) tree + else AdditionTree(dataset) + attr(tr, "score") <- result$score + return(structure(list(tr), class = "multiPhylo")) + } + + tr <- structure( + list(edge = result$edge, + tip.label = names(dataset), + Nnode = nTip - 1L), + class = "phylo" + ) + attr(tr, "score") <- result$score + + structure(list(tr), class = "multiPhylo") +} + +#' Launch tree search graphical user interface +#' +#' Opens a "shiny" app for interactive parsimony tree search and results +#' exploration. +#' +#' @return Opens a Shiny application; does not return a value. +#' @seealso [`MaximizeParsimony()`], [`Morphy()`] +#' @importFrom TreeDist ClusteringInfoDistance +#' @export +EasyTrees <- function () {#nocov start + needed <- c("cluster", "future", "PlotTools", "promises", + "protoclust", "Rogue", "shiny", "shinyjs") + missing <- needed[!vapply(needed, requireNamespace, + logical(1L), quietly = TRUE)] + if (length(missing)) { + stop("EasyTrees() requires additional packages: ", + paste(missing, collapse = ", "), ".\n", + "Install with: install.packages(", + paste0("\"", missing, "\"", collapse = ", "), ")", + call. = FALSE) + } + shiny::runApp(system.file("Parsimony", package = "TreeSearch")) +} + +#' @rdname EasyTrees +#' @export +EasyTreesy <- EasyTrees +#nocov end + +.UseProfile <- function (concavity) { + pmatch(tolower(concavity), "profile", -1L) == 1L +} diff --git a/R/NNI.R b/R/NNI.R index a66d1e98e..3b526932d 100644 --- a/R/NNI.R +++ b/R/NNI.R @@ -27,9 +27,8 @@ #' @return Returns a tree with class \code{phylo} (if \code{returnAll = FALSE}) or #' a set of trees, with class \code{multiPhylo} (if \code{returnAll = TRUE}). #' -#' @references -#' The algorithm is summarized in -#' \insertRef{Felsenstein2004}{TreeSearch} +#' @references \insertCite{Felsenstein2004}{TreeSearch} +#' \insertAllCited{} #' #' #' @examples @@ -140,7 +139,6 @@ NNISwap <- function (parent, child, nTips = (length(parent) / 2L) + 1L, RenumberEdges(parent, child) } -## TODO use RenumberList #' Double NNI #' #' Returns the edge parameter of the two trees consistent with the speficied \acronym{NNI} rearrangement diff --git a/R/ParsSim.R b/R/ParsSim.R new file mode 100644 index 000000000..bcabb8e37 --- /dev/null +++ b/R/ParsSim.R @@ -0,0 +1,617 @@ +#' Simulate a dataset under parsimony +#' +#' Generates a morphological dataset under a strict parsimony model. +#' Characters are initialized at their minimum step count, then extra steps +#' are allocated one at a time. Each added step must increase the Fitch +#' parsimony score of the character by exactly one -- no "masked" or +#' "overprinted" steps are permitted. +#' +#' Back-mutations (e.g. \eqn{0 \to 1 \to 0}{0 -> 1 -> 0}) are allowed +#' when they genuinely add to the parsimony score. +#' +#' When `concavity` is finite (implied weighting), characters that already +#' carry more homoplasy are more likely to receive additional extra steps, +#' mirroring the mathematical relationship described by the +#' \eqn{k / (k + e)}{k/(k+e)} fit function. +#' +#' @param tree A \code{\link[ape:read.tree]{phylo}} object. If non-binary, +#' resolved to binary with a warning. If unrooted, rooted internally at an +#' arbitrary node. If no edge lengths are present, uniform lengths are used. +#' @param nChar Integer vector: `nChar[1]` gives the number of 2-state +#' characters, `nChar[2]` the number of 3-state characters, and so on. +#' @param nExtraSteps Single integer: total extra steps distributed one at a +#' time across all characters. +#' @param concavity Implied weighting concavity constant. `Inf` (default) +#' gives equal weights (uniform character selection). A finite positive +#' number _k_ gives implied weighting, with selection probability +#' proportional to `(k + e) / k`. `"profile"` uses profile parsimony +#' weighting: selection probability is proportional to the inverse of the +#' phylogenetic information at the current step count, computed via +#' [StepInformation()] after initialization. +#' @param rootState Integer vector: initial state at the root node for each +#' character (default `0L`). If length 1, the same root state is used for +#' all characters. If length `sum(nChar)`, each character gets its own root +#' state. Each root state must be in `0:(k-1)` where _k_ is the number of +#' states for that character. +#' @param missing Controls which cells are replaced with the ambiguous token +#' `?`. Missing data is applied _after_ the complete simulation, so +#' attributes such as `extra_steps` and `saturated` reflect the underlying +#' complete dataset. Accepted forms: +#' \describe{ +#' \item{**Scalar** (0--1)}{Flat rate: each cell is independently missing +#' with this probability.} +#' \item{**List** with `taxon` and/or `character` components}{Per-taxon +#' and/or per-character rates. Each component is a numeric vector of +#' probabilities (0--1). `taxon` should be named (matching tip labels) +#' or length `n_tip`; `character` should be length `sum(nChar)`. Per-cell +#' probability is `1 - (1 - p_taxon) * (1 - p_char)`.} +#' \item{**Matrix** (n_tip x total_chars)}{Per-cell probabilities (0--1). +#' Rows are taxa (named to match tip labels, or in tip order); +#' columns are characters.} +#' } +#' Default `0` (no missing data). +#' +#' @return A `phyDat` object with characters ordered by number of states +#' (2-state first, then 3-state, and so on). Additional attributes: +#' \describe{ +#' \item{`saturated`}{Logical vector: can each character accept another +#' step? Computed at return for all characters.} +#' \item{`steps_exhausted`}{Logical vector: was each character discovered +#' to be saturated during the step-placement loop (i.e., selected for +#' an extra step but no legal edge found)?} +#' \item{`extra_steps`}{Integer vector: number of extra steps placed on +#' each character.} +#' } +#' +#' @examples +#' tree <- TreeTools::BalancedTree(8) +#' dataset <- ParsSim(tree, nChar = c(20L), nExtraSteps = 10L) +#' TreeLength(tree, dataset) +#' +#' # Implied weighting: steps concentrate on fewer characters +#' dataset_iw <- ParsSim(tree, nChar = c(40L), nExtraSteps = 30L, +#' concavity = 3) +#' attr(dataset_iw, "extra_steps") +#' +#' # Profile parsimony weighting +#' dataset_pp <- ParsSim(tree, nChar = c(20L), nExtraSteps = 15L, +#' concavity = "profile") +#' attr(dataset_pp, "extra_steps") +#' +#' # 20% missing data injected post-hoc +#' dataset_missing <- ParsSim(tree, nChar = c(20L), nExtraSteps = 10L, +#' missing = 0.2) +#' +#' # Per-taxon missing rates (fragmentary taxa) +#' dataset_taxon <- ParsSim(tree, nChar = c(20L), nExtraSteps = 10L, +#' missing = list(taxon = c(t1 = 0.8, t2 = 0.5))) +#' +#' @references \insertCite{Goloboff2018}{TreeSearch} +#' \insertAllCited{} +#' @importFrom TreeTools MakeTreeBinary MatrixToPhyDat Postorder RootNode +#' RootTree +#' @family tree scoring +#' @export +ParsSim <- function(tree, + nChar = c(100L), + nExtraSteps = 0L, + concavity = Inf, + rootState = 0L, + missing = 0) { + # --- Validate inputs ------------------------------------------------------- + if (!inherits(tree, "phylo")) { + stop("`tree` must be a phylo object") + } + nChar <- as.integer(nChar) + nExtraSteps <- as.integer(nExtraSteps) + if (any(nChar < 0L)) stop("`nChar` values must be non-negative") + total_chars <- sum(nChar) + if (total_chars == 0L) stop("`nChar` must specify at least one character") + if (length(nExtraSteps) != 1L || nExtraSteps < 0L) { + stop("`nExtraSteps` must be a single non-negative integer") + } + missing_spec <- .pars_sim_validate_missing(missing) + + use_profile <- identical(concavity, "profile") + use_iw <- !use_profile && is.finite(concavity) + if (use_iw && concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony)") + } + + # --- Prepare tree ---------------------------------------------------------- + tree_info <- .pars_sim_prepare_tree(tree) + + # --- Determine state counts per character ---------------------------------- + n_states_vec <- rep(seq_along(nChar) + 1L, times = nChar) + + # --- Validate and expand rootState ------------------------------------------ + rootState <- as.integer(rootState) + if (length(rootState) == 1L) { + rootState <- rep(rootState, total_chars) + } else if (length(rootState) != total_chars) { + stop("`rootState` must have length 1 or sum(nChar) (= ", total_chars, ")") + } + bad <- which(rootState < 0L | rootState >= n_states_vec) + if (length(bad) > 0L) { + stop("`rootState[", bad[1], "]` = ", rootState[bad[1]], + " is out of range for a ", n_states_vec[bad[1]], "-state character", + " (must be 0 to ", n_states_vec[bad[1]] - 1L, ")") + } + + # --- Initialize characters ------------------------------------------------- + char_states <- vector("list", total_chars) + char_scores <- integer(total_chars) + for (i in seq_len(total_chars)) { + init <- .pars_sim_init_char(tree_info, n_states_vec[i], rootState[i]) + char_states[[i]] <- init$node_states + char_scores[i] <- init$score + } + + # --- Compute info profiles for profile mode -------------------------------- + info_profiles <- NULL + if (use_profile) { + n_tip <- tree_info$n_tip + info_profiles <- vector("list", total_chars) + for (i in seq_len(total_chars)) { + tip_states_i <- char_states[[i]][seq_len(n_tip)] + info_profiles[[i]] <- StepInformation(tip_states_i) + } + } + + # --- Extra step loop ------------------------------------------------------- + extra_steps <- integer(total_chars) + steps_exhausted <- logical(total_chars) + + if (nExtraSteps > 0L) { + steps_placed <- 0L + while (steps_placed < nExtraSteps) { + available <- which(!steps_exhausted) + if (length(available) == 0L) { + warning("All characters saturated after ", steps_placed, " of ", + nExtraSteps, " extra steps.") + break + } + + # Select character + char_idx <- .pars_sim_select_char(available, extra_steps, concavity, + use_iw, use_profile, char_scores, + info_profiles) + + # Find legal edges + legal <- .pars_sim_legal_edges(char_states[[char_idx]], tree_info, + char_scores[char_idx], + n_states_vec[char_idx]) + + if (is.null(legal)) { + steps_exhausted[char_idx] <- TRUE + next + } + + # Sample legal move weighted by edge length + move_idx <- .safe_sample_idx(nrow(legal), prob = legal$edge_length) + + # Apply transition + char_states[[char_idx]] <- .pars_sim_apply_transition( + char_states[[char_idx]], tree_info, + legal$edge_idx[move_idx], legal$target_state[move_idx] + ) + char_scores[char_idx] <- char_scores[char_idx] + 1L + extra_steps[char_idx] <- extra_steps[char_idx] + 1L + steps_placed <- steps_placed + 1L + + # In profile mode, mark exhausted when info drops to 0 + if (use_profile) { + profile <- info_profiles[[char_idx]] + step_name <- as.character(char_scores[char_idx]) + if (!(step_name %in% names(profile)) || + profile[step_name] <= 0) { + steps_exhausted[char_idx] <- TRUE + } + } + } + } + + # --- Build phyDat ---------------------------------------------------------- + n_tip <- tree_info$n_tip + tip_matrix <- vapply(char_states, function(ns) ns[seq_len(n_tip)], + integer(n_tip)) + rownames(tip_matrix) <- tree_info$tip_labels + + prob_matrix <- .pars_sim_build_missing_matrix( + missing_spec, n_tip, total_chars, tree_info$tip_labels + ) + + if (!is.null(prob_matrix)) { + char_matrix <- matrix(as.character(tip_matrix), nrow = n_tip, + dimnames = dimnames(tip_matrix)) + is_missing <- matrix(runif(n_tip * total_chars), nrow = n_tip) < prob_matrix + char_matrix[is_missing] <- "?" + result <- MatrixToPhyDat(char_matrix) + } else { + result <- MatrixToPhyDat(tip_matrix) + } + + # --- Calculate saturation for all characters -------------------------------- + saturated <- vapply(seq_len(total_chars), function(i) { + is.null(.pars_sim_legal_edges(char_states[[i]], tree_info, + char_scores[i], n_states_vec[i])) + }, logical(1)) + + attr(result, "saturated") <- saturated + attr(result, "steps_exhausted") <- steps_exhausted + attr(result, "extra_steps") <- extra_steps + + result +} + + +# --- Internal helpers -------------------------------------------------------- + +#' Prepare a tree for simulation +#' @return Named list: edge (postorder matrix), edge_length, n_tip, n_node, +#' root, children (list of child-node vectors), tip_labels. +#' @keywords internal +#' @noRd +.pars_sim_prepare_tree <- function(tree) { + if (!ape::is.rooted(tree)) { + tree <- RootTree(tree, tree[["tip.label"]][1]) + } + if (!ape::is.binary(tree)) { + warning("Resolving non-binary tree to binary.") + tree <- MakeTreeBinary(tree) + } + + tree <- Postorder(tree) + edge <- tree[["edge"]] + n_tip <- length(tree[["tip.label"]]) + n_node <- n_tip + tree[["Nnode"]] + + edge_length <- tree[["edge.length"]] + if (is.null(edge_length)) { + edge_length <- rep(1, nrow(edge)) + } + + children <- vector("list", n_node) + for (i in seq_len(n_node)) children[[i]] <- integer(0) + for (i in seq_len(nrow(edge))) { + p <- edge[i, 1] + children[[p]] <- c(children[[p]], edge[i, 2]) + } + + list( + edge = edge, + edge_length = edge_length, + n_tip = n_tip, + n_node = n_node, + root = RootNode(tree), + children = children, + tip_labels = tree[["tip.label"]] + ) +} + + +#' Fitch parsimony score for a single character +#' +#' Pure R Fitch downpass using bit-vector state sets. +#' @param tip_states Integer vector of states (0-indexed) for tips 1..n_tip. +#' @param tree_info List from `.pars_sim_prepare_tree()`. +#' @return Integer parsimony score. +#' @keywords internal +#' @noRd +.pars_sim_fitch_score <- function(tip_states, tree_info) { + n_tip <- tree_info$n_tip + n_node <- tree_info$n_node + edge <- tree_info$edge + + sets <- integer(n_node) + sets[seq_len(n_tip)] <- bitwShiftL(1L, tip_states[seq_len(n_tip)]) + + score <- 0L + for (i in seq_len(nrow(edge))) { + p <- edge[i, 1] + ch <- edge[i, 2] + if (sets[p] == 0L) { + sets[p] <- sets[ch] + } else { + inter <- bitwAnd(sets[p], sets[ch]) + if (inter > 0L) { + sets[p] <- inter + } else { + sets[p] <- bitwOr(sets[p], sets[ch]) + score <- score + 1L + } + } + } + score +} + + +#' Initialize a character with minimum steps +#' +#' Sets all nodes to `root_state`, then places `n_states - 1` transitions on +#' randomly selected edges (weighted by edge length) to introduce each state. +#' @return List: `node_states` (integer vector, length n_node), `score`. +#' @keywords internal +#' @noRd +.pars_sim_init_char <- function(tree_info, n_states, root_state) { + node_states <- rep(as.integer(root_state), tree_info$n_node) + edge <- tree_info$edge + + other_states <- setdiff(seq.int(0L, n_states - 1L), root_state) + for (new_state in other_states) { + # Edges where both endpoints share the same state + unmarked <- which(node_states[edge[, 1]] == node_states[edge[, 2]]) + weights <- tree_info$edge_length[unmarked] + idx <- unmarked[.safe_sample_idx(length(unmarked), prob = weights)] + + node_states <- .pars_sim_apply_transition(node_states, tree_info, idx, + new_state) + } + + list(node_states = node_states, score = n_states - 1L) +} + + +#' Find contiguous region of same-state nodes below a start node +#' +#' DFS from `start_node` following edges where parent and child share the +#' same state. +#' @return List: `region` (all node indices), `tips` (tip-only indices), +#' `boundary_states` (states of nodes just outside the region). +#' @keywords internal +#' @noRd +.pars_sim_find_region <- function(node_states, tree_info, start_node) { + children <- tree_info$children + n_tip <- tree_info$n_tip + state <- node_states[start_node] + + region <- integer(0) + tips <- integer(0) + boundary_states <- integer(0) + stack <- start_node + + while (length(stack) > 0L) { + node <- stack[length(stack)] + stack <- stack[-length(stack)] + region <- c(region, node) + if (node <= n_tip) { + tips <- c(tips, node) + } else { + for (ch in children[[node]]) { + if (node_states[ch] == state) { + stack <- c(stack, ch) + } else { + boundary_states <- c(boundary_states, node_states[ch]) + } + } + } + } + + list(region = region, tips = tips, boundary_states = boundary_states) +} + + +#' Find all legal (edge, target-state) moves for one character +#' +#' For each unmarked edge (endpoints share state), tries each possible +#' target state. Uses a boundary prefilter followed by Fitch verification. +#' @return Data frame with columns `edge_idx`, `target_state`, `edge_length`, +#' or NULL if no legal moves. +#' @keywords internal +#' @noRd +.pars_sim_legal_edges <- function(node_states, tree_info, current_score, + n_states) { + edge <- tree_info$edge + n_edge <- nrow(edge) + + edge_idx_out <- integer(0) + target_state_out <- integer(0) + edge_length_out <- numeric(0) + all_states <- seq.int(0L, n_states - 1L) + + for (i in seq_len(n_edge)) { + p <- edge[i, 1] + ch <- edge[i, 2] + + # Only consider unmarked edges + if (node_states[p] != node_states[ch]) next + + current_state <- node_states[ch] + info <- .pars_sim_find_region(node_states, tree_info, ch) + targets <- setdiff(all_states, current_state) + + for (t in targets) { + # Boundary prefilter: if a boundary child already has target state, + # the transition would eliminate an existing step → skip + if (t %in% info$boundary_states) next + + # Fitch verify + new_tip_states <- node_states[seq_len(tree_info$n_tip)] + new_tip_states[info$tips] <- t + new_score <- .pars_sim_fitch_score(new_tip_states, tree_info) + + if (new_score == current_score + 1L) { + edge_idx_out <- c(edge_idx_out, i) + target_state_out <- c(target_state_out, t) + edge_length_out <- c(edge_length_out, tree_info$edge_length[i]) + } + } + } + + if (length(edge_idx_out) == 0L) return(NULL) + + data.frame(edge_idx = edge_idx_out, + target_state = target_state_out, + edge_length = edge_length_out) +} + + +#' Apply a transition on an edge +#' +#' Changes the child node and its contiguous same-state region to +#' `new_state`. +#' @return Updated `node_states` vector. +#' @keywords internal +#' @noRd +.pars_sim_apply_transition <- function(node_states, tree_info, edge_idx, + new_state) { + child_node <- tree_info$edge[edge_idx, 2] + info <- .pars_sim_find_region(node_states, tree_info, child_node) + node_states[info$region] <- new_state + node_states +} + + +#' Select a character for the next extra step +#' @keywords internal +#' @noRd +.pars_sim_select_char <- function(available, extra_steps, concavity, use_iw, + use_profile = FALSE, char_scores = NULL, + info_profiles = NULL) { + if (length(available) == 1L) return(available) + + if (use_profile) { + # Weight ∝ 1 / info_amount at current step count + weights <- vapply(available, function(i) { + profile <- info_profiles[[i]] + step_name <- as.character(char_scores[i]) + if (step_name %in% names(profile)) { + info <- profile[step_name] + if (info > 0) return(1.0 / info) + } + 0 + }, double(1)) + # If all weights are 0, all available characters are info-saturated + if (all(weights == 0)) return(available[1L]) + available[sample.int(length(available), 1L, prob = weights)] + } else if (use_iw) { + weights <- (concavity + extra_steps[available]) / concavity + available[sample.int(length(available), 1L, prob = weights)] + } else { + available[sample.int(length(available), 1L)] + } +} + + +#' Sample a single index, safe for length-1 vectors +#' @keywords internal +#' @noRd +.safe_sample_idx <- function(n, prob = NULL) { + if (n == 1L) return(1L) + sample.int(n, 1L, prob = prob) +} + + +#' Validate and parse the `missing` argument +#' +#' Returns a list with `type` ("none", "scalar", "list", "matrix") and +#' the parsed value. +#' @keywords internal +#' @noRd +.pars_sim_validate_missing <- function(missing) { + if (is.matrix(missing)) { + if (!is.numeric(missing)) stop("`missing` matrix must be numeric") + if (any(is.na(missing)) || any(missing < 0) || any(missing > 1)) { + stop("`missing` matrix values must be between 0 and 1") + } + return(list(type = "matrix", value = missing)) + } + + if (is.list(missing)) { + valid_names <- c("taxon", "character") + bad <- setdiff(names(missing), valid_names) + if (length(bad) > 0L) { + stop("`missing` list may only contain 'taxon' and/or 'character' ", + "components; found: ", paste(bad, collapse = ", ")) + } + if (length(missing) == 0L || + !any(valid_names %in% names(missing))) { + stop("`missing` list must contain at least one of 'taxon' or 'character'") + } + for (comp in valid_names) { + if (comp %in% names(missing)) { + v <- missing[[comp]] + if (!is.numeric(v) || any(is.na(v)) || any(v < 0) || any(v > 1)) { + stop("`missing$", comp, "` must be a numeric vector with ", + "values between 0 and 1") + } + } + } + return(list(type = "list", value = missing)) + } + + # Scalar case + missing <- as.double(missing) + if (length(missing) != 1L || is.na(missing) || missing < 0 || missing > 1) { + stop("`missing` must be a number between 0 and 1, a list, or a matrix") + } + if (missing == 0) return(list(type = "none")) + list(type = "scalar", value = missing) +} + + +#' Build a per-cell probability matrix from a missing specification +#' +#' @return A n_tip × total_chars matrix of probabilities, or NULL if no +#' missing data should be applied. +#' @keywords internal +#' @noRd +.pars_sim_build_missing_matrix <- function(spec, n_tip, total_chars, + tip_labels) { + if (spec$type == "none") return(NULL) + + if (spec$type == "scalar") { + return(matrix(spec$value, nrow = n_tip, ncol = total_chars)) + } + + if (spec$type == "matrix") { + mat <- spec$value + if (!is.null(rownames(mat))) { + # Reorder rows to match tip_labels + if (!all(tip_labels %in% rownames(mat))) { + stop("`missing` matrix row names must include all tip labels") + } + mat <- mat[tip_labels, , drop = FALSE] + } + if (nrow(mat) != n_tip || ncol(mat) != total_chars) { + stop("`missing` matrix must have ", n_tip, " rows (taxa) and ", + total_chars, " columns (characters)") + } + return(mat) + } + + # List case: combine taxon and character rates + miss <- spec$value + p_taxon <- rep(0, n_tip) + if ("taxon" %in% names(miss)) { + tv <- miss$taxon + if (!is.null(names(tv))) { + if (!all(names(tv) %in% tip_labels)) { + stop("Names in `missing$taxon` must be valid tip labels") + } + # Named: match to tip labels; unlisted taxa get 0 + p_taxon[match(names(tv), tip_labels)] <- tv + } else { + if (length(tv) != n_tip) { + stop("`missing$taxon` must be named or have length ", n_tip) + } + p_taxon <- tv + } + } + + p_char <- rep(0, total_chars) + if ("character" %in% names(miss)) { + cv <- miss$character + if (length(cv) != total_chars) { + stop("`missing$character` must have length ", total_chars) + } + p_char <- cv + } + + # p_cell = 1 - (1 - p_taxon) * (1 - p_char) + prob_mat <- 1 - outer(1 - p_taxon, 1 - p_char) + + if (all(prob_mat == 0)) return(NULL) + prob_mat +} diff --git a/R/RandomTreeScore.R b/R/RandomTreeScore.R index 539637d6f..bc5dc8b31 100644 --- a/R/RandomTreeScore.R +++ b/R/RandomTreeScore.R @@ -1,9 +1,12 @@ -#' Parsimony score of random postorder tree +#' Parsimony score of random tree #' -#' @inheritParams MorphyTreeLength +#' Generates a random tree topology and returns its parsimony score under +#' equal weights. +#' +#' @param dataset A `phyDat` object (recommended) or a Morphy object created +#' with [`PhyDat2Morphy()`] (legacy; deprecated). #' -#' @return `RandomTreeScore()` returns the parsimony score of a random tree -#' for the given Morphy object. +#' @return `RandomTreeScore()` returns a numeric parsimony score. #' @examples #' tokens <- matrix(c( #' 0, "-", "-", 1, 1, 2, @@ -11,21 +14,24 @@ #' 0, "-", "-", 0, 0, 0), byrow = TRUE, nrow = 3L, #' dimnames = list(letters[1:3], NULL)) #' pd <- TreeTools::MatrixToPhyDat(tokens) -#' morphyObj <- PhyDat2Morphy(pd) -#' -#' RandomTreeScore(morphyObj) -#' -#' morphyObj <- UnloadMorphy(morphyObj) +#' RandomTreeScore(pd) +#' @importFrom TreeTools RandomTree #' @export -RandomTreeScore <- function (morphyObj) { - nTip <- mpl_get_numtaxa(morphyObj) +RandomTreeScore <- function(dataset) { + if (inherits(dataset, "morphyPtr")) { + nTip <- mpl_get_numtaxa(dataset) + if (nTip < 2) { + return(0L) + } + return(.Call(`RANDOM_TREE_SCORE`, as.integer(nTip), dataset)) + } + + nTip <- length(dataset) if (nTip < 2) { - # Return: - 0L - } else { - # Return: - .Call(`RANDOM_TREE_SCORE`, as.integer(nTip), morphyObj) + return(0) } + tree <- RandomTree(dataset, root = TRUE) + TreeLength(tree, dataset) } #' Random postorder tree diff --git a/R/Ratchet.R b/R/Ratchet.R index 63c1b2278..280a0b7b8 100644 --- a/R/Ratchet.R +++ b/R/Ratchet.R @@ -2,6 +2,8 @@ #' #' `Ratchet()` uses the parsimony ratchet \insertCite{Nixon1999}{TreeSearch} #' to search for a more parsimonious tree using custom optimality criteria. +#' For standard parsimony searches, [`MaximizeParsimony()`] is faster; +#' use `Ratchet()` when you need a custom `TreeScorer` or `EdgeSwapper`. #' #' For usage pointers, see the #' [vignette](https://ms609.github.io/TreeSearch/articles/custom.html). @@ -84,7 +86,6 @@ Ratchet <- function(tree, dataset, suboptimal = sqrt(.Machine[["double.eps"]]), ...) { epsilon <- sqrt(.Machine[["double.eps"]]) hits <- 0L - # initialize tree and data if (dim(tree[["edge"]])[1] != 2 * tree[["Nnode"]]) { stop("tree must be bifurcating; try rooting with ape::root") } diff --git a/R/RcppExports.R b/R/RcppExports.R index a2bc7abe5..8fbc5db73 100644 --- a/R/RcppExports.R +++ b/R/RcppExports.R @@ -1,6 +1,25 @@ # Generated by using Rcpp::compileAttributes() -> do not edit by hand # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 +#' @rdname Carter1 +#' @examples +#' # Log-probability that a 3-state character (2 "0", 3 "1", 2 "2") needs +#' # exactly 2 steps on a random 7-leaf tree: +#' logp <- MaddisonSlatkin(2, c("0" = 2, "1" = 3, "01" = 0, "2" = 2)) +#' # Convert to an expected number of trees: +#' exp(logp) * TreeTools::NUnrooted(7) +#' +#' @export +MaddisonSlatkin <- function(steps, states) { + .Call(`_TreeSearch_MaddisonSlatkin`, steps, states) +} + +#' @export +#' @keywords internal +MaddisonSlatkin_clear_cache <- function() { + invisible(.Call(`_TreeSearch_MaddisonSlatkin_clear_cache`)) +} + expected_mi <- function(ni, nj) { .Call(`_TreeSearch_expected_mi`, ni, nj) } @@ -53,3 +72,147 @@ all_tbr <- function(edge, break_order) { .Call(`_TreeSearch_all_tbr`, edge, break_order) } +#' Monte Carlo Fitch scores for a single character +#' +#' Generates `n_mc` random trees and scores each with a Fitch parsimony +#' downpass for a single character defined by `state_counts`. +#' Tree generation and scoring are done entirely in C with no R object +#' allocation per tree, making this very fast (~0.01 ms per tree). +#' +#' @param state_counts Integer vector giving the number of tips in each +#' state. Length determines the number of states (k); sum determines +#' the number of tips (n). For example, `c(13, 13, 12)` defines a +#' 3-state character with 38 tips. +#' @param n_mc Number of random trees to generate and score. +#' @return Integer vector of length `n_mc` containing the Fitch parsimony +#' score (number of state changes) for each random tree. +#' @keywords internal +#' @export +mc_fitch_scores <- function(state_counts, n_mc) { + .Call(`_TreeSearch_mc_fitch_scores`, state_counts, n_mc) +} + +ts_fitch_score <- function(edge, contrast, tip_data, weight, levels, min_steps = integer(), concavity = -1.0, infoAmounts = NULL, xpiwe = FALSE, xpiwe_r = 0.5, xpiwe_max_f = 5.0, obs_count = integer()) { + .Call(`_TreeSearch_ts_fitch_score`, edge, contrast, tip_data, weight, levels, min_steps, concavity, infoAmounts, xpiwe, xpiwe_r, xpiwe_max_f, obs_count) +} + +ts_na_debug_char <- function(edge, contrast, tip_data, weight, levels, target_pattern) { + .Call(`_TreeSearch_ts_na_debug_char`, edge, contrast, tip_data, weight, levels, target_pattern) +} + +ts_na_char_steps <- function(edge, contrast, tip_data, weight, levels) { + .Call(`_TreeSearch_ts_na_char_steps`, edge, contrast, tip_data, weight, levels) +} + +ts_char_steps <- function(edge, contrast, tip_data, weight, levels) { + .Call(`_TreeSearch_ts_char_steps`, edge, contrast, tip_data, weight, levels) +} + +ts_debug_clip <- function(edge, contrast, tip_data, weight, levels, clip_node_1based) { + .Call(`_TreeSearch_ts_debug_clip`, edge, contrast, tip_data, weight, levels, clip_node_1based) +} + +ts_test_indirect <- function(edge, contrast, tip_data, weight, levels, clip_node_1based, above_1based, below_1based) { + .Call(`_TreeSearch_ts_test_indirect`, edge, contrast, tip_data, weight, levels, clip_node_1based, above_1based, below_1based) +} + +ts_spr_search <- function(edge, contrast, tip_data, weight, levels, maxHits = 20L, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_spr_search`, edge, contrast, tip_data, weight, levels, maxHits, min_steps, concavity) +} + +ts_tbr_search <- function(edge, contrast, tip_data, weight, levels, maxHits = 1L, acceptEqual = FALSE, maxChanges = 0L, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_tbr_search`, edge, contrast, tip_data, weight, levels, maxHits, acceptEqual, maxChanges, min_steps, concavity) +} + +ts_ratchet_search <- function(edge, contrast, tip_data, weight, levels, nCycles = 10L, perturbProb = 0.04, maxHits = 1L, min_steps = integer(), concavity = -1.0, perturbMode = 0L, perturbMaxMoves = 0L, adaptive = FALSE, targetEscapeRate = 0.3) { + .Call(`_TreeSearch_ts_ratchet_search`, edge, contrast, tip_data, weight, levels, nCycles, perturbProb, maxHits, min_steps, concavity, perturbMode, perturbMaxMoves, adaptive, targetEscapeRate) +} + +ts_drift_search <- function(edge, contrast, tip_data, weight, levels, nCycles = 10L, afdLimit = 3L, rfdLimit = 0.1, maxHits = 1L, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_drift_search`, edge, contrast, tip_data, weight, levels, nCycles, afdLimit, rfdLimit, maxHits, min_steps, concavity) +} + +ts_wagner_tree <- function(contrast, tip_data, weight, levels, addition_order = integer(), min_steps = integer(), concavity = -1.0, infoAmounts = NULL, consSplitMatrix = NULL, consContrast = NULL, consTipData = NULL, consWeight = NULL, consLevels = NULL, consExpectedScore = 0L) { + .Call(`_TreeSearch_ts_wagner_tree`, contrast, tip_data, weight, levels, addition_order, min_steps, concavity, infoAmounts, consSplitMatrix, consContrast, consTipData, consWeight, consLevels, consExpectedScore) +} + +ts_random_wagner_tree <- function(contrast, tip_data, weight, levels, min_steps = integer(), concavity = -1.0, infoAmounts = NULL, consSplitMatrix = NULL, consContrast = NULL, consTipData = NULL, consWeight = NULL, consLevels = NULL, consExpectedScore = 0L) { + .Call(`_TreeSearch_ts_random_wagner_tree`, contrast, tip_data, weight, levels, min_steps, concavity, infoAmounts, consSplitMatrix, consContrast, consTipData, consWeight, consLevels, consExpectedScore) +} + +ts_compute_splits <- function(edge, n_tip) { + .Call(`_TreeSearch_ts_compute_splits`, edge, n_tip) +} + +ts_trees_equal <- function(edge1, edge2, n_tip) { + .Call(`_TreeSearch_ts_trees_equal`, edge1, edge2, n_tip) +} + +ts_pool_test <- function(edges, scores, n_tip, max_size = 100L, suboptimal = 0.0) { + .Call(`_TreeSearch_ts_pool_test`, edges, scores, n_tip, max_size, suboptimal) +} + +ts_nni_search <- function(edge, contrast, tip_data, weight, levels, maxHits = 20L, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_nni_search`, edge, contrast, tip_data, weight, levels, maxHits, min_steps, concavity) +} + +ts_tree_fuse <- function(edge, contrast, tip_data, weight, levels, pool_edges, pool_scores, accept_equal = FALSE, max_rounds = 10L, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_tree_fuse`, edge, contrast, tip_data, weight, levels, pool_edges, pool_scores, accept_equal, max_rounds, min_steps, concavity) +} + +ts_sector_diag <- function(edge, contrast, tip_data, weight, levels, sector_root_1based) { + .Call(`_TreeSearch_ts_sector_diag`, edge, contrast, tip_data, weight, levels, sector_root_1based) +} + +ts_rss_search <- function(edge, contrast, tip_data, weight, levels, minSectorSize = 6L, maxSectorSize = 50L, acceptEqual = FALSE, rssPicks = 0L, ratchetCycles = 6L, maxHits = 1L, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_rss_search`, edge, contrast, tip_data, weight, levels, minSectorSize, maxSectorSize, acceptEqual, rssPicks, ratchetCycles, maxHits, min_steps, concavity) +} + +ts_xss_search <- function(edge, contrast, tip_data, weight, levels, nPartitions = 4L, xssRounds = 3L, acceptEqual = FALSE, ratchetCycles = 6L, maxHits = 1L, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_xss_search`, edge, contrast, tip_data, weight, levels, nPartitions, xssRounds, acceptEqual, ratchetCycles, maxHits, min_steps, concavity) +} + +ts_driven_search <- function(contrast, tip_data, weight, levels, searchControl, runtimeConfig, scoringConfig, constraintConfig = NULL, hsjConfig = NULL, xformConfig = NULL) { + .Call(`_TreeSearch_ts_driven_search`, contrast, tip_data, weight, levels, searchControl, runtimeConfig, scoringConfig, constraintConfig, hsjConfig, xformConfig) +} + +ts_resample_search <- function(contrast, tip_data, weight, levels, bootstrap = FALSE, jackProportion = 2.0 / 3.0, maxReplicates = 5L, targetHits = 2L, tbrMaxHits = 1L, ratchetCycles = 3L, ratchetPerturbProb = 0.04, driftCycles = 0L, min_steps = integer(), concavity = -1.0, consSplitMatrix = NULL, consContrast = NULL, consTipData = NULL, consWeight = NULL, consLevels = NULL, consExpectedScore = 0L, infoAmounts = NULL, xpiwe = FALSE, xpiwe_r = 0.5, xpiwe_max_f = 5.0, obs_count = integer()) { + .Call(`_TreeSearch_ts_resample_search`, contrast, tip_data, weight, levels, bootstrap, jackProportion, maxReplicates, targetHits, tbrMaxHits, ratchetCycles, ratchetPerturbProb, driftCycles, min_steps, concavity, consSplitMatrix, consContrast, consTipData, consWeight, consLevels, consExpectedScore, infoAmounts, xpiwe, xpiwe_r, xpiwe_max_f, obs_count) +} + +ts_parallel_resample <- function(contrast, tip_data, weight, levels, nReplicates = 1L, nThreads = 1L, bootstrap = FALSE, jackProportion = 2.0 / 3.0, maxReplicates = 5L, targetHits = 2L, tbrMaxHits = 1L, ratchetCycles = 3L, ratchetPerturbProb = 0.04, driftCycles = 0L, min_steps = integer(), concavity = -1.0, consSplitMatrix = NULL, consContrast = NULL, consTipData = NULL, consWeight = NULL, consLevels = NULL, consExpectedScore = 0L, infoAmounts = NULL, xpiwe = FALSE, xpiwe_r = 0.5, xpiwe_max_f = 5.0, obs_count = integer()) { + .Call(`_TreeSearch_ts_parallel_resample`, contrast, tip_data, weight, levels, nReplicates, nThreads, bootstrap, jackProportion, maxReplicates, targetHits, tbrMaxHits, ratchetCycles, ratchetPerturbProb, driftCycles, min_steps, concavity, consSplitMatrix, consContrast, consTipData, consWeight, consLevels, consExpectedScore, infoAmounts, xpiwe, xpiwe_r, xpiwe_max_f, obs_count) +} + +ts_successive_approx <- function(contrast, tip_data, weight, levels, saK = 3.0, maxSAIter = 20L, maxReplicates = 10L, targetHits = 3L, tbrMaxHits = 1L, ratchetCycles = 5L, ratchetPerturbProb = 0.04, driftCycles = 0L, min_steps = integer(), concavity = -1.0, consSplitMatrix = NULL, consContrast = NULL, consTipData = NULL, consWeight = NULL, consLevels = NULL, consExpectedScore = 0L, infoAmounts = NULL, xpiwe = FALSE, xpiwe_r = 0.5, xpiwe_max_f = 5.0, obs_count = integer()) { + .Call(`_TreeSearch_ts_successive_approx`, contrast, tip_data, weight, levels, saK, maxSAIter, maxReplicates, targetHits, tbrMaxHits, ratchetCycles, ratchetPerturbProb, driftCycles, min_steps, concavity, consSplitMatrix, consContrast, consTipData, consWeight, consLevels, consExpectedScore, infoAmounts, xpiwe, xpiwe_r, xpiwe_max_f, obs_count) +} + +ts_bench_tbr_phases <- function(edge, contrast, tip_data, weight, levels, min_steps = integer(), concavity = -1.0) { + .Call(`_TreeSearch_ts_bench_tbr_phases`, edge, contrast, tip_data, weight, levels, min_steps, concavity) +} + +ts_simplify_diag <- function(contrast, tip_data, weight, levels) { + .Call(`_TreeSearch_ts_simplify_diag`, contrast, tip_data, weight, levels) +} + +ts_hsj_score <- function(edge, contrast, tip_data, weight, levels, hierarchy_blocks_r, alpha, tip_labels_r, absent_state) { + .Call(`_TreeSearch_ts_hsj_score`, edge, contrast, tip_data, weight, levels, hierarchy_blocks_r, alpha, tip_labels_r, absent_state) +} + +ts_sankoff_test <- function(edge, n_states_r, cost_matrices_r, tip_states_r, forced_root_r) { + .Call(`_TreeSearch_ts_sankoff_test`, edge, n_states_r, cost_matrices_r, tip_states_r, forced_root_r) +} + +ts_wagner_bias_bench <- function(contrast, tip_data, weight, levels, min_steps, concavity, bias, temperature, n_reps, run_tbr) { + .Call(`_TreeSearch_ts_wagner_bias_bench`, contrast, tip_data, weight, levels, min_steps, concavity, bias, temperature, n_reps, run_tbr) +} + +ts_test_strategy_tracker <- function(seed, n_draws) { + .Call(`_TreeSearch_ts_test_strategy_tracker`, seed, n_draws) +} + +ts_tbr_diagnostics <- function(edge, contrast, tip_data, weight, levels, maxHits = 1L, acceptEqual = FALSE, maxChanges = 0L, min_steps = integer(), concavity = -1.0, clipOrder = 0L) { + .Call(`_TreeSearch_ts_tbr_diagnostics`, edge, contrast, tip_data, weight, levels, maxHits, acceptEqual, maxChanges, min_steps, concavity, clipOrder) +} + diff --git a/R/SPR.R b/R/SPR.R index 0374b0520..ef58b82fd 100644 --- a/R/SPR.R +++ b/R/SPR.R @@ -72,8 +72,8 @@ SPRWarning <- function (parent, child, error) { #' @param mergeEdge the index of an edge on which to merge the broken edge. #' @return This function returns a tree in \code{phyDat} format that has undergone one \acronym{SPR} iteration. #' -#' @references The \acronym{SPR} algorithm is summarized in -#' \insertRef{Felsenstein2004}{TreeSearch} +#' @references \insertCite{Felsenstein2004}{TreeSearch} +#' \insertAllCited{} #' #' @author Martin R. Smith #' @@ -104,8 +104,7 @@ SPR <- function(tree, edgeToBreak = NULL, mergeEdge = NULL) { unique(unlist(lapply(which(notDuplicateRoot), AllSPR, parent = parent, child = child, nEdge = nEdge, notDuplicateRoot = notDuplicateRoot), - recursive = FALSE)) # TODO the fact that we need to use `unique` indicates that - # we're being inefficient here. + recursive = FALSE)) } else { newEdge <- SPRSwap(parent, edge[, 2], edgeToBreak = edgeToBreak, mergeEdge = mergeEdge) @@ -160,7 +159,6 @@ SPRMoves.matrix <- function (tree, edgeToBreak = integer(0)) { unique(.all_spr(tree, edgeToBreak)) } -## TODO Do edges need to be pre-ordered before coming here? #' @describeIn SPR faster version that takes and returns parent and child parameters #' @inheritParams RearrangeEdges #' @param nEdge (optional) integer specifying the number of edges of a tree of @@ -174,7 +172,6 @@ SPRSwap <- function (parent, child, nEdge = length(parent), nNode = nEdge / 2L, edgeToBreak = NULL, mergeEdge = NULL) { if (nEdge < 5) { - # TODO we need to re-root this tree... return(list(parent, child)) } @@ -364,7 +361,6 @@ RootedSPR <- function(tree, edgeToBreak = NULL, mergeEdge = NULL) { return (tree) } -## TODO Do edges need to be pre-ordered before coming here? #' @describeIn SPR faster version that takes and returns parent and child parameters #' @return a list containing two elements, corresponding in turn to the rearranged parent and child parameters #' @export @@ -382,8 +378,7 @@ RootedSPRSwap <- function (parent, child, nEdge = length(parent), nNode = nEdge notDuplicateRoot <- .NonDuplicateRoot(parent, child, nEdge) return(unique(unlist(lapply(which(breakable), AllSPR, parent=parent, child=child, nEdge=nEdge, notDuplicateRoot=notDuplicateRoot), - recursive=FALSE))) # TODO the fact that we need to use `unique` indicates that - # we're being inefficient here. + recursive=FALSE))) } rightSide <- DescendantEdges(edge = 1, parent, child, nEdge = nEdge) diff --git a/R/ScoreSpectrum.R b/R/ScoreSpectrum.R new file mode 100644 index 000000000..ef1152681 --- /dev/null +++ b/R/ScoreSpectrum.R @@ -0,0 +1,184 @@ +#' Score-spectrum coverage estimate for parsimony search +#' +#' `ScoreSpectrum()` applies Chao1-style abundance-based richness estimation +#' \insertCite{Chao1984}{TreeSearch} to the distribution of per-replicate +#' parsimony scores returned by [MaximizeParsimony()]. Treating each distinct +#' score value as a "species" and the number of replicates that found it as its +#' "abundance", the estimator quantifies how thoroughly the search has explored +#' the parsimony landscape. +#' +#' The **sample coverage** (Good-Turing estimator) +#' \insertCite{Good1953,Chao2012}{TreeSearch} is: +#' \deqn{\hat{C} = 1 - f_1 / n} +#' where \eqn{f_1} is the number of score levels seen exactly once and \eqn{n} +#' is the total number of replicates. A coverage close to 1 indicates that +#' most of the accessible score landscape has been sampled; low coverage +#' suggests meaningful unexplored territory remains. +#' +#' The **Chao1 lower bound** on total score-level richness is: +#' \deqn{\hat{S} = S_{\mathrm{obs}} + \frac{f_1^2}{2 f_2}} +#' When \eqn{f_2 = 0} (no doubleton scores) the bias-corrected form +#' \eqn{f_1(f_1 - 1)/2} is used instead. +#' +#' @param trees A `multiPhylo` object returned by [MaximizeParsimony()], which +#' must carry a `replicate_scores` attribute. Alternatively, a numeric +#' vector of per-replicate scores. +#' @param tol Numeric tolerance for binning floating-point scores. Scores +#' that differ by less than `tol` are treated as equal. The default +#' (`1e-4`) is suitable for implied-weights and profile-parsimony scores; +#' use `0` for strict equality when working with equal-weights (integer) +#' scores. +#' +#' @return A list of class `"ScoreSpectrum"` with components: +#' \describe{ +#' \item{`n_replicates`}{Total completed replicates.} +#' \item{`observed_levels`}{Distinct score values observed (\eqn{S_\mathrm{obs}}).} +#' \item{`estimated_levels`}{Chao1 lower-bound estimate of total score +#' levels (\eqn{\hat{S}}).} +#' \item{`coverage`}{Good-Turing sample coverage (\eqn{\hat{C}}).} +#' \item{`unseen_fraction`}{Estimated fraction of score levels not yet +#' seen: \eqn{1 - S_\mathrm{obs}/\hat{S}}.} +#' \item{`best_score`}{The lowest (best) score found.} +#' \item{`best_score_reps`}{Number of replicates that reached the best +#' score.} +#' \item{`f`}{Named integer vector: \eqn{f_k} = number of score levels +#' seen exactly \eqn{k} times (frequency spectrum).} +#' \item{`replicate_scores`}{The raw per-replicate scores.} +#' } +#' +#' @references +#' \insertAllCited{} +#' +#' @examples +#' library("TreeTools", quietly = TRUE) +#' data("Lobo", package = "TreeSearch") +#' result <- MaximizeParsimony(Lobo.phy, maxReplicates = 4L) +#' sp <- ScoreSpectrum(result) +#' print(sp) +#' +#' @family search utilities +#' @export +ScoreSpectrum <- function(trees, tol = 1e-4) { + # Accept either a multiPhylo with attribute or a raw numeric vector + if (inherits(trees, "multiPhylo")) { + scores <- attr(trees, "replicate_scores") + if (is.null(scores)) { + stop("`trees` has no `replicate_scores` attribute. ", + "Re-run MaximizeParsimony() with this version of TreeSearch.") + } + } else if (is.numeric(trees)) { + scores <- trees + } else { + stop("`trees` must be a `multiPhylo` from MaximizeParsimony() or a ", + "numeric vector of per-replicate scores.") + } + + scores <- scores[is.finite(scores)] + n <- length(scores) + + if (n < 2L) { + return(structure( + list( + n_replicates = n, + observed_levels = if (n == 0L) 0L else 1L, + estimated_levels = NA_real_, + coverage = NA_real_, + unseen_fraction = NA_real_, + best_score = if (n > 0L) min(scores) else NA_real_, + best_score_reps = if (n > 0L) sum(scores == min(scores)) else 0L, + f = integer(0L), + replicate_scores = scores + ), + class = "ScoreSpectrum" + )) + } + + # Bin scores to handle floating-point equality + if (tol > 0) { + scores_binned <- round(scores / tol) * tol + } else { + scores_binned <- scores + } + + # Frequency of each distinct score value (abundance vector) + abundance <- tabulate(factor(scores_binned)) + s_obs <- length(abundance) # distinct score levels observed + + # Frequency spectrum: f_k = number of score levels seen exactly k times + max_k <- max(abundance) + f_k <- tabulate(abundance, nbins = max_k) + f1 <- if (max_k >= 1L) f_k[1L] else 0L + f2 <- if (max_k >= 2L) f_k[2L] else 0L + + # Good-Turing sample coverage + coverage <- 1.0 - f1 / n + + # Chao1 lower-bound estimate of total richness + if (f2 > 0L) { + s_hat <- s_obs + f1^2 / (2 * f2) + } else if (f1 > 1L) { + # Bias-corrected form when no doubletons + s_hat <- s_obs + f1 * (f1 - 1L) / 2 + } else { + # All observed levels are well-represented + s_hat <- s_obs + } + + unseen_fraction <- if (s_hat > 0) 1 - s_obs / s_hat else 0 + + best_score <- min(scores_binned) + best_score_reps <- sum(scores_binned <= best_score + tol) + + # Trim trailing zeros from f_k for a compact spectrum + last_nonzero <- max(which(f_k > 0L), 0L) + f_k_trimmed <- f_k[seq_len(last_nonzero)] + names(f_k_trimmed) <- seq_len(last_nonzero) + + structure( + list( + n_replicates = n, + observed_levels = s_obs, + estimated_levels = s_hat, + coverage = coverage, + unseen_fraction = unseen_fraction, + best_score = best_score, + best_score_reps = best_score_reps, + f = f_k_trimmed, + replicate_scores = scores + ), + class = "ScoreSpectrum" + ) +} + +#' @export +print.ScoreSpectrum <- function(x, ...) { + if (is.na(x$coverage)) { + cat("ScoreSpectrum: insufficient replicates (n =", x$n_replicates, ")\n") + return(invisible(x)) + } + cat(sprintf( + "Score-spectrum coverage (n = %d replicates)\n", + x$n_replicates + )) + cat(sprintf( + " Best score: %.4g (%d replicates)\n", + x$best_score, x$best_score_reps + )) + cat(sprintf( + " Score levels seen: %d (est. total: %.1f)\n", + x$observed_levels, x$estimated_levels + )) + cat(sprintf( + " Landscape coverage: %.1f%%", + 100 * x$coverage + )) + if (x$unseen_fraction > 0.01) { + cat(sprintf(" (~%.0f%% of score levels unseen)", 100 * x$unseen_fraction)) + } + cat("\n") + if (length(x$f) > 0L) { + cat(" Frequency spectrum (f_k): ") + cat(paste0("f", names(x$f), "=", x$f, collapse = ", "), "\n") + } + invisible(x) +} diff --git a/R/SearchControl.R b/R/SearchControl.R new file mode 100644 index 000000000..26fed429a --- /dev/null +++ b/R/SearchControl.R @@ -0,0 +1,380 @@ +#' Expert search heuristic parameters +#' +#' Construct a list of low-level search parameters for +#' [`MaximizeParsimony()`]. Most users can ignore these and rely on the +#' `strategy` presets (`"sprint"`, `"default"`, `"thorough"`); `SearchControl` +#' is provided for expert tuning. +#' +#' The parameters correspond to heuristics described by +#' \insertCite{Goloboff1999;textual}{TreeSearch} +#' (sectorial search, tree drifting, tree fusing) and +#' \insertCite{Nixon1999;textual}{TreeSearch} +#' (parsimony ratchet), as implemented in TNT +#' \insertCite{Goloboff2016}{TreeSearch}. +#' +#' @param tbrMaxHits Integer; number of equally-scoring trees to accept +#' before stopping a TBR pass. +#' @param clipOrder Integer (experimental); clip-ordering strategy for TBR +#' search. Determines the order in which edges are tried as clip points. +#' 0 = random (default); 1 = inverse-weight (fewest descendant taxa first); +#' 2 = tips-first (terminal edges before internal); 3 = bucket ordering; +#' 4 = anti-tip (internal before terminal); 5 = large-first (most descendant +#' taxa first). On datasets with \eqn{\ge}65 tips, \code{clipOrder = 2L} +#' (tips-first) typically increases replicate throughput by 5--15\% by +#' evaluating higher-probability improvement candidates earlier. +#' @param nniFirst Logical; run an NNI pass before SPR/TBR in each replicate? +#' At small tree sizes (\eqn{\le}88 tips) overhead is negligible; at \eqn{\ge}100 tips +#' this significantly accelerates the initial descent from the Wagner tree. +#' @param sprFirst Logical; run an SPR pass before TBR in each replicate? +#' @param tabuSize Integer; tabu list size for TBR plateau exploration. +#' @param wagnerStarts Integer; random Wagner starting trees per replicate. +#' @param ratchetCycles Integer; number of ratchet perturbation cycles. +#' @param ratchetPerturbProb Numeric (0--1); probability of perturbing each +#' character. +#' @param ratchetPerturbMode Integer; 0 = zero-weight only, 1 = up-weight only, +#' 2 = mixed. +#' @param ratchetPerturbMaxMoves Integer; maximum TBR moves per perturbation +#' cycle (0 = automatic). +#' @param ratchetAdaptive Logical; adjust perturbation probability based on +#' within-replicate escape rate? +#' @param ratchetTaper Logical; taper ratchet perturbation probability across +#' replicates as the pool stabilizes? When `TRUE`, early replicates use +#' the full `ratchetPerturbProb`; later replicates (with high hit rates) +#' use a reduced probability for finer local exploration. The effective +#' probability is `ratchetPerturbProb * max(floor, 1 - strength * hitRate)` +#' where `hitRate` is the fraction of replicates that found the current +#' best score. Default `FALSE`. +#' @param driftCycles Integer; number of drift search cycles. +#' @param driftAfdLimit Integer; maximum absolute fit difference (steps) for +#' accepting a suboptimal drift move. +#' @param driftRfdLimit Numeric; maximum relative fit difference for +#' accepting a suboptimal drift move. +#' @param xssRounds Integer; rounds of exclusive sectorial search. +#' @param xssPartitions Integer; number of partitions in XSS. +#' @param rssRounds Integer; rounds of random sectorial search. +#' @param cssRounds Integer; rounds of constrained (sector-restricted TBR) +#' sectorial search. +#' @param cssPartitions Integer; number of partitions in CSS. +#' @param sectorMinSize,sectorMaxSize Integer; minimum and maximum clade +#' sizes for sectorial search. +#' @param postRatchetSectorial Logical; when `TRUE`, run XSS+RSS+CSS again +#' after ratchet perturbation using the same round counts. Approximates +#' TNT's interleaved sectorial pattern. Default: `FALSE`. +#' @param fuseInterval Integer; fuse pool trees every _n_ replicates. +#' @param fuseAcceptEqual Logical; accept equally-scoring fused trees? +#' @param intraFuse Logical; fuse the current tree against pool donors +#' within each replicate, after TBR polish. This approximates TNT's +#' within-replicate fusing pattern. Default: `FALSE`. +#' @param poolMaxSize Integer; maximum trees retained in the pool. +#' @param poolSuboptimal Numeric; retain trees that are this many steps +#' worse than the best tree. 0 (default) keeps only optimal trees. +#' @param consensusStableReps Integer; stop when the strict consensus of +#' best-score pool trees has been unchanged for this many consecutive +#' replicates. +#' 0 (default) disables this criterion; a typical value is 3--5. +#' When both `consensusStableReps` and `targetHits` are active, the search +#' stops when either criterion is met first. +#' @param perturbStopFactor Integer; stop after +#' `nTip * perturbStopFactor` consecutive replicates that fail to improve +#' the best score. 0 disables this criterion. +#' Default 2, which provides 2.4--6.9\ifelse{html}{×}{x} speedup on +#' converged searches with no score degradation. +#' Complementary to `targetHits`: on hard landscapes where few replicates +#' independently find the best score, `perturbStopFactor` fires first; +#' on easy landscapes, `targetHits` fires first. +#' Inspired by IQ-TREE's unsuccessful-perturbation stopping rule +#' \insertCite{Nguyen2015}{TreeSearch}; adapted from per-perturbation to +#' per-replicate granularity. +#' @param adaptiveLevel Logical; dynamically scale ratchet and drift effort +#' based on the observed hit rate? When `TRUE`, easy landscapes +#' (high hit rate) trigger reduced effort per replicate, while hard +#' landscapes trigger increased effort. Default `FALSE`. +#' @param nniPerturbCycles Integer; number of stochastic NNI-perturbation +#' cycles per replicate. Each cycle randomly applies NNI swaps to a +#' fraction of internal branches, then runs TBR to find a new local +#' optimum. Complementary to the weight-perturbation ratchet: the ratchet +#' perturbs the objective function, while NNI-perturbation perturbs the +#' topology directly. +#' 0 (default) disables NNI perturbation. +#' Inspired by `doRandomNNIs()` in IQ-TREE +#' \insertCite{Nguyen2015}{TreeSearch}. +#' @param nniPerturbFraction Numeric (0--1); fraction of internal branches +#' to swap during each NNI-perturbation cycle. Default 0.5. +#' @param pruneReinsertCycles Integer; number of taxon pruning-reinsertion +#' perturbation cycles per replicate. Each cycle drops a fraction of leaves, +#' runs TBR on the reduced tree to let the backbone restructure, then +#' greedily reinserts the dropped taxa via Wagner addition and TBR-polishes +#' the full tree. Complementary to the ratchet (which perturbs character +#' weights) and NNI-perturbation (which perturbs the topology directly). +#' 0 (default) disables this perturbation. +#' @param pruneReinsertDrop Numeric (0--1); fraction of tips to drop per +#' cycle. Default 0.10 (10%). Always drops at least 3 tips and keeps +#' at least 4. +#' @param pruneReinsertSelection Integer; tip selection strategy for choosing +#' which tips to drop: +#' - `0` = random (default). +#' - `1` = instability-weighted: tips whose parent-edge split is rare across +#' pool trees are preferentially dropped. Requires \eqn{\ge}2 pool trees; +#' falls back to random otherwise. +#' - `2` = missing-data-weighted: tips with more ambiguous or inapplicable +#' characters are preferentially dropped. High-missingness taxa are +#' hardest to score correctly and most likely to be trapped in suboptimal +#' positions. +#' - `3` = combined: weight = instability × (1 + normalised missingness). +#' Targets taxa that are both unstably placed and data-poor. +#' @param pruneReinsertTbrMoves Integer; maximum number of TBR moves accepted +#' during the reduced-tree backbone optimisation phase of each +#' prune-reinsert cycle. 0 means run to convergence; the default of 5 +#' mirrors the ratchet design (short perturbation, many diverse cycles) +#' and substantially reduces per-cycle cost on datasets with inapplicable +#' characters (where Brazeau scoring dominates). Increase towards 0 if +#' you prefer thorough backbone optimisation over replicate throughput. +#' @param pruneReinsertFullMoves Integer; maximum TBR moves during the +#' full-tree polish after each prune-reinsert cycle. 0 (default) runs +#' to convergence. Has no effect when `pruneReinsertNni = TRUE`. +#' @param pruneReinsertNni Logical; if `TRUE`, use NNI (nearest-neighbour +#' interchange) instead of TBR for the full-tree polish step. NNI +#' converges roughly 5x faster than TBR at large tip counts (\eqn{\ge}120), +#' substantially reducing per-cycle cost while still reaching a local +#' optimum before the outer-loop TBR polish. Default `FALSE`. +#' @param consensusConstrain Logical; lock the strict consensus of pool +#' trees as topological constraints for subsequent replicates? When +#' `TRUE`, after enough replicates (\eqn{\ge}5), splits present in ALL +#' best-score pool trees are enforced as constraints, focusing search on +#' uncertain regions. Constraints are cleared whenever a new best score +#' is found. Only active when no user-supplied `constraint` is +#' present. Default `FALSE`. +#' @param wagnerBias Integer; criterion for biasing taxon addition order +#' during Wagner tree construction. 0 = random (default), +#' 1 = Goloboff (2014) non-ambiguous-character priority, +#' 2 = entropy-based state-specificity priority. Biased orders use +#' softmax-weighted sampling for diversity across replicates. +#' @param wagnerBiasTemp Numeric; softmax temperature controlling +#' selectivity of biased Wagner addition (default 0.3). Lower values +#' concentrate sampling on the highest-scoring taxa; higher values +#' approach uniform random. +#' @param outerCycles Integer; number of outer search cycles per replicate +#' (default 1). Each outer cycle runs the full +#' \[XSS/RSS/CSS → ratchet → NNI-perturbation → drift → TBR\] sequence, +#' with perturbation cycles divided evenly among outer iterations. +#' Matches the interleaved sectorial + ratchet pattern of TNT's `xmult` +#' \insertCite{Goloboff1999}{TreeSearch}. +#' @param maxOuterResets Integer; maximum number of improvement-triggered +#' resets of the outer cycle counter (default 0 = no resets, so +#' `outerCycles` is exact). When the search finds a new best score during +#' an outer cycle, the counter resets up to this many times, allowing +#' productive re-exploration. Set to \eqn{-1} for unlimited resets. +#' Strategy presets (`"default"`, `"thorough"`) set 2–3. +#' @param annealCycles Integer; number of simulated annealing perturbation +#' cycles (PCSA) per replicate. Each cycle perturbs the current best tree +#' via scheduled SA cooling, then reconverges with TBR. If the result +#' improves on the best, it becomes the new starting point. Effective at +#' escaping deep basins under equal-weights parsimony at \eqn{\ge}100 tips. +#' 0 (default) disables SA perturbation. +#' @param annealPhases Integer; number of temperature steps in the linear +#' cooling schedule per SA cycle (default 5). +#' @param annealTStart Numeric; initial Boltzmann temperature for SA cooling +#' schedule (default 20). Higher temperatures accept more suboptimal moves. +#' @param annealTEnd Numeric; final Boltzmann temperature (default 0 = +#' strict hill-climbing at end of each cycle). +#' @param annealMovesPerPhase Integer; stochastic TBR moves per temperature +#' step (default 0 = number of tips). +#' @param enumTimeFraction Numeric between 0 and 0.5; fraction of `maxSeconds` +#' reserved for MPT enumeration (TBR plateau walk to discover additional +#' equal-score topologies). The main search loop exits at +#' `maxSeconds * (1 - enumTimeFraction)`. Set to 0 to disable the reserve +#' (pre-v1.6 behaviour: enumeration skipped if the main loop times out). +#' Default: `0.1` (10%). +#' @param adaptiveStart Logical; use Thompson-sampling (bandit) strategy +#' selection for starting trees? When `TRUE`, each replicate draws its +#' starting strategy from a pool of options (random Wagner, biased Wagner, +#' random tree, pool ratchet, pool NNI-perturb), adapting to which +#' strategies yield the best scores. Default `FALSE`. +#' +#' @return A named list of class `"SearchControl"`. +#' +#' @examples +#' # Use defaults +#' SearchControl() +#' +#' # Light ratchet, no drift +#' SearchControl(ratchetCycles = 5L, ratchetPerturbProb = 0.04, +#' driftCycles = 0L) +#' +#' @family tree search functions +#' @seealso [`MaximizeParsimony()`] +#' @references +#' \insertAllCited{} +#' @export +SearchControl <- function( + # TBR + tbrMaxHits = 1L, + # TBR clip ordering strategy (experimental). + # 0L=RANDOM (default), 1L=INV_WEIGHT (w=1/(1+s)), 2L=TIPS_FIRST, + # 3L=BUCKET (tips/small/large), 4L=ANTI_TIP (non-tips first), + # 5L=LARGE_FIRST (large then small then tips) + clipOrder = 0L, + nniFirst = TRUE, + sprFirst = FALSE, + tabuSize = 100L, + wagnerStarts = 1L, + # Wagner biased addition (Goloboff 2014 §3.3) + # 0L = random (default), 1L = Goloboff non-ambiguous score, 2L = entropy + wagnerBias = 0L, + wagnerBiasTemp = 0.3, + # Outer search cycle count (Goloboff 1999 §2.3) + # Repeat [XSS → Ratchet → NNI-perturb → Drift → TBR] this many times. + # Cycles are divided evenly; default 1 = single pipeline pass. + outerCycles = 1L, + # Max improvement-triggered resets of the outer cycle counter. + # 0 = no resets (outerCycles is exact); -1 = unlimited. + # Strategy presets set 2-3 for productive re-exploration. + maxOuterResets = 0L, + # Ratchet + ratchetCycles = 12L, + ratchetPerturbProb = 0.25, + ratchetPerturbMode = 0L, + ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = FALSE, + ratchetTaper = FALSE, + # NNI perturbation + nniPerturbCycles = 0L, + nniPerturbFraction = 0.5, + # Drift + driftCycles = 0L, + driftAfdLimit = 5L, + driftRfdLimit = 0.15, + # Sectorial + xssRounds = 3L, + xssPartitions = 4L, + rssRounds = 1L, + cssRounds = 0L, + cssPartitions = 4L, + sectorMinSize = 6L, + sectorMaxSize = 50L, + postRatchetSectorial = FALSE, + # Fuse / pool + fuseInterval = 3L, + fuseAcceptEqual = FALSE, + intraFuse = FALSE, + poolMaxSize = 100L, + poolSuboptimal = 0, + # Stopping criteria + consensusStableReps = 0L, + perturbStopFactor = 2L, + adaptiveLevel = FALSE, + consensusConstrain = FALSE, + # Taxon pruning-reinsertion (T-266) + pruneReinsertCycles = 0L, + pruneReinsertDrop = 0.10, + pruneReinsertSelection = 0L, + pruneReinsertTbrMoves = 5L, + pruneReinsertFullMoves = 0L, + pruneReinsertNni = FALSE, + # Simulated annealing perturbation (PCSA, T-207) + annealCycles = 0L, + annealPhases = 5L, + annealTStart = 20, + annealTEnd = 0, + annealMovesPerPhase = 0L, + # Adaptive starting-tree strategy (T-190) + # When TRUE, each replicate draws its starting strategy via Thompson + # sampling from {Wagner-random, Wagner-Goloboff, Wagner-entropy, + # random-tree, pool-ratchet, pool-NNI-perturb}. Overrides wagnerBias. + adaptiveStart = FALSE, + enumTimeFraction = 0.1 +) { + structure( + list( + tbrMaxHits = as.integer(tbrMaxHits), + clipOrder = as.integer(clipOrder), + nniFirst = as.logical(nniFirst), + sprFirst = as.logical(sprFirst), + tabuSize = as.integer(tabuSize), + wagnerStarts = as.integer(wagnerStarts), + wagnerBias = as.integer(wagnerBias), + wagnerBiasTemp = as.double(wagnerBiasTemp), + outerCycles = as.integer(outerCycles), + maxOuterResets = as.integer(maxOuterResets), + ratchetCycles = as.integer(ratchetCycles), + ratchetPerturbProb = as.double(ratchetPerturbProb), + ratchetPerturbMode = as.integer(ratchetPerturbMode), + ratchetPerturbMaxMoves = as.integer(ratchetPerturbMaxMoves), + ratchetAdaptive = as.logical(ratchetAdaptive), + ratchetTaper = as.logical(ratchetTaper), + nniPerturbCycles = as.integer(nniPerturbCycles), + nniPerturbFraction = as.double(nniPerturbFraction), + driftCycles = as.integer(driftCycles), + driftAfdLimit = as.integer(driftAfdLimit), + driftRfdLimit = as.double(driftRfdLimit), + xssRounds = as.integer(xssRounds), + xssPartitions = as.integer(xssPartitions), + rssRounds = as.integer(rssRounds), + cssRounds = as.integer(cssRounds), + cssPartitions = as.integer(cssPartitions), + sectorMinSize = as.integer(sectorMinSize), + sectorMaxSize = as.integer(sectorMaxSize), + postRatchetSectorial = as.logical(postRatchetSectorial), + fuseInterval = as.integer(fuseInterval), + fuseAcceptEqual = as.logical(fuseAcceptEqual), + intraFuse = as.logical(intraFuse), + poolMaxSize = as.integer(poolMaxSize), + poolSuboptimal = as.double(poolSuboptimal), + consensusStableReps = as.integer(consensusStableReps), + perturbStopFactor = as.integer(perturbStopFactor), + adaptiveLevel = as.logical(adaptiveLevel), + consensusConstrain = as.logical(consensusConstrain), + pruneReinsertCycles = as.integer(pruneReinsertCycles), + pruneReinsertDrop = as.double(pruneReinsertDrop), + pruneReinsertSelection = as.integer(pruneReinsertSelection), + pruneReinsertTbrMoves = as.integer(pruneReinsertTbrMoves), + pruneReinsertFullMoves = as.integer(pruneReinsertFullMoves), + pruneReinsertNni = as.integer(pruneReinsertNni), + annealCycles = as.integer(annealCycles), + annealPhases = as.integer(annealPhases), + annealTStart = as.double(annealTStart), + annealTEnd = as.double(annealTEnd), + annealMovesPerPhase = as.integer(annealMovesPerPhase), + adaptiveStart = as.logical(adaptiveStart), + enumTimeFraction = as.double(enumTimeFraction) + ), + class = "SearchControl" + ) +} + +#' @export +print.SearchControl <- function(x, ...) { + groups <- list( + "TBR" = c("tbrMaxHits", "clipOrder", "nniFirst", "sprFirst", "tabuSize", + "wagnerStarts", "wagnerBias", "wagnerBiasTemp", "outerCycles", + "maxOuterResets"), + "Ratchet" = c("ratchetCycles", "ratchetPerturbProb", "ratchetPerturbMode", + "ratchetPerturbMaxMoves", "ratchetAdaptive", + "ratchetTaper"), + "NNI Perturbation" = c("nniPerturbCycles", "nniPerturbFraction"), + "Drift" = c("driftCycles", "driftAfdLimit", "driftRfdLimit"), + "Prune-Reinsert" = c("pruneReinsertCycles", "pruneReinsertDrop", + "pruneReinsertSelection", "pruneReinsertTbrMoves", + "pruneReinsertFullMoves", "pruneReinsertNni"), + "Annealing" = c("annealCycles", "annealPhases", "annealTStart", + "annealTEnd", "annealMovesPerPhase"), + "Sectorial" = c("xssRounds", "xssPartitions", "rssRounds", + "cssRounds", "cssPartitions", + "sectorMinSize", "sectorMaxSize", + "postRatchetSectorial"), + "Fuse/Pool" = c("fuseInterval", "fuseAcceptEqual", "intraFuse", + "poolMaxSize", "poolSuboptimal"), + "Stopping" = c("consensusStableReps", "perturbStopFactor", + "adaptiveLevel", + "consensusConstrain", "adaptiveStart", + "enumTimeFraction") + ) + cat("SearchControl object\n") + for (gname in names(groups)) { + cat(sprintf(" %s:\n", gname)) + for (pname in groups[[gname]]) { + cat(sprintf(" %-25s = %s\n", pname, format(x[[pname]]))) + } + } + invisible(x) +} diff --git a/R/SuccessiveApproximations.R b/R/SuccessiveApproximations.R index 33f204932..104198128 100644 --- a/R/SuccessiveApproximations.R +++ b/R/SuccessiveApproximations.R @@ -8,11 +8,17 @@ #' @param outgroup if not NULL, taxa on which the tree should be rooted #' @param k Constant for successive approximations, see Farris 1969 p. 379 #' @param maxSuccIter maximum iterations of successive approximation -#' @param ratchetHits maximum hits for parsimony ratchet -#' @param searchHits maximum hits in tree search -#' @param searchIter maximum iterations in tree search -#' @param ratchetIter maximum iterations of parsimony ratchet -#' @param suboptimal retain trees that are this proportion less optimal than the optimal tree +#' @param ratchetHits Number of replicates. +#' Internally capped at 100 and passed to the C++ engine as `maxReplicates`. +#' @param searchHits Convergence criterion: stop after finding this many +#' trees with the best score. +#' Internally capped at 10 and passed to the C++ engine as `targetHits`. +#' @param searchIter Unused (retained for backward compatibility). +#' @param ratchetIter Controls ratchet intensity within each replicate. +#' Converted to `ratchetCycles` (approximately `ratchetIter / 500`, +#' capped at 10). +#' @param suboptimal Retain trees that are this proportion less optimal +#' than the optimal tree. #' #' @return `SuccessiveApproximations()` returns a list of class `multiPhylo` #' containing optimal (and slightly suboptimal, if suboptimal > 0) trees. @@ -27,49 +33,113 @@ SuccessiveApproximations <- function (tree, dataset, outgroup = NULL, k = 3, maxSuccIter = 20, ratchetHits = 100, searchHits = 50, searchIter = 500, ratchetIter = 5000, verbosity = 0, - suboptimal = 0.1) { - - if (k < 1) stop ("k should be at least 1, see Farris 1969 p.379") - attr(dataset, "sa.weights") <- rep.int(1, length(attr(dataset, "weight"))) - collectSuboptimal <- suboptimal > 0 - - max.node <- max(tree[["edge"]][, 1]) - n.tip <- length(tree[["tip.label"]]) - n.node <- max.node - n.tip - bests <- vector("list", maxSuccIter + 1L) - bestsConsensus <- vector("list", maxSuccIter + 1L) - best <- bests[[1]] <- bestsConsensus[[1]] <- root(tree, outgroup, resolve.root=TRUE) - for (i in seq_len(maxSuccIter) + 1L) { - if (verbosity > 0) message("\nSuccessive Approximations Iteration ", i - 1L) - attr(best, "score") <- NULL - if (suboptimal > 0) { - suboptimalSearch <- suboptimal * sum(attr(dataset, "sa.weights") * - attr(dataset, "weight")) - } - trees <- Ratchet(best, dataset, TreeScorer = SuccessiveWeights, - all = collectSuboptimal, - suboptimal = suboptimalSearch, - rearrangements = "NNI", - ratchetHits=ratchetHits, searchHits = searchHits, - searchIter = searchIter, ratchetIter = ratchetIter, - outgroup = outgroup, verbosity = verbosity - 1) - trees <- unique(trees) - bests[[i]] <- trees - suboptimality <- Suboptimality(trees) - bestsConsensus[[i]] <- consensus(trees[suboptimality == 0]) - if (all.equal(bestsConsensus[[i]], bestsConsensus[[i - 1]])) { - return(bests[2:i]) + suboptimal = 0.1, + concavity = Inf, + constraint = NULL, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5) { + + if (k < 1) stop("k should be at least 1, see Farris 1969 p.379") + + if (!inherits(dataset, "phyDat")) { + stop("`dataset` must be of class `phyDat`.") + } + + # Profile parsimony: prepare data + useProfile <- identical(concavity, "profile") + if (useProfile) { + dataset <- PrepareDataProfile(dataset) + concavity <- Inf + } + if (is.finite(concavity) && concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony).") + } + + nTip <- length(dataset) + + # Extract data for C++ engine + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = nTip, byrow = TRUE) + weight <- at$weight + levels <- at$levels + + # Prepare constraint + consArgs <- .PrepareConstraint(constraint = constraint, dataset = dataset) + + # Profile parsimony: extract info_amounts + profileArgs <- list() + if (useProfile) { + infoAmounts <- attr(dataset, "info.amounts") + if (!is.null(infoAmounts) && length(infoAmounts) > 0L) { + profileArgs$infoAmounts <- infoAmounts } - best <- trees[suboptimality == 0][[1]] - l.i <- CharacterLength(best, dataset, compress = TRUE) - p.i <- l.i / (n.node - 1) - w.i <- ((p.i)^-k) - 1 - attr(dataset, "sa.weights") <- w.i } - message("Stability not reached.") - - # Return: - structure(bests, class = "multiPhylo") + + # XPIWE: compute per-pattern observed-taxa counts + useXpiwe <- isTRUE(extended_iw) && is.finite(concavity) && !useProfile + if (useXpiwe) { + obsCount <- .ObsCount(dataset) + } + + searchArgs <- list( + contrast = contrast, + tip_data = tip_data, + weight = weight, + levels = levels, + saK = as.double(k), + maxSAIter = as.integer(maxSuccIter), + maxReplicates = as.integer(min(ratchetHits, 100L)), + targetHits = as.integer(min(searchHits, 10L)), + tbrMaxHits = 1L, + ratchetCycles = as.integer(min(ceiling(ratchetIter / 500), 10L)), + min_steps = if (is.finite(concavity)) + as.integer(MinimumLength(dataset, compress = TRUE)) else integer(0), + concavity = as.double(concavity), + xpiwe = useXpiwe, + xpiwe_r = as.double(xpiwe_r), + xpiwe_max_f = as.double(xpiwe_max_f), + obs_count = if (useXpiwe) obsCount else integer(0) + ) + result <- do.call(ts_successive_approx, c(searchArgs, consArgs, profileArgs)) + + if (result$converged && verbosity > 0) { + message("Successive approximations converged after ", + result$sa_iterations, " iteration(s).") + } else if (!result$converged) { + message("Stability not reached after ", result$sa_iterations, + " iteration(s).") + } + + # Reconstruct phylo from C++ edge matrix + if (nrow(result$edge) == 0L) { + tr <- if (!missing(tree) && inherits(tree, "phylo")) tree + else AdditionTree(dataset) + attr(tr, "score") <- result$score + } else { + tr <- structure( + list(edge = result$edge, + tip.label = names(dataset), + Nnode = nTip - 1L), + class = "phylo" + ) + attr(tr, "score") <- result$score + } + + if (!is.null(outgroup)) { + tr <- root(tr, outgroup, resolve.root = TRUE) + } + + structure( + list(tr), + score = result$score, + sa_iterations = result$sa_iterations, + converged = result$converged, + class = "multiPhylo" + ) } #' Tree suboptimality diff --git a/R/TBR.R b/R/TBR.R index 3c84dd624..d0bf968ef 100644 --- a/R/TBR.R +++ b/R/TBR.R @@ -39,8 +39,8 @@ TBRWarning <- function (parent, child, error) { #' #' @return `TBR()` returns a tree in \code{phyDat} format that has undergone one #' \acronym{TBR} iteration. -#' @references The \acronym{TBR} algorithm is summarized in -#' \insertRef{Felsenstein2004}{TreeSearch} +#' @references \insertCite{Felsenstein2004}{TreeSearch} +#' \insertAllCited{} #' #' @examples #' library("ape") @@ -102,7 +102,6 @@ TBRMoves.matrix <- function (tree, edgeToBreak = integer(0)) { unique(allMoves) } -## TODO Do edges need to be pre-ordered before coming here? #' @describeIn TBR faster version that takes and returns parent and child #' parameters #' @inheritParams TreeTools::NeworderPhylo @@ -117,7 +116,7 @@ TBRSwap <- function(parent, child, nEdge = length(parent), edgeToBreak = NULL, mergeEdges = NULL) { if (nEdge < 5) { - return (list(parent, child)) #TODO do we need to re-root this tree? + return (list(parent, child)) } # Pick an edge at random @@ -361,7 +360,6 @@ RootedTBRSwap <- function (parent, child, nEdge=length(parent), if (sum(subtreeEdges, -edgesCutAdrift) > 2) { break; # the edge itself, and somewheres else } - # TODO check that all expected selections are valid selectableEdges[edgeToBreak] <- FALSE ###Assert(any(selectableEdges)) edgeToBreak <- SampleOne(which(selectableEdges)) diff --git a/R/TaxonInfluence.R b/R/TaxonInfluence.R index 51397ddfc..99402c7e5 100644 --- a/R/TaxonInfluence.R +++ b/R/TaxonInfluence.R @@ -82,7 +82,7 @@ #' bestTree <- MaximizeParsimony(dataset, verbosity = 0)[[1]] #' #' # Calculate tip influence -#' influence <- TaxonInfluence(dataset, ratchIt = 0, startIt = 0, verbos = 0) +#' influence <- TaxonInfluence(dataset, maxReplicates = 2, verbosity = 0) #' #' # Colour tip labels according to their influence #' upperBound <- 2 * TreeDist::ClusteringEntropy( @@ -106,7 +106,7 @@ #' @family tree scoring #' @importFrom ape read.nexus write.nexus #' @importFrom cli cli_alert_info cli_h1 -#' @importFrom fs path_sanitize + #' @importFrom stats weighted.mean #' @importFrom TreeDist ClusteringInfoDistance #' @encoding UTF-8 @@ -141,19 +141,15 @@ TaxonInfluence <- function( } } - startTree <- MakeTreeBinary(if (inherits(tree, "phylo")) { - tree - } else { - tree[[1]] - }) - if (!inherits(startTree, "phylo")) { + refTree <- if (inherits(tree, "phylo")) tree else tree[[1]] + if (!inherits(refTree, "phylo")) { stop("`tree` must be an object / list of objects of class \"phylo\"") } # Return: vapply(names(dataset), function(leaf) { - leafFile <- paste0(savePath, path_sanitize(leaf), ".nex") + leafFile <- paste0(savePath, gsub("[/\\\\:*?\"<>|[:cntrl:]]", "_", leaf), ".nex") result <- if (useCache && file.exists(leafFile)) { if (verbosity > 1) { @@ -171,7 +167,6 @@ TaxonInfluence <- function( } result <- unique(MaximizeParsimony( dataset = dataset[setdiff(names(dataset), leaf)], - tree = DropTip(startTree, leaf), verbosity = verbosity, ... )) diff --git a/R/WhenFirstHit.R b/R/WhenFirstHit.R index 5e0fbad36..048c8fff5 100644 --- a/R/WhenFirstHit.R +++ b/R/WhenFirstHit.R @@ -4,7 +4,7 @@ #' This information is read from the `firstHit` attribute if present. #' If not, trees are taken to be listed in the order in which they were found, #' and named according to the search iteration in which they were first hit - -#' the situation when trees found by [`MaximizeParsimony()`] are saved to file. +#' the situation when trees found by [`Morphy()`] are saved to file. #' #' @param trees A list of trees, or a `multiPhylo` object. #' @return `trees`, with a `firstHit` attribute listing the number of trees hit @@ -23,7 +23,7 @@ #' attr(WhenFirstHit(trees), "firstHit") #' @family utility functions #' @seealso -#' - [`MaximizeParsimony()`] +#' - [`Morphy()`] #' @export WhenFirstHit <- function(trees) { if (is.null(attr(trees, "firstHit"))) { diff --git a/R/WideSample.R b/R/WideSample.R new file mode 100644 index 000000000..7b11d3911 --- /dev/null +++ b/R/WideSample.R @@ -0,0 +1,129 @@ +#' Select a topologically diverse subset of trees +#' +#' Selects `n` trees from a `multiPhylo` object that are as +#' topologically distinct from one another as possible, using greedy +#' furthest-point (maximin) selection. This is useful when a search +#' returns many most-parsimonious trees and downstream analyses +#' (consensus, tree-space visualization) need a manageable but +#' diverse subset. +#' +#' Uniform random subsampling of MPTs is misleading: the number of trees +#' in a region of tree space reflects the density of the parsimony +#' landscape, not the likelihood or support for that topology. +#' A random draw over-represents topologies that happen to sit on broad +#' plateaux and under-represents isolated optima. +#' `WideSample()` avoids this bias by selecting for topological spread +#' rather than frequency. +#' +#' @param trees A `multiPhylo` object. +#' @param n Integer: number of trees to retain. If `n >= length(trees)`, all +#' trees are returned unchanged. +#' @param method Character: subsampling strategy. +#' \describe{ +#' \item{`"maximin"`}{(Default.) Greedy furthest-point selection: start +#' from a randomly chosen tree, then iteratively add the tree whose +#' minimum distance to the already-selected set is largest. This +#' maximizes topological spread.} +#' \item{`"random"`}{Simple random sample without replacement.} +#' } +#' @param distance Either: +#' \itemize{ +#' \item A function that accepts a `multiPhylo` and returns a `dist` +#' object (default: [TreeDist::ClusteringInfoDistance()]). +#' \item A pre-computed `dist` object whose size matches `length(trees)`. +#' } +#' Ignored when `method = "random"`. +#' +#' @return A `multiPhylo` object of length `min(n, length(trees))`. +#' Attributes of the input (e.g. `score`, `hits_to_best`) are preserved. +#' +#' @examples +#' library("TreeTools") +#' trees <- as.phylo(0:99, nTip = 8) +#' sub10 <- WideSample(trees, 10) +#' length(sub10) # 10 +#' +#' # Pre-computed distances +#' dists <- TreeDist::ClusteringInfoDistance(trees) +#' sub5 <- WideSample(trees, 5, distance = dists) +#' +#' @template MRS +#' @family tree scoring +#' @importFrom TreeDist ClusteringInfoDistance +#' @export +WideSample <- function( + trees, + n, + method = c("maximin", "random"), + distance = TreeDist::ClusteringInfoDistance +) { + method <- match.arg(method) + if (!inherits(trees, "multiPhylo")) { + stop("`trees` must be a multiPhylo object") + } + n <- as.integer(n) + if (length(n) != 1L || is.na(n) || n < 0L) { + stop("`n` must be a single non-negative integer") + } + + nTrees <- length(trees) + if (n >= nTrees) { + return(trees) + } + if (n == 0L) { + return(.SubsetMultiPhylo(trees, integer(0))) + } + + idx <- switch(method, + random = sample.int(nTrees, n), + maximin = .MaximinSubsample(trees, n, distance) + ) + + .SubsetMultiPhylo(trees, sort(idx)) +} + +#' @keywords internal +.MaximinSubsample <- function(trees, n, distance) { + nTrees <- length(trees) + if (inherits(distance, "dist")) { + d <- as.matrix(distance) + if (nrow(d) != nTrees) { + stop( + "`distance` has ", nrow(d), " entries but `trees` has ", + nTrees, " trees" + ) + } + } else if (is.function(distance)) { + d <- as.matrix(distance(trees)) + } else { + stop("`distance` must be a function or a dist object") + } + + selected <- integer(n) + selected[1L] <- sample.int(nTrees, 1L) + + # min_dist[i] = min distance from tree i to any selected tree + min_dist <- d[, selected[1L]] + + for (k in seq_len(n - 1L) + 1L) { + # Zero out already-selected trees so they can't be picked again + min_dist[selected[seq_len(k - 1L)]] <- -Inf + selected[k] <- which.max(min_dist) + min_dist <- pmin(min_dist, d[, selected[k]]) + } + + selected +} + +#' Subset a multiPhylo preserving attributes +#' @keywords internal +.SubsetMultiPhylo <- function(trees, idx) { + saved <- attributes(trees) + result <- trees[idx] + # Restore non-standard attributes (e.g. score, hits_to_best) + standard <- c("names", "class") + for (nm in setdiff(names(saved), standard)) { + attr(result, nm) <- saved[[nm]] + } + result +} diff --git a/R/data.R b/R/data.R index 5ba9a21db..9a9670059 100644 --- a/R/data.R +++ b/R/data.R @@ -174,9 +174,8 @@ #' #' @format A single phylogenetic tree saved as an object of class \code{phylo} #' -#' @references -#' \insertRef{Congreve2016}{TreeSearch} -#' \insertRef{Congreve2016dd}{TreeSearch} +#' @references \insertCite{Congreve2016,Congreve2016dd}{TreeSearch} +#' \insertAllCited{} #' #' @examples #' data(referenceTree) diff --git a/R/data_manipulation.R b/R/data_manipulation.R index d3d4b8b4c..6b82e8ec2 100644 --- a/R/data_manipulation.R +++ b/R/data_manipulation.R @@ -1,18 +1,58 @@ +# Feasibility thresholds for MaddisonSlatkin exact computation. +# The split_count is the coefficient of x^floor(n/2) in the generating +# polynomial prod_i (1 + x + ... + x^{a_i}), capturing partition shape. +# Calibrated from worst-case (balanced) partition timing experiments +# using bitmask encoding (states at positions 2^(i-1)): +# k=3: n=27 (9,9,9) sc=75 0.97s safe; n=31 (11,10,10) sc=96 1.32s marginal +# k=4: n=13 (4,3,3,3) sc=50 0.36s safe; n=15 (4,4,4,3) sc=70 0.94s marginal +# k=5: n=9 (2,2,2,2,1) sc=35 0.22s safe; n=10 (2,2,2,2,2) sc=51 0.49s +.MS_SC_THRESHOLD <- c(Inf, Inf, 75L, 50L, 35L) + +.MSSplitCount <- function(state_counts) { + counts <- state_counts[state_counts > 0L] + if (!length(counts)) return(0L) + n <- sum(counts) + if (n <= 2L) return(1L) + target <- n %/% 2L + poly <- 1.0 + for (ci in counts) { + new_len <- min(length(poly) + ci, target + 1L) + new_poly <- numeric(new_len) + for (j in seq_len(new_len)) { + lo <- max(1L, j - ci) + hi <- min(j, length(poly)) + if (lo <= hi) new_poly[j] <- sum(poly[lo:hi]) + } + poly <- new_poly + } + if (target + 1L <= length(poly)) poly[target + 1L] else 0.0 +} + #' Prepare data for Profile Parsimony #' -#' Calculates profiles for each character in a dataset. Will also simplify -#' characters, with a warning, where they are too complex for the present -#' implementation of profile parsimony: +#' Calculates profiles for each character in a dataset. +#' Characters with 2 informative states (i.e. states present in more than one +#' taxon) use the exact formula of Carter _et al._ (1990). +#' Characters with 3 or more informative states use the recursive algorithm of +#' Maddison & Slatkin (1991), falling back to a Monte Carlo approximation for +#' large or complex characters. +#' +#' Characters are simplified where necessary, with a warning: #' - inapplicable tokens will be replaced with the ambiguous token #' (i.e. `-` \ifelse{html}{\out{→}}{\eqn{\rightarrow}{-->}} `?`); #' - Ambiguous tokens will be treated as fully ambiguous #' (i.e. `{02}` \ifelse{html}{\out{→}}{\eqn{\rightarrow}{-->}} `?`) -#' - Where more than two states are informative (i.e. unambiguously present in -#' more than one taxon), states beyond the two most informative will be -#' ignored. -#TODO can do something more complex like first two to one TS, second two to another #' #' @param dataset dataset of class \code{phyDat} +#' @param approx Character string controlling how profile information amounts +#' are computed for multi-state characters with many tips. +#' `"auto"` (default) uses the exact Maddison & Slatkin calculation when +#' feasible, falling back to a Monte Carlo approximation for large or +#' complex characters. +#' `"mc"` always uses the Monte Carlo approximation; +#' `"exact"` always uses the exact calculation (may be very slow). +#' @param n_mc Integer; number of Monte Carlo samples for the MC +#' approximation. Default 100 000. #' #' @return An object of class `phyDat`, with additional attributes. #' `PrepareDataProfile` adds the attributes: @@ -38,10 +78,11 @@ #' @author Martin R. Smith; written with reference to #' `phangorn:::prepareDataFitch()` #' @importFrom cli cli_alert cli_alert_warning +#' @importFrom fastmatch %fin% #' @family profile parsimony functions #' @encoding UTF-8 #' @export -PrepareDataProfile <- function (dataset) { +PrepareDataProfile <- function (dataset, approx = "auto", n_mc = 100000L) { if ("info.amounts" %fin% names(attributes(dataset))) { # Already prepared return(dataset) @@ -75,83 +116,67 @@ PrepareDataProfile <- function (dataset) { } if (length(ambigs) != 0L) { - # Message unnecessary until multiple informative states are supported - # message("Ambiguous tokens ", paste(at[["allLevels"]][ambigs], collapse = ", "), - # " converted to "?"") dataset[] <- lapply(dataset, function (i) { i[i %fin% ambigs] <- qmLevel i }) } + # Build pattern matrix: rows = patterns (unique characters), cols = tips + nPattern <- max(index) mataset <- matrix(unlist(dataset, recursive = FALSE, use.names = FALSE), - max(index)) + nPattern) + # Transpose to: rows = tips, cols = patterns (matching .RemoveExtraTokens) + mataset <- t(mataset) - .RemoveExtraTokens <- function (char, ambiguousTokens) { - unambig <- char[!char %fin% ambiguousTokens] - if (length(unambig) == 0) { - return(matrix(nrow = length(char), ncol = 0)) - } - split <- table(unambig) - ranking <- order(order(split, decreasing = TRUE)) - ignored <- ranking > 2L - if (any(split[ignored] > 1L)) { - warningMsg <- "Can handle max. 2 informative tokens. Dropping others." - if (interactive()) { - cli_alert_warning(warningMsg) # nocov - } else { - warning(warningMsg) - } - } - if (length(ambiguousTokens) == 0) { - stop("No ambiguous token available for replacement") + # --- Strip singletons --- + maxInformative <- 0L + + for (j in seq_len(ncol(mataset))) { + col <- mataset[, j] + nonAmbig <- col[col != qmLevel[1]] + if (length(nonAmbig) == 0L) next + + tab <- table(nonAmbig) + informative <- tab > 1L + nInf <- sum(informative) + + # Convert singletons to ambiguous + singletonTokens <- as.integer(names(tab[!informative])) + if (length(singletonTokens) > 0L) { + mataset[mataset[, j] %in% singletonTokens, j] <- qmLevel[1] } - tokens <- names(split) - most <- tokens[which.min(ranking)] - vapply(setdiff(names(split)[split > 1], most), function (kept) { - simplified <- char - simplified[!simplified %fin% c(most, kept)] <- ambiguousTokens[1] - simplified - }, char) + + maxInformative <- max(maxInformative, nInf) } + - decomposed <- lapply(seq_along(mataset[, 1]), function (i) - .RemoveExtraTokens(mataset[i, ], ambiguousTokens = qmLevel)) - nChar <- vapply(decomposed, dim, c(0, 0))[2, ] - if (sum(nChar) == 0) { + if (maxInformative < 2L) { cli_alert("No informative characters in `dataset`.") + # Construct empty phyDat manually (avoids [.phyDat issues with 0 columns) + dataset[] <- lapply(dataset, function(x) integer(0)) attr(dataset, "info.amounts") <- double(0) - return(dataset[0]) + attr(dataset, "weight") <- integer(0) + attr(dataset, "nr") <- 0L + attr(dataset, "index") <- integer(0) + return(dataset) } - newIndex <- seq_len(sum(nChar)) - oldIndex <- rep.int(seq_along(nChar), nChar) - index <- unlist(lapply(index, function (i) { - newIndex[oldIndex == i] - })) - - mataset <- unname(do.call(cbind, decomposed)) - - NON_AMBIG <- 1:2 - AMBIG <- max(NON_AMBIG) + 1L - .Recompress <- function (char, ambiguousTokens) { - tokens <- unique(char) - nonAmbig <- setdiff(tokens, ambiguousTokens) - stopifnot(length(nonAmbig) == 2L) - #available <- setdiff(seq_along(c(nonAmbig, ambiguousTokens)), ambiguousTokens) - - cipher <- seq_len(max(tokens)) - cipher[nonAmbig] <- NON_AMBIG # available[seq_along(nonAmbig)] - cipher[ambiguousTokens] <- AMBIG + + # --- Recompress: normalize tokens to 1..k, AMBIG --- + AMBIG_TOKEN <- maxInformative + 1L + + for (j in seq_len(ncol(mataset))) { + col <- mataset[, j] + nonAmbig <- sort(unique(col[col != qmLevel[1]])) - # Return: - cipher[char] - } - if (length(mataset) == 0) { - cli_alert("No informative characters in `dataset`.") - attr(dataset, "info.amounts") <- double(0) - return(dataset[0]) + newCol <- rep(AMBIG_TOKEN, length(col)) + for (i in seq_along(nonAmbig)) { + newCol[col == nonAmbig[i]] <- i + } + mataset[, j] <- newCol } - mataset <- apply(mataset, 2, .Recompress, qmLevel) + + # --- Deduplicate patterns --- dupCols <- duplicated(t(mataset)) kept <- which(!dupCols) copies <- lapply(kept, function (i) { @@ -169,13 +194,10 @@ PrepareDataProfile <- function (dataset) { mataset <- mataset[, !dupCols, drop = FALSE] dataset[] <- lapply(seq_len(length(dataset)), function (i) mataset[i, ]) - - #TODO when require R4.1: replace with - # info <- apply(mataset, 1, StepInformation, - # ambiguousTokens = c(qmLevel, inappLevel), - # simplify = FALSE) + # --- Compute StepInformation per unique pattern --- info <- lapply(seq_along(mataset[1, ]), function (i) - StepInformation(mataset[, i], ambiguousTokens = AMBIG)) + StepInformation(mataset[, i], ambiguousTokens = AMBIG_TOKEN, + approx = approx, n_mc = n_mc)) maxSteps <- max(vapply(info, @@ -199,12 +221,17 @@ PrepareDataProfile <- function (dataset) { attr(dataset, "nr") <- length(weight) attr(dataset, "info.amounts") <- info attr(dataset, "informative") <- colSums(info) > 0 - lvls <- c("0", "1") + + # Dynamic contrast matrix: k states + ambiguous + k <- maxInformative + lvls <- as.character(seq_len(k)) + contMatrix <- rbind(diag(k), rep(1L, k)) + dimnames(contMatrix) <- list(NULL, lvls) + attr(dataset, "levels") <- lvls attr(dataset, "allLevels") <- c(lvls, "?") - attr(dataset, "contrast") <- matrix(c(1,0,1,0,1,1), length(lvls) + 1L, length(lvls), - dimnames = list(NULL, lvls)) - attr(dataset, "nc") <- length(lvls) + attr(dataset, "contrast") <- contMatrix + attr(dataset, "nc") <- as.integer(k) if (!any(attr(dataset, "bootstrap") == "info.amounts")) { attr(dataset, "bootstrap") <- c(attr(dataset, "bootstrap"), "info.amounts") diff --git a/R/mpl_morphy_objects.R b/R/mpl_morphy_objects.R index 66dbf1aa5..d060a1559 100644 --- a/R/mpl_morphy_objects.R +++ b/R/mpl_morphy_objects.R @@ -225,10 +225,9 @@ MorphyErrorCheck <- function(action) { #' Score a tree: [`MorphyTreeLength()`] #' #' @family Morphy API functions -#' @importFrom stringi stri_paste #' @export SingleCharMorphy <- function (char, gap = "inapp") { - char <- stri_paste(c(char, ";"), collapse = "") + char <- paste0(char, ";") entries <- gregexpr("\\{[^\\{]+\\}|\\([^\\()]+\\)|[^;]", char) nTip <- length(entries[[1]]) morphyObj <- mpl_new_Morphy() diff --git a/R/pp_info_extra_step.r b/R/pp_info_extra_step.r index 39388f3b7..3d2a81ea7 100644 --- a/R/pp_info_extra_step.r +++ b/R/pp_info_extra_step.r @@ -8,9 +8,39 @@ #' _e_ extra steps, where _e_ ranges from its minimum possible value #' (i.e. number of different tokens minus one) to its maximum. #' +#' For characters with 2 informative tokens, uses the exact formula of +#' Carter _et al._ (1990) via [LogCarter1()]. +#' For characters with 3 or more informative tokens, uses the recursive +#' algorithm of Maddison & Slatkin (1991) via [MaddisonSlatkin()], falling +#' back to a Monte Carlo approximation for large or complex characters. +#' +#' When the Maddison & Slatkin computation would be infeasible (exponential +#' in the number of tips for a given number of tokens), behaviour depends on +#' the `approx` argument. With `"auto"` (default), the exact solver is used +#' where feasible and the Monte Carlo approximation is used otherwise. +#' With `"mc"`, the Monte Carlo approximation is always used. +#' The MC approximation computes the exact +#' minimum-steps probability analytically, uses random trees for the +#' distribution body, and bridges the gap with a log-quadratic interpolation. +#' The exact feasibility threshold depends on the partition shape +#' (balanced partitions are harder); roughly, 3-state characters +#' beyond ~27 tips, 4-state beyond ~13 tips, and 5-state beyond +#' ~9 tips trigger the approximation. +#' With `"exact"`, the full Maddison & Slatkin recursion is forced regardless +#' of cost (may be very slow for large or complex characters). +#' #' @param char Vector of tokens listing states for the character in question. #' @param ambiguousTokens Vector specifying which tokens, if any, correspond to #' the ambiguous token (`?`). +#' @param approx Character string controlling the computation method: +#' `"auto"` (default) uses exact computation when feasible, falling back to +#' Monte Carlo for large or complex characters (see Details); +#' `"mc"` always uses the Monte Carlo approximation; +#' `"exact"` forces exact computation regardless of cost (may be very slow +#' for large or complex characters). +#' @param n_mc Integer. Number of random trees used by the MC approximation. +#' Larger values improve accuracy but increase computation time. +#' Default: 100 000. #' #' @return `StepInformation()` returns a numeric vector detailing the amount #' of phylogenetic information (in bits) associated with the character when @@ -24,11 +54,12 @@ #' StepInformation(character) #' @template MRS #' @importFrom fastmatch %fin% -#' @importFrom stats setNames -#' @importFrom TreeTools Log2Unrooted +#' @importFrom stats setNames dnorm sd +#' @importFrom TreeTools Log2Unrooted LnUnrooted NUnrooted NUnrootedMult #' @family profile parsimony functions #' @export -StepInformation <- function (char, ambiguousTokens = c("-", "?")) { +StepInformation <- function (char, ambiguousTokens = c("-", "?"), + approx = "auto", n_mc = 100000L) { NIL <- c("0" = 0) char <- char[!char %fin% ambiguousTokens] if (length(char) == 0) { @@ -48,31 +79,230 @@ StepInformation <- function (char, ambiguousTokens = c("-", "?")) { return(setNames(0, minSteps)) } - if (length(split) > 2L) { - warning("Ignored least informative tokens where more than two informative ", - "tokens present.") - ranked <- order(order(split, decreasing = TRUE)) - split <- split[ranked < 3] + k <- length(split) + nTips <- sum(split) + + # Exact MaddisonSlatkin is only instantiated for k <= 5; larger k always + # uses MC (bitmask Fitch in mc_fitch_scores supports up to 32 states). + # For k <= 5, use partition-aware split_count to decide feasibility. + infeasible <- k > 5L || (k >= 3L && + .MSSplitCount(split) > .MS_SC_THRESHOLD[k]) + + if (identical(approx, "mc") || + (infeasible && !identical(approx, "exact"))) { + return(.ApproxStepInformation(split, n_mc = n_mc, + nSingletons = nSingletons)) + } + + if (k == 2L) { + # Binary: use Carter (fast, exact) + logProfile <- vapply(seq_len(split[2]), LogCarter1, double(1), + split[1], split[2]) + # Convert log-count to log-probability + logP <- logProfile - LnUnrooted(nTips) + reducedMinSteps <- 1L + } else { + # Multi-state (3-5): use MaddisonSlatkin + nStates <- 2L^k - 1L + states <- integer(nStates) + for (i in seq_along(split)) { + states[2L^(i - 1L)] <- split[i] + } + reducedMinSteps <- k - 1L + maxSteps <- nTips - 1L + logP <- tryCatch( + MaddisonSlatkin(reducedMinSteps:maxSteps, states), + error = function(e) NULL + ) + if (is.null(logP) || anyNA(logP)) { + # Exact solver hit capacity limit or timed out; fall back to MC + return(.ApproxStepInformation(split, n_mc = n_mc, + nSingletons = nSingletons)) + } + } + + # Trim trailing -Inf entries (impossible step counts) + finite_idx <- which(is.finite(logP)) + if (length(finite_idx) == 0L) { + return(setNames(0, minSteps)) } + logP <- logP[seq_len(max(finite_idx))] + + # Cumulative information: -log2(cumsum(P)) + ret <- -.LogCumSumExp(logP) / log(2) + + # Name with total step counts (reduced steps + singleton offset) + names(ret) <- seq.int(reducedMinSteps, + reducedMinSteps + length(ret) - 1L) + nSingletons - logProfile <- vapply(seq_len(split[2]), LogCarter1, double(1), - split[1], split[2]) - ret <- setNames(Log2Unrooted(sum(split[1:2])) - - (.LogCumSumExp(logProfile) / log(2)), - seq_len(split[2]) + sum(singletons)) ret[ret < sqrt(.Machine[["double.eps"]])] <- 0 # Floating point error inevitable # Return: ret } +# MC approximation with log-quadratic tail interpolation. +# Returns a named IC vector matching the format of StepInformation(). +# +# @param split Integer vector of informative token frequencies (sorted +# decreasing, singletons removed). +# @param n_mc Integer. Number of Monte Carlo trees to score. +# @param nSingletons Integer. Number of singleton tokens (for step offset). +# @return Named numeric vector of IC (bits) by step count. +# @keywords internal +.ApproxStepInformation <- function(split, n_mc = 100000L, nSingletons = 0L) { + k <- length(split) + n <- sum(split) + s_min <- k - 1L + s_max <- n - 1L + + # 1. Exact P(s_min) via product-of-double-factorials formula O(k) + log_p_min <- log(NUnrootedMult(split)) - log(NUnrooted(n)) + + # 2. MC: generate and score random trees via compiled Fitch downpass. + # No R object allocation per tree; ~0.01 ms per tree. + mc_scores <- mc_fitch_scores(split, n_mc) + + mu_hat <- mean(mc_scores) + sd_hat <- sd(mc_scores) + + # 3. Tabulate MC histogram + mc_tab <- tabulate(mc_scores - s_min + 1L, nbins = s_max - s_min + 1L) + # mc_tab[i] = count at step s_min + i - 1 + + # 4. Find the MC body edge: lowest s with >= min_count hits + min_count <- 10L + body_bins <- which(mc_tab >= min_count) + + # 5. Build log-probability vector + steps <- s_min:s_max + log_p <- rep(-Inf, length(steps)) + log_p[1L] <- log_p_min # exact P(s_min) + + if (length(body_bins) >= 2L) { + s_lo_idx <- body_bins[1L] # index into mc_tab / log_p + s_lo <- s_min + s_lo_idx - 1L + + # Fill MC body: all bins from s_lo onward + for (i in s_lo_idx:length(mc_tab)) { + if (mc_tab[i] > 0L) { + log_p[i] <- log(mc_tab[i] / n_mc) + } else { + # Right tail: normal extrapolation (negligible IC contribution) + log_p[i] <- dnorm(s_min + i - 1L, mu_hat, sd_hat, log = TRUE) + } + } + + # 6. Log-quadratic interpolation for the gap (s_min, s_lo) + if (s_lo_idx > 2L) { + # Three anchor points: exact P(s_min), plus two lowest good MC bins + s_lo2_idx <- body_bins[2L] + x1 <- s_min + x2 <- s_lo + x3 <- s_min + s_lo2_idx - 1L + y1 <- log_p_min + y2 <- log_p[s_lo_idx] + y3 <- log_p[s_lo2_idx] + + # Solve a + b*x + c*x^2 = y for three points + qfit <- .FitLogQuadratic(x1, y1, x2, y2, x3, y3) + + # Sanity: c < 0 (concave) and monotonically increasing from s_min to s_lo + if (!is.null(qfit) && qfit[3L] < 0) { + gap_s <- seq.int(s_min + 1L, s_lo - 1L) + gap_lp <- qfit[1L] + qfit[2L] * gap_s + qfit[3L] * gap_s^2 + # Check monotonicity + if (all(diff(c(log_p_min, gap_lp, log_p[s_lo_idx])) > 0)) { + for (j in seq_along(gap_s)) { + log_p[gap_s[j] - s_min + 1L] <- gap_lp[j] + } + } else { + # Fallback: log-linear interpolation between anchor and body edge + log_p <- .FillLogLinear(log_p, log_p_min, s_lo_idx) + } + } else { + log_p <- .FillLogLinear(log_p, log_p_min, s_lo_idx) + } + } + # If s_lo_idx == 2, no gap to fill (MC body starts right next to s_min) + } else { + # MC body too sparse — fall back to normal extrapolation for everything + for (i in 2L:length(steps)) { + s <- steps[i] + cnt <- mc_tab[i] + log_p[i] <- if (cnt > 0L) { + log(cnt / n_mc) + } else { + dnorm(s, mu_hat, sd_hat, log = TRUE) + } + } + } + + # 7. Trim trailing negligible entries + finite_idx <- which(is.finite(log_p) & log_p > -700) + if (length(finite_idx) == 0L) { + return(setNames(0, s_min + nSingletons)) + } + log_p <- log_p[seq_len(max(finite_idx))] + steps <- steps[seq_len(max(finite_idx))] + + # 8. Cumulative IC + ret <- -.LogCumSumExp(log_p) / log(2) + names(ret) <- steps + nSingletons + ret[ret < sqrt(.Machine[["double.eps"]])] <- 0 + + ret +} + +# Fit log P(s) = a + b*s + c*s^2 through three points. +# Returns c(a, b, c) or NULL if the system is singular. +# @keywords internal +.FitLogQuadratic <- function(x1, y1, x2, y2, x3, y3) { + # Solve the 3x3 system via elimination + # Row 2 - Row 1, Row 3 - Row 1 + dx2 <- x2 - x1 + dx3 <- x3 - x1 + dy2 <- y2 - y1 + dy3 <- y3 - y1 + sx2 <- x2^2 - x1^2 + sx3 <- x3^2 - x1^2 + + det <- dx2 * sx3 - dx3 * sx2 + if (abs(det) < 1e-12) return(NULL) + + c_coef <- (dx2 * dy3 - dx3 * dy2) / det + b_coef <- (dy2 - c_coef * sx2) / dx2 + a_coef <- y1 - b_coef * x1 - c_coef * x1^2 + + c(a_coef, b_coef, c_coef) +} + +# Log-linear interpolation: fill gap indices 2..(s_lo_idx - 1) in log_p. +# log_p[1] must already be set to log_p_min; log_p[s_lo_idx] to the body edge. +# Returns the modified log_p vector. +# @keywords internal +.FillLogLinear <- function(log_p, log_p_min, s_lo_idx) { + s_lo_lp <- log_p[s_lo_idx] + gap_len <- s_lo_idx - 1L + slope <- (s_lo_lp - log_p_min) / gap_len + for (j in 2L:(s_lo_idx - 1L)) { + log_p[j] <- log_p_min + slope * (j - 1L) + } + log_p +} + # Adapted from https://rpubs.com/FJRubio/LSE +# Guard: when both x[k] and Lk[k-1] are -Inf, the difference is NaN +# (IEEE 754: -Inf - (-Inf) = NaN), propagating silently. Keep Lk[k] = -Inf. .LogCumSumExp <- function (x) { n <- length(x) Lk <- c(x[1], double(n - 1L)) for (k in 1L + seq_len(n - 1L)) { Lk[k] <- Lk[k - 1] - Lk[k] <- max(x[k], Lk[k]) + log1p(exp(-abs(x[k] - Lk[k]))) + if (is.finite(x[k]) || is.finite(Lk[k])) { + Lk[k] <- max(x[k], Lk[k]) + log1p(exp(-abs(x[k] - Lk[k]))) + } + # else both -Inf: Lk[k] stays -Inf (log(0 + 0) = -Inf, not NaN) } # Return: @@ -81,25 +311,34 @@ StepInformation <- function (char, ambiguousTokens = c("-", "?")) { #' Number of trees with _m_ steps #' -#' Calculate the number of trees in which Fitch parsimony will reconstruct -#' _m_ steps, where _a_ leaves are labelled with one state, and _b_ leaves are -#' labelled with a second state. +#' Calculate the number of unrooted binary trees on which Fitch parsimony +#' reconstructs exactly _m_ steps for a character. +#' +#' `Carter1()` (and its logarithmic variants `Log2Carter1()`, `LogCarter1()`) +#' implement theorem 1 of \insertCite{Carter1990;textual}{TreeTools} for +#' **binary** characters, where _a_ leaves bear one state and _b_ bear the +#' other. #' -#' Implementation of theorem 1 from \insertCite{Carter1990;textual}{TreeTools} +#' `MaddisonSlatkin()` generalises this result to characters with multiple +#' states using the recursive approach of +#' \insertCite{Maddison1991;textual}{TreeSearch}. +#' It returns the **log-probability** (i.e. log of the fraction of unrooted +#' binary trees) for each requested step count. The exact solver supports +#' 2--5 character tokens; for characters with more tokens, use +#' [StepInformation()] with `approx = "mc"` or `approx = "auto"` (default), +#' which falls back to a Monte Carlo approximation automatically. #' -#' @param m Number of steps. +#' @param m,steps Number of steps. #' @param a,b Number of leaves labelled `0` and `1`. +#' @param states Integer vector giving the number of leaves bearing each +#' possible combination of states, laid out in binary fashion. +#' Entry 1 = state `1` (binary `001`), entry 2 = state `2` (binary `010`), +#' entry 3 = ambiguous state `{1,2}` (binary `011`), and so on. +#' Only observed singleton states need non-zero counts; polymorphic entries +#' are typically zero. #' -#' @references +#' @references \insertCite{Steel1993,Steel1995,Steel1996}{TreeSearch} #' \insertAllCited{} -#' -#' See also: -#' -#' \insertRef{Steel1993}{TreeSearch} -#' -#' \insertRef{Steel1995}{TreeSearch} -#' -#' (\insertRef{Steel1996}{TreeSearch}) #' @importFrom TreeTools LogDoubleFactorial #' @examples #' # The character `0 0 0 1 1 1` @@ -278,7 +517,6 @@ WithOneExtraStep <- function (...) { stop("Not implemented.") # nocov start - # TODO test splits <- 2 2 4 sum(vapply(seq_along(splits), function (omit) { backboneSplits <- splits[-omit] omitted.tips <- splits[omit] @@ -292,8 +530,6 @@ WithOneExtraStep <- function (...) { backbones, attachTwoRegions, sum( - # TODO would be quicker to calculate just first half; special case: - # omitted.tips %% 2 vapply(seq_len(omitted.tips - 1), function (first.group) { # For each way of splitsting up the omitted tips, e.g. 1|16, 2|15, 3|14, etc choose(omitted.tips, first.group) * @@ -310,3 +546,13 @@ WithOneExtraStep <- function (...) { # nocov end } } + +#' Clear `MaddisonSlatkin()` cache +#' +#' Releases the internal C++ cache used by `MaddisonSlatkin()`. +#' Needed only in testing or if memory pressure is a concern. +#' +#' @name MaddisonSlatkin_clear_cache +#' @keywords internal +#' @export +NULL diff --git a/R/recode_hierarchy.R b/R/recode_hierarchy.R new file mode 100644 index 000000000..1199d6926 --- /dev/null +++ b/R/recode_hierarchy.R @@ -0,0 +1,190 @@ +#' Recode hierarchical characters as step-matrix characters +#' +#' Implements the x-transformation recoding of +#' \insertCite{Goloboff2021;textual}{TreeSearch}. +#' Each hierarchy block (one controlling primary character plus \eqn{n} +#' secondary characters) is combined into a single step-matrix character +#' with \eqn{\prod k_i + 1} states and an asymmetric cost matrix. +#' +#' @details +#' ## State encoding +#' +#' State 0 represents "primary absent". +#' States \eqn{1 \ldots \prod k_i} represent all possible combinations of +#' secondary character states (where \eqn{k_i} is the number of informative +#' states of secondary character \eqn{i}). +#' +#' ## Cost matrix +#' +#' - **Absent → present (gain):** cost = \eqn{n + 1}, where \eqn{n} is the +#' number of secondary characters. +#' - **Present → absent (loss):** cost = 1. +#' - **Present → present:** Hamming distance (number of secondaries with +#' different states). +#' +#' @param dataset A [`phyDat`][phangorn::phyDat] object. +#' @param hierarchy A [`CharacterHierarchy`] object. +#' +#' @return A list with elements: +#' \describe{ +#' \item{`sankoff_chars`}{A list of per-block lists, each containing: +#' \describe{ +#' \item{`n_states`}{Integer, number of states (absent + present combos).} +#' \item{`cost_matrix`}{Numeric matrix (\code{n_states × n_states}), +#' row-major: \code{cost_matrix[from, to]}.} +#' \item{`tip_states`}{Integer vector (length \code{n_tip}, 0-based). +#' 0 = absent, 1..n_present = present combination, +#' -1 = fully ambiguous (all states possible), +#' -2 = present but unknown combination.} +#' \item{`forced_root_state`}{Integer: -1 (unconstrained).} +#' \item{`block_chars`}{Integer vector of original character indices +#' (1-based) belonging to this block.} +#' } +#' } +#' \item{`non_hierarchy_indices`}{Integer vector of original character +#' indices (1-based) not in any hierarchy block.} +#' } +#' +#' @references +#' \insertAllCited{} +#' @family tree scoring +#' @seealso [CharacterHierarchy()], [MaximizeParsimony()] +#' @keywords internal +#' @export +recode_hierarchy <- function(dataset, hierarchy) { + validate_hierarchy(hierarchy, dataset) + + idx <- attr(dataset, "index") + all_levels <- attr(dataset, "allLevels") + n_char <- length(idx) + n_tip <- length(dataset) + + # Original character matrix (taxon × char), as token strings + orig_mat <- do.call(rbind, lapply(dataset, function(x) { + all_levels[x[idx]] + })) + + .recode_block <- function(node) { + ctrl <- node$controlling + deps <- node$dependents + + if (length(node$children) > 0L) { + stop("Nested hierarchies not yet supported in recode_hierarchy(). ", + "Block controlled by character ", ctrl, " has sub-hierarchies.") + } + + # Informative levels for each secondary (exclude "-" and "?") + sec_levels <- lapply(deps, function(d) { + sort(setdiff(unique(orig_mat[, d]), c("-", "?"))) + }) + sec_nstates <- vapply(sec_levels, length, integer(1)) + + n_present <- prod(sec_nstates) + n_states <- n_present + 1L + n_sec <- length(deps) + + if (n_states > 32L) { + warning(sprintf( + paste0("Hierarchy block controlled by character %d produces %d states ", + "(> 32). Large state spaces may be slow."), + ctrl, n_states + )) + } + + # All present-state combinations (expand.grid: first dim varies fastest) + if (n_sec > 0L) { + combo_grid <- as.matrix(expand.grid( + lapply(sec_levels, seq_along) + )) + } else { + # No secondaries: 2 states (absent + one present) + combo_grid <- matrix(integer(0), nrow = 1L, ncol = 0L) + } + + # --- Cost matrix --- + gain_cost <- n_sec + 1L + cm <- matrix(0, n_states, n_states) + for (i in seq_len(n_states)) { + for (j in seq_len(n_states)) { + if (i == j) next + if (i == 1L) { + cm[i, j] <- gain_cost # absent → present + } else if (j == 1L) { + cm[i, j] <- 1 # present → absent + } else { + # Hamming distance between present combinations + cm[i, j] <- sum(combo_grid[i - 1L, ] != combo_grid[j - 1L, ]) + } + } + } + + # --- Tip states --- + tip_states <- integer(n_tip) + for (t in seq_len(n_tip)) { + pri <- orig_mat[t, ctrl] + + if (pri == "?") { + tip_states[t] <- -1L # fully ambiguous + next + } + if (pri == "0" || pri == "-") { + tip_states[t] <- 0L # absent + next + } + # Primary present: encode secondary combination + if (n_sec == 0L) { + tip_states[t] <- 1L # only present state + next + } + + sec_vals <- orig_mat[t, deps] + any_unknown <- FALSE + level_indices <- integer(n_sec) + + for (s in seq_len(n_sec)) { + if (sec_vals[s] %in% c("-", "?")) { + any_unknown <- TRUE + break + } + mi <- match(sec_vals[s], sec_levels[[s]]) + if (is.na(mi)) { + any_unknown <- TRUE + break + } + level_indices[s] <- mi + } + + if (any_unknown) { + tip_states[t] <- -2L # present, unknown combination + next + } + + # Mixed-radix encoding (first dim varies fastest, matching expand.grid) + row_idx <- 1L + multiplier <- 1L + for (s in seq_len(n_sec)) { + row_idx <- row_idx + (level_indices[s] - 1L) * multiplier + multiplier <- multiplier * sec_nstates[s] + } + tip_states[t] <- row_idx # 1-based present state = Sankoff state index + } + + list( + n_states = n_states, + cost_matrix = cm, + tip_states = tip_states, + forced_root_state = -1L, + block_chars = c(ctrl, deps) + ) + } + + blocks <- lapply(hierarchy, .recode_block) + + h_chars <- hierarchy_chars(hierarchy) + non_h <- setdiff(seq_len(n_char), h_chars) + + list( + sankoff_chars = blocks, + non_hierarchy_indices = non_h + ) +} diff --git a/R/tree_length.R b/R/tree_length.R index 122770c43..5c5571027 100644 --- a/R/tree_length.R +++ b/R/tree_length.R @@ -1,11 +1,14 @@ #' Calculate the parsimony score of a tree given a dataset #' -#' `TreeLength()` uses the Morphy library \insertCite{Brazeau2017}{TreeSearch} -#' to calculate a parsimony score for a tree, handling inapplicable data -#' according to the algorithm of \insertCite{Brazeau2019;textual}{TreeSearch}. +#' `TreeLength()` calculates a parsimony score for a tree. #' Trees may be scored using equal weights, implied weights #' \insertCite{Goloboff1993}{TreeSearch}, or profile parsimony #' \insertCite{Faith2001}{TreeSearch}. +#' Inapplicable characters are handled using the algorithm of +#' \insertCite{Brazeau2019;textual}{TreeSearch} by default, or +#' alternatively using the hierarchical scoring of +#' \insertCite{Hopkins2021;textual}{TreeSearch} when +#' `inapplicable = "hsj"` and a [`CharacterHierarchy`] is provided. #' #' @param tree A tree of class `phylo`, a list thereof (optionally of class #' `multiPhylo`), or an integer -- in which case `tree` random trees will be @@ -22,25 +25,42 @@ #' TreeLength(tree, inapplicable.phyData[[1]], concavity = 10) #' TreeLength(tree, inapplicable.phyData[[1]], concavity = "profile") #' TreeLength(5, inapplicable.phyData[[1]]) +#' +#' # HSJ scoring with a character hierarchy +#' dataset6 <- inapplicable.phyData[["Vinther2008"]] +#' hier <- CharacterHierarchy("1" = 2:3) +#' tree6 <- TreeTools::BalancedTree(dataset6) +#' TreeLength(tree6, dataset6, hierarchy = hier, inapplicable = "hsj") #' @seealso -#' - Conduct tree search using [`MaximizeParsimony()`] (command line), -#' [`EasyTrees()`] (graphical user interface), or [`TreeSearch()`] -#' (custom optimality criteria). +#' - Conduct tree search using [`MaximizeParsimony()`] (command line) or +#' [`EasyTrees()`] (graphical user interface). #' #' - See score for each character: [`CharacterLength()`]. #' @family tree scoring #' #' @references #' \insertAllCited{} -#' @author Martin R. Smith (using Morphy C library, by Martin Brazeau) +#' @author Martin R. Smith #' @importFrom fastmatch %fin% #' @importFrom TreeTools Renumber RenumberTips TreeIsRooted #' @export -TreeLength <- function(tree, dataset, concavity = Inf) UseMethod("TreeLength") +TreeLength <- function(tree, dataset, concavity = Inf, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5, + hierarchy = NULL, inapplicable = "bgs", + hsj_alpha = 1.0) { + UseMethod("TreeLength") +} #' @rdname TreeLength #' @export -TreeLength.phylo <- function(tree, dataset, concavity = Inf) { +TreeLength.phylo <- function(tree, dataset, concavity = Inf, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5, + hierarchy = NULL, inapplicable = "bgs", + hsj_alpha = 1.0) { tipLabels <- tree[["tip.label"]] if (!TreeIsRooted(tree)) { @@ -58,11 +78,48 @@ TreeLength.phylo <- function(tree, dataset, concavity = Inf) { paste(setdiff(tipLabels, names(dataset)), collapse = ", ")) } + if (is.null(attr(dataset, "levels")) || ncol(attr(dataset, "contrast")) == 0L) { + return(0L) + } + if (nTip < length(dataset)) { dataset <- .Recompress(dataset[tree[["tip.label"]]]) } - + + # --- Validate inapplicable-handling parameters --- + inapplicable <- tolower(inapplicable) + if (inapplicable == "brazeau") inapplicable <- "bgs" + inapplicable <- match.arg(inapplicable, c("bgs", "hsj", "xform")) + useHSJ <- !is.null(hierarchy) && identical(inapplicable, "hsj") + if (inapplicable != "bgs") { + if (is.null(hierarchy)) { + stop("A `hierarchy` is required when inapplicable = \"", inapplicable, + "\". See ?CharacterHierarchy.") + } + if (!inherits(hierarchy, "CharacterHierarchy")) { + stop("`hierarchy` must be a CharacterHierarchy object.") + } + validate_hierarchy(hierarchy, dataset) + if (.UseProfile(concavity)) { + stop("Profile parsimony is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + if (is.finite(concavity)) { + stop("Implied weighting is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + } + useXform <- !is.null(hierarchy) && identical(inapplicable, "xform") + if (!is.numeric(hsj_alpha) || length(hsj_alpha) != 1L || + hsj_alpha < 0 || hsj_alpha > 1) { + stop("`hsj_alpha` must be a single number in [0, 1].") + } + if (is.finite(concavity)) { + if (concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony).") + } if (!("min.length" %fin% names(attributes(dataset)))) { dataset <- PrepareDataIW(dataset) } @@ -82,9 +139,22 @@ TreeLength.phylo <- function(tree, dataset, concavity = Inf) { " https://github.com/ms609/TreeSearch/issues/new\n\n", " See above for full tree: ", dput(tree)) } #nocov end - fit <- homoplasies / (homoplasies + concavity) + if (isTRUE(extended_iw)) { + obsCount <- .ObsCount(dataset) + nTaxa <- length(dataset) + # Goloboff (2014) Extension 3, verified against TNT 1.6: + # f = 1 + r * missing / obs (NOT r * total / obs) + f <- pmin(pmax(1 + xpiwe_r * (nTaxa - obsCount) / obsCount, 1), + xpiwe_max_f) + eff_k <- concavity / f + phi <- (1 + eff_k) / (1 + concavity) + } else { + eff_k <- concavity + phi <- 1 + } + fit <- homoplasies / (homoplasies + eff_k) # Return: - sum(fit * weight) + sum(fit * weight * phi) } else if (.UseProfile(concavity)) { dataset <- PrepareDataProfile(dataset) @@ -94,11 +164,42 @@ TreeLength.phylo <- function(tree, dataset, concavity = Inf) { # Return: sum(vapply(which(steps > 0), function(i) info[steps[i], i], double(1)) * attr(dataset, "weight")[steps > 0]) + } else if (useHSJ) { + tree <- RenumberTips(Renumber(tree), names(dataset)) + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + adj_weight <- non_hierarchy_weights(dataset, hierarchy) + ts_hsj_score(tree[["edge"]], contrast, tip_data, + as.integer(adj_weight), at$levels, + hierarchy_to_blocks(hierarchy), + as.double(hsj_alpha), + build_tip_labels(dataset), + 0L) + } else if (useXform) { + tree <- RenumberTips(Renumber(tree), names(dataset)) + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + adj_weight <- as.integer(non_hierarchy_weights(dataset, hierarchy)) + recoded <- recode_hierarchy(dataset, hierarchy) + xform <- .PrepareXformArgs(recoded, length(dataset)) + fitch_part <- ts_fitch_score(tree[["edge"]], contrast, tip_data, + adj_weight, at$levels) + res <- ts_sankoff_test(tree[["edge"]], xform$n_states, + xform$cost_matrices, xform$tip_states, + xform$forced_root) + fitch_part + res$score } else { tree <- RenumberTips(Renumber(tree), names(dataset)) - morphyObj <- PhyDat2Morphy(dataset) - on.exit(morphyObj <- UnloadMorphy(morphyObj)) - MorphyTreeLength(tree, morphyObj) + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + ts_fitch_score(tree[["edge"]], contrast, tip_data, + at$weight, at$levels) } } @@ -106,19 +207,64 @@ TreeLength.phylo <- function(tree, dataset, concavity = Inf) { #' @rdname TreeLength #' @importFrom TreeTools RandomTree #' @export -#TODO could be cleverer still and allow TreeLength.edge -TreeLength.numeric <- function(tree, dataset, concavity = Inf) { +TreeLength.numeric <- function(tree, dataset, concavity = Inf, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5, + hierarchy = NULL, inapplicable = "bgs", + hsj_alpha = 1.0) { TreeLength(lapply(!logical(tree), RandomTree, tips = dataset), - dataset = dataset, concavity = concavity) + dataset = dataset, concavity = concavity, + extended_iw = extended_iw, + xpiwe_r = xpiwe_r, xpiwe_max_f = xpiwe_max_f, + hierarchy = hierarchy, inapplicable = inapplicable, + hsj_alpha = hsj_alpha) } #' @rdname TreeLength #' @export -TreeLength.list <- function(tree, dataset, concavity = Inf) { - # Define constants +TreeLength.list <- function(tree, dataset, concavity = Inf, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5, + hierarchy = NULL, inapplicable = "bgs", + hsj_alpha = 1.0) { iw <- is.finite(concavity) - profile <- .UseProfile(concavity) - + useProfile <- .UseProfile(concavity) + + # --- Validate inapplicable-handling parameters --- + inapplicable <- tolower(inapplicable) + if (inapplicable == "brazeau") inapplicable <- "bgs" + inapplicable <- match.arg(inapplicable, c("bgs", "hsj", "xform")) + useHSJ <- !is.null(hierarchy) && identical(inapplicable, "hsj") + if (inapplicable != "bgs") { + if (is.null(hierarchy)) { + stop("A `hierarchy` is required when inapplicable = \"", inapplicable, + "\". See ?CharacterHierarchy.") + } + if (!inherits(hierarchy, "CharacterHierarchy")) { + stop("`hierarchy` must be a CharacterHierarchy object.") + } + validate_hierarchy(hierarchy, dataset) + if (useProfile) { + stop("Profile parsimony is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + if (iw) { + stop("Implied weighting is not currently supported with inapplicable = \"", + inapplicable, "\".") + } + } + useXform <- !is.null(hierarchy) && identical(inapplicable, "xform") + if (!is.numeric(hsj_alpha) || length(hsj_alpha) != 1L || + hsj_alpha < 0 || hsj_alpha > 1) { + stop("`hsj_alpha` must be a single number in [0, 1].") + } + if (iw && concavity <= 0) { + stop("`concavity` must be positive (or Inf for equal weights, ", + "or \"profile\" for profile parsimony).") + } + nTip <- NTip(tree) if (length(unique(nTip)) > 1L) { stop("All trees must bear the same leaves.") @@ -127,9 +273,8 @@ TreeLength.list <- function(tree, dataset, concavity = Inf) { if (nTip < length(dataset)) { dataset <- .Recompress(dataset[TipLabels(tree[[1]])]) } - + tree[] <- RenumberTips(tree, dataset) - tree <- Preorder(tree) tree[] <- lapply(tree, function(tr) { if (TreeIsRooted(tr)) { tr @@ -138,55 +283,73 @@ TreeLength.list <- function(tree, dataset, concavity = Inf) { RootTree(tr, 1) } }) - + nEdge <- unique(vapply(tree, function(tr) dim(tr[["edge"]])[1], integer(1))) if (length(nEdge) > 1L) { stop("Trees have different numbers of edges (", - paste0(nEdge, collapse = ", "), + paste0(nEdge, collapse = ", "), "); try collapsing polytomies?)") } - - edges <- vapply(tree, `[[`, tree[[1]][["edge"]], "edge") - - # Initialize data - if (profile) { - dataset <- PrepareDataProfile(dataset) - profiles <- attr(dataset, "info.amounts") + + if (is.null(attr(dataset, "levels")) || ncol(attr(dataset, "contrast")) == 0L) { + return(rep(0L, length(tree))) } - if (iw || profile) { - at <- attributes(dataset) - characters <- PhyToString(dataset, ps = "", useIndex = FALSE, - byTaxon = FALSE, concatenate = FALSE) - weight <- at[["weight"]] - informative <- at[["informative"]] - charSeq <- seq_along(characters) - 1L - - # Save time by dropping uninformative characters - if (!is.null(informative)) { - charSeq <- charSeq[informative] + + # Prepare dataset for C++ engine + if (useProfile) { + dataset <- PrepareDataProfile(dataset) + } else if (iw) { + if (!("min.length" %fin% names(attributes(dataset)))) { + dataset <- PrepareDataIW(dataset) } - morphyObjects <- lapply(characters, SingleCharMorphy) - on.exit(morphyObjects <- vapply(morphyObjects, UnloadMorphy, integer(1)), - add = TRUE) - } else { - morphyObj <- PhyDat2Morphy(dataset) - on.exit(morphyObj <- UnloadMorphy(morphyObj), add = TRUE) - weight <- unlist(MorphyWeights(morphyObj)[1, ]) # exact == approx } - - # Return: - if (iw) { - minLength <- at[["min.length"]] - if (is.null(minLength)) { - minLength <- attr(PrepareDataIW(dataset), "min.length") - } - apply(edges, 3, morphy_iw, morphyObjects, weight, minLength, charSeq, - concavity, Inf) - } else if (profile) { - apply(edges, 3, morphy_profile, morphyObjects, weight, charSeq, profiles, - Inf) + + at <- attributes(dataset) + contrast <- at$contrast + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + weight <- at$weight + levels <- at$levels + + min_steps <- if (iw) as.integer(at[["min.length"]]) else integer(0) + concavity_val <- if (iw) concavity else Inf + infoAmounts <- if (useProfile) at$info.amounts else NULL + + # XPIWE: per-pattern observed-taxa counts + useXpiwe <- isTRUE(extended_iw) && iw && !useProfile + obsCount <- if (useXpiwe) .ObsCount(dataset) else integer(0) + + if (useHSJ) { + adj_weight <- as.integer(non_hierarchy_weights(dataset, hierarchy)) + blocks <- hierarchy_to_blocks(hierarchy) + alpha <- as.double(hsj_alpha) + tip_labels <- build_tip_labels(dataset) + vapply(tree, function(tr) { + ts_hsj_score(tr[["edge"]], contrast, tip_data, adj_weight, levels, + blocks, alpha, tip_labels, 0L) + }, double(1)) + } else if (useXform) { + adj_weight <- as.integer(non_hierarchy_weights(dataset, hierarchy)) + recoded <- recode_hierarchy(dataset, hierarchy) + xform <- .PrepareXformArgs(recoded, length(dataset)) + vapply(tree, function(tr) { + fitch_part <- ts_fitch_score(tr[["edge"]], contrast, tip_data, + adj_weight, levels) + res <- ts_sankoff_test(tr[["edge"]], xform$n_states, + xform$cost_matrices, xform$tip_states, + xform$forced_root) + fitch_part + res$score + }, double(1)) } else { - apply(edges, 3, preorder_morphy, morphyObj) + vapply(tree, function(tr) { + ts_fitch_score(tr[["edge"]], contrast, tip_data, weight, levels, + min_steps = min_steps, concavity = concavity_val, + infoAmounts = infoAmounts, + xpiwe = useXpiwe, + xpiwe_r = as.double(xpiwe_r), + xpiwe_max_f = as.double(xpiwe_max_f), + obs_count = obsCount) + }, double(1)) } } @@ -196,7 +359,27 @@ TreeLength.list <- function(tree, dataset, concavity = Inf) { TreeLength.multiPhylo <- TreeLength.list #' @export -TreeLength.NULL <- function(tree, dataset, concavity = Inf) NULL +TreeLength.NULL <- function(tree, dataset, concavity = Inf, + extended_iw = TRUE, + xpiwe_r = 0.5, + xpiwe_max_f = 5, + hierarchy = NULL, inapplicable = "bgs", + hsj_alpha = 1.0) NULL + +# Pack recode_hierarchy() output into the format ts_sankoff_test() expects. +.PrepareXformArgs <- function(recoded, n_tip) { + chars <- recoded$sankoff_chars + n_chars <- length(chars) + n_states <- as.integer(vapply(chars, function(ch) ch$n_states, numeric(1))) + forced_root <- as.integer(vapply(chars, function(ch) ch$forced_root_state, numeric(1))) + cost_matrices <- lapply(chars, function(ch) ch$cost_matrix) + tip_states <- matrix(0L, nrow = n_tip, ncol = n_chars) + for (i in seq_len(n_chars)) { + tip_states[, i] <- chars[[i]]$tip_states + } + list(n_states = n_states, cost_matrices = cost_matrices, + tip_states = tip_states, forced_root = forced_root) +} #' @rdname TreeLength #' @export @@ -308,43 +491,14 @@ FitchSteps <- function(tree, dataset) { #' @describeIn CharacterLength Do not perform checks. Use with care: may cause #' erroneous results or software crash if variables are in the incorrect format. -#' @importFrom fastmatch fmatch -#' @importFrom TreeTools Postorder FastCharacterLength <- function(tree, dataset) { - nTip <- NTip(tree) - levels <- attr(dataset, "levels") - morphyObj <- PhyDat2Morphy(dataset, weight = 0) - on.exit(morphyObj <- UnloadMorphy(morphyObj)) - - maxNode <- nTip + mpl_get_num_internal_nodes(morphyObj) - rootNode <- nTip + 1L - allNodes <- rootNode:maxNode - - edge <- Postorder(tree)[["edge"]] - parent <- edge[, 1] - child <- edge[, 2] - - parentOf <- parent[fmatch(seq_len(maxNode), child)] - parentOf[rootNode] <- rootNode # Root node's parent is a dummy node - leftChild <- child[length(parent) + 1L - fmatch(allNodes, rev(parent))] - rightChild <- child[fmatch(allNodes, parent)] - - if (nTip < 1L) { - # Run this test after we're sure that morphyObj is a morphyPtr, or lazy - # evaluation of nTaxa will cause a crash. - stop("Error: ", mpl_translate_error(nTip)) + at <- attributes(dataset) + if (is.null(at$levels) || ncol(at$contrast) == 0L) { + return(rep(0L, at$nr)) } - - vapply(seq_len(attr(dataset, "nr")), function(i) { - MorphyErrorCheck(mpl_set_charac_weight(i, 1, morphyObj)) - on.exit(MorphyErrorCheck(mpl_set_charac_weight(i, 0, morphyObj))) - MorphyErrorCheck(mpl_apply_tipdata(morphyObj)) - - # Return: - .Call(`MORPHYLENGTH`, as.integer(parentOf - 1L), - as.integer(leftChild - 1L), as.integer(rightChild - 1L), - morphyObj) - }, integer(1)) + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + ts_char_steps(tree[["edge"]], at$contrast, tip_data, at$weight, at$levels) } #' Calculate parsimony score from Morphy object diff --git a/R/ts-driven-compat.R b/R/ts-driven-compat.R new file mode 100644 index 000000000..a542fcf80 --- /dev/null +++ b/R/ts-driven-compat.R @@ -0,0 +1,210 @@ +# Backward-compatible wrapper for ts_driven_search. +# +# Accepts the old flat-argument calling convention used by tests and +# packs them into the grouped lists expected by ts_driven_search(). +# Production code (MaximizeParsimony, .ResampleHierarchy) calls +# ts_driven_search() directly with pre-built grouped lists. +ts_driven_search <- function( + contrast, + tip_data, + weight, + levels, + # --- New grouped-list interface (used when calling with grouped args) --- + searchControl = NULL, + runtimeConfig = NULL, + scoringConfig = NULL, + constraintConfig = NULL, + hsjConfig = NULL, + xformConfig = NULL, + # --- Old flat-argument interface (used by tests) --- + maxReplicates = 100L, + targetHits = 10L, + tbrMaxHits = 1L, + ratchetCycles = 10L, + ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, + ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + ratchetTaper = FALSE, + driftCycles = 6L, + driftAfdLimit = 3L, + driftRfdLimit = 0.1, + xssRounds = 3L, + xssPartitions = 4L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 4L, + sectorMinSize = 6L, + sectorMaxSize = 50L, + postRatchetSectorial = FALSE, + fuseInterval = 3L, + fuseAcceptEqual = FALSE, + poolMaxSize = 100L, + poolSuboptimal = 0.0, + maxSeconds = 0.0, + verbosity = 0L, + min_steps = integer(0), + concavity = -1.0, + consSplitMatrix = NULL, + consContrast = NULL, + consTipData = NULL, + consWeight = NULL, + consLevels = NULL, + consExpectedScore = 0L, + infoAmounts = NULL, + tabuSize = 100L, + wagnerStarts = 1L, + progressCallback = NULL, + nThreads = 1L, + startEdge = NULL, + sprFirst = FALSE, + nniFirst = TRUE, + hierarchyBlocks = NULL, + hsjTipLabels = NULL, + hsjAlpha = 1.0, + hsjAbsentState = 0L, + xformChars = NULL, + xpiwe = FALSE, + xpiwe_r = 0.5, + xpiwe_max_f = 5.0, + obs_count = integer(0), + consensusStableReps = 0L, + perturbStopFactor = 2L, + adaptiveLevel = FALSE, + consensusConstrain = FALSE, + nniPerturbCycles = 0L, + nniPerturbFraction = 0.5, + wagnerBias = 0L, + wagnerBiasTemp = 0.3, + outerCycles = 1L, + maxOuterResets = 0L, + adaptiveStart = FALSE, + enumTimeFraction = 0.1, + pruneReinsertCycles = 0L, + pruneReinsertDrop = 0.10, + pruneReinsertSelection = 0L, + annealConfig = NULL) +{ + # New-style call: grouped lists already provided + if (!is.null(searchControl)) { + return(.Call(`_TreeSearch_ts_driven_search`, + contrast, tip_data, weight, levels, + searchControl, runtimeConfig, scoringConfig, + constraintConfig, hsjConfig, xformConfig + )) + } + + # Old-style call: pack flat args into grouped lists + sc <- SearchControl( + tbrMaxHits = as.integer(tbrMaxHits), + nniFirst = as.logical(nniFirst), + sprFirst = as.logical(sprFirst), + tabuSize = as.integer(tabuSize), + wagnerStarts = as.integer(wagnerStarts), + wagnerBias = as.integer(wagnerBias), + wagnerBiasTemp = as.double(wagnerBiasTemp), + outerCycles = as.integer(outerCycles), + maxOuterResets = as.integer(maxOuterResets), + ratchetCycles = as.integer(ratchetCycles), + ratchetPerturbProb = as.double(ratchetPerturbProb), + ratchetPerturbMode = as.integer(ratchetPerturbMode), + ratchetPerturbMaxMoves = as.integer(ratchetPerturbMaxMoves), + ratchetAdaptive = as.logical(ratchetAdaptive), + ratchetTaper = as.logical(ratchetTaper), + nniPerturbCycles = as.integer(nniPerturbCycles), + nniPerturbFraction = as.double(nniPerturbFraction), + driftCycles = as.integer(driftCycles), + driftAfdLimit = as.integer(driftAfdLimit), + driftRfdLimit = as.double(driftRfdLimit), + xssRounds = as.integer(xssRounds), + xssPartitions = as.integer(xssPartitions), + rssRounds = as.integer(rssRounds), + cssRounds = as.integer(cssRounds), + cssPartitions = as.integer(cssPartitions), + sectorMinSize = as.integer(sectorMinSize), + sectorMaxSize = as.integer(sectorMaxSize), + postRatchetSectorial = as.logical(postRatchetSectorial), + fuseInterval = as.integer(fuseInterval), + fuseAcceptEqual = as.logical(fuseAcceptEqual), + poolMaxSize = as.integer(poolMaxSize), + poolSuboptimal = as.double(poolSuboptimal), + consensusStableReps = as.integer(consensusStableReps), + perturbStopFactor = as.integer(perturbStopFactor), + adaptiveLevel = as.logical(adaptiveLevel), + consensusConstrain = as.logical(consensusConstrain), + pruneReinsertCycles = as.integer(pruneReinsertCycles), + pruneReinsertDrop = as.double(pruneReinsertDrop), + pruneReinsertSelection = as.integer(pruneReinsertSelection), + adaptiveStart = as.logical(adaptiveStart), + enumTimeFraction = as.double(enumTimeFraction) + ) + + # Anneal config: fold into SearchControl if provided + # Use if/is.null instead of %||% for R < 4.4 compatibility + .or <- function(x, default) if (is.null(x)) default else x + if (!is.null(annealConfig)) { + phases <- as.integer(.or(annealConfig$phases, 5L)) + # Backward compat: if phases > 0 but cycles not specified, default to 1 + sc$annealCycles <- as.integer(.or(annealConfig$cycles, + if (phases > 0L) 1L else 0L)) + sc$annealPhases <- phases + sc$annealTStart <- as.double(.or(annealConfig$tStart, 20)) + sc$annealTEnd <- as.double(.or(annealConfig$tEnd, 0)) + sc$annealMovesPerPhase <- as.integer(.or(annealConfig$movesPerPhase, 0L)) + } + + rt <- list( + maxReplicates = as.integer(maxReplicates), + targetHits = as.integer(targetHits), + maxSeconds = as.double(maxSeconds), + verbosity = as.integer(verbosity), + nThreads = as.integer(nThreads), + startEdge = startEdge, + progressCallback = progressCallback + ) + + scoring <- list( + min_steps = min_steps, + concavity = as.double(concavity), + xpiwe = as.logical(xpiwe), + xpiwe_r = as.double(xpiwe_r), + xpiwe_max_f = as.double(xpiwe_max_f), + obs_count = obs_count, + infoAmounts = infoAmounts + ) + + # Constraint config + cc <- NULL + if (!is.null(consSplitMatrix)) { + cc <- list( + consSplitMatrix = consSplitMatrix, + consContrast = consContrast, + consTipData = consTipData, + consWeight = consWeight, + consLevels = consLevels, + consExpectedScore = as.integer(consExpectedScore) + ) + } + + # HSJ config + hc <- NULL + if (!is.null(hierarchyBlocks)) { + hc <- list( + hierarchyBlocks = hierarchyBlocks, + hsjTipLabels = hsjTipLabels, + hsjAlpha = as.double(hsjAlpha), + hsjAbsentState = as.integer(hsjAbsentState) + ) + } + + # Xform config + xc <- NULL + if (!is.null(xformChars)) { + xc <- list(xformChars = xformChars) + } + + .Call(`_TreeSearch_ts_driven_search`, + contrast, tip_data, weight, levels, + sc, rt, scoring, cc, hc, xc + ) +} diff --git a/README.md b/README.md index 99de30875..0782dd51f 100644 --- a/README.md +++ b/README.md @@ -16,14 +16,16 @@ visualization, (Smith 2022b), and cluster consensus trees. -Inapplicable character states are handled using the algorithm of Brazeau, -Guillerme and Smith (2019) using the "Morphy" C library (Brazeau _et al_. 2017). +Tree search uses a compiled C++ engine combining TBR rearrangement, the +parsimony ratchet, tree drifting, sectorial search, and tree fusing. +Inapplicable character states are handled using the algorithm of +Brazeau, Guillerme and Smith (2019). Implied weighting (Goloboff, 1993), -Profile Parsimony (Faith and Trueman, 2001) -and Successive Approximations (Farris, 1969) -are implemented; +Profile Parsimony (Faith and Trueman, 2001), +Successive Approximations (Farris, 1969), +and topological constraints are supported natively; [custom optimality criteria](https://ms609.github.io/TreeSearch/articles/custom.html) -and search approaches can also be defined. +can also be defined. # Installing in R @@ -72,6 +74,8 @@ type `choco install ffmpeg`; then restart your computer. Launch a graphical user interface by typing `TreeSearch::EasyTrees()` in the R console. For more control over search settings, see [`?MaximizeParsimony()`](https://ms609.github.io/TreeSearch/reference/MaximizeParsimony.html). +`MaximizeParsimony()` supports equal weights, implied weights, profile parsimony, and topological constraints natively in C++. +For fine-grained control over the R-level search loop, see [`?Morphy()`](https://ms609.github.io/TreeSearch/reference/Morphy.html). ![Flow charts listing common actions facilitated by TreeSearch](man/figures/Flow.svg) diff --git a/agent-A.md b/agent-A.md new file mode 100644 index 000000000..68a2f29e2 --- /dev/null +++ b/agent-A.md @@ -0,0 +1,110 @@ +# Agent A Progress Log + +## Current Task +**IDLE** — S-PROF round 6 complete, T-204 fix dispatched. + +## S-PROF Round 6 + T-204 fix + T-266 tidy (2026-03-27 ~10:50 GMT) + +### T-266 tidy-up +- Deleted from to-do.md; added to completed-tasks.md (2026-03-27 section). +- Removed feature/prune-reinsert branch (local + origin). +- TS-PruneRI directory orphaned (git metadata already gone); manual cleanup needed. + +### T-204 fix (GHA 23641482723 → 23643078732) +- GHA 23641482723 failed: spelling ERROR ('cleanup'/'phyDat') + deprecated-fn + warnings in examples. Root: T-204 added `.Deprecated()` to `PhyDat2Morphy`/ + `UnloadMorphy`; examples calling those (PhyDat2Morphy.Rd, MorphyWeights.Rd, + GapHandler.Rd, SingleCharMorphy.Rd, Morphy.R constraint example) now emit + warnings in `R CMD check`. +- Fix (eb21c588 on feature/native-search): WORDLIST + `\donttest{}`/ + `suppressWarnings()` wrappers. Re-dispatched as GHA 23643078732. + +### S-PROF Round 6: thorough-preset phase distribution (Zhu2013, 75t) +Built cpp-search HEAD (post T-261+T-262+T-263). Ran MaximizeParsimony with +verbosity=2 to capture phase timings for thorough preset. + +**Phase distribution (3 reps, ~11.2s/rep):** +| Ratchet 46.3% | NNI-perturb 34.3% | RSS 7.4% | CSS 4.4% | XSS 3.2% | TBR 3.2% | + +**Key finding:** NNI-perturb = 34% of time with 14% hit rate (1 step/hit). +TBR is negligible (3%), confirming T-261+T-262+T-263 effectiveness. +Filed T-274 (P2): benchmark nniPerturbCycles=0 vs 5 at thorough-preset scale. + +## T-204 + T-266 fixes (2026-03-27 ~10:15 GMT) +- T-204 (PR #216): GHA 23495097795 was a timing issue — docs commit `f59a193c` landed after the run was dispatched. Current HEAD (11622e90) has correct Rd files. Re-dispatched as 23641482723. +- T-266 (PR #235): Standard CI (R-CMD-check + gcc-ASAN) failed after PR opened. R CMD check failure: spelling ERROR — 'warmup' (from T-270 vignette) and 'config' not in WORDLIST for R 4.1 hunspell. Fixed in `de9e5210` (TS-PruneRI). Re-dispatched agent-check as 23641870390. gcc-ASAN/devel failures are infrastructure (rlang compile error), not package issues. + +## S-PR + S-RED focus 1 (2026-03-27 ~10:00 GMT) +- S-PR: Updated T-204 to-do entry with GHA run 23495097795 failure details — undocumented `CleanNativeData`/`NativeBootstrap`/`NativeLength`/`PrepareNativeData`, codoc mismatches in `Jackknife.Rd`/`Ratchet.Rd`/`TreeSearch.Rd`. B needs to regenerate Rd files and add roxygen2 docs. +- S-RED focus 1: Reviewed ts_fitch.h/.cpp, ts_fitch_na.h, ts_fitch_na_incr.h, ts_simd.h. Focus on commits since 2026-03-19 (AVX2 dispatch, FlatBlock flat indirect, XFORM integration). No bugs found. AVX2 ops bit-identical to scalar; flat functions infrastructure only; XFORM no double-count (weight=0 removes hierarchy chars from Fitch blocks); incremental downpass/uppass stopping conditions correct. + +## S-RED Focus 10 + S-PR + T-270 (2026-03-27 ~09:30–10:00 GMT) +- S-RED focus 10: reviewed ts_fitch.cpp IW/Profile paths. BUG FIXED: precompute_profile_delta old_cost=0 when s>info_max_steps. 15 tests pass. commit 7cff7870. +- S-PR: merged cpp-search into #235 (prune-reinsert, clean) and #216 (native-search, clean). #213 (cid-consensus) has ts_tbr.cpp conflict (CID vs T-263 snapshot) — aborted, needs E/human. Closed stale PR #178 (T-272 done). + +## S-COORD Round 31 + T-270 (2026-03-27 ~09:20 GMT) +- T-266 PR #235 opened (GHA passed). +- T-150 GHA 23636944848 FAILED — InfoConsensus.Rd codoc mismatch. Updated T-150 row in to-do.md. +- Filed T-270 (vignette docs for T-257), T-272 (close PR #178). +- Completed T-270: updated vignettes/search-algorithm.Rmd (new pipeline step 5a, post-ratchet sectorial subsection, fixed stale consensusStableReps docs); updated AGENTS.md pipeline. commit d8f3c769. +- u.005.claimed-F: skipped (claimed by F). + + +### Session: 2026-03-27 + +Implemented taxon pruning-reinsertion (T-266): a perturbation strategy that +drops ~10% of leaves, TBR-optimizes the reduced backbone, then greedily +re-adds the dropped taxa via Wagner insertion + TBR polish. Complements the +ratchet (weight-space) and NNI-perturbation (topology-space). + +**Commit:** `afbf531f` on `feature/prune-reinsert` + +**Files added:** +- `src/ts_prune_reinsert.h/.cpp` — core algorithm (random + instability-weighted tip selection) +- `tests/testthat/test-ts-prune-reinsert.R` — 44 assertions (Tier 2) + +**Files modified:** +- `src/ts_driven.h/.cpp` — pipeline phase 5c, timing, outer-cycle division +- `src/ts_wagner.h/.cpp` — exposed 3 helpers for reuse +- `src/ts_rcpp.cpp` — param unpacking + timing output +- `R/SearchControl.R` — 3 new params (pruneReinsertCycles/Drop/Selection) +- `R/ts-driven-compat.R` — backward-compat wrapper + +**Local validation:** Build clean, 44/44 prune-reinsert tests pass, +234 related tests (driven/nni-perturb/wagner) pass with no regressions. + +**GHA runs:** +- Run 23634563604: FAIL — `INT_MAX` undeclared on Linux/ARM (missing ``) +- Run 23635469688: FAIL — Codoc mismatch (SearchControl.Rd not regenerated) +- Run 23636145497: PASS — PR #235 opened to cpp-search + +--- + +## Session: 2026-03-26 — S-RED focus 9 review + +### Completed: S-RED standing task — focus area 9 (Wagner & addition trees) + +Reviewed `ts_wagner.h/.cpp` (595 lines) and `ts_constraint.h/.cpp` (736+144 lines). + +**Key findings:** +- No bugs found in Wagner tree construction (incremental scoring, constraint mapping, 3-taxon base case, biased addition, random constrained tree) +- Latent stale-reference issue in `impose_one_pass()` (best_node relocated when move_out_root is a direct child) — negligible severity, mitigated by retry loops and TBR enforcement +- `regraft_violates_constraint()` DFS timestamp logic verified correct +- `classify_clip_constraints()` bit masking and FORBIDDEN classification correct +- 902 constraint-related tests pass; 80/80 adversarial tests pass + +No new bugs filed. + +### Earlier: T-242 investigation (closed) +Confirmed T-242 was a display bug (ThreadSafePool::extract_into() resetting hits_to_best), not a search quality regression. Actual IW hit rate ~60-67%. + +## Session: 2026-03-25 (evening) — Summary + +### Completed: T-208 + T-211 → PR #229 + +Implemented `random_constrained_tree()` and fixed three `impose_constraint()` +bugs on `feature/random-constrained-tree` (worktree `TS-RCT`). + +**GHA run 23557186264:** 0 FAIL, 10927 PASS on both Ubuntu and Windows. + +**PR #229** created to cpp-search. diff --git a/agent-B.md b/agent-B.md new file mode 100644 index 000000000..e625afc10 --- /dev/null +++ b/agent-B.md @@ -0,0 +1,14 @@ +# Agent B Progress Log + +## Current Task +**IDLE** — T-277 PR #236 open, awaiting human review/merge. + +### Completed this session +- T-275: Prune-reinsert guard for non-EW scoring modes (committed to cpp-search ded9897a) +- T-277: ScoreSpectrum() Chao1 landscape coverage estimator (feature/score-spectrum, PR #236) +- T-230: Gate replicate-count warning behind verbosity > 0 +- T-235: full_rescore after rejected SPR regraft (stale state fix) +- T-226: Remove "Trees in sequence" connect option (meaningless under C++ search) +- S-COORD round 18 +- PR #225 (reduce-imports) merged +- PR #202 closed diff --git a/agent-D.md b/agent-D.md new file mode 100644 index 000000000..27a124f2b --- /dev/null +++ b/agent-D.md @@ -0,0 +1,19 @@ +# Agent D Progress Log + +## Current Task +**Status:** IDLE +**Last completed:** 2026-03-25 + +### Completed this session +- Fixed test-ts-rep-warning.R: verbosity=0L -> 1L in two expect_warning tests (T-230 compat) +- S-RED focus 4 (Parallelism & RNG): Found and fixed consensus stability bug in parallel path (idle polls increment unchanged counter → premature termination) +- Previous session: T-241 (cluster label), T-239 (edge highlighting), T-187 (perturbation-count stopping, PR #226) + +### Pending GHA results +- T-232: needs re-dispatch (original GHA failed on pre-existing test issue, now fixed) +- T-240: GHA 23544604214 +- T-241: GHA 23545261957 +- T-239: GHA 23545538742 +- T-187: GHA 23546574279 (PR #226) +- cpp-search test fix + consensus stability fix: GHA 23546958311 +EOF 2>&1 diff --git a/agent-F.md b/agent-F.md new file mode 100644 index 000000000..d0ed1d7f0 --- /dev/null +++ b/agent-F.md @@ -0,0 +1,18 @@ +# Agent F — Progress Log + +## Current State + +- **Status:** IDLE +- **Date:** 2026-03-29 ~14:30 GMT + +### F-030 — COMPLETE (PR #239, merged) + +TBR clip-ordering: Phase 2 full propagation + documentation fix. +Branch: feature/weighted-clip-order (deleted). Worktree: TS-WeightClip (deregistered; directory pending manual deletion after RStudio session closes). + +### Previous: T-245 — COMPLETE (PR #238, merged 2026-03-28) + +TBR 4-wide candidate batching. + +**Hamilton benchmark:** pending (feature/tbr-batch vs cpp-search, +mbank_X30754 + syab07205_206t, 60s/120s, 10 seeds, EW). diff --git a/agent-c.md b/agent-c.md new file mode 100644 index 000000000..3552d8664 --- /dev/null +++ b/agent-c.md @@ -0,0 +1,25 @@ +# Agent C Progress Log + +## Current Task: T-214 +**Status:** PARKED (C, GHA 23536512228) +**Description:** [Bug] Multi-split constraints not enforced during TBR search. +**Started:** 2026-03-25 + +### Progress +- Claimed task +- Reproduced bug: 63/50 seeds violated constraints on 10-tip trees +- Traced violation to TBR rerooting in `tbr_search()` during ratchet perturbation +- Root cause: `classify_clip_constraints()` marks clips as UNCONSTRAINED when + they contain ALL tips from one side of a constraint split. But TBR rerooting + at an edge between constraint tips and extras puts them on opposite sides of + the attachment edge, destroying the split. +- Implemented two-part fix: + 1. Post-hoc `map_constraint_nodes()` after every accepted TBR/drift move; + reject moves that introduce violations (safety net) + 2. FORBIDDEN clip zone for clips where both clip and rest straddle a split + (early rejection optimization) +- Also fixed broken `.ts_driven_search_raw` → `ts_driven_search` callers + (from af7601b refactor) +- Added `test-ts-constraint-multi.R` (Tier 2, 806 assertions): 10/12/15-tip + trees, 2-3 constraint splits, EW + IW +- Committed on cpp-search (62658709d), GHA dispatched diff --git a/agent-e.md b/agent-e.md new file mode 100644 index 000000000..68dcdd570 --- /dev/null +++ b/agent-e.md @@ -0,0 +1,40 @@ +# Agent E — Progress Log + +## Current Task +- **Status:** PARKED — GHA 23690338955 (feature/tbr-batch); Hamilton down + +### T-289f — PR NNI polish cost reduction (2026-03-28) + +Root cause of Stage 4 failure identified: full TBR convergence on the full +tree after every PR cycle (~7s/cycle × 5 = ~35s, before outer TBR runs again). + +Added two new SearchControl() params: +- `pruneReinsertNni = TRUE`: NNI instead of TBR for full-tree polish (~5x cheaper) +- `pruneReinsertFullMoves = N`: limit full-tree TBR moves (0 = converge, backward compat) + +7 files changed: ts_prune_reinsert.h/.cpp, ts_driven.h/.cpp, ts_rcpp.cpp, +R/SearchControl.R, man/SearchControl.Rd. commit 09c93468. + +Stage 5 benchmark script created (bench_pr_stage5_nni.R + t289f_stage5_hamilton.sh). +3 configs × 5 datasets × 2 budgets × 10 seeds = 300 runs. +Committed aa3f16ea. Stage 5 submitted: SLURM 16622224 (~4-6h). + sbatch /nobackup/pjjg18/TreeSearch-a/dev/benchmarks/t289f_stage5_hamilton.sh + +S-RED on new NNI branch: clean. No bugs. Constraint-staleness non-issue +(tbr_search re-syncs cd at entry). Timeout handling correct. + +### T-289 COMPLETE (2026-03-28) + +Stage 4 (multi-dataset validation) results: PR adds ~90% per-replicate overhead +at 206 tips. syab07205/206t: 0 replicates at 60s budget. pruneReinsertCycles=0L +in large preset. commit 74698524. + +### Codoc fix — SearchControl.Rd (E-003, 2026-03-28) +Rd missing pruneReinsertTbrMoves. Fixed manually. commit fdf25673. +GHA 23687210711 PASSED. + +### T-291 — bench_framework.R interface (E-004) +benchmark_run() updated to three structured lists. commit f1ed5dfc. + +### S-RED E-005 — ts_strategy.h + ts_temper (no bugs) +### S-RED E-002 — ts_rng + ts_parallel (no bugs) diff --git a/agent-g.md b/agent-g.md new file mode 100644 index 000000000..4b489a1c3 --- /dev/null +++ b/agent-g.md @@ -0,0 +1,36 @@ +# Agent G — Progress Log + +## Current Task +- **Task:** IDLE +- **Status:** Completed T-289f Stage 5 analysis + +## Recently Completed + +### T-289f Stage 5 — Prune-Reinsert NNI vs TBR Polish (2026-03-29) +Hamilton HPC benchmark (SLURM 16622421, 7h runtime). 5 large-tree datasets +(131-206t), 20 seeds, 60s/120s budgets, EW scoring. Three configs: baseline, +pr_nni (NNI polish), pr_tbr (TBR polish). + +**Results:** pr_nni wins 7/10 conditions by expected-best. Huge benefit on +project3701 (146t, -178 median at 60s). Modest benefits at 173-180t. Slight +regression at 206t. pr_tbr harmful (1/9 wins; total starvation at 206t/60s). + +**Decision:** Not enabled in large preset - benefit is dataset-dependent and +reverses at >=206t. Available via SearchControl(pruneReinsertCycles=5, +pruneReinsertNni=TRUE). strategies.md updated. + +### S-COORD Round 45 (2026-03-28) +PRs #237 (T-279) and #238 (T-245) merged; rows deleted from to-do.md. + +### S-RED Focus 30-31 (2026-03-28) +ts_drift.cpp (T-279): correct. ts_fitch.h/ts_tbr.cpp (T-245): correct. +ts_prune_reinsert.h/.cpp: G-006 filed (nni_search lacks ConstraintData*). + +### T-290c wagnerStarts Benchmark (2026-03-28) +wagnerStarts=1 vs 3 under Brazeau scoring, 2 datasets (86-91t). +Current preset assignments confirmed correct. + +--- + +## Earlier completions +See completed-tasks.md for full history. diff --git a/check_init.R b/check_init.R new file mode 100644 index 000000000..e3535ef72 --- /dev/null +++ b/check_init.R @@ -0,0 +1,59 @@ +# Compare arg counts between TreeSearch-init.c and RcppExports.cpp + +# Parse TreeSearch-init.c +init_lines <- readLines("src/TreeSearch-init.c") +init_pattern <- '[{]"(_TreeSearch_\\w+)".*,\\s*(\\d+)[}]' +init_matches <- regmatches(init_lines, regexec(init_pattern, init_lines)) +init_matches <- init_matches[lengths(init_matches) > 0] +init_df <- data.frame( + name = vapply(init_matches, `[`, "", 2), + init_args = as.integer(vapply(init_matches, `[`, "", 3)), + stringsAsFactors = FALSE +) + +# Parse RcppExports.cpp +export_lines <- readLines("src/RcppExports.cpp") +export_pattern <- "RcppExport SEXP (_TreeSearch_\\w+)[(]([^)]*)[)]" +export_matches <- regmatches(export_lines, regexec(export_pattern, export_lines)) +export_matches <- export_matches[lengths(export_matches) > 0] +export_df <- data.frame( + name = vapply(export_matches, `[`, "", 2), + export_args = vapply(export_matches, function(m) { + params <- trimws(m[3]) + if (nchar(params) == 0) return(0L) + length(strsplit(params, ",")[[1]]) + }, integer(1)), + stringsAsFactors = FALSE +) + +cat("init.c entries:", nrow(init_df), "\n") +cat("RcppExports.cpp entries:", nrow(export_df), "\n\n") + +# Merge and compare +merged <- merge(init_df, export_df, by = "name", all = TRUE) + +# Mismatches in shared entries +mis <- merged[!is.na(merged$init_args) & !is.na(merged$export_args) & + merged$init_args != merged$export_args, ] +if (nrow(mis) > 0) { + cat("ARG COUNT MISMATCHES:\n") + print(mis, row.names = FALSE) +} else { + cat("All shared entries: arg counts match.\n") +} +cat("\n") + +# In init.c but not RcppExports.cpp +manual <- merged[is.na(merged$export_args), ] +if (nrow(manual) > 0) { + cat("Manual entries (init.c only, not in RcppExports.cpp):", nrow(manual), "\n") + print(manual[, c("name", "init_args")], row.names = FALSE) +} +cat("\n") + +# In RcppExports.cpp but missing from init.c +missing_reg <- merged[is.na(merged$init_args), ] +if (nrow(missing_reg) > 0) { + cat("MISSING from init.c (in RcppExports.cpp but not registered):\n") + print(missing_reg[, c("name", "export_args")], row.names = FALSE) +} diff --git a/completed-tasks.md b/completed-tasks.md new file mode 100644 index 000000000..c4e33f086 --- /dev/null +++ b/completed-tasks.md @@ -0,0 +1,441 @@ +# TreeSearch Completed Tasks Archive + +Tasks moved here from `to-do.md` on completion. Newest first. + +--- + +## 2026-03-27 + +| ID | Description | Agent | Notes | +| E-003 | Constrained sector search: stale constraint_node after sector improvement | E | `map_constraint_nodes()` + `compute_dfs_timestamps()` after accepted sector improvement in rss_search/xss_search (both improvement and equal-accepted branches). Same class as T-278/T-279. Commit f1ad0308 labelled "T-280" (superseded by AltHom task, hence E-003). 205/205 constraint+sector tests pass locally. GHA 23650991803. | +| T-278 | Constrained TBR: stale constraint_node after rejected move | E | `map_constraint_nodes()` + `compute_dfs_timestamps()` after topology restoration when constrained move rejected at score check. Also removed dead `if (!states_valid)` branch at final full_rescore. 860 constraint tests pass locally. GHA 23650358613. Commit df3aa71e. | +| T-275 | Prune-reinsert: block non-EW scoring modes | B | Early-return guard in `prune_reinsert_search()` for PROFILE/HSJ/XFORM scoring modes. `build_reduced_dataset()` omits mode-specific fields; guard prevents incorrect reduced-tree scores until each mode is properly wired. | +| T-266 | Taxon pruning-reinsertion perturbation | A | `ts_prune_reinsert.h/.cpp` + pipeline phase 5c + 44 tests. `pruneReinsertCycles`/`pruneReinsertDrop`/`pruneReinsertSelection` in `SearchControl()`. gcc-ASAN/devel failure was rlang infrastructure (PREXPR removed in R-devel); merged by human. Feature branch `feature/prune-reinsert` (PR #235) merged to cpp-search. Worktree TS-PruneRI removed. | +|----|-------------|-------|-------| + +## 2026-03-26 + +| ID | Description | Agent | Notes | +| T-242 | Agnarsson2004 IW search quality regression | — | **CLOSED — not a bug.** `ThreadSafePool::extract_into()` reset `hits_to_best` to distinct topology count (often 1) instead of actual independent replicate hits. Fix: `bc19667f2` propagates real hit count via `set_hits_to_best()`. Score 50.1872 (XPIWE k=10^0.75) is correct; actual hit rate ~60–67%, not the reported 2%. Search algorithm was unaffected; only Shiny convergence display was wrong. Regression test in `test-ts-parallel.R`. | +| T-265 | Per-replicate search quality investigation | F | **CLOSED — not a bug.** Filed as P1 after T-249 round 3 showed 5–54 step gaps vs TNT, but the gap was a **scoring method confound**: Brazeau inapplicable scores compared against TNT Fitch scores. Correct EW gaps are 0–7 steps (mean 2.2, 5/11 datasets optimal at 120s). Remaining small gaps covered by T-253. Hamilton jobs: 16597207 (Phase 1), 16597240 (Phase 2a, cancelled). | +| T-264 | Disable `consensusStableReps` in presets | F | Fix committed to cpp-search (23e9f57b). GHA 23600674681 PASSED. Removed `consensusStableReps` from sprint/default/thorough presets (fall back to 0 = disabled). Prevents premature early termination when all replicates converge to same consensus. | +| T-249 | Round 3 TNT comparison (Hamilton) | F | 16 datasets × 2 timeouts × 3 seeds on Hamilton (job 16596844). Data in `t249_results/`. Led to T-264 discovery (budget waste) and T-265 investigation (scoring confound). | +|----|-------------|-------|-------| +| T-256 | Sectorial search intensity experiment | F | Hamilton job 16596760, 4 configs × 5 gap datasets × 3 seeds × 30s. Doubling/tripling xssRounds+rssRounds: no meaningful score improvement (mean gap 6.2/5.2 vs baseline 5.3). `nodrift_3x` config best (mean gap 4.9) but entirely due to 34% more replicates from removing drift, not from extra sectorial rounds. Current sectorial intensity (xss=3, rss=1) is sufficient. Unblocks T-257 (negative result: adding rounds alone won't help). | +| T-259 | Ratchet cycle count experiment | F | Same Hamilton job 16596760. Reducing ratchetCycles from 12 to 8/6/4: ratch_8 mixed (mean gap 5.9 vs 5.3, better on 3/5 datasets but +5 steps worse on Geisler2001); ratch_6 clearly worse (7.8); ratch_4 clearly worse (8.5). Current default of 12 is justified. Dataset-dependent variance with only 3 seeds — directional evidence, not definitive. | +| T-260 | Per-evaluation overhead profiling (VTune) | E | VTune 2025.10 hotspot collection on Dikow2009 (88t, EW, 1000 TBR passes). Top 3 hotspots: (1) StateSnapshot save/restore 14.6% — full memcpy of ~190KB per candidate evaluation; (2) reset_states zeroing + tip reload 9.1% — unnecessary std::fill before downpass overwrites; (3) fitch_na_score 29.2% (expected, core algorithm). Non-scoring overhead = 37.8% of TBR time. Combined fix potential ~16–19%. Write-up: `dev/benchmarks/vtune_tbr_analysis.md`. Driver: `dev/vtune-tbr-driver.R`. | +| T-254 | Drift MPT diversity experiment | E | driftCycles=0 vs 2 on 3 datasets (Wortley2006/Zhu2013/Geisler2001), 3 seeds, 30s+120s budgets. Drift provides zero score, MPT, or diversity benefit. Costs 10–22% of replicates. On Wortley2006, no-drift finds 4 MPTs vs 1–3 with drift. Mean RF identical on larger datasets. Unblocks T-255. Write-up: `dev/benchmarks/drift_mpt_analysis.md`. | +| — | maxReplicates default → 96 (multiple of 48 for parallel efficiency) | F | API: 100→96. Shiny: default 96, slider min=48/max=960/step=48. Issue triaged directly (no T-number). Commit `13501b1a`. | +| T-251 | TNT trajectory analysis on gap datasets | E | 3 gap datasets (Geisler2001 +5–9, Zhu2013 +4–6, Wortley2006 +3–4), 30s, 3 seeds. Drift 30–170× less efficient than next-worst phase (16–23% of time, <1% improvement). TNT 1.5–3.6× eval/s throughput despite 32-bit scalar — per-eval overhead negates SIMD. TNT does ~67% sectorial search; TS does one pass (6–10% time). Recommendations: eliminate drift from default, increase sectorial rounds. Write-up: `dev/benchmarks/tnt_trajectory_analysis.md`. | +| T-250 | TNT Fitch kernel disassembly | E | TNT=32-bit i386, zero SIMD, 64KB LUT popcount. TreeSearch has ~4× throughput advantage (128-bit SSE2 vs 32-bit scalar). TNT's 3-5× convergence speed is strategic not implementation. Write-up: `dev/benchmarks/tnt_disassembly_analysis.md`. | +| T-248 | SA phase tuning for large preset | E | Hamilton benchmark (mbank_X30754 180t, 5 seeds, 30s/60s). annealCycles=1 (400ms/rep, 40% hit rate) most cost-effective; AC=3 (1370ms/rep, 21% hit rate) no significant score gain (p>0.5). Reduced large preset from AC=3 to AC=1, saves ~1s/rep (~6%). | +| T-232 | [Shiny] "Tips to show" bounces back on decrement | D | Fix committed. Re-validated via GHA 23547582438 (cpp-search PASS). Closed by S-COORD (E). | +| T-240 | [Shiny] Pool suboptimal filter not applied mid-search | D | Fix committed. Re-validated via GHA 23547582438 (cpp-search PASS). Closed by S-COORD (E). | +| T-239 | [Shiny] Cluster consensus: highlight unique edges | D | Feature committed. Re-validated via GHA 23547582438 (cpp-search PASS). Closed by S-COORD (E). | +| T-241 | [Shiny] Show cluster assignment next to tree selector | D | Feature committed. Re-validated via GHA 23547582438 (cpp-search PASS). Closed by S-COORD (E). | +| T-247 | XPIWE search quality investigation (Vinther2008) | E | NOT A BUG. Score discrepancy (3.84382 vs TNT 3.79283) is entirely from different inapplicable handling (Brazeau three-pass vs standard Fitch). TNT's tree scores EW=80 in TreeSearch vs 78 in TNT (2-step inapplicable difference). TreeSearch's tree (EW=79) is genuinely better under three-pass scoring. XPIWE implementation verified correct: uses eff_k in all scoring paths. | +| T-244 | Full-pipeline 180-tip benchmark on Hamilton | E | Large preset on EPYC 7702. Median scores: 30s=1202, 60s=1190, 120s=1185. Per-rep median 17.3s. SA phase identified as least productive (7.4% time, 14% hit rate). 65-74 step improvement over pre-T-206 Intel baselines. | + +## 2026-03-25 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-207 | Multi-cycle PCSA perturbation phase | D (PR), F (cleanup) | PR #227 merged to cpp-search. Cherry-picked from `feature/pt-eval`. Includes T-210 fix (SA best-topology tracking). | +| T-210 | [Bug] SA doesn't save best-found topology | D (PR), F (cleanup) | Fixed in T-207 PR #227. `anneal_search` tracks/restores best tree at phase boundaries. | +| T-183 | Pool-seeded Wagner / consensus backbone | F (closed) | Superseded by existing `consensusConstrain` (ts_driven.cpp:565–690), which constrains the entire replicate pipeline, not just Wagner. Run independence concern; marginal starting-tree value given NNI→TBR pipeline. | +| T-196 | [Bug] `extract_divided_steps` wrong for NA+IW | F (cherry-pick) | Cherry-picked from feature/parallel-temper (6dc28a27) to cpp-search. Replaced 3 static copies of extract_divided_steps() with extract_char_steps() in TBR/SPR/drift. ts_temper.cpp already correct via PR #227. | +| T-198–201 | PT core + pipeline integration | — (closed) | Boltzmann PT ruled out by T-199 evaluation: 0% cold↔warm swap acceptance across all datasets. PCSA component cherry-picked as T-207/PR #227. Branch and worktree deleted. Findings preserved in `.positai/expertise/pt-evaluation.md`. | +| T-212 | Test `random_constrained_tree` under RANDOM_TREE | F (S-COORD) | Tests committed by C on cpp-search. GHA failures were from T-214 constraint bug (now fixed; GHA 23542642164 PASS). Closed during S-COORD round 20. | +| T-179 | Large-tree strategy preset (≥120 tips) | G | Completed 2026-03-24, in completed-tasks. Removed stale to-do entry (PR #215 closed). | +| T-182 | Adaptive ratchet perturbation probability | G | PR #221 merged 2026-03-25. Removed stale to-do entry. | +| T-226 | [Shiny] Tree space sequence mode: arrows + index labels | A | Kept feature. Replaced `lines()` with `arrows()`, added "Tree index" plotting symbol option. Commit `dbf593f1b`. | +| T-233 | [Shiny] Search summary text too verbose | A | Removed redundant topology count, shortened ruggedness warning. Commit `efbe77ab5`. | +| T-236 | [Shiny] Auto-start search after profile prep | A | `StartSearch()` called from profile prep result observer instead of showing "click Search" notification. Commit `cfb38b070`. | +| T-237 | [Shiny] Concavity slider visible in profile mode after dataset switch | A | Modal re-open didn't re-apply visibility. Fix: conditionally wrap in `hidden()` before `showModal()`. Commit `3903e3fce`. | +| T-238 | [Shiny] Search & profile notifications disappear prematurely | A | `tryCatch` sibling handler bug: `req(FALSE)` in `shiny.silent.error` handler caught by sibling `error` handler. Fix: single `error` handler with `inherits()` check + `stop(e)` re-throw. Commit `609241b65`. (Originally filed as T-235; renumbered to avoid collision with SPR bug.) | +| T-213 | Implement impose_constraint() for post-hoc topology repair | D (cleanup) | Already on cpp-search (a666918ed, PR #223). 88 tests pass. Formal closeout during S-COORD. | +| T-220 | [Shiny] Crash: searchExtendedIw not found when clicking Continue | D | Variable used in LogCode() before assignment. Moved snapshot above LogCode(). Direct fix on cpp-search. | +| T-229 | [Bug] XFORM scoring used IW path for non-hierarchy chars | D | `fitch_score_ew()` missing `ScoringMode::XFORM` in EW branch. MaxP scores wrong (3 vs 7). 1-line fix. S-RED focus 1. | +| T-219 | [Shiny] Dataset dropdown hover state visible | D | Selectize default hover (#f5f5f5) near-invisible on white. Added explicit hover CSS (#dde6ed). | +| T-211 | Stale `final_` in temper candidate scoring | C | Analyzed: conservative-only. Stale `final_` after clip/evaluate/restore biases Boltzmann screening but not verified acceptance (`temper_full_rescore` gates all accepted moves). Fix would require per-candidate full rescore or save/restore of all `final_` arrays — cost exceeds negligible SA benefit. Closed as not worth fixing. | + +## 2026-03-24 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-190 | Adaptive starting-tree strategy mixing (bandit) | A | Thompson sampling over 4 fresh-start arms (3 Wagner variants + random tree). 33 unit tests, vignette section, strategy diagnostics attribute. Benchmarked on 75–88 tip datasets (neutral to slight benefit). Squash-merged via PR #214. | +| T-177 | Bug fix: mid-TBR/SPR timeout | G | `check_timeout` callback threaded through `tbr_search`, `spr_search`, `nni_search`. 282 targeted tests pass. | +| T-205 | Fix flaky test-pp-random-tree.R on Windows | G | MWC RNG in build_postorder.h uses static global state not seeded by set.seed(). Widened binomial bounds (stringency 0.005→1e-6) and increased nTrees (6000→12000, 12000→24000) across all tests. False-positive rate: ~0.0002% per run (was ~1%). GHA pass: run 23501977394. | +| T-203 | Simulated annealing for large trees | G | Linear cooling schedule (T_start→T_end over N phases) using stochastic TBR + Boltzmann acceptance. `ts_temper.h/.cpp` (Layer 1: stochastic_tbr_phase ported from T-198; Layer 3: anneal_search). Wired into driven pipeline between drift and final TBR polish. `SearchControl(annealPhases, annealTStart, annealTEnd, annealMovesPerPhase)`. `large` preset: drift disabled, 5 annealing phases T=20→0. 19 new tests; all pass. Merged to `cpp-search` (conflict with `enumTimeFraction` resolved). | +| T-197 | Fix `concavity = 0` NaN in `precompute_iw_delta` | D | C++ guard for e==0 avoids 0/0 NaN. R entry points already validate concavity>0. Added 8 new validation tests (MaximizeParsimony, SuccessiveApproximations, TreeLength, AdditionTree). 169 tests pass. | +| T-195 | GHA benchmark workflow | D | `agent-benchmark.yml` + `bench_regression.R` CLI args (`--datasets`, `--budget`, `--output`, `--threads`, `--lib`). 14 datasets with max_score/ref_time_s. CSV artifact upload. Commit `7a80e67a`. | +| T-202 | Fix MPT enumeration skipped on timeout | B | Two-phase timeout: main loop exits at `budget*(1-enumTimeFraction)`, reserving remainder for plateau walk. New `SearchControl(enumTimeFraction=0.1)`. PR #217 merged. | +| T-179 | Large-tree strategy preset (>=120 tips) | G | Tuned via systematic benchmarking on mbank_X30754 (180t, 418p). Key: NNI-perturb too expensive at 5.5s/cycle; ratchet 12, drift 4, no NNI-perturb, outerCycles=1, single biased Wagner, tbrMaxHits=1. 60s: median 1255 vs thorough 1259; 120s: tied at 1250 but 2 reps vs 0-1; 30s: 1276 vs 1283. Commit `fab1e52c`. | +| S-RED | Red-team focus 10: Profile & IW scoring | B | Filed T-196 (P2): `extract_divided_steps` NA+IW bug — four static copies read `local_cost` for NA blocks instead of three-pass corrected steps, mispricing IW candidate screening. Filed T-197 (P3): `concavity = 0` → NaN, no validation. Verified: profile `concavity = 1.0` sentinel correct; `precompute_profile_delta` precomputed_steps offset correct; all indirect IW variants structurally correct. | +| S-RED | Red-team focus 11: T-190/T-202/XPIWE merge review | F | Filed T-208 (P2): `random_topology_tree()` ignores constraints — bandit RANDOM_TREE arm can produce constraint-violating starting tree when `adaptiveStart=true` (thorough preset). TBR blocks all constraint-relevant moves, tree returned to user unvalidated. Verified: XPIWE scoring correctness (eff_k, phi, sector copy, resampling). T-202 two-phase timeout correct (both serial and parallel). T-190 StrategyTracker correct (Thompson sampling, decay, RNG safety). | +| T-194 | Stratified sample selection + dedup profiling | B | Profiled 35 multi-file projects; flagged 24 near-duplicates (≥95% char identity) as `dedup_drop`. Post-dedup: 659 usable (535 train, 124 val). Selected fixed 25-matrix training sample via max-min distance (`MBANK_FIXED_SAMPLE`). Dedup integrated into `build_mbank_catalogue.R`. Documented in `strategies.md` and `AGENTS.md`. | +| T-191 | MorphoBank matrix catalogue | B | Scanned 801 .nex files from neotrans/inst/matrices/. 797 parsed OK (4 failures). 683 usable after ntax≥20 filter: 554 training, 129 validation (project%5==0). Includes multi-matrix projects and 7 syab files. Output: `dev/benchmarks/mbank_catalogue.csv`. Script: `dev/benchmarks/build_mbank_catalogue.R`. | +| T-192 | External dataset loading functions | B | Added to bench_datasets.R: `load_mbank_catalogue()`, `load_mbank_datasets()`, `load_mbank_sample()` (stratified by tier), `load_mbank_split()`. Path auto-resolved to `neotrans/inst/matrices/`. | +| T-193 | MorphoBank benchmark runner integration | B | Added to bench_framework.R: `benchmark_mbank_sample()` (routine ~25 matrix training sample), `benchmark_mbank_sweep()` (full split), `benchmark_mbank_validation()` (one-way-door validation with warning). All results tagged with `source` column. End-to-end tested: 4 matrices x default x 5s. | + +## 2026-03-23 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-177 | Bug fix: mid-TBR/SPR timeout callback | G | Verified `check_timeout` callback threaded through `tbr_search`, `spr_search`, and `nni_search` — all poll periodically (every `n_tip` clips) and bail mid-pass. Driven pipeline, ratchet, drift, NNI-perturb all pass callback. 282 targeted tests pass (0 fail). Closing as complete. | +| — | Ratchet perturbation tuning: 4%→25%, moves 20→5, cycles 5→10 | Human+AI | Systematic sweep across 14 datasets. 9 improved, 4 unchanged, 1 marginal at 10s (resolves at 20s). Commit `f1ae7edb`. | +| — | Drift→ratchet reallocation: driftCycles 4→2, ratchetCycles 10→12 | Human+AI | Drift ~0 per-replicate improvement; ratchet is strictly better use of budget. Commit `7ae01181`. | +| — | Large-tree profiling: 180-taxon dataset analysis | Human+AI | Discovered NNI essential at >100 tips, timeout bug in TBR, strategy presets not calibrated for large trees. Filed T-177 through T-183. | +| T-185 | Inspect IQ-TREE for parsimony search acceleration ideas | G | Reviewed `iqtree.cpp`/`iqtree.h` source. Top idea: stochastic NNI-perturbation (complement to ratchet). Also: diverse starting trees, adaptive perturbation scaling, perturbation-count stopping. Batch NNI not worthwhile (see `.positai/expertise/batch-nni.md`). | +| T-186 | Stochastic NNI-perturbation as escape mechanism | G | New `ts_nni_perturb.h/cpp`: random compatible NNI swaps on ~50% of branches + TBR re-optimization. Integrated between ratchet and drift in driven pipeline. `SearchControl(nniPerturbCycles, nniPerturbFraction)`. `thorough` preset: 5 cycles. 28 new test assertions; 1792 ts-* pass. | +| T-178 | NNI warmup in driven pipeline | G | NNI always-on (`nni_first = true` default). Each Wagner start NNI-optimized before selection (best of NNI-local optima). SPR auto-skipped when NNI active (NNI→TBR empirically optimal). Constraint guard: NNI warmup disabled when constraints active (nni_search lacks constraint support). All presets updated: `nniFirst = TRUE, sprFirst = FALSE`. 1846 ts-* pass. | +| T-156 | XPIWE C++ core | G | Already implemented in feature/xpiwe commit c7a41712. Verified: ScoringMode::XPIWE enum, eff_k[]/phi[] per-pattern vectors in DataSet, build_dataset() computes adjusted concavity, compute_iw()/precompute_iw_delta() use per-pattern eff_k[p]. Branch builds clean, 1677 ts-* pass. | +| T-157 | XPIWE Rcpp bridge | G | Already implemented in c7a41712. xpiwe bool + xpiwe_r + xpiwe_max_f + obs_count params in make_dataset, ts_fitch_score, ts_driven_search, ts_resample_search, ts_successive_approx, ts_parallel_resample. TreeSearch-init.c updated. | +| T-158 | XPIWE R API | G | Already implemented in c7a41712. extended_iw param in MaximizeParsimony(), TreeLength() (all S3 methods), Resample(), SuccessiveApproximations(). Silently ignored when EW/profile. SearchControl() correctly omitted (scoring property, not search control). | +| T-159 | XPIWE Tests | G | Already implemented in c7a41712. 18 XPIWE-specific tests in test-ts-xpiwe.R: formula unit tests, 8-taxon missing-data scenario, TNT validation gallery (Vinther2008, Sano2011, Sansom2010 stored reference k-values). All pass. | +| T-160 | XPIWE Docs + NEWS | G | Rd docs and NEWS already in c7a41712. Added vignette paragraph to profile-scores.Rmd explaining XPIWE formula (eff_k, phi, extrapolation factor). Added 'cdot' to WORDLIST. spell_check_package() clean. Commit ea602512. | +| T-161 | XPIWE Shiny GUI | G | Added "Implied (extended)" as default step weighting in Shiny search config modal. Both "Implied (extended)" and "Implied" share concavity slider. `extendedIw()` reactive threaded through scores(), searchTask, StartSearch(). Updated shinytest2 snapshots. 42 search module tests + 13 Distribution tests pass. Commit 6da9a861. | +| T-162 | XPIWE Shiny citation | G | Added Goloboff 2014 citation to global.R and references panel (Tree Search section). Always shown since XPIWE is default. Commit a553a325. | +| T-184 | maxTime → maxSeconds alias | G | Already implemented in commit fafd5d0e. Intercepts maxTime before Morphy detection, maps to maxSeconds with .Deprecated() warning, removes from dots. Removed maxTime from .morphyParams list. Verified working. | +| T-163 | Search confidence composite diagnostic | G | Replaced exp(-K) with tighter binomial bound (1-K/R)^R, falling back to exp(-K) when K==R. Added optional nTopologies/lastImprovedRep params (wired by T-164). Ruggedness warning when K/R < 0.3 and R >= 5. Limited independence flag when nTopologies==1. 58 search module tests pass. Commit 2d2115cb. | +| T-164 | Wire pool stats to Shiny search confidence | G | Added `count_at_best()` to TreePool. Initialized new DrivenResult fields in parallel path. Wired to Shiny: nTopologies=length(allTrees), lastImprovedRep from search attrs, reset on weighting/concavity/dataset change. 58 module tests pass. Commit 16c02dc7. | +| T-181 | Add 180-taxon dataset to benchmark suite | G | Added mbank_X30754 (180t, 425c, 11 states, 40% missing, 20.5% inapp) as large-tree benchmark tier. `LARGE_BENCHMARK_NAMES`, `load_large_benchmark_datasets()`, `benchmark_large()`. Commit adec48b6. | +| T-180 | Warm-start benchmark infrastructure | G | `bench_warmstart.R`: `compute_warmstart_tree()` (sprint→TBR optimum), `warmstart_run()` (single rep from warm start), `warmstart_benchmark()` (grid), `warmstart_summary()`. Isolates ratchet/drift escape from initial descent. Verified: Vinther2008 sprint→80, warm-start→79. Commit 13a019e3. | + +--- + +## 2026-03-20 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-148 | Red-team: ParsSim log-space convolution (S-RED focus 8) | B | Fixed `.LogCumSumExp` NaN bug: `-Inf - (-Inf) = NaN` in IEEE 754 when both accumulator and new value are `-Inf`. Guard added; 7 new assertions pass. | +| T-165 | Shiny: reset run stats on concavity/weighting change | C | Added stat-reset to `observeEvent(input$concavity)` and `observeEvent(input$implied.weights)` in `mod_search.R`. Trees kept; hits/reps/bestScore cleared. 2 new tests (50 total pass). | +| T-163 | C++ cancel checks in MPT enumeration + final TBR polish | C | Root cause: a "continue" search fills the pool quickly, leading to MPT enumeration with no cancel checkpoint. Added `check_cancel()` between seeds in the MPT while-loop and `check_timeout()` after final TBR polish in `run_single_replicate`. 152 driven-search tests pass. | +| T-166 | Shiny crash: `tipLabels()` subscript out-of-bounds on empty `r$trees` | A | `mod_data.R#107`: added `if (!length(r$trees)) return(character(0L))` guard. Fixes both `DatasetMatchesTrees` (#126) and `UpdateActiveTrees` (#203) stack traces. Resolves T-169 as downstream consequence. | +| T-167 | Shiny: "cannot open file" warning spam in progress poll | A | `mod_search.R`: added `if (!file.exists(pf)) return()` after `invalidateLater(500)`. Observer now silently skips until C++ creates the file. | +| T-168 | Shiny: `LEFT == RIGHT` recycling warning from `ape::read.nexus` | A | Upstream `ape` bug (bracket-index comparison). Suppressed with `suppressWarnings()` at both `read.nexus` call sites in `mod_data.R`. | +| T-169 | Shiny: `updateSliderInput()` value outside `[min, max]` when trees → 0 | A | `mod_data.R#231`: guarded entire tree-range/count slider update block with `if (nTrees > 0L)`; controls hidden via `parentHide` anyway when no trees present. | +| T-170 | Shiny: "no data loaded" in Configure modal; search terminates after second data load | A | Root cause was T-166 crash propagating through `DatasetMatchesTrees()` inside `tryCatch` error handler; fixed as consequence of T-166. | +| T-171 | Shiny: `LengthAdded` matrix-recycling and unknown-scoring warnings | A | `mod_consensus.R`: `PolEscVal` reactive now guards with `setequal(tipLabels(), names(r$dataset))`; returns `NULL` when taxa sets differ (tree has superset taxa vs dataset). | +| T-172 | Shiny: run counter accumulates across concavity changes | A | Root cause: Configure modal created `sliderInput(value=1L)` on each open, sending `input$concavity=1` to server and firing reset observer spuriously. Fixed by initialising all modal inputs from current `input$*` values (`mod_search.R`). | +| T-173 | Shiny: stop-when-N probability always ~37% for any dataset when `targetHits=1` | A | `global.R`: `SearchConfidenceText()` now appends "— increase 'Stop when N runs hit best' for a tighter estimate" when K=R and R≤5, guiding user toward more informative settings. | +| T-164 | Shiny UX: confusing run count when reducing max_runs during continued search | A | `global.R`: `SearchConfidenceText()` now accepts `nSearches`; says "total runs across N searches" when N > 1. `mod_search.R`: tooltip updated to explain per-search vs cumulative distinction; `helpText` added below maxReplicates slider in Configure modal. | +| S-RED | Red-team focus 9: Wagner & addition trees | C | BUG FIXED: boundary-edge false positive in `wagner_edge_violates_constraint` (ts_wagner.cpp) and `regraft_violates_constraint` (ts_constraint.cpp). Both rejected the edge directly above the constraint clade for MUST_OUTSIDE elements (`is_ancestor_or_equal(cn,below)` true when `below==cn`). Fix: `&& below != cn` guard. Search quality improvement, no correctness impact. +2 Wagner tests (43 total). 152 driven-search + 18 constraint tests pass. | +| T-175 | Shiny: search progress indicators vanish after 3-4 seconds (worker startup) | A | `mod_search.R`: both `searchTask` and `profilePrepTask` result observers used `validation = function(e) req(FALSE)` to handle "task running" state. In Shiny 1.8+, `ExtendedTask$result()` throws class `c("shiny.silent.error", "shiny.output.progress", ...)` — NOT "validation". The wrong handler name caused `error = function(e) NULL` to catch every "still running" signal, triggering premature cleanup (notification removed, `r$searchInProgress = FALSE`, `DisplayTreeScores()` called) ~3–4 s into each search (when the worker process starts and status flips to "running"). Fix: rename both `validation =` to `shiny.silent.error =`. | +| T-176 | Shiny: misleading error + wrong dataset when data file uploaded to tree loader | A | `mod_data.R`: after all tree-load attempts fail, try `ReadTntAsPhyDat` then `ReadAsPhyDat` on the file. If either succeeds, set `r$dataset`, `r$chars`, `r$charNotes` directly and notify "No trees found — loaded N taxa and M characters as dataset"; `observeEvent(r$dataset)` clears incompatible trees. If neither succeeds, keep existing "Trees not in a recognized format" message. Fixes (1) misleading error wording and (2) search continuing on previously-loaded dataset. | +| T-174 | Shiny: spurious "Inferring tip labels from dataset" + console warning spam on tree file load | A | `mod_data.R`: (1) `readLines(tmpFile)` in NA/NaN retry branch wrapped in `suppressWarnings` — suppresses benign EOF warning leaking to console. (2) `withCallingHandlers` inside `ReadTntTree` tryCatch muffles "incomplete final line" EOF warnings before they reach the outer `warning` handler (which is for genuine TNT tip-label warnings only). Also added `error = function(e) NULL` handlers to both inner tryCatches (was bare `NULL`). | + +--- + +## 2026-03-19 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-151 | Shiny: dataset observer clears bundled trees (blank plot / 0 trees) | B | `observeEvent(r$dataset,…)` unconditionally cleared `r$allTrees` after `UpdateData()` loaded trees from same .nex; fix: guard with `HaveData() && !DatasetMatchesTrees()`. All 31 bundled datasets affected. 2 new regression tests; 11 mod-data + 1681 ts-* pass. | +| — | Fix inapplicable.Rmd vignette CSL URL (404) | A | Remote `raw.githubusercontent.com` CSL URL returned 404; changed to local `../inst/apa-old-doi-prefix.csl` (matching all other vignettes). Also un-claimed stale Agent C GUI issue in `issues.md`. | +| T-142 | Shiny: Add TreeSearch logo to app header | A | Inline SVG (magnifier + 3-tip tree) added to `inst/Parsimony/ui.R` line 14; flex div wraps icon + h1. No external asset file needed. | +| S-COORD | Coordination review round 8 | A | T-144 fixed → CRAN unblocked. ~9835 pass/0 fail. T-141/T-140/T-097 confirmed complete. Standing tasks P1. | +| T-144 | Fix 15 CRAN Tier 1 test failures (PrepareDataProfile regression) | A | Added binary-reduction warning to PrepareDataProfile; fixed empty-phyDat return (avoid dataset[0] crash in new TreeTools); updated test-data_manipulation.R (17→17 pass) and test-Concordance.R (67→67 pass) to match new behavior. 0 regressions. | +| T-129 | Shiny: Evaluate progressive search result display | A | Briefing in `.positai/briefing-progressive-results.md`. Rec: progress file polling via existing C++ progress_callback infrastructure (~50 lines). Do not stream partial trees (misleading). Filed T-141. | +| S-PROF | Standing: Performance profiling (round 3) | A | EW no regression post T-115–T-124. HSJ ~0.6× EW (Fitch screening effective). XFORM ~1.7× EW (Sankoff overhead in Ratchet/Drift). Hierarchical resampling faster per-rep but no inter-replicate parallelism (1.1× vs 2.5× for Brazeau). No new optimization tasks. | +| S-RED | Red-team focus 7: Shiny module wiring | B | Reviewed all 7 modules + server.R. Forward-ref callbacks correct, cross-module updates correct (1 fragile parent_session usage documented), no orphaned observers, isolate() patterns correct, new progress observer correctly gated. No bugs found. | +| T-143 | Shiny: Per-replicate search progress display | B | Progress file polling: R callback writes `rep max_rep score hits target` to TREESEARCH_PROGRESS_FILE on each replicate. Shiny polls every 500ms via invalidateLater. Fixed C++ callback to fire regardless of verbosity. Updated test-ts-progress.R expectations. 1681 ts-* + 42 mod-search pass. | +| S-RED | Red-team focus 5: Data pipeline & simplification | B | Fixed `build_reduced_dataset` missing `inapp_state` copy (latent HSJ bug). Added `n_states > MAX_STATES` guard in `build_dataset`. Verified simplification, constraint indexing, EW offset interaction. 10 new NA-sector tests. 1679 ts-* pass. | +| T-130 | Shiny: Search cancellation button | B | Already fully implemented: hidden Stop button shown during search, file-based cancel signal polled by C++ (serial + parallel), cleanup on completion. 3 dedicated testServer tests, 38/38 mod-search pass. | +| T-147 | Shiny: "Tips to show" starts at 0, dimension error | F | Same root cause as T-146: `UpdateKeepNTipsRange()` reactive never consumed on init. Added `observe(UpdateKeepNTipsRange())`. 92/92 module tests pass. | +| T-146 | Shiny: "Root on" selectize not populated on launch | F | `UpdateOutgroupInput()` was a reactive never consumed on init. Added `observe(UpdateOutgroupInput())` in mod_consensus.R. 92/92 module tests pass. | +| T-145 | Shiny: Defer profile prep to search start | F | Removed auto-trigger on mode change; profile prep only runs when user clicks Search in profile mode. Error notification → silent LogMsg. On success: "Profile scores ready — click Search to start." 92/92 module tests pass. | +| T-097 | Verify: Sun 2018 continued-search indicator | F | Verified via SearchLog integration test (4/4 pass): EW→IW consecutive search works, `searchCount` increments. Sun2018 loads/scores correctly (54t, 225c, EW=802). Core fix in T-090, additional fix in T-138. | +| T-140 | Shiny: Profile prep error on dataset selection | F | Resolved by T-138 (lazy async prep). Added re-invocation guard: if user changes dataset while profile prep is running, cancel in-flight task via signal file and retry after 200ms. 88/88 module tests pass. | +| T-138 | Shiny: "Continue search" doesn't start new search | F | Root cause: `bindCache` on `scores()` prevented re-evaluation after dataset switch. Fix: replaced with plain `reactive()`, added `r$searchInProgress` flag, async profile prep with cancel, cancel button UI, starting-tree error handling. 5 new testServer tests. | +| S-RED | Red-team review (focus 6) | E | Verified T-137/139/141 fixes. Scores correct (serial+parallel). hits≤reps confirmed. 1676 ts-* pass, 88 module pass. 42 pre-existing failures from human's profile parsimony refactor (tracked by T-144). No new bugs. | +| T-141 | Shiny: Sun 2018 blank plot area despite trees | E | When loading a dataset whose file has no trees, stale trees from the previous dataset persisted with incompatible tips → blank plot. Fix: clear old trees when they don't match the new dataset. 88 module tests pass. | +| T-139 | Shiny: Hit/rep count forgets previous searches | E | Bug: `parallel_driven_search()` captured `hits_to_best` AFTER MPT enumeration, inflating count. Fixed to capture before enumeration (matching serial path). Added clamp in `SearchConfidenceText()` as safety net. 1669 ts-* + 275 parallel pass. | +| T-137 | Shiny: Stop button leaves UI stuck on "Stopping…" | E | C++: threaded cancel callback into `ratchet_search()`/`drift_search()` for faster response. Shiny: cancel observer removes notification immediately; result observer uses `searchInProgress` flag for robust cleanup. 1669 ts-* + 88 module tests pass. | +| T-136 | Shiny: Use `WideSample()` for tree thinning | F | Replaced `seq.int()` with `WideSample()` maximin selection in `mod_data.R`. Updated Distribution snapshots. 119/120 Shiny pass. | +| T-124 | Inapplicable: Hierarchical resampling in `Resample()` | F | `hierarchy`, `inapplicable`, `hsj_alpha` params added. Resamples at unit level (free chars + hierarchy blocks). `.HierarchicalResampleWeights()` + `.ResampleHierarchy()`. 72 tests, 1669 ts-* pass. | +| T-123 | Inapplicable: `TreeLength()` HSJ + xform extension | D | Added `hierarchy`, `inapplicable`, `hsj_alpha` to all TreeLength methods. HSJ via `ts_hsj_score()`, xform via Fitch + `ts_sankoff_test()`. 24 new tests, 73/73 tree_length pass. | +| T-125 | Inapplicable: Documentation & vignette | F | `vignettes/inapplicable.Rmd`: Brazeau/HSJ/xform approaches, hierarchy specification, worked example, comparison table. Updated MaximizeParsimony docs. | +| T-122 | Sankoff: Tests against Goloboff et al. examples | F | 80 assertions in test-ts-xform.R: gain/loss asymmetry, secondary variation penalty, HSJ vs xform cross-validation, Fitch+Sankoff mixed scoring, 8-tip search, deterministic seeds, gain cost scaling. | +| T-121 | Sankoff: Integration with search pipeline | F | `ScoringMode::XFORM`, `score_tree()` dispatch (Fitch+Sankoff), `ts_driven_search` bridge, `MaximizeParsimony()` xform wiring. End-to-end xform search works. | +| T-120 | Sankoff: C++ optimization engine | F | Fixed incorrect asymmetric cost test expectation (score 2, not 3). 24/24 Sankoff pass. | +| T-119 | Sankoff: R-level recoding function | F | `recode_hierarchy()`: primary+secondaries → Sankoff char, asymmetric cost, Hamming distance. 49 tests. | +| T-134 | HSJ: Fix secondary dissimilarity d always 0 | F | Added Fitch uppass to `fitch_label_char()`. 37/37 HSJ pass, 1509 ts-* pass. | +| T-133 | ParsSim: per-taxon/per-character missing rates | A | List/matrix inputs; Bernoulli per-cell sampling. 128/128 ParsSim pass. | +| T-132 | ParsSim: missing data (`?`) support — flat rate | A | `missing` param (0–1) injects `?` post-hoc. 97/97 ParsSim pass. | +| T-131 | Fix: 39 stale IW reference values in test-ts-iw.R | A | Recomputed after T-113 NA bit-stripping. 86/86 IW pass. | +| T-128 | Shiny: Rename "Mode" label to "Step weighting" | ? | Trivial label change. | +| T-120 | Sankoff: C++ optimization engine | F | `ts_sankoff.h/.cpp`, 24 unit tests. | +| T-118 | HSJ: End-to-end tests against paper examples | C | 123 assertions in test-ts-hsj.R. Found d=0 bug → T-134. | +| T-117 | HSJ: Wire into search pipeline + remove placeholder | D | `ScoringMode::HSJ`, `score_tree()` dispatch, driven search bridge. | +| T-116 | HSJ: Rcpp bridge + R marshalling | D | `ts_hsj_score()`, `build_tip_labels()`, `hierarchy_to_blocks()`. | +| T-115 | HSJ: Token mapping & DataSet partitioning | D | `inapp_state` in DataSet, `absent_state` in HierarchyBlock, `partition_weights()`. | +| T-114 | ParsSim: rootState vector handling + docs | D | Validate length/range, per-character scalar indexing. 80 ParsSim pass. | +| T-113 | Fix: T-097 NA ambiguity code fix incomplete | B | Bit-stripping in `build_dataset()`. 12/12 NA-ambig pass. | +| T-112 | Port missing MaddisonSlatkin tests from branch | D | All branch tests already present. 37/37 pass. | +| T-111 | ParsSim: extended test suite + edge cases | ? | 9 extended tests. 66/66 ParsSim pass. | +| T-110 | ParsSim: profile character selection | D | `concavity = "profile"` mode. 67 ParsSim pass. | +| T-109 | ParsSim: core implementation (EW + IW) | F | `R/ParsSim.R` with 7 internal helpers. 35 tests. | +| T-108 | Fix: pattern_freq multiplicative blowup in IW ratchet | B (S-RED) | `+= 1` instead of `*= 2`. 1404 ts-* pass. | +| T-105 | Docs + CRAN prep for multi-state profile | D | Updated vignette, roxygen, NEWS.md. | +| T-104 | Integration tests for multi-state profile parsimony | D | 6 test blocks in test-ts-profile.R. | +| T-103 | Generalize PrepareDataProfile() for multi-state | B | Removed binary decomposition, dynamic contrast matrix. | +| T-102 | Generalize StepInformation() for multi-state | B | 3–5 state dispatches to MaddisonSlatkin. 64 new tests. | +| T-101 | Port MaddisonSlatkin.cpp to main branch | D | 1401-line C++, 24 test assertions. | +| T-100 | Fix: `{-,X}` ambiguity tokens treated as applicable | B (S-RED) | Stale postorder in drift_phase. Also: bit-stripping in ts_data.cpp. | +| T-099 | Confidence text: hover tooltip | B | Title tooltip on summary + modal. 28/28 mod-search pass. | +| T-098 | Fix overconfident search confidence probability | B | `exp(-K)` conservative bound, Laplace smoothing. 76/76 mod pass. | +| T-097 | Fix: {inapplicable,state} ambiguity tokens scored as applicable | F | `{-,S}` tips collapsed to pure NA. 7 new tests. 1404 ts-* pass. | +| T-096 | Fix: upweight_mask missing in EW bounded/cached indirect scoring | C (S-RED) | 6 sites fixed. 1397 ts-* pass. | +| T-095 | Assess MorphyLib decoupling for custom search | ? | Thin wrappers around `ts_fitch_score()`. Design in morphy-migration.md. | +| T-094 | Un-deprecate custom search functions | B | Removed `.Deprecated()` from Ratchet, Jackknife, etc. | +| T-093 | Remove testthat-problems.rds artifact | B | Deleted + .gitignore. | +| T-092 | Fix: hits_to_best inflated by MPT enumeration | B | Capture hits_to_best before plateau walk. 1397 ts-* pass. | +| T-091 | Shiny: Terminology cleanup — "runs" not "replicates" | ? | Already resolved by T-088/T-089/T-090. | +| T-090 | Shiny: Search-in-progress indicator fix + results display refresh | B, C | Moved notification before tree selection; always call `DisplayTreeScores()`. 94/94 shinytest2 pass. | +| T-089 | Shiny: Confidence text rewrite + adaptive slider note | B | "Probability that a better tree exists: ~X%". 92/92 shinytest2 pass. | +| T-088 | Shiny: Search config modal cleanup | B | Renamed labels, removed helpText, restructured layout. 92/92 shinytest2 pass. | + +## 2026-03-18 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-087 | Doc: `.AutoStrategy()` docs vs code mismatch | A | Fixed roxygen + AGENTS.md. | +| T-086 | Replace `suppressWarnings` with `expect_warning` in test-ts-profile.R | D | | +| T-085 | R CMD check refresh: 0E/0W/3N (benign) | C | Added test artifacts to `.Rbuildignore`. | +| T-084 | NEWS.md: Shiny/EasyTrees improvements section | F | | +| T-082 | Shiny: nThreads slider | — | Already implemented (triage). | +| T-083 | Shiny: Search config modal restructured | — | Already implemented. | +| T-081 | Shiny: Timeout slider verified done | F | Dataset-adaptive scaling. | +| T-080 | Shiny: renamed stop label | C | | +| T-079 | Shiny: Fix `maxProjDim` crash | A | | +| T-078 | Shiny: `PlotCharacter()` crash on multifurcating tree | C | | +| T-077 | Shiny: post-search confidence display | D | | +| T-076 | Wrap PrepareDataProfile warnings in test-ts-profile.R | C | | +| T-075 | Bench: ns/candidate cost linear in n_blocks | A | | +| T-072 | Shiny: result observer accumulates trees at matching score | C | | +| T-071 | Shiny: first-search trees unchanged (fixed via T-072) | C | | +| T-070 | Fix ts_wagner.cpp -Wcomment | C | | +| T-065 | Shiny mod: testServer() coverage pass | C | 92 assertions total. | +| T-064 | Shiny mod: Dissolve events.R + final integration | C | 75/75 shinytest2 pass. | +| T-063 | Shiny mod: mod_consensus | C | 1327-line module. 75/75 shinytest2 pass. | +| T-062 | Shiny mod: mod_data | B | Absorbs data.R + trees.R. 70/70 shinytest2 pass. | +| T-061 | Shiny mod: mod_search | A | 51/51 shinytest2 pass. | +| T-060 | Shiny mod: mod_clustering | B | 57/57 shinytest2 pass. | +| T-059 | Shiny mod: mod_treespace | A | | +| T-058 | Shiny mod: mod_downloads | E | 11-assertion testServer. | +| T-057 | Shiny mod: mod_references | ? | 4-assertion testServer. | +| T-056 | Shiny mod: AppState typed store | B | 34+ fields, 51/51 shinytest2 pass. | +| T-055 | Shiny mod: Three-file skeleton + source extraction | B | Split `app.R` → 3 files + 11 server/ files. | +| T-054 | Shiny mod: Capture shinytest2 baselines | B | FAIL 0 / WARN 0 / PASS 29. | + +## 2026-03-17 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-052 | Shiny: seed=TRUE, cache keys, library() fixes | A | | +| T-051 | Shiny: Removed redundant TreeDist install.packages | A | | +| T-050 | Shiny: `[1:3]` → `head(, 3)` in Excel preview | A | | +| T-049 | Shiny: Dead `r$newTrees` — added missing write | A | | +| T-048 | Shiny: onStop cleanup — tempdir() prefix + Excel cleanup | A | | +| T-047 | Shiny: `which.min(NULL)` crash guard | A | | +| T-045 | Shinylive plan for EasyTrees | D | Plan written, sub-tasks TBD. | +| T-044 | Documentation: custom.Rmd vignette framing | A | | +| T-043 | Documentation: SuccessiveApproximations() parameter names | D | | +| T-042 | Documentation: DESCRIPTION, README, cross-references | D | | +| T-041 | Fix Shiny app (EasyTrees) for new MaximizeParsimony params | D | | +| T-040 | All-ambiguous phyDat guard in TreeLength/MaximizeParsimony | D | | +| T-039 | Fully-resolving constraint crash: column-major indexing bug | A+D | | +| T-038 | Final CRAN prep: version bump 1.9.0, R CMD check | D | | +| T-037 | Post-T-025 cleanup: sprFirst default, regression benchmark | C | | +| T-036 | R-level test coverage for new MaximizeParsimony features | E | | +| T-035 | Deprecate legacy search functions | C | | +| T-034 | Migrate RandomTreeScore to C++ engine | A | | +| T-033 | Write NEWS.md for next release | D | | +| T-032 | Disable CSS by default (cssRounds=0) | C | | +| T-031 | Named strategy presets in R interface | C | | +| T-030 | Tier 1 MorphyLib migration: TreeLength + CharacterLength → C++ | C | | +| T-029 | Default parameter tuning: driftCycles 6→2, ratchetCycles 10→5 | C | | +| T-028 | Pass `min_steps` for IW scoring in MaximizeParsimony() | C | | +| T-027 | progressCallback SIGSEGV (secondary symptom of T-025) | E | | +| T-026 | Performance regression benchmark script | C | | +| T-025 | PreallocUndo buffer overflow (P0 crash) | E | | +| T-024 | Parallel resample (jackknife/bootstrap) | E | | +| T-023 | Expose `maxSeconds` in MaximizeParsimony R interface | B | | +| T-022 | R CMD check preparation | D | | +| T-021 | Fix pre-existing test failures in legacy test files | A | | +| T-020 | Fix RcppExports/init.c mismatch for Wagner bridges | A+B | | +| T-019 | Migrate AdditionTree() to C++ Wagner engine | E | | +| T-018 | Pass user-supplied starting tree to C++ engine | B | | +| T-017 | Add test coverage for ambiguous-token simplification | B | | +| T-016 | Add `precomputed_steps` offset in `precompute_profile_delta` | B | | +| T-015 | Copy scoring_mode + simplification fields in build_reduced_dataset | B | | +| T-014 | Fix `compute_fixed_steps` for all-ambiguous characters | B | | +| T-013 | Fix `is_uninformative` for ambiguous tokens | B | | +| T-012 | SPR→TBR escalation in driven search | E | | +| T-011 | Documentation refresh (Morphy, MaximizeParsimony, vignette, README) | B | | +| T-010 | MorphyLib deprecation plan | B | | +| T-009 | Audit rearrange.cpp dead code | B | | +| T-008 | Fix test-ts-simd.R exit code 127 crash (fuseInterval=0) | B | | +| T-007 | Wagner NA-incremental scoring | E | | +| T-006 | Audit and clean R-level TODOs | C | | +| T-005 | Phase 6E: Adaptive strategy selection | A | | +| T-004 | Phase 6D: Benchmarking framework | D | | +| T-003 | Phase 6C: Define strategy space | B | | +| T-002 | Phase 6B: Curate benchmark dataset suite | B | | +| T-001 | Phase 6A: Per-phase timing instrumentation | A | | + +## 2026-03-20 +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-149 | Profile MaddisonSlatkin → VTune hotspot analysis | A | logB/logPVec cache overhead dominates (53% DLL); skill updated | +| T-152 | LSEAccumulator long double→double + expl→exp | A | 26% avg speedup across k=3..5 boundary cases | +| T-153 | std::isfinite→NEG_INF compare in MaddisonSlatkin.cpp | A | bundled with T-152 | +| S-PROF | Threshold comment update + fpl_d/fpl_u batching in LogPVec | A | ~15% further speedup; k=3:27/k=4:19/k=5:13 new thresholds | + +| T-154 | OAFlatMap for logB_cache/logPVec_cache in SolverT | A | Probe-layer OA map eliminates node ptr-chase; deque backing for logPVec stable refs; fixes latent LogB() dangling-ref UB; bd883459. Speedup vs post-fpl-batch baseline: k=3 n=27 ~4.4×, k=4 n=19 ~3.5×, k=5 n=13 ~12×. No formal A/B build (Windows AV makes double-build ~60 min); derived from S-PROF sweep timings. | + +## 2026-03-22 + +| ID | Description | Agent | Notes | +| — | MaddisonSlatkin: test freeze fix (align pp-multistate with split_count gate) | A | Commit 4b95c37a | +| — | MaddisonSlatkin: Carter O(1) closed-form for k=2 + LSE lookup tables + buffer reuse | A | Commit 93851932. k=2 >100× speedup; k=3 ~5-19% | +| — | MaddisonSlatkin: threshold recalibration with correct bitmask encoding | A | Commit b8666825. OLD thresholds used wrong state encoding. k=3: 136→75, k=4: 100→50, k=5: 100→35 | +| — | MaddisonSlatkin: wall-clock time budget safety valve | A | Commit 536f7ff9. 2s chrono budget; every LogB/LogPVec call checks clock (~20ns overhead). Blowup returns NA + warning. 137 tests pass | +|----|-------------|-------|-------| +| — | Collapsed-region regraft merging + collapsed pool dedup | — | Goloboff (1996) approach: skip zero-length edges as clip candidates, skip interior collapsed regraft positions (boundary-only evaluation), diversity-aware pool eviction on ties, collapsed-topology pool dedup. 0% skip rate on standard morph datasets but improves pool diversity. 35b5ad99 | +| — | Strip dead CollapsedRegions code from hot path | — | region_id/n_regions never read by consumers; all callers now use compute_collapsed_flags() directly. a16373a7 | +| — | Collapsed dedup in parallel search path | — | ThreadSafePool::add_collapsed() + worker thread + fuse_round updated. 3648beb9 | +| — | Cross-replicate consensus constraint tightening | — | Opt-in consensusConstrain=TRUE: after ≥5 reps, lock pool strict consensus as topological constraints. Clears on new best score. build_constraint_from_bitsets(), extract_consensus_splits(). 09f69915 | +| — | Strategy preset tuning | — | default: wagnerStarts=3, sprFirst=TRUE, adaptiveLevel=TRUE. thorough: sprFirst=TRUE. Bundled in 09f69915 | + +## 2026-03-23 +| T-188 | Biased Wagner addition — API integration | Human+AI | `wagnerBias` (0=random, 1=Goloboff, 2=entropy) + `wagnerBiasTemp` in `SearchControl()` and `ts_driven_search`. First Wagner start uses biased order; remaining starts use random for diversity. Goloboff 2014 §3.3. Benchmarked: 80% Wagner→TBR gap reduction at 174t; marginal on ≤88t. | +| T-189 | Outer search cycle loop | Human+AI | `outerCycles` param in `SearchControl()`. Wraps [XSS+RSS+CSS → Ratchet → NNI-perturb → Drift → TBR] in outer loop, distributing cycles evenly. Matches TNT xmult pattern (Goloboff 1999 §2.3). `thorough` preset defaults to `outerCycles=2`. Backward-compatible: default=1. | +| T-184 | `maxTime` → `maxSeconds` alias | Human+AI | Already implemented in b8e56e2b. `maxTime` intercepted before `.morphyParams` check, mapped to `maxSeconds` with deprecation warning, routed to C++ engine. Verified: timings attribute present, Morphy() not called. | + +## 2026-03-24 (evening) +| T-209 | NNI perturbation constraint guard | E | Gate on `(!cd || !cd->active)`. PR #220 merged. | +| T-206 | Outer cycle reset cap / maxOuterResets | E+A | Cap resets to maxOuterResets (default 3). PR #218 merged. | +| T-210 | SA best-tree tracking across phases | C | `anneal_search()` saves/restores best tree at phase boundaries. Commit `e204d0a0` on feature/pt-eval. | +| T-182 | Adaptive ratchet perturbation probability | G+E+A | hitRate-based tapering. PR #221 created. | + +## 2026-03-25 +| T-208 | random_topology_tree ignores constraints | G+A | WAGNER_RANDOM fallback. Cherry-picked to cpp-search (24427c9a). PR #219 closed. | +| T-211 | Stale final_ in temper candidate scoring | C | Closed: conservative-only impact, not worth fixing. | + +## 2026-03-25 (morning) +| T-215 | cli progress bar `::` resolution fix | A | `pb_env` parent `baseenv()` → `environment()` in `MaximizeParsimony()`. Commit 908860d25. | +| T-216 | Shiny app `"brazeau"` → `"bgs"` | A | 8 occurrences in `mod_search.R` (comparisons, defaults, selectInput value). Commit 908860d25. | +| T-217 | `tree = NULL` in `MaximizeParsimony()` Morphy path | A | Added `!is.null(tree)` guard on Morphy delegation (line 485). Main path already correct. Commit 908860d25. | +| T-218 | Simplification transforms corrupt NA scoring | A | Full fix: `has_genuine_inapp` flag in `SimplifiedPattern` gates Phase 1 on genuine `-` only; `?`-only characters use Fitch (transforms safe). `build_dataset()` uses flag instead of token scanning. Initial conservative fix `08054102f`, full fix `c32e213bd`. | +| T-221 | [Shiny] Crash loop in cluster consensus concordance | B | `LabelConcordance()` guard `!is.null()` → `inherits(, "phylo")`. Commit `bc5313c22`. | +| T-222 | [Shiny] "Align tips" does nothing in Characters on trees | B | `Display` callback always set edge.length=1; now NULL when tipsRight checked. Commit `b23580823`. | +| T-223 | [Shiny] Tree plot left-aligned with excess white space | B | Display now sets edge.length=NULL (cladogram) to fill width. "Align tips" checkbox now redundant. Commit `280aa446d`. | +| T-224 | [Shiny] `.ts_driven_search_raw` not found | B | Already fixed by T-214 (commit `62658709d`): renamed broken callers from af7601b refactor. | +| T-225 | [Shiny] Tree space Connect shows nothing | B | `mapLines` checkbox was unnamespaced; module couldn't see it. Passed as reactive parameter. Commit `14277d04f`. | +| T-228 | [Shiny] Modal shows "Implied" not "Implied (extended)" | B | Fallback `"on"` → `"xpiwe"` to match reactive default. Commit `63e86f237`. | +| T-227 | [Shiny] Dataset dropdown hover polish | B | `:hover` pseudo-class for hover, `.selected` retains blue bg on hover. Commit `fd401ec81`. | +| S-COORD | Coordination review round 18 | B | Created coordination.md (local). PR #210 GHA in flight. No unresolved bugs on cpp-search. 3 OPEN P3 tasks deferred. | + +## 2026-03-25 (afternoon) +| T-214 | Multi-split constraints not enforced during TBR search | C | RANDOM_TREE falls back to Wagner when constrained; fuse uses direct map_constraint_nodes check. 26/26 + 806/806 tests pass. GHA 23542642164 PASS (both platforms). Commit `3f8792c5f`. | +| T-231 | [Shiny] Search stopping criteria mismatch | C | Root cause: consensus stability stopping preempts targetHits without explanation. Fix: capture `consensus_stable`/`timed_out` from search result, pass `stopReason` to `SearchConfidenceText()`. User now sees "Search stopped: consensus tree unchanged across recent replicates." Commit `e411117b4`. | +| S-RED | Red-team review focus 2: Search topology invariants | C | Reviewed ts_tbr.cpp, ts_drift.cpp, ts_search.cpp, ts_tree.h/.cpp (2856 lines). Found T-235: SPR stale state arrays after rejected regraft (NA/IW screening degradation, low practical impact). TBR/drift/NNI topology invariants verified correct. | +| T-230 | [Shiny] Replicate-count warning when verbosity=0 | B | Gate behind verbosity > 0L. Stale-install part already fixed by T-214. Commit a37984dfa. | +| T-234 | [Shiny] Context-dependent references | D | references_server now takes weighting reactive; Goloboff1993 shown for IW/XPIWE, Goloboff2014 for XPIWE only. Standing refs always shown. 3 new tests. Commit `4cfb37e12`. | +| T-235 | [Bug] SPR stale state after rejected regraft | B | full_rescore after spr_unclip on rejection path. Commit aafeed219. 78 tests pass. | +| T-226 | [Shiny] Remove "Trees in sequence" connect mode | B | Pool order is arbitrary under C++ search; option removed from UI. Commit 898e5e03c. | +TASKEOF 2>&1 +| S-COORD | Coordination review round 19 | D | T-214 GHA passed (23542642164). T-212 unblocked, re-dispatched by B. T-233/T-236/T-237 completed by A. Shiny bug backlog cleared (5 triaged this session, all resolved). 3 OPEN specific tasks remain. Updated S-PR notes. | +EOF 2>&1 +| S-RED | Red-team review focus 3: Ratchet & perturbation | D | Reviewed ts_ratchet.cpp (244), ts_sector.cpp (1007), ts_fuse.cpp (522). All 9 scenarios verified correct. Perturbation save/restore (IW/XPIWE), sector acceptance logic, fuse node mapping, root-structure validation all correct. No bugs found. | +EOF 2>&1 +| T-240 | [Shiny] Pool suboptimal filter not applied mid-search | D | Accumulation path deduped but didn't filter by tolerance. Added TreeLength rescore + filter on combined trees. Commit `340d78381`. | +EOF 2>&1 +| S-RED | Red-team review focus 4: Parallelism & RNG | D | Found and fixed parallel consensus stability bug: idle polls incremented unchanged counter causing premature termination. Also noted R_CheckUserInterrupt/longjmp fragility. 9 items verified, 1 bug fixed. | +| T-208 | random_constrained_tree() — constraint-aware random topology | A | Builds topology respecting all constraint splits. Orders splits smallest→largest, assigns tips to tightest enclosing split, resolves randomly. PR #229. | +| T-211 | impose_constraint() bail-out, return value, root-child bugs | A | Three fixes: threshold n_tip/4→n_tip, return -1 on bail-out, new topology_spr() for root-child moves. PR #229. | +| T-187 | Perturbation-count stopping rule (perturbStopFactor) | D+Z | IQ-TREE-style stopping rule. Benchmarked across 10 datasets: PSF=2 gives 2.4–6.9x speedup with 0 score loss. Default changed from 0 to 2. PR #226 merged. | +ENDOFAPPEND 2>&1 +| T-258 | Intra-replicate fusing | F | Added `intraFuse` param to SearchControl/DrivenParams. Fuses current tree against pool donors after TBR polish (step 6b). Disabled in parallel mode (already has between-replicate fusing). Critical fix: `build_postorder()` + `reset_states()` after `tree_fuse()` to prevent segfault from stale state arrays. 221 tests pass. Merged directly to cpp-search (`924bfb35`). Benchmark script: `dev/benchmarks/bench_intra_fuse.R`. | +| T-243 | FlatBlock metadata, flat EW indirect functions, TBR prefetch | E | PR #230 merged to cpp-search. Confirmed 1.4% speedup at 180 tips on Hamilton (median 11.538→11.360s, p=0.001, n=10). TS-HotLoop worktree removed. | + +## 2026-03-26 (afternoon) +| T-255 | Reduce drift in default and thorough presets | E | Set `driftCycles=0` in default and thorough presets. T-254 confirmed drift has zero score, MPT, or diversity benefit and costs 10–22% of replicates. GHA 23598220226 PASS (ARM64 + Windows, 0 errors, 0 warnings). Also fixed: SearchControl.Rd codoc mismatch (0152daa3), flaky timeout test perturbStopFactor (161e0e1b). | +| T-260 | VTune TBR per-evaluation overhead profiling | E | Dikow2009 88t, EW, 1000 TBR passes, 30.96s CPU. Top hotspots: StateSnapshot save/restore 14.6%, reset_states zeroing 9.1%, fitch_na_score 29.2%. Non-scoring overhead = 37.8%. Filed T-261/T-262/T-263. Write-up: `dev/benchmarks/vtune_tbr_analysis.md`. | +EOF 2>&1 +| T-261 | Eliminate std::fill zeroing in reset_states() | E | Removed 5 redundant std::fill(0) calls from reset_states() in ts_tree.cpp. Audited all Fitch scoring passes to confirm every array entry written before read. PR #232, merged to cpp-search. | +| T-262 | Bulk memcpy for tip state loading | E | Replaced element-by-element tip copy with std::memcpy() in load_tip_states(). Combined T-261+T-262 = 8.6% TBR speedup (Dikow2009, 88t). PR #232, merged to cpp-search. | +EOF 2>&1 + +## 2026-03-26 + +| T-265 | Per-replicate search quality regression — RESOLVED as scoring method confound | E | T-249/T-264 compared Brazeau-scored TreeSearch to EW-scored TNT. Apparent mean gap +17.8 steps; actual EW-vs-EW gap +2.2 steps. 5/11 datasets at 0 gap. R2-equiv/R2-modern/auto preset all find identical Brazeau scores — no preset or engine regression. Also found stale .agent-E library caused T-249 early termination artifact. | +| T-249 | TNT comparison round 3 — validated | E | Hamilton job 16596844 results validated. Large apparent gaps were scoring method confound (Brazeau vs EW). Future TNT comparisons must use fitch_mode() for apples-to-apples. | +| T-264 | consensusStableReps fix — verified | E | GHA 23600674681 passed both platforms. Scoring confound resolved; fix is correct. | + +## 2026-03-27 + +| ID | Description | Agent | Notes | +|----|-------------|-------|-------| +| T-267 | MaddisonSlatkin 5-state test resilience | A | Test now skips when computation hits the 2s time budget on slow CI machines, instead of failing with NA. | +ENDMARK 2>&1 +| T-263 | Hoist StateSnapshot save to once per TBR pass | F | Eliminated per-candidate memcpy save/restore of prelim/final_/down2/subtree_actives (~190 KB at 88 tips). Saved once before TBR pass, restored once after. PR #231 merged 2026-03-27. | +| T-246 | AVX2 runtime dispatch for Fitch SIMD operations | F | Widened ts_simd.h from SSE2 (128-bit) to AVX2 (256-bit) with `__builtin_cpu_supports("avx2")` runtime detection and SSE2 fallback. Estimated 5–10% on multi-block datasets. PR #233 merged 2026-03-27. | +| T-257 | Post-ratchet sectorial search pass | F | Added second sectorial pass after ratchet: [XSS+RSS+CSS → Ratchet → XSS+RSS+CSS → TBR]. Controlled by `postRatchetSectorial` in SearchControl(). PR #234 merged 2026-03-27. | +| T-270 | Algorithm vignette + AGENTS.md for T-257 post-ratchet sectorial | A | Added pipeline step 5a, new "Post-ratchet sectorial pass" subsection in vignette; fixed stale consensusStableReps preset docs (T-264); updated AGENTS.md pipeline steps. commit d8f3c769. | +EOF 2>&1 +| T-272 | Close stale PR #178 (concordance, Aug 2025 DRAFT) | A | Closed via `gh pr close 178`. | +EOF 2>&1 +| S-RED-10 | Red-team focus 10: Profile & IW scoring | A | BUG FIXED: precompute_profile_delta old_cost=0 when s>info_max_steps. commit 7cff7870. | +| T-268 | Branch housekeeping: prune 11 stale local branches, update AGENTS.md worktree table | F | Deleted branches: 143-start-search-from-best-tree, 175-discord, 207-calls-to-rf_error, PlotCharacter-rooting, constraint-addition, copilot/fix-maximumlength-crash, keep-subopt, kmeans++, pol-escapa-negative, taxon-influence, tbr-fix. Updated worktree table (TS-anneal/MadSlat/ParsSim/PTeval/Xpiwe removed; TS-PruneRI added). Triaged u.005. commit 838b14c1. | +| T-273 | Fix flat_blocks.active_mask staleness during ratchet | F | Preventive: sync ds.flat_blocks[b].active_mask in perturb_zero(), perturb_mixed(), restore_perturb_state(). Currently safe (zero call sites for flat indirect), prevents future bug if flat variants wired into ratchet TBR dispatch. commit 44547484. | +| T-274 | NNI-perturb cycle count at thorough-preset scale | F | Per-replicate sampling, 20 seeds, Zhu2013/Giles2015/Dikow2009 (75–88t). NNI-perturb adds 59–69% overhead, ≤0.1-step expected-best benefit — zero practical advantage. Set nniPerturbCycles=0 in thorough preset. bench_t274_nni_perturb.R. | + +## 2026-03-27 + +| F-003 | T-276: convergence summary after MaximizeParsimony() | 3 | perturb_stop in Rcpp bridge + structure attrs; verbosity summary (score/reps/last_improved/n_MPTs/stop_reason/elapsed); docs + 3 new tests. commit 7f4aca29. GHA 23647640670. | +| F-004 | T-252: Hamilton MorphoBank training-set baseline benchmark | 3 | 25-matrix fixed training sample at 30/60/120s, 5 seeds, EW. SLURM 16599543 complete. Results: ≤35t converge at 30s (0 gap); 36-65t near-optimal; 66-135t still improving at 120s (up to 238 steps); project4284 (4062t) can't finish 1 replicate. CSVs in dev/benchmarks/. T-253 unblocked. | +| F-005 | S-RED focus 5: ts_parallel.cpp | 1 | Bug: result.perturb_stop not initialized (UB) and not set to true when perturb-stop fires in parallel path. Serial path had both correct. commit 1a640b73. GHA 23648703841. | +| B-003 | T-277: ScoreSpectrum() Chao1 coverage estimator | 8 | C++ replicate_scores vector, Rcpp bridge, ScoreSpectrum() R function, Shiny coverage note, 8 tests. PR #236 merged 2026-03-27. | +| E-001 | NEWS.md SearchControl parameter additions | 18 | Added nniFirst, postRatchetSectorial, outerCycles/maxOuterResets, wagnerBias/BiasTemp, perturbStopFactor, pruneReinsertCycles/Drop/Selection, nniPerturbCycles/Fraction, anneal*, adaptiveLevel, adaptiveStart, enumTimeFraction, intraFuse, ratchetTaper, consensusConstrain, consensusStableReps. Added Search output section (convergence summary T-276). Fixed consensusStableReps bullet. commit 8ab23af2. | +| F-006 | T-253: Gap characterization by dataset features | 2 | ntax is dominant predictor (ρ≈0.63 in two independent datasets). pct_missing/inapp weakly correlated but confounded with ntax. nchar matters only at extremes (>2000). Results: t253_gap_characterization.md + 2 CSVs in dev/benchmarks/. T-245 (TBR batching) identified as top priority for >75-taxon regime. | +| F-007 | S-RED focus 6: ts_tbr.cpp (1025 lines) | 2 | Two findings: (1) dead-code latent bug line 1013 — `full_rescore()` return not captured in `!states_valid` branch (unreachable currently). (2) Real bug: after constrained move is applied+score-rejected+restored, `cd->constraint_node` is stale. Next clip's `classify_clip_constraints` uses wrong mapping. T-278 filed. | +| F-008 | S-RED focus 8: ts_drift.cpp (796+42 lines) | 2 | Two bugs fixed in drift_phase(): (1) update_constraint() missing after RFD reject in IW path (lines 647-649) and EW path (line 689) — stale cd->constraint_node causes false constraint violations on next clip. (2) Redundant full_rescore in EW reject path: return value at line 657 discarded, second call at line 689 eliminated. Both fixed in feature/drift-constraint-fix commit e85ec84f. T-279 filed+fixed. | +| F-009 | S-RED focus 9: ts_fuse.cpp (521+41 lines) | 0 | No bugs. Minor design note: is_ancestor_split using raw bitsets may be slightly off in rounds 2+ if TBR moved tip0 away from root (conservative impact — no score correctness issue). All key invariants (full rescore, undo/redo, canonical matching, lazy donor caching) correct. | +| F-010 | S-RED focus 10: ts_driven.cpp (1056 lines) | 0 | No bugs. Confirmed correct: goto-finish, adaptive_params, SA best-tree save/restore, MPT check_enum_timeout (full deadline), intra-fuse const pool. Extra score_tree calls at verbosity>=2 harmless. | +| F-011 | S-RED focus 11: ts_sector.cpp (1007 lines) | 0 | No bugs. Key guards: root_ok in search_sector, full rescore after reinsertion, HTU revert on worsening, score_tree before build_reduced_dataset, post-hoc constraint check. from_above uses EW Fitch approximation (accepted). | +| F-012 | S-RED focus 12: ts_pool.cpp (335 lines) | 0 | No critical bugs. Minor finding: tbr_search collect_pool uses add() (full splits) but driven_search uses add_collapsed() — mixed entries can miss duplicates for zero-length-edge trees; conservative/rare. Core logic (hits_to_best, evict, diversity eviction, consensus hash, sft) all correct. | +| F-013 | S-RED focus 14: ts_constraint.cpp (735+143 lines) | 0 | No bugs. build_constraint canonicalization, map_constraint_nodes postorder scan, classify_clip_constraints 4-case logic, regraft_violates_constraint MUST_OUTSIDE boundary exception, topology_spr root-child case, impose_constraint bounded multi-pass repair — all correct. Confirmed constraint staleness bugs (T-278, T-279, E-003) are in callers, not in this module. | +| E-002 | SearchControl Rd completeness check | F | All 47 formals documented; all consumed by C++. Only gap: print.SearchControl missing maxOuterResets (TBR group) and enumTimeFraction (Stopping group). Fixed in b100b9d4. | +EOF 2>&1 +| F-014 | S-RED focus 15: ts_wagner.cpp (1133+102 lines) | 0 | No bugs. Incremental Fitch scoring (downpass+uppass with early termination) correct. LCA-based constraint mapping during construction correct (cn==root skip, MUST_OUTSIDE boundary exception). softmax_sample_order numerical stability verified. random_constrained_tree buffer safety verified via canonicalization invariant (tip 0 always outside prevents all-tips-in-one-split overflow). resolve_randomly subtree_root tracking effectively dead (defensive). Constraint retry loops (100 attempts) silently return on all-fail; driven search impose_constraint is additional safety net. | +EOF 2>&1 + +## 2026-03-28 + +| E-003 | SearchControl.Rd codoc fix (pruneReinsertTbrMoves) | 0 | GHA: all platforms "Codoc mismatches from SearchControl.Rd". pruneReinsertTbrMoves added to function + roxygen but Rd not regenerated. Added to \usage and \arguments manually. commit fdf25673. GHA 23687210711 dispatched. | +EOF 2>&1 +| G-001 | T-290: Brazeau-track phase profiling + wagnerStarts analysis | 3 | Phase profiling {Brazeau,Fitch}x{EW,IW10}x{default,thorough} on 6 datasets (23-173t), 30s, 3 seeds. Key: Wagner 3.6-5.2x more expensive under Brazeau; ratchet 1.1-1.3x; rep rate near-identical. wagnerStarts=3 explained by med_reps=0 datasets — better starting topology dominates when TBR convergence takes >30s. Fitch-tuned presets appropriate for Brazeau. strategies.md updated. T-290c quick confirmatory run launched locally. Results in TS-TNT-bench dev/benchmarks/t290_results/. | +| E-004 | T-291: bench_framework.R benchmark_run() interface update | 0 | Replaced flat do.call() with three structured lists (searchControl/runtimeConfig/scoringConfig). do.call(SearchControl, strategy) fills defaults. commit f1ed5dfc. | +| E-005 | S-RED: ts_strategy.h + ts_temper.h/.cpp | 0 | No bugs. Naming: stochastic_tbr_phase does SPR not TBR (internal only). Intra-phase best-tree not tracked in anneal_search (known design limit, T-210 addressed inter-phase). All correctness invariants correct. | +| E-006 | T-289: Prune-reinsert benchmark — COMPLETE | 5 | Stages 1-4 complete. Stage 4 (5 datasets, 131-206t, 10 seeds): PR overhead too high at 60s for large trees (0 reps at 206t). Modest improvement at 120s (mean -9.1 steps, outlier-driven). Decision: disable PR in large preset. commit 74698524. | +| F-028 | F-027 WORDLIST fix: restore 'config'+'warmup' for R 4.1 CI | 0 | ef83e8db was removed by mistake in e6ad6e3e; restored. GHA 23656560997 PASSED. | +| F-029 | T-269: Fine-grained sectorial interleaving benchmark | 4 | 4 datasets (37–88t), outer_cycles 1/2/4/10/20, 5 seeds, 30s. Higher outer_cycles uniformly reduces replicate throughput (20 cycles → 9 vs 54 reps on Dikow2009) with no score improvement. Decision: outerCycles=2 in thorough preset is optimal; no change needed. | +| F-015 | T-279: Constrained drift stale constraint_node + redundant rescore | 1 | PR #237 merged 2026-03-28. Fixed update_constraint() missing after IW+EW RFD reject in drift_phase(); removed redundant full_rescore in EW reject path. ts_drift.cpp commit e85ec84f. | +| F-016 | T-245: TBR 4-wide candidate batching | 1 | PR #238 merged 2026-03-28. fitch_indirect_cached_flat_x4()/na variant; TBR rerooting inner loop batch-of-4 for flat EW path; IW/upweight fallback to scalar. GHA 23690208221 PASS. Hamilton benchmark pending. | +EOF 2>&1 +| G-002 | T-289f Stage 5: Prune-reinsert NNI vs TBR polish (Hamilton) | 1 | SLURM 16622421 (7h, EPYC 7702). 5 datasets (131-206t), 20 seeds, 60/120s, EW. pr_nni wins 7/10 EB conditions (huge on project3701 -178 median at 60s; modest at 173-180t; regresses at 206t). pr_tbr harmful (1/9 wins; 0 reps at 206t/60s). Decision: not enabled in preset. strategies.md updated. CSV: TS-TNT-bench/dev/benchmarks/t289f_results/. | + +## 2026-03-29 + +| PA-001 | TBR clip ordering experiment: Phase 1 diagnostic | 0 | Instrumented tbr_search() with per-pass diagnostics (ClipOrder enum, TBRPassRecord, ts_tbr_diagnostics() bridge). Ran 10 seeds × 4 datasets (23–88t). Finding: tip clips (~51% of clips) account for only 22–38% of accepted moves (enrichment 0.43–0.76×). Medium-small clips (2..√n) are most productive. All three proposed ordering variants (INV_WEIGHT, TIPS_FIRST, BUCKET) favour tips — the opposite of productive direction. Phase 4 outcome: hypothesis FALSIFIED. Branch feature/weighted-clip-order to be closed. | +| PA-002 | XSS↔TBR cycling experiment: IW vs EW + large-tree scaling | 0 | 5 datasets (62–180t), 20 seeds, 3 scoring modes (EW/IW10/IW3), 774s total. **Hypothesis (IW benefits more from XSS than EW): weak signal — IW3 rate ~30% vs EW ~25%, well below the 2× threshold.** Real finding: benefit scales with tree SIZE, not scoring mode. At ≤88t: XSS cycling adds 24–57% overhead with zero TAEB benefit (hundreds of reps per budget anyway). At 180t: XSS cycling adds only 12–19% overhead and yields TAEB Δ of −6.8 to −9.8 EW steps (30–120s budgets); IW3 Δ −0.8 to −1.2 score units. 13/20 seeds improve at 180t under all scoring modes. Mean 1.8–2.0 cycles to joint convergence. **Decision: no IW-specific XSS treatment needed; existing XSS pipeline adequate; consider outerCycles=2 for large preset.** Plan: `.positai/plans/2026-03-29-0609-...`. Results: `TS-sector-expt/dev/benchmarks/expt_tbr_xss_v2_results.rds`. | +| PA-003 | Targeted post-clip sector search: C++ instrumentation | 0 | Instrumented `tbr_search()` with `targeted_sector` flag: after each accepted strict-improvement TBR move, runs sector-masked TBR on the just-moved clip subtree. 5 datasets (62–180t), 20 seeds, EW/IW10/IW3, 1214s. **Targeting hit rate ~35% regardless of scoring mode (EW 34.7%, IW3 36.1%) — no IW-specific benefit.** But targeting is NET HARMFUL: mbank_X30754 EW TAEB Δ +17 to +34 steps at 30–120s; Zhu2013/Giles2015 EW +1–2 steps. IW3 shows tiny benefit (−0.1 to −0.3). Mechanism: local sector refinement after each move changes the global TBR trajectory, steering into worse basins. This explains why the existing pipeline (XSS as a separate phase AFTER TBR convergence) is correct — interleaving within TBR is counterproductive. **Decision: hypothesis CLOSED. Do not implement.** Plan: `.positai/plans/2026-03-29-0731-...`. Results: `TS-sector-expt/dev/benchmarks/expt_targeted_sector_results.rds`. | +| E-001 | T-289f Stage 5: PR-NNI polish benchmark + large preset update | 0 | SLURM 16622483 (EPYC 7702, 7h12m). 300 runs: 5 datasets (131–206t), 3 configs (baseline/pr_nni/pr_tbr), 2 budgets, 10 seeds. pr_tbr (TBR polish) confirmed catastrophic at 206t/60s (0 reps). pr_nni (NNI polish) fixes 0-rep failure and improves 131–180t: project3701 146t −178 steps (60s), −128 steps (120s); project804 173t −9/−2; mbank_X30754 180t −4/−7. syab07205 206t: +17.5 at 60s (neutral at 120s). Decision: enabled pruneReinsertCycles=5, pruneReinsertNni=TRUE in large preset. Results: dev/benchmarks/t289f_pr_nni_polish.csv. | +EOF 2>&1 +| E-002 | G-006: Fix nni_full ignoring ConstraintData in prune_reinsert_search() | 0 | Guard: `if (params.nni_full && (!cd || !cd->active))` — when constraints are active, fall through to TBR (which enforces them), exactly mirroring the `nni_wagner` guard in ts_driven.cpp. One-line fix in ts_prune_reinsert.cpp with explanatory comment. Only affects users who combine pruneReinsertNni=TRUE with topological constraints; no preset does this. | +EOF 2>&1 +| F-030 | TBR clip-ordering Phase 2: full propagation + docs fix | 1 | Phase 1 had null result because clip_order only reached ~10% of TBR calls (Wagner warmup + final polish). Fixed by adding clip_order to RatchetParams and SectorParams, propagating through ts_driven.cpp, and applying at all TBR call sites in ts_ratchet.cpp and ts_sector.cpp. Benchmark (5 seeds, 30s): TIPS_FIRST +8–13% throughput on Zhu2013/Dikow2009 (thorough preset, 75–88t); neutral/−2% on Agnarsson2004 (default, 62t). No preset defaults changed (clipOrder=0 RANDOM). GHA 23708949592 PASS. PR #239 open → cpp-search. | +EOF 2>&1 diff --git a/coordination.md b/coordination.md new file mode 100644 index 000000000..c98951b1a --- /dev/null +++ b/coordination.md @@ -0,0 +1,760 @@ +# TreeSearch — Strategic Coordination + +## S-COORD Round 46 Summary (2026-03-29 07:40 BST, Agent E) + +**T-289f complete — Stage 5 NNI polish benchmark + large preset update:** +SLURM 16622483 completed (7h12m, EPYC 7702). 300 runs: 5 datasets (131–206t), +configs baseline/pr_nni/pr_tbr, 60s+120s, 10 seeds. +- pr_tbr (TBR polish): confirmed Stage 4 failure — syab07205 (206t) still 0 reps at 60s. +- pr_nni (NNI polish): fixes 0-rep failure; improves 131–180t: project3701 (146t) −178 + steps at 60s / −128 at 120s; project804 (173t) −9/−2; mbank_X30754 (180t) −4/−7. + syab07205 (206t) +17.5 at 60s, neutral at 120s — acceptable. +- Decision: **pruneReinsertCycles=5, pruneReinsertNni=TRUE enabled in large preset**. + commit 4a549eb4. Results: dev/benchmarks/t289f_pr_nni_polish.csv. AGENTS.md updated. + +**G-006 fixed — NNI constraint guard in prune_reinsert_search():** +One-line guard `if (params.nni_full && (!cd || !cd->active))` in ts_prune_reinsert.cpp. +When constraints active, falls through to TBR (which enforces them). Mirrors the +`nni_wagner` guard in ts_driven.cpp. Task deleted from to-do.md. + +**GHA 23703257153 in progress** on cpp-search (covers 4a549eb4 + G-006 fix). + +**PR status:** +- #213 (T-150, CID consensus): GHA PASS, awaiting human merge. +- #216 (T-204, native search): GHA PASS, awaiting human merge. +- #210 (cpp-search→main, DRAFT): Re-run 23702009435 in progress; previous failure was + Windows covr only (transient/infra — tests passed FAIL 0/PASS 11021). + +**Task queue:** Extremely sparse. Only standing tasks + T-280–288 (all WORKTREE/AltHom). +Standing tasks at **P1** (<3 open specific tasks). + +**Next:** S-RED (review alt-homology modules when T-280 merges, or review ts_search.cpp +and ts_nni_perturb.cpp which haven't been reviewed). S-PR to check PR status. + +## S-COORD Round 42 Summary (2026-03-28 16:10 GMT, Agent F) + +**T-269 complete — Fine-grained sectorial interleaving (30s, 4 datasets, outer_cycles 1/2/4/10/20):** +Higher outer_cycles uniformly reduces replicate throughput with no score benefit. +At outer_cycles=20: Dikow2009 gets 9 reps vs 54 at baseline; Zhu2013 gets 16 vs 88. +Scores are flat or marginally worse at high outer_cycles. The current outerCycles=2 in the +thorough preset is optimal; no preset change needed. + +**T-289 complete (E) — Stage 4 confirms disable-PR decision:** +5 datasets 131–206t, 10 seeds, 60s/120s. Key: syab07205 (206t) gets 0 PR reps at 60s +(per-rep cost ≈ 60s, budget exceeded). project3701 (146t) regresses 12 steps mean at 60s. +commit 746985243 disables pruneReinsertCycles in large preset. Available via SearchControl(). + +**F-027 WORDLIST fix — PASSED (GHA 23656560997).** Both 'config' and 'warmup' restored. + +**PR #210 (cpp-search→main):** codoc fix fdf25673 in place; R-CMD-check run 23688837232 +in progress. Previous pre-existing failures (Windows covr, R-devel rlang, ASAN TBB ODR) +are infra issues, not package check failures. + +**Open PRs:** #213 (T-150, GHA PASS), #216 (T-204, GHA PASS), #237 (T-279, GHA PASS). +All three await human merge. + +**Task queue:** T-245 (P3, TBR batching) is only open specific task. Standing tasks P1. +S-RED next: ts_mc_fitch.cpp, ts_tabu.h, ts_prune_reinsert.h (222 lines, unreviewed). + +## S-COORD Round 41 Summary (2026-03-28 14:35 GMT, Agent E) + +**Codoc fix — SearchControl.Rd (E-003):** +All R-CMD-check platforms failing on PR #210 since 2026-03-28 06:25 with +"Codoc mismatches from SearchControl.Rd". Root cause: commit 22f929cf +(`pruneReinsertTbrMoves` param, T-289) added the parameter to the function +and roxygen `@param` but the Rd file was not regenerated. Fix: manually added +`pruneReinsertTbrMoves = 5L` to `\usage` and its `\item` to `\arguments` in +`man/SearchControl.Rd`. Commit fdf25673. PR #210 CI re-triggered (run +23687279706, pending). Agent-check GHA 23687210711 also dispatched. + +**T-289 Stage 4 — Hamilton SLURM 16621426:** +Stage 3 confirmed MISSING criterion (sel=2, c=5, d=5%) gives −14.7 steps at +180t/60s. Large preset updated. Stage 4 validating across 5 matrices +(131–206t) at 60s/120s, 10 seeds, 200 runs. Submitted 2026-03-28 ~08:00 GMT, +~5h wall time. SSH unavailable — poll later. + +**F-027 WORDLIST fix (GHA 23656560997) — PASSED.** Resolved. + +**PR status:** +- #210 (cpp-search→main): CI re-running with codoc fix; was failing since 06:25. +- #213 (T-150, CID consensus): GHA 23650002703 PASS, awaiting merge. +- #216 (T-204, native search): GHA 23649607006 PASS, awaiting merge. +- #237 (T-279, drift constraint fix): GHA 23650290962 PASS, awaiting merge. + +**Task queue:** T-289 PARKED (Hamilton), T-269 PARKED (Hamilton), T-245/T-290/T-291 +OPEN. 3 open specific tasks → standing tasks P2 effective. + +**GHA 23687804562 results (PR #210, post-codoc-fix):** All 5 release platforms +PASS. Remaining failures are pre-existing infra issues: Windows covr path, R-devel +rlang DLL, ASAN RcppParallel TBB ODR. All in dep-install or coverage steps, not +"Check package". PR #210 ready for human review. + +**T-291 complete (E-004):** bench_framework.R benchmark_run() updated to new +ts_driven_search structured-list interface. commit f1ed5dfc. + +**Next:** Poll Hamilton for T-289 Stage 4 results when SSH is available. + +## S-COORD Round 39 Summary (2026-03-27 16:05 GMT) + +**GHA results confirmed (all PASS):** +- GHA 23653228247 (F-015: ratchet constraint staleness) — **PASSED** +- GHA 23653513217 (F-016: NNI-perturb constraint staleness) — **PASSED** +- GHA 23653782359 (F-018: prune-reinsert constraint staleness) — **PASSED** + +All constraint-staleness fixes now validated on both platforms. The full sweep +(TBR T-278, drift T-279, sector E-003, ratchet F-015, NNI-perturb F-016, +prune-reinsert F-018) is complete. All 6 constrained search modules now +consistently call `update_constraint(tree, *cd)` after any topology revert. + +**Hamilton SSH unavailable** — can't poll T-289 (SLURM 16607721) or T-269 +(SLURM 16607719/16607720). Jobs were submitted ~1.5h ago; T-289 ETA ~2.7h +from submission, so likely still running. Results will be in `t289_results/` +and `t269_results/` when complete. + +**PR status:** #213 (T-150), #216 (T-204), #237 (T-279) still awaiting human +merge. No new PRs needed (F-015/016/018 were direct cpp-search commits). + +**Task queue:** T-289 PARKED, T-269 PARKED, T-245 OPEN (only specific open +task). Standing tasks now **P1** (<3 open specific tasks). + +**Agent F next:** S-RED focus 23 (ts_fitch.cpp, 844+288 lines — core Fitch +scoring engine). + +## S-COORD Round 37 Summary (2026-03-27 15:15 GMT) + +**T-289 dispatched (F):** Prune-reinsert benchmark Stage 1 submitted to Hamilton +SLURM 16606222. 13 configs × 4-5 datasets × 5 seeds × 30s ≈ 325 runs, ETA ~2.7h. +Fixed Rscript invocation bug in t289_hamilton.sh: `Rscript -e "expr" file.R` does +NOT source file.R. Use `export R_LIBS_USER; Rscript file.R` (T-252 pattern). +Also committed bench_prune_reinsert.R which was untracked. commits 5b0c0ad5 + 03e981f8. + +Note: t265_hamilton.sh has the same Rscript bug but T-265 is complete. + +**F-015 / S-RED focus 16 — ts_ratchet.cpp (259+61 lines):** +Bug found and fixed directly to cpp-search (same pattern as E-003). +**Constraint staleness after best_tree revert:** in ratchet_search() non-escape +path, `update_constraint(tree, *cd)` was missing after copy_topology(best_tree) + +build_postorder + reset_states. Next cycle's perturbed TBR used stale DFS timestamps. +Same class as T-278/T-279/E-003. commit ae6a3528. GHA 23653228247 running. +All other invariants correct: save/restore state, FlatBlock sync (only active_mask +needed — FlatBlock has no upweight_mask field), perturb modes, adaptive tuning. + +**PR status:** #213 (T-150), #216 (T-204), #237 (T-279) all GHA-passed, awaiting +human merge. No change since round 36. + +**Task queue:** T-289 PARKED, T-245 OPEN (P3), T-269 OPEN (P3). Standing tasks P2 +(effective 3 open tasks counting T-289 parked). + +**Agent F next:** Park T-289 GHA (23653228247). Take T-269 (fine-grained sectorial +interleaving benchmark) — this can run locally on the Hamilton session. + + +Last updated: 2026-03-27 14:55 GMT (S-COORD round 35 by F) + +## S-COORD Round 35 Summary (2026-03-27 14:55 GMT) + +**T-253 complete (F):** Gap characterization by dataset features done. +ntax is dominant predictor of search difficulty (ρ≈0.63 in both T-265 fitch-mode gaps +and T-252 mbank convergence gaps). nchar matters only at extremes (>2000). pct_missing/ +pct_inapp weakly correlated but likely confounded with ntax. T-245 (TBR batching) +confirmed as highest-priority next step for ≥75-taxon regime. Results in +`dev/benchmarks/t253_gap_characterization.md`. commit d05638e5. + +**T-150 WORDLIST fix (F):** "Splitwise" was missing from inst/WORDLIST — the spell-check +test failure root cause. Added and re-dispatched as GHA 23648875258. Previous GHA +23648267378 failed on this (and only this) issue. + +**3 GHAs running:** +- 23648875258 (T-150, feature/cid-consensus, PR #213) +- 23648401936 (T-204, feature/native-search, PR #216) +- 23648703841 (S-RED fix, cpp-search: perturb_stop in parallel path) + +**Task queue:** 2 unblocked OPEN specific tasks (T-245, T-269) + E-002 (soft-blocked +on T-150/T-204 merge) + E-001 (ASSIGNED E). Standing tasks at **P2** (3–5 open). +Next priority: S-RED focus 6 (ts_tbr.cpp review) while GHAs run. + +**Agent F next:** S-RED focus 6. + +## S-COORD Round 34 Summary (2026-03-27 13:45 GMT) + +**T-277 (ScoreSpectrum, B):** Merged via PR #236 to cpp-search. Removed from to-do.md; added to completed-tasks.md. + +**T-276 (convergence summary, F):** DONE. GHA 23647640670 PASS. Removed from to-do.md. + +**S-RED focus 5 (ts_parallel.cpp, F):** Bug fixed — `result.perturb_stop` not initialized (UB) and not set in parallel path. commit 1a640b73. GHA 23648703841 running. + +**ASan.yml fix (E):** `pak::pak("r-lib/rlang")` approach broken — GitHub dev rlang 1.1.7.9000 also embeds `PREXPR` in `src/rlang/rlang-types.h`. New approach: patch CRAN source tarball with `#ifndef PREXPR / #define PREXPR(x) R_PromiseExpr(x) / #endif` shim before `R CMD INSTALL`. commit 05261c34. GHA 23648993981 dispatched to verify. + +**Agent C file stale:** agent-c.md still shows T-214 as PARKED, but T-214 was completed (GHA 23542642164 PASS, per completed-tasks.md). C should update agent-c.md on next assignment. + +**NEWS.md gap (E):** NEWS.md was last updated 2026-03-18. Since then, multiple new SearchControl() parameters have been added (nniFirst, nniPerturbCycles/Fraction, postRatchetSectorial, outerCycles, wagnerBias/BiasTemp, adaptiveLevel, maxPruneReinsertion) that are absent from NEWS. Verbosity convergence summary (T-276) also missing. Filed E-001 (P2). + +**Agent status:** +- A: IDLE. Can take T-245/T-269/E-002 or S-RED focus 6. +- B: IDLE (T-277 merged — B may not know yet). Can take T-245/T-269/E-002. +- C: IDLE (T-214 was done — file stale). Can take T-245/T-269/E-002. +- D: IDLE. Can take T-245/T-269/E-002. +- E: ASSIGNED E-001 (NEWS.md update). T-150/T-204 PRs parked waiting GHA (F). +- F: Parked on T-150 (GHA 23648875258) and T-204 (GHA 23648401936). ASSIGNED T-253. + +**Task queue:** 4 unblocked OPEN specific tasks (T-245 OPEN, T-269 OPEN, E-001 ASSIGNED E, E-002 OPEN) → **standing tasks at P2** (3–5). + +**Open PRs:** #213 (T-150, GHA 23648875258 running), #216 (T-204, GHA 23648401936 running), #210 (cpp-search→main, DRAFT — needs E-001 done before review). + + + +## S-COORD Round 32 Summary (2026-03-27 10:40 GMT) + +**T-268 (branch housekeeping, F):** Done. Pruned 11 stale local branches, updated AGENTS.md worktree table, triaged u.005 (interleaved sectorial rationale → T-269 notes). commit 838b14c1. + +**T-252 (Hamilton benchmarking, F):** Previous job 16598843 failed (httpuv/shiny not building in fresh lib). New `t252_v2.sh` uses `ts-bench/lib-baseline` for all deps. Job 16599543 submitted and running. + +**S-RED focus 2 (F):** T-263 snapshot hoisting VERIFIED CORRECT. T-235 SPR fix VERIFIED CORRECT. LATENT: `flat_blocks.active_mask` not synced by ratchet perturbation (zero call sites — safe now). T-273 filed as P3 preventive fix. + +**T-273 (NEW):** Fix `flat_blocks.active_mask` staleness during ratchet (P3). `FlatBlock` is populated at `build_dataset()` only; ratchet modifies `blocks[b].active_mask` but not `flat_blocks[b].active_mask`. Must be fixed before flat indirect functions are wired into the dispatch path. + +**Agent status:** +- A: IDLE (completed T-270, T-272, S-RED focus 1 today). Can take T-245/T-273/S-PROF. +- B: IDLE but T-204 PR #216 needs GHA fix (add roxygen2 docs for CleanNativeData/NativeBootstrap/NativeLength/PrepareNativeData; regenerate Rds). Should resume T-204. +- C: IDLE (T-214 done). Can take T-245/T-273/T-269. +- D: IDLE. Can take T-245/T-273/T-269. +- E: T-150 PARKED (InfoConsensus.Rd codoc fix needed in TS-CID-cons). Should resume T-150. +- F: T-252 PARKED (Hamilton 16599543). Available for more standing tasks. + +**Task queue:** 3 unblocked OPEN specific tasks (T-245, T-269, T-273) → **standing tasks at P2**. + +**Open PRs:** #213 (T-150, GHA failing — codoc fix), #216 (T-204, GHA failing — missing docs), #235 (T-266, PASSED, awaiting human merge), #210 (cpp-search→main). All others closed. + +## S-COORD Round 31 Summary (2026-03-27 09:20 GMT) + +**T-266 (prune-reinsert, A):** GHA 23636145497 PASSED. PR #235 opened to cpp-search. + +**T-150 (CID consensus, E):** GHA 23636944848 **FAILED** — codoc mismatch in `InfoConsensus.Rd`. Fix: `roxygen2::roxygenise(load_code=roxygen2::load_installed)` in TS-CID-cons worktree, commit, re-dispatch. + +**New tasks:** +- T-270 (P2): Algorithm vignette + AGENTS.md update for T-257 post-ratchet sectorial. Check if PR #234 already included it. +- T-272 (P3): Close stale PR #178 (concordance, Aug 2025, CONFLICTING DRAFT). + +**T-126 (Shiny hierarchy UI, D):** Referenced in AGENTS.md as "ASSIGNED D" but absent from to-do.md and completed-tasks.md. Likely deferred post-release. No action taken — flagged here for human awareness. + +**Task queue:** 3 unblocked OPEN specific tasks (T-245 P3, T-269 P3, T-270 P2). T-253 blocked by T-252. **Standing tasks at P2** (3–5 unblocked OPEN). + +**Open PRs (3 to cpp-search + 1 base PR):** #213 (T-150, GHA failing — codoc fix needed), #216 (T-204), #235 (T-266, GHA passed). #210 (cpp-search→main). #178 stale DRAFT — T-272 filed to close. + +## S-COORD Round 30 Summary (2026-03-27 08:50 GMT) + +**Three PRs merged to cpp-search overnight (2026-03-27):** +- PR #231 (T-263): StateSnapshot save hoisted to once per TBR pass (~14.6% TBR overhead eliminated) +- PR #233 (T-246): AVX2 runtime dispatch for Fitch SIMD (5–10% on multi-block datasets; SSE2 fallback) +- PR #234 (T-257): Post-ratchet sectorial search pass (`postRatchetSectorial` in SearchControl()) + +**T-267 (MaddisonSlatkin 5-state) FIXED by A.** Test now skips on budget timeout instead of failing with NA. + +**T-266 (taxon pruning-reinsertion, A):** GHA 23636145497 PASSED. PR #235 now open. + +**T-150 (CID consensus, E):** SPIC scoring added (commit 6636924c). GHA 23636944848 in progress. + +**Open PRs (3 to cpp-search + 1 base PR):** #213 (T-150, CID+SPIC, GHA pending), #216 (T-204, native-search), #235 (T-266, prune-reinsert, GHA passed). #210 (cpp-search→main) still open. #178 stale — recommend close. + +**Task queue:** 2 unblocked OPEN specific tasks (T-245 P3, T-269 P3) → **standing tasks at P1**. T-253 blocked by T-252 (F, in progress). T-268 (housekeeping) ASSIGNED F. + + +## S-COORD Round 29 Summary (2026-03-26 18:10 GMT) + +**T-242 (P1): CLOSED — not a bug.** The "2% hit rate" on Agnarsson2004 IW +was a parallel pool reporting bug: `ThreadSafePool::extract_into()` reset +`hits_to_best` to distinct topology count instead of actual replicate hits. +Fix already committed (`bc19667f2`, 92 commits ago). Score 50.1872 (XPIWE +k=10^0.75) is correct; actual hit rate ~60–67%. No P1 bugs remain. + +**T-257 GHA 23607823258: FAILED — doc mismatch only.** All 10934 tests pass +on both platforms. Windows failure is `SearchControl.Rd` codoc mismatch — +new `postRatchetSectorial` parameter needs roxygen regeneration. Agent F +should fix and re-dispatch. + +**Task queue:** 0 P1, 2 P2 (T-150 worktree, T-204 PR), 4 P3 (T-245 OPEN, +T-252 OPEN, T-253 blocked, T-257 PARKED). T-263 and T-246 on PRs. +2 unblocked OPEN specific tasks → **standing tasks at P1**. + +**PRs:** 5 open to cpp-search (all MERGEABLE: #213, #216, #231, #233 + #210 +cpp-search→main). #178 stale and CONFLICTING (Aug 2025 — recommend close). + +## S-COORD Round 27 Summary (2026-03-26 16:30 GMT) + +**CI fix pushed:** `%||%` operator in `test-ts-anneal.R` broke R 4.1 CI +(operator introduced in R 4.4). Replaced with `if/is.null` (58fc2552). +This was the root cause of ubuntu-24.04 (R 4.1) failures on runs +23601960123, 23601354741, and all queued runs. Windows R CMD check passes; +Windows covr failure is MaddisonSlatkin floating-point under instrumentation +(not actionable — main check clean). + +**GHA queue:** 9 queued + 4 in_progress runs on cpp-search (PR #210 +triggers). The fix commit will trigger a fresh set. Earlier queued runs +will still fail on R 4.1 but will be superseded. + +**Hamilton job 16597206** (T-265 EW-mode confirmation) — status unknown +(SSH unreachable from this session). Results expected in `t265_results/`. + +**Task queue:** 1 P1 (T-242, parked C — may be scoring confound like T-265), +1 P2 (T-263 PR #231), 3 P3 OPEN (T-245, T-252, T-257). Standing at P2 +(3 unblocked open). T-253 blocked by T-249+T-252 and needs rethinking +(gaps were mostly scoring confound artifacts). + +## S-COORD Round 26 Summary (2026-03-26 late afternoon) + +**T-265 (P1): RESOLVED — scoring method confound, not engine regression.** +The apparent +17.8-step gap between TreeSearch and TNT was almost entirely +due to comparing Brazeau-scored TreeSearch output to EW-scored TNT output. +TreeSearch uses Brazeau et al. (2019) inapplicable algorithm by default; +TNT treats `-` as `?`. When scoring is equalized (both EW), the actual +gap is only +2.2 steps (5/11 datasets at 0 gap, largest residual +7 at +15s budget). R2-equiv / R2-modern / auto-preset all find identical Brazeau +scores on Wilson2003 — no preset or engine regression. AGENTS.md updated +with mandatory `fitch_mode()` warning for future TNT comparisons. + +**T-264 (P0): Fully verified.** GHA passed both platforms. Scoring confound +resolved. Fix is correct. + +**T-249: Validated and closed.** Hamilton results confirmed; gaps were +scoring confound. Future comparisons must convert `-` → `?`. + +**Hamilton job 16597206** running: T-265 EW-mode benchmark (3 configs × +9 datasets × 5 seeds × 120s) for fuller confirmation. Results expected +in ~4-5 hours. + +**Task queue:** 1 P1 (T-242, parked C), 1 P2 (T-263 PR #231), +4 P3 OPEN (T-245, T-252, T-253, T-257). Standing at P2 (3-5 open). +T-253 needs rethinking given the scoring confound — the "gaps" it was +going to characterize are mostly artifacts. + +## S-COORD Round 28 Summary (2026-03-26 late afternoon, by F) + +**T-265 (scoring confound): CLOSED.** The apparent 5–54 step quality +regression vs TNT was a benchmarking methodology error: Brazeau +inapplicable scores were compared against TNT Fitch scores. Correct EW +(Fitch-mode) gaps are **0–7 steps** (mean 2.2) across 11 hard datasets at +120s, with 5 datasets optimal. T-265 moved to completed-tasks. T-264 and +T-249 also archived. Hamilton Phase 2a job (16597240) cancelled (low +cluster capacity + results would be uninformative given the confound). + +**Lesson:** Always compare like-for-like scoring. Brazeau three-pass +scoring produces inherently higher step counts than Fitch — this is by +design (it penalizes inapplicable placements), not a search failure. +`clean_inapplicable()` or `fitch_mode()` must be applied before comparing +against TNT. Added to Architecture Decisions. + +**R-4.1 compat fix:** `%||%` operator (R ≥ 4.4 only) replaced with local +`.or()` helper in `ts-driven-compat.R`. Committed to cpp-search (ad1dbde9). + +**AVX2 ASAN issue (PR #233):** `std::vector::operator[]` OOB assertion in +`ts-collapsed` tests under gcc ASAN. Agent E investigating. + +**Task queue:** 4 OPEN specific tasks (T-245, T-252, T-253, T-257). T-253 +unblocked from T-249 (complete); only blocked on T-252 now. Standing tasks +at P2. 6 open PRs (#233, #231, #216, #213, #210, #178). PR #178 remains +stale/CONFLICTING (Aug 2025) — recommend close. + +## S-COORD Round 25 Summary (2026-03-26 afternoon) + +**T-264 (P0): `consensusStableReps` catastrophic early termination FIXED.** +Root cause: presets set `consensusStableReps = 3`, stopping search after 3 +unchanged-consensus replicates. Most datasets used 7–20% of time budget. +Fix committed (23e9f57b) by F, removes from all presets (falls back to 0). +GHA 23600674681: ARM64 passed, Windows in progress. Hamilton verification +(8 worst datasets, 120s, 3 seeds) dispatched as job 16597096. + +**T-261+T-262 (eliminate-fill): MERGED** as PR #232. 8.6% TBR speedup. +S-RED focus 8 verified no scoring regressions (subtree_actives non-NA +positions safe: init to 0, never written, all reads NA-guarded). + +**T-255 (drift removal): COMPLETE.** GHA 23598220226 passed both platforms. + +**T-246 (AVX2): PR #233 opened** by F. MERGEABLE, CI in progress. + +**Task queue health:** 1 P0 (T-264, fix committed, GHA+Hamilton validating), +1 P1 (T-242, parked C), 1 P2 (T-263 PR #231), 3 P3 OPEN (T-245, T-252, T-257), +T-253 blocked by T-249+T-252. Standing tasks at P2 (3 open unblocked). +6 open PRs: #233, #231, #216, #213, #210, #178 (stale). + +**AGENTS.md updated** for T-264 (consensusStableReps disabled in presets). + +## S-COORD Round 22 Summary (2026-03-26 morning) + +**Drift elimination (T-254/T-255):** Drift search eliminated from default +and thorough presets. T-254 experiment (3 datasets × 3 seeds × 2 budgets) +confirmed zero benefit on score, MPT count, or topological diversity, with +10–22% replicate cost. `SearchControl()` default and all presets now +`driftCycles=0`. GHA 23590522833 in progress. + +**GHA fixes committed to cpp-search:** +- Spelling wordlist: added LCM, TREE's, speedup; removed 28 stale entries +- PrepareDataProfile/StepInformation codoc: `n_mc` 5000→100000 (stale Rd + from devtools::check_man() loading old installed version) +- test-ts-parallel.R:85 flaky timeout: Vinther→Agnarsson (fast ARM64 + completed 23-tip replicates before 1s timeout) + +**T-243 (hot-loop-opt):** GHA 23582386358 failed on the same parallel +timeout flake (ARM64). Fix is on cpp-search (371270b3); needs merge into +feature/hot-loop-opt via TS-HotLoop worktree. + +**Agent assignments:** E: T-255 parked (GHA), F: T-249/T-256/T-258/T-259. + +## S-COORD Round 20 Summary (2026-03-25 afternoon) + +**Major changes since round 15 (~9h ago):** +- 8 PRs merged to cpp-search (#211, #212, #214, #217, #218, #220, #221, #223, #225) +- T-214 (P1 constraint bug) fixed by C — was blocking multiple GHA runs +- PRs #215 (parallel-temper) and #222 (pt-eval) CLOSED without merge +- T-207/T-210 cherry-picked into new PR #227 (`feature/pcsa-phase`) +- ~40 Shiny bug fixes and features landed (T-219–T-243) +- S-RED focus 2–4 completed (found T-235 SPR stale state, T-243 consensus stability) + +**Stale entries cleaned from to-do.md:** +- T-214 (completed), T-212 (validated by T-214 fix), T-179 (completed), T-182 (PR merged) +- T-198–201 and T-196 marked STALE (PR #215 closed) +- T-207/T-210 updated to PR #227 + +**GHA status:** 4 parked Shiny tasks (T-232/T-240/T-239/T-241) had GHA failures from +pre-T-214 state. Run 23547582438 (current HEAD) queued; will validate all. T-242 +(IW regression, P1) GHA 23545987517 also queued. + +**Stale worktrees** (for human cleanup): +- TS-AdaptRatch (feature/adaptive-ratchet — PR #221 merged) +- TS-NNIcons (feature/nni-constraint-guard — PR #220 merged) +- TS-OuterCap (feature/outer-cap-t206 — PR #218 merged) +- TS-PT (feature/parallel-temper — PR #215 closed) +- TS-T211 (feature/stale-final-uppass — T-211 closed) +- TS-FixRandCons (feature/fix-random-tree-constraint — check if still needed) + +**Open PRs requiring review:** #216 (native-search), #213 (CID consensus). #226 (perturb-stop) and #227 (PCSA) merged. + +**Task queue health:** 1 OPEN specific task (T-183), 6 PR-pending, 4 Shiny PARKED +awaiting re-validation, 2 STALE (need decision). Standing tasks at P1. + +## Project State + +The C++ phylogenetic search engine is **v2.0.0** with a new +`MaximizeParsimony()` API, driven C++ search, and fully modularized Shiny app. + +**All planned development objectives are complete.** Two new feature tracks +(inapplicable-handling algorithms, Shiny UX) were added and are substantially +complete; only integration/polish tasks remain. + +Test suite health (full NOT_CRAN run, 2026-03-19 ~17:05): +- R-level: **~9835 pass, 0 fail** (1 stochastic ParsSim failure observed once, transient), 12 warn, 5 skip +- ts-* (C++ engine): 1676 pass, 0 fail (T-144 fix also resolved 3 ts-profile failures from human commit 5235d6e1) +- ParsSim: 128 pass +- MaddisonSlatkin: 37 pass (was 26 fail per E's S-RED round 6; fixed by T-144) +- Recode-hierarchy: 53 pass +- HSJ: 37 pass +- Sankoff: 24 pass +- Xform integration: 80 pass +- Shiny module tests: 88+ pass +- init.c: 45 entries (43 Rcpp + 2 manual), all arg counts match + +**CRAN REGRESSION T-144: FIXED** (Agent A, 2026-03-19). Added missing +binary-reduction warning to `PrepareDataProfile()`, fixed `dataset[0]` crash +in new TreeTools, updated test expectations. CRAN submission no longer blocked +on test failures. + +## Project State Update (2026-03-23) + +### Search optimization phase (2026-03-22–23) + +Systematic profiling of the driven search pipeline across all 14 benchmark +datasets (20–88 tips) led to committed improvements: + +1. **Ratchet perturbation tuning** (`f1ae7edb`): 4% → 25%, moves 20 → 5, + cycles 5 → 10. 9/14 datasets improved. +2. **Drift → ratchet reallocation** (`7ae01181`): driftCycles 4 → 2, + ratchetCycles 10 → 12. +3. **NNI warmup** (T-178): Always-on NNI before TBR. Each Wagner start + NNI-optimized. SPR auto-skipped when NNI active. +4. **NNI-perturbation** (T-186): New escape mechanism between ratchet and + drift. Random NNI swaps + TBR re-optimization. +5. **Biased Wagner** (T-188): Softmax-sampled taxon addition order. +6. **Outer cycle loop** (T-189): Interleave XSS/ratchet/drift. + +### XPIWE feature (2026-03-23) + +All 7 tasks (T-156–T-162) completed on feature/xpiwe branch by Agent G. +Extended implied weighting corrects for missing-entries bias in IW scoring. +Now the default in Shiny. Ready for merge. + +### Benchmark expansion (2026-03-23) + +- T-181: 180-taxon dataset (mbank_X30754) added as large-tree tier +- T-180: Warm-start benchmark infrastructure for isolating escape quality + +### Large-tree scaling (ongoing) + +The 180-taxon dataset exposed that `maxSeconds` doesn't fire mid-TBR (T-177, +P1, ASSIGNED Human+AI). NNI warmup (T-178) and strategy presets (T-179) are +planned but T-179 is blocked on T-177. + +## Current Strategic Objectives + +### Objective 1–4: COMPLETE +- Phase 6 adaptive strategy, code quality, documentation, CRAN readiness +- Version 2.0.0 (major bump for new API) + +### Objective 5: MorphyLib Migration — PARTIAL (not blocking CRAN) +- Tier 1+2 done (TreeLength, CharacterLength, RandomTreeScore, deprecation) +- Tier 3/4 (remove MorphyLib source): Far future + +### Objective 6: Shiny App Modularization — COMPLETE + +### Objective 7: Benchmark Expansion — COMPLETE + +### Objective 8: Shiny Bug Fixes — COMPLETE + +### Objective 9: NEWS.md — COMPLETE + +### Objective 10: Multi-state Profile Parsimony — COMPLETE +All tasks T-101 through T-107 done. MaddisonSlatkin for 3–5 state characters, +feasibility guard for exponential cases (binary fallback with warning), Shiny +app verified. Sun2018 (54 tips, multistate) completes in 2.4s. + +### Objective 11: Alternative Inapplicable-Handling Algorithms — SUBSTANTIALLY COMPLETE +Three scoring methods now functional end-to-end in `MaximizeParsimony()`: +- **Brazeau et al. (2019)**: Three-pass NA algorithm (pre-existing, default) +- **HSJ (Hopkins & St. John 2021)**: Dissimilarity metric with α parameter. + Full C++ implementation, uppass bug fixed, `TreeLength()` HSJ support added. +- **X-transformation (Goloboff et al. 2021)**: Step-matrix recoding via Sankoff. + `recode_hierarchy()` + C++ Sankoff engine, end-to-end search verified. +- **Hierarchical resampling (T-124)**: Done. `Resample()` hierarchy-aware. + +R-level API: `CharacterHierarchy()` class, `hierarchy_from_names()` auto-detect, +`recode_hierarchy()` for xform. Vignette `inapplicable.Rmd` documents all three. + +**Remaining Phase 3 task:** +- T-126 (ASSIGNED D): Shiny app hierarchy UI + method selector + +### Objective 12: Shiny Search UX — COMPLETE +- T-127–T-130, T-137–T-141, T-143: All Shiny UX tasks done +- T-163: Search confidence with binomial bound + diagnostics +- T-164: Pool stats wired to Shiny (topology count, trajectory) + +### Objective 13: Subsample MPTs — COMPLETE +- T-135 DONE: `WideSample()` maximin tree subsampling +- T-136 DONE: Wire WideSample into Shiny tree thinning + +### Objective 14: ParsSim — COMPLETE +`ParsSim()` simulates datasets under parsimony (EW/IW/profile). Supports +per-taxon/per-character missing rates, rootState vectors. 128 tests passing. + +## Agent Status + +| Agent | Status | Current Task | Notes | +|-------|--------|-------------|-------| +| A | IDLE | — | | +| B | IDLE | — | | +| C | IDLE | — | | +| D | IDLE | — | | +| E | IDLE | — | T-289f + G-006 + S-COORD done 2026-03-29. GHA 23703257153 running. | +| F | IDLE | — | | +| G | IDLE | — | | +| M | IDLE | — | | + +## Task Pipeline Health + +- **3 unblocked OPEN**: T-245 (P3), T-269 (P3), T-270 (P2) +- **1 blocked OPEN**: T-253 (P3, needs T-252) +- **1 PARKED (GHA FAILED)**: T-150 (E, codoc mismatch — fix and re-dispatch needed) +- **Tasks on open PRs**: T-150 (#213), T-204 (#216), T-266 (#235) +- **3 PRs to cpp-search**: #213 (GHA failing), #216, #235 (all awaiting review). #210 (cpp-search→main) open. #178 stale DRAFT (T-272 filed to close). +- Standing task effective priority: **P2** (3 unblocked OPEN specific tasks) + +### Observations (Round 15) + +1. **Heredoc artifact (`EOF 2>&1`) caused GHA failures across branches.** + Agent F's merge workflow leaked shell heredoc terminators into + `test-ts-constraint-small.R`. Fixed on cpp-search (3a34cbe1) and + feature/parallel-temper (c2250aa3). This caused T-212's GHA to fail + (re-dispatched as run 23528636505). + +2. **PR #215 compile errors from merge artifacts.** F's merge of cpp-search + into feature/parallel-temper duplicated `anneal_*` fields in DrivenParams + and left stale individual anneal params in MaximizeParsimony's searchArgs + (C++ only accepts annealConfig list). Fixed by A (c2250aa3). + +3. **Simplification of "all in [0,?]" characters in inapplicable datasets.** + The `has_inapp` bypass in `ts_simplify.cpp` was overly conservative: + `?` tokens (all bits set including inapp bit) triggered the bypass, + preventing simplification of genuinely uninformative characters. + Fixed (a48bfc4a); GHA pending. + +4. **T-208 (PR #219) is closed.** Fix was cherry-picked to cpp-search + directly. Removed from active task list. + +5. **T-211 closed (not worth fixing).** Conservative-only impact confirmed + by Agent C. + +### Observations (Round 14) + +1. **T-211 closed (not worth fixing).** Agent C confirmed the conservative-only + impact: stale `final_` affects Boltzmann screening probability only, + `temper_full_rescore` gates all accepted moves. Fix cost (per-candidate + full rescore or save/restore all final_ arrays) exceeds negligible benefit. + +2. **T-212 committed directly to cpp-search.** 7 tests (24 expectations) + covering RANDOM_TREE strategy with constraints, serial and parallel + (nThreads=2). GHA running. + +3. **T-213 (impose_constraint) in progress by Agent A.** New `impose_constraint()` + function repairs topology after constraint-violating moves (NNI perturbation, + fuse). 88 new tests. GHA running on `feature/impose-constraint`. + +4. **T-214 filed: constraint enforcement bug on ≥10-tip trees.** Found by + Agent C during T-212 development. TBR search violates constraint splits on + 10-tip trees (100% violation with 2 splits, sporadic with 1 split). Works + on 5–6 tips. Pre-existing, affects all strategies. T-213's `impose_constraint()` + may address this indirectly by repairing violations post-hoc. + +5. **S-PR done (Agent F).** Resolved merge conflicts on PRs #215, #213, #221. + PR #222 has substantive conflicts (two different SA designs) requiring human + judgment. PRs #216 and #211 are clean. + +6. **PR #219 removed from list.** The T-208 fix (WAGNER_RANDOM fallback for + constrained RANDOM_TREE) was cherry-picked to cpp-search directly + (commit `24427c9a`). The PR may have been closed. + +7. **PR backlog is 6.** Recommended merge order unchanged from round 13: + #215 → #216 → #211 → #213 → smaller PRs. + +### Objective 15: Large-Tree Scaling & Search Optimization + +Motivated by 180-taxon dataset testing. Goal: make `MaximizeParsimony()` +effective and responsive at 100–200+ tips. + +**Sub-goals:** +1. **Bug fix: mid-TBR timeout** (P1). Pass `check_timeout` into + `tbr_search()` and `spr_search()` so they can bail out mid-pass. + Without this, `maxTime` is ineffective for large trees. +2. **NNI warmup** (P1). Add `nni_search()` before SPR in driven pipeline, + gated on `n_tip > ~100`. Provides O(n)-cost initial descent. +3. **Large-tree strategy preset** (P2). For ≥120 tips: NNI→SPR→TBR + escalation, scaled ratchet/drift cycles, sector size tuning. +4. **Large-tree benchmark dataset** (P2). Add 180-taxon dataset to + `dev/benchmarks/`. Separate timing tier for large trees. +5. **Warm-start benchmark** (P2). Seed search with pre-computed local + optimum, measure ratchet escape effectiveness in isolation. +6. **Adaptive ratchet perturbation** (P3). Start aggressive (~40%), + taper by hit rate as pool quality stabilizes. +7. **Pool-seeded Wagner** (P3). Use pool consensus as backbone constraint + during Wagner construction. Concern: run independence. Mitigate by + only activating after N diverse pool trees. + +**Status:** T-178 (NNI warmup), T-186 (NNI-perturbation), T-188 (biased +Wagner), T-189 (outer cycle), T-180 (warm-start benchmark), T-181 (180t +dataset), T-184 (maxTime alias) all complete. T-177 (mid-TBR timeout) in +progress (Human+AI). T-179 (large-tree preset) blocked on T-177. T-182, +T-183, T-187 are P3 nice-to-haves. + +### Objective 16: Extended Implied Weighting (XPIWE) — COMPLETE +All 7 tasks (T-156–T-162) done on feature/xpiwe branch. PR #212 open. + +### Objective 17: Parallel Tempering — COMPLETE (on branch) +All 4 tasks (T-198–T-201, formerly T-190–T-193) implemented by Agent C on +`feature/parallel-temper`. Stochastic TBR, multi-chain framework, pipeline +integration, and benchmarking. No PR yet. + +## Known Issues + +1. **Ratchet `active_mask` not RAII-protected**: Low risk — DataSet rebuilt per R call. +2. **Wagner NA `subtree_actives` staleness**: Documented UB in incremental NA + scoring during Wagner construction. `score_tree()` at end gives correct result. +3. **Shinylive blockers**: See `.positai/plans/2026-03-17-shinylive-plan.md`. +4. **Partial-tip constraint upstream bug**: `TreeTools::AddUnconstrained` crashes + on zero-character phyDat. Full-tip constraints work. +5. **XFORM rebuilds SankoffData per score_tree() call** (noted by Agent E in + S-RED focus 4). Optimization opportunity for future work. +6. **Stochastic ParsSim test**: 1 chi-squared test in ParsSim suite can fail + with unlucky random seed (~0.1% probability per run). Pre-existing; not actionable. +7. ~~**`maxTime` ineffective for large trees** (T-177)~~: **RESOLVED.** `check_timeout` + callback now threaded through `tbr_search`, `spr_search`, `nni_search`. Merged. +8. ~~**MPT enumeration blocked by timeout** (T-202)~~: **RESOLVED.** Two-phase + timeout reserves 10% of budget for MPT plateau walk. Merged via PR #217. + +## Architecture Decisions Log + +| Date | Decision | Rationale | +|------|----------|-----------| +| 2026-03-26 | TNT benchmarks must use Fitch-mode scoring (`clean_inapplicable()` or `fitch_mode()`) | Brazeau three-pass scores are inherently higher than Fitch; comparing across methods produces spurious gaps (T-265 confound) | +| 2026-03-16 | Inter-replicate parallelism via std::thread | Simplest; avoids R memory allocator conflicts | +| 2026-03-16 | thread_local RNG, not parameter-passing | Avoids changing ~15 function signatures | +| 2026-03-16 | Concavity sentinel -1.0 in Rcpp exports | Rcpp can't translate R_PosInf | +| 2026-03-16 | MaximizeParsimony() → C++ engine; Morphy() → legacy | Clean migration path | +| 2026-03-17 | Adaptive strategy: sprint ≤30, default/thorough by nTip×nChar | Benchmark data | +| 2026-03-17 | T-025 fix: bounds-check PreallocUndo capacity | Root cause of P0 crash | +| 2026-03-18 | Shiny modularization: modules return reactive lists | Reactives re-exported in server.R scope | +| 2026-03-18 | Forward-ref callbacks via env for data→search dependency | Data module needs DisplayTreeScores before search module defined | +| 2026-03-18 | Test tiering: 3 tiers (CRAN/CI/extended) | T-073: skip guards prevent slow tests on CRAN | +| 2026-03-18 | Strategy threshold: nTip≥65 AND nChar≥100 | T-068: signal-density gate was backwards | +| 2026-03-18 | Profiling: Wagner negligible, parallel ~80% eff | S-PROF: no new optimization tasks needed | +| 2026-03-18 | events.R dissolved → ShowConfigs inlined in server.R | Top-level DOM element show/hide belongs at top level | +| 2026-03-19 | Hierarchy as separate MP arg, not phyDat attribute | Enables reuse across HSJ/xform methods | +| 2026-03-19 | Mixed Fitch+Sankoff scoring in score_tree() | Non-hierarchy chars use Fitch; recoded chars use Sankoff | +| 2026-03-19 | HSJ full-rescore only (no incremental) | Screen candidates with Fitch, full HSJ for promising ones | +| 2026-03-19 | Multistate profile: binary fallback for infeasible MaddisonSlatkin | k=3/n>15, k=4/n>10, k=5/n>8 thresholds | +| 2026-03-22 | Ratchet perturbation 4%→25%, 5 moves, 10–12 cycles | Systematic sweep: 4% zeroes ~10/253 chars — too gentle | +| 2026-03-22 | Drift cycles 4→2, ratchet 10→12 | Drift contributes ~0 per-replicate improvement | +| 2026-03-23 | NNI essential at >100 tips; redundant at ≤88 | O(n) vs O(n²) per pass; first TBR pass >5min at 180t | +| 2026-03-23 | Biased Wagner: softmax sampling, first start only | Purely greedy = same tree every time; stochastic biasing keeps diversity | +| 2026-03-23 | Outer cycle loop default=1 (backward-compatible) | TNT xmult pattern; interleave XSS between perturbation phases | +| 2026-03-23 | XPIWE default in Shiny | Missing-entries bias correction; eff_k = concavity / f | +| 2026-03-23 | Search confidence: binomial bound (1-K/R)^R | Tighter than exp(-K); falls back when K==R | +| 2026-03-23 | Pool topology count via count_at_best() | Distinct topologies at best score, not pool_size (includes suboptimal) | +| 2026-03-24 | T-202: Two-phase timeout for MPT enumeration | Reserve 10% budget; `enumTimeFraction` tunable via SearchControl | +| 2026-03-24 | Adaptive fuse_accept_equal: hits≥2 && pool≥3 | Auto-enable equal-score fusing when score stable; avoids early-search waste | +| 2026-03-24 | Skip TBR cleanup for equal-score fuse exchanges | Both trees already TBR-optimal; full TBR pass rarely finds improvements | +| 2026-03-24 | Cap equal-score-only fuse rounds at 3 | Diminishing returns from lateral exchanges; `max_equal_rounds` tunable | +| 2026-03-25 | perturbStopFactor default=2 | Benchmarked on 10 datasets (23–213 tips): 2.4–6.9x speedup, 0 score loss. Complementary to targetHits on hard landscapes where hit rate is low | + +## Future: Search Convergence Diagnostics (post-v2.0.0) + +The current `exp(-K)` "miss probability" shown in the Shiny app is dataset- +independent and oversimplified. **T-163/T-164** implement a first improvement +(binomial bound + topology diversity + trajectory flags). + +Longer-term ideas for a later package version: + +1. **IQPNNI-style Weibull record-value stopping** (Vinh & von Haeseler 2004). + Model inter-improvement gaps within a replicate as Weibull; estimate + probability of further improvement. Dataset-adaptive by construction. + Needs adaptation from within-run iteration-level to TreeSearch's + multi-replicate framework. + +2. **Chao1-style score-spectrum estimator.** Treat distinct parsimony scores + found across replicates as "species"; use the singleton/doubleton ratio + (f1²/2f2) to estimate number of unseen score levels — including potentially + better ones. Requires collecting the full score distribution from search + (not currently returned). + +3. **Dataset difficulty prediction (Pythia-style).** ML-based prediction of + landscape ruggedness from dataset features (tip count, character count, + signal density). Would allow adaptive confidence messaging ("this dataset + is expected to be easy/hard"). Requires training data from empirical + benchmarks. + +## Completed Milestones + +| Phase | Description | Date | +|-------|-------------|------| +| 1A–6E | Feature-complete C++ engine | 2026-03-16–17 | +| — | Version 2.0.0, CRAN-ready | 2026-03-17 | +| — | Benchmark expansion (T-067–T-069) | 2026-03-18 | +| — | Test tiering (T-073) | 2026-03-18 | +| — | Shiny modularization complete (Phases 1–5) | 2026-03-18 | +| — | Shiny bug fixes complete (Obj 8) | 2026-03-18 | +| — | NEWS.md updated for v2.0.0 | 2026-03-18 | +| — | **All 9 original objectives COMPLETE** | 2026-03-18 | +| — | Multi-state profile parsimony (Obj 10) | 2026-03-19 | +| — | ParsSim dataset simulator (Obj 14) | 2026-03-19 | +| — | Inapplicable handling: HSJ + xform end-to-end (Obj 11) | 2026-03-19 | +| — | Subsample MPTs: WideSample + Shiny (Obj 13) | 2026-03-19 | +| — | Ratchet/drift tuning + polytomy-search merge | 2026-03-22 | +| — | NNI warmup, NNI-perturbation, biased Wagner, outer cycle | 2026-03-23 | +| — | XPIWE feature complete (Obj 16) | 2026-03-23 | +| — | Search confidence + pool stats wiring (Obj 12 done) | 2026-03-23 | +| — | Large-tree benchmark tier + warm-start infrastructure | 2026-03-23 | diff --git a/dev/benchmarks/bench_clip_ordering.R b/dev/benchmarks/bench_clip_ordering.R new file mode 100644 index 000000000..daa376a64 --- /dev/null +++ b/dev/benchmarks/bench_clip_ordering.R @@ -0,0 +1,215 @@ +# bench_clip_ordering.R +# +# Benchmark comparison of TBR clip ordering strategies. +# +# Compares six clip_order variants: +# 0 = RANDOM (current default) +# 1 = INV_WEIGHT (w = 1/(1+s)) +# 2 = TIPS_FIRST (tips first, then rest; shuffled within) +# 3 = BUCKET (tips / small / large buckets; shuffled within) +# 4 = ANTI_TIP (non-tips first, then tips) +# 5 = LARGE_FIRST (large > √n first, then small, then tips) +# +# Metric: time-adjusted expected best (TAEB) score — the expected minimum +# score from k independent replicates where k = floor(budget / rep_time). +# Estimated via bootstrap resampling of per-replicate scores. +# +# Usage: Rscript dev/benchmarks/bench_clip_ordering.R [lib_path] [n_seeds] +# lib_path defaults to ".agent-wc" +# n_seeds defaults to 10 + +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[1] else ".agent-wc" +n_seeds <- if (length(args) >= 2) as.integer(args[2]) else 10L + +library(TreeSearch, lib.loc = lib_path) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +DATASETS <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") +BUDGETS <- c(30, 60) # seconds +SEEDS <- seq_len(n_seeds) * 1000L + 847L + +ORDERS <- c( + RANDOM = 0L, + INV_WEIGHT = 1L, + TIPS_FIRST = 2L, + BUCKET = 3L, + ANTI_TIP = 4L, + LARGE_FIRST= 5L +) + +# Expected-best bootstrap: given per-replicate scores and total wall time, +# estimate E[min score] at each time budget. +taeb <- function(scores, times_ms, budgets_s, n_boot = 2000L) { + stopifnot(length(scores) == length(times_ms)) + n <- length(scores) + if (n == 0) return(setNames(rep(NA_real_, length(budgets_s)), budgets_s)) + + # Mean time per replicate (use median to be robust to outliers) + med_time_s <- median(times_ms) / 1000 + if (med_time_s <= 0) med_time_s <- 1 + + vapply(budgets_s, function(budget) { + k <- max(1L, floor(budget / med_time_s)) + if (k >= n) { + # Can't bootstrap more than n replicates; just return min + return(min(scores)) + } + boot_mins <- replicate(n_boot, min(sample(scores, k, replace = TRUE))) + mean(boot_mins) + }, numeric(1L)) +} + +# --------------------------------------------------------------------------- +# Prepare datasets +# --------------------------------------------------------------------------- + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + name = name, + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# --------------------------------------------------------------------------- +# Build a default SearchControl with preset based on n_tip +# --------------------------------------------------------------------------- + +make_sc <- function(n_tip, clip_order_int = 0L) { + # Mirror the "default" preset for datasets in 31-119 tip range, + # with only the clip_order changed. This gives a realistic context + # (same ratchet/XSS/RSS settings as normal use) for the comparison. + # + # NOTE: maxSeconds is set per run; runtimeConfig controls the budget. + # Here we only set SearchControl parameters. + SearchControl( + tbrMaxHits = 1L, + clipOrder = clip_order_int, + nniFirst = TRUE, + sprFirst = FALSE, + wagnerStarts = 1L, + wagnerBias = 0L, + outerCycles = 1L, + maxOuterResets = 0L, + ratchetCycles = 12L, + ratchetPerturbProb = 0.25, + ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = FALSE, + nniPerturbCycles = 0L, + driftCycles = 0L, + xssRounds = 3L, xssPartitions = 4L, + rssRounds = 1L, + cssRounds = 0L, + fuseInterval = 3L, + adaptiveLevel = TRUE, + consensusStableReps = 0L + ) +} + +make_runtime <- function(max_seconds) { + list(maxReplicates = 9999L, targetHits = 9999L, + maxSeconds = max_seconds, verbosity = 0L, nThreads = 1L) +} + +# --------------------------------------------------------------------------- +# Data collection +# --------------------------------------------------------------------------- + +cat(sprintf("Benchmark: %d datasets × %d ordering variants × %d seeds\n", + length(DATASETS), length(ORDERS), n_seeds)) +cat(sprintf("Budgets: %s seconds\n\n", paste(BUDGETS, collapse = ", "))) + +all_results <- list() + +for (dname in DATASETS) { + d <- prepare(dname) + + cat(sprintf("=== %s (n_tip=%d) ===\n", dname, d$n_taxa)) + + ds_results <- list() + + for (oname in names(ORDERS)) { + oint <- ORDERS[[oname]] + sc <- make_sc(d$n_taxa, oint) + + rep_scores <- numeric(n_seeds) + rep_times <- numeric(n_seeds) # ms per replicate + + for (i in seq_along(SEEDS)) { + set.seed(SEEDS[i]) + res <- TreeSearch:::ts_driven_search( + d$contrast, d$tip_data, d$weight, d$levels, + searchControl = sc, + runtimeConfig = make_runtime(max(BUDGETS)), + scoringConfig = list(min_steps = integer(), concavity = -1.0, + xpiwe = FALSE, xpiwe_r = 0.5, xpiwe_max_f = 5.0, + obs_count = integer(), infoAmounts = NULL) + ) + rep_scores[i] <- res$best_score + # Estimate per-replicate time from timings + t_total_ms <- sum(unlist(res$timings)) + n_reps <- max(1L, res$n_replicates) + rep_times[i] <- t_total_ms / n_reps + } + + taeb_vals <- taeb(rep_scores, rep_times, BUDGETS) + cat(sprintf(" %-12s: scores %s med_rep %.1fs TAEB@%ds=%.1f @%ds=%.1f\n", + oname, + paste(sprintf("%.0f", range(rep_scores)), collapse = "-"), + median(rep_times)/1000, + BUDGETS[1], taeb_vals[1], + BUDGETS[2], taeb_vals[2])) + + ds_results[[oname]] <- list( + order = oname, order_int = oint, + dataset = dname, n_tip = d$n_taxa, + scores = rep_scores, times_ms = rep_times, + taeb = taeb_vals + ) + } + + all_results[[dname]] <- ds_results + cat("\n") +} + +# --------------------------------------------------------------------------- +# Summary: Δ vs RANDOM baseline for each variant, averaged across datasets +# --------------------------------------------------------------------------- + +cat("=== Summary: TAEB Δ vs RANDOM (lower = better) ===\n\n") + +for (budget in BUDGETS) { + cat(sprintf("--- Budget: %ds ---\n", budget)) + cat(sprintf(" %-15s", "")) + for (dname in DATASETS) cat(sprintf(" %13s", dname)) + cat(sprintf(" %13s\n", "mean_delta")) + + for (oname in names(ORDERS)) { + if (oname == "RANDOM") next + cat(sprintf(" %-15s", oname)) + deltas <- numeric(length(DATASETS)) + for (j in seq_along(DATASETS)) { + dname <- DATASETS[j] + ref <- all_results[[dname]][["RANDOM"]]$taeb[[which(BUDGETS == budget)]] + this_val <- all_results[[dname]][[oname]]$taeb[[which(BUDGETS == budget)]] + delta <- this_val - ref # positive = worse (more steps) + deltas[j] <- delta + cat(sprintf(" %+13.2f", delta)) + } + cat(sprintf(" %+13.2f\n", mean(deltas))) + } + cat("\n") +} + +cat("Positive Δ = worse than RANDOM; negative Δ = better than RANDOM.\n") +cat("Done.\n") diff --git a/dev/benchmarks/bench_collapsed.R b/dev/benchmarks/bench_collapsed.R new file mode 100644 index 000000000..151751036 --- /dev/null +++ b/dev/benchmarks/bench_collapsed.R @@ -0,0 +1,129 @@ +#!/usr/bin/env Rscript +# Benchmark collapsed-tree optimization: skip counts, wall time, score equivalence +# +# Usage: Rscript dev/benchmarks/bench_collapsed.R +# +# Runs each dataset 3 times with fixed seeds and reports: +# - Skip counts (via ts_tbr_search on near-optimal tree) +# - Driven search wall time and score +# - Per-phase timing breakdown + +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[1] else ".agent-a" + +library(TreeSearch, lib.loc = lib_path) +library(TreeTools) + +# --- Datasets --- +datasets <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# --- Part 1: Skip count measurement via TBR on near-optimal trees --- +cat("=== Part 1: Collapsed-flag skip counts (TBR) ===\n\n") + +for (nm in datasets) { + d <- prepare(nm) + n_tip <- d$n_taxa + n_internal <- n_tip - 1L + total_clips <- n_tip + n_internal - 1L # all nodes except root + + # Build a near-optimal tree via short driven search + set.seed(7391) + quick <- TreeSearch:::ts_driven_search( + d$contrast, d$tip_data, d$weight, d$levels, + maxReplicates = 3L, targetHits = 2L, + ratchetCycles = 3L, driftCycles = 1L, + xssRounds = 1L, xssPartitions = 3L, + rssRounds = 0L, cssRounds = 0L, + fuseInterval = 3L, maxSeconds = 30, + verbosity = 0L, nThreads = 1L + ) + + # Run TBR from that tree (converged = already at local optimum) + edge <- quick$trees[[1]] + tbr_res <- TreeSearch:::ts_tbr_search( + edge, d$contrast, d$tip_data, d$weight, d$levels, + maxHits = 10L, acceptEqual = FALSE + ) + + pct_skip <- round(100 * tbr_res$n_zero_skipped / + (tbr_res$n_zero_skipped + tbr_res$n_evaluated), 1) + + cat(sprintf("%-15s tips=%d score=%.0f evaluated=%d skipped=%d skip%%=%.1f%%\n", + nm, n_tip, tbr_res$score, + tbr_res$n_evaluated, tbr_res$n_zero_skipped, pct_skip)) +} + +# --- Part 2: Driven search wall time & scores --- +cat("\n=== Part 2: Driven search (3 seeds × 4 datasets) ===\n\n") + +seeds <- c(2847L, 5192L, 8634L) +results <- list() + +for (nm in datasets) { + d <- prepare(nm) + + for (s in seeds) { + set.seed(s) + t0 <- proc.time() + res <- TreeSearch:::ts_driven_search( + d$contrast, d$tip_data, d$weight, d$levels, + maxReplicates = 10L, targetHits = 5L, + ratchetCycles = 5L, driftCycles = 2L, + xssRounds = 3L, xssPartitions = 4L, + rssRounds = 1L, cssRounds = 0L, + fuseInterval = 3L, maxSeconds = 60, + verbosity = 0L, nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + + tim <- res$timings + row <- data.frame( + dataset = nm, + seed = s, + score = res$best_score, + reps = res$replicates, + pool = res$pool_size, + wall_s = round(elapsed, 2), + tbr_ms = round(tim[["tbr_ms"]], 0), + ratchet_ms = round(tim[["ratchet_ms"]], 0), + drift_ms = round(tim[["drift_ms"]], 0), + xss_ms = round(tim[["xss_ms"]], 0), + rss_ms = round(tim[["rss_ms"]], 0), + fuse_ms = round(tim[["fuse_ms"]], 0), + final_tbr_ms = round(tim[["final_tbr_ms"]], 0), + stringsAsFactors = FALSE + ) + results <- c(results, list(row)) + cat(sprintf(" %-15s seed=%d score=%.0f reps=%d wall=%.2fs\n", + nm, s, res$best_score, res$replicates, elapsed)) + } +} + +results_df <- do.call(rbind, results) + +cat("\n=== Summary by dataset ===\n\n") +for (nm in datasets) { + sub <- results_df[results_df$dataset == nm, ] + cat(sprintf("%-15s best=%.0f median_wall=%.2fs median_tbr_ms=%.0f median_ratchet_ms=%.0f median_drift_ms=%.0f\n", + nm, + min(sub$score), + median(sub$wall_s), + median(sub$tbr_ms), + median(sub$ratchet_ms), + median(sub$drift_ms))) +} + +cat("\nDone.\n") diff --git a/dev/benchmarks/bench_datasets.R b/dev/benchmarks/bench_datasets.R new file mode 100644 index 000000000..8b9cfa880 --- /dev/null +++ b/dev/benchmarks/bench_datasets.R @@ -0,0 +1,411 @@ +# Benchmark dataset loading and scoring utilities +# +# Usage: +# source("dev/benchmarks/bench_datasets.R") +# datasets <- load_benchmark_datasets() +# run_benchmark_suite(maxSeconds = 30, replicates = 5) + +library(TreeSearch) +library(TreeTools) + +# The 14 standard benchmark datasets (<=88 tips), ordered by tip count +BENCHMARK_NAMES <- c( + "Longrich2010", # 20 tips, 3 states, 45% missing + "Vinther2008", # 23 tips, 4 states, 21% missing + "Sansom2010", # 23 tips, 4 states, 40% missing + "DeAssis2011", # 33 tips, 3 states, 21% inapp + "Aria2015", # 35 tips, 6 states, 13% missing + "Wortley2006", # 37 tips, 8 states, 31% missing + "Griswold1999", # 43 tips, 6 states, 6% missing + "Schulze2007", # 52 tips, 3 states, 17% inapp + "Eklund2004", # 54 tips, 6 states, 30% missing + "Agnarsson2004", # 62 tips, 7 states, 6% missing + "Zanol2014", # 74 tips, 9 states, 17% inapp + "Zhu2013", # 75 tips, 4 states, 43% missing + "Giles2015", # 78 tips, 4 states, 42% missing + "Dikow2009" # 88 tips, 9 states, 0.4% missing +) + +# Large-tree benchmark datasets (>= 100 tips). +# Loaded from dev/benchmarks/ rather than inapplicable.phyData. +LARGE_BENCHMARK_NAMES <- c( + "mbank_X30754" # 180 tips, 425 chars, 40% missing, 20% inapplicable +) + +#' Prepare raw data for C++ bridge from a phyDat object +#' @param dataset A phyDat object +#' @return List with contrast, tip_data, weight, levels +prepare_ts_data <- function(dataset) { + at <- attributes(dataset) + list( + contrast = at$contrast, + tip_data = matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(dataset) + ) +} + +#' Load all 14 standard benchmark datasets +#' @return Named list of prepared datasets (ready for C++ bridge) +load_benchmark_datasets <- function() { + datasets <- list() + for (nm in BENCHMARK_NAMES) { + ds <- TreeSearch::inapplicable.phyData[[nm]] + if (is.null(ds)) { + warning("Dataset ", nm, " not found in inapplicable.phyData") + next + } + datasets[[nm]] <- prepare_ts_data(ds) + } + datasets +} + +#' Load large-tree benchmark datasets from dev/benchmarks/ +#' @return Named list of prepared datasets (ready for C++ bridge) +load_large_benchmark_datasets <- function() { + bench_dir <- "dev/benchmarks" + datasets <- list() + for (nm in LARGE_BENCHMARK_NAMES) { + nex_path <- file.path(bench_dir, paste0(nm, ".nex")) + if (!file.exists(nex_path)) { + warning("Large dataset file not found: ", nex_path) + next + } + phyDat <- TreeTools::ReadAsPhyDat(nex_path) + datasets[[nm]] <- prepare_ts_data(phyDat) + } + datasets +} + +#' Load all benchmark datasets (standard + large) +#' @return Named list of prepared datasets +load_all_benchmark_datasets <- function() { + c(load_benchmark_datasets(), load_large_benchmark_datasets()) +} + +#' Characterize a benchmark dataset +#' @param ds Prepared dataset (from prepare_ts_data) +#' @return Data frame with one row of characteristics +characterize_dataset <- function(ds) { + n_taxa <- ds$n_taxa + n_patterns <- length(ds$weight) + n_chars <- sum(ds$weight) + lvls <- ds$levels + contrast <- ds$contrast + n_states <- ncol(contrast) + inapp_idx <- which(lvls == "-") + n_app_states <- n_states - length(inapp_idx) + + td <- ds$tip_data + total_cells <- n_taxa * n_patterns + + n_inapp <- 0L + n_missing <- 0L + has_inapp <- length(inapp_idx) > 0 + for (i in seq_len(nrow(contrast))) { + is_inapp <- has_inapp && contrast[i, inapp_idx] > 0.5 + cols_check <- setdiff(seq_len(n_states), inapp_idx) + is_all <- all(contrast[i, cols_check] > 0.5) + count <- sum(td == i) + if (is_inapp && !is_all) n_inapp <- n_inapp + count + if (is_all) n_missing <- n_missing + count + } + + data.frame( + n_taxa = n_taxa, + n_chars = n_chars, + n_patterns = n_patterns, + pct_inapp = round(100 * n_inapp / total_cells, 1), + n_app_states = n_app_states, + pct_missing = round(100 * n_missing / total_cells, 1) + ) +} + +#' Run a single benchmark: driven search on one dataset +#' @param name Dataset name (from BENCHMARK_NAMES) +#' @param maxSeconds Timeout in seconds +#' @param maxReplicates Maximum replicates +#' @param seed RNG seed +#' @param datasets Pre-loaded datasets (optional) +#' @return List with score, replicates, time, etc. +score_dataset <- function(name, maxSeconds = 10, maxReplicates = 20L, + seed = 42L, datasets = NULL) { + if (is.null(datasets)) { + ds <- prepare_ts_data(TreeSearch::inapplicable.phyData[[name]]) + } else { + ds <- datasets[[name]] + } + if (is.null(ds)) stop("Dataset '", name, "' not found") + + set.seed(seed) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = maxReplicates, + targetHits = 5L, + ratchetCycles = 5L, + xssRounds = 1L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = maxSeconds, + verbosity = 0L + ) + elapsed <- (proc.time() - t0)[3] + + list( + dataset = name, + n_taxa = ds$n_taxa, + best_score = result$best_score, + replicates = result$replicates, + pool_size = result$pool_size, + hits_to_best = result$hits_to_best, + timed_out = result$timed_out, + elapsed = elapsed + ) +} + +#' Run the full benchmark suite +#' @param maxSeconds Timeout per dataset +#' @param replicates Number of independent runs per dataset +#' @param seed Base seed (incremented per replicate) +#' @return Data frame with results +run_benchmark_suite <- function(maxSeconds = 30, replicates = 3L, + seed = 42L) { + datasets <- load_benchmark_datasets() + results <- list() + + for (nm in names(datasets)) { + for (rep in seq_len(replicates)) { + cat(sprintf("[%s] rep %d/%d (timeout=%ds)...", + nm, rep, replicates, maxSeconds)) + res <- score_dataset(nm, maxSeconds = maxSeconds, + seed = seed + rep - 1L, + datasets = datasets) + cat(sprintf(" score=%.0f reps=%d time=%.1fs\n", + res$best_score, res$replicates, res$elapsed)) + res$replicate <- rep + results <- c(results, list(as.data.frame(res))) + } + } + + do.call(rbind, results) +} + +#' Summarize benchmark results +#' @param results Data frame from run_benchmark_suite +#' @return Summary data frame (best score, median time, etc.) +summarize_benchmark <- function(results) { + datasets <- unique(results$dataset) + summaries <- list() + + for (nm in datasets) { + sub <- results[results$dataset == nm, ] + summaries <- c(summaries, list(data.frame( + dataset = nm, + n_taxa = sub$n_taxa[1], + best_score = min(sub$best_score), + median_score = median(sub$best_score), + median_time = round(median(sub$elapsed), 2), + median_reps = median(sub$replicates), + stringsAsFactors = FALSE + ))) + } + + do.call(rbind, summaries) +} + +# =========================================================================== +# MorphoBank external benchmark datasets (neotrans corpus) +# =========================================================================== + +# Hard-coded path to the neotrans matrices directory. +# The neotrans repo is a sibling of the TreeSearch source tree under GitHub/. +# This is a git submodule, so the path is stable. +NEOTRANS_MATRICES_DIR <- local({ + # Try from TreeSearch source root (getwd() == TreeSearch-a/) + candidates <- c( + file.path(getwd(), "..", "neotrans", "inst", "matrices"), + # From dev/benchmarks/ (when sourcing directly) + file.path(getwd(), "..", "..", "neotrans", "inst", "matrices") + ) + for (d in candidates) { + d_norm <- normalizePath(d, mustWork = FALSE) + if (dir.exists(d_norm)) return(d_norm) + } + # Return the most likely path even if it doesn't exist yet + normalizePath(candidates[1], mustWork = FALSE) +}) + +# Minimum taxon count for benchmarking. Matrices below this size are +# trivially solved in milliseconds and contribute no useful signal. +MBANK_MIN_NTAX <- 20L + +# Fixed 25-matrix training sample, selected for diversity across size tiers. +# Chosen via max-min distance on standardized (ntax, nchar, pct_missing, +# pct_inapp) within each tier: 7 small, 7 medium, 7 large, 4 xlarge. +# Do not modify: results are only comparable when the same sample is used. +MBANK_FIXED_SAMPLE <- c( + # Small (20-30 taxa) + "project532", "project2346", "project2451", "project4501", + "project944", "project971_(1)", "project2762", + # Medium (31-60 taxa) + "project826", "project561", "project571", "project4146_(3)", + "project3688", "project4049", "project423", + # Large (61-120 taxa) + "project4286", "project4359", "project4397", "project2084_(1)", + "project2771", "project2184", "project3938", + # XLarge (121+ taxa) + "syab07201", "project4133", "project804", "project4284" +) + +#' Load the MorphoBank matrix catalogue +#' +#' Reads the pre-built catalogue CSV from dev/benchmarks/mbank_catalogue.csv. +#' Filters to usable matrices (parse_ok, ntax >= MBANK_MIN_NTAX) and +#' optionally excludes redundant multi-matrix duplicates. +#' +#' @param include_redundant If FALSE (default), exclude rows flagged +#' as redundant in the catalogue. +#' @return Data frame with one row per matrix. +load_mbank_catalogue <- function(include_redundant = FALSE) { + # Find the catalogue CSV + cat_candidates <- c( + file.path(getwd(), "dev", "benchmarks", "mbank_catalogue.csv"), + file.path(getwd(), "mbank_catalogue.csv") + ) + cat_path <- NULL + for (p in cat_candidates) { + if (file.exists(p)) { cat_path <- p; break } + } + if (is.null(cat_path)) { + stop("mbank_catalogue.csv not found. Run build_mbank_catalogue.R first.") + } + + cat <- read.csv(cat_path, stringsAsFactors = FALSE) + + # Filter to usable matrices + cat <- cat[cat$parse_ok & !is.na(cat$ntax) & cat$ntax >= MBANK_MIN_NTAX, ] + + # Exclude redundant multi-matrix duplicates (if column exists) + if (!include_redundant && "dedup_drop" %in% names(cat)) { + cat <- cat[!cat$dedup_drop, ] + } + + # Add tier classification + cat$tier <- cut(cat$ntax, + breaks = c(0, 30, 60, 120, Inf), + labels = c("small", "medium", "large", "xlarge")) + + rownames(cat) <- cat$key + cat +} + +#' Load MorphoBank datasets by key +#' +#' Reads .nex files from the neotrans matrices directory and prepares them +#' for the C++ bridge. +#' +#' @param catalogue Data frame from load_mbank_catalogue(). +#' @param keys Character vector of matrix keys to load. +#' @param verbose If TRUE, print progress. +#' @return Named list of prepared datasets. +load_mbank_datasets <- function(catalogue, keys, verbose = TRUE) { + if (!dir.exists(NEOTRANS_MATRICES_DIR)) { + stop("Neotrans matrices directory not found: ", NEOTRANS_MATRICES_DIR, + "\nIs the neotrans repo checked out?") + } + + datasets <- list() + for (k in keys) { + if (!k %in% catalogue$key) { + warning("Key '", k, "' not in catalogue; skipping.") + next + } + row <- catalogue[catalogue$key == k, ] + nex_path <- file.path(NEOTRANS_MATRICES_DIR, row$filename) + if (!file.exists(nex_path)) { + warning("File not found: ", nex_path, "; skipping.") + next + } + if (verbose) { + cat(sprintf(" Loading %s (%d taxa, %d chars)...\n", + k, row$ntax, row$nchar)) + } + tryCatch({ + pd <- suppressWarnings(TreeTools::ReadAsPhyDat(nex_path)) + datasets[[k]] <- prepare_ts_data(pd) + }, error = function(e) { + warning("Failed to load ", k, ": ", conditionMessage(e)) + }) + } + datasets +} + +#' Load a stratified sample of MorphoBank datasets +#' +#' Draws a reproducible stratified sample from the training or validation +#' split, with equal representation from each size tier. +#' +#' @param catalogue Data frame from load_mbank_catalogue(). +#' @param n Total number of matrices to sample (approximately). +#' @param seed RNG seed for reproducibility. +#' @param split "training" (default) or "validation". +#' @param tier Optional: restrict to a specific tier ("small", "medium", +#' "large", "xlarge"). +#' @param verbose If TRUE, print summary of what was loaded. +#' @return Named list of prepared datasets. +load_mbank_sample <- function(catalogue, n = 25L, seed = 7193L, + split = "training", tier = NULL, + verbose = TRUE) { + pool <- catalogue[catalogue$split == split, ] + if (!is.null(tier)) { + pool <- pool[pool$tier == tier, ] + } + if (nrow(pool) == 0) { + stop("No matrices in the ", split, " split", + if (!is.null(tier)) paste0(" (tier: ", tier, ")") else "") + } + + # Stratified sampling: allocate n proportionally across tiers + tier_counts <- table(pool$tier) + tier_counts <- tier_counts[tier_counts > 0] + n_per_tier <- round(n * tier_counts / sum(tier_counts)) + # Ensure at least 1 per tier if tier has matrices + n_per_tier <- pmax(n_per_tier, 1L) + + set.seed(seed) + selected <- character(0) + for (t in names(n_per_tier)) { + tier_pool <- pool[pool$tier == t, ] + k <- min(n_per_tier[t], nrow(tier_pool)) + selected <- c(selected, sample(tier_pool$key, k)) + } + + if (verbose) { + cat(sprintf("MorphoBank %s sample: %d matrices from %d tiers\n", + split, length(selected), length(n_per_tier))) + for (t in names(n_per_tier)) { + cat(sprintf(" %s: %d selected (of %d available)\n", + t, sum(pool$tier[pool$key %in% selected] == t), + sum(pool$tier == t))) + } + } + + load_mbank_datasets(catalogue, selected, verbose = verbose) +} + +#' Load all MorphoBank datasets for a given split +#' +#' @param catalogue Data frame from load_mbank_catalogue(). +#' @param split "training" or "validation". +#' @param verbose If TRUE, print progress. +#' @return Named list of prepared datasets. +load_mbank_split <- function(catalogue, split = "training", verbose = TRUE) { + pool <- catalogue[catalogue$split == split, ] + if (verbose) { + cat(sprintf("Loading all %d %s matrices...\n", nrow(pool), split)) + } + load_mbank_datasets(catalogue, pool$key, verbose = verbose) +} diff --git a/dev/benchmarks/bench_drift_mpt.R b/dev/benchmarks/bench_drift_mpt.R new file mode 100644 index 000000000..0aaddf307 --- /dev/null +++ b/dev/benchmarks/bench_drift_mpt.R @@ -0,0 +1,141 @@ +#!/usr/bin/env Rscript +# T-254: Drift MPT diversity experiment +# +# Compare pool size, MPT count, and topological diversity between +# driftCycles=0 and driftCycles=2 on the three gap datasets from T-251. +# +# Usage: +# Rscript dev/benchmarks/bench_drift_mpt.R + +library(TreeSearch, lib.loc = ".agent-E") +library(TreeTools) +library(TreeDist) + +DATASETS <- c("Wortley2006", "Zhu2013", "Geisler2001") +DRIFT_CONDITIONS <- c(0L, 2L) +SEEDS <- 1:3 +BUDGETS <- c(30, 120) + +# Use default preset parameters for everything except driftCycles. +# strategy = "none" bypasses auto-selection; explicit control overrides. +make_control <- function(drift_cycles) { + SearchControl( + tbrMaxHits = 1L, + nniFirst = TRUE, + sprFirst = FALSE, + tabuSize = 100L, + wagnerStarts = 3L, + outerCycles = 1L, + maxOuterResets = 2L, + ratchetCycles = 12L, + ratchetPerturbProb = 0.25, + ratchetPerturbMode = 0L, + ratchetPerturbMaxMoves = 5L, + adaptiveLevel = TRUE, + driftCycles = drift_cycles, + driftAfdLimit = 5L, + driftRfdLimit = 0.15, + xssRounds = 3L, + xssPartitions = 4L, + rssRounds = 1L, + cssRounds = 0L, + consensusStableReps = 3L, + fuseInterval = 3L, + fuseAcceptEqual = FALSE, + poolMaxSize = 100L, + enumTimeFraction = 0.1 + ) +} + +# Compute pairwise RF distances between trees, return summary stats +tree_diversity <- function(trees) { + n <- length(trees) + if (n < 2) return(list(mean_rf = NA, median_rf = NA, min_rf = NA, max_rf = NA)) + rf_mat <- as.matrix(RobinsonFoulds(trees)) + # Upper triangle only (exclude diagonal) + rf_vals <- rf_mat[upper.tri(rf_mat)] + list( + mean_rf = mean(rf_vals), + median_rf = median(rf_vals), + min_rf = min(rf_vals), + max_rf = max(rf_vals) + ) +} + +results <- list() +row_i <- 0L + +for (ds_name in DATASETS) { + ds <- inapplicable.phyData[[ds_name]] + n_tips <- length(ds) + cat(sprintf("\n=== %s (%d tips) ===\n", ds_name, n_tips)) + + for (budget in BUDGETS) { + for (drift in DRIFT_CONDITIONS) { + ctrl <- make_control(drift) + for (seed in SEEDS) { + row_i <- row_i + 1L + cat(sprintf(" budget=%ds drift=%d seed=%d ... ", budget, drift, seed)) + t0 <- proc.time() + + res <- MaximizeParsimony( + ds, + maxSeconds = budget, + strategy = "none", + control = ctrl, + verbosity = 0L, + nThread = 1L + ) + + wall_s <- as.double((proc.time() - t0)[3]) + best_score <- attr(res, "score") + n_trees <- length(res) + n_topo <- attr(res, "n_topologies") + n_reps <- attr(res, "replicates") + timings <- attr(res, "timings") + + # Topological diversity (RF distances) + div <- tree_diversity(res) + + cat(sprintf("score=%.0f trees=%d topo=%d reps=%d (%.1fs)\n", + best_score, n_trees, n_topo, n_reps, wall_s)) + + results[[row_i]] <- data.frame( + dataset = ds_name, + n_tips = n_tips, + budget_s = budget, + drift_cycles = drift, + seed = seed, + best_score = best_score, + n_trees = n_trees, + n_topologies = n_topo, + replicates = n_reps, + wall_s = round(wall_s, 2), + drift_ms = timings["drift_ms"], + total_ms = sum(timings), + drift_pct = round(100 * timings["drift_ms"] / sum(timings), 1), + mean_rf = div$mean_rf, + median_rf = div$median_rf, + min_rf = div$min_rf, + max_rf = div$max_rf, + stringsAsFactors = FALSE + ) + } + } + } +} + +df <- do.call(rbind, results) +rownames(df) <- NULL + +out_path <- "dev/benchmarks/results_drift_mpt.csv" +write.csv(df, out_path, row.names = FALSE) +cat(sprintf("\nResults written to %s\n", out_path)) + +# Quick summary table +cat("\n=== Summary by dataset × budget × drift ===\n") +agg <- aggregate( + cbind(best_score, n_trees, n_topologies, replicates, mean_rf) ~ dataset + budget_s + drift_cycles, + data = df, FUN = median +) +print(agg[order(agg$dataset, agg$budget_s, agg$drift_cycles), ]) diff --git a/dev/benchmarks/bench_framework.R b/dev/benchmarks/bench_framework.R new file mode 100644 index 000000000..23cba54b8 --- /dev/null +++ b/dev/benchmarks/bench_framework.R @@ -0,0 +1,597 @@ +# Phase 6D: Benchmarking framework +# +# Runs dataset x strategy x N replicates and records: +# - Best score found +# - Total wall-clock time +# - Time to best score (via progress callback) +# - Number of replicates to convergence +# - Per-phase timing breakdown +# +# When comparing strategies with DIFFERENT per-replicate cost (e.g. +# NNI→TBR vs TBR), use time-adjusted expected best — the expected +# minimum from k = budget / time_per_rep draws — not median score. +# See .positai/expertise/profiling.md for implementation and rationale. +# Median is fine when comparing parameter changes on a fixed pipeline +# (same time-per-rep). +# +# Usage: +# source("dev/benchmarks/bench_framework.R") +# results <- run_benchmark_grid() +# summary <- summarize_grid(results) + +library(TreeSearch) +library(TreeTools) + +source("dev/benchmarks/bench_datasets.R") + +# ---- Strategy presets (formalized from strategies.md, T-003) ---- + +STRATEGY_NAMES <- c("sprint", "default", "thorough", + "ratchet_heavy", "sectorial_heavy", "drift_heavy") +# Large-tree strategies (for use with LARGE_BENCHMARK_NAMES, >= 120 tips) +LARGE_STRATEGY_NAMES <- c("large", "thorough") + +get_strategy <- function(name = STRATEGY_NAMES) { + name <- match.arg(name) + strategies <- list( + sprint = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 0L, + ratchetCycles = 3L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 0L, driftAfdLimit = 3L, driftRfdLimit = 0.1, + xssRounds = 1L, xssPartitions = 4L, rssRounds = 0L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 5L, fuseAcceptEqual = FALSE + ), + default = list( + wagnerStarts = 3L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 12L, ratchetPerturbProb = 0.25, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = FALSE, + driftCycles = 2L, driftAfdLimit = 5L, driftRfdLimit = 0.15, + xssRounds = 3L, xssPartitions = 4L, rssRounds = 1L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 3L, fuseAcceptEqual = FALSE, + sprFirst = TRUE, adaptiveLevel = TRUE, consensusStableReps = 3L + ), + thorough = list( + wagnerStarts = 3L, tbrMaxHits = 3L, tabuSize = 200L, + ratchetCycles = 20L, ratchetPerturbProb = 0.25, + ratchetPerturbMode = 2L, ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = TRUE, + driftCycles = 12L, driftAfdLimit = 5L, driftRfdLimit = 0.15, + xssRounds = 5L, xssPartitions = 6L, rssRounds = 3L, + cssRounds = 2L, cssPartitions = 6L, + sectorMinSize = 6L, sectorMaxSize = 80L, + fuseInterval = 2L, fuseAcceptEqual = TRUE + ), + ratchet_heavy = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 30L, ratchetPerturbProb = 0.30, + ratchetPerturbMode = 2L, ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = TRUE, + driftCycles = 2L, driftAfdLimit = 3L, driftRfdLimit = 0.1, + xssRounds = 1L, xssPartitions = 4L, rssRounds = 0L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 3L, fuseAcceptEqual = FALSE + ), + sectorial_heavy = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 5L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 3L, driftAfdLimit = 3L, driftRfdLimit = 0.1, + xssRounds = 8L, xssPartitions = 6L, rssRounds = 4L, + cssRounds = 3L, cssPartitions = 6L, + sectorMinSize = 6L, sectorMaxSize = 80L, + fuseInterval = 2L, fuseAcceptEqual = TRUE + ), + drift_heavy = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 5L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 20L, driftAfdLimit = 5L, driftRfdLimit = 0.2, + xssRounds = 2L, xssPartitions = 4L, rssRounds = 1L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 3L, fuseAcceptEqual = TRUE + ), + # Large-tree preset (>=120 tips): thorough + wagnerBias + larger sectors. + large = list( + wagnerStarts = 3L, tbrMaxHits = 3L, tabuSize = 200L, + ratchetCycles = 20L, ratchetPerturbProb = 0.25, + ratchetPerturbMode = 2L, ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = TRUE, + nniPerturbCycles = 5L, nniPerturbFraction = 0.5, + driftCycles = 12L, driftAfdLimit = 5L, driftRfdLimit = 0.15, + xssRounds = 5L, xssPartitions = 6L, rssRounds = 3L, + cssRounds = 2L, cssPartitions = 6L, + sectorMinSize = 8L, sectorMaxSize = 100L, + fuseInterval = 3L, fuseAcceptEqual = TRUE, + wagnerBias = 1L, wagnerBiasTemp = 0.3, + nniFirst = TRUE, sprFirst = FALSE, + outerCycles = 2L, consensusStableReps = 2L + ) + ) + strategies[[name]] +} + +# ---- Best-known EW scores (from datasets.md, T-002) ---- + +BEST_KNOWN_EW <- c( + Longrich2010 = 131, Vinther2008 = 79, Sansom2010 = 189, + DeAssis2011 = 64, Aria2015 = 145, Wortley2006 = 496, + Griswold1999 = 409, Schulze2007 = 167, Eklund2004 = 445, + Agnarsson2004 = 778, Zanol2014 = 1338, Zhu2013 = 649, + Giles2015 = 720, Dikow2009 = 1614 +) + +# Large-tree best-known EW scores. +# NA = not yet established; fill in after benchmarking. +BEST_KNOWN_LARGE_EW <- c( + mbank_X30754 = NA_real_ # 180 tips, 425 chars +) + +# ---- Core benchmark function ---- + +#' Run one driven search and record performance metrics. +#' +#' Calls ts_driven_search directly with the given strategy parameters. +#' Uses a progress callback to record the wall-clock time at which the +#' best score was first found ("time to best"). +#' +#' @param ds Prepared dataset (from prepare_ts_data). +#' @param strategy Named list of strategy parameters (from get_strategy). +#' @param maxReplicates Hard replicate cap. +#' @param targetHits Convergence criterion (hits to best score). +#' @param maxSeconds Wall-clock timeout (0 = no timeout). +#' @param seed RNG seed. +#' @return Named list with score, timing, and convergence metrics. +benchmark_run <- function(ds, strategy, + maxReplicates = 100L, + targetHits = NULL, + maxSeconds = 0, + seed = 42L) { + if (is.null(targetHits)) { + targetHits <- max(10L, ds$n_taxa %/% 5L) + } + + # Progress-callback state: track when best score first appeared + cb_env <- new.env(parent = emptyenv()) + cb_env$best <- Inf + cb_env$time_to_best <- NA_real_ + cb_env$trace <- list() + + progress_cb <- function(info) { + if (is.finite(info$best_score) && info$best_score < cb_env$best) { + cb_env$best <- info$best_score + cb_env$time_to_best <- info$elapsed + } + cb_env$trace[[length(cb_env$trace) + 1L]] <- list( + replicate = info$replicate, + elapsed = info$elapsed, + best_score = info$best_score, + hits = info$hits_to_best, + phase = info$phase + ) + } + + # Build structured args for ts_driven_search (new interface: three config lists). + # verbosity >= 1 required for the C++ engine to invoke the callback. + searchControl <- do.call(TreeSearch::SearchControl, strategy) + runtimeConfig <- list( + maxReplicates = as.integer(maxReplicates), + targetHits = as.integer(targetHits), + maxSeconds = as.double(maxSeconds), + verbosity = 1L, + nThreads = 1L, + startEdge = NULL, + progressCallback = progress_cb + ) + scoringConfig <- list( + concavity = -1.0, # sentinel for Inf (equal weights) + xpiwe = FALSE, + xpiwe_r = 0.0, + xpiwe_max_f = 1.0 + ) + + set.seed(seed) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + searchControl, runtimeConfig, scoringConfig + ) + wall_s <- as.double((proc.time() - t0)[3]) + + list( + best_score = result$best_score, + replicates = result$replicates, + hits_to_best = result$hits_to_best, + pool_size = result$pool_size, + timed_out = result$timed_out, + wall_s = wall_s, + time_to_best_s = cb_env$time_to_best, + timings = result$timings, + trace = cb_env$trace + ) +} + +# ---- Grid runner ---- + +#' Run the full dataset x strategy x replicate benchmark grid. +#' +#' @param dataset_names Character vector of dataset names. +#' @param strategy_names Character vector of strategy preset names. +#' @param replicates Number of independent runs per combination. +#' @param maxReplicates Replicate cap per run. +#' @param targetHits Convergence hits (NULL = auto). +#' @param maxSeconds Timeout per run (0 = no timeout). +#' @param base_seed Seed for first replicate; incremented per replicate. +#' @param datasets Pre-loaded named list of prepared datasets. If NULL +#' (default), loads all standard + large benchmark datasets. +#' @return A data.frame with one row per dataset x strategy x replicate. +run_benchmark_grid <- function( + dataset_names = BENCHMARK_NAMES, + strategy_names = STRATEGY_NAMES, + replicates = 5L, + maxReplicates = 100L, + targetHits = NULL, + maxSeconds = 30, + base_seed = 42L, + datasets = NULL +) { + if (is.null(datasets)) datasets <- load_all_benchmark_datasets() + n_combos <- length(dataset_names) * length(strategy_names) * replicates + cat(sprintf("Benchmark grid: %d datasets x %d strategies x %d reps = %d runs\n", + length(dataset_names), length(strategy_names), replicates, n_combos)) + + rows <- vector("list", n_combos) + idx <- 0L + + for (ds_name in dataset_names) { + ds <- datasets[[ds_name]] + if (is.null(ds)) { + warning("Skipping missing dataset: ", ds_name) + next + } + for (strat_name in strategy_names) { + strat <- get_strategy(strat_name) + for (rep in seq_len(replicates)) { + idx <- idx + 1L + seed <- base_seed + rep - 1L + + cat(sprintf("[%3d/%d] %s x %s rep %d ...", + idx, n_combos, ds_name, strat_name, rep)) + + res <- tryCatch( + benchmark_run(ds, strat, + maxReplicates = maxReplicates, + targetHits = targetHits, + maxSeconds = maxSeconds, + seed = seed), + error = function(e) { + cat(sprintf(" ERROR: %s\n", conditionMessage(e))) + NULL + } + ) + + if (is.null(res)) { + rows[[idx]] <- data.frame( + dataset = ds_name, strategy = strat_name, replicate = rep, + seed = seed, n_taxa = ds$n_taxa, + best_score = NA_real_, replicates = NA_integer_, + hits_to_best = NA_integer_, pool_size = NA_integer_, + timed_out = NA, wall_s = NA_real_, + time_to_best_s = NA_real_, + wagner_ms = NA_real_, tbr_ms = NA_real_, + xss_ms = NA_real_, rss_ms = NA_real_, css_ms = NA_real_, + ratchet_ms = NA_real_, drift_ms = NA_real_, + final_tbr_ms = NA_real_, fuse_ms = NA_real_, + stringsAsFactors = FALSE + ) + next + } + + cat(sprintf(" score=%.0f wall=%.1fs ttb=%.1fs reps=%d\n", + res$best_score, res$wall_s, + if (is.na(res$time_to_best_s)) -1 else res$time_to_best_s, + res$replicates)) + + rows[[idx]] <- data.frame( + dataset = ds_name, + strategy = strat_name, + replicate = rep, + seed = seed, + n_taxa = ds$n_taxa, + best_score = res$best_score, + replicates = res$replicates, + hits_to_best = res$hits_to_best, + pool_size = res$pool_size, + timed_out = res$timed_out, + wall_s = res$wall_s, + time_to_best_s = res$time_to_best_s, + wagner_ms = res$timings[["wagner_ms"]], + tbr_ms = res$timings[["tbr_ms"]], + xss_ms = res$timings[["xss_ms"]], + rss_ms = res$timings[["rss_ms"]], + css_ms = res$timings[["css_ms"]], + ratchet_ms = res$timings[["ratchet_ms"]], + drift_ms = res$timings[["drift_ms"]], + final_tbr_ms = res$timings[["final_tbr_ms"]], + fuse_ms = res$timings[["fuse_ms"]], + stringsAsFactors = FALSE + ) + } + } + } + + do.call(rbind, rows[seq_len(idx)]) +} + +# ---- Summarization ---- + +#' Summarize benchmark grid results per dataset x strategy. +#' +#' Computes: best score, median score, convergence rate (fraction that +#' hit targetHits before timeout), median wall time, median time-to-best, +#' and per-phase time medians. +#' +#' @param results Data frame from run_benchmark_grid. +#' @param best_known Named numeric vector of best-known EW scores. +#' @return Data frame with one row per dataset x strategy. +summarize_grid <- function(results, + best_known = c(BEST_KNOWN_EW, BEST_KNOWN_LARGE_EW)) { + combos <- unique(results[, c("dataset", "strategy")]) + out <- vector("list", nrow(combos)) + + for (i in seq_len(nrow(combos))) { + ds_name <- combos$dataset[i] + st_name <- combos$strategy[i] + sub <- results[results$dataset == ds_name & results$strategy == st_name, ] + sub <- sub[!is.na(sub$best_score), , drop = FALSE] + + if (nrow(sub) == 0) next + + bk <- if (ds_name %in% names(best_known)) best_known[[ds_name]] else NA_real_ + + # How many runs found the best-known score? + found_optimal <- if (is.na(bk)) NA_real_ else mean(sub$best_score <= bk) + + total_phase_ms <- sub$wagner_ms + sub$tbr_ms + sub$xss_ms + sub$rss_ms + + sub$css_ms + sub$ratchet_ms + sub$drift_ms + sub$final_tbr_ms + + sub$fuse_ms + + out[[i]] <- data.frame( + dataset = ds_name, + strategy = st_name, + n_taxa = sub$n_taxa[1], + n_runs = nrow(sub), + best_score = min(sub$best_score), + median_score = median(sub$best_score), + best_known = if (is.na(bk)) NA_real_ else bk, + pct_found_optimal = round(100 * found_optimal, 1), + converge_rate = round(100 * mean(!sub$timed_out), 1), + median_wall_s = round(median(sub$wall_s), 3), + median_ttb_s = round(median(sub$time_to_best_s, na.rm = TRUE), 3), + median_reps = median(sub$replicates), + median_hits = median(sub$hits_to_best), + # Phase fraction (median % of total C++ time) + pct_wagner = round(100 * median(sub$wagner_ms / total_phase_ms, + na.rm = TRUE), 1), + pct_tbr = round(100 * median(sub$tbr_ms / total_phase_ms, + na.rm = TRUE), 1), + pct_xss = round(100 * median(sub$xss_ms / total_phase_ms, + na.rm = TRUE), 1), + pct_rss = round(100 * median(sub$rss_ms / total_phase_ms, + na.rm = TRUE), 1), + pct_css = round(100 * median(sub$css_ms / total_phase_ms, + na.rm = TRUE), 1), + pct_ratchet = round(100 * median(sub$ratchet_ms / total_phase_ms, + na.rm = TRUE), 1), + pct_drift = round(100 * median(sub$drift_ms / total_phase_ms, + na.rm = TRUE), 1), + pct_fuse = round(100 * median(sub$fuse_ms / total_phase_ms, + na.rm = TRUE), 1), + stringsAsFactors = FALSE + ) + } + + do.call(rbind, out[!vapply(out, is.null, logical(1))]) +} + +# ---- Persistence helpers ---- + +#' Save benchmark results to CSV. +save_results <- function(results, + file = sprintf("dev/benchmarks/results_%s.csv", + format(Sys.time(), "%Y%m%d_%H%M"))) { + write.csv(results, file, row.names = FALSE) + cat("Results saved to", file, "\n") + invisible(file) +} + +#' Load benchmark results from CSV. +load_results <- function(file) { + read.csv(file, stringsAsFactors = FALSE) +} + +# ---- Quick-start convenience wrappers ---- + +#' Run a small smoke test: 2 datasets x 2 strategies x 2 reps, 5s timeout. +benchmark_smoke <- function() { + run_benchmark_grid( + dataset_names = c("Vinther2008", "Agnarsson2004"), + strategy_names = c("sprint", "default"), + replicates = 2L, + maxReplicates = 20L, + maxSeconds = 5, + base_seed = 42L + ) +} + +#' Run the full production benchmark (all 14 datasets x 6 strategies). +#' +#' Warning: this takes a long time. At 30s timeout per run with 5 reps: +#' 14 x 6 x 5 = 420 runs x 30s = ~3.5 hours worst case. +benchmark_full <- function(maxSeconds = 30, replicates = 5L) { + run_benchmark_grid( + maxReplicates = 100L, + maxSeconds = maxSeconds, + replicates = replicates, + base_seed = 42L + ) +} + +#' Run benchmark grid on large-tree datasets. +#' +#' Uses longer timeouts and fewer replicates than the standard benchmark, +#' since each replicate at 180+ tips takes minutes rather than seconds. +#' +#' @param strategy_names Strategies to test (default: "default" and "thorough"). +#' @param replicates Independent runs per combination. +#' @param maxReplicates Replicate cap per search (low: most info comes from +#' a single replicate at this scale). +#' @param maxSeconds Timeout per run (default 120s). +#' @param base_seed RNG seed. +#' @return Data frame matching run_benchmark_grid output format. +benchmark_large <- function( + strategy_names = c("default", "thorough"), + replicates = 3L, + maxReplicates = 10L, + maxSeconds = 120, + base_seed = 42L +) { + large_ds <- load_large_benchmark_datasets() + if (length(large_ds) == 0L) stop("No large benchmark datasets found") + run_benchmark_grid( + dataset_names = names(large_ds), + strategy_names = strategy_names, + replicates = replicates, + maxReplicates = maxReplicates, + targetHits = 3L, + maxSeconds = maxSeconds, + base_seed = base_seed + ) +} + +# =========================================================================== +# MorphoBank external benchmark suite +# =========================================================================== +# +# Uses the neotrans MorphoBank corpus (~700 matrices) with a deterministic +# train/validation split: project numbers divisible by 5 are validation. +# See .positai/plans/2026-03-24-0551-*.md for rationale. +# +# IMPORTANT: Validation results must NEVER be used to guide strategy tuning. +# They are a one-way check to confirm that improvements generalize. + +#' Run the MorphoBank fixed training sample benchmark. +#' +#' Runs the fixed 25-matrix training sample (MBANK_FIXED_SAMPLE) through +#' the benchmark grid. Use custom keys to override the fixed sample. +#' +#' @param keys Character vector of matrix keys (default: MBANK_FIXED_SAMPLE). +#' @param strategy_names Strategies to test. +#' @param replicates Independent runs per combination. +#' @param maxSeconds Timeout per run. +#' @param base_seed Base RNG seed. +#' @return Data frame matching run_benchmark_grid output format, with +#' an additional `source` column. +benchmark_mbank_sample <- function( + keys = MBANK_FIXED_SAMPLE, + strategy_names = c("default"), + replicates = 3L, + maxSeconds = 10, + base_seed = 42L +) { + cat_df <- load_mbank_catalogue() + datasets <- load_mbank_datasets(cat_df, keys = keys) + if (length(datasets) == 0L) stop("No MorphoBank training datasets loaded") + + results <- run_benchmark_grid( + dataset_names = names(datasets), + strategy_names = strategy_names, + replicates = replicates, + maxReplicates = 50L, + maxSeconds = maxSeconds, + base_seed = base_seed, + datasets = datasets + ) + results$source <- "mbank_train" + results +} + +#' Run benchmark on all MorphoBank matrices in a given split. +#' +#' WARNING: Running all ~550 training matrices takes a very long time. +#' Use benchmark_mbank_sample() for routine work. +#' +#' @param split "training" or "validation". +#' @param strategy_names Strategies to test. +#' @param replicates Independent runs per combination. +#' @param maxSeconds Timeout per run. +#' @param base_seed Base RNG seed. +#' @return Data frame matching run_benchmark_grid output format. +benchmark_mbank_sweep <- function( + split = "training", + strategy_names = c("default"), + replicates = 1L, + maxSeconds = 10, + base_seed = 42L +) { + cat_df <- load_mbank_catalogue() + datasets <- load_mbank_split(cat_df, split = split) + if (length(datasets) == 0L) { + stop("No MorphoBank ", split, " datasets loaded") + } + + results <- run_benchmark_grid( + dataset_names = names(datasets), + strategy_names = strategy_names, + replicates = replicates, + maxReplicates = 50L, + maxSeconds = maxSeconds, + base_seed = base_seed, + datasets = datasets + ) + results$source <- paste0("mbank_", split) + results +} + +#' Run the MorphoBank VALIDATION benchmark. +#' +#' This is a ONE-WAY DOOR: validation results confirm that strategy +#' improvements generalize, but must not be used to guide further tuning. +#' A prominent warning is printed. +#' +#' @param strategy_names Strategies to test. +#' @param replicates Independent runs per combination. +#' @param maxSeconds Timeout per run. +#' @param base_seed Base RNG seed. +#' @return Data frame matching run_benchmark_grid output format. +benchmark_mbank_validation <- function( + strategy_names = c("default"), + replicates = 1L, + maxSeconds = 10, + base_seed = 42L +) { + message(paste(rep("=", 70), collapse = "")) + message(" VALIDATION DATA") + message(" Do NOT use these results to guide strategy tuning.") + message(" This is a one-way check to confirm generalization.") + message(paste(rep("=", 70), collapse = "")) + Sys.sleep(2) + + benchmark_mbank_sweep( + split = "validation", + strategy_names = strategy_names, + replicates = replicates, + maxSeconds = maxSeconds, + base_seed = base_seed + ) +} diff --git a/dev/benchmarks/bench_grid_run.R b/dev/benchmarks/bench_grid_run.R new file mode 100644 index 000000000..560fbf300 --- /dev/null +++ b/dev/benchmarks/bench_grid_run.R @@ -0,0 +1,152 @@ +# Focused benchmark grid: no callback (workaround for segfault in progress_cb). +# Collects per-phase timings, wall-clock time, scores, convergence stats. + +library(TreeSearch, lib.loc = if (dir.exists(".agent-a")) ".agent-a" else .libPaths()) +library(TreeTools) + +source("dev/benchmarks/bench_datasets.R") +source("dev/benchmarks/bench_framework.R") + +# Simplified benchmark_run without callback +benchmark_run_nocb <- function(ds, strategy, + maxReplicates = 100L, + targetHits = NULL, + maxSeconds = 0, + seed = 42L) { + if (is.null(targetHits)) { + targetHits <- max(10L, ds$n_taxa %/% 5L) + } + + args <- c( + list( + contrast = ds$contrast, + tip_data = ds$tip_data, + weight = ds$weight, + levels = ds$levels, + maxReplicates = as.integer(maxReplicates), + targetHits = as.integer(targetHits), + maxSeconds = as.double(maxSeconds), + verbosity = 0L + ), + strategy + ) + + set.seed(seed) + t0 <- proc.time() + result <- do.call(TreeSearch:::ts_driven_search, args) + wall_s <- as.double((proc.time() - t0)[3]) + + list( + best_score = result$best_score, + replicates = result$replicates, + hits_to_best = result$hits_to_best, + pool_size = result$pool_size, + timed_out = result$timed_out, + wall_s = wall_s, + timings = result$timings + ) +} + +# Representative subset: small, medium, large datasets +GRID_DATASETS <- c( + "Longrich2010", # 20 tips + "Vinther2008", # 23 tips + "Aria2015", # 35 tips + "Griswold1999", # 43 tips + "Agnarsson2004", # 62 tips + "Zhu2013", # 75 tips + "Giles2015", # 78 tips + "Dikow2009" # 88 tips +) + +run_grid <- function(dataset_names = GRID_DATASETS, + strategy_names = STRATEGY_NAMES, + replicates = 3L, + maxReplicates = 100L, + maxSeconds = 20, + base_seed = 7142L) { + datasets <- load_benchmark_datasets() + n_combos <- length(dataset_names) * length(strategy_names) * replicates + cat(sprintf("Grid: %d datasets x %d strategies x %d reps = %d runs\n", + length(dataset_names), length(strategy_names), replicates, n_combos)) + + rows <- vector("list", n_combos) + idx <- 0L + + for (ds_name in dataset_names) { + ds <- datasets[[ds_name]] + if (is.null(ds)) { + warning("Skipping missing dataset: ", ds_name) + next + } + for (strat_name in strategy_names) { + strat <- get_strategy(strat_name) + for (rep in seq_len(replicates)) { + idx <- idx + 1L + seed <- base_seed + (idx - 1L) * 7L + + cat(sprintf("[%3d/%d] %-15s x %-16s rep %d ...", + idx, n_combos, ds_name, strat_name, rep)) + + res <- tryCatch( + benchmark_run_nocb(ds, strat, + maxReplicates = maxReplicates, + targetHits = max(10L, ds$n_taxa %/% 5L), + maxSeconds = maxSeconds, + seed = seed), + error = function(e) { + cat(sprintf(" ERROR: %s\n", conditionMessage(e))) + NULL + } + ) + + if (is.null(res)) { + rows[[idx]] <- data.frame( + dataset = ds_name, strategy = strat_name, replicate = rep, + seed = seed, n_taxa = ds$n_taxa, + best_score = NA_real_, replicates = NA_integer_, + hits_to_best = NA_integer_, pool_size = NA_integer_, + timed_out = NA, wall_s = NA_real_, + wagner_ms = NA_real_, tbr_ms = NA_real_, + xss_ms = NA_real_, rss_ms = NA_real_, css_ms = NA_real_, + ratchet_ms = NA_real_, drift_ms = NA_real_, + final_tbr_ms = NA_real_, fuse_ms = NA_real_, + stringsAsFactors = FALSE + ) + next + } + + cat(sprintf(" score=%.0f wall=%.1fs reps=%d %s\n", + res$best_score, res$wall_s, res$replicates, + if (res$timed_out) "[TIMEOUT]" else "")) + + rows[[idx]] <- data.frame( + dataset = ds_name, strategy = strat_name, replicate = rep, + seed = seed, n_taxa = ds$n_taxa, + best_score = res$best_score, replicates = res$replicates, + hits_to_best = res$hits_to_best, pool_size = res$pool_size, + timed_out = res$timed_out, wall_s = res$wall_s, + wagner_ms = res$timings[["wagner_ms"]], + tbr_ms = res$timings[["tbr_ms"]], + xss_ms = res$timings[["xss_ms"]], + rss_ms = res$timings[["rss_ms"]], + css_ms = res$timings[["css_ms"]], + ratchet_ms = res$timings[["ratchet_ms"]], + drift_ms = res$timings[["drift_ms"]], + final_tbr_ms = res$timings[["final_tbr_ms"]], + fuse_ms = res$timings[["fuse_ms"]], + stringsAsFactors = FALSE + ) + } + } + } + + do.call(rbind, rows[seq_len(idx)]) +} + +# Main +cat("Starting benchmark grid...\n\n") +results <- run_grid() +outfile <- "dev/benchmarks/results_grid.csv" +write.csv(results, outfile, row.names = FALSE) +cat(sprintf("\nResults saved to %s (%d rows)\n", outfile, nrow(results))) diff --git a/dev/benchmarks/bench_intra_fuse.R b/dev/benchmarks/bench_intra_fuse.R new file mode 100644 index 000000000..d873205b9 --- /dev/null +++ b/dev/benchmarks/bench_intra_fuse.R @@ -0,0 +1,172 @@ +#!/usr/bin/env Rscript +# T-258: Intra-replicate fusing experiment +# +# Compares baseline vs intraFuse=TRUE on gap datasets to measure +# score quality and replicate throughput effects. +# +# DESIGNED FOR HAMILTON HPC. Do not run locally. +# +# Usage: +# Rscript bench_intra_fuse.R [timeout_s] [output_dir] + +library(TreeSearch) +library(TreeTools) + +args <- commandArgs(trailingOnly = TRUE) +timeout_s <- if (length(args) >= 1) as.integer(args[1]) else 30L +output_dir <- if (length(args) >= 2) args[2] else "." + +cat("=== T-258: Intra-Replicate Fusing Experiment ===\n") +cat(sprintf("Timeout: %ds\n", timeout_s)) +cat(sprintf("TreeSearch version: %s\n", packageVersion("TreeSearch"))) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Datasets ---- +gap_names <- c("Conrad2008", "Geisler2001", "Wortley2006", + "Zanol2014", "Zhu2013") + +fitch_mode <- function(dataset) { + contrast <- attr(dataset, "contrast") + levels <- attr(dataset, "levels") + inapp_col <- match("-", levels) + if (is.na(inapp_col)) return(dataset) + for (i in seq_len(nrow(contrast))) { + if (contrast[i, inapp_col] == 1 && sum(contrast[i, ]) == 1) { + contrast[i, ] <- 1 + } + } + attr(dataset, "contrast") <- contrast + dataset +} + +datasets <- lapply( + setNames(gap_names, gap_names), + function(nm) fitch_mode(inapplicable.phyData[[nm]]) +) + +# ---- Configurations ---- +configs <- list( + baseline = list(label = "baseline", desc = "default preset, no intra-fuse"), + intra_fuse = list(label = "intra_fuse", desc = "default preset + intraFuse=TRUE", + intraFuse = TRUE) +) + +seeds <- c(1L, 2L, 3L, 4L, 5L) # 5 seeds for better signal +total_runs <- length(configs) * length(datasets) * length(seeds) +cat(sprintf("Configs: %d, Datasets: %d, Seeds: %d -> %d total runs\n\n", + length(configs), length(datasets), length(seeds), total_runs)) + +# ---- TNT reference scores ---- +tnt_best <- c( + Conrad2008 = 1725, Geisler2001 = 1293, Wortley2006 = 479, + Zanol2014 = 1261, Zhu2013 = 624 +) + +# ---- Run experiments ---- +results <- data.frame( + dataset = character(), n_tips = integer(), n_chars = integer(), + config = character(), seed = integer(), timeout_s = integer(), + score = numeric(), n_trees = integer(), replicates = integer(), + hits = integer(), wall_s = numeric(), + tnt_best = numeric(), gap = numeric(), + stringsAsFactors = FALSE +) + +run_idx <- 0L +for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + cat(sprintf("--- Config: %s (%s) ---\n", cfg$label, cfg$desc)) + + for (ds_name in gap_names) { + ds <- datasets[[ds_name]] + ntip <- NTip(ds) + nchar <- sum(attr(ds, "weight")) + + for (seed in seeds) { + run_idx <- run_idx + 1L + cat(sprintf("[%d/%d] %s / %s / seed=%d ... ", + run_idx, total_runs, cfg$label, ds_name, seed)) + + set.seed(seed) + + call_args <- list( + dataset = ds, + concavity = Inf, + maxReplicates = 96L, + targetHits = 5L, + maxSeconds = as.double(timeout_s), + strategy = "auto", + verbosity = 0L, + nThreads = 1L + ) + override_names <- setdiff(names(cfg), c("label", "desc")) + for (nm in override_names) { + call_args[[nm]] <- cfg[[nm]] + } + + t0 <- proc.time() + result <- tryCatch( + do.call(MaximizeParsimony, call_args), + error = function(e) { + warning("Error: ", ds_name, "/", cfg$label, ": ", conditionMessage(e)) + structure(list(), class = "multiPhylo", + score = NA_real_, pool_size = NA_integer_, + replicates = NA_integer_, hits_to_best = NA_integer_) + } + ) + wall_s <- as.double((proc.time() - t0)[3]) + + sc <- attr(result, "score") + tnt_ref <- tnt_best[ds_name] + gap <- if (!is.na(sc)) sc - tnt_ref else NA_real_ + + cat(sprintf("score=%s (gap=%s) in %.1fs (%d reps)\n", + if (is.na(sc)) "NA" else format(sc, nsmall = 0), + if (is.na(gap)) "NA" else sprintf("%+d", gap), + wall_s, + if (is.na(attr(result, "replicates"))) 0L + else attr(result, "replicates"))) + + results <- rbind(results, data.frame( + dataset = ds_name, n_tips = ntip, n_chars = nchar, + config = cfg$label, seed = seed, timeout_s = timeout_s, + score = sc, n_trees = length(result), + replicates = if (is.na(attr(result, "replicates"))) NA_integer_ + else attr(result, "replicates"), + hits = if (is.na(attr(result, "hits_to_best"))) NA_integer_ + else attr(result, "hits_to_best"), + wall_s = wall_s, + tnt_best = tnt_ref, gap = gap, + stringsAsFactors = FALSE + )) + } + } + cat("\n") +} + +# ---- Write results ---- +outfile <- file.path(output_dir, + sprintf("t258_intra_fuse_%ds_%s.csv", + timeout_s, + format(Sys.time(), "%Y%m%d_%H%M"))) +write.csv(results, outfile, row.names = FALSE) +cat(sprintf("\nResults saved to: %s\n", outfile)) + +# ---- Summary ---- +cat("\n=== Summary: Median score by config x dataset ===\n\n") +for (ds_name in gap_names) { + cat(sprintf(" %s (TNT best: %d)\n", ds_name, tnt_best[ds_name])) + for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + sub <- results[results$dataset == ds_name & results$config == cfg$label, ] + med_score <- median(sub$score, na.rm = TRUE) + med_gap <- median(sub$gap, na.rm = TRUE) + best_score <- min(sub$score, na.rm = TRUE) + med_reps <- median(sub$replicates, na.rm = TRUE) + cat(sprintf(" %-15s median=%7.1f (gap %+5.1f) best=%7.1f reps=%.0f\n", + cfg$label, med_score, med_gap, best_score, med_reps)) + } + cat("\n") +} + +cat(sprintf("\nFinished: %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_large_preset.R b/dev/benchmarks/bench_large_preset.R new file mode 100644 index 000000000..93115cb58 --- /dev/null +++ b/dev/benchmarks/bench_large_preset.R @@ -0,0 +1,115 @@ +# bench_large_preset.R +# +# Validates the T-179 "large" strategy preset against "thorough" on the +# 180-taxon mbank_X30754 dataset. +# +# Run from package root: +# Rscript dev/benchmarks/bench_large_preset.R +# +# Results saved to dev/benchmarks/results_large_preset.csv + +.libPaths(c(".agent-X", .libPaths())) +library(TreeSearch) +library(TreeTools) + +SRC <- getwd() +source(file.path(SRC, "dev/benchmarks/bench_datasets.R")) +# Pull updated presets from source (no rebuild needed for pure-R changes) +source(file.path(SRC, "R/SearchControl.R")) +source(file.path(SRC, "R/MaximizeParsimony.R")) + +BUDGET_S <- 60 # 60s per run — allows ~1 replicate at 180 tips +SEEDS <- c(1031L, 2847L, 7193L, 4561L, 8822L) +OUT_FILE <- file.path(SRC, "dev/benchmarks/results_large_preset.csv") + +cat("TreeSearch version:", as.character(packageVersion("TreeSearch")), "\n") +cat(sprintf("Budget: %ds | Seeds: %d\n\n", BUDGET_S, length(SEEDS))) + +# Load 180-taxon dataset +large_ds_list <- load_large_benchmark_datasets() +ds_180 <- large_ds_list[["mbank_X30754"]] +if (is.null(ds_180)) stop("mbank_X30754 not found") +cat(sprintf("Dataset: mbank_X30754 | %d taxa | %d patterns\n\n", + ds_180$n_taxa, length(ds_180$weight))) + +# Use R-level SearchControl presets (sourced above) +presets <- .StrategyPresets() +conditions <- list( + large = unclass(presets[["large"]]), + thorough = unclass(presets[["thorough"]]) +) +conditions <- lapply(conditions, function(x) { attr(x, "class") <- NULL; x }) + +total_runs <- length(conditions) * length(SEEDS) +cat(sprintf("Total runs: %d conditions x %d seeds = %d\n\n", + length(conditions), length(SEEDS), total_runs)) + +rows <- list() +idx <- 0L + +for (cond_name in names(conditions)) { + strat <- conditions[[cond_name]] + for (seed in SEEDS) { + idx <- idx + 1L + cat(sprintf("[%d/%d] %-10s | seed %d ... ", + idx, total_runs, cond_name, seed)) + flush.console() + + t_start <- proc.time() + set.seed(seed) + result <- tryCatch( + do.call(TreeSearch:::ts_driven_search, + c(list(contrast = ds_180$contrast, + tip_data = ds_180$tip_data, + weight = ds_180$weight, + levels = ds_180$levels, + maxReplicates = 500L, + targetHits = max(10L, ds_180$n_taxa %/% 5L), + maxSeconds = as.double(BUDGET_S), + verbosity = 0L), + strat)), + error = function(e) { cat("ERROR:", conditionMessage(e), "\n"); NULL } + ) + wall_s <- as.double((proc.time() - t_start)[3]) + + if (is.null(result)) next + + cat(sprintf("score=%.0f reps=%d wall=%.1fs\n", + result$best_score, result$replicates, wall_s)) + + rows[[idx]] <- data.frame( + condition = cond_name, seed = seed, + best_score = result$best_score, + replicates = result$replicates, + hits_to_best = result$hits_to_best, + wall_s = wall_s, + stringsAsFactors = FALSE + ) + } +} + +results_df <- do.call(rbind, rows) +write.csv(results_df, OUT_FILE, row.names = FALSE) +cat("\nResults written to:", OUT_FILE, "\n") + +# Summary +cat("\n===== large vs thorough on mbank_X30754 (180 tips, 60s budget) =====\n") +cat(sprintf("%-12s %8s %8s %8s %8s\n", + "Condition", "Min", "Median", "Max", "Med.reps")) +for (cond in names(conditions)) { + r <- results_df[results_df$condition == cond & !is.na(results_df$best_score), ] + cat(sprintf("%-12s %8.0f %8.0f %8.0f %8.0f\n", + cond, min(r$best_score), median(r$best_score), + max(r$best_score), median(r$replicates))) +} + +# Per-seed comparison +cat("\nPer-seed comparison (large - thorough, negative = large better):\n") +for (s in SEEDS) { + lrg <- results_df$best_score[results_df$condition == "large" & results_df$seed == s] + thr <- results_df$best_score[results_df$condition == "thorough" & results_df$seed == s] + if (length(lrg) == 1 && length(thr) == 1) { + cat(sprintf(" seed %d: large=%4.0f thorough=%4.0f delta=%+.0f\n", + s, lrg, thr, lrg - thr)) + } +} diff --git a/dev/benchmarks/bench_memory.R b/dev/benchmarks/bench_memory.R new file mode 100644 index 000000000..91d19df61 --- /dev/null +++ b/dev/benchmarks/bench_memory.R @@ -0,0 +1,168 @@ +# Phase 3D: Memory layout profiling +# +# Measures TBR phase breakdown and scaling across tree sizes. +# Run with: source("dev/benchmarks/bench_memory.R") + +library(TreeSearch) +library(TreeTools) + +# --- Helper: prepare dataset args for Rcpp call --- +prep_ds <- function(dataset) { + at <- attributes(dataset) + contrast <- at$contrast + storage.mode(contrast) <- "double" + # phyDat stores data as list of integer vectors (one per taxon) + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + storage.mode(tip_data) <- "integer" + weight <- at$weight + levels <- at$levels + + # min_steps from contrast matrix + min_steps <- apply(contrast, 2, function(x) sum(x > 0)) - 1L + min_steps <- pmax(min_steps, 0L) + + list(contrast = contrast, tip_data = tip_data, weight = weight, + levels = levels, min_steps = min_steps) +} + +# --- Helper: get random tree edge matrix for n tips --- +make_tree_edge <- function(dataset) { + tree <- RandomTree(names(dataset), root = TRUE) + tree$edge +} + +# --- Helper: generate synthetic dataset --- +make_synthetic <- function(n_tips, n_chars = 200, na_prob = 0.1) { + tree <- RandomTree(n_tips, root = TRUE) + mat <- matrix( + sample(c("0", "1", "-"), n_tips * n_chars, replace = TRUE, + prob = c((1 - na_prob) / 2, (1 - na_prob) / 2, na_prob)), + n_tips, n_chars, + dimnames = list(tree$tip.label, NULL) + ) + MatrixToPhyDat(mat) +} + +# --- Benchmark one dataset --- +bench_one <- function(dataset, label, n_reps = 3) { + ds_args <- prep_ds(dataset) + edge <- make_tree_edge(dataset) + + results <- vector("list", n_reps) + for (i in seq_len(n_reps)) { + edge <- make_tree_edge(dataset) # different random tree each rep + results[[i]] <- TreeSearch:::ts_bench_tbr_phases( + edge, ds_args$contrast, ds_args$tip_data, + ds_args$weight, ds_args$levels, + ds_args$min_steps + ) + } + + # Average across reps + avg <- function(field) mean(vapply(results, `[[`, numeric(1), field)) + + data.frame( + label = label, + n_tips = results[[1]]$n_tips, + n_node = results[[1]]$n_node, + n_blocks = results[[1]]$n_blocks, + total_words = results[[1]]$total_words, + total_chars = results[[1]]$total_chars, + has_na = results[[1]]$has_na, + score = avg("score"), + n_clips = avg("n_clips"), + n_candidates = avg("n_candidates"), + # Timing (microseconds) + full_rescore_us = avg("time_full_rescore_us"), + clip_incr_us = avg("time_clip_incr_us"), + indirect_us = avg("time_indirect_us"), + unclip_us = avg("time_unclip_us"), + snap_save_us = avg("time_snapshot_save_us"), + snap_restore_us = avg("time_snapshot_restore_us"), + snap_bytes = avg("snapshot_bytes"), + stringsAsFactors = FALSE + ) +} + +# --- Run benchmarks --- +cat("=== Phase 3D Memory Layout Profiling ===\n\n") + +set.seed(7382) + +# Empirical datasets +cat("Benchmarking empirical datasets...\n") +data("inapplicable.phyData", package = "TreeSearch") + +empirical_results <- list() +for (name in c("Vinther2008", "Agnarsson2004")) { + cat(" ", name, "...\n") + empirical_results[[name]] <- bench_one( + inapplicable.phyData[[name]], name, n_reps = 3 + ) +} + +# Synthetic datasets of increasing size +cat("Benchmarking synthetic datasets...\n") +sizes <- c(20, 50, 100, 200) +synthetic_results <- list() +for (n in sizes) { + label <- paste0("synth_", n) + cat(" ", label, "...\n") + ds <- make_synthetic(n, n_chars = 200, na_prob = 0.1) + synthetic_results[[label]] <- bench_one(ds, label, n_reps = 3) +} + +# Combine results +all_results <- do.call(rbind, c(empirical_results, synthetic_results)) + +# --- Display --- +cat("\n=== Results ===\n\n") +print(all_results[, c("label", "n_tips", "n_blocks", "total_words", + "n_clips", "n_candidates")]) + +cat("\n=== Timing breakdown (microseconds, total across all clips) ===\n\n") +timing_cols <- c("label", "n_tips", "full_rescore_us", "clip_incr_us", + "indirect_us", "unclip_us", "snap_save_us", "snap_restore_us") +print(all_results[, timing_cols], digits = 3) + +# Compute fractions +cat("\n=== Time fractions (clip+incr / indirect / unclip) ===\n\n") +total_pass <- all_results$clip_incr_us + all_results$indirect_us + + all_results$unclip_us +fracs <- data.frame( + label = all_results$label, + n_tips = all_results$n_tips, + pct_clip_incr = round(100 * all_results$clip_incr_us / total_pass, 1), + pct_indirect = round(100 * all_results$indirect_us / total_pass, 1), + pct_unclip = round(100 * all_results$unclip_us / total_pass, 1), + snap_save_per_op_us = round(all_results$snap_save_us, 1), + snap_restore_per_op_us = round(all_results$snap_restore_us, 1), + snap_KB = round(all_results$snap_bytes / 1024, 1) +) +print(fracs) + +# Per-candidate timing +cat("\n=== Per-candidate indirect timing ===\n\n") +per_cand <- data.frame( + label = all_results$label, + n_tips = all_results$n_tips, + n_candidates = round(all_results$n_candidates), + indirect_us_total = round(all_results$indirect_us), + ns_per_candidate = round(1000 * all_results$indirect_us / + all_results$n_candidates, 1) +) +print(per_cand) + +# Scaling analysis +cat("\n=== Scaling analysis (synthetic datasets) ===\n\n") +synth <- all_results[grepl("synth", all_results$label), ] +if (nrow(synth) >= 3) { + fit <- lm(log(indirect_us) ~ log(n_tips), data = synth) + cat("Indirect time scaling exponent:", round(coef(fit)[2], 2), + "(expected ~2 for O(n^2))\n") + fit2 <- lm(log(n_candidates) ~ log(n_tips), data = synth) + cat("Candidate count scaling exponent:", round(coef(fit2)[2], 2), "\n") +} + +cat("\nDone.\n") diff --git a/dev/benchmarks/bench_nni_survey.R b/dev/benchmarks/bench_nni_survey.R new file mode 100644 index 000000000..14350b77f --- /dev/null +++ b/dev/benchmarks/bench_nni_survey.R @@ -0,0 +1,184 @@ +# NNI survey: measure batch-NNI feasibility +# +# For each dataset, builds Wagner trees and surveys all NNI candidates to +# count how many moves improve the score. This measures the theoretical +# payoff of batch/simultaneous NNI at different search stages. +# +# Usage: Rscript dev/benchmarks/bench_nni_survey.R + +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[1] else stop("Usage: Rscript bench_nni_survey.R ") +.libPaths(c(lib_path, .libPaths())) + +pkg_name <- basename(lib_path) +agent_letter <- sub(".*-", "", pkg_name) +renamed <- paste0("TreeSearch.", agent_letter) +library(renamed, character.only = TRUE) +if (is.null(.Internal(getRegisteredNamespace("TreeSearch")))) + .Internal(registerNamespace("TreeSearch", asNamespace(renamed))) + +library(TreeTools) + +prepare_ts_data <- function(dataset) { + at <- attributes(dataset) + list( + contrast = at$contrast, + tip_data = matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(dataset) + ) +} + +build_wagner <- function(ds, seed) { + set.seed(seed) + TreeSearch:::ts_wagner_tree(ds$contrast, ds$tip_data, ds$weight, ds$levels) +} + +run_survey <- function(edge_mat, ds) { + TreeSearch:::ts_nni_survey( + edge_mat, ds$contrast, ds$tip_data, ds$weight, ds$levels + ) +} + +run_nni <- function(edge_mat, ds, maxHits = 20L) { + TreeSearch:::ts_nni_search( + edge_mat, ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxHits = maxHits + ) +} + +analyze_survey <- function(survey) { + deltas <- survey$delta + n_candidates <- length(deltas) + n_improving <- sum(deltas < 0) + n_equal <- sum(deltas == 0) + + edge_ids <- survey$edge + best_per_edge <- tapply(deltas, edge_ids, min) + n_edges_improving <- sum(best_per_edge < 0) + + total_improvement <- -sum(deltas[deltas < 0]) + best_improvement <- if (n_improving > 0) -min(deltas) else 0L + + data.frame( + base_score = survey$base_score, + n_edges = survey$n_edges, + n_candidates = n_candidates, + n_improving = n_improving, + n_equal = n_equal, + n_edges_improving = n_edges_improving, + total_improvement = total_improvement, + best_single_improvement = best_improvement, + pct_edges_improving = round(100 * n_edges_improving / survey$n_edges, 1) + ) +} + +# All standard Fitch datasets (no inapplicable-dominant ones) +DATASETS <- c( + "Vinther2008", # 23 tips + "Griswold1999", # 43 tips + "Eklund2004", # 54 tips + "Agnarsson2004", # 62 tips + "Zhu2013", # 75 tips + "Giles2015", # 78 tips + "Dikow2009" # 88 tips +) + +SEEDS <- c(1742L, 5281L, 8093L, 3647L, 9210L) + +cat("=== NNI Survey: Batch-NNI Feasibility ===\n") +cat("Date:", format(Sys.time(), "%Y-%m-%d %H:%M"), "\n\n") + +all_wagner <- list() +all_converged <- list() + +for (nm in DATASETS) { + ds_raw <- TreeSearch::inapplicable.phyData[[nm]] + if (is.null(ds_raw)) { cat("SKIP:", nm, "\n"); next } + ds <- prepare_ts_data(ds_raw) + n_tips <- ds$n_taxa + + cat(sprintf("\n--- %s (%d tips, %d edges) ---\n", nm, n_tips, n_tips - 2L)) + + for (seed in SEEDS) { + # Stage 1: Wagner tree + wagner <- build_wagner(ds, seed) + survey_w <- run_survey(wagner$edge, ds) + info_w <- analyze_survey(survey_w) + info_w$dataset <- nm + info_w$n_tips <- n_tips + info_w$seed <- seed + info_w$stage <- "wagner" + + cat(sprintf(" seed=%d Wagner: score=%d, %d/%d edges improving (total delta=%d, best=%d)\n", + seed, as.integer(info_w$base_score), + info_w$n_edges_improving, info_w$n_edges, + info_w$total_improvement, info_w$best_single_improvement)) + + all_wagner <- c(all_wagner, list(info_w)) + + # Stage 2: After NNI convergence (maxHits=20, full plateau search) + nni_result <- run_nni(wagner$edge, ds, maxHits = 20L) + survey_c <- run_survey(nni_result$edge, ds) + info_c <- analyze_survey(survey_c) + info_c$dataset <- nm + info_c$n_tips <- n_tips + info_c$seed <- seed + info_c$stage <- "nni_converged" + info_c$nni_moves <- nni_result$n_moves + info_c$nni_iterations <- nni_result$n_iterations + + cat(sprintf(" NNI converged: score=%d (%d moves, %d iter), %d improving edges\n", + as.integer(info_c$base_score), + nni_result$n_moves, nni_result$n_iterations, + info_c$n_edges_improving)) + + all_converged <- c(all_converged, list(info_c)) + } +} + +wagner_df <- do.call(rbind, all_wagner) +converged_df <- do.call(rbind, all_converged) + +cat("\n\n========================================\n") +cat("=== SUMMARY: Wagner Tree Surveys ===\n") +cat("========================================\n\n") + +for (nm in unique(wagner_df$dataset)) { + sub <- wagner_df[wagner_df$dataset == nm, ] + csub <- converged_df[converged_df$dataset == nm, ] + cat(sprintf("%s (%d tips, %d NNI edges):\n", nm, sub$n_tips[1], sub$n_edges[1])) + cat(sprintf(" Wagner scores: %d-%d (median %d)\n", + min(as.integer(sub$base_score)), + max(as.integer(sub$base_score)), + as.integer(median(sub$base_score)))) + cat(sprintf(" Improving edges: %d-%d (median %.0f, %.0f%% of edges)\n", + min(sub$n_edges_improving), max(sub$n_edges_improving), + median(sub$n_edges_improving), + median(sub$pct_edges_improving))) + cat(sprintf(" Total delta: %d-%d steps (median %d)\n", + min(sub$total_improvement), max(sub$total_improvement), + as.integer(median(sub$total_improvement)))) + cat(sprintf(" Best single move: %d-%d steps\n", + min(sub$best_single_improvement), + max(sub$best_single_improvement))) + cat(sprintf(" NNI-converged: score %d-%d (%d-%d moves)\n\n", + min(as.integer(csub$base_score)), + max(as.integer(csub$base_score)), + min(csub$nni_moves), max(csub$nni_moves))) +} + +cat("\n=== Key Finding: Batch Size (improving edges on Wagner trees) ===\n") +cat(sprintf("%-15s %5s %10s %10s %10s %10s\n", + "Dataset", "Tips", "Med.Batch", "Max.Batch", "%Edges", "Med.Delta")) +for (nm in unique(wagner_df$dataset)) { + sub <- wagner_df[wagner_df$dataset == nm, ] + cat(sprintf("%-15s %5d %10.0f %10d %9.0f%% %10d\n", + nm, sub$n_tips[1], + median(sub$n_edges_improving), + max(sub$n_edges_improving), + median(sub$pct_edges_improving), + as.integer(median(sub$total_improvement)))) +} diff --git a/dev/benchmarks/bench_outer_cycles.R b/dev/benchmarks/bench_outer_cycles.R new file mode 100644 index 000000000..ac0f56da2 --- /dev/null +++ b/dev/benchmarks/bench_outer_cycles.R @@ -0,0 +1,163 @@ +# bench_outer_cycles.R +# +# Compares thorough preset with outerCycles=1 vs outerCycles=2 across all 14 +# standard benchmark datasets. Uses 3 seeds x 20s time budget per condition. +# +# Run from package root via: +# Rscript dev/benchmarks/bench_outer_cycles.R +# +# Results saved to dev/benchmarks/results_outer_cycles.csv + +.libPaths(c(".agent-X", .libPaths())) +library(TreeSearch) +library(TreeTools) + +SRC <- getwd() +source(file.path(SRC, "dev/benchmarks/bench_datasets.R")) +source(file.path(SRC, "dev/benchmarks/bench_framework.R")) + +BUDGET_S <- 20 +SEEDS <- c(1031L, 2847L, 7193L) +OUT_FILE <- file.path(SRC, "dev/benchmarks/results_outer_cycles.csv") + +cat("TreeSearch version:", as.character(packageVersion("TreeSearch")), "\n") +cat(sprintf("Budget: %ds | Seeds: %d\n", BUDGET_S, length(SEEDS))) + +# Build thorough strategy base (matches get_strategy("thorough") in bench_framework.R) +thorough_base <- list( + wagnerStarts = 3L, + tbrMaxHits = 3L, + tabuSize = 200L, + ratchetCycles = 20L, + ratchetPerturbProb = 0.25, + ratchetPerturbMode = 2L, + ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = TRUE, + driftCycles = 12L, + driftAfdLimit = 5L, + driftRfdLimit = 0.15, + xssRounds = 5L, + xssPartitions = 6L, + rssRounds = 3L, + cssRounds = 2L, + cssPartitions = 6L, + sectorMinSize = 6L, + sectorMaxSize = 80L, + fuseInterval = 2L, + fuseAcceptEqual = TRUE, + nniFirst = TRUE, + sprFirst = FALSE, + consensusStableReps = 3L +) + +conditions <- list( + thorough_1 = c(thorough_base, list(outerCycles = 1L)), + thorough_2 = c(thorough_base, list(outerCycles = 2L)) +) + +datasets <- load_benchmark_datasets() +cat("Datasets loaded:", length(datasets), "\n\n") + +total_runs <- length(BENCHMARK_NAMES) * length(conditions) * length(SEEDS) +cat(sprintf("Total runs: %d x %d conditions x %d seeds = %d\n\n", + length(BENCHMARK_NAMES), length(conditions), length(SEEDS), total_runs)) + +rows <- list() +idx <- 0L + +for (ds_name in BENCHMARK_NAMES) { + ds <- datasets[[ds_name]] + if (is.null(ds)) { warning("Skipping ", ds_name); next } + + for (cond_name in names(conditions)) { + strat <- conditions[[cond_name]] + + for (seed in SEEDS) { + idx <- idx + 1L + cat(sprintf("[%3d/%d] %-14s | %-12s | seed %d ... ", + idx, total_runs, ds_name, cond_name, seed)) + flush.console() + + t_start <- proc.time() + set.seed(seed) + result <- tryCatch( + do.call(TreeSearch:::ts_driven_search, + c(list(contrast = ds$contrast, + tip_data = ds$tip_data, + weight = ds$weight, + levels = ds$levels, + maxReplicates = 200L, + targetHits = max(10L, ds$n_taxa %/% 5L), + maxSeconds = as.double(BUDGET_S), + verbosity = 0L), + strat)), + error = function(e) { + cat("ERROR:", conditionMessage(e), "\n"); NULL + } + ) + wall_s <- as.double((proc.time() - t_start)[3]) + + if (is.null(result)) { + rows[[idx]] <- data.frame( + dataset = ds_name, condition = cond_name, seed = seed, + n_taxa = ds$n_taxa, best_score = NA_real_, + replicates = NA_integer_, hits_to_best = NA_integer_, + wall_s = wall_s, stringsAsFactors = FALSE + ) + next + } + + cat(sprintf("score=%.0f reps=%d wall=%.1fs\n", + result$best_score, result$replicates, wall_s)) + + rows[[idx]] <- data.frame( + dataset = ds_name, + condition = cond_name, + seed = seed, + n_taxa = ds$n_taxa, + best_score = result$best_score, + replicates = result$replicates, + hits_to_best = result$hits_to_best, + wall_s = wall_s, + stringsAsFactors = FALSE + ) + } + } +} + +results_df <- do.call(rbind, rows) +write.csv(results_df, OUT_FILE, row.names = FALSE) +cat("\nResults written to:", OUT_FILE, "\n") + +# Quick summary +library(dplyr) +summary_tbl <- results_df |> + filter(!is.na(best_score)) |> + group_by(dataset, n_taxa, condition) |> + summarise(median_score = median(best_score), + median_reps = median(replicates), + .groups = "drop") |> + tidyr::pivot_wider(names_from = condition, + values_from = c(median_score, median_reps)) |> + mutate(delta = median_score_thorough_2 - median_score_thorough_1) |> + arrange(n_taxa) + +cat("\n===== outerCycles=2 vs outerCycles=1 (lower score = better) =====\n") +cat(sprintf("%-16s %5s %8s %8s %6s %5s %5s\n", + "Dataset", "Tips", "OC1_score", "OC2_score", "Delta", + "OC1_reps", "OC2_reps")) +cat(strrep("-", 68), "\n") +for (i in seq_len(nrow(summary_tbl))) { + r <- summary_tbl[i, ] + cat(sprintf("%-16s %5d %8.0f %8.0f %+6.1f %5.0f %5.0f\n", + r$dataset, r$n_taxa, + r$median_score_thorough_1, r$median_score_thorough_2, + r$delta, + r$median_reps_thorough_1, r$median_reps_thorough_2)) +} +improved <- sum(summary_tbl$delta < -0.5, na.rm = TRUE) +unchanged <- sum(abs(summary_tbl$delta) <= 0.5, na.rm = TRUE) +worse <- sum(summary_tbl$delta > 0.5, na.rm = TRUE) +cat(strrep("-", 68), "\n") +cat(sprintf("Improved: %d Unchanged: %d Worse: %d\n", + improved, unchanged, worse)) diff --git a/dev/benchmarks/bench_pr_stage2_mbank.R b/dev/benchmarks/bench_pr_stage2_mbank.R new file mode 100644 index 000000000..1178653d7 --- /dev/null +++ b/dev/benchmarks/bench_pr_stage2_mbank.R @@ -0,0 +1,212 @@ +#!/usr/bin/env Rscript +# T-289c: Prune-reinsert Stage 2 — mbank_X30754 (180t) only, Brazeau scoring +# +# DESIGNED FOR HAMILTON HPC. Do not run locally. +# +# Stage 1 (13 configs × 5 datasets × 5 seeds × 30s) showed: +# - ≤88t: PR is net-negative (replicate cost >> score gain). No further testing. +# - 180t: Real signal. Best configs by mean delta vs baseline: +# pr_c3_d10: −8.0 (4/5 seeds), pr_c5_d10: −6.6 (5/5 seeds, most consistent) +# pr_c5_d05: −6.8 (4/5), pr_c3_d05: −4.8 (3/5) +# pr_c1_d10: −2.8 (3/5) — weak but cheap +# d≥20% with c≥3 rarely completes a replicate in 30s. +# +# Stage 2 goals: +# 1. Confirm signal at 60s (≥2 completed replicates per seed). +# 2. Narrow to best cycle/drop combination. +# 3. Test selection=1 (greedy insertion) for top-2 configs. +# +# Configs tested (8 + baseline = 9 total): +# baseline, pr_c1_d10, +# pr_c3_d05, pr_c3_d10, pr_c3_d10_sel1, +# pr_c5_d05, pr_c5_d10, pr_c5_d10_sel1 +# +# Grid: 9 configs × 1 dataset × 10 seeds × 60s ≈ 90 min wall time. +# +# Usage: +# Rscript bench_pr_stage2_mbank.R [timeout_s] [output_dir] +# timeout_s: search budget in seconds. Default: 60 +# output_dir: where to write CSV. Default: "." +# +# Output: t289c_stage2_{timeout}s.csv + +library(TreeSearch) +library(TreeTools) + +args <- commandArgs(trailingOnly = TRUE) +timeout_s <- if (length(args) >= 1) as.integer(args[1]) else 60L +output_dir <- if (length(args) >= 2) args[2] else "." + +cat("=== T-289c: Prune-Reinsert Stage 2 (mbank, Brazeau) ===\n") +cat(sprintf("Timeout: %ds | TreeSearch %s\n", timeout_s, + packageVersion("TreeSearch"))) +cat(sprintf("Output: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Load 180-tip dataset ---- +mbank_path <- Sys.glob("/nobackup/*/TreeSearch-a/dev/benchmarks/mbank_X30754.nex") +if (length(mbank_path) == 0) { + mbank_path <- file.path(dirname(dirname(dirname(getwd()))), + "TreeSearch-a", "dev", "benchmarks", "mbank_X30754.nex") +} +if (length(mbank_path) > 0) mbank_path <- mbank_path[1] +if (!file.exists(mbank_path)) stop("mbank_X30754.nex not found") +cat("Loading:", mbank_path, "\n") +ds <- ReadAsPhyDat(mbank_path) +cat(sprintf(" %d taxa, %d patterns\n\n", length(ds), sum(attr(ds, "weight")))) + +seeds <- 1:10 + +# ---- Config grid ---- +# +# Stage 1 top performers (all random selection, pr_selection=0): +# pr_c3_d10: mean delta −8.0, 4/5 seeds improved +# pr_c5_d10: mean delta −6.6, 5/5 seeds improved ← most consistent +# pr_c5_d05: mean delta −6.8, 4/5 +# pr_c3_d05: mean delta −4.8, 3/5 +# pr_c1_d10: mean delta −2.8, 3/5 — cheap reference +# +# Also test selection=1 (greedy insertion) for the top-2 configs. +configs <- list( + baseline = list( + label = "baseline", + desc = "No prune-reinsert (auto preset)", + pr_cycles = 0L, pr_drop = 0.0, pr_selection = 0L + ), + pr_c1_d10 = list( + label = "pr_c1_d10", + desc = "PR 1 cycle, 10% drop, random", + pr_cycles = 1L, pr_drop = 0.10, pr_selection = 0L + ), + pr_c3_d05 = list( + label = "pr_c3_d05", + desc = "PR 3 cycles, 5% drop, random", + pr_cycles = 3L, pr_drop = 0.05, pr_selection = 0L + ), + pr_c3_d10 = list( + label = "pr_c3_d10", + desc = "PR 3 cycles, 10% drop, random", + pr_cycles = 3L, pr_drop = 0.10, pr_selection = 0L + ), + pr_c3_d10_sel1 = list( + label = "pr_c3_d10_sel1", + desc = "PR 3 cycles, 10% drop, greedy insertion", + pr_cycles = 3L, pr_drop = 0.10, pr_selection = 1L + ), + pr_c5_d05 = list( + label = "pr_c5_d05", + desc = "PR 5 cycles, 5% drop, random", + pr_cycles = 5L, pr_drop = 0.05, pr_selection = 0L + ), + pr_c5_d10 = list( + label = "pr_c5_d10", + desc = "PR 5 cycles, 10% drop, random", + pr_cycles = 5L, pr_drop = 0.10, pr_selection = 0L + ), + pr_c5_d10_sel1 = list( + label = "pr_c5_d10_sel1", + desc = "PR 5 cycles, 10% drop, greedy insertion", + pr_cycles = 5L, pr_drop = 0.10, pr_selection = 1L + ) +) + +total_runs <- length(configs) * length(seeds) +cat(sprintf("Configs: %d, Seeds: %d -> %d total runs\n\n", + length(configs), length(seeds), total_runs)) + +# ---- Run experiments ---- +results <- data.frame( + dataset = character(), n_tips = integer(), n_patterns = integer(), + config = character(), seed = integer(), timeout_s = integer(), + score = numeric(), n_trees = integer(), replicates = integer(), + hits = integer(), wall_s = numeric(), + pr_cycles = integer(), pr_drop = numeric(), pr_selection = integer(), + stringsAsFactors = FALSE +) + +ntip <- length(ds) +npat <- sum(attr(ds, "weight")) +run_idx <- 0L + +for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + cat(sprintf("\n--- %s: %s ---\n", cfg$label, cfg$desc)) + + for (s in seeds) { + run_idx <- run_idx + 1L + cat(sprintf(" [%d/%d] seed=%d ... ", run_idx, total_runs, s)) + + set.seed(s) + t0 <- proc.time() + + tryCatch({ + if (cfg$pr_cycles == 0L) { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "auto", + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + } else { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "auto", + pruneReinsertCycles = cfg$pr_cycles, + pruneReinsertDrop = cfg$pr_drop, + pruneReinsertSelection = cfg$pr_selection, + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + } + + elapsed <- (proc.time() - t0)[3] + best_score <- attr(res, "score") + n_trees <- length(res) + reps <- attr(res, "replicates") + hits <- attr(res, "hits") + + cat(sprintf("score=%g, reps=%d, %.1fs\n", best_score, reps, elapsed)) + + results <- rbind(results, data.frame( + dataset = "mbank_X30754", n_tips = ntip, n_patterns = npat, + config = cfg$label, seed = s, timeout_s = timeout_s, + score = best_score, n_trees = n_trees, replicates = reps, + hits = hits, wall_s = elapsed, + pr_cycles = cfg$pr_cycles, pr_drop = cfg$pr_drop, + pr_selection = cfg$pr_selection, + stringsAsFactors = FALSE + )) + }, error = function(e) { + cat(sprintf("ERROR: %s\n", conditionMessage(e))) + }) + } + + # Save after each config (crash recovery) + outfile <- file.path(output_dir, + sprintf("t289c_stage2_%ds.csv", timeout_s)) + write.csv(results, outfile, row.names = FALSE) +} + +# ---- Save final ---- +outfile <- file.path(output_dir, sprintf("t289c_stage2_%ds.csv", timeout_s)) +write.csv(results, outfile, row.names = FALSE) +cat(sprintf("\n=== Results written to %s (%d rows) ===\n", + outfile, nrow(results))) + +# ---- Quick summary ---- +cat("\n--- Mean scores by config ---\n") +agg <- aggregate(score ~ config, data = results, FUN = mean) +bl <- agg$score[agg$config == "baseline"] +agg$delta <- round(agg$score - bl, 2) +agg <- agg[order(agg$delta), ] +print(agg, row.names = FALSE) + +cat(sprintf("\nCompleted: %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_pr_stage3_mbank.R b/dev/benchmarks/bench_pr_stage3_mbank.R new file mode 100644 index 000000000..9eff3c5f6 --- /dev/null +++ b/dev/benchmarks/bench_pr_stage3_mbank.R @@ -0,0 +1,192 @@ +#!/usr/bin/env Rscript +# T-289d: Prune-reinsert Stage 3 — new drop criteria (MISSING, COMBINED) +# +# DESIGNED FOR HAMILTON HPC. Do not run locally. +# +# Stage 2 (9 configs x 10 seeds x 60s, mbank_X30754) established: +# - All PR configs improve over baseline at 180t. +# - Instability-weighted dropping (sel=1) beats random (sel=0) by 1.8–3.3 steps. +# - pr_c5_d05 (−12.3 steps, 3.0 reps) best cost-quality ratio at sel=0. +# - pr_c5_d10_sel1 (−14.1 steps, 2.2 reps) best overall. +# - Gap: pr_c5_d05_sel1 not tested. +# +# Stage 3 goals: +# 1. Fill gap: pr_c5_d05_sel1 (instability-weighted at cheapest good config). +# 2. Benchmark new criteria: MISSING (sel=2), COMBINED (sel=3) at d05 and d10. +# 3. Reference repeats: baseline + pr_c5_d05_sel0 + pr_c5_d10_sel1 for +# within-run comparability (avoids cross-run seed variance). +# +# Grid: 8 configs × 1 dataset × 10 seeds × 60s ≈ 87 min wall time. +# +# Drop criteria (pruneReinsertSelection): +# 0 = RANDOM uniform random +# 1 = INSTABILITY weighted by positional instability in pool +# 2 = MISSING weighted by ambiguous/inapplicable character count +# 3 = COMBINED instability × (1 + normalised missingness) +# +# Usage: +# Rscript bench_pr_stage3_mbank.R [timeout_s] [output_dir] + +library(TreeSearch) +library(TreeTools) + +args <- commandArgs(trailingOnly = TRUE) +timeout_s <- if (length(args) >= 1) as.integer(args[1]) else 60L +output_dir <- if (length(args) >= 2) args[2] else "." + +cat("=== T-289d: Prune-Reinsert Stage 3 (new criteria, mbank, Brazeau) ===\n") +cat(sprintf("Timeout: %ds | TreeSearch %s\n", timeout_s, + packageVersion("TreeSearch"))) +cat(sprintf("Output: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Load 180-tip dataset ---- +mbank_path <- Sys.glob("/nobackup/*/TreeSearch-a/dev/benchmarks/mbank_X30754.nex") +if (length(mbank_path) == 0) { + mbank_path <- file.path(dirname(dirname(dirname(getwd()))), + "TreeSearch-a", "dev", "benchmarks", "mbank_X30754.nex") +} +if (length(mbank_path) > 0) mbank_path <- mbank_path[1] +if (!file.exists(mbank_path)) stop("mbank_X30754.nex not found") +cat("Loading:", mbank_path, "\n") +ds <- ReadAsPhyDat(mbank_path) +cat(sprintf(" %d taxa, %d patterns\n\n", length(ds), sum(attr(ds, "weight")))) + +seeds <- 1:10 + +# ---- Config grid ---- +# +# Notation: pr_c{cycles}_d{drop%}_sel{selection} +# References from Stage 2 included for within-run comparability. +configs <- list( + baseline = list( + label = "baseline", desc = "No prune-reinsert", + pr_cycles = 0L, pr_drop = 0.0, pr_selection = 0L + ), + # --- d=5%, c=5: cheapest good config from Stage 2 --- + pr_c5_d05_sel0 = list( + label = "pr_c5_d05_sel0", desc = "c5 d5% random (Stage2 ref)", + pr_cycles = 5L, pr_drop = 0.05, pr_selection = 0L + ), + pr_c5_d05_sel1 = list( + label = "pr_c5_d05_sel1", desc = "c5 d5% instability (gap)", + pr_cycles = 5L, pr_drop = 0.05, pr_selection = 1L + ), + pr_c5_d05_sel2 = list( + label = "pr_c5_d05_sel2", desc = "c5 d5% missing (new)", + pr_cycles = 5L, pr_drop = 0.05, pr_selection = 2L + ), + pr_c5_d05_sel3 = list( + label = "pr_c5_d05_sel3", desc = "c5 d5% combined (new)", + pr_cycles = 5L, pr_drop = 0.05, pr_selection = 3L + ), + # --- d=10%, c=5: Stage 2 overall winner config --- + pr_c5_d10_sel1 = list( + label = "pr_c5_d10_sel1", desc = "c5 d10% instability (Stage2 ref)", + pr_cycles = 5L, pr_drop = 0.10, pr_selection = 1L + ), + pr_c5_d10_sel2 = list( + label = "pr_c5_d10_sel2", desc = "c5 d10% missing (new)", + pr_cycles = 5L, pr_drop = 0.10, pr_selection = 2L + ), + pr_c5_d10_sel3 = list( + label = "pr_c5_d10_sel3", desc = "c5 d10% combined (new)", + pr_cycles = 5L, pr_drop = 0.10, pr_selection = 3L + ) +) + +total_runs <- length(configs) * length(seeds) +cat(sprintf("Configs: %d, Seeds: %d -> %d total runs\n\n", + length(configs), length(seeds), total_runs)) + +# ---- Run experiments ---- +results <- data.frame( + dataset = character(), n_tips = integer(), n_patterns = integer(), + config = character(), seed = integer(), timeout_s = integer(), + score = numeric(), n_trees = integer(), replicates = integer(), + hits = integer(), wall_s = numeric(), + pr_cycles = integer(), pr_drop = numeric(), pr_selection = integer(), + stringsAsFactors = FALSE +) + +ntip <- length(ds) +npat <- sum(attr(ds, "weight")) +run_idx <- 0L +outfile <- file.path(output_dir, sprintf("t289d_stage3_%ds.csv", timeout_s)) + +for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + cat(sprintf("\n--- %s: %s ---\n", cfg$label, cfg$desc)) + + for (s in seeds) { + run_idx <- run_idx + 1L + cat(sprintf(" [%d/%d] seed=%d ... ", run_idx, total_runs, s)) + + set.seed(s) + t0 <- proc.time() + + tryCatch({ + if (cfg$pr_cycles == 0L) { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "auto", + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + } else { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "auto", + pruneReinsertCycles = cfg$pr_cycles, + pruneReinsertDrop = cfg$pr_drop, + pruneReinsertSelection = cfg$pr_selection, + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + } + + elapsed <- (proc.time() - t0)[3] + best_score <- attr(res, "score") + reps <- attr(res, "replicates") + hits <- attr(res, "hits") + + cat(sprintf("score=%g, reps=%d, %.1fs\n", best_score, reps, elapsed)) + + results <- rbind(results, data.frame( + dataset = "mbank_X30754", n_tips = ntip, n_patterns = npat, + config = cfg$label, seed = s, timeout_s = timeout_s, + score = best_score, n_trees = length(res), replicates = reps, + hits = hits, wall_s = elapsed, + pr_cycles = cfg$pr_cycles, pr_drop = cfg$pr_drop, + pr_selection = cfg$pr_selection, + stringsAsFactors = FALSE + )) + }, error = function(e) { + cat(sprintf("ERROR: %s\n", conditionMessage(e))) + }) + } + + # Save after each config (crash recovery) + write.csv(results, outfile, row.names = FALSE) +} + +write.csv(results, outfile, row.names = FALSE) +cat(sprintf("\n=== Results written to %s (%d rows) ===\n", outfile, nrow(results))) + +# ---- Quick summary ---- +cat("\n--- Mean delta vs baseline ---\n") +bl_mean <- mean(results$score[results$config == "baseline"]) +agg <- aggregate(score ~ config + pr_selection, data = results, FUN = mean) +agg$delta <- round(agg$score - bl_mean, 2) +agg <- agg[order(agg$delta), ] +print(agg, row.names = FALSE) + +cat(sprintf("\nCompleted: %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_pr_stage4_validation.R b/dev/benchmarks/bench_pr_stage4_validation.R new file mode 100644 index 000000000..6199f03c2 --- /dev/null +++ b/dev/benchmarks/bench_pr_stage4_validation.R @@ -0,0 +1,161 @@ +# bench_pr_stage4_validation.R +# +# T-289: Prune-reinsert Stage 4 — multi-dataset validation at large-tree scale +# +# DESIGNED FOR HAMILTON HPC. Do not run locally. +# +# Stage 3 (mbank_X30754, 180t, 10 seeds, 60s) confirmed: +# MISSING criterion (sel=2) best or tied at d=5% and d=10%: +# pr_c5_d05_sel2: mean delta -14.7 (SE 5.9, 3.0 reps) +# PR enabled in large preset: c=5, d=5%, sel=MISSING +# +# Stage 4 goals: +# 1. Verify PR benefit generalises across 5 independent large matrices +# spanning 131-206 tips. +# 2. Check whether benefit persists or baseline catches up at 120s budget. +# +# Datasets (all training-split MorphoBank): +# mbank_X30754: 180t, 425p, 20% inapp (anchor — Stage 2/3 calibration) +# project4133: 131t, 349p, 6% inapp +# project3701: 146t, 324p, 15% inapp +# project804: 173t, 569p, 31% inapp +# syab07205: 206t, 748p, 4% inapp +# +# Configs (2): +# baseline: large preset, pruneReinsertCycles = 0 (no PR) +# pr_large: large preset, pruneReinsertCycles = 5, drop = 0.05, sel = MISSING (2) +# +# Grid: 5 datasets × 2 configs × 2 budgets × 10 seeds = 200 runs +# Expected wall time: ~5-6h on a single Hamilton node. +# +# Usage: +# Rscript bench_pr_stage4_validation.R [output_dir] +# output_dir: where to write CSV. Default: "." +# +# Output: t289e_stage4_validation.csv + +suppressPackageStartupMessages({ + library(TreeSearch) + library(TreeTools) +}) + +args <- commandArgs(trailingOnly = TRUE) +output_dir <- if (length(args) >= 1) args[1] else "." + +cat("=== T-289e: Prune-Reinsert Stage 4 Validation ===\n") +cat(sprintf("TreeSearch %s\n", packageVersion("TreeSearch"))) +cat(sprintf("Output: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Dataset definitions ---- +neotrans_dir <- Sys.glob("/nobackup/*/neotrans/inst/matrices") +if (length(neotrans_dir) == 0) { + # Fallback: sibling of TreeSearch-a + neotrans_dir <- file.path(dirname(dirname(dirname(getwd()))), + "neotrans", "inst", "matrices") +} +neotrans_dir <- neotrans_dir[1] +if (!dir.exists(neotrans_dir)) stop("neotrans matrices directory not found: ", neotrans_dir) + +mbank_path <- Sys.glob("/nobackup/*/TreeSearch-a/dev/benchmarks/mbank_X30754.nex") +if (length(mbank_path) == 0) { + mbank_path <- file.path(dirname(dirname(dirname(getwd()))), + "TreeSearch-a", "dev", "benchmarks", "mbank_X30754.nex") +} +mbank_path <- mbank_path[1] +if (!file.exists(mbank_path)) stop("mbank_X30754.nex not found") + +dataset_defs <- list( + list(key = "mbank_X30754", path = mbank_path), + list(key = "project4133", path = file.path(neotrans_dir, "project4133.nex")), + list(key = "project3701", path = file.path(neotrans_dir, "project3701.nex")), + list(key = "project804", path = file.path(neotrans_dir, "project804.nex")), + list(key = "syab07205", path = file.path(neotrans_dir, "syab07205.nex")) +) + +# ---- Config grid ---- +sc_baseline <- SearchControl(pruneReinsertCycles = 0L) +sc_pr_large <- SearchControl( + pruneReinsertCycles = 5L, + pruneReinsertDrop = 0.05, + pruneReinsertSelection = 2L +) + +configs <- list( + baseline = sc_baseline, + pr_large = sc_pr_large +) + +budgets <- c(60L, 120L) +seeds <- 1:10 + +# ---- Output ---- +out_file <- file.path(output_dir, "t289e_stage4_validation.csv") +out_cols <- c("dataset","n_tips","n_patterns","config","seed","timeout_s", + "score","n_trees","replicates","hits","wall_s", + "pr_cycles","pr_drop","pr_selection") +write(paste(shQuote(out_cols), collapse = ","), out_file) + +total_runs <- length(dataset_defs) * length(configs) * length(budgets) * length(seeds) +cat(sprintf("Total runs: %d\n\n", total_runs)) +run_i <- 0L + +for (ddef in dataset_defs) { + cat(sprintf("--- Loading: %s ---\n", ddef$key)) + ds <- tryCatch(ReadAsPhyDat(ddef$path), error = function(e) { + cat(sprintf(" ERROR loading %s: %s\n", ddef$key, e$message)) + NULL + }) + if (is.null(ds)) next + n_tips <- length(ds) + n_patterns <- sum(attr(ds, "weight")) + cat(sprintf(" %d taxa, %d patterns\n\n", n_tips, n_patterns)) + + for (budget in budgets) { + for (cfg_name in names(configs)) { + sc <- configs[[cfg_name]] + for (seed in seeds) { + run_i <- run_i + 1L + cat(sprintf("[%d/%d] %s | %s | budget=%ds | seed=%d ... ", + run_i, total_runs, ddef$key, cfg_name, budget, seed)) + t0 <- proc.time()[["elapsed"]] + + res <- tryCatch( + MaximizeParsimony( + dataset = ds, + maxSeconds = budget, + nThreads = 2L, + seed = seed, + verbosity = 0L, + control = sc + ), + error = function(e) { + cat(sprintf("ERROR: %s\n", e$message)) + NULL + } + ) + + wall_s <- proc.time()[["elapsed"]] - t0 + if (is.null(res)) next + + score <- attr(res, "score") + n_trees <- length(res) + replicates <- attr(res, "replicates") + hits <- attr(res, "hits") + + pr_cycles <- if (!is.null(sc$pruneReinsertCycles)) sc$pruneReinsertCycles else 0L + pr_drop <- if (!is.null(sc$pruneReinsertDrop)) sc$pruneReinsertDrop else 0.05 + pr_sel <- if (!is.null(sc$pruneReinsertSelection)) sc$pruneReinsertSelection else 0L + + row <- sprintf('%s,%d,%d,%s,%d,%d,%g,%d,%d,%d,%.3f,%d,%.2f,%d', + shQuote(ddef$key), n_tips, n_patterns, shQuote(cfg_name), + seed, budget, score, n_trees, replicates, hits, wall_s, + pr_cycles, pr_drop, pr_sel) + write(row, out_file, append = TRUE) + cat(sprintf("score=%g reps=%d wall=%.1fs\n", score, replicates, wall_s)) + } + } + } +} + +cat(sprintf("\nDone. %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_pr_stage5_nni.R b/dev/benchmarks/bench_pr_stage5_nni.R new file mode 100644 index 000000000..cecccf520 --- /dev/null +++ b/dev/benchmarks/bench_pr_stage5_nni.R @@ -0,0 +1,163 @@ +# bench_pr_stage5_nni.R +# +# T-289f: Prune-reinsert Stage 5 — NNI full-tree polish cost reduction +# +# DESIGNED FOR HAMILTON HPC. Do not run locally. +# +# Stage 4 conclusion: PR (TBR full polish) is disqualified for the large preset +# at 60s budget because per-cycle cost is too high (~60s at 206 tips, leaving +# 0 replicates). Stage 5 asks whether NNI full-tree polish (pruneReinsertNni=TRUE, +# ~5x cheaper at large n) restores PR's value. +# +# Hypothesis: PR's benefit comes from topological displacement, not from the +# quality of post-reinsert local search. NNI reaches a local optimum sufficient +# to identify improvements; outer-loop TBR then polishes to full convergence. +# +# Three configs: +# baseline: large preset, pruneReinsertCycles=0 (no PR) +# pr_nni: large preset, c=5, d=5%, MISSING, NNI=TRUE (new cheap option) +# pr_tbr: large preset, c=5, d=5%, MISSING, NNI=FALSE (Stage 4 reference) +# +# Same 5 datasets as Stage 4 (131-206 tips, training-split MorphoBank). +# +# Grid: 5 datasets x 3 configs x 2 budgets x 10 seeds = 300 runs +# Expected wall time: ~4-6h (pr_nni ~5x faster than pr_tbr). +# +# Usage: +# Rscript bench_pr_stage5_nni.R [output_dir] +# output_dir: where to write CSV. Default: "." +# +# Output: t289f_pr_nni_polish.csv + +suppressPackageStartupMessages({ + library(TreeSearch) + library(TreeTools) +}) + +args <- commandArgs(trailingOnly = TRUE) +output_dir <- if (length(args) >= 1) args[1] else "." + +cat("=== T-289f: Prune-Reinsert Stage 5 — NNI Polish ===\n") +cat(sprintf("TreeSearch %s\n", packageVersion("TreeSearch"))) +cat(sprintf("Output: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Dataset definitions ---- +neotrans_dir <- Sys.glob("/nobackup/*/neotrans/inst/matrices") +if (length(neotrans_dir) == 0) { + neotrans_dir <- file.path(dirname(dirname(dirname(getwd()))), + "neotrans", "inst", "matrices") +} +neotrans_dir <- neotrans_dir[1] +if (!dir.exists(neotrans_dir)) stop("neotrans matrices directory not found: ", neotrans_dir) + +mbank_path <- Sys.glob("/nobackup/*/TreeSearch-a/dev/benchmarks/mbank_X30754.nex") +if (length(mbank_path) == 0) { + mbank_path <- file.path(dirname(dirname(dirname(getwd()))), + "TreeSearch-a", "dev", "benchmarks", "mbank_X30754.nex") +} +mbank_path <- mbank_path[1] +if (!file.exists(mbank_path)) stop("mbank_X30754.nex not found") + +dataset_defs <- list( + list(key = "mbank_X30754", path = mbank_path), + list(key = "project4133", path = file.path(neotrans_dir, "project4133.nex")), + list(key = "project3701", path = file.path(neotrans_dir, "project3701.nex")), + list(key = "project804", path = file.path(neotrans_dir, "project804.nex")), + list(key = "syab07205", path = file.path(neotrans_dir, "syab07205.nex")) +) + +# ---- Config grid ---- +sc_baseline <- SearchControl( + pruneReinsertCycles = 0L +) +sc_pr_nni <- SearchControl( + pruneReinsertCycles = 5L, + pruneReinsertDrop = 0.05, + pruneReinsertSelection = 2L, # MISSING + pruneReinsertNni = TRUE # new: NNI polish instead of TBR +) +sc_pr_tbr <- SearchControl( + pruneReinsertCycles = 5L, + pruneReinsertDrop = 0.05, + pruneReinsertSelection = 2L, # MISSING + pruneReinsertNni = FALSE # Stage 4 reference: TBR full convergence +) + +configs <- list( + baseline = sc_baseline, + pr_nni = sc_pr_nni, + pr_tbr = sc_pr_tbr +) + +budgets <- c(60L, 120L) +seeds <- 1:10 + +# ---- Output ---- +out_file <- file.path(output_dir, "t289f_pr_nni_polish.csv") +out_cols <- c("dataset", "n_tips", "n_patterns", "config", "seed", "timeout_s", + "score", "n_trees", "replicates", "hits", "wall_s", + "pr_cycles", "pr_nni") +write(paste(shQuote(out_cols), collapse = ","), out_file) + +total_runs <- length(dataset_defs) * length(configs) * length(budgets) * length(seeds) +cat(sprintf("Total runs: %d\n\n", total_runs)) +run_i <- 0L + +for (ddef in dataset_defs) { + cat(sprintf("--- Loading: %s ---\n", ddef$key)) + ds <- tryCatch(ReadAsPhyDat(ddef$path), error = function(e) { + cat(sprintf(" ERROR loading %s: %s\n", ddef$key, e$message)) + NULL + }) + if (is.null(ds)) next + n_tips <- length(ds) + n_patterns <- sum(attr(ds, "weight")) + cat(sprintf(" %d taxa, %d patterns\n\n", n_tips, n_patterns)) + + for (budget in budgets) { + for (cfg_name in names(configs)) { + sc <- configs[[cfg_name]] + for (seed in seeds) { + run_i <- run_i + 1L + cat(sprintf("[%d/%d] %s | %s | budget=%ds | seed=%d ... ", + run_i, total_runs, ddef$key, cfg_name, budget, seed)) + t0 <- proc.time()[["elapsed"]] + + res <- tryCatch( + MaximizeParsimony( + dataset = ds, + maxSeconds = budget, + nThreads = 2L, + seed = seed, + verbosity = 0L, + control = sc + ), + error = function(e) { + cat(sprintf("ERROR: %s\n", e$message)) + NULL + } + ) + + wall_s <- proc.time()[["elapsed"]] - t0 + if (is.null(res)) next + + score <- attr(res, "score") + n_trees <- length(res) + replicates <- attr(res, "replicates") + hits <- attr(res, "hits") + pr_cycles <- if (!is.null(sc$pruneReinsertCycles)) sc$pruneReinsertCycles else 0L + pr_nni_val <- if (!is.null(sc$pruneReinsertNni)) as.integer(sc$pruneReinsertNni) else 0L + + row <- sprintf('%s,%d,%d,%s,%d,%d,%g,%d,%d,%d,%.3f,%d,%d', + shQuote(ddef$key), n_tips, n_patterns, shQuote(cfg_name), + seed, budget, score, n_trees, replicates, hits, wall_s, + pr_cycles, pr_nni_val) + write(row, out_file, append = TRUE) + cat(sprintf("score=%g reps=%d wall=%.1fs\n", score, replicates, wall_s)) + } + } + } +} + +cat(sprintf("\nDone. %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_profile_round2.R b/dev/benchmarks/bench_profile_round2.R new file mode 100644 index 000000000..f2570cea0 --- /dev/null +++ b/dev/benchmarks/bench_profile_round2.R @@ -0,0 +1,181 @@ +# Profiling round 2: Fresh baselines and detailed phase analysis +# Agent F, S-PROF, 2026-03-17 +# +# Run via: Rscript -e "library(TreeSearch, lib.loc='.agent-f'); source('dev/benchmarks/bench_profile_round2.R')" + +library(TreeSearch, lib.loc = ".agent-f") +library(TreeTools) + +# Representative datasets spanning the size range +DATASETS <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# ---- Section 1: End-to-end with timings attribute ---- + +cat("=== Section 1: End-to-end with per-phase timings ===\n\n") + +for (nm in DATASETS) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips) ---\n", nm, ds$n_taxa)) + + # 3 runs, take medians + timings_list <- list() + wall_times <- numeric(3) + + for (run in 1:3) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 5L, + targetHits = 3L, + ratchetCycles = 5L, + driftCycles = 5L, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 60, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + wall_times[run] <- elapsed + timings_list[[run]] <- result$timings + cat(sprintf(" Run %d: %.3fs wall, score=%.0f, reps=%d\n", + run, elapsed, result$best_score, result$replicates)) + } + + # Median wall time + med_wall <- median(wall_times) + # Median per-phase (element-wise) + med_timings <- sapply(names(timings_list[[1]]), function(ph) { + median(sapply(timings_list, function(t) t[[ph]])) + }) + cpp_total <- sum(med_timings) + r_overhead <- med_wall * 1000 - cpp_total + + cat(sprintf("\n Median wall: %.3fs\n", med_wall)) + cat(" Per-phase (median ms):\n") + for (ph in names(med_timings)) { + pct <- if (cpp_total > 0) 100 * med_timings[[ph]] / cpp_total else 0 + cat(sprintf(" %-12s %8.1f ms (%4.1f%%)\n", ph, med_timings[[ph]], pct)) + } + cat(sprintf(" %-12s %8.1f ms (C++ total)\n", "TOTAL", cpp_total)) + cat(sprintf(" %-12s %8.1f ms (R overhead: %.1f%% of wall)\n\n", + "R overhead", r_overhead, 100 * r_overhead / (med_wall * 1000))) +} + +# ---- Section 2: IW comparison ---- + +cat("=== Section 2: IW mode comparison ===\n\n") + +for (nm in c("Vinther2008", "Zhu2013")) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips, IW k=10) ---\n", nm, ds$n_taxa)) + + wall_times <- numeric(3) + timings_list <- list() + + for (run in 1:3) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + concavity = 10.0, + maxReplicates = 5L, + targetHits = 3L, + ratchetCycles = 5L, + driftCycles = 5L, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 60, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + wall_times[run] <- elapsed + timings_list[[run]] <- result$timings + cat(sprintf(" Run %d: %.3fs wall, score=%.2f, reps=%d\n", + run, elapsed, result$best_score, result$replicates)) + } + + med_wall <- median(wall_times) + med_timings <- sapply(names(timings_list[[1]]), function(ph) { + median(sapply(timings_list, function(t) t[[ph]])) + }) + cpp_total <- sum(med_timings) + + cat(sprintf("\n Median wall: %.3fs\n", med_wall)) + cat(" Per-phase (median ms):\n") + for (ph in names(med_timings)) { + pct <- if (cpp_total > 0) 100 * med_timings[[ph]] / cpp_total else 0 + cat(sprintf(" %-12s %8.1f ms (%4.1f%%)\n", ph, med_timings[[ph]], pct)) + } + cat(sprintf(" %-12s %8.1f ms (C++ total)\n\n", "TOTAL", cpp_total)) +} + +# ---- Section 3: Scaling test ---- + +cat("=== Section 3: Scaling — single TBR pass timing ===\n\n") + +for (nm in DATASETS) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips) ---\n", nm, ds$n_taxa)) + + # Single replicate, no sectorial/ratchet/drift — just Wagner+TBR + wall_times <- numeric(3) + timings_list <- list() + + for (run in 1:3) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 1L, + targetHits = 1L, + ratchetCycles = 0L, + driftCycles = 0L, + xssRounds = 0L, + rssRounds = 0L, + cssRounds = 0L, + fuseInterval = 0L, + maxSeconds = 60, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + wall_times[run] <- elapsed + timings_list[[run]] <- result$timings + } + + med_wall <- median(wall_times) + med_timings <- sapply(names(timings_list[[1]]), function(ph) { + median(sapply(timings_list, function(t) t[[ph]])) + }) + + cat(sprintf(" Wagner: %6.1f ms\n", med_timings[["wagner"]])) + cat(sprintf(" TBR: %6.1f ms\n", med_timings[["tbr"]])) + cat(sprintf(" Wall: %6.1f ms\n", med_wall * 1000)) + cat(sprintf(" R ovhd: %6.1f ms\n\n", med_wall * 1000 - sum(med_timings))) +} + +cat("=== Profiling complete ===\n") diff --git a/dev/benchmarks/bench_profile_round2b.R b/dev/benchmarks/bench_profile_round2b.R new file mode 100644 index 000000000..97891e4fc --- /dev/null +++ b/dev/benchmarks/bench_profile_round2b.R @@ -0,0 +1,203 @@ +# Profiling round 2b: Drift/ratchet deep dive + scaling +# Agent F, S-PROF, 2026-03-17 + +library(TreeSearch, lib.loc = ".agent-f") +library(TreeTools) + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# ---- Section 3: Drift cycle count sensitivity ---- + +cat("=== Section 3: Drift cycle count sensitivity ===\n\n") + +# How does drift time scale with cycle count? +# The question: are we doing too many drift cycles for the benefit? + +for (nm in c("Zhu2013", "Dikow2009")) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips) ---\n", nm, ds$n_taxa)) + cat(sprintf(" %-8s %8s %8s %8s %8s\n", "dCycles", "drift_ms", "total_ms", "score", "reps")) + + for (dc in c(0L, 1L, 2L, 3L, 5L, 10L)) { + scores <- numeric(3) + drift_ms <- numeric(3) + total_ms <- numeric(3) + reps <- numeric(3) + + for (run in 1:3) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 3L, + targetHits = 3L, + ratchetCycles = 5L, + driftCycles = dc, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 120, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + scores[run] <- result$best_score + total_ms[run] <- elapsed * 1000 + drift_ms[run] <- result$timings[["drift_ms"]] + reps[run] <- result$replicates + } + + cat(sprintf(" %-8d %8.0f %8.0f %8.0f %8.0f\n", + dc, median(drift_ms), median(total_ms), + median(scores), median(reps))) + } + cat("\n") +} + +# ---- Section 4: Ratchet cycle count sensitivity ---- + +cat("=== Section 4: Ratchet cycle count sensitivity ===\n\n") + +for (nm in c("Zhu2013", "Dikow2009")) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips) ---\n", nm, ds$n_taxa)) + cat(sprintf(" %-8s %8s %8s %8s %8s\n", "rCycles", "ratch_ms", "total_ms", "score", "reps")) + + for (rc in c(0L, 1L, 2L, 3L, 5L, 10L)) { + scores <- numeric(3) + ratch_ms <- numeric(3) + total_ms <- numeric(3) + reps <- numeric(3) + + for (run in 1:3) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 3L, + targetHits = 3L, + ratchetCycles = rc, + driftCycles = 5L, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 120, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + scores[run] <- result$best_score + total_ms[run] <- elapsed * 1000 + ratch_ms[run] <- result$timings[["ratchet_ms"]] + reps[run] <- result$replicates + } + + cat(sprintf(" %-8d %8.0f %8.0f %8.0f %8.0f\n", + rc, median(ratch_ms), median(total_ms), + median(scores), median(reps))) + } + cat("\n") +} + +# ---- Section 5: CSS effectiveness ---- + +cat("=== Section 5: CSS vs no CSS ===\n\n") + +for (nm in c("Zhu2013", "Dikow2009")) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips) ---\n", nm, ds$n_taxa)) + + for (css in c(0L, 1L, 2L)) { + scores <- numeric(3) + css_ms <- numeric(3) + total_ms <- numeric(3) + + for (run in 1:3) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 3L, + targetHits = 3L, + ratchetCycles = 5L, + driftCycles = 5L, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = css, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 120, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + scores[run] <- result$best_score + css_ms[run] <- result$timings[["css_ms"]] + total_ms[run] <- elapsed * 1000 + } + + cat(sprintf(" cssRounds=%d: css_ms=%6.0f total_ms=%6.0f score=%6.0f\n", + css, median(css_ms), median(total_ms), median(scores))) + } + cat("\n") +} + +# ---- Section 6: Wagner + TBR-only (no perturbation) ---- + +cat("=== Section 6: Wagner + TBR only (scaling) ===\n\n") + +DATASETS <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") + +for (nm in DATASETS) { + ds <- prepare(nm) + wall_times <- numeric(5) + tbr_ms <- numeric(5) + wagner_ms <- numeric(5) + + for (run in 1:5) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 1L, + targetHits = 1L, + ratchetCycles = 0L, + driftCycles = 0L, + xssRounds = 0L, + rssRounds = 0L, + cssRounds = 0L, + fuseInterval = 0L, + maxSeconds = 60, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + wall_times[run] <- elapsed + tbr_ms[run] <- result$timings[["tbr_ms"]] + wagner_ms[run] <- result$timings[["wagner_ms"]] + } + + cat(sprintf(" %s (%2d tips): Wagner=%5.1f ms, TBR=%7.1f ms, Wall=%7.1f ms\n", + nm, ds$n_taxa, + median(wagner_ms), median(tbr_ms), median(wall_times) * 1000)) +} + +cat("\n=== Profiling complete ===\n") diff --git a/dev/benchmarks/bench_profile_round2c.R b/dev/benchmarks/bench_profile_round2c.R new file mode 100644 index 000000000..8ad786920 --- /dev/null +++ b/dev/benchmarks/bench_profile_round2c.R @@ -0,0 +1,179 @@ +# Profiling round 2c: Parallel scaling + quality impact of reduced cycles +# Agent F, S-PROF, 2026-03-17 + +library(TreeSearch, lib.loc = ".agent-f") +library(TreeTools) + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# ---- Section 7: Quality impact with more statistical power ---- + +cat("=== Section 7: Drift/ratchet tuning — quality impact (10 seeds) ===\n\n") + +run_config <- function(ds, drift, ratchet, seed) { + set.seed(seed) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 5L, + targetHits = 3L, + ratchetCycles = ratchet, + driftCycles = drift, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 120, + verbosity = 0L, + nThreads = 1L + ) + elapsed <- (proc.time() - t0)[3] + c(score = unname(result$best_score), time = unname(elapsed), reps = unname(result$replicates)) +} + +configs <- list( + "d5_r5" = c(drift = 5, ratchet = 5), # current default + "d2_r2" = c(drift = 2, ratchet = 2), # reduced + "d2_r5" = c(drift = 2, ratchet = 5), # drift only reduced + "d5_r2" = c(drift = 5, ratchet = 2), # ratchet only reduced + "d0_r5" = c(drift = 0, ratchet = 5), # no drift + "d5_r0" = c(drift = 5, ratchet = 0) # no ratchet +) + +seeds <- 7301:7310 + +for (nm in c("Zhu2013", "Dikow2009")) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips, 10 seeds) ---\n", nm, ds$n_taxa)) + cat(sprintf(" %-8s %8s %8s %8s %8s %8s\n", + "config", "med_scr", "mean_scr", "min_scr", "med_time", "mean_t")) + + for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + sc <- numeric(length(seeds)) + tm <- numeric(length(seeds)) + for (i in seq_along(seeds)) { + r <- run_config(ds, cfg[["drift"]], cfg[["ratchet"]], seeds[i]) + sc[i] <- r[["score"]] + tm[i] <- r[["time"]] + } + + cat(sprintf(" %-8s %8.0f %8.1f %8.0f %8.1f %8.1f\n", + cfg_name, median(sc), mean(sc), min(sc), + median(tm), mean(tm))) + } + cat("\n") +} + +# ---- Section 8: Parallel scaling ---- + +cat("=== Section 8: Parallel scaling ===\n\n") + +for (nm in c("Zhu2013")) { + ds <- prepare(nm) + cat(sprintf("--- %s (%d tips) ---\n", nm, ds$n_taxa)) + cat(sprintf(" %-10s %8s %8s %8s\n", "nThreads", "time_ms", "score", "reps")) + + for (nt in c(1L, 2L)) { + times <- numeric(3) + scores <- numeric(3) + reps <- numeric(3) + + for (run in 1:3) { + set.seed(7300 + run) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 5L, + targetHits = 5L, + ratchetCycles = 5L, + driftCycles = 5L, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 120, + verbosity = 0L, + nThreads = nt + ) + elapsed <- (proc.time() - t0)[3] + times[run] <- elapsed + scores[run] <- result$best_score + reps[run] <- result$replicates + } + + cat(sprintf(" %-10d %8.0f %8.0f %8.0f\n", + nt, median(times) * 1000, median(scores), median(reps))) + } + cat("\n") +} + +# ---- Section 9: Per-replicate cost breakdown ---- + +cat("=== Section 9: Per-replicate cost (ms/rep) ===\n\n") + +DATASETS <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") +cat(sprintf(" %-15s %4s %8s %8s %8s %8s %8s %8s %8s\n", + "dataset", "tips", "wagner", "tbr", "sect", "ratch", "drift", "fTBR", "TOTAL")) + +for (nm in DATASETS) { + ds <- prepare(nm) + + all_timings <- list() + all_reps <- numeric(3) + + for (run in 1:3) { + set.seed(7300 + run) + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 5L, + targetHits = 3L, + ratchetCycles = 5L, + driftCycles = 5L, + xssRounds = 1L, + rssRounds = 1L, + cssRounds = 1L, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 120, + verbosity = 0L, + nThreads = 1L + ) + all_timings[[run]] <- result$timings + all_reps[run] <- result$replicates + } + + med_reps <- median(all_reps) + med_t <- sapply(names(all_timings[[1]]), function(ph) { + median(sapply(all_timings, function(t) t[[ph]])) + }) + + # Per-rep average + pr <- med_t / med_reps + sect <- pr[["xss_ms"]] + pr[["rss_ms"]] + pr[["css_ms"]] + total <- sum(pr) + + cat(sprintf(" %-15s %4d %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f %8.1f\n", + nm, ds$n_taxa, + pr[["wagner_ms"]], pr[["tbr_ms"]], sect, + pr[["ratchet_ms"]], pr[["drift_ms"]], + pr[["final_tbr_ms"]], total)) +} + +cat("\n=== Profiling complete ===\n") diff --git a/dev/benchmarks/bench_prune_reinsert.R b/dev/benchmarks/bench_prune_reinsert.R new file mode 100644 index 000000000..a88a83d90 --- /dev/null +++ b/dev/benchmarks/bench_prune_reinsert.R @@ -0,0 +1,297 @@ +#!/usr/bin/env Rscript +# T-289: Prune-reinsert perturbation benchmark +# +# DESIGNED FOR HAMILTON HPC. Do not run locally (hours of wall time). +# +# Evaluates taxon prune-reinsert (T-266) as perturbation strategy: +# - Does it improve scores vs baseline (no prune-reinsert)? +# - Optimal cycle count (1, 3, 5)? +# - Optimal drop fraction (0.05, 0.10, 0.20, 0.30)? +# - RANDOM vs INSTABILITY tip selection? +# - Complement or substitute for ratchet? +# - Scaling with dataset size (37t → 180t)? +# +# Design: Two-stage grid. +# Stage 1 ("sweep"): coarse grid on 5 datasets, 5 seeds, 30s budget. +# Identifies best cycle count and drop fraction. +# Stage 2 ("confirm"): best configs + baseline on 5 datasets, +# 5 seeds, 30s + 60s budgets. Also tests INSTABILITY selection +# and ratchet-replacement. +# +# Usage: +# Rscript bench_prune_reinsert.R [stage] [timeout_s] [output_dir] +# stage: 1 (sweep) or 2 (confirm). Default: 1 +# timeout_s: search budget in seconds. Default: 30 +# output_dir: where to write CSV results. Default: "." +# +# Output: t289_stage{1,2}_{timeout}s.csv + +library(TreeSearch) +library(TreeTools) + +args <- commandArgs(trailingOnly = TRUE) +stage <- if (length(args) >= 1) as.integer(args[1]) else 1L +timeout_s <- if (length(args) >= 2) as.integer(args[2]) else 30L +output_dir <- if (length(args) >= 3) args[3] else "." + +cat("=== T-289: Prune-Reinsert Benchmark ===\n") +cat(sprintf("Stage: %d, Timeout: %ds\n", stage, timeout_s)) +cat(sprintf("TreeSearch version: %s\n", packageVersion("TreeSearch"))) +cat(sprintf("Output dir: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Datasets ---- +# 5 datasets spanning tip-count range, chosen for enough landscape +# difficulty that perturbation quality matters. +bench_names <- c( + "Wortley2006", # 37 tips — small, gap dataset + "Agnarsson2004", # 62 tips — medium + "Zhu2013", # 75 tips — hard, high missing + "Dikow2009" # 88 tips — largest standard +) + +# Convert inapplicable to missing for EW Fitch scoring (match TNT) +fitch_mode <- function(dataset) { + contrast <- attr(dataset, "contrast") + levels <- attr(dataset, "levels") + inapp_col <- match("-", levels) + if (is.na(inapp_col)) return(dataset) + for (i in seq_len(nrow(contrast))) { + if (contrast[i, inapp_col] == 1 && sum(contrast[i, ]) == 1) { + contrast[i, ] <- 1 + } + } + attr(dataset, "contrast") <- contrast + dataset +} + +datasets <- lapply( + setNames(bench_names, bench_names), + function(nm) fitch_mode(inapplicable.phyData[[nm]]) +) + +# Also load 180-tip dataset if available +mbank_path <- file.path(dirname(dirname(getwd())), + "TreeSearch-a", "dev", "benchmarks", + "mbank_X30754.nex") +if (!file.exists(mbank_path)) { + mbank_path <- Sys.glob("/nobackup/*/TreeSearch-a/dev/benchmarks/mbank_X30754.nex") + if (length(mbank_path) > 0) mbank_path <- mbank_path[1] +} +if (length(mbank_path) == 1 && file.exists(mbank_path)) { + cat("Loading 180-tip dataset from:", mbank_path, "\n") + mbank <- fitch_mode(ReadAsPhyDat(mbank_path)) + datasets[["mbank_X30754"]] <- mbank + bench_names <- c(bench_names, "mbank_X30754") +} else { + cat("180-tip dataset not found; skipping mbank_X30754\n") +} + +# TNT reference scores (EW Fitch mode) +tnt_best <- c( + Wortley2006 = 479, Agnarsson2004 = 718, + Zhu2013 = 624, Dikow2009 = 1603, + mbank_X30754 = 1164 +) + +seeds <- 1:5 + +# ---- Baseline control (current auto preset, no prune-reinsert) ---- +make_baseline <- function() { + # No prune-reinsert; everything else at default + SearchControl( + pruneReinsertCycles = 0L, + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L + ) +} + +# ---- Build config grid ---- +build_configs <- function(stage) { + cfgs <- list() + + # Baseline: no prune-reinsert + cfgs[["baseline"]] <- list( + label = "baseline", + desc = "No prune-reinsert (auto preset)", + control = NULL # use strategy = "auto" + ) + + if (stage == 1L) { + # Stage 1: sweep cycles × drop_fraction, RANDOM selection only + for (cyc in c(1L, 3L, 5L)) { + for (drop in c(0.05, 0.10, 0.20, 0.30)) { + tag <- sprintf("pr_c%d_d%02d", cyc, round(drop * 100)) + cfgs[[tag]] <- list( + label = tag, + desc = sprintf("PR: %d cycles, %.0f%% drop, random", + cyc, drop * 100), + pr_cycles = cyc, + pr_drop = drop, + pr_selection = 0L # RANDOM + ) + } + } + } else { + # Stage 2: best configs from stage 1 + INSTABILITY + ratchet-replacement + # (Placeholder — human fills in best configs after stage 1 analysis) + # Default stage 2 tests a reasonable spread: + for (cyc in c(1L, 3L)) { + for (drop in c(0.10, 0.20)) { + for (sel in c(0L, 1L)) { + sel_tag <- if (sel == 0L) "rand" else "inst" + tag <- sprintf("pr_c%d_d%02d_%s", cyc, round(drop * 100), sel_tag) + cfgs[[tag]] <- list( + label = tag, + desc = sprintf("PR: %d cycles, %.0f%% drop, %s", + cyc, drop * 100, sel_tag), + pr_cycles = cyc, + pr_drop = drop, + pr_selection = sel + ) + } + } + } + + # Ratchet-replacement: prune-reinsert WITH reduced ratchet + for (cyc in c(3L, 5L)) { + tag <- sprintf("pr_c%d_d10_noratch", cyc) + cfgs[[tag]] <- list( + label = tag, + desc = sprintf("PR: %d cycles, 10%% drop, ratchet halved", cyc), + pr_cycles = cyc, + pr_drop = 0.10, + pr_selection = 0L, + ratchet_override = TRUE # signal to halve ratchet cycles + ) + } + } + + cfgs +} + +configs <- build_configs(stage) +total_runs <- length(configs) * length(datasets) * length(seeds) +cat(sprintf("Configs: %d, Datasets: %d, Seeds: %d -> %d total runs\n\n", + length(configs), length(datasets), length(seeds), total_runs)) + +# ---- Run experiments ---- +results <- data.frame( + dataset = character(), n_tips = integer(), n_patterns = integer(), + config = character(), seed = integer(), timeout_s = integer(), + score = numeric(), n_trees = integer(), replicates = integer(), + hits = integer(), wall_s = numeric(), + pr_cycles = integer(), pr_drop = numeric(), pr_selection = integer(), + ratchet_halved = logical(), + tnt_best = numeric(), gap = numeric(), + stringsAsFactors = FALSE +) + +run_idx <- 0L +for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + cat(sprintf("\n--- Config: %s (%s) ---\n", cfg$label, cfg$desc)) + + for (ds_name in bench_names) { + ds <- datasets[[ds_name]] + ntip <- length(ds) + npat <- sum(attr(ds, "weight")) + + for (s in seeds) { + run_idx <- run_idx + 1L + cat(sprintf(" [%d/%d] %s / %s / seed=%d ... ", + run_idx, total_runs, ds_name, cfg$label, s)) + + set.seed(s) + t0 <- proc.time() + + tryCatch({ + if (cfg_name == "baseline") { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "auto", + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + } else { + # Pass prune-reinsert params as ... args so the auto preset still + # governs everything else (ratchetCycles, wagnerStarts, etc.). + # Using control = SearchControl(...) marks ALL fields as explicit, + # which discards the preset — see MaximizeParsimony.R lines 532-543. + extra_args <- list( + ds, + maxSeconds = timeout_s, + strategy = "auto", + pruneReinsertCycles = cfg$pr_cycles, + pruneReinsertDrop = cfg$pr_drop, + pruneReinsertSelection = cfg$pr_selection, + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + + # Ratchet-replacement mode: halve ratchet cycles + if (isTRUE(cfg$ratchet_override)) { + extra_args$ratchetCycles <- 6L # halved from preset default 12 + } + + res <- do.call(MaximizeParsimony, extra_args) + } + + elapsed <- (proc.time() - t0)[3] + best_score <- attr(res, "score") + n_trees <- length(res) + reps <- attr(res, "replicates") + hits <- attr(res, "hits") + tnt_ref <- tnt_best[ds_name] + gap <- if (!is.na(tnt_ref)) best_score - tnt_ref else NA_real_ + + cat(sprintf("score=%g, gap=%s, reps=%d, %.1fs\n", + best_score, + if (is.na(gap)) "?" else sprintf("%+d", gap), + reps, elapsed)) + + results <- rbind(results, data.frame( + dataset = ds_name, n_tips = ntip, n_patterns = npat, + config = cfg$label, seed = s, timeout_s = timeout_s, + score = best_score, n_trees = n_trees, replicates = reps, + hits = hits, wall_s = elapsed, + pr_cycles = if (is.null(cfg$pr_cycles)) 0L else cfg$pr_cycles, + pr_drop = if (is.null(cfg$pr_drop)) 0 else cfg$pr_drop, + pr_selection = if (is.null(cfg$pr_selection)) NA_integer_ + else cfg$pr_selection, + ratchet_halved = isTRUE(cfg$ratchet_override), + tnt_best = tnt_ref, gap = gap, + stringsAsFactors = FALSE + )) + }, error = function(e) { + cat(sprintf("ERROR: %s\n", conditionMessage(e))) + }) + } + } +} + +# ---- Save results ---- +outfile <- file.path( + output_dir, + sprintf("t289_stage%d_%ds.csv", stage, timeout_s) +) +write.csv(results, outfile, row.names = FALSE) +cat(sprintf("\n=== Results written to %s (%d rows) ===\n", + outfile, nrow(results))) + +# ---- Quick summary ---- +cat("\n--- Median scores by config × dataset ---\n") +agg <- aggregate(score ~ config + dataset, data = results, FUN = median) +agg_wide <- reshape(agg, direction = "wide", idvar = "config", + timevar = "dataset", v.names = "score") +print(agg_wide, row.names = FALSE) + +cat(sprintf("\nCompleted: %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_prune_reinsert_brazeau.R b/dev/benchmarks/bench_prune_reinsert_brazeau.R new file mode 100644 index 000000000..de365312c --- /dev/null +++ b/dev/benchmarks/bench_prune_reinsert_brazeau.R @@ -0,0 +1,250 @@ +#!/usr/bin/env Rscript +# T-289b: Prune-reinsert perturbation benchmark — Brazeau (default) scoring +# +# DESIGNED FOR HAMILTON HPC. Do not run locally (hours of wall time). +# +# Parallel companion to bench_prune_reinsert.R (Fitch/EW mode). +# Uses TreeSearch's default Brazeau et al. (2019) inapplicable algorithm, +# which is what package users actually experience. +# +# Comparison with TNT absolute scores is NOT valid here (different algorithms). +# Comparisons are: PR config vs baseline, both in Brazeau mode. +# +# Design: same two-stage grid as the Fitch companion. +# Stage 1 ("sweep"): coarse grid on 5 datasets, 5 seeds, 30s budget. +# Stage 2 ("confirm"): best configs from Stage 1, 30s + 60s budgets. +# +# Usage: +# Rscript bench_prune_reinsert_brazeau.R [stage] [timeout_s] [output_dir] +# stage: 1 (sweep) or 2 (confirm). Default: 1 +# timeout_s: search budget in seconds. Default: 30 +# output_dir: where to write CSV results. Default: "." +# +# Output: t289b_stage{1,2}_{timeout}s.csv ("b" = Brazeau mode) + +library(TreeSearch) +library(TreeTools) + +args <- commandArgs(trailingOnly = TRUE) +stage <- if (length(args) >= 1) as.integer(args[1]) else 1L +timeout_s <- if (length(args) >= 2) as.integer(args[2]) else 30L +output_dir <- if (length(args) >= 3) args[3] else "." + +cat("=== T-289b: Prune-Reinsert Benchmark (Brazeau scoring) ===\n") +cat(sprintf("Stage: %d, Timeout: %ds\n", stage, timeout_s)) +cat(sprintf("TreeSearch version: %s\n", packageVersion("TreeSearch"))) +cat(sprintf("Output dir: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Datasets (NO fitch_mode conversion; default Brazeau scoring) ---- +bench_names <- c( + "Wortley2006", # 37 tips — small, gap dataset + "Agnarsson2004", # 62 tips — medium + "Zhu2013", # 75 tips — hard, high missing + "Dikow2009" # 88 tips — largest standard +) + +datasets <- lapply( + setNames(bench_names, bench_names), + function(nm) inapplicable.phyData[[nm]] +) + +# Also load 180-tip dataset if available +mbank_path <- file.path(dirname(dirname(getwd())), + "TreeSearch-a", "dev", "benchmarks", + "mbank_X30754.nex") +if (!file.exists(mbank_path)) { + mbank_path <- Sys.glob("/nobackup/*/TreeSearch-a/dev/benchmarks/mbank_X30754.nex") + if (length(mbank_path) > 0) mbank_path <- mbank_path[1] +} +if (length(mbank_path) == 1 && file.exists(mbank_path)) { + cat("Loading 180-tip dataset from:", mbank_path, "\n") + datasets[["mbank_X30754"]] <- ReadAsPhyDat(mbank_path) + bench_names <- c(bench_names, "mbank_X30754") +} else { + cat("180-tip dataset not found; skipping mbank_X30754\n") +} + +seeds <- 1:5 + +# ---- Build config grid (identical to Fitch companion) ---- +build_configs <- function(stage) { + cfgs <- list() + + cfgs[["baseline"]] <- list( + label = "baseline", + desc = "No prune-reinsert (auto preset)", + control = NULL + ) + + if (stage == 1L) { + for (cyc in c(1L, 3L, 5L)) { + for (drop in c(0.05, 0.10, 0.20, 0.30)) { + tag <- sprintf("pr_c%d_d%02d", cyc, round(drop * 100)) + cfgs[[tag]] <- list( + label = tag, + desc = sprintf("PR: %d cycles, %.0f%% drop, random", + cyc, drop * 100), + pr_cycles = cyc, + pr_drop = drop, + pr_selection = 0L + ) + } + } + } else { + for (cyc in c(1L, 3L)) { + for (drop in c(0.10, 0.20)) { + for (sel in c(0L, 1L)) { + sel_tag <- if (sel == 0L) "rand" else "inst" + tag <- sprintf("pr_c%d_d%02d_%s", cyc, round(drop * 100), sel_tag) + cfgs[[tag]] <- list( + label = tag, + desc = sprintf("PR: %d cycles, %.0f%% drop, %s", + cyc, drop * 100, sel_tag), + pr_cycles = cyc, + pr_drop = drop, + pr_selection = sel + ) + } + } + } + + for (cyc in c(3L, 5L)) { + tag <- sprintf("pr_c%d_d10_noratch", cyc) + cfgs[[tag]] <- list( + label = tag, + desc = sprintf("PR: %d cycles, 10%% drop, ratchet halved", cyc), + pr_cycles = cyc, + pr_drop = 0.10, + pr_selection = 0L, + ratchet_override = TRUE + ) + } + } + + cfgs +} + +configs <- build_configs(stage) +total_runs <- length(configs) * length(datasets) * length(seeds) +cat(sprintf("Configs: %d, Datasets: %d, Seeds: %d -> %d total runs\n\n", + length(configs), length(datasets), length(seeds), total_runs)) + +# ---- Run experiments ---- +results <- data.frame( + dataset = character(), n_tips = integer(), n_patterns = integer(), + config = character(), seed = integer(), timeout_s = integer(), + score = numeric(), n_trees = integer(), replicates = integer(), + hits = integer(), wall_s = numeric(), + pr_cycles = integer(), pr_drop = numeric(), pr_selection = integer(), + ratchet_halved = logical(), + stringsAsFactors = FALSE +) + +run_idx <- 0L +for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + cat(sprintf("\n--- Config: %s (%s) ---\n", cfg$label, cfg$desc)) + + for (ds_name in bench_names) { + ds <- datasets[[ds_name]] + ntip <- length(ds) + npat <- sum(attr(ds, "weight")) + + for (s in seeds) { + run_idx <- run_idx + 1L + cat(sprintf(" [%d/%d] %s / %s / seed=%d ... ", + run_idx, total_runs, ds_name, cfg$label, s)) + + set.seed(s) + t0 <- proc.time() + + tryCatch({ + if (cfg_name == "baseline") { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "auto", + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + } else { + extra_args <- list( + ds, + maxSeconds = timeout_s, + strategy = "auto", + pruneReinsertCycles = cfg$pr_cycles, + pruneReinsertDrop = cfg$pr_drop, + pruneReinsertSelection = cfg$pr_selection, + consensusStableReps = 0L, + nniPerturbCycles = 0L, + driftCycles = 0L, + verbosity = 0L, + nThreads = 1L + ) + + if (isTRUE(cfg$ratchet_override)) { + extra_args$ratchetCycles <- 6L + } + + res <- do.call(MaximizeParsimony, extra_args) + } + + elapsed <- (proc.time() - t0)[3] + best_score <- attr(res, "score") + n_trees <- length(res) + reps <- attr(res, "replicates") + hits <- attr(res, "hits") + + cat(sprintf("score=%g, reps=%d, %.1fs\n", best_score, reps, elapsed)) + + results <- rbind(results, data.frame( + dataset = ds_name, n_tips = ntip, n_patterns = npat, + config = cfg$label, seed = s, timeout_s = timeout_s, + score = best_score, n_trees = n_trees, replicates = reps, + hits = hits, wall_s = elapsed, + pr_cycles = if (is.null(cfg$pr_cycles)) 0L else cfg$pr_cycles, + pr_drop = if (is.null(cfg$pr_drop)) 0 else cfg$pr_drop, + pr_selection = if (is.null(cfg$pr_selection)) NA_integer_ + else cfg$pr_selection, + ratchet_halved = isTRUE(cfg$ratchet_override), + stringsAsFactors = FALSE + )) + }, error = function(e) { + cat(sprintf("ERROR: %s\n", conditionMessage(e))) + }) + } + } +} + +# ---- Save results ---- +outfile <- file.path( + output_dir, + sprintf("t289b_stage%d_%ds.csv", stage, timeout_s) +) +write.csv(results, outfile, row.names = FALSE) +cat(sprintf("\n=== Results written to %s (%d rows) ===\n", + outfile, nrow(results))) + +# ---- Quick summary ---- +cat("\n--- Median scores by config × dataset ---\n") +agg <- aggregate(score ~ config + dataset, data = results, FUN = median) +agg_wide <- reshape(agg, direction = "wide", idvar = "config", + timevar = "dataset", v.names = "score") +print(agg_wide, row.names = FALSE) + +# ---- Delta vs baseline ---- +cat("\n--- Median delta vs baseline (negative = improvement) ---\n") +bl <- agg[agg$config == "baseline", c("dataset", "score")] +names(bl)[2] <- "baseline_score" +agg2 <- merge(agg[agg$config != "baseline", ], bl, by = "dataset") +agg2$delta <- agg2$score - agg2$baseline_score +delta_wide <- reshape(agg2[, c("config", "dataset", "delta")], + direction = "wide", idvar = "config", + timevar = "dataset", v.names = "delta") +print(delta_wide, row.names = FALSE) + +cat(sprintf("\nCompleted: %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_regression.R b/dev/benchmarks/bench_regression.R new file mode 100644 index 000000000..93bf803bc --- /dev/null +++ b/dev/benchmarks/bench_regression.R @@ -0,0 +1,216 @@ +#!/usr/bin/env Rscript +# Performance regression benchmark for TreeSearch C++ engine. +# Run after every significant code change to catch quality or speed regressions. +# +# Usage: +# Rscript dev/benchmarks/bench_regression.R [lib_path] +# Rscript dev/benchmarks/bench_regression.R --datasets=Vinther2008,Zhu2013 --budget=30 +# Rscript dev/benchmarks/bench_regression.R --datasets=all --budget=20 --output=results.csv +# +# Arguments (positional, legacy): +# lib_path Library path for TreeSearch (default: auto-detect) +# +# Arguments (named): +# --lib=PATH Library path (overrides positional) +# --datasets=NAMES Comma-separated dataset names, or "all" (default: core 3) +# --budget=SECS Per-dataset time budget in seconds (default: 30) +# --output=FILE Write CSV results to FILE (in addition to stdout) +# --threads=N Number of threads (default: 1) +# +# Each benchmark runs in its own subprocess to isolate any crashes. +# +# Asserts: +# 1. Score quality: each dataset must reach its max allowed score. +# 2. Timing: no dataset should take more than 3x its reference time. +# +# Exit code 0 = pass, 1 = regression detected. + +# --- Parse arguments --- +args <- commandArgs(trailingOnly = TRUE) + +named_args <- list() +positional_args <- character(0) +for (arg in args) { + if (grepl("^--", arg)) { + parts <- strsplit(sub("^--", "", arg), "=", fixed = TRUE)[[1]] + named_args[[parts[1]]] <- if (length(parts) > 1) parts[2] else "true" + } else { + positional_args <- c(positional_args, arg) + } +} + +`%||%` <- function(a, b) if (is.null(a)) b else a + +lib_path <- named_args[["lib"]] %||% + (if (length(positional_args)) positional_args[1] else NULL) +budget <- as.numeric(named_args[["budget"]] %||% "30") +output_file <- named_args[["output"]] +n_threads <- as.integer(named_args[["threads"]] %||% "1") +dataset_arg <- named_args[["datasets"]] + +# --- Reference data --- +# Max scores are ~1-2% above optimal to allow for stochastic variation. +# ref_time_s is the expected time at budget=30s with 1 thread. +all_benchmarks <- list( + Vinther2008 = list(n_tip = 23, max_score = 80, ref_time_s = 1.0), + Agnarsson2004 = list(n_tip = 62, max_score = 785, ref_time_s = 5.0), + Zhu2013 = list(n_tip = 75, max_score = 662, ref_time_s = 8.0), + Longrich2010 = list(n_tip = 20, max_score = 132, ref_time_s = 0.5), + Sansom2010 = list(n_tip = 23, max_score = 190, ref_time_s = 0.8), + DeAssis2011 = list(n_tip = 33, max_score = 66, ref_time_s = 1.0), + Aria2015 = list(n_tip = 35, max_score = 145, ref_time_s = 1.5), + Wortley2006 = list(n_tip = 37, max_score = 500, ref_time_s = 2.0), + Griswold1999 = list(n_tip = 43, max_score = 415, ref_time_s = 3.0), + Schulze2007 = list(n_tip = 52, max_score = 168, ref_time_s = 4.0), + Eklund2004 = list(n_tip = 54, max_score = 450, ref_time_s = 4.0), + Zanol2014 = list(n_tip = 74, max_score = 1345, ref_time_s = 7.0), + Giles2015 = list(n_tip = 78, max_score = 725, ref_time_s = 7.0), + Dikow2009 = list(n_tip = 88, max_score = 1625, ref_time_s = 10.0) +) + +# Select datasets +default_names <- c("Vinther2008", "Agnarsson2004", "Zhu2013") +if (is.null(dataset_arg) || dataset_arg == "") { + selected_names <- default_names +} else if (tolower(dataset_arg) == "all") { + selected_names <- names(all_benchmarks) +} else { + selected_names <- trimws(strsplit(dataset_arg, ",")[[1]]) + unknown <- setdiff(selected_names, names(all_benchmarks)) + if (length(unknown)) { + stop("Unknown datasets: ", paste(unknown, collapse = ", "), + "\nAvailable: ", paste(names(all_benchmarks), collapse = ", ")) + } +} + +benchmarks <- all_benchmarks[selected_names] + +# Resolve library path +if (is.null(lib_path)) { + candidates <- c(Sys.glob(".agent-*"), Sys.glob(".builds/TreeSearch-*")) + if (length(candidates)) { + lib_path <- candidates[1] + cat("Auto-detected library:", lib_path, "\n") + } else { + lib_path <- .libPaths()[1] + } +} + +cat("=== TreeSearch Performance Regression Benchmark ===\n") +cat(sprintf(" Library: %s\n", lib_path)) +cat(sprintf(" Datasets: %s\n", paste(selected_names, collapse = ", "))) +cat(sprintf(" Budget: %ds per dataset\n", budget)) +cat(sprintf(" Threads: %d\n\n", n_threads)) + +n_pass <- 0L +n_fail <- 0L +results <- list() + +for (nm in names(benchmarks)) { + bm <- benchmarks[[nm]] + cat(sprintf("--- %s (%d tips) ---\n", nm, bm$n_tip)) + + script <- sprintf(' + library(TreeSearch, lib.loc = "%s") + library(TreeTools) + ds <- TreeSearch::inapplicable.phyData[["%s"]] + at <- attributes(ds) + contrast <- at$contrast + tip_data <- matrix(unlist(ds, use.names = FALSE), nrow = length(ds), byrow = TRUE) + weight <- at$weight + levels <- at$levels + set.seed(4217) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + contrast, tip_data, weight, levels, + maxReplicates = 100L, targetHits = 3L, + ratchetCycles = 12L, driftCycles = 2L, + xssRounds = 3L, xssPartitions = 4L, + rssRounds = 1L, cssRounds = 0L, + fuseInterval = 3L, maxSeconds = %d, + verbosity = 0L, nThreads = %dL + ) + elapsed <- (proc.time() - t0)[3] + cat(result$best_score, elapsed, result$replicates, sep = " ") + ', lib_path, nm, budget, n_threads) + + tf <- tempfile(fileext = ".R") + writeLines(script, tf) + timeout <- max(budget * 3, 60) + output <- tryCatch( + system2("Rscript", tf, stdout = TRUE, stderr = FALSE, timeout = timeout), + error = function(e) paste("ERROR:", conditionMessage(e)) + ) + unlink(tf) + + if (length(output) == 0 || startsWith(output[length(output)], "ERROR")) { + cat(" CRASHED or timed out\n") + n_fail <- n_fail + 1L + results[[nm]] <- data.frame( + dataset = nm, n_tip = bm$n_tip, + score = NA, elapsed = NA, replicates = NA, status = "CRASH", + stringsAsFactors = FALSE + ) + next + } + + vals <- strsplit(trimws(output[length(output)]), "\s+")[[1]] + if (length(vals) < 2) { + cat(" Unexpected output:", output[length(output)], "\n") + n_fail <- n_fail + 1L + results[[nm]] <- data.frame( + dataset = nm, n_tip = bm$n_tip, + score = NA, elapsed = NA, replicates = NA, status = "ERROR", + stringsAsFactors = FALSE + ) + next + } + + score <- as.numeric(vals[1]) + elapsed <- as.numeric(vals[2]) + reps <- if (length(vals) >= 3) as.integer(vals[3]) else NA_integer_ + + score_ok <- score <= bm$max_score + time_limit <- bm$ref_time_s * 3 * (budget / 30) + time_ok <- elapsed <= time_limit + + status <- if (score_ok && time_ok) "PASS" else "FAIL" + cat(sprintf(" Score: %.0f (max: %d) %s\n", + score, bm$max_score, + if (score_ok) "OK" else "REGRESSION")) + cat(sprintf(" Time: %.2fs (limit: %.1fs) %s\n", + elapsed, time_limit, + if (time_ok) "OK" else "REGRESSION")) + if (!is.na(reps)) cat(sprintf(" Reps: %d\n", reps)) + cat(sprintf(" Result: %s\n\n", status)) + + if (status == "PASS") n_pass <- n_pass + 1L + else n_fail <- n_fail + 1L + + results[[nm]] <- data.frame( + dataset = nm, n_tip = bm$n_tip, + score = score, elapsed = elapsed, replicates = reps, status = status, + stringsAsFactors = FALSE + ) +} + +cat(sprintf("=== Summary: %d PASS, %d FAIL ===\n", n_pass, n_fail)) + +# Write CSV output if requested +if (!is.null(output_file)) { + df <- do.call(rbind, results) + df$budget_s <- budget + df$threads <- n_threads + df$timestamp <- format(Sys.time(), "%Y-%m-%dT%H:%M:%S") + dir.create(dirname(output_file), showWarnings = FALSE, recursive = TRUE) + write.csv(df, output_file, row.names = FALSE) + cat(sprintf("Results written to %s\n", output_file)) +} + +if (n_fail > 0L) { + cat("\nREGRESSIONS DETECTED.\n") + quit(status = 1L) +} else { + cat("\nAll benchmarks passed.\n") + quit(status = 0L) +} diff --git a/dev/benchmarks/bench_score_micro.R b/dev/benchmarks/bench_score_micro.R new file mode 100644 index 000000000..2f11e2a58 --- /dev/null +++ b/dev/benchmarks/bench_score_micro.R @@ -0,0 +1,62 @@ +# Micro-benchmark: just Fitch scoring, no search +# Usage: Rscript dev/benchmarks/bench_score_micro.R +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[1] else ".agent-pgo" + +library(TreeSearch, lib.loc = lib_path) +library(TreeTools) + +data("inapplicable.phyData") + +prep_ds <- function(dataset) { + at <- attributes(dataset) + contrast <- at$contrast + storage.mode(contrast) <- "double" + tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) + storage.mode(tip_data) <- "integer" + weight <- at$weight + levels <- at$levels + min_steps <- apply(contrast, 2, function(x) sum(x > 0)) - 1L + min_steps <- pmax(min_steps, 0L) + list(contrast = contrast, tip_data = tip_data, weight = weight, + levels = levels, min_steps = min_steps) +} + +for (nm in c("Agnarsson2004", "Dikow2009")) { + ds <- inapplicable.phyData[[nm]] + ds_args <- prep_ds(ds) + + set.seed(7294) + tree <- RandomTree(names(ds), root = TRUE) + edge <- tree$edge + + # Time many scoring calls + n_iter <- 500L + t0 <- system.time({ + for (i in seq_len(n_iter)) { + TreeSearch:::ts_fitch_score( + edge, ds_args$contrast, ds_args$tip_data, + ds_args$weight, ds_args$levels, ds_args$min_steps + ) + } + }) + cat(nm, ": ", n_iter, " scores in ", t0["elapsed"], "s (", + round(t0["elapsed"] / n_iter * 1000, 2), " ms/score)\n", sep = "") +} + +# TBR phase breakdown +for (nm in c("Agnarsson2004", "Dikow2009")) { + ds <- inapplicable.phyData[[nm]] + ds_args <- prep_ds(ds) + + set.seed(7294) + edge <- RandomTree(names(ds), root = TRUE)$edge + + r <- TreeSearch:::ts_bench_tbr_phases( + edge, ds_args$contrast, ds_args$tip_data, + ds_args$weight, ds_args$levels, ds_args$min_steps + ) + cat(nm, " TBR: indirect=", r$time_indirect_us, "us, clip_incr=", + r$time_clip_incr_us, "us, total_candidates=", r$n_candidates, "\n", sep = "") +} diff --git a/dev/benchmarks/bench_simd.R b/dev/benchmarks/bench_simd.R new file mode 100644 index 000000000..2ec3f03f9 --- /dev/null +++ b/dev/benchmarks/bench_simd.R @@ -0,0 +1,128 @@ +# Phase 3E SIMD benchmark: measure TBR search performance. +# +# This benchmark compares SIMD-enabled TBR performance across dataset sizes. +# Since SIMD is compiled in (no runtime toggle), we measure absolute timings +# and per-candidate costs to verify the Phase 3D profiling baseline is met +# or improved. +# +# Usage: Rscript dev/benchmarks/bench_simd.R + +library(TreeSearch) +library(TreeTools) + +cat("Phase 3E SIMD Benchmark\n") +cat("=======================\n\n") + +# Helper: run TBR search and measure time +bench_tbr <- function(dataset, n_reps = 3, label = "") { + ds <- list( + contrast = attr(dataset, "contrast"), + tip_data = t(vapply(dataset, I, dataset[[1]])), + weight = attr(dataset, "weight"), + levels = attr(dataset, "levels") + ) + n_tip <- length(dataset) + tree <- Preorder(PectinateTree(dataset)) + + times <- vapply(seq_len(n_reps), function(i) { + set.seed(4200 + i) + t0 <- proc.time() + TreeSearch:::ts_tbr_search( + tree$edge, ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxHits = 5L + ) + elapsed <- (proc.time() - t0)[["elapsed"]] + elapsed + }, numeric(1)) + + med <- median(times) + cat(sprintf(" %-30s tips=%3d median=%.3fs (%.3f, %.3f, %.3f)\n", + label, n_tip, med, times[1], times[2], times[3])) + data.frame(label = label, n_tip = n_tip, median_s = med, + stringsAsFactors = FALSE) +} + +# Helper: run driven search and measure time +bench_driven <- function(dataset, n_reps = 3, label = "") { + n_tip <- length(dataset) + ds <- list( + contrast = attr(dataset, "contrast"), + tip_data = t(vapply(dataset, I, dataset[[1]])), + weight = attr(dataset, "weight"), + levels = attr(dataset, "levels") + ) + + times <- vapply(seq_len(n_reps), function(i) { + set.seed(4200 + i) + t0 <- proc.time() + TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 2L, targetHits = 2L, + ratchetCycles = 2L, driftCycles = 0L, + xssPartitions = 2L, rssRounds = 0L, + cssRounds = 0L, cssPartitions = 2L, + fuseInterval = 0L, poolMaxSize = 2L, + maxSeconds = 30, verbosity = 0L + ) + elapsed <- (proc.time() - t0)[["elapsed"]] + elapsed + }, numeric(1)) + + med <- median(times) + cat(sprintf(" %-30s tips=%3d median=%.3fs (%.3f, %.3f, %.3f)\n", + label, n_tip, med, times[1], times[2], times[3])) + data.frame(label = label, n_tip = n_tip, median_s = med, + stringsAsFactors = FALSE) +} + +# ---- TBR benchmarks ---- +cat("TBR search (5 hits to best):\n") +results_tbr <- list() + +for (ds_name in c("Vinther2008", "Agnarsson2004", "Wills2012", + "Aria2015", "Zhu2013")) { + dataset <- inapplicable.phyData[[ds_name]] + results_tbr[[ds_name]] <- bench_tbr(dataset, label = ds_name) +} + +# DNA dataset +suppressWarnings(data("Laurasiatherian", package = "phangorn")) +results_tbr[["Laurasiatherian"]] <- bench_tbr(Laurasiatherian, + label = "Laurasiatherian (DNA)") + +cat("\nDriven search (2 replicates, 30s timeout):\n") +results_driven <- list() + +for (ds_name in c("Vinther2008", "Agnarsson2004", "Zhu2013")) { + dataset <- inapplicable.phyData[[ds_name]] + results_driven[[ds_name]] <- bench_driven(dataset, label = ds_name) +} + +# Phase benchmark diagnostic (if available) +cat("\nTBR phase timing (Phase 3D diagnostic):\n") +for (ds_name in c("Vinther2008", "Zhu2013")) { + dataset <- inapplicable.phyData[[ds_name]] + ds <- list( + contrast = attr(dataset, "contrast"), + tip_data = t(vapply(dataset, I, dataset[[1]])), + weight = attr(dataset, "weight"), + levels = attr(dataset, "levels") + ) + tree <- Preorder(PectinateTree(dataset)) + set.seed(7777) + ph <- tryCatch( + TreeSearch:::ts_bench_tbr_phases( + tree$edge, ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxHits = 3L + ), + error = function(e) NULL + ) + if (!is.null(ph)) { + cat(sprintf(" %s: clip=%.1fms indirect=%.1fms verify=%.1fms total=%.1fms\n", + ds_name, + ph$clip_us / 1000, ph$indirect_us / 1000, + ph$verify_us / 1000, ph$total_us / 1000)) + } +} + +cat("\nBenchmark complete.\n") diff --git a/dev/benchmarks/bench_stress_large.R b/dev/benchmarks/bench_stress_large.R new file mode 100644 index 000000000..47308da31 --- /dev/null +++ b/dev/benchmarks/bench_stress_large.R @@ -0,0 +1,268 @@ +# T-069: Stress test at 150–225 taxa +# Agent F, 2026-03-18 +# +# Three large neotrans matrices: project175 (165t), project3763 (205t), syab07204 (225t) +# Goals: +# 1. Per-phase timing and phase distribution at large size +# 2. TBR pass micro-benchmark via ts_bench_tbr_phases +# 3. Pool behaviour (pool_size, replicates, fuse events) +# 4. Scaling exponent for indirect scoring vs smaller datasets +# +# Run via: +# Rscript --vanilla -e "library(TreeSearch, lib.loc='.agent-f'); source('dev/benchmarks/bench_stress_large.R')" + +library(TreeSearch, lib.loc = ".agent-f") +library(TreeTools) + +NEOTRANS_DIR <- "../neotrans/inst/matrices" +MATRICES <- c("project175.nex", "project3763.nex", "syab07204.nex") + +# ---- Helpers ---------------------------------------------------------------- + +load_nex <- function(file) { + path <- file.path(NEOTRANS_DIR, file) + ReadAsPhyDat(path) +} + +prep_ds <- function(phyDat) { + at <- attributes(phyDat) + contrast <- at$contrast + storage.mode(contrast) <- "double" + tip_data <- matrix(unlist(phyDat, use.names = FALSE), + nrow = length(phyDat), byrow = TRUE) + storage.mode(tip_data) <- "integer" + weight <- at$weight + levels <- at$levels + # min_steps: number of non-zero contrast entries minus 1, clamped to 0 + min_steps <- pmax(apply(contrast, 2, function(x) sum(x > 0)) - 1L, 0L) + list(contrast = contrast, tip_data = tip_data, weight = weight, + levels = levels, min_steps = min_steps, + n_taxa = length(phyDat), n_chars = ncol(tip_data)) +} + +# ---- Section 1: Load matrices and summarise --------------------------------- + +cat("=== T-069 Large-Matrix Stress Test ===\n\n") +cat("=== Section 1: Dataset summary ===\n\n") + +datasets <- list() +for (f in MATRICES) { + cat(" Loading", f, "...\n") + pd <- load_nex(f) + ds <- prep_ds(pd) + datasets[[f]] <- ds + inappl_pct <- if (!is.null(attributes(pd)$levels) && + "-" %in% attributes(pd)$levels) { + round(100 * mean(unlist(pd) == which(attributes(pd)$levels == "-")), 1) + } else 0 + cat(sprintf(" %s: %d taxa, %d chars, inapplicable_pct=%.1f%%\n", + f, ds$n_taxa, ds$n_chars, inappl_pct)) +} + +# ---- Section 2: TBR pass micro-benchmark ------------------------------------ + +cat("\n=== Section 2: TBR pass micro-benchmark (ts_bench_tbr_phases) ===\n\n") + +tbr_results <- list() +for (f in MATRICES) { + ds <- datasets[[f]] + cat(sprintf(" %s (%d tips)...\n", f, ds$n_taxa)) + + reps_raw <- vector("list", 3) + for (i in 1:3) { + set.seed(4100 + i) + tree <- RandomTree(ds$n_taxa, root = TRUE) + reps_raw[[i]] <- TreeSearch:::ts_bench_tbr_phases( + tree$edge, + ds$contrast, ds$tip_data, ds$weight, ds$levels, + ds$min_steps + ) + } + + avg <- function(field) mean(vapply(reps_raw, `[[`, numeric(1), field)) + row <- data.frame( + file = f, + n_tips = reps_raw[[1]]$n_tips, + n_blocks = reps_raw[[1]]$n_blocks, + total_words = reps_raw[[1]]$total_words, + has_na = reps_raw[[1]]$has_na, + n_clips = avg("n_clips"), + n_candidates = avg("n_candidates"), + full_rescore_us = avg("time_full_rescore_us"), + clip_incr_us = avg("time_clip_incr_us"), + indirect_us = avg("time_indirect_us"), + unclip_us = avg("time_unclip_us"), + snap_save_us = avg("time_snapshot_save_us"), + snap_restore_us = avg("time_snapshot_restore_us"), + snap_bytes = avg("snapshot_bytes"), + stringsAsFactors = FALSE + ) + tbr_results[[f]] <- row + cat(sprintf(" clips=%.0f cands=%.0f indirect=%.0fms snap=%.1fKB\n", + row$n_clips, row$n_candidates, + row$indirect_us / 1000, row$snap_bytes / 1024)) +} + +tbr_df <- do.call(rbind, tbr_results) +rownames(tbr_df) <- NULL + +cat("\nTBR phase timing (μs, per pass):\n") +print(tbr_df[, c("file", "n_tips", "n_blocks", "full_rescore_us", + "clip_incr_us", "indirect_us", "unclip_us", + "snap_save_us", "snap_restore_us")], digits = 4) + +cat("\nPer-candidate indirect timing (ns):\n") +ns_cand <- round(1000 * tbr_df$indirect_us / tbr_df$n_candidates, 1) +print(data.frame(file = tbr_df$file, n_tips = tbr_df$n_tips, + n_candidates = round(tbr_df$n_candidates), + indirect_total_ms = round(tbr_df$indirect_us / 1000, 1), + ns_per_candidate = ns_cand)) + +# ---- Section 3: Scaling vs smaller datasets -------------------------------- +# +# Pull synthetic-series data from bench_memory.R baselines if available, +# otherwise run a quick synthetic series here. + +cat("\n=== Section 3: Scaling analysis ===\n\n") + +# Quick synthetic series: 20, 50, 100, 200, + new 225 point from tbr_df +make_synthetic <- function(n_tips, n_chars = 200, na_prob = 0.1) { + tree <- RandomTree(n_tips, root = TRUE) + mat <- matrix( + sample(c("0", "1", "-"), n_tips * n_chars, replace = TRUE, + prob = c((1 - na_prob) / 2, (1 - na_prob) / 2, na_prob)), + n_tips, n_chars, + dimnames = list(tree$tip.label, NULL) + ) + MatrixToPhyDat(mat) +} + +bench_tbr_one <- function(n_tips, n_chars = 200, na_prob = 0.1, seed = 4200) { + set.seed(seed) + pd <- make_synthetic(n_tips, n_chars, na_prob) + ds <- prep_ds(pd) + set.seed(seed + 1) + tree <- RandomTree(n_tips, root = TRUE) + r <- TreeSearch:::ts_bench_tbr_phases( + tree$edge, + ds$contrast, ds$tip_data, ds$weight, ds$levels, + ds$min_steps + ) + data.frame( + n_tips = n_tips, + n_candidates = r$n_candidates, + indirect_us = r$time_indirect_us, + clip_incr_us = r$time_clip_incr_us + ) +} + +synth_sizes <- c(20, 50, 100, 150, 200, 225) +cat(" Synthetic scaling series:", paste(synth_sizes, collapse = ", "), "tips...\n") +synth_rows <- lapply(synth_sizes, function(n) { + cat(" n =", n, "\n") + bench_tbr_one(n) +}) +synth_df <- do.call(rbind, synth_rows) +print(synth_df) + +# Fit scaling exponents +if (nrow(synth_df) >= 4) { + fit_indirect <- lm(log(indirect_us) ~ log(n_tips), data = synth_df) + fit_candidates <- lm(log(n_candidates) ~ log(n_tips), data = synth_df) + fit_clip <- lm(log(clip_incr_us) ~ log(n_tips), data = synth_df) + cat(sprintf("\nScaling exponents (log-log fit):\n")) + cat(sprintf(" indirect_us ~ n^%.2f (expected ~2.0)\n", coef(fit_indirect)[2])) + cat(sprintf(" n_candidates ~ n^%.2f (expected ~2.0)\n", coef(fit_candidates)[2])) + cat(sprintf(" clip_incr_us ~ n^%.2f\n", coef(fit_clip)[2])) +} + +# ---- Section 4: Full driven search (default params, 2 seeds) --------------- + +cat("\n=== Section 4: Full driven search at default params ===\n\n") +cat(" (maxReplicates=2, nThreads=2, default strategy)\n\n") + +driven_results <- list() +for (f in MATRICES) { + ds <- datasets[[f]] + cat(sprintf("--- %s (%d tips, %d chars) ---\n", f, ds$n_taxa, ds$n_chars)) + + # Auto-select strategy: replicate what MaximizeParsimony() does + # For large matrices, thorough if nChar < 100 AND nTip >= 65 + nTip <- ds$n_taxa + nChar <- ds$n_chars + use_thorough <- (nTip >= 65) && (nChar < 100) + if (use_thorough) { + ratchet <- 20L; drift <- 12L; xss <- 1L; rss <- 1L; css <- 0L + strat_name <- "thorough" + } else { + ratchet <- 5L; drift <- 2L; xss <- 1L; rss <- 1L; css <- 0L + strat_name <- "default" + } + cat(sprintf(" Strategy: %s (ratchet=%d, drift=%d)\n", strat_name, ratchet, drift)) + + run_list <- list() + for (seed_i in 1:2) { + set.seed(4300 + seed_i) + t0 <- proc.time() + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 2L, + targetHits = 1L, + ratchetCycles = ratchet, + driftCycles = drift, + xssRounds = xss, + rssRounds = rss, + cssRounds = css, + cssPartitions = 3L, + xssPartitions = 3L, + fuseInterval = 5L, + maxSeconds = 300, + verbosity = 0L, + nThreads = 2L + ) + elapsed <- (proc.time() - t0)[3] + run_list[[seed_i]] <- list(result = result, elapsed = elapsed) + cat(sprintf(" seed %d: %.2fs score=%.1f reps=%d pool=%d\n", + seed_i, elapsed, result$best_score, + result$replicates, result$pool_size)) + } + + # Per-phase breakdown from first run + r1 <- run_list[[1]]$result + if (!is.null(r1$timings)) { + timings <- r1$timings + cpp_total <- sum(unlist(timings)) + cat(sprintf(" Per-phase breakdown (seed 1):\n")) + for (ph in names(timings)) { + pct <- if (cpp_total > 0) 100 * timings[[ph]] / cpp_total else 0 + cat(sprintf(" %-12s %7.0f ms (%4.1f%%)\n", ph, timings[[ph]], pct)) + } + cat(sprintf(" %-12s %7.0f ms (C++ total)\n", "TOTAL", cpp_total)) + } + + driven_results[[f]] <- list( + file = f, + n_tips = ds$n_taxa, + n_chars = ds$n_chars, + strategy = strat_name, + score1 = run_list[[1]]$result$best_score, + score2 = run_list[[2]]$result$best_score, + time1 = run_list[[1]]$elapsed, + time2 = run_list[[2]]$elapsed, + pool1 = run_list[[1]]$result$pool_size, + reps1 = run_list[[1]]$result$replicates + ) + cat("\n") +} + +cat("=== Summary table ===\n\n") +summary_df <- do.call(rbind, lapply(driven_results, as.data.frame)) +rownames(summary_df) <- NULL +print(summary_df) + +# Save results +out_path <- "dev/benchmarks/stress_large_results.csv" +write.csv(summary_df, out_path, row.names = FALSE) +cat("\nResults written to", out_path, "\n") + +cat("\n=== T-069 complete ===\n") diff --git a/dev/benchmarks/bench_subprocess.R b/dev/benchmarks/bench_subprocess.R new file mode 100644 index 000000000..14a6ef216 --- /dev/null +++ b/dev/benchmarks/bench_subprocess.R @@ -0,0 +1,124 @@ +# Subprocess-isolated benchmark: each run in its own Rscript process. +# Workaround for T-025 (ratchet-triggered optimization-dependent UB that +# causes segfaults on consecutive ts_driven_search calls). + +library(TreeSearch) +library(TreeTools) + +source("dev/benchmarks/bench_datasets.R") +source("dev/benchmarks/bench_framework.R") + +GRID_DATASETS <- c( + "Longrich2010", # 20 tips + "Vinther2008", # 23 tips + "Aria2015", # 35 tips + "Griswold1999", # 43 tips + "Agnarsson2004", # 62 tips + "Zhu2013", # 75 tips + "Giles2015", # 78 tips + "Dikow2009" # 88 tips +) + +# One benchmark in a subprocess; returns CSV line or NA on crash +run_one_subprocess <- function(ds_name, strat_name, seed, maxSeconds = 20L, + maxReplicates = 100L) { + script <- sprintf(' +library(TreeSearch, lib.loc = if (dir.exists(".agent-a")) ".agent-a" else .libPaths()) +library(TreeTools) +source("dev/benchmarks/bench_datasets.R") +source("dev/benchmarks/bench_framework.R") +ds <- prepare_ts_data(TreeSearch::inapplicable.phyData[["%s"]]) +strat <- get_strategy("%s") +targetHits <- max(10L, ds$n_taxa %%/%% 5L) +args <- c( + list(contrast = ds$contrast, tip_data = ds$tip_data, + weight = ds$weight, levels = ds$levels, + maxReplicates = %dL, targetHits = targetHits, + maxSeconds = %d, verbosity = 0L), + strat) +set.seed(%dL) +t0 <- proc.time() +result <- do.call(TreeSearch:::ts_driven_search, args) +wall <- as.double((proc.time() - t0)[3]) +cat(result$best_score, result$replicates, result$hits_to_best, + result$pool_size, as.integer(result$timed_out), wall, + result$timings[["wagner_ms"]], result$timings[["tbr_ms"]], + result$timings[["xss_ms"]], result$timings[["rss_ms"]], + result$timings[["css_ms"]], result$timings[["ratchet_ms"]], + result$timings[["drift_ms"]], result$timings[["final_tbr_ms"]], + result$timings[["fuse_ms"]], sep = ",") +', ds_name, strat_name, maxReplicates, maxSeconds, seed) + + tf <- tempfile(fileext = ".R") + writeLines(script, tf) + on.exit(unlink(tf)) + + out <- tryCatch( + system2("Rscript", c("--no-save", tf), + stdout = TRUE, stderr = FALSE, timeout = maxSeconds + 30L), + error = function(e) NA_character_ + ) + + if (length(out) == 0 || is.na(out[1])) return(NULL) + vals <- as.numeric(strsplit(out[length(out)], ",")[[1]]) + if (length(vals) != 15) return(NULL) + + data.frame( + dataset = ds_name, strategy = strat_name, seed = seed, + n_taxa = length(TreeSearch::inapplicable.phyData[[ds_name]]), + best_score = vals[1], replicates = vals[2], hits_to_best = vals[3], + pool_size = vals[4], timed_out = as.logical(vals[5]), + wall_s = vals[6], + wagner_ms = vals[7], tbr_ms = vals[8], xss_ms = vals[9], + rss_ms = vals[10], css_ms = vals[11], ratchet_ms = vals[12], + drift_ms = vals[13], final_tbr_ms = vals[14], fuse_ms = vals[15], + stringsAsFactors = FALSE + ) +} + +# Run grid using subprocess isolation +run_grid_safe <- function(dataset_names = GRID_DATASETS, + strategy_names = STRATEGY_NAMES, + replicates = 3L, + maxSeconds = 20L, + base_seed = 7142L) { + n_combos <- length(dataset_names) * length(strategy_names) * replicates + cat(sprintf("Grid: %d datasets x %d strategies x %d reps = %d runs (subprocess)\n", + length(dataset_names), length(strategy_names), replicates, n_combos)) + + rows <- vector("list", n_combos) + idx <- 0L + + for (ds_name in dataset_names) { + for (strat_name in strategy_names) { + for (rep in seq_len(replicates)) { + idx <- idx + 1L + seed <- base_seed + (idx - 1L) * 7L + + cat(sprintf("[%3d/%d] %-15s x %-16s rep %d ... ", + idx, n_combos, ds_name, strat_name, rep)) + + res <- run_one_subprocess(ds_name, strat_name, seed, + maxSeconds = maxSeconds) + if (is.null(res)) { + cat("CRASH/ERROR\n") + next + } + cat(sprintf("score=%.0f wall=%.1fs reps=%d %s\n", + res$best_score, res$wall_s, res$replicates, + if (res$timed_out) "[TIMEOUT]" else "")) + rows[[idx]] <- res + } + } + } + + result <- do.call(rbind, rows[!vapply(rows, is.null, logical(1))]) + outfile <- "dev/benchmarks/results_grid.csv" + write.csv(result, outfile, row.names = FALSE) + cat(sprintf("\nResults saved to %s (%d rows)\n", outfile, nrow(result))) + invisible(result) +} + +# Main +cat("Starting subprocess-isolated benchmark grid...\n\n") +results <- run_grid_safe() diff --git a/dev/benchmarks/bench_t252_mbank_training.R b/dev/benchmarks/bench_t252_mbank_training.R new file mode 100644 index 000000000..8f39a0b10 --- /dev/null +++ b/dev/benchmarks/bench_t252_mbank_training.R @@ -0,0 +1,129 @@ +#!/usr/bin/env Rscript +# T-252: MorphoBank training-set baseline benchmark +# +# Runs the fixed 25-matrix training sample at 30s, 60s, and 120s budgets +# with the "default" strategy, 5 seeds per combination. +# Total: 25 matrices x 3 budgets x 5 seeds = 375 runs. +# Estimated wall time: ~4–5 hours (most runs hit timeout). +# +# Usage: +# Rscript bench_t252_mbank_training.R +# +# Requires: TreeSearch (installed), neotrans corpus in ../neotrans/ + +args <- commandArgs(trailingOnly = TRUE) +outdir <- if (length(args) >= 1) args[1] else "." + +# Find the repo root (this script lives in dev/benchmarks/) +# When run from repo root (cd $REPO; Rscript dev/benchmarks/...), getwd() is it. +repo_root <- getwd() +if (!file.exists(file.path(repo_root, "DESCRIPTION"))) { + # Try relative to script location + script_dir <- tryCatch( + dirname(normalizePath(sys.frame(1)$ofile)), + error = function(e) getwd() + ) + repo_root <- normalizePath(file.path(script_dir, "..", ".."), + mustWork = FALSE) +} +setwd(repo_root) + +cat("=== T-252: MorphoBank Training-Set Benchmark ===\n") +cat("Repo root:", repo_root, "\n") +cat("Output dir:", outdir, "\n") +cat("Started:", format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "\n\n") + +library(TreeSearch) +library(TreeTools) + +source("dev/benchmarks/bench_datasets.R") +source("dev/benchmarks/bench_framework.R") + +# ---- Configuration ---- +BUDGETS <- c(30, 60, 120) # seconds +N_SEEDS <- 5L +BASE_SEED <- 3847L +STRATEGY <- "default" + +# ---- Load training matrices ---- +cat("Loading MorphoBank catalogue...\n") +catalogue <- load_mbank_catalogue() +cat(sprintf("Catalogue: %d usable matrices\n", nrow(catalogue))) + +cat(sprintf("Loading %d fixed training matrices...\n", + length(MBANK_FIXED_SAMPLE))) +datasets <- load_mbank_datasets(catalogue, keys = MBANK_FIXED_SAMPLE) +cat(sprintf("Successfully loaded: %d matrices\n\n", length(datasets))) + +if (length(datasets) == 0) { + stop("No datasets loaded. Is the neotrans repo available?") +} + +# ---- Characterize datasets ---- +cat("Dataset characteristics:\n") +chars <- do.call(rbind, lapply(names(datasets), function(nm) { + ch <- characterize_dataset(datasets[[nm]]) + ch$key <- nm + ch +})) +chars <- chars[order(chars$n_taxa), ] +print(chars[, c("key", "n_taxa", "n_chars", "n_patterns", + "pct_missing", "pct_inapp", "n_app_states")]) +cat("\n") + +# ---- Run benchmarks ---- +strat <- get_strategy(STRATEGY) +all_results <- list() + +for (budget in BUDGETS) { + cat(sprintf("\n========== Budget: %ds ==========\n", budget)) + + results <- run_benchmark_grid( + dataset_names = names(datasets), + strategy_names = STRATEGY, + replicates = N_SEEDS, + maxReplicates = 100L, + maxSeconds = budget, + base_seed = BASE_SEED, + datasets = datasets + ) + results$budget_s <- budget + results$source <- "mbank_training" + + # Save intermediate results per budget + outfile <- file.path( + outdir, + sprintf("t252_mbank_%ds_%s.csv", budget, + format(Sys.time(), "%Y%m%d_%H%M")) + ) + write.csv(results, outfile, row.names = FALSE) + cat(sprintf("Saved %d rows to %s\n", nrow(results), outfile)) + + all_results[[as.character(budget)]] <- results +} + +# ---- Combine and save final results ---- +final <- do.call(rbind, all_results) +final_file <- file.path(outdir, + sprintf("t252_mbank_all_%s.csv", + format(Sys.time(), "%Y%m%d_%H%M"))) +write.csv(final, final_file, row.names = FALSE) +cat(sprintf("\n=== Final results: %d rows saved to %s ===\n", + nrow(final), final_file)) + +# ---- Summary statistics ---- +cat("\n=== Summary by budget ===\n") +for (budget in BUDGETS) { + sub <- final[final$budget_s == budget, ] + cat(sprintf("\n--- %ds budget (%d runs) ---\n", budget, nrow(sub))) + cat(sprintf(" Median score: %.1f\n", median(sub$best_score, na.rm = TRUE))) + cat(sprintf(" Timed out: %d/%d (%.0f%%)\n", + sum(sub$timed_out, na.rm = TRUE), nrow(sub), + 100 * mean(sub$timed_out, na.rm = TRUE))) + cat(sprintf(" Median replicates: %.0f\n", + median(sub$replicates, na.rm = TRUE))) + cat(sprintf(" Median wall time: %.1fs\n", + median(sub$wall_s, na.rm = TRUE))) +} + +cat("\n=== Completed:", format(Sys.time(), "%Y-%m-%d %H:%M:%S"), "===\n") diff --git a/dev/benchmarks/bench_t265_regression.R b/dev/benchmarks/bench_t265_regression.R new file mode 100644 index 000000000..f17ff7b89 --- /dev/null +++ b/dev/benchmarks/bench_t265_regression.R @@ -0,0 +1,289 @@ +#!/usr/bin/env Rscript +# T-265: Per-replicate search quality regression diagnosis +# +# DESIGNED FOR HAMILTON HPC. Do not run locally. +# +# Tests whether the quality regression is in preset params vs engine code +# by comparing 3 configurations on the datasets with largest TNT gaps: +# +# r2_equiv — Minimal pipeline matching R2 structure: 12 ratchet (4%, +# auto moves), 2 drift, no sectorial, 1 Wagner, no tabu, +# no NNI warmup. Tests what R2 actually ran. +# r2_modern — R2 structure + modern ratchet tuning: 12 ratchet (25%, +# 5 moves), 0 drift, 1 Wagner, no sectorial, no tabu, +# NNI warmup ON. Tests whether modern ratchet params help +# with a minimal pipeline. +# auto_preset — Current auto-selected preset (default or thorough). +# Tests whether added complexity helps or hurts. +# +# If r2_equiv or r2_modern produce better scores -> preset complexity is +# the problem. If all configs show the same regression -> engine code issue. +# +# Usage: +# Rscript bench_t265_regression.R [timeout_s] [output_dir] +# +# Default: 120s budget, output to current directory. + +library(TreeSearch) +library(TreeTools) + +args <- commandArgs(trailingOnly = TRUE) +timeout_s <- if (length(args) >= 1) as.integer(args[1]) else 120L +output_dir <- if (length(args) >= 2) args[2] else "." + +cat("=== T-265: Per-Replicate Quality Regression Diagnosis ===\n") +cat(sprintf("Timeout: %ds\n", timeout_s)) +cat(sprintf("TreeSearch version: %s\n", packageVersion("TreeSearch"))) +cat(sprintf("Output dir: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Datasets ---- +# 8 datasets with largest persistent TNT gaps, plus Wilson2003 from T-265 +gap_names <- c( + "Wortley2006", "Eklund2004", "Wilson2003", "Conrad2008", + "Geisler2001", "Zanol2014", "Zhu2013", "Giles2015", "Dikow2009" +) + +# Convert inapplicable to missing for EW Fitch scoring (match TNT) +fitch_mode <- function(dataset) { + contrast <- attr(dataset, "contrast") + levels <- attr(dataset, "levels") + inapp_col <- match("-", levels) + if (is.na(inapp_col)) return(dataset) + for (i in seq_len(nrow(contrast))) { + if (contrast[i, inapp_col] == 1 && sum(contrast[i, ]) == 1) { + contrast[i, ] <- 1 + } + } + attr(dataset, "contrast") <- contrast + dataset +} + +datasets <- lapply( + setNames(gap_names, gap_names), + function(nm) fitch_mode(inapplicable.phyData[[nm]]) +) + +# ---- Configurations ---- +configs <- list( + r2_equiv = list( + label = "r2_equiv", + desc = "R2 pipeline: 12 ratchet (4%), 2 drift, no sectorial, no tabu", + control = SearchControl( + ratchetCycles = 12L, + ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, + ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 2L, + driftAfdLimit = 5L, + driftRfdLimit = 0.15, + xssRounds = 0L, rssRounds = 0L, cssRounds = 0L, + wagnerStarts = 1L, + tabuSize = 0L, + nniFirst = FALSE, sprFirst = FALSE, + perturbStopFactor = 0L, + adaptiveLevel = FALSE, + maxOuterResets = 0L, + outerCycles = 1L, + fuseInterval = 5L, + fuseAcceptEqual = FALSE, + poolMaxSize = 100L, + consensusStableReps = 0L, + nniPerturbCycles = 0L, + annealCycles = 0L, + adaptiveStart = FALSE, + enumTimeFraction = 0.1 + ) + ), + r2_modern = list( + label = "r2_modern", + desc = "R2 structure + modern ratchet (25%, 5 moves), NNI warmup, no drift", + control = SearchControl( + ratchetCycles = 12L, + ratchetPerturbProb = 0.25, + ratchetPerturbMode = 0L, + ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = FALSE, + driftCycles = 0L, + xssRounds = 0L, rssRounds = 0L, cssRounds = 0L, + wagnerStarts = 1L, + tabuSize = 0L, + nniFirst = TRUE, sprFirst = FALSE, + perturbStopFactor = 0L, + adaptiveLevel = FALSE, + maxOuterResets = 0L, + outerCycles = 1L, + fuseInterval = 5L, + fuseAcceptEqual = FALSE, + poolMaxSize = 100L, + consensusStableReps = 0L, + nniPerturbCycles = 0L, + annealCycles = 0L, + adaptiveStart = FALSE, + enumTimeFraction = 0.1 + ) + ), + auto_preset = list( + label = "auto_preset", + desc = "Current auto-selected preset (default or thorough)" + # No control override — uses strategy = "auto" + ) +) + +seeds <- 1:5 +total_runs <- length(configs) * length(datasets) * length(seeds) +cat(sprintf("Configs: %d, Datasets: %d, Seeds: %d -> %d total runs\n", + length(configs), length(datasets), length(seeds), total_runs)) + +# TNT reference scores (from bench_intra_fuse.R and T-265 notes) +tnt_best <- c( + Wortley2006 = 479, Eklund2004 = 438, Wilson2003 = 860, + Conrad2008 = 1725, Geisler2001 = 1293, + Zanol2014 = 1261, Zhu2013 = 624, + Giles2015 = 670, Dikow2009 = 1603 +) + +# ---- Run experiments ---- +results <- data.frame( + dataset = character(), n_tips = integer(), n_patterns = integer(), + auto_strategy = character(), + config = character(), seed = integer(), timeout_s = integer(), + score = numeric(), n_trees = integer(), replicates = integer(), + hits = integer(), wall_s = numeric(), + tnt_best = numeric(), gap = numeric(), + stringsAsFactors = FALSE +) + +run_idx <- 0L +for (cfg_name in names(configs)) { + cfg <- configs[[cfg_name]] + cat(sprintf("\n--- Config: %s (%s) ---\n", cfg$label, cfg$desc)) + + for (ds_name in gap_names) { + ds <- datasets[[ds_name]] + ntip <- length(ds) + npat <- sum(attr(ds, "weight")) + auto_strat <- if (ntip <= 30) "sprint" + else if (npat < 100) "default" + else if (ntip >= 120) "large" + else if (ntip >= 65) "thorough" + else "default" + + for (s in seeds) { + run_idx <- run_idx + 1L + cat(sprintf(" [%d/%d] %s / %s / seed=%d ... ", + run_idx, total_runs, ds_name, cfg$label, s)) + + set.seed(s) + t0 <- proc.time() + + tryCatch({ + if (cfg_name == "auto_preset") { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "auto", + verbosity = 0L, + nThreads = 1L + ) + } else { + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + strategy = "none", + control = cfg$control, + verbosity = 0L, + nThreads = 1L + ) + } + + elapsed <- (proc.time() - t0)[3] + best_score <- attr(res, "score") + n_trees <- length(res) + reps <- attr(res, "replicates") + hits <- attr(res, "hits") + tnt_ref <- tnt_best[ds_name] + gap <- if (!is.na(tnt_ref)) best_score - tnt_ref else NA_real_ + + cat(sprintf("score=%g, gap=%s, reps=%d, %.1fs\n", + best_score, + if (is.na(gap)) "?" else sprintf("%+d", gap), + reps, elapsed)) + + results <- rbind(results, data.frame( + dataset = ds_name, n_tips = ntip, n_patterns = npat, + auto_strategy = auto_strat, + config = cfg$label, seed = s, timeout_s = timeout_s, + score = best_score, n_trees = n_trees, replicates = reps, + hits = hits, wall_s = elapsed, + tnt_best = tnt_ref, gap = gap, + stringsAsFactors = FALSE + )) + }, error = function(e) { + elapsed <- (proc.time() - t0)[3] + cat(sprintf("ERROR: %s (%.1fs)\n", conditionMessage(e), elapsed)) + results <<- rbind(results, data.frame( + dataset = ds_name, n_tips = ntip, n_patterns = npat, + auto_strategy = auto_strat, + config = cfg$label, seed = s, timeout_s = timeout_s, + score = NA_real_, n_trees = NA_integer_, replicates = NA_integer_, + hits = NA_integer_, wall_s = elapsed, + tnt_best = tnt_best[ds_name], gap = NA_real_, + stringsAsFactors = FALSE + )) + }) + } + } +} + +# ---- Save results ---- +out_file <- file.path(output_dir, + sprintf("t265_results_%ds.csv", timeout_s)) +write.csv(results, out_file, row.names = FALSE) +cat(sprintf("\nResults saved to: %s\n", out_file)) + +# ---- Summary ---- +cat("\n=== Summary by config × dataset (median score, median gap) ===\n\n") +for (ds_name in gap_names) { + sub <- results[results$dataset == ds_name, ] + if (nrow(sub) == 0) next + cat(sprintf(" %s (%dt, %dp, auto=%s, TNT=%s):\n", + ds_name, sub$n_tips[1], sub$n_patterns[1], + sub$auto_strategy[1], + if (is.na(tnt_best[ds_name])) "?" else tnt_best[ds_name])) + for (cfg_name in names(configs)) { + cfg_sub <- sub[sub$config == configs[[cfg_name]]$label, ] + if (nrow(cfg_sub) == 0) next + med_score <- median(cfg_sub$score, na.rm = TRUE) + med_gap <- median(cfg_sub$gap, na.rm = TRUE) + min_score <- min(cfg_sub$score, na.rm = TRUE) + max_score <- max(cfg_sub$score, na.rm = TRUE) + med_reps <- median(cfg_sub$replicates, na.rm = TRUE) + unique_scores <- length(unique(na.omit(cfg_sub$score))) + cat(sprintf(" %-14s median=%7.0f (range %g-%g), gap=%+.0f, reps=%.0f, unique_scores=%d\n", + configs[[cfg_name]]$label, med_score, min_score, max_score, + med_gap, med_reps, unique_scores)) + } +} + +# ---- Per-replicate convergence check ---- +cat("\n=== Score diversity across seeds (do all seeds find the same score?) ===\n\n") +for (ds_name in gap_names) { + sub <- results[results$dataset == ds_name, ] + if (nrow(sub) == 0) next + cat(sprintf(" %s:\n", ds_name)) + for (cfg_name in names(configs)) { + cfg_sub <- sub[sub$config == configs[[cfg_name]]$label, ] + if (nrow(cfg_sub) == 0) next + scores <- na.omit(cfg_sub$score) + if (length(scores) == 0) next + n_unique <- length(unique(scores)) + cat(sprintf(" %-14s scores: %s (%d unique)\n", + configs[[cfg_name]]$label, + paste(scores, collapse = ", "), + n_unique)) + } +} + +cat(sprintf("\nCompleted: %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_t269_interleaving.R b/dev/benchmarks/bench_t269_interleaving.R new file mode 100644 index 000000000..90081885e --- /dev/null +++ b/dev/benchmarks/bench_t269_interleaving.R @@ -0,0 +1,201 @@ +#!/usr/bin/env Rscript +# T-269: Fine-grained sectorial interleaving benchmark +# +# DESIGNED FOR HAMILTON HPC. Do not run locally (hours of wall time). +# +# Tests whether fine-grained interleaving of sectorial search with ratchet +# perturbation improves score quality. The key question: does performing +# one sectorial pass per ratchet cycle (outerCycles = ratchetCycles) help +# compared to the current thorough preset (outerCycles = 2)? +# +# Design: +# - Thorough preset as base (ratchetCycles=20, XSS+RSS+CSS, outerCycles=2) +# - Vary outerCycles ∈ {1, 2, 4, 10, 20} while holding ratchetCycles=20 +# - 4 standard gap datasets (37–88 tips), 5 seeds, 30s + 60s budgets +# - EW scoring throughout (inapplicable → missing via fitch_mode) +# +# outerCycles=1: all 20 ratchet cycles in one block, then 1 sectorial pass +# outerCycles=2: 2 × 10 ratchet + 2 sectorial passes (current thorough) +# outerCycles=4: 4 × 5 ratchet + 4 sectorial passes +# outerCycles=10: 10 × 2 ratchet + 10 sectorial passes +# outerCycles=20: 20 × 1 ratchet + 20 sectorial passes (TNT pattern) +# +# Usage: +# Rscript bench_t269_interleaving.R [timeout_s] [output_dir] +# timeout_s: search budget in seconds. Default: 30 +# output_dir: where to write CSV results. Default: "." +# +# Output: t269_interleaving_{timeout}s.csv + +library(TreeSearch) +library(TreeTools) + +args <- commandArgs(trailingOnly = TRUE) +timeout_s <- if (length(args) >= 1) as.integer(args[1]) else 30L +output_dir <- if (length(args) >= 2) args[2] else "." + +cat("=== T-269: Fine-Grained Sectorial Interleaving Benchmark ===\n") +cat(sprintf("Timeout: %ds\n", timeout_s)) +cat(sprintf("TreeSearch version: %s\n", packageVersion("TreeSearch"))) +cat(sprintf("Output dir: %s\n", output_dir)) +cat(sprintf("Started: %s\n\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) + +# ---- Datasets ---- +# 4 standard datasets with persistent TNT gaps — range 37–88 tips. +# inapplicable converted to missing for EW Fitch (match TNT). +fitch_mode <- function(dataset) { + contrast <- attr(dataset, "contrast") + levels <- attr(dataset, "levels") + inapp_col <- match("-", levels) + if (is.na(inapp_col)) return(dataset) + for (i in seq_len(nrow(contrast))) { + if (contrast[i, inapp_col] == 1 && sum(contrast[i, ]) == 1) { + contrast[i, ] <- 1 + } + } + attr(dataset, "contrast") <- contrast + dataset +} + +bench_names <- c("Wortley2006", "Agnarsson2004", "Zhu2013", "Dikow2009") +datasets <- lapply( + setNames(bench_names, bench_names), + function(nm) fitch_mode(inapplicable.phyData[[nm]]) +) + +# TNT reference scores (EW Fitch mode, from T-265) +tnt_best <- c( + Wortley2006 = 479, Agnarsson2004 = 718, + Zhu2013 = 624, Dikow2009 = 1603 +) + +seeds <- 1:5 + +# ---- Configs ---- +# Fixed thorough-preset parameters (ratchetCycles=20, no drift, no NNI-perturb) +# outerCycles varies: 1, 2, 4, 10, 20. +outer_cycles_grid <- c(1L, 2L, 4L, 10L, 20L) + +build_control <- function(outer_cycles) { + SearchControl( + # Thorough preset base + ratchetCycles = 20L, + ratchetPerturbProb = 0.25, + ratchetPerturbMode = 2L, + ratchetPerturbMaxMoves = 5L, + ratchetAdaptive = FALSE, # off for cleaner comparison + # Vary this: + outerCycles = outer_cycles, + # Sectorial + xssRounds = 5L, + rssRounds = 5L, + cssRounds = 2L, + # No drift/NNI-perturb + driftCycles = 0L, + nniPerturbCycles = 0L, + # Other thorough settings + wagnerStarts = 3L, + nniFirst = TRUE, + consensusStableReps = 0L + ) +} + +configs <- setNames( + lapply(outer_cycles_grid, build_control), + sprintf("outer_%02d", outer_cycles_grid) +) + +total_runs <- length(configs) * length(datasets) * length(seeds) +cat(sprintf("Configs: %d (outerCycles: %s), Datasets: %d, Seeds: %d -> %d total runs\n\n", + length(configs), + paste(outer_cycles_grid, collapse = "/"), + length(datasets), length(seeds), total_runs)) + +# ---- Run experiments ---- +results <- data.frame( + dataset = character(), n_tips = integer(), n_patterns = integer(), + outer_cycles = integer(), seed = integer(), timeout_s = integer(), + score = numeric(), n_trees = integer(), replicates = integer(), + wall_s = numeric(), tnt_best = numeric(), gap = numeric(), + stringsAsFactors = FALSE +) + +run_idx <- 0L +for (cfg_name in names(configs)) { + ctrl <- configs[[cfg_name]] + oc <- ctrl$outerCycles + cat(sprintf("\n--- outerCycles = %d ---\n", oc)) + + for (ds_name in bench_names) { + ds <- datasets[[ds_name]] + ntip <- length(ds) + npat <- sum(attr(ds, "weight")) + + for (s in seeds) { + run_idx <- run_idx + 1L + cat(sprintf(" [%d/%d] %s / oc=%d / seed=%d ... ", + run_idx, total_runs, ds_name, oc, s)) + + set.seed(s) + t0 <- proc.time() + + tryCatch({ + res <- MaximizeParsimony( + ds, + maxSeconds = timeout_s, + control = ctrl, + verbosity = 0L, + nThreads = 1L + ) + + elapsed <- (proc.time() - t0)[3] + best_score <- attr(res, "score") + n_trees <- length(res) + reps <- attr(res, "replicates") + tnt_ref <- tnt_best[ds_name] + gap <- if (!is.na(tnt_ref)) best_score - tnt_ref else NA_real_ + + cat(sprintf("score=%g, gap=%s, reps=%d, %.1fs\n", + best_score, + if (is.na(gap)) "?" else sprintf("%+d", gap), + reps, elapsed)) + + results <- rbind(results, data.frame( + dataset = ds_name, n_tips = ntip, n_patterns = npat, + outer_cycles = oc, seed = s, timeout_s = timeout_s, + score = best_score, n_trees = n_trees, replicates = reps, + wall_s = elapsed, + tnt_best = tnt_ref, gap = gap, + stringsAsFactors = FALSE + )) + }, error = function(e) { + cat(sprintf("ERROR: %s\n", conditionMessage(e))) + }) + } + } +} + +# ---- Save results ---- +outfile <- file.path( + output_dir, + sprintf("t269_interleaving_%ds.csv", timeout_s) +) +write.csv(results, outfile, row.names = FALSE) +cat(sprintf("\n=== Results written to %s (%d rows) ===\n", + outfile, nrow(results))) + +# ---- Quick summary ---- +cat("\n--- Median gap by outerCycles × dataset ---\n") +agg <- aggregate(gap ~ outer_cycles + dataset, data = results, FUN = median, + na.rm = TRUE) +agg_wide <- reshape(agg, direction = "wide", idvar = "outer_cycles", + timevar = "dataset", v.names = "gap") +names(agg_wide) <- sub("gap\\.", "", names(agg_wide)) +print(agg_wide[order(agg_wide$outer_cycles), ], row.names = FALSE) + +cat("\n--- Median gap by outerCycles (pooled) ---\n") +agg2 <- aggregate(gap ~ outer_cycles, data = results, FUN = median, + na.rm = TRUE) +print(agg2[order(agg2$outer_cycles), ], row.names = FALSE) + +cat(sprintf("\nCompleted: %s\n", format(Sys.time(), "%Y-%m-%d %H:%M:%S %Z"))) diff --git a/dev/benchmarks/bench_t274_nni_perturb.R b/dev/benchmarks/bench_t274_nni_perturb.R new file mode 100644 index 000000000..6f90bcfeb --- /dev/null +++ b/dev/benchmarks/bench_t274_nni_perturb.R @@ -0,0 +1,180 @@ +# bench_t274_nni_perturb.R +# +# T-274: Benchmark nniPerturbCycles=0 vs 5 at thorough-preset scale. +# +# S-PROF round 6 found NNI-perturb = 34.3% of Zhu2013 (75t) thorough-preset +# search time with only 14% hit rate and ~1-step mean improvement. +# This benchmark tests whether removing NNI-perturb improves time-adjusted +# expected best score at 30s and 60s budgets on 65–88 tip datasets. +# +# METHODOLOGY: Per-replicate sampling. +# - maxReplicates=1 per run, many seeds → per-replicate score distribution +# - time_per_rep estimated from wall time +# - expected_best(scores, k=floor(budget/median_time)) at 30s/60s +# +# Usage: +# Rscript dev/benchmarks/bench_t274_nni_perturb.R [lib_path] +# Default lib_path = .agent-F +# +# Results: dev/benchmarks/results_t274_nni_perturb.csv +# Run time: ~12-18 min (3 datasets x 2 conditions x 20 seeds) + +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[[1L]] else ".agent-F" +.libPaths(c(lib_path, .libPaths())) +library(TreeSearch) +library(TreeTools) + +cat("TreeSearch version:", as.character(packageVersion("TreeSearch")), "\n") +cat("Date:", format(Sys.time(), "%Y-%m-%d %H:%M"), "\n\n") + +# ------------------------------------------------------------ +# Configuration +# ------------------------------------------------------------ +DATASETS <- c("Zhu2013", "Giles2015", "Dikow2009") # 75, 78, 88 tips +BUDGETS_S <- c(30, 60) +N_SEEDS <- 20L +NNI_CONDITIONS <- c(0L, 5L) +OUT_FILE <- "dev/benchmarks/results_t274_nni_perturb.csv" + +# Seeds — fixed for reproducibility +set.seed(4718) +seeds <- sample.int(99999L, N_SEEDS) + +# ------------------------------------------------------------ +# expected_best: bootstrap estimate of expected minimum from k draws +# ------------------------------------------------------------ +expected_best <- function(scores, k, n_boot = 5000L) { + mean(replicate(n_boot, min(sample(scores, k, replace = TRUE)))) +} + +# ------------------------------------------------------------ +# Per-replicate runs +# ------------------------------------------------------------ +total_runs <- length(DATASETS) * length(NNI_CONDITIONS) * N_SEEDS +cat(sprintf("Total runs: %d datasets x %d conditions x %d seeds = %d\n\n", + length(DATASETS), length(NNI_CONDITIONS), N_SEEDS, total_runs)) + +rows <- list() +idx <- 0L + +for (ds_name in DATASETS) { + dataset <- TreeSearch::inapplicable.phyData[[ds_name]] + if (is.null(dataset)) { + warning("Dataset not found: ", ds_name) + next + } + n_taxa <- length(dataset) + n_char <- sum(attr(dataset, "weight")) + + cat(sprintf("=== %s (%dt, %dc) ===\n", ds_name, n_taxa, n_char)) + + for (nni_cycles in NNI_CONDITIONS) { + cond_label <- if (nni_cycles == 0L) "nni=0" else sprintf("nni=%d", nni_cycles) + + for (seed in seeds) { + idx <- idx + 1L + cat(sprintf("[%3d/%d] %-12s | %-6s | seed %5d ... ", + idx, total_runs, ds_name, cond_label, seed)) + flush.console() + + set.seed(seed) + t0 <- proc.time()[[3L]] + result <- tryCatch( + # Pass nniPerturbCycles via ... so it overrides the thorough preset + # for just that parameter, leaving all other thorough params intact. + MaximizeParsimony(dataset, + strategy = "thorough", + nniPerturbCycles = as.integer(nni_cycles), + maxReplicates = 1L, + nThreads = 1L, + verbosity = 0L), + error = function(e) { + cat("ERROR:", conditionMessage(e), "\n") + NULL + } + ) + wall_s <- proc.time()[[3L]] - t0 + + if (is.null(result)) { + rows[[idx]] <- data.frame( + dataset = ds_name, n_taxa = n_taxa, nni_cycles = nni_cycles, + seed = seed, best_score = NA_real_, wall_s = NA_real_, + stringsAsFactors = FALSE + ) + next + } + + best_score <- min(attr(result, "score"), na.rm = TRUE) + cat(sprintf("score=%.0f wall=%.1fs\n", best_score, wall_s)) + + rows[[idx]] <- data.frame( + dataset = ds_name, + n_taxa = n_taxa, + nni_cycles = nni_cycles, + seed = seed, + best_score = best_score, + wall_s = wall_s, + stringsAsFactors = FALSE + ) + } + cat("\n") + } +} + +results_df <- do.call(rbind, rows) +write.csv(results_df, OUT_FILE, row.names = FALSE) +cat("\nResults written to:", OUT_FILE, "\n\n") + +# ------------------------------------------------------------ +# Analysis: Time-adjusted expected best +# ------------------------------------------------------------ +cat("===== Time-adjusted expected best (lower score = better) =====\n\n") + +for (ds_name in DATASETS) { + sub <- results_df[results_df$dataset == ds_name & !is.na(results_df$best_score), ] + cat(sprintf("--- %s ---\n", ds_name)) + + for (budget in BUDGETS_S) { + cat(sprintf(" Budget = %ds:\n", budget)) + for (nni in NNI_CONDITIONS) { + d <- sub[sub$nni_cycles == nni, ] + if (nrow(d) < 5L) { cat(sprintf(" nni=%d: insufficient data\n", nni)); next } + med_time <- median(d$wall_s, na.rm = TRUE) + k <- max(1L, floor(budget / med_time)) + eb <- expected_best(d$best_score, k) + cat(sprintf(" nni=%d: median_time=%.1fs, k=%d reps, expected_best=%.1f (n=%d)\n", + nni, med_time, k, eb, nrow(d))) + } + } + cat("\n") +} + +# Summary table: delta (nni=0 - nni=5) at each budget +cat("===== Expected-best delta (nni=0 vs nni=5, negative = nni=0 better) =====\n") +cat(sprintf("%-14s %8s %8s %8s %8s\n", + "Dataset", "30s_nni0", "30s_nni5", "60s_nni0", "60s_nni5")) +cat(strrep("-", 56), "\n") + +for (ds_name in DATASETS) { + sub <- results_df[results_df$dataset == ds_name & !is.na(results_df$best_score), ] + row_vals <- c(ds_name) + + for (budget in BUDGETS_S) { + for (nni in NNI_CONDITIONS) { + d <- sub[sub$nni_cycles == nni, ] + if (nrow(d) < 5L) { row_vals <- c(row_vals, "N/A"); next } + med_time <- median(d$wall_s, na.rm = TRUE) + k <- max(1L, floor(budget / med_time)) + eb <- expected_best(d$best_score, k) + row_vals <- c(row_vals, sprintf("%.1f", eb)) + } + } + cat(sprintf("%-14s %8s %8s %8s %8s\n", + row_vals[[1L]], row_vals[[2L]], row_vals[[3L]], + row_vals[[4L]], row_vals[[5L]])) +} + +cat("\n") +cat("Interpretation: Positive delta = nni=0 is better (removes overhead).\n") +cat("Negative delta = nni=5 is better (perturbation value exceeds overhead).\n") diff --git a/dev/benchmarks/bench_trajectory.R b/dev/benchmarks/bench_trajectory.R new file mode 100644 index 000000000..bb7fef113 --- /dev/null +++ b/dev/benchmarks/bench_trajectory.R @@ -0,0 +1,416 @@ +#!/usr/bin/env Rscript +# T-251: TNT vs TreeSearch trajectory comparison +# +# Captures per-replicate search trajectories from both engines on the +# datasets where TNT has the largest score advantage. Focuses on: +# - Score vs wall-clock time +# - Rearrangements per improvement (TNT) vs phase cost per improvement (TS) +# - Escape effectiveness (delta from ratchet/drift/sectorial) +# +# Usage: +# source("dev/benchmarks/bench_trajectory.R") +# results <- trajectory_compare() # all 3 gap datasets, 30s +# results <- trajectory_compare_quick() # Wortley2006 only, 10s + +library(TreeSearch) +library(TreeTools) +library(dplyr) + +TNT_EXE <- "C:/Programs/Phylogeny/tnt/TNT-bin/tnt.exe" +STAGING_DIR <- ".tnt-bench" +dir.create(STAGING_DIR, showWarnings = FALSE, recursive = TRUE) + +# Datasets with largest persistent gaps (from T-249): +# Geisler2001 +21, Zhu2013 +8, Wortley2006 +7, Conrad2008 +5, Zanol2014 +4 +GAP_DATASETS <- c("Geisler2001", "Zhu2013", "Wortley2006") + +# ---- Data preparation ---- + +prepare_dataset <- function(name) { + ds <- inapplicable.phyData[[name]] + # Convert inapplicable to missing to match TNT's default Fitch scoring + mat <- PhyDatToMatrix(ds) + mat[mat == "-"] <- "?" + ds_clean <- MatrixToPhyDat(mat) + + # Export for TNT + tnt_path <- file.path(STAGING_DIR, paste0(name, ".tnt")) + WriteTntCharacters(ds_clean, filepath = tnt_path) + + # Prepare for TreeSearch C++ bridge + at <- attributes(ds_clean) + list( + name = name, + phyDat = ds_clean, + contrast = at$contrast, + tip_data = matrix(unlist(ds_clean, use.names = FALSE), + nrow = length(ds_clean), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds_clean), + n_chars = sum(at$weight), + tnt_file = paste0(name, ".tnt") + ) +} + +# ---- TNT trajectory capture ---- + +run_tnt_trajectory <- function(data_file, timeout_s = 30, seed = 1, + hits = 10L, reps = 100L) { + commands <- c( + "mxram 1024;", + sprintf("proc %s;", data_file), + "hold 10000;", + sprintf("rseed %d;", seed), + sprintf("timeout %d:%02d:%02d;", + timeout_s %/% 3600, (timeout_s %% 3600) %/% 60, timeout_s %% 60), + sprintf("xmult=hits %d replic %d;", hits, reps), + "best;", + "quit;" + ) + + script_path <- file.path(STAGING_DIR, "tntbench.run") + writeLines(commands, script_path) + + old_wd <- setwd(STAGING_DIR) + on.exit(setwd(old_wd), add = TRUE) + + t0 <- proc.time() + output <- withCallingHandlers( + system2(TNT_EXE, args = "tntbench.run;", + stdout = TRUE, stderr = TRUE, timeout = timeout_s + 60), + warning = function(w) invokeRestart("muffleWarning") + ) + wall_s <- as.double((proc.time() - t0)[3]) + + output <- iconv(output, from = "", to = "UTF-8", sub = "") + parse_tnt_trajectory(output, wall_s) +} + +parse_tnt_trajectory <- function(output, wall_s) { + out_text <- paste(output, collapse = "\n") + + # TNT uses \r for progress bars — split on \r to get individual lines + raw_text <- paste(output, collapse = "\n") + all_lines <- unlist(strsplit(raw_text, "[\r\n]+")) + all_lines <- trimws(all_lines) + + # Parse per-replicate lines: + # "1 SECT 6 1301 1301 0:00:01 22,678,443" + # "5 FUSE 20 ------ ------ 0:00:04 100,410,686" + # Score and Best Score fields can be "------" + rep_pattern <- "(\\d+)\\s+(SECT|FUSE|RATCH|DRIFT|CSS|RAT|RAS|SPR|TBR|FUS)\\s+(\\d+)\\s+(-{2,}|\\d+)\\s+(-{2,}|\\d+)\\s+(\\d+:\\d+:\\d+)\\s+([0-9,]+)" + + reps <- list() + for (line in all_lines) { + m <- regmatches(line, gregexpr(rep_pattern, line, perl = TRUE))[[1]] + for (match in m) { + parts <- regmatches(match, regexec(rep_pattern, match, perl = TRUE))[[1]] + if (length(parts) >= 8) { + time_parts <- as.integer(strsplit(parts[7], ":")[[1]]) + secs <- time_parts[1] * 3600 + time_parts[2] * 60 + time_parts[3] + reps[[length(reps) + 1]] <- data.frame( + replicate = as.integer(parts[2]), + algorithm = parts[3], + trees = as.integer(parts[4]), + score = if (grepl("-", parts[5])) NA_integer_ else as.integer(parts[5]), + best_score = if (grepl("-", parts[6])) NA_integer_ else as.integer(parts[6]), + time_s = secs, + rearrangements = as.numeric(gsub(",", "", parts[8])), + stringsAsFactors = FALSE + ) + } + } + } + + # Parse totals (use raw_text which includes all \r-separated content) + total_rearr <- NA_real_ + m <- regmatches(raw_text, regexpr("Total rearrangements examined:\\s+([0-9,]+)", raw_text)) + if (length(m) == 1) { + total_rearr <- as.numeric(gsub("[^0-9]", "", sub("Total rearrangements examined:\\s+", "", m))) + } + + best_score <- NA_real_ + m <- regmatches(raw_text, regexpr("Best score:\\s+([0-9.]+)", raw_text)) + if (length(m) == 1) best_score <- as.numeric(sub("Best score:\\s+", "", m)) + + list( + trajectory = if (length(reps) > 0) do.call(rbind, reps) else NULL, + total_rearrangements = total_rearr, + best_score = best_score, + wall_s = wall_s, + raw_output = output + ) +} + +# ---- TreeSearch trajectory capture ---- + +run_ts_trajectory <- function(ds, timeout_s = 30, seed = 1, + hits = 10L, reps = 100L) { + # Capture verbosity=2 output by redirecting Rprintf + set.seed(seed) + + # Use a text connection to capture the C++ Rprintf output + log_file <- tempfile(fileext = ".txt") + t0 <- proc.time() + + # Capture C++ Rprintf output via output diversion + log_con <- file(log_file, open = "wt") + sink(log_con, type = "output") + + result <- tryCatch( + TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = as.integer(reps), + targetHits = as.integer(hits), + maxSeconds = as.double(timeout_s), + verbosity = 2L, + nThreads = 1L, + # Match current default strategy + ratchetCycles = 12L, + ratchetPerturbProb = 0.25, + driftCycles = 2L, + nniFirst = TRUE, + outerCycles = 1L, + maxOuterResets = 2L, + adaptiveLevel = TRUE + ), + finally = { + sink(type = "output") + close(log_con) + } + ) + wall_s <- as.double((proc.time() - t0)[3]) + + log_lines <- readLines(log_file, warn = FALSE) + unlink(log_file) + + parse_ts_trajectory(log_lines, result, wall_s) +} + +parse_ts_trajectory <- function(log_lines, result, wall_s) { + # Parse per-replicate, per-phase data from verbosity=2 output + # Format: " Phase score: NNNN [NNN ms total]" + # Replicate headers: "Replicate N/M" or "Replicate N/M (best: N, pool: N, hits: N)" + + phases <- list() + current_rep <- 0L + cumulative_ms <- 0 + + for (line in log_lines) { + # Replicate header + rep_match <- regmatches(line, regexec("Replicate (\\d+)/(\\d+)", line))[[1]] + if (length(rep_match) >= 2) { + current_rep <- as.integer(rep_match[2]) + next + } + + # Phase line: " Phase score: NNNN [NNN ms]" or " Phase score: NNNN [NNN ms total]" + phase_match <- regmatches( + line, + regexec("^\\s+(\\S+)\\s+.*score:\\s+(\\d+)\\s+\\[(\\d+\\.?\\d*)\\s+ms", line) + )[[1]] + if (length(phase_match) >= 4) { + phase_name <- sub("_.*", "", phase_match[2]) + score <- as.integer(phase_match[3]) + ms <- as.numeric(phase_match[4]) + + phases[[length(phases) + 1]] <- data.frame( + replicate = current_rep, + phase = phase_name, + score = score, + phase_ms = ms, + stringsAsFactors = FALSE + ) + next + } + + # Wagner line: " wag_rand+NNI tree score: NNNN [NNN ms]" + wag_match <- regmatches( + line, + regexec("^\\s+wag.*score:\\s+(\\d+)\\s+\\[(\\d+\\.?\\d*)\\s+ms", line) + )[[1]] + if (length(wag_match) >= 3) { + phases[[length(phases) + 1]] <- data.frame( + replicate = current_rep, + phase = "Wagner", + score = as.integer(wag_match[2]), + phase_ms = as.numeric(wag_match[3]), + stringsAsFactors = FALSE + ) + next + } + + # Outer cycle reset line + reset_match <- regmatches( + line, + regexec("Outer cycle improved.*\\((\\d+) -> (\\d+)\\)", line) + )[[1]] + if (length(reset_match) >= 3) { + phases[[length(phases) + 1]] <- data.frame( + replicate = current_rep, + phase = "Reset", + score = as.integer(reset_match[3]), + phase_ms = 0, + stringsAsFactors = FALSE + ) + } + } + + trajectory <- if (length(phases) > 0) do.call(rbind, phases) else NULL + + list( + trajectory = trajectory, + best_score = result$best_score, + replicates = result$replicates, + hits = result$hits_to_best, + wall_s = wall_s, + timings = result$timings, + log_lines = log_lines + ) +} + +# ---- Main comparison ---- + +trajectory_compare <- function(datasets = GAP_DATASETS, + timeout_s = 30, seeds = 1:3) { + results <- list() + + for (nm in datasets) { + cat(sprintf("\n=== %s ===\n", nm)) + ds <- prepare_dataset(nm) + cat(sprintf(" %d taxa, %d chars\n", ds$n_taxa, ds$n_chars)) + + for (seed in seeds) { + cat(sprintf(" Seed %d: ", seed)) + key <- paste0(nm, "_s", seed) + + # TNT + cat("TNT... ") + tnt <- run_tnt_trajectory(ds$tnt_file, timeout_s = timeout_s, + seed = seed, hits = 10L, reps = 100L) + cat(sprintf("%.0f (%.1fs, %.0fM rearr) | ", tnt$best_score, + tnt$wall_s, tnt$total_rearrangements / 1e6)) + + # TreeSearch + cat("TS... ") + ts <- run_ts_trajectory(ds, timeout_s = timeout_s, + seed = seed, hits = 10L, reps = 100L) + cat(sprintf("%.0f (%.1fs, %d reps)\n", ts$best_score, + ts$wall_s, ts$replicates)) + + results[[key]] <- list( + dataset = nm, seed = seed, n_taxa = ds$n_taxa, n_chars = ds$n_chars, + tnt = tnt, ts = ts + ) + } + } + + results +} + +trajectory_compare_quick <- function() { + trajectory_compare(datasets = "Wortley2006", timeout_s = 10, seeds = 1:2) +} + +# ---- Analysis helpers ---- + +summarize_trajectories <- function(results) { + rows <- list() + for (key in names(results)) { + r <- results[[key]] + tnt <- r$tnt + ts <- r$ts + + # TNT trajectory summary + tnt_traj <- tnt$trajectory + tnt_n_reps <- if (!is.null(tnt_traj)) max(tnt_traj$replicate) else NA + tnt_rearr_per_s <- if (!is.na(tnt$total_rearrangements) && tnt$wall_s > 0) { + round(tnt$total_rearrangements / tnt$wall_s / 1e6, 1) + } else NA + + # TreeSearch trajectory summary + ts_traj <- ts$trajectory + ts_n_phases <- if (!is.null(ts_traj)) nrow(ts_traj) else NA + + # Phase cost breakdown (ms) + tm <- unlist(ts$timings) + total_ms <- sum(tm) + ratchet_pct <- round(100 * tm["ratchet_ms"] / total_ms, 1) + tbr_pct <- round(100 * tm["tbr_ms"] / total_ms, 1) + drift_pct <- round(100 * tm["drift_ms"] / total_ms, 1) + xss_pct <- round(100 * tm["xss_ms"] / total_ms, 1) + css_pct <- round(100 * tm["css_ms"] / total_ms, 1) + + rows[[key]] <- data.frame( + dataset = r$dataset, seed = r$seed, + n_taxa = r$n_taxa, n_chars = r$n_chars, + tnt_score = tnt$best_score, + tnt_wall_s = round(tnt$wall_s, 2), + tnt_reps = tnt_n_reps, + tnt_rearr_M = round(tnt$total_rearrangements / 1e6, 1), + tnt_rearr_per_s_M = tnt_rearr_per_s, + ts_score = ts$best_score, + ts_wall_s = round(ts$wall_s, 2), + ts_reps = ts$replicates, + gap = ts$best_score - tnt$best_score, + ratchet_pct = ratchet_pct, tbr_pct = tbr_pct, + drift_pct = drift_pct, xss_pct = xss_pct, css_pct = css_pct, + stringsAsFactors = FALSE + ) + } + do.call(rbind, rows) +} + +# Extract per-replicate best score trajectory from TreeSearch log +ts_replicate_trajectory <- function(ts_result) { + traj <- ts_result$trajectory + if (is.null(traj)) return(NULL) + + # Get final score per replicate (last phase entry per replicate) + library(dplyr) + traj |> + group_by(replicate) |> + summarise( + rep_score = last(score), + total_phase_ms = sum(phase_ms), + n_phases = n(), + n_resets = sum(phase == "Reset"), + .groups = "drop" + ) |> + mutate( + best_so_far = cummin(rep_score), + improved = rep_score < lag(best_so_far, default = Inf) + ) +} + +# Compare escape effectiveness: how often does each perturbation phase +# actually improve the score? +ts_phase_effectiveness <- function(ts_result) { + traj <- ts_result$trajectory + if (is.null(traj)) return(NULL) + + # For each replicate, track score before and after each phase + traj |> + group_by(replicate) |> + mutate( + prev_score = lag(score, default = first(score)), + delta = prev_score - score, # positive = improvement + improved = delta > 0 + ) |> + ungroup() |> + filter(phase != "Wagner", phase != "Reset") |> + group_by(phase) |> + summarise( + n = n(), + n_improved = sum(improved), + hit_rate = round(mean(improved), 3), + mean_delta = round(mean(delta[improved]), 1), + total_ms = sum(phase_ms), + ms_per_improvement = if (sum(improved) > 0) { + round(sum(phase_ms) / sum(improved)) + } else NA_real_, + .groups = "drop" + ) |> + arrange(desc(hit_rate)) +} diff --git a/dev/benchmarks/bench_warmstart.R b/dev/benchmarks/bench_warmstart.R new file mode 100644 index 000000000..2cb3ccda4 --- /dev/null +++ b/dev/benchmarks/bench_warmstart.R @@ -0,0 +1,220 @@ +# Warm-start benchmark: measure ratchet/drift escape effectiveness +# +# Seeds search with a pre-computed TBR-optimal tree to isolate +# perturbation quality from initial descent quality. +# +# Usage: +# source("dev/benchmarks/bench_framework.R") +# source("dev/benchmarks/bench_warmstart.R") +# ws <- warmstart_benchmark("Agnarsson2004", replicates = 20) +# warmstart_summary(ws) + +library(TreeSearch) +library(TreeTools) + +source("dev/benchmarks/bench_datasets.R") + +#' Compute a TBR-optimal tree via a short sprint search. +#' +#' Runs a fast search (sprint strategy, 1 replicate) to produce a local +#' optimum. This tree serves as the warm-start seed for escape benchmarks. +#' +#' @param ds Prepared dataset (from prepare_ts_data). +#' @param seed RNG seed for the sprint search. +#' @return Named list with `edge` (edge matrix), `score` (optimum score), +#' and `tree` (phylo object) for inspection. +compute_warmstart_tree <- function(ds, seed = 7381L) { + set.seed(seed) + result <- TreeSearch:::ts_driven_search( + ds$contrast, ds$tip_data, ds$weight, ds$levels, + maxReplicates = 1L, + targetHits = 1L, + ratchetCycles = 0L, + driftCycles = 0L, + xssRounds = 0L, + rssRounds = 0L, + cssRounds = 0L, + nniPerturbCycles = 0L, + maxSeconds = 0, + verbosity = 0L + ) + if (result$pool_size == 0) stop("Sprint search produced no trees") + + edge_mat <- result$trees[[1]] + list( + edge = edge_mat, + score = result$best_score + ) +} + +#' Run a single warm-started replicate. +#' +#' Passes the pre-computed tree via `startEdge`, runs 1 replicate with +#' the given strategy. Since the starting tree is already TBR-optimal, +#' the initial TBR phase converges immediately; only ratchet/drift/XSS +#' perturbations can improve the score. +#' +#' @param ds Prepared dataset. +#' @param start_edge Edge matrix from compute_warmstart_tree(). +#' @param strategy Named list of strategy params (from get_strategy). +#' @param seed RNG seed for this replicate. +#' @param maxSeconds Timeout. +#' @return Named list with metrics. +warmstart_run <- function(ds, start_edge, strategy, + seed = 42L, maxSeconds = 30) { + + # Track when the score improves from the warm-start baseline + cb_env <- new.env(parent = emptyenv()) + cb_env$best <- Inf + cb_env$time_to_improvement <- NA_real_ + cb_env$trace <- list() + + progress_cb <- function(info) { + if (is.finite(info$best_score) && info$best_score < cb_env$best) { + cb_env$best <- info$best_score + cb_env$time_to_improvement <- info$elapsed + } + cb_env$trace[[length(cb_env$trace) + 1L]] <- list( + replicate = info$replicate, + elapsed = info$elapsed, + best_score = info$best_score, + phase = info$phase + ) + } + + args <- c( + list( + contrast = ds$contrast, + tip_data = ds$tip_data, + weight = ds$weight, + levels = ds$levels, + maxReplicates = 1L, + targetHits = 1L, + maxSeconds = as.double(maxSeconds), + verbosity = 1L, + startEdge = start_edge, + progressCallback = progress_cb + ), + strategy + ) + + set.seed(seed) + t0 <- proc.time() + result <- do.call(TreeSearch:::ts_driven_search, args) + wall_s <- as.double((proc.time() - t0)[3]) + + list( + best_score = result$best_score, + wall_s = wall_s, + time_to_improvement_s = cb_env$time_to_improvement, + timed_out = result$timed_out, + timings = result$timings, + trace = cb_env$trace + ) +} + +#' Run warm-start escape benchmark for one dataset. +#' +#' First computes a TBR-local-optimum via sprint, then runs multiple +#' warm-started replicates with varying seeds and strategies. +#' +#' @param ds_name Dataset name (from BENCHMARK_NAMES or LARGE_BENCHMARK_NAMES). +#' @param strategy_names Strategies to test. +#' @param replicates Independent warm-started runs per strategy. +#' @param maxSeconds Timeout per run. +#' @param warmstart_seed Seed for the initial sprint search. +#' @param base_seed Base seed for warm-started replicates. +#' @return Data frame with one row per strategy x replicate. +warmstart_benchmark <- function( + ds_name, + strategy_names = c("default", "thorough"), + replicates = 10L, + maxSeconds = 30, + warmstart_seed = 7381L, + base_seed = 42L +) { + all_ds <- load_all_benchmark_datasets() + ds <- all_ds[[ds_name]] + if (is.null(ds)) stop("Dataset '", ds_name, "' not found") + + cat(sprintf("Computing warm-start tree for %s (%d tips)...\n", + ds_name, ds$n_taxa)) + ws <- compute_warmstart_tree(ds, seed = warmstart_seed) + cat(sprintf("Warm-start score: %.5g\n\n", ws$score)) + + rows <- list() + for (strat_name in strategy_names) { + strat <- get_strategy(strat_name) + for (rep in seq_len(replicates)) { + seed <- base_seed + rep - 1L + cat(sprintf("[%s rep %d/%d] ...", strat_name, rep, replicates)) + + res <- tryCatch( + warmstart_run(ds, ws$edge, strat, seed = seed, + maxSeconds = maxSeconds), + error = function(e) { + cat(sprintf(" ERROR: %s\n", conditionMessage(e))) + NULL + } + ) + + if (is.null(res)) { + rows <- c(rows, list(data.frame( + dataset = ds_name, n_taxa = ds$n_taxa, + strategy = strat_name, replicate = rep, seed = seed, + warmstart_score = ws$score, + best_score = NA_real_, improvement = NA_real_, + wall_s = NA_real_, time_to_improvement_s = NA_real_, + timed_out = NA, + stringsAsFactors = FALSE + ))) + next + } + + improvement <- ws$score - res$best_score + cat(sprintf(" score=%.5g improvement=%.5g time=%.1fs\n", + res$best_score, improvement, res$wall_s)) + + rows <- c(rows, list(data.frame( + dataset = ds_name, n_taxa = ds$n_taxa, + strategy = strat_name, replicate = rep, seed = seed, + warmstart_score = ws$score, + best_score = res$best_score, + improvement = improvement, + wall_s = res$wall_s, + time_to_improvement_s = res$time_to_improvement_s, + timed_out = res$timed_out, + stringsAsFactors = FALSE + ))) + } + } + + do.call(rbind, rows) +} + +#' Summarize warm-start benchmark results. +#' +#' @param results Data frame from warmstart_benchmark. +#' @return Summary per strategy: median improvement, escape rate, timing. +warmstart_summary <- function(results) { + strats <- unique(results$strategy) + summaries <- list() + for (st in strats) { + sub <- results[results$strategy == st & !is.na(results$best_score), ] + if (nrow(sub) == 0) next + escaped <- sub$improvement > 0 + summaries <- c(summaries, list(data.frame( + strategy = st, + n_runs = nrow(sub), + warmstart_score = sub$warmstart_score[1], + best_found = min(sub$best_score), + median_score = median(sub$best_score), + median_improvement = median(sub$improvement), + escape_rate = round(100 * mean(escaped), 1), + median_wall_s = round(median(sub$wall_s), 2), + median_tti_s = round(median(sub$time_to_improvement_s, na.rm = TRUE), 2), + stringsAsFactors = FALSE + ))) + } + do.call(rbind, summaries) +} diff --git a/dev/benchmarks/benchmark_mp2.R b/dev/benchmarks/benchmark_mp2.R new file mode 100644 index 000000000..7cd7c0ef0 --- /dev/null +++ b/dev/benchmarks/benchmark_mp2.R @@ -0,0 +1,83 @@ +# Benchmark: MaximizeParsimony2 (C++ driven search) vs MaximizeParsimony (R loop) +# +# Compares wall-clock time and best score found on a selection of datasets +# from inapplicable.phyData, using equal-weight Fitch parsimony throughout. + +library(TreeSearch) +library(TreeTools) + +data("inapplicable.phyData") + +#' Convert inapplicable tokens to fully ambiguous for pure Fitch EW scoring +#' @param ds A phyDat object +#' @return The modified phyDat with "-" treated as "?" +strip_inapp <- function(ds) { + cont <- attr(ds, "contrast") + lvls <- attr(ds, "levels") + dash_col <- which(lvls == "-") + if (length(dash_col) == 0L) return(ds) + # Tokens that code for "-": make them fully ambiguous over applicable states + has_dash <- cont[, dash_col] == 1 + app_cols <- setdiff(seq_len(ncol(cont)), dash_col) + cont[has_dash, app_cols] <- 1 + # Drop the "-" state column + cont <- cont[, -dash_col, drop = FALSE] + attr(ds, "contrast") <- cont + attr(ds, "levels") <- lvls[-dash_col] + ds +} + +bench_datasets <- c( + "Vinther2008", # 23 tips, 50 chars + "Asher2005", # 23 tips, 125 chars + "Wortley2006", # 37 tips, 105 chars + "Wills2012", # 55 tips, 87 chars + "Agnarsson2004", # 62 tips, 225 chars + "Dikow2009" # 88 tips, 204 chars +) + +results <- data.frame( + dataset = character(), tips = integer(), patterns = integer(), + mp2_score = numeric(), mp1_score = numeric(), score_diff = numeric(), + mp2_time = numeric(), mp1_time = numeric(), speedup = numeric(), + stringsAsFactors = FALSE +) + +for (nm in bench_datasets) { + ds <- strip_inapp(inapplicable.phyData[[nm]]) + n_tip <- length(ds) + n_pat <- attr(ds, "nr") + cat("\n---", nm, "(", n_tip, "tips,", n_pat, "pat) ---\n") + + # --- MaximizeParsimony2 (C++ driven search) --- + set.seed(6218) + t2 <- system.time({ + r2 <- MaximizeParsimony2(ds, verbosity = 0L) + }) + s2 <- TreeLength(r2[[1]], ds) + + # --- MaximizeParsimony (R loop) --- + set.seed(6218) + t1 <- system.time({ + r1 <- MaximizeParsimony(ds, ratchIter = 7L, tbrIter = 2L, + maxHits = n_tip * 1.8, maxTime = 5, + verbosity = 0L) + }) + s1 <- TreeLength(r1[[1]], ds) + + cat(" MP2:", s2, sprintf("(%.2fs, %d reps)", t2["elapsed"], + attr(r2, "replicates")), + " MP1:", s1, sprintf("(%.2fs)", t1["elapsed"]), + " diff:", s2 - s1, "\n") + + results <- rbind(results, data.frame( + dataset = nm, tips = n_tip, patterns = n_pat, + mp2_score = s2, mp1_score = s1, score_diff = s2 - s1, + mp2_time = t2["elapsed"], mp1_time = t1["elapsed"], + speedup = t1["elapsed"] / t2["elapsed"], + stringsAsFactors = FALSE + )) +} + +cat("\n\n=== SUMMARY ===\n") +print(results, row.names = FALSE) diff --git a/dev/benchmarks/build_mbank_catalogue.R b/dev/benchmarks/build_mbank_catalogue.R new file mode 100644 index 000000000..84cf450e5 --- /dev/null +++ b/dev/benchmarks/build_mbank_catalogue.R @@ -0,0 +1,301 @@ +#!/usr/bin/env Rscript +# Build a catalogue of MorphoBank matrices from the neotrans corpus. +# +# Scans neotrans/inst/matrices/*.nex, attempts to parse each as phyDat, +# and records metadata (ntax, nchar, patterns, missing%, inapplicable%). +# +# Output: dev/benchmarks/mbank_catalogue.csv +# +# Run from the TreeSearch source root: +# Rscript dev/benchmarks/build_mbank_catalogue.R +# +# Or from dev/benchmarks/: +# Rscript build_mbank_catalogue.R + +library(TreeTools) + +# --- Path resolution --- +find_neotrans_dir <- function() { + candidates <- c( + file.path(getwd(), "..", "neotrans", "inst", "matrices"), + file.path(getwd(), "..", "..", "neotrans", "inst", "matrices"), + file.path(dirname(getwd()), "neotrans", "inst", "matrices") + ) + for (d in candidates) { + d <- normalizePath(d, mustWork = FALSE) + if (dir.exists(d)) return(d) + } + stop("Cannot find neotrans/inst/matrices/. ", + "Run from TreeSearch source root or dev/benchmarks/.") +} + +find_output_dir <- function() { + candidates <- c( + file.path(getwd(), "inst", "benchmarks"), + getwd() + ) + for (d in candidates) { + if (file.exists(file.path(d, "bench_datasets.R"))) return(d) + } + # Fall back to dev/benchmarks if it exists + d <- file.path(getwd(), "inst", "benchmarks") + if (dir.exists(d)) return(d) + stop("Cannot find dev/benchmarks/ directory.") +} + +neotrans_dir <- find_neotrans_dir() +output_dir <- find_output_dir() + +cat("Neotrans matrices dir:", neotrans_dir, "\n") +cat("Output dir:", output_dir, "\n") + +# --- Find all .nex files --- +nex_files <- list.files(neotrans_dir, pattern = "\\.nex$", + full.names = TRUE, recursive = FALSE) +cat("Found", length(nex_files), ".nex files\n") + +# --- Parse each file and collect metadata --- +characterize_phyDat <- function(dataset) { + at <- attributes(dataset) + contrast <- at$contrast + lvls <- at$levels + n_taxa <- length(dataset) + n_patterns <- length(at$weight) + n_chars <- sum(at$weight) + n_states <- ncol(contrast) + + inapp_idx <- which(lvls == "-") + n_app_states <- n_states - length(inapp_idx) + + td <- matrix(unlist(dataset, use.names = FALSE), + nrow = n_taxa, byrow = TRUE) + total_cells <- n_taxa * n_patterns + + n_inapp <- 0L + n_missing <- 0L + has_inapp <- length(inapp_idx) > 0 + for (i in seq_len(nrow(contrast))) { + is_inapp <- has_inapp && contrast[i, inapp_idx] > 0.5 + cols_check <- setdiff(seq_len(n_states), inapp_idx) + is_all <- length(cols_check) > 0 && all(contrast[i, cols_check] > 0.5) + count <- sum(td == i) + if (is_inapp && !is_all) n_inapp <- n_inapp + count + if (is_all) n_missing <- n_missing + count + } + + list( + ntax = n_taxa, + nchar = n_chars, + n_patterns = n_patterns, + n_states = n_app_states, + pct_missing = round(100 * n_missing / total_cells, 1), + pct_inapp = round(100 * n_inapp / total_cells, 1) + ) +} + +results <- vector("list", length(nex_files)) + +for (i in seq_along(nex_files)) { + f <- nex_files[i] + bname <- basename(f) + + # Extract project ID and matrix index + if (grepl("^project", bname, ignore.case = TRUE)) { + proj_num <- as.integer(sub("^project(\\d+).*", "\\1", bname, + ignore.case = TRUE)) + # Multi-matrix index: "project1037 (2).nex" -> 2 + if (grepl("\\(\\d+\\)", bname)) { + mat_idx <- as.integer(sub(".*\\((\\d+)\\).*", "\\1", bname)) + } else { + mat_idx <- NA_integer_ + } + source_type <- "morphobank" + } else if (grepl("^syab", bname, ignore.case = TRUE)) { + proj_num <- NA_integer_ + mat_idx <- NA_integer_ + source_type <- "syab" + } else { + proj_num <- NA_integer_ + mat_idx <- NA_integer_ + source_type <- "other" + } + + # Unique key for this matrix + key <- sub("\\.nex$", "", bname, ignore.case = TRUE) + key <- gsub(" ", "_", key) + + # Assign split + if (!is.na(proj_num) && proj_num %% 5 == 0) { + split <- "validation" + } else { + split <- "training" + } + + # Try to parse + row <- list( + key = key, + filename = bname, + project_id = proj_num, + matrix_idx = mat_idx, + source_type = source_type, + split = split, + ntax = NA_integer_, + nchar = NA_integer_, + n_patterns = NA_integer_, + n_states = NA_integer_, + pct_missing = NA_real_, + pct_inapp = NA_real_, + parse_ok = FALSE, + error_message = "" + ) + + tryCatch({ + pd <- ReadAsPhyDat(f) + chars <- characterize_phyDat(pd) + row$ntax <- chars$ntax + row$nchar <- chars$nchar + row$n_patterns <- chars$n_patterns + row$n_states <- chars$n_states + row$pct_missing <- chars$pct_missing + row$pct_inapp <- chars$pct_inapp + row$parse_ok <- TRUE + }, error = function(e) { + row$error_message <<- conditionMessage(e) + }, warning = function(w) { + # Warnings during parsing are common (e.g. "Duplicate taxon names") + # Try to continue + tryCatch({ + pd <- suppressWarnings(ReadAsPhyDat(f)) + chars <- characterize_phyDat(pd) + row$ntax <<- chars$ntax + row$nchar <<- chars$nchar + row$n_patterns <<- chars$n_patterns + row$n_states <<- chars$n_states + row$pct_missing <<- chars$pct_missing + row$pct_inapp <<- chars$pct_inapp + row$parse_ok <<- TRUE + row$error_message <<- paste("WARNING:", conditionMessage(w)) + }, error = function(e2) { + row$error_message <<- paste("WARNING:", conditionMessage(w), + "; ERROR:", conditionMessage(e2)) + }) + }) + + results[[i]] <- as.data.frame(row, stringsAsFactors = FALSE) + + if (i %% 50 == 0 || i == length(nex_files)) { + cat(sprintf(" [%d/%d] %s\n", i, length(nex_files), bname)) + } +} + +catalogue <- do.call(rbind, results) + +# --- Dedup: flag near-duplicate multi-file matrices --- +# Multi-file projects (e.g. "project1037 (1).nex", "project1037 (2).nex") often +# contain the same character data with minor taxon-sampling differences. We flag +# redundant copies so the benchmark loader can exclude them by default. +# +# Method: for each project with multiple usable files, load all matrices, +# compute pairwise character identity on shared taxa, and greedily keep the +# largest (most taxa) representative from each cluster of >=95% identical pairs. + +usable_mask <- catalogue$parse_ok & !is.na(catalogue$ntax) & catalogue$ntax >= 20 +catalogue$dedup_drop <- FALSE + +usable_multi <- catalogue[usable_mask & !is.na(catalogue$matrix_idx), ] +if (nrow(usable_multi) > 0) { + usable_multi$project <- sub("_\\(\\d+\\)$", "", usable_multi$key) + proj_counts <- table(usable_multi$project) + multi_projects <- names(proj_counts[proj_counts >= 2]) + + cat(sprintf("\nDedup: checking %d multi-file projects (%d matrices)...\n", + length(multi_projects), + sum(usable_multi$project %in% multi_projects))) + + drop_keys <- character(0) + + for (proj in multi_projects) { + rows <- usable_multi[usable_multi$project == proj, ] + keys <- rows$key + mats <- list() + for (j in seq_len(nrow(rows))) { + fpath <- file.path(neotrans_dir, rows$filename[j]) + tryCatch({ + mats[[rows$key[j]]] <- suppressWarnings(ReadAsPhyDat(fpath)) + }, error = function(e) NULL) + } + if (length(mats) < 2) next + + # Build pairwise character-identity matrix + mk <- names(mats) + identity_mat <- matrix(NA_real_, length(mk), length(mk), + dimnames = list(mk, mk)) + for (a in seq_len(length(mk) - 1)) { + for (b in (a + 1):length(mk)) { + taxa_a <- names(mats[[mk[a]]]) + taxa_b <- names(mats[[mk[b]]]) + common <- intersect(taxa_a, taxa_b) + # Require >=80% taxon overlap with the smaller matrix + if (length(common) < 0.8 * min(length(taxa_a), length(taxa_b))) next + mat_a <- as.matrix(mats[[mk[a]]])[common, , drop = FALSE] + mat_b <- as.matrix(mats[[mk[b]]])[common, , drop = FALSE] + if (ncol(mat_a) != ncol(mat_b)) next + identity_mat[mk[a], mk[b]] <- mean(mat_a == mat_b, na.rm = TRUE) + identity_mat[mk[b], mk[a]] <- identity_mat[mk[a], mk[b]] + } + } + + # Greedy dedup: sort by ntax desc, keep first, drop near-dups + sorted_keys <- rows$key[order(-rows$ntax, -rows$nchar)] + kept <- character(0) + for (k in sorted_keys) { + is_dup <- FALSE + for (kk in kept) { + ci <- identity_mat[k, kk] + if (!is.na(ci) && ci >= 0.95) { is_dup <- TRUE; break } + } + if (is_dup) drop_keys <- c(drop_keys, k) + else kept <- c(kept, k) + } + } + + catalogue$dedup_drop[catalogue$key %in% drop_keys] <- TRUE + cat(sprintf("Dedup: flagged %d near-duplicate matrices for exclusion.\n", + length(drop_keys))) +} + +# --- Summary --- +cat("\n=== Catalogue Summary ===\n") +cat("Total files scanned:", nrow(catalogue), "\n") +cat("Parse OK:", sum(catalogue$parse_ok), "\n") +cat("Parse failed:", sum(!catalogue$parse_ok), "\n") +cat("\nAfter ntax >= 20 filter:\n") +usable <- catalogue$parse_ok & !is.na(catalogue$ntax) & catalogue$ntax >= 20 +cat(" Usable (before dedup):", sum(usable), "\n") +cat(" Dedup dropped:", sum(usable & catalogue$dedup_drop), "\n") +usable_dedup <- usable & !catalogue$dedup_drop +cat(" Usable (after dedup):", sum(usable_dedup), "\n") +cat(" Training:", sum(usable_dedup & catalogue$split == "training"), "\n") +cat(" Validation:", sum(usable_dedup & catalogue$split == "validation"), "\n") + +cat("\nSize tiers (after dedup):\n") +usable_cat <- catalogue[usable_dedup, ] +usable_cat$tier <- cut(usable_cat$ntax, + breaks = c(0, 30, 60, 120, Inf), + labels = c("Small(20-30)", "Medium(31-60)", + "Large(61-120)", "XLarge(121+)")) +print(table(usable_cat$split, usable_cat$tier)) + +cat("\nParse failures:\n") +if (any(!catalogue$parse_ok)) { + fails <- catalogue[!catalogue$parse_ok, c("key", "error_message")] + for (j in seq_len(nrow(fails))) { + cat(sprintf(" %s: %s\n", fails$key[j], + substr(fails$error_message[j], 1, 80))) + } +} + +# --- Save --- +out_path <- file.path(output_dir, "mbank_catalogue.csv") +write.csv(catalogue, out_path, row.names = FALSE) +cat("\nCatalogue written to:", out_path, "\n") diff --git a/dev/benchmarks/datasets.md b/dev/benchmarks/datasets.md new file mode 100644 index 000000000..f9fb34706 --- /dev/null +++ b/dev/benchmarks/datasets.md @@ -0,0 +1,112 @@ +# Benchmark Dataset Suite + +Selected from the 30 `inapplicable.phyData` datasets bundled with TreeSearch. +Criteria: cover small → large tip counts, varying inapplicable proportions, +varying state counts, and varying matrix densities (% missing data). + +## Dataset Selection + +| # | Dataset | Tips | Chars | Patterns | %Inapp | States | %Missing | Category | +|---|---------|------|-------|----------|--------|--------|----------|----------| +| 1 | Longrich2010 | 20 | 93 | 80 | 4.2 | 3 | 45.3 | Small, high missing | +| 2 | Vinther2008 | 23 | 57 | 50 | 6.1 | 4 | 21.0 | Small, moderate | +| 3 | Sansom2010 | 23 | 109 | 97 | 6.1 | 4 | 40.0 | Small, high missing | +| 4 | DeAssis2011 | 33 | 50 | 36 | 21.4 | 3 | 0.2 | Medium-small, high inapp | +| 5 | Aria2015 | 35 | 50 | 50 | 6.7 | 6 | 12.7 | Medium-small, multi-state | +| 6 | Wortley2006 | 37 | 105 | 105 | 2.7 | 8 | 31.4 | Medium, many states | +| 7 | Griswold1999 | 43 | 137 | 118 | 6.2 | 6 | 5.6 | Medium, dense matrix | +| 8 | Schulze2007 | 52 | 58 | 57 | 16.7 | 3 | 2.4 | Medium, high inapp, dense | +| 9 | Eklund2004 | 54 | 131 | 131 | 7.8 | 6 | 29.8 | Medium, moderate | +| 10 | Agnarsson2004 | 62 | 242 | 225 | 6.9 | 7 | 6.1 | Large, many chars, dense | +| 11 | Zanol2014 | 74 | 213 | 210 | 16.8 | 9 | 11.9 | Large, high inapp, many states | +| 12 | Zhu2013 | 75 | 253 | 253 | 12.4 | 4 | 42.6 | Large, high missing | +| 13 | Giles2015 | 78 | 236 | 236 | 11.8 | 4 | 41.5 | Large, high missing+inapp | +| 14 | Dikow2009 | 88 | 220 | 204 | 1.2 | 9 | 0.4 | Largest, dense, many states | + +## Selection Rationale + +- **Size range**: 20 → 88 tips (5× range). Covers small (exhaustive-feasible) + through large (heuristic-only). +- **Inapplicable variation**: 1.2% (Dikow) → 21.4% (DeAssis). Tests the + NA three-pass scoring path under varying load. +- **State count variation**: 3–9 applicable states. Affects `total_words` + (state word count per block) and thus inner-loop iteration count. +- **Missing data variation**: 0.2% (DeAssis) → 45.3% (Longrich). High missing + data creates more ambiguous tokens, affecting scoring and simplification. +- **Dense vs sparse**: DeAssis (0.2% missing) and Dikow (0.4% missing) are + nearly complete matrices; Longrich (45.3%) and Zhu (42.6%) are sparse. + +## Best-Known EW Scores + +Scores from the C++ driven search engine (5 replicates, 5s timeout per +dataset, `set.seed(42)`). These are the standard Fitch parsimony scores +(not inapplicable-aware). Published tree scores from `inapplicable.trees` +are generally higher because they may not be optimized for standard Fitch. + +| Dataset | C++ Best | Published Tree | Notes | +|---------|----------|---------------|-------| +| Longrich2010 | 131 | 167 | | +| Vinther2008 | 79 | 93 | | +| Sansom2010 | 189 | — | | +| DeAssis2011 | 64 | 89 | | +| Aria2015 | 145 | 185 | | +| Wortley2006 | 496 | 518 | | +| Griswold1999 | 409 | 511 | | +| Schulze2007 | 167 | 212 | | +| Eklund2004 | 445 | 496 | | +| Agnarsson2004 | 778 | 1035 | | +| Zanol2014 | 1338 | 1802 | | +| Zhu2013 | 649 | 810 | | +| Giles2015 | 720 | 1005 | | +| Dikow2009 | 1614 | 1646 | | + +Note: C++ scores are lower than published because (a) the published trees +were optimized for a different scoring method (inapplicable-aware), and +(b) our driven search may find better trees. These scores were obtained +with `set.seed(42)`, 10s timeout, 10 replicates. Use `bench_datasets.R` +with longer search times for authoritative best-known scores. + +## Large-Tree Benchmark Datasets + +Separate tier for datasets >= 100 tips, loaded from `dev/benchmarks/`. +These have fundamentally different search dynamics: single TBR convergence +takes seconds to minutes, replicates take minutes rather than sub-second. + +| # | Dataset | Tips | Chars | Patterns | %Missing | %Inapp | Source | +|---|---------|------|-------|----------|----------|--------|--------| +| L1 | mbank_X30754 | 180 | 425 | 418 | 40% | 20.5% | MorphoBank P30754 | + +### mbank_X30754 + +MorphoBank project X30754 (downloaded 2025-06-16). 180 taxa, 425 characters +with ~40% missing data and ~20% inapplicable entries. This is a realistic +large morphological matrix that exposes scaling issues in the search engine: +NNI warmup is essential, single TBR convergence takes ~13s, and the standard +strategy presets (calibrated for ≤88 tips) are poorly suited. + +Best-known EW score: TBD (to be established after systematic benchmarking). + +## Usage + +```r +source("dev/benchmarks/bench_datasets.R") + +# Load standard benchmark datasets (14 datasets, ≤88 tips) +datasets <- load_benchmark_datasets() + +# Load large-tree benchmark datasets (≥100 tips) +large <- load_large_benchmark_datasets() + +# Load all (standard + large) +all_ds <- load_all_benchmark_datasets() + +# Score a single dataset +score_dataset("Vinther2008", maxSeconds = 10) + +# Run standard benchmark suite +run_benchmark_suite(maxSeconds = 30, replicates = 5) + +# Run large-tree benchmark (from bench_framework.R) +# source("dev/benchmarks/bench_framework.R") +# benchmark_large(maxSeconds = 120) +``` diff --git a/dev/benchmarks/diag_clip_ordering.R b/dev/benchmarks/diag_clip_ordering.R new file mode 100644 index 000000000..1a3722b19 --- /dev/null +++ b/dev/benchmarks/diag_clip_ordering.R @@ -0,0 +1,286 @@ +# diag_clip_ordering.R +# +# Diagnostic script for the size-weighted TBR clip ordering experiment. +# +# Purpose: Characterise baseline (random) TBR clip ordering behaviour to test +# whether the small-clip-first hypothesis holds empirically. +# +# For each dataset and seed, builds a random Wagner starting tree, runs +# ts_tbr_diagnostics() to convergence, and accumulates per-pass records. +# Produces three summary tables: +# +# 1. Accepted clip size breakdown by bucket (tips / small / large). +# Key question: are tip clips over-represented in accepted moves +# relative to their uniform expectation? +# +# 2. Clips tried before acceptance (productive passes). +# Key question: is n_clips_tried typically large enough that a +# small-first ordering could meaningfully reduce it? +# +# 3. Evaluation budget split: productive vs null passes. +# Key question: what fraction of TBR work is "wasted" in null passes? +# +# Usage: Rscript dev/benchmarks/diag_clip_ordering.R [lib_path] +# lib_path defaults to ".agent-wc" + +args <- commandArgs(trailingOnly = TRUE) +lib_path <- if (length(args) >= 1) args[1] else ".agent-wc" + +library(TreeSearch, lib.loc = lib_path) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- + +DATASETS <- c("Vinther2008", "Agnarsson2004", "Zhu2013", "Dikow2009") +SEEDS <- c(1847L, 2956L, 3712L, 4519L, 5823L, 6401L, 7238L, 8145L, 9032L, 9871L) + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +prepare <- function(name) { + ds <- TreeSearch::inapplicable.phyData[[name]] + at <- attributes(ds) + list( + name = name, + contrast = at$contrast, + tip_data = matrix(unlist(ds, use.names = FALSE), + nrow = length(ds), byrow = TRUE), + weight = at$weight, + levels = at$levels, + n_taxa = length(ds) + ) +} + +# Bucket label for a clip of subtree size s given n_tip. +# Tip: s == 1 +# Small: 2 <= s <= floor(sqrt(n_tip)) +# Large: s > floor(sqrt(n_tip)) +clip_bucket <- function(s, n_tip) { + sq <- floor(sqrt(n_tip)) + ifelse(s == 1, "tip", + ifelse(s <= sq, "small", "large")) +} + +# Expected fraction of clips in each bucket for a binary rooted tree with +# n_tip leaves. Total clips = 2*(n_tip-1). +# Tip clips (s==1) : exactly n_tip +# Non-tip clips : n_tip - 2 +# Among non-tip, sizes 2..n_tip-1. Approximate uniform distribution: +# small (2..floor(sqrt)) : floor(sqrt)-1 sizes +# large (floor(sqrt)+1..n_tip-1): n_tip-1-floor(sqrt) sizes +# This is approximate (not all sizes appear equally often), but adequate +# for comparison against observed acceptance fractions. +expected_bucket_fracs <- function(n_tip) { + n_clips <- 2L * (n_tip - 1L) + sq <- floor(sqrt(n_tip)) + n_tip_c <- n_tip # tip clips + n_nontip <- n_tip - 2L # non-tip clips + n_small_c <- sq - 1L # sizes 2..sq (approximate, may be 0) + n_large_c <- n_nontip - n_small_c + list( + tip = n_tip_c / n_clips, + small = max(0, n_small_c) / n_clips, + large = max(0, n_large_c) / n_clips + ) +} + +# --------------------------------------------------------------------------- +# Data collection +# --------------------------------------------------------------------------- + +cat("Collecting TBR pass diagnostics (", length(SEEDS), "seeds per dataset)...\n\n", + sep = "") + +all_records <- list() + +for (dname in DATASETS) { + d <- prepare(dname) + n_tip <- d$n_taxa + sq <- floor(sqrt(n_tip)) + exp <- expected_bucket_fracs(n_tip) + + cat(sprintf("Dataset: %-15s n_tip=%d sqrt_n=%d total_clips=%d\n", + dname, n_tip, sq, 2L*(n_tip-1L))) + + ds_records <- vector("list", length(SEEDS)) + + for (i in seq_along(SEEDS)) { + set.seed(SEEDS[i]) + + # Random Wagner starting tree + wag <- TreeSearch:::ts_random_wagner_tree( + d$contrast, d$tip_data, d$weight, d$levels + ) + + # TBR to convergence with per-pass diagnostics (default clip_order = RANDOM) + res <- TreeSearch:::ts_tbr_diagnostics( + wag$edge, d$contrast, d$tip_data, d$weight, d$levels + ) + + passes <- res$passes + passes$dataset <- dname + passes$seed <- SEEDS[i] + passes$n_tip <- n_tip + passes$n_clips <- 2L * (n_tip - 1L) + passes$final_score <- res$score + passes$bucket <- clip_bucket(passes$accepted_clip_size, n_tip) + # bucket is only meaningful for productive passes; set NA for null passes + passes$bucket[!passes$productive] <- NA_character_ + + ds_records[[i]] <- passes + } + + all_records[[dname]] <- do.call(rbind, ds_records) + recs <- all_records[[dname]] + prod <- recs[recs$productive, ] + null <- recs[!recs$productive, ] + + cat(sprintf(" Passes: %d productive=%d (%.0f%%) null=%d (%.0f%%)\n", + nrow(recs), nrow(prod), 100*nrow(prod)/nrow(recs), + nrow(null), 100*nrow(null)/nrow(recs))) + + if (nrow(prod) > 0) { + tip_obs <- mean(prod$accepted_clip_size == 1) + tip_exp <- exp$tip + enrich <- tip_obs / tip_exp + cat(sprintf(" Tip-clip acceptance: observed=%.0f%% expected=%.0f%% enrichment=%.2fx\n", + 100*tip_obs, 100*tip_exp, enrich)) + cat(sprintf(" Clips tried before accept: median=%d mean=%.1f (out of %d clips)\n", + median(prod$n_clips_tried), mean(prod$n_clips_tried), 2L*(n_tip-1L))) + cat(sprintf(" Final score range: %.0f – %.0f\n", + min(recs$final_score), max(recs$final_score))) + } + cat("\n") +} + +combined <- do.call(rbind, all_records) +prod_all <- combined[combined$productive, ] + +# --------------------------------------------------------------------------- +# Table 1: Accepted clip size bucket breakdown +# --------------------------------------------------------------------------- + +cat("=== Table 1: Accepted clip size breakdown (productive passes only) ===\n\n") + +fmt_pct <- function(x) sprintf("%.1f%%", 100 * x) + +bucket_tbl <- do.call(rbind, lapply(DATASETS, function(dname) { + p <- prod_all[prod_all$dataset == dname, ] + n_tip <- p$n_tip[1] + exp <- expected_bucket_fracs(n_tip) + tot <- nrow(p) + + tip_obs <- mean(p$accepted_clip_size == 1) + small_obs <- mean(p$accepted_clip_size > 1 & + p$accepted_clip_size <= floor(sqrt(n_tip))) + large_obs <- mean(p$accepted_clip_size > floor(sqrt(n_tip))) + + data.frame( + dataset = dname, + n_tip = n_tip, + n_prod_passes = tot, + tip_obs = fmt_pct(tip_obs), + tip_exp = fmt_pct(exp$tip), + tip_enrichment = round(tip_obs / exp$tip, 2), + small_obs = fmt_pct(small_obs), + small_exp = fmt_pct(exp$small), + large_obs = fmt_pct(large_obs), + large_exp = fmt_pct(exp$large) + ) +})) + +print(bucket_tbl, row.names = FALSE) + +# --------------------------------------------------------------------------- +# Table 2: Clips tried before acceptance +# --------------------------------------------------------------------------- + +cat("\n=== Table 2: Clips tried in productive passes ===\n") +cat("(n_clips_tried includes the accepted clip itself; 1 = first clip accepted)\n\n") + +tried_tbl <- do.call(rbind, lapply(DATASETS, function(dname) { + p <- prod_all[prod_all$dataset == dname, ] + n_clips <- p$n_clips[1] + tried <- p$n_clips_tried + + data.frame( + dataset = dname, + n_clips = n_clips, + n_prod_passes = nrow(p), + pct_first_clip = fmt_pct(mean(tried == 1)), + pct_within_5 = fmt_pct(mean(tried <= 5)), + pct_within_10pct = fmt_pct(mean(tried <= 0.1 * n_clips)), + median_tried = median(tried), + mean_tried = round(mean(tried), 1), + median_position = round(median(tried) / n_clips, 2) + ) +})) + +print(tried_tbl, row.names = FALSE) + +# --------------------------------------------------------------------------- +# Table 3: Evaluation budget — productive vs null passes +# --------------------------------------------------------------------------- + +cat("\n=== Table 3: Evaluation budget by pass type ===\n\n") + +eval_tbl <- do.call(rbind, lapply(DATASETS, function(dname) { + d <- combined[combined$dataset == dname, ] + prod <- d[d$productive, ] + null <- d[!d$productive, ] + tot <- sum(d$n_candidates_evaluated) + + data.frame( + dataset = dname, + n_prod_passes = nrow(prod), + n_null_passes = nrow(null), + pct_evals_prod = fmt_pct(sum(prod$n_candidates_evaluated) / tot), + pct_evals_null = fmt_pct(sum(null$n_candidates_evaluated) / tot), + med_evals_prod = if (nrow(prod) > 0) median(prod$n_candidates_evaluated) else NA_real_, + med_evals_null = if (nrow(null) > 0) median(null$n_candidates_evaluated) else NA_real_ + ) +})) + +print(eval_tbl, row.names = FALSE) + +# --------------------------------------------------------------------------- +# Hypothesis assessment +# --------------------------------------------------------------------------- + +cat("\n=== Hypothesis assessment ===\n") +cat("H: small clips (s=1) are over-represented in accepted moves,\n") +cat(" AND n_clips_tried is large enough that ordering would help.\n\n") + +for (dname in DATASETS) { + p <- prod_all[prod_all$dataset == dname, ] + n_clips <- p$n_clips[1] + n_tip <- p$n_tip[1] + enrich <- (mean(p$accepted_clip_size == 1)) / + (expected_bucket_fracs(n_tip)$tip) + med_pos <- median(p$n_clips_tried) / n_clips # fraction of clips needed + + # Potential saving if tips-first: E[position of accepted tip clip in random + # order] - E[position in tips-first order]. Very roughly: + # random E[pos] ≈ n_clips/2; tips-first E[pos] ≈ n_tip/2. + # saving_fraction ≈ (n_clips/2 - n_tip/2) / n_clips = (1 - n_tip/n_clips)/2 ≈ 0.25 + # But only beneficial if tip clips ARE more commonly accepted (enrich > 1). + + verdict <- if (enrich >= 2.0 && med_pos >= 0.25) { + "STRONGLY SUPPORTS ordering (high enrichment + late acceptance)" + } else if (enrich >= 1.5 && med_pos >= 0.15) { + "SUPPORTS ordering (moderate enrichment + moderate position)" + } else if (enrich >= 1.5) { + "PARTIAL (enrichment, but acceptance mostly in first few clips)" + } else if (enrich < 0.8) { + "CONTRADICTS hypothesis (large clips accepted more often)" + } else { + "NEUTRAL (no consistent tip-clip enrichment)" + } + + cat(sprintf(" %-15s: tip enrichment=%.2fx median_pos=%.2f -> %s\n", + dname, enrich, med_pos, verdict)) +} + +cat("\nDone.\n") diff --git a/dev/benchmarks/drift_mpt_analysis.md b/dev/benchmarks/drift_mpt_analysis.md new file mode 100644 index 000000000..1299ec6fc --- /dev/null +++ b/dev/benchmarks/drift_mpt_analysis.md @@ -0,0 +1,99 @@ +# T-254: Drift MPT Diversity Experiment + +## Question + +Drift search consumes 15–19% of wall time but contributes <1% of score +improvement (T-251). Before reducing it, we need to check whether drift +helps **MPT enumeration** — finding topologically distinct optimal trees +that the post-search TBR plateau walk uses as seeds. + +## Design + +- **Datasets**: Wortley2006 (37t), Zhu2013 (75t), Geisler2001 (68t) +- **Conditions**: `driftCycles=0` vs `driftCycles=2` (default preset value) +- **Seeds**: 1, 2, 3 +- **Budgets**: 30s (primary, equal-budget), 120s (with consensus stopping) +- **Other params**: All match `default` preset (ratchet 12 cycles, 25% + perturbation, XSS 3 rounds, etc.) +- **Metrics**: best score, pool tree count, n_topologies, replicates + completed, mean pairwise Robinson-Foulds distance + +### Equal-budget design + +The primary comparison uses `consensusStableReps=0` to disable +consensus-stability early stopping. This ensures both conditions use the +full 30s budget, avoiding the confound that no-drift converges to consensus +stability faster (fewer replicates needed to stabilize the strict consensus). + +## Results (30s, equal budget) + +| Dataset | Drift | Med score | Med trees | Med reps | Med RF | Drift % | +|-------------|:-----:|:---------:|:---------:|:--------:|:------:|:-------:| +| Geisler2001 | 0 | 1295 | 100 | 27 | 7.3 | 0 | +| Geisler2001 | 2 | 1295 | 100 | 25 | 7.4 | 18 | +| Wortley2006 | 0 | 482 | 4 | 75 | 17.3 | 0 | +| Wortley2006 | 2 | 482 | 2 | 62 | 10.0 | 15 | +| Zhu2013 | 0 | 638 | 100 | 26 | 11.6 | 0 | +| Zhu2013 | 2 | 638 | 100 | 19 | 10.2 | 17 | + +### Replicate cost + +| Dataset | Reps (d=0) | Reps (d=2) | Loss | +|-------------|:----------:|:----------:|:----:| +| Geisler2001 | 27 | 24 | 10% | +| Wortley2006 | 76 | 61 | 20% | +| Zhu2013 | 25 | 20 | 22% | + +### Key findings + +1. **Score quality**: Identical. Both conditions find the same best score + on all datasets at all seeds. + +2. **MPT count**: On Wortley2006, no-drift consistently finds 4 MPTs + (all 3 seeds) while drift finds 1–3 (median 2). On larger datasets, + both fill the 100-tree pool. Drift does NOT help MPT enumeration. + +3. **Topological diversity**: Mean pairwise RF distances are essentially + identical on Geisler2001 (7.3 vs 7.4 out of max 132). On Zhu2013, + no-drift shows slightly higher RF (11.6 vs 10.2 out of max 146). + On Wortley2006, no-drift has higher RF (17.3 vs 10.0 out of max 70). + **Drift does not improve topological diversity.** + +4. **Replicate throughput**: No-drift completes 10–22% more replicates + in the same wall time. Each independent replicate starts from a random + Wagner tree, providing more diverse initial basins than drift's local + perturbation within a single basin. + +5. **Consensus stability confound**: With consensus stopping enabled + (120s budget), no-drift reaches consensus stability 2–3× faster and + stops early. Drift prevents early stabilization (by perturbing into + slightly different topologies) but the extra time produces no better + scores or more MPTs. This means drift actively delays convergence + without adding value. + +## Conclusion + +**Drift can be safely eliminated from the default preset.** It provides: +- Zero score benefit (confirmed both here and in T-251) +- Zero MPT enumeration benefit (fewer MPTs on Wortley2006) +- Zero topological diversity benefit +- Negative throughput impact (10–22% fewer replicates) + +The time saved should be reallocated to additional replicates (which +provide genuinely independent basin sampling via random Wagner starts). + +## Recommendation for T-255 + +- **default**: `driftCycles = 0` (was 2) +- **sprint**: already 0 (no change) +- **thorough**: reduce from 12 to 0 or 1. The thorough preset has many + other escape mechanisms (NNI-perturbation, adaptive ratchet, outer + cycles) that make drift redundant. +- **large**: already 0 (no change) + +## Scripts and data + +- `dev/benchmarks/bench_drift_mpt.R` — full experiment script +- `dev/benchmarks/results_drift_mpt_30s.csv` — 30s with consensus stopping +- `dev/benchmarks/results_drift_mpt_120s.csv` — 120s with consensus stopping +- `dev/benchmarks/results_drift_mpt_30s_nostop.csv` — 30s equal-budget (primary) diff --git a/dev/benchmarks/mbank_X30754.nex b/dev/benchmarks/mbank_X30754.nex new file mode 100644 index 000000000..4e6507490 --- /dev/null +++ b/dev/benchmarks/mbank_X30754.nex @@ -0,0 +1,5240 @@ +#NEXUS + + [ File output by Morphobank v3.0 (http://www.morphobank.org); 2025-06-16 11.36.14 ] + + BEGIN TAXA; + DIMENSIONS NTAX=180; + TAXLABELS + 'Orstenoloricus shergoldii' + 'Gastrotricha' + 'Lineus' + 'Solenogastres' + 'Nereis' + 'Ancalagon minor' + 'Fieldia lanceolata' + 'Scolecofurca rara' + 'Markuelia lauriei' + 'Shergoldana australiensis' + 'Xinliscolex intermedius' + 'Shanscolex decorus' + 'Qinscolex spinosus' + 'Zhongpingscolex qinensis' + 'Eokinorhynchus rarus' + 'Eopriapulites sphinx' + 'Eolorica deadwoodensis' + 'Nanaloricus mysticus' + 'Armorloricus elegans' + 'Spinoloricus turbatio' + 'Rugiloricus carolinensis' + 'Pliciloricus corvus' + 'Urnaloricus ibenae' + 'Wataloricus japonicus' + 'Tenuiloricus shirayamai' + 'Patuloricus tangaroa' + 'Scaberiloricus samba' + 'Franciscideres kalenesos' + 'Antygomonas paulae' + 'Campyloderes cf vanhoeffeni' + 'Centroderes spinosus' + 'Echinoderes dujardinii' + 'Zelinkaderes klepali' + 'Cateria gerlachi' + 'Dracoderes abei' + 'Paracentrophyes anurus' + 'Pycnophyes zelinkaei' + 'Chordodes' + 'Nectonema' + 'Euchromadora' + 'Odontophora' + 'Kinonchulus' + 'Anatonchus' + 'Acanthopriapulus horridus' + 'Halicryptus spinulosus' + 'Maccabeus' + 'Meiopriapulus fijiensis' + 'Priapulopsis bicaudatus' + 'Priapulus caudatus' + 'Tubiluchus lemburgi' + 'Tubiluchus vanuatensis' + 'Euperipatoides' + 'Plicatoperipatus' + 'Ooperipatellus' + 'Archechiniscus bahamensis' + 'Batillipes pennaki' + 'Batillipes phreaticus' + 'Coronarctus yurupari' + 'Coronarctus laubieri' + 'Dipodarctus susannae' + 'Wingstrandarctus unsculptus' + 'Neoarctus primigenius' + 'Neostygarctus oceanopolis' + 'Renaudarctus fossorius' + 'Mesostygarctus spiralis' + 'Parastygarctus renaudae' + 'Raiarctus jesperi' + 'Styraconyx nanoqsunguak' + 'Actinarctus neretinus' + 'Isoechiniscoides sifae' + 'Neoechiniscoides aski' + 'Oreella chugachii' + 'Echiniscus testudo' + 'Multipseudechiniscus raneyi' + 'Testechiniscus spitsbergensis' + 'Pseudechiniscus suillus' + 'Cornechiniscus imperfectus' + 'Milnesium berladnicorum' + 'Milnesium swolenski' + 'Milnesium tardigradum' + 'Austeruseus faeroensis' + 'Mesocrista revelata' + 'Hypsibius dujardini' + 'Beron leggi' + 'Calohypsibius ornatus' + 'Fractonotus verrucosus' + 'Cryoconicus kaczmareki' + 'Haplomacrobiotus utahensis' + 'Doryphoribius dawkinsi' + 'Paradoryphoribius chronocaribbeus' + 'Halobiotus crispae' + 'Macrobiotus paulinae' + 'Dactylobiotus ovimutans' + 'Richtersius coronifer' + 'Sicyophorus rarus' + 'Sirilorica carlsbergi' + 'Acosmia' + 'Eximipriapulus globocaudata' + 'Laojieella thecata' + 'Ottoia prolifica' + 'Ottoia tricuspida' + 'Paratubiluchus bicaudatus' + 'Priapulites konecniorum' + 'Selkirkia columbia' + 'Paraselkirkia sinica' + 'Xiaoheiqingella peculiaris' + 'Xystoscolex boreogyrus' + 'Chalazoscolex pharkus' + 'Louisella pedunculata' + 'Corynetis brevis' + 'GUANDUSCOLEX minor' + 'MAOTIANSHANIA cylindrica' + 'PALAEOSCOLEX piscatorum' + 'SCHISTOSCOLEX umbilicatus' + 'SCATHASCOLEX minor' + 'WRONASCOLEX antiquus' + 'WRONASCOLEX iacoborum' + 'YUNNANOSCOLEX magnus' + 'MAFANGSCOLEX yunnanensis' + 'Cricocosmia n. sp.' + 'CRICOCOSMIA jinningensis' + 'TABELLISCOLEX hexagonus' + 'Tylotites petiolaris' + 'Xenusion' + 'Hadranax' + 'Aysheaia' + 'Siberion' + 'Onychodictyon ferox' + 'Diania' + 'Paucipodia' + 'Cardiodictyon' + 'Microdictyon' + 'Onychodictyon gracilis' + 'Thanahita distos' + 'Orstenotubulus' + 'Tritonychus phanerosarkus' + 'Carbotubulus' + 'Hallucigenia sparsa' + 'Hallucigenia fortis' + 'Hallucigenia hongmeia' + 'Facivermis yunnanicus' + 'Luolishania' + 'Ovatiovermis cribratus' + 'Collinsium' + 'Collinsovermis monstruosus' + 'Emu Bay Collins monster' + 'Acinocricus' + 'Antennacanthopodia' + 'Helenodora' + 'Tertiapatus dominicanus' + 'Siberian Orsten tardigrade' + 'Youti yuanshi' + 'Megadictyon' + 'Jianshanopodia' + 'Cucumericrus' + 'Kerygmachela' + 'Pambdelurion' + 'Omnidens qiongqii' + 'Parapeytoia' + 'Kylinxia' + 'Isoxys' + 'Stanleycaris' + 'Opabinia' + 'Utaurora' + 'Caryosyntrips camurus' + 'Amplectobelua symbrachiata' + 'Anomalocaris canadensis' + 'Cambroraster falcatus' + 'Hurdia victoria' + 'Cf. Peytoia' + 'Peytoia nathorsti' + 'Aegirocassis benmoulai' + 'Lyrarapax unguispinus' + 'Schinderhannes' + 'Chengjiangocaris' + 'Fuxianhuia' + 'Leanchoilia' + 'Alalcomenaeus' + 'Misszhouia longicaudata' + 'Kuamaia lata' + ; + ENDBLOCK; + + BEGIN CHARACTERS; + DIMENSIONS NCHAR=425; + FORMAT DATATYPE=STANDARD GAP=- MISSING=? SYMBOLS="0123456789A"; + CHARLABELS + [1] 'General organization: Voluminous primary body cavity' + [2] 'General organization: Aspect ratio of body length to (maximum) trunk width in adult' + [3] 'General organization: Clear differentiation of dorsal and ventral trunk' + [4] 'General organization: Paired appendages' + [5] 'General organization: Anus position' + [6] 'General organization: Mouth opening position' + [7] 'General organization: Mouth orientation' + [8] 'Introvert: Distinct introvert' + [9] 'Introvert: Triangular proboscis' + [10] 'Introvert: Invaginable' + [11] 'Introvert: Extent of invagination' + [12] 'Introvert: Two rings of introvert retractors attach through the collar-shaped brain' + [13] 'Introvert: Trichoscalids' + [14] 'Introvert: Trichoscalids: Nature of separation between trichoscalids and Zone I armature' + [15] 'Introvert: Trichoscalids: Number per ring' + [16] 'Introvert: Trichoscalids: Number of rings' + [17] 'Introvert: Trichoscalids: Basal plates' + [18] 'Introvert: Trichoscalids: Articulation' + [19] 'Introvert: Trichoscalids: Morphology' + [20] 'Introvert: Trichoscalids: Doubled' + [21] 'Introvert: Zone I armature' + [22] 'Introvert: Elements that comprise first three circlets define number of longitudinal rows of elements on the introvert' + [23] 'Introvert: Zone I armature: Direction' + [24] 'Introvert: Zone I armature: Number of circlets' + [25] 'Introvert: Zone I armature: Elements in two superposed series' + [26] 'Introvert: Zone I armature: Arranged in rows' + [27] 'Introvert: Zone I armature: Row orientation' + [28] 'Introvert: Zone I armature: Extent' + [29] 'Introvert: Zone I armature: Cuticularized' + [30] 'Introvert: Zone I armature: Solid elements' + [31] 'Introvert: Zone I armature: Elongate elements' + [32] 'Introvert: Zone I armature: Element curvature' + [33] 'Introvert: Zone I armature: Bifurcating elements' + [34] 'Introvert: Zone I armature: Elements are dentate' + [35] 'Introvert: Zone I armature: Elements comprise articulated units' + [36] 'Introvert: Zone I armature: Elements bear setules' + [37] 'Introvert: Zone I armature: Telescopic elements' + [38] 'Introvert: Zone I armature: Hooded elements' + [39] 'Introvert: Zone I armature: Intrinsic musculature' + [40] 'Introvert: Symmetry: Pentaradial' + [41] 'Introvert: Symmetry: Twentyfive-fold' + [42] 'Introvert: Symmetry: Hexaraidal' + [43] 'Pharynx: Large dorsal tooth' + [44] 'Pharynx: Pre-oral chamber' + [45] 'Pharynx: Annulations' + [46] 'Pharynx: Eversion' + [47] 'Pharynx: Eversion: Permanent' + [48] 'Pharynx: Eversion: Introvert or pharynx employed in locomotion' + [49] 'Pharynx: Eversion: Zone III eversible' + [50] 'Pharynx: Eversion: Size when everted' + [51] 'Pharynx: Eversion: Zone III fully inversible' + [52] 'Pharynx: Symmetry: Pharyngeal lumina symmetry' + [53] 'Pharynx: Zone II armature' + [54] 'Pharynx: Zone II armature: Contact area' + [55] 'Pharynx: Zone II armature: Disposition' + [56] 'Pharynx: Zone II armature: Differentiated elements' + [57] 'Pharynx: Zone II armature: Differentiated elements: Number of enlarged plates' + [58] 'Pharynx: Zone II armature: Furrowed folds' + [59] 'Pharynx: Zone II armature: Nodes on outer face' + [60] 'Pharynx: Zone II armature: Nodes on inner face' + [61] 'Pharynx: Zone II armature: Element constitution' + [62] 'Pharynx: Zone II armature: Elements in proximal circlet' + [63] 'Pharynx: Zone II armature: Aspect ratio' + [64] 'Pharynx: Zone II armature: Multiple cusps' + [65] 'Pharynx: Zone II armature: Spinose projections from inner face' + [66] 'Pharynx: Zone II armature: Spinose projections from inner face: Number' + [67] 'Pharynx: Zone II armature: Proximal circlet fused to introvert' + [68] 'Pharynx: Zone III wider than Zone II' + [69] 'Pharynx: Proximal region: Unarmed region between Zone II and Zone III' + [70] 'Pharynx: Proximal region: Cuticular reinforcement' + [71] 'Pharynx: Proximal region: Oral ridges' + [72] 'Pharynx: Proximal region: Oral ridges: Number' + [73] 'Pharynx: Proximal region: Oral ridges: Furcae' + [74] 'Pharynx: Proximal region: Oral ridges: Differentiated series' + [75] 'Pharynx: Proximal region: Fenestrae' + [76] 'Pharynx: Zone III armature' + [77] 'Pharynx: Zone III armature: Complexity' + [78] 'Pharynx: Zone III armature: Retained to adulthood' + [79] 'Pharynx: Zone III armature: Composition' + [80] 'Pharynx: Zone III armature: Disposition' + [81] 'Pharynx: Zone III armature: Radial extent' + [82] 'Pharynx: Zone III armature: Number of circlets' + [83] 'Pharynx: Zone III armature: Number of pentagonal circlets in proximal region' + [84] 'Pharynx: Zone III armature: Proximal circlet: Number of elements' + [85] 'Pharynx: Zone III armature: Proximal circlet: Which multiple of five' + [86] 'Pharynx: Zone III armature: Proximal circlet: Dorsal element reduced' + [87] 'Pharynx: Zone III armature: Proximal circlet: Alternating size' + [88] 'Pharynx: Zone III armature: Proximal circlet: Prominent central spine in elements' + [89] 'Pharynx: Zone III armature: Proximal circlet: Prominent central spine: Recurved (hooked)' + [90] 'Pharynx: Zone III armature: Proximal circlet: Additional robust spines (multispinose) or pectinate fringe on elements' + [91] 'Pharynx: Zone III armature: Proximal circlet: Elements comprise articulated units' + [92] 'Pharynx: Zone III armature: Proximal circlet: Massively reduced' + [93] 'Pharynx: Zone III armature: Proximal circlet: Morphologically differentiated' + [94] 'Pharynx: Zone III armature: Ring fold' + [95] 'Pharynx: Zone III armature: Middle circlets of Zone III armature reduced' + [96] 'Pharynx: Zone III armature: Middle circlets: Element morphology' + [97] 'Pharynx: Zone III armature: Distal circlets: Morphologically distinct' + [98] 'Pharynx: Zone III armature: Distal circlets: Element morphology' + [99] 'Pharynx: Zone III armature: Distal circlets: Trend of element size' + [100] 'Pharynx: Zone III armature: Intrinsic muscles of outer oral styles' + [101] 'Pharynx: Zone III armature: Placoids' + [102] 'Pharynx: Zone III armature: Placoids: Type' + [103] 'Pharynx: Zone III armature: Microplacoid' + [104] 'Pharynx: Zone III armature: Reinforcement of pharynx cuticle' + [105] 'Pharynx: Buccal tube: Apophysis for the insertion of the stylet muscle' + [106] 'Pharynx: Buccal tube: Apophysis: Type' + [107] 'Pharynx: Terminal bulb' + [108] 'Neck: Forms segment-like ring' + [109] 'Neck: Encircled by ring of cuticular plates' + [110] 'Neck: Cuticular neck plates: Form closing mechanism when adult head retracted into trunk' + [111] 'Neck: Cuticular neck plates: Closing apparatus: Symmetry' + [112] 'Neck: Cuticular neck plates: Number' + [113] 'Neck: Cuticular neck plates: Distal margin shape' + [114] 'Neck: Cuticular neck plates: Attachment to first trunk segment' + [115] 'Head region: Amphids' + [116] 'Head region: Amphids: Fovea shape' + [117] 'Head region: Anterodorsal lobe' + [118] 'Head region: Anterior region covered by sclerites' + [119] 'Head region: Head shield (cephalic shield) formed by fused cephalic segments' + [120] 'Head region: Dorsal isolated sclerite: Position' + [121] 'Head region: Dorsal isolated sclerite: Shape' + [122] 'Head region: Dorsal isolated sclerite: Reticulate ornament' + [123] 'Head region: Degree of attachment of dorsal isolated sclerite on head' + [124] 'Head region: Isolated lateral sclerites, forming tripartite carapace' + [125] 'Head region: Isolated lateral sclerites: Shape' + [126] 'Head region: Ventral isolated sclerite' + [127] 'Head region: Anterior trunk flexure in coronal plane' + [128] 'Head region: Swelling of anteriormost trunk ' + [129] 'Head region: Paired anterior projections' + [130] 'Head region: Paired anterior projections: Incorporated into lips' + [131] 'Head region: Paired anterior projection: Sensory field' + [132] 'Head region: Paired anterior projections: Position of Cirri A' + [133] 'Head region: Club or dome-shaped chemosensory organ' + [134] 'Ocular structures' + [135] 'Ocular structures: Number' + [136] 'Ocular structures: Compound eyes' + [137] 'Ocular structures: Compound eyes: Attachment' + [138] 'Ocular structures: Compound eyes: Posterior displacement' + [139] 'Cephalic/anterior appendages: Protocerebral appendage pair: Sclerotization' + [140] 'Cephalic/anterior appendages: Protocerebral appendage pair: Arthrodial membranes' + [141] 'Cephalic/anterior appendages: Pre-ocular (protocerebral) limb pair: Structurally differentiated from trunk appendages' + [142] 'Cephalic/anterior appendages: Protocerebral appendage pair: Podomeres' + [143] 'Cephalic/anterior appendages: Protocerebral appendages: Podomeres: Differentiation' + [144] 'Cephalic/anterior appendages: Protocerebral appendages: Podomeres: Distal taper' + [145] 'Cephalic/anterior appendages: Protocerebral appendage pair: Position' + [146] 'Cephalic/anterior appendages: Protocerebral appendage pair: Posterior shift' + [147] 'Cephalic/anterior appendages: Protocerebral appendages: Directly adjacent to one another' + [148] 'Cephalic/anterior appendages: Protocerebral appendages: Basal adjacency' + [149] 'Cephalic/anterior appendages: Protocerebral appendages: Mechanical fusion' + [150] 'Cephalic/anterior appendages: Protocerebral appendage pair: Loss of claws' + [151] 'Cephalic/anterior appendages: Protocerebral appendage pair: Ventral spine series' + [152] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Number' + [153] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Height' + [154] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Accessory spines' + [155] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Accessory spine distribution' + [156] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Alternation' + [157] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Width' + [158] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Base to tip thickness' + [159] 'Cephalic/anterior appendages: Protocerebral appendages: Ventral spine/spinules: Tip orientation' + [160] 'Cephalic/anterior appendages: Protocerebral spine series: Lateral spine series' + [161] 'Cephalic/anterior appendages: Protocerebral appendage pair: Multifurcate distal termination' + [162] 'Cephalic/anterior appendages: Protocerebral appendages: Kink' + [163] 'Cephalic/anterior appendages: Protocerebral appendages: Pincer' + [164] 'Cephalic/anterior appendages: Protocerebral appendages: Outer spines' + [165] 'Cephalic/anterior appendages: Protocerebral appendages: Accessory gnathal spines' + [166] 'Cephalic/anterior appendages: Post-ocular (post-protocerebral) appendages: Arthrodial membranes' + [167] 'Cephalic/anterior appendages: Nature of post-ocular lobopodous inner branch' + [168] 'Cephalic/anterior appendages: Deutocerebral limb pair structurally differentiated from trunk appendages' + [169] 'Cephalic/anterior appendages: Nature of sclerotized first post-ocular (deutocerebral) appendage' + [170] 'Cephalic/anterior appendages: Nature of lobopodous first post-ocular (deutocerebral) appendage' + [171] 'Cephalic/anterior appendages: Inner blade of deutocerebral jaw with diastema' + [172] 'Cephalic/anterior appendages: Nature of lobopodous second post-ocular (tritocerebral) appendage' + [173] 'Cephalic/anterior appendages: Nature of arthropodized second post-ocular (tritocerebral) appendage' + [174] 'Trunk region: Annulations' + [175] 'Trunk region: Annulations: Organization' + [176] 'Trunk region: Annulations: Annulations become indistinct in undifferentiated anterior trunk' + [177] 'Trunk region: Annulations: Branching of annular rings' + [178] 'Trunk region: Epidermal segmentation' + [179] 'Trunk region: Dorsal integument sclerotized to form sternal plates' + [180] 'Trunk region: Sternal plates: Connected by arthrodial membranes' + [181] 'Trunk region: Sternal plates: First sternite: Anterior margin with lateral projections' + [182] 'Trunk region: Sternal plates: First sternite: Anterior margin with medial notch' + [183] 'Trunk region: Sternal plates: First sternite: Posterior ventral spine' + [184] 'Trunk region: Sternal plates: Second segment is a single ring' + [185] 'Trunk region: Sternal plates: Differentiation in third and fourth segment' + [186] 'Trunk region: Sternal plates: Present in trunk segments 7+' + [187] 'Trunk region: Sternal plates: Posterior sternite differentiated' + [188] 'Trunk region: Sternal plates: Posterior sternite: Dorsal extension of margins' + [189] 'Trunk region: Sternal plates: Posterior sternite: Dorsal extension of margins: Extended into spinose process' + [190] 'Trunk region: Sternal plates: Posterior sternite: Lateral terminal spines' + [191] 'Trunk region: Sternal plates: Posterior sternite: Lateral accessory spines' + [192] 'Trunk region: Sternal plates: Posterior sternite: Medial spine' + [193] 'Trunk region: Sternal plates: Posterior sternite: Medial spine: Muscles' + [194] 'Trunk region: Sternal plates: Posterior sternite: Lateroventral notches in margins' + [195] 'Trunk region: Sternal plates: Setae' + [196] 'Trunk region: Sternal plates: Scales' + [197] 'Trunk region: Sternal plates: Secondary fringe' + [198] 'Trunk region: Serially repeated mid-gut glands' + [199] 'Trunk region: Narrowing posteriad' + [200] 'Trunk region: Differentiated anterior trunk' + [201] 'Trunk region: Middle of trunk bears single pair of elongated lateral cuspidate spines' + [202] 'Trunk region: Flosculi or sensory spots' + [203] 'Trunk region: Sensory spots: Flosculi' + [204] 'Trunk region: Sensory spots: Flosculi: Petals' + [205] 'Trunk region: Sensory spots: Flosculi: Petals: Number' + [206] 'Trunk region: Papillae on trunk annulations' + [207] 'Trunk region: Epidermal papillae in two ventral rows' + [208] 'Trunk region: Lorica' + [209] 'Trunk region: Lorica: Retained to adulthood' + [210] 'Trunk region: Lorica: Cuticle thickened in dorsal and ventral plicae' + [211] 'Trunk region: Lorica: Series of lorical plates' + [212] 'Trunk region: Lorica: Number of plates per series' + [213] 'Trunk region: Lorica: Differentiated dorsal and ventral plates' + [214] 'Epidermal sclerites: Present on adult trunk' + [215] 'Epidermal sclerites: Comprise a stack of nested elements' + [216] 'Epidermal sclerites: Integumentary trunk sclerites' + [217] 'Epidermal sclerites: Trunk sclerites: Heavily phosphatized' + [218] 'Epidermal sclerites: Trunk sclerites: Shape' + [219] 'Epidermal sclerites: Trunk sclerites: Nodes' + [220] 'Epidermal sclerites: Trunk sclerites: Nodes: Number of rings' + [221] 'Epidermal sclerites: Trunk sclerites: Nodes: Number in central ring is constant' + [222] 'Epidermal sclerites: Trunk sclerites: Nodes: Number in central ring' + [223] 'Epidermal sclerites: Trunk sclerites: Nodes: Exact number of nodes in central ring (if three to six)' + [224] 'Epidermal sclerites: Trunk sclerites: Differentiated anterior region' + [225] 'Epidermal sclerites: Trunk sclerites: Distribution: Complete rings' + [226] 'Epidermal sclerites: Trunk sclerites: Distribution' + [227] 'Epidermal sclerites: Trunk sclerites: Rows: Sclerite fields per annulation' + [228] 'Epidermal sclerites: Trunk sclerites: Rows: Distribution of sclerites within fields' + [229] 'Epidermal sclerites: Trunk sclerites: Distribution: Row arrangement' + [230] 'Epidermal sclerites: Trunk sclerites: Microplates present in addition to plates' + [231] 'Epidermal sclerites: Trunk sclerites: Tessellation' + [232] 'Epidermal sclerites: Sparse specialized sclerites' + [233] 'Epidermal sclerites: Sparse specialized sclerites: Trunk tubuli' + [234] 'Epidermal sclerites: Sparse specialized sclerites: Tumuli (small sclerites)' + [235] 'Epidermal sclerites: Sparse specialized sclerites: Tumuli: Radial supporting buttresses' + [236] 'Epidermal sclerites: Enlarged sclerites' + [237] 'Epidermal sclerites: Enlarged sclerites: Regular distribution' + [238] 'Epidermal sclerites: Enlarged sclerites: Transverse bands: Maximum elements per band' + [239] 'Epidermal sclerites: Enlarged sclerites: Transverse bands: Frequency' + [240] 'Epidermal sclerites: Enlarged sclerites: Transverse bands: Spacing' + [241] 'Epidermal sclerites: Enlarged sclerites: Transverse bands: Intersegmental dorsal plates' + [242] 'Epidermal sclerites: Enlarged sclerites: Transverse bands: Consistent size' + [243] 'Epidermal sclerites: Enlarged sclerites: Transverse bands: Pseudosegmental dorsal plates' + [244] 'Epidermal sclerites: Enlarged sclerites: Proportions' + [245] 'Epidermal sclerites: Enlarged sclerites: Acute distal termination' + [246] 'Epidermal sclerites: Enlarged sclerites: Acute distal termination: Curvature' + [247] 'Epidermal sclerites: Enlarged sclerites: Shape of distal margins' + [248] 'Epidermal sclerites: Enlarged sclerites: Degree of sclerotization' + [249] 'Epidermal sclerites: Enlarged sclerites: Lateral flanges' + [250] 'Epidermal sclerites: Enlarged sclerites: Ornament' + [251] 'Epidermal sclerites: Enlarged sclerites: Ornament: Bosses at net junctions' + [252] 'Trunk appendages: Sclerotization' + [253] 'Trunk appendages: Longitudinal (gill-like) wrinkling on distal part of (outer branch) flaps-v2' + [254] 'Trunk appendages: Trunk exites' + [255] 'Trunk appendages: Trunk exites: Form' + [256] 'Trunk appendages: Trunk exites: Fused with endopod to form biramous appendage' + [257] 'Trunk appendages: Dorsal flaps' + [258] 'Trunk appendages: Antero-posteriorly compressed protopodite with gnathobasic endites in post-deutocerebral appendage pair' + [259] 'Trunk appendages: Exite distribution' + [260] 'Trunk appendages: Shape of lobopodous appendages' + [261] 'Trunk appendages: Secondary structures on non-sclerotized (lobopodous) limbs' + [262] 'Trunk appendages: Nature of secondary structure' + [263] 'Trunk appendages: Type of secondary structure' + [264] 'Trunk appendages: Length of spines on secondary structure' + [265] 'Trunk appendages: Papillae on non-sclerotized (lobopodous) limbs' + [266] 'Trunk appendages: Finger-like elements in distal tip of limbs' + [267] 'Trunk appendages: Papillae with terminal spine' + [268] 'Trunk appendages: Discs ' + [269] 'Trunk appendages: Claws on trunk limbs' + [270] 'Trunk appendages: Claws: Shape of base' + [271] 'Trunk appendages: Claws: Position' + [272] 'Trunk appendages: Claws: Multiple branches' + [273] 'Trunk appendages: Claws: Multiple branches: Type' + [274] 'Trunk appendages: Claws: Multiple branches: Type: Symmetry of fused claws' + [275] 'Trunk appendages: Claws: Multiple branches: Primary branch' + [276] 'Trunk appendages: Claws: Multiple branches: Branch angle' + [277] 'Claws: Multiple branches: Anterior claws: Connection between primary and secondary branch' + [278] 'Claws: Multiple branches: Anterior claws: Symmetry of primary and secondary branches with respect to median plane of leg' + [279] 'Claws: Multiple branches: Anterior claws: External claw primary branch connection to basal section' + [280] 'Claws: Multiple branches: Anterior claws: Angular insertion of external claw secondary branch to basal section' + [281] 'Claws: Multiple branches: Anterior claws: Basal section subdivided into stem/peduncle and distal section' + [282] 'Claws: Multiple branches: Anterior claws: Base extension' + [283] 'Claws: Multiple branches: Anterior claws: Base extension: Type' + [284] 'Claws: Multiple branches: Posterior claws: Connection between primary and secondary branch' + [285] 'Claws: Multiple branches: Posterior claws: Symmetry of primary and secondary branches with respect to leg median plane' + [286] 'Claws: Multiple branches: Posterior claws: Primary branch connection to basal section' + [287] 'Claws: Multiple branches: Posterior claws: Angular insertion of secondary branch to basal section' + [288] 'Claws: Multiple branches: Posterior claws: Basal section subdivided into basal and distal sections' + [289] 'Claws: Multiple branches: Posterior claws: Base extension' + [290] 'Claws: Multiple branches: Posterior claws: Base extension: Type' + [291] 'Trunk appendages: Maximum number of claws on walking limbs' + [292] 'Trunk appendages: Number of claws varies between appendages' + [293] 'Trunk appendages: Nature of claws on each trunk limb' + [294] 'Trunk appendages: Differentiated distal foot in lobopodous trunk limbs' + [295] 'Trunk appendages: Telescopic lobopodous limbs' + [296] 'Trunk appendages: External branch expressed as lateral flaps (body extends laterally into imbricated, unsclerotized flaps)-v2' + [297] 'Trunk appendages: Strengthening rays in lateral flaps' + [298] 'Trunk appendages: Posterior tapering of lateral flaps' + [299] 'Trunk appendages: Anteriormost limb pair hypertrophied' + [300] 'Trunk appendages: Anterior limbs reduced' + [301] 'Trunk appendages: Lobopodous limbs differentiated into two batches of multiple anterior/long and posterior/short limbs' + [302] 'Trunk appendages: Number of limbs on differentiated anterior trunk' + [303] 'Trunk appendages: Nature of lobopodous limbs on differentiated anterior trunk' + [304] 'Trunk appendages: Appendages comprise 15 or more podomeres' + [305] 'Trunk appendages: Leg plate' + [306] 'Posterior termination: Limbless posterior extension of the lobopodous trunk' + [307] 'Posterior termination: Posterior trunk divided into appendages' + [308] 'Posterior termination: Posterior tagma composed of three paired lateral flaps' + [309] 'Posterior termination: Direction of claws on posteriormost appendage pair' + [310] 'Posterior termination: Posterior trunk appendages: Structural differentiation' + [311] 'Posterior termination: Posterior trunk appendages: Structural differentiation: Nature' + [312] 'Posterior termination: Posterior trunk appendages: Tail: Nature' + [313] 'Posterior termination: Posterior trunk appendages: Tail: Shape' + [314] 'Posterior termination: Posterior trunk with localised bulbous widening' + [315] 'Posterior termination: Caudal appendage' + [316] 'Posterior termination: Caudal appendage: Eversible' + [317] 'Posterior termination: Caudal appendage: Length' + [318] 'Posterior termination: Caudal appendages: Divided' + [319] 'Posterior termination: Caudal appendage: Single' + [320] 'Posterior termination: Caudal appendage: Position' + [321] 'Posterior termination: Caudal appendage: Surface' + [322] 'Posterior termination: Spinneret' + [323] 'Posterior termination: Posterior projections (i.e. spines or hooks)' + [324] 'Posterior termination: Posterior projections: Sclerotization' + [325] 'Posterior termination: Posterior projections: Basal diameter >20% trunk diameter' + [326] 'Posterior termination: Posterior projections: Number' + [327] 'Posterior termination: Posterior projections: Arrangement' + [328] 'Posterior termination: Posterior ring papillae' + [329] 'Posterior termination: Posterior abdomen greatly extensible' + [330] 'Posterior termination: Posterior warts' + [331] 'Posterior termination: Posterior wart size' + [332] 'Musculature: Skeletal musculature' + [333] 'Musculature: Longitudinal peripheral musculature' + [334] 'Musculature: Ventromedian longitudinal muscle' + [335] 'Musculature: Longitudinal muscle attachment points' + [336] 'Musculature: Longitudinal muscle attachment points: Position on tegumental plate' + [337] 'Musculature: Circular peripheral musculature' + [338] 'Musculature: Circular musculature inside longitudinal musculature' + [339] 'Musculature: Loss of dorsoventral muscles in segment 1' + [340] 'Musculature: Box-truss' + [341] 'Musculature: Heart' + [342] 'Musculature: Pharynx protractor muscles' + [343] 'Neuroanatomy: Nerve cord location' + [344] 'Neuroanatomy: Ventral nerve cord: Paired' + [345] 'Neuroanatomy: Paired ventral nerve cords: Symmetry' + [346] 'Neuroanatomy: Paired ventral nerve cords: Merge caudally' + [347] 'Neuroanatomy: Paired ventral nerve cords: Paired ganglia' + [348] 'Neuroanatomy: Paired ventral nerve cords: Position' + [349] 'Neuroanatomy: Paired ventral nerve cords: Medial interpedal commissures' + [350] 'Neuroanatomy: VNC with morphologically discrete condensed hemiganglia connected by medial commissures' + [351] 'Neuroanatomy: Regularly spaced peripheral nerves along the entire length of the nerve cord' + [352] 'Neuroanatomy: Nerve cord has orthogonal organization' + [353] 'Neuroanatomy: Orthogonal nerve cord: Complete ring commissures' + [354] 'Neuroanatomy: Segmental leg nerves shifted anteriorly relative to appendages' + [355] 'Neuroanatomy: Segmental leg nerves paired' + [356] 'Neuroanatomy: Stomatogastric ganglion' + [357] 'Neuroanatomy: Circumpharyngeal brain' + [358] 'Neuroanatomy: Circumpharyngeal brain: Subpharyngeal main region with weak suprapharyngeal commissure' + [359] 'Neuroanatomy: Dorsal condensed brain' + [360] 'Neuroanatomy: Dorsal condensed brain: Neuromeres' + [361] 'Neuroanatomy: Mouth innervation relative to brain neuromeres' + [362] 'Neuroanatomy: Dorsal nerve cord' + [363] 'Neuroanatomy: Dorsal nerve cord: Paired' + [364] 'Neuroanatomy: Brain neuropil sandwiched by perikarya' + [365] 'Neuroanatomy: Apical brain composed of perikarya' + [366] 'Neuroanatomy: Tooth ganglia connected by diagonal nerve net' + [367] 'Organ systems: Perigenital area: Cloaca' + [368] 'Organ systems: Perigenital area: Cloaca in both sexes' + [369] 'Organ systems: Perigenital area: Urogenital system attached to the body wall by a ligament' + [370] 'Organ systems: Perigenital area: Seminal receptacle: External ' + [371] 'Organ systems: Perigenital area: Perigenital setae' + [372] 'Organ systems: Perigenital area: Clavulae' + [373] 'Organ systems: Perigenital area: Clavulae: Stalk length' + [374] 'Organ systems: Perigenital area: Clavulae: Distal bulb' + [375] 'Organ systems: Perigenital area: Bullulae' + [376] 'Organ systems: Expanded anterior gut' + [377] 'Organ systems: Polythyridium' + [378] 'Organ systems: Protonephridia' + [379] 'Organ systems: Protonephridia: Integrated into the gonad' + [380] 'Organ systems: Protonephridia: Compound filter, built by two or more terminal cells' + [381] 'Organ systems: Protonephridia: Sieve plates' + [382] 'Organ systems: Protonephridia: Terminal cells with circumciliary microvilli' + [383] 'Organ systems: Tube' + [384] 'Cellular structure: Flagellate spermatozoa' + [385] 'Cellular structure: Primary constituent of cuticle' + [386] 'Cellular structure: Layer of cuticle containing abundant chitin' + [387] 'Cellular structure: Middle layer of cuticle has distinct composition' + [388] 'Cellular structure: Nucleation of "peritoneal" membrane' + [389] 'Cellular structure: Pillar-like structure in the epicuticle' + [390] 'Cellular structure: Tanycytes' + [391] 'Cellular structure: Cross-wise fibres in cuticle' + [392] 'Cellular structure: Large helical fibres in cuticle' + [393] 'Cellular structure: Egg ornamentation' + [394] 'Larval morphology: Developmental mode' + [395] 'Larval morphology: Cuticle dorso-ventrally flattened with six accordion-like lateral plates' + [396] 'Larval morphology: Neck crenulated like an accordion' + [397] 'Larval morphology: Larvae/juveniles with long pharynx retractor muscles' + [398] 'Larval morphology: Body divided into proboscis + abdomen' + [399] 'Larval morphology: Diaphragm separates larval thorax from abdomen' + [400] 'Larval morphology: Pair of spines at anterior of larval abdomen' + [401] 'Larval morphology: Caudal spines or appendages at posterior of larval abdomen' + [402] 'Larval morphology: Buccal canal morphology' + [403] 'Larval morphology: Large mesenchyme cells' + [404] 'Larval morphology: Higgins larva' + [405] 'Higgins larva: Head-trunk dimensions' + [406] 'Higgins larva: Thorax-abdomen length dimensions' + [407] 'Higgins larva: Thorax ornamentation' + [408] 'Higgins larva: Thorax wrinkles: Nature' + [409] 'Higgins larva: Lorica composition' + [410] 'Higgins larva: Closing plates on ventral side of thorax' + [411] 'Higgins larva: Head and thorax separated by collar region' + [412] 'Higgins larva: Mouth cone with oral teeth' + [413] 'Higgins larva: Inner armature' + [414] 'Higgins larva: Clavoscalids with distal units forming lobe with a hook' + [415] 'Higgins larva: Number of Row 2 scalids' + [416] 'Higgins larva: Row 2 scalids: Small, pincher-shaped claw' + [417] 'Higgins larva: Bifurcated scalids in penultimate row' + [418] 'Higgins larva: Alternating trifurcated and kite-shaped scalids in posteriormost row' + [419] 'Higgins larva: Anteroventral setae' + [420] 'Higgins larva: Anterolateral setae' + [421] 'Higgins larva: Short ventral tube-like setae' + [422] 'Higgins larva: Toe' + [423] 'Higgins larva: Toe: Shape' + [424] 'Higgins larva: Toe: Mucrones' + [425] 'Higgins larva: Toe: Ball and socket articulation' + ; + STATELABELS + 1 + 'absent' + 'present' + , + 2 + '[Transformational character]' + '<10' + '10-20' + '>20' + , + 3 + 'trunk cylindrical and undifferentiated on dorsoventral axis' + 'dorsal and/or ventral surface recognizable by shape, armature, or location of appendages' + , + 4 + 'absent' + 'present' + , + 5 + 'terminal' + 'subterminal' + 'in abdomen' + , + 6 + 'terminal' + 'ventral' + , + 7 + '[Transformational character: Inapplicable if mouth is not ventral]' + 'anterior' + 'ventral' + 'posterior' + , + 8 + 'absent' + 'present' + , + 9 + 'absent' + 'present' + , + 10 + 'introvert not present or not invaginable' + 'introvert invaginable' + , + 11 + '[Transformational character: Inapplicable if introvert not eversible]' + 'invaginable to part of Zone I or equivalent' + 'completely invaginable into the trunk (i.e. to the base of Zone I)' + , + 12 + 'absent' + 'present' + , + 13 + 'absent' + 'present' + , + 14 + '[Transformational character: Inapplicable if tricoscalids absent]' + 'constriction (as in loriciferans)' + 'insertion of muscles (as in kinorhynchs)' + , + 15 + '[Transformational character: Inapplicable if tricoscalids absent]' + 'six' + 'seven' + 'nine' + 'fourteen' + 'fifteen' + , + 16 + '[Transformational character: Inapplicable if trichoscalids absent]' + 'one' + 'two' + , + 17 + 'absent; trichoscalids attach directly to introvert' + 'trichoscalid plate present' + , + 18 + 'not articulated' + 'articulated' + , + 19 + '[Transformational character: Inapplicable if trichoscalids absent]' + 'simple, unornamented' + 'serrated' + 'with pectinate fringe' + , + 20 + 'absent' + 'present' + , + 21 + 'unarmed' + 'armed (whether in larva or adult)' + , + 22 + '[Transformational character: Inapplicable if Zone I lacks longitudinal rows of sclerites]' + 'no' + 'yes' + , + 23 + '[Transformational character: Inapplicable if introvert unarmoured or not eversible]' + 'concave surface directed anteriad when introvert is everted (or equivalent)' + 'concave surface directed posteriad when introvert is everted' + , + 24 + '[Transformational character: Inapplicable if no Zone I armature]' + 'single circlet' + 'multiple circlets' + , + 25 + '[Transformational character: Inapplicable if Zone I unarmed]' + 'elements as a single series, whether or not morphology differs' + 'elements organized into two or more transverse bands or series, possibly with different element morphologies within each series, but the sequence of morphologies being comparable between subsequent series' + , + 26 + '[Transformational character: Inapplicable if Zone I unarmed]' + 'not in rows' + 'in prominent rows (excepting transverse rows)' + , + 27 + '[Transformational character: Inapplicable if not in rows]' + 'discrete parallel longitudinal rows' + 'rows aligned diagonal to the anterior-posterior axis of the animal, possibly producing a quincunx' + , + 28 + '[Transformational character: Inapplicable if Zone I armature absent]' + 'continuous to end of introvert / Zone II elements' + 'gap between armature and end of introvert' + , + 29 + '[Transformational character: Inapplicable if Zone I unarmed]' + 'papillae only' + 'cuticularized spines, hooks or scalids' + , + 30 + '[Transformational character: Inapplicable if Zone I lacks sclerotized armature]' + 'elements hollow' + 'elements solid' + , + 31 + '[Transformational character: Inapplicable if Zone I unarmed]' + 'elements not elongate' + 'extreme elongation: elements more than 20 times longer than wide' + , + 32 + '[Transformational character: Inapplicable if Zone I unarmed]' + 'dead straight' + 'spinose/conical' + 'curved or hooked' + , + 33 + 'elements do not bifurcate' + 'bifurcating elements' + , + 34 + 'edentate' + 'dentate' + 'pectinate' + , + 35 + 'lacking articulation' + 'articulated joints' + , + 36 + 'setules absent' + 'setules present' + , + 37 + 'not telescopic' + 'telescopic' + , + 38 + 'elements lack hood' + 'elements with hood' + , + 39 + 'absent' + 'present' + , + 40 + 'not a multiple of five' + 'a multiple of five' + , + 41 + 'not a multiple of 25' + 'a multiple of 25' + , + 42 + 'not a multiple of six' + 'a multiple of six' + , + 43 + 'absent' + 'present' + , + 44 + 'absent' + 'present' + , + 45 + 'absent' + 'present' + , + 46 + 'pharynx (mouth cone) permanently inverted' + 'pharynx eversible' + , + 47 + '[Transformational character: Inapplicable if pharynx not eversible]' + 'pharynx eversible and invaginable' + 'pharynx permanently everted' + , + 48 + '[Transformational character: Inapplicable if neither pharynx nor introvert eversible]' + 'neither introvert nor pharynx involved in locomotion' + 'introvert or pharynx involved in locomotion' + , + 49 + '[Transformational character: Inapplicable if pharynx not eversible]' + 'complete' + 'incomplete (but beyond proximal teeth only)' + 'restricted (only as far as proximal teeth)' + , + 50 + '[Transformational character: Inapplicable if pharynx not eversible]' + 'diminutive (<2% of animal length)' + 'very large (>30% of animal length)' + , + 51 + '[Transformational character]' + 'invaginable' + 'distal region permanently everted' + 'proximal region forms non-invertible mouth cone' + , + 52 + 'round' + 'triradiate' + , + 53 + 'absent' + 'circumpharyngeal structures present' + , + 54 + '[Transformational character: Inapplicable if Zone II unarmed]' + 'small contact area (e.g. coronal spines)' + 'large contact area (e.g. Parapeytoia)' + , + 55 + '[Transformational character: Ambiguous if Zone II unarmed]' + 'continuous ring' + 'opposed bilateral series' + , + 56 + '[Transformational character: Inapplicable if circumoral structures, if present, are neither scalids nor plates]' + 'undifferentiated' + 'differentiated (e.g. Radiodonta – three or four enlarged plates)' + , + 57 + '[Transformational character: Inapplicable if no differentiated elements]' + '3 enlarged plates' + '4 enlarged plates' + , + 58 + 'absent' + 'present' + , + 59 + 'absent' + 'present' + , + 60 + 'absent' + 'present' + , + 61 + '[Transformational character: Inapplicable if radial circumpharyngeal structures absent]' + 'labile papillae or lamellae' + 'cuticularized scalids or plates' + , + 62 + '[Transformational character: Inapplicable if Zone II lacks armature]' + 'four' + 'six' + 'seven' + 'eight' + 'nine' + 'ten' + 'many' + , + 63 + '[Transformational character: Inapplicable if Zone II lacks armature]' + 'less than four times longer than wide' + 'elongate spines; at least ten times longer than wide' + , + 64 + '[Transformational character: Inapplicable if Zone II unarmed]' + 'monocuspate elements' + 'polycuspate elements' + , + 65 + 'absent' + 'present' + , + 66 + '[Transformational character: Inapplicable if spinose projections absent]' + 'proximal surface with single projection' + 'proximal surface with multiple spines' + , + 67 + 'unfused' + 'fused to introvert' + , + 68 + 'not substantially (i.e. less than 2×) wider' + 'substantially (at least 2×) wider' + , + 69 + '[transformational character]' + 'teeth gap; pharyngeal teeth not directly adjacent ' + 'no teeth gap; pharyngeal teeth directly adjacent' + , + 70 + 'absent' + 'present' + , + 71 + 'absent' + 'present' + , + 72 + '[Transformational character: Inapplicable if oral ridges absent]' + 'six' + 'eight' + , + 73 + 'absent' + 'present' + , + 74 + 'undifferentiated' + 'differentiated' + , + 75 + 'absent' + 'present' + , + 76 + 'unarmed' + 'armed (whether in larvae or adults)' + , + 77 + '[Transformational character]' + 'no elaboration of tooth point; spinose/acicular' + 'each tooth has multiple cusps, perhaps expressed as denticles or serrations' + , + 78 + '[Transformational character: Inapplicable if Zone III unarmed]' + 'lost at metamorphosis, or primarily absent' + 'retained to adulthood' + , + 79 + '[Transformational character: Inapplicable if Zone III unarmed]' + 'composed exclusively of cuticle' + 'outer covering of cuticle with central cavity' + , + 80 + '[Transformational character: Ambiguous if Zone III unarmed]' + 'radial rings or whorls' + 'haphazard distribution around full circumference of pharynx' + 'bilaterally opposed series' + , + 81 + '[Transformational character: Inapplicable if Zone III unarmed]' + 'occupying most of circumference of pharynx, perhaps with modest gap between series' + 'few longitudinal rows or series with large gap between' + , + 82 + '[Transformational character: Inapplicable if Zone III lacks armature]' + 'one' + 'strictly four' + 'four to six' + 'strictly six' + 'many' + , + 83 + '[Transformational character: Inapplicable if Zone III does not follow this configuration]' + 'five' + 'six' + 'seven' + 'eight' + , + 84 + '[Transformational character: Inapplicable if none present, or Zone III does not follow this configuration]' + 'four' + 'multiple of five' + 'multiple of six' + 'multiple of eight' + , + 85 + '[Transformational character: Inapplicable if number of elements in proximal circlet is not a multiple of five]' + 'five' + 'ten' + , + 86 + 'not reduced' + 'reduced' + , + 87 + '[Transformational character: Inapplicable if Zone III unarmed]' + 'uniform size' + 'alternate elements large then small' + , + 88 + 'elements lack prominent central spine' + 'elements with prominent central spine' + , + 89 + '[Transformational character: Inapplicable if proximal circlet not morphologically differentiated; ambiguous if reduced]' + 'straight' + 'strongly recurved (hooked)' + 'appendicules' + , + 90 + 'absent' + 'present' + , + 91 + 'not articulated' + 'articulated' + , + 92 + 'not reduced' + 'reduced' + , + 93 + 'armature not differentiated' + 'armature of proximal circlet (or few proximal circlets) is morphologically differentiated from rest of Zone III armature' + , + 94 + 'absent' + 'present' + , + 95 + '[Transformational character: Inapplicable if Zone III lacks armature, or only has 1-4 circlets]' + 'not reduced' + 'reduced' + , + 96 + '[Transformational character: Inapplicable if middle circlets absent; ambiguous if reduced]' + 'papillae or simple cone (no spine, wider than tall)' + 'single spine' + 'multiple spines' + 'pectinate' + , + 97 + '[Transformational character: Inapplicable if Zone III unarmoured]' + 'distal circlets not differentiated, or only differentiated in size or aspect ratio' + 'teeth in distal armature field morphologically distinct from teeth in other circlets' + , + 98 + '[Transformational character: Inapplicable if distal circlets not morphologically differentiated; ambiguous if reduced]' + 'papillae (no spine, wider and longer than tall)' + 'single spine' + 'multiple spines' + 'pectinate' + 'wide lamella or plate' + 'chain-like elements' + , + 99 + '[Transformational character: Inapplicable if Zone III unarmed or insufficient distal circlets to assess]' + 'approximately equal' + 'decreasing distally (distalmost elements less than half the size of proximal)' + , + 100 + 'absent' + 'present' + , + 101 + 'absent' + 'present' + , + 102 + '[Transformational character: Inapplicable if placoids absent]' + 'single undivided macroplacoid' + 'divided macroplacoids' + , + 103 + 'absent' + 'present' + , + 104 + 'absent' + 'present' + , + 105 + 'absent' + 'present' + , + 106 + '[Transformational character: Inapplicable if no apophysis for the insertion of the stylet muscle]' + 'hook shaped' + 'ventral ridge' + 'ridge shaped' + , + 107 + 'absent' + 'present' + , + 108 + 'no segment-like ring' + 'neck forms segment-like ring' + , + 109 + 'absent' + 'present (placids or lips)' + , + 110 + 'absent' + 'present' + , + 111 + '[Transformational character: Inapplicable closing apparatus absent]' + 'radial' + 'bilateral' + , + 112 + '[Transformational character: Inapplicable if cuticular neck plates absent]' + 'six' + 'seven' + 'nine' + 'twelve' + 'fourteen' + 'sixteen' + , + 113 + '[Transformational character: Inapplicable if cuticular neck plates absent]' + 'straight' + 'rectangular with straight margin and angular corners' + 'tripartite' + 'spikes present on anterior margin of plate' + , + 114 + '[Transformational character: Inapplicable if cuticular neck plates absent]' + 'fused with first trunk segment' + 'articulated' + , + 115 + 'absent' + 'present' + , + 116 + '[Transformational character: Inapplicable if amphids absent]' + 'round' + 'slit-like' + , + 117 + 'absent' + 'present' + , + 118 + 'absent' + 'present' + , + 119 + '[transformational character]' + 'absent' + 'present' + , + 120 + '[transformational]' + 'dorsal' + 'anterior' + , + 121 + '[transformational]' + 'oval/rounded' + 'elongate' + , + 122 + 'absent' + 'present' + , + 123 + '[transformational character]' + 'broad attachment to cephalic region' + 'narrow attachment to anterior edge of cephalic region' + , + 124 + 'absent' + 'present' + , + 125 + '[transformational character]' + 'subcircular' + 'elongate' + , + 126 + 'absent' + 'present' + , + 127 + 'orientation of mouth is fixed relative to main trunk' + 'flexible anterior trunk allowing mouth''s dorsal-ventral orientation to be independent of main trunk axis' + , + 128 + 'anteriormost trunk contiguous with posterior trunk; no swollen ‘head’' + 'anteriormost trunk elliptical, substantially wider than adjacent trunk' + , + 129 + 'absent' + 'present' + , + 130 + 'Frontal filaments not incorporated into lip papillae' + 'Incorporated into lip papillae' + , + 131 + 'no sensory field' + 'sensory field present' + , + 132 + '[Transformational character: Inapplicable if Cirri A absent]' + 'Mid-head' + 'Posterior part of the head' + 'First trunk segment' + , + 133 + 'absent' + 'present' + , + 134 + 'absent' + 'present' + , + 135 + '[Transformational character: Inapplicable if occular structures absent]' + 'two' + 'four' + , + 136 + 'absent' + 'present' + , + 137 + 'eye stalks absent' + 'eye stalks present' + , + 138 + '[transformational character]' + 'approximately dorsal to mouth' + 'significantly posterior of mouth' + , + 139 + 'not sclerotized' + 'sclerotized' + , + 140 + 'absent' + 'present' + , + 141 + 'pre-ocular limb pair absent or not differentiated from other limbs' + 'distinct pre-ocular limb pair' + , + 142 + 'absent' + 'present' + , + 143 + 'no material differentiation of podomeres' + 'strong differentiation of proximal from distal podomeres' + , + 144 + '[transformational character]' + 'Distal podomeres approximately uniform size' + 'Distal podomere diameter strongly reducing distally' + , + 145 + '[transformational character]' + 'lateral' + 'ventral' + 'within mouth cavity' + , + 146 + 'frontal appendages not shifted posteriorly' + 'frontal appendages shifted posteriorly' + , + 147 + '[transformational character]' + 'pre-ocular appendages not directly adjacent' + 'pre-ocular appendages adjacent to one another, with or without physical fusion' + , + 148 + '[transformational character]' + 'basally adjacent' + 'bases separated by physical gap' + , + 149 + '[transformational character]' + 'pre-ocular appendages adjacent but not mechanically fused' + 'pre-ocular appendages are mechanically fused to form a single element' + , + 150 + 'no loss of claws on differentiated protocerebral appendage' + 'differentiated protocerebral appendage claws lost' + , + 151 + 'absent' + 'present' + , + 152 + '[Transformational character: Inapplicable if ventral spine series absent]' + 'one row' + 'two rows' + 'more than two rows' + , + 153 + '[Transformational character: Inapplicable if ventral spine series absent]' + 'comparable size to shaft' + 'significantly larger than shaft' + , + 154 + 'absent' + 'present' + , + 155 + '[Transformational character: Inapplicable if accessory spines absent]' + 'accessory spines originate near base of main spine' + 'accessory spines regularly spaced along main spine' + , + 156 + 'no alternation in length' + 'alternation in length from each spine to the next' + , + 157 + '[Transformational character: Inapplicable if ventral spine series absent]' + 'comparable width of spine to podomere width' + 'spine width significantly narrower' + , + 158 + '[Transformational character: Inapplicable if spine series absent]' + 'no increase (e.g., Anomalocaris)' + 'increase (e.g., Hurdia)' + , + 159 + '[transformational character]' + 'spine series point to other appendage' + 'spine series point outwards' + , + 160 + 'absent' + 'present' + , + 161 + 'absent' + 'present' + , + 162 + 'absent' + 'present' + , + 163 + 'absent' + 'present' + , + 164 + 'absent' + 'present' + , + 165 + 'absent' + 'present' + , + 166 + 'arthrodial membranes absent' + 'arthrodial membranes present' + , + 167 + '[transformational character]' + 'cylindrical/subconical appendage' + 'laterally expanded swimming flap' + , + 168 + 'undifferentiated, or differentiated in size only' + 'structurally differentiated' + , + 169 + '[transformational character]' + 'antenniform with distinct podomeres' + 'short great-appendage' + , + 170 + '[transformational character]' + 'ambulatory' + 'sensorial' + 'masticatory, with sclerotized jaw' + , + 171 + 'absent' + 'present' + , + 172 + 'undifferentiated' + 'specialized papilla' + , + 173 + '[transformational character]' + 'ambulatory limb with distinct podomeres' + 'specialized post-antennal appendage' + , + 174 + 'absent' + 'present' + , + 175 + '[transformational character]' + 'homonomous' + 'heteronomous' + , + 176 + 'annulations continue unaltered for full length of anterior trunk' + 'annulations becoming indistinct anteriad' + , + 177 + '[Transformational character: Inapplicable if annular rings absent]' + 'unbranched' + 'branched' + , + 178 + 'absent' + 'present' + , + 179 + 'absent' + 'present' + , + 180 + 'absent' + 'present' + , + 181 + 'projections absent' + 'angular projections on anterolateral corners of first sternites' + , + 182 + '[Transformational character: Inapplicable if sternites absent]' + 'straight' + 'medially incised' + , + 183 + 'absent' + 'spinose midventral process' + , + 184 + '[Transformational character: Inapplicable if sternal plates absent]' + 'second segment an undivided ring' + 'second segment divided into sternites and tergites' + , + 185 + 'as in segments 7+' + 'differentiated' + , + 186 + '[Transformational character]' + 'one tergal plate with midventral articulation' + 'one tergal and two sternal plates' + , + 187 + 'as in segments 7+' + 'differentiated' + , + 188 + 'absent' + 'present' + , + 189 + 'not extended' + 'spinose process extending well beyond posterior segment margin' + , + 190 + 'absent' + 'lateral terminal spines present' + , + 191 + 'absent' + 'lateral terminal accessory spines present' + , + 192 + 'absent' + 'midterminal spine present' + , + 193 + 'absent' + 'present' + , + 194 + 'entire' + 'deep lateroventral notches, with or without spines' + , + 195 + 'setae absent on sternal plates' + 'setae on sternal plates' + , + 196 + 'absent' + 'present' + , + 197 + 'absent' + 'present' + , + 198 + 'absent' + 'reniform, submillimetric lamellar' + , + 199 + '[transformational character]' + 'broadly uniform trunk width' + 'substantial posteriad trend to narrower trunk' + , + 200 + 'trunk of uniform construction' + 'anterior trunk differentiated from posterior trunk by abrupt change in thickness, armature and appendage construction' + , + 201 + 'absent' + 'present at some point during ontogeny' + , + 202 + 'absent' + 'present' + , + 203 + '[Transformational character: Inapplicable if flosculi absent]' + 'flosculi, including N-flosculi and P-flosculi' + 'sensory spots' + , + 204 + 'no petals' + 'petals' + , + 205 + '[Transformational character: Inapplicable if petals absent]' + 'variable' + 'invariably eight' + , + 206 + '[transformational character]' + 'absent' + 'present' + , + 207 + 'absent' + 'two transverse rows of accentuated papillae present' + , + 208 + 'absent' + 'ring of cuticular elements post-introvert (i.e. girdling neck / cervical region) present at any point in ontogeny' + , + 209 + 'absent' + 'present' + , + 210 + 'absent' + 'present' + , + 211 + '[Transformational character: Inapplicable if lorica absent at all stages in ontogeny]' + 'no plates; lorica comprises plicae' + 'one series of plates or plicae' + 'two series (cf. Sirilorica)' + 'four series (cf. Shergoldana)' + , + 212 + '[Transformational character: Inapplicable if lorical plates absent]' + 'six' + 'seven' + 'eight' + 'ten' + 'twenty' + , + 213 + 'plates equant' + 'dorsal and ventral plates enlarged' + , + 214 + 'absent' + 'present' + , + 215 + 'absent' + 'present' + , + 216 + 'absent' + 'present' + , + 217 + 'no more than a trace of phosphorous' + 'principally phosphatic in composition' + , + 218 + '[Transformational character: Inapplicable if plates absent]' + 'essentially circular' + 'elongated parallel to body axis' + 'acutely pointed, extended perpendicular to body axis' + , + 219 + 'absent' + 'present' + , + 220 + '[Transformational character]' + 'single node' + 'single ring' + 'two rings' + , + 221 + '[Transformational character]' + 'variable within an individual' + 'constant number' + , + 222 + '[Transformational character]' + 'single central node' + 'three to six' + 'eight to ten' + , + 223 + '[Transformational character: Inapplicable if not three to six nodes]' + 'three' + 'four' + 'five' + , + 224 + 'no differentiated anterior region' + 'anterior trunk with differentiated spinose sclerites' + , + 225 + '[Transformational character: Inapplicable if trunk sclerites not arranged in transverse series]' + 'complete rings' + 'transverse rows of limited extent that do not surround trunk' + , + 226 + '[Transformational character: Inapplicable if integumentary trunk sclerites absent]' + 'irregularly disposed' + 'in transverse fields (''rows'')' + 'in longitudinal fields (''columns'')' + , + 227 + '[Transformational character: Inapplicable if plates disordered]' + 'sclerites distributed irregularly within each annulation' + 'single primary field (or row) of sclerites / large plates on each annulation' + 'two separate primary fields of large plates on each annulation, one on each margin' + , + 228 + '[Transformational character; inapplicable if plates disordered]' + 'single series of sclerites' + 'sclerites occur in pairs along each field' + 'three rows of sclerites within each field' + 'four rows of sclerites within each field' + , + 229 + '[Transformational character: Inapplicable if integumental trunk sclerites not arranged in rows]' + 'linear; each transverse row identical to last' + 'alternate transverse rows offset, so sclerites produce quincunx' + 'no exact correspondence between sclerites of one row to the next' + , + 230 + 'no differentiated class of smaller platelets' + 'large plates and smaller platelets' + , + 231 + '[Transformational character; inapplicable if platelets absent]' + 'gaps between trunk sclerites and platelets' + 'tessellate to cover entire surface of organism' + , + 232 + 'absent' + 'present' + , + 233 + 'absent' + 'present' + , + 234 + 'no separate class of diminutive sclerites' + 'standard trunk sclerites accompanied by smaller sclerites (or tumuli)' + , + 235 + 'absent' + 'radial buttresses, giving stellate appearance' + , + 236 + 'absent' + 'present' + , + 237 + '[Transformational character: Inapplicable if enlarged sclerites absent]' + 'irregular distribution' + 'arranged in regular configuration' + , + 238 + '[Transformational character: Inapplicable if transverse bands not present]' + 'one' + 'two' + 'three' + 'four' + 'five' + 'six' + 'seven' + 'fourteen' + '20 to 25' + , + 239 + '[Transformational character: Inapplicable if not regularly spaced]' + 'Occur on every annulation' + 'Occur at lower frequency' + , + 240 + '[Transformational character: Inapplicable if irregular distribution]' + 'regular' + 'variable' + , + 241 + 'absent' + 'present' + , + 242 + '[Transformational character; Inapplicable if not multiple transverse bands of sclerites]' + 'each group of dorsal elements of equivalent size' + 'size of dorsal elements varies between groups' + , + 243 + 'absent' + 'present' + , + 244 + '[Transformational character: Inapplicable if enlarged sclerites absent]' + 'wider than tall (e.g. nodes or plates)' + 'taller than wide (e.g. spines)' + , + 245 + 'absent' + 'present' + , + 246 + '[Transformational character: Inapplicable if epidermal evaginations absent or lack an acute distal terminus]' + 'absent' + 'present' + , + 247 + '[Transformational character: Inapplicable if enlarged sclerites absent]' + 'round' + 'straight' + 'rectangular with straight margin and angular corners' + 'spikes present on anterior margin of plate' + , + 248 + '[Transformational character: Inapplicable if enlarged sclerites absent]' + 'weak' + 'substantial' + , + 249 + 'absent' + 'present' + , + 250 + '[Transformational character: Inapplicable if enlarged sclerites absent]' + 'unornamented' + 'honeycomb surface ornament (cf. Nanaloricus)' + 'regular perforations (cf. Tabelliscolex)' + 'net-like holes (cf. Microdictyon)' + 'scaly' + 'tufted' + , + 251 + 'absent' + 'present' + , + 252 + 'not sclerotized' + 'sclerotized' + , + 253 + 'absent' + 'present' + , + 254 + 'absent' + 'present' + , + 255 + '[Transformational character: Inapplicable if trunk exites absent]' + 'lateral lobes' + 'setal blades' + 'simple oval paddle with marginal spines' + 'bipartite shaft with lamellar setae' + , + 256 + 'not fused' + 'fused' + , + 257 + 'absent' + 'present' + , + 258 + 'absent' + 'present' + , + 259 + '[transformational character]' + 'confined laterally' + 'present dorsally' + , + 260 + '[Transformational character]' + 'cylindrical (e.g. Hallucigenia sparsa)' + 'conical; significantly tapered (e.g. Aysheaia)' + , + 261 + 'absent' + 'present' + , + 262 + '[transformational character]' + 'spines/setae' + 'appendicules' + , + 263 + '[transformational character]' + 'arranged in rows' + 'one or two spines' + , + 264 + '[transformational character]' + 'short/equant' + 'needle-like' + , + 265 + 'absent' + 'present' + , + 266 + 'absent' + 'present' + , + 267 + 'spine absent' + 'spine present' + , + 268 + 'absent' + 'present' + , + 269 + 'absent' + 'present' + , + 270 + '[Transformational character: Inapplicable if claws absent]' + 'no enlarged base (e.g. Paucipodia''s claws)' + 'enlarged base (e.g. Onychophora claws)' + , + 271 + '[Transformational character]' + 'terminal' + 'sub-terminal' + , + 272 + 'absent' + 'present' + , + 273 + '[Transformational character: Inapplicable if branched claws absent]' + 'seperated' + 'fused' + , + 274 + '[transformational]' + 'Aysymmetrical (2121)' + 'Symmetrical (2112)' + , + 275 + '[Transformational character: Inapplicable if branched claws absent]' + 'rigid' + 'flexible' + , + 276 + '[transformational]' + 'Right-angled' + 'Curved' + , + 277 + '[Transformational character: Inapplicable if claws unbranched]' + 'not connected' + 'connected' + , + 278 + '[Transformational character: Inapplicable if claws unbranched]' + 'symmetrical' + 'asymmetrical' + , + 279 + 'direct' + 'with a flexible connection' + , + 280 + '[Transformational character: Inapplicable if claws unbranched]' + 'not perpendicular' + 'perpendicular' + , + 281 + 'undivided' + 'divided' + , + 282 + 'absent' + 'present' + , + 283 + '[Transformational character: Inapplicable if claws unbranched or base not extended]' + 'basal thickening' + 'pseudolunules' + 'lunules' + , + 284 + '[Transformational character: Inapplicable if claws unbranched]' + 'not connected' + 'connected' + , + 285 + '[Transformational character: Inapplicable if claws unbranched]' + 'symmetrical' + 'asymmetrical' + , + 286 + 'direct' + 'with a flexible connection' + , + 287 + '[Transformational character: Inapplicable if claws unbranched]' + 'not perpendicular' + 'perpendicular' + , + 288 + 'undivided' + 'divided' + , + 289 + 'absent' + 'present' + , + 290 + '[Transformational character: Inapplicable if base extension absent]' + 'basal thickening' + 'pseudolunules' + 'lunules' + , + 291 + '[transformational character]' + 'one' + 'two' + 'three' + 'four' + 'six' + 'seven' + , + 292 + '[transformational character]' + 'equal number of claws on all claw-bearing appendages' + 'variable number of claws' + , + 293 + '[transformational character]' + 'claws on single limb all identical' + 'claws on single limb differentiated' + , + 294 + 'absent' + 'present' + , + 295 + 'absent' + 'present' + , + 296 + 'absent' + 'present' + , + 297 + 'absent' + 'present' + , + 298 + '[transformational character]' + 'absent' + 'even body outline' + 'present' + 'pronounced decrease in lobe width posteriad' + , + 299 + 'first pair of trunk limbs comparable in size to subsequent pairs' + 'first pair of trunk limbs hypertrophied' + , + 300 + 'no reduction of anterior limbs' + 'anterior limbs reduced in size or absent' + , + 301 + 'absent' + 'present' + , + 302 + '[transformational character]' + 'two' + 'three' + 'five' + 'six' + , + 303 + '[transformational character]' + 'slender, simple' + 'cirrate' + , + 304 + '[transformational character]' + 'Fewer than 15 podomeres' + '15 or more podomeres' + , + 305 + 'absent' + 'present' + , + 306 + 'absent' + 'present: tubular portion of the body extends beyond the last observable appendage pair' + , + 307 + 'absent' + 'present; tubular portion of the body extends beyond the last observable appendage pair' + , + 308 + 'absent' + 'present' + , + 309 + '[transformational character]' + 'same direction as claws on other appendages' + 'rotated anteriad' + , + 310 + 'undifferentiated' + 'differentiated' + , + 311 + '[Transformational character: Inapplicable if posteriormost appendages not differentiated]' + 'appendicular tail' + 'partially fused/reduced walking legs' + , + 312 + '[Transformational character: Inapplicable if posterior trunk appendages do not form a differentiated tail]' + 'tail rami' + 'tail flaps' + , + 313 + '[Transformational character: Inapplicable if posterior trunk appendages do not form a differentiated tail]' + 'blade-like' + 'paddle-like' + 'elongate filament or spine' + , + 314 + 'absent' + 'present' + , + 315 + 'absent' + 'present' + , + 316 + 'not eversible' + 'eversible' + , + 317 + '[Transformational character: Inapplicable if caudal appendage absent]' + 'shorter than body' + 'longer than body' + , + 318 + '[Transformational character: Inapplicable if caudal appendage absent]' + 'undivided' + 'pseudo-segmented' + , + 319 + '[Transformational character: Inapplicable if caudal appendage absent]' + 'single' + 'bicaudal' + , + 320 + '[Transformational character: Inapplicable if caudal appendage absent]' + 'terminal' + 'dorso-medial' + , + 321 + '[Transformational character: Inapplicable if caudal appendage absent]' + 'smooth' + 'vesiculate' + 'bearing large warts' + , + 322 + 'absent' + 'present' + , + 323 + 'absent' + 'present' + , + 324 + '[Transformational character: Inapplicable if posterior projections absent]' + 'non-sclerotized tubulae' + 'sclerotized sclerites or setae' + , + 325 + '[Transformational character: Inapplicable if posterior projections absent]' + 'smaller' + 'larger' + , + 326 + '[Transformational character: Inapplicable if posterior projections absent]' + 'two' + 'three' + 'four' + 'six' + 'eight' + , + 327 + '[Transformational character: Inapplicable if single pair or no posterior projections]' + 'irregular' + 'bilateral arc' + 'radial ring' + , + 328 + 'absent' + 'present' + , + 329 + 'absent' + 'present' + , + 330 + 'absent' + 'present' + , + 331 + 'small' + 'large' + , + 332 + 'peripheral longitudinal and circular muscle' + 'metamerically arranged skeletal muscle' + , + 333 + 'absent' + 'present' + , + 334 + 'absent' + 'present' + , + 335 + '[Transformational character; Inapplicable if longitudinal musculature absent]' + 'anterior and posterior of trunk only' + 'successive attachment points along the body' + 'attached laterally, to chords of the epidermis' + , + 336 + '[Transformational character: Inapplicable if tegumental plates lacking]' + 'pachycycli at anterior segment margins' + 'anterior or central part of tegumental plates' + , + 337 + 'absent' + 'present' + , + 338 + '[Transformational character; Inapplicable if circular or longitudinal musculature absent]' + 'circular muscles inside longitudinal' + 'longitudinal muscles inside circular' + , + 339 + 'not reduced' + 'reduced in segment 1 only' + , + 340 + 'absent' + 'present' + , + 341 + 'absent' + 'present' + , + 342 + 'absent' + 'present' + , + 343 + '[Transformational character]' + 'intraepithelial' + 'basiepithelial' + , + 344 + 'unpaired' + 'paired' + , + 345 + 'no differentiation of nerve cords' + 'paired nerve cords differentiated in size or extent' + , + 346 + 'no fusion of nerve cords: unpaired or paired for full length' + 'merge caudally' + , + 347 + 'absent' + 'present' + , + 348 + '[Transformational character: Inapplicable if nerve cord unpaired]' + 'ventral (Alalcomenaeus, Fuxianhuia, Tardigrada)' + 'lateralized (Onychophora)' + , + 349 + 'medial interpedal commissures absent' + 'medial interpedal commissures present' + , + 350 + 'hemiganglia absent' + 'morphologically discrete condensed hemiganglia connected by medial commissures' + , + 351 + 'absent, or not occurring regularly along entire length of nerve cord' + 'present along entire length of nerve cord' + , + 352 + 'not orthogonally organized' + 'orthogonally organized' + , + 353 + 'ring commissures incomplete or absent' + 'complete ring commissures' + , + 354 + '[Transformational character: Inapplicable if leg nerves absent]' + 'not shifted anteriorly' + 'shifted anteriorly' + , + 355 + '[Transformational character: Inapplicable if leg nerves absent]' + 'unpaired' + 'paired' + , + 356 + 'absent' + 'present' + , + 357 + 'absent' + 'present' + , + 358 + 'absent' + 'present' + , + 359 + 'absent' + 'present' + , + 360 + '[Transformational character: Inapplicable if dorsal condensed brain absent]' + 'one' + 'two' + 'three' + , + 361 + 'protocerebral innervation, or innervated by circumoral nerve ring' + 'deutocerebral innervation' + 'innervation from multiple neuromeres' + 'tritocerebral innervation' + , + 362 + 'absent' + 'present' + , + 363 + '[Transformational character: Inapplicable if dorsal nerve cord absent]' + 'unpaired' + 'paired' + , + 364 + 'equal distribution of perikarya' + 'brain consisting of perikarya-neuropil-perikarya' + , + 365 + '[Transformational character: Inapplicable if brain not of cycloneuralian pattern]' + 'apical perikarya lost' + 'apical perikarya retained' + , + 366 + 'absent' + 'present' + , + 367 + 'absent' + 'present' + , + 368 + 'absent' + 'present' + , + 369 + 'no' + 'yes' + , + 370 + 'absent' + 'present' + , + 371 + 'absent' + 'present (whether or not reduced)' + , + 372 + 'absent' + 'present' + , + 373 + 'short' + 'long' + , + 374 + 'distal bulb absent' + 'distal bulb present' + , + 375 + 'absent' + 'present' + , + 376 + 'anterior gut similar diameter to mid gut' + 'expanded anterior gut' + , + 377 + 'absent' + 'present' + , + 378 + 'fused with first trunk segment' + 'articulated' + , + 379 + 'not integrated into the gonad' + 'integrated into, or flow into, the gonad' + , + 380 + 'absent' + 'present' + , + 381 + 'absent' + 'present' + , + 382 + 'circumciliary microvilli absent' + 'circumciliary microvilli present' + , + 383 + 'absent' + 'tube composed of plant debris' + 'tube comprised of chitin' + , + 384 + 'spermatozoa lack flagellum' + 'spermatozoa with flagellum' + , + 385 + 'alpha-chitin' + 'collagen' + , + 386 + 'exocuticle (middle cuticle layer)' + 'endocuticle (lowermost cuticle layer)' + , + 387 + 'composition not distinct' + 'distinct composition' + , + 388 + 'membrane without nuclei or simply with ameobocytes in association with the surface' + 'membrane containing scattered nuclei' + , + 389 + 'absent' + 'present' + , + 390 + 'absent' + 'present' + , + 391 + 'absent' + 'present' + , + 392 + 'absent' + 'present' + , + 393 + 'unornamented; smooth' + 'ornamented' + , + 394 + 'direct' + 'biphasic (or multiphasic)' + , + 395 + 'absent' + 'present' + , + 396 + '[Transformational character: Inapplicable if direct development, or larva without defined neck]' + 'larval neck smooth' + 'larval neck crenulated' + , + 397 + 'absent' + 'present' + , + 398 + 'division not evident' + 'body divided' + , + 399 + 'absent' + 'present' + , + 400 + 'absent' + 'present' + , + 401 + 'absent' + 'present' + , + 402 + '[Transformational character: Inapplicable if no buccal canal]' + 'short, linear or sacculose' + 'elongate, curving' + , + 403 + 'not present in both sexes' + 'present in both sexes' + , + 404 + 'absent' + 'present' + , + 405 + '[Transformational character; inapplicable if no Higgins larva]' + 'trunk wider than head' + 'trunk and head same width' + 'head wider than trunk' + , + 406 + '[Transformational character]' + 'thorax shorter than abdomen' + 'thorax longer than addomen' + , + 407 + '[Transformational character]' + 'plates' + 'wrinkles' + , + 408 + '[Transformational character: Inapplicable if thorax not wrinkled]' + 'irregular wrinkles' + 'zigzag wrinkles' + , + 409 + '[Transformational character: Inapplicable if no Higgins larva]' + 'plates' + 'plicae' + , + 410 + 'absent' + 'present' + , + 411 + 'absent' + 'present' + , + 412 + 'absent' + 'present' + , + 413 + 'absent' + 'present' + , + 414 + 'absent' + 'present' + , + 415 + 'row missing' + 'six or seven scalids' + 'ten or more scalids' + , + 416 + 'absent' + 'present' + , + 417 + 'absent' + 'present' + , + 418 + 'absent' + 'present' + , + 419 + '[Transformational character]' + 'tripartite locomotory setae' + 'single unit, eventually branched' + , + 420 + 'absent' + 'present' + , + 421 + 'absent' + 'present' + , + 422 + 'absent' + 'present' + , + 423 + '[Transformational character: Inapplicable if toe absent]' + 'spinous' + 'elongate with abrupt tapering' + 'stout' + , + 424 + 'absent' + 'present' + , + 425 + 'absent' + 'present' + + ; + MATRIX + 'Orstenoloricus shergoldii' ??0000-????????????????????????????????????????????????????????????????????????????????????????????????????000----??00---0-0-000000-?0-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-00000000000?210????101?02501000------0-----0-10000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---0????????????????????????????????????????????????????????????????????????????????02?1?10??1?12210?????????000-00 + 'Gastrotricha' 01?00??1?0--0--?00-00-----------0000000???0????---?10----000----0-0--??????0---------0-----000-----00-00??0?00----0-00---0-0-00????-?1?00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-000000000000???0-0-?0000--?0000------0-----0-00000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0??1??000-????-1?0??0?1?00?????????0????12010?00?00-00?010???010-0??000?00??0?????0-----000-0-000-000-00 + 'Lineus' ???0????????????????????????----00??000???0?????????0----000----0-0?-??????0---------0-----000-----00-00????00----0-00---0-0-00????-?1?00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-000000000000????????0000--?0000------0-----0-00000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0?????????????-??0??0????????????????????????????0?????0?????01???????????????????0-----000-0-000-000-00 + 'Solenogastres' ??10????????????????????????----00??000???0?????????0----000----0-0?-??????0---------0-----000-----00-00????00----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-000000000000?????????000--?1010------?-1---0-00000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0?????????????-??0??0????????????????????????????0?????0?????01???????????????????0-----000-0-000-000-00 + 'Nereis' ?31100-0???-0--?00-00-----------0000000???0?????-???0----000----0-0--??????0---------0-----000-----00-00??0?00----0-00---0-0-00????-?1?00-00??????????0--0-----?0????0?0?????11??10-0-0-0-000000000000???0-0-?0000--?0000------0-----0-00000----0-0-0---0-0?00-0?0??0---?0??0--0------0-00---0-00----??-0???0?????00-0???000-----0??1??0000????-??0?0??10001????????0?1-?0-0?-??0?00-0??010???01?????????10??0?????0-----000-0-000-000-00 + 'Ancalagon minor' ?20000-1?1??0--?00-01?2212212?120000000???0??11?3?1?0----000----0-00??????01?2???1-???11200?002-1-10???0??0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000??0?????0?0????1010------?-1---0-?000??-????0-?-?-?-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----?00-????-??0????????????????????????????????0-0??0?????0?????????????????????????????????????????? + 'Fieldia lanceolata' ?30000-1012?0--?00-01-?212222?1?0000000???00?0-?--??1111-0??27110-00???????1?????1-???????0????????0????0-?000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-11??00-0-0-0-000000000000110?????0000--?1010------?12--20-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----01211-?00-????-???????????????????????????????????0-0??0?????0?????????????????????????????????????????? + 'Scolecofurca rara' ???000-1????0--?00-01?221??12?120?0?000???00?1??????11??????2?21??00???????1?2??15-????????????????0??????1000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-000000000000??0?????0000--?1?10------?12--?0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---????????0?????????????-???????????????????????????????????????0??????0?????????????????????????????????????????? + 'Markuelia lauriei' ?20000-1?12?0--?00-01-221--121120000000110001?1?????1??1-???27?1??0???????????2???????????0??0?????0???0???000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-00000000000??000-0-?0000--??0?0?0----?-1---0-??00??-????0-?---?-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---?00-----01224??00-????-???????????????????????????????????0-0??0????????????????001-0000-0?????????????????????? + 'Shergoldana australiensis' ?10000-1????????????1?12??????120000000001?????????????????????????????????????????????????????????0???????000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110200-0-0-0-00000000000??100-0-?01?04??1?10------?12--10-0??01241???011222010000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----012(1 2)1-000-????-???????????????????????????????????0-00???????0???????????02?1?01???????????????????????? + 'Xinliscolex intermedius' 111000-?????0--?00-0?????????????????????????11??(1 2)1????????????????????????11?2?15???????????0121-100-000-0???????0-00---0-0-000?????0-00-00000--0---00--0-----0000000-0--00-11?200-0-0-0-000000000000??0????10000--?1100------0-----0-?00012(2 3)2202011212010000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----00??????-???????????????????????????????????0-00???????0?????????????????????????????????????????? + 'Shanscolex decorus' ???0?0-1?1??0--?00-01?2211-?21120000000001?????????????????????????????????????????????????????????0???????000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-00000000000??100-0-?0??????101?30----0122130-00001112???011112?10000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---????????0?????????????-???????????????????????????????????0-00?????????????????????????????????????????????????? + 'Qinscolex spinosus' 1??0?0-1?1??0--?00-01?2211-12112000000000000?11???10??????????????????????011?2115????11?00?00121-?00-000-0000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-11?100-0-0-0-00000000000??000-0-10000--?111?30----?122130-?0001?(1 2)2?0?011212010000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---????????0?????????????-???????????????????????????????????0-00?????????????????????????????????????????????????? + 'Zhongpingscolex qinensis' ?11000-1?1??0--?00-01??211-?21?20000000????????????????????????????????????????????????????????????0???????000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-00000000000??000-0-?0??????101?30----0122130-00001122???010-12?10000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-????-???????????????????????????????????0-00???????0?????????????????????????????????????????? + 'Eokinorhynchus rarus' ?11000-1?1??0--?00-0112212212?110000000???0??11?(1 2)1??0----000----0-00110-000112211(2 5 3 4)-4-01110000012???00-000-0000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-0000000000002100-0-10000--?101??0----????????10001222202011212010000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0122320000????-???????0000-00?????????????????????0-00???????0?????????????????????????????????????????? + 'Eopriapulites sphinx' 110000-10???0--?00-01?2212112112000000000100?11?????1111-00027??0-002?????0112211(2 5 3)-3-01110000012???0???????000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110200-0-0-0-00000000000?2000-0-10000--?1000------0----?0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-????-???????0000-00?????????????????????0-00???????0?????????????????????????????????????????? + 'Eolorica deadwoodensis' ?10000-1?0-?1???????1?22????21210011000?????????????1??1-???2211??????????????????????????0??0?????0???????000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-?10?00-0-0-0-00000000000???0?????01102501000------0----10-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0??????00-????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Nanaloricus mysticus' 010000-1?0-1115110211122121121210011000001001111(2 3)?310----000----0-0011121?01?????????????????02-1-?0??????1000----0-00---0-0-00?000-00-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000210110-?01102111000------0----10-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----01211-000-0???-1?00-0?1000?????????100-012120?01?00-00?011???010110?100?10211111211111-11000010001011100 + 'Armorloricus elegans' ?10000-1?12111???0211?22121121????????????001111(2 3)?310----000----0-0?11121?0??????????????????02-1-?0??????1000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-00000000000021??????01102111000------0----?0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----01211-?00-0???-1?00?0??0????????????0?????????????????????????????????????????????1111-11000000001011100 + 'Spinoloricus turbatio' ?1?000-1?1211????0211?22121121?????????????0?111(2 3)?310----000----0-0?1112010?12?112-3-0111000102-26100-000-1000----0-00---0-0-000000-0?????00000--0---00--0-----0000000-0--00-110100-0-0-0-000000000000210111?101102311?00------0----?0-??????????????????20000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---????????0?????????????-??00?0?????????????????????????????????????????????????????????????1111-11000010001011100 + 'Rugiloricus carolinensis' ?1?000-1?1211151102011221211212????????????00111(2 3)?31????????????????1???0??????????????????????????0??????????????0-??????????00000-0?????00000--0---00--0-----0000000-0--00-?????0-0-0-0-00000000000?2100-0-?01101-00000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---????????0???????10????-??00?0?????????????????????????????????????????????????????????????1111-20010000002101300 + 'Pliciloricus corvus' 010000-1?121115110211122121121210011000001000111(2 3)2310----000----0-0011??0?01122112-1-0111000102-26200-000-1000----0-00---0-0-000???-00-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000210110-?011?1-00000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----01211-00100???-1?00-0?1000?????????100-0??????01?00-00?011???010110?100?10211111??1111-20111020102101201 + 'Urnaloricus ibenae' ???????1??????????????????????????????????????????????????????????????????????????????????????????????????????????0-???????????????????????????????????????????????????????????????????????????????????????????1?01-0???0????????????????????????????????????????????????????????????????????????????????????????????????????????0???????????????????????????????????????????????????????????????????????102?1?111?1121-20111020102101111 + 'Wataloricus japonicus' 010000-1?121115110201122121121210000000101000111(2 3)2310----000----0-001112000112211?-??????????02-22?00-000-1000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-000000000000210????101102-?0000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----01211-000?0???-??00-??????????????????????????????0??00??????0??????????102?1?112?1122120001120112101200 + 'Tenuiloricus shirayamai' ?1??00-???????????????????????????????????????????????????????????????????????????????????????????????????????????0-???????????????????????????????????????????????????????????????????????????????????????????1?0-?????0????????????????????????????????????????????????????????????????????????????????????????????????????????0???????????????????????????????????????????????????????????????????????102?1?11??1322220000121002001110 + 'Patuloricus tangaroa' ?1?000-1?121????????1?22121??1?????????????0?111(2 3)?31????????????????1?????????????????????????????????????????????0-??????????????????????00000--0---00--0-----0000000-0--00-??????????????????????????????????????????00------0----?0-????????????????????000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---????????0?????????????-???0???????????????????????????????????????????????????????????????1322220000121112101200 + 'Scaberiloricus samba' ?1?000-1?121????????1?22121??1?????????????0?111(2 3)?31????????????????1?????????????????????????????????????????????0-??????????????????????00000--0---00--0-----0000000-0--00-??????????????????????????????????????????00------0----?0-????????????????????000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---????????0?????????????-???0???????????????????????????????????????????????????????????????1222220011120112101200 + 'Franciscideres kalenesos' 021000-1???10--?00-0112211-?21210000000100000112313?1111-00026210-00100-0001122112-22111101010121-100-000-01001---0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-0-0-111010101100110010000200110--0000--?1000------0-----0-10000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100220-0001?10??1??000--0100-012120?00000-000010???010010??00?001-0000-00-----000-0-000-000-00 + 'Antygomonas paulae' 011000-1?121124?0???112211-?21210?????11000??11231301??1-???2??1??00??0-0001?2???2-???1???1??011??-1??????001126120-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-11102020200011111011020?110-?0000--?1000------0-----0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100210-1101?101?1??000--0100-012120?00?00-000010???010010??00?001-0000-00-----000-0-000-000-00 + 'Campyloderes cf vanhoeffeni' 011000-1?121124?1030112211-?2121021000?1000??11231301??1-???2??1??00??0-0001?2???2-???1???1??011??-???????001115120-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-111010202000111?00110200110-??000--?1000------0-----0-?0??0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100210-?101?10??1??000--0100-012120?00?00-000010???010010??00?001-0000-00-----000-0-000-000-00 + 'Centroderes spinosus' 011000-1???1124?0???112211-?21210?????11000??1123130???1-??????1??00??0-000???????-???????1??0?????1??????001116120-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-111011202000111?0011020?110-?0000--?1000------0-----0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100210-?101?10??1??000--0100-012120?00?00-000010???010010??00?001-0000-00-----000-0-000-000-00 + 'Echinoderes dujardinii' 011000-1?12112111?30112211-?212100100001000??11231300----000----0-00??0-0001?22??2-22111111010111--???????001116120-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-111010102000110000010200110-?0000--?1000------0-----0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100210-1101210111??000--0100-012120?00?00-000010?1?010010??00?001-0000-00-----000-0-000-000-00 + 'Zelinkaderes klepali' 021000-10121124?0010112211-12121020000110000011231301111-00026210-00100-0001122112-22111111010141--10-000-001016210-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-11101010100011111000020?110-?0000--?1000------0-----0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100220-1101?101?1??000--0100-012120?00?00-000010???010010??00?001-0000-00-----000-0-000-000-00 + 'Cateria gerlachi' 021000-10??112410030112211-?21211?????01000??11231301111-00026210-001?0-0001122112-22111(1 2)01010121-?00-000-001014120-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-0-0-11101021100011101000020?110-?0000--?1000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100220-0001?10??1??000--0100-012120?00?00-000010???010010??00?001-0000-00-----000-0-000-000-00 + 'Dracoderes abei' 011000-1?121123?0030112211-?212100100011000??112313?1??2????2??1??00??0-0001?2???2-???1???11?011??-???????001123120-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-111010202000100000010200110-?0000--?1000------0-----0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100210-?001?10??1??000--0100-012120?00?00-000010???010010??00?001-0000-00-----000-0-000-000-00 + 'Paracentrophyes anurus' 011000-1?121124?0???112211-?212100000001000??112313?0----000----0-00??0-0001?22???-22111111110????1???????001122120-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-111110202110101?01100200110-??000--?1000------0-----0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100210-?001?10??1??000--0100-012120?00?00-0000?????010010??00?001-0000-00-----000-0-000-000-00 + 'Pycnophyes zelinkaei' 011000-1???1124?0???112211-?212100000001000??1123131???1-??????1??00??0-0001?22???-???????0??0????1???????001121120-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-111110202010100001110200110-?0000--?1000------0-----0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-100210-0001?10111??000--0100-012120?00?00-0000101?0010010?100?001-0000-00-----000-0-000-000-00 + 'Chordodes' 130020-111200--?00-01?221??12?12100000000100?1111?210----000----0-0-100-0001112325?00011100000121--00-010-0000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-000000000000100120-?0000--?1010------?-1---0-10100----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-0????0-000021010-00??????110-011120110?00-00?00000-00110???11?10101101?10-----000-0-000-000-00 + 'Nectonema' 130000-1?1200--?00-01-22?--?211?0000000???0??1111?2?0----000----0-0-?00-0001?1????????????0??0?????00-010-0000----0-00---0-0-00?000-?0-00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-0000000000001000-0-?0000--?0000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-0???-0-000021010-00??????110-011120-10?00-00?00000-00110???11?1????1??210-----000-0-000-000-00 + 'Euchromadora' 130010-10??0??12001?0-----------000000000110?0-1--1?0----000----0-00100-0001122112---?111000101122-00-000-0000----1200---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-0000000000002000-0-10000--??000------0----?0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----10----000?0103?0-?000?????????00-?010???????01????0???0????????1???0??????????????0-----000-0-000-000-00 + 'Odontophora' 130010-10??0??12001?11222--221120(0 1)00000001?000-1--1?0----000----0-0?-00-00?0---------0-----000-----?0-000-00?0----1100---0-0-000000-?0-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-00000000000?2000-0-10000--?1000------0----?0-10000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----10----000?0103?0-0?00????????000?-?1?????????????????000????????????0?????????????0-----000-0-000-000-00 + 'Kinonchulus' 130010-1012?1?12011?112212112?12000000000110?0-2--111111-00027210-00200-000112?113---010-0000011--?00-000-0000----1200---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-0000000000002000-0-10000--?0000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-0103-0-0000?1110-00000--0100-011120????????????????01???????????????????0-----000-0-000-000-00 + 'Anatonchus' 130000-100-01?12001?0-----------00000000011010-1--110----000----0-00-00-0000---------0-----000-----00-010-0000----1100---0-0-000???-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-0000000000002000-0-10000--?0000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----00----000-0103-0-000011110-00000--0100-011120100?00-00000000-00111?00(0 1)1?10100000??0-----000-0-000-000-00 + 'Acanthopriapulus horridus' 110020-101210--?00-01222121121110000000110000112?210??????????????0?100-0001222115?21011(1 2)00??02-24100-000-0000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-000000000000100101?20?0????1010------?-1---0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---0101212200----100-01??-120001?0000-00???--0100-012121?01??0-00?011???010??10??0???????????0-----000-0-000-000-00 + 'Halicryptus spinulosus' 110020-1?1210--?00-012221211211201000001100??1123(1 2)101??1-???1711??00?00-0001?22??5421020-00010131-20???0??0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?000-0-?01012311010------?-1---0-?000??81???0-?-2-?20000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----01211-10100???-1?0?01?0000-00??????100-012121?01?10-00?011???010111??00?11111001100-----000-0-000-000-00 + 'Maccabeus' 110000-1?1210--?00-012121211211201000001100??1123(1 2)101??1-???24?2??00?00-0001?22??5321020-00010132210???0??0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?011111?0000--?0000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0121?3010-0???-1?0?01?0000-00??????100-012121?01?10-00?011???110??0???0?11111000100-----000-0-000-000-00 + 'Meiopriapulus fijiensis' 110000-1?1210--?00-011221211211202000101100??1122(1 2)101??1-???2411??00?00-0001?22??5-4-010-00000141-10???0??0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-000000000000?101112?0000--?1010------?-1---0-?11???-????0-?---?-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0121?3000-0???-1?0?01?0000-00??????100-012111?01?10-00?111???010??0?110?001-0000-00-----000-0-000-000-00 + 'Priapulopsis bicaudatus' 110020-1?1210--?00-012122211211100001001100??11232101??1-???2411??01?00-0001?22??5121011110010131-20???0??0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?0?0-0-?01012311010------?-1---0-?011??-????0-?---?20000-000--0---00000--0------0-00---0-00----0000-000---0000-0---010112-200----100-0???-1?0?01?0000-00??????100-012121?01?10-00?011???010??1?????11111?01??0-----000-0-000-000-00 + 'Priapulus caudatus' 110020-1?1210--?00-012222211211100000001100??11232101??1-???2411??01?00-0001?22??5321010-00000131-20???0??0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?010-0-?01012311010------?-1---0-?000??81???0-?-2-?20000-000--0---00000--0------0-00---0-00----0000-000---0000-0---0101212200----10110???-1?0?0110000-00??????100-012121?01?10-00?011???010011??00?11111001100-----000-0-000-000-00 + 'Tubiluchus lemburgi' 110020-101210--?00-0122212112112020000011000?1122(1 2)1011?1-00024111100100-0001222?15?21010-000101424100-00--0000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-000000000000110111220000--?1010------??3--?0-?1110----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---0102111100----000?0101-12000110000-00111--0100-012111001011011?111???010??001?0?10111101100-----000-0-000-000-00 + 'Tubiluchus vanuatensis' 110020-1?1210--?00-012221211211202000001100??1121(1 2)101??1-???1711??00?00-0001?22??5-???1???0??0142410???0??0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000??01111?0100250?0?0------???--?0-?111??91???0-?-2-?10000-000--0---00000--0------0-00---0-00----0000-000---0000-0---0101111100----000-0???-1?0?0110000-00??????100-012111?01?11????111???010??0?1?0?10111101100-----000-0-000-000-00 + 'Euperipatoides' 12110120?0-00--?00-00-----------0000000???01?0-1--110----000----0-0?-00-0000---------0-----000-----00-00--0000----0-00---0-0-000110?01100-00100-101--10--0-----000000011-301-11?200-0-0-0-0000000000001000-0-20000--?1110------??2--30-10000----0-0-0---0-0000-000-20---10101210----??????????????21110-0-000---010010---000-----00----000?0111-110010?1000210111120101220-0?0100000?00?0000??01011?0?00?00100000?00-----000-0-000-000-00 + 'Plicatoperipatus' 12?1?12????00--?00-00-----------0000000???01?0-1--1?0----000----0-0?-00-0000---------0-----000-----00-00--??00----0-00---0-0-000110?01100-00100-101--10--0-----000000011-311-11?200-0-0-0-00000000000010?????2?000--?1110??????????????1???0----0-0-0---0-00?0-000-20---10101210----??????????????21110?0-00?---01?010---????????0?????????0111-110010?10?021011112010122?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Ooperipatellus' 12?1?12????00--?00-00-----------0000000???01?0-1--1?0----000----0-0?-00-00?0---------0-----000-----00-00--??00----0-00---0-0-000110?01100-00100-101--10--0-----000000011-311-11?200-0-0-0-00000000000010?????2?000--?1110??????????????1???0----0-0-0---0-00?0-000-20---10101210----??????????????21110?0-00?---01?010---????????0?????????0111-110010?10?021011112010122?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Archechiniscus bahamensis' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-2112?0?001210----??????????????41200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Batillipes pennaki' 111100-11???0--?00-00-----------0000000???00?0-1--1?111(1 2)-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-????????0?0???????????????????????-?-?????0??0?0-000-2112???01??1????????????????????1?01?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Batillipes phreaticus' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-????????0?0???????????????????????-?-?????0??0?0-000-2112?0??1??1????????????????????1?00?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Coronarctus yurupari' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-2112?0?001210----??????????????41100?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Coronarctus laubieri' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-2112?0?001210----??????????????41101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Dipodarctus susannae' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-2112?0?001210----??????????????41201?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Wingstrandarctus unsculptus' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????1??0?0-000-2112?0?001210----??????????????41101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Neoarctus primigenius' 111101?11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100311100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????0?-?????1??0?0-000-2112?0?001220----??????????????31101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Neostygarctus oceanopolis' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?-?????1??0?0-000-2112?0?001110----??????????????41101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Renaudarctus fossorius' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?-?????1??0?0-000-2112?0?001210----??????????????41101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Mesostygarctus spiralis' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?-?????1??0?0-000-2112?0?001210----??????????????41101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Parastygarctus renaudae' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?-?????1??0?0-000-2112?0?001210----??????????????41201?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Raiarctus jesperi' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-2112?0?001210----??????????????41101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Styraconyx nanoqsunguak' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-2112?0?001210----??????????????41101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Actinarctus neretinus' 111100-111?00--?00-00-----------000000000100?0-1--111111-0??2211?10-?00-00?1?21?1??--010-0000?--1--00-000-0000----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-0000000000001000-0--??00-??1000------0----?0-1???12???-?-10--2020000-000-2112?01001210----??????????????41101-0-000---0010212--000-----00----000-1012-0-01-0?100111111022?101100-0-00?0100-00?0000?-01011?1?00?00100000-00-----000-0-000-000-00 + 'Isoechiniscoides sifae' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-??0??0?0-000-2112?0?001210----??????????????51101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Neoechiniscoides aski' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-??0??0?0-000-2112?0?001210----??????????????61101?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Oreella chugachii' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-00---0-0-000100111100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-2112?0?001210----??????????????41201?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??1??????????????????1??????????????0-----000-0-000-000-00 + 'Echiniscus testudo' 111100-11??00--?00-00-----------0000000???00?0-1--111111-0???????1??-00-0000---------0-----000-----00-000-??00----0-01---0-0-000100211100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?0?????0??0?0-000-2112?0?001210----??????????????41201?0-00?---10?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Multipseudechiniscus raneyi' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-01---0-0-000100211100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?1?????0??0?0-000-2112?0?001210----??????????????41201?0-00?---10?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Testechiniscus spitsbergensis' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-01---0-0-000100211100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?0?????0??0?0-000-2112?0?001210----??????????????41101?0-00?---10?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Pseudechiniscus suillus' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-01---0-0-000100211100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?1?????0??0?0-000-2112?0?001210----??????????????41201?0-00?---10?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Cornechiniscus imperfectus' 111100-11???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????00-000-??00----0-01---0-0-000100211100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????1?1?????0??0?0-000-2112?0?001210----??????????????41101?0-00?---10?0212--????????0?????????1012-0-01-0?10?111111022?10110?????0??0??????????????????1??????????????0-----000-0-000-000-00 + 'Milnesium berladnicorum' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0?0?????1???00-00?????????????????????????00-000-??00----0-00---0-0-0001011(0 1)1100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0011111---??????????????41200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Milnesium swolenski' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0?0????0-???00-00?????????????????????????0???0????00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-????????0?0-000-20---0?001?111---??????????????41?00?0-00?---00?0212--????????0?????????????-??0??0?10?1??1??????10110???????????????????????????0??????????????0-----000-0-000-000-00 + 'Milnesium tardigradum' 111100-10??00--?00-00-----------000000000100?0-1--111111-00021110-00-00-0010---------0-----000-----00-000-??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0011111---11?--1111---1141200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0???0??????????0-----000-0-000-000-00 + 'Austeruseus faeroensis' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-00126110-00100-0001121?12---0---00001--25-01??013??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?????0??0?0-000-20---0?0012112111??????????????21200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Mesocrista revelata' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????01??011??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0012112121??????????????21200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Hypsibius dujardini' 111100-10??00--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????0120011??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-?0-?20??0?0-000-20---0?00121121212211012221101221200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0???0??????????0-----000-0-000-000-00 + 'Beron leggi' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0??????0-???00-00?????????????????????????????0????00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-10?-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-????????0?0-000-20---0?001?11????22110??221100-21?00?0-00?---00?0212--????????0?????????????-??0????10?1??1??????10110???????????????????????????0??????????????0-----000-0-000-000-00 + 'Calohypsibius ornatus' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????0120011??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0012112121220100-220100-21100?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0???0??????????0-----000-0-000-000-00 + 'Fractonotus verrucosus' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????01??013??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0012112112??????????????21100?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Cryoconicus kaczmareki' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????01??011??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0012112121??????????????21200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Haplomacrobiotus utahensis' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????01??013??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0012112112??????????????21200?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Doryphoribius dawkinsi' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????0120013??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0012112112220200-220200-21100?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0???0??????????0-----000-0-000-000-00 + 'Paradoryphoribius chronocaribbeus' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0???????1???00-00?????????????????????????0?100????00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-????????0?0-000-20---0?001?112112220200-220200-2??00?0-00?---00?0212--????????0?????????????-??0????10?1??1??????10110???????????????????????????0??????????????0-----000-0-000-000-00 + 'Halobiotus crispae' 111100-101??0--?00-00-----------000000000100?0-1--111111-0??2211?10-?00-00?112111??--010-0000?--1--01210130000----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-0000000000001000-0--?000--?1000------0----?0-1???0----0-0-0---0-0000-000-20---0100121121122202012220201221100-0-000---0010212--000-----00----000-1012-0-01-0?1001111110221101100-0-01?0000-00?0000?-01011?0?00000100000-00-----000-0-000-000-00 + 'Macrobiotus paulinae' 111100-1011?0--?00-00-----------000000000100?0-1--1?1111-00126110-00100-0001121312---010-00001--25-01210120000----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?00121122112101113210111321100?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0???1??????????0-----000-0-000-000-00 + 'Dactylobiotus ovimutans' 111100-1011?0--?00-00-----------0000000???00?0-1--1?1111-00126110-00100-00?1121315---010-00000--25-01??012??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?0012112211??????????????21100?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0??????????????0-----000-0-000-000-00 + 'Richtersius coronifer' 111100-10???0--?00-00-----------0000000???00?0-1--1?1111-0012?110-00200-0001121?15---010-00000--1--0120012??00----0-00---0-0-000101101100-00100-301--00--0-----?0000?010-100-0-0-00-0-0-0-00000000000010?????-???????10?0??????????????1????????-?-10-?20??0?0-000-20---0?00121122112101113210111321100?0-00?---00?0212--????????0?????????1012-0-01-0?10?111111022110110?????1??0??????????????????0???1??????????0-----000-0-000-000-00 + 'Sicyophorus rarus' ?10000-1011?0--?00-0122212112?120?0?00??0?00?112(1 2)(1 2)??1111-0??27210-002?????01?22115????1???0?001?1-1?0-0?0-?000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-??0?00-0-0-0-000000000000110?????01102501?00------0----?0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---?00-----00----?00-????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Sirilorica carlsbergi' ?10000-1????0--?00-0???????????????????0010?????????1??1-???2212???0?????????1-??----010-00000--1--???????0000----0-00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-(0 1)-0-00-0-0-0-000000000000??0?????01103201?00------0----?0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----0??????00-????-???????????????????????????????????0-0????????0??????????101???00???????????????????????? + 'Acosmia' ?200?0-1?0-?0--?00-0112111-?2113000000????00?0-1--110----000----0-0-1??????112212?---0-0-0??-0??????0-010-0000----0-00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-111100-0-0-0-0000000000001000-0-?0000--?101010----1-1---0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?0----000-????-??0????0000-00?????????????????????0-00?0?????0?????????????????????????????????????????? + 'Eximipriapulus globocaudata' ?10000-10???0--?00-01?2211-12?12000000????00?1123-1?1111-00027210-001?????01?2?115????1???00001(1 2 3)????0-000-0000----??00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110200-0-0-0-0000000000001?00-0-10000--?101030----012(1 2)-30-1???0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---100-----?0----000-????-???????????????????????????????????0-0??0?????0?????????????????????????????????????????? + 'Laojieella thecata' ?20000-1?12?0--?00-01?2212212?11000000????0??11?1?1?0----000----0-00??????01?2???5-???????0??01???1????0??0000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000??0?????0000--??000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---110112-1?0----?00-????-???????????????????????????????????0-0??0?????0?????????????????????????????????????????? + 'Ottoia prolifica' ?10000-1?12?0--?00-01?2212212112010000????0??11?1(1 2)1?1??1-?002711??01??0-0001?22??5-???11110?10132410???0??1000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?00?????0000--??000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---01111111?1215-?00-0???-??0????0000-00????????0-0?????????00-00?0?????0?????????????????????????????????????????? + 'Ottoia tricuspida' ?10000-1?12?0--?00-01?2212212112010000????0??11?1(1 2)1?1??1-?002711??01??0-0001?22??5-???11110?10132410???0??1000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?00?????0000--??000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---01111111?1215-?00-0???-??0????0000-00????????????????????00-00?0?????0?????????????????????????????????????????? + 'Paratubiluchus bicaudatus' ?10000-1????0--?00-012221211??????????????0??11??21?0----000----0-00???????1?2????????????0??01????????0???000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-0-0-00-0-0-0-000000000000?10?????0000--??000------0-----0-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---01?112-1?0----?00-????-????????????????????????????????????-0????????0?????????????????????????????????????????? + 'Priapulites konecniorum' ?100?0-1?1??0--?00-012?2?2112?12000000????00?11??(1 2)1?1??????????????????????????????????????????????????????000----??00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-000000000000100?????0000--???00------0----?0-????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---01?11??1?0----?00-????-???????????????????????????????????????0??????0?????????????????????????????????????????? + 'Selkirkia columbia' ?(1 2)0000-1?12?0--?00-01?2212212111010000????0??11?1(1 2)1?1??1-???2711??00??????01?22??5-???10-10?10131-1????0??0000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-?1??00-0-0-0-000000000000??0?????0?0????101030----?121-30-??????-????0-?-?-?-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---0?0-1---?0----?00-0???-???????????????????????????????????0-0??0?????2?????????????????????????????????????????? + 'Paraselkirkia sinica' ?(1 2)0000-1????0--?00-01?22122121110?0000????0??11?1?1?1??1-???2?11??00??????01?22??5-???1???0??0????1????0??0000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-?1??00-0-0-0-000000000000?10???????0????101030----?121-30-??????-????0-?-?-?-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---0?0-1---?0----?00-0???-???????????????????????????????????0-0??0?????2?????????????????????????????????????????? + 'Xiaoheiqingella peculiaris' ?20000-1????0--?00-0122212112?11000000?1100??11???1?1??1-???2411??00???????1?22???????????0???1????????0???000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?00???????0??????00------0----?0-??????-????0-?-?-?-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---11?112-1?0----?00-????-???????0000-00?????????????????????0-0??0?????0?????????????????????????????????????????? + 'Xystoscolex boreogyrus' ?(1 2)00?0-1????0--?00-01?22122?2?11000000????0??11?1???1??1-???2????????0????01?2???????????????0?????????????000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-11??00-0-0-0-00000000000??00???????0????1?1?1?????012?????????0----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---000-----???????00-0???-1??????????????????????????????????0-0????????0?????????????????????????????????????????? + 'Chalazoscolex pharkus' ?21000-1?1??0--?00-01?22????2?11000000????0?????(1 2)????????????????????0?????1?2?11?????1????????????????0???000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?00?????0?0????1?1?1?????11221312?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---01?1111????????00-0???-1??????????????????????????????????0-0????????0?????????????????????????????????????????? + 'Louisella pedunculata' ?21000-1?12?0--?00-01?2212211?110?0???????0??11?1(1 2)1?1??1-?002721??01?0????01?2???5-???10-00?001(2 3 4)??1????0??1000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?10?????1?0????101030----0122130-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---01111111?0----?00-0???-??????????????????????????????????00-00?0?????0?????????????????????????????????????????? + 'Corynetis brevis' ?(1 2)0000-1012?0--?00-00-----------0000000???0??1111(1 2)1?1111-00027210-0010????0112?215---010-00000111-1?0-000-0000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110?00-0-0-0-000000000000?00?????0?0????101?11311-012212???0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---01?11111?0----?00-0???-???????????????????????????????????0-00?0?????0?????????????????????????????????????????? + 'GUANDUSCOLEX minor' ?20000-10???0--?00-01??21?????????00??????00011?1(1 2)1????????????????0100-0001?2?115-???1?????001???1?0-000-0000----??00---0-0-000???-00-00-00000--0---00--0-----0000000-0--00-11?100-0-0-0-00000000000?100????10?0????1?1111312-01223212?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1211-?00-0???-???????????????????????????????????0-0??0?????0?????????????????????????????????????????? + 'MAOTIANSHANIA cylindrica' ?30000-1????0--?00-01??2122???????????????0??11?1?1????????????????0?0????01?2????????10-0??00????????????0000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-11??00-0-0-0-000000000000?00?????0?0????1011112222012--???00000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1221-?00-0???-???????????????????????????????????0-00?0?????0?????????????????????????????????????????? + 'PALAEOSCOLEX piscatorum' ?30000-??????????????????????????????????????????????????????????????0????0?????????????????????????????????????????00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-11??00-0-0-0-00000000000???0?????0?00-??101121213-0123131200000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0010-0---000-----???????00-0???-???????????????????????????????????0-00?0?????0?1?????11????????????????????????????????? + 'SCHISTOSCOLEX umbilicatus' ??0000-??????????????????????????????????????????????????????????????0??????????????????????????????????????????????00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-11??00-0-0-0-00000000000???00-0-?0??????101111212-01231?1200000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0010-0---000-----?11233?00-0???-???????????????????????????????????0-00???????0?1???????????????????????????????????????? + 'SCATHASCOLEX minor' ?3?000-1????0--?00-01-211--22?12000000????0??1???(1 2)1?1111-?002711??00?0????01?2????-???10-00?0013???????0???000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-000000000000100?????0?00-??1011112?2?01232?0-00000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?12(1 2)32000-0???-???????????????????????????????????0-00???????0?????????????????????????????????????????? + 'WRONASCOLEX antiquus' ?30000-1????0--?00-01-?11--?2?1?????00???????1???(1 2)???????????????????0?????????????????????????????????????000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-11??00-0-0-0-00000000000??00?????0?0????101111212-0122131200000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?12232?00-0???-???????????????????????????????????0-00???????0?????????????????????????????????????????? + 'WRONASCOLEX iacoborum' ?300?0-??????????????????????????????????????????????????????????????0?????????????????????????????????????000----??00---0-0-00????-?0-00-00000--0---00--0-----0000000-0--00-11??00-0-0-0-00000000000???0?????0?0????10111122230123131?00000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---00?0-0---0?????????????????0???-???????????????????????????????????0-00???????0?????????????????????????????????????????? + 'YUNNANOSCOLEX magnus' ?(1 2)100?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????00000--0---00--0-----0000000-0--00-11?100-0-0-0-000000000000??0????10?0????1?1111212-0122320-?0000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1211-?00-0???-???????????????????????????????????0-00???????0?????????????????????????????????????????? + 'MAFANGSCOLEX yunnanensis' ?31000-101??0--?00-01122122121120?0000?00100?11?1(1 2)??1111-00027110-0010????01222115-22010-100001(3 4)2(3 4)1?0-0?0-1000----??00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-110100-0-0-0-000000000000100????10000--?1111111---012--(1 3)1210000----0-0-0---0-0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1221-000-0???-???????0000-00?????????????????????0-00?0?????0?????????????????????????????????????????? + 'Cricocosmia n. sp.' ?31000-10???0--?00-01?221??12?12000000????00?11?1(1 2)101111-00027(1 2)10-0010??0?0122?115???????????0131-1?0-00(- 0)-?000----??00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-11?100-0-0-0-0000000000001(0 1)0????10?00-??11111122220121-10-10001222101011112010000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1221-?00?0???-???????????????????????????????????0?00?0?????0?????????????????????????????????????????? + 'CRICOCOSMIA jinningensis' ?31000-10???0--?00-01?2212212112000000????00011?1(1 2)101111-00027(1 2)10-00?00-000122?115?3-01??????0131-1?0-000-1000----??00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-111100-0-0-0-000000000000110????10?00-??1??1??????012??1??10001221101011112030000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1221-?00-0???-???????????????????????????????????0-00?0?????0?????????????????????????????????????????? + 'TABELLISCOLEX hexagonus' ?31000-10???0--?00-01?2212?12?12000000????00011?1(1 2)100----000----0-00100-0001?2?115?????0-?00?01?2(2 3 4)1?0-000-?000----??00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-111100-0-0-0-000000000000110????10?0????1?11112?220121-???10001222101010-12030000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1221-?00-0???-???????????????????????????????????0-00?0?????0?????????????????????????????????????????? + 'Tylotites petiolaris' ?(2 3)(0 1)000-1?1??0--?00-01??212212?12000000??0000011?1(1 2)1????????????????0100-0001?2?115???????????01(2 3)???????00-1000----??00---0-0-000000-00-00-00000--0---00--0-----0000000-0--00-111100-0-0-0-000000000000110?????0?0????1?????????012??3???????(- 2)(- 8)?(- 1)(- 0)(- 1)0(- 2)?(- 2)(- 1)(- 2)0(- 1)0000-000--0---00000--0------0-00---0-00----0000-000---0000-0---000-----?1221-?00-0???-???????????????????????????????????0-0??0?????0?????????????????????????????????????????? + 'Xenusion' ?(2 3)11??????????????????????????????????????0????????????1-???-????????0??????????????????????????????0-0????000----??00---0-0-000????00-00-00000-1?1--?0--0-----000000010-100-110100-0-0-0-00000000000?100????10000--?1??0??????????????????1222101011?12010000-000-2121-10000--0------0-00---0-00----00-0-000---0100-0---000-----?0----????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Hadranax' ?(2 3)11?????????????????????????????????????????????????????????????????0??????????????????????????????????????00----???????????????????????????????????????????????????01?-???-12?100-0-0-0-00000000000?1?0????20000--?1?103??????12??30-???0124210(1 2)010-(1 2)?0-0?00-000-20---10?00--0------0-00---0-00----00-0-??0---0??0-????0?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Aysheaia' ?21100-1011?1?11001?0-----------000000000100?111?11?1111-00012(1 2)10-00?0????????????????????????????????????0000----??00---0-0-000000-00-00-00100-101--10--0-----110000010-100-110100-0-0-0-000000000000100????20000--?101030----02221?0-1???0----0-0-0---0-0000-000-2112100001220----??????????????61100-0-000---0010212--000-----?0----?00-0???-???????????????????????????????????0-0010?????0?????????????????????????????????????????? + 'Siberion' ?(1 2)11?0-???-?????????0-----------0000000???00?0-?--1?1????0001????1???0??????????????????????????????0-0?--??00----??00---0-0-000????00-00-00100-102-1?????????11?0000010-100-110100-0-0-0-00000000000?10?????2?000--?1??0????????????0-???00----0-0-0---0-0000-000-?????0?00??????????????????????????0-0-000---0??0?????000-----??????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Onychodictyon ferox' ?(1 2)1100-101??0--?00-00-----------0000000---00?0-1--1????--000----?-0-?0????????????????????0?????????0-0???0000----??00---0-0-000100?01100-00100-101--112-0-0--?10000?010-100-120100-0-0-0-000000000000100????21?0????1?1030----0?221?0-1?00122?1?10111?2041000-000-?121-0000121???????????????????21200-0-000---0010212--000-----?0----?0??????-???????????????????????????????????0-0010?????0?????????????????????????????????????????? + 'Diania' ?(2 3)11?0-?????0--?00-00-----------0000000???0??0-1--1??????????????????0?????????????????????????????????????000----??00---0-0-0??000-00-00-00000-101--00--0-----000000010-100-121100-0-0-0-0000000000001?0?????0000--?1?1030----01221(2 3)0-1000121?101010-11010000-000-10---00001110----??????????????-1100-0-000---010010---000-----?0----000-????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Paucipodia' ??11?0-0?0-?????????0-----------0000000???0??0-?--1????1-???-????????0?????????????????????????????????0--??00----??00---0-0-000000-0??00-00000-101--10--0-----?00000010-100-110100-0-0-0-00000000000010?????10000--?1?00------0-----0-10000----0-0-0---0-0000-000-10---000011?0----??????????????(1 2)2100-0-0?0---0100?0---0?????????????????????-???????????????????????????????????????0??????0?????????????????????????????????????????? + 'Cardiodictyon' 1311?0-?0???0--?00-00-----------0000000???0?00-(- 1)--1????(- 1)-??????(- 1)???0?00-000?????????????????????????0-00--?000----??01111010-001000-01100-00?00-101--10--0-----000000010-200-121100-0-0-0-000000000000110????10000--?1?00------0-----0-100012{1 2}2101010-(1 2)20(2 3 4)?000-000-10---00001110----??????????????21100-0-01121-010010---000-----?0----000?????-???????????????????????????????????????0??????0?????????????????????????????????????????? + 'Microdictyon' ?21100-000-?0--?00-00-----------0000000---0??0----1????1-???---1???-?0???????????----010-0?00?--1--?0-00--0?00----??00---0-0-010000-0??00-00000-101--10--0-----?00000010-100-121100-0-0-0-000000000000100????10000--?1?00------0-----0-10001-22101010--2041000-000-10---00001110----??????????????2??00-0-000---010010---000-----?0----?00-????-???????????????????????????????????0-0?00?????0?????????????????????????????????????????? + 'Onychodictyon gracilis' ??11?0-?????????????0-----------0000000???00?0-?--1????1-???-????????0??????????????????????????????0-0?????00----??0????0?0-0???00?0????????????????????????????????01?-?00-1?0100-0-0-0-0000000000001??????2??0????1?103???????221?0-1??0122?101011??20(2 3 4 5 6)1?00-000-2????00001210----??????????????2??00-0-000??-0100?12--000-----??????????????-???????????????????????????????????????0??????0?????????????????????????????????????????? + 'Thanahita distos' 12?1????????????????0-----------0000000???0???????1?????????????????????????????????????????????????0-0?--??00----??????????????????0????????????????????????????????01??????0-0-00-0-0-0-00000000000011?????-???????1?00------0----?0-1???1?2?2010(1 2)0-?106???0-000-10---00001110----??????????????22100?0-01??1-01?010---??????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Orstenotubulus' ??11??????????????????????????????????????????????1?????????????????????????????????????????????????????--??00----??????????????????0????????????????????????????????01?-???-12?200-0-0-0-00000000000?1??????2??0????1?00------0----?0-1???122???202????0?0?00-000-10---1?1???????????????????????????0-0-??0??-0??0?????0?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Tritonychus phanerosarkus' ???1????????????????0-----------0000000???????????1?????????????????????????????????????????????????0-0?--??????????????????????????0????????????????????????????????01?-???-1??200-0-0-0-00000000000????????2???????1?00------0----?0-1??????????0??????????0-000-10---101?121???????????????????3?100?0-??????0??????????????????????????01??-???0????????????????????????????????????????????????????????????????????????????????????? + 'Carbotubulus' ?(1 2)?1????????????????0-----------0000000???0???????1?????????????????????????????????????????????????0-0?????00----??????????????????0????????????????????????????????01?-???-0-0-00-0-0-0-00000000000?11?????-?????????00------0----?0-???????????0??????????0-000-10---0?0???????????????????????????0?0-?????-00?0???--??????????????????0???-????????????????????????????????????????????????????????????????????????????????????????? + 'Hallucigenia sparsa' 131100-0?0-?0--?00-00-----------0000000---0??0----1?1111-0002721?10010?????112??2?????10-00?0012??1?0-00--0000----??00---0-0-011000-01100-00000-101--?0--0-----000000010-{1 2}00-0-0-00-0-0-0-0000000000001100-0--0000--?1100------0-----0-1000122?1010212-2050000-000-10---00001110----??????????????22100-0-01021-00?010---000-----?0----?00-????-???????????????????????????????????0-0?00?????0?????????????????????????????????????????? + 'Hallucigenia fortis' ?211?0-0?0-?????????0-----------0000000??????0-?--1????1-???-????????0??????????????????????????????0-0?--??00----??01111010-011000-01100-00000-101--?0--0-----000000010-{1 2}00-121?00-0-0-0-00000000000011?????10000--?1?00------0-----0-1000122?1010212-20(1 2 3 4 5 6)0000-000-10---000?1?10----??????????????2??00-0-01011-0?0010---0?????????????????????-???????????????????????????????????????0??????0?????????????????????????????????????????? + 'Hallucigenia hongmeia' ??11????????????????0-----------0000000???????????1??????????????????0??????????????????????????????????--??00----??????????????????0????-???????????????????????????01?-??0-12??00-0-0-0-00000000000?1??????10000--?1?00------0-----0-1000122?1020212-20(2 4)0?00-000-10---00001110----??????????????11-00-0-??0??-0?00?0---0?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Facivermis yunnanicus' ?2?1?0-?????????????0-----------0000000??????0-?--1??????????????????0??????????????????????????????0-0?--??00----??00---0-0-0?0?00?01100-00000-101--?0--0-----?0000?010-{1 2}00-12?100-0-0-0-00000000000011?????2?000--?1?10??????0122130-1???0----0-0-0---0-00?0-000-1111210001?10----??????????????11-00?0-00?32-0110?0---??????????????????????-???????????????????????????????????????0????????????????????????????????????????????????? + 'Luolishania' ?211?0-?????????????0-----------0000000???0??0-?--1????1-???-????????0??????????????????????????????0-00--??00----??011??0-0-0?1100?01100-00000-101--10--0-----?0000?010-{1 2}00-121100-0-0-0-00000000000011?????2??0????1??0????????????0-1??0123?2?20212?20?0000-000-1111210001210----??????????????11-00-0-00132-0?0010---0?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Ovatiovermis cribratus' ?2?1?0-?0???????????0-----------0000000???0??0-?--1?????????????????10?????112???5---010-00000111-1?0-00--0000----??00---0-0-010?00?01100-00000-101--?0--0-----?0000?010-{1 2}00-12?100-0-0-0-00000000000011?????1???????1100------0----?0-1???1?2?2??021??20??0?0-000-1111200001210----??????????????11100?0-00?42-00?010---??????????????????????-???????????????????????????????????????0????????????????????????????????????????????????? + 'Collinsium' ?2?1?0-?????????????0-----------0000000???0???????1??????????????????0??????????????????????????????0-00--??00----??011??0-0-010100?00-00-00000-101--10--0-----?0000?010-{1 2}00-12?100-0-0-0-00000000000011?????2???????1100------0----?0-10001?5?2?20212?20(2 4)?0?0-000-?111200001210----??????????????11-00?0-00?42-011010---??????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Collinsovermis monstruosus' ?2?1????????0--?00-00-----------0000000???0??0-?--1??????????????????0??????????????????????????????0-00--??00----??011??0-0-0??100?0??00-00000-101--10--0-----??0000010-{1 2}00-12?100-0-0-0-00000000000011?????????????1100------0----?0-1???1?3?1?20212?20??0?0-000-21112???01210----??????????????11?00?0-00?42-0??0?0---??????????????????????-???????????????????????????????????????0????????????????????????????????????????????????? + 'Emu Bay Collins monster' ??11????????0--?00-00-----------0000000???????????1?????????????????????????????????????????????????0-0?--??00----??011?????????????0?????00?????????????????????????01?-{1 2}00-12?100-0-0-0-00000000000?11????????0????1?00------0----?0-1???123???20212?20?0?00-000-?111200001210----??????????????11-00?0-00132-0??0?????0?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Acinocricus' ???1????????0--?00-00-----------0000000???????????1?????????????????????????????????????????????????????????00----???????????????????????????????????????????????????010-{1 2}00-12?100-0-0-0-00000000000011?????????????111030----0122130-????1?6?1?20212?2?????0-000-?1112????????????????????????????????0-00?32-???0???????????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Antennacanthopodia' 1211?1??????0--?00-0??????????????????????0??0-(- 1)--1????????????????????????????????????????????????????????000----??00---0-0-000??0-01100-00100-101--10--0-----000000011-?00-????00-0-0-0-000000000000100?????0000--?1?1030----01221?0-1???0----0-0-0---0-0000-000-10---?0101?10----??????????????11-00(- 0)0-000---0110?1113000-----?0----????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Helenodora' ??11????????0--?00-00-----------0000000?????????????????????????????????????????????????????????????0-0?--??00----??00---0-0-0?0????0?????00100-101--??-?0-0-???0000?01?-??1-1???00-0-0-0-0000000000001??????2?000--?1?10????????????0-1??00----0-0-0---0-0000-000-20---10?01?10----??????????????211?0?0-0?0??-0??0?????0?????????????????????-??????????????????????????????????????????????0?????1???????????????????????????????????? + 'Tertiapatus dominicanus' ?2?1?12?????????????0-----------0000000???01??????1?????????????????????????????????????????????????0-0?--??00----??00---0-0-000110?01100-00100-101--?0--0-----?0000?011-??1-11??00-0-0-0-00000000000?10?????2?000--????0??????????????????0----0-0-0---0-00?0-000-?0---?0?0??????????????????????????0?0-00?---01?0?0---??????????????????0???-????????????????????????????????????????????????????0???????????????????????????????????? + 'Siberian Orsten tardigrade' ?111?0-?0??????????????????????????????????0????????1??1-0???????1??????????????????????????????????????????00----??00---0-0-000????0??????0???-?????????????????????010-??0-0-0-00-0-0-0-00000000000?10?????-??0????1?00------0----?0-1?????-?---0?0-??0?0000-000-20---000012111?12??????????????21200?0-00?---0010?12--0?????????????????????-??????????????????????????????????????????????0?????1???????????????????????????????????? + 'Youti yuanshi' 1?11?12????0??????????????????????????????00?0-?--1?????????????????-0?????0---------0-----000-----?0-0?--??00----??10---0-0-00010??011????01???(1 2)0???????????????????01?-(1 2)00-????00-0-0-0-000000000001?0???????000--???????????????????????0----0-0-0---0-00???000?20---0000????----????????????????????0-00?---???????????????????????????????-????1??????1????????10110??????????????0????????????????????????????????????????????????? + 'Megadictyon' 1111??????????????????????????????????????0??0-?--1?1???????????????2?????012???1????????????0??????0-0?--??00----??00---0-0-0?0100-0??00-00100-101--01??0-0--?100000010-100-11?100-0-0-0-00000000000110?????10000--?0000------0-----0-?0000----0-0-0---0-0000-000-?121-0000?21??????????????????????00-0-000---00?0?????0?????????????????????-???????????????????????????????????????1??????0?????????????????????????????????????????? + 'Jianshanopodia' ?111?0-?????0--?00-00-----------0000000---00?0-?--1?1?1?????2721??0020????0122??1?????10-0??0014????0-0?--0?00----??00---0-0-000???-0??00-00100-10????11?0-0--?100000010-100-12?100-0-0-0-00000000000110?????10000--?0000------0-----0-?0000----0-0-0---0-0000-000-2121-0000??1??????????????????????00-0-000---0010-11220?????????????????????-???????????????????????????????????????1??????0?????????????????????????????????????????? + 'Cucumericrus' ??11????????????????????????????????????????????????????????????????????????????????????????????????????????00----?????????????????-???????1?????????????????????????1(1 2)????????????????????????????????????????????????????????????????0??????????0????????1?110111?0---?0??0--0------0-00---0-00----00?1???????0???-??????????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Kerygmachela' ?211?11?????0--?00-00-----------0000000---00?0----1?1????000?--1?10-?????????????-?--010-00000--1--?0-0?--0?00----??10---0-0-000100-?1110100100-101--11--0-0---110000010-100-120100-0-0-0-000000000001100????20000--?1000------0-----0-0000124-1??010--10-00111010120---00000--0------0-00---0-00----00101000---0010-1113000-----?0----000-01?1-1???1??10?????????????110??????????0-0??0?????0?????????????????????????????????????????? + 'Pambdelurion' ?111?12?????????????1?211??22?12010000????00?0----1?1?21-000271?11002?????0122?313---0111??000131--?0-00--??00----???0---0-0-000100-????0?00100-2022111--0-0---1?0000010-100-1???00-0-0-0-00000000000110???????000--?0000------0-----0-00000----0-0-0---0-00111010120---00000--0------0-00---0-00----00101000---0???-????0?????????????????1111-???0??????????????????????????????????????????0?????????????????????????????????????????? + 'Omnidens qiongqii' ??11????????0--?00-01-211??2?112010000????00?0-1--??1221-000271?12002?????01222313---?11110000131--?0-00--?000----????????????????????????00100-(1 2)???(- 1)?0--0-----000000???-?0?-????????????????????????????????????????????------???--?0-???????????0????????????????????????????????????????????????????????????????????????????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Parapeytoia' ??11??????????????????????????????????????00?0-?--??12?2(2 1)000????12??????????????????????????????????????????00----????????????00??????????111????????1???????????????11???0?????????????????????????????????????????????0??????????????0????????-?0?????0??1?110111-?????0??0--0------0-00---0-00----0????????-10???-??????????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Kylinxia' ?111?13?????????????0-----------0000000???00?0-?--1?????????????????????????????????????????????????0-00--??00----??1112101?-?00?00-0121111111012122111211112120000001-0-100-????110??????????????????2??????-??????????0??????????????0???????---0?0-??0??1?1100?1-0---00000--0------0-00---0-00----00???01?--100?1-1122??????????????????????-???????????????????????????????????????0????????????????????????????????????????????????? + 'Isoxys' ?111?13?????????????0-----------0000000???00?0-?--1??????????????????0??????????????????????????????0-00--??00----??112???????00???-01?1111111??212211???????????????1-???00-?????0-0-0-0-00000000000??0???????????????00------0----?0-0????????-?0????????1?12???1-0---00000--0------0-00---0-00----00????????10???-??????????????????????????-???????????????????????????????????????0????????????????????????????????????????????????? + 'Stanleycaris' 1111?13?????????????0-----------0000000???00?0-?--1?12121000????12??????????????????????????????????0-0?--??00----??11121010-000100-011111111112202211112121221100011020-100-0-0-10-0-0-0-00000000000120?????-?000--?0000------0-----0-00000----0-0-0---0-00?120002-0---00000--0------0-00---0-00----00?1200?--?00?1-111???????????????????????-???????10?1???????????1????????????????0????????????????????????????????????????????????? + 'Opabinia' 1111?130?0-?????????0-----------0000000???00?0----1?12?1-0002???11???????????-??????????????????????0-00--??00----??11111010-000?00-0121110011??202111???0-?--1?0000?010-100-1?0?10-0-0-0-00000000000110???????000--?1000------0-----0-000012--1-?010--10-000110102?0---00000--0------0-00---0-00----00101000---01?1-11120?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Utaurora' ??11?13?????????????0-----------0000000????0?0-?--1?????????????????????????????????????????????????0-0?--??00----???????0-0-?00???-01?111??1???20211?????????????0??0????0???????0-0-0-0-00000000000?10????????????????0??????????????0????????-?0?????0??0?110102?????00000--0------0-00---0-00----00?0100?--?0??1-1111??????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Caryosyntrips camurus' ??11????????????????0-----------0000000???????????1?????????????????????????????????????????????????????????00----????????????????????????1?110?2021111??0-?--1100001?????0?????????????????????????????????????????????0??????????????0??????????0????????????????????????????????????????????????????????????????????????????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Amplectobelua symbrachiata' ?111?12?????????????0-----------0000000???00?0-?--1?1??22110?????(1 2)??????????????????????????????????0-00--??00----??111?10111?0????-01111?1111112?2211121111212011110020-100-?????0-0-0-0-00000000000?20?????-?000--??000------0----?0-0???0----0-0-0---0-00?11001?-0---00000--0------0-00---0-00----00?1201?--?00?1-1111??????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Anomalocaris canadensis' ?111?120?0-?????????0-----------0000000???00?0----1?1212111027(1 2)1120??????????1??????????????????????0-00--??00----??11111011110??00-011111111101202211121111212010010020-100-0-0-00-0-0-0-00000000000120?????-?000--??000------0----?0-????0----0-0-0---0-000110002-?---0000?--0----??????????????---00112010--?0011-11110?????????????????1???-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Cambroraster falcatus' ?111?11?????????????0-----------0000000???00?0-?--1?12122000?????2??2?????012???1???????????????????0-0?--??00----??11112111200??00-01111211111220?2?11121201?1010010020-100-0-0-00-0-0-0-00000000000?20?????-?000--??000------0----?0-0???0----0-0-0---0-00?110002-0---00000--0------0-00---0-00----00??101?--?00?0-1121??????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Hurdia victoria' 1111?1?0?0-?0--?00-00-----------0000000???00?0----1?1212200027(1 2)112002?????0122?113???010-1000013---?0-00--?000----??11112111200??00-01111211111220???1112120121010010020-100-0-0-?0-0-0-0-00000000000??0?????-?000--??000------0----?0-0???0----0-0-0---0-000110?02-0---000?0--0------0-00---0-00----001?1010--?0010-11220?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Cf. Peytoia' ??11????????????????0-----------0000000?????????????????????????????????????????????????????????????????????00----????????????????????????1111??2021111?2120121100011?????0????????????????????????0????????????????????0?????????????????????????0????????????????????????????????????????????????????????????????????????????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Peytoia nathorsti' ?111?120?0-?????????0-----------0000000???00?0-?--1?1212200027(1 2)1120??????????1???????????????0??????0-0?--??00----??11112011210??00-011112111111202?11112120121010010020-100-0-0-00-0-0-0-00000000000120?????-?000--??000------0----?0-0???0----0-0-0---0-000110?02-0---00000--0------0-00---0-00----00112010--?0010-0---0?????????????????????-??????????????????????????????????????????????0?????????????????????????????????????????? + 'Aegirocassis benmoulai' ?111????????????????0-----------0000000???00??????1???1?????????????????????????????????????????????????--??00----??11112011??0????-??????11110?20???11121201?100?000020-100-0-0-10-0-0-0-00000000000??0?????-?000--??000------0----?0-0???0----0-0-0---0-00?110102-0---00000--0------0-00---0-00----00?120??--?00?0-0---??????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Lyrarapax unguispinus' ?111?12?????????????0-----------0000000???00?0-?--1?12?2201?????12???????????????????????????0??????0-0?--??00----??11111011??0??00-?111111111?1202211111111212001110020-100-0-0-10-0-0-0-00000000000120?????-?000--??000------0----?0-0???0----0-0-0---0-00?110002-0---00000--0------0-00---0-00----00?1211?--?00?1-111???????????????????1???-???????10??1?1??????0?110???????????????????????????????????????????????????????????????? + 'Schinderhannes' ?111?1??????????????0-----------0000000???00?0-?--1?1?11-0???????????0??????????????????????????????0-0?--??00----?????????1??0????-0111111?11??202?111(1 2)2120??21??0??0??-100-??????????????????????0????????????????????0???????????????????????-?0????????0?????0?-??????????????????????????????????0???1??--?00?0-1111??????????????????????-????????????????????????????????????????????????????????????????????????????????????????? + 'Chengjiangocaris' ?111?130?0-?????????0-----------0000000???00?0----1?0----000----0-0?-0?????0---------0-----000-----?0-00--??00----??1112102???0????-01111110100-212?210--0-----?0?0??1-11-0?2??0?110????????0??????0?0?-?????-??0?????000------0----?0-0????1-?---0---?-0-010121001-0---?0?00--0------0-00---0-00----0000-0?0--20--0-11220?????????????????????-???????10?11??1????????????????????0-00???????0?????????????????????????????????????????? + 'Fuxianhuia' ?111?130?0-0????????0-----------0000000???00?0----1?0----000----0-0?-0?????0---------0-----000-----?0-00--??00----??1112102???0?100-01111110100-212??10--0-----?0?0??1-11-0?2??0?110????????0??????0?0?-?????-??0?????000------0----?0-0????1-?---0---?-0-010121001-0---?0?00--0------0-00---0-00----0?00-0?0--20--0-112?0?????????????????10??-0-??1??10????1????????131??????????0-000??????0?????????????????????????????????????????? + 'Leanchoilia' ?111?130?0-?????????0-----------0000000???00?0----1?0----000----0-0?-0?????0---------0-----000-----?0-00--??00----??1122102??10????-012111101???212??10--0-----?0?0??1-1??0?1??0?11?????????0??????0?1?-?????-??0?????000------0----?0-0????1-?---0---?-0-010121011-0---?0?01110----??????????????{1 3}1?0000-0?0--10--0-0---0?????????????????????-???????????????????1???????????????0-000??????0?????0???????????????????????????????????? + 'Alalcomenaeus' ?111?130?0-?????????0-----------0000000???00?0----1?0----000----0-0?-0?????0---------0-----000-----?0-00--??00----??1122102???0????-012111??1???212??1???????-????0??1-1??0?1??0?11?????????0??????0?1?-?????-??0?????000------0----?0-0????1-?---0---?-0-010121011-0---?0?01110----??????????????11-0?00-0?0--10--0-0---0?????????????????????-???????10?11010--??1??131??????????0-000??????0?????0???????????????????????????????????? + 'Misszhouia longicaudata' ?111?130?0-?????????0-----------0000000???00?0----1?0----000----0-0?-0?????0---------0-----000-----?0-00--??00----??1122102???0????-00-00-10100-212?210--0-----?0?0??1-11-0?1??0?111????????0??????0?1?-?????-??0?????000------0----?0-0????1-?---0---?-0-010131011-0---?0?01110----??????????????11-0000-0?0--10--0-0---0?????????????????1012-0-????????????????????????????0??0?0-000??????0?????????????????????????????????????????? + 'Kuamaia lata' ?111?130?0-?????????0-----------0000000???00?0----1?0----000----0-0?-0?????0---------0-----000-----?0-00--??00----??11221?2???0????-01111110100-212?210--0-----?0?0??1-11-0?1??0?111????????0??????0?1?-?????-??0?????000------0----?0-0????1-?---0---?-0-010131011-0---?0?01110----??????????????3120?00-0?0--10--0-0---0?????????????????1012-0-????????????????????????????0??0?0-00???????0?????????????????????????????????????????? + ; + ENDBLOCK; + + BEGIN NOTES; + [Taxon comments] + TEXT TAXON=9 TEXT='@Dong2010^n'; + TEXT TAXON=11 TEXT='@Zhang2022'; + TEXT TAXON=12 TEXT='@Liu2019'; + TEXT TAXON=13 TEXT='@Liu2019'; + TEXT TAXON=14 TEXT='@Shao2020'; + TEXT TAXON=15 TEXT='@Zhang2015'; + TEXT TAXON=16 TEXT='@Liu2014; @Shao2016; @Shao2020; @Wang2025'; + TEXT TAXON=22 TEXT='@Gad2005za'; + TEXT TAXON=23 TEXT='Adult Urnaloricus have not been found and may not exist [@Sørensen2025]'; + TEXT TAXON=24 TEXT='@Fujimoto2020mb'; + TEXT TAXON=25 TEXT='@Neves2014ode'; + TEXT TAXON=28 TEXT='@Rucci2020z'; + TEXT TAXON=34 TEXT='@Neuhaus2015z'; + TEXT TAXON=39 TEXT='@Kulikov1998rjn'; + TEXT TAXON=40 TEXT='@Inglis1969bbmnh - detailed line drawings of pharyngeal armature^n@Venekey2019z - Schematic of head; taxonomic diagnosis^n@Kulikov1998rjn - detailed description and illustration of E. robusta'; + TEXT TAXON=41 TEXT='@Leduc2016n'; + TEXT TAXON=42 TEXT='Kinonchulus Riemann, 1972^n^n= Pseudonchulus Altherr, 1972 syn. n.^n^n^n^nsee Holovachov et al., 2008'; + TEXT TAXON=95 TEXT='A senior synonym of Palaeopriapulites parvus [@Smith2015]'; + TEXT TAXON=128 TEXT='@Ou2012; @Liu2008'; + TEXT TAXON=170 TEXT='USNM 57490'; + TEXT TAXON=172 TEXT='Taxon name corrected using international commission on zoological nomenclature from A. benmoulae to A. benmoulai by Van Roy et al., 2015.^n^nTaxon name corrected using international commission on zoological nomenclature from A. benmoulae to A. benmoulai by Van Roy et al., 2015.'; + + [Character comments] + TEXT CHARACTER=1 TEXT='@Wills2012 (character 94) consider this to denote a priapulan synapomorphy. However, large primary body cavities occur in many ecdysozoan phyla.^n^nThe body cavity of both priapulids and nematomorphs represents a cleft in the extracellular matrix, and is thus defined as a primary body cavity, in contrast to a coelom (which would be lined with epithelia) [@SchmidtRhaesa2013].^n^nIn onychophorans a reduced coelom surrounds the gonads and protonephridia, but the perivisceral cavity is a primary body cavity, or strictly a mixocoel (resulting from the fusion of the primary body cavity with coelomic tissue during embryogenesis) [@Mayer2004az]. Tardigrades likewise exhibit a large primary body cavity; the tardigrade coelom is restricted to the gonads [@Dewel1998ar]. ^n^nThe body cavities of kinorhynchs and loriciferans are reduced [@SchmidtRhaesa2013]. ^n^n'; + TEXT CHARACTER=2 TEXT='WTS25. Taxa within 25% of the borderline between tokens are coded ambiguous for either token.^nDimensions for palaeoscolecids from (García-Bellido et al. 2013a). Eopriapulites follows (Shao et al. 2016). Xystoscolex measured from photographs (Conway Morris and Peel 2010) at close to 10; scored ambiguous (0, 1). Selkirkia around 7–10, depending on how much of tube the body occupies; scored as ambiguous (0, 1). Paraselkirkia , measured from photographs (Hou et al. 2017)almost exactly 10'; + TEXT CHARACTER=3 TEXT='This character distinguishes essentially cylindrical worms such as Palaeoscolex from taxa with clearly defined dorsal and ventral surfaces, whether by the presence of appendages (such as Louisella and lobopodians) or plates (such as Cricocosmia and Tabelliscolex) or by the differential expression of spinose armature (such as Tylotites). ^n^nThis character addresses fundamental asymmetry in the trunk organization, and thus overlooks differentiation that is restricted to the proboscis or the posterior trunk, such as the location of the anus or presence of tail hooks or caudal appendages; and diminutive landmarks such as specifically-positioned setae that do not affect the overall trunk morphology.^n'; + TEXT CHARACTER=4 TEXT='Character 1 in @Smith2015 and @Yang2015.'; + TEXT CHARACTER=5 TEXT='WTS43.^nTerminal in Maccabeus (Por and Bromley 1974)^nTerminal in Onychophora and Tardigrada; not clear why coded as in abdomen in Wills et al. 2012^nSubterminal in many nematodes, which have a caudal filament or spinneret glands posterior of the anus.'; + TEXT CHARACTER=6 TEXT='This character identifies the hypothesized evolutionary event of a movement of the mouth position. In Euarthropods the mouth is in a ventral position. In certain lobopodians, including Collinsium, hallucigeniids and Luolishania, the terminal mouth can superficially appear ventral due to the flexure of the neck [@Ma2009; @Smith2015; @Yang2015].^n^nThis character captures the transformation envisaged by characters 23 and 24 in Yang et al. (2015): both these characters appear to code for the same event of movement of mouth position, leading to a rotation in the head area, with appendages moving to an anterior position relative to the mouth.^n'; + TEXT CHARACTER=7 TEXT='When the mouth is anterior and terminal, mouth orientation is fixed as it can only face anteriorly. However, if the mouth is in a ventral position, then it can face anteriorly [following the interpretation of Kerygmachela by @Park2018], ventrally (anomalocaridids) or posteriorly (crown euarthropods, opabiniids). This character is only applicable when mouth is ventral.^n^nAdapted from character 11 in @Zhang2016 [SC: 7; Y: 23]^n'; + TEXT CHARACTER=8 TEXT='The anterior trunk of Aysheaia and Onychodictyon ferox is differentiated into a stout ‘proboscis’, distinct from the trunk by virtue of its shape and its lack of annulations (Ou et al. 2012). This ‘proboscis’ is considered homologous to the cycloneuralian worm introvert (=armature Zone I). This region is reduced in taxa such as Hallucigenia (where it has become part of the buccal cavity) and Anomalocaris (where it has been reduced and is no longer evident). We consider the oral region of tardigrades as a potential homologue of the introvert, noting the similarity of oral papillae with similar features in Aysheaia. @Kihm2023 draw attention to the similarity in form and function of tardigrade oral papillae and buccal sclerites in priapulan larvae; and there is indeed a compelling resemblance with the introvert-borne buccal papillae of e.g. Halicryptus [@Merriman1981].^n^nIn nematodes [@SchmidtRhaesa2014] and priapulans [@SchmidtRhaesa2012], only the midgut has an endodermal origin; the foregut (including the pharynx) and hindgut are ectodermal and hence covered in cuticle. The pseudointestine of Nematomorphs is endodermal and hence homologous to the intestine [@SchmidtRhaesa2012].^n^nThe ''head seam'' marks the anterior limit of the nematode trunk [@Kulikov1998rjon].^nThe nematode mouth comprises a cheilostome and pharyngostome (which together form the buccal cavity, or stoma sensu lato), pharynx (oesophagus), and pharyngo-intestinal junction (cardia) [@SchmidtRhaesa2014]. As the mouth opening (including the lips) and cheilostome are covered with body exocuticle [@Kulikov1998rjon] and occasionally bear cuticular projections [@SchmidtRhaesa2014] (historically termed odontia [@Inglis1966plsl]), we treat these as equivalent to the introvert. The wedge ring, which delimits this region [@DeLey1995], corresponds to Zone II. The subsequent elements of the foregut bear pharyngeal cuticle [@SchmidtRhaesa2014] and thus correspond to Zone III [@ConwayMorris1977]. As such, the gymnostome (proximal pharyngostome) corresponds to the unarmoured region between Zones II and III, whereas the stegosome (distal pharyngostome), which often bears denticles or teeth [@SchmidtRhaesa2014] historically termed onchia [@Inglis1966plsl], corresponds to the armoured pharynx. The six (inner) labial sensilla are somatic; in apomorphic taxa they head up a series of sensillae that continues along the trunk. They thus do not form part of the introvert, even if they secondarily migrate onto the lips in some cases [@SchdmitRhaesa2014]. The six outer labial / inner cephalic sensilla [terminology differs; see @Meldal2004] and the four (outer) cephalic sensilla, which primitively occur upon the lips, have a distinct developmental origin [@SchdmitRhaesa2014] and are thus not homologous with the trunk sensilla.^n^nIn nematomorphs, the adult intestine is reduced; it includes an anterior region that bears teeth and rods, perhaps vestiges of larval armature, followed by a cuticular pharynx (=oesophagus) [@SchmidtRhaesa2012]. We treat the spines, point backwards when the proboscis is everted [@SchmidtRhaesa2012], as introvert hooks. There are three rings of six hooks; the ventral hook on the outer ring (the first to be everted) is deeply cleft, giving the false appearance of a second hook [@SchmidtRhaesa2012].^n^n^nThe mouth cone of kinorhynchs occupies an equivalent position to that of loriciferans, but the cones in the two lineages exhibit distinct muscular, sensory and nervous configurations; they seem to have evolved independently [@Nebelsick1993]. The mouth cone ''represents the beginning of the alimentary canal'' [@SchmidtRhaesa2012] and may be regarded as equivalent to the priapulan foregut, in a permanently everted configuration [@Nebelsick1993]^n^nThe kinorhynch mouth cone exhibits (typically four) rings of stylets; the outer stylets sit posterior of the boundary between the mouth cone and the buccal epithelium [@Nebelsick1993]; they do not represent Zone II armature, contra @ConwayMorris1977. Their status as articulated spinose outgrowths makes it unlikely that they correspond to the cuticular thickenings of loriciferan oral ridges. Inner oral styles [@Nebelsick1993; @Neuhaus2002icb] occur on the buccal epithelium; unusually, their tips are directed toward the mouth opening. The styles are interpreted as Zone III armature; their position anterior to the pharyngeal bulb (cf. loriciferan placoids) suggests that they denote medial rather than distal rings.^n^nThe primary spinoscalids surround the base of the mouth cone, and thus correspond in position to Zone II elements. Their interpretation as distinct from the Zone I elements of the introvert is supported by their distinct structure, and their different spatial position: the primary spinoscalids define the boundaries between zones of introvert teeth [@SchmidtRhaesa2012; @Herranz2016za; @Neuhaus2002icb].^n^nIn loriciferans [@Neves2016za], the ring(s) of trichoscalids are considered homologous with those of kinorhynchs, and thus a feature of the neck rather than the introvert (Zone I) armature. The armature of the introvert (spinoscalids and clavoscalids) is considered to correspond to Zone I.^n^nThough taxa may display various reinforcements of the proximal mouth cone, none exhibits distinct sclerites; Zone II must be identified as unarmed. The oral ridges (sometimes termed oral stylets [@Neves2016za]) are thickenings of the mouth cone rather than sclerites. ^n^nThe oral stylets and (in Higgins larvae) oral teeth are distal scleritozations of the mouth cone that surround the mouth opening [@Gad2005za]. The longitudinal stylets are internal within the mouth cone, but are associated with a closing apparatus [@Gad2005za; @Sorensen2022za]; they are ''enwrapped by an apical ring at the tip of the mouth cone'' [@Sorensen2022za]. These are the clearest candidates for Zone II elements. ^n^nA weakly sclerotized pharyngeal tube connects the mouth opening to the pharyngeal bulb [@Gad2005za]. In Pliciloricus [@Gad2005za], the buccal tube bears three symmetrical ''bracelets'' of ''prepharyngeal'' armature, which supports the buccal tube [@Neves2013fz]. These are external to the buccal channel and are thus part of the mouth cone structure rather than scalids of the pharynx itself.^n^nThe inner surface of the pharyngeal bulb bears transverse rows (three in Pliciloricus [@Gad2005za]; five in Armorloricus [@Kristensen2004cbm]; multiple in larvae, including shira larvae, of Patuloricus [@Sorensen2022za]) of placoids, which surely correspond to Zone III.^n'; + TEXT CHARACTER=9 TEXT='Many heterotardigrades have a triangular proboscis, the proboscis being the combination of introvert and pharynx.^n^nCharacter 62 in @Kihm2023.'; + TEXT CHARACTER=10 TEXT='After transformation series 1 in @Wills2012. The introvert is the region of the trunk that corresponds to the Zone I armature zone in the scheme of @ConwayMorris1977. (Zones II and III are on the pharynx, which is often termed the ‘mouth cone’ in the priapulid and kinorhynch literature.)^nTaxa without such a region, or where the region is extremely short (as in Scathascolex and Wronascolex), do not have an invaginable introvert. This character cannot readily be applied to taxa with a non-terminal mouth.^nThe introvert of Eokinorhynchus is partly inverted in some specimens, fully everted in others; the maximum extent of its invagination is unknown (Zhang et al. 2015).^nCoded as ambiguous in Sirilorica (Peel et al. 2013); introvert never seen invaginated but sample size insufficient to determine whether this was not biologically possible.^nAn introvert is not present in adult Chordodidae (Poinar Jr. and Doelman 1974).^nAmbiguous in Xiaoheiqingella as the introvert is not retracted in any specimen (Han et al. 2004; Huang et al. 2004b; Han and Hu 2006; HU et al. 2017) ^nSeemingly invaginable to some extent in Chalazoscolex; unclear in Xystoscolex (Conway Morris and Peel 2010)?^nInvaginable in tardigrades (Guidetti et al. 2013b)^nThe loriciferan introvert can be telescopically retracted inside the lorica, but not inverted (Kristensen 1983), so these are coded as not invaginable.^nPartially inversible in Aysheaia (Whittington 1978)'; + TEXT CHARACTER=11 TEXT='After transformation series 1 in Wills et al. (2012). It is not clear how Wills established that the introverts of Louisella and Selkirkia/Paraselkirkia could not be fully retracted; as such these taxa are left ambiguous.^nAmbiguous in Scathascolex (Smith 2015)'; + TEXT CHARACTER=12 TEXT='Proposed by Nielsen (2001, 2012) as a synapomorphy of kinorhynchs, loriciferans and extant priapulids.'; + TEXT CHARACTER=13 TEXT='Trichoscalids are scalids that occur posterior to the last spinoscalid ring of the introvert in kinorhynchs, distinguished from other scalids morphologically, by their ‘hairy’ appearance, and positionally, by the gap between them and the Zone I scalids (Neves et al. 2016). They are listed as features of the neck as trichoscalid plates, where present, are connected to the placids (Sørensen et al. 2015). Wills et al. (2012) considered these as a separate ring of the Zone I armature (see their transformation series 11), but they are here treated separately.^nThe fringed tips of sclerites in Eolorica (Harvey and Butterfield 2017) are taken to identify the presence of trichoscalids.^n^nKinorhynch trichoscalids are innervated by longitudinal introvert nerves that extend from the forebrain, with one nerve corresponding to each trichoscalid (and the subsequent introvert scalids) [@Nebelsick1993z]. There is an analogy here with the two rings of six mechanoreceptory labial papillae of the nematode introvert, which are also innervated by six longitudinal nerves that seem to emanate from a cluster of cells immediately anterior of the circumoral brain [@SchmidtRhaesa2016]. Labial papillae occur in two rings, rather than the single ring of trichoscalids typical of kinorhynchs and loriciferans; but some kinorhynchs exhibit a second ring of accessory trichoscalids [@Herranz2013za]. By extension, we also consider the labial papillae of Aysheaia as possible homologues.^n^n^nCoded ambiguous in Sicyophorus, as there is a hint of spine-like structures at the base of the introvert of (fig. 3a Maas et al. 2007c) that could conceivably represent tricoscalids.^n^nThe row of backward pointing spines in Markuelia (Haug and Maas 2009) occupy an equivalent position and are coded as homologous.'; + TEXT CHARACTER=14 TEXT='After transformation series 11 in Wills et al. (2012).'; + TEXT CHARACTER=15 TEXT='Following character 5 of (Sørensen et al. 2015). Loriciferans (Neves et al. 2016) have fifteen trichoscalids (seven of which are sometimes ‘double’, interpreted as basally bifurcating as they attach to the same trichoscalid plate).'; + TEXT CHARACTER=16 TEXT='One ring in most kinorhynchs and loriciferans; two rings in nematodes. The four cephalic sclerites typically at the level of the amphids have a different symmetry and presumably innervation, so are not considered as homologues.'; + TEXT CHARACTER=17 TEXT='Trichoscalid plates are large plates to which trichoscalids attach. They are always present in lorificerans (Neves et al. 2016), and occur in certain kinorhynchs, where they connect at their posterior margin to the placids; see character 6 of (Sørensen et al. 2015)'; + TEXT CHARACTER=18 TEXT='Characters 5-6 in @Meldal2004. Trichoscalids (= labial papillae) comprise multiple articulated units in certain nematodes'; + TEXT CHARACTER=20 TEXT='Certain loriciferans exhibit doubled trichoscalids that share a common base [e.g. @Gad2005ode]'; + TEXT CHARACTER=21 TEXT='After transformation series 5 in Wills et al. (2012).^nAmbiguous in Tabelliscolex due to low preservational fidelity (Han et al. 2003b).^n^nAmbiguous in Shergoldana and Orstenoloricus because these taxa are presumed to represent larvae, meaning that the adult situation is unknown.^nCoded as ambiguous in Palaeoscolex piscatorum (Whittard 1953; Conway Morris 1997); not clear on what basis Wills et al. (2012) coded introvert features.^nThe specimens of Cricocosmia figured in (Hou et al. 2017) clearly shows that there is a single circlet of Zone I armature.^nMultiple circlets are evident in Tylotites (Han et al. 2007c)^nDetailed references: Nanaloricus (Kristensen et al. 2007); Pliciloricus, (Heiner and Kristensen 2005); Echinoderes (Sørensen and Pardos 2008; Herranz et al. 2014); Paracentrophyes, (Sørensen et al. 2010); Campyloderes, (Neuhaus and Sørensen 2013); CEntroderes, (Neuhaus et al. 2014); Zelinkaderes (Sørensen et al. 2007; Altenburger et al. 2015)^n^n(~) inapplicable; introvert absent or armature not comparable to priapulid proboscis zones^nAfter transformation series 5 in Wills et al. (2012).^nAmbiguous in Tabelliscolex due to low preservational fidelity (Han et al. 2003b).^nAbsent in Corynetis (Hu et al. 2012).^nAmbiguous in Shergoldana and Orstenoloricus because these taxa are presumed to represent larvae, meaning that the adult situation is unknown.^nCoded as ambiguous in Palaeoscolex piscatorum (Whittard 1953; Conway Morris 1997); not clear on what basis Wills et al. (2012) coded introvert features.^nThe specimens of Cricocosmia figured in (Hou et al. 2017) clearly shows that there is a single circlet of Zone I armature.^nMultiple circlets are evident in Tylotites (Han et al. 2007c)^nDetailed references: Nanaloricus (Kristensen et al. 2007); Pliciloricus, (Heiner and Kristensen 2005); Echinoderes (Sørensen and Pardos 2008; Herranz et al. 2014); Paracentrophyes, (Sørensen et al. 2010); Campyloderes, (Neuhaus and Sørensen 2013); CEntroderes, (Neuhaus et al. 2014); Zelinkaderes (Sørensen et al. 2007; Altenburger et al. 2015)'; + TEXT CHARACTER=22 TEXT='Although the first three circlets of Meiopriapulus contain 25 sclerites, these do not define longitudinal rows of sclerites or a 25-fold symmetry of the introvert (Adrianov and Malakhov 2001)^nThe circlets of Maccabeus contain 25 elements apiece; the first circlet is interpreted as representing an amalgamation of the first three circlets (Adrianov and Malakhov 2001)^nNot so in Kinorhynchs (Herranz et al. 2013), or Kinonchulus (where it is the first one row that defines the symmetry) (Liu et al. 2014)'; + TEXT CHARACTER=23 TEXT='Scalids and pharyngeal teeth have been distinguished based on their orientation on the pharynx (Nielsen 2001 p. 332)^nCoded ambiguous in Tylotites as descriptions are ambiguous on this point (Han et al. 2003a, 2007c)^nDirected anteriad in Maccabeus (Por and Bromley 1974)'; + TEXT CHARACTER=24 TEXT='Nematomorphs have two (Nectonema?) or three (Gordiida) circlets (Schmidt-Rhaesa 1996). Kinonchulus has around seven. Shergoldana possibly has more than one.'; + TEXT CHARACTER=25 TEXT='After transformation series 10 in Wills et al. (2012).'; + TEXT CHARACTER=26 TEXT='Zone I sclerites arranged ‘radially’, in longitudinal or diagonal lines; see transformation series 6 in Wills et al. (2012). The nature of the radial arrangement – pentaradial or hexaradial – is not independent of the number of longitudinal armature rows (trans. ser. 84) and is thus not coded separately here. A radial arrangement is not apparent in Eokinorhynchus (Zhang et al. 2015); this does not seem to represent preservation and is taken as authentic.^nNot in rows in Nematomorphs: they have a 6-6-7 arrangement (Schmidt-Rhaesa 1996)^nThe symmetric arrangement of kinorhynchs (Herranz et al. 2013) and loriciferans (Kristensen et al. 2007) does not qualify as neat rows; the character aims to capture the regimented organization of priapulids. The armature of Ottoia and Selkirkia is in prominent diagonal rows, producing a quincunx arrangement (Smith et al. 2015). Inapplicable in Markuelia as the three rings preserved seem to correspond to those that define the prominent rows in priapulids (Dong et al. 2010).'; + TEXT CHARACTER=27 TEXT='After transformation series 6 in Wills et al. (2012). Diagonal in Eokinorhynchus (Zhang et al. 2015). Parallel in Nanaloricus, Pliciloricus (Neves et al. 2016). The regular arrangement of scalids in Eolorica (Harvey and Butterfield 2017) is suggestive of a row-wise arrangement, though the orientation of such rows cannot be determined.^nDiagonal in Laojieella (Han et al. 2006)^nAmbiguous in Eopiapulites; although sclerites occur in more-or-less transverse rows, helical ridges suggest that the underlying organization may be spiral (Liu et al. 2014)'; + TEXT CHARACTER=29 TEXT='This and the following transformation series attempt to extract the full phylogenetic information implicit in transformation series 8 of Wills et al. (2012).^nThe spines of Scolecofurca appear to have originally been cuticularized, based on images taken by Jean-Bernard Caron (Caron 2011). These depict simple posterior-directed spines, though finer subsidiary morphology is possible.'; + TEXT CHARACTER=30 TEXT='The solid sclerites of nematomorphs differ from the structures borne by scalidophorans (Schmidt-Rhaesa 1996). The construction of Zone II and Zone III armature elements is typically the same as that of Zone I elements, so this character statement stands for elements of all three zones.'; + TEXT CHARACTER=31 TEXT='Maccabeus has long and short sclerites in Zone I (Por and Bromley 1974) so is coded as ambiguous.'; + TEXT CHARACTER=32 TEXT='Coded as spinose in loriciferans (Neves et al. 2016)^nLimited information is available from Nectonema (Schmidt-Rhaesa 1996)'; + TEXT CHARACTER=33 TEXT='In certain kinorhynch genera (here, Cateria), primary spinoscalids bifurcate at their base, giving the appearance that their number is twice its true figure (Sørensen et al. 2015).'; + TEXT CHARACTER=34 TEXT='Dentate elements bear secondary denticles; pectinate elements bear a fine comb-like fringe'; + TEXT CHARACTER=35 TEXT='Spinoscalids and clavoscalids of many loriciferans, including Eolorica, bear articulated joints (Neves et al. 2016; Harvey and Butterfield 2017)^nSclerites of Dracoderes are also articulated (Sørensen et al. 2012b)'; + TEXT CHARACTER=36 TEXT='Spinoscalids of many loriciferans, including Eolorica, bear small subsidiary setules (Neves et al. 2016; Harvey and Butterfield 2017)'; + TEXT CHARACTER=37 TEXT='The sclerites of Priapulopsis are telescopic (Storch et al. 1995).'; + TEXT CHARACTER=38 TEXT='The Zone I sclerites of Meiopriapulus bear a pectinate hood (Morse 1981)'; + TEXT CHARACTER=39 TEXT='The primary spinoscalids of certain kinorhynchs have intrinsic muscles [@Herranz2021z]'; + TEXT CHARACTER=40 TEXT='Modified from transformation series 9 in Wills et al. (2012). In order to capture homology due to a radial arrangement, the armature number is formulated a number of transformation series, each corresponding to a common factor and thus a potential homology of symmetry. Priapulids, with 25 rows, also exhibit pentaradial symmetry. (A taxon could conceivably exhibit 30-fold symmetry, which would have both pentaradial and hexaradial symmetry.) In priapulids, the symmetry of the pharynx is defined by the number of elements that comprise the first three circlets and, hence, defining the number of longitudinal rows of elements on the introvert.^nTaxa with 25 scalid rows: Recent Priapulidae, Halicryptidae, Tubiluchidae and Maccabeus (Adrianov and Malakhov 2001), Xiaoheiqingella, Yunnanpriapulus (Huang et al. 2004b), Markuelia (Dong et al. 2010)^nThe first three rows contain 8 + 9 + 8 = 25 scalids in Meiopriapulus even if they do not define the symmetry (Adrianov and Malakhov 2001)^nEximipriapulus: “More than 30” (Ma et al. 2014b)^nfewer than 25 – c. 10 on each side – in Sicyophorus and Palaeopriapulites (Maas et al. 2007c)^n6+6+7=19 in nematomorphs (Conway Morris 1977a) – but does this reflect an underlying 6-fold symmetry (see below)?^nShergoldana’s armature comprises a ring of cushion-like folds, each bearing a single tooth. Each fold is associated with two round humps, and a further round hump occurs between each pair of folds (Maas et al. 2007a). This arrangement suggests a six-fold symmetry.^nThe armature of kinorhynchs is arranged in a pentaradial fashion (Sørensen et al. 2008)^nA six-fold symmetry is observed in Chordodes, Shergoldana (Maas et al. 2007a), larval nematodes (despite lack of Zone I armature), and loriciferans (Sørensen et al. 2008). In Halicryptus, the hatching larva has seven-fold symmetry, becoming eightfold in the Higgins larva (Storch and Higgins 1991; Janssen et al. 2009)^nThe six oral papillae of Aysheaia (Whittington 1978) and Tardigrada (Urban 2013) are taken to indicate a 6-fold pharyngeal symmetry, reflected by the six oral plates of Actinarctus (Boesgaard and Kristensen 2001) and Halobiotus (Biserova and Kuznetsova 2012). The six denticles of Sirilorica (Peel 2010; Peel et al. 2013)are interpreted in the same way'; + TEXT CHARACTER=41 TEXT='Treated as a neomorphic character, contingent on five-fold symmetry, analogous to an ordered character (not five-fold / five-fold / twentyfive-fold) reflecting progressively higher degrees of organization.'; + TEXT CHARACTER=43 TEXT='The dorsal stylet (large dorsal tooth of Kinonchulus) arises outside the pharynx, as revealed during moulting and by the possession of its own set of musculature (p 191 Bird and Bird 1991); it is not considered part of the pharyngeal armature. Its dorsal position indicates that it is not homologous with the (ventral) tardigrade stylet.'; + TEXT CHARACTER=44 TEXT='We define a pre-oral (‘buccal’) chamber as a region enclosing the mouth and formed by the outgrowth of surrounding body tissue – as observed in modern onychophorans [@Martin2014]. The character is coded as absent where the oral region is clearly displayed externally, as in Tardigrada, euarthropods from Kerygmachela crownwards (where the position of the mouth is marked by the expression of an ‘oral cone’), Jianshanopodia [@Vannier2014], Aysheaia [@Whittington1978], and Siberion [@Dzik2011]. It is coded as ambiguous in other taxa, as the location of the original mouth is unclear.^n^nCharacter 8 in @Smith2015.'; + TEXT CHARACTER=45 TEXT='The buccal tube of Nanaloricidae, part of the pharynx, bears annulations; these are absent in Pliciloricidae @Neves2016za]'; + TEXT CHARACTER=46 TEXT='After transformation series 4 in Wills et al. (2012).^nScathascolex and Wronascolex are coded as ambiguous as available material is insufficient to determine the invagibility of the pharynx.^nA pharynx, permanently inverted, is present in Chordodes; it is degenerate in adults and there is pharyngeal armature is not recorded (though it exists in larval stages) (Bolek et al. 2010).^nEversible in Xystoscolex; and seemingly Chalazoscolex (Conway Morris and Peel 2010)^nAn everted pharynx can be observed in some specimens of Aysheaia (e.g. USNM 58655; Whittington 1978)^n^n---^nThe pharynx of priapulans and palaeoscolecids can be everted [@ConwayMorris1977], whereas the panarthropod foregut is permanently inverted. No lobopodians exhibit complete eversion of the pharynx, even if some taxa display a certain degree of flexibility: as perhaps evident in the proboscis of Onychodictyon ferox [@Ou2012] or the presumed suction-feeding mouthparts of anomalocaridids [@Daley2012]. Taxa whose mouth region is unknown or is known from a limited sample size are coded ambiguous to reflect the possibility that eversion was possible but not displayed in the available material.^n^nCharacter 12 in @Smith2015.'; + TEXT CHARACTER=47 TEXT='After transformation series 4 in Wills et al. (2012).^nScathascolex and Wronascolex are coded as ambiguous as available material is insufficient to determine the invagibility of the pharynx. Scolecofurca is viewed as having its narrow pharynx in a minimally everted position.^nLoriciferans are coded as permanently everted as the mouth cone remains everted even when the introvert is retracted (Neves et al. 2013)'; + TEXT CHARACTER=48 TEXT='Cf. WTS85.^nCoded as absent in Corynetis following the interpretation of (Huang et al. 2004a). Movement of pharynx is interpreted as having a role in locomotion in Kinonchulus (Riemann 1972), but it is not clear whether this employs peristalsis.. Both Kinorhyncha (Neuhaus and Higgins 2002) and Loricifera employ their introvert in locomotion (Sørensen et al. 2008), though seemingly through the use movement of individual scalids rather than peristaltic contraction of the entire introvert.'; + TEXT CHARACTER=49 TEXT='WTS22^nCoded ambiguous in Scathascolex as there are insufficient specimens to determine whether the pharynx is preserved in its fully everted position. Coded as complete or incomplete in Eokinorhynchus as a specimen is preserved with eversion beyond proximal teeth (Zhang et al. 2015)'; + TEXT CHARACTER=50 TEXT='Cf. WTS83. ^nThe large size of the pharynx underlies the proposed ''Megintroverta'' clade uniting Acanthopriapulus with Priapulus and Priapulopsis [@Lemburg1999; @SchmidtRhaesa2022za]. This formulation attempts to avoid attaching undue significance to subtle variations in introvert length. It is phrased as ‘foregut’ rather than ‘introvert’ to recognize the proposed homology between the panarthropod foregut and the cycloneuralian introvert. Scored as ambiguous (0, 1) where neither category applies.'; + TEXT CHARACTER=51 TEXT='The distal pharynx of nematomorphs can be retracted into the body, but not inverted [@SchmidtRhaesa2012]. The mouth cone of kinorhynchs and loriciferans corresponds to the anterior pharynx; it can be withdrawn, but not inverted.'; + TEXT CHARACTER=52 TEXT='WTS86. A triradiate introvert is a putative synapomorphy of Loricifera, Priapulida + Kinorhyncha (Yamasaki et al. 2015). The pharynx is also triradiate in most nematodes (Altun and Hall 2017), including Kinonchulus (Riemann 1972) and in some larval onychophorans (Schmidt-Rhaesa et al. 1998)^nTriradiate in Pycnophyes and Kinorhynchus, but round in Cyclorhagida (Neuhaus and Higgins 2002)'; + TEXT CHARACTER=53 TEXT='See transformation series 12 in Wills et al. (2012).^n^nZone II is considered to represent the base of the pharynx and the position of the stomodeum. As such, the proximal circlet of Zone II sclerites represent circumpharyngeal structures, which are coded as homologous with the radial mouthparts of Hallucigenia [@Smith2015] and the circumoral apparatus of anomalocaridids [@Smith2015; @Li2024].^n^nThe grasping denticles of Sirilorica (Peel 2010; Peel et al. 2013) are interpreted as circumpharyngeal.^n^nThe six (?) peri-oral structures of Eolorica (Harvey and Butterfield 2017) are interpreted as robust oral ridges, preserved where the accompanying mouth cone has decayed.^nCoded as ambiguous in Tylotites (Han et al. 2003a, 2007c) as it is possible that the distinct and forward-oriented ring of introvert hooks corresponds to the spines at the base of Zone II.^nA single ring of elongate spines appears to gird the base of Zone II in Cricocosmia (Hou and Bergström 1994; Han et al. 2007b)^nThe buccal papillae of Halicryptus are assumed to correspond to the Zone II elements; these are not sclerotized and are irregularly distributed (Merriman 1981; Storch et al. 1990; Adrianov and Malakhov 2001). A similar condition occurs in Tubiluchus (Calloway 1975)^nThe ‘double tentacles’ of Maccabeus are described as surrounding the mouth, but the eight trigger spines sit directly on the circumoral nerve ring (Por and Bromley 1974). On this basis, the latter are homologized with the Zone II elements; the tentacles are considered to represent modified Zone I spines. ^nScolecofurca appears to have elongate Zone II elements visible at its anterior margin (Caron 2011)^nCoded ambiguous for Fieldia; just a hint of some form of structure around the base of the everted Zone II (Pers Obs of ROM 93-1678A)^nNot described in Guanduscolex (Hu et al. 2008), though it is possible that these have been overlooked on account of preservation (cf. the faint preservation in Mafangscolex mannus).^nXystoscolex has prominent ridge-like features at the boundary of the introvert and the pharynx (Conway Morris and Peel 2010); these presumably correspond to Zone II armature, though their detailed morphology remains ambiguous^n^n---^nTODO Review in Aysheaia and Siberion whether the oral papillae might correspond to Zone I^n^n---^nThe mouth of many ecdysozoans is surrounded by radially-arranged cuticular or sclerotized structures. In priapulans and other cycloneuralians these are typically conical spines, arranged centripetally when the pharynx is contracted and centrifugally when the pharynx is everted [@ConwayMorris1977]. In basal panarthropods such as Aysheaia, Kerygmachela, Hallucigenia and Jianshanopodia, the structures are regular non-sclerotized lamellae, aciculae or plates [@Whittington1978; @Budd1998trse; @Vannier2014; @Smith2015]; among anomalocaridids the plates are sclerotized and differentiated [@Daley2012; @Daley2013p; @Daley2013jsp]. (Although the three or four prominent plates in the anomalocaridid oral cone are differentiated to give rise to bilateral symmetry [@Daley2012], the underlying radial arrangement of these plates is clear: some plates straddle the midline, and if rotated by 90° the smaller plates are equivalent to their counterparts.) Tardigrades bear circumoral lamellae [@Budd2001za; @Guidetti2012; @Mayer2013po]. Other lobopodians are coded as ambiguous; euarthropods and onychophorans are coded as absent [discussed in @Smith2015, char. 9].^n^nCharacter 9 in @Smith2015; character 25 in @Yang2015.'; + TEXT CHARACTER=54 TEXT='This character differentiates circumoral structures with a small contact area with the body (e.g. coronal spines of priapulomorph worms) from the plate-like circumoral structures that have a large surface area in contact with the body - found commonly in stem euarthropods (e.g. Parapeytoia, Hurdia)'; + TEXT CHARACTER=55 TEXT='Zone II sclerites form a ring around the mouth of e.g. priapulans, but are paired bilateral series in Omnidens. As the underlying symmetry mechanism is assumed to be conserved even in the absence of armature, taxa without Zone II sclerites are coded ambiguous.'; + TEXT CHARACTER=56 TEXT='This character distinguishes the simple organization of the mouth apparatus in Hallucigenia [@Smith2015] and Pambdelurion [@Budd1998ar; @Vinther2016] from the more complex mouthparts of anomalocaridids [@Daley2009; @Daley2012; @Daley2014]. We score Megadictyon, Schinderhannes and Jianshanopodia as uncertain to reflect their mouthparts'' poor preservation [@Liu2006; @Liu2007az; @Kuhl2009]. The character is treated as neomorphic to reflect the increasing gradient of complexity reflected by differentiation.^n^nCharacter 10 in @Smith2015.'; + TEXT CHARACTER=59 TEXT='Node-bearing plates are considered by @Liu2018nsr to represent a synapomorphy of Anomalocarididae + Amplectobeluidae. @Kihm2023 interpret the first band of teeth in the tardigrade oral cavity as homologous to these nodes. Because they occur on the inner rather than the outer surface of the plates, we prefer to ascribe such nodes to a separate transformation series.'; + TEXT CHARACTER=60 TEXT='@Dewel2006 notes that the first band of teeth in the tardigrade oral cavity occur on the circumoral plates, rather than forming part of the buccal tube.^n@Kihm2023 interpret the teeth as homologous with the nodes on anomalocaridid + amplectobeluid circumoral plates, which we consider unlikely as these occur on the outer face of the plates, whereas tardigrade elements occur on the inner face.'; + TEXT CHARACTER=61 TEXT='Panarthropods express a considerable diversity of circumoral structures, which represent a symplesiomorphic feature of Ecdysozoa as a whole (e.g. Edgecombe 2009). Various lobopodians bear oral papillae/lamellae (e.g. Aysheaia (Whittington 1978); Kerygmachela (Budd 1993, 1998a); Opabinia (Whittington 1975)); a similar feature occurs in the oral cone of Tardigrada (Dewel and Eibye-Jacobsen 2006; Guidetti et al. 2012). Pambdelurion (Budd 1998b) and anomalocaridids (e.g. Daley et al. 2009; Daley and Edgecombe 2014) exhibit radially arranged plates that together form a mouth apparatus (Daley and Bergström 2012). We code the nature of the circumoral structures in Megadictyon and Jianshanopodia (Liu et al. 2006, 2007; Vannier et al. 2014) as uncertain; in the former case, the type material does not unequivocally exhibit a plate-like nature; in the latter, the documentation of the plates is inconclusive. The transformation series is scored as inapplicable in Onychophora because the bilaterally symmetrical lip papillae are demonstrably not homologous with the radially symmetrical structures of other taxa (Eriksson and Budd 2000; Martin and Mayer 2014)'; + TEXT CHARACTER=62 TEXT='After transformation series 12 in Wills et al. (2012).^nThe single circlet of large denticles in Sirilorica (Peel et al. 2013) is interpreted as the proximal circlet of Zone II. There are at least six denticles; a seventh may be obscured by incomplete preservation (Peel et al. 2013). Nanaloricidae, and most species of Pliciloricus, bear eight oral ridges (though ranging from six to twelve in Pliciloricidae) (Neves et al. 2016). The six peri-oral sclerites preserved in Eolorica (Harvey and Butterfield 2017) are taken to represent the full complement. Six in Actinarctus (Boesgaard and Kristensen 2001), Halobiotus (Biserova and Kuznetsova 2012), Aysheaia. Twelve liplets corresponding to six lobes in Anatonchus (Borgonie et al. 1995), so coded as homologous to six as this is the underlying organization. c. Eighteen in Eopriapulites (Liu et al. 2014)'; + TEXT CHARACTER=63 TEXT='After transformation series 13 in Wills et al. (2012).^nMaccabeus is coded ambiguous; the spines have a length:width ratio of 4:1 if the width is measured at its maximum in the basal region, but 12:1 if measured at the base of the elongate projection (Por and Bromley 1974).'; + TEXT CHARACTER=64 TEXT='The spines of Sirilorica seem to have multiple cusps (Peel et al. 2013), as do the trigger spines of Maccabeus (Por and Bromley 1974).^nShergoldana is coded as ambiguous as it is not clear whether the ‘cushion-like folds’ (Maas et al. 2007a) are part of the spine or represent soft tissue.'; + TEXT CHARACTER=65 TEXT='The inner face of plates of e.g. Omnidens bear inward-directed spines.'; + TEXT CHARACTER=66 TEXT='The inner surface of the circumoral plates of Anomalocaris, Peytoia and Hurdia bear multiple spinose projections [@Daley2012; @Daley2013jsp], but the equivalent face of plates in priapulans and Hallucigenia sparsa lacks projections [@Smith2015].^n^nCharacter 16 in @Zhang2016.'; + TEXT CHARACTER=67 TEXT='Where present, the oral ridges of loriciferans (Neves et al. 2016) are interpreted as Zone II sclerites fused to the introvert. In other taxa, only the proximal part of the sclerites is attached to the trunk. ^nIn certain kinorhynchs (here, Campyloderes) the outer oral stylets (= Zone III sclerites) lie flat against the introvert, to which they are fused (Sørensen et al. 2015).'; + TEXT CHARACTER=68 TEXT='WTS20^nSlightly narrower in Corynetis (Huang et al. 2004a; Hu et al. 2012)^nNot substantially wider in Cricocosmia (Hou and Bergström 1994)^nNot wider in Halicryptus (Merriman 1981; Shirley and Storch 1999; Adrianov and Malakhov 2001)'; + TEXT CHARACTER=69 TEXT='Pharyngeal teeth in Cricocosmia do not extend to the circumoral spines of the introvert [e.g. @Cong2017]. Similarly in Hallucigenia there is an absence of pharyngeal teeth in proximity to the circumoral structures [@Smith2015]. In tardigrades [e.g. @Hansen2002], Hurdia [@Daley2009] and Cambroraster [@Moysiuk2019], the pharyngeal teeth are directly adjacent to the radially arranged circumoral structures.'; + TEXT CHARACTER=70 TEXT='The proximal pharynx of Spinoloricus forms a short pleated ring [@Heiner2007; @Neves2016za]. A similar reinforced region occurs in Armorloricus, referred to as ''basal plate row of mouth cone base'' [@Kristensen2004cbm].'; + TEXT CHARACTER=71 TEXT='The loriciferan mouth cone bears typically six to twelve external (outer) oral folds or ridges, which are thickenings of the cuticle [@Neves2016za]'; + TEXT CHARACTER=72 TEXT='Number of oral ridges present in adults'; + TEXT CHARACTER=73 TEXT='The oral ridge bifurcates in Nanaloricidae [@Neves2016za]'; + TEXT CHARACTER=74 TEXT='In certain taxa, alternating ridges are differentiated into two lengths'; + TEXT CHARACTER=75 TEXT='In certain tardigrades, "the mouth ring appears to be demarcated from the buccal tube by a zone of attenuated or fenestrated cuticle" [@Dewel2006, fig. 11]'; + TEXT CHARACTER=76 TEXT='The Zone III armature trans. series has been reformulated from Wills et al. (2012, trans. ser. 15–19) to better capture possible homologies between similar structures, and to avoid treating different character states as homologous.^n^nMany tardigrades exhibit three rows of oral teeth in their buccal apparatus [@Michalczyk2003]. We follow @Kihm2023 in treating the first row of teeth at nodes of the circumoral plates, even if their equivalence with potential homologues in anomalocaridids is difficult to demonstrate conclusively. ^n@Kihm2023 contend that the second and third rows of oral teeth cannot correspond to Zone III elements in other ecdysozoans, because the buccal tube is not part of the pharynx. Whereas the buccopharyngeal apparatus is clearly divided into a buccal tube and a pharyngeal bulb, both components of the apparatus derive from the stomodeum [@Massa2024], and are thus together homologous with the pharynx of other ecdysozoan worms. Treating the circumoral lamellae as equivalent to Zone II teeth implicitly accepts the homology between these features. We thus treat the buccal tube as a modification of the component of the proximal region of Zone III, which corresponds to the proximal pharynx in other ecdysozoan worms; and the tardigrade pharyngeal bulb as equivalent to the distal region of Zone III.^n^n^nAfter transformation series 3 in Wills et al. (2012). Priapulids’ Zone III sclerites, and the microspines (proventricular acanthae) of the arthropod foregut, are included here. The presence of comparable features in the foregut of Jianshanopodia (Vannier et al. 2014) demonstrates that comparable features can be observed in Cambrian lobopodians given suitable preservation; lobopodians are thus coded as ambiguous for this transformation series. The features are absent in Onychophora (Elzinga 1998); electron-dense thickenings in the tardigrade pharynx (Dewel and Clark 1973b) are tentatively considered to represent armature that has been reduced in size due to the miniaturization of the tardigrade body.^nCorresponds to the inner rows of teeth in Hurdia (Transformation series 9 in Daley et al. (2009). ^nOral stylets are occasionally present (three, four or six) in Pliciloricus (Neves et al. 2016); eight are present in Nanaloricus mysticus (Kristensen 1983).^nPresence of armature in Priapulites (Schram 1973) indicated by robust preservation of pharyngeal region.^nThe adult condition is used, except where elements are present in larvae only.^nSimple spinose elements within the oral circlet of Eopriapulites (Liu et al. 2014) are interpreted as elements of Zone III.^n^nIn nematomorphs, the proboscis contains cuticular spines (=hooks), interpreted as introvert elements); and a ''stylet'' [@SchmidtRhaesa2012]. The stylet comprises three dense cuticular rods; the surface contains series of individual cuticularized teeth [@SchmidtRhaesa2012] which we treat as Zone III elements. The opening in the centre of the stylet corresponds to a gland duct; @SchmidtRhaesa2012 consider it unlikely to correspond to a mouth. As no other mouth position is identified, and we interpret this as the likely position of the mouth.^n^n---^nIn many taxa the pharynx is lined with cuticular sclerites or "pharyngeal teeth". Priapulans have an eversible pharynx lined with pharyngeal teeth [@vanderLand1970; @ConwayMorris1977]. Hallucigenia sparsa has a structurally differentiated (narrower) pharynx lined with acicular teeth [@Smith2015]. Jianshanopodia bears a series of pharyngeal teeth with multiple cusps [@Liu2006; @Vannier2014]. This fossil, perhaps alongside Omnidens [@Hou2006], resembles a longitudinally extended Hurdia mouthpart; the inner rows of teeth in Hurdia are correspondingly interpreted as pharyngeal teeth [@Daley2009], 2013a). Sclerotized teeth have also been reported in the foregut of Paucipodia [@Hou2004], although the nature and distribution of the teeth is not clear from the fossil material. Onychophorans bear a differentiated pharynx with an oesophageal constriction, but this is unornamented [@Elzinga1998]. ^n^nMost tardigrade taxa exhibit two to five rows of teeth (= mucrones) caudally to their circumoral lamellae [@Pilato1972; @Schuster1980; @Hansen2002; @Dastych2003; @Guidetti2012]; some have a further row of sclerotized transverse ridges (= baffles). Following @Khim2023 we consider these oral teeth to be a separate innovation, and thus not homologous to pharyngeal teeth. Coded as ambiguous in all other taxa due to inadequate preservation.^n^nCharacter 13 in @Smith2015.'; + TEXT CHARACTER=77 TEXT='Priapulans'' pharyngeal teeth exhibit a range of morphologies but always bear multiple cusps [@vanderLand1970; @Smith2015p]. Hallucigenia sparsa has acicular teeth that come to a single point [@Smith2015]. The teeth of Hurdia and Jianshanopodia have multiple cusps [@Daley2009; @Daley2013jsp; @Vannier2014]; tardigrade teeth do not [@Pilato1972; @Hansen2002; @Schuster1980; @Dastych2003].^n^nCharacter 14 in @Smith2015.'; + TEXT CHARACTER=78 TEXT='The situation in Zone III is assumed also to apply to Zone I armature, thus this character statement stands for both.'; + TEXT CHARACTER=79 TEXT='After transformation series 3 in Wills et al. (2012).'; + TEXT CHARACTER=80 TEXT='Zone III sclerites form rings or whorls around the pharynx in e.g. priapulans, but are paired bilateral series in Omnidens [@Li2024] and some tardigrades [@Michalczyk2003]. They surround the pharynx in a haphazard arrangement in taxa such as Corynetis [@Hu2012]. As the underlying symmetry mechanism is assumed to be conserved even in the absence of armature, taxa without Zone III sclerites are coded ambiguous.'; + TEXT CHARACTER=81 TEXT='In contrast to the uniform distribution of sclerites in priapulans and total-group euarthropods, the pharyngeal teeth in Hallucigenia sparsa seem to occupy one or two longitudinal rows and do not cover the entire surface of the pharynx [@Smith2015].^n^nCharacter 15 in @Smith2015.'; + TEXT CHARACTER=82 TEXT='After transformation series 14 in Wills et al. (2012). ^nThe oral stylets of loriciferans, where present, occur in a single circlet (Neves et al. 2016).^nTwo bands of teeth occur in tardigrades, plus Band I, interpreted herein as representing nodes of the circumoral lamellae (after @Kihm2023). Band II often contains irregularly distributed elements.^n^nAnatonchus (Nematoda) has four circlets (Borgonie et al. 1995), in addition to three teeth (coded as distal elements)^nMaotianshania has around 12 circlets (Hou and Bergström 1994)^nEokinorhynchus has at least two circlets; the full number is unknown as the pharynx may not be fully everted (Zhang et al. 2015).^nKinorhynchs have four circlets within Zone III: one ring of outer oral stylets, two rings of inner oral styles, and the helioscalids (Sørensen and Pardos 2008)^nOttoia has c. 40 (Smith et al. 2015)^nCorynetis has c. 60, if the armature is arranged in circlets rather than irregularly (Hu et al. 2012)^nHalluicgenia is coded as ?; its sclerites are not strictly arranged in circlets, and occur in a large number of rows (Smith and Caron 2015).^nHalycryptus has multiple circlets, with the number increasing during growth (Adrianov and Malakhov 2001)^nMany circlets in Maccabeus (Por and Bromley 1974)^nc. 50 in Priapulus (Adrianov and Malakhov 2001)^nAround 12 clear circlets in Tubiluchus, with further more distally (Kirsteuer 1976)^nAt least thirteen in Priapulopsis (van der Land 1970)^nVariable numbers reported in Meiopriapulus, possible changing with ontogeny (Sørensen et al. 2012a)^nc. 20 in Laojieella (Han et al. 2007a)^nSeemingly a single circlet in Ancalagon and Fieldia (Conway Morris 1977a) (and Pers. Obs)'; + TEXT CHARACTER=83 TEXT='A distinct pentaradial symmetry is evident “only in the [first] eight circlets” in Halicryptus (Adrianov and Malakhov 2001)^nSeven (including the proximal circlet) in Maccabeus (Por and Bromley 1974)^nWTS19'; + TEXT CHARACTER=84 TEXT='WTS18^nEight in Meiopriapulus (Sørensen et al. 2012a)^nFive in Tubiluchus lemburgi (Schmidt-Rhaesa et al. 2013), reflected by the five basal papillae (which represent a distinct circlet from the remainder of the pharyngeal teeth).^nSix in Namnaloricus, Pliciloricus (Neves et al. 2016), Kinochulus ^nMaotianshania has approximately 15 (Hou and Bergström 1994); coded as ?^nCoded ambiguous in Cricocosmia (Hou and Bergström 1994)?^nSixteen in Eokinorhynchus (Zhang et al. 2015)^nOttoia approx.. 24–30; probably not a constant number between specimens (Smith et al. 2015); coded ambiguous^nThe dorsal outer oral stylet in Kinorhyncha is understood to be secondarily reduced; the decaradial arrangement of the elements being clearly evident (Nebelsick 1993); these taxa are thus coded as ten.^nTwelve reported in Kinonchulus, though six drawn; possibly six are the Zone II ‘hooks’.^nAnatonchus > 40 (Borgonie et al. 1995), coded ambiguous ^nFive in Halicryptus (Adrianov and Malakhov 2001) and Priapulopsis (van der Land 1970)^nc. Sixteen in Fieldia? (pers. obs)'; + TEXT CHARACTER=85 TEXT='The proximal circlet in kinorhynchs has an obvious relationship to the subsequent circlets. This character distinguishes this arrangement, whilst still recognizing the underlying five-fold symmetry shared with other taxa.'; + TEXT CHARACTER=87 TEXT='Following Sørensen et al. 2015 character 1 (Sørensen et al. 2015), the outer oral stylets of neocentrophyid and dracoderid kinorhynchs alternate between prominent and less well-developed sizes; see for example Paracentrophyes (Sørensen et al. 2010)'; + TEXT CHARACTER=88 TEXT='This and the following transformation series have been modified from trans. ser. 15 in Wills et al. 2012 to better capture possible homologies between sclerite morphologies.'; + TEXT CHARACTER=90 TEXT='Pectinate projections can occur from the fringe of a central cone (cf. trans. ser. 15, state 4 in Wills et al. 2012) or can occur along the margin of a scalid that lacks a central spine (cf. trans. ser. 15, state 6 in Wills et al. 2012). There is a gradation from a pectinate fringe (cf. Ottoia) via a multispinose situation (cf. Selkirkia Type A) through multiple spines (cf. Pripaulopsis); as such, all of these are coded in a single character statement.^nA basal fringe is present in the outer oral stylets of certain kinorhynchs – Paracentrophyes (Sørensen et al. 2010), Dracoderes (Sørensen et al. 2012b)'; + TEXT CHARACTER=91 TEXT='The outer oral stylets of many kinorhynchs (though not certain Pycnophyidae) consist of two to three rigid articulating units (Sørensen et al. 2015)'; + TEXT CHARACTER=92 TEXT='Reduction relates to size; this transformation series therefore applies whether or not the *morphology* of the proximal circlet is differentiated'; + TEXT CHARACTER=93 TEXT='After transformation series 18 in Wills et al. (2012).^nDifferentiated circlets are depicted in the drawn reconstruction of Corynetis (Huang et al. 2004a), but not formally described or depicted; this taxon is coded as ambiguous.^nDifferentiated in Halicryptus, Priapulopsis (Conway Morris 1977a).^nScored as inapplicable in loriciferans (Neves et al. 2016), whose single circlet cannot be meaningfully coded as ‘proximal’, ‘medial’ or ‘distal’.^nScored as differentiated in Ancalagon and Fieldia as the medial Zone III armature is massively reduced or absent in these taxa (Conway Morris 1977a)^nThe circlet of five sclerotized trabeculae in Maccabeus (Por and Bromley 1974) are interpreted as a reduced circlet of Zone III teeth.^nIn Priapulopsis bicaudatus, the first ring of teeth feature a reduced central spine (van der Land 1970)^nDifferentiated in Ottoia and Selkirkia (Type A teeth) (Conway Morris 1977a; Smith et al. 2015)'; + TEXT CHARACTER=94 TEXT='A raised band lies proximal to bands II and III of the tardigrade oral cavity armature [@Michalczyk2003]'; + TEXT CHARACTER=95 TEXT='Fieldia and Ancalagon only possess a single ring of Zone III teeth (Conway Morris 1977a), which I consider to represent a differentiated proximal circlet; the middle circlets are perhaps reduced or indistinct in the fossil material. The situation in loriciferans is taken to be the same: the Zone III armature comprises a single ring (typically) of oral ridges (i.e. a proximal circlet) and a single ring (where present) of oral stylets (a single distal circlet) (Neves et al. 2016).^nKinorhynchs have three rings of simple spinose styles (Sørensen and Pardos 2008; Herranz et al. 2014)'; + TEXT CHARACTER=96 TEXT='Multiple spines in Ottoia and Selkirkia (Smith et al. 2015). Simple spines in Eokinorhynchus (Zhang et al. 2015)^nSingle spines with pectinate fringe in Antygomonas (Bauer-Nebelsick 1996) and Centroderes (Neuhaus et al. 2014) coded as single spines; not clear that pectinate fringe is always clear enough to be unambiguously observed in other taxa.'; + TEXT CHARACTER=97 TEXT='In taxa such as Ottoia, the distal teeth in Zone III are morphologically and constitutionally distinct from the more proximal Zone III teeth (Smith et al. 2015). Scored as present if sclerites in the distal region of Zone III are morphologically distinct from those in the medial region, which are typically more robustly cuticularized. Sclerites that arm the upstanding eversible ‘mouth cone’ of priapulids are not included as part of the Zone III armature. Ambiguous in Louisella (Conway Morris 1977a). ^nDifferentiated ‘curved, scimitar-shaped’ teeth at entrance to stomach in Maccabeus (Por and Bromley 1974)'; + TEXT CHARACTER=98 TEXT='Pectinate in Ottoia (Smith et al. 2015).^nMorphologically distinct, though still pectinate, in Tubiluchus (Kirsteuer and Ruetzler 1973)'; + TEXT CHARACTER=99 TEXT='WTS23. If the proximal ring of elements is morphologically distinct, they are not included in this consideration.^nThe Zone III elements in Ottoia and Selkirkia do not change in size, just in angle of preservation (Smith et al. 2015).^nNo change in size is evident in Scathascolex.'; + TEXT CHARACTER=100 TEXT='Certain kinorhynchs exhibit small muscles that allow each outer style to be moved individually [@Herranz2021z]'; + TEXT CHARACTER=101 TEXT='Placoids are thickenings of the pharyngeal cuticle related to the attachment of the buccal tube.^n^nCharacter 64 from @Shi2021, 76 from @Khim2023, 32 in @Mapalo2024cb.'; + TEXT CHARACTER=102 TEXT='Character 33 in @Mapalo2024cb'; + TEXT CHARACTER=103 TEXT='Character 35 in @Mapalo2024cb. Present in certain tardigrades.'; + TEXT CHARACTER=104 TEXT='The ''stylets'' of nematomorphs comprise thickenings of the pharyngeal cuticle (and are thus not obvious homologues with structures termed ''stylets'' in other taxa)'; + TEXT CHARACTER=105 TEXT='In parachelan tardigrades, the anterior part of the buccal tube has hooks or ridges for the insertion of stylet musculature. ^n^nCharacter 74 from @Khim2023, 28 from @Mapalo2024cb.'; + TEXT CHARACTER=106 TEXT='Character 75 from @Khim2023; characters 20 and 30 in @Mapalo2024cb'; + TEXT CHARACTER=107 TEXT='WTS21.^nThe fully everted pharynx of Ottoia, Sirilorica and Louisella expresses a marked increase in width; this bulb-like feature is armed in Louisella (Peel et al. 2013)(Conway Morris 1977a).^nCoded ambiguous in Scathascolex as there are insufficient specimens to determine whether the pharynx is preserved in its fully everted position.^nCoded ambiguous in Cricocosmia (Hou and Bergström 1994); the material on which the reconstruction of (Han et al. 2007b) is based is not figured.^nCoded present in nematodes as pharynx (though not eversible) bears bulbs (Altun and Hall 2017).^nCoded ambiguous in Paratubiluchus as the ‘bulb’ may represent gut contents (Han et al. 2004)'; + TEXT CHARACTER=108 TEXT='A feature of certain kinorhynchs, e.g. Franciscideres; see character 7 in @Sorensen2015'; + TEXT CHARACTER=109 TEXT='Character 8 in @Sorensen2015.^nPlacids are a ring of plates in the neck region of most kinorhynchs, posterior to the introvert. Coded ambiguous in Sicyophorus, as there is a hint of spine-like structures at the base of the introvert (fig. 3a Maas et al. 2007c) that could conceivably correspond to placids or lips'; + TEXT CHARACTER=110 TEXT='In most cases, placids form a closing mechanism when the head is retracted into the trunk; see character 8 in (Sørensen et al. 2015). Nematode ‘lips’ also serve to close the front of the trunk (Borgonie et al. 1995; Altun and Hall 2017)'; + TEXT CHARACTER=111 TEXT='In certain kinorhynch taxa, the arrangement of placids incorporates gaps that give rise to a bilaterally symmetric character; see character 13 in (Sørensen et al. 2015).'; + TEXT CHARACTER=112 TEXT='See character 9 in (Sørensen et al. 2015).^nSix in Franciscideres; seven in Paracentrophyes; nine in Dracoderes; fourteen in Campyloderes; sixteen in Antygomonas, Centroderes , Echinoderes, Zelinkaderes (Sørensen et al. 2015).'; + TEXT CHARACTER=113 TEXT='See character 11 in (Sørensen et al. 2015).^nSirilorica and Nanaloricus bear spikes on the anterior margins of their loricae (Peel et al. 2013; Neves et al. 2016); Pliciloricus and Eolorica do not (Neves et al. 2016; Harvey and Butterfield 2017)'; + TEXT CHARACTER=114 TEXT='See character 12 in (Sørensen et al. 2015).'; + TEXT CHARACTER=115 TEXT='See characters 14-17 in @Meldal2004. Amphids are lateral sensory organs in nematodes, typically comprising a round or slit-like opening and an inner pocket.'; + TEXT CHARACTER=116 TEXT='The opening of the amphids may be round or slit-like'; + TEXT CHARACTER=117 TEXT='The head of Kerygmachela has a dorsal protruding lobe that contains neural tissue [@Park2018], presumed homologous to the projection of YKLP 12387. This is distinct from the "swelling" of the anterior trunk in certain hallucigeniid lobopodians, which gives rise to a bulbous "head" region. In higher euarthropods, the anterior lobe may be covered by a dorsal sclerite.'; + TEXT CHARACTER=118 TEXT='Numerous lobopodians have been considered to have cephalic sclerites [see @Ma2014jsp, char. 37], but in some cases this interpretation requires revision or confirmation through new material. Following @Liu2014ppp, we score this character as absent in Hallucigenia fortis [contra @Hou1995zjls], Onychodictyon ferox [contra @Ou2012] and Cardiodictyon [see @Hou1995zjls]. It is coded as ambiguous in Onychodictyon gracilis [@Liu2008app] and Hallucigenia hongmeia [@Steiner2012], as well as Luolishania [following @Smith2014]. Fossil taxa with an incomplete anterior region are coded as uncertain.^n^nCharacter 2 in @Smith2015 and @Yang2015.^n^n---^nNumerous lobopodians have been considered to have cephalic sclerites (Ma et al. 2014a), but in some cases this interpretation requires revision or confirmation through new material. Following recent data presented by Liu and Dunlop (2014), we score this transformation series as absent in Hallucigenia fortis (contra Hou and Bergström 1995), Onychodictyon ferox (contra Ou et al. 2012) and Cardiodictyon (see Hou and Bergström 1995). We code it as uncertain where the anterior region is ambiguously preserved, as in Onychodictyon gracilis (Liu et al. 2008) and Hallucigenia hongmeia (Steiner et al. 2012). An uncertain coding is also applied to Luolishania, as their apparent presence is only documented by a single specimen (Ma et al. 2009) whose ‘sclerites’ worryingly resemble features in other lobopodians whose original interpretation as sclerites has since been overthrown. Taxa with an incomplete anterior region are coded as uncertain.'; + TEXT CHARACTER=119 TEXT='We score this character as absent for fuxianhuiids, because the cephalic shield is not derived from fused segments [@Chen1995s; @Waloszek2005; @Bergstrom2008; @Yang2013], and in anomalocaridids, because the carapace-like structure on the head seems not to cover multiple cephalic segments [e.g. @Daley2009; @Daley2014].^n^nCharacter 3 in @Smith2015 and @Yang2015.^n'; + TEXT CHARACTER=120 TEXT='This character represents the hypothetical change in position of the anterior sclerite of the upper-stem euarthropods and the dorsal head sclerite of the anomalocaridids that are associated with protocerebral structures [see @Budd2021].'; + TEXT CHARACTER=121 TEXT='Character adapted from 59 in @VanRoy2015. Character 5 in @Yang2015.'; + TEXT CHARACTER=122 TEXT='The head sclerites of certain hurdiids exhibit a conspicuous reticulate ornamentation.^n^nCharacter 26 in @Moysiuk2019.^n'; + TEXT CHARACTER=123 TEXT='The dorsal sclerite is attached broadly in Radiodonta [@Daley2009; @Daley2012; @Cong2014; @Daley2014; @VanRoy2015], whereas the euarthropod anterior sclerite is only narrowly attached to the anterior end of the body in upper-stem and crown-group euarthropods [@Edgecombe1999; @Budd2008; @Yang2013; @Ortega2015].^n^nCharacter 6 in @Yang2015.^n'; + TEXT CHARACTER=124 TEXT='This character refers to the lateral "P" elements that typify the anterior scleritome of hurdiid radiodontans [@Daley2009; @Daley2012; @VanRoy2015].^n^nCharacter 7 in @Yang2015.'; + TEXT CHARACTER=125 TEXT='Character 30 in @Moysiuk2019. The lateral sclerites of certain hurdiids are elongate, whereas those of anomalocaridids are more circular in aspect and shape.'; + TEXT CHARACTER=126 TEXT='Character formulated from possible homology of ventral sclerites in §7 of @Budd2021.'; + TEXT CHARACTER=127 TEXT='The terminal mouths of Hallucigenia sparsa [@Smith2015], H. fortis [@Liu2014ppp], Collinsium [@Yang2015], Microdictyon and Cardiodictyon [@Chen1995bnmns; @Liu2014ppp] are consistently oriented ventrally, perpendicular to the main trunk axis; the anteriormost trunk (or, colloquially, ‘head’) can be manoeuvred independently of the main trunk. In other taxa (e.g. priapulans), the orientation of the mouth is fixed relative to the main trunk.^n^nCharacter 5 in @Smith2015.'; + TEXT CHARACTER=128 TEXT='Certain lobopodians (Cardiodictyon, Hallucigenia fortis, Luolishania) have a differentiated anteriormost trunk that forms a wide ellipse or "head" [@Liu2014ppp]. In Hallucigenia sparsa, the "head" is denoted by a slight increase in the width of the anteriormost trunk, which is most prominent in smaller specimens [@Smith2015]. In other taxa (Aysheaia, Onychodictyon ferox, Megadictyon, Jianshanopodia, Ilyodes, Collinsium), the anteriormost trunk is not differentiated in this way [@Thompson1980; @Ou2012; @Vannier2014; @Yang2015]. Coded as ambiguous in euarthropods, where the "trunk" has been replaced by sclerotized segments. ^n^nCharacter 6 in @Smith2015'; + TEXT CHARACTER=129 TEXT='@Budd2021 argues that anterior projections in certain lobopodians and tardigrades are homologous to euarthropod frontal filaments. We additionally interpret the anterior projections of Pambdelurion and Megadictyon as potential homologues.^n^nPotential homologues to the frontal filaments – the frontal processes, which migrate to become the anteriormost pair of lip papillae in adults – are present in crown group Onychophora [@Ortega2016asd]. On this view, we interpret the dorsal, apparently non-appendicular, antenniform appendages of Collinsovermis and Luolishania as potential homologues to the frontal filaments.^n^nAdapted from character 95 in @Yang2016.^n'; + TEXT CHARACTER=130 TEXT='Potential homologues to the frontal filaments – the frontal processes – migrate to become the anteriormost pair of lip papillae in adult Onychophora [@Ortega2016asd]. As the migration is a derived state, this character is treated as neomorphic.'; + TEXT CHARACTER=131 TEXT='Eutardigrades have a sensory field in the same region where cirri A occurs in heterotardigrades, which is considered a remnant of cirri A. A reduced cirrus A/sensory field is difficult to establish in fossils, we consider this reduction to be an synapomorphy of heterotardigrades, as such, fossil lobopodians are coded as absent, until further evidence is provided. Since the reduction (sensory field) is the derived state, we consider this character neomorphic.^n^nAdapted from character 15 in @Khim2023.'; + TEXT CHARACTER=132 TEXT='Cirri A in most athrotardigrades are on the head segment, and on the posterior part of the head in echiniscoideans. Neoarctus has cirri A on the first trunk segment. We conservatively code lobopodians with filamentous structures as uncertain as the homology to tardigrade Cirri A are not clear.^n^nAdapted from Character 16 from @Khim2023.'; + TEXT CHARACTER=133 TEXT='This character refers to the suite of cirri and clavae characteristic of heterotardigrades. ^n^nAdapted from @Khim2023 character 17.'; + TEXT CHARACTER=134 TEXT='@Daley2009 (char. 10), @Ma2014jsp (chars. 25, 27) and @Lan2021 implicitly treat compound eyes and ocelli as homologous structures. We uphold the case for deep homology between these organs. Modified ocelli can resemble a single ommatidium of a compound eye [@Land2012, pp. 125-126] and compound eyes can be de-differentiated into an ocelli during metamorphosis [@Bitsch2005, §3.1]. This implies a deep homology in fossils of ocelli and compound eyes despite notable differences in certain aspects, such as visual pigments, in ocelli and compound eyes in extant euarthropods [@Henze2012]. Paleontological support for this homology is reviewed by @Schoenemann2023. ^n^nAdapted from characters: 16 and 18 in @Smith2015; 29 and 31 in @Yang2015.'; + TEXT CHARACTER=135 TEXT='Number of discrete visual units, whether compound eyes or ocelli. Despite differences in visual pigmentation and innervation, we hypothesize that all visual units – whether compound or singular – share a deep homology.'; + TEXT CHARACTER=136 TEXT='Treated as a separate organ from ocelli; see parent character for discussion.^n^nAdapted from characters: 16 and 18 in @Smith2015; 29 and 31 in @Yang2015.^n^nTreated as a separate organ from ocelli; see parent character for discussion.^n^nAdapted from characters: 16 and 18 in @Smith2015; 29 and 31 in @Yang2015.'; + TEXT CHARACTER=137 TEXT='Treated as neomorphic as a stalk represents an additional morphological structure.^n^nCharacter 26 in @Ma2014jsp; character 17 in @Smith2015; and character 30 in @Yang2015. Character 4 in @Smith2015 and @Yang2016 is redundant to this character, so is not included in the present matrix.'; + TEXT CHARACTER=138 TEXT='After character 15 in @Moysiuk2019.^nThe eyes of certain hurdiid radiodonts are dislocated to an extremely posterior location.'; + TEXT CHARACTER=139 TEXT='This new character reflects the hypothesis that sclerotization originated in the protocerebral (preocular) appendages, before being co-opted in trunk appendages. Modified from @Yang2015 character "Cephalic/anterior appendages: Protocerebral limb pair sclerotized"(character 9; also character 21 in @Smith2015).^n^nWe code this character as present in any taxon with sclerotized pre-ocular (protocerebral) limbs, including the podomeres in anomalocaridid "great appendages" [@Daley2014] and the hypostome that covers the euarthropod labrum [e.g. @Edgecombe1999; @Yang2013]. We score this character as uncertain in taxa where the presence of a hypostome is suggested, but not verified (e.g. Alalcomenaeus). The sclerotized stylets and stylet supports of tardigrades are likely modified claws [@Mobjerg2018], hence no appendage sclerotization (or arthrodial membranes) are present.^nThe character is treated as neomorphic, as sclerotization represents a novel increase in the complexity of the appendage.^n'; + TEXT CHARACTER=140 TEXT='May be present only if protocerebral limbs are sclerotized.^n^nThis transformation series distinguishes the arthropodized ‘great appendages’ of anomalocaridids (Daley and Edgecombe 2014) from the hypostome of Euarthropoda (e.g. Edgecombe and Ramsköld 1999; Yang et al. 2013) and the stylet of Tardigrada (e.g. Halberg et al. 2009), both of which are sclerotized but lack soft arthrodial membranes.'; + TEXT CHARACTER=141 TEXT='In most panarthropods, the first pair of limbs is pre-ocular (at least developmentally), is associated with the protocerebral segment, and is structurally differentiated from other limb pairs. In hallucigeniids, however, the first limb pair is not structurally differentiated from its neighbour; moreover, the great distance between the head and the first limb pair in Hallucigenia sparsa [@Smith2015] argues against a pre-ocular or indeed cerebral identity of these appendages. Whether or not the first appendage pair truly corresponds to the pre-ocular appendage of other groups, the absence of a differentiated pre-ocular appendage characterizes a number of armoured lobopodians: Xenusion [@Dzik1989], Diania [@Ma2014jsp], Microdictyon [@Chen1995bnmns], Paucipodia [@Chen1995trse; @Hou2004], H. fortis [@Ramskold1998], and H. sparsa [@Smith2015]. A distinct structure is evident in onychophorans, Antennacanthopodia and Ilyodes (antennae); tardigrades (the stylet apparatus); anomalocaridids (great appendages) [@Cong2014]; Opabinia (proboscis) [@Dhungana2021]; and euarthropods and basal panarthropods (homologues of the labrum) [@Budd2021]. We differ from previous studies in homologizing the antenniform appendages of luolishaniids with frontal filaments, rather than appendage, reflecting their dorsal position and lack of obvious parallels with the differentiated trunk appendages. Coded as ambiguous in taxa where the head is not preserved (including Carbotubulus).^n^nCharacter 20 in @Smith2015.'; + TEXT CHARACTER=142 TEXT='This neomorphic character distinguishes the arthropodized "great appendages" of radiodontans [@Daley2014] from the hypostome of Euarthropoda [e.g. @Edgecombe1999].^nThe sclerotized stylets and stylet supports of tardigrades are likely modified claws [@Mobjerg2018]. No podomeres are present.^n^nAdapted from character 22 in @Smith2015 and character 10 in @Yang2015.'; + TEXT CHARACTER=143 TEXT='Character 33 in @Moysiuk2019. The segments of the first appendage pair are uniform in form (homonomous) along the length of the limb in Anomalocaris, whereas in Hurdia the segments of the distal and proximal sections are strongly distinct.^n^nThe peduncle and outer spines are not considered in this character. The character is treated as neomorphic, as differentiation is seen to reflect a greater degree of developmental and morphological specialization.'; + TEXT CHARACTER=144 TEXT='The distalmost podomeres of Caryosyntrips, Hurdia are differentiated and strongly reduce distally, resulting in "inward flexure" of these podomeres.^n^nTreated as applicable even when podomeres are homonomous, as differentiation in size need not depend on differentiation of podomere morphology. Inapplicable in taxa that lack sclerotized protocerebral appendages.^n^nCharacter 35 in @Moysiuk2019.^n'; + TEXT CHARACTER=145 TEXT='We score this character as ventral in Euarthropoda given that the reduced protocerebral appendage pair, transformed into the labrum, occupies a ventral position in association with the mouth [e.g. @Scholtz2006]. As the forward-facing stylet apparatus of tardigrades is internalized into the mouth cone [@Halberg2009], the position of the stylets are not independent of the mouth position, therefore we code this as an alternative character state. ^n^nCharacter 26 in @Smith2015 and character 16 in @Yang2015.'; + TEXT CHARACTER=146 TEXT='This character reflects the migration of the frontal appendages from an ancestrally anterior position, as in lobopodians (e.g. Kerygmachela, Jianshanopodia, Pambdelurion, Siberion), to a more posterior (e.g. Megacheirans and Leanchoiliids) and ultimately ventral position, as in the euarthropod labrum [@Budd2021].^n^nAs the direction of evolution is well attested by developmental data, we treat this character as neomorphic.'; + TEXT CHARACTER=147 TEXT='Modified from character 16 in @Ma2014jsp to reflect the posited homology between the anterior appendages of lobopodians and the euarthropod labrum [cf. @Eriksson2000; @Budd2002]: specifically, the euarthropod labrum is coded as a fused pair of appendages [@Scholtz2006; @Liu2009; @Liu2010; @Posnien2009]. The stylet apparatus of Tardigrada is not coded as fused, as each stylet within the buccal tube remains independent despite significant modification [@Dewel2006; @Halberg2009; @Guidetti2012].^n^nCharacter 27 in @Smith2015 and 17 in @Yang2015.'; + TEXT CHARACTER=148 TEXT='In Opabinia, Caryosyntrips and cf. Peytoia [@Moysiuk2021], the protocerebral appendages are adjacent to the other, without a gap; in radiodonts such as Anomalocaris canadensis, the protocerebral appendages are separated by a gap [e.g. @Daley2014, fig. 1]. The situation is unclear in many hurdiids due to limited preservation of appendage bases. The adjacency of bases is a prerequisite for the physical mechanical fusion of the protocerebral appendages.'; + TEXT CHARACTER=149 TEXT='In Kerygmachela, Pambdelurion and Siberion, the appendages have migrated into an adjacent position but are not mechanically connected [@Budd1993; @Budd1998ar; @Budd1998trse; @Dzik2011]; this also seems to be the case in radiodontans [@Daley2009; @Daley2014]. In euarthropods, the appendages exhibit a degree of fusion.^n^nCharacter 28 in @Smith2015, cf. character 17 in @Yang2015.^n'; + TEXT CHARACTER=150 TEXT='This neomorphic character represents the loss of claws on the (differentiated) protocerebral appendage as compared to the (undifferentiated) trunk appendages. By definition, taxa with undifferentiated protocerebral appendages have not undergone loss of claws on those appendages. Taxa without claws are coded as ambiguous as we cannot tell if a claw suppression mechanism acts silently in the protocerebral appendages; in other words, the gain or loss of claws on the trunk represents a separate neomorphic event and is thus independent of this character.^n'; + TEXT CHARACTER=151 TEXT='This neomorphic character refers to the spines/spinules present in the most anterior appendage pair of anomalocaridids [@Daley2009; @Daley2014], gilled lobopodians [Kerygmachela, see @Budd1993, @Budd1998trse; Pambdelurion, see @Budd1998ar; Opabinia, see @Budd1996] and certain lobopodians [e.g. Aysheaia, see @Whittington1978; Jianshanopodia, see @Liu2006; Megadictyon, see @Liu2007az; Onychodictyon ferox, see @Ou2012].^n^nWe treat the presence of lateral and ventral spine series as different characters. Certain taxa (e.g. Stanleycaris, cf. Peytoia) have both series present, whereas other taxa (e.g., Caryosyntrips) have only lateral spine series (see @Moysiuk2021). We extend this homology scheme to stem-euarthropods with lateral spine series such as Kerygmachela, Pambdelurion and Opabinia following @Dhungana2021. Lateral spine series (referred to as "gnathal" spines in @Moysiuk2021) in sclerotized appendages are often characterized by small asymmetric accessory spines that originate near the base of the main lateral spine [@Moysiuk2021].^n^nVentral spine series (endites) characterize most radiodonts (with the notable exception of Caryosyntrips). These ventral spines have regularly spaced accessory spines along their length in Hurdiids. Anomalocaris and Lyrarapax symmetric accessory spines originating at the base of the main ventral spines. The similarity of Kylinxia"s "dorsal" spine series to Anomalocaris indicates possible homology [@Zeng2020], and rotation of the protocerebral appendages. See @Guo2019 for an overview of radiodont appendage morphology. ^n^nCharacters 42 of @Zhang2016 is redundant under this formulation, so has been removed from our matrix. Adapted from character 30 in @Smith2015 and 19 in @Yang2015.^n'; + TEXT CHARACTER=152 TEXT='This character pertains to the rows of ventral spines (endites). Amplectobeluidae and Anomalocarididae have two rows, Hurdiidae have one row [@Guo2019].^n^nPrevious formulation inspired by char. 31 in @Smith2015 and char. 20 in @Yang2015. These matrices did not separate the lateral from ventral spine series (see discussion in character description above: Protocerebral appendage pair: Spine series).^n'; + TEXT CHARACTER=153 TEXT='Hurdiids typically have very long main spines (endites) compared to the thickness of the shaft of the appendage. We treat the ventral spine series as distinct from lateral spine series [following @Dhungana2021], and limit this character to ventral spines. ^n^nAdapted from @Zeng2020 character 191; @Aria2019 character 90.'; + TEXT CHARACTER=154 TEXT='Accessory spines to the main ventral spines of the protocerebral spine series of many radiodonts.'; + TEXT CHARACTER=155 TEXT='Hurdiids have accessory spines arranged in a regular series along the main spine, whereas e.g. Anomalocaris canadensis has accessory spines originating near the base of the main spine, giving a multifurcate appearance.^n'; + TEXT CHARACTER=156 TEXT='Character 44 in @Vinther2014 and 41 in @Moysiuk2019. The endites of certain anomalocaridid appendages alternate in length from podomere to podomere. Treated as neomorphic as alternation represents additional complexity in developmental control.'; + TEXT CHARACTER=157 TEXT='Spine series can be comparable in width to the base of the podomere/annulation, or be significantly narrower.^n^nCharacter adapted from char. 192 in @Zeng2020; char. 108 in @Aria2019^n'; + TEXT CHARACTER=158 TEXT='The large ventral endites of radiodonts can increase in size from base to tip e.g., Hurdia, Peytoia, Stanleycaris. Treated as transformational. See table 1 in @Guo2019, and figure 2 in @Pates2019.'; + TEXT CHARACTER=159 TEXT='The orientation of spine series are independent of the position of the spine series. In Hurdiids, for example, the main enditic spines are in a ventral position, but spines curve such that the distal tips face the other appendage. In gilled lobopodians and Caryosyntrips, spine series point towards the other appendage. In Anomalocaris, the ventral spine series do not face the other appendage, but are straight and point outwards (ventrally).'; + TEXT CHARACTER=160 TEXT='@Moysiuk2021 suggest that the laterally located gnathal spine series in e.g. Caryosyntrips is independent of the ventral enditic spine series observed in many radiodonts. We treat the lateral spine series of e.g. Aysheaia and gilled lobopodians as equivalent.'; + TEXT CHARACTER=161 TEXT='This neomorphic character describes the multifurcate termination observed in the protocerebral appendages of dinocaridids [@Budd1996; @Daley2009; @Daley2010; @Budd2012; @Daley2014] and certain lobopodians -- such as Aysheaia [@Whittington1978], Megadictyon [@Liu2007az] and Kerygmachela [@Budd1993; @Budd1998trse] -- but absent in Onychodictyon ferox [@Ou2012].^nCoded as inapplicable in tardigrades due to the extremely modification of the pre-ocular appendage into a stylet apparatus, which poses challenges to the identification of homologues of appendicular features.^n^nCharacter 33 in @Smith2015 and 22 in @Yang2015.^n'; + TEXT CHARACTER=162 TEXT='In Amplectobelua, Lyrarapax and Anomalocaris saron, the distal appendage kinks outwards at a high angle relative to the appendage peduncle (shaft).^n^nCharacter 36 in @Moysiuk2019, following character 27 in @Vinther2014.^n'; + TEXT CHARACTER=163 TEXT='Character 35 in @Vinther2014 and 40 in @Moysiuk2019. In Amplectobelua and Lyrarapax, a proximal endite projects forwards to oppose the distal endites, forming a "pincer" or "claw".'; + TEXT CHARACTER=164 TEXT='Most radiodonts have outer spine series in addition to inner spine series on the protocerebral appendage [@Moysiuk2019] also referred to as "dorsal" spines [e.g. @Zeng2020], typically the spines in this series are larger distal-ward. This spine series appears to be independent of the medial/ventral spine series [@Moysiuk2019]^n'; + TEXT CHARACTER=165 TEXT='Adapted from char. 55 in @Moysiuk2021, who note that auxiliary spines are present on the lateral spines/gnathites of Stanleycaris and cf. Peytoia.^n'; + TEXT CHARACTER=166 TEXT='Character atomized from previous formulation to reflect complexity in "arthropodization" of the post-ocular appendages. ^n^nAdapted from character 19 in @Smith2015 and 8 in @Yang2015.'; + TEXT CHARACTER=167 TEXT='The cylindrical ambulacral lobopodous leg characteristic of lobopodians is also found in Opabinia [@Budd1996; @Budd2012], Kerygmachela [@Budd1993; @Budd1998trse], Pambdelurion [@Budd1998ar] and Aegirocassis [@VanRoy2015]. Coding for radiodontans follows @VanRoy2015.^n^nCharacter 23 from @Smith2015 and 11 in @Yang2015.'; + TEXT CHARACTER=168 TEXT='There are various taxa in which the deutocerebral appendage pair is morphologically differentiated from the rest of the trunk appendages [see references in @Liu2014ppp]. For example, Antennacanthopodia has a second set of antenna-like limbs that are morphologically distinct from the walking legs [@Ou2011]. The first pair of legs in Tardigrada is serially homologous with the deutocerebral segment of Euarthropoda [@Mayer2013po], and thus is not structurally different from the rest of the trunk appendages. The deutocerebral jaws of Onychophora are significantly modified relative to the rest of the appendages in the body [@Eriksson2010; @Oliveira2013]. In Euarthropoda, this morphological differentiation is generally expressed in the presence of an antenniform [e.g. @Edgecombe1999; @Ma2012n; @Yang2013] or raptorial [@Chen2004; @Haug2012p; @Tanaka2013] deutocerebral appendage. The second leg pair of hallucishaniid taxa are not differentiated from their neighbours [@Ramskold1998] and are therefore coded as undifferentiated; the trunk limbs are instead divided into two morphological zones.^n^nCharacter 24 in @Smith2015 and 14 in @Yang2015.'; + TEXT CHARACTER=169 TEXT='This character, adapted from char. 25 from @Smith2015 and char. 12 from @Yang2015, has been re-formulated into two separate characters on the basis that the arthropodization of the first post-ocular appendage is not independent from the arthropodization of the subsequent trunk appendages. This formulation makes it unnecessary to distinguish taxa with differentiated deutocerebral appendages (e.g., char. 24 @Smith2015).^n'; + TEXT CHARACTER=170 TEXT='The first post-ocular limb is not observable in Tertiapatus or Ilyodes [@Poinar2000; @Haug2012cb], and is thus scored as ambiguous. It is difficult to evaluate the role of the slender appendages of Hallucigenia [@Ramskold1998; @Smith2015] and the cirrate post-ocular appendages of Luolishania, Collinsium, Acinocricus and the Collins monsters [@Ma2009; @Garcia2013; @Yang2015; @Caron2020]; as such, these are coded as ambiguous for states "ambulatory" and "sensorial".^n^nCharacter 25 from @Smith2015 and 12 from @Yang2015.^n'; + TEXT CHARACTER=171 TEXT='See character 13 in @Yang2015.^nPresent in Peripatidae [@Oliveira2013], but absent in Euperipatoides [@Smith2014].'; + TEXT CHARACTER=172 TEXT='As with character relating to the nature of the deutocerebral appendages, this character is coded as a separate character in taxa with lobopodous and with arthropodized appendages. Ilyodes [@Haug2012cb], Tertiapatus [@Poinar2000] and extant onychophorans are interpreted as bearing paired oral papillae.^n^nAdapted from character 15 in @Yang2015.'; + TEXT CHARACTER=173 TEXT='The tritocerebral appendages of fuxianhuiids are reduced for a sweep-feeding function [@Yang2013].^n^nAdapted from character 15 in @Yang2015.'; + TEXT CHARACTER=174 TEXT='Annulations are repeated superficial integument rings.^n^nCharacter 26 in @Daley2009, 37 in @Smith2015 and 36 in @Yang2015. WTS27.^n^n^nAnnulations are repeated superficial integument rings. Coded as present in Eokinorhynchus, reflecting ring-like nature of epidermal ‘segments’ (Zhang et al. 2015). Present in Fieldia, reflected by transverse arrangement in spines in certain specimens (e.g. USNM57715, see (Caron 2011)) The cuticle of Anatonchus bears hints of fine annulations on its tip (Peneva et al. 1999; Choudhary et al. 2009); the cuticle of Kinonchulus is ‘delicately annulated’ (Riemann 1972). Coded ambiguous in Selkirkia and Paraselkirkia as the trunk is concealed by the tube, and ambiguous in Palaeopriapulites and Sicyophorus as the ‘trunk’ is putatively concealed by a lorica (Hou et al. 2017).^n^nTaxa in which annulations are present on the appendages but not the trunk are coded ambiguous.'; + TEXT CHARACTER=175 TEXT='This character distinguishes between annulation patterns that are uniform along the length of the trunk (homonomous) from those which display serially repeated differentiated fields (heteronomous), usually associated with the location of limbs.^n^nCharacter: 29 in @Liu2011; 27 in @Daley2009; 40 in @Smith2015 and 38 in @Yang2015. WTS27.'; + TEXT CHARACTER=176 TEXT='The bulbous heads of Hallucigenia fortis, Microdictyon, Cardiodictyon and Luolishania lack annulations [@Chen1995bnmns; @Ma2009; @Ma2012asd; @Liu2014ppp]. In contrast, annulations continue to the tip of the head in Paucipodia, Onychodictyon gracilis, and Diania (whichever end of Diania is interpreted as anterior) [@Chen1995trse; @Hou2004; @Liu2008app; @Ma2014jsp].^n^nIn contrast to character 39 in @Smith2015, we do not interpret the introvert of priapulans or lobopodians as part of the trunk.'; + TEXT CHARACTER=177 TEXT='Unbranched in Aysheaia, Siberion, Onychodictyon, Diania, Xenusion, Paucipodia, Microdictyon, Luolishania, the Collins Monsters, Acinocricus, Jianshanopodia, Hadranax and Kerygmachela [@Whittington1978; @Caron2020; @ConwayMorris1988; @Dzik1989; @Chen1995bnmns; @Budd1998p; @Hou2004; @Liu2006; @Liu2008app; @Ma2009; @Ma2014jsp; @Dzik2011; @Ou2012; @Garcia2013; @Yang2015]; branched in Orstenotubulus, onychophorans (i.e. anastomosing plicae) and the Orsten-type lobopodian segment [@Maas2007csb; @Oliveira2014]; ambiguous in Megadictyon, Antennacanthopodia and Tertiapatus [@Poinar2000; @Liu2007az; @Ou2011].^n^nCharacter 51 in @Zhang2016.'; + TEXT CHARACTER=178 TEXT='Epidermal segmentation is a distinguishing feature of Euarthropoda [@Budd2001za; @Edgecombe2009]. Although the body of Onychophora and Tardigrada is metamerically organized, both at the level of segment polarity gene expression [@Gabriel2007; @Eriksson2009] and musculature [e.g. @Halberg2009; @Marchioro2013], this pattern is not expressed on the epidermis: we thus score it as absent in these phyla. Epidermal segmentation is not evident in most radiodontans [e.g. @Daley2014], which we score absent. Kinorhynchs and annelids also exhibit a segmented epidermis; though this presumably has an independent derivation from the segmentation of arthropods, the lack of a clear morphological basis for discrimination means separate character states cannot be assigned to these phyla.^n^nCharacter 25 in @Daley2009, 34 in @Smith2015 and 32 in @Yang2015.'; + TEXT CHARACTER=179 TEXT='The development of sclerotized tergal plates connected by arthrodial membranes is distinctive of body arthrodization, and thus exclusive to Euarthropoda [@Edgecombe1999; @Haug2012p; @Yang2013]. Given the morphological similarity between arthropod tergites and the articulated tergal and sternal plates of kinorhynchs [e.g. @Sorensen2008, @SchmidtRhaesa2012], we treat the latter using the same transformation series, though noting that the structures are almost certainly not homologous. Plates in kinorhynchs arise through progressive sclerotization of flexible cuticle through ontogeny [@SchmidtRhaesa2012], and are thus not considered to be equivalent to epidermal sclerites.^n^nAlthough some heterotardigrades possess dorsal plates (e.g. Nelson 2002; Marchioro et al. 2013; Persson et al. 2014), these are not connected by arthrodial membranes and thus score the heterotardigrade terminal Actinarctus as absent for this transformation series.^n^nCharacter 35 in @Smith2015, 33 in @Yang2015.'; + TEXT CHARACTER=180 TEXT='Sternites – ventral sclerotized plates – are a key feature of most Euarthropoda, and are well documented in Artiopoda [e.g. @Whittington1993; @Edgecombe1999; @Ortega2012]. Sternites are notably absent in Fuxianhuiida [@Chen1995s; @Waloszek2005; @Bergstrom2008; @Yang2013], even though these taxa have a sclerotized dorsal exoskeleton. We code sternites as uncertain in leanchoiliids. Given the morphological similarity between arthropod sternites and the articulated sternal plates of kinorhynchs (e.g. Sørensen 2008), the latter are also scored as present.^n^nCharacter 36 in @Smith2015 and 34 in @Yang2015.'; + TEXT CHARACTER=181 TEXT='Present in Pycnophyidae and Neocentrophyidae (Kinorhyncha); see character 14 in (Sørensen et al. 2015)'; + TEXT CHARACTER=182 TEXT='See character 17 in @Sorensen2015'; + TEXT CHARACTER=183 TEXT='See character 23 in @Sorensen2015'; + TEXT CHARACTER=184 TEXT='See character 19 in @Sorensen2015'; + TEXT CHARACTER=185 TEXT='Modified from character 20 in @Sorensen2015; comparison with other plates avoids over-weighting the presence/absence of two sternal plates'; + TEXT CHARACTER=186 TEXT='See character 21 in @Sorensen2015'; + TEXT CHARACTER=187 TEXT='Modified from character 22 in @Sorensen2015; comparison with other plates avoids over-weighting the presence/absence of two sternal plates'; + TEXT CHARACTER=188 TEXT='See character 25 in @Sorensen2015'; + TEXT CHARACTER=189 TEXT='See character 26 in @Sorensen2015'; + TEXT CHARACTER=190 TEXT='See character 40 in @Sorensen2015^nMost kinorhynchs (though not, for example, Kinorhynchus, Neocentrophyes) exhibit prominent lateroterminal spines (Sørensen and Pardos 2008), clearly distinguished from palaeoscolecid/priapulid posterior hooks by their position and morphology.'; + TEXT CHARACTER=191 TEXT='Certain kinorhynchs (Antygomonas, Franciscideres, Cateria, Campyloderes, Centroderes, Echinoderes, Zelnkaderes) exhibit an secondary spine alongside their lateroterminal spine (Higgins 1968; Sørensen and Pardos 2008; Dal Zotto et al. 2013; Neuhaus and Sørensen 2013; Neuhaus et al. 2014; Altenburger et al. 2015; Landers and Sørensen 2016). Others (Pyconophyes, Dracoderes, Paracentrophyes) do not (Sørensen et al. 2010, 2012b; Herranz et al. 2014; Sánchez et al. 2016). See character 38 in (Sørensen et al. 2015).'; + TEXT CHARACTER=192 TEXT='See character 41 in @Sorensen2015; modified to present in Paracentrophyes (Sørensen et al. 2010)^nFollowing the coding of Sorensen where this contradicts the data from (Sørensen and Pardos 2008; Dal Zotto et al. 2013)'; + TEXT CHARACTER=194 TEXT='See character 24 in @Sorensen2015. Treated as neomorphic.'; + TEXT CHARACTER=195 TEXT='See character 27 in @Sorensen2015'; + TEXT CHARACTER=196 TEXT='Scales [@Neuhaus2013za; @SchmidtRhaesa2012] are cuticular, short, triangular to shingle-like processes or projections of the sternal plates, often found in the central region; they give the plates a ''bristled'' appearance.'; + TEXT CHARACTER=197 TEXT='A secondary fringe is a line of small cuticular processes (usually triangular^nscales) at anterior margin of segmental plates [@SchmidtRhaesa2013]. More than one may be present. '; + TEXT CHARACTER=198 TEXT='The nature of the mid-gut glands of Megadictyon, Jianshanopodia, Pambdelurion and Opabinia is elucidated by [@Vannier2014]. Midgut glands were biologically, rather than taphonomically, absent in Ilyodes [@Haug2012cb], Hallucigenia sparsa [@Smith2015], Lyrarapax [@Cong2014], Acinocricus [@ConwayMorris1988] and Collinsium [@Yang2015].^n^nCharacter 42 in @Ma2014jsp; 16 in @Daley2009; 53 in @Smith2015 and 52 in @Yang2015.^n^n^n---^nCoded as uncertain in Antennacanthopodia (Ou et al. 2011) because the dark infilling of the type material may represent decayed internal organs. The nature of the mid-gut glands of Megadictyon, Jianshanopodia, Pambdelurion and Opabinia is elucidated by Vannier et al. (2014).'; + TEXT CHARACTER=199 TEXT='Lobopodians have a relatively cylindrical trunk with a uniform width, whereas the trunk of anomalocaridids narrows markedly towards the posterior.^n^nCharacter 65 in @Moysiuk2019.^n'; + TEXT CHARACTER=200 TEXT='This character reflects the pronounced differentiation of the posterior and anterior trunk – not just the trunk appendages – in certain lobopodians. In Hallucigenia sparsa, the region of the trunk anterior of the third appendage pair is narrower, lacks dorsal armature, and expresses differentiated appendages [@Smith2015]. The short constricted region anterior of the first spine pair in H. fortis is associated with two differentiated appendage pairs [@Ramskold1998] and apparently corresponds with the ‘neck’ of H. sparsa. In luolishaniids, the anterior body bears elongate limbs with accentuated armature [@Ma2009; @Garcia2013]. The portion of the trunk in Carbotubulus corresponding to the first two or three leg pairs is substantially narrower than the posterior trunk and its associated appendages are narrower and less prominent than the posterior appendages, indicating trunk differentiation [@Haug2012cb]. Although the width of the trunk narrows gradually towards the front of Paucipodia, this tapering is gradual and does not correspond to the differentiation of the anterior trunk [@Chen1995trse; @Hou2004]. Coded ambiguous in Orstenotubulus, Hallucigenia hongmeia, and Ilyodes due to incomplete preservation [@Thompson1980; @Maas2007csb; @Steiner2012].^n^nCharacter 54 in @Smith2015 and 72 in @Yang2015.^n^n^n---^nAfter transformation series 54 in Smith & Caron (2015). The differentiation observed in lobopodians (see below) is also reflected in the organisation of Louisella and Tylotites (Conway Morris 1977a; Han et al. 2007c; Zhang et al. 2015), where the anterior trunk has a different annulation pattern to the posterior portion, with an abrupt change separating the two regions. The “neck” of Eokinorhynchus is not consistently distinguishable from the introvert, and is considered to represent part of that structure. The same is arguably true in Halicryptus higginsi, though not in H. spinulosus. The anterior annulations of H. higginsi are much more closely spaced and bear denser setae than the posterior annulations (Shirley and Storch 1999), but in the absence of a sharp distinction between the regions this character is scored as ambiguous. Coded absent in Cricocosmia (Hou et al. 2017); the diminution of annulations anteriad is reflected in a separate transformation series, and there is no clear morphological division of an anterior portion of the trunk.^nThis transformation series reflects the pronounced differentiation of the posterior and anterior trunk – not just the trunk appendages – in certain lobopodians. In Hallucigenia sparsa, the region of the trunk anterior of the third appendage pair is narrower, lacks dorsal armature, and expresses differentiated appendages (this study). The short constricted region anterior of the first spine pair in H. fortis is associated with two differentiated appendage pairs (Ramsköld and Chen 1998) and apparently corresponds with the ‘neck’ of H. sparsa. In luolishaniids, the anterior body bears elongate limbs with accentuated armature (Ma et al. 2009; García-Bellido et al. 2013). The portion of the trunk in Carbotubulus corresponding to the first two or three leg pairs is substantially narrower than the posterior trunk and its associated appendages are narrower and less prominent than the posterior appendages, indicating trunk differentiation (Haug et al. 2012c). Although the width of the trunk narrows gradually towards the front of Paucipodia, this tapering is gradual and does not correspond to the differentiation of the anterior trunk (Chen et al. 1995a; Hou et al. 2004). Coded ambiguous in Orstenotubulus, Hallucigenia hongmeia, and Ilyodes due to incomplete preservation (Thompson and Jones 1980; Maas et al. 2007; Steiner et al. 2012).^nCoded as absent in loriciferans (Neves et al. 2016), Sirilorica (Peel et al. 2013).^nThe anterior 5–8% of the trunk of Meiopriapulus bears trunk scalids and lacks the wrinkles, tubercles and other structures of the posterior trunk (Sørensen et al. 2012a)^nTubiluchus lemburgi has a distinctive anterior trunk marked by a change in diameter and surface ornament (Schmidt-Rhaesa et al. 2013)^nCoded as present in Paratubiluchus (Han et al. 2004).^nPresent in Paraselkirkia, where armature becomes enhanced (Hou et al. 2017); not evident in Selkirkia, even USNM 57624 in which the trunk is well extended; but coded ambiguous as posterior trunk unknown.^nCoded as present in Eximipriapulus (Ma et al. 2014b), though with the caveat that the differential appearance of the neck might conceivably be attributed to preservational factors^nCoded absent in Markuelia; the unannulated region in e.g. Dong et al. fig. 10D seems to correspond to the introvert, as suggested by the presence of a single row of spines (cf. trichoscalids, anterior head setae of nematodes); it is not demarked from the rest of the trunk by a change of thickness etc.'; + TEXT CHARACTER=201 TEXT='A proposed synapomorphy of Scalidophora, though absent in loriciferans, and not really codable as present in priapulids (Sørensen et al. 2008) – thus alternatively proposed as a synapomorphy of kinorhynchs and loriciferans (Neuhaus and Higgins 2002)^nThis said, the spines in kinorhynchs (e.g. Zelinkaderes, Neuhaus and Higgins 2002) are not restricted to the mid-trunk – more occur more posteriorly ^n## For a more careful description see Schmidt-Rhaesa 1997⁄98; Lemburg 1999; Neuhaus and Higgins 2002^nUnambiguously absent in Halicryptus and Tubiluchus Higgins larvae (Higgins and Storch 1989; Storch and Higgins 1991; Higgins et al. 1993), but present in Pripaulus (van der Land 1970) and early larvae of Maccabeus (Por and Bromley 1974)'; + TEXT CHARACTER=202 TEXT='WTS36^nFlosculi have been proposed as a synapomorphy of Scalidophora (Lemburg 1995; Nielsen 2012). They are raised, flower-like structures with a central cilium [@SchmidtRhaesa2015].^nAmong loricifera, present only in Nanaloricus and Pliciloricus (Neves and Kristensen 2014).^nCoded ambiguous in Chordodes (Bolek et al. 2010) as it is unclear whether any of its areoles might be considered equivalent to flosculi.^nFlosculi are really tiny (Storch and Alberti 1985) and the chances of picking them out from cuticular ridges in Burgess Shale-type fossils are slim – such taxa are coded ambiguous accordingly. Coded absent in Schistoscolex as the preservation is of sufficient fidelity, and the posterior region of is preserved (Duan et al. 2012)^n^nSensory spots are flat-lying regions of the cuticle, surrounding a cilliated pore, covered in small projections [@SchmidtRhaesa2015]'; + TEXT CHARACTER=203 TEXT='Flosculi in Maccabeus (Por and Bromley 1974), Meiopriapulus and Tubiluchus (Sørensen et al. 2012a), kinorhynchs and loriciferans [[Nematomorpha, Priapulida, Kinorhyncha, Loricifera edited by Andreas Schmidt-Rhaesa]]'; + TEXT CHARACTER=204 TEXT='Flosculi in priapulids have petal-like structrues (typically eight) (Wills et al. 2012); in loriciferans they do not bear clear petals (Neves et al. 2016)'; + TEXT CHARACTER=205 TEXT='WTS36'; + TEXT CHARACTER=206 TEXT='Treated as transformational as it is not clear whether the absence of papillae on limbs represents a differentiation of the limbs (and the introduction of a separate developmental regime to pattern them independently from the trunk)^n^nCharacter 41 in @Ma2014jsp; character 50 in @Smith2015 and 51 in @Yang2015.^n'; + TEXT CHARACTER=207 TEXT='Louisella and Onychodictyon ferox bear transverse rows of ventral papillae (Conway Morris 1977a; Ou et al. 2012)'; + TEXT CHARACTER=208 TEXT='A lorica is inferred to be present in a larval stage of any taxon in which it is present in an adult, even if certain taxa also exhibit post-hatching, pre-loricate larval stages [@Janssen2009].^n^nThe placids of kinorhynchs and the lorical ring of loriciferans and larval priapulids form cuticular plates that surround the neck region of the respective organisms (Wennberg et al. 2009; Peel et al. 2013; Sørensen et al. 2015); whilst placids conceivably represent reduced lorical plates, they are not considered homologous and are treated as two separate transformation series. Coded absent in Shergoldana as the plates do not form clear rings and do not clearly girdle the neck (Maas et al. 2007a).^nCoded as present in Corynetis as a ring of robust plates seems to occur at the anterior margin of specimens with retracted introverts, though there is no indication that these could form a closing apparatus, their size being too large (Hu et al. 2012).^nCoded as ambiguous in Acanthopriapulus as larval stages are unknown (van der Land 1970; Higgins and Storch 1991)^nCf. WTS61.'; + TEXT CHARACTER=209 TEXT='Cf. WTS24.^n^nPlates not retained in Tubiluchus; no reference available to support retention in T. vanuatensis (Kirsteuer and Ruetzler 1973; Calloway 1975; Kirsteuer 1976)^nThere is a possibility that the ‘theca’ of Laojieella (Han et al. 2006) is homologous with loriciferan plates (cf. Priapulus Higgins larvae with a prominent dorsal and ventral plate, and Sirilorica with prominent regions of the trunk anterior and posterior of its lorica), but this cannot be substantiated, so Laojieella is coded as ambiguous.^nThe lorica of Sicyophorus is considered to represent the adult form due to the size of the organisms (Hou et al. 2017). I consider Palaeopriapulites to have a lorical too; a distinct anterior margin is evident in some specimens (Hou et al. 2017).?'; + TEXT CHARACTER=210 TEXT='Cf. WTS62^nPresent in Halicryptus (Storch and Higgins 1991)^nAbsent in Sicyophorus; may be present in Palaeopriapulites (coded ambiguous) (Maas et al. 2007c; Hou et al. 2017)'; + TEXT CHARACTER=211 TEXT='Coded as ambiguous in macrofossils that do not preserve loricae, as the early developmental stages are unknown.^n'; + TEXT CHARACTER=212 TEXT='Number of lorical plates in a single ring, when multiple series are present'; + TEXT CHARACTER=213 TEXT='In certain priapulans the dorsal and ventral plates are substantially larger than the slender lateral plates'; + TEXT CHARACTER=214 TEXT='Many ecdysozoans bear cuticular sclerites on their trunk (i.e. posterior of the neck or proboscis). We recognize three broad categories of sclerites: (i) integumentary trunk sclerites: densely arranged sclerites that cover the trunk; (ii) sparse specialized sclerites: sparsely arranged sclerites specialized for a specific purpose (e.g. sensory sclerites, claws); (iii) enlarged dorsal sclerites: often paired, reinforced or sculptured, and with a presumed defensive function. These elements are likely homologous as sclerites, yet each category may be controlled by a distinct genetic toolkit. The broad character of ''epidermal sclerites'' is therefore present in most taxa in this matrix, and is coded ambiguous in many fossil taxa given the often diminutive scale of sensory sclerites. Secondary characters, each neomorphic, record the existence of sclerites in each of the three categories.^n^nWe include the setae, tubes, spines and processes of Kinorhyncha and Loricifera as sclerites. Lorical plates seem to form through the thickening of cuticle and are not treated as sclerites.^n^n^n----^n^nTransformation series 41 in Ma et al. (Ma et al. 2014a) and 30 (and cf. 29) in Wills (2012). We code Orstenotubulus as uncertain as its papillae are not clearly observed throughout the trunk region (Maas et al. 2007b).^nSpine-like ornament of Tylotites (Han et al. 2007c)^nSpines in Louisella (Smith 2015),^nRings of papillae in certain lobopodians (e.g. Aysheaia, onychophorans)^nPresent in Eokinorhynchus (Zhang et al. 2015).^nOccur, seemingly in rings, in Markuelia (Haug and Maas 2009; Dong et al. 2010)^nAbsent in Cricocosmia and Tabelliscolex (Han et al. 2007b).^nDetails of Mafangscolex given by (Liu et al. 2016)^nDetail of Maotianshania mentioned in (Hu et al. 2012)^nAmbiguous in Shergoldana as adult state unknown.^nAmbiguous in Antennacanthopoda (Ou et al. 2011) as preservational quality insufficient to discern,^nAreoles in Chordodes [@Bolek2010] are treated as epidermal plates.^nRound non-mineralized plates adorn the posterior trunk of Ancalagon (pers. obs.)^nSpines adorn the surface of Fieldia (ROM 93-1678; @ConwayMorris1977) ^nSmall spines (setae) occur on Halicryptus and Priapulopsis (van der Land 1970). Somatic setae occur irregularly on Kinonchulus (Riemann 1972) and in other onchulids (Olovachov et al. 2008)^nAbsent in Maccabeus (Por and Bromley 1974)^nRobustly-topped spines in Aysheaia (Whittington 1978)^nSpines are present at least in the anterior trunk of Selkirkia and Paraselkirkia (termed ‘Zone C of the proboscis’ by @ConwayMorris1977); this was variably emergent from the tube (see Caron 2011)^nNot reported in Kerygmachela (Budd 1998a)'; + TEXT CHARACTER=215 TEXT='This transformation series is coded as present in any taxon where sclerites comprise stacked constituent elements at all stages of growth (as in Hallucigenia sparsa and Euperipatoides, see main text), not just during ecdysis (as in Onychodictyon, see Topper et al. 2013). Where sclerites are not preserved in sufficient detail to assess their construction, this transformation series is coded as ambiguous.'; + TEXT CHARACTER=216 TEXT='This character describes sclerites that are broadly distributed across much of the trunk integument.^n^nIn taxa such as Hallucigenia, trunk sclerites are absent, leaving only the enlarged sclerites (dorsal spines and claws).^n^nThe tergal plates of kinorhynchs derive through thickening of the trunk cuticle, hence these do not represent trunk sclerites.^n^n---^n^nPhosphatized Hadimopanella-like plates characterize palaeoscolecids sensu lato (Harvey et al. 2010).^n^nPlates in Louisella have the same properties as non-mineralized cuticular structures.^nNematomorph areoles are cuticular, not mineralized (Bolek et al. 2010)^nSpines are heavily chitinised in Acanthopriapulus (van der Land 1970)^nSpines of Corynetis are not obviously mineralized (Huang et al. 2004a)^n'; + TEXT CHARACTER=217 TEXT='Palaeoscolecid plates are routinely preserved in three dimensions as phosphate. Traces of phosphorous, as occur in e.g. Hallucigenia spines in the Burgess Shale, are not taken to denote a heavy degree of original phosphatization, so taxa where a phosphatic composition is not robustly attested are coded as lacking heavy phosphatization.'; + TEXT CHARACTER=218 TEXT='Circular in Scathascolex, Wronascolex spp.^nElongated parallel to body axis in Palaeoscolex piscatorum^nEssentially circular in Chordodes (Bolek et al. 2010)^ncf. WTS30'; + TEXT CHARACTER=219 TEXT='Nodes are raised lumps, arranged in a series parallel to the plate margin^nBlackberry areoles in Chordodes have a similar construction, even if the nodes are irregularly distributed (Bolek et al. 2010) – but these areoles are perhaps better considered as equivalent to platelets, by comparison with priapulid tumuli.^nSchistoscolex has four nodes in an irregular ring (Müller and Hinz-Schallreuter 1993)'; + TEXT CHARACTER=221 TEXT='Palaeoscolex piscatorum has eight to ten nodes on its plates (Conway Morris 1997)^nScathascolex sometimes has five, perhaps sometimes has four as well?^nWronascolex antiquus has four to six ^nWronascolex iacoborum has five, always'; + TEXT CHARACTER=224 TEXT='In certain taxa the anterior and posterior trunk exhibit prominently distinct sclerite morphology, even if the trunk itself may not be differentiated'; + TEXT CHARACTER=226 TEXT='This character refers to integumentary trunk sclerites. Enlarged sclerites often exhibit a distinct distribution (as in Eokinorhynchus); if only enlarged sclerites are present (as in Hallucigenia, treating claws and spines as enlarged trunk sclerites), this character is inapplicable.^n^nPlates of Corynetis form clear transverse rows (Huang et al. 2004a)^nThose of Tubiluchus lemburgi form longitudinal rows that occasionally arise or pinch out (Schmidt-Rhaesa et al. 2013)^nTaxa with a differentiated fore-trunk (e.g. Eximipriapulus, Meiopriapulus) often show a more regular arrangement in their ‘neck’; the arrangement in the trunk (which is typically irregular) is what is coded here.^nSome ordering is apparent in Selkirkia, where the rows are clearly diagonal/quincuncial^nIn ventrolateral, bilaterally paired groups of one or more elements'; + TEXT CHARACTER=227 TEXT='Wronascolex antiquus has a single row of plates on each annulation. Scathascolex minor has a row of plates on each margin of each annulation; within each row, sclerites are longitudinally paired. I have interpreted this as two primary fields per annulation, each comprising two rows of sclerites. cf. WTS30^nAmbiguous (at best) in Louisella (Conway Morris 1977a, 1997; Smith 2015)^nProminently single in Tylotites (Han et al. 2007c)^nSeemingly single in Chalazoscolex (Conway Morris and Peel 2010)'; + TEXT CHARACTER=228 TEXT='This character primarily has in mind the regimented distribution of plates within each plate field of palaeoscolecid worms.'; + TEXT CHARACTER=229 TEXT='The plates of Corynetis form a quincuncial arrangement, a consequence of each subsequent transverse row being offset relative to the previous (Huang et al. 2004a).'; + TEXT CHARACTER=230 TEXT='Microplates are smaller than plates and platelets and are expressed as a patterning of the cuticle. [tbc]^nAmbiguous in Louisella and Tylotites as plates are not strongly preserved; preservational quality is inadequate to assess the presence of microplates'; + TEXT CHARACTER=232 TEXT='This character captures the differentiation of individual sclerites to specialized roles, including sensory and locomotory sclerites. The specific role is not specified, reflecting the fact that sclerites may serve multiple roles (for example, many priapulan scalids are sensory structures used in locomotion) and the possibility that a the primary role of a structure may vary depending on context. Moreover, the function of a sclerite is difficult to infer from fossil material.'; + TEXT CHARACTER=233 TEXT='WTS34^nTubuli are distinctive tube-like projections arising from the trunk in certain priapulids (at loricate and adult stages) [@SchmidttRhaesa2013] (e.g. Janssen et al. 2009). In Tubilucus, these are adhesive organs with a bulbous base and a stiff tapering tube (Todaro and Shirley 2003).'; + TEXT CHARACTER=234 TEXT='WTS33^nTumuli are small papillae: round-topped cuticular wart-like structures [@SchmidtRhaesa2012]. In Tubiluchus they are supported at their periphery by cuticular ridges, giving them a star-shaped aspect (Todaro and Shirley 2003).^nThis character is applied inclusively to incorporate any case where small sclerites occur alongside regular sclerites, with an equivalent distribution and ornamentation.^n^nVariation in plate size in Chordodes is neither systematic nor substantive (Bolek et al. 2010); this taxon is coded as having plates of a single size.'; + TEXT CHARACTER=235 TEXT='Priapulid tumuli have a distinctively star-shaped appearance (Schmidt-Rhaesa et al. 2013)'; + TEXT CHARACTER=236 TEXT='Taxa such as Eokinorhynchus exhibit two size classes of sclerites: small sclerites borne on individual annulations, which typically cover much of the trunk; and individual sclerites that are prominently larger. These larger sclerites often include a prominent spine. There is a continuity in morphology between these spines and the dorsolateral specializations in cricocsmiids, in Microdictyon and Onychodictyon, and in hallucishaniids. We therefore consider these sclerites as potential homologues. In Shergoldana the enlarged sclerites form tessellating plates that encircle the trunk, corresponding to the position of plicae in loriciferan Higgins larvae [see e.g. @Neves2019] – which in some cases also exhibit a broad base and a pointed apical projection.^n^nThe nodes, plates and spines of lobopodian taxa (TS32) represent epidermal evaginations; the paired sclerotized dorsal plates of Actinarctus (Heterotardigrada) are also interpreted as epidermal evaginations (e.g. Nelson 2002; Marchioro et al. 2013; Persson et al. 2014). The paired pits that serve as muscle attachment sites in Halobiotus (Eutardigrada) are not treated as homologous (Halberg et al. 2009; Marchioro et al. 2013). We code Paucipodia, Diania and Aysheaia as uncertain; their preservation is insufficient to establish whether the paired specializations are node-like evaginations or pit-like depressions (Chen et al. 1995a; Liu and Dunlop 2014; Ma et al. 2014a).^nShergoldana bears three rings of four epidermal evaginations (Maas et al. 2007a); we follow the model of Dzik and Krumbiegel (Dzik and Krumbiegel 1989) and code these in the same fashion as the trunk developments of certain palaeoscolecids (e.g. Cricocosmia) and lobopodians (e.g. Microdictyon).^n^n^n---^nThis character refers to the differentiated epidermal regions found on the dorsal side of most lobopodians. The epidermal specialization is usually conspicuous, as in the paired nodes of Xenusion [@Dzik1989], Hadranax [@Budd1998p] and Kerygmachela [@Budd1993; @Budd1998trse]; the sclerotized plates of Onychodictyon [@Zhang2007; @Ou2012]; and the spines of Hallucigenia [@Ramskold1992; @Hou1995zjls; @Steiner2012], luolishaniids [@Ma2009; @Yang2015] and Orstenotubulus [@Maas2007csb]. The transformation is also coded as present in the modern tardigrades, denoting the paired pit-like structures associated with each pair of legs. These have been described as sites for muscular attachment in the visceral side of the body wall [e.g. @Halberg2009; @Marchioro2013]; the epidermal specializations of lobopodians have also been interpreted as muscle attachment sites [@Budd2001ed; @Zhang2007].^n^n---^nCharacters 41-42 in @Smith2015 and 39-40 in @Yang2015.^n^nMODIFIED in regards to how tardigrades are treated: plates, but not depressions, are included here, returning to the formulation of e.g. @Nelson2002; @Marchioro2013; @Persson2014.^nThe epidermal depressions of Halobiotus (Eutardigrada), represented by the paired pits that serve as muscle attachment sites [@Halberg2009; @Marchioro2013], are therefore not included.'; + TEXT CHARACTER=237 TEXT='Given the possibility that lobopodian sclerites derived from the plates of palaeoscolecid worms (Dzik and Krumbiegel 1989), we have reformulated this transformation series from (Smith and Ortega-Hernández 2014) to encapsulate the ‘two longitudinal rows’ of sclerites envisioned by trans. ser. 31 in Wills et al. (2012). We still code these as present in tardigrades to represent the possible homology of their epidermal depressions with the epidermal evaginations of other lobopodians (Smith and Ortega-Hernández 2014). Aysheaia is coded as absent as its ‘plates’ (reported by Liu and Dunlop 2014) seem to represent the impressions of the opposite pair of legs (see Whittington 1978). Eokinorhynchus is coded as present as its spines are regularly paired; the seemingly ventral position of the first pair may represent relocation late in development, or deformation of the specimen during preservation. Chalazoscolex is coded present, with the “two to three” individual sclerites occupying the width of each segment (Conway Morris and Peel 2010) assumed to reflect sclerites of the dorsal zone. Loriciferans are scored as present, as their plicae form regular rings of plates around their lorical region [@Neves2016].'; + TEXT CHARACTER=238 TEXT='Cf. WTS32.^n^nWe score Cardiodictyon as having two epidermal specializations (token 1), following suggestions that the apparently single dorsal sclerite is formed by the fusion of a pair of elements (Liu and Dunlop 2014).^n^nIn Loricifera, this character denotes the number of plicae in each ring of the lorica.^n^nWe score Cardiodictyon as having two epidermal specializations, following suggestions that the apparently single dorsal sclerite is formed by the fusion of a pair of elements [@Liu2014ppp]. The plates of Cricocosmia occur in pairs [@Han2007app]. Collinsium bears five primary spines [@Yang2015]; Acinocricus bears seven [@ConwayMorris1988]. Tardigrades are coded as ambiguous in view of the complex integration of their dorsal plates.^n^nCharacter 49 in @Smith2015 and 47 in @Yang2015.^n'; + TEXT CHARACTER=239 TEXT='Enlarged sclerites may occur on every annulation (as in Cricocosmia jinningensis) or less frequently.^n^nThe plates of the lorica occur on every annulation of the lorica zone [@Neves2016].'; + TEXT CHARACTER=240 TEXT='In most lobopodian taxa, the epidermal specializations exhibit a regular spacing, even if the spacing of appendages varies along the body [@Smith2015]. Both Collinsium and Luolishania, by contrast, exhibit an extended spacing between spines in the medial portion of the trunk [@Ma2009; @Yang2015].^n^nCharacter 50 in @Yang2015.'; + TEXT CHARACTER=241 TEXT='Some heterotardigrades dorsally have plates between segmental plates. ^n^nCharacter 112 in @Khim2023. '; + TEXT CHARACTER=242 TEXT='In most armoured lobopodians, each group of dorsal spines or plates exhibits a similar size [e.g. @Smith2015]. In Collinsium, Hallucigenia hongmeia, Luolishania, Acinocricus and the Emu Bay Collins Monster, the size of spines varies between each group [@ConwayMorris1988; @Liu2007az; @Ma2009; @Steiner2012; @Garcia2013].^n^nCharacter 49 in @Yang2015.^n'; + TEXT CHARACTER=243 TEXT='Some echiniscoidean tardigrades have a a dorsal segmental plate at the last trunk segment, with an additional plate which does not match to the trunk segment. ^n^nCharacter 111 in @Khim2023.'; + TEXT CHARACTER=244 TEXT='Lobopodians’ epidermal evaginations fall into two geometric categories: flat nodes or plates (token 1) and tall spines (token 2). Although the distal portions of the evaginations of Orstenotubulus are not preserved (Maas et al. 2007b), we infer a spine-like habit from the proportions of the spine stubs.^n^nCharacter 43 in @Smith2015 and 41 in @Yang2015.'; + TEXT CHARACTER=245 TEXT='This character refers solely to the shape of the trunk evaginations’ apices. It is independent from the evaginations’ proportions, as demonstrated by Onychodictyon ferox, where sclerites are wider than tall (i.e. plates) but display an acute distal termination [@Zhang2007; @Ou2012; @Topper2013].^n^nCharacter 44 in @Smith2015 and 42 in @Yang2015.'; + TEXT CHARACTER=246 TEXT='The spines of Hallucigenia fortis (Hou and Bergström 1995), H. hongmeia (Steiner et al. 2012), Luolishania (Ma et al. 2009) and the Emu Bay ‘Collins Monster’ (García-Bellido et al. 2013b) are distinctively curved, whereas those of H. sparsa (Conway Morris 1977b) and Onychodictyon ferox (Topper et al. 2013) are essentially straight.'; + TEXT CHARACTER=247 TEXT='See character 11 in (Sørensen et al. 2015).^nSirilorica and Nanaloricus bear spikes on the anterior margins of their loricae (Peel et al. 2013; Neves et al. 2016); Pliciloricus and Eolorica do not (Neves et al. 2016; Harvey and Butterfield 2017)'; + TEXT CHARACTER=248 TEXT='The epidermal evaginations of Cricocosmia and "armoured" lobopodians are substantially sclerotized [@Hou1995zjls; @Han2007app; @Steiner2012; @Caron2013], in contrast to those of Xenusion [@Dzik1989], Hadranax [@Budd1998p], Diania [@Ma2014jsp] and Kerygmachela [@Budd1993, @Budd1998trse].^n^nCharacter 46 in @Smith2015 and 44 in @Yang2015.^n^n---^n^nThe epidermal evaginations of ‘armoured’ lobopodians are substantially sclerotized (Hou and Bergström 1995; Steiner et al. 2012; Caron et al. 2013), in contrast to those of Xenusion (Dzik and Krumbiegel 1989), Hadranax (Budd and Peel 1998) and Kerygmachela (Budd 1993, 1998a).^nThe robust preservation and narrow spinose projections of the evaginations of Shergoldana (Maas et al. 2007a) suggest primary sclerotization.'; + TEXT CHARACTER=249 TEXT='The epidermal specializations of athrotardigrades such as Wingstrandarctus and Raiarctus exhibit have a cuticular expansion.^nWe code this as neomorphic.^n^nAdapted from character 109 in @Khim2023. '; + TEXT CHARACTER=250 TEXT='We code this character as uncertain in taxa that are not well enough preserved for the ornament to be apparent. Hallucigenia sparsa has a scaly ornament [@Caron2013] whereas H. hongmeia and Collinsium bear a net-like pattern [@Steiner2012; @Yang2015] shared with Onychodictyon, Microdictyon and Cricocosmia [@Han2007app; @Topper2013]; Cardiodictyon specimens show a comparable ornament [@Liu2014ppp fig. 4f]. The ornament of Cricocosmia and Tabelliscolex has been compared to Microdictyon, but this is in fact quite distinct, with a much more regular pattern of equally sized performations [@Shi2022]^n^nCharacter 47 in @Smith2015, 45 in @Yang2015, 5 in @Sorensen2023.'; + TEXT CHARACTER=251 TEXT='Microdictyon and Onychodictyon plates exhibit mushroom-like bosses at the junction of the net-like pattern.'; + TEXT CHARACTER=252 TEXT='Arthropodization is thought to happen first in pre-ocular appendages, then co-opted to the rest of the appendages [@Chipman2019]. This character reflects this hypothesised event. Sclerotization is thought to occur simultaneously in all trunk appendages (as they have been co-opted from the pre-ocular appendages), therefore we code this in one character. This is treated as a neomorphic character as the trunk appendages" co-option of sclerotization from pre-ocular appendages requires additional genetic control.^n'; + TEXT CHARACTER=253 TEXT='(~) inapplicable: lateral flaps (trans. ser. 55) not present^nTransformation series 38 in Daley et al. (2009).'; + TEXT CHARACTER=254 TEXT='Treated as neomorphic.^n^nTo summarise @Daley2009 and @VanRoy2015:^nExopods, the outer branch of a true biramous limb, are unique to Mandibulata. The outer appendage branch of chelicerates and many stem-group euarthropods is interpreted as an exite, a lateral flap which is not homologous to the mandibulate exopod [see also @Bruce2020]. ^n@Daley2009 treated this flap as homologous with the lateral flaps of anomalocaridids and gilled lobopodians, which often bear dorsal lanceolate blades (= setal blades). ^n@VanRoy2015 considered the setal blades themselves to represent the exite, homologizing the wrinkling on Kerygmachela and Pambdelurion flaps, the setal blades of Opabinia and anomalocaridids, and the exites of upper stem euarthropods. Dorsal flaps are therefore not necessarily present in addition to the setal blades; indeed @VanRoy2015 code them as absent in Amplectobeluids and Anomalocaris (as well as euarthropods).^n^nAdapted from character 31 in @Daley2009. Character 55 in @Smith2015 and 53 in @Yang2015.^n'; + TEXT CHARACTER=255 TEXT='The description of Aegirocassis @VanRoy2015 clarifies the relationship of the dorsal lanceolate (setal) blades in gilled lobopodians and radiodontans, and establishes their homology with setae borne on the outer appendage branches (i.e. exites) of upper-stem Euarthropoda.^n^nAdapted from characters 51, 56 and 68 from @Smith2015 and 54 in @Yang2015.'; + TEXT CHARACTER=256 TEXT='The dorsal flaps of anomalocaridids and gilled lobopodians are considered as homologous with euarthropod exites.^nSee character 57 in @VanRoy2015. Treated as neomorphic.^n^nCharacter 57 in @Smith2015 and @Yang2015.'; + TEXT CHARACTER=257 TEXT='@VanRoy2015 consider the setal blades to represent exites. Dorsal flaps are not always present in addition to the setal blades: the (ventral) flaps of amplectobeluids and Anomalocaris correspond to the euarthropod endopod. Dorsal flaps are considered to represent an elaboration of the setal blades, and thus treated as a neomorphic character.^n^nModified from character 21 in @VanRoy2015. Character 67 in @Smith2015 and 55 in @Yang2015.'; + TEXT CHARACTER=258 TEXT='Gnathobasic appendages are absent in fuxianhuiids [@Chen1995s; @Waloszek2005; @Bergstrom2008; @Yang2013] but present in Artiopoda [@Edgecombe1999; @Ortega2013] and megacheirans [@Chen2004; @Haug2012bmceb; @Haug2012p].^n^nCharacter 8 of @Ma2014jsp; 35 in @Daley2009; 58 in @Smith2015 and @Yang2015.'; + TEXT CHARACTER=259 TEXT='Character 51 of @VanRoy2015, reflecting the continuation of setal blades in certain dinocaridids across the dorsal surface.^n^nCharacter 56 in @Yang2015.'; + TEXT CHARACTER=260 TEXT='Some lobopodians have cylindrical appendages (e.g. Microdictyon, Hallucigenia) whereas others have more conical or tapered lobopods.^nInapplicable when lobopodous limbs are absent.^n'; + TEXT CHARACTER=261 TEXT='Only structures that are distinct from trunk sclerites are considered here.^n^nModified from character 9 in @Ma2014jsp. 59 in @Smith2015 and Yang2015.^n'; + TEXT CHARACTER=262 TEXT='Spines and setae taper to sharp point, whereas appendicules have a uniform length and a flattened terminus.^n^nCharacter 60 in @Smith2015 and @Yang2015.'; + TEXT CHARACTER=263 TEXT='In Luolishaniids the secondary structures are arranged in rows, whereas in Ayesheaia, there is only one or two on trunk limbs.'; + TEXT CHARACTER=264 TEXT='Luolishaniids have long setiform spines [@Caron2020; @ConwayMorris1988; @Ma2009; @Garcia2013; @Yang2015], which contrast with the short, more equant spines of Diania and Aysheaia [@Whittington1978; @Ma2014jsp].^n^nCharacter 61 in @Yang2015.'; + TEXT CHARACTER=265 TEXT='In contrast to appendicules and spines, papillae are short projections associated with the annulations. The preservation of papillae in Ilyodes indicates that the absence of papillae in Carbotubulus is not taphonomic [@Haug2012cb]. Ambiguous in euarthropods as sclerotization is considered to overprint and obscure any papillae that may have been present.^n^nCharacter 10 in @Ma2014jsp; 61 in @Smith2015 and 62 in @Yang2015.'; + TEXT CHARACTER=266 TEXT='The finger-like projections in the legs of tardigrades can bear sets of terminal claws or sucking discs [@Schuster1980; @Nelson2002].^n^nCharacter 62 in @Smith2015 and 63 in @Yang2015.'; + TEXT CHARACTER=267 TEXT='A cuticularized spine is borne by the papillae of the partial Orsten-type lobopodian and crown-group onychophorans.^n^nCharacter 77 in @Zhang2016.^n^nA cuticularized spine is borne by the papillae of the partial Orsten-type lobopodian and crown-group onychophorans.^n^nCharacter 77 in @Zhang2016.'; + TEXT CHARACTER=268 TEXT='Arthrotardigrade Batillipes has discs on the tip of its limbs. Coded as neomorphic. ^n^nModified from character 50 from @Khim2023.'; + TEXT CHARACTER=269 TEXT='From character 63 in @Smith2015 and 64 in @Yang2015. This character is contingent on the presence of specialized trunk sclerites.'; + TEXT CHARACTER=270 TEXT='The outer edge of e.g. onychophoran claws have a similar curvature along its length, whereas the inner edge has a distinct inflection/step in curvature along its length, forming an enlarged attachment base.'; + TEXT CHARACTER=271 TEXT='Whilst many lobopodians have terminal claws, Aysheaia"s claws are sub-terminal; lobopods extend beyond the claws [@Whittington1978].^nInapplicable when terminal or sub-terminal claws absent.'; + TEXT CHARACTER=272 TEXT='Present in Eutardigrada [@Schuster1980; @Nelson2002; @Halberg2009] and the Siberian Orsten-type tardigrade [@Maas2001]. Absent in heterotardigrades and Palaeozoic lobopodians, which express simple concavo-convex claws.^n^nCharacter 64 in @Smith2015 and 65 in @Yang2015. Similar to character 52 in @Khim2023.^n^n'; + TEXT CHARACTER=273 TEXT='In apochelans, the primary and secondary branches are seperate, whereas in parachelans they are fused. ^n^nCharacter 53 from @Khim2023.'; + TEXT CHARACTER=274 TEXT='Where 1 represents the primary branch, and 2 is the secondary branch. If claw is 2121, the sequence of claws on a limb is external claw secondary (2), external claw primary (1), internal claw secondary (2), internal claw primary (1). ^n^nCharacter 54 in @Khim2023.'; + TEXT CHARACTER=275 TEXT='Character 55 of @Kihm2023'; + TEXT CHARACTER=276 TEXT='Character 56 of @Kihm2023'; + TEXT CHARACTER=277 TEXT='@Mapalo2024cb, character 4'; + TEXT CHARACTER=278 TEXT='@Mapalo2024cb, character 5'; + TEXT CHARACTER=279 TEXT='@Mapalo2024cb, character 8'; + TEXT CHARACTER=280 TEXT='@Mapalo2024cb, character 10'; + TEXT CHARACTER=281 TEXT='@Mapalo2024cb, characters 11 and 12 merged'; + TEXT CHARACTER=282 TEXT='@Mapalo2024cb, character 13'; + TEXT CHARACTER=283 TEXT='@Mapalo2024cb, character 14'; + TEXT CHARACTER=284 TEXT='@Mapalo2024cb, character 15'; + TEXT CHARACTER=285 TEXT='@Mapalo2024cb, character 16'; + TEXT CHARACTER=286 TEXT='@Mapalo2024cb, character 19'; + TEXT CHARACTER=287 TEXT='@Mapalo2024cb, character 21'; + TEXT CHARACTER=288 TEXT='@Mapalo2024cb, characters 22 and 23 merged'; + TEXT CHARACTER=289 TEXT='@Mapalo2024cb, character 24'; + TEXT CHARACTER=290 TEXT='@Mapalo2024cb, character 25'; + TEXT CHARACTER=291 TEXT='The differentiated anterior appendages of hallucishaniids do not bear unambiguous claws: structures interpreted as such (e.g. in Ovatiovermis, Luolishania) are not morphologically or compositionally distinct from co-occurring setae/spinules. As such, only the walking trunk limbs are considered.^n^nCharacter 65 in @Smith2015 and 66 in @Yang2015.'; + TEXT CHARACTER=292 TEXT='In many lobopodians, posterior trunk appendages bear fewer claws than anterior appendages.'; + TEXT CHARACTER=293 TEXT='All seven claws in Aysheaia are identical [@Whittington1978]. Euperipatoides claws are identical on trunk limbs, although the jaw elements are differentiated [@Smith2014]. Paucipodia claws are not visibly differentiated [@Hou2004]; neither are those of Hallucigenia sparsa [@Smith2015]. Onychodictyon ferox has a large and a small claw [@Steiner2012].^n^nCharacter 66 in @Smith2015; 6 and 17 in @Mapalo2024cb.'; + TEXT CHARACTER=294 TEXT='A movable foot is present in the Onychophoran crown group, but not in Tertiapatus [@Poinar2000].^n^nCharacter 67 in @Yang2015.^n'; + TEXT CHARACTER=295 TEXT='Certain heterotardigrades have partitioned, retractable limbs. We code this character as neomorphic. ^n^nCharacter 42 from @Kihm2023'; + TEXT CHARACTER=296 TEXT='Transformation series 31 in Ma et al. (Ma et al. 2014a); trans. ser. 36 in Daley et al. (2009). The definition has been slightly modified reflect the presence of two pairs of lateral flaps in Anomalocaridida (Van Roy et al. 2013).'; + TEXT CHARACTER=297 TEXT='Character 37 in @Daley2009; 69 in @Smith2015 and @Yang2015.^n'; + TEXT CHARACTER=298 TEXT='Character 40 in @Daley2009: "Posterior tapering of the width of the lateral lobes is pronounced in Anomalocaris and Laggania, while other lateral lobe-bearing taxa, including Hurdia, have a more even body outline."^n^nCharacter 70 in @Smith2015 and @Yang2015.^n'; + TEXT CHARACTER=299 TEXT='The first pair of body flaps (posterior of segments lacking flaps) are enlarged into "paddles" in Schinderhannes and Lyrarapax [@Kuhl2009; @Cong2014; @Cong2016]. Because the body flaps of these radiodontans are homologous with endopods and lobopods [@VanRoy2015], this character has been generalized from @Yang2016 in order to apply to all appendage-bearing taxa.^n^nCharacter 68 in @Yang2015.^n'; + TEXT CHARACTER=300 TEXT='In Lyrarapax, Hurdia, Peytoia and Anomalocaris, the flaps of the anterior region are reduced [@Daley2009; @Cong2014; @Daley2014], whereas in Opabinia, Kerygmachela and Pambdelurion, the equivalent flaps remain expressed [@Whittington1975; @Budd1998ar; @Budd1998trse]. The preservation of Aegirocassis and Schinderhannes in inadequate to resolve this feature.^nBecause the ventral body flaps of the radiodonts are homologous with endopods and lobopods [@VanRoy2015], this character has been generalized from @Yang2015 and @Yang2016 in order to apply to all limb-bearing taxa.^n^nCharacter 71 in @Yang2015.'; + TEXT CHARACTER=301 TEXT='Transformation series 38 in Ma et al. (Ma et al. 2014a).'; + TEXT CHARACTER=302 TEXT='Hallucigenia fortis has two pairs of elongate limbs [@Ma2012asd]; Hallucigenia sparsa has three [@Smith2015]; Luolishania, Facivermis, Acinocricus and the Emu Bay Collins Monster have five [@Ramskold1998; @Ma2009; @Garcia2013; @Howard2020]; Collinsium, Ovatiovermis and Collinsovermis bear six [@Caron2020; @Yang2015; @Caron2017].^n^nCharacter 73 in @Yang2015.'; + TEXT CHARACTER=303 TEXT='The anterior limbs of Hallucigenia sparsa are simple and lack cirri; the anterior limbs of luolishaniids bear multiple cirri. The trunk is not differentiated into distinct anterior and posterior components in any other taxon.^n^nCharacter 71 in @Yang2015.^n'; + TEXT CHARACTER=304 TEXT='The endopods of certain taxa in the euarthropod stem-group, such as fuxianhuiids, bear 15 or more podomeres and are considered "multipodomerous" [@Chen1995s; @Waloszek2005; @Bergstrom2008; @Yang2013].^n^nCharacter 72 in @Smith2015 and 74 in @Yang2015.^n'; + TEXT CHARACTER=305 TEXT='Some echiniscoideans have a small sclerotized plate on the last pair of limbs.^nWe code this character as neomorphic. ^n^nCharacter 118 from @Khim2023.'; + TEXT CHARACTER=306 TEXT='This character has been modified by that of previous analyses [e.g. character 34 in @Ma2014jsp] to reflect the fact that, in extant onychophorans, the posterior extension of the lobopodous trunk (i.e. anal cone) corresponds to a segment that has lost its appendage pair, as evinced by the prevalence of nephridia in this region [@Mayer2005]. As it is not possible to determine whether the posterior extension of the trunk in Palaeozoic lobopodians arises through the loss of the last appendage pair (as in Onychophora) or as an elongation of the trunk, we code this character as present in all taxa where the trunk extends posteriad of the last observable pair of limbs. Coded ambiguous where trunk appendages are absent.^n^nCharacter 73 in @Smith2015 and 75 in @Yang2015.'; + TEXT CHARACTER=307 TEXT='i.e. terminal limbs of lobopodians; lumps of palaeoscolecids. Distinguish from caudal appendages.^n^n^nThis transformation series has been modified by that of previous analyses (Ma et al. 2014a) to reflect the fact that, in extant Onychophorans, the posterior extension of the lobopodous trunk (i.e. anal cone) corresponds to a segment that has lost its appendage pair, as evinced by the prevalence of nephridia in this region (Mayer and Koch 2005). As it is not possible to determine whether the posterior extension of the trunk in Palaeozoic lobopodians arises through the loss of the last appendage pair (as in Onychophora) or as an elongation of the trunk, we code this transformation series as present in all taxa where the trunk extends posteriad of the last observable pair of limbs. We code this transformation series as absent in Kerygmachela (Budd 1993, 1998a), Jianshanopodia (Liu et al. 2006) and Anomalocaris (Daley and Edgecombe 2014) as their tails likely represent modified appendages (see transformation series 63 and 64). There is possible, but inconclusive, evidence for a small posterior extension in Opabinia (Whittington 1975; Budd 1996; Budd and Daley 2012), which is thus coded as uncertain. Siberion is scored as uncertain as it is difficult to distinguish the possible body termination from a posterior leg or pair of legs (Dzik 2011). Hallucigenia sparsa is also coded as uncertain; the posterior part of its body is poorly known (Ramsköld 1992). It is present in other species of Hallucigenia (e.g. Hou and Bergström 1995).'; + TEXT CHARACTER=308 TEXT='Character 42 in @Daley2009, 74 in @Smith2015 and 76 in @Yang2015.'; + TEXT CHARACTER=309 TEXT='The last pair of legs are rotated anteriad in tardigrades [e.g. @Marchioro2013], Aysheaia [@Whittington1978] and O. ferox [@Ou2012], but not in O. gracilis, Cardiodictyon, Hallucigenia fortis or Microdictyon [@Hou1995zjls].^n^nCharacter 78 in @Smith2015 and 80 in @Yang2015.'; + TEXT CHARACTER=310 TEXT='Character 75 in @Smith2015 and 77 in @Yang2015. See also character 35 in @Ma2014jsp.'; + TEXT CHARACTER=311 TEXT='In fuxianhuiids, the posteriormost appendage pair is modified into a tail fan or tail flukes [e.g. @Chen1995s; @Yang2013]; a similar condition is also observed in Opabinia [@Whittington1975; @Budd1996; @Budd2012], Anomalocaris [@Daley2014] and Hurdia [@Daley2009]. Partial fusion of the last pair of legs occurs in Aysheaia [@Whittington1978], Onychodictyon gracilis [@Liu2008app], O. ferox [@Ou2012] and Tardigrada [e.g. @Halberg2009; @Marchioro2013]; in these taxa, this characteristic is expressed as an incipient fusion of the medioproximal bases of the posteriormost appendage pair. ^n^nCharacter 76 in @Smith2015 and 78 in @Yang2015.'; + TEXT CHARACTER=312 TEXT='As noted by @Pates2022, many euathropods have caudal rami. The rami of Kergmachela may represent fused rami. ^n^nThis character distinguishes the long tail rami of Kerygmachela [@Budd1993; @Budd1998trse] from the flaps observed in Jianshanopodia [@Liu2006], anomalocaridids [@Daley2009; @Daley2014], and fuxianhuiids [e.g. @Yang2013].^n^nCharacter 77 in @Smith2015 and 79 in @Yang2015.'; + TEXT CHARACTER=313 TEXT='Opabiniids [@Budd2012; @Pates2022] and Anomalocaridids [@Daley2009] have differentiated posterior appendages that form a tail fan. Fuxianhuiids have similar modified appendicular tail flukes [e.g. @Yang2013]. Opabinia regalis has a paddle-like, more symmetric morphology to its tail appendages, whereas Anomalocaris and Utaurora appendages are more asymmetric, with a sharp anterior edge forming a blade-like morphology [@Pates2022].^nCoded as inapplicable when posterior tagma tail flaps are absent.^n^nCharacter 106 from @Pates2022.'; + TEXT CHARACTER=314 TEXT='Observed in Laojieella, Eximipriapulus, Xiaoheiqingella^nIncludes the lorical region of Palaeopriapulites and Sicyophorus. (Hou et al. 2017)'; + TEXT CHARACTER=315 TEXT='Cf. WTS45.^nBy comparison with Corynetis (Huang et al. 2004a; Hu et al. 2012), the posterior trunk region of Louisella (Conway Morris 1977a) is coded as a caudal appendage.^nThe posterior lobes of nematomorphs are not considered to represent separate appendages or organs, so caudal appendages are coded as absent in this taxon.^nJust a hint of some form of caudal appendage in Fieldia (ROM 93-1509)^nCoded ambiguous in Selkirkia as the posterior trunk is not known; the posterior of the tube was apparently open.'; + TEXT CHARACTER=316 TEXT='WTS41.^nGiven the difficulty of distinguishing the ‘bursa’ in fossil worms such as Ottoia and Louisella (Conway Morris 1977a) from a caudal appendage, or indeed of clearly defining a distinction, a ‘bursa’ is coded as a caudal appendage; this transformation series refers to the eversibility of this appendage.^nThe identity of the posterior extension in Chalazascolex as a bursa is speculative (Conway Morris and Peel 2010); this taxon is coded ambiguous.'; + TEXT CHARACTER=317 TEXT='Cf. WTS45. The caudal appendage of Tubiluchus lemburgi is distinctly longer than the body'; + TEXT CHARACTER=318 TEXT='WTS46.'; + TEXT CHARACTER=319 TEXT='Cf. WTS47.'; + TEXT CHARACTER=320 TEXT='Cf WTS47.'; + TEXT CHARACTER=321 TEXT='WTS48.'; + TEXT CHARACTER=322 TEXT='See characters 176-129 in @Meldal2004. Caudal glands open through a spinneret to secrete an adhesive that free-living nematodes use to attach to a substrate.'; + TEXT CHARACTER=323 TEXT='Cf. WTS39.^n^nThis character considers posterior spines, setae, and tubulae, but not posterior bifurcations of the trunk (as in adult nematomorphs), which are treated separately.^n^nThe ''toes'' of loriciferans are spines, used for locomotion and adhesion; they are reduced in adults [@Neves2016]. ^nLoriciferans bear posterior spines, interpreted as sensory setae (Neves and Kristensen 2014)^n^nPosterior projections in kinorhynchs [@Sorensen2008] that correspond to projections on other segments are not treated as posterior projections for the purposes of this character. Likewise, tergal extensions are extensions of the tergal plate [@Herranz2014], rather than distinct projections.'; + TEXT CHARACTER=324 TEXT='Cf. WTS44.'; + TEXT CHARACTER=325 TEXT='Cf. WTS49.^nCoded as ambiguous in Scathascolex and Eokinorhynchus as the tail hooks’ basal diameter is close to 20% of the trunk diameter; the preservation of the fossils makes it difficult to determine the exact diameter of the hooks.'; + TEXT CHARACTER=326 TEXT='Scathascolex and Eokinorhynchus have four hooks.^nWronascolex antiquus is scored as having four hooks; the hooks are occluded by adpression in the specimens figured in (García-Bellido et al. 2013a) but seem to occur in two pairs.^nMaccabeus has 40–65 hooks (Por and Bromley 1974)^nMeiopriapulus has 32–38 in a single ring (Sørensen et al. 2012a)^nSix in Markuelia (Dong et al. 2010)'; + TEXT CHARACTER=327 TEXT='Cf WTS39.^nSchistoscolex has four projections, two bilateral pairs; they encircle the entire posterior surface of the organism and are thus coded as being a radial ring.^nThe pairs in Eokinorhynchus form an open arc (Zhang et al. 2015)^nThe condition in Markuelia is unclear (Dong et al. 2010)^nThe condition in Acanthopriapulus is taken to be irregular (van der Land 1970)'; + TEXT CHARACTER=328 TEXT='WTS40. Ring papillae are small peg-like structures tipped with a seta; they occur on the annulus/annuli closes to the anus. They grade into abdominal setae, and are easily missed except with SEM analysis of living priapulids (Merriman 1981) and are thus coded ambiguous in fossil taxa except the exquisitely preserved Schistoscolex, where they are demonstrably absent. Corynetis and Xiaoheiqingella, coded as present in Wills et al. 2012, do not obviously express a ring of papillae that are distinct from abdominal spines.'; + TEXT CHARACTER=329 TEXT='WTS42.'; + TEXT CHARACTER=330 TEXT='Cf. WTS50.'; + TEXT CHARACTER=331 TEXT='Cf. WTS50.'; + TEXT CHARACTER=332 TEXT='@Budd2001za proposes the distribution of musculature as a key phylogenetic character. The musculature of tardigrades, Pambdelurion, Anomalocaris and more derived euarthropods is metamerically arranged and runs through the body cavity, whereas muscles in cycloneuralians, onychophorans and Kerygmachela are seemingly dominated by longitudinal and circular structures [@Carnevali1979; @Hoyle1980; @Budd1998l, @Budd2001za].^n^nCharacter 52 in @Smith2015.'; + TEXT CHARACTER=333 TEXT='Longitudinal muscles may exist in the peripheral region [see @Zhang2016] in addition to circular and/or metameric musculature. Present in priapulans and onychophorans [@Carnevali1979; @Hoyle1980]; absent in tardigrades and euarthropods [@Halberg2009], and presumed absent in Fuxianhuia.^n^nCharacter 113 in @Zhang2016.^n^nIn nematodes, longitudinal somatic musculature lies directly underneath the epidermis [@SchmidtRhaesa2014]'; + TEXT CHARACTER=334 TEXT='Observed in Pambdelurion, tardigrades and onychophorans [@Young2017].'; + TEXT CHARACTER=335 TEXT='In tardigrades, longitudinal muscles attach at successive points along the body; on onychophorans and gilled lobopodians, they attach only at the anterior and posterior end of the trunk [@Young2017]. Inapplicable if longitudinal muscles are absent.^n^nIn nematodes, the muscles attach at their edges to lateral, dorsal and ventral chords that protrude inwards from the epidermis [@SchmidtRhaesa2014]'; + TEXT CHARACTER=336 TEXT='In most kinorhynchs, longitudinal muscles attach to the pachycycli situated at the anterior segment margins; in ''aberrant'' kinorhynchs, they attach more posteriorly to the anteriormost part, or the central part, of each tegumental plate. [Paraphrased from @Herranz2021z]'; + TEXT CHARACTER=337 TEXT='WTS87.^n^nPresent in priapulans and onychophorans [@Carnevali1979; @Hoyle1980]; absent in tardigrades and euarthropods [@Halberg2009], and presumed absent in Fuxianhuia. ^nNanaloricus bears circular muscles around the neck (Neves et al. 2013)^nCircular muscles are reduced in adult Nematoids (Sørensen et al. 2008)^n^nCharacter 114 in @Zhang2016.'; + TEXT CHARACTER=338 TEXT='Longitudinal muscles occur inside circular muscles in priapulans and onychophorans [@Carnevali1979; @Hoyle1980].^n^nCharacter 115 in @Zhang2016.'; + TEXT CHARACTER=339 TEXT='Dorsoventral muscles are absent in segment 1 of certain kinorhynchs [@Herranz2021z]. Treated as a neomorphic character denoting the specialization of segment 1, hence coded as absent in taxa without segmented dorsoventral musculature.'; + TEXT CHARACTER=340 TEXT='Metamerically arranged dorsoventral and oblique muscles connecting the lateral and ventral muscle groups are present in tardigrades and euarthropods, resulting in a "box-truss trunk musculature system" [@Young2017]'; + TEXT CHARACTER=341 TEXT='Character(s) 84 in @Smith2015, 86 and 81 in @Yang2015.^n^nMa et al. (Ma et al. 2014a) described a dorsal heart in Fuxianhuia; all other fossil taxa are scored as ambiguous. Budd (2001b) discussed the difficulty of interpreting the absence of a circulatory system in Tardigrada as ancestral or derived, given that a circulatory system is unnecessary in a miniaturized organism; he concluded that the most methodologically sound way to address this issue in a cladistic context is to score the character as inapplicable.'; + TEXT CHARACTER=342 TEXT='Pharynx protractor muscles connect the base of the mouth cone to the posterior end of the pharynx in priapulans and kinorhynchs [@Neuhaus2002icb; @Altenburger2016ed]'; + TEXT CHARACTER=343 TEXT='The nervous system of priapulids is intraepithelial; neurites are basiepithelial in nematomorphs'; + TEXT CHARACTER=344 TEXT='WTS52.^n^n“Living priapulids possess unpaired ventral nerve cords, whereas gastrotrichs, onychophorans and loriciferans possess ventral nerve cords that are paired throughout their length, and the ventral nerve cords of nematomorphs and nematodes divide at points along their length [@SchmidtRhaesa1997]; the situation in kinorhynchs is unresolved (paired according to Kristensen and Higgins, 1991; unpaired according to Neuhaus 1994). The condition in Ottoia is common to extant priapulids (Conway Morris, 1977).”^nSee also (Martín-Durán et al. 2016)^nIn kinorhynchs there are seven to twelve nerve cords; the ventral nerve cord is unpaired in Echinoderes (Neuhaus and Higgins 2002)^n^nThis neomorphic character codes the transformation from a single ventral nerve cord (e.g., priapulans) to a pair (e.g., extant panarthropods, Chengjiangocaris). ^n^nTreated as paired in nematomorphs, a paired configuration can be observed in e.g. Paragordius [@SchmidtRhaesa2014], and its vestiges can be observed throughout the phylum [@SchmidtRhaesa1997]^n^nCharacter 85 in @Yang2016.'; + TEXT CHARACTER=345 TEXT='In many nematodes, the central cord exhibits a primary right branch and a subsidiary left branch, which may merge back into the primary cord terminally [@SchmidtRhaesa2014]. In other taxa, paired cords are equivalent in size [@SchmidtRhaesa2012].^n'; + TEXT CHARACTER=346 TEXT='WTS53.^nIn many nematodes, the central cord exhibits a primary right branch and a subsidiary left branch, which may merge back into the primary cord terminally [@SchmidtRhaesa2014]. The paired cords of certain nematomorphs also merge caudally [@SchmidtRhaesa2012]. Coded present also in cases where the nematomorph nerve cord is fully merged.^n'; + TEXT CHARACTER=347 TEXT='Character 2 in @Tanaka2013, 79 in @Smith2015 and 81 in @Yang2015.^n^nTardigrada and Euarthropoda have a ganglionated ventral nerve cord [@Schulze2014], in contrast to the ladder-like ventral nerve cord in Onychophora [@Mayer2013bmceb]. Priapulida have an unpaired nerve cord associated with a net-like system of neural connectives [@Storch1991; @Rothe2010].^n'; + TEXT CHARACTER=348 TEXT='Character 1 in @Tanaka2013; revised by @Yang2016 to apply only to paired nerve cords. This character distinguishes the organization of the ventral nerve cord in Onychophora [e.g. @Mayer2013bmceb] from that in other phyla.^n^nCharacter 83 in @Smith2015, 85 in @Yang2015 and 87 in @Yang2016.^n^n---^nTransformation series 1 in Tanaka et al. (2013). This transformation series distinguishes the organization of the ventral nerve cord in Onychophora (e.g. Mayer et al. 2013a) from that in other phyla.'; + TEXT CHARACTER=349 TEXT='Present in Onychophora and Tardigrada; absent in Euarthropoda, including Alalcomenaeus. Ambiguous in Lyrarapax and Chengjiangocaris. See @Yang2016 for further discussion.^n^nCharacter 88 in @Yang2016.'; + TEXT CHARACTER=350 TEXT='Neural concentrations (ganglia) along the ventral nerve cord give a "rope ladder-like" appearance in tardigrades and euarthropods, in contrast to a ladder-like VNC, found in onychophorans [@Yang2016]. The presence of transverse commissures likely are fundamentally linked neurological features [@Yang2016].^n^nCharacter 86 in @Yang2016.^n'; + TEXT CHARACTER=351 TEXT='Present in Priapulida, Onychophora, Tardigrada and Chengjiangocaris; absent in Euarthropoda and Alalcomenaeus. See @Yang2016 for further discussion.^n^nCharacter 93 in @Yang2016.^n'; + TEXT CHARACTER=352 TEXT='Orthogonal organization of several ring-like commissures and peripheral nerves that intersect longitudinal dorsal and lateral nerve strands to form a reticulate pattern. Present in Priapulida, Onychophora and Tardigrada. Uncertain in Chengjiangocaris; absent in Alalcomenaeus and crown Euarthropoda. See @Yang2016 for further discussion. Contra @Yang2016, we score this character as inapplicable in taxa where the regularly spaced peripheral nerves that constitute the transverse component of the orthogonal organization are not present.^n^nCharacter 89 in @Yang2016.'; + TEXT CHARACTER=353 TEXT='Complete in Priapulida and Onychophora; incomplete in Tardigrada. Inapplicable in Euarthropoda. See @Yang2016 for further discussion.^n^nCharacter 90 in @Yang2016.'; + TEXT CHARACTER=354 TEXT='Anteriorly displaced in Tardigrada and Euarthropoda; not in Onychophora. Ambiguous in fossil taxa. See @Yang2016 for further discussion.^n^nCharacter 91 in @Yang2016.^n'; + TEXT CHARACTER=355 TEXT='Two nerves innervate each leg in Onychophora and Eutardigrada, but a single nerve innervates each Euarthropod leg. The configuration is ambiguous in fossil material. See @Yang2016 for further discussion.^n^nCharacter 92 in @Yang2016.^n'; + TEXT CHARACTER=356 TEXT='Present in Eutardigrada and Euarthropoda; uncertain in Heterotardigrada; absent in Onychophora and Priapulida. See @Yang2016 for further discussion.^n^nCharacter 64 in @Yang2016.'; + TEXT CHARACTER=357 TEXT='WTS55.^n^nProposed as a synapomorphy of Cycloneuralia (Nielsen 2012), though also present in Panarthropoda; Euperipatoides has been scored as present based on the homology of the supraoesophageal ganglion with the circumpharyngeal brain, as argued by @Eriksson2000. Present in Nematomorpha despite the absence of a pharynx [@Henne2017; @SchmidtRhaesa2012]^n^nCircumpharyngeal nerve rings are found in the nematode brain [@White1997; @Henne2017] and the anterior nervous systems of extant tardigrades [@Mayer2013bmceb; @Smith2017]. They are likely precursors of the dorsal condensed brain [@Smith2024]'; + TEXT CHARACTER=358 TEXT='A synapomorphy of Nematomorpha [@SchmidtRhaesa1996; @SchmidtRhaesa1997]'; + TEXT CHARACTER=359 TEXT='Whereas typical cycloneuralians have a circumoesophageal nerve ring [e.g. @Storch1991; @Telford2008; @Edgecombe2009; @Rothe2010], Panarthropoda is characterized by dorsal condensed brain neuromeres [@Eriksson2003; @Mittmann2003; @Harzsch2005asd; @Mayer2010; @Mayer2013po]. A dorsal condensed brain has been described in Fuxianhuia [@Ma2012n] and Alalcomenaeus [@Tanaka2013].^n^nCharacter 80 in @Smith2015 and 82 in @Yang2015.^n'; + TEXT CHARACTER=360 TEXT='Number of neuromeres integrated into the dorsal condensed brain. See the introductory statements for char. 81 in @Smith2015 and char. 83 in @Yang2015.^n'; + TEXT CHARACTER=361 TEXT='Recent fossil data suggest a likely deutocerebral innervation for the mouth in Fuxianhuia and Alalcomenaeus based on the position of the oesophageal foramen relative to the brain [@Ma2012n; @Tanaka2013], which is congruent with the organization found in phylogenetically basal extant euarthropods such as Chelicerata and Myriapoda [@Mittmann2003; @Harzsch2005asd; @Scholtz2005; @Scholtz2006]. Tritocerebral innervation is observed in Pancrustacea, but not among the taxa included in this study.^n^nThe circumoral nerve ring is treated as homologous with the protocerebrum, per @Smith2024.^n^nCharacter 82 in @Smith2015 and 84 in @Yang2015.^n'; + TEXT CHARACTER=362 TEXT='Absent in panarthropods (Martin et al. 2017)'; + TEXT CHARACTER=363 TEXT='Cf. WTS54.^nUnpaired dorsal nerve cords are seen as a synapomorphy of Nematoida (Sørensen et al. 2008)'; + TEXT CHARACTER=364 TEXT='WTS56. The cycloneuralian brain comprises three distinct regions: an anterior aggregation of somata from neurons (perikarya), followed by a central neuropil, followed posteriorly by a further region of perikarya (Rothe and Schmidt-Rhaesa 2010). This has been proposed as a synapomorphy of the cycloneuralians (Lemburg 1999): as the brains of panarthropods are arranged differently (Martin et al. 2017). However, the perikarya has an equal distribution in nematomorpha (Schmidt-Rhaesa 1997a)'; + TEXT CHARACTER=365 TEXT='Modified from Wills et al. (2012) character statement 57, which is understood to refer to a modification of the cycloneuralian brain in Tubiluchus and Meiopriapulus; the revised character is thus scored inapplicable in taxa without a cycloneuralian brain arrangement. This avoids the difficulty in deciding which bit of the onychophoran brain, which contains abundant perikarya (Martin et al. 2017), is ‘apical’.'; + TEXT CHARACTER=366 TEXT='WTS58. Lemburg (1999) recognizes the presence of this character as a synapomorphy of (extant) Priapulida'; + TEXT CHARACTER=367 TEXT='Eutardigrades have a cloaca (combined opening of gonopore and anus). ^n^nCharacter 91 in @Khim2023.'; + TEXT CHARACTER=368 TEXT='WTS66.^nThe presence of a cloaca in both sexes is seen as a synapomorphy of Nematoida (Sørensen et al. 2008)'; + TEXT CHARACTER=369 TEXT='WTS68.'; + TEXT CHARACTER=370 TEXT='Some heterotardigrades the duct of the seminal receptacle (sperm storage pocket) extends to the external part of the body.^n^nAdapted from character 92 @Khim2023.'; + TEXT CHARACTER=371 TEXT='Cf. WTS72.^nPerigenital setae comprising a ventral shaft and distal spine occur close to the urogenital pores in the anterior trunk of certain priapulans [@Land1970]'; + TEXT CHARACTER=372 TEXT='‘Mushroom shaped’ structures present in the genital region of Tubiluchus (Priapulida). WTS37'; + TEXT CHARACTER=373 TEXT='The clavula of Tubiluchus lemburgi has a short stalk and moderately sized distal bulb (Schmidt-Rhaesa et al. 2013).^nT. corallicola, T. australiensis have a short-stalked clavula (Van Der Land 1982; Schmidt-Rhaesa et al. 2013)^nT. remanei has a long-stalked clavula (Van Der Land 1982)'; + TEXT CHARACTER=374 TEXT='The clavula of Tubiluchus lemburgi has a short stalk and moderately sized distal bulb (Schmidt-Rhaesa et al. 2013).^nThe clavulae of T. remanei and T. corallicola are club-shaped with a distal bulb (van der Land 1982)^n^nFrom Schmidt-Rhaesa et al. 2013:^nIn T. remanei (see van der Land, 1982),^nthere is a row of perigenital setae of varying shape and size and^nvery long stalked clavula on each side next to the cloacal opening.^nInT. corallicola(seevan der Land, 1970), the urogenital pore and^nthe anus are very small and almost invisible. Close to each pore is a^nclavula, posterior are two large setae and anterior is a row of small^nperigenital setae. This row leads to a broad ventral region, in which^nnormal setae, large perigenital setae and tubuli are present; most^nprominent is a group of large “normal” setae anterior to the row^nof small perigenital setae. InT. remanei(see van der Land, 1982),^nthere is a row of perigenital setae of varying shape and size and^nvery long stalked clavula on each side next to the cloacal opening.^nA comb-like series of cuticular ridges is described, but not figured.^nTubiluchus australensis (see van der Land, 1985) has a clavula with a^nlarge spherical distal end and a short stalk. Additionally, only a row^nof eight perigenital setae of varying shape and size is present on^neach side of the animal. Whereas the urogenital opening is almost^ninvisible in all previous species, it is quite large and funnel-shaped^ninT. philippinensis (see van der Land, 1985). With a length of about^n25µm the clavulae are very large, and their distal ends are clubshaped. Some setae are present close to each clavula, but the most^nconspicuous structures are a dense group of small perigenital setae^nanterior to each urogenital opening. InT. troglodytes(seeTodaro^nand Shirley, 2003), there are circular cuticular ridges, which in total^nhave the form of an “8”. Eight to 10 setae are present along the ridge^non each side, and a clavula and two setae are present in the anterior^nregion anterolateral of the ridges on each side. Anterior of these^nstructures is a dense group of up to 70 setae. The genital structures^nofT. arcticusandT. vanuatuensiscould not be included here (they^nare, e.g. not mentioned in the English summary of the species in^nAdrianov and Malakhov, 1996).'; + TEXT CHARACTER=375 TEXT='Bullulae are small hemispherical elevations present in the genital region of certain priapulids, including Tubiluchus lemburgi (Schmidt-Rhaesa et al. 2013).^nPresent in T. corallicola (Van Der Land 1982)'; + TEXT CHARACTER=376 TEXT='Onychodictyon ferox''s gut expands anteriad forming a cone shape [see @Vannier2017], whilst some other lobopodian guts do not expand significantly in the anterior region, with the anterior end of a gut with a similar diameter to the mid-gut. We code the taxa with an eversible pharynx as ambiguous as it is unclear how these should be coded. ^n^n^nThis replaces the invariant character 17 from @Zhang2016, "Pharynx differentiated from midgut" [SC: 11].^n'; + TEXT CHARACTER=377 TEXT='WTS51.^nThe polythyridium is a muscular component of the gut surrounding the entrance to the intestine, adorned with circlets of cuticular plates (valvulae) [@Rothe2010]. It is interpreted as an autapomorphy of Tubiluchidae [@Kirsteuer1970].'; + TEXT CHARACTER=378 TEXT='Cf. WTS67.^nThe reduction of protonephridia is seen as a possible nematoid synapomorphy (Sørensen et al. 2008)'; + TEXT CHARACTER=379 TEXT='Cf. WTS67.'; + TEXT CHARACTER=380 TEXT='A possible synapomorphy of Scalidophora (Sørensen et al. 2008)^nPresent in Pycnophyes (Neuhaus 1988)^nCheck Neuhaus 1994, Ultrastructure of alimentary canal and body cavity, ground pattern, and phylogenetic relationships of the Kinorhyncha , Microfauna marina for Zelinkaderes details.'; + TEXT CHARACTER=381 TEXT='Homologous to cuticularized tubes of Pycnophyes (and Kinorhyncha) (Neuhaus 1988). In species of Echinoderidae, the protonephridial openings form two fairly conspicuous sieve plates, and due to their distinct appearance in LM as well as SEM, they are often reported in systematic and taxonomic studies. However, in non-echinoderid species there are no sieve plates and the nephridial pores are much more inconspicuous.'; + TEXT CHARACTER=382 TEXT='The absence of such microvilli is a possible synapomorphy of Priapulida + Kinorhyncha (Neuhaus and Higgins 2002)'; + TEXT CHARACTER=383 TEXT='WTS38. These states are retained in a single transformation series as states 1 and 2 are mutually exclusive but are unlikely to be homologous.'; + TEXT CHARACTER=384 TEXT='WTS69.^nThe reduction of the flagellum is seen as a possible synapomorphy of Nematoids (Sørensen et al. 2008), though a flagelliform tail is found in Kinonchulus (Riemann 1972). A flagellum has been reported in Gordius, but this is probably a misinterpretation (Schmidt-Rhaesa 1997b) , so Chordodes is coded as lacking a flagellum.'; + TEXT CHARACTER=385 TEXT='WTS73.^nSee (Bereiter-Hahn et al. 1984). Nematodes and Nematomorphs have principally replaced chitin with collagen as the principle component of their cuticle, though vestiges of chitin remain (Nielsen 2012). Coded as present in Palaeoscolex as chambers in the cuticle are believed to correspond to collagen fibres (Kraft and Mergl 1989)'; + TEXT CHARACTER=386 TEXT='Cf. WTS74.'; + TEXT CHARACTER=387 TEXT='WTS76.'; + TEXT CHARACTER=388 TEXT='WTS88'; + TEXT CHARACTER=389 TEXT='Present in heterotardigrades. Some eutardigrades also show a pillar-like structure in their epicuticle. Character 4 from @Khim2023. '; + TEXT CHARACTER=390 TEXT='Special types of glial/epidermal cells with characteristic bundles of tonofilaments, interpreted as a scalidophoran synapomorphy (though absent in certain Echinoderes species) (Nebelsick 1993). Coded, following the references in Nebelsick, as present in Tubiluchus, Meiopriapulus, Pycnophyes and Loricifera, ambiguous in Echinoderes dujardinii, and absent in Nematoda.'; + TEXT CHARACTER=391 TEXT='WTS78.'; + TEXT CHARACTER=392 TEXT='Cf. WTS79.^n^nTo add^nIn the tardigrade Echiniscus viridis, the central cuticle comprises:^n- An outer portion of alternating dense antd transparent layuers, with a much denser band proximally^n- Within these, a region made up of hexagons (looking striated in transverse section), with a complex dense outer layer and a less dense inner one^n- Proximal to that, an electron transparent zone containing dense rods^n- Within that, and innermost, transversely oriented fibres^n- The structure of the ventral cuticle is […] virtually identical to that described by Wright and Hope (1968) for the cuticle of the marine nematode, Acanthonchus (duplicatus Wieser, 1959 and quite similar to that described by Inglis (1964) and Watson (1965) for cuticles of certain other nematodes, including Elichromadora sp. Moreover, the striated layer found in the cuticle of E. viridi.s appears to be a nearly universal characteristic of nematode cuticles (cf. I,ee 1966, Wisse and Daems 1968 – (Crowe et al. 1970)'; + TEXT CHARACTER=393 TEXT='Character 36 in @Mapalo2024cb'; + TEXT CHARACTER=394 TEXT='WTS60. Biphasic encompasses the multiple phases of priapulid larvae (e.g. Wennberg et al. 2009), also documented in Sirilorica (Peel et al. 2013)^nThe nematode larva is morphologically similar to the adult, but lacks reproductive functions.^nLarvae of Priapulopsis are poorly known but are understood to be similar to Priapulus (van der Land 1970), and are coded equivalently herein.'; + TEXT CHARACTER=395 TEXT='Cf. WTS62'; + TEXT CHARACTER=396 TEXT='Crenulated in the Higgins larva of loriciferans (Neves et al. 2016)^nNot in Halicryptus (Storch and Higgins 1991; Janssen et al. 2009)'; + TEXT CHARACTER=397 TEXT='WTS63. Lemburg (1999) recognised the presence of this character as a synapomorphy of (extant) Priapulida. However, it has since been demonstrated that the larvae of nematomorphs also possess six pharyngeal retractor muscles (Kristensen 2003; Müller et al. 2004). Long pharynx retractor muscles are also present in loriciferans (at least within the Nanaloricidae) (Neves et al. 2013).'; + TEXT CHARACTER=398 TEXT='WTS65.^nA proposed synapomorphy of scalidophora, but also present in nematomorphs ^nPresent in Priapulid caudatus, Tubiluchis corallicola; absent in Kinorhyncha, Loricifera ; probably absent in Nectonema larvae yet present in adults (Schmidt-Rhaesa 1997a)'; + TEXT CHARACTER=399 TEXT='Proposed as a synapomorphy between nematomorph larvae and loriciferan adults [@Kristensen1983], but not seemingly ^nNot reported in Halicryptus, Maccabeus or Priapulus (van der Land 1970; Por and Bromley 1974; Storch and Higgins 1991), but present in Tubiluchus (Higgins and Storch 1989)'; + TEXT CHARACTER=400 TEXT='(~) inapplicable: proboscis and abdomen undivided^nThe larvae of Orstenoloricus (Maas and Waloszek 2009) possess a pair of spines at the anterior of the trunk. These may correspond to the anteroventral setae of Tenuiloricus (Neves and Kristensen 2014). Similar spines are present in Nanaloricidae and Pliciloricidae (Neves et al. 2016)^nThe tubuli present in the Halicryptus hatching larva are not paired, and disappear in the Higgins larval stage; paired spines are coded as absent in this taxon (Storch and Higgins 1991; Janssen et al. 2009) similar structures present in Tubiluchus (Kirsteuer 1976)'; + TEXT CHARACTER=401 TEXT='(~) inapplicable^nPresent in the Higgins larva of many loriciferans (Neves et al. 2016), Shergoldana (Maas et al. 2007a), ^nCoded as absent in Orstenoloricus (Maas and Waloszek 2009) as most specimens unambiguously lack them; only a single specimen has putative structures that are not unequivocally spines or appendages.^nPosterior protuberances occur at the posterior of the Halicryptus lorica (van der Land 1970); these probably ought to be coded in a separate transformation series but are included here for now. Similar features (‘tubuli’) are present in Priapulus (Higgins et al. 1993)'; + TEXT CHARACTER=402 TEXT='A similarity between the Nectonema and Nanaloricus larvae (Kristensen 1983)^nSac-like guts with single ‘fold’ in larvae of e.g. Tubiluchus (Higgins and Storch 1989)'; + TEXT CHARACTER=403 TEXT='Large mesenchyme cells in the larva are a similarity between nematomorph and loriciferan larvae (Kristensen 1983)'; + TEXT CHARACTER=404 TEXT='The Higgins larva is a component of the loriciferan lifecycle with a distinctive morphology'; + TEXT CHARACTER=405 TEXT='@Sorensen2023, character 1'; + TEXT CHARACTER=406 TEXT='@Sorensen2023, character 2'; + TEXT CHARACTER=407 TEXT='After @Sorensen2023, character 3; nature of wrinkles set as additional character'; + TEXT CHARACTER=408 TEXT='After character 3 in @Sorensen2023'; + TEXT CHARACTER=409 TEXT='@Sorensen2023, character 4'; + TEXT CHARACTER=410 TEXT='@Sorensen2023, character 6'; + TEXT CHARACTER=411 TEXT='@Sorensen2023, character 7'; + TEXT CHARACTER=412 TEXT='@Sorensen2023, character 8'; + TEXT CHARACTER=413 TEXT='@Sorensen2023, character 9'; + TEXT CHARACTER=414 TEXT='@Sorensen2023, character 12'; + TEXT CHARACTER=415 TEXT='@Sorensen2023, character 13'; + TEXT CHARACTER=416 TEXT='@Sorensen2023, character 14'; + TEXT CHARACTER=417 TEXT='@Sorensen2023, character 15'; + TEXT CHARACTER=418 TEXT='@Sorensen2023, character 17'; + TEXT CHARACTER=419 TEXT='@Sorensen2023, character 18'; + TEXT CHARACTER=420 TEXT='@Sorensen2023, character 19'; + TEXT CHARACTER=421 TEXT='@Sorensen2023, character 21'; + TEXT CHARACTER=423 TEXT='@Sorensen2023, character 22: modified to make the presence of mucrones a separate character'; + TEXT CHARACTER=424 TEXT='Modified from @Sorensen2023, character 22'; + TEXT CHARACTER=425 TEXT='@Sorensen2023, character 24'; + + [Attribute comments] + TEXT CHARACTER= 1 TAXON=11 TEXT='Indicated by CT sections [@Zhang2022]'; + TEXT CHARACTER= 1 TAXON=13 TEXT='Large internal spaces [@Liu2019]'; + TEXT CHARACTER= 1 TAXON=16 TEXT='Seemingly present [@Shao2016]'; + TEXT CHARACTER= 1 TAXON=38 TEXT='Exhibits a large body cavity under certain conditions (possibly reproductive maturity?), even if some specimens lack one entirely [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 1 TAXON=39 TEXT='Exhibits a large body cavity under certain conditions (possibly reproductive maturity?), even if some specimens lack one entirely [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 1 TAXON=42 TEXT='Nematodes exhibit a primary body cavity that surrounds the organs and occupies much of the trunk [@SchmidtRhaesa2014, §1.7 and fig. 1.12]'; + TEXT CHARACTER= 1 TAXON=43 TEXT='Nematodes exhibit a primary body cavity that surrounds the organs and occupies much of the trunk [@SchmidtRhaesa2014, §1.7 and fig. 1.12]'; + TEXT CHARACTER= 1 TAXON=47 TEXT='Meiopriapulus is the only priapulan to exhibit a coelom: a small coelom surrounds the foregut [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 1 TAXON=52 TEXT='Large mixocoel is apparently homologous with the primary body cavity, though it fuses with the coelom during development [@Mayer2004az]'; + TEXT CHARACTER= 1 TAXON=53 TEXT='Large mixocoel is apparently homologous with the primary body cavity, though it fuses with the coelom during development [@Mayer2004az]'; + TEXT CHARACTER= 1 TAXON=54 TEXT='Large mixocoel is apparently homologous with the primary body cavity, though it fuses with the coelom during development [@Mayer2004az]'; + TEXT CHARACTER= 1 TAXON=131 TEXT='Coded as present, following @Smith2024'; + TEXT CHARACTER= 1 TAXON=134 TEXT='The ambiguous internal structure is interpreted as a primary body cavity, following @Smith2024'; + TEXT CHARACTER= 1 TAXON=138 TEXT='We interpret the dark stain within the cuticle as corresponding to the primary body cavity, after @Smith2024'; + TEXT CHARACTER= 1 TAXON=148 TEXT='We interpret the dark stain within the cuticle as corresponding to the primary body cavity, after @Smith2024'; + TEXT CHARACTER= 1 TAXON=152 TEXT='Large perivisceral cavity [@Smith2024]'; + TEXT CHARACTER= 1 TAXON=153 TEXT='Coded as present, following @Smith2024'; + TEXT CHARACTER= 1 TAXON=162 TEXT='Tonguelettes are interpreted as extensions of the primary body cavity [@Smith2024]'; + TEXT CHARACTER= 1 TAXON=163 TEXT='We interpret the dark stain within the cuticle as corresponding to the primary body cavity, after @Smith2024'; + TEXT CHARACTER= 1 TAXON=169 TEXT='Tonguelettes are interpreted as extensions of the primary body cavity [@Smith2024]'; + TEXT CHARACTER= 2 TAXON=7 TEXT='The preserved section of the incomplete NMNH198597 is 20 times longer than wide [@ConwayMorris1977]'; + TEXT CHARACTER= 2 TAXON=11 TEXT='The anterior is somewhat incomplete, but the layout of the gut demonstrates a short body [@Zhang2022]'; + TEXT CHARACTER= 2 TAXON=14 TEXT='Four [@Shao2020]'; + TEXT CHARACTER= 2 TAXON=16 TEXT='Estimated to range between 6 and 10 [@Shao2016]'; + TEXT CHARACTER= 2 TAXON=28 TEXT='Above 10:1, and notably longer than in other kinorhynchs -- interpreted as an adaptation to interstitial habitats [@Herranz2021z]'; + TEXT CHARACTER= 2 TAXON=33 TEXT='Above 10:1, and notably longer than in other kinorhynchs -- interpreted as an adaptation to interstitial habitats [@Herranz2021z]'; + TEXT CHARACTER= 2 TAXON=34 TEXT='Above 10:1, and notably longer than in other kinorhynchs -- interpreted as an adaptation to interstitial habitats [@Herranz2021z]'; + TEXT CHARACTER= 2 TAXON=97 TEXT='Incomplete specimens close to ten times longer than wide. Listed dimensions are 8 mm width and up to 100 mm length [@Howard2020].'; + TEXT CHARACTER= 2 TAXON=111 TEXT='~16 measured from @Hu2008, though the dimensions given in the text give a ratio closer to 12-13.'; + TEXT CHARACTER= 2 TAXON=119 TEXT='30-50 times longer than wide [@Yang2020]'; + TEXT CHARACTER= 2 TAXON=123 TEXT='"The ratio of width to length is ca. 1/20" [@Han2007pr]'; + TEXT CHARACTER= 2 TAXON=124 TEXT='Preserved component >10× longer than wide'; + TEXT CHARACTER= 2 TAXON=125 TEXT='At least 10× longer than wide [@Budd1998p]'; + TEXT CHARACTER= 2 TAXON=129 TEXT='Close to 20, measured from YKLP11313 [@Ma2014]'; + TEXT CHARACTER= 2 TAXON=131 TEXT='>20 [@Strausfeld2022]'; + TEXT CHARACTER= 2 TAXON=137 TEXT='~10 [@Haug2012]'; + TEXT CHARACTER= 2 TAXON=148 TEXT='~12-14 [@Ou2011]'; + TEXT CHARACTER= 3 TAXON=10 TEXT='Absent as not clear that plate distribution follows dorsal-ventral axis [@Maas2007].'; + TEXT CHARACTER= 3 TAXON=11 TEXT='Consistent orientation of expanded plates [@Zhang2022]'; + TEXT CHARACTER= 3 TAXON=12 TEXT='Uncertain; not enough of the trunk is preserved to determine whether sclerites indicate a dorsoventral polarity; @Liu2019 do not articulate their basis for identifying the dorsoventral orientation'; + TEXT CHARACTER= 3 TAXON=13 TEXT='Uncertain; not enough of the trunk is preserved to determine whether sclerites indicate a dorsoventral polarity'; + TEXT CHARACTER= 3 TAXON=14 TEXT='[@Shao2020]'; + TEXT CHARACTER= 3 TAXON=16 TEXT='Location of nerve cord [@Wang2025] not considered to differentiate trunk'; + TEXT CHARACTER= 3 TAXON=45 TEXT='Present: both Halicryptus species bear a ventral grove [@Shirley1999]'; + TEXT CHARACTER= 3 TAXON=108 TEXT='Present, reflected by the three lateral zones [@ConwayMorris2010]'; + TEXT CHARACTER= 3 TAXON=118 TEXT='Ventral trunk bears enlarged plates, termed protuberances [@Hu2012]'; + TEXT CHARACTER= 3 TAXON=119 TEXT='Ventral surface distinguished by presence of sclerites [@Han2007; @Shi2022; @ThisStudy]'; + TEXT CHARACTER= 3 TAXON=123 TEXT='Dorsal spines longer than ventral spines [@Han2007pr] – but no prominent differentiation of trunk, so coded ambiguous.'; + TEXT CHARACTER= 3 TAXON=126 TEXT='Appendages, and dorsal extent of sclerites [@Whittington1975]'; + TEXT CHARACTER= 4 TAXON=120 TEXT='Ventral projections are treated as potential homologues to paired appendages [@Dhungana2023]'; + TEXT CHARACTER= 5 TAXON=39 TEXT='Intestine terminates at the posterior end of the larva [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 5 TAXON=42 TEXT='Anus not terminal (Riemann 1972)'; + TEXT CHARACTER= 5 TAXON=103 TEXT='Although coded as abdominal by @Wills2012, it is not clear that this can be supported based on described fossil material [@Schram1973; @ConwayMorris1977]'; + TEXT CHARACTER= 6 TAXON=73 TEXT='The mouth appears to be at a terminal position, but due to the curvature of the trunk region, it faces anterio-ventrally. Since this character codes for a change in the position of the mouth, which is not observed, we code as terminal. '; + TEXT CHARACTER= 6 TAXON=74 TEXT='The mouth appears to be at a terminal position, but due to the curvature of the trunk region, it faces anterio-ventrally. Since this character codes for a change in the position of the mouth, which is not observed, we code as terminal. '; + TEXT CHARACTER= 6 TAXON=75 TEXT='The mouth appears to be at a terminal position, but due to the curvature of the trunk region, it faces anterio-ventrally. Since this character codes for a change in the position of the mouth, which is not observed, we code as terminal. '; + TEXT CHARACTER= 6 TAXON=76 TEXT='The mouth appears to be at a terminal position, but due to the curvature of the trunk region, it faces anterio-ventrally. Since this character codes for a change in the position of the mouth, which is not observed, we code as terminal. '; + TEXT CHARACTER= 6 TAXON=77 TEXT='The mouth appears to be at a terminal position, but due to the curvature of the trunk region, it faces anterio-ventrally. Since this character codes for a change in the position of the mouth, which is not observed, we code as terminal. '; + TEXT CHARACTER= 6 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 6 TAXON=126 TEXT='Terminal [@Whittington1978]'; + TEXT CHARACTER= 6 TAXON=128 TEXT='Following @Ou2012'; + TEXT CHARACTER= 6 TAXON=131 TEXT='Terminal mouth [@Strausfeld2022]'; + TEXT CHARACTER= 6 TAXON=141 TEXT='Terminal [@Howard2020]'; + TEXT CHARACTER= 6 TAXON=148 TEXT='A terminal mouth is incompatible with the extent of the preserved body cavity [@Ou2011]'; + TEXT CHARACTER= 6 TAXON=153 TEXT='@Liu2007az suggest a ventral location, although this could be due to compaction, therefore we code this as uncertain.'; + TEXT CHARACTER= 6 TAXON=156 TEXT='@Park2018 interpret a ventral position, contra @Budd1993; @Budd1998trse'; + TEXT CHARACTER= 6 TAXON=157 TEXT='The mouth opening is ventrally oriented in Pambdelurion [@Budd1998ar].'; + TEXT CHARACTER= 6 TAXON=166 TEXT='Ventral [@Cong2017]'; + TEXT CHARACTER= 7 TAXON=2 TEXT='The mouth of Gastrotrich is anterior, in some species terminal, in others is sub-terminal'; + TEXT CHARACTER= 7 TAXON=62 TEXT='Unclear if ventrally or anteriorly facing based on @Grimaldi1992 drawings.'; + TEXT CHARACTER= 7 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 7 TAXON=148 TEXT='Impossible to determine as obscured by head [@Ou2011]'; + TEXT CHARACTER= 7 TAXON=153 TEXT='Unclear mouth position, however mouth clearly not rotated to posteriad [@Liu2007az]'; + TEXT CHARACTER= 7 TAXON=156 TEXT='@Park2018 interpret the mouth has moved to a ventral position, yet faces anteriad, therefore has not been rotated to point posteriad.'; + TEXT CHARACTER= 7 TAXON=168 TEXT='Coded as anterior: figure 2J from @Moysiuk2019 shows that the mouth faces anteriorly, in contrast to the ventral facing mouth of e.g., Anomalocaris [see figures 5 and 8 from @Daley2014]. Therefore we code as anterior.'; + TEXT CHARACTER= 7 TAXON=169 TEXT='Hurdia specimens are often markedly disarticulated, and therefore difficult to code orientation of circumoral elements [@Daley2013jsp]. We conservatively code Hurdia ambiguous as there are no specimens in a ventral position to determine mouth orientation [supplementary data in @Daley2009]'; + TEXT CHARACTER= 7 TAXON=171 TEXT='Coded as ventral, per the two specimens in figure 3 of @Budd2021'; + TEXT CHARACTER= 7 TAXON=173 TEXT='@Cong2014 interpret a ventral-facing mouth'; + TEXT CHARACTER= 7 TAXON=174 TEXT='We code as ambiguous, as the mouthpart orientation is not clear from the fossil evidence given in figure 1F,H of @Kuhl2009'; + TEXT CHARACTER= 8 TAXON=7 TEXT='Contra @ConwayMorris1977, we interpret the anterior trunk as a differentiated anterior trunk; there is a gradual gradation between the anterior and posterior trunk, rather than a clear delineation. We interpret the narrow, seemingly unarmed (or perhaps lightly armed? Preservation does not allow the preclusion of diminutive armature) region between the trunk and the single ring of spines as the introvert, with the spines therefore corresponding to Zone II circumoral spines.'; + TEXT CHARACTER= 8 TAXON=10 TEXT='Denoted by ring of cusion-like folds'; + TEXT CHARACTER= 8 TAXON=11 TEXT='Anteriormost trunk missing [@Zhang2022]'; + TEXT CHARACTER= 8 TAXON=97 TEXT='The ''anterior proboscis'', which is ornamented with conical papillae [@Howard2020] as a differentiated anterior trunk. The circumoral elements are interpreted as denoting the Zone I armature. No Zone II armature is evident.'; + TEXT CHARACTER= 8 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 8 TAXON=119 TEXT='Following Yang et al., 2020'; + TEXT CHARACTER= 8 TAXON=122 TEXT='Present [@Shi2022]'; + TEXT CHARACTER= 8 TAXON=129 TEXT='The narrow end of the complete specimen YKLP11314 [@Ma2014] is not dissimilar from the introvert of O. ferox.^nWe interpret ELEL-SJ102058 [@Ou2018] to be folded back upon itself, accounting for the juxtaposition of its appendages on two adjacent layers within the sediment. On this view, the ''head'' of this specimen represents a cross-section through the trunk as it folds out of the plane of the specimen.'; + TEXT CHARACTER= 8 TAXON=131 TEXT='Uncertain; a small round disk seems to extend beyond the anterior head in @Strausfeld2022 fig. 3D, with a similar (denticulated?) circular structure in @Liu2014 fig. 4D. The identity of this structure requires further investigation.'; + TEXT CHARACTER= 8 TAXON=148 TEXT='Existence of a ventral structure cannot be ruled out [@Ou2011]'; + TEXT CHARACTER= 8 TAXON=156 TEXT='Coded ambiguous, reflecting possibility that Pambdelurion auxiliary plates correspond to introvert scalids [@Kihm2023]'; + TEXT CHARACTER= 8 TAXON=157 TEXT='Coded ambiguous, reflecting possibility that Pambdelurion auxiliary plates correspond to introvert scalids [@Kihm2023]'; + TEXT CHARACTER= 8 TAXON=158 TEXT='Coded ambiguous, reflecting possibility that Pambdelurion auxiliary plates correspond to introvert scalids [@Kihm2023]'; + TEXT CHARACTER= 9 TAXON=38 TEXT='Created by the lateral extension [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 9 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 9 TAXON=131 TEXT='Seemingly round [@Strausfeld2022]'; + TEXT CHARACTER= 10 TAXON=16 TEXT='Probably not invaginable, based on preservation of type material [@Liu2014]; but difficult to demonstrate conclusively'; + TEXT CHARACTER= 10 TAXON=19 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=20 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=21 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=22 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=23 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=24 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=26 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=27 TEXT='The anterior end can be inverted into the lorica [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=33 TEXT='Can be fully retracted [@Neuhaus2002icb]'; + TEXT CHARACTER= 10 TAXON=38 TEXT='Invaginable [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 10 TAXON=97 TEXT='Same width as trunk and never invaginated; interpreted as not invaginable [@Howard2020]'; + TEXT CHARACTER= 10 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 11 TAXON=7 TEXT='Invaginable (see NMNH83939)'; + TEXT CHARACTER= 11 TAXON=39 TEXT='See e.h. @Kakui2021'; + TEXT CHARACTER= 11 TAXON=95 TEXT='Always dumbbell shaped (?) [@Maas2007ppp]'; + TEXT CHARACTER= 11 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 12 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 13 TAXON=9 TEXT='Absent [@Dong2010]'; + TEXT CHARACTER= 13 TAXON=17 TEXT='Seemingly present [@Harvey2017nee], but difficult to establish number or morphology; elements in SEM perhaps bear subtle hints of serration, but are not unequivocally trichoscalids.'; + TEXT CHARACTER= 13 TAXON=24 TEXT='15 single trichoscalids [@Fujimoto2020mb]'; + TEXT CHARACTER= 13 TAXON=28 TEXT='Absent [@Rucci2020z; @Herranz2021z]'; + TEXT CHARACTER= 13 TAXON=32 TEXT='Introvert stylets are innervated from ten longitudinal introvert nerves that extend from the ventrally open forebrain, which comprises ten lobes of perikarya [@Nebelsick1993z]'; + TEXT CHARACTER= 13 TAXON=42 TEXT='Two rings of articulate labial papillae [@Riemann1972] '; + TEXT CHARACTER= 13 TAXON=43 TEXT='"Labial region offset by a constriction. Papillae prominent." [@Peneva1999n]'; + TEXT CHARACTER= 13 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 13 TAXON=126 TEXT='The oral papillae are treated as potential homologues, by comparison with nematodes'; + TEXT CHARACTER= 14 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 15 TAXON=21 TEXT='Fifteen [@Gad2005mbr]'; + TEXT CHARACTER= 15 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 17 TAXON=24 TEXT='Basal plates present on neck alongside alternate trichoscalids, though absent on thorax [@Fujimoto2020mb]'; + TEXT CHARACTER= 17 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 18 TAXON=19 TEXT='@Heiner2004hmr'; + TEXT CHARACTER= 18 TAXON=20 TEXT='@Heiner2007hmr'; + TEXT CHARACTER= 18 TAXON=22 TEXT='@Gad2005ode'; + TEXT CHARACTER= 19 TAXON=21 TEXT='Serrated [@Heiner2008sb]'; + TEXT CHARACTER= 19 TAXON=24 TEXT='[@Fujimoto2020mb]'; + TEXT CHARACTER= 19 TAXON=30 TEXT='Fringed [@Neuhaus2012za]'; + TEXT CHARACTER= 19 TAXON=33 TEXT='Simple spines [@Herranz2016za]'; + TEXT CHARACTER= 19 TAXON=34 TEXT='Covered with long hairs [@Neuhaus2015z]'; + TEXT CHARACTER= 19 TAXON=35 TEXT='Seemingly fringed [@Sorensen2012mbrm fig. 5b]'; + TEXT CHARACTER= 20 TAXON=18 TEXT='Seven double and eight single [@Kristensen2007ib]'; + TEXT CHARACTER= 20 TAXON=19 TEXT='Eight single, seven double [@Heiner2004hmr]'; + TEXT CHARACTER= 20 TAXON=20 TEXT='Eight single, seven double [@Heiner2007hmr]'; + TEXT CHARACTER= 20 TAXON=21 TEXT='Single [@Heiner2008sb]'; + TEXT CHARACTER= 20 TAXON=22 TEXT='Eight single, seven double [@Gad2005ode]'; + TEXT CHARACTER= 20 TAXON=24 TEXT='Single [@Fujimoto2020mb]'; + TEXT CHARACTER= 21 TAXON=1 TEXT='By comparison with the Higgins larva [e.g. Sorensen2023ode], it is possible that an introvert existed anterior to the preserved lorica and neck, but is not preserved due to distinct preservation [@Maas2009] or involusion in the available material.'; + TEXT CHARACTER= 21 TAXON=7 TEXT='Four rows of diminutive scalids at base of armature (ROM 93-1678), with gap before circumoral scalids (see e.g. NMNH198597, 198605)'; + TEXT CHARACTER= 21 TAXON=9 TEXT='@Dong2010'; + TEXT CHARACTER= 21 TAXON=40 TEXT='Lobe-like outgrowths of the stoma with small denticles on their edges [@Kulikov1998rjn] are treated as elements of the pharyngostome, per @Venekey2019z and @Inglis1969bbmnh; hence there is no introvert armature'; + TEXT CHARACTER= 21 TAXON=41 TEXT='Six odontia in anterior portion of buccal cavity (i.e. cheilostome), with accessory structures in between [@Leduc2016n]'; + TEXT CHARACTER= 21 TAXON=42 TEXT='A lip papilla occurs immediately adjacent to the articulated head seta; ahead of this is a double row of sclerotized spines [@Riemann1972].'; + TEXT CHARACTER= 21 TAXON=43 TEXT='Unarmed [@Borgonie1995]'; + TEXT CHARACTER= 21 TAXON=97 TEXT='A ring of diminutive conical elements (''oral spines'') surrounds the larger plates (RCCBYU10233; YKLP 11410) [@Howard2020]'; + TEXT CHARACTER= 21 TAXON=110 TEXT='Corynetis seems to have an unarmoured introvert leading to a ring of elongate circumoral spines [@Huang2004; @Hu2012; @Chen2012]'; + TEXT CHARACTER= 21 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 21 TAXON=121 TEXT='Specimens with well-preserved introverts and pharynxes are figured by @Maas2007ppp, @Hou1994; @Vannier2017'; + TEXT CHARACTER= 21 TAXON=131 TEXT='If the short oral projections visible in Fig. 4D are not taphonomic artefacts, they are most likely to correspond to Zone II elements.'; + TEXT CHARACTER= 21 TAXON=157 TEXT='We score the ovate plates as equivalent to Zone I armature, following @Kihm2023'; + TEXT CHARACTER= 21 TAXON=158 TEXT='We score the ovate plates as equivalent to Zone I armature, following @Kihm2023'; + TEXT CHARACTER= 22 TAXON=9 TEXT='Presumably yes; the three circlets comprise 8+8+9 sclerites [@Dong2010]'; + TEXT CHARACTER= 22 TAXON=16 TEXT='Ambiguous: Defined by first two rows. 12 rows of 9 scalids each offset to produce 18 rows [@Shao2020]'; + TEXT CHARACTER= 22 TAXON=38 TEXT='Only three circlets present^n'; + TEXT CHARACTER= 22 TAXON=44 TEXT='25 rows less regimented than in other priapulans [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 22 TAXON=97 TEXT='Single ring of elements. (See discussion of introvert for identity of zonal elements.)'; + TEXT CHARACTER= 22 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 22 TAXON=119 TEXT='Defined by first circlet only [@Yang2020]'; + TEXT CHARACTER= 23 TAXON=40 TEXT='Not figured in sufficient detail [@Kulikov1998rjn]'; + TEXT CHARACTER= 23 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 23 TAXON=157 TEXT='Seemingly directed as in Omnidens [@Vinther2016]'; + TEXT CHARACTER= 23 TAXON=158 TEXT='Apices directed away from mouth, thus posteriad [@Li2024]'; + TEXT CHARACTER= 24 TAXON=9 TEXT='Three [@Dong2010]'; + TEXT CHARACTER= 24 TAXON=39 TEXT='Two in adults [@Poinar2001]'; + TEXT CHARACTER= 24 TAXON=41 TEXT='Odontia and accessory buccal structures [@Leduc2016n] treated as separate circlets in close proximity'; + TEXT CHARACTER= 24 TAXON=97 TEXT='Seemingly a single circlet of oral teeth [@Howard2020]'; + TEXT CHARACTER= 24 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 24 TAXON=120 TEXT='Five circlets [@ThisStudy]'; + TEXT CHARACTER= 24 TAXON=121 TEXT='Multiple circlets most obvious in @Vannier2017'; + TEXT CHARACTER= 25 TAXON=103 TEXT='Reported as two transverse bands by @Wills1998, without evidence; this is not evident in figured material [@Schram1973; @ConwayMorris1977], so is scored as ambiguous'; + TEXT CHARACTER= 25 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 26 TAXON=38 TEXT='Only three rows; hence conservatively coded as ambiguous'; + TEXT CHARACTER= 26 TAXON=98 TEXT='Chaotically scattered [@Ma2014; @Yang2021]'; + TEXT CHARACTER= 26 TAXON=103 TEXT='Prominent rows [@ConwayMorris1977]'; + TEXT CHARACTER= 26 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 26 TAXON=120 TEXT='Parallel longitudinal rows [@ThisStudy]'; + TEXT CHARACTER= 26 TAXON=121 TEXT='Forming quincunx, possibly with gap between anterior and posterior region [@Vannier2017]'; + TEXT CHARACTER= 27 TAXON=16 TEXT='Parallel rows [@Liu2014]'; + TEXT CHARACTER= 27 TAXON=18 TEXT='Parallel rows [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 27 TAXON=38 TEXT='Only three rows; hence conservatively coded as ambiguous'; + TEXT CHARACTER= 27 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 27 TAXON=122 TEXT='Seemingly longitudinal [@Shi2022], but difficult to be certain '; + TEXT CHARACTER= 27 TAXON=123 TEXT='Qincunx [@Han2007pr]'; + TEXT CHARACTER= 28 TAXON=8 TEXT='GSC 45331'; + TEXT CHARACTER= 28 TAXON=42 TEXT='Continuous to the base of the ''pricks'' [@Reiman1972], which we interpret as Zone II elements'; + TEXT CHARACTER= 28 TAXON=97 TEXT='Position of Zone II unclear.'; + TEXT CHARACTER= 28 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 29 TAXON=16 TEXT='Hollow cuticular spines [@Liu2014]'; + TEXT CHARACTER= 29 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 30 TAXON=38 TEXT='Stated as solid, but likely hollow as in Nectonema.'; + TEXT CHARACTER= 30 TAXON=39 TEXT='Hollow [@SchmidtRhaesa1996]'; + TEXT CHARACTER= 30 TAXON=41 TEXT='Central cavity evident in odontia [@Leduc2016n]'; + TEXT CHARACTER= 30 TAXON=97 TEXT='Preservation suggests hollow'; + TEXT CHARACTER= 30 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 30 TAXON=121 TEXT='Preservation suggests a central cavity [@Vanner2017]'; + TEXT CHARACTER= 31 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 32 TAXON=7 TEXT='Too short to evaluate'; + TEXT CHARACTER= 32 TAXON=8 TEXT='Simple cones (GSC 45331)'; + TEXT CHARACTER= 32 TAXON=24 TEXT='No innate curvature evident, though flexible [@Fujimoto2020mb]'; + TEXT CHARACTER= 32 TAXON=97 TEXT='Slightly curved posteriad [@Howard2020]'; + TEXT CHARACTER= 32 TAXON=103 TEXT='"Apparently simple cones" [@ConwayMorris1977]'; + TEXT CHARACTER= 32 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 33 TAXON=28 TEXT='No evidence of bifurcation [@Rucci2020z]'; + TEXT CHARACTER= 33 TAXON=38 TEXT='Bifurcation of single ventral sclerite'; + TEXT CHARACTER= 33 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 34 TAXON=8 TEXT='Cannot rule out presence of fine denticles'; + TEXT CHARACTER= 34 TAXON=33 TEXT='Basal component with pectinate fringe [@BauerNebelsick1995]'; + TEXT CHARACTER= 34 TAXON=41 TEXT='Accessory elements bear knobs [@Leduc2016n]'; + TEXT CHARACTER= 34 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 35 TAXON=33 TEXT='Bipartite, with a broad base and an elongate tip, but not obviously articulated [@BauerNebelsick1995]'; + TEXT CHARACTER= 35 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 36 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 37 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 38 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 39 TAXON=28 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 39 TAXON=29 TEXT='Presence in genera reported by @Herranz2021z'; + TEXT CHARACTER= 39 TAXON=30 TEXT='No data available [@Herranz2021z]'; + TEXT CHARACTER= 39 TAXON=31 TEXT='Presence in genera reported by @Herranz2021z'; + TEXT CHARACTER= 39 TAXON=32 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 39 TAXON=33 TEXT='Presence in genera reported by @Herranz2021z'; + TEXT CHARACTER= 39 TAXON=34 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 39 TAXON=35 TEXT='Present [@Herranz2021z]'; + TEXT CHARACTER= 39 TAXON=36 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 39 TAXON=37 TEXT='Absent; within Allomalorhagida, intrinsic muscles in the primary spinoscalids are only present in Dracoderes [@Herranz2021z]'; + TEXT CHARACTER= 40 TAXON=12 TEXT='Alternating rows of 12 sclerites [@Liu2019]'; + TEXT CHARACTER= 40 TAXON=13 TEXT='11 introvert rows; see media [@ThisStudy]'; + TEXT CHARACTER= 40 TAXON=16 TEXT='18 rows [@Shao2016]'; + TEXT CHARACTER= 40 TAXON=24 TEXT='Thirty elements per row [@Fujimoto2020mb]'; + TEXT CHARACTER= 40 TAXON=92 TEXT='Ten buccal lamellae [@Michalczyk2003], but these do not necessarily correspond to the symmetry of the introvert.'; + TEXT CHARACTER= 40 TAXON=103 TEXT='At least twenty, and possibly twenty five, rows – but exact number uncertain [@ConwayMorris1977]'; + TEXT CHARACTER= 40 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 40 TAXON=123 TEXT='Likely pentaradial: four or five scalids in each ring, offset to produce comprises 8-10 longitudinal rows [@Han2007]'; + TEXT CHARACTER= 41 TAXON=9 TEXT='8+8+9 [@Dong2010]'; + TEXT CHARACTER= 41 TAXON=95 TEXT='Ten rows visible [@Maas2007ppp]; total could conceivably be 18, 19, or 20.'; + TEXT CHARACTER= 41 TAXON=111 TEXT='Possibly around 25 elements [@Hu2008], but preservation to poor to confirm'; + TEXT CHARACTER= 41 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 42 TAXON=12 TEXT='Alternating rows of 12 sclerites [@Liu2019]'; + TEXT CHARACTER= 42 TAXON=16 TEXT='18 [@Shao2016]'; + TEXT CHARACTER= 42 TAXON=38 TEXT='Six teeth per row'; + TEXT CHARACTER= 42 TAXON=41 TEXT='6 + 6'; + TEXT CHARACTER= 42 TAXON=80 TEXT='Six, defined by oral papillae [@Dewel2006]'; + TEXT CHARACTER= 42 TAXON=91 TEXT='@Kristensen1982'; + TEXT CHARACTER= 42 TAXON=92 TEXT='Six buccal lamellae [@Kihm2023]'; + TEXT CHARACTER= 42 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 42 TAXON=123 TEXT='Likely pentaradial: four or five scalids in each ring, offset to produce comprises 8-10 longitudinal rows [@Han2007pr]'; + TEXT CHARACTER= 43 TAXON=40 TEXT='Large solid onchium, usually ''bent about its mid-length'' [@Inglis1969bbmnh]'; + TEXT CHARACTER= 43 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 44 TAXON=38 TEXT='The "buccal cavity" corresponds to the inverted introvert, rather than a separate chamber [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 44 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 44 TAXON=131 TEXT='Possibly represented by the ''buccal tube'' of @Strausfeld2022'; + TEXT CHARACTER= 44 TAXON=138 TEXT='Ambiguous: the apparently internal position of the circumoral plates could denote post mortem retraction of the pharyngeal apparatus, as observed in tardigrades [@Khim2023].^n'; + TEXT CHARACTER= 45 TAXON=24 TEXT='No annulations evident [@Fujimoto2020mb]'; + TEXT CHARACTER= 45 TAXON=41 TEXT='Not evident in light micrographs [@Leduc2016n]'; + TEXT CHARACTER= 45 TAXON=43 TEXT='Annulations below bifurcated lobes in buccal cavity [@Borgonie1995fan]'; + TEXT CHARACTER= 45 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 46 TAXON=7 TEXT='Never preserved everted [@ConwayMorris1977]; consistent position, extent and shape in NMNH198605, NMNH198597, ROM93-1678'; + TEXT CHARACTER= 46 TAXON=8 TEXT='Partly everted in GSC 45331'; + TEXT CHARACTER= 46 TAXON=11 TEXT='Inferred from bulb-like shape [@Zhang2022]'; + TEXT CHARACTER= 46 TAXON=38 TEXT='The stylet is treated as an eversible pharynx'; + TEXT CHARACTER= 46 TAXON=39 TEXT='The stylet is treated as an eversible pharynx'; + TEXT CHARACTER= 46 TAXON=97 TEXT='No indication of eversibility [@Howard2020]'; + TEXT CHARACTER= 46 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 46 TAXON=126 TEXT='NMNH83942a exhibits a small pyrimidal extension of the pharynx; NMNH57655 displays an everted pharynx, narrower than the introvert and conceivably tipped with triangular teeth, evident in reflected light but obscured by a dark stain in polarized light [@Whittington1975, figs 10–11]. Further investigation is necessary to establish the nature of this structure.'; + TEXT CHARACTER= 46 TAXON=156 TEXT='Anterior position in certain specimens is attributed to post-mortem processes [@Park2018]'; + TEXT CHARACTER= 46 TAXON=157 TEXT='Oral cone eversible, but pharynx is not [@Vinther2016]'; + TEXT CHARACTER= 47 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 48 TAXON=28 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=29 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=30 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=31 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=32 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=33 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=34 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=35 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=36 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=37 TEXT='The kinorhynch introvert is a locomotory and sensory organ [@Herranz2021z]'; + TEXT CHARACTER= 48 TAXON=42 TEXT='Employed in locomotion [@Reiman1972]'; + TEXT CHARACTER= 48 TAXON=95 TEXT='Interpreted as locomotory introvert [@Maas2007ppp]'; + TEXT CHARACTER= 48 TAXON=98 TEXT='Introvert interpreted as locomotory [@Ma2014]'; + TEXT CHARACTER= 48 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 48 TAXON=129 TEXT='Locomotion presumably employed the appendages'; + TEXT CHARACTER= 49 TAXON=44 TEXT='Fully everted pharynx not observed in the six available specimens [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 49 TAXON=95 TEXT='Substantial eversion evident [@Maas2007ppp]'; + TEXT CHARACTER= 49 TAXON=98 TEXT='Not everted beyond proximal teeth in any known specimen [@Ma2014; @Yang2021]'; + TEXT CHARACTER= 49 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 50 TAXON=22 TEXT='Elongate mouth cone possibly an apomorphy of this species [@Higgins1986scz]'; + TEXT CHARACTER= 50 TAXON=24 TEXT='Elongate mouth tube [@Fujimoto2020mb]'; + TEXT CHARACTER= 50 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 50 TAXON=121 TEXT='Neither'; + TEXT CHARACTER= 51 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 52 TAXON=33 TEXT='Round [@Neuhaus2002icb]'; + TEXT CHARACTER= 52 TAXON=38 TEXT='Triradial oesophagus [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 52 TAXON=69 TEXT='Triradiate in A. doryphorus [@EibyeJacobsen2001jzser]'; + TEXT CHARACTER= 52 TAXON=73 TEXT='Triradiate in E. viridissimus [@EibyeJacobsen2001jzser]'; + TEXT CHARACTER= 52 TAXON=80 TEXT='Triradiate [@EibyeJacobsen2001jzser]'; + TEXT CHARACTER= 52 TAXON=91 TEXT='Triradiate [@EibyeJacobsen2001jzser]'; + TEXT CHARACTER= 52 TAXON=97 TEXT='Three prominent robust elements'; + TEXT CHARACTER= 52 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 53 TAXON=8 TEXT='Circumpharyngeal spines evident in GSC 45331, left and right of partly everted pharynx'; + TEXT CHARACTER= 53 TAXON=11 TEXT='Associated region not preserved [@Zhang2022]'; + TEXT CHARACTER= 53 TAXON=15 TEXT='The ‘wrinkles’ at the base of the Eokinorhynchus pharynx [@Zhang2015, fig. 1f] seem to be cuticular structures rather than spines lying flat against the pharynx surface'; + TEXT CHARACTER= 53 TAXON=20 TEXT='The base of the mouth cone is marked by a ring of pleats that intriguingly resemble circumoral plates [@Heiner2007hmr], but in fact represent ridges of the cuticle [@Neves2016za].'; + TEXT CHARACTER= 53 TAXON=28 TEXT='Spinose processes (cf. those in Cateria?) occur just inside the primary spinoscalids [@Rucci2020z]'; + TEXT CHARACTER= 53 TAXON=34 TEXT='Some specimens of Cateria exhibit a ring of cuticular spines anterior to the primary spinoscalids; these spines are sometimes joined by a sheet of cuticle, becoming distinct only distally [@Neuhaus2015z, fig. 6A, 10E, 13G]. These elements are indistinct and poorly known; if they represent Zone II elements this may prompt the primary spinoscalids to be reconsidered as elements of Zone I. We treat the primary spinoscalids as Zone II elements here, leaving the nature of the cuticular spine-sheet open.'; + TEXT CHARACTER= 53 TAXON=42 TEXT='The twelve pricks [@Reiman1972] are interpreted as Zone II elements based on their position and morphology'; + TEXT CHARACTER= 53 TAXON=44 TEXT='Not obvious in SEM or µCT images [@SchmidtRhaesa2022za], but absence difficult to determine'; + TEXT CHARACTER= 53 TAXON=94 TEXT='The velum is considered to represent fused lamellae [@Guidetti2012]'; + TEXT CHARACTER= 53 TAXON=95 TEXT='Spines [@Maas2007ppp, fig 7a]'; + TEXT CHARACTER= 53 TAXON=98 TEXT='Elongate spines [@Yang2021]'; + TEXT CHARACTER= 53 TAXON=103 TEXT='Peribuccal collar preserved [@ConwayMorris1977]'; + TEXT CHARACTER= 53 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 53 TAXON=121 TEXT='Faint ring of elongate circumoral spines [@Maas2007ppp; @Vannier2017]'; + TEXT CHARACTER= 53 TAXON=122 TEXT='Seemingly absent. The "collar spines" of @Shi2022 are taphonomic features reflecting flaking of the cuticle; they do not have a consistent shape and do not recur around the pharynx. The margins of the pharynx are smooth, and thus prominently unarmed. The possible presence of coronal spines is harder to discount with certainty, but we see no candidates; spines if present must be diminutive. We score as absent.'; + TEXT CHARACTER= 53 TAXON=123 TEXT='Anterior scalids are distinct from others on the introvert and can point anteriad or posteriad [@Han2007pr], and are likely coronal spines – but better material is required for confident designation.'; + TEXT CHARACTER= 53 TAXON=126 TEXT='Mouth surrounded by six slim papillae [@Whittington1975]'; + TEXT CHARACTER= 53 TAXON=129 TEXT='Armature not preserved, but impossible to rule out absence, particularly given the elusive nature of equivalent structures in e.g. Hallucigenia [@Smith2015].'; + TEXT CHARACTER= 53 TAXON=131 TEXT='Potentially represented by the radial structures that surround the mouth in @Liu2014, fig. 4D'; + TEXT CHARACTER= 53 TAXON=132 TEXT='The cuticular ring reported in the head of Microdictyon [@Liu2014ppp] requires detailed study before its interpretation can be considered secure. '; + TEXT CHARACTER= 53 TAXON=141 TEXT='Preservation inadequate to evaluate [@Howard2020]'; + TEXT CHARACTER= 53 TAXON=143 TEXT='Detailed arrangement of tooth-like structures compatible with arrangement in Hallucigenia [@Smith2015], but inadequately preserved to evaluate [@Caron2017]. Coded as ambiguous.'; + TEXT CHARACTER= 53 TAXON=148 TEXT='Antennacanthopodia [@Ou2011] is coded as ambiguous as there is no direct evidence for the location of the mouth.'; + TEXT CHARACTER= 53 TAXON=152 TEXT='Uncertain, as oral surface is incompletely preserved [@Smith2023n], and it is possible that such structures would become more prominent in adults'; + TEXT CHARACTER= 53 TAXON=153 TEXT='Present [@Vannier2014, supplementary figure 6]'; + TEXT CHARACTER= 53 TAXON=163 TEXT='Radial structures around the mouth drawn by @Whittington1975 are interpreted by @Dhungana2021 as circumoral plates.'; + TEXT CHARACTER= 53 TAXON=166 TEXT='Smooth and tuberculate plates are interpreted as elements of an Anomalocaris-like oral cone [@Cong2017]'; + TEXT CHARACTER= 53 TAXON=173 TEXT='Not described in original reports [@Cong2014; @Cong2016; @Cong2017], but documented in a juvenile by @Liu2018nsr, who propose that the absence in larger specimens is taphonomic.'; + TEXT CHARACTER= 54 TAXON=7 TEXT='Erect triangular spines [@ConwayMorris1977]'; + TEXT CHARACTER= 54 TAXON=20 TEXT='We interpret the plicae as erect spines, as in cases they seem to .'; + TEXT CHARACTER= 54 TAXON=55 TEXT='We code tardigrades as having a small contact area as the peribuccal lamellae are only basally attached to the body [e.g. @Guidetti2013, figure 3B]'; + TEXT CHARACTER= 54 TAXON=97 TEXT='Semi-erect plates [@Howard2020, RCCBYU 10233]'; + TEXT CHARACTER= 54 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 54 TAXON=126 TEXT='Erect [@Whittington1975]'; + TEXT CHARACTER= 54 TAXON=153 TEXT='Due to the limited material we code this as ambiguous'; + TEXT CHARACTER= 54 TAXON=157 TEXT='Ambiguous. Although @Vinther2016 (e.g. Figure 3D) reconstruct only the basal parts of the ''triangular plates'' as in contact with the body, comparison with Omnidens suggests a more complete attachment.'; + TEXT CHARACTER= 54 TAXON=158 TEXT='The flat surfaces of the plate are interpreted as in contact with the body, with the inner spines protruding'; + TEXT CHARACTER= 55 TAXON=8 TEXT='Presumably continuous ring but only evident at sides (GSC 45331)'; + TEXT CHARACTER= 55 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 55 TAXON=154 TEXT='Traditionally interpreted as radial. @Li2024 suggest the possibility of a biserial configuration, but @Vannier2014 clearly show both Zone II and Zone III elements occurring along the midline of the specimen, seemingly corroborating a radial configuration.'; + TEXT CHARACTER= 55 TAXON=157 TEXT='@Li2024'; + TEXT CHARACTER= 55 TAXON=159 TEXT='Traditionally interpreted as radial, but plausibly bilateral [@Li2024]'; + TEXT CHARACTER= 55 TAXON=166 TEXT='Traditionally interpreted as radial, but plausibly bilateral [@Li2024]'; + TEXT CHARACTER= 55 TAXON=173 TEXT='Traditionally interpreted as radial, but plausibly bilateral [@Li2024]'; + TEXT CHARACTER= 56 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 56 TAXON=158 TEXT='Some size differences but no prominent differentiation [@Li2024]'; + TEXT CHARACTER= 56 TAXON=159 TEXT='Differentiated, following @Budd2021'; + TEXT CHARACTER= 56 TAXON=163 TEXT='Undifferentiated elongate plates [@Dhungana2021]'; + TEXT CHARACTER= 56 TAXON=166 TEXT='Three or possibly four tuberculate plates [@Cong2017]'; + TEXT CHARACTER= 56 TAXON=168 TEXT='@Moysiuk2019'; + TEXT CHARACTER= 56 TAXON=173 TEXT='Four enlarged plates [@Liu2018nsr]'; + TEXT CHARACTER= 56 TAXON=174 TEXT='Figure 1H of @Kuhl2009 has no indication that any of the plates are enlarged, therefore we code Schinderhannes as having undifferentiated circumoral sclerites. The reconstruction of @Kuhl2009 implicitly implies a lack of differentiation'; + TEXT CHARACTER= 57 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 58 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 59 TAXON=92 TEXT='On inner face only [@Michalczyk2003]'; + TEXT CHARACTER= 59 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 59 TAXON=166 TEXT='Tubercles [@Cong2017]'; + TEXT CHARACTER= 59 TAXON=168 TEXT='Not evident or interpreted as present by @Moysiuk2019'; + TEXT CHARACTER= 59 TAXON=173 TEXT='Present [@Liu2018nsr]'; + TEXT CHARACTER= 60 TAXON=78 TEXT='Only present in parachelan species [@Guidetti2012]'; + TEXT CHARACTER= 60 TAXON=79 TEXT='Only present in parachelan species [@Guidetti2012]'; + TEXT CHARACTER= 60 TAXON=80 TEXT='Absent [@Dewel2006]'; + TEXT CHARACTER= 60 TAXON=81 TEXT='In Bertolanius volubilis (Eohypsibiidae) [@Guidetti2015]'; + TEXT CHARACTER= 60 TAXON=92 TEXT='On inner face only [@Michalczyk2003]'; + TEXT CHARACTER= 60 TAXON=93 TEXT='The ''anterior tooth row'' of @Kihm20203, fig. 1F'; + TEXT CHARACTER= 60 TAXON=94 TEXT='Corresponding to anterior band of buccal armature [@Guidetti2012]'; + TEXT CHARACTER= 60 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 61 TAXON=97 TEXT='Strong three-dimensional relief [@Howard2020] implies robust original construction '; + TEXT CHARACTER= 61 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 62 TAXON=16 TEXT='''Possibly'' twelve [@Shao2020; @Liu2014]'; + TEXT CHARACTER= 62 TAXON=42 TEXT='Six pairs [@Reiman1972]'; + TEXT CHARACTER= 62 TAXON=80 TEXT='Four [@Dewel2006]'; + TEXT CHARACTER= 62 TAXON=81 TEXT='In Bertolanius volubilis (Eohypsibiidae) [@Guidetti2015]'; + TEXT CHARACTER= 62 TAXON=92 TEXT='Ten [@Guidetti2012]'; + TEXT CHARACTER= 62 TAXON=93 TEXT='Ten [@Guidetti2012]'; + TEXT CHARACTER= 62 TAXON=94 TEXT='Single fused element? [@Guidetti2012]'; + TEXT CHARACTER= 62 TAXON=97 TEXT='Three visible in lateral view, indicating six in original circlet'; + TEXT CHARACTER= 62 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 63 TAXON=8 TEXT='Elongate spines (GSC 45331)'; + TEXT CHARACTER= 63 TAXON=16 TEXT='Only bases preserved [@Liu2014]'; + TEXT CHARACTER= 63 TAXON=95 TEXT='Just visible in @Maas2007ppp, figs 3b, 7a'; + TEXT CHARACTER= 63 TAXON=98 TEXT='Elongate [@Yang2021]'; + TEXT CHARACTER= 63 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 63 TAXON=126 TEXT='Around five times longer than wide [@Whittington1975]'; + TEXT CHARACTER= 64 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 65 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 65 TAXON=163 TEXT='Single projection [@Dhungana2021]'; + TEXT CHARACTER= 66 TAXON=33 TEXT='Secondary setae and pectinate projections [@BauerNebelsick1995]'; + TEXT CHARACTER= 66 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 66 TAXON=158 TEXT='Accessory spines on certain plates [@Li2024]'; + TEXT CHARACTER= 66 TAXON=163 TEXT='Single projection [@Dhungana2021]'; + TEXT CHARACTER= 66 TAXON=166 TEXT='@Cong2017 identify two spinose projections on one tuberculate plate (fig. 8e), but only one is unambiguously evident. We thus code this character as ambiguous.'; + TEXT CHARACTER= 66 TAXON=168 TEXT='Multiple spines [@Moysiuk2019]'; + TEXT CHARACTER= 66 TAXON=173 TEXT='Multiple spines [@Liu2018nsr]'; + TEXT CHARACTER= 67 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 68 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 68 TAXON=122 TEXT='Narrower [@Shi2022]'; + TEXT CHARACTER= 69 TAXON=15 TEXT='Short gap of wrinkled cuticle [@Zhang2015, fig. 1]'; + TEXT CHARACTER= 69 TAXON=16 TEXT='Negligible gap [@Liu2014]'; + TEXT CHARACTER= 69 TAXON=38 TEXT='Unarmed region present [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 69 TAXON=44 TEXT='Seemingly a gap based on µCT data [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 69 TAXON=50 TEXT='Gap [@Schmidt2017, fig 2A]'; + TEXT CHARACTER= 69 TAXON=92 TEXT='Gap [@Michalczyk2003]'; + TEXT CHARACTER= 69 TAXON=93 TEXT='Gap, best seen towards bottom of @Kihm20203, fig. 1F'; + TEXT CHARACTER= 69 TAXON=94 TEXT='Without prominent gap [@Guidetti2012]'; + TEXT CHARACTER= 69 TAXON=95 TEXT='No gap [@Maas2007ppp]'; + TEXT CHARACTER= 69 TAXON=98 TEXT='Gap [@Ma2014, fig 5.4]'; + TEXT CHARACTER= 69 TAXON=111 TEXT='Apparent teeth gap [@Hu2008]'; + TEXT CHARACTER= 69 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 69 TAXON=120 TEXT='Prominent gap [@ThisStudy]'; + TEXT CHARACTER= 69 TAXON=121 TEXT='Gap minimal, if it exists at all [@Maas2007ppp; @Vannier2017]'; + TEXT CHARACTER= 69 TAXON=122 TEXT='Prominent teeth gap (see notes on Zone II sclerites)'; + TEXT CHARACTER= 69 TAXON=123 TEXT='Unarmed ''collar'' [@Han2007pr]'; + TEXT CHARACTER= 69 TAXON=154 TEXT='No gap [@Liu2006]'; + TEXT CHARACTER= 69 TAXON=157 TEXT='Directly adjacent [@Vinther2016]'; + TEXT CHARACTER= 69 TAXON=158 TEXT='No separation [@Li2024; @Li2025]'; + TEXT CHARACTER= 70 TAXON=15 TEXT='The wrinkles [@Zhang2015, fig. 1] are not dissimilar to the pleats of certain loriciferans, so are interpreted as denoting cuticular reinforcement'; + TEXT CHARACTER= 70 TAXON=18 TEXT='Cuticularized bars occur on the proximal mouth cone, preceding each oral furca [@Neves2021po].'; + TEXT CHARACTER= 70 TAXON=22 TEXT='The flexible cuticle of the base of the mouth cone is divided into eight plates [@Gad2005za]'; + TEXT CHARACTER= 70 TAXON=24 TEXT='Well-developed ruff: "a cuticular ring with fibres arising from eight points" [@Fujimoto2020mb]'; + TEXT CHARACTER= 70 TAXON=34 TEXT='Conceivably represented by the cuticular sheath [@Neuhaus2015z], whose ridges are akin to those observed in the basal ring of loriciferans.'; + TEXT CHARACTER= 70 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 71 TAXON=18 TEXT='Eight in Nanaloricidae [@Neves2016za]'; + TEXT CHARACTER= 71 TAXON=19 TEXT='Eight in Nanaloricidae [@Neves2016za]'; + TEXT CHARACTER= 71 TAXON=20 TEXT='Eight in Nanaloricidae [@Neves2016za]'; + TEXT CHARACTER= 71 TAXON=24 TEXT='Eight [@Fujimoto2020mb]'; + TEXT CHARACTER= 71 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 71 TAXON=120 TEXT='Cuticular folds akin to oral ridges [@ThisStudy] are likely taphonomic. Coded ambiguous.'; + TEXT CHARACTER= 72 TAXON=18 TEXT='Eight in Nanaloricidae [@Neves2016za]'; + TEXT CHARACTER= 72 TAXON=19 TEXT='Eight in Nanaloricidae [@Neves2016za]'; + TEXT CHARACTER= 72 TAXON=20 TEXT='Eight [@Heiner2007hmr]'; + TEXT CHARACTER= 72 TAXON=24 TEXT='Eight [@Fujimoto2020mb]'; + TEXT CHARACTER= 72 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 73 TAXON=18 TEXT='Present in Nanaloricidae only [@Neves2016za]'; + TEXT CHARACTER= 73 TAXON=19 TEXT='Present in Nanaloricidae only [@Neves2016za]'; + TEXT CHARACTER= 73 TAXON=20 TEXT='No sclerotized furcae [@Heiner2007hmr]'; + TEXT CHARACTER= 73 TAXON=21 TEXT='Present in Nanaloricidae only [@Neves2016za]'; + TEXT CHARACTER= 73 TAXON=22 TEXT='Present in Nanaloricidae only [@Neves2016za]'; + TEXT CHARACTER= 73 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 74 TAXON=20 TEXT='Two different lengths [@Heiner2007hmr]'; + TEXT CHARACTER= 74 TAXON=24 TEXT='Anterior tips of ridges are not attached to mouth cone [@Fujimoto2020mb]'; + TEXT CHARACTER= 74 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 75 TAXON=80 TEXT='Fenestrated cuticle [@EibyeJacobsen2001za, fig. 12; @Dewel2006, fig. 11]'; + TEXT CHARACTER= 75 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 76 TAXON=7 TEXT='No armature visible [@ConwayMorris1977], perhaps due to non-eversion of pharynx'; + TEXT CHARACTER= 76 TAXON=8 TEXT='Carbon-rich preservation in everted component of phayrnx in GSC 45331 suggests armature; inverted component seemingly armed too, but difficult to determine with confidence'; + TEXT CHARACTER= 76 TAXON=11 TEXT='Present on inverted pharynx [@Zhang2022]^n'; + TEXT CHARACTER= 76 TAXON=22 TEXT='The distal mouth cone contains four oral stylets; three rows of placoids adorn the pharyngeal bulb [@Gad2005za]'; + TEXT CHARACTER= 76 TAXON=24 TEXT='Internal armature present in adults; presence of stylets equivocal [@Fujimoto2020mb]'; + TEXT CHARACTER= 76 TAXON=41 TEXT='Unarmed [@Keppner1988tams; @Leduc2016n]'; + TEXT CHARACTER= 76 TAXON=42 TEXT='Not visible in drawings of @Reiman1972, but present in close relative Onchulus [@Swart1993]'; + TEXT CHARACTER= 76 TAXON=43 TEXT='Coded as unarmed. The teeth and denticles within the buccal cavity [@Borgonie1995] are outgrowths of three plates, one of which corresponds to the dorsal tooth; hence these seem not to represent equivalents of the Zone III pharyngeal teeth.'; + TEXT CHARACTER= 76 TAXON=73 TEXT='@Dewel2006'; + TEXT CHARACTER= 76 TAXON=80 TEXT='@Dewel2006'; + TEXT CHARACTER= 76 TAXON=93 TEXT='Posterior band of small teeth [@Guidetti2012]'; + TEXT CHARACTER= 76 TAXON=94 TEXT='Posterior band of small teeth [@Guidetti2012]'; + TEXT CHARACTER= 76 TAXON=103 TEXT='No armature preserved [@ConwayMorris1977]'; + TEXT CHARACTER= 76 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 76 TAXON=126 TEXT='Not displayed, but preservation of figured material [@Whittington1975] inadequate to exclude internal pharyngeal structures'; + TEXT CHARACTER= 76 TAXON=129 TEXT='Impossible to rule out the presence of pharyngeal teeth based on available material'; + TEXT CHARACTER= 76 TAXON=131 TEXT='Not possible to rule out the presence of hallucigeniid-like aciculae based on available material'; + TEXT CHARACTER= 76 TAXON=143 TEXT='Present [@Caron2017]'; + TEXT CHARACTER= 76 TAXON=157 TEXT='Present [@Vinther2016]'; + TEXT CHARACTER= 76 TAXON=166 TEXT='Preservation insufficient to evaluate [@Cong2017]'; + TEXT CHARACTER= 76 TAXON=168 TEXT='Present [@Moysiuk2019]'; + TEXT CHARACTER= 77 TAXON=40 TEXT='Most denticles are expressed as individual cusps expressed on outgrowths of the pharynx [@Kulikov1998rjn]'; + TEXT CHARACTER= 77 TAXON=44 TEXT='Multiple cusps certainly in distal teeth, if possibly not in proximal teeth [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 77 TAXON=110 TEXT='Seemingly simple scalids [@Hu2012]'; + TEXT CHARACTER= 77 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 77 TAXON=119 TEXT='Seemingly multicuspate [@Yang2020], cf. Selkirkia'; + TEXT CHARACTER= 77 TAXON=143 TEXT='Short spines [@Caron2017]'; + TEXT CHARACTER= 77 TAXON=153 TEXT='From @Vannier2014 supplementary figure 6c, the pharyngeal teeth appear multicupsate, although only few are preserved well.'; + TEXT CHARACTER= 77 TAXON=157 TEXT='Multiple cusps inferred based on similarity to Omnidens [@Vinther2016]'; + TEXT CHARACTER= 77 TAXON=168 TEXT='Multiple cusps [@Moysiuk2019]'; + TEXT CHARACTER= 78 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 79 TAXON=15 TEXT='Hollow elements Short gap of wrinkled cuticle [@Zhang2015, fig. 1]'; + TEXT CHARACTER= 79 TAXON=38 TEXT='Small cavity present in Paragordius [@Jochmann2007]'; + TEXT CHARACTER= 79 TAXON=97 TEXT='Seeminhly hollow [@Howard2022, fig. 1c]'; + TEXT CHARACTER= 79 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 79 TAXON=143 TEXT='No evidence of void, but preservation consistent with central cavity [@Caron2017]'; + TEXT CHARACTER= 79 TAXON=158 TEXT='Cavity seems likely based on sediment-like infilling [@Li2024]'; + TEXT CHARACTER= 80 TAXON=38 TEXT='Two bilateral series, with diminutive third on lateral extension'; + TEXT CHARACTER= 80 TAXON=40 TEXT='Four approximate series [@Inglis1999bbmnh]'; + TEXT CHARACTER= 80 TAXON=91 TEXT='@Kristensen1982'; + TEXT CHARACTER= 80 TAXON=92 TEXT='Strong bilateral symmetry, particularly in row III, with a gap between bilateral series [@Michalczyk2003]'; + TEXT CHARACTER= 80 TAXON=93 TEXT='Not prominent in ''Row II'', but clearly present in ''Row III''; @Kihm20203, fig. 1F'; + TEXT CHARACTER= 80 TAXON=97 TEXT='Triradial disposition [@Howard2020]'; + TEXT CHARACTER= 80 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 80 TAXON=143 TEXT='Disordered [@Caron2017]'; + TEXT CHARACTER= 81 TAXON=97 TEXT='Three series'; + TEXT CHARACTER= 81 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 81 TAXON=143 TEXT='Distribution unclear; possibly uniform around pharynx [@Caron2017], but coded as ambiguous'; + TEXT CHARACTER= 81 TAXON=157 TEXT='Uniformly distributed [@Vinther2016]'; + TEXT CHARACTER= 81 TAXON=168 TEXT='Four series [@Moysiuk2019]'; + TEXT CHARACTER= 82 TAXON=15 TEXT='At least two [@Zhang2015]'; + TEXT CHARACTER= 82 TAXON=16 TEXT='More than one'; + TEXT CHARACTER= 82 TAXON=20 TEXT='Six oral stylets [@Neves2016zab]'; + TEXT CHARACTER= 82 TAXON=22 TEXT='Oral stylets + three rows of placoids in bulb [@Gad2005za]'; + TEXT CHARACTER= 82 TAXON=40 TEXT='Four circlets identified [@Inglis1969bbmnh]'; + TEXT CHARACTER= 82 TAXON=81 TEXT='Three in Band II, plus transverse crests (= Band III), in Bertolanius volubilis (Eohypsibiidae) [@Guidetti2015]'; + TEXT CHARACTER= 82 TAXON=92 TEXT='Three circlets (?) in Band II; one in Band III [@Michalczyk2003, fig 57a]'; + TEXT CHARACTER= 82 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 82 TAXON=158 TEXT='State ''four to six'' to denote limited number of rows, potentially variable based on specimen size, in contrast to strict number of four observed in other taxa.'; + TEXT CHARACTER= 83 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 84 TAXON=16 TEXT='Eighteen [@Liu2014]'; + TEXT CHARACTER= 84 TAXON=20 TEXT='Six oral stylets in S. neuhausi [@Neves2016zab]'; + TEXT CHARACTER= 84 TAXON=24 TEXT='Six in Higgins larva; undetermined in adults [@Fujimoto2020mb]'; + TEXT CHARACTER= 84 TAXON=44 TEXT='Five [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 84 TAXON=110 TEXT='Haphazard distribution [@Hu2012]'; + TEXT CHARACTER= 84 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 84 TAXON=119 TEXT='Seemingly 10 [@Yang2020]'; + TEXT CHARACTER= 84 TAXON=121 TEXT='Each circlet in CWM360 [@Maas2007ppp fig. 5B] and ) ELI-000-1402 [@Vannier2017, fig. 3d] contains six visible elements for a total of twelve.'; + TEXT CHARACTER= 85 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 85 TAXON=119 TEXT='Five visible on upper surface, for a total of ten [@Yang2020, fig. 2h]'; + TEXT CHARACTER= 86 TAXON=33 TEXT='Dorsal style reduced [@BauerNebelsick1995]'; + TEXT CHARACTER= 86 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 87 TAXON=108 TEXT='Inferred from seemingly quincunxial distribution [@ConwayMorris2010]'; + TEXT CHARACTER= 87 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 88 TAXON=15 TEXT='Single spine [@Zhang2015]'; + TEXT CHARACTER= 88 TAXON=38 TEXT='@Bolek2010; @Szmygiel2014'; + TEXT CHARACTER= 88 TAXON=44 TEXT='Cuspidate teeth [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 88 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 88 TAXON=119 TEXT='Broad triangle, probably with secondary elements, but no prominent central spine [@Yang2020]'; + TEXT CHARACTER= 88 TAXON=122 TEXT='Seemingly a denticulate triangular arch, resembling Selkirkia teeth [@Smith2015]'; + TEXT CHARACTER= 88 TAXON=169 TEXT='Multiple cusps [@Daley2013]'; + TEXT CHARACTER= 89 TAXON=34 TEXT='Somewhat recurved [@Neuhaus2015z, fig. 13C]'; + TEXT CHARACTER= 89 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 90 TAXON=16 TEXT='Grooved, but not seemingly producing additional spines [@Liu2014]'; + TEXT CHARACTER= 90 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 90 TAXON=143 TEXT='Simple rods or spines [@Caron2017]'; + TEXT CHARACTER= 91 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 92 TAXON=33 TEXT='Large [@BauerNebelsick1995]'; + TEXT CHARACTER= 92 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 93 TAXON=40 TEXT='Two large denticles, without smaller denticles [@Inglis1969bbmnh]'; + TEXT CHARACTER= 93 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 94 TAXON=81 TEXT='Modest ring fold in Bertolanius volubilis (Eohypsibiidae) [@Guidetti2015]'; + TEXT CHARACTER= 94 TAXON=93 TEXT='Not evident [@Kihm20203, fig. 1F]'; + TEXT CHARACTER= 94 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 95 TAXON=22 TEXT='The extended gap between the oral stylets and the placoids is interpreted as denoting the absence of the middle circlets.'; + TEXT CHARACTER= 95 TAXON=44 TEXT='We score the apparent gap between the two regions of teeth [@SchmidtRhaesa2022za] as denoting the reduction of the middle circlets'; + TEXT CHARACTER= 95 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 96 TAXON=28 TEXT='Elongate spine [@Rucci2020z]'; + TEXT CHARACTER= 96 TAXON=33 TEXT='With pectinate fringe [@BauerNebelsick1995]'; + TEXT CHARACTER= 96 TAXON=110 TEXT='Seemingly simple [@Hu2012]'; + TEXT CHARACTER= 96 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 96 TAXON=121 TEXT='Prominent central spine with additional lateral elements indicated by footprint in @Vannier2017, fig. 3d'; + TEXT CHARACTER= 97 TAXON=18 TEXT='No placoids in '; + TEXT CHARACTER= 97 TAXON=40 TEXT='larger and basally fused [@Inglis1969bbmnh]'; + TEXT CHARACTER= 97 TAXON=81 TEXT='In Bertolanius volubilis (Eohypsibiidae) [@Guidetti2015]'; + TEXT CHARACTER= 97 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 97 TAXON=120 TEXT='No indication of variability in morphology [@ThisStudy]'; + TEXT CHARACTER= 97 TAXON=121 TEXT='Possibly larger, but no clear differentiated field'; + TEXT CHARACTER= 97 TAXON=122 TEXT='Distal teeth more conical and elongate [@Shi2022, fig. 3a]'; + TEXT CHARACTER= 97 TAXON=123 TEXT='Distal region likely not exposed in available material [@Han2007pr]'; + TEXT CHARACTER= 98 TAXON=24 TEXT='Seemingly acicular [@Fujimoto2020mb]'; + TEXT CHARACTER= 98 TAXON=81 TEXT='In Bertolanius volubilis (Eohypsibiidae) [@Guidetti2015]'; + TEXT CHARACTER= 98 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 99 TAXON=22 TEXT='Placoids of first row larger than subsequent rows [@Gad2005za]'; + TEXT CHARACTER= 99 TAXON=42 TEXT='Insufficient circlets (in Onchulus) to discriminate a dorsal region'; + TEXT CHARACTER= 99 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 100 TAXON=28 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 100 TAXON=29 TEXT='Present [@Herranz2021z]'; + TEXT CHARACTER= 100 TAXON=31 TEXT='Present [@Herranz2021z]'; + TEXT CHARACTER= 100 TAXON=33 TEXT='Present [@Herranz2021z]'; + TEXT CHARACTER= 100 TAXON=34 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 101 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 102 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 102 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 102 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 102 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 102 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 102 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 102 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 103 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 104 TAXON=40 TEXT='Not prominently reinforced [@Kulikov1998rjn]'; + TEXT CHARACTER= 104 TAXON=43 TEXT='The three plates in the buccal cavity [@Borgone1995] are treated as possible developments of reinforced pharyngeal cuticle'; + TEXT CHARACTER= 104 TAXON=97 TEXT='Three reinforced ridges [@Howard2020]'; + TEXT CHARACTER= 104 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 105 TAXON=89 TEXT='After @Kihm2023, noting that @Mapalo2024cb score D. macrodon as lacking a dorsal apophysis'; + TEXT CHARACTER= 105 TAXON=92 TEXT='After @Kihm2023, noting that @Mapalo2024cb score M. hufelandi as lacking a dorsal apophysis'; + TEXT CHARACTER= 105 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 106 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 107 TAXON=8 TEXT='Seemingly present in inverted pharynx of GSC 45331'; + TEXT CHARACTER= 107 TAXON=18 TEXT='A large pharyngeal bulb characterizes the Nanaloricidae [@Kristensen2004cbm]'; + TEXT CHARACTER= 107 TAXON=19 TEXT='A large pharyngeal bulb characterizes the Nanaloricidae [@Kristensen2004cbm]'; + TEXT CHARACTER= 107 TAXON=20 TEXT='Present [@Neves2016za]'; + TEXT CHARACTER= 107 TAXON=22 TEXT='A small pharyngeal bulb occurs within the mouth cone [@Gad2005az]'; + TEXT CHARACTER= 107 TAXON=43 TEXT='Absent [@Borgonie1995]'; + TEXT CHARACTER= 107 TAXON=95 TEXT='No clear evidence for terminal bulb [@Maas2007ppp]'; + TEXT CHARACTER= 107 TAXON=108 TEXT='No obvious evidence pertaining to the presence of this structure [@ConwayMorris2010]'; + TEXT CHARACTER= 107 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 107 TAXON=121 TEXT='Visible in CWM360 [@Maas2007ppp fig. 5b]'; + TEXT CHARACTER= 107 TAXON=131 TEXT='Depends on interpretation of pharyngeal bulb [@Strausfeld2022]'; + TEXT CHARACTER= 108 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 109 TAXON=28 TEXT='Cylindrical neck without placids [@Herranz2021z]'; + TEXT CHARACTER= 109 TAXON=34 TEXT='Cateria gerlachi does, contra @Sorensen2015, exhibit a neck with 12 placids [@Neuhaus2015z]; a neck and closing apparatus is absent in C. styx [@Herranz2019]'; + TEXT CHARACTER= 109 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 110 TAXON=33 TEXT='As the spinoscalids are too long to be fully withdrawn, the neck does not function as a closing apparatus [@Herranz2021z]'; + TEXT CHARACTER= 110 TAXON=34 TEXT='Cateria gerlachi does, contra @Sorensen2015, exhibit a neck with 12 placids [@Neuhaus2015z]; a neck and closing apparatus is absent in C. styx [@Herranz2019]. Nonetheless, as the spinoscalids are too long to be fully withdrawn, the neck does not function as a closing apparatus [@Herranz2021z]'; + TEXT CHARACTER= 110 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 111 TAXON=28 TEXT='Radial closing apparatus [@Herranz2021z]'; + TEXT CHARACTER= 111 TAXON=33 TEXT='Radial [@Herranz2021z]'; + TEXT CHARACTER= 111 TAXON=34 TEXT='Slight bilateral symmetry produced by narrow dorsal placid [@Neuhaus2015z], but considered radial [@Herranz2021z]'; + TEXT CHARACTER= 111 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 112 TAXON=34 TEXT='Twelve [@Neuhaus2015z]'; + TEXT CHARACTER= 112 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 113 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 114 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 116 TAXON=40 TEXT='Slit-like [@Kulikov1998rjn]'; + TEXT CHARACTER= 116 TAXON=41 TEXT='Round [@Leduc2016n]'; + TEXT CHARACTER= 116 TAXON=42 TEXT='Broad, pocket-like [@Riemann1972]'; + TEXT CHARACTER= 116 TAXON=43 TEXT='Amphids ''cup-shaped'' [@Peneva1999n]'; + TEXT CHARACTER= 117 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 117 TAXON=152 TEXT='Present [@Smith2023n]'; + TEXT CHARACTER= 117 TAXON=156 TEXT='Present [@Park2018]'; + TEXT CHARACTER= 117 TAXON=157 TEXT='Coded ambiguous as the character of the anterior protrusion between the appendages of Pambdelurion [e.g. @Vinther2016, fig. 1] is uncertain: this may be a manifestation of the oral apparatus, or may be a Kerygmachela-like lobe, as perhaps suggested by the anterior-directed filaments [@Vinther2016, fig. 3], which conceivably correspond to dorsal cirri.'; + TEXT CHARACTER= 117 TAXON=160 TEXT='We interpret an anterior lobe as underlying the medial sclerite of Kylinxia [@Dhungana2021]'; + TEXT CHARACTER= 117 TAXON=173 TEXT='Covered by dorsal sclerite'; + TEXT CHARACTER= 118 TAXON=69 TEXT='Coded as absent as a single dorsal sclerite covers the entire body; this structure does not seem to correspond directly to the anterior sclerites of other taxa [@Boesgaard2001]'; + TEXT CHARACTER= 118 TAXON=73 TEXT='We code this as '; + TEXT CHARACTER= 118 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 118 TAXON=131 TEXT='The rigid, ridged ''carapace'' [@Strausfeld2022] seemingly denotes a sclerotization of the dorsal head region; its margin displays relief and a consistent shape across specimens [@Liu2014]'; + TEXT CHARACTER= 118 TAXON=133 TEXT='@Liu2008app, in fig 2A4-5, suggest that the anterior region is sclerotized, although preservation shows irregular margins therefore more specimens are needed to confirm presence.'; + TEXT CHARACTER= 118 TAXON=139 TEXT='The head region of H. fortis displays a similar shape, medial ridge and doublure to that of Cardiodictyon; it is notably darker (= more heavily sclerotized?) in some specimens [e.g. ELI-JS0013; @Liu2014ppp]'; + TEXT CHARACTER= 118 TAXON=141 TEXT='Head sclerite absent [@Howard2020]'; + TEXT CHARACTER= 118 TAXON=143 TEXT='Absent [@Caron2017]'; + TEXT CHARACTER= 118 TAXON=152 TEXT='No evidence of incipient sclerotization [@Smith2023n]'; + TEXT CHARACTER= 118 TAXON=158 TEXT='Likely, but not certain [@Li2024]'; + TEXT CHARACTER= 118 TAXON=163 TEXT='Present [@Dhungana2021]'; + TEXT CHARACTER= 118 TAXON=166 TEXT='Central oval head shield present [@Cong2017]'; + TEXT CHARACTER= 119 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 120 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 120 TAXON=162 TEXT='Prominently anterior [@Moysiuk2022]'; + TEXT CHARACTER= 120 TAXON=163 TEXT='Dorsal sclerite present [@Dhungana2021]'; + TEXT CHARACTER= 121 TAXON=73 TEXT='We do not code for the presence of the dorsal sclerite in certain heterotardigrades (contra @Khim2023, as those cephalic sclerites are always present when trunk sclerites are present, and therefore unlikely to be homologous to the euarthropod dorsal/anterior sclerite'; + TEXT CHARACTER= 121 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 121 TAXON=166 TEXT='Oval [@Cong2017]'; + TEXT CHARACTER= 122 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 122 TAXON=166 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 122 TAXON=167 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 122 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 122 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 122 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 122 TAXON=172 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 122 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 122 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 123 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 124 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 124 TAXON=157 TEXT='No indication of lateral sclerites in @Young2017 or @Budd1998ar'; + TEXT CHARACTER= 124 TAXON=166 TEXT='Prominent ovoid structures adjacent to the frontal appendage are interpreted as P-elements, connected by a rod [@Cong2017]'; + TEXT CHARACTER= 124 TAXON=167 TEXT='Present [@Moysiuk2019]'; + TEXT CHARACTER= 124 TAXON=171 TEXT='Present [@Moysiuk2019]'; + TEXT CHARACTER= 124 TAXON=173 TEXT='Present [@Moysiuk2019]'; + TEXT CHARACTER= 124 TAXON=174 TEXT='@Moysiuk2019 interpret the ventrolateral plate-like elements as lateral sclerites'; + TEXT CHARACTER= 125 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 125 TAXON=166 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 125 TAXON=167 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 125 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 125 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 125 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 125 TAXON=172 TEXT='The dorsal sclerite of Aegirocassis [@VanRoy2015] is neither resembles the cub-circular Anomalocaris-type sclerite nor as elongate as e.g., Hurdia. Therefore we code this character as ambiguous.'; + TEXT CHARACTER= 125 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 125 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 126 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 126 TAXON=166 TEXT='Following @Budd2021'; + TEXT CHARACTER= 126 TAXON=167 TEXT='Following @Budd2021'; + TEXT CHARACTER= 126 TAXON=168 TEXT='No intermediate plate between p-elements observed [@Moysiuk2019]'; + TEXT CHARACTER= 126 TAXON=169 TEXT='Following @Budd2021'; + TEXT CHARACTER= 126 TAXON=171 TEXT='Following @Budd2021'; + TEXT CHARACTER= 126 TAXON=177 TEXT='Following @Budd2021'; + TEXT CHARACTER= 127 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 127 TAXON=143 TEXT='Interpreted as flexible [@Caron2017]'; + TEXT CHARACTER= 128 TAXON=95 TEXT='Expanded introvert, giving dumbbell shaped appearance [@Maas2007ppp], is not treated as equivalent to the condition described in lobopodians.'; + TEXT CHARACTER= 128 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 128 TAXON=129 TEXT='The ''head'' of @Ou2018 is interpreted as a cross-section through the folded trunk; see comments on introvert.'; + TEXT CHARACTER= 128 TAXON=141 TEXT='No swelling [@Howard2020]'; + TEXT CHARACTER= 129 TAXON=55 TEXT='Coded as present based on innervation data that suggests that heterotardigrade anterior cephalic structures are homologous to sensory fields in eutardigrades [@Gross2021]'; + TEXT CHARACTER= 129 TAXON=78 TEXT='Coded as present based on innervation data that suggests that heterotardigrade anterior cephalic structures are homologous to sensory fields in eutardigrades [@Gross2021]'; + TEXT CHARACTER= 129 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 129 TAXON=124 TEXT='Inadequately preserved for confident scoring'; + TEXT CHARACTER= 129 TAXON=126 TEXT='There are no obvious equivalents to the anterior paired projections [@Whittington1975]'; + TEXT CHARACTER= 129 TAXON=127 TEXT='Inadequately preserved for confident scoring'; + TEXT CHARACTER= 129 TAXON=128 TEXT='Unannulated, narrow antenniform structures [@Liu2008app, fig. 3D] are interpreted as potential homologues.'; + TEXT CHARACTER= 129 TAXON=129 TEXT='No obvious candidates present in the complete specimen [@Ma2014]'; + TEXT CHARACTER= 129 TAXON=131 TEXT='By analogy with Hallucigenia and Microdictyon, the anterior appendages [@Strausfeld2022] are all treated as trunk appendages; this is consistent with their uniform position and shape'; + TEXT CHARACTER= 129 TAXON=133 TEXT='Ambiguous: although described as absent by @Liu2008app, we do not consider the limited available material sufficient to definitively rule out the presence of these features.'; + TEXT CHARACTER= 129 TAXON=141 TEXT='Ambiguous [@Howard2020]'; + TEXT CHARACTER= 129 TAXON=143 TEXT='Figure 1H from @Caron2017 shows a possible anterior projection. More detailed head anatomy is needed to be certain of this feature.'; + TEXT CHARACTER= 129 TAXON=145 TEXT='Present [@Caron2020]'; + TEXT CHARACTER= 129 TAXON=148 TEXT='The possibility that one set of antenniform appendages corresponds to enlarged frontal filaments is enticing but difficult to test.'; + TEXT CHARACTER= 129 TAXON=152 TEXT='Dorsal filaments treated as potential homologues [@Smith2023n]'; + TEXT CHARACTER= 129 TAXON=153 TEXT='See Figure 1e from @Vannier2014'; + TEXT CHARACTER= 129 TAXON=154 TEXT='Inadequately preserved for confident scoring'; + TEXT CHARACTER= 129 TAXON=156 TEXT='Interpreted as present by @Ortega2016asd. See rostral spines in supplementary figure 8 from @Park2018.'; + TEXT CHARACTER= 129 TAXON=157 TEXT='Interpreted as present by @Ortega2016asd'; + TEXT CHARACTER= 129 TAXON=160 TEXT='Difficult to demonstrate absence based on available material [@Zeng2020]'; + TEXT CHARACTER= 129 TAXON=162 TEXT='We consider the structures interpreted as "filament-like anterior nerves" [@Moysiuk2022, e.g. fig. 3a] as potential homologues of the frontal filaments '; + TEXT CHARACTER= 129 TAXON=167 TEXT='Coded ambiguous; although the head is known from many articulated specimens [@Daley2014], the disposition of large sclerotized head elements leaves the absence of cirri difficult to conclusively demonstrate.'; + TEXT CHARACTER= 129 TAXON=168 TEXT='Ambiguous; head obscured by carapaces [@Moysiuk2019]'; + TEXT CHARACTER= 129 TAXON=169 TEXT='Ambiguous; head obscured by carapaces [@Daley2013jsp]'; + TEXT CHARACTER= 129 TAXON=171 TEXT='Ambiguities in head region [@Budd2021] mean the absence of these features cannot be determined with confidence'; + TEXT CHARACTER= 129 TAXON=173 TEXT='Considered ambiguous due to position of head sclerite and difficulty in interpreting head outline in available material [@Cong2014; @Cong2016; @Liu2018nsr]'; + TEXT CHARACTER= 129 TAXON=175 TEXT='Coded ambiguous, as anterior region comprises sclerotized segments.'; + TEXT CHARACTER= 129 TAXON=176 TEXT='Frontal filaments present [@Budd2021]'; + TEXT CHARACTER= 129 TAXON=177 TEXT='Considered ambiguous in megacheirans by @Ortega2016asd'; + TEXT CHARACTER= 129 TAXON=178 TEXT='Considered ambiguous in megacheirans by @Ortega2016asd'; + TEXT CHARACTER= 129 TAXON=179 TEXT='Coded ambiguous, as anterior region comprises sclerotized segments.'; + TEXT CHARACTER= 129 TAXON=180 TEXT='Coded ambiguous, as anterior region comprises sclerotized segments.'; + TEXT CHARACTER= 130 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 131 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 132 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 133 TAXON=1 TEXT='Preservation and larval status inadequate to establish'; + TEXT CHARACTER= 133 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 134 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 134 TAXON=128 TEXT='Present [cf. @Ou2012]'; + TEXT CHARACTER= 134 TAXON=131 TEXT='Bears a single pair of eyespots [@Liu2014ppp]'; + TEXT CHARACTER= 134 TAXON=132 TEXT='Coded ambiguous. A dark structure occurs in a location equivalent to the ocellus of Hallucigenia sparsa in ELRC 30060 [@Chen1995bnmns, pl. 6 fig. 2]; reexamination of fossil material is necessary before the absence of ocelli can be categorically confirmed.'; + TEXT CHARACTER= 134 TAXON=138 TEXT='Ocelli [@Smith2015]'; + TEXT CHARACTER= 134 TAXON=139 TEXT='We follow @Liu2014ppp in recognizing a single pair of eyespots. The various carbonaceous regions and pigmented patches [@Ma2012asd] likely represent a degraded but originally continuous carbon film.'; + TEXT CHARACTER= 134 TAXON=141 TEXT='Pair of simple ocellus-like eyes [@Howard2020]'; + TEXT CHARACTER= 134 TAXON=142 TEXT='Pit-type eyes [per @Smith2015, char. 18]'; + TEXT CHARACTER= 134 TAXON=143 TEXT='Sessile ocellus-type eyes [@Caron2017]'; + TEXT CHARACTER= 134 TAXON=156 TEXT='Compound, following @Park2018'; + TEXT CHARACTER= 134 TAXON=157 TEXT='Coded ambiguous: the dorsal surface of Pambdelurion is poorly known [@Budd1998ar]'; + TEXT CHARACTER= 134 TAXON=166 TEXT='Stalked eyes present [@Cong2017]'; + TEXT CHARACTER= 134 TAXON=168 TEXT='@Moysiuk2019'; + TEXT CHARACTER= 134 TAXON=179 TEXT='Reduced [@Mayers2019]'; + TEXT CHARACTER= 135 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 135 TAXON=138 TEXT='Two [@Smith2015]'; + TEXT CHARACTER= 135 TAXON=139 TEXT='We follow @Liu2014ppp in recognizing a single pair of eyespots. In our view the various carbonaceous regions and pigmented patches [@Ma2012asd] likely represent a degraded but originally continuous carbon film.'; + TEXT CHARACTER= 135 TAXON=141 TEXT='Pair of simple ocellus-like eyes [@Howard2020]'; + TEXT CHARACTER= 135 TAXON=160 TEXT='Four [@Dhungana2021]'; + TEXT CHARACTER= 135 TAXON=161 TEXT='A large pair of compound eyes is present [@Schoenemann2011; @Fu2011], but we consider it possible that a small pair of medial ocelli, if present, would be impossible to recognize in the preserved fossil material, so we conservatively code this taxon as ambiguous.'; + TEXT CHARACTER= 135 TAXON=162 TEXT='Two. We consider the "third eye" of @Moysiuk2022 to correspond to nervous tissue in an anterior lobe.'; + TEXT CHARACTER= 135 TAXON=163 TEXT='Four [@Dhungana2021]'; + TEXT CHARACTER= 135 TAXON=164 TEXT='Ambiguous. Possible eyes interpreted by @Pates2022'; + TEXT CHARACTER= 135 TAXON=166 TEXT='Structures interpreted as eyes are not, so this remains ambiguous [@Cong2017]'; + TEXT CHARACTER= 135 TAXON=175 TEXT='Two. Only two eyes have been described [@Yang2013]; we have been unable to substantiate the view of @Lan2021 that fuxianhuiids exhibit medial ocelli in addition to their lateral compound eyes.'; + TEXT CHARACTER= 135 TAXON=176 TEXT='Though @Lan2021 contend that fuxianhuiids exhibit medial ocelli in addition to their lateral compound eyes, @Ma2012n interpret putative medial eyes as lateral extensions of the rostrum.'; + TEXT CHARACTER= 135 TAXON=177 TEXT='Sideward pair and forward pair [@Lan2021]'; + TEXT CHARACTER= 135 TAXON=178 TEXT='Sideward pair and forward pair [@Lan2021]'; + TEXT CHARACTER= 135 TAXON=179 TEXT='Eyes are secondarily lost in Misszhouia and other naraoiids [@Mayers2019]'; + TEXT CHARACTER= 136 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 136 TAXON=156 TEXT='Present, following @Park2018'; + TEXT CHARACTER= 136 TAXON=157 TEXT='Coded ambiguous: the dorsal surface of Pambdelurion is poorly known [@Budd1998ar]'; + TEXT CHARACTER= 136 TAXON=166 TEXT='Stalked eyes presumed compound [@Cong2017]'; + TEXT CHARACTER= 136 TAXON=168 TEXT='@Moysiuk2019'; + TEXT CHARACTER= 136 TAXON=179 TEXT='Eyes are secondarily lost in Misszhouia and other naraoiids [@Mayers2019]'; + TEXT CHARACTER= 137 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 137 TAXON=138 TEXT='Sessile [@Smith2015]'; + TEXT CHARACTER= 137 TAXON=156 TEXT='Sessile, following @Park2018'; + TEXT CHARACTER= 137 TAXON=157 TEXT='Whether or not eyes are present, available specimens clearly demonstrate the absence of an eye stalk [@Budd1998ar; @Young2017]'; + TEXT CHARACTER= 137 TAXON=179 TEXT='Eyes are secondarily lost in Misszhouia and other naraoiids [@Mayers2019]'; + TEXT CHARACTER= 138 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 138 TAXON=156 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=163 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=167 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=175 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=176 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=177 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=178 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 138 TAXON=179 TEXT='Eyes are secondarily lost in Misszhouia and other naraoiids [@Mayers2019]'; + TEXT CHARACTER= 138 TAXON=180 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 139 TAXON=55 TEXT='The sclerotized stylets and stylet supports of tardigrades are likely modified claws [see @Mobjerg2018], since the appendages have been reduced, we code this character as ambiguously. '; + TEXT CHARACTER= 139 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 139 TAXON=126 TEXT='The spines of Aysheaia are not likely to be sclerotized given they preserve similarly the trunk cuticle, and are not enriched in carbon (darker) like the claws. '; + TEXT CHARACTER= 139 TAXON=151 TEXT='The protocerebral appendage pair (assuming its modification to a stylet, as in modern tardigrades) cannot be directly observed.'; + TEXT CHARACTER= 139 TAXON=152 TEXT='Probably not sclerotized [@Smith2023n] - but coded conservatively'; + TEXT CHARACTER= 139 TAXON=163 TEXT='Opabinia''s protocerebral appendages are more robust than fully lobopodous appendages, and may have a single terminal sclerotized segment. We code as uncertain to allow for the possibility that this kind of hardened tip of the appendages are a precursor to sclerotized appendages of radiosdonts. '; + TEXT CHARACTER= 139 TAXON=178 TEXT='The presence of a hypostome is suggested, but not verified'; + TEXT CHARACTER= 140 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 140 TAXON=168 TEXT='Following @DeVivo2021'; + TEXT CHARACTER= 140 TAXON=169 TEXT='Following @DeVivo2021'; + TEXT CHARACTER= 140 TAXON=171 TEXT='Following @DeVivo2021'; + TEXT CHARACTER= 140 TAXON=175 TEXT='Absent, presumably secondarily, in the reduced labrum'; + TEXT CHARACTER= 140 TAXON=176 TEXT='Absent, presumably secondarily, in the reduced labrum'; + TEXT CHARACTER= 140 TAXON=177 TEXT='Absent, presumably secondarily, in the reduced labrum'; + TEXT CHARACTER= 140 TAXON=179 TEXT='Absent, presumably secondarily, in the reduced labrum'; + TEXT CHARACTER= 140 TAXON=180 TEXT='Absent, presumably secondarily, in the reduced labrum'; + TEXT CHARACTER= 141 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 141 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 141 TAXON=129 TEXT='No evidence of appendage differentiation [@Ma2014]; the candidate appendages presented by @Ou2018 are interpreted to be folds associated with flexure of the trunk.'; + TEXT CHARACTER= 141 TAXON=131 TEXT='Coded as ambiguous, as the detailed morphology of the head is unclear'; + TEXT CHARACTER= 141 TAXON=141 TEXT='Not differentiated [@Howard2020]'; + TEXT CHARACTER= 141 TAXON=142 TEXT='We interpret the antenniform structures [@Ma2009] as possible homologues to the frontal filaments rather than appendages. '; + TEXT CHARACTER= 141 TAXON=143 TEXT='Not evident [@Caron2017]'; + TEXT CHARACTER= 141 TAXON=144 TEXT='We code the anterior antennae-like structures [@Yang2015] as possible homologous of the frontal filaments. Hence the first pair of limbs are coded as undifferentiated.'; + TEXT CHARACTER= 141 TAXON=145 TEXT='The first pair of appendages are not differentiated [@Caron2020]. We code the anterior antennae-like structures as possible homologous of the frontal filaments. '; + TEXT CHARACTER= 141 TAXON=148 TEXT='The two anterior appendages may correspond to (i) the protocerebral trunk appendage plus an enlarged anterior filament; or (ii) the protocerebral and deuterocerebral trunk appendages. Under either interpretation, the protocerebral limb pair is distinct from the trunk appendages.'; + TEXT CHARACTER= 141 TAXON=158 TEXT='No evidence of trunk appendages, indicating distinct form (lobopodous?) if present [@Li2024]'; + TEXT CHARACTER= 142 TAXON=55 TEXT='The sclerotized stylets and stylet supports of tardigrades are likely modified claws [see @Mobjerg2018], since the appendages have been reduced, we code this character as ambiguously. '; + TEXT CHARACTER= 142 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 142 TAXON=152 TEXT='Conservatively coded as ambiguous to reflect possibility of later development of podomeres'; + TEXT CHARACTER= 142 TAXON=163 TEXT='We interpret the claws of Opabinia’s protocerebral appendage as podomerous [see @Whittington1975, figs 75 and 79], and thus the protocerebral appendage as sclerotized'; + TEXT CHARACTER= 143 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 143 TAXON=152 TEXT='Conservatively coded as ambiguous to reflect possibility of later development of podomeres'; + TEXT CHARACTER= 143 TAXON=163 TEXT='The basal podomeres are poorly preserved [@Dhungana2021] hence we code as ambiguous. '; + TEXT CHARACTER= 143 TAXON=165 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=166 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=167 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=172 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 143 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 144 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 144 TAXON=152 TEXT='Uncertain as adult morphology unknown; observed tapering [@Smith2023n] may be a developmental phenomenon.'; + TEXT CHARACTER= 144 TAXON=163 TEXT='We interpret the claws of Opabinia''s protocerebral appendage as podomerous [see @Whittington1975, figs 75, 79]. The distal three podomeres are differentiated, and could be homologous to the differentiation of distal podomeres of certain hurdiids, however, given that hurdiid distal podomeres taper in diameter, and Opabinia''s terminal podomere is the largest, we code as uncertain for this character.'; + TEXT CHARACTER= 144 TAXON=166 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 144 TAXON=167 TEXT='No significant change.'; + TEXT CHARACTER= 144 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 144 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 144 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 144 TAXON=172 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 144 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 144 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 145 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 145 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 145 TAXON=149 TEXT='The first pair of appendages in Ilyodes are lateral [@Thompson1980; @Haug2012cb]'; + TEXT CHARACTER= 145 TAXON=152 TEXT='Ventrolateral – adult position uncertain'; + TEXT CHARACTER= 145 TAXON=157 TEXT='Ventral [@Budd1998ar]'; + TEXT CHARACTER= 146 TAXON=55 TEXT='As the mouth is terminal, and the appendages have been assumed to be incorporated into the mouth, we code that the frontal appendages have not shifted posteriorly.'; + TEXT CHARACTER= 146 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 147 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 147 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 147 TAXON=149 TEXT='The first pair of appendages in Ilyodes are not directly adjacent [@Thompson1980; @Haug2012cb]'; + TEXT CHARACTER= 147 TAXON=152 TEXT='Coded as uncertain as protocerebral appendages are known to migrate during development [@Budd2021]'; + TEXT CHARACTER= 147 TAXON=153 TEXT='The first pair of appendages in Megadictyon are not directly adjacent [@Liu2007az]^n'; + TEXT CHARACTER= 147 TAXON=154 TEXT='Jianshanopodia is coded uncertain due to unclear preservation [@Liu2006; @Liu2007az]'; + TEXT CHARACTER= 147 TAXON=156 TEXT='Not directly adjacent, but separated by anterior lobe [@Park2018]'; + TEXT CHARACTER= 147 TAXON=166 TEXT='Adjacent in better-articulated material, and thus presumably in life [@Cong2017]'; + TEXT CHARACTER= 147 TAXON=169 TEXT='Coded as ambiguous, as the well-developed dorsal cephalic plate in Hurdia and Aegirocassis obscures the base of the appendages [@Daley2009; @VanRoy2015].'; + TEXT CHARACTER= 147 TAXON=172 TEXT='Coded as ambiguous, as the well-developed dorsal cephalic plate in Hurdia and Aegirocassis obscures the base of the appendages [@Daley2009; @VanRoy2015].'; + TEXT CHARACTER= 148 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 148 TAXON=152 TEXT='Coded as uncertain as protocerebral appendages are known to migrate during development [@Budd2021]'; + TEXT CHARACTER= 148 TAXON=168 TEXT='Figure 2J in @Moysiuk2019 shows a prominent gap between appendages'; + TEXT CHARACTER= 149 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 149 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 149 TAXON=152 TEXT='Coded as uncertain as protocerebral appendages are known to migrate during development [@Budd2021]'; + TEXT CHARACTER= 150 TAXON=55 TEXT='In tardigrades, the presence of stylet glands, responsible for the moulting and production of stylet and stylet supports are likely transformed claw glands [@Mobjerg2018]. As such stylets and stylet supports are interpreted as modified claws [@Halberg2009; @Nielsen2001]. This homology is supported by the presence of microtubules in the epidermal cell attachments of exclusively the retractor muscles of claws and stylets in tardigrades [@Halberg2009, J. of Morphology].'; + TEXT CHARACTER= 150 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 150 TAXON=124 TEXT='Although protocerebral appendage is undifferentiated, claws are not preserved [@Dzik1989] and as such this cell is coded as ambiguous.'; + TEXT CHARACTER= 150 TAXON=126 TEXT='No claws on differentiated protocerebral appendages [see @Whittington1978]'; + TEXT CHARACTER= 150 TAXON=127 TEXT='Potential claws not preserved, therefore ambiguous [@Dzik2011]'; + TEXT CHARACTER= 150 TAXON=138 TEXT='Claws are absent in multiple anterior appendages, therefore coded as ambiguous (although posterior appendages are clawed).'; + TEXT CHARACTER= 150 TAXON=141 TEXT='Claws absent in multiple anterior appendages [@Howard2020], therefore coded as ambiguous (although posterior appendages are clawed) '; + TEXT CHARACTER= 150 TAXON=153 TEXT='@Liu2007az suggest claws present on differentiated protocerebral appendages of Megadictyon; these are figured by @Vannier2014'; + TEXT CHARACTER= 150 TAXON=154 TEXT='Not evident from incompletely preserved available material [@Liu2006; @Vannier2014]'; + TEXT CHARACTER= 150 TAXON=156 TEXT='Claws absent on protocerebral appendages [@Park2018, supplementary figure 3]'; + TEXT CHARACTER= 150 TAXON=157 TEXT='Following @Vinther2016 we code terminal claws on protocerebral appendages to be absent. The terminal structures are not well differentiated from the pointed outgrowths along the inner edge of the appendages and no terminal claw can be readily distinguished (see @Vinther2016, fig. 1; contra @Vannier2014).'; + TEXT CHARACTER= 150 TAXON=158 TEXT='Unclear whether spines are modified claws or separate elaborations, hence coded ambiguous, though presumably lost per Pambdelurion'; + TEXT CHARACTER= 150 TAXON=163 TEXT='Opabinia''s protocerebral spines are not homologous to lobopodian-style claws.'; + TEXT CHARACTER= 151 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 151 TAXON=128 TEXT='Spine series present on differentiated protocerebral appendage, therefore coded as present.'; + TEXT CHARACTER= 151 TAXON=149 TEXT='Coded as absent [@Haug2012cb]'; + TEXT CHARACTER= 151 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 151 TAXON=156 TEXT='Present [@Budd1993; @Budd1998trse]'; + TEXT CHARACTER= 151 TAXON=157 TEXT='Present [@Budd1998ar]'; + TEXT CHARACTER= 151 TAXON=158 TEXT='Absent [@Li2024]'; + TEXT CHARACTER= 151 TAXON=163 TEXT='We code this as uncertain, as the present material on Opabinia''s frontal appendages does not allow for a clear assessment if lateral spines are present on the frontal appendages. '; + TEXT CHARACTER= 152 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 152 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 152 TAXON=154 TEXT='@Li2024'; + TEXT CHARACTER= 152 TAXON=156 TEXT='@Li2024'; + TEXT CHARACTER= 152 TAXON=157 TEXT='@Li2024'; + TEXT CHARACTER= 152 TAXON=160 TEXT='Following @Zeng2020'; + TEXT CHARACTER= 152 TAXON=166 TEXT='Paired ventral endites are present on podomeres 2-9 only [@Daley2010]'; + TEXT CHARACTER= 152 TAXON=169 TEXT='Hurdiidae have one row [@Guo2019]'; + TEXT CHARACTER= 152 TAXON=171 TEXT='Hurdiidae have one row [@Guo2019]'; + TEXT CHARACTER= 152 TAXON=174 TEXT='Though previously coded and reconstructed as having two rows, we code to allow the possibility that Schinderhannes may only have one row [only one row clear in @Kuhl2009, Supplementary Fig S1A].'; + TEXT CHARACTER= 153 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 154 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 154 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 155 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 155 TAXON=159 TEXT='The serrated margins of the main spines of Parapeytoia [@Hou1995gff] have been compared to megacheiran appendages (see @Budd2021). We conservatively code this character as ambiguous as the potential homology to accessory endite spines in radiodonts is unclear.'; + TEXT CHARACTER= 156 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 156 TAXON=153 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=154 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=156 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=157 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=160 TEXT='Alternating [@Zeng2020]'; + TEXT CHARACTER= 156 TAXON=166 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=167 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=172 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 156 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 157 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 157 TAXON=160 TEXT='Narrower than podomere: Kylinxia is closer to the condition in anomalocaridids than in hurdiids [@Zeng2020]'; + TEXT CHARACTER= 157 TAXON=165 TEXT='As lateral spine (gnathal) series are not homologous to the ventral spine series of radiodonts [@Moysiuk2021], we code taxa with lateral spine series only as inapplicable.'; + TEXT CHARACTER= 158 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 158 TAXON=126 TEXT='Uniform length [@Whittington1975]'; + TEXT CHARACTER= 158 TAXON=166 TEXT='No increase'; + TEXT CHARACTER= 158 TAXON=167 TEXT='No increase'; + TEXT CHARACTER= 159 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 159 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 159 TAXON=174 TEXT='Schinerhannes possibly has straight endites [@Moysiuk2019], although this is difficult to ascertain from the original material [@Kuhl2009], hence conservatively we code ambiguously.'; + TEXT CHARACTER= 160 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 160 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 160 TAXON=128 TEXT='Two, one on each side of the appendage.'; + TEXT CHARACTER= 160 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 160 TAXON=163 TEXT='Uncertain if spine series are present.'; + TEXT CHARACTER= 160 TAXON=174 TEXT='Present [following @Moysiuk2021]'; + TEXT CHARACTER= 161 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 161 TAXON=129 TEXT='Terminal spines are spines, rather than modifications of the appendages [@Ma2014]'; + TEXT CHARACTER= 161 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 161 TAXON=153 TEXT='Megadictyon protocerebral appendages end in a single claw [e.g. @Vannier2014], therefore have a single rather than multifurcate termination.'; + TEXT CHARACTER= 161 TAXON=157 TEXT='@Vannier2014 suggest Pambdelurion''s protocerebral appendage terminates in a single claw, however, this "claw" could be a taphonomic artefact. We code as uncertain. '; + TEXT CHARACTER= 161 TAXON=159 TEXT='The affinity of the anterior appendages of Parapeytoia is unclear therefore we code this character ambiguously, although there is o indication that distalmost podomere is multifurcate [e.g. @Hou1995gff, fig. 12]'; + TEXT CHARACTER= 161 TAXON=163 TEXT='We interpret the claws of Opabinia''s protocerebral appendage as podomerous [see @Whittington1975, figs 75, 79]. The distalmost podomere terminates in a single point [e.g. @Whittington1975, fig. 79], therefore we code the multifurcate termination as absent.'; + TEXT CHARACTER= 161 TAXON=170 TEXT='@Moysiuk2021 interpret the tip of the appendages to have outer spine series ("os" in their figure 6F) with a single terminal stub without a multifurcate termination'; + TEXT CHARACTER= 161 TAXON=172 TEXT='"Terminal podomere stout, with pointed tip." [@VanRoy2015]'; + TEXT CHARACTER= 161 TAXON=173 TEXT='@Liu2018nsr shows that the Lyrarapax appendage terminates in a distal claw, and does not have a multifurcate distal termination.'; + TEXT CHARACTER= 162 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 162 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 162 TAXON=167 TEXT='Unkinked [@Daley2014], though kink present in A. saron. ^nOriginally coded as kinked by @Vinther2014; updated to not kinked by @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 163 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 163 TAXON=159 TEXT='The affinity of the frontal appendages of Parapeytoia is unclear, hence we code this character as ambiguous although the ''pincer'' of Parapeytoia is formed by distal endite with opposing curvature [@Hou1995gff], rather than the proximal endite (such as in Lyrarapax). '; + TEXT CHARACTER= 163 TAXON=166 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=167 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=172 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 163 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 164 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 164 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 164 TAXON=172 TEXT='Following @Moysiuk2021'; + TEXT CHARACTER= 165 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 165 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 165 TAXON=152 TEXT='Coded ambiguous as protocerebral appendages appear to be in an early developmental stage [@Smith2023n]; adult morphology is uncertain.'; + TEXT CHARACTER= 166 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 166 TAXON=129 TEXT='Although @Liu2011 described the appendages of the lobopodian Diania as having an arthropodized organization, a recent revision of this taxon [@Ma2014jsp] concluded that the podomere-like structures on the legs represent taphonomic features on lobopodous appendages.'; + TEXT CHARACTER= 166 TAXON=166 TEXT='Flaps are not sclerotized [@Chen1994]'; + TEXT CHARACTER= 166 TAXON=174 TEXT='Schinderhannes [@Kuhl2009] is coded as having lobopodous post-protocerebral appendages based on the presence of a pair of enlarged lateral body flaps resembling those of Lyrarapax [@Cong2014].^n'; + TEXT CHARACTER= 167 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 167 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 167 TAXON=174 TEXT='Schinderhannes is coded uncertain in view of its ambiguous morphology [@Kuhl2009; @Ortega2016br]'; + TEXT CHARACTER= 168 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 168 TAXON=131 TEXT='The second appendage pair of Cardiodictyon does not seem to be differentiated [@Liu2008app]'; + TEXT CHARACTER= 168 TAXON=137 TEXT='Because the head of Carbotubulus is not preserved [@Haug2012cb], the identity of the limbs is unclear and this character is coded as ambiguous.'; + TEXT CHARACTER= 168 TAXON=148 TEXT='Coded ambiguous to reflect uncertainty as to whether the two anterior appendage pairs represent (i) the protocerebral appendage and a dorsal projection; (ii) the protocerebral and deutocerebral appendages'; + TEXT CHARACTER= 168 TAXON=152 TEXT='Preservation insufficient to evaluate potential differentiation in adult; and appendages may be in an early developmental stage, with differentiation occurring late in development [@Smith2023n].'; + TEXT CHARACTER= 168 TAXON=166 TEXT='The first three flaps are reduced, but the deutocerebral appendage is not morphologically distinct [@Cong2017]. The gnathobase-like structures [@Cong2017] are captured in a separate character.'; + TEXT CHARACTER= 168 TAXON=167 TEXT='@Daley2014 reported the presence of a smaller set of flaps in proximity with the putative head region of Anomalocaris canadensis; given that this differentiation is expressed in size, rather than structural identity, we score the deutocerebral limbs as undifferentiated in Anomalocaris.'; + TEXT CHARACTER= 168 TAXON=174 TEXT='The nature of the second appendage in Schinderhannes is unclear due to poor preservation [@Kuhl2009].'; + TEXT CHARACTER= 169 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 170 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 170 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 170 TAXON=131 TEXT='Short and without claws, indicating non-ambulatory function [@Strausfeld2022]'; + TEXT CHARACTER= 170 TAXON=148 TEXT='The ‘second antenna’ of Antennacanthopodia [@Ou2011] is interpreted as a sensorial appendage.'; + TEXT CHARACTER= 170 TAXON=174 TEXT='Schinderhannes is scored as having an ambulatory limb based on the structure of the enlarged body flap, which is the first observable pot-ocular appendage [@Kuhl2009].'; + TEXT CHARACTER= 171 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 172 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 172 TAXON=175 TEXT='Inapplicable in upper stem euarthropods with arthropodized second appendage; coded with ambiguous token as character is neomorphic.'; + TEXT CHARACTER= 172 TAXON=176 TEXT='Inapplicable in upper stem euarthropods with arthropodized second appendage; coded with ambiguous token as character is neomorphic.'; + TEXT CHARACTER= 172 TAXON=177 TEXT='Inapplicable in upper stem euarthropods with arthropodized second appendage; coded with ambiguous token as character is neomorphic.'; + TEXT CHARACTER= 172 TAXON=178 TEXT='Inapplicable in upper stem euarthropods with arthropodized second appendage; coded with ambiguous token as character is neomorphic.'; + TEXT CHARACTER= 172 TAXON=179 TEXT='Inapplicable in upper stem euarthropods with arthropodized second appendage; coded with ambiguous token as character is neomorphic.'; + TEXT CHARACTER= 172 TAXON=180 TEXT='Inapplicable in upper stem euarthropods with arthropodized second appendage; coded with ambiguous token as character is neomorphic.'; + TEXT CHARACTER= 173 TAXON=118 TEXT='Anterior region not preserved [@Hu2012]'; + TEXT CHARACTER= 174 TAXON=1 TEXT='Neck with 9-11 annular folds [@Maas2009]'; + TEXT CHARACTER= 174 TAXON=7 TEXT='Present, and implied by distribution of trunk spines, particularly in posterior trunk, though the trunk cuticle is often indistinct'; + TEXT CHARACTER= 174 TAXON=10 TEXT='Present, if imperfectly, in thorax'; + TEXT CHARACTER= 174 TAXON=11 TEXT='Annuli present, but weakly developed and in places seem to pinch out [@Zhang2022]'; + TEXT CHARACTER= 174 TAXON=17 TEXT='Presumed present in thorax of Higgins larva, but not possible to establish'; + TEXT CHARACTER= 174 TAXON=18 TEXT='Prominent in thorax of Higgins larva [@Neves2019]'; + TEXT CHARACTER= 174 TAXON=19 TEXT='Prominent in thorax of Higgins larva [@Neves2019]'; + TEXT CHARACTER= 174 TAXON=20 TEXT='Degree of annulation on neck [@Heiner2007hmr]'; + TEXT CHARACTER= 174 TAXON=22 TEXT='Prominent in thorax of Higgins larva [@Neves2019]'; + TEXT CHARACTER= 174 TAXON=23 TEXT='Prominent in thorax [@Neves2019]'; + TEXT CHARACTER= 174 TAXON=24 TEXT='No annulations evident in adult or larva [@Fujimoto2020mb]'; + TEXT CHARACTER= 174 TAXON=25 TEXT='Prominent throughout Shira larval trunk [@Neves2014ode]'; + TEXT CHARACTER= 174 TAXON=28 TEXT='Absent; segmentation instead'; + TEXT CHARACTER= 174 TAXON=38 TEXT='Annulations in larval trunk only [@Bolek2013]'; + TEXT CHARACTER= 174 TAXON=95 TEXT='Preservation insufficient to evaluate [@Maas2007ppp]'; + TEXT CHARACTER= 174 TAXON=96 TEXT='Inferred as present based on distribution of denticles, even if not evident from preservation of cuticle'; + TEXT CHARACTER= 174 TAXON=137 TEXT='A taphonomic absence can be discounted because annulations are preserved in co-occurring specimens of Ilyodes [@Haug2012cb]^n'; + TEXT CHARACTER= 174 TAXON=138 TEXT='Absent [@Smith2015]'; + TEXT CHARACTER= 174 TAXON=141 TEXT='Annulated trunk and limbs [@Howard2020]'; + TEXT CHARACTER= 174 TAXON=143 TEXT='Fine epidermal annuli between limb pairs [@Caron2017]'; + TEXT CHARACTER= 174 TAXON=148 TEXT='Present on limbs; it is unclear whether the trunk was annulated, due to effaced preservation [@Ou2011].'; + TEXT CHARACTER= 174 TAXON=152 TEXT='Coded ambiguous, as larval stages may lack evidence of annulations that are present in adults [e.g. in Onychophora; @Walker2004]'; + TEXT CHARACTER= 174 TAXON=155 TEXT='Lobopodous limb appears annulated [@Hou1995gff]'; + TEXT CHARACTER= 174 TAXON=157 TEXT='Present on limbs; it is unclear whether the trunk was annulated, due to effaced preservation [@Budd1998ar].'; + TEXT CHARACTER= 174 TAXON=175 TEXT='Coded ambiguous as sclerotization of trunk assumed to overprint evidence of annulation'; + TEXT CHARACTER= 174 TAXON=176 TEXT='Coded ambiguous as sclerotization of trunk assumed to overprint evidence of annulation'; + TEXT CHARACTER= 174 TAXON=177 TEXT='Coded ambiguous as sclerotization of trunk assumed to overprint evidence of annulation'; + TEXT CHARACTER= 174 TAXON=178 TEXT='Coded ambiguous as sclerotization of trunk assumed to overprint evidence of annulation'; + TEXT CHARACTER= 174 TAXON=179 TEXT='Coded ambiguous as sclerotization of trunk assumed to overprint evidence of annulation'; + TEXT CHARACTER= 174 TAXON=180 TEXT='Coded ambiguous as sclerotization of trunk assumed to overprint evidence of annulation'; + TEXT CHARACTER= 175 TAXON=14 TEXT='Annulations uneven in size, but not systematically differentiated [@Shao2020]'; + TEXT CHARACTER= 175 TAXON=124 TEXT='Homonomous annulation, despite presence of appendages [@Dzik1989; @Jaeger2010]'; + TEXT CHARACTER= 175 TAXON=153 TEXT='Annulations in Megadictyon appear regular [@Liu2007az], so this taxon is coded as homonomous.'; + TEXT CHARACTER= 175 TAXON=156 TEXT='Jianshanopodia exhibits regions of narrower annulations between appendages [@Liu2006], so is coded as heteronomous.'; + TEXT CHARACTER= 175 TAXON=157 TEXT='We code Pambdelurion as uncertain, as the trunk is not adequately preserved to make a confident assignation [@Budd1998ar; @Young2017]'; + TEXT CHARACTER= 176 TAXON=15 TEXT='Coded absent in Eokinorhynchus as the ‘neck’ region is considered part of the introvert [@Zhang2015]'; + TEXT CHARACTER= 176 TAXON=45 TEXT='Coded as continuing to front in Halicryptus. A deep groove separates the region that is adorned with Zone I armature, but this area seems to bear faint annulations (that are not associated with the armature) [@Shirley1999].'; + TEXT CHARACTER= 176 TAXON=49 TEXT='Continuing to the front [@Hammond1970]'; + TEXT CHARACTER= 176 TAXON=97 TEXT='Indistinct in anterior trunk (see notes on Introvert for delineation of trunk and introvert) [@Howard2020]'; + TEXT CHARACTER= 176 TAXON=102 TEXT='Coded ambiguous in Paratubiluchus [@Han2004] as annulations are not clearly enough preserved to evaluate their distribution in the neck area.^n'; + TEXT CHARACTER= 176 TAXON=111 TEXT='Coded ambiguous in Guanduscolex [@Hu2008] as the apparent absence of anterior annulations may be preservational.'; + TEXT CHARACTER= 176 TAXON=119 TEXT='Consistent annulation to base of introvert [@Yang2020]'; + TEXT CHARACTER= 176 TAXON=122 TEXT='As with Cricocosmia, the anterior trunk is less prominently annulated and lacks prominent dorsal sclerites [@Shi2022]'; + TEXT CHARACTER= 176 TAXON=123 TEXT='Anterior region with indistinct annulations and reduction of dorsal armature'; + TEXT CHARACTER= 176 TAXON=126 TEXT='The introvert is not treated as part of the trunk'; + TEXT CHARACTER= 176 TAXON=128 TEXT='The introvert is not treated as part of the trunk'; + TEXT CHARACTER= 176 TAXON=129 TEXT='Indistinct near narrow end of trunk [@Ma2014]'; + TEXT CHARACTER= 176 TAXON=153 TEXT='Annulations are not clearly preserved in the anterior region [@Liu2006; @Liu2007], making this character difficult to score with confidence.'; + TEXT CHARACTER= 176 TAXON=154 TEXT='Annulations are not clearly preserved in the anterior region [@Liu2006; @Liu2007], making this character difficult to score with confidence.'; + TEXT CHARACTER= 176 TAXON=156 TEXT='Annulations in the pharynx of Kerygmachela continue to the terminal mouth [@Budd1998trse]; given the position of the prominent annulated appendages, it seems likely that the head also expressed external annulations.'; + TEXT CHARACTER= 177 TAXON=10 TEXT='Branching present [@Maas2007]'; + TEXT CHARACTER= 177 TAXON=11 TEXT='Branching and pinching out evident [@Zhang2022]'; + TEXT CHARACTER= 177 TAXON=13 TEXT='First ten annulae unbranched [@Liu2018]'; + TEXT CHARACTER= 177 TAXON=15 TEXT='Strictly unbranched [@Zhang2015]'; + TEXT CHARACTER= 177 TAXON=16 TEXT='Apparent branching [@Shao2020]'; + TEXT CHARACTER= 177 TAXON=98 TEXT='Apparent branching / overlapping [@Ma2014, fig. 3.3]'; + TEXT CHARACTER= 177 TAXON=141 TEXT='No branching observed [@Howard2020]'; + TEXT CHARACTER= 177 TAXON=143 TEXT='No evidence of branching [@Caron2017]'; + TEXT CHARACTER= 177 TAXON=153 TEXT='No indication of branching in @Ramskold1998, fig 3.8C '; + TEXT CHARACTER= 178 TAXON=15 TEXT='The repeated elements of Eokinorhynchus are coded as annulations with serially iterated sclerites; this taxon is not coded as segmented.'; + TEXT CHARACTER= 178 TAXON=162 TEXT='@Moysiuk2022 observe segmental boundaries (arguably implying arthrodization) in the dorsal trunk cuticle, though these are not apparent on the ventral surface; this recalls the ventrally flexible configuration of Opabinia.'; + TEXT CHARACTER= 178 TAXON=163 TEXT='Coded as present since has discrete body segments separated by furrows [@Budd1996; @Zhang2007; @Budd2012]'; + TEXT CHARACTER= 178 TAXON=169 TEXT='The single complete specimen does not conclusively establish the presence or absence of epidermal segmentation [@Daley2009]'; + TEXT CHARACTER= 178 TAXON=172 TEXT='Interpreted as present by @Moysiuk2022'; + TEXT CHARACTER= 178 TAXON=173 TEXT='Interpreted as present by Moysiuk & Caron (2022)'; + TEXT CHARACTER= 178 TAXON=174 TEXT='Although interpreted as present by @Moysiuk2022, we do not consider the single available specimen [@Kuhl2009] to definitively establish the presence or absence of epidermal segmentation'; + TEXT CHARACTER= 179 TAXON=69 TEXT='Although some heterotardigrades possess dorsal plates [e.g. @Nelson2002; @Marchioro2013], these are not connected by arthrodial membranes. We thus score Actinarctus as absent for this character.'; + TEXT CHARACTER= 179 TAXON=129 TEXT='The dorsal oval elements [@Liu2011; @Ma2014] are interpreted as modified trunk sclerites'; + TEXT CHARACTER= 179 TAXON=160 TEXT='Following @Zeng2020'; + TEXT CHARACTER= 179 TAXON=161 TEXT='Trunk not arthrodized [@Zhang2023]'; + TEXT CHARACTER= 180 TAXON=160 TEXT='No arthrodial membranes [@Zeng2020]'; + TEXT CHARACTER= 180 TAXON=161 TEXT='Trunk not arthrodized [@Zhang2023]'; + TEXT CHARACTER= 180 TAXON=176 TEXT='Absent'; + TEXT CHARACTER= 192 TAXON=28 TEXT='Reported as present by @DalZotto2013, but considered absent by @Herranz2021z, who note instead the presence of a long mid-dorsal spine on segment 11, not associated with musculature'; + TEXT CHARACTER= 193 TAXON=28 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 193 TAXON=29 TEXT='Present [@Herranz2021z]'; + TEXT CHARACTER= 193 TAXON=33 TEXT='Present [@Herranz2021z]'; + TEXT CHARACTER= 193 TAXON=34 TEXT='Absent [@Herranz2021z]'; + TEXT CHARACTER= 196 TAXON=28 TEXT='Absent [@DalZotto2013sb]'; + TEXT CHARACTER= 196 TAXON=29 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=30 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=31 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=32 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=33 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=34 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=35 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=36 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 196 TAXON=37 TEXT='Present in Antygomonas, Campyloderes, Centroderes, Dracoderes, Echinoderes, Meristoderes, Semnoderes, Sphenoderes, Tubulideres, Kinorhynchus and Pycnophyes [@SchmidtRheasa2013]'; + TEXT CHARACTER= 197 TAXON=28 TEXT='Absent [@DalZotto2013sb]'; + TEXT CHARACTER= 198 TAXON=141 TEXT='Absent [@Howard2020]'; + TEXT CHARACTER= 198 TAXON=148 TEXT='Not evident, despite some preservation of internal tissue [@Ou2011]'; + TEXT CHARACTER= 198 TAXON=152 TEXT='Present [@Smith2023n]'; + TEXT CHARACTER= 198 TAXON=167 TEXT='Present [@Briggs1984; @Daley2014]'; + TEXT CHARACTER= 198 TAXON=173 TEXT='Present in L. trilobus [@Cong2016]; reported absence in L. unguispinus [@Cong2014] attributed to non-preservation.'; + TEXT CHARACTER= 199 TAXON=16 TEXT='Posterior narrowing [@Shao2020]'; + TEXT CHARACTER= 199 TAXON=42 TEXT='Uniform for most of length, before narrowing to caudal filament that comprises a third of the body [@Reiman1972]'; + TEXT CHARACTER= 199 TAXON=43 TEXT='Narrow post-anal caudal extension of the trunk'; + TEXT CHARACTER= 199 TAXON=152 TEXT='Unknown whether narrowing [@Smith2023n] is developmental or would be retained to adulthood.'; + TEXT CHARACTER= 199 TAXON=156 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=157 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=160 TEXT='@Zeng2020 supplementary info clarifies narrowing trend posteriad'; + TEXT CHARACTER= 199 TAXON=163 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=166 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=167 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=168 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=169 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=171 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=172 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=173 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 199 TAXON=174 TEXT='Following @Moysiuk2019'; + TEXT CHARACTER= 200 TAXON=7 TEXT='Primarily manifested in differentiation of armature type, distribution and density [@ConwayMorris1977]. See notes on introvert for interpretation of the anterior trunk vs. introvert.'; + TEXT CHARACTER= 200 TAXON=10 TEXT='Thorax differentiated from plate-bearing region and anal field'; + TEXT CHARACTER= 200 TAXON=12 TEXT='Anterior four annulations are narrower and exhibit distinct sclerite shape [@Liu2019]'; + TEXT CHARACTER= 200 TAXON=14 TEXT='Some variation in annular expression between first five and subsequent annulations [@Shao2020], but not prominent enough to denote a distinct subdivision of the trunk'; + TEXT CHARACTER= 200 TAXON=15 TEXT='Differentiated ''neck'' region [@Zhang2015] resembles anterior trunk of Acosmia [@Howard2020]'; + TEXT CHARACTER= 200 TAXON=16 TEXT='Not differentiated beyond introvert [@Zhao2016]'; + TEXT CHARACTER= 200 TAXON=17 TEXT='Unclear in adult but likely prominent in larva'; + TEXT CHARACTER= 200 TAXON=95 TEXT='Dumbbell shape hints at anterior differentiation [@Maas2007ppp]'; + TEXT CHARACTER= 200 TAXON=96 TEXT='Unclear in adult but likely prominent in larva'; + TEXT CHARACTER= 200 TAXON=97 TEXT='Differentiated only by armature and diminished annulations [@Howard2020], both of which are acknowledged in separate characters. '; + TEXT CHARACTER= 200 TAXON=98 TEXT='Ambiguous: Smooth anterior trunk in YKLP 11333 [@Ma2014] but the introvert of this specimen is not obviously equivalent to that in Eximipriapulus and no other specimen shows this differentiation so clearly. '; + TEXT CHARACTER= 200 TAXON=120 TEXT='Plausible differentiation, by analogy with C. jinningensis [@ThisStudy]'; + TEXT CHARACTER= 200 TAXON=121 TEXT='Present: the anterior trunk of many specimens lacks plates, and annulations are more closely spaced or absent [@Hou1994; @Maas2007ppp; @Vannier2017]'; + TEXT CHARACTER= 200 TAXON=122 TEXT='Anterior region with diminished annulation and absence of dorsal plates'; + TEXT CHARACTER= 200 TAXON=129 TEXT='Ambiguous; depends on whether the narrow end [@Ma2014] is interpreted as corresponding to an introvert.'; + TEXT CHARACTER= 200 TAXON=131 TEXT='Three differentiated appendages associated with sclerotized region [@Strausfeld2022]'; + TEXT CHARACTER= 200 TAXON=134 TEXT='Change in appendage construction, and possibly thickness [@Siveter2018]'; + TEXT CHARACTER= 200 TAXON=141 TEXT='Differentiated: posterior trunk lacks appendages [@Howard2020]'; + TEXT CHARACTER= 200 TAXON=143 TEXT='Short posterior trunk comprising three appendage pairs [@Caron2017]'; + TEXT CHARACTER= 202 TAXON=1 TEXT='May not be evident if epicuticle is not preserved [@Maas2009]'; + TEXT CHARACTER= 202 TAXON=21 TEXT='Absent [@Heiner2008sb]'; + TEXT CHARACTER= 202 TAXON=38 TEXT='Crowned areoles [@Bolek2013] have some resemblance to sensory spots in other taxa'; + TEXT CHARACTER= 202 TAXON=44 TEXT='Sensory structures ringed with tube-like elements, and ''ring papillae'' [@SchmidtRhaesa2022za], both recall these sensory structures'; + TEXT CHARACTER= 204 TAXON=44 TEXT='Petal-like configuration [@SchmidtRhaesa2022za, fig. 6c]'; + TEXT CHARACTER= 206 TAXON=8 TEXT='Unclear, but cuticular elements present'; + TEXT CHARACTER= 206 TAXON=44 TEXT='Present [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 206 TAXON=103 TEXT='''may have borne surface ornamentation'' [@ConwayMorris1977]'; + TEXT CHARACTER= 206 TAXON=124 TEXT='No indication of papillae on annulations [@Dzik1989; @Jaeger2010]'; + TEXT CHARACTER= 206 TAXON=129 TEXT='Ambiguous, through trunk spines are present [@Ou2018].'; + TEXT CHARACTER= 206 TAXON=135 TEXT='Present [@Maas2007csb]'; + TEXT CHARACTER= 206 TAXON=141 TEXT='Present [@Howard2020]'; + TEXT CHARACTER= 206 TAXON=148 TEXT='Coded as ambiguous in Antennacanthopodia [@Ou2011] as its trunk annulations are not clearly apparent.'; + TEXT CHARACTER= 207 TAXON=127 TEXT='Possibly represented by the row of ''tubercles'' [@Dzik2011]'; + TEXT CHARACTER= 208 TAXON=10 TEXT='The tessellating plates [@Maas2007] satisfy the morphological criteria for inclusion as a lorica'; + TEXT CHARACTER= 208 TAXON=25 TEXT='Present [@Neves2014ode]'; + TEXT CHARACTER= 209 TAXON=95 TEXT='Specimens reach consistent large size [@Maas2007ppp] so are assumed adult'; + TEXT CHARACTER= 211 TAXON=20 TEXT='One series, interspersed with intercalar plicae [@Heiner2007hmr]'; + TEXT CHARACTER= 211 TAXON=21 TEXT='30–60 plicae [@SchmidtRhasea2013]'; + TEXT CHARACTER= 211 TAXON=95 TEXT='Single series [@Maas2007ppp]'; + TEXT CHARACTER= 212 TAXON=17 TEXT='Twenty [@Harvey2017]'; + TEXT CHARACTER= 212 TAXON=18 TEXT='Six in adults [@SchmidtRhasea2013]^n22 plicae in N. mysticus Higgins larva; 20-25 in genus [@Neves2016]'; + TEXT CHARACTER= 212 TAXON=19 TEXT='Six [@SchmidtRhasea2013]'; + TEXT CHARACTER= 212 TAXON=20 TEXT='Eight [@SchmidtRhasea2013]'; + TEXT CHARACTER= 212 TAXON=21 TEXT='30 to 60 longitudinal folds [@Fujimoto2020mb]'; + TEXT CHARACTER= 212 TAXON=22 TEXT='Twenty plicae in Higgins larva of P. orphanus, P. gracilis [@Neves2016]; 22 or 24 in other species^n^nTwenty-two plicae in adults [generalizes @SchmidtRhasea2013]'; + TEXT CHARACTER= 212 TAXON=23 TEXT='Thirty plicae'; + TEXT CHARACTER= 212 TAXON=24 TEXT='About 46 longitudinal folds [@Fujimoto2020mb]'; + TEXT CHARACTER= 212 TAXON=45 TEXT='Large dorsal and ventral with six slender accordion-like lateral plates [@Storch1991jm]'; + TEXT CHARACTER= 212 TAXON=48 TEXT='Large dorsal and ventral plates plus six slender lateral plates [@SchmidtRhaesa2023za]'; + TEXT CHARACTER= 212 TAXON=49 TEXT='Eight in first (and second?) lorica larva [@Wennberg2009ib]'; + TEXT CHARACTER= 212 TAXON=51 TEXT='Twenty [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 212 TAXON=95 TEXT='~20; ten on visible surface [@Maas20087ppp]'; + TEXT CHARACTER= 212 TAXON=96 TEXT='Seven [@Peel2013]'; + TEXT CHARACTER= 213 TAXON=20 TEXT='Differentiated and somewhat enlarged'; + TEXT CHARACTER= 213 TAXON=49 TEXT='Somewhat distinct [@Wennberg2009ib, fig. 6B]'; + TEXT CHARACTER= 214 TAXON=8 TEXT='Triangular/conical elements evident on each annulation in GSC 45331'; + TEXT CHARACTER= 214 TAXON=10 TEXT='Papillae extending into spines or setae [@Maas2007]'; + TEXT CHARACTER= 214 TAXON=16 TEXT='The ''pits'' [@Shao2020] are interpreted as the (broken?) bases of sclerites'; + TEXT CHARACTER= 214 TAXON=21 TEXT='Occasional setae, but no lorical plates or obvious sclerites [e.g. @Heiner2006sb]'; + TEXT CHARACTER= 214 TAXON=22 TEXT='Cuticle is folded but lacks sclerotized fields [@Gad2005za]'; + TEXT CHARACTER= 214 TAXON=24 TEXT='No sclerites posterior of the trichoscalids, borne on the neck.'; + TEXT CHARACTER= 214 TAXON=41 TEXT='Copulatory spicules and other setae [@Keppner1988tams]'; + TEXT CHARACTER= 214 TAXON=42 TEXT='Absent in Onchulus [@Swart1993] (Description of two new species of the genera Onchulus and Limonchulus from Southern Africa (Nematoda: Enoplida, Onchulinae))'; + TEXT CHARACTER= 214 TAXON=44 TEXT='Prominent on caudal appendage [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 214 TAXON=96 TEXT='Evident elements occur on the posterior introvert [@Peel2013], but the status of the trunk is uncertain; sensory setae are possibly present but not preserved'; + TEXT CHARACTER= 214 TAXON=106 TEXT='Annulations are observed, but sclerites are not [@Hu2017]; the preservation is inadequate to evaluate the possible presence of specialized sclerites'; + TEXT CHARACTER= 214 TAXON=107 TEXT='Xystoscolex clearly displays bands of plates, and sclerites on its extensive introvert [@ConwayMorris2010]; though there is no direct evidence, it is hard to rule out the possibility that diminutive sclerites occur along the trunk'; + TEXT CHARACTER= 214 TAXON=111 TEXT='No evidence of sclerites among plates [@Hu2012], but material inadequately known to exclude the possibility of e.g. ventral spines'; + TEXT CHARACTER= 214 TAXON=112 TEXT='Not reported or visible; only plates present [@Hou1994]'; + TEXT CHARACTER= 214 TAXON=114 TEXT='''Ruptures'' in cuticle [@Duan2012] conceivably denote sensory structures, but no evidence of robust sclerites in phosphatized specimens'; + TEXT CHARACTER= 214 TAXON=116 TEXT='No sclerties preserved despite high fidelity preservation of plates [@GarciaBellido2013]'; + TEXT CHARACTER= 214 TAXON=117 TEXT='No sclerties preserved despite high fidelity preservation of plates [@GarciaBellido2013]'; + TEXT CHARACTER= 214 TAXON=118 TEXT='No evidence of sclerites among plates [@Hu2012], but material inadequately known to exclude the possibility of e.g. ventral spines'; + TEXT CHARACTER= 214 TAXON=119 TEXT='Paired spines present [@Shi2022]'; + TEXT CHARACTER= 214 TAXON=121 TEXT='Paired dorsal sclerites [@Shi2022]'; + TEXT CHARACTER= 214 TAXON=126 TEXT='@Whittington1975 describes rows of seven ''tubercles'' with a triangular lateral profile and which bore an apical spine. They exhibit slight relief and are carbonized. We homologize these with trunk sclerites.'; + TEXT CHARACTER= 214 TAXON=127 TEXT='Difficult to interpret the single longitudinal series of ''tubercles'' [@Dzik2011], which could correspond to a row of ventral papillae (cg. Onychodictyon)'; + TEXT CHARACTER= 214 TAXON=130 TEXT='Claws present (but no other sclerites)'; + TEXT CHARACTER= 214 TAXON=147 TEXT='The smaller spines between the enlarged spines [@ConwayMorris1988] are treated as ''standard'' trunk sclerites that have been incorporated into the sclerotized rings, rather than a separate character as in @Yang2015 (character 48).^n'; + TEXT CHARACTER= 214 TAXON=158 TEXT='Presence of nodes plausible but impossible to establish'; + TEXT CHARACTER= 215 TAXON=11 TEXT='Seemingly present in enlarged sclerites [@ThisStudy]'; + TEXT CHARACTER= 215 TAXON=12 TEXT='Hollow, without internal elements [@Liu2019]'; + TEXT CHARACTER= 215 TAXON=14 TEXT='Hollow sclerites; suggestion of laminar construction in enlarged sclerites on annulus 9 [@Shao2020] is presumed taphonomic'; + TEXT CHARACTER= 215 TAXON=95 TEXT='Not obviously apparent [@Maas2007ppp], but quality of preservation insufficient to determine with confidence'; + TEXT CHARACTER= 215 TAXON=119 TEXT='Stacked elements [@ThisStudy]'; + TEXT CHARACTER= 215 TAXON=120 TEXT='Evident on trunk sclerites and tail spines [@ThisStudy]'; + TEXT CHARACTER= 215 TAXON=121 TEXT='Not reported [@Shi2022], presumably reflecting inadequate preservation'; + TEXT CHARACTER= 215 TAXON=126 TEXT='Aysheaia claws do not have stacked elements [@Smith2014].'; + TEXT CHARACTER= 215 TAXON=128 TEXT='Some form of internal structure evidence in @Vannier2007 fig. 5, but unclear whether this corresponds to nested elements.'; + TEXT CHARACTER= 215 TAXON=129 TEXT='Ambiguous: some elements hint at a stacked construction [@Ma2014], but it is not possible to account for preservation in the published figures.'; + TEXT CHARACTER= 215 TAXON=130 TEXT='Long, slender claws make structure difficult to determine; a plausible hint of an outer element exists in one specimen [@Vannier2017, fig. 5e] but this interpretation is at best ambiguous.'; + TEXT CHARACTER= 215 TAXON=138 TEXT='Present [@Caron2013; @Smith2014]'; + TEXT CHARACTER= 215 TAXON=141 TEXT='Spines not described in adequate detail to evaluate presence of stacked elements [@Howard2020]'; + TEXT CHARACTER= 215 TAXON=143 TEXT='Claws comprise stacked elements [@Caron2017]'; + TEXT CHARACTER= 215 TAXON=144 TEXT='Yes, in dorsal spines [@Yang2015]'; + TEXT CHARACTER= 215 TAXON=145 TEXT='Present; see discussion in @Caron2020'; + TEXT CHARACTER= 215 TAXON=147 TEXT='Stacked elements present in spines [@Caron2020]'; + TEXT CHARACTER= 216 TAXON=4 TEXT='Sclerites are often mineralized, but are considered homologous to lophotrochozoan chaetae and thus not treated as homologous here'; + TEXT CHARACTER= 216 TAXON=11 TEXT='Enlarged and specialized sclerites only [@Zhang2022]'; + TEXT CHARACTER= 216 TAXON=16 TEXT='Specialized elements only [@Liu2014; @Shao2020]'; + TEXT CHARACTER= 216 TAXON=97 TEXT='The anterior papillae have a conical shape; the disc-like posterior papillae are likely equivalent, perhaps with a lower profile [@Howard2020]. There is no evidence of mineralization. All papillae are therefore treated as papillae rather than plates.'; + TEXT CHARACTER= 216 TAXON=106 TEXT='Annulations are observed, but sclerites are not [@Hu2017]'; + TEXT CHARACTER= 216 TAXON=107 TEXT='More or less circular; no microstructure visible [@ConwayMorris2010]'; + TEXT CHARACTER= 216 TAXON=122 TEXT='Present, if difficult to discern from e.g. @Shi2022, fig. 4b'; + TEXT CHARACTER= 216 TAXON=124 TEXT='At least some annulae have a pustulose appearance suggestive of sclerite presence [@Dzik1989]'; + TEXT CHARACTER= 216 TAXON=125 TEXT='The angular nature of the papillae on annulations [@Budd1998p] suggests their identification as sclerotized elements'; + TEXT CHARACTER= 216 TAXON=126 TEXT='We treat the sclerites as non-enlarged given their diminutive size; the subtle nature of their preservation argues against the robust structure that often characterizes enlarged sclerites.'; + TEXT CHARACTER= 216 TAXON=127 TEXT='Difficult to interpret the single longitudinal series of ''tubercles'' [@Dzik2011], which conceivably correspond to trunk sclerites'; + TEXT CHARACTER= 216 TAXON=128 TEXT='"Finger-like papillae" [@Ou2012] have a spine-shaped outline and likely correspond to trunk sclerites in other taxa'; + TEXT CHARACTER= 216 TAXON=133 TEXT='Probably represented by ''tubercles'' [@Liu2008]'; + TEXT CHARACTER= 216 TAXON=134 TEXT='Seemingly absent '; + TEXT CHARACTER= 216 TAXON=136 TEXT='Specialized elements only [@Zhang2016bl]'; + TEXT CHARACTER= 216 TAXON=142 TEXT='Insufficiently preserved to evaluate [@Ma2009; @Ma2012]'; + TEXT CHARACTER= 216 TAXON=143 TEXT='Absent, except for spines on anterior appendages [@Caron2017].'; + TEXT CHARACTER= 216 TAXON=144 TEXT='Scored as absent: the papillae and hair-like setae are sparsely distributed [@Yang2015] and so coded as specialized.'; + TEXT CHARACTER= 216 TAXON=145 TEXT='Annulae seemingly without sclerites, which may nonetheless be present on anterior appendages [@Caron2020]'; + TEXT CHARACTER= 216 TAXON=147 TEXT='Spinose sclerites present on annulae [@Caron2020, fig. 6b]'; + TEXT CHARACTER= 216 TAXON=148 TEXT='Small conical spines evident in rings [@Ou2011]'; + TEXT CHARACTER= 217 TAXON=107 TEXT='No clear evidence of phosphatization but difficult to evaluate from preservational mode'; + TEXT CHARACTER= 217 TAXON=110 TEXT='Prominent three-dimensional relief and dark colouration [@huang] hints at an originally phosphatic composition '; + TEXT CHARACTER= 217 TAXON=123 TEXT='Though not reported, the surface exhibits a plate-like texture [@Han2007pr, fig. 1.8] and in regions seems to preserve with relief [@Han2007pr, fig. 1.7], hinting at the possible presence of trunk sclerites'; + TEXT CHARACTER= 218 TAXON=98 TEXT='Subtriangular elements [@Ma2014jp]'; + TEXT CHARACTER= 218 TAXON=104 TEXT='Elongate [@Smith2015p]'; + TEXT CHARACTER= 218 TAXON=105 TEXT='Triangular [@Yang2021]'; + TEXT CHARACTER= 218 TAXON=109 TEXT='Triangular projections'; + TEXT CHARACTER= 218 TAXON=110 TEXT='Reconstructed as triangular, but preserved sclerites are flat discs [@Huang2004, fig. 3c]'; + TEXT CHARACTER= 218 TAXON=125 TEXT='Seemingly conical [@Budd1998p]'; + TEXT CHARACTER= 218 TAXON=126 TEXT='Triangular [@Whittington1978]'; + TEXT CHARACTER= 218 TAXON=133 TEXT='Seemingly triangular in profile [see pair in top left corner of @Liu2008, fig. 2A6]'; + TEXT CHARACTER= 219 TAXON=107 TEXT='Potential nodes on sclerites (sets of four) in @ConwayMorris2010, fig. 5B, but these are not mentioned in text and inadequately figured to support a decisive scoring.'; + TEXT CHARACTER= 219 TAXON=108 TEXT='Some surface texture [@ConwayMorris2010]; unclear whether this corresponds to nodes.'; + TEXT CHARACTER= 219 TAXON=110 TEXT='Five to ten nodes around a central node [@Huang2004, fig. 1c]'; + TEXT CHARACTER= 219 TAXON=119 TEXT='Plates with single node [@Yang2020]'; + TEXT CHARACTER= 219 TAXON=122 TEXT='Four nodes in single ring, where this can be determined from @Shi2022, fig. 4'; + TEXT CHARACTER= 220 TAXON=114 TEXT='Prominent single central boss with three to four nodes [@Duan2012]'; + TEXT CHARACTER= 220 TAXON=118 TEXT='Single ring of four to six nodes [@Hu2012]'; + TEXT CHARACTER= 220 TAXON=119 TEXT='Single node [@Yang2020]'; + TEXT CHARACTER= 221 TAXON=122 TEXT='No cases that obviously don''t have four, but images in @Shi2022 insufficient to determine with confidence.'; + TEXT CHARACTER= 223 TAXON=121 TEXT='Four nodes [@ThisStudy]'; + TEXT CHARACTER= 224 TAXON=98 TEXT='Possible distinction of posterior band [@Ma2014jp] is not considered equivalent, if this is indeed not a taphonomic feature.'; + TEXT CHARACTER= 224 TAXON=104 TEXT='Posterior not visible'; + TEXT CHARACTER= 224 TAXON=105 TEXT='Posterior not visible'; + TEXT CHARACTER= 224 TAXON=108 TEXT='Spinose anterior region [@ConwayMorris2010] interpreted as anterior trunk rather than introvert, by comparison with Acosmia [@Howard2020]'; + TEXT CHARACTER= 224 TAXON=125 TEXT='Anterior missing'; + TEXT CHARACTER= 225 TAXON=126 TEXT='Seemingly restricted to dorsal surface [@Whittington1978]'; + TEXT CHARACTER= 225 TAXON=128 TEXT='Ventral disposition unknown'; + TEXT CHARACTER= 225 TAXON=129 TEXT='Seemingly complete; certainly spanning the width of the body [@Ou2018]'; + TEXT CHARACTER= 225 TAXON=148 TEXT='Completely encircling appendages [@Ou2011]'; + TEXT CHARACTER= 226 TAXON=7 TEXT='Regular series in quincunx'; + TEXT CHARACTER= 226 TAXON=11 TEXT='Tubules and enlarged plates'; + TEXT CHARACTER= 226 TAXON=98 TEXT='Transverse fields in at least the posterior region of the trunk [@Ma2014jp]'; + TEXT CHARACTER= 226 TAXON=107 TEXT='Transverse rows evident in posterior trunk [@ConwayMorris2010, fig. 5B]'; + TEXT CHARACTER= 226 TAXON=122 TEXT='Following @Shi2022, under the interpretation presented by @ThisStudy'; + TEXT CHARACTER= 226 TAXON=126 TEXT='Regular transverse rows [@Whittington1978]'; + TEXT CHARACTER= 226 TAXON=141 TEXT='Along annular rings [@Howard2020cb]'; + TEXT CHARACTER= 226 TAXON=148 TEXT='On appendages only [@Ou2011]'; + TEXT CHARACTER= 227 TAXON=111 TEXT='One field comprising three rows of plates per annulation [@Hu2008]'; + TEXT CHARACTER= 227 TAXON=122 TEXT='Irregular [@Shi2022, fig. 4]'; + TEXT CHARACTER= 228 TAXON=111 TEXT='One field comprising three rows of plates per annulation [@Hu2008]'; + TEXT CHARACTER= 229 TAXON=12 TEXT='Disorderly [@Liu2019]'; + TEXT CHARACTER= 229 TAXON=14 TEXT='Number of sclerites increases in line with trunk circumference [@Shao2020]'; + TEXT CHARACTER= 229 TAXON=15 TEXT='We interpret the flat subrectuangular elements as an expression of cuticular structure, rather than distinct sclerites. Distinct sclerites display an inexact correspondence between subsequent rows in the type material [@Zhang2015], and in other material ascribed to the genus [@Wang2025, fig. 2F]'; + TEXT CHARACTER= 229 TAXON=108 TEXT='No evidence of correspondence [@ConwayMorris2010]'; + TEXT CHARACTER= 229 TAXON=109 TEXT='No evidence of correspondence [@Smith2015]'; + TEXT CHARACTER= 229 TAXON=110 TEXT='Prominent quincunx [@Huang2004]'; + TEXT CHARACTER= 229 TAXON=123 TEXT='Alignment similar between rings but number of spines not consistent, so not forming rows along the trunk [@Han2007]'; + TEXT CHARACTER= 229 TAXON=126 TEXT='Difficult to evaluate'; + TEXT CHARACTER= 230 TAXON=108 TEXT='Seemingly represented by polygonal texture [e.g. @ConwayMorris2010 fig, 6d]'; + TEXT CHARACTER= 230 TAXON=110 TEXT='Seemingly absent but SEM required to verify'; + TEXT CHARACTER= 230 TAXON=111 TEXT='Platelets considered absent [@Hu2008], but do seem to be evident (subtly) in figures; we attribute their diminished prominence to the manner of preservation.'; + TEXT CHARACTER= 230 TAXON=117 TEXT='Smaller plates irregularly dispersed [@GarciaBellido2013]'; + TEXT CHARACTER= 230 TAXON=118 TEXT='Platelets not preserved, in contrast to co-occurring Wudingscolex [@Hu2012]. Larter plates (''protruberences'') are present [@Hu2012].'; + TEXT CHARACTER= 230 TAXON=119 TEXT='PLatelets present [@Yang2020]'; + TEXT CHARACTER= 232 TAXON=1 TEXT='Paired spines at anterior of lorica, plus pair at posterior in larger ?semaphront [@Maas2009aap]'; + TEXT CHARACTER= 232 TAXON=15 TEXT='''Small spines'' [@Zhang2015]'; + TEXT CHARACTER= 232 TAXON=28 TEXT='Various setae and tubes [@Rucci2020z]'; + TEXT CHARACTER= 232 TAXON=38 TEXT='''Thorns'' [@Bolek2013]'; + TEXT CHARACTER= 232 TAXON=95 TEXT='Not obviously apparent [@Maas2007ppp], but quality of preservation insufficient to determine with confidence'; + TEXT CHARACTER= 232 TAXON=97 TEXT='Ambiguous: not reported, but preservation does not exclude the presence of diminutive elements'; + TEXT CHARACTER= 232 TAXON=98 TEXT='Interpreted as present based on mid-trunk sclerites with setal traces [@Ma2014jp]'; + TEXT CHARACTER= 232 TAXON=135 TEXT='Specialized spines borne on papillae [@Maas2007csb]'; + TEXT CHARACTER= 232 TAXON=136 TEXT='Individual sclerites present on trunk [@Zhang2016]'; + TEXT CHARACTER= 232 TAXON=177 TEXT='Euarthropod claws are interpreted as specializations of the appendage sclerotization rather than homologues of epidermal sclerites'; + TEXT CHARACTER= 232 TAXON=178 TEXT='Euarthropod claws are interpreted as specializations of the appendage sclerotization rather than homologues of epidermal sclerites'; + TEXT CHARACTER= 232 TAXON=179 TEXT='Euarthropod claws are interpreted as specializations of the appendage sclerotization rather than homologues of epidermal sclerites'; + TEXT CHARACTER= 232 TAXON=180 TEXT='Euarthropod claws are interpreted as specializations of the appendage sclerotization rather than homologues of epidermal sclerites'; + TEXT CHARACTER= 233 TAXON=95 TEXT='Not obviously apparent [@Maas2007ppp], but quality of preservation insufficient to determine with confidence'; + TEXT CHARACTER= 234 TAXON=10 TEXT='Small structures could be setae or papillae [@Maas2007]'; + TEXT CHARACTER= 234 TAXON=95 TEXT='Not obviously apparent [@Maas2007ppp], but quality of preservation insufficient to determine with confidence'; + TEXT CHARACTER= 234 TAXON=117 TEXT='The isolated small sclerites [@GarciaBellido2013] are treated as microplates rather than sclerites.'; + TEXT CHARACTER= 236 TAXON=15 TEXT='Occurring in irregularly spaced bilateral pairs [@Zhang2015]'; + TEXT CHARACTER= 236 TAXON=25 TEXT='The lorica field does not comprise enlarged plicae [@Neves2014]'; + TEXT CHARACTER= 236 TAXON=121 TEXT='Present [@Han2007app; @Steiner2012]'; + TEXT CHARACTER= 236 TAXON=123 TEXT='Given the possible presence of palaeoscolecid-like plates between the spine rows [@Han2007pr, fig. 1.8], it is possible that the spines are best interpreted as enlarged sclerites. We code these as ambiguous pending further information on Tylotites.'; + TEXT CHARACTER= 236 TAXON=126 TEXT='Paucipodia [@Chen1995trse] and Aysheaia [@Liu2014ppp, fig. 1] have been reported to bear subtle sub-circular specializations, but these putative structures in fact represent flattened appendages [@Hou2004; @Yang2015].'; + TEXT CHARACTER= 236 TAXON=127 TEXT='Impressions of the dorsal and ventral surfaces are interpreted as evident on the single specimen; neither surface shows evidence of epidermal specializations [@Dzik2011]'; + TEXT CHARACTER= 236 TAXON=129 TEXT='Coded as present based on the shield-like specializations associated with each leg pair [@Ma2014jsp, fig. 2].'; + TEXT CHARACTER= 236 TAXON=130 TEXT='Paucipodia [@Chen1995trse] and Aysheaia [@Liu2014ppp, fig. 1] have been reported to bear subtle sub-circular specializations, but these putative structures in fact represent flattened appendages [@Hou2004; @Yang2015].'; + TEXT CHARACTER= 236 TAXON=137 TEXT='Ambiguous, as the dorsal surface is not visible in the available material [@Haug2012cb]'; + TEXT CHARACTER= 236 TAXON=141 TEXT='Not evident [@Howard2020]'; + TEXT CHARACTER= 236 TAXON=143 TEXT='We interpret the ''gut diverculata'' described by @Caron2017 to be dorsal epidermal evaginations. As @Caron2017 point out, these features are located above limb pairs. Their additional file 1 shows these features are pointed dorsally and rounded ventrally, and exhibit more consistent shape. They also overprint the annulations in their additional file 4 panel c, consistent with being an external feature - and where they do so they exhibit a well-defined gut margin. These features have a paired appearance in Additional file 2 panel b.^n^nFurthermore, elemental mapping in their fig 1C shows no hint of a gut characterization in the posteriormost element (see also panel D, E in their additional file 6). Instead, they are associated with elevated concentrations of carbon, as are the claws; see the carbon distribution in additional file 6 panel A. They extend beyond the body wall (additional figure 9), indicating an external feature.'; + TEXT CHARACTER= 236 TAXON=157 TEXT='Not evident in well-preserved specimens of @Budd1998ar or @Young2017.'; + TEXT CHARACTER= 237 TAXON=11 TEXT='Single medial row (albeit with seemingly irregular spacing) [@Zhang2022]'; + TEXT CHARACTER= 237 TAXON=13 TEXT='Only a single large sclerite is known; the opposite side of the trunk is missing [@Liu2019]'; + TEXT CHARACTER= 237 TAXON=15 TEXT='Large sclerites occurs in pairs, with front of large sclerites aligned with annulations 1, 5, 10, 15, 19; pair two consistently more ventral than other pairs [@Zhang2015]'; + TEXT CHARACTER= 238 TAXON=11 TEXT='The ''caudal'' sclerites seem to occur on the dorsal surface (as defined by the central columns of enlarged sclerites) and do not obviously surround the anus [@Zhang2022]. They are thus treated as belonging to dorsal bands of sclerites. They seem to be slightly offset from the central sclerites; hence this state is coded ambiguous to denote two sclerites if the lateral sclerites form separate bands, or three sclerites per row if the sclerites are lateral to the medial sclerite in a single band.'; + TEXT CHARACTER= 238 TAXON=13 TEXT='Only a single large sclerite is known; as the opposite side of the trunk is missing, it is possible that a second sclerite is present [@Liu2019]'; + TEXT CHARACTER= 238 TAXON=91 TEXT='Halobiotus (Eutardigrada) has paired epidermal specialisations (depressions), represented by pits that serve as muscle attachment sites [@Halberg2009; @Marchioro2013]'; + TEXT CHARACTER= 238 TAXON=131 TEXT='Single element, potentially representing two fused elements [@Strausfeld2022]'; + TEXT CHARACTER= 238 TAXON=134 TEXT='Two papillae reported per leg pair, with additional in between leg pairs [@Siveter2018]'; + TEXT CHARACTER= 238 TAXON=145 TEXT='Three [@Caron2020]'; + TEXT CHARACTER= 239 TAXON=14 TEXT='Symmetrical pairs of enlarged sclerites on annulae 7 and 9; single medial element on annulus 12 [@Shao2020]'; + TEXT CHARACTER= 239 TAXON=15 TEXT='Every three to five annulations'; + TEXT CHARACTER= 239 TAXON=121 TEXT='Every annulation [@Shi2022]'; + TEXT CHARACTER= 239 TAXON=122 TEXT='On every other annulation [@Shi2022]'; + TEXT CHARACTER= 240 TAXON=11 TEXT='First sclerite widely separated from later bands [@Zhang2022]'; + TEXT CHARACTER= 240 TAXON=120 TEXT='Sub-regular [@ThisStudy]'; + TEXT CHARACTER= 240 TAXON=122 TEXT='Alternate annulations [@Shi2022]'; + TEXT CHARACTER= 242 TAXON=11 TEXT='Differing [@Zhang2022]'; + TEXT CHARACTER= 242 TAXON=125 TEXT='Not all dorsal specialisations present, as trunk incomplete in @Budd1998p, therefore coded as ambiguous (applicable).'; + TEXT CHARACTER= 242 TAXON=135 TEXT='Orstenotubulus has prominent spines and buttresses above some leg pairs, but these are profoundly diminished above others [@Maas2007csb].'; + TEXT CHARACTER= 244 TAXON=11 TEXT='Wider than tall (at least anteriorly) [@Zhang2022]'; + TEXT CHARACTER= 244 TAXON=134 TEXT='Equant [@Siveter2018], so coded ambiguous'; + TEXT CHARACTER= 245 TAXON=13 TEXT='Truncated but clearly evident originally [@Liu2019]'; + TEXT CHARACTER= 245 TAXON=121 TEXT='Present [@Shi2022]'; + TEXT CHARACTER= 245 TAXON=124 TEXT='Dorsal spine [@Jaeger2010]'; + TEXT CHARACTER= 245 TAXON=125 TEXT='Seemingly absent [@Budd1998p]; no central ''pore'' as in Xenusion [@Jaeger2010]'; + TEXT CHARACTER= 245 TAXON=131 TEXT='Interpreted as having a pointed apex [@Hou1991; @Liu2014], but whilst certain aspects of the spine have an angular silhouette, there is no distinct pointed apex [@Strausfeld2022]'; + TEXT CHARACTER= 245 TAXON=135 TEXT='Ambiguously preserved [@Maas2007]'; + TEXT CHARACTER= 246 TAXON=10 TEXT='Gently curved posteriad [@Maas2007]'; + TEXT CHARACTER= 246 TAXON=11 TEXT='Modest curvature [@Zhang2022]'; + TEXT CHARACTER= 246 TAXON=12 TEXT='Limited curvature, if any [@Liu2019]'; + TEXT CHARACTER= 246 TAXON=120 TEXT='Absent in Cricocosmia n. sp. [@ThisStudy]'; + TEXT CHARACTER= 246 TAXON=121 TEXT='Simple cones [@Shi2022]'; + TEXT CHARACTER= 246 TAXON=128 TEXT='Figs 1B.1, 2C in @Liu2008app show O. ferox with a straight distal termination in epidermal evagination.'; + TEXT CHARACTER= 246 TAXON=133 TEXT='@Liu2008app figures 3a, b depict a curved morphology; however, as fossil photographs do not convincingly demonstrate this interpretation, we code as ambiguous.'; + TEXT CHARACTER= 246 TAXON=138 TEXT='The spines of Hallucigenia sparsa are gently curved [@Smith2014; @Smith2015].'; + TEXT CHARACTER= 247 TAXON=17 TEXT='Anterior margin of lorica plate straight'; + TEXT CHARACTER= 247 TAXON=18 TEXT='Lorica plates rectangular with straight margin and round corners'; + TEXT CHARACTER= 247 TAXON=19 TEXT='Spike present on anterior margin of lorical plate'; + TEXT CHARACTER= 247 TAXON=95 TEXT='Spikes on anterior lorical plate [@Maas2007ppp]'; + TEXT CHARACTER= 247 TAXON=125 TEXT='Not angular [@Budd1998p]'; + TEXT CHARACTER= 248 TAXON=124 TEXT='Substantial relief, with spines'; + TEXT CHARACTER= 248 TAXON=134 TEXT='Not obviously sclerotized [@Siveter2018]'; + TEXT CHARACTER= 250 TAXON=10 TEXT='Spines/setae but no ornament [@Maas2007]'; + TEXT CHARACTER= 250 TAXON=18 TEXT='Honeycomb ornament on lorical plates [@Neves2016, fig. 17]'; + TEXT CHARACTER= 250 TAXON=19 TEXT='Honeycomb pattern in Higgins larva lorica plates [@Neves2016za, fig. 17] but unornamented in adult [@Neves2016za]'; + TEXT CHARACTER= 250 TAXON=69 TEXT='Actinarctus sclerites exhibit a polygonal ornament, but the indentations do not penetrate the sclerites [@Marchioro2013].'; + TEXT CHARACTER= 250 TAXON=95 TEXT='Seemingly unornamented [@Maas2007ppp]'; + TEXT CHARACTER= 250 TAXON=128 TEXT='@Liu2008app, fig. 2B, shows a net like texture of sclerite ornaments for O. ferox, similar to those described by @Topper2013 in Onychodictyon sp. plates'; + TEXT CHARACTER= 250 TAXON=131 TEXT='Regular polygonal pattern [@Liu2014], but undetermined whether these polygons penetrate the sclerites or whether their distribution corresponds to Microdictyon / Onychodictyon plates.'; + TEXT CHARACTER= 250 TAXON=133 TEXT='Coded ambiguous as the texture is difficult to discern from the figures of @Liu2008app'; + TEXT CHARACTER= 250 TAXON=134 TEXT='Thanahita exhibits a distinct tuft-like morphology [@Siveter2018]'; + TEXT CHARACTER= 250 TAXON=139 TEXT='Unclear from @Hou1995zjls, but clearly not inapplicable.'; + TEXT CHARACTER= 250 TAXON=140 TEXT='A honeycomb-like pattern that seems to be a surface ornament, but conceivably forms net-like holes [@Steiner2012]'; + TEXT CHARACTER= 250 TAXON=144 TEXT='"The dorsolateral spines of Collinsium have a distinctive punctate-like ornamentation similar to that of H. hongmeia" [@Yang2015]; hence coded per that taxon.^n'; + TEXT CHARACTER= 251 TAXON=69 TEXT='Absent [@Marchioro2013]'; + TEXT CHARACTER= 251 TAXON=121 TEXT='Absent [@Shi2022]'; + TEXT CHARACTER= 251 TAXON=128 TEXT='Likely evident; not obvious in articulated material [@Steiner2012, fig. 8], consistent with the diminutive stature of the feature in isolated plates [@Topper2013]'; + TEXT CHARACTER= 251 TAXON=133 TEXT='Presumed present based on presence in disarticulated Onychodictyon sp. [@Topper2013]'; + TEXT CHARACTER= 251 TAXON=138 TEXT='Whilst it is conceivable that the spinose ornament on H. fortis spines corresponds to bosses of an originally net-like sclerite, we do not consider there to be sufficient evidence to treat these as homologous here.'; + TEXT CHARACTER= 251 TAXON=140 TEXT='Seemingly evident as carbon-enriched spots in elemental maps [@Steiner2012, fig. 7H]'; + TEXT CHARACTER= 254 TAXON=152 TEXT='Coded ambiguous: not present at larval stage [@Smith2023n], but dorsal extensions of the haemolymph system are plausible precursors of a feature that may be added in an adult stage with different metabolic requirements.'; + TEXT CHARACTER= 257 TAXON=152 TEXT='Coded as absent, reflecting the absence of any indication of flaps, despite expression of appendages [@Smith2023n] – though it remains possible that these structures were not expressed until a later instar.'; + TEXT CHARACTER= 257 TAXON=156 TEXT='Setal blades are expressed as wrinkles on the dorsal flaps of gilled lobopodians [@VanRoy2015]'; + TEXT CHARACTER= 257 TAXON=157 TEXT='Setal blades are expressed as wrinkles on the dorsal flaps of gilled lobopodians [@VanRoy2015]'; + TEXT CHARACTER= 257 TAXON=166 TEXT='Coded as absent by @VanRoy2015'; + TEXT CHARACTER= 257 TAXON=168 TEXT='Only a single series of lateral flaps is reconstructed [@Moysiuk2019]'; + TEXT CHARACTER= 257 TAXON=169 TEXT='Coded ambiguous: the presence of dorsal and ventral flaps is tentatively interpreted by @VanRoy2015, though @Moysiuk2019 consider them absent'; + TEXT CHARACTER= 257 TAXON=171 TEXT='@VanRoy2015 identify "clear evidence" of two sets of flaps, though @Moysiuk2019 consider the evidence equivocal. We thus take the conservative position of coding this taxon ambiguous.'; + TEXT CHARACTER= 258 TAXON=166 TEXT='Coded as present to reflect proposed homology of gnathobasic endites with those of euarthropods [@Cong2017]'; + TEXT CHARACTER= 259 TAXON=168 TEXT='Crossing the body [@Moysiuk2019]'; + TEXT CHARACTER= 260 TAXON=127 TEXT='Appendages not completely preserved [@Dzik2011], so coded ambiguous'; + TEXT CHARACTER= 260 TAXON=128 TEXT='Neither entirely slender and cylindrical or conical'; + TEXT CHARACTER= 260 TAXON=129 TEXT='Minimal tapering [@Ou2018]'; + TEXT CHARACTER= 260 TAXON=130 TEXT='Figure 5a in @Vannier2017 establishes that lobopods, when oriented parallel to bedding, are cylindrical.'; + TEXT CHARACTER= 260 TAXON=144 TEXT='Ambiguous [@Yang2015]'; + TEXT CHARACTER= 260 TAXON=153 TEXT='Preservation inadequate to distinguish [@Liu2007az]'; + TEXT CHARACTER= 261 TAXON=127 TEXT='Coded as uncertain because its limbs are poorly preserved [@Dzik2011]. '; + TEXT CHARACTER= 261 TAXON=129 TEXT='Spines are treated as equivalent to t'; + TEXT CHARACTER= 261 TAXON=133 TEXT='We code as O. gracilis as uncertain as its longitudinal series of dot-like structures [@Liu2008csb fig. 2A6] could indicate an organization of appendicules similar to those of O. ferox [see @Ou2012, fig. 2a]. '; + TEXT CHARACTER= 261 TAXON=141 TEXT='Double series of Luolishania-like spines [@Howard2020]'; + TEXT CHARACTER= 261 TAXON=148 TEXT='Not distinct from possible trunk sclerites'; + TEXT CHARACTER= 261 TAXON=167 TEXT='Anomalocaris is treated as uncertain [@VanRoy2015]. '; + TEXT CHARACTER= 261 TAXON=169 TEXT='Absent [@VanRoy2015]'; + TEXT CHARACTER= 261 TAXON=171 TEXT='Absent [@VanRoy2015]'; + TEXT CHARACTER= 262 TAXON=55 TEXT='Heterotardigrades have a spine-like sensory organ on the trunk limbs. See character 36 in @Khim2023. '; + TEXT CHARACTER= 262 TAXON=143 TEXT='Two series of spines, arranged in chevrons [@Caron2017]'; + TEXT CHARACTER= 265 TAXON=124 TEXT='Appendage-parallel banding present [@Jaeger2010]'; + TEXT CHARACTER= 265 TAXON=126 TEXT='A small number of possible cases [@Whittington1975], but not convincingly demonstrated.'; + TEXT CHARACTER= 265 TAXON=141 TEXT='Seemingly present [@Howard2020]'; + TEXT CHARACTER= 265 TAXON=143 TEXT='Absent [@Caron2017]'; + TEXT CHARACTER= 265 TAXON=148 TEXT='Unclear whether spines borne on papillae'; + TEXT CHARACTER= 267 TAXON=125 TEXT='Ambiguous; sclerotized elements may account for the angular termination of the papillae'; + TEXT CHARACTER= 269 TAXON=120 TEXT='Coded as ambiguous: the potential homology between the pair of terminal hooks of Cricocosmia and the similarly-shaped claws on trunk appendages [@Steiner2012] is difficult to evaluate.'; + TEXT CHARACTER= 269 TAXON=124 TEXT='Seemingly absent [@Dzik1989; @Jaeger2010]'; + TEXT CHARACTER= 269 TAXON=126 TEXT='The lobopod claws of Aysheaia are sub-terminal; the lobopods extend beyond the claws [@Whittington1978]'; + TEXT CHARACTER= 269 TAXON=129 TEXT='The appendages terminate in sclerites of equivalent construction to those that adorn the rest of the appendage [@Liu2011; @Ma2014jsp; @Ou2018]. Because claws are likely homologous with trunk sclerites, we code this transformation series as present to reflect the possible homology with claws of other taxa.'; + TEXT CHARACTER= 269 TAXON=130 TEXT='Simple elongate claws [@Vannier2017]'; + TEXT CHARACTER= 269 TAXON=134 TEXT='Present [@Siveter2018]'; + TEXT CHARACTER= 269 TAXON=138 TEXT='Hallucigenia sparsa is coded with two claws as this is the state on most trunk limbs, even if a second claw is not evident on the posteriormost appendages [@Smith2015].'; + TEXT CHARACTER= 269 TAXON=141 TEXT='The "trunk spines" [@Howard2020] are interpreted as corresponding to terminal claws on vestigial trunk limbs'; + TEXT CHARACTER= 269 TAXON=142 TEXT='Coded as present (one claw) as this represents the state of its typical trunk limbs. Spinose elements on its anterior limbs do not exhibit a claw-like morphology and may represent cirri rather than claws.'; + TEXT CHARACTER= 269 TAXON=148 TEXT='The sclerotized ''pads'' [@Ou2011] are positionally and compositionally equivalent to claws in other taxa'; + TEXT CHARACTER= 269 TAXON=152 TEXT='As claws are not evident until a rather late stage of onychophoran development [@Walker2004], we cannot be confident that their absence in YKLP 12387 [@Smith2023n] reflects the adult condition.'; + TEXT CHARACTER= 269 TAXON=153 TEXT='Jianshanopodia [@Liu2006] and Megadictyon [@Liu2007az] are also coded as uncertain as the preservation of the type material does not allow the presence or absence of terminal claws to be confirmed. '; + TEXT CHARACTER= 269 TAXON=154 TEXT='Jianshanopodia [@Liu2006] and Megadictyon [@Liu2007az] are also coded as uncertain as the preservation of the type material does not allow the presence or absence of terminal claws to be confirmed. '; + TEXT CHARACTER= 269 TAXON=163 TEXT='Absent, following @Budd2012'; + TEXT CHARACTER= 269 TAXON=167 TEXT='Coded ambiguous, as there is no definitive information on the presence of lobopodous limbs or a second set of flaps [@VanRoy2015]. '; + TEXT CHARACTER= 269 TAXON=177 TEXT='Leanchoilia is coded as ambiguous for one or three claws to reflect the conflicting interpretations of @Garcia2007 and @Haug2012bmceb.'; + TEXT CHARACTER= 270 TAXON=91 TEXT='Eutardigrades have a two-branched claw with differing morphologies, however, the base of most claws appear enlarged [including Halobiotidae, Doryphoribiidae, Eohysibiidae, Rhichtersiidae; see @Gasiorek2019] hence we code this as present for this taxon.'; + TEXT CHARACTER= 270 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 270 TAXON=126 TEXT='Enlarged base; figured in supplementary material of @Smith2014'; + TEXT CHARACTER= 270 TAXON=130 TEXT='Paucipodia''s claws do not have an enlarged base [@Vannier2017]'; + TEXT CHARACTER= 270 TAXON=131 TEXT='No enlarged base [@Ramskold1998]'; + TEXT CHARACTER= 270 TAXON=133 TEXT='The claws of Onychodictyon gracilis appear to have an enlarged base [see @Liu2008app, fig 2A6], although few other claws have been described.'; + TEXT CHARACTER= 270 TAXON=138 TEXT='Hallucigenia''s claws do not have an enlarged base, with similar curvature through the length of the claw [@Smith2014].'; + TEXT CHARACTER= 270 TAXON=142 TEXT='Enlarged base in the claws of posterior lobopods [@Ma2009, figure 10]'; + TEXT CHARACTER= 270 TAXON=146 TEXT='Enlarged base [@Garcia2013]'; + TEXT CHARACTER= 270 TAXON=177 TEXT='No enlarged base [@Garcia2007]'; + TEXT CHARACTER= 270 TAXON=178 TEXT='see @Briggs1999'; + TEXT CHARACTER= 271 TAXON=62 TEXT='Neoarctus has sub-terminal claws. See @Fontoura2017.'; + TEXT CHARACTER= 271 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 271 TAXON=130 TEXT='@Vannier2017 (fig. 5a, 5b) indicate Paucipodia''s claws are sub-terminal; however, there is a possibility that this is taphonomic [cf. @Murdock2014], as the musculature attached to the claws may have shrunk relative to the cuticle, giving the false impression of sub-terminal claws. As previous studies describe the claws as terminal [@Hou2004], we code claw position as ambiguous.'; + TEXT CHARACTER= 272 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 277 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 277 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 278 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 279 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 280 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 281 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 282 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 283 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 284 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 285 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 286 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 287 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 288 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 289 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=84 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=90 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 290 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 291 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 291 TAXON=128 TEXT='A pair of claws occurs on each appendage [@Liu2008app]'; + TEXT CHARACTER= 291 TAXON=129 TEXT='Inapplicable as claws not yet differentiated from appendage sclerites'; + TEXT CHARACTER= 291 TAXON=130 TEXT='@Hou2004 report that each lobopod carries two claws, whereas @Vannier2017 only observe a single claw on the two complete and exceptionally well preserved appendages of ELI-JS0001A. We interpret this discrepancy as representing variation in claw number between appendages.'; + TEXT CHARACTER= 291 TAXON=131 TEXT='Cardiodictyon unambiguously has two claws on each leg [@Ramskold1998]'; + TEXT CHARACTER= 291 TAXON=132 TEXT='Two claws. Though @Hou1995zjls observe a single claw, they leave open the possibility of a second; @Liu2008app interpret the presence of two claws.'; + TEXT CHARACTER= 291 TAXON=133 TEXT='A pair of claws is evident in one appendage [@Liu2008app], and taken to represent the typical number.'; + TEXT CHARACTER= 291 TAXON=134 TEXT='One on some appendages, two on others [@Siveter2018]'; + TEXT CHARACTER= 291 TAXON=138 TEXT='Two claws on anterior trunk appendages, one on posterior [@Smith2015]'; + TEXT CHARACTER= 291 TAXON=140 TEXT='Single claw [@Steiner2012]'; + TEXT CHARACTER= 291 TAXON=142 TEXT='Only one claw is observed on the unmodified (i.e. posterior) trunk limbs [@Ma2009]. Spinose elements on anterior limbs do not exhibit a claw-like morphology and may represent cirri rather than claws.'; + TEXT CHARACTER= 291 TAXON=143 TEXT='The two "claws" on anterior limbs have the same shape and elemental composition as cirri [@Caron2017], so are not treated as homologous with claws. The posterior appendages each bear a single claw [@Caron2017]'; + TEXT CHARACTER= 291 TAXON=144 TEXT='No claws on anterior appendages; single claw on each posterior appendage [@Yang2015]'; + TEXT CHARACTER= 291 TAXON=146 TEXT='Single claw present on posterior appendages; claws are not apparent on cirrate anterior appendages [@Garcia2013]'; + TEXT CHARACTER= 291 TAXON=177 TEXT='Coded as ambiguous (one or three claws) to reflect the conflicting interpretations of @Garcia2007 and @Haug2012bmceb'; + TEXT CHARACTER= 292 TAXON=120 TEXT='Equal number [@Dhungana2023]'; + TEXT CHARACTER= 292 TAXON=128 TEXT='A pair of claws occurs on each appendage [@Liu2008app]'; + TEXT CHARACTER= 292 TAXON=130 TEXT='@Hou2004 report that each lobopod carries two claws, whereas @Vannier2017 only observe a single claw on the two complete and exceptionally well preserved appendages of ELI-JS0001A. We interpret this discrepancy as representing variation in claw number between appendages.'; + TEXT CHARACTER= 292 TAXON=131 TEXT='Cardiodictyon unambiguously has two claws on each leg [@Ramskold1998]'; + TEXT CHARACTER= 292 TAXON=133 TEXT='Only one appendage is adequately preserved to identify claws [@Liu2008app]'; + TEXT CHARACTER= 292 TAXON=134 TEXT='One on some appendages, two on others [@Siveter2018]'; + TEXT CHARACTER= 292 TAXON=138 TEXT='Two claws on anterior trunk appendages, one on posterior [@Smith2015]'; + TEXT CHARACTER= 292 TAXON=143 TEXT='Two claws on anterior limbs, one on posterior [@Caron2017]'; + TEXT CHARACTER= 292 TAXON=146 TEXT='Single claw presumed on all posterior appendages'; + TEXT CHARACTER= 293 TAXON=52 TEXT='Euperipatoides claws are identical on trunk limbs, although the jaw elements are differentiated [@Smith2014].'; + TEXT CHARACTER= 293 TAXON=85 TEXT='Treated as similar, after @Mapalo2024cb [contra @Kihm2023]'; + TEXT CHARACTER= 293 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 293 TAXON=126 TEXT='All seven claws are identical [@Whittington1978].'; + TEXT CHARACTER= 293 TAXON=128 TEXT='Onychodictyon ferox has a large and a small claw [@Steiner2012, fig. 8] '; + TEXT CHARACTER= 293 TAXON=130 TEXT='Not visibly differentiated [@Hou2004]'; + TEXT CHARACTER= 293 TAXON=138 TEXT='Not visibly differentiated [@Smith2015]'; + TEXT CHARACTER= 294 TAXON=152 TEXT='Ambiguous as distal foot does not arise in Onychophora until a late stage in development [@Walker2004]'; + TEXT CHARACTER= 297 TAXON=166 TEXT='"Oblique veins" @Chen1994 interpreted as strengthening rays.'; + TEXT CHARACTER= 297 TAXON=169 TEXT='Treated as ambiguous by @Moysiuk2019'; + TEXT CHARACTER= 297 TAXON=173 TEXT='Present in L. trilobus [@Cong2016]; possibly reflected by striations in L. unguispinus?'; + TEXT CHARACTER= 298 TAXON=168 TEXT='Relatively even [@Moysiuk2019]'; + TEXT CHARACTER= 300 TAXON=131 TEXT='The first three limbs are diminutive [@Strausfeld2022]'; + TEXT CHARACTER= 300 TAXON=137 TEXT='Single anterior pair reduced in size [@Haug2012cb]; as it is unclear whether this represents a homologous reduction, we code as ambiguous'; + TEXT CHARACTER= 300 TAXON=168 TEXT='First three flaps reduced [@Moysiuk2019]'; + TEXT CHARACTER= 306 TAXON=120 TEXT='Present [@Dhungana2023]'; + TEXT CHARACTER= 306 TAXON=127 TEXT='Siberion is scored as uncertain as it is difficult to distinguish the possible body termination from a posterior leg or pair of legs [@Dzik2011].'; + TEXT CHARACTER= 306 TAXON=132 TEXT='Present [@Chen1995bnmns]'; + TEXT CHARACTER= 306 TAXON=134 TEXT='Conical extension present [@Siveter2018]'; + TEXT CHARACTER= 306 TAXON=137 TEXT='Absent [@Haug2012cb]'; + TEXT CHARACTER= 306 TAXON=138 TEXT='Absent [@Smith2015]'; + TEXT CHARACTER= 306 TAXON=139 TEXT='H. fortis and H. hongmeia are coded as ambiguous, as the preservation is insufficiently clear to determine whether possible "posterior extensions" correspond to the trunk or to legs [@Hou1995zjls; @Steiner2012; @Liu2014ppp]'; + TEXT CHARACTER= 306 TAXON=140 TEXT='H. fortis and H. hongmeia are coded as ambiguous, as the preservation is insufficiently clear to determine whether possible "posterior extensions" correspond to the trunk or to legs [@Hou1995zjls; @Steiner2012; @Liu2014ppp]'; + TEXT CHARACTER= 306 TAXON=141 TEXT='Pear-shaped posterior bulge [@Howard2020]'; + TEXT CHARACTER= 306 TAXON=142 TEXT='Although Luolishania is described as bearing a protruding posterior termination, this is not unambiguously evident in specimens or camera lucida images; this taxon is thus coded as ambiguous [@Liu2008csb; @Ma2009].'; + TEXT CHARACTER= 306 TAXON=143 TEXT='Absent [@Caron2017]'; + TEXT CHARACTER= 306 TAXON=152 TEXT='The posterior appendages are incompletely formed at this developmental stage [@Smith2023n], so it is impossible to evaluate their condition in mature individuals.'; + TEXT CHARACTER= 306 TAXON=154 TEXT='We code this character as absent in Kerygmachela [@Budd1993; @Budd1998trse], Jianshanopodia [@Liu2006] and Anomalocaris [@Daley2014] as their tails likely represent modified appendages.'; + TEXT CHARACTER= 306 TAXON=156 TEXT='We code this character as absent in Kerygmachela [@Budd1993; @Budd1998trse], Jianshanopodia [@Liu2006] and Anomalocaris [@Daley2014] as their tails likely represent modified appendages.'; + TEXT CHARACTER= 306 TAXON=157 TEXT='We score Pambdelurion as uncertain because its posterior trunk is poorly known [@Budd1998ar].'; + TEXT CHARACTER= 306 TAXON=163 TEXT='The trunk of Opabinia extends further than the lobopodous limbs [@DhunganaForthcoming].'; + TEXT CHARACTER= 306 TAXON=167 TEXT='We code this character as absent in Kerygmachela [@Budd1993; @Budd1998trse], Jianshanopodia [@Liu2006] and Anomalocaris [@Daley2014] as their tails likely represent modified appendages.'; + TEXT CHARACTER= 308 TAXON=157 TEXT='We score Pambdelurion as uncertain because its posterior trunk is poorly known [@Budd1998ar].'; + TEXT CHARACTER= 308 TAXON=166 TEXT='Amplectobelua "resembles Anomalocaris in the number of lateral flaps, the flap venation, tail fan, and long furcae" [@Chen1994]'; + TEXT CHARACTER= 309 TAXON=120 TEXT='Following @Dhungana2023'; + TEXT CHARACTER= 309 TAXON=133 TEXT='Uncertain [@Liu2008app]'; + TEXT CHARACTER= 309 TAXON=138 TEXT='The claws of Hallucigenia sparsa seem to be oriented in the same direction on all appendage pairs [@Smith2015].'; + TEXT CHARACTER= 309 TAXON=143 TEXT='Claw direction on posteriormost pair (appendage 9) matches that of adjacent appendages (7 and 8) [@Caron2017]. @Caron2017 assert that the posteriormost two or three claws of Hallucigenia and Collinsium are directed in a different direction to those of other trunk limbs, citing references that do not obviously support this assertion.'; + TEXT CHARACTER= 309 TAXON=151 TEXT='Uncertain [@Maas2007csb]'; + TEXT CHARACTER= 309 TAXON=157 TEXT='We score Pambdelurion as uncertain because its posterior trunk is poorly known [@Budd1998ar].'; + TEXT CHARACTER= 310 TAXON=52 TEXT='Onychophora are scored as undifferentiated, as the posteriormost appendages are appendages are lost, not structurally differentiated [@Mayer2005].'; + TEXT CHARACTER= 310 TAXON=53 TEXT='Onychophora are scored as undifferentiated, as the posteriormost appendages are appendages are lost, not structurally differentiated [@Mayer2005].'; + TEXT CHARACTER= 310 TAXON=54 TEXT='Onychophora are scored as undifferentiated, as the posteriormost appendages are appendages are lost, not structurally differentiated [@Mayer2005].'; + TEXT CHARACTER= 310 TAXON=120 TEXT='Undifferentiated [@Dhungana2023]'; + TEXT CHARACTER= 310 TAXON=148 TEXT='The posterior filaments [@Ou2011] are treated as modified appendages, by analogy with Kerygmachela'; + TEXT CHARACTER= 310 TAXON=149 TEXT='The preservation is inadequate to evaluate this feature.'; + TEXT CHARACTER= 310 TAXON=154 TEXT='We score Jianshanopodia [@Liu2006] as present because the lateral extensions of the tail fan likely correspond to a modified pair of appendages. '; + TEXT CHARACTER= 310 TAXON=157 TEXT='We score Pambdelurion as uncertain because its posterior trunk is poorly known [@Budd1998ar].'; + TEXT CHARACTER= 310 TAXON=166 TEXT='Amplectobelua "resembles Anomalocaris in the number of lateral flaps, the flap venation, tail fan, and long furcae" [@Chen1994]'; + TEXT CHARACTER= 310 TAXON=168 TEXT='Several pairs of lobes incorporated into tail fan [@Moysiuk2019]'; + TEXT CHARACTER= 310 TAXON=169 TEXT='Hurdia and Schinderhannes bear a single flap-like appendage on the posterior end [@Daley2009; @Kuhl2009].'; + TEXT CHARACTER= 310 TAXON=174 TEXT='Hurdia and Schinderhannes bear a single flap-like appendage on the posterior end [@Daley2009; @Kuhl2009].'; + TEXT CHARACTER= 311 TAXON=151 TEXT='The Siberian Orsten tardigrade is scored as having a reduced posteriormost appendage pair based on the vestigial rudiment present on its posteroventral body region [@Maas2001].'; + TEXT CHARACTER= 311 TAXON=154 TEXT='The last appendage pair of Jianshanopodia is modified into a set of lateral flaps, which form a tail fan together with the flattened terminal portion of the body [@Liu2006]'; + TEXT CHARACTER= 311 TAXON=156 TEXT='The paired tail rami of Kerygmachela [@Budd1993; @Budd1998trse] likely represent modified appendages. '; + TEXT CHARACTER= 311 TAXON=157 TEXT='We score Pambdelurion as uncertain because its posterior trunk is poorly known [@Budd1998ar].'; + TEXT CHARACTER= 311 TAXON=166 TEXT='Amplectobelua "resembles Anomalocaris in the number of lateral flaps, the flap venation, tail fan, and long furcae" [@Chen1994]'; + TEXT CHARACTER= 312 TAXON=157 TEXT='We score Pambdelurion as uncertain because its posterior trunk is poorly known [@Budd1998ar].'; + TEXT CHARACTER= 312 TAXON=163 TEXT='Rami [@Pates2022]'; + TEXT CHARACTER= 312 TAXON=164 TEXT='Tail fan composed of seven pairs of elongate blades, and a pair of caudal rami [@Pates2022]'; + TEXT CHARACTER= 312 TAXON=166 TEXT='Following @Pates2021'; + TEXT CHARACTER= 312 TAXON=173 TEXT='Following @Pates2021'; + TEXT CHARACTER= 312 TAXON=174 TEXT='Following @Pates2021'; + TEXT CHARACTER= 313 TAXON=175 TEXT='Tail flukes appear more paddle-like than blade like [@Yang2013, supplementary figure 4b]'; + TEXT CHARACTER= 314 TAXON=95 TEXT='Dumbbell shape [@Maas2007ppp] indicates presence of lorica, and is not obviously equivalent to other posterior bulbs; coded ambiguous in order to be conservative.^n'; + TEXT CHARACTER= 314 TAXON=98 TEXT='Bulbous posterior trunk [@Ma2014]'; + TEXT CHARACTER= 315 TAXON=108 TEXT='A 6 mm terminal extension beyond the segmented body is compared to the bursa of Ottoia [@ConwayMorris2010]'; + TEXT CHARACTER= 319 TAXON=103 TEXT='Most likely single, but specimens are indecisive [@ConwayMorris1977]'; + TEXT CHARACTER= 320 TAXON=103 TEXT='Scored as dorso-medial by @Wills2012, but unclear how this can be determined from available material [@Schram1973; @ConwayMorris1977]'; + TEXT CHARACTER= 321 TAXON=103 TEXT='Considered smooth by @ConwayMorris1977'; + TEXT CHARACTER= 322 TAXON=40 TEXT='Present [@Kulikov1998rjn]'; + TEXT CHARACTER= 322 TAXON=41 TEXT='Present [@Luduc2016n]'; + TEXT CHARACTER= 323 TAXON=1 TEXT='Ambiguous: absent in smaller form, but present in larger form, a possible semaphront [@Maas2009aap]; may also be present in unknown adult.'; + TEXT CHARACTER= 323 TAXON=7 TEXT='Pair? of extended straight spines in NMNH198604'; + TEXT CHARACTER= 323 TAXON=10 TEXT='The single pair of ventroterminal outgrowths extending into posterior spines or setae are conceivably homologous with loriciferan ''toes''.'; + TEXT CHARACTER= 323 TAXON=11 TEXT='The lateral spines are treated as dorsal; they are not constrained to the posteriormost end of the organism, but occur on the two posterior ''segments'' [@Zhang2022]'; + TEXT CHARACTER= 323 TAXON=13 TEXT='Specimen incomplete'; + TEXT CHARACTER= 323 TAXON=14 TEXT='Absent, presuming that tongue-like structure denotes end of body [@Shao2020]'; + TEXT CHARACTER= 323 TAXON=15 TEXT='Eokinorhynchus has two pairs of caudal spines, distinguishing them from the series of lateral spines on the dorsal trunk (Zhang et al. 2015).^n'; + TEXT CHARACTER= 323 TAXON=16 TEXT='No posterior structures present in Eopriapulites (Shao et al. 2016)'; + TEXT CHARACTER= 323 TAXON=38 TEXT='Scored as absent, despite presence in larvae [e.g. @Marek2010], to ensure consistent coding with fossil taxa'; + TEXT CHARACTER= 323 TAXON=44 TEXT='Acanthopriapulus is covered in a profusion of hooks [@Land1970]; tail hooks are not distinguished from other trunk hooks, so the character is scored as ambiguous.^n'; + TEXT CHARACTER= 323 TAXON=45 TEXT='Two present in Halicryptus [@Shirley1999]^n'; + TEXT CHARACTER= 323 TAXON=49 TEXT='Tail hooks are absent in Priapulus; it is possible that the posterior warts correspond to these structures, but I was unable to find any literature that documented their distribution.'; + TEXT CHARACTER= 323 TAXON=96 TEXT='Preservation inadequate to determine whether vestigial features may be present [@Peel2010]'; + TEXT CHARACTER= 323 TAXON=113 TEXT='We know of no specimens of Palaeoscolex piscatorum that document the posterior end; it’s not clear how @Wills2012 coded hooks as present.'; + TEXT CHARACTER= 325 TAXON=10 TEXT='Approximately 20%'; + TEXT CHARACTER= 325 TAXON=111 TEXT='Not clearly figured or described, but sketch indicates small size [@Hu2008]'; + TEXT CHARACTER= 325 TAXON=118 TEXT='Narrow but elongated [@Hu2012]'; + TEXT CHARACTER= 328 TAXON=16 TEXT='Seemingly absent [@Shao2016]'; + TEXT CHARACTER= 328 TAXON=44 TEXT='Present [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 330 TAXON=21 TEXT='Four small warts [e.g. @Gad2005mbr]'; + TEXT CHARACTER= 330 TAXON=22 TEXT='Six posterior warts [@Gad2005za]'; + TEXT CHARACTER= 330 TAXON=44 TEXT='Absent [@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 332 TAXON=137 TEXT='A longitudinal arrangement of musculature is suggested by the longitudinal wrinkling [@Haug2012cb]'; + TEXT CHARACTER= 332 TAXON=167 TEXT='Present [@Daley2014]'; + TEXT CHARACTER= 332 TAXON=173 TEXT='Present [@Cong2014]'; + TEXT CHARACTER= 332 TAXON=176 TEXT='Coded as present in Fuxianhuia based on a probable fuxianhuiid with muscle tissue from Kaili [@Zhu2004]'; + TEXT CHARACTER= 332 TAXON=179 TEXT='The metameric distribution of musculature in artiopodans is inferred by comparison with Campanamuta [@Budd2011].'; + TEXT CHARACTER= 332 TAXON=180 TEXT='The metameric distribution of musculature in artiopodans is inferred by comparison with Campanamuta [@Budd2011]'; + TEXT CHARACTER= 333 TAXON=28 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=29 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=30 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=31 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=32 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=33 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=34 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=35 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=36 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=37 TEXT='One pair of bundles of ventral and dorsal longitudinal muscles extending between the pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 333 TAXON=156 TEXT='Well-developed longitudinal muscles "appear to sheath the entire body" [@Young2017]'; + TEXT CHARACTER= 333 TAXON=157 TEXT='Pambdelurion exhibits longitudinal peripheral musculature [@Budd1998l; @Young2017].'; + TEXT CHARACTER= 333 TAXON=179 TEXT='An axial distribution of longitudinal muscle is inferred in artiopodans by comparison with Campanamuta [@Young2017].'; + TEXT CHARACTER= 334 TAXON=50 TEXT='Priapulans exhibit undifferentiated longitudinal muscle bands [@Young2017]'; + TEXT CHARACTER= 334 TAXON=157 TEXT='Present [@Young2017]'; + TEXT CHARACTER= 334 TAXON=179 TEXT='Inferred in artiopodans by comparison with Kiisortoqia [@Young2017]'; + TEXT CHARACTER= 334 TAXON=180 TEXT='Inferred in artiopodans by comparison with Kiisortoqia [@Young2017]'; + TEXT CHARACTER= 335 TAXON=28 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=29 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=30 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=31 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=32 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=33 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=34 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=35 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=36 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=37 TEXT='Attaching to pachycycli of subsequent segments [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 335 TAXON=179 TEXT='The successive attachment points in artiopodans are inferred by comparison with Campanamuta [@Young2017]'; + TEXT CHARACTER= 335 TAXON=180 TEXT='The successive attachment points in artiopodans are inferred by comparison with Campanamuta [@Young2017]'; + TEXT CHARACTER= 337 TAXON=18 TEXT='Musculature of adult described by @Neves2013^n'; + TEXT CHARACTER= 337 TAXON=19 TEXT='Musculature of Higgins larva described by @Neves2013'; + TEXT CHARACTER= 337 TAXON=28 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=29 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=30 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=31 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=32 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=33 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=34 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=35 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=36 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=37 TEXT='Circular muscles in certain places (bases of scalid rings 6 and 7; connecting placids) but not in integument [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 337 TAXON=156 TEXT='Circular muscle reported by multiple studies [@Budd1993; @Budd1998trse; @Young2017]'; + TEXT CHARACTER= 337 TAXON=157 TEXT='Pambdelurion exhibits longitudinal peripheral musculature; the presence of circular muscle is equivocal [@Budd1998l; @Young2017].'; + TEXT CHARACTER= 337 TAXON=179 TEXT='The absence of circular muscle in artiopodans is inferred by comparison with Campanamuta, in which no circular musculature is evident despite preservation of individual myofibrils [@Young2017]'; + TEXT CHARACTER= 337 TAXON=180 TEXT='The absence of circular muscle in artiopodans is inferred by comparison with Campanamuta, in which no circular musculature is evident despite preservation of individual myofibrils [@Young2017]'; + TEXT CHARACTER= 338 TAXON=44 TEXT='[@SchmidtRhaesa2022za]'; + TEXT CHARACTER= 339 TAXON=28 TEXT='Muscles present in all segments [@Herranz2021z]'; + TEXT CHARACTER= 339 TAXON=29 TEXT='Reduced in segment 1 [@Herranz2021z]'; + TEXT CHARACTER= 339 TAXON=32 TEXT='Reduced in segment 1 [@Herranz2021z]'; + TEXT CHARACTER= 339 TAXON=33 TEXT='Reduced in segment 1 [@Herranz2021z]'; + TEXT CHARACTER= 339 TAXON=34 TEXT='Muscles present in all segments [@Herranz2021z]'; + TEXT CHARACTER= 339 TAXON=37 TEXT='Muscles present in all segments [@Herranz2021z]'; + TEXT CHARACTER= 340 TAXON=29 TEXT='Oblique muscles present in Cyclorhagida only [@SchmidtRhaesa2013]; could broadly be said to mirror the box-truss system observed in Tactopoda '; + TEXT CHARACTER= 340 TAXON=30 TEXT='Oblique muscles present in Cyclorhagida only [@SchmidtRhaesa2013]; could broadly be said to mirror the box-truss system observed in Tactopoda '; + TEXT CHARACTER= 340 TAXON=31 TEXT='Oblique muscles present in Cyclorhagida only [@SchmidtRhaesa2013]; could broadly be said to mirror the box-truss system observed in Tactopoda '; + TEXT CHARACTER= 340 TAXON=32 TEXT='Oblique muscles present in Cyclorhagida only [@SchmidtRhaesa2013]; could broadly be said to mirror the box-truss system observed in Tactopoda '; + TEXT CHARACTER= 340 TAXON=33 TEXT='Oblique muscles present in Cyclorhagida only [@SchmidtRhaesa2013]; could broadly be said to mirror the box-truss system observed in Tactopoda '; + TEXT CHARACTER= 340 TAXON=136 TEXT='Oblique musculature, but no dorsoventral [@Zhang2016]'; + TEXT CHARACTER= 340 TAXON=156 TEXT='Oblique muscles are evident in the anterior, but there is no good evidence of dorsoventral muscles [@Young2017].'; + TEXT CHARACTER= 340 TAXON=157 TEXT='Dorsoventral muscles not reported; extent of oblique muscles disputed [@Budd1998l; @Young2017], and orientation does not match that of box-truss.'; + TEXT CHARACTER= 341 TAXON=156 TEXT='We suggest that the pericardial region represents the musculature of the heart.'; + TEXT CHARACTER= 341 TAXON=176 TEXT='Present [@Ma2014nc]'; + TEXT CHARACTER= 342 TAXON=5 TEXT='Code with care - an unreliable internet source attests to their presence^n'; + TEXT CHARACTER= 343 TAXON=32 TEXT='Basiepithelial mouth cone nerves, stomatogastric nerves, and circumoral brain, the latter situated between the first scalid ring and the base of the mouth cone [@Nebelsick1993z]'; + TEXT CHARACTER= 343 TAXON=38 TEXT='Basiepithelial neurites in the epidermis [@SchmidtRhaesa2014]'; + TEXT CHARACTER= 343 TAXON=39 TEXT='Basiepithelial neurites in the epidermis [@SchmidtRhaesa2014]'; + TEXT CHARACTER= 343 TAXON=43 TEXT='Enclosed within a basal lamina shared with the epidermis [@SchmidtRhaesa2014]'; + TEXT CHARACTER= 343 TAXON=49 TEXT='Intraepithelial [@Rothe2010]'; + TEXT CHARACTER= 343 TAXON=50 TEXT='Intraepithelial in T. troglodytes [@Rothe2010]'; + TEXT CHARACTER= 343 TAXON=51 TEXT='Intraepithelial in T. troglodytes [@Rothe2010]'; + TEXT CHARACTER= 344 TAXON=15 TEXT='Unpaired [@Wang2025]'; + TEXT CHARACTER= 344 TAXON=16 TEXT='Unpaired [@Wang2025]'; + TEXT CHARACTER= 344 TAXON=28 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=29 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=30 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=31 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=32 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=33 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=34 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=35 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=36 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=37 TEXT='The ventral nerve cord originates from the forebrain as two distinct strands, which fuse to one cord in certain taxa [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 344 TAXON=38 TEXT='Unpaired in Chordodes [@Son2009] and Gordius [@SchmidtRhaesa1996], though paired in Paragordius [@SchmidtRhaesa2014]; nematomorph ventral nerve cords retain vestiges of a paired origin [@SchmidtRhaesa1997]'; + TEXT CHARACTER= 344 TAXON=39 TEXT='Though visibly unpaired [@SchmidtRhaesa1996], nematomorph ventral nerve cords retain vestiges of a paired origin [@SchmidtRhaesa1997]'; + TEXT CHARACTER= 344 TAXON=42 TEXT='Paired, unequal [@SchmidtRhaesa2014]'; + TEXT CHARACTER= 344 TAXON=43 TEXT='Paired, unequal [@SchmidtRhaesa2014]'; + TEXT CHARACTER= 344 TAXON=50 TEXT='Unpaired see @Yang2016. '; + TEXT CHARACTER= 344 TAXON=52 TEXT='Paired [@Yang2016]'; + TEXT CHARACTER= 344 TAXON=53 TEXT='Paired [@Yang2016]'; + TEXT CHARACTER= 344 TAXON=54 TEXT='Paired [@Yang2016]'; + TEXT CHARACTER= 344 TAXON=55 TEXT='Paired [@Yang2016]'; + TEXT CHARACTER= 344 TAXON=97 TEXT='@Wang2025'; + TEXT CHARACTER= 344 TAXON=106 TEXT='@Wang2025'; + TEXT CHARACTER= 344 TAXON=119 TEXT='@Wang2025'; + TEXT CHARACTER= 344 TAXON=130 TEXT='Ambiguous. @Hou2004 report the presence of a ventral nerve cord, although it is not possible to discern if it is paired or not [@Yang2016]'; + TEXT CHARACTER= 344 TAXON=156 TEXT='Tentatively interpreted as paired, unfused [@Park2018]'; + TEXT CHARACTER= 344 TAXON=173 TEXT='Paired. Two descending tracts on the anterior trunk region [@Cong2014]'; + TEXT CHARACTER= 344 TAXON=175 TEXT='Paired [@Yang2016]'; + TEXT CHARACTER= 344 TAXON=178 TEXT='Paired [@Tanaka2013]'; + TEXT CHARACTER= 346 TAXON=29 TEXT='Fuse to one cord after leaving forebrain [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 346 TAXON=32 TEXT='Fuse to one cord after leaving forebrain [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 346 TAXON=33 TEXT='Fuse to one cord after leaving forebrain [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 346 TAXON=37 TEXT='One chord reported in some Pycnopyhes species [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 347 TAXON=15 TEXT='Absent [@Wang2025]'; + TEXT CHARACTER= 347 TAXON=32 TEXT='Paired ganglia [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 347 TAXON=37 TEXT='Paired ganglia [@SchmidtRhaesa2013]'; + TEXT CHARACTER= 347 TAXON=42 TEXT='Single ventral nerve cord terminates in single terminal ganglion [@SchmidtRhaesa2014]'; + TEXT CHARACTER= 347 TAXON=130 TEXT='@Hou2004 (figs 2f, 4f) reported faint paired structures adjacent to the gut of Paucipodia, which were interpreted as potential nerve ganglia. We nevertheless code Paucipodia as ambiguous: the structures cannot be observed in the figured material, and are described as "faintly preserved with a pink colour" in contrast to the conspicuously dark colouration of unambiguous nervous tissue in Chengjiang-type fossils [see @Ma2012n; @Tanaka2013; @Yang2013].'; + TEXT CHARACTER= 347 TAXON=156 TEXT='We code this ambiguously as @Park2018 only implicitly reconstruct paired ganglia (in their figure 4); the ''nerve cords'' referred to in the text could represent the circumpharyngeal connectives that lead to the ventral nerve cord.'; + TEXT CHARACTER= 347 TAXON=173 TEXT='Ambiguous [@Cong2014]'; + TEXT CHARACTER= 347 TAXON=175 TEXT='Recent data on the neurological organization of stem-euarthropods indicate that paired ganglia are present in Chengjiangocaris [@Yang2013] and Alalcomenaeus [@Tanaka2013].'; + TEXT CHARACTER= 347 TAXON=178 TEXT='Recent data on the neurological organization of stem-euarthropods indicate that paired ganglia are present in Chengjiangocaris [@Yang2013] and Alalcomenaeus [@Tanaka2013]'; + TEXT CHARACTER= 348 TAXON=152 TEXT='The single medial sinus contrasts with the two lateral perineural sinuses of onychophorans [@Jahn2023]'; + TEXT CHARACTER= 348 TAXON=156 TEXT='Hints of a paired nerve cord in the anterior of Kerygmachela [@Park2018] are insufficient to establish their lateralization, though the positioning seems to correspond to that of Lyrarapax [@Cong2014]'; + TEXT CHARACTER= 348 TAXON=173 TEXT='Medial (Cong et al. 2014)'; + TEXT CHARACTER= 356 TAXON=177 TEXT='Coded present by proxy as unambiguously present in crown-Euarthropoda [see @Budd2021].'; + TEXT CHARACTER= 356 TAXON=178 TEXT='Coded present by proxy as unambiguously present in crown-Euarthropoda [see @Budd2021].'; + TEXT CHARACTER= 357 TAXON=52 TEXT='Following @Martin2022, who argue that the circumpharyngeal connective represents the last vestiges of the circumoral nerve ring.'; + TEXT CHARACTER= 357 TAXON=53 TEXT='Following @Martin2022, who argue that the circumpharyngeal connective represents the last vestiges of the circumoral nerve ring.'; + TEXT CHARACTER= 357 TAXON=54 TEXT='Following @Martin2022, who argue that the circumpharyngeal connective represents the last vestiges of the circumoral nerve ring.'; + TEXT CHARACTER= 357 TAXON=156 TEXT='The ''nerve cords'' interpreted by @Park2018 could represent circumoral connectives (interpreted by @Martin2022 as homologous to the circumoral nerve ring).'; + TEXT CHARACTER= 357 TAXON=173 TEXT='Not interpreted as present [@Cong2014; @Park2018]'; + TEXT CHARACTER= 359 TAXON=42 TEXT='Ventral nerve ring without condensation [@SchmidtRhaesa2014]'; + TEXT CHARACTER= 359 TAXON=156 TEXT='Present [@Park2018]'; + TEXT CHARACTER= 360 TAXON=156 TEXT='The brain is protocerebral [@Park2018]'; + TEXT CHARACTER= 361 TAXON=52 TEXT='Onychophora are coded as innervated from multiple neuromeres to reflect their complex neurological organization: although the jaws have a deutocerebral segmental affinity and innervation, the lip papillae that delineate the oral opening are formed as epidermal derivatives of the three anteriormost body segments, and thus receive nervous terminals from the protocerebrum, deutocerebrum and part of the ventral nerve cord [@Eriksson2000; @Martin2014].'; + TEXT CHARACTER= 361 TAXON=53 TEXT='Onychophora are coded as innervated from multiple neuromeres to reflect their complex neurological organization: although the jaws have a deutocerebral segmental affinity and innervation, the lip papillae that delineate the oral opening are formed as epidermal derivatives of the three anteriormost body segments, and thus receive nervous terminals from the protocerebrum, deutocerebrum and part of the ventral nerve cord [@Eriksson2000; @Martin2014].'; + TEXT CHARACTER= 361 TAXON=54 TEXT='Onychophora are coded as innervated from multiple neuromeres to reflect their complex neurological organization: although the jaws have a deutocerebral segmental affinity and innervation, the lip papillae that delineate the oral opening are formed as epidermal derivatives of the three anteriormost body segments, and thus receive nervous terminals from the protocerebrum, deutocerebrum and part of the ventral nerve cord [@Eriksson2000; @Martin2014].'; + TEXT CHARACTER= 361 TAXON=55 TEXT='The tardigrade mouth cone is innervated from the protocerebrum [@Mayer2013po].'; + TEXT CHARACTER= 361 TAXON=173 TEXT='Lyrarapax has protocerebral mouth innervation [@Cong2014].'; + TEXT CHARACTER= 367 TAXON=38 TEXT='Single combined body opening'; + TEXT CHARACTER= 367 TAXON=39 TEXT='Inapplicable: intestine is incomplete and ends blindly [@SchmidtRhaesa2012]'; + TEXT CHARACTER= 367 TAXON=43 TEXT='Present in males; separate vulva and anus in females [@SchmidtRhaesa2024]'; + TEXT CHARACTER= 367 TAXON=179 TEXT='Absent by proxy for Euarthropoda crown.'; + TEXT CHARACTER= 367 TAXON=180 TEXT='Absent by proxy for Euarthropoda crown'; + TEXT CHARACTER= 368 TAXON=43 TEXT='Present in males; separate vulva and anus in females [@SchmidtRhaesa2024]'; + TEXT CHARACTER= 370 TAXON=179 TEXT='Absent by proxy for Euarthropod crown.'; + TEXT CHARACTER= 370 TAXON=180 TEXT='Absent by proxy for Euarthropod crown.'; + TEXT CHARACTER= 376 TAXON=126 TEXT='Anterior gut expanded [@Whittington1978, e.g. fig 43]'; + TEXT CHARACTER= 376 TAXON=130 TEXT='No indication of gut widening, whichever end is anterior [@Hou2004 / @Vannier2017]'; + TEXT CHARACTER= 376 TAXON=179 TEXT='@Chen1997'; + TEXT CHARACTER= 383 TAXON=11 TEXT='The associated Conotheca fragment is oppositely directed and hence not a dwelling tube of the organism [@Zhang2022]'; + TEXT CHARACTER= 384 TAXON=42 TEXT='Flagelliform tail present [@Reimann1972]'; + TEXT CHARACTER= 389 TAXON=177 TEXT='Coded present by proxy as absent in crown-Euarthropoda (see @Khim2023). If this matrix is to be used to investigate euarthropod relationships in future, this coding should be adjusted accordingly.'; + TEXT CHARACTER= 389 TAXON=178 TEXT='Coded present by proxy as absent in crown-Euarthropoda (see @Khim2023). If this matrix is to be used to investigate euarthropod relationships in future, this coding should be adjusted accordingly.'; + TEXT CHARACTER= 393 TAXON=80 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 393 TAXON=83 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 393 TAXON=85 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 393 TAXON=89 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 393 TAXON=91 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 393 TAXON=92 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 393 TAXON=94 TEXT='After @Mapalo2024cb'; + TEXT CHARACTER= 398 TAXON=24 TEXT='Wrinkled thorax, abdominal lorica [@Fujimoto2020mb]'; + TEXT CHARACTER= 400 TAXON=24 TEXT='Pair of anteroventral setae present, plus an anterolateral pair [@Fujimoto2020mb]'; + TEXT CHARACTER= 401 TAXON=24 TEXT='Posterodorsal and posterolateral setae present [@Fujimoto2020mb]'; + TEXT CHARACTER= 404 TAXON=1 TEXT='In view of the morphological arrangement, treated as a likely homologue of the Higgins larva.'; + TEXT CHARACTER= 405 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 405 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=1 TEXT='The thorax (crenulated region) is shorter than the abdomen (loricate region) in most specimens [@Maas2009aap], become equant in the larger specimen.'; + TEXT CHARACTER= 406 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 406 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 407 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 409 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 410 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 411 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 412 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 413 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 414 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 415 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 416 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 417 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 418 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 419 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 420 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 421 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=1 TEXT='Not evident [@Maas2009app]'; + TEXT CHARACTER= 422 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 422 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 423 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 424 TAXON=27 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=18 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=19 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=20 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=21 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=22 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=23 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=24 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=25 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=26 TEXT='Following @Sorensen2023'; + TEXT CHARACTER= 425 TAXON=27 TEXT='Following @Sorensen2023'; + ENDBLOCK; + BEGIN ASSUMPTIONS; + TYPESET * UNTITLED = unord: 1 - 425; + ENDBLOCK; + \ No newline at end of file diff --git a/dev/benchmarks/mbank_catalogue.csv b/dev/benchmarks/mbank_catalogue.csv new file mode 100644 index 000000000..9c82a3983 --- /dev/null +++ b/dev/benchmarks/mbank_catalogue.csv @@ -0,0 +1,805 @@ +"key","filename","project_id","matrix_idx","source_type","split","ntax","nchar","n_patterns","n_states","pct_missing","pct_inapp","parse_ok","error_message","dedup_drop" +"project1013","project1013.nex",1013,NA,"morphobank","training",112,174,172,9,23.3,0,TRUE,"",FALSE +"project1020","project1020.nex",1020,NA,"morphobank","validation",28,110,108,10,15.1,8.1,TRUE,"",FALSE +"project1024","project1024.nex",1024,NA,"morphobank","training",163,156,151,5,7.2,1,TRUE,"",FALSE +"project1035","project1035.nex",1035,NA,"morphobank","validation",58,185,183,6,17.4,2.9,TRUE,"",FALSE +"project1037_(1)","project1037 (1).nex",1037,1,"morphobank","training",62,71,71,4,13.7,12,TRUE,"",FALSE +"project1037_(2)","project1037 (2).nex",1037,2,"morphobank","training",62,69,69,4,14,12.4,TRUE,"",TRUE +"project1037_(3)","project1037 (3).nex",1037,3,"morphobank","training",64,69,69,4,14.7,12.4,TRUE,"",FALSE +"project104","project104.nex",104,NA,"morphobank","training",29,207,202,6,25,3.1,TRUE,"",FALSE +"project1045","project1045.nex",1045,NA,"morphobank","validation",13,37,35,4,8.4,0,TRUE,"",FALSE +"project1046","project1046.nex",1046,NA,"morphobank","training",34,291,290,5,36,0,TRUE,"",FALSE +"project1049","project1049.nex",1049,NA,"morphobank","training",41,145,133,2,3.2,0,TRUE,"",FALSE +"project1066","project1066.nex",1066,NA,"morphobank","training",32,92,78,4,5.9,4.2,TRUE,"",FALSE +"project1070","project1070.nex",1070,NA,"morphobank","validation",72,426,426,6,23.3,5.7,TRUE,"",FALSE +"project1076","project1076.nex",1076,NA,"morphobank","training",22,70,70,4,29.7,0,TRUE,"",FALSE +"project108","project108.nex",108,NA,"morphobank","training",29,207,202,6,25,3.1,TRUE,"",FALSE +"project1088","project1088.nex",1088,NA,"morphobank","training",11,44,38,2,25.4,3.8,TRUE,"",FALSE +"project1097","project1097.nex",1097,NA,"morphobank","training",66,1,1,5,1.5,0,TRUE,"",FALSE +"project1102","project1102.nex",1102,NA,"morphobank","training",61,143,143,4,27.7,5.8,TRUE,"WARNING: Could not parse character states; does each end with a ' or ;?.",FALSE +"project1104","project1104.nex",1104,NA,"morphobank","training",60,127,123,4,3,2.8,TRUE,"",FALSE +"project1105","project1105.nex",1105,NA,"morphobank","validation",22,57,57,4,15,0.7,TRUE,"",FALSE +"project1109","project1109.nex",1109,NA,"morphobank","training",11,65,56,3,26.9,2.1,TRUE,"",FALSE +"project1113","project1113.nex",1113,NA,"morphobank","training",38,42,40,4,1.9,5.1,TRUE,"",FALSE +"project1115","project1115.nex",1115,NA,"morphobank","validation",25,51,50,5,13.9,1.2,TRUE,"",FALSE +"project1118","project1118.nex",1118,NA,"morphobank","training",37,98,98,3,31.5,4.6,TRUE,"",FALSE +"project1119","project1119.nex",1119,NA,"morphobank","training",73,408,408,6,46.5,0,TRUE,"",FALSE +"project1120","project1120.nex",1120,NA,"morphobank","validation",33,85,84,4,27.4,0.3,TRUE,"",FALSE +"project1122","project1122.nex",1122,NA,"morphobank","training",32,63,58,5,10.9,22.1,TRUE,"",FALSE +"project1126","project1126.nex",1126,NA,"morphobank","training",132,560,560,5,59.8,0,TRUE,"",FALSE +"project1135","project1135.nex",1135,NA,"morphobank","validation",29,127,124,5,40,0,TRUE,"",FALSE +"project1138","project1138.nex",1138,NA,"morphobank","training",56,72,70,3,13.1,0.6,TRUE,"",FALSE +"project1144","project1144.nex",1144,NA,"morphobank","training",10,32,28,3,21.8,0,TRUE,"",FALSE +"project1150","project1150.nex",1150,NA,"morphobank","validation",62,111,111,7,40.7,7.6,TRUE,"",FALSE +"project1151","project1151.nex",1151,NA,"morphobank","training",12,16,16,3,6.2,0,TRUE,"",FALSE +"project1157","project1157.nex",1157,NA,"morphobank","training",110,205,156,3,0.1,2.8,TRUE,"",FALSE +"project1166","project1166.nex",1166,NA,"morphobank","training",71,141,140,10,23.4,3.7,TRUE,"",FALSE +"project1187","project1187.nex",1187,NA,"morphobank","training",32,81,81,3,20.5,2.9,TRUE,"",FALSE +"project1189","project1189.nex",1189,NA,"morphobank","training",26,72,72,4,14,5.3,TRUE,"",FALSE +"project1192","project1192.nex",1192,NA,"morphobank","training",80,51,51,6,35.2,18.4,TRUE,"",FALSE +"project1194","project1194.nex",1194,NA,"morphobank","training",49,175,172,7,37.5,0,TRUE,"",FALSE +"project1197","project1197.nex",1197,NA,"morphobank","training",28,83,66,5,7.1,8.2,TRUE,"",FALSE +"project1207","project1207.nex",1207,NA,"morphobank","training",53,208,207,7,23.7,1.6,TRUE,"",FALSE +"project1209","project1209.nex",1209,NA,"morphobank","training",46,125,91,5,0,0,TRUE,"WARNING: Could not parse character states; does each end with a ' or ;?.",FALSE +"project1210","project1210.nex",1210,NA,"morphobank","validation",86,36,17,3,4,0,TRUE,"",FALSE +"project1213","project1213.nex",1213,NA,"morphobank","training",29,139,136,5,41,0,TRUE,"",FALSE +"project1214","project1214.nex",1214,NA,"morphobank","training",12,42,40,4,6.9,5.2,TRUE,"",FALSE +"project1220_(1)","project1220 (1).nex",1220,1,"morphobank","validation",4,24,20,5,8.8,1.2,TRUE,"",FALSE +"project1220_(2)","project1220 (2).nex",1220,2,"morphobank","validation",5,61,45,4,5.3,5.3,TRUE,"",FALSE +"project1221","project1221.nex",1221,NA,"morphobank","training",150,252,251,8,31.9,15.3,TRUE,"",FALSE +"project1223","project1223.nex",1223,NA,"morphobank","training",30,78,77,4,27.1,0.9,TRUE,"",FALSE +"project1228","project1228.nex",1228,NA,"morphobank","training",18,20,19,2,20.2,0,TRUE,"",FALSE +"project1271","project1271.nex",1271,NA,"morphobank","training",25,33,32,25,24,0,TRUE,"",FALSE +"project1278","project1278.nex",1278,NA,"morphobank","training",23,60,60,8,38.8,6.2,TRUE,"",FALSE +"project157","project157.nex",157,NA,"morphobank","training",69,408,408,6,40.2,4.6,TRUE,"",FALSE +"project161","project161.nex",161,NA,"morphobank","training",21,173,165,4,28.5,0,TRUE,"",FALSE +"project171","project171.nex",171,NA,"morphobank","training",68,228,222,5,14.6,3.2,TRUE,"",FALSE +"project175","project175.nex",175,NA,"morphobank","validation",165,71,71,6,12.9,0,TRUE,"",FALSE +"project181","project181.nex",181,NA,"morphobank","training",24,119,116,6,29.7,0,TRUE,"",FALSE +"project182","project182.nex",182,NA,"morphobank","training",50,115,108,6,46.9,0,TRUE,"",FALSE +"project194","project194.nex",194,NA,"morphobank","training",72,207,163,8,13.9,0,TRUE,"",FALSE +"project198","project198.nex",198,NA,"morphobank","training",83,412,412,5,37,3.3,TRUE,"",FALSE +"project199","project199.nex",199,NA,"morphobank","training",NA,NA,NA,NA,NA,NA,FALSE,"WARNING: Missing character state definition for: 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173 ; ERROR: missing value where TRUE/FALSE needed",FALSE +"project200","project200.nex",200,NA,"morphobank","validation",40,123,121,4,28.6,0,TRUE,"",FALSE +"project205","project205.nex",205,NA,"morphobank","validation",41,315,315,11,37.7,0.1,TRUE,"",FALSE +"project2084_(1)","project2084 (1).nex",2084,1,"morphobank","training",86,3660,3601,10,20.9,24.9,TRUE,"",FALSE +"project2084_(2)","project2084 (2).nex",2084,2,"morphobank","training",68,146,146,4,28.5,5.3,TRUE,"",FALSE +"project2086","project2086.nex",2086,NA,"morphobank","training",91,453,453,8,45.3,15.1,TRUE,"",FALSE +"project2099_(1)","project2099 (1).nex",2099,1,"morphobank","training",114,555,555,7,57.3,2.3,TRUE,"",FALSE +"project2099_(2)","project2099 (2).nex",2099,2,"morphobank","training",114,555,555,7,57.4,2.3,TRUE,"",FALSE +"project2106","project2106.nex",2106,NA,"morphobank","training",62,90,90,4,32.3,2,TRUE,"",FALSE +"project2116","project2116.nex",2116,NA,"morphobank","training",74,158,158,10,27.7,6,TRUE,"",FALSE +"project2124","project2124.nex",2124,NA,"morphobank","training",81,477,477,5,65.2,0,TRUE,"",FALSE +"project2131","project2131.nex",2131,NA,"morphobank","training",32,55,54,6,36.5,7.9,TRUE,"",FALSE +"project2144","project2144.nex",2144,NA,"morphobank","training",109,124,123,4,48.9,2.7,TRUE,"",FALSE +"project2151","project2151.nex",2151,NA,"morphobank","training",55,56,56,5,8.5,2.2,TRUE,"",FALSE +"project216","project216.nex",216,NA,"morphobank","training",51,129,105,8,2.3,1.8,TRUE,"",FALSE +"project2167","project2167.nex",2167,NA,"morphobank","training",81,421,419,10,47.5,2.3,TRUE,"",FALSE +"project2183","project2183.nex",2183,NA,"morphobank","training",318,535,533,5,22.5,0,TRUE,"",FALSE +"project2184","project2184.nex",2184,NA,"morphobank","training",114,205,168,3,1.7,2.5,TRUE,"",FALSE +"project2189","project2189.nex",2189,NA,"morphobank","training",73,777,743,9,39.4,0,TRUE,"",FALSE +"project2191","project2191.nex",2191,NA,"morphobank","training",105,216,215,8,23.6,12.8,TRUE,"",FALSE +"project2193","project2193.nex",2193,NA,"morphobank","training",38,364,363,5,33.3,1.5,TRUE,"",FALSE +"project2194_(1)","project2194 (1).nex",2194,1,"morphobank","training",16,57,50,3,32.4,0.6,TRUE,"",FALSE +"project2194_(2)","project2194 (2).nex",2194,2,"morphobank","training",16,1,1,3,6.2,0,TRUE,"",FALSE +"project2196","project2196.nex",2196,NA,"morphobank","training",99,339,339,9,34.9,6.8,TRUE,"",FALSE +"project2197","project2197.nex",2197,NA,"morphobank","training",61,178,178,5,19.1,2.3,TRUE,"",FALSE +"project2209","project2209.nex",2209,NA,"morphobank","training",1,6,5,4,0,0,TRUE,"WARNING: Could not parse character states; does each end with a ' or ;?.",FALSE +"project2215","project2215.nex",2215,NA,"morphobank","validation",15,64,60,3,27.7,3.4,TRUE,"",FALSE +"project2216","project2216.nex",2216,NA,"morphobank","training",61,232,227,7,29,18.4,TRUE,"",FALSE +"project2218","project2218.nex",2218,NA,"morphobank","training",54,164,164,4,16.7,1.7,TRUE,"",FALSE +"project2219","project2219.nex",2219,NA,"morphobank","training",17,35,33,8,42.4,0.7,TRUE,"",FALSE +"project222","project222.nex",222,NA,"morphobank","training",63,28,27,9,3.5,0,TRUE,"",FALSE +"project2220_(1)","project2220 (1).nex",2220,1,"morphobank","validation",92,17,17,6,6,0.5,TRUE,"",FALSE +"project2220_(2)","project2220 (2).nex",2220,2,"morphobank","validation",267,17,17,6,41.4,0.6,TRUE,"",FALSE +"project2225","project2225.nex",2225,NA,"morphobank","validation",3,272,37,4,31.5,6.3,TRUE,"",FALSE +"project2238","project2238.nex",2238,NA,"morphobank","training",1,272,7,5,14.3,14.3,TRUE,"",FALSE +"project2244","project2244.nex",2244,NA,"morphobank","training",54,194,173,9,11,23.6,TRUE,"",FALSE +"project2261","project2261.nex",2261,NA,"morphobank","training",42,83,82,5,26.1,0,TRUE,"",FALSE +"project2285","project2285.nex",2285,NA,"morphobank","validation",19,73,71,6,18.7,1,TRUE,"",FALSE +"project2286","project2286.nex",2286,NA,"morphobank","training",134,232,194,3,1.4,1.8,TRUE,"",FALSE +"project2289","project2289.nex",2289,NA,"morphobank","training",73,220,211,3,15.3,0.9,TRUE,"",FALSE +"project2291","project2291.nex",2291,NA,"morphobank","training",79,132,130,7,53.9,0,TRUE,"",FALSE +"project2292","project2292.nex",2292,NA,"morphobank","training",114,497,493,6,47.4,0,TRUE,"",FALSE +"project2320","project2320.nex",2320,NA,"morphobank","validation",66,382,382,6,48.8,0,TRUE,"",FALSE +"project2329","project2329.nex",2329,NA,"morphobank","training",51,65,65,4,20.5,2.8,TRUE,"",FALSE +"project2331","project2331.nex",2331,NA,"morphobank","training",94,272,271,5,39,4.3,TRUE,"",FALSE +"project2332","project2332.nex",2332,NA,"morphobank","training",49,71,70,8,1.5,0,TRUE,"",FALSE +"project2334","project2334.nex",2334,NA,"morphobank","training",12,85,82,8,8.7,1.1,TRUE,"",FALSE +"project2335","project2335.nex",2335,NA,"morphobank","validation",45,60,60,3,17.9,1.3,TRUE,"",FALSE +"project2340_(1)","project2340 (1).nex",2340,1,"morphobank","validation",40,43,43,9,5.1,3.5,TRUE,"",FALSE +"project2340_(2)","project2340 (2).nex",2340,2,"morphobank","validation",40,46,46,8,5.1,4.3,TRUE,"",FALSE +"project2340_(3)","project2340 (3).nex",2340,3,"morphobank","validation",31,46,44,6,6.6,4.3,TRUE,"",TRUE +"project2341","project2341.nex",2341,NA,"morphobank","training",64,47,43,5,1.6,0,TRUE,"",FALSE +"project2342","project2342.nex",2342,NA,"morphobank","training",30,234,229,6,43.3,0,TRUE,"",FALSE +"project2346","project2346.nex",2346,NA,"morphobank","training",23,144,141,4,18,28.5,TRUE,"",FALSE +"project2348","project2348.nex",2348,NA,"morphobank","training",39,93,92,8,15.2,2.4,TRUE,"",FALSE +"project2349","project2349.nex",2349,NA,"morphobank","training",26,66,64,4,7.8,3.9,TRUE,"",FALSE +"project2359","project2359.nex",2359,NA,"morphobank","training",42,111,111,7,3.3,26.3,TRUE,"",FALSE +"project2368","project2368.nex",2368,NA,"morphobank","training",62,351,350,8,60.3,0.2,TRUE,"",FALSE +"project2384","project2384.nex",2384,NA,"morphobank","training",150,226,226,10,37.6,7.9,TRUE,"",FALSE +"project2387","project2387.nex",2387,NA,"morphobank","training",28,22,22,4,14.6,0.2,TRUE,"",FALSE +"project2399","project2399.nex",2399,NA,"morphobank","training",111,439,439,6,55.2,0,TRUE,"",FALSE +"project240","project240.nex",240,NA,"morphobank","validation",21,245,230,7,20.8,2,TRUE,"",FALSE +"project2403","project2403.nex",2403,NA,"morphobank","training",23,66,66,6,8,0.8,TRUE,"",FALSE +"project2405","project2405.nex",2405,NA,"morphobank","validation",5,197,62,4,0.3,0,TRUE,"",FALSE +"project2406_(1)","project2406 (1).nex",2406,1,"morphobank","training",6,65,49,3,0,0,TRUE,"",FALSE +"project2406_(2)","project2406 (2).nex",2406,2,"morphobank","training",6,65,52,3,0,0,TRUE,"",FALSE +"project2406_(3)","project2406 (3).nex",2406,3,"morphobank","training",6,65,58,3,0,0,TRUE,"",FALSE +"project2409","project2409.nex",2409,NA,"morphobank","training",19,14,13,3,0.8,11.3,TRUE,"",FALSE +"project2411","project2411.nex",2411,NA,"morphobank","training",69,75,75,5,9.6,10.5,TRUE,"",FALSE +"project2416","project2416.nex",2416,NA,"morphobank","training",102,600,600,6,56,3.3,TRUE,"",FALSE +"project2436_(1)","project2436 (1).nex",2436,1,"morphobank","training",41,273,271,6,24.9,1.9,TRUE,"",FALSE +"project2436_(2)","project2436 (2).nex",2436,2,"morphobank","training",41,273,271,6,25.3,1.9,TRUE,"",TRUE +"project2439","project2439.nex",2439,NA,"morphobank","training",32,101,101,5,35.9,0,TRUE,"",FALSE +"project2442","project2442.nex",2442,NA,"morphobank","training",53,206,204,10,26.3,11.7,TRUE,"",FALSE +"project2448","project2448.nex",2448,NA,"morphobank","training",13,13,13,7,0,4.7,TRUE,"",FALSE +"project2449","project2449.nex",2449,NA,"morphobank","training",176,463,292,8,58.1,0,TRUE,"",FALSE +"project2450","project2450.nex",2450,NA,"morphobank","validation",24,391,378,6,48.7,0,TRUE,"",FALSE +"project2451","project2451.nex",2451,NA,"morphobank","training",24,380,367,6,54.5,0,TRUE,"",FALSE +"project2452","project2452.nex",2452,NA,"morphobank","training",94,272,271,5,38.3,4.4,TRUE,"",FALSE +"project246","project246.nex",246,NA,"morphobank","training",35,204,199,6,27.6,2.6,TRUE,"",FALSE +"project2463","project2463.nex",2463,NA,"morphobank","training",20,5,5,6,2,0,TRUE,"",FALSE +"project2473","project2473.nex",2473,NA,"morphobank","training",8,24,23,3,18.5,0,TRUE,"",FALSE +"project2477","project2477.nex",2477,NA,"morphobank","training",213,387,386,4,4.6,4,TRUE,"",FALSE +"project2482_(1)","project2482 (1).nex",2482,1,"morphobank","training",15,78,59,3,15.6,0.9,TRUE,"",FALSE +"project2482_(2)","project2482 (2).nex",2482,2,"morphobank","training",15,78,59,3,15.6,0.9,TRUE,"",FALSE +"project2490","project2490.nex",2490,NA,"morphobank","validation",13,8,8,4,1,0,TRUE,"",FALSE +"project2495","project2495.nex",2495,NA,"morphobank","validation",20,75,75,4,32.3,1.2,TRUE,"",FALSE +"project2501_(1)","project2501 (1).nex",2501,1,"morphobank","training",57,97,96,5,34.6,0,TRUE,"",FALSE +"project2501_(2)","project2501 (2).nex",2501,2,"morphobank","training",57,97,96,5,34.6,0,TRUE,"",TRUE +"project2506","project2506.nex",2506,NA,"morphobank","training",25,30,30,8,5.9,0.5,TRUE,"",FALSE +"project2525","project2525.nex",2525,NA,"morphobank","validation",134,44,43,8,6,0,TRUE,"",FALSE +"project2527","project2527.nex",2527,NA,"morphobank","training",32,247,247,4,29.4,3.8,TRUE,"",FALSE +"project2532","project2532.nex",2532,NA,"morphobank","training",133,561,561,5,60.1,0,TRUE,"",FALSE +"project2533","project2533.nex",2533,NA,"morphobank","training",8,63,36,6,6.9,4.5,TRUE,"",FALSE +"project2537_(1)","project2537 (1).nex",2537,1,"morphobank","training",58,77,77,8,9.4,24.7,TRUE,"",FALSE +"project2537_(2)","project2537 (2).nex",2537,2,"morphobank","training",48,63,63,8,9.5,15.9,TRUE,"",FALSE +"project2544","project2544.nex",2544,NA,"morphobank","training",8,12,10,2,0,0,TRUE,"",FALSE +"project2545","project2545.nex",2545,NA,"morphobank","validation",13,50,39,3,43,0,TRUE,"",FALSE +"project2546","project2546.nex",2546,NA,"morphobank","training",14,75,67,3,39.7,0,TRUE,"",FALSE +"project2547","project2547.nex",2547,NA,"morphobank","training",69,119,118,6,34.3,0,TRUE,"",FALSE +"project2551","project2551.nex",2551,NA,"morphobank","training",42,131,131,6,35.7,0.8,TRUE,"WARNING: Could not parse character states; does each end with a ' or ;?.",FALSE +"project2553","project2553.nex",2553,NA,"morphobank","training",37,145,145,4,35.3,7,TRUE,"",FALSE +"project2554","project2554.nex",2554,NA,"morphobank","training",43,282,281,6,25.6,4.1,TRUE,"",FALSE +"project2576","project2576.nex",2576,NA,"morphobank","training",83,125,115,5,16.2,0,TRUE,"",FALSE +"project2577","project2577.nex",2577,NA,"morphobank","training",6,20,17,4,0,0,TRUE,"",FALSE +"project2579","project2579.nex",2579,NA,"morphobank","training",31,78,77,4,27.9,0.8,TRUE,"",FALSE +"project2600","project2600.nex",2600,NA,"morphobank","validation",42,58,58,8,4.6,5.9,TRUE,"",FALSE +"project2604","project2604.nex",2604,NA,"morphobank","training",43,307,306,5,45.9,1.5,TRUE,"",FALSE +"project2606","project2606.nex",2606,NA,"morphobank","training",153,256,255,8,31.6,15.3,TRUE,"",FALSE +"project2607_(1)","project2607 (1).nex",2607,1,"morphobank","training",72,321,318,7,29,2.7,TRUE,"",TRUE +"project2607_(2)","project2607 (2).nex",2607,2,"morphobank","training",74,321,318,7,30.4,2.7,TRUE,"",FALSE +"project2610","project2610.nex",2610,NA,"morphobank","validation",17,53,51,5,6.2,0,TRUE,"WARNING: Could not parse character states; does each end with a ' or ;?.",FALSE +"project2615","project2615.nex",2615,NA,"morphobank","validation",18,41,40,6,28.8,3.6,TRUE,"",FALSE +"project262","project262.nex",262,NA,"morphobank","training",61,111,111,7,40.4,7.7,TRUE,"",FALSE +"project2621","project2621.nex",2621,NA,"morphobank","training",48,32,32,6,17,0,TRUE,"",FALSE +"project2626","project2626.nex",2626,NA,"morphobank","training",97,568,568,7,53.7,2.6,TRUE,"",FALSE +"project2627","project2627.nex",2627,NA,"morphobank","training",72,169,168,5,37.4,2.3,TRUE,"",FALSE +"project264","project264.nex",264,NA,"morphobank","training",63,150,146,7,20.2,9,TRUE,"",FALSE +"project2648","project2648.nex",2648,NA,"morphobank","training",95,272,271,5,39.1,4.3,TRUE,"",FALSE +"project265","project265.nex",265,NA,"morphobank","validation",30,208,203,6,22.9,2.4,TRUE,"",FALSE +"project2650","project2650.nex",2650,NA,"morphobank","validation",32,101,101,5,35.9,0,TRUE,"",FALSE +"project2653","project2653.nex",2653,NA,"morphobank","training",12,16,13,3,9,0,TRUE,"",FALSE +"project2655","project2655.nex",2655,NA,"morphobank","validation",38,272,270,5,26.4,4.4,TRUE,"",FALSE +"project2657","project2657.nex",2657,NA,"morphobank","training",20,22,22,6,21.8,0,TRUE,"",FALSE +"project266","project266.nex",266,NA,"morphobank","training",17,209,187,7,19,1.1,TRUE,"",FALSE +"project2668","project2668.nex",2668,NA,"morphobank","training",196,1227,1140,6,49.1,3.2,TRUE,"",FALSE +"project2669","project2669.nex",2669,NA,"morphobank","training",96,270,270,7,52.5,0,TRUE,"",FALSE +"project2691","project2691.nex",2691,NA,"morphobank","training",17,41,38,5,8.4,4.8,TRUE,"",FALSE +"project2694","project2694.nex",2694,NA,"morphobank","training",31,31,30,6,3.8,1.2,TRUE,"",FALSE +"project2702","project2702.nex",2702,NA,"morphobank","training",29,40,40,7,4.7,0,TRUE,"",FALSE +"project2707","project2707.nex",2707,NA,"morphobank","training",64,35,35,12,23,0,TRUE,"",FALSE +"project2713","project2713.nex",2713,NA,"morphobank","training",14,16,16,3,23.7,0,TRUE,"",FALSE +"project2722","project2722.nex",2722,NA,"morphobank","training",385,520,519,4,30.9,3.1,TRUE,"",FALSE +"project2723","project2723.nex",2723,NA,"morphobank","training",56,97,95,6,12,1.6,TRUE,"",FALSE +"project2726","project2726.nex",2726,NA,"morphobank","training",24,71,54,5,6,0,TRUE,"",FALSE +"project2749","project2749.nex",2749,NA,"morphobank","training",64,163,163,4,42.1,0,TRUE,"",FALSE +"project2762","project2762.nex",2762,NA,"morphobank","training",29,187,177,5,34.8,17.1,TRUE,"",FALSE +"project2769","project2769.nex",2769,NA,"morphobank","training",102,219,218,5,10.9,3.5,TRUE,"",FALSE +"project277","project277.nex",277,NA,"morphobank","training",12,40,39,4,9.2,0,TRUE,"",FALSE +"project2770","project2770.nex",2770,NA,"morphobank","validation",55,307,307,4,37,2.7,TRUE,"",FALSE +"project2771","project2771.nex",2771,NA,"morphobank","training",94,124,123,8,1,30,TRUE,"",FALSE +"project2776","project2776.nex",2776,NA,"morphobank","training",96,270,270,7,52.5,0,TRUE,"",FALSE +"project2781","project2781.nex",2781,NA,"morphobank","training",58,202,199,6,40,0,TRUE,"",FALSE +"project2788_(1)","project2788 (1).nex",2788,1,"morphobank","training",33,106,106,3,40.4,0,TRUE,"",TRUE +"project2788_(2)","project2788 (2).nex",2788,2,"morphobank","training",34,106,106,3,40.1,0,TRUE,"",FALSE +"project2789","project2789.nex",2789,NA,"morphobank","training",72,75,75,4,13.7,7.1,TRUE,"",FALSE +"project2792","project2792.nex",2792,NA,"morphobank","training",93,230,219,10,19.8,13.7,TRUE,"",FALSE +"project2794","project2794.nex",2794,NA,"morphobank","training",113,170,170,5,39.5,2.2,TRUE,"",FALSE +"project2798__Ungulate_dental","project2798__Ungulate_dental.nex",2798,NA,"morphobank","training",76,92,91,4,20.3,0,TRUE,"",FALSE +"project2798_Gheerbrant_et_al._(2016)","project2798_Gheerbrant et al. (2016).nex",2798,2016,"morphobank","training",28,184,182,6,25.3,0,TRUE,"",FALSE +"project2798_Muizon_et_al._(2015)","project2798_Muizon et al. (2015).nex",2798,2015,"morphobank","training",73,426,426,6,29.9,0,TRUE,"",FALSE +"project2798_Tabuce_et_al._(2011)","project2798_Tabuce et al. (2011).nex",2798,2011,"morphobank","training",38,65,64,5,16.9,0,TRUE,"",FALSE +"project2799","project2799.nex",2799,NA,"morphobank","training",64,401,400,6,51.6,0,TRUE,"",FALSE +"project2800","project2800.nex",2800,NA,"morphobank","validation",54,225,217,5,48.7,9.7,TRUE,"",FALSE +"project2804","project2804.nex",2804,NA,"morphobank","training",86,76,74,4,5.3,9.8,TRUE,"",FALSE +"project2806_(1)","project2806 (1).nex",2806,1,"morphobank","training",37,6,6,10,27.9,0,TRUE,"",FALSE +"project2806_(2)","project2806 (2).nex",2806,2,"morphobank","training",37,165,162,7,5,20,TRUE,"",FALSE +"project2816","project2816.nex",2816,NA,"morphobank","training",57,323,323,4,37.9,2.7,TRUE,"",FALSE +"project291","project291.nex",291,NA,"morphobank","training",17,395,386,6,20.6,0,TRUE,"",FALSE +"project295","project295.nex",295,NA,"morphobank","validation",31,145,141,5,19.2,2.1,TRUE,"",FALSE +"project299","project299.nex",299,NA,"morphobank","training",30,144,129,9,7,4.6,TRUE,"",FALSE +"project3151","project3151.nex",3151,NA,"morphobank","training",20,107,107,6,18.8,0,TRUE,"",FALSE +"project3154","project3154.nex",3154,NA,"morphobank","training",33,209,193,6,35.6,4.2,TRUE,"",FALSE +"project316","project316.nex",316,NA,"morphobank","training",69,408,408,6,40.2,4.6,TRUE,"",FALSE +"project3165","project3165.nex",3165,NA,"morphobank","validation",28,59,59,5,41.2,0,TRUE,"",FALSE +"project3167","project3167.nex",3167,NA,"morphobank","training",13,47,46,11,35.1,0,TRUE,"",FALSE +"project3168","project3168.nex",3168,NA,"morphobank","training",90,415,415,6,45,1.3,TRUE,"",FALSE +"project3172","project3172.nex",3172,NA,"morphobank","training",43,227,223,4,49.4,0,TRUE,"",FALSE +"project3173","project3173.nex",3173,NA,"morphobank","training",95,419,419,5,43.9,0.2,TRUE,"",FALSE +"project3184","project3184.nex",3184,NA,"morphobank","training",39,52,52,5,16.9,2.6,TRUE,"",FALSE +"project3187","project3187.nex",3187,NA,"morphobank","training",25,101,84,4,14.8,0,TRUE,"",FALSE +"project3188","project3188.nex",3188,NA,"morphobank","training",30,77,76,4,27.8,0.8,TRUE,"",FALSE +"project3189","project3189.nex",3189,NA,"morphobank","training",47,211,206,7,23,14.4,TRUE,"",FALSE +"project3199","project3199.nex",3199,NA,"morphobank","training",88,168,138,3,0,1,TRUE,"",FALSE +"project3200","project3200.nex",3200,NA,"morphobank","validation",138,113,111,5,4.3,5.7,TRUE,"",FALSE +"project3203","project3203.nex",3203,NA,"morphobank","training",61,337,337,4,37.4,2.6,TRUE,"",FALSE +"project321","project321.nex",321,NA,"morphobank","training",81,661,656,6,38.6,0,TRUE,"",FALSE +"project3210","project3210.nex",3210,NA,"morphobank","validation",37,70,69,6,22.3,1,TRUE,"",FALSE +"project3211","project3211.nex",3211,NA,"morphobank","training",50,192,191,15,31.4,4.2,TRUE,"",FALSE +"project3212","project3212.nex",3212,NA,"morphobank","training",146,10,10,9,0.5,0,TRUE,"",FALSE +"project3216","project3216.nex",3216,NA,"morphobank","training",19,98,88,6,16.1,0.5,TRUE,"",FALSE +"project3234","project3234.nex",3234,NA,"morphobank","training",13,45,45,4,16.6,0.5,TRUE,"",FALSE +"project3239","project3239.nex",3239,NA,"morphobank","training",49,18,18,2,5.8,22.6,TRUE,"",FALSE +"project3244","project3244.nex",3244,NA,"morphobank","training",26,34,30,4,7.6,5.4,TRUE,"",FALSE +"project3249","project3249.nex",3249,NA,"morphobank","training",89,413,413,5,42.2,0,TRUE,"",FALSE +"project3253","project3253.nex",3253,NA,"morphobank","training",125,394,393,7,49,1.8,TRUE,"",FALSE +"project3260","project3260.nex",3260,NA,"morphobank","validation",18,74,74,8,26.9,3.4,TRUE,"",FALSE +"project3264","project3264.nex",3264,NA,"morphobank","training",66,303,301,9,41.4,1.6,TRUE,"",FALSE +"project3267","project3267.nex",3267,NA,"morphobank","training",68,355,352,5,61.3,0.3,TRUE,"",FALSE +"project3285","project3285.nex",3285,NA,"morphobank","validation",391,520,519,4,29.2,3.1,TRUE,"",FALSE +"project3287_Cassidulidae_complete","project3287_Cassidulidae_complete.nex",3287,NA,"morphobank","training",66,98,97,7,8.5,0.3,TRUE,"",FALSE +"project3287_Cassidulidae_without_partial_uncertainties","project3287_Cassidulidae_without partial uncertainties.nex",3287,NA,"morphobank","training",66,98,97,7,9.1,0.3,TRUE,"",FALSE +"project3293","project3293.nex",3293,NA,"morphobank","training",32,111,111,8,27.3,0,TRUE,"",FALSE +"project332","project332.nex",332,NA,"morphobank","training",22,107,105,3,25.7,0.3,TRUE,"",FALSE +"project3335","project3335.nex",3335,NA,"morphobank","validation",13,36,35,5,9.5,0,TRUE,"",FALSE +"project3345","project3345.nex",3345,NA,"morphobank","validation",44,77,70,8,1.5,11.2,TRUE,"",FALSE +"project3351","project3351.nex",3351,NA,"morphobank","training",34,143,138,7,23,0,TRUE,"",FALSE +"project3354_(1)","project3354 (1).nex",3354,1,"morphobank","training",78,18,18,8,11.8,10.3,TRUE,"",FALSE +"project3354_(2)","project3354 (2).nex",3354,2,"morphobank","training",78,121,120,8,25,4.6,TRUE,"",FALSE +"project3380","project3380.nex",3380,NA,"morphobank","validation",33,121,120,6,45.8,0.4,TRUE,"",FALSE +"project3381","project3381.nex",3381,NA,"morphobank","training",34,93,92,4,23.7,0.8,TRUE,"",FALSE +"project3384","project3384.nex",3384,NA,"morphobank","training",45,352,352,5,39.3,0,TRUE,"",FALSE +"project3385_(1)","project3385 (1).nex",3385,1,"morphobank","validation",96,555,454,5,55.8,21.3,TRUE,"",FALSE +"project3385_(2)","project3385 (2).nex",3385,2,"morphobank","validation",55,634,563,5,47,29,TRUE,"",FALSE +"project3392_(1)","project3392 (1).nex",3392,1,"morphobank","training",47,132,131,7,2.2,10.5,TRUE,"",TRUE +"project3392_(2)","project3392 (2).nex",3392,2,"morphobank","training",49,132,132,7,2.8,10.3,TRUE,"",FALSE +"project3392","project3392.nex",3392,NA,"morphobank","training",47,132,131,7,2.2,10.5,TRUE,"",FALSE +"project3400","project3400.nex",3400,NA,"morphobank","validation",24,38,36,5,11.6,0,TRUE,"WARNING: Could not parse character states; does each end with a ' or ;?.",FALSE +"project3405","project3405.nex",3405,NA,"morphobank","validation",100,324,321,3,8.4,13.6,TRUE,"",FALSE +"project3408","project3408.nex",3408,NA,"morphobank","training",19,30,24,4,19.5,0,TRUE,"",FALSE +"project3411","project3411.nex",3411,NA,"morphobank","training",84,530,528,8,39.4,16.9,TRUE,"",FALSE +"project3419","project3419.nex",3419,NA,"morphobank","training",40,368,343,7,5.4,0,TRUE,"",FALSE +"project3422","project3422.nex",3422,NA,"morphobank","training",110,278,277,6,41.2,4.4,TRUE,"",FALSE +"project3436","project3436.nex",3436,NA,"morphobank","training",99,245,245,7,31.3,10,TRUE,"",FALSE +"project3437","project3437.nex",3437,NA,"morphobank","training",64,89,89,7,17.4,19.1,TRUE,"",FALSE +"project3445","project3445.nex",3445,NA,"morphobank","validation",30,34,33,4,10.1,4.5,TRUE,"",FALSE +"project3448","project3448.nex",3448,NA,"morphobank","training",135,81,79,6,1.3,2.7,TRUE,"",FALSE +"project3456","project3456.nex",3456,NA,"morphobank","training",36,102,101,4,15.5,1.7,TRUE,"",FALSE +"project3466","project3466.nex",3466,NA,"morphobank","training",20,170,149,5,29.3,1.6,TRUE,"",FALSE +"project3470","project3470.nex",3470,NA,"morphobank","validation",55,303,299,9,35,1.7,TRUE,"",FALSE +"project3477","project3477.nex",3477,NA,"morphobank","training",21,109,109,6,16.5,0,TRUE,"",FALSE +"project3480","project3480.nex",3480,NA,"morphobank","validation",42,202,202,4,38.3,2.6,TRUE,"",FALSE +"project3489","project3489.nex",3489,NA,"morphobank","training",50,115,115,5,39.5,0.2,TRUE,"",FALSE +"project3497","project3497.nex",3497,NA,"morphobank","training",160,259,258,8,33,15.3,TRUE,"",FALSE +"project3501","project3501.nex",3501,NA,"morphobank","training",102,270,270,7,53.5,0,TRUE,"",FALSE +"project3508","project3508.nex",3508,NA,"morphobank","training",39,294,292,5,42.4,0,TRUE,"",FALSE +"project3509","project3509.nex",3509,NA,"morphobank","training",15,51,42,4,12.9,0.3,TRUE,"",FALSE +"project3512_(1)","project3512 (1).nex",3512,1,"morphobank","training",75,45,45,16,3.1,3.3,TRUE,"",FALSE +"project3512_(2)","project3512 (2).nex",3512,2,"morphobank","training",72,54,54,20,2,4.5,TRUE,"",FALSE +"project3512_(3)","project3512 (3).nex",3512,3,"morphobank","training",63,77,76,22,8,3.6,TRUE,"",FALSE +"project352_(1)","project352 (1).nex",352,1,"morphobank","training",61,88,88,5,2.7,1.2,TRUE,"",FALSE +"project352_(2)","project352 (2).nex",352,2,"morphobank","training",59,27,27,4,17.3,4,TRUE,"",FALSE +"project352_(3)","project352 (3).nex",352,3,"morphobank","training",59,19,19,5,1.8,0,TRUE,"",FALSE +"project3520","project3520.nex",3520,NA,"morphobank","validation",102,324,324,10,28.2,6.3,TRUE,"",FALSE +"project3521","project3521.nex",3521,NA,"morphobank","training",93,156,156,4,47.7,0,TRUE,"",FALSE +"project3533","project3533.nex",3533,NA,"morphobank","training",42,85,84,2,18.9,4.4,TRUE,"",FALSE +"project3538","project3538.nex",3538,NA,"morphobank","training",99,138,121,8,4.3,9.7,TRUE,"",FALSE +"project3541_(1)","project3541 (1).nex",3541,1,"morphobank","training",24,74,74,4,26.3,0.3,TRUE,"",FALSE +"project3541_(2)","project3541 (2).nex",3541,2,"morphobank","training",22,74,74,4,28,0.3,TRUE,"",TRUE +"project3544","project3544.nex",3544,NA,"morphobank","training",34,120,115,4,27.1,0,TRUE,"",FALSE +"project3558","project3558.nex",3558,NA,"morphobank","training",86,59,59,5,19,10.2,TRUE,"",FALSE +"project3561","project3561.nex",3561,NA,"morphobank","training",36,110,109,5,4,3.2,TRUE,"",FALSE +"project3569","project3569.nex",3569,NA,"morphobank","training",43,97,96,5,24.2,0,TRUE,"",FALSE +"project3575","project3575.nex",3575,NA,"morphobank","validation",21,37,35,3,24.1,0,TRUE,"",FALSE +"project3581","project3581.nex",3581,NA,"morphobank","training",56,63,61,4,16.5,18,TRUE,"",FALSE +"project3587","project3587.nex",3587,NA,"morphobank","training",106,194,193,7,29.8,7.1,TRUE,"",FALSE +"project3592","project3592.nex",3592,NA,"morphobank","training",41,99,95,6,3.8,1.7,TRUE,"",FALSE +"project3597","project3597.nex",3597,NA,"morphobank","training",62,2,2,2,43.5,0,TRUE,"",FALSE +"project3599","project3599.nex",3599,NA,"morphobank","training",54,128,121,5,2.7,3.7,TRUE,"",FALSE +"project360","project360.nex",360,NA,"morphobank","validation",38,34,34,4,5.4,0,TRUE,"",FALSE +"project3601","project3601.nex",3601,NA,"morphobank","training",24,52,51,4,27.7,0,TRUE,"",FALSE +"project3602","project3602.nex",3602,NA,"morphobank","training",105,197,196,7,30,7,TRUE,"",FALSE +"project3603","project3603.nex",3603,NA,"morphobank","training",70,14,14,5,1.7,0.9,TRUE,"",FALSE +"project3613","project3613.nex",3613,NA,"morphobank","training",49,63,62,4,10.5,17.2,TRUE,"",FALSE +"project3617","project3617.nex",3617,NA,"morphobank","training",65,361,361,7,32.4,3.9,TRUE,"",FALSE +"project3619","project3619.nex",3619,NA,"morphobank","training",21,57,52,6,1.8,0.4,TRUE,"",FALSE +"project3621","project3621.nex",3621,NA,"morphobank","training",62,245,245,5,52.4,0,TRUE,"",FALSE +"project3625","project3625.nex",3625,NA,"morphobank","validation",27,57,55,4,1.6,0,TRUE,"",FALSE +"project3626","project3626.nex",3626,NA,"morphobank","training",22,57,57,7,18.6,0,TRUE,"",FALSE +"project3627","project3627.nex",3627,NA,"morphobank","training",22,74,74,5,28.6,0.3,TRUE,"",FALSE +"project3637","project3637.nex",3637,NA,"morphobank","training",86,530,528,8,39.1,17.1,TRUE,"",FALSE +"project3646","project3646.nex",3646,NA,"morphobank","training",69,202,193,6,0.9,9.1,TRUE,"",FALSE +"project365","project365.nex",365,NA,"morphobank","validation",22,75,75,6,23.2,2,TRUE,"",FALSE +"project3655","project3655.nex",3655,NA,"morphobank","validation",45,77,72,5,28.9,4.7,TRUE,"",FALSE +"project3656","project3656.nex",3656,NA,"morphobank","training",61,339,339,4,37.1,2.7,TRUE,"",FALSE +"project3664","project3664.nex",3664,NA,"morphobank","training",20,26,25,6,5.4,0,TRUE,"",FALSE +"project3665","project3665.nex",3665,NA,"morphobank","validation",39,297,297,5,15.3,3,TRUE,"",FALSE +"project367","project367.nex",367,NA,"morphobank","training",51,216,215,4,38.3,0,TRUE,"",FALSE +"project3670","project3670.nex",3670,NA,"morphobank","validation",62,120,115,8,10,9.1,TRUE,"",FALSE +"project3672_(1)","project3672 (1).nex",3672,1,"morphobank","training",11,54,35,6,7.8,0,TRUE,"",FALSE +"project3672_(2)","project3672 (2).nex",3672,2,"morphobank","training",11,54,35,6,8.8,0,TRUE,"",FALSE +"project3677","project3677.nex",3677,NA,"morphobank","training",50,98,97,4,19.4,0,TRUE,"",FALSE +"project3684","project3684.nex",3684,NA,"morphobank","training",36,248,244,6,33.9,1.5,TRUE,"",FALSE +"project3685","project3685.nex",3685,NA,"morphobank","validation",37,257,257,6,49.4,0,TRUE,"",FALSE +"project3687","project3687.nex",3687,NA,"morphobank","training",43,252,252,6,54.3,0,TRUE,"",FALSE +"project3688","project3688.nex",3688,NA,"morphobank","training",60,245,245,7,57.3,0,TRUE,"",FALSE +"project3695","project3695.nex",3695,NA,"morphobank","validation",40,45,44,4,8.6,0.2,TRUE,"",FALSE +"project3696","project3696.nex",3696,NA,"morphobank","training",22,22,21,4,5.8,0.2,TRUE,"",FALSE +"project3698","project3698.nex",3698,NA,"morphobank","training",20,62,52,4,14,0,TRUE,"",FALSE +"project3701","project3701.nex",3701,NA,"morphobank","training",146,324,324,10,30.9,15.1,TRUE,"",FALSE +"project3705","project3705.nex",3705,NA,"morphobank","validation",27,193,185,7,7.7,2.7,TRUE,"",FALSE +"project3707","project3707.nex",3707,NA,"morphobank","training",151,131,121,10,0.6,25.9,TRUE,"",FALSE +"project3708","project3708.nex",3708,NA,"morphobank","training",69,254,254,8,38.9,0,TRUE,"",FALSE +"project3709","project3709.nex",3709,NA,"morphobank","training",42,65,39,2,0,0,TRUE,"",FALSE +"project3710","project3710.nex",3710,NA,"morphobank","validation",115,65,39,2,0,0,TRUE,"",FALSE +"project3711","project3711.nex",3711,NA,"morphobank","training",79,132,130,4,3.6,13.7,TRUE,"",FALSE +"project3725","project3725.nex",3725,NA,"morphobank","validation",90,189,187,6,50.4,2.3,TRUE,"",FALSE +"project3726","project3726.nex",3726,NA,"morphobank","training",76,146,146,4,29.7,4.8,TRUE,"",FALSE +"project3730","project3730.nex",3730,NA,"morphobank","validation",21,107,107,6,17.8,0,TRUE,"",FALSE +"project3733","project3733.nex",3733,NA,"morphobank","training",157,853,841,8,65.5,0,TRUE,"",FALSE +"project3740","project3740.nex",3740,NA,"morphobank","validation",66,39,39,5,2.7,24.1,TRUE,"",FALSE +"project3741","project3741.nex",3741,NA,"morphobank","training",86,110,107,6,4.3,14.7,TRUE,"",FALSE +"project3742","project3742.nex",3742,NA,"morphobank","training",9,16,13,4,9.4,0.9,TRUE,"",FALSE +"project3755","project3755.nex",3755,NA,"morphobank","validation",46,201,201,5,8.4,4.4,TRUE,"",FALSE +"project3756","project3756.nex",3756,NA,"morphobank","training",34,69,69,4,36.3,0.6,TRUE,"",FALSE +"project3757","project3757.nex",3757,NA,"morphobank","training",59,61,60,7,6,3.6,TRUE,"",FALSE +"project3760_(1)","project3760 (1).nex",3760,1,"morphobank","validation",130,509,506,7,49.7,0,TRUE,"",FALSE +"project3760_(2)","project3760 (2).nex",3760,2,"morphobank","validation",130,509,506,12,49.7,0,TRUE,"",TRUE +"project3763","project3763.nex",3763,NA,"morphobank","training",205,105,103,6,10.8,8.5,TRUE,"",FALSE +"project3766","project3766.nex",3766,NA,"morphobank","training",89,286,283,8,31.7,1.5,TRUE,"",FALSE +"project3768","project3768.nex",3768,NA,"morphobank","training",79,214,213,7,29.6,1.6,TRUE,"",FALSE +"project3769","project3769.nex",3769,NA,"morphobank","training",76,123,120,6,0.3,1.8,TRUE,"",FALSE +"project3773","project3773.nex",3773,NA,"morphobank","training",194,823,812,6,59.4,4.3,TRUE,"",FALSE +"project3782","project3782.nex",3782,NA,"morphobank","training",83,163,163,6,33.9,0,TRUE,"",FALSE +"project3785","project3785.nex",3785,NA,"morphobank","validation",21,27,26,4,29.9,3.3,TRUE,"",FALSE +"project3794","project3794.nex",3794,NA,"morphobank","training",24,65,65,8,18.9,3.8,TRUE,"",FALSE +"project380","project380.nex",380,NA,"morphobank","validation",17,164,152,5,17.8,1,TRUE,"",FALSE +"project3804","project3804.nex",3804,NA,"morphobank","training",54,117,113,7,11.6,19.3,TRUE,"",FALSE +"project3806","project3806.nex",3806,NA,"morphobank","training",202,746,746,7,72.3,1.2,TRUE,"",FALSE +"project3807","project3807.nex",3807,NA,"morphobank","training",96,83,81,10,7.8,5.6,TRUE,"",FALSE +"project3812","project3812.nex",3812,NA,"morphobank","training",98,568,568,7,55.9,0,TRUE,"",FALSE +"project3818","project3818.nex",3818,NA,"morphobank","training",49,206,206,4,41.2,2.7,TRUE,"",FALSE +"project3825","project3825.nex",3825,NA,"morphobank","validation",136,37,18,10,12.2,43.4,TRUE,"",FALSE +"project383","project383.nex",383,NA,"morphobank","training",27,84,84,3,10.9,1.6,TRUE,"",FALSE +"project3831","project3831.nex",3831,NA,"morphobank","training",46,134,128,4,27.1,0,TRUE,"",FALSE +"project3832","project3832.nex",3832,NA,"morphobank","training",10,27,17,3,2.4,0.6,TRUE,"",FALSE +"project3833","project3833.nex",3833,NA,"morphobank","training",48,69,68,5,3.2,1.7,TRUE,"",FALSE +"project3854","project3854.nex",3854,NA,"morphobank","training",89,188,186,9,50.4,2.3,TRUE,"",FALSE +"project386","project386.nex",386,NA,"morphobank","training",10,21,19,3,3.7,0,TRUE,"",FALSE +"project3868","project3868.nex",3868,NA,"morphobank","training",20,42,34,3,10.6,1.9,TRUE,"",FALSE +"project3874","project3874.nex",3874,NA,"morphobank","training",54,125,125,6,16.5,45.5,TRUE,"",FALSE +"project3887_(1)","project3887 (1).nex",3887,1,"morphobank","training",55,275,272,8,43.6,14.2,TRUE,"",FALSE +"project3887_(2)","project3887 (2).nex",3887,2,"morphobank","training",196,823,823,6,58.8,4.2,TRUE,"",FALSE +"project3894","project3894.nex",3894,NA,"morphobank","training",58,148,128,8,4.6,0,TRUE,"",FALSE +"project3896","project3896.nex",3896,NA,"morphobank","training",72,207,201,4,2.5,8.6,TRUE,"",FALSE +"project3898","project3898.nex",3898,NA,"morphobank","training",85,143,143,8,11.9,0,TRUE,"",FALSE +"project3906","project3906.nex",3906,NA,"morphobank","training",54,58,58,8,18.6,5.3,TRUE,"",FALSE +"project3908","project3908.nex",3908,NA,"morphobank","training",51,364,364,5,40.2,1.3,TRUE,"",FALSE +"project3910","project3910.nex",3910,NA,"morphobank","validation",135,28,28,2,13,0,TRUE,"",FALSE +"project3914","project3914.nex",3914,NA,"morphobank","training",13,86,76,5,15.9,0,TRUE,"",FALSE +"project3916","project3916.nex",3916,NA,"morphobank","training",25,140,70,5,2.1,0.7,TRUE,"",FALSE +"project3927","project3927.nex",3927,NA,"morphobank","training",63,154,154,7,39.6,15.4,TRUE,"",FALSE +"project3929","project3929.nex",3929,NA,"morphobank","training",40,130,122,5,14,0.8,TRUE,"",FALSE +"project3930","project3930.nex",3930,NA,"morphobank","validation",32,84,84,5,12.1,6,TRUE,"",FALSE +"project3931","project3931.nex",3931,NA,"morphobank","training",115,287,287,4,49.9,2.3,TRUE,"",FALSE +"project3932","project3932.nex",3932,NA,"morphobank","training",72,170,169,5,37.1,2.3,TRUE,"",FALSE +"project3933","project3933.nex",3933,NA,"morphobank","training",21,43,42,4,44.2,0.2,TRUE,"",FALSE +"project3934","project3934.nex",3934,NA,"morphobank","training",85,418,418,5,42.6,0,TRUE,"",FALSE +"project3935","project3935.nex",3935,NA,"morphobank","validation",10,36,30,5,27.7,0,TRUE,"",FALSE +"project3936","project3936.nex",3936,NA,"morphobank","training",42,170,166,5,29.8,2,TRUE,"",FALSE +"project3938","project3938.nex",3938,NA,"morphobank","training",119,677,677,6,52.6,4.3,TRUE,"",FALSE +"project3939","project3939.nex",3939,NA,"morphobank","training",32,57,57,7,35.4,0,TRUE,"",FALSE +"project3941","project3941.nex",3941,NA,"morphobank","training",80,600,600,6,45.1,4.2,TRUE,"",FALSE +"project3942","project3942.nex",3942,NA,"morphobank","training",33,102,94,4,16.4,3.9,TRUE,"",FALSE +"project3943","project3943.nex",3943,NA,"morphobank","training",121,551,548,7,52.6,0,TRUE,"",FALSE +"project3951_(1)","project3951 (1).nex",3951,1,"morphobank","training",41,107,106,7,34.3,1.3,TRUE,"",FALSE +"project3951_(2)","project3951 (2).nex",3951,2,"morphobank","training",1,1,1,0,0,0,TRUE,"",FALSE +"project3951_(3)","project3951 (3).nex",3951,3,"morphobank","training",1,1,1,0,0,0,TRUE,"",FALSE +"project3955","project3955.nex",3955,NA,"morphobank","validation",76,395,394,8,60.5,0.5,TRUE,"",FALSE +"project3958","project3958.nex",3958,NA,"morphobank","training",79,284,268,8,42.3,8.4,TRUE,"",FALSE +"project3964","project3964.nex",3964,NA,"morphobank","training",79,419,419,6,52.3,1.6,TRUE,"",FALSE +"project3970","project3970.nex",3970,NA,"morphobank","validation",68,339,339,4,39.5,2.4,TRUE,"",FALSE +"project3978","project3978.nex",3978,NA,"morphobank","training",58,164,164,4,18.1,1.6,TRUE,"",FALSE +"project3989","project3989.nex",3989,NA,"morphobank","training",25,181,170,6,13.9,6.8,TRUE,"",FALSE +"project4010_(1)","project4010 (1).nex",4010,1,"morphobank","validation",28,112,112,3,2.5,0,TRUE,"",TRUE +"project4010_(2)","project4010 (2).nex",4010,2,"morphobank","validation",40,112,112,3,28.2,0,TRUE,"",FALSE +"project4010_(3)","project4010 (3).nex",4010,3,"morphobank","validation",28,112,112,3,3.2,0,TRUE,"",TRUE +"project402","project402.nex",402,NA,"morphobank","training",32,80,80,6,0.2,18,TRUE,"",FALSE +"project4034","project4034.nex",4034,NA,"morphobank","training",37,218,214,3,63.5,0.1,TRUE,"",FALSE +"project4044","project4044.nex",4044,NA,"morphobank","training",30,93,83,2,8.7,4.5,TRUE,"",FALSE +"project4049","project4049.nex",4049,NA,"morphobank","training",60,721,719,5,22.2,0,TRUE,"",FALSE +"project4056","project4056.nex",4056,NA,"morphobank","training",16,568,560,11,37.2,4.2,TRUE,"",FALSE +"project4066","project4066.nex",4066,NA,"morphobank","training",26,27,26,5,4.7,1.5,TRUE,"",FALSE +"project407","project407.nex",407,NA,"morphobank","training",23,25,25,4,6.1,4.2,TRUE,"",FALSE +"project4074","project4074.nex",4074,NA,"morphobank","training",12,26,24,3,12.8,0.7,TRUE,"",FALSE +"project4077","project4077.nex",4077,NA,"morphobank","training",52,101,100,9,9.9,0,TRUE,"",FALSE +"project4078","project4078.nex",4078,NA,"morphobank","training",80,192,190,5,17.3,22.4,TRUE,"",FALSE +"project408","project408.nex",408,NA,"morphobank","training",27,77,75,3,7.7,0,TRUE,"",FALSE +"project4085","project4085.nex",4085,NA,"morphobank","validation",164,716,716,7,58.2,4,TRUE,"",FALSE +"project4087","project4087.nex",4087,NA,"morphobank","training",27,71,60,5,9,0,TRUE,"",FALSE +"project4091","project4091.nex",4091,NA,"morphobank","training",7,25,20,3,22.1,2.9,TRUE,"",FALSE +"project4095","project4095.nex",4095,NA,"morphobank","validation",21,26,26,4,11,0.2,TRUE,"",FALSE +"project4103","project4103.nex",4103,NA,"morphobank","training",144,159,152,6,1.3,6.1,TRUE,"",FALSE +"project4104","project4104.nex",4104,NA,"morphobank","training",64,92,88,5,52.9,1.1,TRUE,"",FALSE +"project4111","project4111.nex",4111,NA,"morphobank","training",74,102,100,5,0,7.1,TRUE,"",FALSE +"project4112","project4112.nex",4112,NA,"morphobank","training",30,100,92,5,19.1,20.6,TRUE,"",FALSE +"project4119","project4119.nex",4119,NA,"morphobank","training",32,69,66,4,41.6,1.7,TRUE,"",FALSE +"project4123_(1)","project4123 (1).nex",4123,1,"morphobank","training",39,187,187,3,32.1,5.8,TRUE,"",FALSE +"project4123_(2)","project4123 (2).nex",4123,2,"morphobank","training",39,173,173,4,33.7,5.3,TRUE,"",FALSE +"project4125","project4125.nex",4125,NA,"morphobank","validation",59,156,155,5,51.3,0,TRUE,"",FALSE +"project4126","project4126.nex",4126,NA,"morphobank","training",30,106,101,3,32,0,TRUE,"",FALSE +"project4133","project4133.nex",4133,NA,"morphobank","training",131,349,349,5,31.3,6,TRUE,"",FALSE +"project4135","project4135.nex",4135,NA,"morphobank","validation",29,78,77,4,20.9,0.5,TRUE,"",FALSE +"project4138","project4138.nex",4138,NA,"morphobank","training",131,45,45,3,20.2,0,TRUE,"",FALSE +"project4146_(1)","project4146 (1).nex",4146,1,"morphobank","training",57,129,129,6,17.1,46,TRUE,"",FALSE +"project4146_(2)","project4146 (2).nex",4146,2,"morphobank","training",56,129,129,6,16.5,46.2,TRUE,"",TRUE +"project4146_(3)","project4146 (3).nex",4146,3,"morphobank","training",59,130,130,7,18.1,45.6,TRUE,"",FALSE +"project4146_(4)","project4146 (4).nex",4146,4,"morphobank","training",59,130,130,7,18.4,45.6,TRUE,"",TRUE +"project4146_(5)","project4146 (5).nex",4146,5,"morphobank","training",56,131,130,6,16.3,46.1,TRUE,"",FALSE +"project4146_(6)","project4146 (6).nex",4146,6,"morphobank","training",56,130,130,6,16.2,46.1,TRUE,"",FALSE +"project4146_(7)","project4146 (7).nex",4146,7,"morphobank","training",56,130,129,6,16.5,46.3,TRUE,"",TRUE +"project4146_(8)","project4146 (8).nex",4146,8,"morphobank","training",56,129,129,6,16.5,46.2,TRUE,"",TRUE +"project4147","project4147.nex",4147,NA,"morphobank","training",71,153,150,7,36.5,14.4,TRUE,"",FALSE +"project4149","project4149.nex",4149,NA,"morphobank","training",40,178,178,4,17.4,1.1,TRUE,"",FALSE +"project4163","project4163.nex",4163,NA,"morphobank","training",33,72,72,4,44.7,0,TRUE,"",FALSE +"project4166","project4166.nex",4166,NA,"morphobank","training",63,355,355,4,23.4,5.6,TRUE,"",FALSE +"project4168","project4168.nex",4168,NA,"morphobank","training",43,46,45,5,6.8,0,TRUE,"",FALSE +"project4169","project4169.nex",4169,NA,"morphobank","training",34,88,88,4,45.9,0.3,TRUE,"",FALSE +"project417","project417.nex",417,NA,"morphobank","training",12,39,37,4,20.7,1.4,TRUE,"",FALSE +"project4171","project4171.nex",4171,NA,"morphobank","training",13,39,38,4,31.6,1.8,TRUE,"",FALSE +"project4173","project4173.nex",4173,NA,"morphobank","training",81,155,136,8,0.3,13.2,TRUE,"",FALSE +"project4174","project4174.nex",4174,NA,"morphobank","training",13,30,22,4,7.3,0,TRUE,"",FALSE +"project4176","project4176.nex",4176,NA,"morphobank","training",148,22,22,5,20.6,0,TRUE,"",FALSE +"project4181","project4181.nex",4181,NA,"morphobank","training",54,219,219,4,17.1,5.8,TRUE,"",FALSE +"project4182","project4182.nex",4182,NA,"morphobank","training",22,29,29,4,9.1,6.1,TRUE,"",FALSE +"project4183","project4183.nex",4183,NA,"morphobank","training",36,20,14,4,0,7.5,TRUE,"",FALSE +"project4184","project4184.nex",4184,NA,"morphobank","training",106,435,435,9,61.4,0.9,TRUE,"",FALSE +"project4185","project4185.nex",4185,NA,"morphobank","validation",41,88,85,4,49.8,1.1,TRUE,"",FALSE +"project4186","project4186.nex",4186,NA,"morphobank","training",48,33,33,8,4,0,TRUE,"",FALSE +"project4187","project4187.nex",4187,NA,"morphobank","training",10,7,7,3,1.4,1.4,TRUE,"",FALSE +"project4190","project4190.nex",4190,NA,"morphobank","validation",50,89,82,3,2.9,0,TRUE,"",FALSE +"project4192_(1)","project4192 (1).nex",4192,1,"morphobank","training",42,104,101,5,16.1,3.7,TRUE,"",FALSE +"project4192_(2)","project4192 (2).nex",4192,2,"morphobank","training",42,104,101,5,16.1,3.7,TRUE,"",TRUE +"project4204","project4204.nex",4204,NA,"morphobank","training",163,37,37,2,3.7,0.8,TRUE,"",FALSE +"project4210","project4210.nex",4210,NA,"morphobank","validation",43,235,234,3,65.9,0,TRUE,"",FALSE +"project4220","project4220.nex",4220,NA,"morphobank","validation",47,48,45,4,3.4,9.6,TRUE,"",FALSE +"project423","project423.nex",423,NA,"morphobank","training",60,253,219,5,12.2,15.4,TRUE,"",FALSE +"project4230","project4230.nex",4230,NA,"morphobank","validation",125,302,302,4,29.4,8,TRUE,"",FALSE +"project4235","project4235.nex",4235,NA,"morphobank","validation",13,93,90,3,16.3,0,TRUE,"",FALSE +"project4255","project4255.nex",4255,NA,"morphobank","validation",24,106,106,5,28.3,0,TRUE,"",FALSE +"project4263","project4263.nex",4263,NA,"morphobank","training",4,35,7,3,7.1,0,TRUE,"",FALSE +"project4264_(1)","project4264 (1).nex",4264,1,"morphobank","training",112,441,441,6,66.2,0.3,TRUE,"",FALSE +"project4264_(2)","project4264 (2).nex",4264,2,"morphobank","training",104,394,394,10,61.5,1.1,TRUE,"",FALSE +"project4265","project4265.nex",4265,NA,"morphobank","validation",13,82,72,4,13.5,1,TRUE,"",FALSE +"project427","project427.nex",427,NA,"morphobank","training",223,364,364,10,41.6,3.9,TRUE,"",FALSE +"project4271_Modified_Herrera_et_al._(2021)","project4271_Modified Herrera et al. (2021).nex",4271,2021,"morphobank","training",169,519,519,7,52.3,3.6,TRUE,"",FALSE +"project4271_Modified_Wilberg_et_al._2019","project4271_Modified Wilberg et al. 2019.nex",4271,NA,"morphobank","training",105,410,410,6,40.7,4.1,TRUE,"",FALSE +"project4278","project4278.nex",4278,NA,"morphobank","training",78,214,213,7,31.1,0,TRUE,"",FALSE +"project4281","project4281.nex",4281,NA,"morphobank","training",61,146,145,8,43.8,3.8,TRUE,"",FALSE +"project4284","project4284.nex",4284,NA,"morphobank","training",4062,27,27,5,82.9,2.6,TRUE,"",FALSE +"project4285","project4285.nex",4285,NA,"morphobank","validation",81,155,136,8,0.3,13.2,TRUE,"",FALSE +"project4286","project4286.nex",4286,NA,"morphobank","training",63,135,135,7,18.7,46.7,TRUE,"",FALSE +"project4288","project4288.nex",4288,NA,"morphobank","training",14,37,35,4,15.7,0,TRUE,"",FALSE +"project429","project429.nex",429,NA,"morphobank","training",36,65,49,4,0.5,11.4,TRUE,"",FALSE +"project4291_(1)","project4291 (1).nex",4291,1,"morphobank","training",63,246,246,4,40.6,1.5,TRUE,"",TRUE +"project4291_(2)","project4291 (2).nex",4291,2,"morphobank","training",66,246,246,4,43.8,1.5,TRUE,"",FALSE +"project4291_(3)","project4291 (3).nex",4291,3,"morphobank","training",78,246,246,4,51,1.2,TRUE,"",FALSE +"project4291","project4291.nex",4291,NA,"morphobank","training",78,246,246,4,51,1.2,TRUE,"",FALSE +"project4299_(1)","project4299 (1).nex",4299,1,"morphobank","training",15,34,31,4,19.8,0,TRUE,"",FALSE +"project4299_(2)","project4299 (2).nex",4299,2,"morphobank","training",16,34,33,4,22.9,0,TRUE,"",FALSE +"project4299_(3)","project4299 (3).nex",4299,3,"morphobank","training",18,33,32,4,20.3,0,TRUE,"",FALSE +"project4299_(4)","project4299 (4).nex",4299,4,"morphobank","training",24,33,33,4,28.4,0,TRUE,"",FALSE +"project4300","project4300.nex",4300,NA,"morphobank","validation",158,717,717,7,57.9,3.9,TRUE,"",FALSE +"project4304","project4304.nex",4304,NA,"morphobank","training",29,91,91,5,19.6,0,TRUE,"",FALSE +"project4305","project4305.nex",4305,NA,"morphobank","validation",36,65,62,4,10.2,0.3,TRUE,"",FALSE +"project4306","project4306.nex",4306,NA,"morphobank","training",73,244,233,8,41.1,10.2,TRUE,"",FALSE +"project4307_(1)","project4307 (1).nex",4307,1,"morphobank","training",71,246,237,7,41.2,9.6,TRUE,"",TRUE +"project4307_(2)","project4307 (2).nex",4307,2,"morphobank","training",72,246,237,7,41.7,9.6,TRUE,"",FALSE +"project4308","project4308.nex",4308,NA,"morphobank","training",27,68,65,6,22.6,0,TRUE,"",FALSE +"project4309","project4309.nex",4309,NA,"morphobank","training",16,68,65,6,11.8,0,TRUE,"",FALSE +"project431","project431.nex",431,NA,"morphobank","training",64,141,141,4,30,5.7,TRUE,"",FALSE +"project4310","project4310.nex",4310,NA,"morphobank","validation",48,46,40,5,19,2.7,TRUE,"",FALSE +"project4311","project4311.nex",4311,NA,"morphobank","training",4,35,7,3,7.1,0,TRUE,"",FALSE +"project4313","project4313.nex",4313,NA,"morphobank","training",41,125,124,6,27.7,0,TRUE,"",FALSE +"project4315","project4315.nex",4315,NA,"morphobank","validation",55,74,74,8,27,2.7,TRUE,"",FALSE +"project4317","project4317.nex",4317,NA,"morphobank","training",98,284,275,8,50.7,5.8,TRUE,"",FALSE +"project4318","project4318.nex",4318,NA,"morphobank","training",15,27,23,4,24.1,0,TRUE,"",FALSE +"project4319","project4319.nex",4319,NA,"morphobank","training",65,52,46,4,4.3,0,TRUE,"",FALSE +"project4326","project4326.nex",4326,NA,"morphobank","training",25,57,47,3,8.2,0,TRUE,"",FALSE +"project4327","project4327.nex",4327,NA,"morphobank","training",197,823,823,6,58.3,4.3,TRUE,"",FALSE +"project4328","project4328.nex",4328,NA,"morphobank","training",27,60,57,4,23.7,0,TRUE,"",FALSE +"project4329","project4329.nex",4329,NA,"morphobank","training",47,4,4,4,0,0,TRUE,"",FALSE +"project4332","project4332.nex",4332,NA,"morphobank","training",47,359,359,5,41.4,0,TRUE,"",FALSE +"project4333","project4333.nex",4333,NA,"morphobank","training",58,223,222,4,38.6,0,TRUE,"",FALSE +"project4335","project4335.nex",4335,NA,"morphobank","validation",49,359,359,5,40.8,0,TRUE,"",FALSE +"project4340","project4340.nex",4340,NA,"morphobank","validation",78,214,213,7,31.1,0,TRUE,"",FALSE +"project4348","project4348.nex",4348,NA,"morphobank","training",87,142,141,4,20.8,6.3,TRUE,"",FALSE +"project4356","project4356.nex",4356,NA,"morphobank","training",18,43,42,5,10.6,1.7,TRUE,"",FALSE +"project4358","project4358.nex",4358,NA,"morphobank","training",104,140,134,5,5.6,0.3,TRUE,"",FALSE +"project4359","project4359.nex",4359,NA,"morphobank","training",71,245,146,7,83.8,3.1,TRUE,"",FALSE +"project4363","project4363.nex",4363,NA,"morphobank","training",36,76,71,5,3.1,1.8,TRUE,"",FALSE +"project4364","project4364.nex",4364,NA,"morphobank","training",21,40,40,4,10.4,0.6,TRUE,"",FALSE +"project4372","project4372.nex",4372,NA,"morphobank","training",25,57,57,7,22.7,0,TRUE,"",FALSE +"project4376","project4376.nex",4376,NA,"morphobank","training",17,31,22,3,14.4,0,TRUE,"",FALSE +"project4377","project4377.nex",4377,NA,"morphobank","training",160,182,153,4,0.7,5.6,TRUE,"",FALSE +"project4390","project4390.nex",4390,NA,"morphobank","validation",27,109,108,6,15.4,0,TRUE,"",FALSE +"project4392","project4392.nex",4392,NA,"morphobank","training",55,265,261,6,50.6,1.2,TRUE,"",FALSE +"project4396","project4396.nex",4396,NA,"morphobank","training",19,48,42,4,23.2,0,TRUE,"",FALSE +"project4397","project4397.nex",4397,NA,"morphobank","training",75,223,222,4,32.3,4.6,TRUE,"",FALSE +"project44","project44.nex",44,NA,"morphobank","training",27,46,45,7,7.2,0,TRUE,"",FALSE +"project4400","project4400.nex",4400,NA,"morphobank","validation",99,419,419,5,44.8,0.2,TRUE,"",FALSE +"project4405","project4405.nex",4405,NA,"morphobank","validation",74,215,214,4,33.9,4.5,TRUE,"",FALSE +"project4406","project4406.nex",4406,NA,"morphobank","training",42,115,108,4,8.9,23.1,TRUE,"",FALSE +"project441","project441.nex",441,NA,"morphobank","training",61,231,227,6,10.1,10.4,TRUE,"",FALSE +"project4411","project4411.nex",4411,NA,"morphobank","training",121,443,443,6,56.5,0,TRUE,"",FALSE +"project4415","project4415.nex",4415,NA,"morphobank","validation",28,87,87,4,30.5,2.3,TRUE,"",FALSE +"project4416","project4416.nex",4416,NA,"morphobank","training",7,9,9,2,9.5,0,TRUE,"",FALSE +"project4417","project4417.nex",4417,NA,"morphobank","training",24,63,63,7,6.6,0,TRUE,"",FALSE +"project4420","project4420.nex",4420,NA,"morphobank","validation",68,61,60,5,4.6,9.9,TRUE,"",FALSE +"project4421","project4421.nex",4421,NA,"morphobank","training",34,86,77,4,21.8,6.5,TRUE,"",FALSE +"project4422","project4422.nex",4422,NA,"morphobank","training",67,93,93,7,18.3,19.7,TRUE,"",FALSE +"project4430_(1)","project4430 (1).nex",4430,1,"morphobank","validation",121,176,169,8,3.9,5,TRUE,"",FALSE +"project4431","project4431.nex",4431,NA,"morphobank","training",40,123,122,6,25.7,0,TRUE,"",FALSE +"project4434","project4434.nex",4434,NA,"morphobank","training",110,130,130,7,11.1,12.2,TRUE,"",FALSE +"project4445","project4445.nex",4445,NA,"morphobank","validation",104,268,268,6,44.7,1.6,TRUE,"",FALSE +"project4446_(1)","project4446 (1).nex",4446,1,"morphobank","training",199,1773,1742,2,79.9,0,TRUE,"",FALSE +"project4446_(2)","project4446 (2).nex",4446,2,"morphobank","training",153,860,859,8,65.6,0,TRUE,"",FALSE +"project4449","project4449.nex",4449,NA,"morphobank","training",105,268,267,6,44.3,1.8,TRUE,"",FALSE +"project4458","project4458.nex",4458,NA,"morphobank","training",25,81,81,4,25,0.1,TRUE,"",FALSE +"project4460","project4460.nex",4460,NA,"morphobank","validation",61,167,167,6,43.5,8.6,TRUE,"",FALSE +"project4461","project4461.nex",4461,NA,"morphobank","training",44,95,95,5,34.5,0,TRUE,"",FALSE +"project4467","project4467.nex",4467,NA,"morphobank","training",47,48,45,4,3.4,9.6,TRUE,"",FALSE +"project4469","project4469.nex",4469,NA,"morphobank","training",110,287,287,4,46.5,2.8,TRUE,"",FALSE +"project4473","project4473.nex",4473,NA,"morphobank","training",37,87,87,5,17.5,0,TRUE,"",FALSE +"project449","project449.nex",449,NA,"morphobank","training",24,43,43,4,25.9,0,TRUE,"",FALSE +"project4495","project4495.nex",4495,NA,"morphobank","validation",22,61,61,4,27.2,2.5,TRUE,"",FALSE +"project4496","project4496.nex",4496,NA,"morphobank","training",28,83,83,4,33.3,0,TRUE,"",FALSE +"project4499","project4499.nex",4499,NA,"morphobank","training",66,96,91,6,12.3,2,TRUE,"",FALSE +"project45","project45.nex",45,NA,"morphobank","validation",40,65,58,4,23.9,2.9,TRUE,"",FALSE +"project450","project450.nex",450,NA,"morphobank","validation",40,14,14,4,0.5,0,TRUE,"",FALSE +"project4501","project4501.nex",4501,NA,"morphobank","training",24,42,41,6,4.1,13.1,TRUE,"",FALSE +"project4516","project4516.nex",4516,NA,"morphobank","training",70,41,41,6,0.6,0,TRUE,"",FALSE +"project4517","project4517.nex",4517,NA,"morphobank","training",99,285,282,8,33.8,1.4,TRUE,"",FALSE +"project4531","project4531.nex",4531,NA,"morphobank","training",71,256,252,8,40.8,0,TRUE,"",FALSE +"project4532_(1)","project4532 (1).nex",4532,1,"morphobank","training",33,74,72,4,21.1,12.1,TRUE,"",FALSE +"project4532_(2)","project4532 (2).nex",4532,2,"morphobank","training",33,72,71,4,21.3,11.2,TRUE,"",FALSE +"project4532_(3)","project4532 (3).nex",4532,3,"morphobank","training",32,74,72,4,19.7,12.5,TRUE,"",TRUE +"project4532_(4)","project4532 (4).nex",4532,4,"morphobank","training",33,138,101,4,19.1,18.1,TRUE,"",FALSE +"project4532_(5)","project4532 (5).nex",4532,5,"morphobank","training",33,74,72,6,21.3,12,TRUE,"",TRUE +"project4532_(6)","project4532 (6).nex",4532,6,"morphobank","training",33,138,101,4,19.1,18.1,TRUE,"",TRUE +"project4533","project4533.nex",4533,NA,"morphobank","training",50,95,93,9,11.8,6.6,TRUE,"",FALSE +"project4542","project4542.nex",4542,NA,"morphobank","training",20,27,27,3,31.7,0,TRUE,"",FALSE +"project4545","project4545.nex",4545,NA,"morphobank","validation",26,31,31,3,16.6,0,TRUE,"",FALSE +"project4550","project4550.nex",4550,NA,"morphobank","validation",230,889,889,8,60.2,4.1,TRUE,"",FALSE +"project4553","project4553.nex",4553,NA,"morphobank","training",72,244,244,8,38.3,0,TRUE,"",FALSE +"project456","project456.nex",456,NA,"morphobank","training",148,146,144,18,16.1,21.3,TRUE,"",FALSE +"project4580","project4580.nex",4580,NA,"morphobank","validation",109,676,676,6,49,4.6,TRUE,"",FALSE +"project4581","project4581.nex",4581,NA,"morphobank","training",72,323,323,5,51,2.8,TRUE,"",FALSE +"project4596","project4596.nex",4596,NA,"morphobank","training",98,35,35,5,23.4,7.1,TRUE,"",FALSE +"project4598","project4598.nex",4598,NA,"morphobank","training",74,103,70,2,2.3,1.4,TRUE,"",FALSE +"project46","project46.nex",46,NA,"morphobank","training",80,368,315,7,33.5,0,TRUE,"",FALSE +"project4614","project4614.nex",4614,NA,"morphobank","training",112,287,287,4,46.6,2.9,TRUE,"",FALSE +"project4620","project4620.nex",4620,NA,"morphobank","validation",19,37,35,3,21.4,0,TRUE,"",FALSE +"project4622","project4622.nex",4622,NA,"morphobank","training",11,16,11,3,3.3,0,TRUE,"",FALSE +"project4624","project4624.nex",4624,NA,"morphobank","training",76,510,510,8,38.1,2.7,TRUE,"",FALSE +"project4626","project4626.nex",4626,NA,"morphobank","training",63,33,16,10,66.3,9,TRUE,"",FALSE +"project463","project463.nex",463,NA,"morphobank","training",60,227,227,4,21.3,5.7,TRUE,"",FALSE +"project4634","project4634.nex",4634,NA,"morphobank","training",41,92,90,6,33.5,2.6,TRUE,"",FALSE +"project4637","project4637.nex",4637,NA,"morphobank","training",106,90,90,8,43.3,0.8,TRUE,"",FALSE +"project4649","project4649.nex",4649,NA,"morphobank","training",82,127,119,6,18,0.7,TRUE,"",FALSE +"project466_(1)","project466 (1).nex",466,1,"morphobank","training",7,151,118,8,6.9,0,TRUE,"",FALSE +"project466_(2)","project466 (2).nex",466,2,"morphobank","training",7,151,119,9,7.8,0,TRUE,"",FALSE +"project466_(3)","project466 (3).nex",466,3,"morphobank","training",7,151,120,10,6.4,0,TRUE,"",FALSE +"project466_(4)","project466 (4).nex",466,4,"morphobank","training",7,151,118,8,5.6,0,TRUE,"",FALSE +"project466_(5)","project466 (5).nex",466,5,"morphobank","training",7,151,113,9,6.2,0,TRUE,"",FALSE +"project466_(6)","project466 (6).nex",466,6,"morphobank","training",7,151,122,10,5.3,0,TRUE,"",FALSE +"project4661","project4661.nex",4661,NA,"morphobank","training",101,230,228,8,58.3,7.7,TRUE,"",FALSE +"project4671","project4671.nex",4671,NA,"morphobank","training",62,83,83,6,24,0,TRUE,"",FALSE +"project4672","project4672.nex",4672,NA,"morphobank","training",22,27,27,3,23.1,3,TRUE,"",FALSE +"project4675","project4675.nex",4675,NA,"morphobank","validation",48,105,105,4,37.5,0.3,TRUE,"",FALSE +"project4680","project4680.nex",4680,NA,"morphobank","validation",80,180,179,8,47.5,8.5,TRUE,"",FALSE +"project470","project470.nex",470,NA,"morphobank","validation",14,48,47,4,6.2,0,TRUE,"",FALSE +"project4712","project4712.nex",4712,NA,"morphobank","training",27,110,107,4,24,0,TRUE,"",FALSE +"project4747","project4747.nex",4747,NA,"morphobank","training",25,15,15,4,12,0,TRUE,"",FALSE +"project4761","project4761.nex",4761,NA,"morphobank","training",58,370,369,6,29.7,4.5,TRUE,"",FALSE +"project4789","project4789.nex",4789,NA,"morphobank","training",13,12,10,4,5.4,0,TRUE,"",FALSE +"project4790","project4790.nex",4790,NA,"morphobank","validation",16,32,32,4,12.7,0,TRUE,"",FALSE +"project48","project48.nex",48,NA,"morphobank","training",80,690,658,6,29.3,9.3,TRUE,"",FALSE +"project4817","project4817.nex",4817,NA,"morphobank","training",101,267,264,7,14.9,38.8,TRUE,"",FALSE +"project482","project482.nex",482,NA,"morphobank","training",44,69,69,4,19.6,0.9,TRUE,"",FALSE +"project484","project484.nex",484,NA,"morphobank","training",20,50,50,4,22.2,0,TRUE,"",FALSE +"project485","project485.nex",485,NA,"morphobank","validation",82,413,413,5,37.8,3.3,TRUE,"",FALSE +"project4867","project4867.nex",4867,NA,"morphobank","training",60,138,138,3,40.3,9.3,TRUE,"",FALSE +"project488","project488.nex",488,NA,"morphobank","training",38,75,75,10,23,0,TRUE,"",FALSE +"project489","project489.nex",489,NA,"morphobank","training",46,243,243,8,13.2,39.4,TRUE,"",FALSE +"project4910","project4910.nex",4910,NA,"morphobank","validation",26,160,156,12,32.6,0,TRUE,"",FALSE +"project493","project493.nex",493,NA,"morphobank","training",35,290,289,5,36.3,0,TRUE,"",FALSE +"project495","project495.nex",495,NA,"morphobank","validation",19,66,66,3,13,0,TRUE,"",FALSE +"project496","project496.nex",496,NA,"morphobank","training",74,408,408,6,47.3,0,TRUE,"",FALSE +"project497.1","project497.1.nex",497,NA,"morphobank","training",NA,NA,NA,NA,NA,NA,FALSE,"WARNING: no non-missing arguments to max; returning -Inf ; ERROR: values must be type 'integer', + but FUN(X[[1]]) result is type 'double'",FALSE +"project497.2","project497.2.nex",497,NA,"morphobank","training",NA,NA,NA,NA,NA,NA,FALSE,"WARNING: no non-missing arguments to max; returning -Inf ; ERROR: values must be type 'integer', + but FUN(X[[1]]) result is type 'double'",FALSE +"project506","project506.nex",506,NA,"morphobank","training",30,137,133,5,12.3,0,TRUE,"",FALSE +"project5099","project5099.nex",5099,NA,"morphobank","training",53,15,15,4,2.4,1.5,TRUE,"",FALSE +"project510","project510.nex",510,NA,"morphobank","validation",188,2954,2857,12,22.1,0,TRUE,"",FALSE +"project5186","project5186.nex",5186,NA,"morphobank","training",43,41,40,4,0.9,0,TRUE,"",FALSE +"project5201","project5201.nex",5201,NA,"morphobank","training",86,71,71,5,14.2,27.6,TRUE,"",FALSE +"project5228","project5228.nex",5228,NA,"morphobank","training",59,146,126,8,0.8,0,TRUE,"",FALSE +"project5230","project5230.nex",5230,NA,"morphobank","validation",71,40,40,6,0.4,3.8,TRUE,"",FALSE +"project5255","project5255.nex",5255,NA,"morphobank","validation",13,9,9,3,6,0,TRUE,"",FALSE +"project5268","project5268.nex",5268,NA,"morphobank","training",30,46,45,3,13.6,0,TRUE,"",FALSE +"project528","project528.nex",528,NA,"morphobank","training",44,99,98,4,17.3,0,TRUE,"",FALSE +"project529","project529.nex",529,NA,"morphobank","training",27,107,106,5,19.9,0,TRUE,"",FALSE +"project530_(1)","project530 (1).nex",530,1,"morphobank","validation",20,39,38,4,6.6,0,TRUE,"",FALSE +"project530_(2)","project530 (2).nex",530,2,"morphobank","validation",23,90,89,5,18.9,0,TRUE,"",FALSE +"project532","project532.nex",532,NA,"morphobank","training",21,674,427,9,15.9,2,TRUE,"",FALSE +"project5327","project5327.nex",5327,NA,"morphobank","training",55,135,133,7,24.7,4.5,TRUE,"",FALSE +"project537","project537.nex",537,NA,"morphobank","training",30,58,58,3,25.9,5.1,TRUE,"",FALSE +"project538","project538.nex",538,NA,"morphobank","training",11,19,19,4,10.5,0,TRUE,"",FALSE +"project539","project539.nex",539,NA,"morphobank","training",22,51,50,5,7.9,1.4,TRUE,"",FALSE +"project540","project540.nex",540,NA,"morphobank","validation",55,114,113,6,15.9,12.4,TRUE,"",FALSE +"project541","project541.nex",541,NA,"morphobank","training",33,71,71,3,24.6,4,TRUE,"",FALSE +"project542","project542.nex",542,NA,"morphobank","training",24,43,43,4,13.4,3.4,TRUE,"",FALSE +"project549","project549.nex",549,NA,"morphobank","training",84,395,384,9,28.2,23.4,TRUE,"",FALSE +"project553","project553.nex",553,NA,"morphobank","training",NA,NA,NA,NA,NA,NA,FALSE,"WARNING: no non-missing arguments to max; returning -Inf ; ERROR: values must be type 'integer', + but FUN(X[[1]]) result is type 'double'",FALSE +"project561","project561.nex",561,NA,"morphobank","training",34,356,329,6,5,9.6,TRUE,"",FALSE +"project563","project563.nex",563,NA,"morphobank","training",82,50,49,6,19.9,4.3,TRUE,"",FALSE +"project567","project567.nex",567,NA,"morphobank","training",24,86,84,5,11.9,0,TRUE,"",FALSE +"project568","project568.nex",568,NA,"morphobank","training",45,81,80,10,18.4,3.5,TRUE,"",FALSE +"project569","project569.nex",569,NA,"morphobank","training",22,60,58,5,11.5,0.9,TRUE,"",FALSE +"project571","project571.nex",571,NA,"morphobank","training",42,125,125,5,16.8,4.2,TRUE,"",FALSE +"project574","project574.nex",574,NA,"morphobank","training",19,97,97,14,27,0.2,TRUE,"",FALSE +"project578","project578.nex",578,NA,"morphobank","training",23,166,163,5,25.5,2.7,TRUE,"",FALSE +"project581","project581.nex",581,NA,"morphobank","training",85,301,301,5,39.3,0,TRUE,"",FALSE +"project586","project586.nex",586,NA,"morphobank","training",36,80,80,3,26.4,5.6,TRUE,"",FALSE +"project589","project589.nex",589,NA,"morphobank","training",69,135,124,8,2.2,18.5,TRUE,"",FALSE +"project599","project599.nex",599,NA,"morphobank","training",18,60,51,5,0,0.8,TRUE,"",FALSE +"project600","project600.nex",600,NA,"morphobank","validation",21,60,51,5,0,0.7,TRUE,"",FALSE +"project608","project608.nex",608,NA,"morphobank","training",97,313,259,10,52.7,0,TRUE,"",FALSE +"project610","project610.nex",610,NA,"morphobank","validation",47,69,66,9,8.6,1.4,TRUE,"",FALSE +"project611","project611.nex",611,NA,"morphobank","training",23,66,65,4,28.2,0,TRUE,"",FALSE +"project618","project618.nex",618,NA,"morphobank","training",17,42,42,10,2.9,26.9,TRUE,"",FALSE +"project619","project619.nex",619,NA,"morphobank","training",41,89,78,7,3.9,9,TRUE,"",FALSE +"project622","project622.nex",622,NA,"morphobank","training",29,65,54,7,4.1,2.1,TRUE,"",FALSE +"project623","project623.nex",623,NA,"morphobank","training",37,84,73,7,3.9,7.2,TRUE,"",FALSE +"project624","project624.nex",624,NA,"morphobank","training",34,80,69,7,3.8,7.6,TRUE,"",FALSE +"project625","project625.nex",625,NA,"morphobank","validation",106,258,236,8,18.7,15.3,TRUE,"",FALSE +"project628","project628.nex",628,NA,"morphobank","training",15,50,50,3,31.1,0,TRUE,"",FALSE +"project631","project631.nex",631,NA,"morphobank","training",44,253,155,4,14.7,12.4,TRUE,"",FALSE +"project632_(1)","project632 (1).nex",632,1,"morphobank","training",42,34,32,8,15.7,0,TRUE,"",FALSE +"project632_(2)","project632 (2).nex",632,2,"morphobank","training",52,54,54,5,7.9,2.4,TRUE,"",FALSE +"project633","project633.nex",633,NA,"morphobank","training",12,41,30,3,1.7,0,TRUE,"",FALSE +"project635","project635.nex",635,NA,"morphobank","validation",19,20,15,3,0,0,TRUE,"",FALSE +"project638","project638.nex",638,NA,"morphobank","training",71,115,102,5,0.4,3.7,TRUE,"",FALSE +"project640","project640.nex",640,NA,"morphobank","validation",27,53,52,3,1.4,1.6,TRUE,"",FALSE +"project641","project641.nex",641,NA,"morphobank","training",31,95,81,5,0.8,3,TRUE,"",FALSE +"project643","project643.nex",643,NA,"morphobank","training",11,28,23,4,2.8,0,TRUE,"",FALSE +"project647","project647.nex",647,NA,"morphobank","training",15,56,49,4,1.1,3.1,TRUE,"",FALSE +"project648","project648.nex",648,NA,"morphobank","training",21,19,17,5,0.6,0.3,TRUE,"",FALSE +"project652","project652.nex",652,NA,"morphobank","training",56,224,224,4,49.9,0,TRUE,"",FALSE +"project657","project657.nex",657,NA,"morphobank","training",54,99,95,5,10.9,0,TRUE,"",FALSE +"project660","project660.nex",660,NA,"morphobank","validation",117,477,477,8,56,2.9,TRUE,"",FALSE +"project667","project667.nex",667,NA,"morphobank","training",65,259,254,4,41.1,3.2,TRUE,"",FALSE +"project674","project674.nex",674,NA,"morphobank","training",18,54,54,4,20.1,0.5,TRUE,"",FALSE +"project675","project675.nex",675,NA,"morphobank","validation",16,52,52,4,0.1,2.9,TRUE,"",FALSE +"project676","project676.nex",676,NA,"morphobank","training",27,59,57,4,19.9,1.8,TRUE,"",FALSE +"project681","project681.nex",681,NA,"morphobank","training",22,50,40,4,2.3,0,TRUE,"",FALSE +"project682","project682.nex",682,NA,"morphobank","training",94,78,78,4,28.7,0,TRUE,"",FALSE +"project683","project683.nex",683,NA,"morphobank","training",19,71,69,5,25.3,1.7,TRUE,"",FALSE +"project684","project684.nex",684,NA,"morphobank","training",52,303,298,9,33.1,1.8,TRUE,"",FALSE +"project687","project687.nex",687,NA,"morphobank","training",90,272,271,5,37.8,4.4,TRUE,"",FALSE +"project689_(1)","project689 (1).nex",689,1,"morphobank","training",76,183,173,8,37.9,11.9,TRUE,"",TRUE +"project689_(2)","project689 (2).nex",689,2,"morphobank","training",109,183,173,8,37.4,12.3,TRUE,"",FALSE +"project691","project691.nex",691,NA,"morphobank","training",103,446,443,6,43.4,0,TRUE,"",FALSE +"project692","project692.nex",692,NA,"morphobank","training",71,408,408,6,40.7,4.5,TRUE,"",FALSE +"project694","project694.nex",694,NA,"morphobank","training",46,286,286,9,17.5,3.6,TRUE,"",FALSE +"project696","project696.nex",696,NA,"morphobank","training",34,35,35,7,0,4.5,TRUE,"",FALSE +"project699","project699.nex",699,NA,"morphobank","training",47,175,170,7,37.6,0,TRUE,"",FALSE +"project701","project701.nex",701,NA,"morphobank","training",35,12,12,4,1,0,TRUE,"",FALSE +"project706","project706.nex",706,NA,"morphobank","training",9,114,85,4,8.5,2.6,TRUE,"",FALSE +"project709","project709.nex",709,NA,"morphobank","training",31,38,38,4,20.1,6.3,TRUE,"",FALSE +"project713","project713.nex",713,NA,"morphobank","training",32,334,333,7,43.1,0.3,TRUE,"",FALSE +"project715","project715.nex",715,NA,"morphobank","validation",23,68,68,5,10.4,0,TRUE,"",FALSE +"project717","project717.nex",717,NA,"morphobank","training",29,101,100,5,30.4,0,TRUE,"",FALSE +"project721","project721.nex",721,NA,"morphobank","training",19,68,68,4,29.3,0,TRUE,"",FALSE +"project723","project723.nex",723,NA,"morphobank","training",22,72,65,5,34.3,0.1,TRUE,"",FALSE +"project724","project724.nex",724,NA,"morphobank","training",37,114,114,8,27.9,8.9,TRUE,"",FALSE +"project727","project727.nex",727,NA,"morphobank","training",15,56,52,3,14.6,0,TRUE,"",FALSE +"project728","project728.nex",728,NA,"morphobank","training",59,98,97,5,11.6,0,TRUE,"",FALSE +"project730","project730.nex",730,NA,"morphobank","validation",27,77,75,3,12.1,1.9,TRUE,"",FALSE +"project735","project735.nex",735,NA,"morphobank","validation",37,90,89,5,17,0,TRUE,"",FALSE +"project739","project739.nex",739,NA,"morphobank","training",38,261,258,6,23.7,2.1,TRUE,"",FALSE +"project740","project740.nex",740,NA,"morphobank","validation",89,78,78,6,0.3,0.1,TRUE,"",FALSE +"project741","project741.nex",741,NA,"morphobank","training",27,206,199,3,58.6,0.1,TRUE,"",FALSE +"project742","project742.nex",742,NA,"morphobank","training",46,71,70,6,0.2,0.2,TRUE,"",FALSE +"project743","project743.nex",743,NA,"morphobank","training",23,43,43,18,23.5,0.7,TRUE,"",FALSE +"project746","project746.nex",746,NA,"morphobank","training",77,348,348,5,58.2,0,TRUE,"",FALSE +"project748","project748.nex",748,NA,"morphobank","training",60,138,138,3,40.3,9.3,TRUE,"",FALSE +"project749","project749.nex",749,NA,"morphobank","training",25,53,53,12,5.3,1.9,TRUE,"",FALSE +"project750","project750.nex",750,NA,"morphobank","validation",34,240,240,5,21.8,0,TRUE,"",FALSE +"project751","project751.nex",751,NA,"morphobank","training",52,193,192,5,15.3,1.9,TRUE,"",FALSE +"project758","project758.nex",758,NA,"morphobank","training",28,74,71,5,11.7,0.9,TRUE,"",FALSE +"project776","project776.nex",776,NA,"morphobank","training",69,232,231,5,35.6,5.2,TRUE,"",FALSE +"project779","project779.nex",779,NA,"morphobank","training",51,118,117,9,24.9,0,TRUE,"",FALSE +"project780_(1)","project780 (1).nex",780,1,"morphobank","validation",63,104,103,11,24.1,4.5,TRUE,"",TRUE +"project780_(2)","project780 (2).nex",780,2,"morphobank","validation",66,104,101,8,27.1,4.4,TRUE,"",FALSE +"project784","project784.nex",784,NA,"morphobank","training",188,2,2,9,0,5.6,TRUE,"",FALSE +"project790","project790.nex",790,NA,"morphobank","validation",108,210,208,20,16.8,16.5,TRUE,"",FALSE +"project793","project793.nex",793,NA,"morphobank","training",51,253,179,5,18.2,11.7,TRUE,"",FALSE +"project794","project794.nex",794,NA,"morphobank","training",47,213,204,9,5.6,10.1,TRUE,"",FALSE +"project798","project798.nex",798,NA,"morphobank","training",73,282,278,8,24.4,1.7,TRUE,"",FALSE +"project802","project802.nex",802,NA,"morphobank","training",26,73,71,4,39.9,0,TRUE,"",FALSE +"project804","project804.nex",804,NA,"morphobank","training",173,589,569,10,32.8,30.9,TRUE,"",FALSE +"project805","project805.nex",805,NA,"morphobank","validation",7,16,10,2,10,5.7,TRUE,"",FALSE +"project806","project806.nex",806,NA,"morphobank","training",58,82,82,8,14.3,16.1,TRUE,"",FALSE +"project809","project809.nex",809,NA,"morphobank","training",41,90,81,4,3.9,2.6,TRUE,"",FALSE +"project810","project810.nex",810,NA,"morphobank","validation",16,40,33,9,10,0,TRUE,"",FALSE +"project811","project811.nex",811,NA,"morphobank","training",64,97,89,17,16.1,0,TRUE,"",FALSE +"project816","project816.nex",816,NA,"morphobank","training",23,35,34,5,4.2,3.2,TRUE,"",FALSE +"project825","project825.nex",825,NA,"morphobank","validation",33,131,129,6,21.6,0.4,TRUE,"",FALSE +"project826","project826.nex",826,NA,"morphobank","training",33,218,213,3,61.7,0.1,TRUE,"",FALSE +"project831","project831.nex",831,NA,"morphobank","training",21,49,41,6,12.8,2.4,TRUE,"",FALSE +"project833","project833.nex",833,NA,"morphobank","training",36,6,6,3,0,0,TRUE,"",FALSE +"project84","project84.nex",84,NA,"morphobank","training",14,39,38,4,20.5,3.9,TRUE,"",FALSE +"project847","project847.nex",847,NA,"morphobank","training",38,126,123,6,11.2,15.7,TRUE,"",FALSE +"project849","project849.nex",849,NA,"morphobank","training",22,47,46,5,17.6,0,TRUE,"",FALSE +"project854","project854.nex",854,NA,"morphobank","training",33,201,200,4,41.4,3,TRUE,"",FALSE +"project858_(1)","project858 (1).nex",858,1,"morphobank","training",30,115,115,4,10.8,0.6,TRUE,"",FALSE +"project858_(2)","project858 (2).nex",858,2,"morphobank","training",56,58,57,5,22,1.5,TRUE,"",FALSE +"project861","project861.nex",861,NA,"morphobank","training",141,32,32,4,0.7,0,TRUE,"",FALSE +"project869","project869.nex",869,NA,"morphobank","training",47,175,170,7,37.8,0,TRUE,"WARNING: Could not parse character states; does each end with a ' or ;?.",FALSE +"project870","project870.nex",870,NA,"morphobank","validation",37,74,73,5,23.1,0,TRUE,"",FALSE +"project871","project871.nex",871,NA,"morphobank","training",28,111,102,7,22.6,0,TRUE,"",FALSE +"project876","project876.nex",876,NA,"morphobank","training",44,137,132,5,36.3,0,TRUE,"",FALSE +"project896","project896.nex",896,NA,"morphobank","training",27,22,22,4,6.9,4.7,TRUE,"",FALSE +"project906","project906.nex",906,NA,"morphobank","training",24,177,164,6,8.9,7,TRUE,"",FALSE +"project908","project908.nex",908,NA,"morphobank","training",30,177,174,6,16.6,6.3,TRUE,"",FALSE +"project912","project912.nex",912,NA,"morphobank","training",173,74,74,9,19,3.7,TRUE,"",FALSE +"project922","project922.nex",922,NA,"morphobank","training",40,94,86,7,14.7,5.7,TRUE,"",FALSE +"project923","project923.nex",923,NA,"morphobank","training",28,46,1,0,0,0,TRUE,"",FALSE +"project929_(1)","project929 (1).nex",929,1,"morphobank","training",38,258,256,6,23.3,1.7,TRUE,"",FALSE +"project929_(2)","project929 (2).nex",929,2,"morphobank","training",38,258,256,6,22.9,1.7,TRUE,"",TRUE +"project931","project931.nex",931,NA,"morphobank","training",13,23,22,3,9.8,0,TRUE,"",FALSE +"project936","project936.nex",936,NA,"morphobank","training",23,33,33,7,2.1,4,TRUE,"",FALSE +"project937","project937.nex",937,NA,"morphobank","training",30,83,81,5,24.1,0,TRUE,"",FALSE +"project938","project938.nex",938,NA,"morphobank","training",35,83,81,5,18.4,0.5,TRUE,"",FALSE +"project944","project944.nex",944,NA,"morphobank","training",25,72,72,4,17.2,0.9,TRUE,"",FALSE +"project945","project945.nex",945,NA,"morphobank","validation",64,102,99,5,5,6.9,TRUE,"",FALSE +"project947","project947.nex",947,NA,"morphobank","training",80,220,220,7,28.6,0,TRUE,"",FALSE +"project950","project950.nex",950,NA,"morphobank","validation",12,9,9,3,3.7,1.9,TRUE,"",FALSE +"project954","project954.nex",954,NA,"morphobank","training",83,75,75,5,15.4,1.7,TRUE,"",FALSE +"project955","project955.nex",955,NA,"morphobank","validation",26,66,66,3,37.6,0,TRUE,"",FALSE +"project960","project960.nex",960,NA,"morphobank","validation",21,37,37,4,20.6,0,TRUE,"",FALSE +"project961","project961.nex",961,NA,"morphobank","training",24,33,28,3,5.4,0,TRUE,"",FALSE +"project964","project964.nex",964,NA,"morphobank","training",24,98,90,4,29.1,8,TRUE,"",FALSE +"project970","project970.nex",970,NA,"morphobank","validation",157,1844,1346,6,52.3,2.8,TRUE,"",FALSE +"project971_(1)","project971 (1).nex",971,1,"morphobank","training",26,101,73,6,53.3,0.5,TRUE,"",FALSE +"project971_(2)","project971 (2).nex",971,2,"morphobank","training",26,47,38,5,43.7,0.9,TRUE,"",FALSE +"project977_(1)","project977 (1).nex",977,1,"morphobank","training",14,234,212,6,38.4,0,TRUE,"",FALSE +"project977_(2)","project977 (2).nex",977,2,"morphobank","training",14,234,212,6,38.4,0,TRUE,"",FALSE +"project979","project979.nex",979,NA,"morphobank","training",119,477,419,8,58.2,3.2,TRUE,"",FALSE +"project984","project984.nex",984,NA,"morphobank","training",28,205,203,3,55.6,0.1,TRUE,"",FALSE +"project987","project987.nex",987,NA,"morphobank","training",108,122,117,7,21.5,6.5,TRUE,"",FALSE +"project996","project996.nex",996,NA,"morphobank","training",53,70,70,5,39.9,0,TRUE,"",FALSE +"project997","project997.nex",997,NA,"morphobank","training",66,80,76,4,0.9,3,TRUE,"",FALSE +"syab07200","syab07200.nex",NA,NA,"syab","training",39,297,297,5,15.3,3,TRUE,"",FALSE +"syab07201","syab07201.nex",NA,NA,"syab","training",125,2954,2813,10,28.3,0,TRUE,"",FALSE +"syab07202","syab07202.nex",NA,NA,"syab","training",111,360,359,7,45.1,2,TRUE,"",FALSE +"syab07203","syab07203.nex",NA,NA,"syab","training",50,196,191,5,5.1,4.6,TRUE,"",FALSE +"syab07204","syab07204.nex",NA,NA,"syab","training",225,748,748,2,53,4.5,TRUE,"",FALSE +"syab07205","syab07205.nex",NA,NA,"syab","training",206,748,748,2,52.4,4.1,TRUE,"",FALSE +"syab07206","syab07206.nex",NA,NA,"syab","training",117,538,535,6,52,0,TRUE,"",FALSE diff --git a/dev/benchmarks/memory_profile_results.md b/dev/benchmarks/memory_profile_results.md new file mode 100644 index 000000000..100338b5f --- /dev/null +++ b/dev/benchmarks/memory_profile_results.md @@ -0,0 +1,189 @@ +# Phase 3D: Memory Layout Profiling Results + +Date: 2026-03-16 +Platform: Windows, R 4.5.2, GCC 14.2.0 +CPU: Intel (L1 32 KB, L2 256 KB typical) + +## 1. Baseline Measurements + +### TBR pass phase breakdown + +All timings in microseconds (μs), averaged over 3 random trees per dataset. + +| Dataset | Tips | Blocks | Words | Clips | Candidates | Clip+Incr (μs) | Indirect (μs) | Unclip (μs) | +|---------|------|--------|-------|-------|------------|-----------------|----------------|-------------| +| Vinther2008 | 23 | 6 | 28 | 38 | 3,585 | 789 | 286 | 268 | +| Agnarsson2004 | 62 | 8 | 59 | 112 | 56,501 | 2,948 | 5,175 | 856 | +| synth_20 | 20 | 4 | 11 | 34 | 2,535 | 271 | 65 | 93 | +| synth_50 | 50 | 4 | 12 | 91 | 32,776 | 1,021 | 989 | 314 | +| synth_100 | 100 | 4 | 12 | 190 | 237,536 | 3,880 | 7,999 | 1,013 | +| synth_200 | 200 | 4 | 12 | 377 | 1,090,533 | 11,238 | 35,930 | 2,695 | + +### Time fraction breakdown + +| Dataset | Tips | % Clip+Incr | % Indirect | % Unclip | +|---------|------|-------------|------------|----------| +| synth_20 | 20 | 63.2 | 15.1 | 21.7 | +| synth_50 | 50 | 43.9 | 42.6 | 13.5 | +| synth_100 | 100 | 30.1 | 62.0 | 7.9 | +| synth_200 | 200 | 22.5 | 72.1 | 5.4 | + +**Conclusion:** Indirect scoring dominates at scale (72% at 200 tips). The clip+incremental +phase dominates at small scales because the incremental downpass is O(depth) ≈ O(n) for +small trees (depth ≈ n), while indirect evaluation is O(n²). + +### Per-candidate indirect timing + +| Dataset | Tips | total_words | Candidates | ns/candidate | +|---------|------|-------------|------------|--------------| +| Vinther2008 | 23 | 28 | 3,585 | 79.9 | +| Agnarsson2004 | 62 | 59 | 56,501 | 91.6 | +| synth_20 | 20 | 11 | 2,535 | 25.6 | +| synth_50 | 50 | 12 | 32,776 | 30.2 | +| synth_100 | 100 | 12 | 237,536 | 33.7 | +| synth_200 | 200 | 12 | 1,090,533 | 32.9 | + +**Conclusion:** Per-candidate cost is stable across tree sizes (~33 ns for `total_words=12`), +confirming that cache effects are not increasing per-candidate cost. The cost scales linearly +with `total_words` (28 words → 80 ns, 59 words → 92 ns). + +### Scaling analysis + +- Indirect time scaling exponent: **2.78** (vs expected 2.0 for O(n²)) +- Candidate count scaling exponent: **2.66** +- The super-quadratic scaling is primarily from candidate count growth (2.66), + not from per-candidate cost degradation (stable at ~33 ns). +- The extra 0.12 exponent may come from TBR rerooting generating O(k) sub-edges + per clip, where k is subtree size. + +### Snapshot overhead + +| Tips | Save (μs) | Restore (μs) | Size (KB) | +|------|-----------|---------------|-----------| +| 20 | 0.3 | 0.3 | 14.6 | +| 50 | 1.1 | 1.1 | 40.2 | +| 100 | 2.5 | 2.3 | 80.8 | +| 200 | 5.4 | 5.0 | 162.1 | + +**Conclusion:** Snapshot save/restore is negligible — 5 μs per operation at 200 tips, +compared to 36 ms for indirect evaluation. StateSnapshot optimization (Step 6) is not +worth pursuing. + +## 2. Steps Investigated and Decisions + +### Step 3: Postorder node renumbering — SKIPPED + +Analysis of node-ID strides during postorder traversal (50-tip tree): +- Mean stride: 34.6 node IDs (~52 cache lines at `total_words=12`) +- Max stride: 93 node IDs (~140 cache lines) + +However, the downpass is **not the hot path** — it's only 22% of time at 200 tips. The +state arrays fit comfortably in L2 (prelim for 200 tips = 37 KB; total state data ≈ 162 +KB). Since the bottleneck is indirect scoring (which uses vroot_cache with linear access), +postorder renumbering would not improve the hot path. + +**Decision:** Not implemented. Cost/benefit ratio unfavorable. + +### Step 4: Binary-character specialization — SKIPPED + +Block `n_states` values for typical datasets: +- Vinther2008: 4, 4, 5, 5, 5, 5 (total_words=28) +- Agnarsson2004: 7, 7, 7, 7, 7, 8, 8, 8 (total_words=59) +- synth_200 (binary+NA): 3, 3, 3, 3 (total_words=12) + +`n_states` per block is determined by the **total number of applicable states in the +contrast matrix**, not by individual character state coverage. All standard blocks share +the same `n_states`. Binary characters contribute to blocks with the full `n_states` +because `state_remap` assigns globally consecutive indices. + +**Decision:** Per-block unrolling for binary characters is not possible with the current +block structure. Changing this would require per-block state counts, which is a deep +architectural change. Not worth it for Phase 3D. + +Verified: all inner loops correctly iterate `blk.n_states` (not `total_words`). No bug. + +### Step 5: Block-major layout — SKIPPED + +The vroot_cache (Phase 2B) already provides linear access for the indirect scoring hot +path. Per-candidate cost is stable across tree sizes, confirming no cache pressure issue. +State arrays for 200 tips fit in L2 (162 KB total). + +For morphological data (the target use case), `total_words` is small (12-59) and trees +rarely exceed 500 tips. Block-major layout would add complexity without measurable benefit. + +**Decision:** Not implemented. Experiment not justified by profiling data. + +### Step 6: StateSnapshot reduction — SKIPPED + +Snapshot overhead is <0.01% of TBR pass time at scale. Not worth optimizing. + +## 3. Optimizations Applied + +### Postorder save/restore in TBR (ts_tbr.cpp) + +After `spr_unclip()`, the tree topology is identical to before `spr_clip()`, so the +postorder traversal is the same. Previously, `build_postorder()` (O(n) DFS with vector +allocations) was called to reconstruct it. Now the pre-clip postorder is saved and +restored via `assign()` (O(n) memcpy, no allocation). + +Similarly, after `state_snap.restore()` on rejection, the postorder is already restored +by the snapshot's memcpy. The redundant `build_postorder()` calls were removed. + +**Changes:** +- Save `tree.postorder` before `spr_clip()`, restore after `spr_unclip()` +- Remove 2 redundant `build_postorder()` calls after `state_snap.restore()` + +**Impact:** Eliminates ~377 `build_postorder()` calls per TBR pass at 200 tips. Each call +saves O(n) DFS traversal plus 2 vector allocations. Estimated savings: 1-3% of the +unclip phase. The benefit is modest because unclip is only 5% of total TBR pass time; +the real bottleneck (indirect scoring at 72%) is addressed by Phase 3E (SIMD). + +## 4. Implications for Future Phases + +### Phase 3E (SIMD) — highest priority + +The profiling clearly shows that the **indirect scoring inner loop** is the primary target +for optimization. At 200 tips, it consumes 72% of TBR pass time. The inner loop is: + +```cpp +for (int b = 0; b < ds.n_blocks; ++b) { + uint64_t any_hit = 0; + for (int s = 0; s < blk.n_states; ++s) { + any_hit |= (clip_prelim[offset+s] & vroot[offset+s]); + } + uint64_t needs_step = ~any_hit & blk.active_mask; + extra_steps += blk.weight * popcount64(needs_step); +} +``` + +This is a textbook SIMD target: independent AND/OR operations over contiguous uint64_t +arrays. SSE2 can process 2 words per instruction, AVX2 can process 4. With `n_states` +typically 3-8 per block, even 2× throughput from SSE2 would be significant. + +### Algorithmic improvements + +The candidate count scaling exponent (2.66 > 2.0) suggests that TBR rerooting generates +more candidates than pure SPR. Reducing the candidate set (e.g., tighter bounds on which +rerootings to try) could reduce the constant factor. + +## 5. Files Created/Modified + +### Created: +- `dev/benchmarks/bench_memory.R` — profiling harness +- `dev/benchmarks/memory_profile_results.md` — this file +- `tests/testthat/test-ts-memory-layout.R` — 32 regression tests + +### Modified: +- `src/ts_rcpp.cpp` — added `ts_bench_tbr_phases` diagnostic (append only), added + `#include ` and `#include ` +- `src/TreeSearch-init.c` — registered `ts_bench_tbr_phases` (7 args) +- `src/ts_tbr.cpp` — postorder save/restore optimization (3 changes) +- `R/RcppExports.R` — regenerated via `Rcpp::compileAttributes()` +- `src/RcppExports.cpp` — regenerated + +### Test status: +- memory-layout: 32/32 passing +- driven: 53/53 passing +- tbr-bench: 26/26 passing +- fuse: 16/16 passing (1 skip) +- sector: 32/32 passing diff --git a/dev/benchmarks/nblocks_cost_bench.csv b/dev/benchmarks/nblocks_cost_bench.csv new file mode 100644 index 000000000..95fc7e0f2 --- /dev/null +++ b/dev/benchmarks/nblocks_cost_bench.csv @@ -0,0 +1,46 @@ +"file","ntax","nchar","n_blocks","total_words","seed","n_candidates","ns_per_cand","time_indirect_us","time_clip_us","time_rescore_us" +"project2144.nex",109,123,3,16,1,282145,17.8029027627638,5023,2168,25 +"project2144.nex",109,123,3,16,2,322536,17.5205248406379,5651,2286,21 +"project2144.nex",109,123,3,16,3,250546,18.1204249918179,4540,2431,22 +"project2144.nex",109,123,3,16,4,332963,17.5214663491139,5834,2251,22 +"project2144.nex",109,123,3,16,5,283110,17.8905725689661,5065,2408,22 +"project987.nex",108,114,4,30,1,269474,23.8316126973289,6422,2409,30 +"project987.nex",108,114,4,30,2,314216,23.436107645696,7364,2298,29 +"project987.nex",108,114,4,30,3,245212,25.6961323263136,6301,2772,29 +"project987.nex",108,114,4,30,4,315884,23.7840473085057,7513,2570,29 +"project987.nex",108,114,4,30,5,276910,24.275035209996,6722,2830,29 +"project2191.nex",105,215,5,46,1,190882,33.4761790006391,6390,3983,55 +"project2191.nex",105,215,5,46,2,290576,32.098315070756,9327,4224,54 +"project2191.nex",105,215,5,46,3,228263,33.5534011206372,7659,4121,54 +"project2191.nex",105,215,5,46,4,249097,31.7587124694396,7911,4068,54 +"project2191.nex",105,215,5,46,5,262366,32.035400928474,8405,4013,53 +"project3422.nex",110,277,6,42,1,288184,34.3010021375232,9885,4499,56 +"project3422.nex",110,277,6,42,2,329304,33.528289969147,11041,5005,55 +"project3422.nex",110,277,6,42,3,265616,34.534817179688,9173,5471,55 +"project3422.nex",110,277,6,42,4,368991,33.2447132856899,12267,4762,55 +"project3422.nex",110,277,6,42,5,290950,34.3289224952741,9988,4668,56 +"project4264 (1).nex",112,441,7,50,1,372418,39.0179851672046,14531,6786,68 +"project4264 (1).nex",112,441,7,50,2,344072,39.2475993396731,13504,6548,66 +"project4264 (1).nex",112,441,7,50,3,278578,40.8216011314605,11372,6177,66 +"project4264 (1).nex",112,441,7,50,4,402180,38.3808245064399,15436,6373,66 +"project4264 (1).nex",112,441,7,50,5,307630,39.492247180054,12149,6334,64 +"project1157.nex",110,138,8,26,1,288184,36.1158148960386,10408,2032,29 +"project1157.nex",110,138,8,26,2,329304,35.9272890702816,11831,2073,28 +"project1157.nex",110,138,8,26,3,265616,36.4134690681284,9672,2014,28 +"project1157.nex",110,138,8,26,4,368991,35.7271586569862,13183,2126,28 +"project1157.nex",110,138,8,26,5,290950,36.1780374634817,10526,1993,28 +"project691.nex",103,443,9,64,1,164942,57.1716118393132,9430,5012,77 +"project691.nex",103,443,9,64,2,271032,51.2411818530653,13888,5584,78 +"project691.nex",103,443,9,64,3,228027,53.3621018563591,12168,5739,78 +"project691.nex",103,443,9,64,4,223032,53.2210624484379,11870,5366,78 +"project691.nex",103,443,9,64,5,245470,52.7559375891148,12950,5665,77 +"project625.nex",106,236,10,86,1,205087,61.076518745703,12526,6110,87 +"project625.nex",106,236,10,86,2,298504,56.8836598504543,16980,6861,87 +"project625.nex",106,236,10,86,3,232494,61.0639414350478,14197,7257,87 +"project625.nex",106,236,10,86,4,269753,60.3255570837025,16273,6858,88 +"project625.nex",106,236,10,86,5,267310,60.117466611799,16070,5887,88 +"project2292.nex",114,493,11,76,1,385477,59.7960449002145,23050,8509,100 +"project2292.nex",114,493,11,76,2,365468,59.7945647772172,21853,7772,104 +"project2292.nex",114,493,11,76,3,302948,61.7366676789416,18703,7130,100 +"project2292.nex",114,493,11,76,4,381935,61.9398588765104,23657,8153,100 +"project2292.nex",114,493,11,76,5,319570,62.7843664924743,20064,7902,103 diff --git a/dev/benchmarks/nblocks_cost_findings.md b/dev/benchmarks/nblocks_cost_findings.md new file mode 100644 index 000000000..ac8345e31 --- /dev/null +++ b/dev/benchmarks/nblocks_cost_findings.md @@ -0,0 +1,80 @@ +# Per-Candidate Cost vs Number of Character Blocks + +**Task:** T-075 +**Date:** 2026-03-18 +**Agent:** A + +## Setup + +- 9 neotrans matrices selected from the 100–130 tip range +- All have inapplicable characters (NA-aware scoring) +- 5 random tree seeds per matrix +- Measured via `ts_bench_tbr_phases()` (one full TBR clip–evaluate–unclip pass) + +## Key finding + +Per-candidate indirect scoring cost is **linear** in both `n_blocks` and +`total_words`, with no significant nonlinearity (quadratic term p = 0.41). + +### Model: `ns_per_cand ~ n_blocks + total_words` + +| Term | Coefficient | SE | Interpretation | +|------|------------|-----|----------------| +| intercept | 2.4 ns | 0.7 | Base overhead per candidate | +| n_blocks | 3.3 ns | 0.2 | Per-block overhead (loop, function call) | +| total_words | 0.29 ns | 0.02 | Per-word cost (bit-parallel ops) | + +R² = 0.990 (45 observations from 9 datasets × 5 seeds) + +### Predicted cost at range extremes + +| n_blocks | total_words | Predicted ns/candidate | Observed mean | +|----------|-------------|----------------------|---------------| +| 3 | 16 | 17.1 | 17.8 | +| 11 | 76 | 61.2 | 61.2 | + +Ratio: 3.6× cost increase from simplest to most complex dataset. + +### Standalone models + +- `n_blocks` alone: R² = 0.931, slope ≈ 5.4 ns/block +- `total_words` alone: R² = 0.885, slope ≈ 0.62 ns/word + +## Practical implications + +1. **No threshold effect**: Cost scales linearly — there's no critical + n_blocks value after which performance degrades sharply. + +2. **Block overhead dominates**: At typical total_words (30–80), the per-block + overhead (3.3 ns × n_blocks) contributes more than per-word cost + (0.29 ns × total_words) for datasets with many state-count groups. + +3. **Optimisation opportunity**: Merging blocks with adjacent state counts + (e.g., 5-state and 6-state characters into a single padded block) could + reduce n_blocks by 2–4, saving ~7–13 ns/candidate. At 300k candidates + per clip, this would save ~2–4 ms per clip pass, or ~100–200 ms across + a full TBR sweep with 50 clips. Meaningful for large datasets but not + critical — this is a low-priority micro-optimisation. + +4. **For strategy selection**: n_blocks can be computed cheaply at dataset + load time. Datasets with n_blocks ≥ 10 will have ~3× higher per-candidate + cost than datasets with n_blocks ≤ 4, which affects expected search + duration. This could inform time estimates in the Shiny app. + +## Data + +Raw results: `nblocks_cost_bench.csv` (45 rows: 9 datasets × 5 seeds) + +### Datasets used + +| File | n_tips | n_char | n_blocks | total_words | Mean ns/cand | +|------|--------|--------|----------|-------------|-------------| +| project2144.nex | 109 | 123 | 3 | 16 | 17.8 | +| project987.nex | 108 | 114 | 4 | 30 | 24.2 | +| project2191.nex | 105 | 215 | 5 | 46 | 32.6 | +| project3422.nex | 110 | 277 | 6 | 42 | 34.0 | +| project4264 (1).nex | 112 | 441 | 7 | 50 | 39.4 | +| project1157.nex | 110 | 138 | 8 | 26 | 36.1 | +| project691.nex | 103 | 443 | 9 | 64 | 53.6 | +| project625.nex | 106 | 236 | 10 | 86 | 59.9 | +| project2292.nex | 114 | 493 | 11 | 76 | 61.2 | diff --git a/dev/benchmarks/neotrans_baselines.csv b/dev/benchmarks/neotrans_baselines.csv new file mode 100644 index 000000000..85bff8a87 --- /dev/null +++ b/dev/benchmarks/neotrans_baselines.csv @@ -0,0 +1,11 @@ +"file","ntax","nchar","nlevels","inapplicable","reps","score","time_s","status" +"project265.nex",30,203,7,TRUE,5,690,0.75,"OK" +"project463.nex",60,227,5,TRUE,5,1193,4.15,"OK" +"project692.nex",71,408,7,TRUE,5,2469,9.59,"OK" +"project3199.nex",88,138,4,TRUE,5,424,3.52,"OK" +"syab07206.nex",117,535,7,TRUE,3,2788,33.07,"OK" +"syab07201.nex",125,2813,10,FALSE,3,15528,89.36,"OK" +"project3200.nex",138,111,6,TRUE,3,818,19.13,"OK" +"project175.nex",165,71,6,FALSE,2,426,2.88999999999999,"OK" +"project3763.nex",205,103,7,TRUE,2,1503,38.34,"OK" +"syab07204.nex",225,748,3,TRUE,2,11960,156.89,"OK" diff --git a/dev/benchmarks/neotrans_strategy_comparison.csv b/dev/benchmarks/neotrans_strategy_comparison.csv new file mode 100644 index 000000000..45eed92b1 --- /dev/null +++ b/dev/benchmarks/neotrans_strategy_comparison.csv @@ -0,0 +1,16 @@ +"file","ntax","nchar","ratio","default_med","default_min","default_time","thorough_med","thorough_min","thorough_time","improvement","slowdown" +"project4626.nex",63,16,0.253968253968254,35,35,1.35,35,34,4.45,0,3.2962962962963 +"project3437.nex",64,89,1.390625,278,277,1.58,276,275,7.04000000000001,2,4.45569620253165 +"project3617.nex",65,361,5.55384615384615,2899,2885,9.63,2885,2856,24.17,14,2.50986500519211 +"project4420.nex",68,60,0.882352941176471,188,188,1.75,189,187,4.90000000000003,-1,2.80000000000002 +"project3970.nex",68,339,4.98529411764706,1345,1324,7.5,1312,1304,22.83,33,3.044 +"project4147.nex",71,150,2.11267605633803,532,525,6.5,530,527,18.6600000000001,2,2.87076923076924 +"project3896.nex",72,201,2.79166666666667,868,867,8.29999999999995,869,867,29.55,-1,3.56024096385544 +"project4553.nex",72,244,3.38888888888889,1035,1025,3.38999999999999,1017,1002,8.24000000000001,18,2.43067846607671 +"project4306.nex",73,233,3.19178082191781,655,649,12.9200000000001,645,640,35.98,10,2.78482972136221 +"project689 (1).nex",76,173,2.27631578947368,505,504,12.3099999999999,501,496,31.4400000000001,4,2.55402112103982 +"project563.nex",82,49,0.597560975609756,156,154,2.22000000000003,156,154,7.26999999999998,0,3.27477477477473 +"project549.nex",84,384,4.57142857142857,910,906,20.8999999999999,903,901,61.5599999999999,7,2.94545454545456 +"project1210.nex",86,17,0.197674418604651,45,45,0.440000000000055,45,45,1.48000000000002,0,3.36363636363599 +"project3558.nex",86,59,0.686046511627907,198,195,1.68999999999983,196,194,5.45000000000005,2,3.22485207100627 +"project3637.nex",86,528,6.13953488372093,2560,2544,24.1200000000001,2486,2460,90.2999999999997,74,3.74378109452733 diff --git a/dev/benchmarks/pgo_recipe.md b/dev/benchmarks/pgo_recipe.md new file mode 100644 index 000000000..c055d8d65 --- /dev/null +++ b/dev/benchmarks/pgo_recipe.md @@ -0,0 +1,118 @@ +# PGO (Profile-Guided Optimization) Build Recipe + +## Overview + +PGO lets GCC optimize branch prediction, function layout, and inlining +decisions based on actual runtime behavior. Requires two compilation +passes: one instrumented build to gather profile data, then a second +build that uses that data for optimization. + +## Results (2026-03-16, GCC 13 / rtools45, Windows x86_64) + +| Benchmark | Baseline (s) | PGO (s) | Speedup | +|-----------|-------------|---------|---------| +| Vinther EW (23 tips) | 0.240 | 0.240 | 0% | +| Vinther IW (23 tips) | 0.170 | 0.190 | -12% | +| Zhu EW (75 tips) | 4.010 | 3.790 | 5% | +| Zhu IW (75 tips) | 5.340 | 4.990 | 7% | +| Agnarsson EW (62 tips) | 2.200 | 2.080 | 5% | + +PGO provides a modest ~5-7% speedup on medium-sized datasets where the +C++ hot path dominates. On small datasets, R overhead and startup time +swamp any C++ improvement. Scores are identical (correctness verified: +53/53 driven search tests pass). + +## Build Steps + +All steps run from the package root directory. + +### Step 1: Baseline build (no PGO) + +Ensure no `src/Makevars.win` exists: + +```bash +rm -f src/Makevars.win src/*.o src/*.dll +R CMD INSTALL --library=.agent-pgo . +``` + +### Step 2: Instrumented build + +Create `src/Makevars.win`: + +```makefile +PROFILE_DIR = C:/Users/pjjg18/GitHub/TreeSearch/.pgo-data +PKG_CXXFLAGS = -fprofile-generate=$(PROFILE_DIR) +PKG_CFLAGS = -fprofile-generate=$(PROFILE_DIR) +PKG_LIBS = -fprofile-generate +``` + +Build and install: + +```bash +rm -rf .pgo-data && mkdir .pgo-data +rm -f src/*.o src/*.dll +R CMD INSTALL --library=.agent-pgo-gen . +``` + +### Step 3: Training workload + +Load the instrumented build and exercise all major code paths: + +```r +library(TreeSearch, lib.loc = ".agent-pgo-gen") +data(inapplicable.phyData, package = "TreeSearch") + +# EW + IW on small and medium datasets +MaximizeParsimony(inapplicable.phyData[["Vinther2008"]], + maxReplicates = 5L, targetHits = 3L, verbosity = 0L) +MaximizeParsimony(inapplicable.phyData[["Vinther2008"]], concavity = 10, + maxReplicates = 5L, targetHits = 3L, verbosity = 0L) +MaximizeParsimony(inapplicable.phyData[["Zhu2013"]], + maxReplicates = 3L, targetHits = 2L, verbosity = 0L) +MaximizeParsimony(inapplicable.phyData[["Zhu2013"]], concavity = 10, + maxReplicates = 3L, targetHits = 2L, verbosity = 0L) +MaximizeParsimony(inapplicable.phyData[["Agnarsson2004"]], + maxReplicates = 3L, targetHits = 2L, verbosity = 0L) +``` + +The `.gcda` files appear under `.pgo-data/C~/Users/.../src/`. + +### Step 4: PGO-use build + +Replace `src/Makevars.win`: + +```makefile +PROFILE_DIR = C:/Users/pjjg18/GitHub/TreeSearch/.pgo-data +PKG_CXXFLAGS = -fprofile-use=$(PROFILE_DIR) -fprofile-correction +PKG_CFLAGS = -fprofile-use=$(PROFILE_DIR) -fprofile-correction +PKG_LIBS = -fprofile-use +``` + +Build (**note: takes 3-5 minutes**, much longer than normal): + +```bash +rm -f src/*.o src/*.dll +R CMD INSTALL --library=.agent-pgo-use . +``` + +### Step 5: Clean up + +**Always remove `src/Makevars.win` after PGO builds** — leaving PGO +flags in place will cause segfaults (instrumented build) or broken +builds (PGO-use without matching `.gcda` files): + +```bash +rm -f src/Makevars.win src/*.o src/*.dll +``` + +## Notes + +- `-fprofile-correction` is needed because some source files may have + changed since profile generation. It tells GCC to accept mismatched + profiles gracefully rather than erroring. +- The `.pgo-data/` directory contains machine-specific binary data. + Do not commit to version control. +- PGO-use compilation is 2-5× slower than normal. Allow 5 minutes for + a full rebuild (30+ source files). +- GCC on Windows (rtools45) nests `.gcda` files under a path encoding + like `C~/Users/...`. This is expected behavior. diff --git a/dev/benchmarks/results_analysis.md b/dev/benchmarks/results_analysis.md new file mode 100644 index 000000000..6f7ab28be --- /dev/null +++ b/dev/benchmarks/results_analysis.md @@ -0,0 +1,52 @@ +# Benchmark Results Analysis (Agent A, T-005) + +## Dataset + +8 datasets × 6 strategies × 3 reps = 144 planned runs. +55/144 succeeded (38%) due to T-025 optimization-dependent UB segfault. +Aria2015 (35 tips) and Dikow2009 (88 tips) had highest crash rates. + +## Key Findings + +### 1. All strategies find optimal on small datasets (≤43 tips) +- Longrich (20 tips), Vinther (23 tips), Griswold (43 tips): 100% optimal +- Strategy choice doesn't matter much for small datasets + +### 2. Thorough and ratchet_heavy win on large datasets +- Zhu2013 (75 tips): `thorough` found best-known (649), `sprint` failed (652) +- Giles2015 (78 tips): `ratchet_heavy` found best (714), others 716-720 +- Dikow2009 (88 tips): `ratchet_heavy` and `drift_heavy` both found 1612 (vs best-known 1614) + +### 3. Sprint is fastest but loses quality at scale +- Sprint uses 3 ratchet cycles, no drift, minimal sectorial +- At ≤43 tips: optimal quality, 2-10× faster wall time +- At 75+ tips: fails to find optimal within 20s timeout + +### 4. Phase time distribution depends strongly on strategy +| Strategy | TBR | Ratchet | Drift | Sectorial | Fuse | +|----------|-----|---------|-------|-----------|------| +| sprint | 43% | 42% | 0% | 9% | 1% | +| default | 11% | 37% | 39% | 11% | 0% | +| ratchet_heavy | 6% | 87% | 5% | 1% | 0% | +| sectorial_heavy | 13% | 20% | 21% | 38% | 7% | +| drift_heavy | 7% | 12% | 74% | 4% | 3% | + +### 5. Replicates-to-convergence varies by strategy +- Sprint: 16-43 reps (many cheap reps) +- Thorough: 6-10 reps (few expensive reps) +- At 20s timeout, sprint completes 35-100 reps; thorough completes 6-10 + +## Recommendations for Adaptive Strategy + +1. **Size-based switching**: Use sprint for ≤30 tips, default for 30-60, + thorough or ratchet_heavy for 60+. +2. **Phase timing feedback**: If ratchet/drift phases dominate but scores + aren't improving, switch to more replicates with lighter per-replicate effort. +3. **Time budget**: With short timeouts, sprint covers more replicates. + With longer timeouts, thorough explores deeper per replicate. + +## Limitations + +- Only 38% of runs succeeded due to T-025 bug +- 20s timeout limits large-dataset exploration +- No IW or profile parsimony benchmarks (EW only) diff --git a/dev/benchmarks/results_drift_mpt_120s.csv b/dev/benchmarks/results_drift_mpt_120s.csv new file mode 100644 index 000000000..45b2d6b19 --- /dev/null +++ b/dev/benchmarks/results_drift_mpt_120s.csv @@ -0,0 +1,19 @@ +"dataset","n_tips","budget_s","drift_cycles","seed","best_score","n_trees","n_topologies","replicates","wall_s","drift_ms","total_ms","drift_pct","mean_rf","median_rf" +"Wortley2006",37,120,0,1,483,3,3,19,5.27,0,5258.8291,0,24,34 +"Wortley2006",37,120,0,2,482,1,1,22,7.56,0,7402.2598,0,NA,NA +"Wortley2006",37,120,0,3,484,74,74,15,4.84,0,3893.9118,0,21.7778600518327,24 +"Wortley2006",37,120,2,1,484,6,6,6,1.72,200.3704,1611.3042,12.4,4.4,4 +"Wortley2006",37,120,2,2,483,5,5,11,3.73,417.7614,3086.4603,13.5,9.2,14 +"Wortley2006",37,120,2,3,483,6,6,10,2.69,359.331,2666.7322,13.5,22.9333333333333,22 +"Zhu2013",75,120,0,1,638,100,100,20,24.86,0,24391.2249,0,9.42545454545455,10 +"Zhu2013",75,120,0,2,638,100,100,14,20.3,0,19788.4594,0,8.82141414141414,8 +"Zhu2013",75,120,0,3,639,100,100,8,8.38,0,8068.8115,0,8.38747474747475,8 +"Zhu2013",75,120,2,1,638,80,80,26,40.85,6750.1543,37658.7576,17.9,13.8329113924051,12 +"Zhu2013",75,120,2,2,638,100,100,26,32.33,5607.5592,32091.1807,17.5,9.45212121212121,8 +"Zhu2013",75,120,2,3,638,44,44,6,6.88,671.2499,4907.1112,13.7,16.8964059196617,6 +"Geisler2001",68,120,0,1,1298,100,100,9,6.54,0,6259.6277,0,9.44323232323232,8 +"Geisler2001",68,120,0,2,1295,100,100,19,17.65,0,17364.6849,0,8.51434343434343,8 +"Geisler2001",68,120,0,3,1296,100,100,16,14.7,0,14335.6088,0,9.81737373737374,8 +"Geisler2001",68,120,2,1,1296,100,100,8,7.83,1242.2633,7675.4627,16.2,8.44525252525253,8 +"Geisler2001",68,120,2,2,1295,100,100,22,29,5418.9427,28501.9577,19,8.02020202020202,8 +"Geisler2001",68,120,2,3,1297,100,100,10,9.32,1448.9778,9118.779,15.9,8.85777777777778,8 diff --git a/dev/benchmarks/results_drift_mpt_30s.csv b/dev/benchmarks/results_drift_mpt_30s.csv new file mode 100644 index 000000000..5dd8bcbbc --- /dev/null +++ b/dev/benchmarks/results_drift_mpt_30s.csv @@ -0,0 +1,19 @@ +"dataset","n_tips","budget_s","drift_cycles","seed","best_score","n_trees","n_topologies","replicates","wall_s","drift_ms","total_ms","drift_pct","mean_rf","median_rf" +"Wortley2006",37,30,0,1,483,3,3,12,2.33,0,2312.2469,0,25.3333333333333,30 +"Wortley2006",37,30,0,2,484,46,46,14,3.59,0,3293.46,0,21.5478260869565,24 +"Wortley2006",37,30,0,3,482,2,2,52,14.47,0,14469.3116,0,10,10 +"Wortley2006",37,30,2,1,484,5,5,19,5.24,763.1592,5231.0175,14.6,16.6,24 +"Wortley2006",37,30,2,2,482,3,3,47,15.44,2364.2745,15437.9649,15.3,8,10 +"Wortley2006",37,30,2,3,485,72,72,7,2.3,138.8317,1218.1715,11.4,12.5915492957746,8 +"Zhu2013",75,30,0,1,638,100,100,25,27.41,0,27001.8771,0,10.3668686868687,10 +"Zhu2013",75,30,0,2,639,100,100,17,27,0,26340.9309,0,11.9260606060606,10 +"Zhu2013",75,30,0,3,638,100,100,10,16.76,0,15910.4713,0,8.12848484848485,8 +"Zhu2013",75,30,2,1,639,24,24,11,28.41,4420.0451,27002.2213,16.4,6.55072463768116,6 +"Zhu2013",75,30,2,2,639,100,100,15,24.68,3575.011,24043.2459,14.9,10.3781818181818,6 +"Zhu2013",75,30,2,3,639,100,100,13,16.3,2397.1034,15842.008,15.1,10.2448484848485,8 +"Geisler2001",68,30,0,1,1295,100,100,26,26.64,0,26346.3173,0,7.65252525252525,8 +"Geisler2001",68,30,0,2,1295,100,100,13,10.97,0,10790.435,0,7.30060606060606,8 +"Geisler2001",68,30,0,3,1297,100,100,17,14.51,0,14270.2228,0,8.12929292929293,8 +"Geisler2001",68,30,2,1,1295,100,100,22,27.3,5069.8475,27009.3889,18.8,6.39555555555556,6 +"Geisler2001",68,30,2,2,1295,100,100,19,27.5,4803.201,27008.6251,17.8,7.92040404040404,8 +"Geisler2001",68,30,2,3,1295,100,100,16,27.29,5096.0644,26999.3252,18.9,6.85131313131313,6 diff --git a/dev/benchmarks/results_drift_mpt_30s_nostop.csv b/dev/benchmarks/results_drift_mpt_30s_nostop.csv new file mode 100644 index 000000000..d4c5886d0 --- /dev/null +++ b/dev/benchmarks/results_drift_mpt_30s_nostop.csv @@ -0,0 +1,19 @@ +"dataset","n_tips","budget_s","drift_cycles","seed","best_score","n_trees","n_topologies","replicates","wall_s","drift_ms","total_ms","drift_pct","mean_rf","median_rf" +"Wortley2006",37,30,0,1,482,4,4,74,27.03,0,26999.6914,0,18.3333333333333,19 +"Wortley2006",37,30,0,2,482,4,4,75,25.41,0,25392.1515,0,17.3333333333333,24 +"Wortley2006",37,30,0,3,482,4,4,79,27.01,0,26996.0762,0,17.3333333333333,24 +"Wortley2006",37,30,2,1,482,2,2,58,27.02,4054.7815,26998.224,15,2,2 +"Wortley2006",37,30,2,2,482,1,1,63,27,4105.1979,26999.9565,15.2,NA,NA +"Wortley2006",37,30,2,3,482,3,3,62,27,4293.3852,26999.6641,15.9,18,26 +"Zhu2013",75,30,0,1,638,100,100,26,27.2,0,26997.0477,0,26.9260606060606,37 +"Zhu2013",75,30,0,2,639,100,100,29,27.32,0,27003.2961,0,11.6056565656566,8 +"Zhu2013",75,30,0,3,638,47,47,21,30,0,26991.8496,0,5.80203515263645,6 +"Zhu2013",75,30,2,1,639,100,100,19,27.38,4862.418,27004.7242,18,10.1882828282828,8 +"Zhu2013",75,30,2,2,638,100,100,21,27.6,4386.1555,27003.6628,16.2,16.8638383838384,8 +"Zhu2013",75,30,2,3,638,100,100,19,27.36,4630.2351,27003.3931,17.1,8.0210101010101,8 +"Geisler2001",68,30,0,1,1295,100,100,27,27.5,0,27008.9305,0,8.39919191919192,8 +"Geisler2001",68,30,0,2,1295,100,100,26,27.52,0,27004.1687,0,7.28525252525252,8 +"Geisler2001",68,30,0,3,1295,100,100,28,27.4,0,27000.9511,0,6.9179797979798,6 +"Geisler2001",68,30,2,1,1295,100,100,23,27.24,5153.3365,27003.0785,19.1,7.29131313131313,8 +"Geisler2001",68,30,2,2,1295,100,100,25,27.44,4788.7047,27000.6298,17.7,7.45292929292929,8 +"Geisler2001",68,30,2,3,1295,100,100,25,27.28,4788.3551,27001.2777,17.7,7.41454545454545,8 diff --git a/dev/benchmarks/results_grid.csv b/dev/benchmarks/results_grid.csv new file mode 100644 index 000000000..5feb5baf9 --- /dev/null +++ b/dev/benchmarks/results_grid.csv @@ -0,0 +1,56 @@ +"dataset","strategy","seed","n_taxa","best_score","replicates","hits_to_best","pool_size","timed_out","wall_s","wagner_ms","tbr_ms","xss_ms","rss_ms","css_ms","ratchet_ms","drift_ms","final_tbr_ms","fuse_ms" +"Longrich2010","sprint",7156,20,131,16,10,10,FALSE,0.35,5.5612,168.9216,33.5792,0,0,136.0137,0,17.6695,4.3639 +"Longrich2010","default",7177,20,131,12,10,9,FALSE,0.79,3.5726,97.9913,73.731,19.9502,35.3677,281.7302,248.3973,11.462,4.9795 +"Longrich2010","thorough",7191,20,131,10,10,10,FALSE,1.43,10.8893,83.8347,107.9262,53.7111,69.9601,595.777,425.8947,14.7263,60.7966 +"Longrich2010","ratchet_heavy",7212,20,131,10,10,10,FALSE,0.87,2.9456,80.6327,16.5024,0,0,705.3218,54.506,10.1509,3.8626 +"Longrich2010","sectorial_heavy",7226,20,131,14,10,9,FALSE,0.92,4.3215,117.3891,207.5194,74.8896,95.5932,178.7793,162.2722,14.6907,68.3265 +"Longrich2010","sectorial_heavy",7240,20,131,12,10,9,FALSE,0.84,3.7353,112.879,216.303,65.1713,81.3997,134.3745,157.5637,12.6482,61.7056 +"Longrich2010","drift_heavy",7261,20,131,10,10,10,FALSE,1.22,3.6682,125.7203,53.9082,17.2822,0,162.2324,788.4821,14.5726,42.4944 +"Vinther2008","sprint",7282,23,79,43,10,9,FALSE,0.96,13.4144,305.9908,101.5532,0,0,469.8212,0,56.8494,12.0851 +"Vinther2008","default",7303,23,79,15,10,8,FALSE,1.29,4.4285,147.0399,120.6356,26.9947,49.2964,511.2222,386.3871,22.1471,9.2218 +"Vinther2008","ratchet_heavy",7338,23,79,13,10,9,FALSE,1.75,4.183,79.8243,29.7423,0,0,1518.565,93.1282,18.5196,7.6951 +"Vinther2008","ratchet_heavy",7345,23,79,12,10,6,FALSE,1.58,4.3809,88.3588,25.7994,0,0,1318.619,116.2221,23.6332,5.2536 +"Vinther2008","drift_heavy",7373,23,79,15,10,8,FALSE,1.89,4.4506,155.6572,68.4049,31.5383,0,287.7199,1229.335,23.0958,96.2598 +"Griswold1999","sprint",7527,43,407,100,1,1,FALSE,8.95,86.3985,3142.07,898.5605,0,0,4334.641,0,472.8122,17.423 +"Griswold1999","sprint",7534,43,407,100,1,1,FALSE,8.33,72.3996,3019.879,829.1007,0,0,3972.065,0,419.8558,5.3561 +"Griswold1999","default",7541,43,407,60,5,4,TRUE,20.01,51.3871,2028.548,1608.473,517.8287,646.6741,7913.14,6882.725,304.1913,54.717 +"Griswold1999","default",7555,43,407,52,4,3,TRUE,20.14,48.1219,1989.576,1628.201,551.7708,647.1327,8067.253,6908.469,285.1488,23.3511 +"Griswold1999","thorough",7569,43,407,26,7,4,TRUE,20.28,69.5974,883.0762,1322.144,665.1719,620.8039,9653.046,6315.852,128.0589,608.7557 +"Griswold1999","sectorial_heavy",7604,43,409,33,10,10,FALSE,10.99,25.9571,1050.931,2538.971,959.7211,1151.893,2304.769,2019.774,160.1463,778.1662 +"Griswold1999","sectorial_heavy",7611,43,407,59,4,2,TRUE,20.03,50.699,1937.855,4619.779,1815.873,2144.137,4311.689,3853.577,296.9927,1001.677 +"Griswold1999","sectorial_heavy",7618,43,408,54,5,5,TRUE,20,53.296,2058.464,4475.212,1792.645,2162.046,4179.014,3912.437,295.7644,1082.892 +"Griswold1999","drift_heavy",7632,43,407,29,1,1,TRUE,20.31,29.243,1345.69,789.8366,342.0521,0,2829.767,14532.42,180.9324,260.5181 +"Griswold1999","drift_heavy",7639,43,407,35,2,2,TRUE,20.4,28.5549,1216.641,692.5993,348.2278,0,2660.212,14800.87,181.3329,468.1162 +"Agnarsson2004","sprint",7646,62,778,25,12,4,FALSE,7.64,38.8319,3306.102,664.5492,0,0,3240.233,0,355.2857,36.4514 +"Agnarsson2004","sprint",7653,62,778,16,12,4,FALSE,4.84,24.0395,1993.6,434.3031,0,0,2112.969,0,248.6172,23.9967 +"Agnarsson2004","sprint",7660,62,778,28,12,4,FALSE,9.03,46.9422,4055.725,707.0973,0,0,3766.75,0,412.585,43.3126 +"Agnarsson2004","default",7681,62,778,13,12,4,FALSE,14.88,20.086,1636.263,1010.528,253.8344,416.1035,5540.763,5752.729,200.2798,46.9966 +"Agnarsson2004","thorough",7695,62,778,6,7,3,TRUE,20.52,38.019,978.6282,1304.621,495.5814,712.8048,9477.131,6990.946,106.7311,409.6798 +"Agnarsson2004","ratchet_heavy",7709,62,778,7,8,3,TRUE,20.57,17.9828,1367.128,284.0427,0,0,17716.98,1049.291,110.6061,16.2788 +"Agnarsson2004","ratchet_heavy",7716,62,778,8,9,4,TRUE,20.4,15.0823,1281.816,243.2763,0,0,17582.67,1088.114,167.5827,17.3994 +"Agnarsson2004","ratchet_heavy",7723,62,778,8,9,3,TRUE,20.52,13.2234,1090.884,264.887,0,0,18161.67,851.4318,118.6747,12.781 +"Agnarsson2004","sectorial_heavy",7730,62,778,14,12,4,FALSE,16.49,20.8428,2065.099,3273.523,1150.482,1610.231,3425.814,3582.109,234.0559,1118.618 +"Agnarsson2004","sectorial_heavy",7744,62,778,13,12,4,FALSE,15.71,21.851,2042.292,3228.474,1304.259,1763.248,2981.633,3185.584,208.5059,979.7699 +"Agnarsson2004","drift_heavy",7758,62,778,9,10,3,TRUE,20.03,14.879,1375.563,591.4157,212.5581,0,2459.886,14734.41,135.1524,506.9411 +"Zhu2013","sprint",7772,75,651,39,1,1,TRUE,20.02,72.9111,13737.71,1183.313,0,0,4580.738,0,440.0807,0 +"Zhu2013","sprint",7779,75,653,40,1,1,TRUE,20.06,75.0919,13781.53,1404.004,0,0,4354.869,0,446.4723,0 +"Zhu2013","default",7800,75,648,26,0,1,TRUE,20.28,68.5746,9550.869,1557.155,383.2678,647.4293,3937.54,3835.627,272.6035,38.4683 +"Zhu2013","default",7807,75,652,27,1,1,TRUE,20,43.4461,9116.992,1808.787,423.0211,706.6058,3813.093,3831.995,263.5211,0 +"Zhu2013","thorough",7821,75,644,10,1,1,TRUE,20.29,60.7229,3234.679,1154.143,452.1932,679.1188,6656.512,7966.357,97.8088,0 +"Zhu2013","ratchet_heavy",7835,75,647,10,2,2,TRUE,20.1,21.6947,4091.166,338.5121,0,0,13942.89,1542.329,162.059,0 +"Zhu2013","sectorial_heavy",7863,75,646,18,1,1,TRUE,20.34,30.4038,6684.249,3763.609,1155.983,1669.957,2394.022,4458.786,188.8624,0 +"Zhu2013","sectorial_heavy",7870,75,654,14,1,1,TRUE,20.13,28.2669,6638.929,3353.36,1181.068,1522.333,2687.835,4516.656,191.1578,0 +"Giles2015","sprint",7898,78,716,31,0,1,TRUE,20.02,67.1701,13977.79,905.5851,0,0,4606.327,0,431.2986,22.5306 +"Giles2015","sprint",7912,78,720,35,1,1,TRUE,20.08,66.9398,14428.12,932.5643,0,0,4150.108,0,463.9039,26.9601 +"Giles2015","default",7919,78,717,20,0,1,TRUE,20.1,40.6125,9100.392,1475.09,348.6732,593.0079,4175.914,4029.259,263.322,80.5659 +"Giles2015","default",7926,78,724,21,1,1,TRUE,20.03,38.5791,8895.455,1497.378,372.198,614.7384,4047.391,4313.701,255.574,0 +"Giles2015","default",7933,78,719,22,1,1,TRUE,20.19,39.7737,9231.238,1406.215,337.7801,601.8482,4118.125,4177.309,277.7193,0 +"Giles2015","thorough",7940,78,718,8,1,1,TRUE,20.47,41.1324,3145.117,865.2061,495.5709,516.7613,6827.516,8496.909,76.0479,0 +"Giles2015","ratchet_heavy",7975,78,714,11,2,2,TRUE,21.1,17.4333,3992.947,333.8161,0,0,14876.64,1729.617,113.2207,28.6116 +"Giles2015","sectorial_heavy",7982,78,719,17,1,1,TRUE,20.09,26.083,6699.72,3134.947,1041.038,1372.883,2912.329,4542.178,170.6497,199.7884 +"Giles2015","sectorial_heavy",7989,78,716,14,1,1,TRUE,20.19,30.4455,6425.667,3180.292,1006.807,1690.058,3088.124,4532.7,221.133,0 +"Giles2015","sectorial_heavy",7996,78,720,18,1,1,TRUE,20.34,41.4848,6531.305,2608.641,956.5212,1391.038,3147.44,5209.824,196.5575,274.792 +"Giles2015","drift_heavy",8017,78,716,7,1,1,TRUE,21.13,13.0351,3371.816,429.5716,149.9113,0,1706.396,15358.03,85.6689,0 +"Dikow2009","ratchet_heavy",8101,88,1612,3,1,1,TRUE,20.29,8.1673,1104.885,161.8035,0,0,17875.2,1046.379,78.6071,0 +"Dikow2009","sectorial_heavy",8108,88,1614,9,2,2,TRUE,20.16,26.449,3291.174,4484.637,1531.966,2196.717,3844.513,4295.55,233.0998,251.5741 +"Dikow2009","drift_heavy",8136,88,1612,4,1,1,TRUE,21.61,13.7694,1500.282,559.3438,242.1143,0,2671.889,16521.6,106.8854,0 diff --git a/dev/benchmarks/results_large_preset.csv b/dev/benchmarks/results_large_preset.csv new file mode 100644 index 000000000..7e8e1fbca --- /dev/null +++ b/dev/benchmarks/results_large_preset.csv @@ -0,0 +1,20 @@ +"condition","seed","best_score","replicates","budget_s","notes" +"large_v2",2847,1271,1,60,"T-179: lean_c design (ratch12,drift4,nniP0,outer1)" +"large_v2",7193,1255,1,60,"T-179: lean_c design" +"large_v2",4561,1237,1,60,"T-179: lean_c design" +"large_v2",1031,1219,1,60,"T-179: lean_c design (round 3 partial)" +"thorough",2847,1263,0,60,"source_large baseline (ratch20,drift12,nniP5,outer2,as=T)" +"thorough",7193,1247,0,60,"source_large baseline" +"thorough",4561,1257,0,60,"source_large baseline" +"large_v2",2847,1250,2,120,"lean_c at 120s budget" +"large_v2",7193,1243,2,120,"lean_c at 120s budget" +"large_v2",4561,1253,2,120,"lean_c at 120s budget" +"thorough",2847,1250,1,120,"thorough at 120s budget" +"thorough",7193,1233,0,120,"thorough at 120s budget" +"thorough",4561,1252,1,120,"thorough at 120s budget" +"large_v2",2847,1276,0,30,"lean_c at 30s budget" +"large_v2",7193,1274,0,30,"lean_c at 30s budget" +"large_v2",4561,1292,0,30,"lean_c at 30s budget" +"thorough",2847,1283,0,30,"thorough at 30s budget" +"thorough",7193,1277,0,30,"thorough at 30s budget" +"thorough",4561,1316,0,30,"thorough at 30s budget" diff --git a/dev/benchmarks/results_outer_cycles.csv b/dev/benchmarks/results_outer_cycles.csv new file mode 100644 index 000000000..0a584db5e --- /dev/null +++ b/dev/benchmarks/results_outer_cycles.csv @@ -0,0 +1,85 @@ +"dataset","condition","seed","n_taxa","best_score","replicates","hits_to_best","wall_s" +"Longrich2010","thorough_1",1031,20,131,6,6,1.14 +"Longrich2010","thorough_1",2847,20,131,8,8,1.43 +"Longrich2010","thorough_1",7193,20,131,8,8,1.01 +"Longrich2010","thorough_2",1031,20,131,6,6,0.91 +"Longrich2010","thorough_2",2847,20,131,7,7,1.25 +"Longrich2010","thorough_2",7193,20,131,8,8,1.15 +"Vinther2008","thorough_1",1031,23,79,7,5,1.72 +"Vinther2008","thorough_1",2847,23,79,8,7,2.11 +"Vinther2008","thorough_1",7193,23,79,5,5,1.46 +"Vinther2008","thorough_2",1031,23,79,5,5,1.48 +"Vinther2008","thorough_2",2847,23,79,6,4,1.7 +"Vinther2008","thorough_2",7193,23,79,5,5,1.44 +"Sansom2010","thorough_1",1031,23,189,10,7,2.05 +"Sansom2010","thorough_1",2847,23,189,10,9,1.84 +"Sansom2010","thorough_1",7193,23,189,7,4,1.33 +"Sansom2010","thorough_2",1031,23,189,11,7,2.3 +"Sansom2010","thorough_2",2847,23,189,12,9,2.62 +"Sansom2010","thorough_2",7193,23,189,14,9,3.11 +"DeAssis2011","thorough_1",1031,33,64,5,5,1.14 +"DeAssis2011","thorough_1",2847,33,64,8,8,1.47 +"DeAssis2011","thorough_1",7193,33,64,7,7,1.27 +"DeAssis2011","thorough_2",1031,33,64,6,6,1.17 +"DeAssis2011","thorough_2",2847,33,64,7,7,1.37 +"DeAssis2011","thorough_2",7193,33,64,5,5,1.04 +"Aria2015","thorough_1",1031,35,143,12,2,2.39 +"Aria2015","thorough_1",2847,35,143,9,2,2.5 +"Aria2015","thorough_1",7193,35,143,16,3,3.67 +"Aria2015","thorough_2",1031,35,143,8,4,2.73 +"Aria2015","thorough_2",2847,35,143,11,2,3.81 +"Aria2015","thorough_2",7193,35,143,19,4,3.8 +"Wortley2006","thorough_1",1031,37,490,46,2,19.19 +"Wortley2006","thorough_1",2847,37,490,17,2,7.7 +"Wortley2006","thorough_1",7193,37,487,43,1,20.02 +"Wortley2006","thorough_2",1031,37,490,6,2,3.48 +"Wortley2006","thorough_2",2847,37,488,12,1,6.86 +"Wortley2006","thorough_2",7193,37,487,37,1,20 +"Griswold1999","thorough_1",1031,43,407,23,5,18.72 +"Griswold1999","thorough_1",2847,43,407,9,3,6.44 +"Griswold1999","thorough_1",7193,43,407,21,4,13.83 +"Griswold1999","thorough_2",1031,43,407,10,2,7.62 +"Griswold1999","thorough_2",2847,43,407,11,2,8.30000000000001 +"Griswold1999","thorough_2",7193,43,407,14,3,10.22 +"Schulze2007","thorough_1",1031,52,164,10,2,3.85999999999999 +"Schulze2007","thorough_1",2847,52,164,15,4,5.31 +"Schulze2007","thorough_1",7193,52,164,12,2,4.05000000000001 +"Schulze2007","thorough_2",1031,52,164,12,3,4.78 +"Schulze2007","thorough_2",2847,52,164,27,2,11.98 +"Schulze2007","thorough_2",7193,52,164,16,2,7.05000000000001 +"Eklund2004","thorough_1",1031,54,441,18,3,18.39 +"Eklund2004","thorough_1",2847,54,440,9,2,10.26 +"Eklund2004","thorough_1",7193,54,441,12,5,15.88 +"Eklund2004","thorough_2",1031,54,441,9,3,12.64 +"Eklund2004","thorough_2",2847,54,441,16,1,20.02 +"Eklund2004","thorough_2",7193,54,441,7,2,10.35 +"Agnarsson2004","thorough_1",1031,62,778,7,7,18.6 +"Agnarsson2004","thorough_1",2847,62,778,7,7,17.06 +"Agnarsson2004","thorough_1",7193,62,778,6,6,16.92 +"Agnarsson2004","thorough_2",1031,62,778,6,7,20 +"Agnarsson2004","thorough_2",2847,62,778,5,5,15.8 +"Agnarsson2004","thorough_2",7193,62,778,6,6,16.81 +"Zanol2014","thorough_1",1031,74,1322,5,1,20.02 +"Zanol2014","thorough_1",2847,74,1326,5,1,20 +"Zanol2014","thorough_1",7193,74,1324,4,1,20 +"Zanol2014","thorough_2",1031,74,1321,5,1,20.03 +"Zanol2014","thorough_2",2847,74,1325,5,1,20.02 +"Zanol2014","thorough_2",7193,74,1322,5,1,20 +"Zhu2013","thorough_1",1031,75,641,8,1,20.01 +"Zhu2013","thorough_1",2847,75,642,7,1,20.02 +"Zhu2013","thorough_1",7193,75,645,7,1,20 +"Zhu2013","thorough_2",1031,75,643,7,2,20.01 +"Zhu2013","thorough_2",2847,75,643,7,1,20 +"Zhu2013","thorough_2",7193,75,646,6,1,20 +"Giles2015","thorough_1",1031,78,714,6,1,20.0200000000001 +"Giles2015","thorough_1",2847,78,713,6,2,20 +"Giles2015","thorough_1",7193,78,714,7,1,20 +"Giles2015","thorough_2",1031,78,712,6,2,20.01 +"Giles2015","thorough_2",2847,78,713,6,1,20.02 +"Giles2015","thorough_2",7193,78,717,5,2,20.02 +"Dikow2009","thorough_1",1031,88,1611,4,1,20.01 +"Dikow2009","thorough_1",2847,88,1611,3,1,20.03 +"Dikow2009","thorough_1",7193,88,1611,3,1,20 +"Dikow2009","thorough_2",1031,88,1615,3,1,20.0500000000001 +"Dikow2009","thorough_2",2847,88,1614,4,1,20.0899999999999 +"Dikow2009","thorough_2",7193,88,1612,4,2,20.0200000000001 diff --git a/dev/benchmarks/results_t274_nni_perturb.csv b/dev/benchmarks/results_t274_nni_perturb.csv new file mode 100644 index 000000000..b5a124045 --- /dev/null +++ b/dev/benchmarks/results_t274_nni_perturb.csv @@ -0,0 +1,121 @@ +"dataset","n_taxa","nni_cycles","seed","best_score","wall_s" +"Zhu2013",75,0,69788,645,2.06 +"Zhu2013",75,0,8923,638,10.82 +"Zhu2013",75,0,79376,640,2.42 +"Zhu2013",75,0,16815,643,2.8 +"Zhu2013",75,0,19686,639,2.9 +"Zhu2013",75,0,63005,642,4.28 +"Zhu2013",75,0,84922,640,2.58 +"Zhu2013",75,0,43596,640,3.17 +"Zhu2013",75,0,40810,644,1.78 +"Zhu2013",75,0,24478,641,2.1 +"Zhu2013",75,0,26571,638,3.06 +"Zhu2013",75,0,69494,639,2.53 +"Zhu2013",75,0,91340,639,2.24 +"Zhu2013",75,0,50693,640,1.45 +"Zhu2013",75,0,23811,645,1.67 +"Zhu2013",75,0,75529,640,2.05 +"Zhu2013",75,0,11851,644,1.55 +"Zhu2013",75,0,34949,638,2.04 +"Zhu2013",75,0,65380,639,2.44 +"Zhu2013",75,0,73338,641,1.75 +"Zhu2013",75,5,69788,645,2.19 +"Zhu2013",75,5,8923,638,8.53999999999999 +"Zhu2013",75,5,79376,640,3.61 +"Zhu2013",75,5,16815,643,3.93000000000001 +"Zhu2013",75,5,19686,641,3.92 +"Zhu2013",75,5,63005,642,3.89999999999999 +"Zhu2013",75,5,84922,638,3.22000000000001 +"Zhu2013",75,5,43596,640,4.14 +"Zhu2013",75,5,40810,641,2.73999999999999 +"Zhu2013",75,5,24478,639,4.36 +"Zhu2013",75,5,26571,638,7.09 +"Zhu2013",75,5,69494,640,3.92 +"Zhu2013",75,5,91340,638,3.86 +"Zhu2013",75,5,50693,638,4.34 +"Zhu2013",75,5,23811,645,2.91 +"Zhu2013",75,5,75529,639,4.33 +"Zhu2013",75,5,11851,644,2.19 +"Zhu2013",75,5,34949,640,2.99999999999999 +"Zhu2013",75,5,65380,640,2.98000000000002 +"Zhu2013",75,5,73338,641,2.01999999999998 +"Giles2015",78,0,69788,714,2.09 +"Giles2015",78,0,8923,711,2.05000000000001 +"Giles2015",78,0,79376,710,2.85999999999999 +"Giles2015",78,0,16815,712,1.98000000000002 +"Giles2015",78,0,19686,712,2.63 +"Giles2015",78,0,63005,710,1.97 +"Giles2015",78,0,84922,711,2.40000000000001 +"Giles2015",78,0,43596,710,2.16999999999999 +"Giles2015",78,0,40810,713,2.84999999999999 +"Giles2015",78,0,24478,713,1.94 +"Giles2015",78,0,26571,711,2.14000000000001 +"Giles2015",78,0,69494,712,1.88999999999999 +"Giles2015",78,0,91340,710,2.92000000000002 +"Giles2015",78,0,50693,712,1.91 +"Giles2015",78,0,23811,711,2.84999999999999 +"Giles2015",78,0,75529,712,3.08000000000001 +"Giles2015",78,0,11851,715,3.48999999999998 +"Giles2015",78,0,34949,713,2.04000000000002 +"Giles2015",78,0,65380,712,3.23999999999998 +"Giles2015",78,0,73338,712,2.26000000000002 +"Giles2015",78,5,69788,711,3.53999999999999 +"Giles2015",78,5,8923,711,3.59999999999999 +"Giles2015",78,5,79376,711,3.88 +"Giles2015",78,5,16815,712,2.95000000000002 +"Giles2015",78,5,19686,711,3.63999999999999 +"Giles2015",78,5,63005,710,2.75 +"Giles2015",78,5,84922,711,3.5 +"Giles2015",78,5,43596,710,2.5 +"Giles2015",78,5,40810,712,6.44 +"Giles2015",78,5,24478,710,4.63 +"Giles2015",78,5,26571,711,2.54000000000002 +"Giles2015",78,5,69494,714,3.06999999999999 +"Giles2015",78,5,91340,711,4.09999999999999 +"Giles2015",78,5,50693,712,2.69 +"Giles2015",78,5,23811,710,3.88 +"Giles2015",78,5,75529,713,3.06 +"Giles2015",78,5,11851,712,4.27000000000001 +"Giles2015",78,5,34949,711,3.41999999999999 +"Giles2015",78,5,65380,712,2.92000000000002 +"Giles2015",78,5,73338,710,2.86000000000001 +"Dikow2009",88,0,69788,1612,5.43999999999997 +"Dikow2009",88,0,8923,1615,4.20000000000005 +"Dikow2009",88,0,79376,1621,3.94 +"Dikow2009",88,0,16815,1620,4.15999999999997 +"Dikow2009",88,0,19686,1616,3.25999999999999 +"Dikow2009",88,0,63005,1616,3.36000000000001 +"Dikow2009",88,0,84922,1614,3.05000000000001 +"Dikow2009",88,0,43596,1611,5.62 +"Dikow2009",88,0,40810,1615,5.38 +"Dikow2009",88,0,24478,1611,7.53000000000003 +"Dikow2009",88,0,26571,1615,10.79 +"Dikow2009",88,0,69494,1611,4.05000000000001 +"Dikow2009",88,0,91340,1616,12.66 +"Dikow2009",88,0,50693,1617,3.16999999999996 +"Dikow2009",88,0,23811,1612,5.58000000000004 +"Dikow2009",88,0,75529,1611,4.35999999999996 +"Dikow2009",88,0,11851,1614,8.98000000000002 +"Dikow2009",88,0,34949,1613,3.86000000000001 +"Dikow2009",88,0,65380,1613,4.27999999999997 +"Dikow2009",88,0,73338,1613,3.77000000000004 +"Dikow2009",88,5,69788,1612,7.13999999999999 +"Dikow2009",88,5,8923,1612,7.13999999999999 +"Dikow2009",88,5,79376,1611,7.87 +"Dikow2009",88,5,16815,1611,7.22000000000003 +"Dikow2009",88,5,19686,1615,8.38 +"Dikow2009",88,5,63005,1611,8.92000000000002 +"Dikow2009",88,5,84922,1614,4.88 +"Dikow2009",88,5,43596,1614,5.52999999999997 +"Dikow2009",88,5,40810,1615,5.86000000000001 +"Dikow2009",88,5,24478,1611,10.22 +"Dikow2009",88,5,26571,1613,7.58000000000004 +"Dikow2009",88,5,69494,1611,6 +"Dikow2009",88,5,91340,1617,3.94999999999999 +"Dikow2009",88,5,50693,1612,4.17000000000002 +"Dikow2009",88,5,23811,1611,8.89999999999998 +"Dikow2009",88,5,75529,1611,8.75 +"Dikow2009",88,5,11851,1612,6.81999999999999 +"Dikow2009",88,5,34949,1612,3.65000000000003 +"Dikow2009",88,5,65380,1615,6.15999999999997 +"Dikow2009",88,5,73338,1613,3.97000000000003 diff --git a/dev/benchmarks/strategies.md b/dev/benchmarks/strategies.md new file mode 100644 index 000000000..1a41780a9 --- /dev/null +++ b/dev/benchmarks/strategies.md @@ -0,0 +1,497 @@ +# Driven Search Strategy Space + +Last updated: 2026-03-17 + +This document defines all tunable parameters of the C++ driven search +engine (`MaximizeParsimony()`) and proposes named strategy presets for +benchmarking (Phase 6D) and adaptive search (Phase 6F). + +## Pipeline Overview + +Each replicate executes this fixed phase sequence: + +``` +Wagner → TBR → XSS → RSS → CSS → Ratchet → Drift → Final TBR +``` + +Phases may be skipped by setting their cycle/round counts to 0. +Sectorial phases (XSS, RSS, CSS) only run when the tree has +≥ 2 × `sectorMinSize` tips. + +Between replicates, the pool collects the best tree(s) and tree +fusing may run (every `fuseInterval` replicates). + +--- + +## Parameter Categories + +### A. Strategy Parameters (per-replicate search behavior) + +These control how each replicate explores tree space. They are the +primary targets for strategy tuning in Phase 6D. + +#### A1. Wagner Start + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `wagnerStarts` | `wagner_starts` | 1 | Random Wagner trees built per replicate; best-scoring one used as TBR starting point. Higher values improve starting topology at low cost for small datasets. | + +#### A2. TBR + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `tbrMaxHits` | `tbr_max_hits` | 1 | Equal-score hits before TBR declares convergence. Higher values explore the plateau more thoroughly. | +| `tabuSize` | `tabu_size` | 100 | Tabu list capacity for TBR. Prevents revisiting recently-explored topologies on plateaus. 0 = disabled. | + +#### A3. Ratchet + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `ratchetCycles` | `ratchet_cycles` | 10 | Perturbation-then-search cycles per replicate. Primary knob for ratchet intensity. 0 = skip ratchet. | +| `ratchetPerturbProb` | `ratchet_perturb_prob` | 0.04 | Per-character probability of perturbation. Higher = more disruptive. | +| `ratchetPerturbMode` | `ratchet_perturb_mode` | 0 | 0 = zero (silence characters), 1 = upweight (double weight), 2 = mixed (zero some, double others). | +| `ratchetPerturbMaxMoves` | `ratchet_perturb_max_moves` | 0 (auto) | Max TBR moves during perturbation phase. 0 = `max(20, min(200, n_tip/8))`. | +| `ratchetAdaptive` | `ratchet_adaptive` | FALSE | Auto-tune `perturbProb` to target a ~30% escape rate. | + +#### A4. Drift + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `driftCycles` | `drift_cycles` | 6 | Suboptimal-exploration cycles per replicate. 0 = skip drift. | +| `driftAfdLimit` | `drift_afd_limit` | 3 | Max absolute fit difference (steps) for accepting suboptimal moves. | +| `driftRfdLimit` | `drift_rfd_limit` | 0.1 | Max relative fit difference for accepting suboptimal moves. | + +#### A5. Sectorial Search + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `xssRounds` | `xss_rounds` | 3 | Exclusive Sectorial Search (systematic partition) rounds. 0 = skip XSS. | +| `xssPartitions` | `xss_partitions` | 4 | Number of non-overlapping sectors per XSS round. | +| `rssRounds` | `rss_rounds` | 1 | Random Sectorial Search rounds after XSS. 0 = skip RSS. | +| `cssRounds` | `css_rounds` | 1 | Constrained Sectorial Search (full-tree exact scoring) rounds. 0 = skip CSS. | +| `cssPartitions` | `css_partitions` | 4 | Partitions for CSS. | +| `sectorMinSize` | `sector_min_size` | 6 | Minimum sector clade size (tips). | +| `sectorMaxSize` | `sector_max_size` | 50 | Maximum sector clade size (tips). | + +#### A6. Tree Fusing + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `fuseInterval` | `fuse_interval` | 3 | Fuse best tree against pool every N replicates. | +| `fuseAcceptEqual` | `fuse_accept_equal` | FALSE | Accept equal-score fusions (increases pool diversity). | + +### B. Convergence Parameters (when to stop) + +These control total search effort across replicates. Independent of +per-replicate strategy — benchmarking should generally fix these. + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `maxReplicates` | `max_replicates` | 100 | Hard cap on replicates. | +| `targetHits` | `target_hits` | `max(10, n_tip/5)` | Stop after this many independent hits to the best score. | + +### C. Pool Parameters + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `poolMaxSize` | `pool_max_size` | 100 | Maximum trees retained in the pool. | +| `poolSuboptimal` | `pool_suboptimal` | 0.0 | Score tolerance for retaining suboptimal trees. | + +### D. Infrastructure Parameters (not strategy-relevant) + +| R parameter | C++ field | Default | Description | +|-------------|-----------|---------|-------------| +| `concavity` | — | Inf | Scoring mode: Inf = EW, finite = IW, "profile" = profile parsimony. | +| `nThreads` | — | 1 | Worker threads. | +| `verbosity` | `verbosity` | 1 | 0 = silent, 1 = per-replicate, 2 = per-phase. | +| `progressCallback` | — | NULL (auto) | Custom progress reporting function. | +| `constraint` | — | (none) | Topology constraint (splits). | +| — | `max_seconds` | 0 | Timeout in seconds (available in C++ bridge, not exposed in R-level `MaximizeParsimony`). | + +### E. Not Yet Implemented (noted in production plan) + +| Parameter | Description | Status | +|-----------|-------------|--------| +| SPR vs TBR phase choice | Use SPR first, escalate to TBR only where SPR plateaus | Not implemented (T-012) | +| NNI pre-pass | Quick NNI before TBR | Not implemented | + +--- + +## Strategy Vector + +For Phase 6D benchmarking, the **strategy vector** consists of the 20 +Category A parameters. Each preset specifies values for all 20. + +--- + +## Named Strategy Presets + +### 1. `sprint` + +Minimal effort for fast interactive exploration. Skips expensive phases. +Suitable as a quick-look default or for very small datasets where a +single TBR pass is often sufficient. + +``` +wagnerStarts = 1 +tbrMaxHits = 1 +tabuSize = 0 +ratchetCycles = 3 +ratchetPerturbProb = 0.04 +ratchetPerturbMode = 0 +ratchetPerturbMaxMoves = 0 +ratchetAdaptive = FALSE +driftCycles = 0 # skip drift +driftAfdLimit = 3 +driftRfdLimit = 0.1 +xssRounds = 1 +xssPartitions = 4 +rssRounds = 0 # skip RSS +cssRounds = 0 # skip CSS +cssPartitions = 4 +sectorMinSize = 6 +sectorMaxSize = 50 +fuseInterval = 5 +fuseAcceptEqual = FALSE +``` + +**Rationale**: 3 ratchet cycles (vs 10) provides some escape from local +optima without large time cost. No drift (most expensive phase per cycle). +Minimal sectorial (1 XSS round, no RSS/CSS). No tabu (saves memory and +TBR overhead for quick passes). + +### 2. `default` + +Current production defaults. Balanced for general use. + +``` +wagnerStarts = 1 +tbrMaxHits = 1 +tabuSize = 100 +ratchetCycles = 5 +ratchetPerturbProb = 0.04 +ratchetPerturbMode = 0 +ratchetPerturbMaxMoves = 0 +ratchetAdaptive = FALSE +driftCycles = 2 +driftAfdLimit = 3 +driftRfdLimit = 0.1 +xssRounds = 3 +xssPartitions = 4 +rssRounds = 1 +cssRounds = 0 +cssPartitions = 4 +sectorMinSize = 6 +sectorMaxSize = 50 +fuseInterval = 3 +fuseAcceptEqual = FALSE +``` + +### 3. `thorough` + +More exhaustive exploration. More cycles of everything, adaptive ratchet, +multiple Wagner starts, wider plateau exploration. + +``` +wagnerStarts = 3 +tbrMaxHits = 3 +tabuSize = 200 +ratchetCycles = 20 +ratchetPerturbProb = 0.04 +ratchetPerturbMode = 2 # mixed +ratchetPerturbMaxMoves = 0 +ratchetAdaptive = TRUE +driftCycles = 12 +driftAfdLimit = 5 +driftRfdLimit = 0.15 +xssRounds = 5 +xssPartitions = 6 +rssRounds = 3 +cssRounds = 2 +cssPartitions = 6 +sectorMinSize = 6 +sectorMaxSize = 80 +fuseInterval = 2 +fuseAcceptEqual = TRUE +``` + +**Rationale**: Doubles most cycle counts. Adaptive ratchet tunes perturbation +intensity automatically. Mixed perturbation mode (zero + upweight) provides +more diverse perturbation landscapes. More Wagner starts improve starting +point quality. Higher `tbrMaxHits` + `tabuSize` explore plateaus better. +`fuseAcceptEqual` increases pool diversity for fusing. + +### 4. `ratchet_heavy` + +Emphasize ratchet perturbation for escaping deep local optima. Useful +when the fitness landscape has many local optima separated by large +barriers (common in large datasets with many inapplicable characters). + +``` +wagnerStarts = 1 +tbrMaxHits = 1 +tabuSize = 100 +ratchetCycles = 30 +ratchetPerturbProb = 0.08 +ratchetPerturbMode = 2 # mixed +ratchetPerturbMaxMoves = 0 +ratchetAdaptive = TRUE +driftCycles = 2 # reduced +driftAfdLimit = 3 +driftRfdLimit = 0.1 +xssRounds = 1 # reduced +xssPartitions = 4 +rssRounds = 0 # skip +cssRounds = 0 # skip +cssPartitions = 4 +sectorMinSize = 6 +sectorMaxSize = 50 +fuseInterval = 3 +fuseAcceptEqual = FALSE +``` + +**Rationale**: 3× ratchet cycles, 2× perturbation probability, adaptive +tuning + mixed mode. Drift and sectorial reduced to leave time budget +for ratchet. Most time goes to perturbation-escape cycles. + +### 5. `sectorial_heavy` + +Emphasize sectorial search for large trees where full-tree TBR is +expensive. Decompose the problem into cheaper subproblems. + +``` +wagnerStarts = 1 +tbrMaxHits = 1 +tabuSize = 100 +ratchetCycles = 5 # reduced +ratchetPerturbProb = 0.04 +ratchetPerturbMode = 0 +ratchetPerturbMaxMoves = 0 +ratchetAdaptive = FALSE +driftCycles = 3 # reduced +driftAfdLimit = 3 +driftRfdLimit = 0.1 +xssRounds = 8 # increased +xssPartitions = 6 # more partitions +rssRounds = 4 # increased +cssRounds = 3 # increased +cssPartitions = 6 +sectorMinSize = 6 +sectorMaxSize = 80 # larger sectors +fuseInterval = 2 +fuseAcceptEqual = TRUE +``` + +**Rationale**: Heavy sectorial search (XSS + RSS + CSS) with more +partitions and larger max sector size. Ratchet and drift reduced. +For large trees (60+ tips), sectorial search per-step cost is lower +than full-tree TBR, so more sectorial rounds may yield better +time-to-optimal. + +### 6. `drift_heavy` + +Emphasize tree drifting for exploring the near-optimal landscape. +Useful when the fitness landscape has broad plateaus or many +near-optimal trees. + +``` +wagnerStarts = 1 +tbrMaxHits = 1 +tabuSize = 100 +ratchetCycles = 5 # reduced +ratchetPerturbProb = 0.04 +ratchetPerturbMode = 0 +ratchetPerturbMaxMoves = 0 +ratchetAdaptive = FALSE +driftCycles = 20 # increased +driftAfdLimit = 5 # wider +driftRfdLimit = 0.2 # wider +xssRounds = 2 # reduced +xssPartitions = 4 +rssRounds = 1 +cssRounds = 0 # skip +cssPartitions = 4 +sectorMinSize = 6 +sectorMaxSize = 50 +fuseInterval = 3 +fuseAcceptEqual = TRUE +``` + +**Rationale**: 3× drift cycles with relaxed acceptance criteria +(AFD 5, RFD 0.2) allow the search to wander farther from local +optima via incremental suboptimal moves. Ratchet and sectorial +reduced. `fuseAcceptEqual` helps propagate diverse drifted topologies. + +--- + +## Preset Summary Table + +| Preset | Wagner | TBR hits | Ratchet | Drift | XSS | RSS | CSS | Fuse int | +|--------|--------|----------|---------|-------|-----|-----|-----|----------| +| sprint | 1 | 1 | 3 cyc | off | 1 rnd | off | off | 5 | +| default | 1 | 1 | 10 cyc | 6 cyc | 3 rnd | 1 rnd | 1 rnd | 3 | +| thorough | 3 | 3 | 20 cyc adaptive | 12 cyc | 5 rnd | 3 rnd | 2 rnd | 2 | +| ratchet_heavy | 1 | 1 | 30 cyc adaptive | 2 cyc | 1 rnd | off | off | 3 | +| sectorial_heavy | 1 | 1 | 5 cyc | 3 cyc | 8 rnd | 4 rnd | 3 rnd | 2 | +| drift_heavy | 1 | 1 | 5 cyc | 20 cyc | 2 rnd | 1 rnd | off | 3 | + +--- + +## Usage in Benchmarking (Phase 6D) + +The benchmarking framework should: + +1. Fix convergence parameters (`maxReplicates`, `targetHits`) identically + across presets to make wall-clock comparisons fair. +2. For each benchmark dataset × preset combination, measure: + - Time to find the best-known score (from `datasets.md`) + - Total time for convergence or timeout + - Number of replicates to convergence + - Phase-level timing breakdown (from `timings` attribute) +3. The results matrix (datasets × presets → metrics) feeds Phase 6E + (predictive model) and Phase 6F (adaptive search). + +## Usage in Adaptive Search (Phase 6F) + +The warmup-then-switch approach: +1. Run 2–3 replicates with `default` preset while collecting phase timings. +2. Compute dataset features + phase yield metrics (e.g., "ratchet improved + score in 80% of cycles" → ratchet-heavy might help). +3. Select the best preset for remaining replicates. + +Alternatively, online adaptation could smoothly interpolate between presets +based on per-phase improvement rates. + +--- + +## R Helper Function + +The `dev/benchmarks/bench_datasets.R` benchmark utility can use a +`get_strategy(name)` helper. Example: + +```r +get_strategy <- function(name = c("sprint", "default", "thorough", + "ratchet_heavy", "sectorial_heavy", + "drift_heavy")) { + name <- match.arg(name) + strategies <- list( + sprint = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 0L, + ratchetCycles = 3L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 0L, driftAfdLimit = 3L, driftRfdLimit = 0.1, + xssRounds = 1L, xssPartitions = 4L, rssRounds = 0L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 5L, fuseAcceptEqual = FALSE + ), + default = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 5L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 2L, driftAfdLimit = 3L, driftRfdLimit = 0.1, + xssRounds = 3L, xssPartitions = 4L, rssRounds = 1L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 3L, fuseAcceptEqual = FALSE + ), + thorough = list( + wagnerStarts = 3L, tbrMaxHits = 3L, tabuSize = 200L, + ratchetCycles = 20L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 2L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = TRUE, + driftCycles = 12L, driftAfdLimit = 5L, driftRfdLimit = 0.15, + xssRounds = 5L, xssPartitions = 6L, rssRounds = 3L, + cssRounds = 2L, cssPartitions = 6L, + sectorMinSize = 6L, sectorMaxSize = 80L, + fuseInterval = 2L, fuseAcceptEqual = TRUE + ), + ratchet_heavy = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 30L, ratchetPerturbProb = 0.08, + ratchetPerturbMode = 2L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = TRUE, + driftCycles = 2L, driftAfdLimit = 3L, driftRfdLimit = 0.1, + xssRounds = 1L, xssPartitions = 4L, rssRounds = 0L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 3L, fuseAcceptEqual = FALSE + ), + sectorial_heavy = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 5L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 3L, driftAfdLimit = 3L, driftRfdLimit = 0.1, + xssRounds = 8L, xssPartitions = 6L, rssRounds = 4L, + cssRounds = 3L, cssPartitions = 6L, + sectorMinSize = 6L, sectorMaxSize = 80L, + fuseInterval = 2L, fuseAcceptEqual = TRUE + ), + drift_heavy = list( + wagnerStarts = 1L, tbrMaxHits = 1L, tabuSize = 100L, + ratchetCycles = 5L, ratchetPerturbProb = 0.04, + ratchetPerturbMode = 0L, ratchetPerturbMaxMoves = 0L, + ratchetAdaptive = FALSE, + driftCycles = 20L, driftAfdLimit = 5L, driftRfdLimit = 0.2, + xssRounds = 2L, xssPartitions = 4L, rssRounds = 1L, + cssRounds = 0L, cssPartitions = 4L, + sectorMinSize = 6L, sectorMaxSize = 50L, + fuseInterval = 3L, fuseAcceptEqual = TRUE + ) + ) + strategies[[name]] +} +``` + +This helper will be formalized in the benchmarking framework (T-004). + +--- + +## External Benchmark Datasets (MorphoBank corpus) + +### Train/validation split + +The `neotrans/inst/matrices/` directory contains ~800 MorphoBank phylogenetic +matrices. These supplement the 14 bundled datasets for broader, less +overfitting-prone benchmarking. + +**Split rule:** A matrix belongs to the **validation** set if its MorphoBank +project number is divisible by 5 (i.e., `project_id %% 5 == 0`); all others +are **training**. The 7 `syab*` files (non-MorphoBank) are always training. + +After filtering (ntax ≥ 20, parse OK, dedup): 535 training, 124 validation. + +**Usage rules:** +- **Training** matrices may be used freely during development and tuning. +- **Validation** matrices are a **one-way door**: run once to confirm that + improvements generalize. Results must **never** inform strategy tuning. +- If validation is ever used for tuning, the split is compromised and must + be rebuilt with a new rule. + +### Dedup + +Multi-file projects (same MorphoBank project, separate `.nex` files) often +contain the same character matrix with minor taxon-sampling variations. These +are flagged as `dedup_drop = TRUE` in the catalogue. The dedup uses pairwise +character identity ≥ 95% on shared taxa (requiring ≥ 80% taxon overlap), +keeping the largest matrix per redundancy cluster. + +24 near-duplicates are excluded, leaving 659 usable matrices. + +### Fixed 25-matrix training sample + +For routine benchmarking, a fixed sample of 25 matrices is used +(`MBANK_FIXED_SAMPLE` in `bench_datasets.R`). Selected via max-min distance +on standardized (ntax, nchar, pct_missing, pct_inapp) within each tier: + +| Tier | Count | Keys | +|------|-------|------| +| Small (20–30) | 7 | project532, project2346, project2451, project4501, project944, project971_(1), project2762 | +| Medium (31–60) | 7 | project826, project561, project571, project4146_(3), project3688, project4049, project423 | +| Large (61–120) | 7 | project4286, project4359, project4397, project2084_(1), project2771, project2184, project3938 | +| XLarge (121+) | 4 | syab07201, project4133, project804, project4284 | + +**Do not modify this list.** Benchmark comparisons require the same sample. diff --git a/dev/benchmarks/stress_large_findings.md b/dev/benchmarks/stress_large_findings.md new file mode 100644 index 000000000..23d5dcdb0 --- /dev/null +++ b/dev/benchmarks/stress_large_findings.md @@ -0,0 +1,83 @@ +# T-069 Stress Test Findings — 150–225 taxa +Agent F, 2026-03-18 + +## Datasets + +| File | Taxa | Chars | NA blocks | Inapplicable | +|------|------|-------|-----------|--------------| +| project175.nex | 165 | 71 | 2 | 0% | +| project3763.nex | 205 | 103 | 3 | 50.1% | +| syab07204.nex | 225 | 748 | 12 | 25.1% | + +## Key Findings + +### 1. Scaling exponents (synthetic series, n=20–225) + +| Metric | Exponent | Expected | +|--------|----------|---------| +| `n_candidates` | **n^2.86** | O(n^2) = 2.0 | +| `indirect_us` | **n^2.73** | — | +| `clip_incr_us` | **n^1.50** | — | + +Candidate count scales slightly super-quadratically (larger pruned subtrees give more valid regraft positions). Indirect scoring tracks candidates closely. Clip/incremental is sub-linear relative to candidates — incremental state amortises well. + +Both exponents are consistent with the existing AGENTS.md note (~n^2.8 TBR cost). + +### 2. NA block count drives per-candidate cost + +| Dataset | n_tips | n_blocks | ns/candidate | +|---------|--------|----------|--------------| +| project175 | 165 | 2 | 12.6 ns | +| project3763 | 205 | 3 | 19.2 ns | +| syab07204 | 225 | 12 | **57.5 ns** | + +syab07204's 12 NA character blocks cause ~4.6× higher per-candidate cost than the 2-block case, and 3× higher than 3-block. The NA three-pass scoring cost is proportional to n_blocks, not just n_tips. This is a real bottleneck for large, character-rich matrices with many inapplicable characters. + +The existing baseline in AGENTS.md (`~23 ns at 75 tips`) was measured on small inapplicable.phyData sets. Large real matrices with many NA blocks can be 2–3× slower per candidate. + +### 3. TBR fraction surpasses ratchet+drift at 200+ taxa + +| Dataset | TBR% | Ratchet% | Drift% | +|---------|------|----------|--------| +| project175 (165t, thorough) | 17% | 38% | 42% | +| project3763 (205t, default) | **57%** | 13% | 28% | +| syab07204 (225t, default) | **49%** | 13% | 27% | + +At ≤100 taxa, ratchet+drift dominate (~65–70%). At 200+ taxa, TBR itself becomes the largest single cost (49–57%). This crossover happens around 150–175 taxa. The phase distribution shift is driven by the super-quadratic TBR cost overwhelming the approximately-linear perturbation overhead. + +### 4. Pool collapse at large n with many characters + +syab07204 (225t, 748 chars) produced pool sizes of **8 and 2** from 2 replicates (2 reps each, nThreads=2). In contrast, project3763 (205t, 103 chars) filled the 100-tree pool even from 2 reps. + +The near-empty pool for syab07204 means: +- Tree fusing has almost no material to work with +- MPT enumeration from the pool will be from very few seeds +- Users may get poor solutions without many more replicates + +This is expected behaviour (each TBR pass takes ~150ms, so a 2-rep run completes very few TBR iterations), but it highlights that **recommended replicates should scale with taxa × chars**. At 225t / 748 chars, users need 10–20+ replicates for reliable results. + +### 5. Score variability at large n + +| Dataset | Score seed1 | Score seed2 | Δ | +|---------|------------|------------|---| +| project175 | 419 | 424 | 5 (1.2%) | +| project3763 | 1643 | 1513 | 130 (7.9%) | +| syab07204 | 11785 | 11933 | 148 (1.3%) | + +project3763 shows high variability (7.9%) despite only 205 taxa — likely because the 50% inapplicable data creates a very complex landscape. High inapplicable fractions interact with the NA three-pass to create many near-equal plateau trees. + +### 6. Memory (snapshot bytes per TBR pass) + +| Dataset | Snapshot KB | +|---------|------------| +| project175 (165t, 2 blocks) | 66.8 KB | +| project3763 (205t, 3 blocks) | 290.8 KB | +| syab07204 (225t, 12 blocks) | **547.2 KB** | + +Snapshot memory is manageable (well under 1 MB per pass), but the 547 KB for syab07204 means that with nThreads=2 each thread carries ~1 MB of snapshot state. Not a memory problem, but cache pressure contributes to the elevated per-candidate cost. + +## Suggested Follow-up Tasks + +- **T-073 (potential)**: Benchmark per-candidate cost as a function of `n_blocks` (hold n_tips fixed). Determine whether there's a block-count threshold beyond which a different NA scoring strategy would help. +- **T-074 (potential)**: Auto-scale `maxReplicates` recommendation in `SearchControl()` based on n_tips × n_chars × n_blocks. +- Revisit `thorough` strategy for large char-dense matrices: at 225t/748 chars, the ratchet+drift overhead is proportionally small (40%), so increasing ratchet/drift cycles is cheap relative to per-pass TBR cost. diff --git a/dev/benchmarks/stress_large_results.csv b/dev/benchmarks/stress_large_results.csv new file mode 100644 index 000000000..cbfee5f20 --- /dev/null +++ b/dev/benchmarks/stress_large_results.csv @@ -0,0 +1,4 @@ +"file","n_tips","n_chars","strategy","score1","score2","time1","time2","pool1","reps1" +"project175.nex",165,71,"thorough",419,424,1.86,1.85,100,1 +"project3763.nex",205,103,"default",1643,1513,14.58,17.88,100,1 +"syab07204.nex",225,748,"default",11785,11933,41.83,35.32,8,1 diff --git a/dev/benchmarks/t252_hamilton.sh b/dev/benchmarks/t252_hamilton.sh new file mode 100644 index 000000000..2413bc62f --- /dev/null +++ b/dev/benchmarks/t252_hamilton.sh @@ -0,0 +1,70 @@ +#!/bin/bash +#SBATCH --job-name=t252-mbank +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=8G +#SBATCH --time=8:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t252_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t252_%j.err + +# T-252: MorphoBank training-set baseline benchmark +# 25 matrices x 3 budgets (30/60/120s) x 5 seeds = 375 runs +# Estimated: ~5 hours + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t252_results + +mkdir -p "$LIB" +mkdir -p "$OUTDIR" +mkdir -p /nobackup/$USER/TreeSearch/logs + +echo "=== T-252 MorphoBank Training-Set Benchmark ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Build and install from latest cpp-search +cd "$REPO" || exit 1 +git pull --ff-only origin cpp-search 2>/dev/null || true +echo "Git HEAD: $(git log --oneline -1)" +echo "" + +rm -f src/*.o src/*.so +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +echo "Install exit code: $rc" +rm -f TreeSearch_*.tar.gz + +if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 +fi + +# Verify neotrans corpus is available +NEOTRANS=/nobackup/$USER/neotrans/inst/matrices +if [ ! -d "$NEOTRANS" ]; then + echo "FATAL: neotrans matrices not found at $NEOTRANS" + echo "Clone with: cd /nobackup/$USER && git clone " + exit 1 +fi +echo "Neotrans matrices: $(ls $NEOTRANS | wc -l) files" +echo "" + +# Run benchmark +cd "$REPO" +export R_LIBS_USER="$LIB" +Rscript dev/benchmarks/bench_t252_mbank_training.R "$OUTDIR" 2>&1 + +echo "" +echo "Completed: $(date)" +echo "Results in: $OUTDIR" +ls -la "$OUTDIR"/t252_*.csv 2>/dev/null diff --git a/dev/benchmarks/t252_mbank_120s_20260327_1317.csv b/dev/benchmarks/t252_mbank_120s_20260327_1317.csv new file mode 100644 index 000000000..b6c1174a7 --- /dev/null +++ b/dev/benchmarks/t252_mbank_120s_20260327_1317.csv @@ -0,0 +1,126 @@ +"dataset","strategy","replicate","seed","n_taxa","best_score","replicates","hits_to_best","pool_size","timed_out","wall_s","time_to_best_s","wagner_ms","tbr_ms","xss_ms","rss_ms","css_ms","ratchet_ms","drift_ms","final_tbr_ms","fuse_ms","budget_s","source" +"project532","default",1,3847,21,1139,7,5,4,FALSE,0.597999999999956,0.174283273,148.503215,14.327887,24.690415,17.636236,0,303.184831,62.553178,12.901541,2.343453,120,"mbank_training" +"project532","default",2,3848,21,1139,6,4,3,FALSE,0.597999999999956,0.123916778,121.97986,33.485211,22.381566,18.933991,0,320.406746,55.284499,12.856154,1.757961,120,"mbank_training" +"project532","default",3,3849,21,1139,6,5,4,FALSE,0.579999999999927,0.139597412,132.649526,22.214793,26.952043,15.646982,0,300.59517,53.942751,12.362817,1.812183,120,"mbank_training" +"project532","default",4,3850,21,1139,7,3,3,FALSE,0.750999999999294,0.197873706,141.871555,42.623983,26.681093,20.747777,0,423.874721,68.928898,15.491427,0.949228,120,"mbank_training" +"project532","default",5,3851,21,1139,6,3,2,FALSE,0.592999999999847,0.129039236,125.361359,30.279261,26.760433,19.136244,0,319.94664,49.821137,13.127436,2.038109,120,"mbank_training" +"project2346","default",1,3847,23,316,18,2,4,FALSE,0.829999999999927,0.608430951,71.029184,35.473094,32.825955,19.374163,0,519.261742,131.533754,13.02882,0.765322,120,"mbank_training" +"project2346","default",2,3848,23,318,8,2,20,FALSE,0.304000000000087,0.117708173,33.520126,9.281411,16.761981,7.827445,0,158.257356,36.152855,5.366508,0.311627,120,"mbank_training" +"project2346","default",3,3849,23,320,12,2,4,FALSE,0.46599999999944,0.124930457,48.132288,18.466171,27.800715,12.278035,0,276.894882,67.959639,8.181343,0.703515,120,"mbank_training" +"project2346","default",4,3850,23,317,17,2,13,FALSE,0.795000000000073,0.344254344,68.922419,28.551914,29.73234,17.731987,0,483.753854,123.08423,12.051296,0.341163,120,"mbank_training" +"project2346","default",5,3851,23,314,78,1,4,FALSE,3.67699999999968,1.435090677,304.810463,132.69613,141.883327,73.51865,0,2356.842515,605.479236,51.597378,1.447145,120,"mbank_training" +"project2451","default",1,3847,24,735,12,2,4,FALSE,0.427999999999884,0.221843663,66.205828,13.396343,18.763832,10.832563,0,227.105905,79.846971,6.955762,0.330172,120,"mbank_training" +"project2451","default",2,3848,24,732,67,1,1,FALSE,3.22999999999956,0.823948679,346.979458,82.764533,119.244198,52.943228,0,1902.570635,683.326202,38.894812,0,120,"mbank_training" +"project2451","default",3,3849,24,731,7,2,6,FALSE,0.244999999999891,0.092966703,41.354672,8.912597,12.506295,4.7361,0,123.690091,42.28259,4.282265,0.193916,120,"mbank_training" +"project2451","default",4,3850,24,731,74,1,3,FALSE,3.45600000000013,1.132576511,383.106166,87.869532,126.007845,56.212429,0,2044.100132,703.889984,42.021767,1.663113,120,"mbank_training" +"project2451","default",5,3851,24,730,56,1,2,FALSE,2.67200000000048,0.277758048,280.567088,85.41411,100.316807,46.314025,0,1563.836515,557.562805,33.588465,0,120,"mbank_training" +"project4501","default",1,3847,24,118,11,4,63,FALSE,0.449999999999818,0.049543826,20.627493,11.198614,11.815055,7.776755,0,138.45961,22.195465,5.140892,14.184838,120,"mbank_training" +"project4501","default",2,3848,24,118,8,6,93,FALSE,0.360999999999876,0.02131701,15.809006,7.847642,11.572414,6.5989,0,69.68908,14.638501,3.76466,0.836356,120,"mbank_training" +"project4501","default",3,3849,24,118,11,8,63,FALSE,0.387000000000626,0.025869516,17.988182,13.159864,11.846536,7.174633,0,103.145235,18.32269,5.53868,1.248622,120,"mbank_training" +"project4501","default",4,3850,24,118,11,6,63,FALSE,0.428999999999178,0.05354915,18.767008,14.497546,12.386349,6.770313,0,134.033431,20.862917,5.271509,0.836095,120,"mbank_training" +"project4501","default",5,3851,24,118,9,4,93,FALSE,0.377000000000407,0.018355794,15.849192,12.337627,7.080306,5.553369,0,101.60287,17.539517,3.805357,0.62106,120,"mbank_training" +"project944","default",1,3847,25,128,7,7,60,FALSE,0.349999999999454,0.018372965,13.749598,8.665922,6.577338,4.388015,0,50.700024,9.316056,2.852453,0.884296,120,"mbank_training" +"project944","default",2,3848,25,128,6,6,60,FALSE,0.305999999999585,0.026536602,16.762202,6.242929,6.315995,4.568325,0,59.400532,10.678432,3.202963,1.232052,120,"mbank_training" +"project944","default",3,3849,25,128,6,6,60,FALSE,0.353000000000065,0.023519519,12.339231,7.414637,6.222741,4.003219,0,49.831448,10.604764,2.731975,1.047022,120,"mbank_training" +"project944","default",4,3850,25,128,8,8,60,FALSE,0.373000000000502,0.018053484,16.38878,8.637668,6.467942,5.269536,0,61.244825,13.507091,3.300045,0.928819,120,"mbank_training" +"project944","default",5,3851,25,128,9,9,60,FALSE,0.289999999999964,0.019552678,18.095653,12.151427,7.457316,5.676282,0,68.236955,11.678938,3.661214,1.465672,120,"mbank_training" +"project971_(1)","default",1,3847,26,157,7,5,100,FALSE,0.199999999999818,0.034288362,19.540965,8.624554,13.504326,6.719836,0,112.82168,17.875589,5.259305,0.877813,120,"mbank_training" +"project971_(1)","default",2,3848,26,157,7,3,100,FALSE,0.216000000000349,0.03481252,16.035452,7.674916,8.319572,7.100072,0,124.029589,18.743483,5.235702,0.826979,120,"mbank_training" +"project971_(1)","default",3,3849,26,157,14,9,100,FALSE,0.42200000000048,0.07753632,34.000391,20.956551,31.22286,14.428183,0,240.754111,33.668735,10.389131,1.483253,120,"mbank_training" +"project971_(1)","default",4,3850,26,157,9,5,100,FALSE,0.268000000000029,0.034529356,20.521152,11.128321,12.392041,9.051989,0,167.722094,26.988611,6.584964,0.792242,120,"mbank_training" +"project971_(1)","default",5,3851,26,157,8,5,100,FALSE,0.269000000000233,0.033144458,22.034311,10.560942,19.018461,9.771033,0,142.472617,25.789647,7.242129,1.522097,120,"mbank_training" +"project2762","default",1,3847,29,259,9,5,100,FALSE,0.552000000000589,0.076698963,102.552619,31.991234,32.017848,18.777667,0,255.924423,60.232298,12.068471,3.722681,120,"mbank_training" +"project2762","default",2,3848,29,259,9,5,100,FALSE,0.559000000000196,0.124296864,110.021915,23.549355,22.25614,22.257053,0,265.42186,58.979949,11.944967,3.162605,120,"mbank_training" +"project2762","default",3,3849,29,259,14,7,100,FALSE,0.896999999999935,0.271071243,182.034881,44.101888,34.36301,33.517622,0,439.83955,83.552751,18.319235,3.683558,120,"mbank_training" +"project2762","default",4,3850,29,259,7,2,100,FALSE,0.552000000000589,0.072439469,103.33374,22.591772,28.818399,19.640743,0,260.223182,55.45098,10.463058,0.79617,120,"mbank_training" +"project2762","default",5,3851,29,259,9,2,100,FALSE,0.640000000000327,0.196973069,111.81434,30.079626,29.628034,21.70403,0,335.326449,74.873774,12.244683,1.187246,120,"mbank_training" +"project826","default",1,3847,33,431,10,5,100,FALSE,0.731000000000677,0.076276196,121.741229,108.597777,43.75939,21.104991,0,309.097116,60.566575,12.983497,2.531488,120,"mbank_training" +"project826","default",2,3848,33,431,14,8,100,FALSE,0.932999999999993,0.072963527,190.042457,114.346752,47.944522,34.721809,0,421.781289,70.243541,16.782441,3.219924,120,"mbank_training" +"project826","default",3,3849,33,431,9,5,100,FALSE,0.604999999999563,0.14725746,118.219918,80.220355,40.605209,16.564079,0,268.776122,41.222762,11.147686,2.454232,120,"mbank_training" +"project826","default",4,3850,33,431,13,7,100,FALSE,0.806999999999789,0.069525721,173.788642,93.184784,51.796127,23.401488,0,341.117535,62.517173,15.345382,4.454339,120,"mbank_training" +"project826","default",5,3851,33,431,10,7,100,FALSE,0.626999999999498,0.132048574,127.710431,85.490192,32.522076,19.169466,0,269.264299,52.301932,12.045337,2.735471,120,"mbank_training" +"project561","default",1,3847,34,1169,5,4,2,FALSE,0.766999999999825,0.200921647,104.203848,28.18669,31.8137,34.553072,0,434.189226,104.452797,18.555571,1.495537,120,"mbank_training" +"project561","default",2,3848,34,1169,10,8,2,FALSE,1.57900000000063,0.235814397,184.091378,57.688625,102.624785,76.210602,0,897.399635,209.365029,40.577297,1.872447,120,"mbank_training" +"project561","default",3,3849,34,1169,6,4,2,FALSE,1.03800000000047,0.215172027,112.760827,32.396947,72.031643,37.331976,0,617.902089,130.587572,22.017981,3.256114,120,"mbank_training" +"project561","default",4,3850,34,1169,7,6,2,FALSE,1.09799999999996,0.255119559,141.429342,58.494966,51.773246,51.428364,0,604.051173,149.422167,28.95167,1.973197,120,"mbank_training" +"project561","default",5,3851,34,1169,13,10,2,FALSE,1.8779999999997,0.227611827,254.913113,90.500066,104.908815,86.761706,0,1057.262797,223.664192,49.85961,0,120,"mbank_training" +"project571","default",1,3847,42,634,8,3,12,FALSE,0.894999999999527,0.199396634,135.515565,66.212952,42.147855,27.872039,0,445.262381,81.952351,16.597809,1.538257,120,"mbank_training" +"project571","default",2,3848,42,635,7,6,28,FALSE,0.715000000000146,0.099789584,112.405676,51.24409,31.78723,22.885687,0,258.887033,54.158859,11.817098,2.125303,120,"mbank_training" +"project571","default",3,3849,42,634,14,5,12,FALSE,1.57200000000012,0.225831785,228.209175,86.461079,80.530839,48.792823,0,858.462695,170.886235,26.904718,3.584772,120,"mbank_training" +"project571","default",4,3850,42,634,6,3,12,FALSE,0.603000000000065,0.137255932,117.591152,49.656388,31.269116,20.712042,0,260.278734,57.636379,11.03258,2.350998,120,"mbank_training" +"project571","default",5,3851,42,634,7,4,12,FALSE,0.618999999999687,0.103445409,115.114959,37.63192,24.581629,23.032833,0,293.901454,58.725278,11.897841,2.274955,120,"mbank_training" +"project4146_(3)","default",1,3847,59,260,88,2,100,FALSE,34.116,1.413533364,2671.985035,1807.940198,1630.073094,728.712218,0,19646.876166,7051.046702,440.119674,2.154328,120,"mbank_training" +"project4146_(3)","default",2,3848,59,262,16,2,100,FALSE,4.8080000000009,3.259231522,437.061873,332.329626,260.812709,136.426056,0,2658.735654,754.976978,76.161495,3.929552,120,"mbank_training" +"project4146_(3)","default",3,3849,59,263,11,2,100,FALSE,3.04700000000048,1.879511765,306.395457,247.90622,145.666599,77.723276,0,1638.1709,496.971015,51.236215,1.810841,120,"mbank_training" +"project4146_(3)","default",4,3850,59,261,43,1,100,FALSE,15.3180000000002,2.374788278,1320.469164,894.874744,714.758367,346.257565,0,8599.051059,3133.391372,210.801877,10.461404,120,"mbank_training" +"project4146_(3)","default",5,3851,59,261,31,2,100,FALSE,12.2269999999999,0.741778749,1014.974118,655.035662,685.970374,273.578485,0,6815.550667,2422.397126,162.815185,2.274815,120,"mbank_training" +"project3688","default",1,3847,60,854,17,2,100,FALSE,5.02500000000055,5.021771877,510.37382,427.643124,259.063646,112.165339,0,2383.360876,1192.373307,62.203885,1.168781,120,"mbank_training" +"project3688","default",2,3848,60,852,56,2,100,FALSE,17.5849999999991,17.581586221,1508.551866,1422.340587,842.489003,302.981617,0,8563.456329,4566.251877,183.361074,8.76499,120,"mbank_training" +"project3688","default",3,3849,60,850,86,2,100,FALSE,23.5720000000001,7.411819695,2115.873042,1577.791567,1209.467271,451.31191,0,11671.087764,6083.894379,257.34136,1.490678,120,"mbank_training" +"project3688","default",4,3850,60,845,100,0,100,FALSE,28.1569999999992,24.162367597,2261.042949,2157.45028,1376.283966,483.92103,0,14079.414317,7325.004318,307.627242,8.029794,120,"mbank_training" +"project3688","default",5,3851,60,851,100,2,100,FALSE,31.8050000000003,31.802672613,2618.332145,2409.378973,1494.269097,570.378077,0,15695.913627,8333.697,340.037484,0,120,"mbank_training" +"project4049","default",1,3847,60,5237,58,2,69,FALSE,80.0329999999994,32.052243607,16809.550981,5256.228341,2110.483353,1030.435606,0,35258.662297,11422.279982,653.04131,4.22619,120,"mbank_training" +"project4049","default",2,3848,60,5241,14,2,100,FALSE,16.0689999999995,7.870476592,4173.924444,1414.553628,544.295207,261.063861,0,6683.463902,1911.022872,169.10444,3.227579,120,"mbank_training" +"project4049","default",3,3849,60,5237,86,0,69,TRUE,120.012,29.14747189,24915.557783,8555.080917,2826.307074,1585.849986,0,52725.984647,16420.480107,960.753632,19.357873,120,"mbank_training" +"project4049","default",4,3850,60,5237,82,1,67,TRUE,113.106000000001,87.538582491,24380.17746,8460.16691,2744.356645,1587.127748,0,53779.48615,16087.417783,947.49759,0,120,"mbank_training" +"project4049","default",5,3851,60,5238,48,2,100,FALSE,61.625,42.237513028,14343.858343,4557.954165,1591.975844,975.424709,0,29679.922169,9072.33144,551.917905,6.537494,120,"mbank_training" +"project423","default",1,3847,60,495,8,4,100,FALSE,3.40499999999975,0.526153467,566.898634,190.75694,215.289551,136.966522,0,1626.799883,386.053694,70.778868,9.020881,120,"mbank_training" +"project423","default",2,3848,60,495,10,4,100,FALSE,3.90000000000055,0.411584624,588.627592,245.185435,238.855369,144.338626,0,1998.711399,474.040828,90.219949,16.501401,120,"mbank_training" +"project423","default",3,3849,60,495,9,4,100,FALSE,3.39900000000034,0.46492677,604.06468,177.265199,213.227558,110.902656,0,1578.009632,412.734359,77.942664,17.989694,120,"mbank_training" +"project423","default",4,3850,60,495,9,3,100,FALSE,3.30899999999929,0.925018402,492.497982,155.53686,151.105736,102.833498,0,1678.795371,457.312226,70.475237,10.101998,120,"mbank_training" +"project423","default",5,3851,60,495,7,3,100,FALSE,2.92799999999988,0.693696482,413.570945,112.361907,210.636516,84.170433,0,1356.68924,368.077685,66.544864,10.714993,120,"mbank_training" +"project4286","default",1,3847,63,283,29,1,100,FALSE,11.7159999999994,4.100318091,1046.296071,768.137171,567.789875,282.419437,0,6540.196924,2232.430133,165.583054,11.269518,120,"mbank_training" +"project4286","default",2,3848,63,286,17,1,100,FALSE,6.14800000000014,2.731416157,577.453715,449.078541,267.206664,154.448919,0,3413.660776,1114.497786,88.825288,12.203679,120,"mbank_training" +"project4286","default",3,3849,63,282,100,0,100,FALSE,44.9789999999994,5.96331056,3683.094082,2517.868007,1813.204677,950.494329,0,25889.045977,9131.184162,567.907615,10.793273,120,"mbank_training" +"project4286","default",4,3850,63,282,36,2,100,FALSE,15.5619999999999,7.152473633,1275.286219,836.323187,605.933541,328.329446,0,8875.831635,3180.293214,206.313307,4.975995,120,"mbank_training" +"project4286","default",5,3851,63,281,100,0,100,FALSE,46.5769999999993,35.041412631,3699.824056,2689.559526,2089.605703,946.042036,0,26985.775764,9107.103506,618.261001,11.396668,120,"mbank_training" +"project4359","default",1,3847,71,183,44,14,100,FALSE,41.9520000000002,3.203920721,8767.766222,2006.351704,1513.433996,1382.957629,0,23498.199682,3646.497845,799.066168,169.565444,120,"mbank_training" +"project4359","default",2,3848,71,183,30,14,100,FALSE,21.7539999999999,0.915047912,5225.794822,1438.006894,960.455597,825.817399,0,10896.586175,1663.655235,533.446058,88.870922,120,"mbank_training" +"project4359","default",3,3849,71,183,48,14,100,FALSE,40.0810000000001,4.272051181,8738.055745,2383.24364,1527.197327,1300.041735,0,21769.124441,3284.361575,822.207967,151.216495,120,"mbank_training" +"project4359","default",4,3850,71,183,39,14,100,FALSE,30.3309999999992,4.925332616,7050.165321,1939.795687,1343.31804,1102.898121,0,15901.077905,2095.50652,655.182761,120.679789,120,"mbank_training" +"project4359","default",5,3851,71,184,5,4,100,FALSE,3.22699999999986,0.863578676,733.995855,246.411799,130.749929,134.986509,0,1477.365187,265.119975,95.768599,7.753395,120,"mbank_training" +"project4397","default",1,3847,75,1645,100,1,80,FALSE,92.9229999999998,14.423068514,13204.176944,13809.390556,2675.089632,1261.189782,0,43065.316764,14214.117253,874.400732,0,120,"mbank_training" +"project4397","default",2,3848,75,1647,44,2,72,FALSE,40.518,40.516149152,5462.808026,6069.197401,1228.633206,540.398184,0,17364.29475,5740.716472,370.847129,5.18697,120,"mbank_training" +"project4397","default",3,3849,75,1649,22,2,100,FALSE,18.2539999999999,18.252467134,2903.384257,3034.807995,547.422862,272.134946,0,8130.583929,2706.994697,187.14634,2.482476,120,"mbank_training" +"project4397","default",4,3850,75,1646,100,0,100,FALSE,89.067,62.749311293,12699.873477,14195.273694,2819.594899,1371.308273,0,42354.846242,13338.153642,863.920671,23.747802,120,"mbank_training" +"project4397","default",5,3851,75,1646,100,1,43,FALSE,88.2970000000005,8.63749485,12756.82406,14581.059802,2551.350852,1244.568156,0,41346.437513,13615.086309,876.091028,0,120,"mbank_training" +"project2084_(1)","default",1,3847,86,28962,3,1,1,TRUE,108.528,39.357658916,23627.712028,24029.377952,5464.093052,1094.520444,0,35104.689634,17833.210932,1040.792512,0,120,"mbank_training" +"project2084_(1)","default",2,3848,86,28206,2,1,2,TRUE,110.605,43.882252328,21477.758523,22676.801355,8385.164939,1728.20278,0,33082.151982,20161.210155,644.290514,0,120,"mbank_training" +"project2084_(1)","default",3,3849,86,28303,3,1,1,TRUE,108.306,42.637740632,30964.641428,23462.698032,5351.663578,1468.335058,0,32030.349028,13873.619593,887.377121,0,120,"mbank_training" +"project2084_(1)","default",4,3850,86,28724,3,0,1,TRUE,108.268,79.500306167,21209.336022,16545.544527,6915.74316,1698.567909,0,40609.952976,18209.011495,999.977684,1818.540659,120,"mbank_training" +"project2084_(1)","default",5,3851,86,29024,4,1,1,TRUE,108.461,78.301270838,25734.337567,17627.057703,7778.349417,1409.668754,0,38256.493533,16373.841466,1007.174092,0,120,"mbank_training" +"project2771","default",1,3847,94,1042,65,1,16,TRUE,109.469999999999,90.147688908,7955.523496,3364.064349,5118.545353,1685.906249,0,59102.753807,29649.557642,1129.086726,4.686277,120,"mbank_training" +"project2771","default",2,3848,94,1049,65,1,10,TRUE,109.496,109.495686787,8276.423813,3896.479947,5070.025056,1554.401001,0,60209.181891,27813.96109,1150.255409,25.301502,120,"mbank_training" +"project2771","default",3,3849,94,1055,65,1,10,TRUE,108.414,108.413357524,7932.479499,4160.669826,6047.767008,1677.958011,0,59065.792055,27961.877695,1152.576054,0,120,"mbank_training" +"project2771","default",4,3850,94,1046,66,0,6,TRUE,108.206,20.66770794,8170.151603,3690.397114,5343.117597,1847.35924,0,60061.381446,27735.459754,1123.077726,38.827043,120,"mbank_training" +"project2771","default",5,3851,94,1059,65,1,1,TRUE,108.043000000001,44.486758246,8124.186883,3750.441696,4893.377873,1690.985489,0,59411.915119,29030.299076,1117.712092,0,120,"mbank_training" +"project2184","default",1,3847,114,565,13,1,100,FALSE,16.5100000000002,10.270284347,1901.604448,4187.802544,481.297502,267.875047,0,7162.991061,1958.33887,236.763264,35.048523,120,"mbank_training" +"project2184","default",2,3848,114,564,8,2,100,FALSE,9.17799999999988,4.400413194,1068.136565,2525.129151,342.888574,182.385329,0,3728.285364,981.185821,141.535775,6.532746,120,"mbank_training" +"project2184","default",3,3849,114,565,18,2,100,FALSE,23.433,14.00867502,2492.973307,5244.857721,716.101013,419.309974,0,10828.214684,3285.003272,326.780788,18.981366,120,"mbank_training" +"project2184","default",4,3850,114,563,73,0,100,TRUE,108.129000000001,15.334493816,9759.896311,23116.913797,2974.664553,1610.739116,0,53197.148769,15938.358971,1339.551383,64.034474,120,"mbank_training" +"project2184","default",5,3851,114,564,18,2,100,FALSE,24.5619999999999,1.094052073,2382.013724,5658.68694,772.047049,388.245695,0,11377.309218,3381.536146,324.950291,12.937261,120,"mbank_training" +"project3938","default",1,3847,119,3417,9,2,100,FALSE,44.5619999999999,19.075341196,11413.369643,15029.814616,2202.384628,586.820894,0,10863.89477,3556.837676,388.669586,26.65143,120,"mbank_training" +"project3938","default",2,3848,119,3408,18,1,100,TRUE,108.633,20.195418378,26861.257588,33098.946527,4015.335393,1157.836335,0,29753.222391,12283.646294,828.075363,0,120,"mbank_training" +"project3938","default",3,3849,119,3413,9,2,100,FALSE,46.6349999999993,25.918489915,12043.938351,14090.206009,1790.431927,539.632353,0,12164.741083,4564.794154,398.036473,187.663721,120,"mbank_training" +"project3938","default",4,3850,119,3408,18,0,100,TRUE,108.391,83.647681499,24710.571406,34061.871319,4085.051645,1121.98776,0,30625.896483,12543.803153,802.23311,62.598762,120,"mbank_training" +"project3938","default",5,3851,119,3405,18,1,100,TRUE,108.735000000001,108.725656538,27374.72291,33415.734207,2665.163714,1469.101328,0,31037.667179,11162.01504,797.550904,62.214628,120,"mbank_training" +"syab07201","default",1,3847,125,14933,12,1,3,TRUE,108.728,12.592821236,13570.480664,8180.130151,3577.144371,1630.401141,0,51213.95676,28665.633721,1232.812355,0,120,"mbank_training" +"syab07201","default",2,3848,125,14931,12,1,1,TRUE,108.170999999999,89.087282798,13846.124014,8503.522116,3033.045252,1743.023891,0,56010.682657,23629.967986,1254.266463,0,120,"mbank_training" +"syab07201","default",3,3849,125,14932,12,1,4,TRUE,109.077,82.250482396,14298.112856,7593.533046,3447.583924,1667.188132,0,53409.384596,26372.219385,1269.450238,0,120,"mbank_training" +"syab07201","default",4,3850,125,14948,13,1,3,TRUE,108.708,58.209723111,14448.190969,8478.906994,3301.745774,1760.348587,0,52382.540065,26330.066606,1350.91075,0,120,"mbank_training" +"syab07201","default",5,3851,125,14926,9,1,2,TRUE,108.34,38.305426931,13591.55147,10687.585468,2838.502225,2031.87457,0,52607.030956,24885.943975,1424.417762,0,120,"mbank_training" +"project4133","default",1,3847,131,2371,28,1,100,TRUE,109.487999999999,109.485362629,18339.628663,29653.665094,3505.219405,1074.854075,0,37083.883069,17530.824584,823.678609,0,120,"mbank_training" +"project4133","default",2,3848,131,2379,28,1,100,TRUE,109.438,12.828640628,18298.553113,32054.720869,3214.405102,1234.216956,0,35374.579163,17020.561662,822.438309,0,120,"mbank_training" +"project4133","default",3,3849,131,2378,28,1,100,TRUE,109.021000000001,16.15741946,19005.412995,29148.022626,4032.380029,1146.585333,0,37006.380772,16787.657354,879.708429,0,120,"mbank_training" +"project4133","default",4,3850,131,2372,25,1,100,TRUE,109.190000000001,109.18586107,18321.807637,32304.089003,3145.022262,1071.913018,0,36400.932524,15981.659703,761.298839,0,120,"mbank_training" +"project4133","default",5,3851,131,2376,27,1,100,TRUE,108.601000000001,108.597659452,19342.257079,29311.053978,3138.549354,1310.424968,0,37076.326113,17006.773101,825.497051,0,120,"mbank_training" +"project804","default",1,3847,173,1361,5,1,100,TRUE,119.700000000001,119.687429526,18892.753258,22370.05734,6848.462855,1344.700911,0,38968.418347,18451.002179,1120.532486,0,120,"mbank_training" +"project804","default",2,3848,173,1361,6,1,77,TRUE,120.030999999999,120.019861675,19661.854592,29304.260402,6104.284431,1238.988987,0,35207.71569,15866.336486,1093.321497,0,120,"mbank_training" +"project804","default",3,3849,173,1374,6,1,37,TRUE,120.075000000001,120.066968555,20123.993779,21883.447515,4358.387938,1306.7305,0,35922.962296,22978.080261,1861.523875,0,120,"mbank_training" +"project804","default",4,3850,173,1363,5,1,100,TRUE,115.026,115.013226373,19002.039202,30904.950987,10687.222416,1399.585322,0,33172.285515,12033.102396,860.626358,0,120,"mbank_training" +"project804","default",5,3851,173,1363,7,1,100,TRUE,115.235999999999,115.223057538,22282.004866,20105.230448,6611.483638,1417.110018,0,39375.434188,17047.971526,1247.914574,0,120,"mbank_training" +"project4284","default",1,3847,4062,1072,0,1,100,TRUE,349.493,349.46794991,103451.107481,13308.954479,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",2,3848,4062,1322,0,1,100,TRUE,462.067999999999,461.989119379,101258.802999,13033.694143,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",3,3849,4062,1193,0,1,1,TRUE,120.931999999999,120.925018555,108451.106187,12471.801843,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",4,3850,4062,1040,0,1,100,TRUE,333.196,333.186980399,95453.13475,13018.854749,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",5,3851,4062,1220,0,1,100,TRUE,279.598,279.591312384,98085.089315,17653.911914,0,0,0,0,0,0,0,120,"mbank_training" diff --git a/dev/benchmarks/t252_mbank_30s_20260327_1044.csv b/dev/benchmarks/t252_mbank_30s_20260327_1044.csv new file mode 100644 index 000000000..b05455aac --- /dev/null +++ b/dev/benchmarks/t252_mbank_30s_20260327_1044.csv @@ -0,0 +1,126 @@ +"dataset","strategy","replicate","seed","n_taxa","best_score","replicates","hits_to_best","pool_size","timed_out","wall_s","time_to_best_s","wagner_ms","tbr_ms","xss_ms","rss_ms","css_ms","ratchet_ms","drift_ms","final_tbr_ms","fuse_ms","budget_s","source" +"project532","default",1,3847,21,1139,7,5,4,FALSE,0.542,0.159055272,135.619212,13.04573,22.272809,15.919137,0,276.407229,56.168931,11.423204,2.049818,30,"mbank_training" +"project532","default",2,3848,21,1139,6,4,3,FALSE,0.454999999999999,0.094952955,92.239999,25.374239,16.837307,14.426449,0,245.246002,42.116067,9.629668,1.382472,30,"mbank_training" +"project532","default",3,3849,21,1139,6,5,4,FALSE,0.475000000000001,0.115830358,107.415156,18.13517,22.390323,12.792332,0,248.020245,43.648871,9.998143,1.511787,30,"mbank_training" +"project532","default",4,3850,21,1139,7,3,3,FALSE,0.583,0.158395851,110.868678,33.604432,20.556298,16.136819,0,327.756102,53.823888,12.069131,0.705788,30,"mbank_training" +"project532","default",5,3851,21,1139,6,3,2,FALSE,0.443000000000001,0.095074113,94.240283,22.942882,19.845399,14.211134,0,237.886468,36.744354,9.808866,1.523488,30,"mbank_training" +"project2346","default",1,3847,23,316,18,2,4,FALSE,0.706,0.483245432,59.526585,29.639617,27.097784,15.987328,0,445.189137,111.983257,10.955062,0.615469,30,"mbank_training" +"project2346","default",2,3848,23,318,8,2,20,FALSE,0.272,0.107900119,30.632818,8.388472,14.761471,6.777368,0,142.381512,32.831868,4.816267,0.208482,30,"mbank_training" +"project2346","default",3,3849,23,320,12,2,4,FALSE,0.401999999999999,0.107059556,42.565554,15.985884,22.632968,10.643807,0,238.422067,59.348087,7.143847,0.571676,30,"mbank_training" +"project2346","default",4,3850,23,317,17,2,13,FALSE,0.695,0.302030974,61.210236,24.433465,24.273697,15.448662,0,427.301713,107.544701,10.352768,0.259899,30,"mbank_training" +"project2346","default",5,3851,23,314,78,1,4,FALSE,3.36,1.270754501,279.259769,120.36549,125.623277,66.597026,0,2160.387218,553.280991,47.067397,1.0782,30,"mbank_training" +"project2451","default",1,3847,24,735,12,2,4,FALSE,0.391,0.201359523,57.55009,12.458124,17.440524,10.013999,0,208.719441,73.761881,6.301613,0.352573,30,"mbank_training" +"project2451","default",2,3848,24,732,67,1,1,FALSE,2.958,0.754287847,321.511476,75.820549,109.558619,48.42972,0,1739.621241,624.549235,35.606072,0,30,"mbank_training" +"project2451","default",3,3849,24,731,7,2,6,FALSE,0.221999999999998,0.083935985,38.627521,8.011942,11.150373,4.288323,0,111.132705,37.786174,3.844417,0.195688,30,"mbank_training" +"project2451","default",4,3850,24,731,74,1,3,FALSE,3.226,1.072942617,382.702166,78.858534,117.675578,52.337654,0,1893.503165,653.858183,39.018316,1.668751,30,"mbank_training" +"project2451","default",5,3851,24,730,56,1,2,FALSE,2.475,0.259627507,272.084943,78.715156,92.241695,42.774133,0,1441.742098,513.345106,30.965669,0,30,"mbank_training" +"project4501","default",1,3847,24,118,11,4,63,FALSE,0.402000000000001,0.045659888,19.456467,10.321312,11.249257,7.273704,0,127.024403,20.262034,4.693674,0.592876,30,"mbank_training" +"project4501","default",2,3848,24,118,8,6,93,FALSE,0.331000000000003,0.019307548,14.526607,7.223871,10.667763,6.112406,0,63.857366,13.417429,3.468459,0.840874,30,"mbank_training" +"project4501","default",3,3849,24,118,11,8,63,FALSE,0.353999999999999,0.024017324,16.544606,12.054514,10.874954,6.541294,0,94.229752,16.766104,5.012416,1.208605,30,"mbank_training" +"project4501","default",4,3850,24,118,11,6,63,FALSE,0.399999999999999,0.049086008,17.238983,13.324074,11.375175,6.189715,0,122.860359,19.102503,4.840804,0.788614,30,"mbank_training" +"project4501","default",5,3851,24,118,9,4,93,FALSE,0.378,0.018508023,15.81334,12.376079,7.116948,5.60964,0,102.823873,17.718845,3.840052,0.661435,30,"mbank_training" +"project944","default",1,3847,25,128,7,7,60,FALSE,0.306999999999999,0.018572714,13.799929,8.682096,6.631403,4.331374,0,50.309772,9.260855,2.764333,0.884165,30,"mbank_training" +"project944","default",2,3848,25,128,6,6,60,FALSE,0.233000000000001,0.019885466,12.479662,4.603999,4.628432,3.361849,0,44.265201,7.877521,2.368148,0.873353,30,"mbank_training" +"project944","default",3,3849,25,128,6,6,60,FALSE,0.314,0.020729454,10.93727,6.439242,5.535302,3.476875,0,43.362833,9.229105,2.367437,0.930562,30,"mbank_training" +"project944","default",4,3850,25,128,8,8,60,FALSE,0.356999999999999,0.018798128,16.221829,8.35613,6.25217,5.100312,0,59.507827,13.044878,3.160491,0.944067,30,"mbank_training" +"project944","default",5,3851,25,128,9,9,60,FALSE,0.276999999999997,0.018958742,17.363908,11.685671,7.165347,5.454358,0,65.433255,11.16655,3.531729,1.479684,30,"mbank_training" +"project971_(1)","default",1,3847,26,157,7,5,100,FALSE,0.193000000000001,0.032616012,18.605586,8.255522,13.045921,6.466022,0,108.291193,17.142443,5.032654,0.93516,30,"mbank_training" +"project971_(1)","default",2,3848,26,157,7,3,100,FALSE,0.209,0.033034109,15.300445,7.423335,8.066606,6.895672,0,120.329215,17.930359,5.086765,0.809634,30,"mbank_training" +"project971_(1)","default",3,3849,26,157,14,9,100,FALSE,0.411000000000001,0.073843174,32.755924,20.344981,29.768159,14.029603,0,232.580428,32.34325,10.062332,1.52913,30,"mbank_training" +"project971_(1)","default",4,3850,26,157,9,5,100,FALSE,0.266000000000002,0.034971856,20.398181,11.072927,12.306986,9.028718,0,166.869904,26.803601,6.538127,0.79671,30,"mbank_training" +"project971_(1)","default",5,3851,26,157,8,5,100,FALSE,0.220000000000002,0.030588355,18.153386,8.61647,15.762383,8.048763,0,118.243181,20.804155,5.864089,1.237399,30,"mbank_training" +"project2762","default",1,3847,29,259,9,5,100,FALSE,0.477,0.064657482,89.005679,28.625208,27.015199,16.271522,0,222.30784,53.258133,10.549338,2.931178,30,"mbank_training" +"project2762","default",2,3848,29,259,9,5,100,FALSE,0.500999999999998,0.107338091,97.623433,21.280192,19.892007,19.848566,0,237.460345,52.994316,10.82002,2.92123,30,"mbank_training" +"project2762","default",3,3849,29,259,14,7,100,FALSE,0.824000000000002,0.248402003,167.013281,40.401138,31.5735,30.962567,0,404.788011,76.685877,16.82874,3.46784,30,"mbank_training" +"project2762","default",4,3850,29,259,7,2,100,FALSE,0.442,0.060617737,82.572548,17.849112,22.881476,15.476914,0,210.443998,44.501366,8.239273,0.595401,30,"mbank_training" +"project2762","default",5,3851,29,259,9,2,100,FALSE,0.549999999999997,0.156776252,96.810243,25.176484,24.499914,18.026146,0,289.442735,65.42589,10.713507,1.121812,30,"mbank_training" +"project826","default",1,3847,33,431,10,5,100,FALSE,0.585000000000001,0.058192461,97.322235,84.950343,36.014361,16.87607,0,243.540672,48.527945,10.402926,2.289268,30,"mbank_training" +"project826","default",2,3848,33,431,14,8,100,FALSE,0.815000000000001,0.064265745,164.329903,100.342802,40.22171,29.986892,0,362.511776,60.62095,14.329798,2.752712,30,"mbank_training" +"project826","default",3,3849,33,431,9,5,100,FALSE,0.506,0.112018993,97.964377,66.801407,33.044869,13.567502,0,224.709019,34.377508,9.366282,2.14059,30,"mbank_training" +"project826","default",4,3850,33,431,13,7,100,FALSE,0.698,0.062873383,154.074276,81.270968,45.052406,19.77621,0,295.134169,54.289724,13.291651,3.70243,30,"mbank_training" +"project826","default",5,3851,33,431,10,7,100,FALSE,0.533000000000001,0.1020999,107.928141,72.157363,27.216947,15.95665,0,228.789413,45.677109,10.315738,2.329386,30,"mbank_training" +"project561","default",1,3847,34,1169,5,4,2,FALSE,0.659999999999997,0.181347639,92.846159,25.059065,28.207322,29.981203,0,370.359995,88.552929,15.979782,1.446704,30,"mbank_training" +"project561","default",2,3848,34,1169,10,8,2,FALSE,1.232,0.18098681,142.576079,44.374205,78.560658,59.124055,0,702.572509,163.908009,31.657059,1.392722,30,"mbank_training" +"project561","default",3,3849,34,1169,6,4,2,FALSE,0.892000000000003,0.193806133,98.658642,28.550289,63.174951,32.545117,0,527.190542,111.328013,20.332498,2.808147,30,"mbank_training" +"project561","default",4,3850,34,1169,7,6,2,FALSE,0.850000000000001,0.196609952,112.865658,45.375572,39.933657,39.492137,0,466.18721,115.350405,22.395089,1.381331,30,"mbank_training" +"project561","default",5,3851,34,1169,13,10,2,FALSE,1.577,0.176815928,214.233348,75.676926,87.389599,73.393768,0,887.255078,189.841499,42.006711,0,30,"mbank_training" +"project571","default",1,3847,42,634,8,3,12,FALSE,0.694000000000003,0.164095501,106.218011,51.333049,32.312008,21.394067,0,342.296416,63.619378,12.651639,1.096264,30,"mbank_training" +"project571","default",2,3848,42,635,7,6,28,FALSE,0.639000000000003,0.088679535,100.785177,45.952629,28.371058,20.286232,0,230.646768,48.366994,10.52306,1.873838,30,"mbank_training" +"project571","default",3,3849,42,634,14,5,12,FALSE,1.271,0.205375594,186.209453,70.512624,64.876695,39.745794,0,696.746744,136.711756,21.663235,2.899548,30,"mbank_training" +"project571","default",4,3850,42,634,6,3,12,FALSE,0.503999999999998,0.106289547,97.153147,40.469948,25.720071,17.059675,0,215.716522,48.979746,9.500026,2.09783,30,"mbank_training" +"project571","default",5,3851,42,634,7,4,12,FALSE,0.562000000000005,0.094291991,104.624002,34.374362,22.712617,21.010924,0,265.769841,52.724237,10.747413,2.061411,30,"mbank_training" +"project4146_(3)","default",1,3847,59,260,79,1,100,TRUE,27.184,1.159421069,2161.951899,1431.436231,1369.737579,584.938795,0,15612.088541,5495.045514,342.372656,0,30,"mbank_training" +"project4146_(3)","default",2,3848,59,262,16,2,100,FALSE,4.43300000000001,3.006056794,398.978693,305.709476,242.038821,126.156641,0,2449.890491,699.98229,70.565506,3.637498,30,"mbank_training" +"project4146_(3)","default",3,3849,59,263,11,2,100,FALSE,2.911,1.75921419,292.432282,236.70639,138.23963,74.826225,0,1569.189041,477.95843,49.093865,1.822251,30,"mbank_training" +"project4146_(3)","default",4,3850,59,261,43,1,100,FALSE,13.781,2.134428645,1195.778958,805.816844,652.279548,310.548343,0,7750.977861,2798.563687,188.875727,9.304356,30,"mbank_training" +"project4146_(3)","default",5,3851,59,261,31,2,100,FALSE,10.048,0.622684871,834.240637,543.209237,558.784365,222.447677,0,5626.288589,1986.738636,134.143195,1.677448,30,"mbank_training" +"project3688","default",1,3847,60,854,17,2,100,FALSE,4.04899999999999,4.046715439,408.411651,338.435934,212.952407,88.730993,0,1919.553444,962.191821,49.845781,0.793184,30,"mbank_training" +"project3688","default",2,3848,60,852,56,2,100,FALSE,15.176,15.173541808,1302.515289,1226.872642,710.476017,259.35943,0,7396.027813,3937.753164,158.084343,7.912235,30,"mbank_training" +"project3688","default",3,3849,60,850,86,2,100,FALSE,22.625,7.183552468,2072.997198,1511.265434,1160.574252,431.875451,0,11185.916283,5821.436102,245.879774,1.441333,30,"mbank_training" +"project3688","default",4,3850,60,845,100,0,100,FALSE,26.996,23.186208158,2348.914977,2049.502682,1311.093182,459.592175,0,13408.741898,6965.596352,294.003592,7.587966,30,"mbank_training" +"project3688","default",5,3851,60,846,99,2,11,TRUE,27.29,27.28811545,2406.499645,2060.393789,1264.02288,485.95709,0,13418.549013,7075.465949,287.115493,0,30,"mbank_training" +"project4049","default",1,3847,60,5240,26,1,100,TRUE,27.317,5.573148428,6703.751431,2176.123735,825.960021,396.113765,0,12768.91305,3880.674858,253.963364,0,30,"mbank_training" +"project4049","default",2,3848,60,5241,14,2,100,FALSE,13.655,7.027287931,3626.086034,1198.572419,456.608051,220.963331,0,5704.360758,1589.957546,142.553627,3.045504,30,"mbank_training" +"project4049","default",3,3849,60,5237,25,0,69,TRUE,30.007,24.853505039,6743.164871,2194.709548,693.107501,428.940065,0,12666.958798,4006.547911,248.048225,20.524267,30,"mbank_training" +"project4049","default",4,3850,60,5238,25,1,42,TRUE,30.004,0.932033283,6776.515792,2319.610345,684.456542,434.411469,0,12776.817935,3766.735394,249.499943,0,30,"mbank_training" +"project4049","default",5,3851,60,5239,25,1,100,TRUE,27.486,3.676459796,6617.98495,2189.03596,703.300419,449.992373,0,12985.569232,3805.079674,248.978194,0,30,"mbank_training" +"project423","default",1,3847,60,495,8,4,100,FALSE,2.75400000000002,0.419124173,456.884432,154.343943,179.912009,110.096176,0,1290.272654,315.113496,59.03124,8.115354,30,"mbank_training" +"project423","default",2,3848,60,495,10,4,100,FALSE,3.13900000000001,0.370053092,496.594593,197.924011,185.722131,115.789679,0,1590.611775,385.967567,74.139425,14.305423,30,"mbank_training" +"project423","default",3,3849,60,495,9,4,100,FALSE,2.88499999999999,0.385641831,500.870846,145.446436,177.377975,93.716956,0,1323.701525,355.745644,66.297395,15.357013,30,"mbank_training" +"project423","default",4,3850,60,495,9,3,100,FALSE,3.17099999999999,0.913398284,477.038761,151.875211,145.583234,99.231705,0,1607.287212,441.466933,67.048899,9.279487,30,"mbank_training" +"project423","default",5,3851,60,495,7,3,100,FALSE,2.30000000000001,0.587509882,327.409848,89.159324,165.845635,66.60812,0,1078.061213,286.696336,50.646708,8.055533,30,"mbank_training" +"project4286","default",1,3847,63,283,29,1,100,FALSE,10.679,3.568361219,949.758854,695.981939,507.727922,255.192069,0,5968.563535,2042.300499,151.240761,10.412181,30,"mbank_training" +"project4286","default",2,3848,63,286,17,1,100,FALSE,5.93299999999999,2.493855935,547.902595,427.93154,253.562543,148.533339,0,3300.116545,1089.177478,84.825461,12.413171,30,"mbank_training" +"project4286","default",3,3849,63,282,69,0,100,TRUE,27.162,5.203139408,2360.989445,1512.558401,1088.458169,571.79875,0,15666.679256,5442.170076,346.111242,8.470814,30,"mbank_training" +"project4286","default",4,3850,63,282,36,2,100,FALSE,13.267,6.209710639,1092.563694,721.114448,524.582188,278.114164,0,7555.478801,2692.809987,176.91012,4.096113,30,"mbank_training" +"project4286","default",5,3851,63,283,70,1,100,TRUE,27.169,1.557344311,2287.207347,1582.845371,1211.075903,547.191397,0,15688.016025,5332.93518,347.612348,0,30,"mbank_training" +"project4359","default",1,3847,71,183,41,13,100,TRUE,27.097,2.261897091,5771.353119,1305.011338,1006.712715,912.447879,0,15026.53261,2340.667863,522.209948,117.847876,30,"mbank_training" +"project4359","default",2,3848,71,183,30,14,100,FALSE,16.129,0.665570597,3928.068751,1053.953565,706.859134,611.186316,0,8062.325089,1227.655533,389.882129,64.610764,30,"mbank_training" +"project4359","default",3,3849,71,183,43,13,100,TRUE,27.09,3.114505389,6132.139298,1549.203645,1017.246691,869.207429,0,14564.715459,2203.509034,547.569478,96.465078,30,"mbank_training" +"project4359","default",4,3850,71,183,39,14,100,FALSE,23.331,3.776490508,5602.272217,1470.537157,1043.605592,841.264513,0,12109.540516,1584.418474,494.61431,93.977932,30,"mbank_training" +"project4359","default",5,3851,71,184,5,4,100,FALSE,2.51499999999999,0.625239003,581.307694,181.271259,99.713473,106.22802,0,1151.351682,219.65755,60.431613,5.864753,30,"mbank_training" +"project4397","default",1,3847,75,1645,39,1,84,TRUE,29.898,11.652682336,4059.293835,4611.243157,859.16792,378.941198,0,12614.467637,4211.561709,269.027083,0,30,"mbank_training" +"project4397","default",2,3848,75,1648,40,1,11,TRUE,27.222,16.275268683,4083.297886,4499.993494,922.733936,399.403983,0,12638.777186,4181.619232,271.886856,1.900378,30,"mbank_training" +"project4397","default",3,3849,75,1649,22,2,100,FALSE,14.671,14.669488113,2326.940955,2439.062281,438.000839,215.523963,0,6522.444608,2178.359279,147.225135,1.934383,30,"mbank_training" +"project4397","default",4,3850,75,1648,40,1,100,TRUE,27.3430000000001,10.737418532,4263.486964,4441.437048,998.363541,456.623285,0,12534.551377,4024.735858,269.57219,10.386157,30,"mbank_training" +"project4397","default",5,3851,75,1646,39,1,42,TRUE,28.025,6.965588083,4076.471754,4722.078731,843.218866,405.213474,0,12542.741242,4122.793808,285.67242,0,30,"mbank_training" +"project2084_(1)","default",1,3847,86,28962,0,1,1,TRUE,27.2950000000001,27.284871037,6087.152752,7601.569896,1703.323901,292.649511,0,10154.252001,1190.248423,0,0,30,"mbank_training" +"project2084_(1)","default",2,3848,86,28206,0,1,2,TRUE,27.893,27.88159824,6449.215962,9863.875036,2809.440548,508.882523,0,7554.658108,0,0,0,30,"mbank_training" +"project2084_(1)","default",3,3849,86,28303,0,1,1,TRUE,27.3520000000001,27.341604691,5532.892173,4659.700576,2729.695298,369.901689,0,10917.922654,2879.327597,0,0,30,"mbank_training" +"project2084_(1)","default",4,3850,86,29263,0,1,1,TRUE,27.337,27.32739407,5136.496018,4624.353351,2476.729143,305.33289,0,12907.034261,1626.271363,0,0,30,"mbank_training" +"project2084_(1)","default",5,3851,86,29028,0,1,1,TRUE,27.277,27.265355976,6452.248698,4808.151115,2003.218686,324.815519,0,10753.100827,2544.773109,129.969162,0,30,"mbank_training" +"project2771","default",1,3847,94,1061,18,1,2,TRUE,27.628,27.627539078,2381.315256,1179.557903,1473.537564,476.722015,0,14814.327106,6359.277975,317.095031,4.795324,30,"mbank_training" +"project2771","default",2,3848,94,1051,18,1,2,TRUE,27.056,4.598189444,2329.759595,1058.876302,1523.15815,437.623557,0,14578.550452,6707.850826,347.023291,25.508291,30,"mbank_training" +"project2771","default",3,3849,94,1055,18,1,25,TRUE,28.596,28.595855406,2208.668083,1237.955741,1885.302937,461.560814,0,14342.634134,6721.423525,318.360851,0,30,"mbank_training" +"project2771","default",4,3850,94,1046,18,0,6,TRUE,27.212,20.807814846,2244.376253,996.635736,1387.986264,635.028035,0,14958.245088,6429.43522,307.637888,39.061989,30,"mbank_training" +"project2771","default",5,3851,94,1061,18,1,83,TRUE,30.005,30.00408473,2175.970571,1277.59992,1288.181195,517.295524,0,14521.506037,6925.085047,312.43618,0,30,"mbank_training" +"project2184","default",1,3847,114,565,13,1,100,FALSE,16.5749999999999,10.308158943,1901.553612,4214.328218,485.080766,269.267477,0,7183.460842,1969.620003,238.019628,35.13861,30,"mbank_training" +"project2184","default",2,3848,114,564,8,2,100,FALSE,9.25199999999995,4.431088219,1068.876064,2542.898349,345.815564,183.719232,0,3766.096167,990.361813,142.478229,6.554553,30,"mbank_training" +"project2184","default",3,3849,114,565,18,2,100,FALSE,23.623,14.129842318,2496.19712,5294.070284,723.003063,423.243246,0,10922.706838,3311.749457,329.429608,19.212075,30,"mbank_training" +"project2184","default",4,3850,114,563,19,0,100,TRUE,27.1590000000001,15.437353718,2783.61853,6176.676207,832.030467,454.22667,0,12537.971451,3609.087835,353.928239,64.443171,30,"mbank_training" +"project2184","default",5,3851,114,564,18,2,100,FALSE,24.5440000000001,1.10036677,2383.790706,5703.843109,776.865951,391.596843,0,11368.827702,3408.154639,327.715989,13.345019,30,"mbank_training" +"project3938","default",1,3847,119,3416,5,1,100,TRUE,28.0219999999999,28.003850298,7657.059517,10286.657825,1092.717385,263.907318,0,5528.088936,1944.685173,217.01893,0,30,"mbank_training" +"project3938","default",2,3848,119,3408,5,1,100,TRUE,27.7280000000001,20.346678436,8009.709849,9193.549534,847.319192,396.677981,0,5963.455621,2388.873139,226.921023,0,30,"mbank_training" +"project3938","default",3,3849,119,3413,5,1,100,TRUE,27.654,25.92062548,7597.568873,8397.953192,1192.278244,267.524903,0,6370.474937,2821.517517,219.073277,160.334703,30,"mbank_training" +"project3938","default",4,3850,119,3418,5,1,100,TRUE,27.4750000000001,9.50999064,7772.234407,9643.990176,1475.966887,248.789132,0,5542.434776,2106.959977,212.515308,0,30,"mbank_training" +"project3938","default",5,3851,119,3422,5,1,100,TRUE,27.732,5.309066635,8970.183635,8855.738138,862.2458,303.566881,0,5511.693151,2297.473368,216.724037,0,30,"mbank_training" +"syab07201","default",1,3847,125,14933,3,1,3,TRUE,27.6960000000001,12.634024402,4209.831881,3971.282564,792.06249,485.70106,0,11643.179767,5614.675847,313.862167,0,30,"mbank_training" +"syab07201","default",2,3848,125,15033,4,1,1,TRUE,27.1510000000001,6.377969028,5490.40643,1351.931126,1062.927057,575.611519,0,11885.543952,6220.76002,417.368365,0,30,"mbank_training" +"syab07201","default",3,3849,125,14953,4,1,1,TRUE,27.2079999999999,19.582339775,4913.616435,2616.01053,1404.911755,502.94652,0,11247.894925,5943.98444,427.252739,0,30,"mbank_training" +"syab07201","default",4,3850,125,15017,4,1,1,TRUE,27.154,7.581652703,5078.897263,4206.671582,732.611115,532.396846,0,12075.781743,3956.822031,428.170321,0,30,"mbank_training" +"syab07201","default",5,3851,125,14926,4,1,2,TRUE,27.3429999999998,26.157566299,4773.575904,2310.009118,810.267198,599.034403,0,12631.506286,5520.778049,411.987076,0,30,"mbank_training" +"project4133","default",1,3847,131,2386,7,1,100,TRUE,27.915,15.584865867,5481.31431,8081.440573,1366.990872,340.315942,0,8473.379962,3062.261999,200.053899,0,30,"mbank_training" +"project4133","default",2,3848,131,2375,7,1,100,TRUE,29.7460000000001,29.74417613,5614.65215,9794.68452,805.954015,307.043522,0,7098.080922,3172.412796,211.646549,0,30,"mbank_training" +"project4133","default",3,3849,131,2377,8,1,100,TRUE,29.912,29.909055691,5418.41755,8613.7662,1026.893078,312.755544,0,8189.025034,3218.032499,236.792295,0,30,"mbank_training" +"project4133","default",4,3850,131,2374,7,1,100,TRUE,28.2819999999999,28.278871211,5472.009347,9335.30634,1118.641229,294.130262,0,7872.79282,2702.84137,207.013293,0,30,"mbank_training" +"project4133","default",5,3851,131,2385,8,1,100,TRUE,27.2849999999999,15.627101633,5979.695623,8990.18192,1047.259017,274.950811,0,7223.982727,3261.511037,237.368799,0,30,"mbank_training" +"project804","default",1,3847,173,1375,1,1,3,TRUE,27.7629999999999,17.248265978,5083.03061,7686.516173,873.982178,424.487131,0,10184.06465,2628.645841,184.614359,0,30,"mbank_training" +"project804","default",2,3848,173,1370,1,1,3,TRUE,30.0450000000001,30.041864633,4723.645469,9349.265054,812.927877,420.088117,0,8947.386937,2593.918773,183.881763,0,30,"mbank_training" +"project804","default",3,3849,173,1373,1,1,12,TRUE,30.056,30.051604437,6251.779509,6250.564102,1728.904108,435.905993,0,9042.788334,3203.429055,272.356204,0,30,"mbank_training" +"project804","default",4,3850,173,1387,1,1,100,TRUE,30.0989999999999,18.068643362,7075.131855,6976.566617,2091.847773,385.917478,0,7139.858341,3216.192954,174.730846,0,30,"mbank_training" +"project804","default",5,3851,173,1372,1,1,99,TRUE,30.1019999999999,15.872184522,5765.983551,5838.745921,1365.292695,412.16547,0,9596.695161,3903.152447,177.915998,0,30,"mbank_training" +"project4284","default",1,3847,4062,1268,0,1,1,TRUE,42.9269999999999,42.89428889,27450.181719,15441.009559,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",2,3848,4062,1411,0,1,1,TRUE,40.9389999999999,40.934490269,27438.224699,13493.849822,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",3,3849,4062,1193,0,1,1,TRUE,39.7939999999999,39.789130405,27403.434015,12382.574941,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",4,3850,4062,1107,0,1,1,TRUE,40.596,40.592123674,27251.044937,13338.608947,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",5,3851,4062,1360,0,1,1,TRUE,42.2569999999998,42.252403973,27459.165573,14790.786965,0,0,0,0,0,0,0,30,"mbank_training" diff --git a/dev/benchmarks/t252_mbank_60s_20260327_1135.csv b/dev/benchmarks/t252_mbank_60s_20260327_1135.csv new file mode 100644 index 000000000..da622ea80 --- /dev/null +++ b/dev/benchmarks/t252_mbank_60s_20260327_1135.csv @@ -0,0 +1,126 @@ +"dataset","strategy","replicate","seed","n_taxa","best_score","replicates","hits_to_best","pool_size","timed_out","wall_s","time_to_best_s","wagner_ms","tbr_ms","xss_ms","rss_ms","css_ms","ratchet_ms","drift_ms","final_tbr_ms","fuse_ms","budget_s","source" +"project532","default",1,3847,21,1139,7,5,4,FALSE,0.531999999999925,0.155157045,132.94095,12.896659,22.605166,15.817748,0,270.798128,54.609435,11.237556,2.086117,60,"mbank_training" +"project532","default",2,3848,21,1139,6,4,3,FALSE,0.446999999999889,0.093571746,91.262179,25.037604,16.547562,14.29948,0,240.200325,41.112086,9.448097,1.32286,60,"mbank_training" +"project532","default",3,3849,21,1139,6,5,4,FALSE,0.451999999999998,0.107800963,103.294437,17.516446,21.010614,12.231377,0,234.851555,42.029894,9.646431,1.373085,60,"mbank_training" +"project532","default",4,3850,21,1139,7,3,3,FALSE,0.560000000000173,0.151373582,105.851452,32.081296,19.716629,15.418977,0,315.724865,51.357515,11.456677,0.675031,60,"mbank_training" +"project532","default",5,3851,21,1139,6,3,2,FALSE,0.424999999999955,0.09220405,90.138613,21.852509,18.915639,13.605615,0,229.299615,35.483317,9.40638,1.447665,60,"mbank_training" +"project2346","default",1,3847,23,316,18,2,4,FALSE,0.674999999999955,0.462164759,56.807774,28.543874,26.025564,15.40238,0,425.377913,107.322736,10.523358,0.588648,60,"mbank_training" +"project2346","default",2,3848,23,318,8,2,20,FALSE,0.261999999999944,0.104624604,29.655097,8.09496,14.298628,6.572934,0,136.799606,31.645155,4.640075,0.204004,60,"mbank_training" +"project2346","default",3,3849,23,320,12,2,4,FALSE,0.388999999999896,0.104599907,41.047939,15.545544,21.653463,10.344817,0,231.190393,57.622497,6.943413,0.455207,60,"mbank_training" +"project2346","default",4,3850,23,317,17,2,13,FALSE,0.672000000000025,0.2921479,59.06303,23.600541,23.237525,14.67714,0,413.671557,104.475003,10.015255,0.229012,60,"mbank_training" +"project2346","default",5,3851,23,314,78,1,4,FALSE,3.25399999999991,1.237738072,270.848388,116.721824,120.649286,64.301568,0,2091.267241,537.485683,45.54476,1.042463,60,"mbank_training" +"project2451","default",1,3847,24,735,12,2,4,FALSE,0.380000000000109,0.195803825,58.437761,11.924619,16.795548,9.706935,0,201.186919,70.898262,6.095423,0.310965,60,"mbank_training" +"project2451","default",2,3848,24,732,67,1,1,FALSE,2.86299999999983,0.739887091,309.339849,73.561174,106.35613,47.104094,0,1682.872919,606.382824,34.560083,0,60,"mbank_training" +"project2451","default",3,3849,24,731,7,2,6,FALSE,0.213999999999942,0.08147899,36.285649,7.782122,10.81444,4.176962,0,107.266687,36.934762,3.742436,0.176081,60,"mbank_training" +"project2451","default",4,3850,24,731,74,1,3,FALSE,3.10799999999995,1.03302151,354.13203,76.234593,112.742415,50.471643,0,1835.061126,633.554558,37.608168,1.54608,60,"mbank_training" +"project2451","default",5,3851,24,730,56,1,2,FALSE,2.38400000000001,0.248315494,251.682816,76.48362,89.400481,41.476819,0,1392.188884,499.804545,30.093098,0,60,"mbank_training" +"project4501","default",1,3847,24,118,11,4,63,FALSE,0.388000000000147,0.044144614,18.206382,9.968826,10.619233,6.948279,0,123.442226,19.781661,4.620336,0.453224,60,"mbank_training" +"project4501","default",2,3848,24,118,8,6,93,FALSE,0.31899999999996,0.01872961,14.026555,6.975631,10.285114,5.881491,0,61.758493,12.974875,3.354795,0.776,60,"mbank_training" +"project4501","default",3,3849,24,118,11,8,63,FALSE,0.342000000000098,0.022986874,15.960925,11.566373,10.484639,6.290182,0,90.623341,16.135268,4.844278,1.13743,60,"mbank_training" +"project4501","default",4,3850,24,118,11,6,63,FALSE,0.385999999999967,0.047210546,16.637923,12.774526,11.017282,5.998301,0,118.138331,18.401079,4.671276,0.787301,60,"mbank_training" +"project4501","default",5,3851,24,118,9,4,93,FALSE,0.366999999999962,0.018027006,15.285145,12.046139,6.928463,5.445069,0,99.51878,17.209297,3.704213,0.634866,60,"mbank_training" +"project944","default",1,3847,25,128,7,7,60,FALSE,0.294999999999845,0.018088743,13.30662,8.38728,6.301262,4.208281,0,48.776002,9.005236,2.673461,0.830533,60,"mbank_training" +"project944","default",2,3848,25,128,6,6,60,FALSE,0.224999999999909,0.019102592,12.06331,4.438717,4.448804,3.289853,0,42.937782,7.673997,2.300673,0.821297,60,"mbank_training" +"project944","default",3,3849,25,128,6,6,60,FALSE,0.300999999999931,0.020199055,10.4648,6.232242,5.257136,3.33605,0,41.69849,8.874975,2.280816,0.85007,60,"mbank_training" +"project944","default",4,3850,25,128,8,8,60,FALSE,0.346000000000004,0.017898024,15.590799,8.054094,6.008188,4.86531,0,57.379981,12.570734,3.056505,0.871962,60,"mbank_training" +"project944","default",5,3851,25,128,9,9,60,FALSE,0.271999999999935,0.018105484,16.873965,11.336702,6.987282,5.283617,0,63.617765,10.896273,3.429023,1.41757,60,"mbank_training" +"project971_(1)","default",1,3847,26,157,7,5,100,FALSE,0.188000000000102,0.0321425,18.231222,8.017143,12.649642,6.286026,0,106.37865,16.725717,4.923929,0.854858,60,"mbank_training" +"project971_(1)","default",2,3848,26,157,7,3,100,FALSE,0.202999999999975,0.032296189,14.838464,7.174025,7.827495,6.616547,0,116.535101,17.333633,4.903698,0.776372,60,"mbank_training" +"project971_(1)","default",3,3849,26,157,14,9,100,FALSE,0.396999999999935,0.071620829,31.668768,19.663817,28.913962,13.593501,0,225.426793,31.191852,9.765443,1.439651,60,"mbank_training" +"project971_(1)","default",4,3850,26,157,9,5,100,FALSE,0.256000000000085,0.033776476,19.610728,10.628288,11.828968,8.692054,0,160.535044,25.838523,6.279481,0.742718,60,"mbank_training" +"project971_(1)","default",5,3851,26,157,8,5,100,FALSE,0.212999999999965,0.029701174,17.588101,8.384524,15.25026,7.746113,0,114.389611,20.183938,5.732391,1.160304,60,"mbank_training" +"project2762","default",1,3847,29,259,9,5,100,FALSE,0.456000000000131,0.061945926,85.191185,27.493606,26.010027,15.582262,0,213.117243,51.055173,10.07161,2.744155,60,"mbank_training" +"project2762","default",2,3848,29,259,9,5,100,FALSE,0.478000000000065,0.102143639,93.133407,20.179709,18.970153,19.004044,0,226.906961,50.594025,10.297777,2.769936,60,"mbank_training" +"project2762","default",3,3849,29,259,14,7,100,FALSE,0.793999999999869,0.238375902,161.098384,38.927341,30.230702,29.649027,0,389.337059,73.892207,16.263747,3.270378,60,"mbank_training" +"project2762","default",4,3850,29,259,7,2,100,FALSE,0.428000000000111,0.058983519,80.459811,17.423549,22.272229,15.008232,0,203.642406,42.868811,8.013146,0.555866,60,"mbank_training" +"project2762","default",5,3851,29,259,9,2,100,FALSE,0.535000000000082,0.151931487,94.256053,24.5204,23.889673,17.526596,0,281.778727,63.415303,10.419676,1.102776,60,"mbank_training" +"project826","default",1,3847,33,431,10,5,100,FALSE,0.562999999999874,0.056729766,94.04372,82.409339,32.56211,15.907697,0,235.778602,47.145061,10.064618,1.889146,60,"mbank_training" +"project826","default",2,3848,33,431,14,8,100,FALSE,0.773000000000138,0.06226196,158.0213,96.919785,38.896136,28.937695,0,349.497322,58.555463,13.803087,2.599543,60,"mbank_training" +"project826","default",3,3849,33,431,9,5,100,FALSE,0.48700000000008,0.107898042,94.132256,64.66845,31.710938,13.12036,0,216.448743,33.174753,9.034518,1.991478,60,"mbank_training" +"project826","default",4,3850,33,431,13,7,100,FALSE,0.675999999999931,0.06100803,148.574968,78.863774,43.65249,19.242854,0,286.236231,52.802974,12.941453,3.579557,60,"mbank_training" +"project826","default",5,3851,33,431,10,7,100,FALSE,0.51299999999992,0.098033702,103.843431,69.345303,26.127607,15.397387,0,220.027508,44.046429,9.939384,2.271867,60,"mbank_training" +"project561","default",1,3847,34,1169,5,4,2,FALSE,0.641999999999825,0.17492852,90.052497,24.187263,27.040114,28.734684,0,360.911661,86.894653,15.497404,1.369959,60,"mbank_training" +"project561","default",2,3848,34,1169,10,8,2,FALSE,1.2030000000002,0.177173586,138.669793,43.757383,76.882535,58.167253,0,685.538141,159.881894,30.825721,1.339312,60,"mbank_training" +"project561","default",3,3849,34,1169,6,4,2,FALSE,0.869999999999891,0.18790584,95.455449,27.965055,61.085397,31.719011,0,515.495499,109.62762,18.470351,2.695755,60,"mbank_training" +"project561","default",4,3850,34,1169,7,6,2,FALSE,0.826999999999998,0.192727156,109.19865,44.217522,38.954784,38.522289,0,453.311567,111.987842,21.865571,1.357856,60,"mbank_training" +"project561","default",5,3851,34,1169,13,10,2,FALSE,1.52800000000002,0.170371331,207.568654,73.128851,84.761045,71.265761,0,857.731735,184.962638,40.678768,0,60,"mbank_training" +"project571","default",1,3847,42,634,8,3,12,FALSE,0.671000000000049,0.157445137,102.114585,49.798648,31.506792,20.846217,0,331.649651,60.904718,12.152725,1.196162,60,"mbank_training" +"project571","default",2,3848,42,635,7,6,28,FALSE,0.619000000000142,0.086190797,97.06063,44.233453,27.674077,19.839528,0,223.541076,46.893869,10.235498,1.847647,60,"mbank_training" +"project571","default",3,3849,42,634,14,5,12,FALSE,1.22199999999998,0.196988458,179.846047,68.135679,62.309499,38.144598,0,669.29153,131.646387,20.805877,2.624811,60,"mbank_training" +"project571","default",4,3850,42,634,6,3,12,FALSE,0.484000000000151,0.102017813,93.340107,39.119923,24.874958,16.486808,0,207.705063,47.070943,9.135167,1.997872,60,"mbank_training" +"project571","default",5,3851,42,634,7,4,12,FALSE,0.546000000000049,0.091017033,101.386263,33.300991,21.79463,20.3546,0,258.555851,51.713282,10.502081,1.974336,60,"mbank_training" +"project4146_(3)","default",1,3847,59,260,88,2,100,FALSE,29.146,1.121571562,2298.172541,1541.883571,1388.448093,619.279036,0,16782.220998,6022.513703,373.627406,1.907832,60,"mbank_training" +"project4146_(3)","default",2,3848,59,262,16,2,100,FALSE,4.29300000000012,2.907800167,388.581579,296.941575,232.639383,121.909569,0,2373.682509,675.650357,68.198961,3.556906,60,"mbank_training" +"project4146_(3)","default",3,3849,59,263,11,2,100,FALSE,2.81899999999996,1.70537764,283.697704,229.458175,134.279144,72.100403,0,1519.530647,460.846893,47.369725,1.771184,60,"mbank_training" +"project4146_(3)","default",4,3850,59,261,43,1,100,FALSE,13.5260000000001,2.079365236,1173.373873,791.026839,639.12085,305.072936,0,7612.742871,2745.110794,185.134828,9.093941,60,"mbank_training" +"project4146_(3)","default",5,3851,59,261,31,2,100,FALSE,9.84099999999989,0.609325093,815.511969,534.029647,547.343539,218.158431,0,5510.730906,1946.230582,130.943191,1.776644,60,"mbank_training" +"project3688","default",1,3847,60,854,17,2,100,FALSE,3.92099999999982,3.919504781,385.501468,328.321857,206.991666,85.831002,0,1864.174767,936.303222,48.589011,0.777434,60,"mbank_training" +"project3688","default",2,3848,60,852,56,2,100,FALSE,14.6719999999998,14.670244516,1209.847499,1190.050694,689.647573,251.904609,0,7175.943495,3823.25799,153.520014,7.791968,60,"mbank_training" +"project3688","default",3,3849,60,850,86,2,100,FALSE,21.9349999999999,6.939731752,1927.898247,1467.977175,1126.604546,419.843857,0,10905.788095,5657.655153,238.271027,1.38597,60,"mbank_training" +"project3688","default",4,3850,60,845,100,0,100,FALSE,27.221,23.330009764,2294.310236,1989.385736,1275.489694,446.473982,0,13802.071859,6750.57814,283.637525,15.922755,60,"mbank_training" +"project3688","default",5,3851,60,851,100,2,100,FALSE,26.5239999999999,26.46343803,2229.919796,2008.345071,1232.32596,472.192149,0,13016.601459,6917.000715,281.497787,0,60,"mbank_training" +"project4049","default",1,3847,60,5237,50,1,24,TRUE,55.635,28.849031526,12812.355924,4022.935411,1534.967378,770.142303,0,26041.833005,8340.113622,482.876614,0,60,"mbank_training" +"project4049","default",2,3848,60,5241,14,2,100,FALSE,13.231,6.809624175,3518.238252,1155.261025,441.629723,213.68929,0,5531.891441,1538.969853,138.060614,2.902625,60,"mbank_training" +"project4049","default",3,3849,60,5237,51,0,69,TRUE,60.0050000000001,24.444532187,12896.301176,4401.928909,1452.051462,798.980615,0,25780.83288,8172.16566,480.915105,19.044584,60,"mbank_training" +"project4049","default",4,3850,60,5238,50,1,42,TRUE,57.2310000000002,0.927573556,12594.197084,4495.215782,1233.283096,798.486808,0,26471.88747,7921.979066,483.190351,0,60,"mbank_training" +"project4049","default",5,3851,60,5238,48,2,100,FALSE,51.5299999999997,35.234272169,12285.229797,3817.922871,1290.084708,807.233125,0,24705.508017,7500.001315,463.980469,5.626637,60,"mbank_training" +"project423","default",1,3847,60,495,8,4,100,FALSE,2.64699999999993,0.400347063,441.869488,149.306496,171.026613,105.804704,0,1238.875356,303.385543,56.319473,7.638977,60,"mbank_training" +"project423","default",2,3848,60,495,10,4,100,FALSE,3.02799999999979,0.356720675,484.076736,189.899313,179.44867,111.645103,0,1532.396363,369.926824,71.14467,13.962615,60,"mbank_training" +"project423","default",3,3849,60,495,9,4,100,FALSE,2.79700000000003,0.373323881,487.248487,141.088071,171.733752,90.220876,0,1283.917822,344.518464,64.250429,14.971223,60,"mbank_training" +"project423","default",4,3850,60,495,9,3,100,FALSE,3.04700000000003,0.878920563,461.192317,144.824947,139.270067,95.323255,0,1543.655094,421.459133,64.434844,9.116608,60,"mbank_training" +"project423","default",5,3851,60,495,7,3,100,FALSE,2.23999999999978,0.566558355,320.509298,86.205063,162.281775,64.508463,0,1050.605463,277.976441,49.275418,7.849402,60,"mbank_training" +"project4286","default",1,3847,63,283,29,1,100,FALSE,11.241,3.589487481,989.545223,727.212385,542.731052,270.822659,0,6261.587648,2167.214292,158.508772,11.424284,60,"mbank_training" +"project4286","default",2,3848,63,286,17,1,100,FALSE,6.8130000000001,2.866909116,617.645772,485.539661,292.257845,172.845279,0,3823.49682,1229.65814,98.085378,13.692145,60,"mbank_training" +"project4286","default",3,3849,63,282,100,0,100,FALSE,42.7640000000001,6.010016617,3526.899076,2417.887392,1721.874089,909.951811,0,24570.01672,8672.056574,543.419217,9.878051,60,"mbank_training" +"project4286","default",4,3850,63,282,36,2,100,FALSE,14.7190000000001,6.805195528,1212.868133,790.710567,583.564096,308.686255,0,8340.113943,2979.273262,196.729748,5.521758,60,"mbank_training" +"project4286","default",5,3851,63,281,100,0,100,FALSE,43.7359999999999,33.033736523,3495.468488,2532.104488,2037.564548,902.984646,0,25233.885802,8661.81827,553.655857,9.87596,60,"mbank_training" +"project4359","default",1,3847,71,183,44,14,100,FALSE,32.0450000000001,2.284122122,6608.926961,1508.127227,1179.962765,1060.524953,0,18053.52329,2758.388991,624.412424,148.430653,60,"mbank_training" +"project4359","default",2,3848,71,183,30,14,100,FALSE,17.991,0.669618659,4264.572667,1193.658269,796.706378,688.173414,0,9061.52797,1384.455348,438.30725,72.17638,60,"mbank_training" +"project4359","default",3,3849,71,183,48,14,100,FALSE,34.951,3.286649126,7525.892458,2044.580377,1323.267198,1124.803309,0,19080.732472,2870.561837,717.551942,135.330823,60,"mbank_training" +"project4359","default",4,3850,71,183,39,14,100,FALSE,24.9720000000002,4.017413013,5956.408073,1609.563323,1125.355011,908.142088,0,12957.920369,1690.329368,530.747424,99.477889,60,"mbank_training" +"project4359","default",5,3851,71,184,5,4,100,FALSE,2.68499999999995,0.670274178,618.15515,193.851391,105.988098,112.845919,0,1234.846153,232.937716,64.465129,6.273569,60,"mbank_training" +"project4397","default",1,3847,75,1645,68,1,100,TRUE,56.2779999999998,12.170124323,7991.80297,8621.763032,1577.508321,731.455933,0,25971.726563,8575.839645,531.199288,0,60,"mbank_training" +"project4397","default",2,3848,75,1647,44,2,72,FALSE,37.058,37.056632143,4958.379407,5528.590199,1114.292194,481.745149,0,15757.734049,5206.777362,340.585754,4.460428,60,"mbank_training" +"project4397","default",3,3849,75,1649,22,2,100,FALSE,16.4540000000002,16.451864336,2598.100076,2728.514627,483.544724,241.513852,0,7331.81783,2463.322895,165.851744,2.134348,60,"mbank_training" +"project4397","default",4,3850,75,1647,69,1,100,TRUE,54.3110000000001,34.441545572,7951.753605,9080.744802,1793.795614,869.29426,0,25555.851696,8210.116374,524.851391,11.542171,60,"mbank_training" +"project4397","default",5,3851,75,1646,68,1,29,TRUE,54.7840000000001,7.434980652,7951.811871,8898.509066,1817.529788,778.936639,0,25542.609159,8458.239025,548.439589,0,60,"mbank_training" +"project2084_(1)","default",1,3847,86,28962,1,1,1,TRUE,54.4740000000002,36.027487568,14979.095008,14999.501691,2975.967362,696.11225,0,15425.894194,4695.655548,410.523317,0,60,"mbank_training" +"project2084_(1)","default",2,3848,86,28206,1,1,2,TRUE,56.5880000000002,43.132102468,13181.638643,16595.40308,5997.758415,594.90442,0,11696.330144,7423.078548,256.345525,0,60,"mbank_training" +"project2084_(1)","default",3,3849,86,28303,1,1,1,TRUE,54.8099999999999,33.587438404,13343.930947,12374.806132,4238.386208,917.861064,0,19324.586019,3811.831915,333.559326,0,60,"mbank_training" +"project2084_(1)","default",4,3850,86,29022,1,1,2,TRUE,54.9989999999998,34.283512361,11256.392787,12465.419556,6071.672261,804.994572,0,19199.613915,3854.822219,513.142007,0,60,"mbank_training" +"project2084_(1)","default",5,3851,86,29028,1,1,1,TRUE,54.3580000000002,32.795953363,13459.063726,7689.930694,7774.009206,647.933228,0,20775.360569,3475.924906,262.921936,0,60,"mbank_training" +"project2771","default",1,3847,94,1061,30,1,7,TRUE,54.5080000000003,54.50748361,4055.693235,2007.917633,2703.941653,867.315002,0,29991.013706,13797.074177,578.259028,5.170318,60,"mbank_training" +"project2771","default",2,3848,94,1051,30,1,2,TRUE,54.0459999999998,4.789783894,4271.827506,1819.564166,2938.795187,815.396822,0,30148.284471,13355.075623,621.537151,27.556894,60,"mbank_training" +"project2771","default",3,3849,94,1054,30,1,7,TRUE,56.1459999999997,56.145150433,4154.5174,2329.563524,3529.689896,867.569369,0,29018.037154,13612.762975,617.564404,0,60,"mbank_training" +"project2771","default",4,3850,94,1046,30,0,6,TRUE,54.2339999999999,23.20915686,4222.126455,2096.845122,2913.642794,1085.219592,0,29918.720473,13139.276886,579.73558,51.066233,60,"mbank_training" +"project2771","default",5,3851,94,1059,29,1,1,TRUE,54.0189999999998,50.911111792,3962.747493,2194.650357,2428.450395,882.999859,0,29559.448131,14387.393034,580.596246,0,60,"mbank_training" +"project2184","default",1,3847,114,565,13,1,100,FALSE,18.6109999999999,11.402577243,2131.156542,4725.997082,530.837764,293.965445,0,8043.019541,2221.036326,273.326637,44.186869,60,"mbank_training" +"project2184","default",2,3848,114,564,8,2,100,FALSE,11.154,5.345453234,1274.511283,3095.947372,411.255146,218.503153,0,4509.715032,1206.564447,175.035752,7.404348,60,"mbank_training" +"project2184","default",3,3849,114,565,18,2,100,FALSE,26.3409999999999,16.141419283,2771.799272,6012.279972,808.007389,471.532338,0,12121.375686,3661.194424,370.384895,20.829929,60,"mbank_training" +"project2184","default",4,3850,114,563,33,0,100,TRUE,54.2220000000002,16.851939064,4867.676228,11886.042139,1552.892566,850.359378,0,26578.906698,7519.570371,675.624246,70.321069,60,"mbank_training" +"project2184","default",5,3851,114,564,18,2,100,FALSE,28.5320000000002,1.17403699,2732.189783,6746.666943,896.459513,454.621275,0,13185.943663,3950.044113,370.900718,16.416084,60,"mbank_training" +"project3938","default",1,3847,119,3417,9,2,100,FALSE,51.2469999999998,21.493213813,13097.014635,17240.880122,2577.814748,685.203831,0,12611.715252,4025.969422,457.683269,30.401117,60,"mbank_training" +"project3938","default",2,3848,119,3408,8,1,100,TRUE,54.9359999999997,22.446101436,14280.167284,17485.494785,2503.527778,693.394689,0,13934.462712,4716.448387,413.765958,0,60,"mbank_training" +"project3938","default",3,3849,119,3413,9,2,100,FALSE,53.6620000000003,29.438755385,13646.266789,16273.772787,1957.851645,607.939928,0,14071.340355,5282.540219,453.269932,204.081741,60,"mbank_training" +"project3938","default",4,3850,119,3409,9,1,100,TRUE,54.4050000000002,32.279967283,14393.190366,16570.649742,3077.911077,580.782538,0,14041.999764,4926.950212,444.543922,0,60,"mbank_training" +"project3938","default",5,3851,119,3410,9,0,100,TRUE,55.8039999999996,54.032769174,15041.883919,18403.466552,1289.59538,761.434192,0,13465.906008,4532.486313,469.045411,66.780611,60,"mbank_training" +"syab07201","default",1,3847,125,14933,6,1,3,TRUE,54.5649999999996,13.463024632,8283.224612,5422.107356,2155.406468,975.963841,0,23496.45208,12990.29354,720.755871,0,60,"mbank_training" +"syab07201","default",2,3848,125,15033,6,1,1,TRUE,54.1619999999998,6.879759994,8631.661473,4556.603853,1981.947232,1083.600626,0,24835.027048,12258.982609,667.987072,0,60,"mbank_training" +"syab07201","default",3,3849,125,14953,6,1,1,TRUE,54.3450000000003,23.402911157,8807.088314,4685.148042,2305.728683,1070.541614,0,25444.06322,11129.258295,704.90096,0,60,"mbank_training" +"syab07201","default",4,3850,125,15017,6,1,2,TRUE,54.4230000000002,9.162811337,8130.311725,5679.011914,2400.763562,1039.644438,0,25123.454571,10902.237259,753.185253,0,60,"mbank_training" +"syab07201","default",5,3851,125,14926,6,1,2,TRUE,54.5190000000002,31.986010941,7983.21979,6412.495985,1702.238379,1285.062244,0,25779.764873,10200.97239,729.078123,0,60,"mbank_training" +"project4133","default",1,3847,131,2373,12,1,100,TRUE,55.0460000000003,50.763268092,9777.261701,16367.982627,2105.14162,590.469161,0,17587.748365,7183.47726,410.978192,0,60,"mbank_training" +"project4133","default",2,3848,131,2378,12,1,100,TRUE,56.9690000000001,56.966936068,10115.779173,17903.407013,1533.860302,741.545257,0,15572.04862,7743.139666,419.157453,0,60,"mbank_training" +"project4133","default",3,3849,131,2377,12,1,100,TRUE,54.7849999999999,54.782788502,10360.897605,15017.756517,2720.598684,644.129521,0,17636.426464,7206.39643,420.041532,0,60,"mbank_training" +"project4133","default",4,3850,131,2371,13,1,100,TRUE,56.2750000000001,56.271602815,10099.90732,17124.399453,1715.449085,535.285948,0,17256.680013,6864.383411,406.018243,0,60,"mbank_training" +"project4133","default",5,3851,131,2376,12,1,100,TRUE,55.3900000000003,55.386705398,10123.53186,16744.885155,2063.074556,809.123932,0,17128.859433,6715.284436,414.796312,0,60,"mbank_training" +"project804","default",1,3847,173,1375,3,1,47,TRUE,60.0190000000002,21.23623549,11228.226484,12303.850331,2430.559834,830.477995,0,17909.169516,8717.022799,708.078116,0,60,"mbank_training" +"project804","default",2,3848,173,1368,3,1,100,TRUE,60.1500000000005,60.080988944,10396.680621,17198.177502,1861.84538,753.302697,0,16601.247605,6676.313676,571.262477,0,60,"mbank_training" +"project804","default",3,3849,173,1373,2,1,86,TRUE,60.0889999999999,60.066671998,9195.650252,12590.645613,3033.072112,755.479002,0,20511.079276,7269.364064,711.281004,0,60,"mbank_training" +"project804","default",4,3850,173,1387,3,1,100,TRUE,60.0789999999997,22.113473518,12998.737773,12270.630953,4290.703251,689.04819,0,15290.848349,7909.329051,669.759235,0,60,"mbank_training" +"project804","default",5,3851,173,1370,3,1,7,TRUE,60.0300000000007,60.022688033,13480.410099,11992.159228,3052.961199,727.439951,0,16306.47525,8248.843774,634.589284,0,60,"mbank_training" +"project4284","default",1,3847,4062,1268,0,1,1,TRUE,75.3469999999998,75.339956447,54305.100757,21032.114597,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",2,3848,4062,1407,0,1,1,TRUE,72.5429999999997,72.536462982,54305.789977,18227.270056,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",3,3849,4062,1193,0,1,1,TRUE,70.0289999999995,70.021119683,54540.924025,15477.382811,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",4,3850,4062,1107,0,1,1,TRUE,70.415,70.408575744,54275.210143,16129.911666,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",5,3851,4062,1355,0,1,1,TRUE,74.0500000000002,74.042071236,54236.199289,19803.278572,0,0,0,0,0,0,0,60,"mbank_training" diff --git a/dev/benchmarks/t252_mbank_all_20260327_1317.csv b/dev/benchmarks/t252_mbank_all_20260327_1317.csv new file mode 100644 index 000000000..5f5cc6c46 --- /dev/null +++ b/dev/benchmarks/t252_mbank_all_20260327_1317.csv @@ -0,0 +1,376 @@ +"dataset","strategy","replicate","seed","n_taxa","best_score","replicates","hits_to_best","pool_size","timed_out","wall_s","time_to_best_s","wagner_ms","tbr_ms","xss_ms","rss_ms","css_ms","ratchet_ms","drift_ms","final_tbr_ms","fuse_ms","budget_s","source" +"project532","default",1,3847,21,1139,7,5,4,FALSE,0.542,0.159055272,135.619212,13.04573,22.272809,15.919137,0,276.407229,56.168931,11.423204,2.049818,30,"mbank_training" +"project532","default",2,3848,21,1139,6,4,3,FALSE,0.454999999999999,0.094952955,92.239999,25.374239,16.837307,14.426449,0,245.246002,42.116067,9.629668,1.382472,30,"mbank_training" +"project532","default",3,3849,21,1139,6,5,4,FALSE,0.475000000000001,0.115830358,107.415156,18.13517,22.390323,12.792332,0,248.020245,43.648871,9.998143,1.511787,30,"mbank_training" +"project532","default",4,3850,21,1139,7,3,3,FALSE,0.583,0.158395851,110.868678,33.604432,20.556298,16.136819,0,327.756102,53.823888,12.069131,0.705788,30,"mbank_training" +"project532","default",5,3851,21,1139,6,3,2,FALSE,0.443000000000001,0.095074113,94.240283,22.942882,19.845399,14.211134,0,237.886468,36.744354,9.808866,1.523488,30,"mbank_training" +"project2346","default",1,3847,23,316,18,2,4,FALSE,0.706,0.483245432,59.526585,29.639617,27.097784,15.987328,0,445.189137,111.983257,10.955062,0.615469,30,"mbank_training" +"project2346","default",2,3848,23,318,8,2,20,FALSE,0.272,0.107900119,30.632818,8.388472,14.761471,6.777368,0,142.381512,32.831868,4.816267,0.208482,30,"mbank_training" +"project2346","default",3,3849,23,320,12,2,4,FALSE,0.401999999999999,0.107059556,42.565554,15.985884,22.632968,10.643807,0,238.422067,59.348087,7.143847,0.571676,30,"mbank_training" +"project2346","default",4,3850,23,317,17,2,13,FALSE,0.695,0.302030974,61.210236,24.433465,24.273697,15.448662,0,427.301713,107.544701,10.352768,0.259899,30,"mbank_training" +"project2346","default",5,3851,23,314,78,1,4,FALSE,3.36,1.270754501,279.259769,120.36549,125.623277,66.597026,0,2160.387218,553.280991,47.067397,1.0782,30,"mbank_training" +"project2451","default",1,3847,24,735,12,2,4,FALSE,0.391,0.201359523,57.55009,12.458124,17.440524,10.013999,0,208.719441,73.761881,6.301613,0.352573,30,"mbank_training" +"project2451","default",2,3848,24,732,67,1,1,FALSE,2.958,0.754287847,321.511476,75.820549,109.558619,48.42972,0,1739.621241,624.549235,35.606072,0,30,"mbank_training" +"project2451","default",3,3849,24,731,7,2,6,FALSE,0.221999999999998,0.083935985,38.627521,8.011942,11.150373,4.288323,0,111.132705,37.786174,3.844417,0.195688,30,"mbank_training" +"project2451","default",4,3850,24,731,74,1,3,FALSE,3.226,1.072942617,382.702166,78.858534,117.675578,52.337654,0,1893.503165,653.858183,39.018316,1.668751,30,"mbank_training" +"project2451","default",5,3851,24,730,56,1,2,FALSE,2.475,0.259627507,272.084943,78.715156,92.241695,42.774133,0,1441.742098,513.345106,30.965669,0,30,"mbank_training" +"project4501","default",1,3847,24,118,11,4,63,FALSE,0.402000000000001,0.045659888,19.456467,10.321312,11.249257,7.273704,0,127.024403,20.262034,4.693674,0.592876,30,"mbank_training" +"project4501","default",2,3848,24,118,8,6,93,FALSE,0.331000000000003,0.019307548,14.526607,7.223871,10.667763,6.112406,0,63.857366,13.417429,3.468459,0.840874,30,"mbank_training" +"project4501","default",3,3849,24,118,11,8,63,FALSE,0.353999999999999,0.024017324,16.544606,12.054514,10.874954,6.541294,0,94.229752,16.766104,5.012416,1.208605,30,"mbank_training" +"project4501","default",4,3850,24,118,11,6,63,FALSE,0.399999999999999,0.049086008,17.238983,13.324074,11.375175,6.189715,0,122.860359,19.102503,4.840804,0.788614,30,"mbank_training" +"project4501","default",5,3851,24,118,9,4,93,FALSE,0.378,0.018508023,15.81334,12.376079,7.116948,5.60964,0,102.823873,17.718845,3.840052,0.661435,30,"mbank_training" +"project944","default",1,3847,25,128,7,7,60,FALSE,0.306999999999999,0.018572714,13.799929,8.682096,6.631403,4.331374,0,50.309772,9.260855,2.764333,0.884165,30,"mbank_training" +"project944","default",2,3848,25,128,6,6,60,FALSE,0.233000000000001,0.019885466,12.479662,4.603999,4.628432,3.361849,0,44.265201,7.877521,2.368148,0.873353,30,"mbank_training" +"project944","default",3,3849,25,128,6,6,60,FALSE,0.314,0.020729454,10.93727,6.439242,5.535302,3.476875,0,43.362833,9.229105,2.367437,0.930562,30,"mbank_training" +"project944","default",4,3850,25,128,8,8,60,FALSE,0.356999999999999,0.018798128,16.221829,8.35613,6.25217,5.100312,0,59.507827,13.044878,3.160491,0.944067,30,"mbank_training" +"project944","default",5,3851,25,128,9,9,60,FALSE,0.276999999999997,0.018958742,17.363908,11.685671,7.165347,5.454358,0,65.433255,11.16655,3.531729,1.479684,30,"mbank_training" +"project971_(1)","default",1,3847,26,157,7,5,100,FALSE,0.193000000000001,0.032616012,18.605586,8.255522,13.045921,6.466022,0,108.291193,17.142443,5.032654,0.93516,30,"mbank_training" +"project971_(1)","default",2,3848,26,157,7,3,100,FALSE,0.209,0.033034109,15.300445,7.423335,8.066606,6.895672,0,120.329215,17.930359,5.086765,0.809634,30,"mbank_training" +"project971_(1)","default",3,3849,26,157,14,9,100,FALSE,0.411000000000001,0.073843174,32.755924,20.344981,29.768159,14.029603,0,232.580428,32.34325,10.062332,1.52913,30,"mbank_training" +"project971_(1)","default",4,3850,26,157,9,5,100,FALSE,0.266000000000002,0.034971856,20.398181,11.072927,12.306986,9.028718,0,166.869904,26.803601,6.538127,0.79671,30,"mbank_training" +"project971_(1)","default",5,3851,26,157,8,5,100,FALSE,0.220000000000002,0.030588355,18.153386,8.61647,15.762383,8.048763,0,118.243181,20.804155,5.864089,1.237399,30,"mbank_training" +"project2762","default",1,3847,29,259,9,5,100,FALSE,0.477,0.064657482,89.005679,28.625208,27.015199,16.271522,0,222.30784,53.258133,10.549338,2.931178,30,"mbank_training" +"project2762","default",2,3848,29,259,9,5,100,FALSE,0.500999999999998,0.107338091,97.623433,21.280192,19.892007,19.848566,0,237.460345,52.994316,10.82002,2.92123,30,"mbank_training" +"project2762","default",3,3849,29,259,14,7,100,FALSE,0.824000000000002,0.248402003,167.013281,40.401138,31.5735,30.962567,0,404.788011,76.685877,16.82874,3.46784,30,"mbank_training" +"project2762","default",4,3850,29,259,7,2,100,FALSE,0.442,0.060617737,82.572548,17.849112,22.881476,15.476914,0,210.443998,44.501366,8.239273,0.595401,30,"mbank_training" +"project2762","default",5,3851,29,259,9,2,100,FALSE,0.549999999999997,0.156776252,96.810243,25.176484,24.499914,18.026146,0,289.442735,65.42589,10.713507,1.121812,30,"mbank_training" +"project826","default",1,3847,33,431,10,5,100,FALSE,0.585000000000001,0.058192461,97.322235,84.950343,36.014361,16.87607,0,243.540672,48.527945,10.402926,2.289268,30,"mbank_training" +"project826","default",2,3848,33,431,14,8,100,FALSE,0.815000000000001,0.064265745,164.329903,100.342802,40.22171,29.986892,0,362.511776,60.62095,14.329798,2.752712,30,"mbank_training" +"project826","default",3,3849,33,431,9,5,100,FALSE,0.506,0.112018993,97.964377,66.801407,33.044869,13.567502,0,224.709019,34.377508,9.366282,2.14059,30,"mbank_training" +"project826","default",4,3850,33,431,13,7,100,FALSE,0.698,0.062873383,154.074276,81.270968,45.052406,19.77621,0,295.134169,54.289724,13.291651,3.70243,30,"mbank_training" +"project826","default",5,3851,33,431,10,7,100,FALSE,0.533000000000001,0.1020999,107.928141,72.157363,27.216947,15.95665,0,228.789413,45.677109,10.315738,2.329386,30,"mbank_training" +"project561","default",1,3847,34,1169,5,4,2,FALSE,0.659999999999997,0.181347639,92.846159,25.059065,28.207322,29.981203,0,370.359995,88.552929,15.979782,1.446704,30,"mbank_training" +"project561","default",2,3848,34,1169,10,8,2,FALSE,1.232,0.18098681,142.576079,44.374205,78.560658,59.124055,0,702.572509,163.908009,31.657059,1.392722,30,"mbank_training" +"project561","default",3,3849,34,1169,6,4,2,FALSE,0.892000000000003,0.193806133,98.658642,28.550289,63.174951,32.545117,0,527.190542,111.328013,20.332498,2.808147,30,"mbank_training" +"project561","default",4,3850,34,1169,7,6,2,FALSE,0.850000000000001,0.196609952,112.865658,45.375572,39.933657,39.492137,0,466.18721,115.350405,22.395089,1.381331,30,"mbank_training" +"project561","default",5,3851,34,1169,13,10,2,FALSE,1.577,0.176815928,214.233348,75.676926,87.389599,73.393768,0,887.255078,189.841499,42.006711,0,30,"mbank_training" +"project571","default",1,3847,42,634,8,3,12,FALSE,0.694000000000003,0.164095501,106.218011,51.333049,32.312008,21.394067,0,342.296416,63.619378,12.651639,1.096264,30,"mbank_training" +"project571","default",2,3848,42,635,7,6,28,FALSE,0.639000000000003,0.088679535,100.785177,45.952629,28.371058,20.286232,0,230.646768,48.366994,10.52306,1.873838,30,"mbank_training" +"project571","default",3,3849,42,634,14,5,12,FALSE,1.271,0.205375594,186.209453,70.512624,64.876695,39.745794,0,696.746744,136.711756,21.663235,2.899548,30,"mbank_training" +"project571","default",4,3850,42,634,6,3,12,FALSE,0.503999999999998,0.106289547,97.153147,40.469948,25.720071,17.059675,0,215.716522,48.979746,9.500026,2.09783,30,"mbank_training" +"project571","default",5,3851,42,634,7,4,12,FALSE,0.562000000000005,0.094291991,104.624002,34.374362,22.712617,21.010924,0,265.769841,52.724237,10.747413,2.061411,30,"mbank_training" +"project4146_(3)","default",1,3847,59,260,79,1,100,TRUE,27.184,1.159421069,2161.951899,1431.436231,1369.737579,584.938795,0,15612.088541,5495.045514,342.372656,0,30,"mbank_training" +"project4146_(3)","default",2,3848,59,262,16,2,100,FALSE,4.43300000000001,3.006056794,398.978693,305.709476,242.038821,126.156641,0,2449.890491,699.98229,70.565506,3.637498,30,"mbank_training" +"project4146_(3)","default",3,3849,59,263,11,2,100,FALSE,2.911,1.75921419,292.432282,236.70639,138.23963,74.826225,0,1569.189041,477.95843,49.093865,1.822251,30,"mbank_training" +"project4146_(3)","default",4,3850,59,261,43,1,100,FALSE,13.781,2.134428645,1195.778958,805.816844,652.279548,310.548343,0,7750.977861,2798.563687,188.875727,9.304356,30,"mbank_training" +"project4146_(3)","default",5,3851,59,261,31,2,100,FALSE,10.048,0.622684871,834.240637,543.209237,558.784365,222.447677,0,5626.288589,1986.738636,134.143195,1.677448,30,"mbank_training" +"project3688","default",1,3847,60,854,17,2,100,FALSE,4.04899999999999,4.046715439,408.411651,338.435934,212.952407,88.730993,0,1919.553444,962.191821,49.845781,0.793184,30,"mbank_training" +"project3688","default",2,3848,60,852,56,2,100,FALSE,15.176,15.173541808,1302.515289,1226.872642,710.476017,259.35943,0,7396.027813,3937.753164,158.084343,7.912235,30,"mbank_training" +"project3688","default",3,3849,60,850,86,2,100,FALSE,22.625,7.183552468,2072.997198,1511.265434,1160.574252,431.875451,0,11185.916283,5821.436102,245.879774,1.441333,30,"mbank_training" +"project3688","default",4,3850,60,845,100,0,100,FALSE,26.996,23.186208158,2348.914977,2049.502682,1311.093182,459.592175,0,13408.741898,6965.596352,294.003592,7.587966,30,"mbank_training" +"project3688","default",5,3851,60,846,99,2,11,TRUE,27.29,27.28811545,2406.499645,2060.393789,1264.02288,485.95709,0,13418.549013,7075.465949,287.115493,0,30,"mbank_training" +"project4049","default",1,3847,60,5240,26,1,100,TRUE,27.317,5.573148428,6703.751431,2176.123735,825.960021,396.113765,0,12768.91305,3880.674858,253.963364,0,30,"mbank_training" +"project4049","default",2,3848,60,5241,14,2,100,FALSE,13.655,7.027287931,3626.086034,1198.572419,456.608051,220.963331,0,5704.360758,1589.957546,142.553627,3.045504,30,"mbank_training" +"project4049","default",3,3849,60,5237,25,0,69,TRUE,30.007,24.853505039,6743.164871,2194.709548,693.107501,428.940065,0,12666.958798,4006.547911,248.048225,20.524267,30,"mbank_training" +"project4049","default",4,3850,60,5238,25,1,42,TRUE,30.004,0.932033283,6776.515792,2319.610345,684.456542,434.411469,0,12776.817935,3766.735394,249.499943,0,30,"mbank_training" +"project4049","default",5,3851,60,5239,25,1,100,TRUE,27.486,3.676459796,6617.98495,2189.03596,703.300419,449.992373,0,12985.569232,3805.079674,248.978194,0,30,"mbank_training" +"project423","default",1,3847,60,495,8,4,100,FALSE,2.75400000000002,0.419124173,456.884432,154.343943,179.912009,110.096176,0,1290.272654,315.113496,59.03124,8.115354,30,"mbank_training" +"project423","default",2,3848,60,495,10,4,100,FALSE,3.13900000000001,0.370053092,496.594593,197.924011,185.722131,115.789679,0,1590.611775,385.967567,74.139425,14.305423,30,"mbank_training" +"project423","default",3,3849,60,495,9,4,100,FALSE,2.88499999999999,0.385641831,500.870846,145.446436,177.377975,93.716956,0,1323.701525,355.745644,66.297395,15.357013,30,"mbank_training" +"project423","default",4,3850,60,495,9,3,100,FALSE,3.17099999999999,0.913398284,477.038761,151.875211,145.583234,99.231705,0,1607.287212,441.466933,67.048899,9.279487,30,"mbank_training" +"project423","default",5,3851,60,495,7,3,100,FALSE,2.30000000000001,0.587509882,327.409848,89.159324,165.845635,66.60812,0,1078.061213,286.696336,50.646708,8.055533,30,"mbank_training" +"project4286","default",1,3847,63,283,29,1,100,FALSE,10.679,3.568361219,949.758854,695.981939,507.727922,255.192069,0,5968.563535,2042.300499,151.240761,10.412181,30,"mbank_training" +"project4286","default",2,3848,63,286,17,1,100,FALSE,5.93299999999999,2.493855935,547.902595,427.93154,253.562543,148.533339,0,3300.116545,1089.177478,84.825461,12.413171,30,"mbank_training" +"project4286","default",3,3849,63,282,69,0,100,TRUE,27.162,5.203139408,2360.989445,1512.558401,1088.458169,571.79875,0,15666.679256,5442.170076,346.111242,8.470814,30,"mbank_training" +"project4286","default",4,3850,63,282,36,2,100,FALSE,13.267,6.209710639,1092.563694,721.114448,524.582188,278.114164,0,7555.478801,2692.809987,176.91012,4.096113,30,"mbank_training" +"project4286","default",5,3851,63,283,70,1,100,TRUE,27.169,1.557344311,2287.207347,1582.845371,1211.075903,547.191397,0,15688.016025,5332.93518,347.612348,0,30,"mbank_training" +"project4359","default",1,3847,71,183,41,13,100,TRUE,27.097,2.261897091,5771.353119,1305.011338,1006.712715,912.447879,0,15026.53261,2340.667863,522.209948,117.847876,30,"mbank_training" +"project4359","default",2,3848,71,183,30,14,100,FALSE,16.129,0.665570597,3928.068751,1053.953565,706.859134,611.186316,0,8062.325089,1227.655533,389.882129,64.610764,30,"mbank_training" +"project4359","default",3,3849,71,183,43,13,100,TRUE,27.09,3.114505389,6132.139298,1549.203645,1017.246691,869.207429,0,14564.715459,2203.509034,547.569478,96.465078,30,"mbank_training" +"project4359","default",4,3850,71,183,39,14,100,FALSE,23.331,3.776490508,5602.272217,1470.537157,1043.605592,841.264513,0,12109.540516,1584.418474,494.61431,93.977932,30,"mbank_training" +"project4359","default",5,3851,71,184,5,4,100,FALSE,2.51499999999999,0.625239003,581.307694,181.271259,99.713473,106.22802,0,1151.351682,219.65755,60.431613,5.864753,30,"mbank_training" +"project4397","default",1,3847,75,1645,39,1,84,TRUE,29.898,11.652682336,4059.293835,4611.243157,859.16792,378.941198,0,12614.467637,4211.561709,269.027083,0,30,"mbank_training" +"project4397","default",2,3848,75,1648,40,1,11,TRUE,27.222,16.275268683,4083.297886,4499.993494,922.733936,399.403983,0,12638.777186,4181.619232,271.886856,1.900378,30,"mbank_training" +"project4397","default",3,3849,75,1649,22,2,100,FALSE,14.671,14.669488113,2326.940955,2439.062281,438.000839,215.523963,0,6522.444608,2178.359279,147.225135,1.934383,30,"mbank_training" +"project4397","default",4,3850,75,1648,40,1,100,TRUE,27.3430000000001,10.737418532,4263.486964,4441.437048,998.363541,456.623285,0,12534.551377,4024.735858,269.57219,10.386157,30,"mbank_training" +"project4397","default",5,3851,75,1646,39,1,42,TRUE,28.025,6.965588083,4076.471754,4722.078731,843.218866,405.213474,0,12542.741242,4122.793808,285.67242,0,30,"mbank_training" +"project2084_(1)","default",1,3847,86,28962,0,1,1,TRUE,27.2950000000001,27.284871037,6087.152752,7601.569896,1703.323901,292.649511,0,10154.252001,1190.248423,0,0,30,"mbank_training" +"project2084_(1)","default",2,3848,86,28206,0,1,2,TRUE,27.893,27.88159824,6449.215962,9863.875036,2809.440548,508.882523,0,7554.658108,0,0,0,30,"mbank_training" +"project2084_(1)","default",3,3849,86,28303,0,1,1,TRUE,27.3520000000001,27.341604691,5532.892173,4659.700576,2729.695298,369.901689,0,10917.922654,2879.327597,0,0,30,"mbank_training" +"project2084_(1)","default",4,3850,86,29263,0,1,1,TRUE,27.337,27.32739407,5136.496018,4624.353351,2476.729143,305.33289,0,12907.034261,1626.271363,0,0,30,"mbank_training" +"project2084_(1)","default",5,3851,86,29028,0,1,1,TRUE,27.277,27.265355976,6452.248698,4808.151115,2003.218686,324.815519,0,10753.100827,2544.773109,129.969162,0,30,"mbank_training" +"project2771","default",1,3847,94,1061,18,1,2,TRUE,27.628,27.627539078,2381.315256,1179.557903,1473.537564,476.722015,0,14814.327106,6359.277975,317.095031,4.795324,30,"mbank_training" +"project2771","default",2,3848,94,1051,18,1,2,TRUE,27.056,4.598189444,2329.759595,1058.876302,1523.15815,437.623557,0,14578.550452,6707.850826,347.023291,25.508291,30,"mbank_training" +"project2771","default",3,3849,94,1055,18,1,25,TRUE,28.596,28.595855406,2208.668083,1237.955741,1885.302937,461.560814,0,14342.634134,6721.423525,318.360851,0,30,"mbank_training" +"project2771","default",4,3850,94,1046,18,0,6,TRUE,27.212,20.807814846,2244.376253,996.635736,1387.986264,635.028035,0,14958.245088,6429.43522,307.637888,39.061989,30,"mbank_training" +"project2771","default",5,3851,94,1061,18,1,83,TRUE,30.005,30.00408473,2175.970571,1277.59992,1288.181195,517.295524,0,14521.506037,6925.085047,312.43618,0,30,"mbank_training" +"project2184","default",1,3847,114,565,13,1,100,FALSE,16.5749999999999,10.308158943,1901.553612,4214.328218,485.080766,269.267477,0,7183.460842,1969.620003,238.019628,35.13861,30,"mbank_training" +"project2184","default",2,3848,114,564,8,2,100,FALSE,9.25199999999995,4.431088219,1068.876064,2542.898349,345.815564,183.719232,0,3766.096167,990.361813,142.478229,6.554553,30,"mbank_training" +"project2184","default",3,3849,114,565,18,2,100,FALSE,23.623,14.129842318,2496.19712,5294.070284,723.003063,423.243246,0,10922.706838,3311.749457,329.429608,19.212075,30,"mbank_training" +"project2184","default",4,3850,114,563,19,0,100,TRUE,27.1590000000001,15.437353718,2783.61853,6176.676207,832.030467,454.22667,0,12537.971451,3609.087835,353.928239,64.443171,30,"mbank_training" +"project2184","default",5,3851,114,564,18,2,100,FALSE,24.5440000000001,1.10036677,2383.790706,5703.843109,776.865951,391.596843,0,11368.827702,3408.154639,327.715989,13.345019,30,"mbank_training" +"project3938","default",1,3847,119,3416,5,1,100,TRUE,28.0219999999999,28.003850298,7657.059517,10286.657825,1092.717385,263.907318,0,5528.088936,1944.685173,217.01893,0,30,"mbank_training" +"project3938","default",2,3848,119,3408,5,1,100,TRUE,27.7280000000001,20.346678436,8009.709849,9193.549534,847.319192,396.677981,0,5963.455621,2388.873139,226.921023,0,30,"mbank_training" +"project3938","default",3,3849,119,3413,5,1,100,TRUE,27.654,25.92062548,7597.568873,8397.953192,1192.278244,267.524903,0,6370.474937,2821.517517,219.073277,160.334703,30,"mbank_training" +"project3938","default",4,3850,119,3418,5,1,100,TRUE,27.4750000000001,9.50999064,7772.234407,9643.990176,1475.966887,248.789132,0,5542.434776,2106.959977,212.515308,0,30,"mbank_training" +"project3938","default",5,3851,119,3422,5,1,100,TRUE,27.732,5.309066635,8970.183635,8855.738138,862.2458,303.566881,0,5511.693151,2297.473368,216.724037,0,30,"mbank_training" +"syab07201","default",1,3847,125,14933,3,1,3,TRUE,27.6960000000001,12.634024402,4209.831881,3971.282564,792.06249,485.70106,0,11643.179767,5614.675847,313.862167,0,30,"mbank_training" +"syab07201","default",2,3848,125,15033,4,1,1,TRUE,27.1510000000001,6.377969028,5490.40643,1351.931126,1062.927057,575.611519,0,11885.543952,6220.76002,417.368365,0,30,"mbank_training" +"syab07201","default",3,3849,125,14953,4,1,1,TRUE,27.2079999999999,19.582339775,4913.616435,2616.01053,1404.911755,502.94652,0,11247.894925,5943.98444,427.252739,0,30,"mbank_training" +"syab07201","default",4,3850,125,15017,4,1,1,TRUE,27.154,7.581652703,5078.897263,4206.671582,732.611115,532.396846,0,12075.781743,3956.822031,428.170321,0,30,"mbank_training" +"syab07201","default",5,3851,125,14926,4,1,2,TRUE,27.3429999999998,26.157566299,4773.575904,2310.009118,810.267198,599.034403,0,12631.506286,5520.778049,411.987076,0,30,"mbank_training" +"project4133","default",1,3847,131,2386,7,1,100,TRUE,27.915,15.584865867,5481.31431,8081.440573,1366.990872,340.315942,0,8473.379962,3062.261999,200.053899,0,30,"mbank_training" +"project4133","default",2,3848,131,2375,7,1,100,TRUE,29.7460000000001,29.74417613,5614.65215,9794.68452,805.954015,307.043522,0,7098.080922,3172.412796,211.646549,0,30,"mbank_training" +"project4133","default",3,3849,131,2377,8,1,100,TRUE,29.912,29.909055691,5418.41755,8613.7662,1026.893078,312.755544,0,8189.025034,3218.032499,236.792295,0,30,"mbank_training" +"project4133","default",4,3850,131,2374,7,1,100,TRUE,28.2819999999999,28.278871211,5472.009347,9335.30634,1118.641229,294.130262,0,7872.79282,2702.84137,207.013293,0,30,"mbank_training" +"project4133","default",5,3851,131,2385,8,1,100,TRUE,27.2849999999999,15.627101633,5979.695623,8990.18192,1047.259017,274.950811,0,7223.982727,3261.511037,237.368799,0,30,"mbank_training" +"project804","default",1,3847,173,1375,1,1,3,TRUE,27.7629999999999,17.248265978,5083.03061,7686.516173,873.982178,424.487131,0,10184.06465,2628.645841,184.614359,0,30,"mbank_training" +"project804","default",2,3848,173,1370,1,1,3,TRUE,30.0450000000001,30.041864633,4723.645469,9349.265054,812.927877,420.088117,0,8947.386937,2593.918773,183.881763,0,30,"mbank_training" +"project804","default",3,3849,173,1373,1,1,12,TRUE,30.056,30.051604437,6251.779509,6250.564102,1728.904108,435.905993,0,9042.788334,3203.429055,272.356204,0,30,"mbank_training" +"project804","default",4,3850,173,1387,1,1,100,TRUE,30.0989999999999,18.068643362,7075.131855,6976.566617,2091.847773,385.917478,0,7139.858341,3216.192954,174.730846,0,30,"mbank_training" +"project804","default",5,3851,173,1372,1,1,99,TRUE,30.1019999999999,15.872184522,5765.983551,5838.745921,1365.292695,412.16547,0,9596.695161,3903.152447,177.915998,0,30,"mbank_training" +"project4284","default",1,3847,4062,1268,0,1,1,TRUE,42.9269999999999,42.89428889,27450.181719,15441.009559,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",2,3848,4062,1411,0,1,1,TRUE,40.9389999999999,40.934490269,27438.224699,13493.849822,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",3,3849,4062,1193,0,1,1,TRUE,39.7939999999999,39.789130405,27403.434015,12382.574941,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",4,3850,4062,1107,0,1,1,TRUE,40.596,40.592123674,27251.044937,13338.608947,0,0,0,0,0,0,0,30,"mbank_training" +"project4284","default",5,3851,4062,1360,0,1,1,TRUE,42.2569999999998,42.252403973,27459.165573,14790.786965,0,0,0,0,0,0,0,30,"mbank_training" +"project532","default",1,3847,21,1139,7,5,4,FALSE,0.531999999999925,0.155157045,132.94095,12.896659,22.605166,15.817748,0,270.798128,54.609435,11.237556,2.086117,60,"mbank_training" +"project532","default",2,3848,21,1139,6,4,3,FALSE,0.446999999999889,0.093571746,91.262179,25.037604,16.547562,14.29948,0,240.200325,41.112086,9.448097,1.32286,60,"mbank_training" +"project532","default",3,3849,21,1139,6,5,4,FALSE,0.451999999999998,0.107800963,103.294437,17.516446,21.010614,12.231377,0,234.851555,42.029894,9.646431,1.373085,60,"mbank_training" +"project532","default",4,3850,21,1139,7,3,3,FALSE,0.560000000000173,0.151373582,105.851452,32.081296,19.716629,15.418977,0,315.724865,51.357515,11.456677,0.675031,60,"mbank_training" +"project532","default",5,3851,21,1139,6,3,2,FALSE,0.424999999999955,0.09220405,90.138613,21.852509,18.915639,13.605615,0,229.299615,35.483317,9.40638,1.447665,60,"mbank_training" +"project2346","default",1,3847,23,316,18,2,4,FALSE,0.674999999999955,0.462164759,56.807774,28.543874,26.025564,15.40238,0,425.377913,107.322736,10.523358,0.588648,60,"mbank_training" +"project2346","default",2,3848,23,318,8,2,20,FALSE,0.261999999999944,0.104624604,29.655097,8.09496,14.298628,6.572934,0,136.799606,31.645155,4.640075,0.204004,60,"mbank_training" +"project2346","default",3,3849,23,320,12,2,4,FALSE,0.388999999999896,0.104599907,41.047939,15.545544,21.653463,10.344817,0,231.190393,57.622497,6.943413,0.455207,60,"mbank_training" +"project2346","default",4,3850,23,317,17,2,13,FALSE,0.672000000000025,0.2921479,59.06303,23.600541,23.237525,14.67714,0,413.671557,104.475003,10.015255,0.229012,60,"mbank_training" +"project2346","default",5,3851,23,314,78,1,4,FALSE,3.25399999999991,1.237738072,270.848388,116.721824,120.649286,64.301568,0,2091.267241,537.485683,45.54476,1.042463,60,"mbank_training" +"project2451","default",1,3847,24,735,12,2,4,FALSE,0.380000000000109,0.195803825,58.437761,11.924619,16.795548,9.706935,0,201.186919,70.898262,6.095423,0.310965,60,"mbank_training" +"project2451","default",2,3848,24,732,67,1,1,FALSE,2.86299999999983,0.739887091,309.339849,73.561174,106.35613,47.104094,0,1682.872919,606.382824,34.560083,0,60,"mbank_training" +"project2451","default",3,3849,24,731,7,2,6,FALSE,0.213999999999942,0.08147899,36.285649,7.782122,10.81444,4.176962,0,107.266687,36.934762,3.742436,0.176081,60,"mbank_training" +"project2451","default",4,3850,24,731,74,1,3,FALSE,3.10799999999995,1.03302151,354.13203,76.234593,112.742415,50.471643,0,1835.061126,633.554558,37.608168,1.54608,60,"mbank_training" +"project2451","default",5,3851,24,730,56,1,2,FALSE,2.38400000000001,0.248315494,251.682816,76.48362,89.400481,41.476819,0,1392.188884,499.804545,30.093098,0,60,"mbank_training" +"project4501","default",1,3847,24,118,11,4,63,FALSE,0.388000000000147,0.044144614,18.206382,9.968826,10.619233,6.948279,0,123.442226,19.781661,4.620336,0.453224,60,"mbank_training" +"project4501","default",2,3848,24,118,8,6,93,FALSE,0.31899999999996,0.01872961,14.026555,6.975631,10.285114,5.881491,0,61.758493,12.974875,3.354795,0.776,60,"mbank_training" +"project4501","default",3,3849,24,118,11,8,63,FALSE,0.342000000000098,0.022986874,15.960925,11.566373,10.484639,6.290182,0,90.623341,16.135268,4.844278,1.13743,60,"mbank_training" +"project4501","default",4,3850,24,118,11,6,63,FALSE,0.385999999999967,0.047210546,16.637923,12.774526,11.017282,5.998301,0,118.138331,18.401079,4.671276,0.787301,60,"mbank_training" +"project4501","default",5,3851,24,118,9,4,93,FALSE,0.366999999999962,0.018027006,15.285145,12.046139,6.928463,5.445069,0,99.51878,17.209297,3.704213,0.634866,60,"mbank_training" +"project944","default",1,3847,25,128,7,7,60,FALSE,0.294999999999845,0.018088743,13.30662,8.38728,6.301262,4.208281,0,48.776002,9.005236,2.673461,0.830533,60,"mbank_training" +"project944","default",2,3848,25,128,6,6,60,FALSE,0.224999999999909,0.019102592,12.06331,4.438717,4.448804,3.289853,0,42.937782,7.673997,2.300673,0.821297,60,"mbank_training" +"project944","default",3,3849,25,128,6,6,60,FALSE,0.300999999999931,0.020199055,10.4648,6.232242,5.257136,3.33605,0,41.69849,8.874975,2.280816,0.85007,60,"mbank_training" +"project944","default",4,3850,25,128,8,8,60,FALSE,0.346000000000004,0.017898024,15.590799,8.054094,6.008188,4.86531,0,57.379981,12.570734,3.056505,0.871962,60,"mbank_training" +"project944","default",5,3851,25,128,9,9,60,FALSE,0.271999999999935,0.018105484,16.873965,11.336702,6.987282,5.283617,0,63.617765,10.896273,3.429023,1.41757,60,"mbank_training" +"project971_(1)","default",1,3847,26,157,7,5,100,FALSE,0.188000000000102,0.0321425,18.231222,8.017143,12.649642,6.286026,0,106.37865,16.725717,4.923929,0.854858,60,"mbank_training" +"project971_(1)","default",2,3848,26,157,7,3,100,FALSE,0.202999999999975,0.032296189,14.838464,7.174025,7.827495,6.616547,0,116.535101,17.333633,4.903698,0.776372,60,"mbank_training" +"project971_(1)","default",3,3849,26,157,14,9,100,FALSE,0.396999999999935,0.071620829,31.668768,19.663817,28.913962,13.593501,0,225.426793,31.191852,9.765443,1.439651,60,"mbank_training" +"project971_(1)","default",4,3850,26,157,9,5,100,FALSE,0.256000000000085,0.033776476,19.610728,10.628288,11.828968,8.692054,0,160.535044,25.838523,6.279481,0.742718,60,"mbank_training" +"project971_(1)","default",5,3851,26,157,8,5,100,FALSE,0.212999999999965,0.029701174,17.588101,8.384524,15.25026,7.746113,0,114.389611,20.183938,5.732391,1.160304,60,"mbank_training" +"project2762","default",1,3847,29,259,9,5,100,FALSE,0.456000000000131,0.061945926,85.191185,27.493606,26.010027,15.582262,0,213.117243,51.055173,10.07161,2.744155,60,"mbank_training" +"project2762","default",2,3848,29,259,9,5,100,FALSE,0.478000000000065,0.102143639,93.133407,20.179709,18.970153,19.004044,0,226.906961,50.594025,10.297777,2.769936,60,"mbank_training" +"project2762","default",3,3849,29,259,14,7,100,FALSE,0.793999999999869,0.238375902,161.098384,38.927341,30.230702,29.649027,0,389.337059,73.892207,16.263747,3.270378,60,"mbank_training" +"project2762","default",4,3850,29,259,7,2,100,FALSE,0.428000000000111,0.058983519,80.459811,17.423549,22.272229,15.008232,0,203.642406,42.868811,8.013146,0.555866,60,"mbank_training" +"project2762","default",5,3851,29,259,9,2,100,FALSE,0.535000000000082,0.151931487,94.256053,24.5204,23.889673,17.526596,0,281.778727,63.415303,10.419676,1.102776,60,"mbank_training" +"project826","default",1,3847,33,431,10,5,100,FALSE,0.562999999999874,0.056729766,94.04372,82.409339,32.56211,15.907697,0,235.778602,47.145061,10.064618,1.889146,60,"mbank_training" +"project826","default",2,3848,33,431,14,8,100,FALSE,0.773000000000138,0.06226196,158.0213,96.919785,38.896136,28.937695,0,349.497322,58.555463,13.803087,2.599543,60,"mbank_training" +"project826","default",3,3849,33,431,9,5,100,FALSE,0.48700000000008,0.107898042,94.132256,64.66845,31.710938,13.12036,0,216.448743,33.174753,9.034518,1.991478,60,"mbank_training" +"project826","default",4,3850,33,431,13,7,100,FALSE,0.675999999999931,0.06100803,148.574968,78.863774,43.65249,19.242854,0,286.236231,52.802974,12.941453,3.579557,60,"mbank_training" +"project826","default",5,3851,33,431,10,7,100,FALSE,0.51299999999992,0.098033702,103.843431,69.345303,26.127607,15.397387,0,220.027508,44.046429,9.939384,2.271867,60,"mbank_training" +"project561","default",1,3847,34,1169,5,4,2,FALSE,0.641999999999825,0.17492852,90.052497,24.187263,27.040114,28.734684,0,360.911661,86.894653,15.497404,1.369959,60,"mbank_training" +"project561","default",2,3848,34,1169,10,8,2,FALSE,1.2030000000002,0.177173586,138.669793,43.757383,76.882535,58.167253,0,685.538141,159.881894,30.825721,1.339312,60,"mbank_training" +"project561","default",3,3849,34,1169,6,4,2,FALSE,0.869999999999891,0.18790584,95.455449,27.965055,61.085397,31.719011,0,515.495499,109.62762,18.470351,2.695755,60,"mbank_training" +"project561","default",4,3850,34,1169,7,6,2,FALSE,0.826999999999998,0.192727156,109.19865,44.217522,38.954784,38.522289,0,453.311567,111.987842,21.865571,1.357856,60,"mbank_training" +"project561","default",5,3851,34,1169,13,10,2,FALSE,1.52800000000002,0.170371331,207.568654,73.128851,84.761045,71.265761,0,857.731735,184.962638,40.678768,0,60,"mbank_training" +"project571","default",1,3847,42,634,8,3,12,FALSE,0.671000000000049,0.157445137,102.114585,49.798648,31.506792,20.846217,0,331.649651,60.904718,12.152725,1.196162,60,"mbank_training" +"project571","default",2,3848,42,635,7,6,28,FALSE,0.619000000000142,0.086190797,97.06063,44.233453,27.674077,19.839528,0,223.541076,46.893869,10.235498,1.847647,60,"mbank_training" +"project571","default",3,3849,42,634,14,5,12,FALSE,1.22199999999998,0.196988458,179.846047,68.135679,62.309499,38.144598,0,669.29153,131.646387,20.805877,2.624811,60,"mbank_training" +"project571","default",4,3850,42,634,6,3,12,FALSE,0.484000000000151,0.102017813,93.340107,39.119923,24.874958,16.486808,0,207.705063,47.070943,9.135167,1.997872,60,"mbank_training" +"project571","default",5,3851,42,634,7,4,12,FALSE,0.546000000000049,0.091017033,101.386263,33.300991,21.79463,20.3546,0,258.555851,51.713282,10.502081,1.974336,60,"mbank_training" +"project4146_(3)","default",1,3847,59,260,88,2,100,FALSE,29.146,1.121571562,2298.172541,1541.883571,1388.448093,619.279036,0,16782.220998,6022.513703,373.627406,1.907832,60,"mbank_training" +"project4146_(3)","default",2,3848,59,262,16,2,100,FALSE,4.29300000000012,2.907800167,388.581579,296.941575,232.639383,121.909569,0,2373.682509,675.650357,68.198961,3.556906,60,"mbank_training" +"project4146_(3)","default",3,3849,59,263,11,2,100,FALSE,2.81899999999996,1.70537764,283.697704,229.458175,134.279144,72.100403,0,1519.530647,460.846893,47.369725,1.771184,60,"mbank_training" +"project4146_(3)","default",4,3850,59,261,43,1,100,FALSE,13.5260000000001,2.079365236,1173.373873,791.026839,639.12085,305.072936,0,7612.742871,2745.110794,185.134828,9.093941,60,"mbank_training" +"project4146_(3)","default",5,3851,59,261,31,2,100,FALSE,9.84099999999989,0.609325093,815.511969,534.029647,547.343539,218.158431,0,5510.730906,1946.230582,130.943191,1.776644,60,"mbank_training" +"project3688","default",1,3847,60,854,17,2,100,FALSE,3.92099999999982,3.919504781,385.501468,328.321857,206.991666,85.831002,0,1864.174767,936.303222,48.589011,0.777434,60,"mbank_training" +"project3688","default",2,3848,60,852,56,2,100,FALSE,14.6719999999998,14.670244516,1209.847499,1190.050694,689.647573,251.904609,0,7175.943495,3823.25799,153.520014,7.791968,60,"mbank_training" +"project3688","default",3,3849,60,850,86,2,100,FALSE,21.9349999999999,6.939731752,1927.898247,1467.977175,1126.604546,419.843857,0,10905.788095,5657.655153,238.271027,1.38597,60,"mbank_training" +"project3688","default",4,3850,60,845,100,0,100,FALSE,27.221,23.330009764,2294.310236,1989.385736,1275.489694,446.473982,0,13802.071859,6750.57814,283.637525,15.922755,60,"mbank_training" +"project3688","default",5,3851,60,851,100,2,100,FALSE,26.5239999999999,26.46343803,2229.919796,2008.345071,1232.32596,472.192149,0,13016.601459,6917.000715,281.497787,0,60,"mbank_training" +"project4049","default",1,3847,60,5237,50,1,24,TRUE,55.635,28.849031526,12812.355924,4022.935411,1534.967378,770.142303,0,26041.833005,8340.113622,482.876614,0,60,"mbank_training" +"project4049","default",2,3848,60,5241,14,2,100,FALSE,13.231,6.809624175,3518.238252,1155.261025,441.629723,213.68929,0,5531.891441,1538.969853,138.060614,2.902625,60,"mbank_training" +"project4049","default",3,3849,60,5237,51,0,69,TRUE,60.0050000000001,24.444532187,12896.301176,4401.928909,1452.051462,798.980615,0,25780.83288,8172.16566,480.915105,19.044584,60,"mbank_training" +"project4049","default",4,3850,60,5238,50,1,42,TRUE,57.2310000000002,0.927573556,12594.197084,4495.215782,1233.283096,798.486808,0,26471.88747,7921.979066,483.190351,0,60,"mbank_training" +"project4049","default",5,3851,60,5238,48,2,100,FALSE,51.5299999999997,35.234272169,12285.229797,3817.922871,1290.084708,807.233125,0,24705.508017,7500.001315,463.980469,5.626637,60,"mbank_training" +"project423","default",1,3847,60,495,8,4,100,FALSE,2.64699999999993,0.400347063,441.869488,149.306496,171.026613,105.804704,0,1238.875356,303.385543,56.319473,7.638977,60,"mbank_training" +"project423","default",2,3848,60,495,10,4,100,FALSE,3.02799999999979,0.356720675,484.076736,189.899313,179.44867,111.645103,0,1532.396363,369.926824,71.14467,13.962615,60,"mbank_training" +"project423","default",3,3849,60,495,9,4,100,FALSE,2.79700000000003,0.373323881,487.248487,141.088071,171.733752,90.220876,0,1283.917822,344.518464,64.250429,14.971223,60,"mbank_training" +"project423","default",4,3850,60,495,9,3,100,FALSE,3.04700000000003,0.878920563,461.192317,144.824947,139.270067,95.323255,0,1543.655094,421.459133,64.434844,9.116608,60,"mbank_training" +"project423","default",5,3851,60,495,7,3,100,FALSE,2.23999999999978,0.566558355,320.509298,86.205063,162.281775,64.508463,0,1050.605463,277.976441,49.275418,7.849402,60,"mbank_training" +"project4286","default",1,3847,63,283,29,1,100,FALSE,11.241,3.589487481,989.545223,727.212385,542.731052,270.822659,0,6261.587648,2167.214292,158.508772,11.424284,60,"mbank_training" +"project4286","default",2,3848,63,286,17,1,100,FALSE,6.8130000000001,2.866909116,617.645772,485.539661,292.257845,172.845279,0,3823.49682,1229.65814,98.085378,13.692145,60,"mbank_training" +"project4286","default",3,3849,63,282,100,0,100,FALSE,42.7640000000001,6.010016617,3526.899076,2417.887392,1721.874089,909.951811,0,24570.01672,8672.056574,543.419217,9.878051,60,"mbank_training" +"project4286","default",4,3850,63,282,36,2,100,FALSE,14.7190000000001,6.805195528,1212.868133,790.710567,583.564096,308.686255,0,8340.113943,2979.273262,196.729748,5.521758,60,"mbank_training" +"project4286","default",5,3851,63,281,100,0,100,FALSE,43.7359999999999,33.033736523,3495.468488,2532.104488,2037.564548,902.984646,0,25233.885802,8661.81827,553.655857,9.87596,60,"mbank_training" +"project4359","default",1,3847,71,183,44,14,100,FALSE,32.0450000000001,2.284122122,6608.926961,1508.127227,1179.962765,1060.524953,0,18053.52329,2758.388991,624.412424,148.430653,60,"mbank_training" +"project4359","default",2,3848,71,183,30,14,100,FALSE,17.991,0.669618659,4264.572667,1193.658269,796.706378,688.173414,0,9061.52797,1384.455348,438.30725,72.17638,60,"mbank_training" +"project4359","default",3,3849,71,183,48,14,100,FALSE,34.951,3.286649126,7525.892458,2044.580377,1323.267198,1124.803309,0,19080.732472,2870.561837,717.551942,135.330823,60,"mbank_training" +"project4359","default",4,3850,71,183,39,14,100,FALSE,24.9720000000002,4.017413013,5956.408073,1609.563323,1125.355011,908.142088,0,12957.920369,1690.329368,530.747424,99.477889,60,"mbank_training" +"project4359","default",5,3851,71,184,5,4,100,FALSE,2.68499999999995,0.670274178,618.15515,193.851391,105.988098,112.845919,0,1234.846153,232.937716,64.465129,6.273569,60,"mbank_training" +"project4397","default",1,3847,75,1645,68,1,100,TRUE,56.2779999999998,12.170124323,7991.80297,8621.763032,1577.508321,731.455933,0,25971.726563,8575.839645,531.199288,0,60,"mbank_training" +"project4397","default",2,3848,75,1647,44,2,72,FALSE,37.058,37.056632143,4958.379407,5528.590199,1114.292194,481.745149,0,15757.734049,5206.777362,340.585754,4.460428,60,"mbank_training" +"project4397","default",3,3849,75,1649,22,2,100,FALSE,16.4540000000002,16.451864336,2598.100076,2728.514627,483.544724,241.513852,0,7331.81783,2463.322895,165.851744,2.134348,60,"mbank_training" +"project4397","default",4,3850,75,1647,69,1,100,TRUE,54.3110000000001,34.441545572,7951.753605,9080.744802,1793.795614,869.29426,0,25555.851696,8210.116374,524.851391,11.542171,60,"mbank_training" +"project4397","default",5,3851,75,1646,68,1,29,TRUE,54.7840000000001,7.434980652,7951.811871,8898.509066,1817.529788,778.936639,0,25542.609159,8458.239025,548.439589,0,60,"mbank_training" +"project2084_(1)","default",1,3847,86,28962,1,1,1,TRUE,54.4740000000002,36.027487568,14979.095008,14999.501691,2975.967362,696.11225,0,15425.894194,4695.655548,410.523317,0,60,"mbank_training" +"project2084_(1)","default",2,3848,86,28206,1,1,2,TRUE,56.5880000000002,43.132102468,13181.638643,16595.40308,5997.758415,594.90442,0,11696.330144,7423.078548,256.345525,0,60,"mbank_training" +"project2084_(1)","default",3,3849,86,28303,1,1,1,TRUE,54.8099999999999,33.587438404,13343.930947,12374.806132,4238.386208,917.861064,0,19324.586019,3811.831915,333.559326,0,60,"mbank_training" +"project2084_(1)","default",4,3850,86,29022,1,1,2,TRUE,54.9989999999998,34.283512361,11256.392787,12465.419556,6071.672261,804.994572,0,19199.613915,3854.822219,513.142007,0,60,"mbank_training" +"project2084_(1)","default",5,3851,86,29028,1,1,1,TRUE,54.3580000000002,32.795953363,13459.063726,7689.930694,7774.009206,647.933228,0,20775.360569,3475.924906,262.921936,0,60,"mbank_training" +"project2771","default",1,3847,94,1061,30,1,7,TRUE,54.5080000000003,54.50748361,4055.693235,2007.917633,2703.941653,867.315002,0,29991.013706,13797.074177,578.259028,5.170318,60,"mbank_training" +"project2771","default",2,3848,94,1051,30,1,2,TRUE,54.0459999999998,4.789783894,4271.827506,1819.564166,2938.795187,815.396822,0,30148.284471,13355.075623,621.537151,27.556894,60,"mbank_training" +"project2771","default",3,3849,94,1054,30,1,7,TRUE,56.1459999999997,56.145150433,4154.5174,2329.563524,3529.689896,867.569369,0,29018.037154,13612.762975,617.564404,0,60,"mbank_training" +"project2771","default",4,3850,94,1046,30,0,6,TRUE,54.2339999999999,23.20915686,4222.126455,2096.845122,2913.642794,1085.219592,0,29918.720473,13139.276886,579.73558,51.066233,60,"mbank_training" +"project2771","default",5,3851,94,1059,29,1,1,TRUE,54.0189999999998,50.911111792,3962.747493,2194.650357,2428.450395,882.999859,0,29559.448131,14387.393034,580.596246,0,60,"mbank_training" +"project2184","default",1,3847,114,565,13,1,100,FALSE,18.6109999999999,11.402577243,2131.156542,4725.997082,530.837764,293.965445,0,8043.019541,2221.036326,273.326637,44.186869,60,"mbank_training" +"project2184","default",2,3848,114,564,8,2,100,FALSE,11.154,5.345453234,1274.511283,3095.947372,411.255146,218.503153,0,4509.715032,1206.564447,175.035752,7.404348,60,"mbank_training" +"project2184","default",3,3849,114,565,18,2,100,FALSE,26.3409999999999,16.141419283,2771.799272,6012.279972,808.007389,471.532338,0,12121.375686,3661.194424,370.384895,20.829929,60,"mbank_training" +"project2184","default",4,3850,114,563,33,0,100,TRUE,54.2220000000002,16.851939064,4867.676228,11886.042139,1552.892566,850.359378,0,26578.906698,7519.570371,675.624246,70.321069,60,"mbank_training" +"project2184","default",5,3851,114,564,18,2,100,FALSE,28.5320000000002,1.17403699,2732.189783,6746.666943,896.459513,454.621275,0,13185.943663,3950.044113,370.900718,16.416084,60,"mbank_training" +"project3938","default",1,3847,119,3417,9,2,100,FALSE,51.2469999999998,21.493213813,13097.014635,17240.880122,2577.814748,685.203831,0,12611.715252,4025.969422,457.683269,30.401117,60,"mbank_training" +"project3938","default",2,3848,119,3408,8,1,100,TRUE,54.9359999999997,22.446101436,14280.167284,17485.494785,2503.527778,693.394689,0,13934.462712,4716.448387,413.765958,0,60,"mbank_training" +"project3938","default",3,3849,119,3413,9,2,100,FALSE,53.6620000000003,29.438755385,13646.266789,16273.772787,1957.851645,607.939928,0,14071.340355,5282.540219,453.269932,204.081741,60,"mbank_training" +"project3938","default",4,3850,119,3409,9,1,100,TRUE,54.4050000000002,32.279967283,14393.190366,16570.649742,3077.911077,580.782538,0,14041.999764,4926.950212,444.543922,0,60,"mbank_training" +"project3938","default",5,3851,119,3410,9,0,100,TRUE,55.8039999999996,54.032769174,15041.883919,18403.466552,1289.59538,761.434192,0,13465.906008,4532.486313,469.045411,66.780611,60,"mbank_training" +"syab07201","default",1,3847,125,14933,6,1,3,TRUE,54.5649999999996,13.463024632,8283.224612,5422.107356,2155.406468,975.963841,0,23496.45208,12990.29354,720.755871,0,60,"mbank_training" +"syab07201","default",2,3848,125,15033,6,1,1,TRUE,54.1619999999998,6.879759994,8631.661473,4556.603853,1981.947232,1083.600626,0,24835.027048,12258.982609,667.987072,0,60,"mbank_training" +"syab07201","default",3,3849,125,14953,6,1,1,TRUE,54.3450000000003,23.402911157,8807.088314,4685.148042,2305.728683,1070.541614,0,25444.06322,11129.258295,704.90096,0,60,"mbank_training" +"syab07201","default",4,3850,125,15017,6,1,2,TRUE,54.4230000000002,9.162811337,8130.311725,5679.011914,2400.763562,1039.644438,0,25123.454571,10902.237259,753.185253,0,60,"mbank_training" +"syab07201","default",5,3851,125,14926,6,1,2,TRUE,54.5190000000002,31.986010941,7983.21979,6412.495985,1702.238379,1285.062244,0,25779.764873,10200.97239,729.078123,0,60,"mbank_training" +"project4133","default",1,3847,131,2373,12,1,100,TRUE,55.0460000000003,50.763268092,9777.261701,16367.982627,2105.14162,590.469161,0,17587.748365,7183.47726,410.978192,0,60,"mbank_training" +"project4133","default",2,3848,131,2378,12,1,100,TRUE,56.9690000000001,56.966936068,10115.779173,17903.407013,1533.860302,741.545257,0,15572.04862,7743.139666,419.157453,0,60,"mbank_training" +"project4133","default",3,3849,131,2377,12,1,100,TRUE,54.7849999999999,54.782788502,10360.897605,15017.756517,2720.598684,644.129521,0,17636.426464,7206.39643,420.041532,0,60,"mbank_training" +"project4133","default",4,3850,131,2371,13,1,100,TRUE,56.2750000000001,56.271602815,10099.90732,17124.399453,1715.449085,535.285948,0,17256.680013,6864.383411,406.018243,0,60,"mbank_training" +"project4133","default",5,3851,131,2376,12,1,100,TRUE,55.3900000000003,55.386705398,10123.53186,16744.885155,2063.074556,809.123932,0,17128.859433,6715.284436,414.796312,0,60,"mbank_training" +"project804","default",1,3847,173,1375,3,1,47,TRUE,60.0190000000002,21.23623549,11228.226484,12303.850331,2430.559834,830.477995,0,17909.169516,8717.022799,708.078116,0,60,"mbank_training" +"project804","default",2,3848,173,1368,3,1,100,TRUE,60.1500000000005,60.080988944,10396.680621,17198.177502,1861.84538,753.302697,0,16601.247605,6676.313676,571.262477,0,60,"mbank_training" +"project804","default",3,3849,173,1373,2,1,86,TRUE,60.0889999999999,60.066671998,9195.650252,12590.645613,3033.072112,755.479002,0,20511.079276,7269.364064,711.281004,0,60,"mbank_training" +"project804","default",4,3850,173,1387,3,1,100,TRUE,60.0789999999997,22.113473518,12998.737773,12270.630953,4290.703251,689.04819,0,15290.848349,7909.329051,669.759235,0,60,"mbank_training" +"project804","default",5,3851,173,1370,3,1,7,TRUE,60.0300000000007,60.022688033,13480.410099,11992.159228,3052.961199,727.439951,0,16306.47525,8248.843774,634.589284,0,60,"mbank_training" +"project4284","default",1,3847,4062,1268,0,1,1,TRUE,75.3469999999998,75.339956447,54305.100757,21032.114597,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",2,3848,4062,1407,0,1,1,TRUE,72.5429999999997,72.536462982,54305.789977,18227.270056,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",3,3849,4062,1193,0,1,1,TRUE,70.0289999999995,70.021119683,54540.924025,15477.382811,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",4,3850,4062,1107,0,1,1,TRUE,70.415,70.408575744,54275.210143,16129.911666,0,0,0,0,0,0,0,60,"mbank_training" +"project4284","default",5,3851,4062,1355,0,1,1,TRUE,74.0500000000002,74.042071236,54236.199289,19803.278572,0,0,0,0,0,0,0,60,"mbank_training" +"project532","default",1,3847,21,1139,7,5,4,FALSE,0.597999999999956,0.174283273,148.503215,14.327887,24.690415,17.636236,0,303.184831,62.553178,12.901541,2.343453,120,"mbank_training" +"project532","default",2,3848,21,1139,6,4,3,FALSE,0.597999999999956,0.123916778,121.97986,33.485211,22.381566,18.933991,0,320.406746,55.284499,12.856154,1.757961,120,"mbank_training" +"project532","default",3,3849,21,1139,6,5,4,FALSE,0.579999999999927,0.139597412,132.649526,22.214793,26.952043,15.646982,0,300.59517,53.942751,12.362817,1.812183,120,"mbank_training" +"project532","default",4,3850,21,1139,7,3,3,FALSE,0.750999999999294,0.197873706,141.871555,42.623983,26.681093,20.747777,0,423.874721,68.928898,15.491427,0.949228,120,"mbank_training" +"project532","default",5,3851,21,1139,6,3,2,FALSE,0.592999999999847,0.129039236,125.361359,30.279261,26.760433,19.136244,0,319.94664,49.821137,13.127436,2.038109,120,"mbank_training" +"project2346","default",1,3847,23,316,18,2,4,FALSE,0.829999999999927,0.608430951,71.029184,35.473094,32.825955,19.374163,0,519.261742,131.533754,13.02882,0.765322,120,"mbank_training" +"project2346","default",2,3848,23,318,8,2,20,FALSE,0.304000000000087,0.117708173,33.520126,9.281411,16.761981,7.827445,0,158.257356,36.152855,5.366508,0.311627,120,"mbank_training" +"project2346","default",3,3849,23,320,12,2,4,FALSE,0.46599999999944,0.124930457,48.132288,18.466171,27.800715,12.278035,0,276.894882,67.959639,8.181343,0.703515,120,"mbank_training" +"project2346","default",4,3850,23,317,17,2,13,FALSE,0.795000000000073,0.344254344,68.922419,28.551914,29.73234,17.731987,0,483.753854,123.08423,12.051296,0.341163,120,"mbank_training" +"project2346","default",5,3851,23,314,78,1,4,FALSE,3.67699999999968,1.435090677,304.810463,132.69613,141.883327,73.51865,0,2356.842515,605.479236,51.597378,1.447145,120,"mbank_training" +"project2451","default",1,3847,24,735,12,2,4,FALSE,0.427999999999884,0.221843663,66.205828,13.396343,18.763832,10.832563,0,227.105905,79.846971,6.955762,0.330172,120,"mbank_training" +"project2451","default",2,3848,24,732,67,1,1,FALSE,3.22999999999956,0.823948679,346.979458,82.764533,119.244198,52.943228,0,1902.570635,683.326202,38.894812,0,120,"mbank_training" +"project2451","default",3,3849,24,731,7,2,6,FALSE,0.244999999999891,0.092966703,41.354672,8.912597,12.506295,4.7361,0,123.690091,42.28259,4.282265,0.193916,120,"mbank_training" +"project2451","default",4,3850,24,731,74,1,3,FALSE,3.45600000000013,1.132576511,383.106166,87.869532,126.007845,56.212429,0,2044.100132,703.889984,42.021767,1.663113,120,"mbank_training" +"project2451","default",5,3851,24,730,56,1,2,FALSE,2.67200000000048,0.277758048,280.567088,85.41411,100.316807,46.314025,0,1563.836515,557.562805,33.588465,0,120,"mbank_training" +"project4501","default",1,3847,24,118,11,4,63,FALSE,0.449999999999818,0.049543826,20.627493,11.198614,11.815055,7.776755,0,138.45961,22.195465,5.140892,14.184838,120,"mbank_training" +"project4501","default",2,3848,24,118,8,6,93,FALSE,0.360999999999876,0.02131701,15.809006,7.847642,11.572414,6.5989,0,69.68908,14.638501,3.76466,0.836356,120,"mbank_training" +"project4501","default",3,3849,24,118,11,8,63,FALSE,0.387000000000626,0.025869516,17.988182,13.159864,11.846536,7.174633,0,103.145235,18.32269,5.53868,1.248622,120,"mbank_training" +"project4501","default",4,3850,24,118,11,6,63,FALSE,0.428999999999178,0.05354915,18.767008,14.497546,12.386349,6.770313,0,134.033431,20.862917,5.271509,0.836095,120,"mbank_training" +"project4501","default",5,3851,24,118,9,4,93,FALSE,0.377000000000407,0.018355794,15.849192,12.337627,7.080306,5.553369,0,101.60287,17.539517,3.805357,0.62106,120,"mbank_training" +"project944","default",1,3847,25,128,7,7,60,FALSE,0.349999999999454,0.018372965,13.749598,8.665922,6.577338,4.388015,0,50.700024,9.316056,2.852453,0.884296,120,"mbank_training" +"project944","default",2,3848,25,128,6,6,60,FALSE,0.305999999999585,0.026536602,16.762202,6.242929,6.315995,4.568325,0,59.400532,10.678432,3.202963,1.232052,120,"mbank_training" +"project944","default",3,3849,25,128,6,6,60,FALSE,0.353000000000065,0.023519519,12.339231,7.414637,6.222741,4.003219,0,49.831448,10.604764,2.731975,1.047022,120,"mbank_training" +"project944","default",4,3850,25,128,8,8,60,FALSE,0.373000000000502,0.018053484,16.38878,8.637668,6.467942,5.269536,0,61.244825,13.507091,3.300045,0.928819,120,"mbank_training" +"project944","default",5,3851,25,128,9,9,60,FALSE,0.289999999999964,0.019552678,18.095653,12.151427,7.457316,5.676282,0,68.236955,11.678938,3.661214,1.465672,120,"mbank_training" +"project971_(1)","default",1,3847,26,157,7,5,100,FALSE,0.199999999999818,0.034288362,19.540965,8.624554,13.504326,6.719836,0,112.82168,17.875589,5.259305,0.877813,120,"mbank_training" +"project971_(1)","default",2,3848,26,157,7,3,100,FALSE,0.216000000000349,0.03481252,16.035452,7.674916,8.319572,7.100072,0,124.029589,18.743483,5.235702,0.826979,120,"mbank_training" +"project971_(1)","default",3,3849,26,157,14,9,100,FALSE,0.42200000000048,0.07753632,34.000391,20.956551,31.22286,14.428183,0,240.754111,33.668735,10.389131,1.483253,120,"mbank_training" +"project971_(1)","default",4,3850,26,157,9,5,100,FALSE,0.268000000000029,0.034529356,20.521152,11.128321,12.392041,9.051989,0,167.722094,26.988611,6.584964,0.792242,120,"mbank_training" +"project971_(1)","default",5,3851,26,157,8,5,100,FALSE,0.269000000000233,0.033144458,22.034311,10.560942,19.018461,9.771033,0,142.472617,25.789647,7.242129,1.522097,120,"mbank_training" +"project2762","default",1,3847,29,259,9,5,100,FALSE,0.552000000000589,0.076698963,102.552619,31.991234,32.017848,18.777667,0,255.924423,60.232298,12.068471,3.722681,120,"mbank_training" +"project2762","default",2,3848,29,259,9,5,100,FALSE,0.559000000000196,0.124296864,110.021915,23.549355,22.25614,22.257053,0,265.42186,58.979949,11.944967,3.162605,120,"mbank_training" +"project2762","default",3,3849,29,259,14,7,100,FALSE,0.896999999999935,0.271071243,182.034881,44.101888,34.36301,33.517622,0,439.83955,83.552751,18.319235,3.683558,120,"mbank_training" +"project2762","default",4,3850,29,259,7,2,100,FALSE,0.552000000000589,0.072439469,103.33374,22.591772,28.818399,19.640743,0,260.223182,55.45098,10.463058,0.79617,120,"mbank_training" +"project2762","default",5,3851,29,259,9,2,100,FALSE,0.640000000000327,0.196973069,111.81434,30.079626,29.628034,21.70403,0,335.326449,74.873774,12.244683,1.187246,120,"mbank_training" +"project826","default",1,3847,33,431,10,5,100,FALSE,0.731000000000677,0.076276196,121.741229,108.597777,43.75939,21.104991,0,309.097116,60.566575,12.983497,2.531488,120,"mbank_training" +"project826","default",2,3848,33,431,14,8,100,FALSE,0.932999999999993,0.072963527,190.042457,114.346752,47.944522,34.721809,0,421.781289,70.243541,16.782441,3.219924,120,"mbank_training" +"project826","default",3,3849,33,431,9,5,100,FALSE,0.604999999999563,0.14725746,118.219918,80.220355,40.605209,16.564079,0,268.776122,41.222762,11.147686,2.454232,120,"mbank_training" +"project826","default",4,3850,33,431,13,7,100,FALSE,0.806999999999789,0.069525721,173.788642,93.184784,51.796127,23.401488,0,341.117535,62.517173,15.345382,4.454339,120,"mbank_training" +"project826","default",5,3851,33,431,10,7,100,FALSE,0.626999999999498,0.132048574,127.710431,85.490192,32.522076,19.169466,0,269.264299,52.301932,12.045337,2.735471,120,"mbank_training" +"project561","default",1,3847,34,1169,5,4,2,FALSE,0.766999999999825,0.200921647,104.203848,28.18669,31.8137,34.553072,0,434.189226,104.452797,18.555571,1.495537,120,"mbank_training" +"project561","default",2,3848,34,1169,10,8,2,FALSE,1.57900000000063,0.235814397,184.091378,57.688625,102.624785,76.210602,0,897.399635,209.365029,40.577297,1.872447,120,"mbank_training" +"project561","default",3,3849,34,1169,6,4,2,FALSE,1.03800000000047,0.215172027,112.760827,32.396947,72.031643,37.331976,0,617.902089,130.587572,22.017981,3.256114,120,"mbank_training" +"project561","default",4,3850,34,1169,7,6,2,FALSE,1.09799999999996,0.255119559,141.429342,58.494966,51.773246,51.428364,0,604.051173,149.422167,28.95167,1.973197,120,"mbank_training" +"project561","default",5,3851,34,1169,13,10,2,FALSE,1.8779999999997,0.227611827,254.913113,90.500066,104.908815,86.761706,0,1057.262797,223.664192,49.85961,0,120,"mbank_training" +"project571","default",1,3847,42,634,8,3,12,FALSE,0.894999999999527,0.199396634,135.515565,66.212952,42.147855,27.872039,0,445.262381,81.952351,16.597809,1.538257,120,"mbank_training" +"project571","default",2,3848,42,635,7,6,28,FALSE,0.715000000000146,0.099789584,112.405676,51.24409,31.78723,22.885687,0,258.887033,54.158859,11.817098,2.125303,120,"mbank_training" +"project571","default",3,3849,42,634,14,5,12,FALSE,1.57200000000012,0.225831785,228.209175,86.461079,80.530839,48.792823,0,858.462695,170.886235,26.904718,3.584772,120,"mbank_training" +"project571","default",4,3850,42,634,6,3,12,FALSE,0.603000000000065,0.137255932,117.591152,49.656388,31.269116,20.712042,0,260.278734,57.636379,11.03258,2.350998,120,"mbank_training" +"project571","default",5,3851,42,634,7,4,12,FALSE,0.618999999999687,0.103445409,115.114959,37.63192,24.581629,23.032833,0,293.901454,58.725278,11.897841,2.274955,120,"mbank_training" +"project4146_(3)","default",1,3847,59,260,88,2,100,FALSE,34.116,1.413533364,2671.985035,1807.940198,1630.073094,728.712218,0,19646.876166,7051.046702,440.119674,2.154328,120,"mbank_training" +"project4146_(3)","default",2,3848,59,262,16,2,100,FALSE,4.8080000000009,3.259231522,437.061873,332.329626,260.812709,136.426056,0,2658.735654,754.976978,76.161495,3.929552,120,"mbank_training" +"project4146_(3)","default",3,3849,59,263,11,2,100,FALSE,3.04700000000048,1.879511765,306.395457,247.90622,145.666599,77.723276,0,1638.1709,496.971015,51.236215,1.810841,120,"mbank_training" +"project4146_(3)","default",4,3850,59,261,43,1,100,FALSE,15.3180000000002,2.374788278,1320.469164,894.874744,714.758367,346.257565,0,8599.051059,3133.391372,210.801877,10.461404,120,"mbank_training" +"project4146_(3)","default",5,3851,59,261,31,2,100,FALSE,12.2269999999999,0.741778749,1014.974118,655.035662,685.970374,273.578485,0,6815.550667,2422.397126,162.815185,2.274815,120,"mbank_training" +"project3688","default",1,3847,60,854,17,2,100,FALSE,5.02500000000055,5.021771877,510.37382,427.643124,259.063646,112.165339,0,2383.360876,1192.373307,62.203885,1.168781,120,"mbank_training" +"project3688","default",2,3848,60,852,56,2,100,FALSE,17.5849999999991,17.581586221,1508.551866,1422.340587,842.489003,302.981617,0,8563.456329,4566.251877,183.361074,8.76499,120,"mbank_training" +"project3688","default",3,3849,60,850,86,2,100,FALSE,23.5720000000001,7.411819695,2115.873042,1577.791567,1209.467271,451.31191,0,11671.087764,6083.894379,257.34136,1.490678,120,"mbank_training" +"project3688","default",4,3850,60,845,100,0,100,FALSE,28.1569999999992,24.162367597,2261.042949,2157.45028,1376.283966,483.92103,0,14079.414317,7325.004318,307.627242,8.029794,120,"mbank_training" +"project3688","default",5,3851,60,851,100,2,100,FALSE,31.8050000000003,31.802672613,2618.332145,2409.378973,1494.269097,570.378077,0,15695.913627,8333.697,340.037484,0,120,"mbank_training" +"project4049","default",1,3847,60,5237,58,2,69,FALSE,80.0329999999994,32.052243607,16809.550981,5256.228341,2110.483353,1030.435606,0,35258.662297,11422.279982,653.04131,4.22619,120,"mbank_training" +"project4049","default",2,3848,60,5241,14,2,100,FALSE,16.0689999999995,7.870476592,4173.924444,1414.553628,544.295207,261.063861,0,6683.463902,1911.022872,169.10444,3.227579,120,"mbank_training" +"project4049","default",3,3849,60,5237,86,0,69,TRUE,120.012,29.14747189,24915.557783,8555.080917,2826.307074,1585.849986,0,52725.984647,16420.480107,960.753632,19.357873,120,"mbank_training" +"project4049","default",4,3850,60,5237,82,1,67,TRUE,113.106000000001,87.538582491,24380.17746,8460.16691,2744.356645,1587.127748,0,53779.48615,16087.417783,947.49759,0,120,"mbank_training" +"project4049","default",5,3851,60,5238,48,2,100,FALSE,61.625,42.237513028,14343.858343,4557.954165,1591.975844,975.424709,0,29679.922169,9072.33144,551.917905,6.537494,120,"mbank_training" +"project423","default",1,3847,60,495,8,4,100,FALSE,3.40499999999975,0.526153467,566.898634,190.75694,215.289551,136.966522,0,1626.799883,386.053694,70.778868,9.020881,120,"mbank_training" +"project423","default",2,3848,60,495,10,4,100,FALSE,3.90000000000055,0.411584624,588.627592,245.185435,238.855369,144.338626,0,1998.711399,474.040828,90.219949,16.501401,120,"mbank_training" +"project423","default",3,3849,60,495,9,4,100,FALSE,3.39900000000034,0.46492677,604.06468,177.265199,213.227558,110.902656,0,1578.009632,412.734359,77.942664,17.989694,120,"mbank_training" +"project423","default",4,3850,60,495,9,3,100,FALSE,3.30899999999929,0.925018402,492.497982,155.53686,151.105736,102.833498,0,1678.795371,457.312226,70.475237,10.101998,120,"mbank_training" +"project423","default",5,3851,60,495,7,3,100,FALSE,2.92799999999988,0.693696482,413.570945,112.361907,210.636516,84.170433,0,1356.68924,368.077685,66.544864,10.714993,120,"mbank_training" +"project4286","default",1,3847,63,283,29,1,100,FALSE,11.7159999999994,4.100318091,1046.296071,768.137171,567.789875,282.419437,0,6540.196924,2232.430133,165.583054,11.269518,120,"mbank_training" +"project4286","default",2,3848,63,286,17,1,100,FALSE,6.14800000000014,2.731416157,577.453715,449.078541,267.206664,154.448919,0,3413.660776,1114.497786,88.825288,12.203679,120,"mbank_training" +"project4286","default",3,3849,63,282,100,0,100,FALSE,44.9789999999994,5.96331056,3683.094082,2517.868007,1813.204677,950.494329,0,25889.045977,9131.184162,567.907615,10.793273,120,"mbank_training" +"project4286","default",4,3850,63,282,36,2,100,FALSE,15.5619999999999,7.152473633,1275.286219,836.323187,605.933541,328.329446,0,8875.831635,3180.293214,206.313307,4.975995,120,"mbank_training" +"project4286","default",5,3851,63,281,100,0,100,FALSE,46.5769999999993,35.041412631,3699.824056,2689.559526,2089.605703,946.042036,0,26985.775764,9107.103506,618.261001,11.396668,120,"mbank_training" +"project4359","default",1,3847,71,183,44,14,100,FALSE,41.9520000000002,3.203920721,8767.766222,2006.351704,1513.433996,1382.957629,0,23498.199682,3646.497845,799.066168,169.565444,120,"mbank_training" +"project4359","default",2,3848,71,183,30,14,100,FALSE,21.7539999999999,0.915047912,5225.794822,1438.006894,960.455597,825.817399,0,10896.586175,1663.655235,533.446058,88.870922,120,"mbank_training" +"project4359","default",3,3849,71,183,48,14,100,FALSE,40.0810000000001,4.272051181,8738.055745,2383.24364,1527.197327,1300.041735,0,21769.124441,3284.361575,822.207967,151.216495,120,"mbank_training" +"project4359","default",4,3850,71,183,39,14,100,FALSE,30.3309999999992,4.925332616,7050.165321,1939.795687,1343.31804,1102.898121,0,15901.077905,2095.50652,655.182761,120.679789,120,"mbank_training" +"project4359","default",5,3851,71,184,5,4,100,FALSE,3.22699999999986,0.863578676,733.995855,246.411799,130.749929,134.986509,0,1477.365187,265.119975,95.768599,7.753395,120,"mbank_training" +"project4397","default",1,3847,75,1645,100,1,80,FALSE,92.9229999999998,14.423068514,13204.176944,13809.390556,2675.089632,1261.189782,0,43065.316764,14214.117253,874.400732,0,120,"mbank_training" +"project4397","default",2,3848,75,1647,44,2,72,FALSE,40.518,40.516149152,5462.808026,6069.197401,1228.633206,540.398184,0,17364.29475,5740.716472,370.847129,5.18697,120,"mbank_training" +"project4397","default",3,3849,75,1649,22,2,100,FALSE,18.2539999999999,18.252467134,2903.384257,3034.807995,547.422862,272.134946,0,8130.583929,2706.994697,187.14634,2.482476,120,"mbank_training" +"project4397","default",4,3850,75,1646,100,0,100,FALSE,89.067,62.749311293,12699.873477,14195.273694,2819.594899,1371.308273,0,42354.846242,13338.153642,863.920671,23.747802,120,"mbank_training" +"project4397","default",5,3851,75,1646,100,1,43,FALSE,88.2970000000005,8.63749485,12756.82406,14581.059802,2551.350852,1244.568156,0,41346.437513,13615.086309,876.091028,0,120,"mbank_training" +"project2084_(1)","default",1,3847,86,28962,3,1,1,TRUE,108.528,39.357658916,23627.712028,24029.377952,5464.093052,1094.520444,0,35104.689634,17833.210932,1040.792512,0,120,"mbank_training" +"project2084_(1)","default",2,3848,86,28206,2,1,2,TRUE,110.605,43.882252328,21477.758523,22676.801355,8385.164939,1728.20278,0,33082.151982,20161.210155,644.290514,0,120,"mbank_training" +"project2084_(1)","default",3,3849,86,28303,3,1,1,TRUE,108.306,42.637740632,30964.641428,23462.698032,5351.663578,1468.335058,0,32030.349028,13873.619593,887.377121,0,120,"mbank_training" +"project2084_(1)","default",4,3850,86,28724,3,0,1,TRUE,108.268,79.500306167,21209.336022,16545.544527,6915.74316,1698.567909,0,40609.952976,18209.011495,999.977684,1818.540659,120,"mbank_training" +"project2084_(1)","default",5,3851,86,29024,4,1,1,TRUE,108.461,78.301270838,25734.337567,17627.057703,7778.349417,1409.668754,0,38256.493533,16373.841466,1007.174092,0,120,"mbank_training" +"project2771","default",1,3847,94,1042,65,1,16,TRUE,109.469999999999,90.147688908,7955.523496,3364.064349,5118.545353,1685.906249,0,59102.753807,29649.557642,1129.086726,4.686277,120,"mbank_training" +"project2771","default",2,3848,94,1049,65,1,10,TRUE,109.496,109.495686787,8276.423813,3896.479947,5070.025056,1554.401001,0,60209.181891,27813.96109,1150.255409,25.301502,120,"mbank_training" +"project2771","default",3,3849,94,1055,65,1,10,TRUE,108.414,108.413357524,7932.479499,4160.669826,6047.767008,1677.958011,0,59065.792055,27961.877695,1152.576054,0,120,"mbank_training" +"project2771","default",4,3850,94,1046,66,0,6,TRUE,108.206,20.66770794,8170.151603,3690.397114,5343.117597,1847.35924,0,60061.381446,27735.459754,1123.077726,38.827043,120,"mbank_training" +"project2771","default",5,3851,94,1059,65,1,1,TRUE,108.043000000001,44.486758246,8124.186883,3750.441696,4893.377873,1690.985489,0,59411.915119,29030.299076,1117.712092,0,120,"mbank_training" +"project2184","default",1,3847,114,565,13,1,100,FALSE,16.5100000000002,10.270284347,1901.604448,4187.802544,481.297502,267.875047,0,7162.991061,1958.33887,236.763264,35.048523,120,"mbank_training" +"project2184","default",2,3848,114,564,8,2,100,FALSE,9.17799999999988,4.400413194,1068.136565,2525.129151,342.888574,182.385329,0,3728.285364,981.185821,141.535775,6.532746,120,"mbank_training" +"project2184","default",3,3849,114,565,18,2,100,FALSE,23.433,14.00867502,2492.973307,5244.857721,716.101013,419.309974,0,10828.214684,3285.003272,326.780788,18.981366,120,"mbank_training" +"project2184","default",4,3850,114,563,73,0,100,TRUE,108.129000000001,15.334493816,9759.896311,23116.913797,2974.664553,1610.739116,0,53197.148769,15938.358971,1339.551383,64.034474,120,"mbank_training" +"project2184","default",5,3851,114,564,18,2,100,FALSE,24.5619999999999,1.094052073,2382.013724,5658.68694,772.047049,388.245695,0,11377.309218,3381.536146,324.950291,12.937261,120,"mbank_training" +"project3938","default",1,3847,119,3417,9,2,100,FALSE,44.5619999999999,19.075341196,11413.369643,15029.814616,2202.384628,586.820894,0,10863.89477,3556.837676,388.669586,26.65143,120,"mbank_training" +"project3938","default",2,3848,119,3408,18,1,100,TRUE,108.633,20.195418378,26861.257588,33098.946527,4015.335393,1157.836335,0,29753.222391,12283.646294,828.075363,0,120,"mbank_training" +"project3938","default",3,3849,119,3413,9,2,100,FALSE,46.6349999999993,25.918489915,12043.938351,14090.206009,1790.431927,539.632353,0,12164.741083,4564.794154,398.036473,187.663721,120,"mbank_training" +"project3938","default",4,3850,119,3408,18,0,100,TRUE,108.391,83.647681499,24710.571406,34061.871319,4085.051645,1121.98776,0,30625.896483,12543.803153,802.23311,62.598762,120,"mbank_training" +"project3938","default",5,3851,119,3405,18,1,100,TRUE,108.735000000001,108.725656538,27374.72291,33415.734207,2665.163714,1469.101328,0,31037.667179,11162.01504,797.550904,62.214628,120,"mbank_training" +"syab07201","default",1,3847,125,14933,12,1,3,TRUE,108.728,12.592821236,13570.480664,8180.130151,3577.144371,1630.401141,0,51213.95676,28665.633721,1232.812355,0,120,"mbank_training" +"syab07201","default",2,3848,125,14931,12,1,1,TRUE,108.170999999999,89.087282798,13846.124014,8503.522116,3033.045252,1743.023891,0,56010.682657,23629.967986,1254.266463,0,120,"mbank_training" +"syab07201","default",3,3849,125,14932,12,1,4,TRUE,109.077,82.250482396,14298.112856,7593.533046,3447.583924,1667.188132,0,53409.384596,26372.219385,1269.450238,0,120,"mbank_training" +"syab07201","default",4,3850,125,14948,13,1,3,TRUE,108.708,58.209723111,14448.190969,8478.906994,3301.745774,1760.348587,0,52382.540065,26330.066606,1350.91075,0,120,"mbank_training" +"syab07201","default",5,3851,125,14926,9,1,2,TRUE,108.34,38.305426931,13591.55147,10687.585468,2838.502225,2031.87457,0,52607.030956,24885.943975,1424.417762,0,120,"mbank_training" +"project4133","default",1,3847,131,2371,28,1,100,TRUE,109.487999999999,109.485362629,18339.628663,29653.665094,3505.219405,1074.854075,0,37083.883069,17530.824584,823.678609,0,120,"mbank_training" +"project4133","default",2,3848,131,2379,28,1,100,TRUE,109.438,12.828640628,18298.553113,32054.720869,3214.405102,1234.216956,0,35374.579163,17020.561662,822.438309,0,120,"mbank_training" +"project4133","default",3,3849,131,2378,28,1,100,TRUE,109.021000000001,16.15741946,19005.412995,29148.022626,4032.380029,1146.585333,0,37006.380772,16787.657354,879.708429,0,120,"mbank_training" +"project4133","default",4,3850,131,2372,25,1,100,TRUE,109.190000000001,109.18586107,18321.807637,32304.089003,3145.022262,1071.913018,0,36400.932524,15981.659703,761.298839,0,120,"mbank_training" +"project4133","default",5,3851,131,2376,27,1,100,TRUE,108.601000000001,108.597659452,19342.257079,29311.053978,3138.549354,1310.424968,0,37076.326113,17006.773101,825.497051,0,120,"mbank_training" +"project804","default",1,3847,173,1361,5,1,100,TRUE,119.700000000001,119.687429526,18892.753258,22370.05734,6848.462855,1344.700911,0,38968.418347,18451.002179,1120.532486,0,120,"mbank_training" +"project804","default",2,3848,173,1361,6,1,77,TRUE,120.030999999999,120.019861675,19661.854592,29304.260402,6104.284431,1238.988987,0,35207.71569,15866.336486,1093.321497,0,120,"mbank_training" +"project804","default",3,3849,173,1374,6,1,37,TRUE,120.075000000001,120.066968555,20123.993779,21883.447515,4358.387938,1306.7305,0,35922.962296,22978.080261,1861.523875,0,120,"mbank_training" +"project804","default",4,3850,173,1363,5,1,100,TRUE,115.026,115.013226373,19002.039202,30904.950987,10687.222416,1399.585322,0,33172.285515,12033.102396,860.626358,0,120,"mbank_training" +"project804","default",5,3851,173,1363,7,1,100,TRUE,115.235999999999,115.223057538,22282.004866,20105.230448,6611.483638,1417.110018,0,39375.434188,17047.971526,1247.914574,0,120,"mbank_training" +"project4284","default",1,3847,4062,1072,0,1,100,TRUE,349.493,349.46794991,103451.107481,13308.954479,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",2,3848,4062,1322,0,1,100,TRUE,462.067999999999,461.989119379,101258.802999,13033.694143,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",3,3849,4062,1193,0,1,1,TRUE,120.931999999999,120.925018555,108451.106187,12471.801843,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",4,3850,4062,1040,0,1,100,TRUE,333.196,333.186980399,95453.13475,13018.854749,0,0,0,0,0,0,0,120,"mbank_training" +"project4284","default",5,3851,4062,1220,0,1,100,TRUE,279.598,279.591312384,98085.089315,17653.911914,0,0,0,0,0,0,0,120,"mbank_training" diff --git a/dev/benchmarks/t252_setup_and_run.sh b/dev/benchmarks/t252_setup_and_run.sh new file mode 100644 index 000000000..e5fa0866b --- /dev/null +++ b/dev/benchmarks/t252_setup_and_run.sh @@ -0,0 +1,97 @@ +#!/bin/bash +#SBATCH --job-name=t252-mbank +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=8G +#SBATCH --time=10:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t252_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t252_%j.err + +# T-252: MorphoBank training-set baseline benchmark +# Phase 1: Install dependencies +# Phase 2: Install TreeSearch +# Phase 3: Run 25 matrices x 3 budgets x 5 seeds = 375 runs + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t252_results + +mkdir -p "$LIB" "$OUTDIR" /nobackup/$USER/TreeSearch/logs + +echo "=== T-252 MorphoBank Training-Set Benchmark ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Phase 1: Install R dependencies +echo "=== Phase 1: Installing R dependencies ===" +export R_LIBS_USER="$LIB" +Rscript -e " + .libPaths(c('$LIB', .libPaths())) + needed <- c('Rcpp', 'ape', 'TreeTools', 'TreeDist', 'Rdpack', + 'cli', 'fastmatch', 'abind', 'colorspace') + missing <- needed[!vapply(needed, requireNamespace, logical(1), quietly = TRUE)] + if (length(missing) > 0) { + cat('Installing:', paste(missing, collapse = ', '), '\n') + install.packages(missing, lib = '$LIB', + repos = 'https://cloud.r-project.org', Ncpus = 1) + } else { + cat('All dependencies already installed\n') + } + # Verify + ok <- vapply(needed, requireNamespace, logical(1), quietly = TRUE) + if (!all(ok)) { + stop('Still missing: ', paste(needed[!ok], collapse = ', ')) + } + cat('All', length(needed), 'dependencies OK\n') +" 2>&1 +rc=$? +if [ $rc -ne 0 ]; then + echo "FATAL: dependency installation failed" + exit 1 +fi + +# Phase 2: Install TreeSearch +echo "" +echo "=== Phase 2: Installing TreeSearch ===" +cd "$REPO" || exit 1 +git pull --ff-only origin cpp-search 2>/dev/null || true +echo "Git HEAD: $(git log --oneline -1)" + +rm -f src/*.o src/*.so +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +echo "Install exit code: $rc" +rm -f TreeSearch_*.tar.gz + +if [ $rc -ne 0 ]; then + echo "FATAL: TreeSearch install failed" + exit 1 +fi + +# Verify neotrans +NEOTRANS=/nobackup/$USER/neotrans/inst/matrices +if [ ! -d "$NEOTRANS" ] || [ "$(ls $NEOTRANS | wc -l)" -eq 0 ]; then + echo "FATAL: neotrans matrices not found or empty at $NEOTRANS" + exit 1 +fi +echo "Neotrans matrices: $(ls $NEOTRANS | wc -l) files" + +# Phase 3: Run benchmark +echo "" +echo "=== Phase 3: Running benchmark ===" +cd "$REPO" +Rscript dev/benchmarks/bench_t252_mbank_training.R "$OUTDIR" 2>&1 + +echo "" +echo "=== Completed: $(date) ===" +echo "Results in: $OUTDIR" +ls -la "$OUTDIR"/t252_*.csv 2>/dev/null diff --git a/dev/benchmarks/t252_v2.sh b/dev/benchmarks/t252_v2.sh new file mode 100644 index 000000000..b9f3ca489 --- /dev/null +++ b/dev/benchmarks/t252_v2.sh @@ -0,0 +1,89 @@ +#!/bin/bash +#SBATCH --job-name=t252-mbank +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=8G +#SBATCH --time=8:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t252_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t252_%j.err + +# T-252: MorphoBank training-set baseline benchmark (v2 — fixed lib paths) +# 25 matrices x 3 budgets (30/60/120s) x 5 seeds = 375 runs (~5 hours) +# +# Uses ts-bench/lib-baseline for all deps (TreeDist, TreeTools, etc.), +# installs only the fresh TreeSearch build into TreeSearch/lib-t252. + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +FRESH_LIB=/nobackup/$USER/TreeSearch/lib-t252 +DEP_LIB=/nobackup/$USER/ts-bench/lib-baseline +OUTDIR=/nobackup/$USER/TreeSearch/t252_results + +mkdir -p "$FRESH_LIB" "$OUTDIR" /nobackup/$USER/TreeSearch/logs + +echo "=== T-252 MorphoBank Training-Set Benchmark v2 ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "Fresh lib: $FRESH_LIB" +echo "Dep lib: $DEP_LIB" +echo "" + +# Phase 1: Build and install TreeSearch (deps resolved from DEP_LIB) +echo "=== Building TreeSearch from cpp-search ===" +cd "$REPO" || exit 1 +git pull --ff-only origin cpp-search 2>/dev/null || true +echo "Git HEAD: $(git log --oneline -1)" + +rm -f src/*.o src/*.so +TMPBUILD=$(mktemp -d) +(cd "$TMPBUILD" && R CMD build --no-build-vignettes --no-manual --no-resave-data "$REPO") + +# Install using both libs so R can find TreeSearch's Imports during install +export R_LIBS="$FRESH_LIB:$DEP_LIB" +R CMD INSTALL --library="$FRESH_LIB" "$TMPBUILD"/TreeSearch_*.tar.gz +rc=$? +rm -rf "$TMPBUILD" +echo "Install exit code: $rc" + +if [ $rc -ne 0 ]; then + echo "FATAL: TreeSearch install failed" + exit 1 +fi + +# Verify the install loaded correctly +Rscript -e " + .libPaths(c('$FRESH_LIB', '$DEP_LIB', .libPaths())) + library(TreeSearch) + cat('TreeSearch version:', as.character(packageVersion('TreeSearch')), '\n') +" +rc=$? +if [ $rc -ne 0 ]; then + echo "FATAL: TreeSearch failed to load" + exit 1 +fi + +# Phase 2: Verify neotrans corpus +NEOTRANS=/nobackup/$USER/neotrans/inst/matrices +if [ ! -d "$NEOTRANS" ] || [ "$(ls $NEOTRANS | wc -l)" -eq 0 ]; then + echo "FATAL: neotrans matrices not found at $NEOTRANS" + exit 1 +fi +echo "Neotrans matrices: $(ls $NEOTRANS | wc -l) files" + +# Phase 3: Run benchmark +echo "" +echo "=== Running benchmark ===" +cd "$REPO" +export R_LIBS="$FRESH_LIB:$DEP_LIB" +Rscript dev/benchmarks/bench_t252_mbank_training.R "$OUTDIR" 2>&1 + +echo "" +echo "=== Completed: $(date) ===" +echo "Results in: $OUTDIR" +ls -la "$OUTDIR"/t252_*.csv 2>/dev/null diff --git a/dev/benchmarks/t253_conv_gap_mbank.csv b/dev/benchmarks/t253_conv_gap_mbank.csv new file mode 100644 index 000000000..6392d9c22 --- /dev/null +++ b/dev/benchmarks/t253_conv_gap_mbank.csv @@ -0,0 +1,26 @@ +dataset,s_30,s_120,conv_gap,ntax,nchar,n_patterns,pct_missing,pct_inapp +project2084_(1),28962,28724,238,86,3660,3601,20.9,24.9 +project2184,564,564,0,114,205,168,1.7,2.5 +project2346,317,317,0,23,144,141,18,28.5 +project2451,731,731,0,24,380,367,54.5,0 +project2762,259,259,0,29,187,177,34.8,17.1 +project2771,1055,1049,6,94,124,123,1,30 +project3688,850,851,-1,60,245,245,57.3,0 +project3938,3416,3408,8,119,677,677,52.6,4.3 +project4049,5239,5237,2,60,721,719,22.2,0 +project4133,2377,2376,1,131,349,349,31.3,6 +project4146_(3),261,261,0,59,130,130,18.1,45.6 +project423,495,495,0,60,253,219,12.2,15.4 +project4284,1268,1193,75,4062,27,27,82.9,2.6 +project4286,283,282,1,63,135,135,18.7,46.7 +project4359,183,183,0,71,245,146,83.8,3.1 +project4397,1648,1646,2,75,223,222,32.3,4.6 +project4501,118,118,0,24,42,41,4.1,13.1 +project532,1139,1139,0,21,674,427,15.9,2 +project561,1169,1169,0,34,356,329,5,9.6 +project571,634,634,0,42,125,125,16.8,4.2 +project804,1373,1363,10,173,589,569,32.8,30.9 +project826,431,431,0,33,218,213,61.7,0.1 +project944,128,128,0,25,72,72,17.2,0.9 +project971_(1),157,157,0,26,101,73,53.3,0.5 +syab07201,14953,14932,21,125,2954,2813,28.3,0 diff --git a/dev/benchmarks/t253_gap_characterization.md b/dev/benchmarks/t253_gap_characterization.md new file mode 100644 index 000000000..c1ed84fbb --- /dev/null +++ b/dev/benchmarks/t253_gap_characterization.md @@ -0,0 +1,110 @@ +# T-253: Gap Characterization by Dataset Features + +**Date:** 2026-03-27 +**Agent:** F +**Data sources:** +- `t265_results/t265_phase1_20260326_1617.csv` — 8 named datasets, fitch_mode EW, 120s (TNT vs TreeSearch, apples-to-apples) +- `t252_mbank_*` CSVs — 25 MorphoBank training matrices, TreeSearch 30/60/120s (convergence proxy) + +--- + +## Summary + +**ntax is the primary predictor of search difficulty** in both analyses (Spearman ρ ≈ 0.63). +At ≤60 taxa with modest character counts, TreeSearch converges fully at 30s. +Difficulty increases steadily above ~75 taxa and becomes acute above ~120 taxa. + +Character count (nchar) matters only at extremes (e.g. 3660 chars, 2954 chars); +pct_missing and pct_inapp show moderate individual correlations (ρ = 0.49–0.55 in T-265) +but inconsistent signal in the MorphoBank sample — small samples mean these +correlations are unreliable beyond the ntax signal. + +--- + +## TNT comparison gaps (T-265, fitch_mode, 120s, 8 datasets) + +These are the only reliable apples-to-apples gaps (Fitch TreeSearch vs TNT Fitch). + +| Dataset | ntax | nchar | pct_missing | pct_inapp | median_gap | +|---------|-----:|------:|:-----------:|:---------:|:----------:| +| Zanol2014 | 74 | 213 | 11.7% | 16.6% | **3** | +| Zhu2013 | 75 | 253 | 42.6% | 12.4% | **3** | +| Conrad2008 | 64 | 363 | 23.4% | 5.1% | 2 | +| Giles2015 | 78 | 236 | 41.5% | 11.8% | 2 | +| OMeara2014 | 63 | 317 | 43.4% | 5.4% | 2 | +| Liljeblad2008 | 68 | 308 | 5.2% | 5.6% | 0 | +| Wetterer2000 | 63 | 150 | 21.2% | 7.7% | 0 | +| Wilson2003 | 61 | 165 | 7.7% | 8.6% | 0 | + +Spearman correlations with `median_gap`: + +| Feature | ρ | +|---------|:-:| +| ntax | 0.63 | +| pct_missing | 0.55 | +| pct_inapp | 0.49 | +| nchar | 0.28 | +| n_patterns | 0.28 | + +**Note:** n=8 is too small for reliable multivariate analysis. The pct_missing/pct_inapp +signals may be confounded with ntax (larger datasets often have more missing data). + +--- + +## Convergence gaps (T-252, MorphoBank 25 matrices, 30s → 120s improvement) + +Most matrices converge fully at 30s (gap=0). Non-zero gap datasets: + +| Dataset | ntax | nchar | pct_miss | pct_inapp | conv_gap | +|---------|-----:|------:|:--------:|:---------:|:--------:| +| project2068 | 86 | **3660** | 20.9% | 24.9% | 238 | +| project4284 | **4062** | 27 | 82.9% | 0% | 75 | +| syab072 | 125 | 2954 | 28.3% | ? | 21 | +| project804 | 173 | 589 | 32.8% | ? | 10 | +| project3938 | 119 | 677 | 52.6% | 4.3% | 8 | +| project2771 | 94 | 124 | 1.0% | 30.0% | 6 | +| (others) | ≤131 | ≤721 | | | ≤2 | + +Spearman correlations with `conv_gap` (n=23, excluding 2 extreme outliers): + +| Feature | ρ | +|---------|:-:| +| ntax | **0.64** | +| n_patterns | 0.34 | +| pct_inapp | 0.36 | +| nchar | 0.30 | +| pct_missing | −0.04 | + +--- + +## Key findings + +1. **ntax is the dominant difficulty predictor** (consistent ρ ≈ 0.63 across two + independent datasets/metrics). The hard wall is around 75–130 taxa under the + current strategy presets. + +2. **nchar matters only at extremes.** project2068 (86t, 3660c) has the largest + absolute convergence gap despite modest ntax — the 3660-character search space + is simply too large per-replicate. syab072 (125t, 2954c) similarly. + +3. **Missing data and inapplicable characters** show moderate correlations in T-265 + but not in T-252. This likely reflects a confound with ntax (larger datasets often + have more missing data in MorphoBank matrices), not an independent effect. + +4. **Most datasets are already covered** (≤60 taxa, ≤700 chars): 19 of 25 MorphoBank + training matrices and all datasets ≤60 taxa converge at 30s. TreeSearch's + CRAN benchmark suite (14 datasets, ≤88 taxa) is well-covered. + +--- + +## Strategic implications for T-253 + +| Priority | Action | Targets | +|----------|--------|---------| +| High | **T-245: TBR batching** — reduce per-candidate evaluation cost | ≥75 taxa (nchar moderate) | +| High | **NNI escalation** (already in presets via `nniFirst=TRUE`) | ≥75 taxa | +| Medium | **Character batching / lazy scoring** for high-nchar datasets | ≥1000 chars | +| Low | Missing/inapplicable tuning | Not independently predictive | + +The clearest opportunity is the ≥75-taxon regime. T-245 (TBR candidate batching, +estimated ~13% gain) is the highest-value next step for search quality at scale. diff --git a/dev/benchmarks/t253_gap_features_t265.csv b/dev/benchmarks/t253_gap_features_t265.csv new file mode 100644 index 000000000..232220fd4 --- /dev/null +++ b/dev/benchmarks/t253_gap_features_t265.csv @@ -0,0 +1,9 @@ +dataset,median_gap,n_taxa,n_chars,n_patterns,n_levels,pct_missing,pct_inapp +Conrad2008,2,64,363,360,7,0.23390151515151514,0.050576790633608815 +Giles2015,2,78,236,236,3,0.41536288570186874,0.11777488048674489 +Liljeblad2008,0,68,308,299,7,0.05213903743315508,0.05576776165011459 +OMeara2014,2,63,317,315,5,0.4338290521255821,0.05437884933153072 +Wetterer2000,0,63,150,145,6,0.21206349206349207,0.07661375661375662 +Wilson2003,0,61,165,161,5,0.07660208643815201,0.08614008941877795 +Zanol2014,3,74,213,210,8,0.11737089201877934,0.16565156706001777 +Zhu2013,3,75,253,253,3,0.42582345191040843,0.12442687747035573 diff --git a/dev/benchmarks/t265_hamilton.sh b/dev/benchmarks/t265_hamilton.sh new file mode 100644 index 000000000..84b393004 --- /dev/null +++ b/dev/benchmarks/t265_hamilton.sh @@ -0,0 +1,56 @@ +#!/bin/bash +#SBATCH --job-name=t265-regression +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=8G +#SBATCH --time=6:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t265_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t265_%j.err + +# T-265: Per-replicate quality regression diagnosis +# 3 configs x 9 datasets x 5 seeds x 120s = ~135 runs x ~120s = ~4.5 hours + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t265_results + +mkdir -p "$LIB" +mkdir -p "$OUTDIR" + +echo "=== T-265 Hamilton job ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Build and install from latest cpp-search +cd "$REPO" || exit 1 +echo "Git HEAD: $(git log --oneline -1)" + +rm -f src/*.o src/*.so +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +echo "Install exit code: $rc" +rm -f TreeSearch_*.tar.gz + +if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 +fi + +# Run benchmark +cd "$OUTDIR" +Rscript -e ".libPaths(c('$LIB', .libPaths()))" \ + "$REPO/dev/benchmarks/bench_t265_regression.R" 120 "$OUTDIR" + +echo "" +echo "Completed: $(date)" +echo "Results in: $OUTDIR" +ls -la "$OUTDIR"/t265_*.csv 2>/dev/null diff --git a/dev/benchmarks/t269_hamilton.sh b/dev/benchmarks/t269_hamilton.sh new file mode 100644 index 000000000..ea779df78 --- /dev/null +++ b/dev/benchmarks/t269_hamilton.sh @@ -0,0 +1,82 @@ +#!/bin/bash +#SBATCH --job-name=t269-interleave +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=4G +#SBATCH --time=4:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t269_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t269_%j.err + +# T-269: Fine-grained sectorial interleaving benchmark +# +# 5 configs × 4 datasets × 5 seeds × {30s, 60s} = 200 runs × ~45s avg ≈ 2.5h +# +# Usage: +# sbatch t269_hamilton.sh # 30s budget +# sbatch t269_hamilton.sh 60 # 60s budget + +TIMEOUT=${1:-30} + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t269_results + +mkdir -p "$LIB" +mkdir -p "$OUTDIR" +mkdir -p /nobackup/$USER/TreeSearch/logs + +echo "=== T-269 Hamilton job ===" +echo "Timeout: ${TIMEOUT}s" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Install CRAN dependencies into local lib (if missing) +echo "Checking/installing CRAN dependencies..." +Rscript --no-save -e " + lib <- '$LIB' + .libPaths(c(lib, .libPaths())) + pkgs <- c('ape', 'cli', 'inapplicable', 'phangorn', 'Rdpack', 'TreeDist', 'TreeTools') + need <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] + if (length(need) > 0) { + message('Installing: ', paste(need, collapse = ', ')) + install.packages(need, lib = lib, repos = 'https://cloud.r-project.org', quiet = TRUE) + } else { + message('All dependencies present.') + } +" + +# Build and install from latest cpp-search +cd "$REPO" || exit 1 +git fetch origin cpp-search +git pull --ff-only origin cpp-search || git reset --hard origin/cpp-search +echo "Git HEAD: $(git log --oneline -1)" + +rm -f src/*.o src/*.so +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +echo "Install exit code: $rc" +rm -f TreeSearch_*.tar.gz + +if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 +fi + +# Run benchmark +cd "$OUTDIR" +export R_LIBS_USER="$LIB" +Rscript "$REPO/dev/benchmarks/bench_t269_interleaving.R" "$TIMEOUT" "$OUTDIR" + +echo "" +echo "Completed: $(date)" +echo "Results in: $OUTDIR" +ls -la "$OUTDIR"/t269_*.csv 2>/dev/null diff --git a/dev/benchmarks/t289_hamilton.sh b/dev/benchmarks/t289_hamilton.sh new file mode 100644 index 000000000..454869306 --- /dev/null +++ b/dev/benchmarks/t289_hamilton.sh @@ -0,0 +1,86 @@ +#!/bin/bash +#SBATCH --job-name=t289-prune-ri +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=4G +#SBATCH --time=8:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t289_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t289_%j.err + +# T-289: Prune-reinsert perturbation benchmark +# +# Stage 1: 13 configs × 5 datasets × 5 seeds × 30s ≈ 325 runs × ~30s ≈ 2.7h +# Stage 2: ~10 configs × 5 datasets × 5 seeds × {30s,60s} ≈ 500 runs × ~45s ≈ 6.3h +# +# Usage: +# sbatch t289_hamilton.sh # runs stage 1 (30s) +# sbatch t289_hamilton.sh 2 30 # stage 2, 30s budget +# sbatch t289_hamilton.sh 2 60 # stage 2, 60s budget + +STAGE=${1:-1} +TIMEOUT=${2:-30} + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t289_results +export R_LIBS="$LIB:${R_LIBS}" + +mkdir -p "$LIB" +mkdir -p "$OUTDIR" +mkdir -p /nobackup/$USER/TreeSearch/logs + +echo "=== T-289 Hamilton job ===" +echo "Stage: $STAGE, Timeout: ${TIMEOUT}s" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Install CRAN dependencies into local lib (if missing) +echo "Checking/installing CRAN dependencies..." +Rscript --no-save -e " + lib <- '$LIB' + .libPaths(c(lib, .libPaths())) + pkgs <- c('abind', 'ape', 'cli', 'colorspace', 'fastmatch', 'Rdpack', 'TreeDist', 'TreeTools') + need <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] + if (length(need) > 0) { + message('Installing: ', paste(need, collapse = ', ')) + install.packages(need, lib = lib, repos = 'https://cloud.r-project.org', + dependencies = NA, quiet = TRUE) # NA = Imports+Depends only + } else { + message('All dependencies present.') + } +" + +# Build and install from latest cpp-search +cd "$REPO" || exit 1 +git fetch origin cpp-search +git pull --ff-only origin cpp-search || git reset --hard origin/cpp-search +echo "Git HEAD: $(git log --oneline -1)" + +rm -f src/*.o src/*.so +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +echo "Install exit code: $rc" +rm -f TreeSearch_*.tar.gz + +if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 +fi + +# Run benchmark +cd "$OUTDIR" +Rscript "$REPO/dev/benchmarks/bench_prune_reinsert.R" "$STAGE" "$TIMEOUT" "$OUTDIR" + +echo "" +echo "Completed: $(date)" +echo "Results in: $OUTDIR" +ls -la "$OUTDIR"/t289_*.csv 2>/dev/null diff --git a/dev/benchmarks/t289b_brazeau_hamilton.sh b/dev/benchmarks/t289b_brazeau_hamilton.sh new file mode 100644 index 000000000..065a25a26 --- /dev/null +++ b/dev/benchmarks/t289b_brazeau_hamilton.sh @@ -0,0 +1,103 @@ +#!/bin/bash +#SBATCH --job-name=t289b-brazeau +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=4G +#SBATCH --time=8:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t289b_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t289b_%j.err + +# T-289b: Prune-reinsert benchmark — Brazeau (default) scoring +# +# Parallel companion to t289_hamilton.sh (Fitch/EW mode). +# Uses TreeSearch's default Brazeau et al. (2019) inapplicable scoring. +# Shares the same build artifact as the Fitch job — no rebuild needed +# if t289_hamilton.sh has already installed TreeSearch in $LIB. +# +# Stage 1: 13 configs x 5 datasets x 5 seeds x 30s ≈ 325 runs ≈ 2.7h +# Stage 2: ~10 configs x 5 datasets x 5 seeds x {30s,60s} ≈ 500 runs ≈ 6.3h +# +# Usage: +# sbatch t289b_brazeau_hamilton.sh # stage 1, 30s +# sbatch t289b_brazeau_hamilton.sh 2 30 # stage 2, 30s +# sbatch t289b_brazeau_hamilton.sh 2 60 # stage 2, 60s + +STAGE=${1:-1} +TIMEOUT=${2:-30} + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t289b_results +export R_LIBS="$LIB:${R_LIBS}" + +mkdir -p "$LIB" +mkdir -p "$OUTDIR" +mkdir -p /nobackup/$USER/TreeSearch/logs + +echo "=== T-289b Hamilton job (Brazeau scoring) ===" +echo "Stage: $STAGE, Timeout: ${TIMEOUT}s" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Install CRAN dependencies into local lib (if missing) +echo "Checking/installing CRAN dependencies..." +Rscript --no-save -e " + lib <- '$LIB' + .libPaths(c(lib, .libPaths())) + pkgs <- c('abind', 'ape', 'cli', 'colorspace', 'fastmatch', 'Rdpack', 'TreeDist', 'TreeTools') + need <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] + if (length(need) > 0) { + message('Installing: ', paste(need, collapse = ', ')) + install.packages(need, lib = lib, repos = 'https://cloud.r-project.org', + dependencies = NA, quiet = TRUE) + } else { + message('All dependencies present.') + } +" + +# Build and install from latest cpp-search +# (Skip rebuild if TreeSearch is already installed and up to date; +# rebuild if the Fitch job hasn't run yet or if HEAD has moved.) +cd "$REPO" || exit 1 +git fetch origin cpp-search +git pull --ff-only origin cpp-search || git reset --hard origin/cpp-search +echo "Git HEAD: $(git log --oneline -1)" + +INSTALLED_VER=$(Rscript --no-save -e \ + ".libPaths(c('$LIB', .libPaths())); cat(as.character(packageVersion('TreeSearch')))" \ + 2>/dev/null || echo "none") +REPO_VER=$(grep '^Version:' DESCRIPTION | awk '{print $2}') +echo "Installed: $INSTALLED_VER Repo: $REPO_VER" + +if [ "$INSTALLED_VER" != "$REPO_VER" ]; then + echo "Rebuilding TreeSearch..." + rm -f src/*.o src/*.so + R CMD build --no-build-vignettes --no-manual --no-resave-data . + R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz + rc=$? + echo "Install exit code: $rc" + rm -f TreeSearch_*.tar.gz + if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 + fi +else + echo "TreeSearch already up to date; skipping rebuild." +fi + +# Run benchmark +cd "$OUTDIR" +Rscript "$REPO/dev/benchmarks/bench_prune_reinsert_brazeau.R" "$STAGE" "$TIMEOUT" "$OUTDIR" + +echo "" +echo "Completed: $(date)" +echo "Results in: $OUTDIR" +ls -la "$OUTDIR"/t289b_*.csv 2>/dev/null diff --git a/dev/benchmarks/t289c_stage2_hamilton.sh b/dev/benchmarks/t289c_stage2_hamilton.sh new file mode 100644 index 000000000..400af6e3b --- /dev/null +++ b/dev/benchmarks/t289c_stage2_hamilton.sh @@ -0,0 +1,96 @@ +#!/bin/bash +#SBATCH --job-name=t289c-pr-s2 +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=6G +#SBATCH --time=3:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t289c_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t289c_%j.err + +# T-289c: Prune-reinsert Stage 2 — mbank_X30754 only, 60s budget +# +# Stage 1 (5 datasets × 13 configs × 5 seeds × 30s) verdict: +# - ≤88t: PR net-negative (replicate cost >> score gain). Not tested here. +# - 180t: Real signal. pr_c5_d10 most consistent (5/5 seeds, mean −6.6 steps). +# +# Stage 2 grid: 9 configs × 1 dataset × 10 seeds = 90 runs × ~65s ≈ 98 min. +# SBATCH --time=3:00:00 provides comfortable margin. +# +# Usage: +# sbatch t289c_stage2_hamilton.sh [timeout_s] +# Default timeout: 60s + +TIMEOUT=${1:-60} + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t289c_results +export R_LIBS="$LIB:${R_LIBS}" + +mkdir -p "$LIB" "$OUTDIR" /nobackup/$USER/TreeSearch/logs + +echo "=== T-289c Hamilton job (PR Stage 2) ===" +echo "Timeout: ${TIMEOUT}s" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Install CRAN dependencies if missing +echo "Checking CRAN dependencies..." +Rscript --no-save -e " + lib <- '$LIB' + .libPaths(c(lib, .libPaths())) + pkgs <- c('abind', 'ape', 'cli', 'colorspace', 'fastmatch', 'Rdpack', 'TreeDist', 'TreeTools') + need <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] + if (length(need) > 0) { + message('Installing: ', paste(need, collapse = ', ')) + install.packages(need, lib = lib, repos = 'https://cloud.r-project.org', + dependencies = NA, quiet = TRUE) + } else { + message('All dependencies present.') + } +" + +# Build and install TreeSearch from cpp-search +cd "$REPO" || exit 1 +git fetch origin cpp-search +git pull --ff-only origin cpp-search || git reset --hard origin/cpp-search +echo "Git HEAD: $(git log --oneline -1)" + +INSTALLED_VER=$(Rscript --no-save -e \ + ".libPaths(c('$LIB', .libPaths())); cat(as.character(packageVersion('TreeSearch')))" \ + 2>/dev/null || echo "none") +REPO_VER=$(grep '^Version:' DESCRIPTION | awk '{print $2}') +echo "Installed: $INSTALLED_VER Repo: $REPO_VER" + +if [ "$INSTALLED_VER" != "$REPO_VER" ]; then + echo "Rebuilding TreeSearch..." + rm -f src/*.o src/*.so + R CMD build --no-build-vignettes --no-manual --no-resave-data . + R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz + rc=$? + rm -f TreeSearch_*.tar.gz + echo "Install exit code: $rc" + if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 + fi +else + echo "TreeSearch already up to date; skipping rebuild." +fi + +# Run benchmark +cd "$OUTDIR" +Rscript "$REPO/dev/benchmarks/bench_pr_stage2_mbank.R" "$TIMEOUT" "$OUTDIR" + +echo "" +echo "Completed: $(date)" +echo "Results in: $OUTDIR" +ls -lh "$OUTDIR"/t289c_*.csv 2>/dev/null diff --git a/dev/benchmarks/t289d_stage3_hamilton.sh b/dev/benchmarks/t289d_stage3_hamilton.sh new file mode 100644 index 000000000..e605f62a7 --- /dev/null +++ b/dev/benchmarks/t289d_stage3_hamilton.sh @@ -0,0 +1,81 @@ +#!/bin/bash +#SBATCH --job-name=t289d-pr-s3 +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=6G +#SBATCH --time=3:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t289d_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t289d_%j.err + +# T-289d: Prune-reinsert Stage 3 — new drop criteria (MISSING, COMBINED) +# +# Requires TreeSearch >= commit 1ce5e12e (feat: MISSING+COMBINED criteria). +# +# Grid: 8 configs × 1 dataset × 10 seeds × 60s ≈ 87 min. +# SBATCH --time=3:00:00 provides comfortable margin. +# +# Usage: +# sbatch t289d_stage3_hamilton.sh [timeout_s] +# Default: 60s + +TIMEOUT=${1:-60} + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t289d_results +export R_LIBS="$LIB:${R_LIBS}" + +mkdir -p "$LIB" "$OUTDIR" /nobackup/$USER/TreeSearch/logs + +echo "=== T-289d Hamilton job (PR Stage 3 — new criteria) ===" +echo "Timeout: ${TIMEOUT}s" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Install CRAN dependencies if missing +Rscript --no-save -e " + lib <- '$LIB' + .libPaths(c(lib, .libPaths())) + pkgs <- c('abind', 'ape', 'cli', 'colorspace', 'fastmatch', 'Rdpack', 'TreeDist', 'TreeTools') + need <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] + if (length(need) > 0) { + message('Installing: ', paste(need, collapse = ', ')) + install.packages(need, lib = lib, repos = 'https://cloud.r-project.org', + dependencies = NA, quiet = TRUE) + } else { message('All dependencies present.') } +" + +# Always rebuild — Stage 3 requires the new MISSING/COMBINED criteria +# (commit 1ce5e12e on cpp-search). +cd "$REPO" || exit 1 +git fetch origin cpp-search +git pull --ff-only origin cpp-search || git reset --hard origin/cpp-search +echo "Git HEAD: $(git log --oneline -1)" + +echo "Rebuilding TreeSearch (new criteria require recompile)..." +rm -f src/*.o src/*.so src/*.dll +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +rm -f TreeSearch_*.tar.gz +echo "Install exit code: $rc" +if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 +fi + +# Run benchmark +cd "$OUTDIR" +Rscript "$REPO/dev/benchmarks/bench_pr_stage3_mbank.R" "$TIMEOUT" "$OUTDIR" + +echo "" +echo "Completed: $(date)" +ls -lh "$OUTDIR"/t289d_*.csv 2>/dev/null diff --git a/dev/benchmarks/t289e_stage4_hamilton.sh b/dev/benchmarks/t289e_stage4_hamilton.sh new file mode 100644 index 000000000..2d7a89a83 --- /dev/null +++ b/dev/benchmarks/t289e_stage4_hamilton.sh @@ -0,0 +1,74 @@ +#!/bin/bash +#SBATCH --job-name=t289e-pr-s4 +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=8G +#SBATCH --time=8:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t289e_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t289e_%j.err + +# T-289e: Prune-reinsert Stage 4 — multi-dataset validation +# +# Validates that PR (c=5, d=5%, MISSING) benefit generalises across 5 large-tree +# matrices (131-206 tips) and persists at 120s budget. +# +# Grid: 5 datasets × 2 configs × 2 budgets × 10 seeds = 200 runs +# Expected wall time: ~5h; 8h limit provides comfortable margin. + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t289e_results +export R_LIBS="$LIB:${R_LIBS}" + +mkdir -p "$LIB" "$OUTDIR" /nobackup/$USER/TreeSearch/logs + +echo "=== T-289e Hamilton job (PR Stage 4 — multi-dataset validation) ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Install CRAN dependencies if missing +Rscript --no-save -e " + lib <- '$LIB' + .libPaths(c(lib, .libPaths())) + pkgs <- c('abind', 'ape', 'cli', 'colorspace', 'fastmatch', 'Rdpack', 'TreeDist', 'TreeTools') + need <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] + if (length(need) > 0) { + message('Installing: ', paste(need, collapse = ', ')) + install.packages(need, lib = lib, repos = 'https://cloud.r-project.org', + dependencies = NA, quiet = TRUE) + } else { message('All dependencies present.') } +" + +# Rebuild — Stage 4 runs the large preset with PR (commit in cpp-search) +cd "$REPO" || exit 1 +git fetch origin cpp-search +git pull --ff-only origin cpp-search || git reset --hard origin/cpp-search +echo "Git HEAD: $(git log --oneline -1)" + +echo "Rebuilding TreeSearch..." +rm -f src/*.o src/*.so src/*.dll +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +rm -f TreeSearch_*.tar.gz +echo "Install exit code: $rc" +if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 +fi + +# Run benchmark +cd "$OUTDIR" +Rscript "$REPO/dev/benchmarks/bench_pr_stage4_validation.R" "$OUTDIR" + +echo "" +echo "Completed: $(date)" +ls -lh "$OUTDIR"/t289e_*.csv 2>/dev/null diff --git a/dev/benchmarks/t289f_pr_nni_polish.csv b/dev/benchmarks/t289f_pr_nni_polish.csv new file mode 100644 index 000000000..141a7e828 --- /dev/null +++ b/dev/benchmarks/t289f_pr_nni_polish.csv @@ -0,0 +1,593 @@ +'dataset','n_tips','n_patterns','config','seed','timeout_s','score','n_trees','replicates','hits','wall_s','pr_cycles','pr_nni' +'mbank_X30754',180,425,'baseline',9,60,1197,100,4,1,56.715,0,0 +'mbank_X30754',180,425,'baseline',1,60,1177,100,2,1,59.092,0,0 +'mbank_X30754',180,425,'baseline',10,60,1176,100,3,1,55.062,0,0 +'mbank_X30754',180,425,'baseline',2,60,1190,100,2,1,54.989,0,0 +'mbank_X30754',180,425,'pr_nni',1,60,1180,100,2,1,55.082,5,1 +'mbank_X30754',180,425,'baseline',3,60,1222,100,2,1,54.731,0,0 +'mbank_X30754',180,425,'pr_nni',2,60,1179,100,2,1,57.167,5,1 +'mbank_X30754',180,425,'baseline',4,60,1177,100,2,1,57.861,0,0 +'mbank_X30754',180,425,'pr_nni',3,60,1179,100,2,1,55.015,5,1 +'mbank_X30754',180,425,'baseline',5,60,1194,100,3,1,55.508,0,0 +'mbank_X30754',180,425,'pr_nni',4,60,1199,100,2,1,55.457,5,1 +'mbank_X30754',180,425,'baseline',6,60,1197,100,3,1,55.868,0,0 +'mbank_X30754',180,425,'pr_nni',5,60,1179,100,2,1,56.435,5,1 +'mbank_X30754',180,425,'baseline',7,60,1185,100,2,1,56.900,0,0 +'mbank_X30754',180,425,'pr_nni',6,60,1217,100,2,1,59.724,5,1 +'mbank_X30754',180,425,'baseline',8,60,1189,100,2,1,54.543,0,0 +'mbank_X30754',180,425,'pr_nni',7,60,1176,100,2,1,55.750,5,1 +'mbank_X30754',180,425,'baseline',9,60,1204,100,2,1,55.588,0,0 +'mbank_X30754',180,425,'pr_nni',8,60,1180,100,2,1,55.757,5,1 +'mbank_X30754',180,425,'baseline',10,60,1192,100,2,1,55.061,0,0 +'mbank_X30754',180,425,'pr_nni',9,60,1204,100,2,2,55.467,5,1 +'mbank_X30754',180,425,'pr_nni',1,60,1185,100,2,1,55.101,5,1 +'mbank_X30754',180,425,'pr_nni',10,60,1188,100,2,1,54.681,5,1 +'mbank_X30754',180,425,'pr_nni',2,60,1176,100,2,1,55.528,5,1 +'mbank_X30754',180,425,'pr_tbr',1,60,1193,100,2,1,56.211,5,0 +'mbank_X30754',180,425,'pr_nni',3,60,1183,100,2,1,55.575,5,1 +'mbank_X30754',180,425,'pr_tbr',2,60,1198,100,2,1,55.143,5,0 +'mbank_X30754',180,425,'pr_nni',4,60,1191,100,2,1,56.851,5,1 +'mbank_X30754',180,425,'pr_tbr',3,60,1196,100,2,1,54.840,5,0 +'mbank_X30754',180,425,'pr_nni',5,60,1192,100,2,1,58.001,5,1 +'mbank_X30754',180,425,'pr_tbr',4,60,1209,100,2,1,54.656,5,0 +'mbank_X30754',180,425,'pr_nni',6,60,1191,100,2,1,56.559,5,1 +'mbank_X30754',180,425,'pr_tbr',5,60,1186,100,2,1,54.891,5,0 +'mbank_X30754',180,425,'pr_nni',7,60,1186,100,2,1,57.042,5,1 +'mbank_X30754',180,425,'pr_tbr',6,60,1180,100,2,1,55.273,5,0 +'mbank_X30754',180,425,'pr_nni',8,60,1189,100,2,1,55.129,5,1 +'mbank_X30754',180,425,'pr_tbr',7,60,1203,100,2,1,55.910,5,0 +'mbank_X30754',180,425,'pr_nni',9,60,1220,100,2,1,58.170,5,1 +'mbank_X30754',180,425,'pr_tbr',8,60,1195,100,2,1,55.233,5,0 +'mbank_X30754',180,425,'pr_nni',10,60,1202,100,2,1,56.174,5,1 +'mbank_X30754',180,425,'pr_tbr',9,60,1183,100,2,1,55.821,5,0 +'mbank_X30754',180,425,'pr_tbr',1,60,1185,100,2,1,56.862,5,0 +'mbank_X30754',180,425,'pr_tbr',10,60,1196,100,2,2,55.204,5,0 +'mbank_X30754',180,425,'pr_tbr',2,60,1192,100,2,1,54.971,5,0 +'mbank_X30754',180,425,'pr_tbr',3,60,1176,100,2,1,57.297,5,0 +'mbank_X30754',180,425,'baseline',1,120,1185,100,7,1,111.575,0,0 +'mbank_X30754',180,425,'pr_tbr',4,60,1184,100,2,1,54.973,5,0 +'mbank_X30754',180,425,'pr_tbr',5,60,1209,100,2,1,58.387,5,0 +'mbank_X30754',180,425,'baseline',2,120,1184,100,7,1,109.196,0,0 +'mbank_X30754',180,425,'pr_tbr',6,60,1185,100,2,1,55.455,5,0 +'mbank_X30754',180,425,'pr_tbr',7,60,1194,100,2,1,55.388,5,0 +'mbank_X30754',180,425,'baseline',3,120,1178,100,8,1,108.885,0,0 +'mbank_X30754',180,425,'pr_tbr',8,60,1179,100,2,1,55.061,5,0 +'mbank_X30754',180,425,'pr_tbr',9,60,1193,100,2,1,54.979,5,0 +'mbank_X30754',180,425,'baseline',4,120,1193,100,8,1,109.054,0,0 +'mbank_X30754',180,425,'pr_tbr',10,60,1192,100,2,1,57.003,5,0 +'mbank_X30754',180,425,'baseline',5,120,1190,100,8,1,108.889,0,0 +'mbank_X30754',180,425,'baseline',1,120,1178,100,7,1,109.109,0,0 +'mbank_X30754',180,425,'baseline',6,120,1182,100,8,1,109.471,0,0 +'mbank_X30754',180,425,'baseline',2,120,1184,100,6,1,109.635,0,0 +'mbank_X30754',180,425,'baseline',7,120,1183,100,8,1,118.883,0,0 +'mbank_X30754',180,425,'baseline',3,120,1167,100,6,1,108.920,0,0 +'mbank_X30754',180,425,'baseline',8,120,1172,100,8,1,110.093,0,0 +'mbank_X30754',180,425,'baseline',4,120,1179,100,7,1,110.132,0,0 +'mbank_X30754',180,425,'baseline',9,120,1180,100,7,1,111.729,0,0 +'mbank_X30754',180,425,'baseline',5,120,1186,100,7,1,109.305,0,0 +'mbank_X30754',180,425,'baseline',10,120,1187,100,8,1,108.743,0,0 +'mbank_X30754',180,425,'baseline',6,120,1183,100,6,1,108.823,0,0 +'mbank_X30754',180,425,'pr_nni',1,120,1179,100,6,1,109.762,5,1 +'mbank_X30754',180,425,'baseline',7,120,1165,100,6,1,109.404,0,0 +'mbank_X30754',180,425,'pr_nni',2,120,1186,100,6,1,108.679,5,1 +'mbank_X30754',180,425,'baseline',8,120,1183,100,7,1,112.141,0,0 +'mbank_X30754',180,425,'pr_nni',3,120,1171,100,6,1,111.128,5,1 +'mbank_X30754',180,425,'baseline',9,120,1182,100,6,1,110.592,0,0 +'mbank_X30754',180,425,'pr_nni',4,120,1171,100,6,1,109.038,5,1 +'mbank_X30754',180,425,'baseline',10,120,1198,100,6,1,110.612,0,0 +'mbank_X30754',180,425,'pr_nni',5,120,1168,100,6,1,109.992,5,1 +'mbank_X30754',180,425,'pr_nni',1,120,1191,100,6,1,109.929,5,1 +'mbank_X30754',180,425,'pr_nni',6,120,1172,100,6,1,109.218,5,1 +'mbank_X30754',180,425,'pr_nni',2,120,1166,100,6,1,109.651,5,1 +'mbank_X30754',180,425,'pr_nni',7,120,1176,100,6,1,109.912,5,1 +'mbank_X30754',180,425,'pr_nni',3,120,1170,100,6,1,109.166,5,1 +'mbank_X30754',180,425,'pr_nni',8,120,1177,100,6,1,109.341,5,1 +'mbank_X30754',180,425,'pr_nni',4,120,1199,100,6,1,108.951,5,1 +'mbank_X30754',180,425,'pr_nni',9,120,1179,100,6,1,109.004,5,1 +'mbank_X30754',180,425,'pr_nni',5,120,1184,100,6,1,108.633,5,1 +'mbank_X30754',180,425,'pr_nni',10,120,1165,100,6,1,109.002,5,1 +'mbank_X30754',180,425,'pr_nni',6,120,1172,100,6,1,108.740,5,1 +'mbank_X30754',180,425,'pr_tbr',1,120,1173,100,6,1,108.584,5,0 +'mbank_X30754',180,425,'pr_nni',7,120,1181,100,6,0,109.348,5,1 +'mbank_X30754',180,425,'pr_tbr',2,120,1184,100,6,1,109.429,5,0 +'mbank_X30754',180,425,'pr_nni',8,120,1169,100,6,1,109.950,5,1 +'mbank_X30754',180,425,'pr_tbr',3,120,1172,100,6,1,111.199,5,0 +'mbank_X30754',180,425,'pr_nni',9,120,1176,100,6,1,109.544,5,1 +'mbank_X30754',180,425,'pr_tbr',4,120,1187,100,6,1,109.747,5,0 +'mbank_X30754',180,425,'pr_nni',10,120,1184,100,6,1,110.134,5,1 +'mbank_X30754',180,425,'pr_tbr',5,120,1175,100,6,1,109.611,5,0 +'mbank_X30754',180,425,'pr_tbr',1,120,1182,100,5,1,109.597,5,0 +'mbank_X30754',180,425,'pr_tbr',6,120,1191,100,6,1,108.677,5,0 +'mbank_X30754',180,425,'pr_tbr',2,120,1183,100,6,1,109.602,5,0 +'mbank_X30754',180,425,'pr_tbr',7,120,1169,100,6,1,109.658,5,0 +'mbank_X30754',180,425,'pr_tbr',3,120,1179,100,6,1,110.182,5,0 +'mbank_X30754',180,425,'pr_tbr',8,120,1178,100,6,1,110.202,5,0 +'mbank_X30754',180,425,'pr_tbr',4,120,1180,100,6,1,109.706,5,0 +'mbank_X30754',180,425,'pr_tbr',9,120,1188,100,5,1,108.700,5,0 +'mbank_X30754',180,425,'pr_tbr',5,120,1182,100,5,1,109.927,5,0 +'mbank_X30754',180,425,'pr_tbr',10,120,1178,100,6,1,114.074,5,0 +'mbank_X30754',180,425,'pr_tbr',6,120,1173,100,5,1,108.743,5,0 +'project4133',131,349,'baseline',1,60,2380,100,19,1,56.519,0,0 +'project4133',131,349,'baseline',2,60,2382,100,20,1,56.966,0,0 +'mbank_X30754',180,425,'pr_tbr',7,120,1172,100,6,1,110.728,5,0 +'project4133',131,349,'baseline',3,60,2381,100,19,1,54.533,0,0 +'project4133',131,349,'baseline',4,60,2384,100,20,1,55.538,0,0 +'mbank_X30754',180,425,'pr_tbr',8,120,1178,100,6,1,109.630,5,0 +'project4133',131,349,'baseline',5,60,2382,100,20,1,55.160,0,0 +'project4133',131,349,'baseline',6,60,2387,100,20,1,54.761,0,0 +'mbank_X30754',180,425,'pr_tbr',9,120,1175,100,6,1,109.415,5,0 +'project4133',131,349,'baseline',7,60,2379,100,20,1,54.772,0,0 +'project4133',131,349,'baseline',8,60,2382,100,18,1,55.407,0,0 +'mbank_X30754',180,425,'pr_tbr',10,120,1186,100,6,1,108.966,5,0 +'project4133',131,349,'baseline',9,60,2382,100,20,1,54.695,0,0 +'project4133',131,349,'baseline',1,60,2376,100,19,1,55.222,0,0 +'project4133',131,349,'baseline',10,60,2375,100,19,1,55.297,0,0 +'project4133',131,349,'baseline',2,60,2376,100,18,0,55.204,0,0 +'project4133',131,349,'pr_nni',1,60,2380,100,16,0,54.902,5,1 +'project4133',131,349,'baseline',3,60,2377,100,20,1,55.130,0,0 +'project4133',131,349,'pr_nni',2,60,2377,100,16,1,54.472,5,1 +'project4133',131,349,'baseline',4,60,2383,100,20,1,55.378,0,0 +'project4133',131,349,'pr_nni',3,60,2379,100,17,1,55.110,5,1 +'project4133',131,349,'baseline',5,60,2381,100,19,0,54.706,0,0 +'project4133',131,349,'pr_nni',4,60,2371,100,16,1,54.680,5,1 +'project4133',131,349,'baseline',6,60,2373,100,19,1,55.544,0,0 +'project4133',131,349,'pr_nni',5,60,2381,100,16,1,55.020,5,1 +'project4133',131,349,'baseline',7,60,2379,100,18,1,55.174,0,0 +'project4133',131,349,'pr_nni',6,60,2378,100,16,1,55.187,5,1 +'project4133',131,349,'baseline',8,60,2386,100,19,1,55.515,0,0 +'project4133',131,349,'pr_nni',7,60,2381,100,16,1,54.865,5,1 +'project4133',131,349,'baseline',9,60,2386,100,19,2,54.507,0,0 +'project4133',131,349,'pr_nni',8,60,2376,100,16,1,54.993,5,1 +'project4133',131,349,'baseline',10,60,2379,100,19,1,55.416,0,0 +'project4133',131,349,'pr_nni',9,60,2383,100,16,1,56.042,5,1 +'project4133',131,349,'pr_nni',1,60,2377,100,16,1,54.876,5,1 +'project4133',131,349,'pr_nni',10,60,2386,100,16,2,55.174,5,1 +'project4133',131,349,'pr_nni',2,60,2380,100,16,1,54.711,5,1 +'project4133',131,349,'pr_tbr',1,60,2382,100,14,1,55.313,5,0 +'project4133',131,349,'pr_nni',3,60,2386,100,16,1,54.472,5,1 +'project4133',131,349,'pr_tbr',2,60,2380,100,14,1,55.553,5,0 +'project4133',131,349,'pr_nni',4,60,2381,100,16,1,55.697,5,1 +'project4133',131,349,'pr_tbr',3,60,2377,100,14,1,55.433,5,0 +'project4133',131,349,'pr_nni',5,60,2378,100,16,1,55.456,5,1 +'project4133',131,349,'pr_tbr',4,60,2382,100,14,0,55.032,5,0 +'project4133',131,349,'pr_nni',6,60,2382,100,16,1,55.048,5,1 +'project4133',131,349,'pr_tbr',5,60,2374,100,14,1,56.293,5,0 +'project4133',131,349,'pr_nni',7,60,2380,100,16,1,54.921,5,1 +'project4133',131,349,'pr_tbr',6,60,2372,100,14,1,54.974,5,0 +'project4133',131,349,'pr_nni',8,60,2379,100,16,1,56.451,5,1 +'project4133',131,349,'pr_tbr',7,60,2374,100,14,0,55.044,5,0 +'project4133',131,349,'pr_nni',9,60,2378,100,16,1,57.662,5,1 +'project4133',131,349,'pr_tbr',8,60,2376,100,14,1,55.855,5,0 +'project4133',131,349,'pr_nni',10,60,2381,100,16,1,55.670,5,1 +'project4133',131,349,'pr_tbr',9,60,2376,100,14,1,54.633,5,0 +'project4133',131,349,'pr_tbr',1,60,2377,100,15,1,55.159,5,0 +'project4133',131,349,'pr_tbr',10,60,2381,100,14,1,55.251,5,0 +'project4133',131,349,'pr_tbr',2,60,2370,100,15,0,56.899,5,0 +'project4133',131,349,'pr_tbr',3,60,2373,100,14,1,57.368,5,0 +'project4133',131,349,'baseline',1,120,2375,100,37,1,108.586,0,0 +'project4133',131,349,'pr_tbr',4,60,2378,100,14,1,55.260,5,0 +'project4133',131,349,'pr_tbr',5,60,2374,100,14,1,54.918,5,0 +'project4133',131,349,'baseline',2,120,2377,100,39,1,109.877,0,0 +'project4133',131,349,'pr_tbr',6,60,2370,100,14,0,55.524,5,0 +'project4133',131,349,'pr_tbr',7,60,2377,100,14,1,55.338,5,0 +'project4133',131,349,'baseline',3,120,2378,100,38,1,108.823,0,0 +'project4133',131,349,'pr_tbr',8,60,2385,100,14,1,54.774,5,0 +'project4133',131,349,'pr_tbr',9,60,2375,100,14,1,54.685,5,0 +'project4133',131,349,'baseline',4,120,2370,100,38,1,109.582,0,0 +'project4133',131,349,'pr_tbr',10,60,2379,100,14,1,55.873,5,0 +'project4133',131,349,'baseline',5,120,2377,100,39,1,108.986,0,0 +'project4133',131,349,'baseline',1,120,2382,100,40,0,108.747,0,0 +'project4133',131,349,'baseline',6,120,2377,100,38,1,109.615,0,0 +'project4133',131,349,'baseline',2,120,2378,100,40,1,109.510,0,0 +'project4133',131,349,'baseline',7,120,2379,100,39,1,109.977,0,0 +'project4133',131,349,'baseline',3,120,2378,100,39,1,108.720,0,0 +'project4133',131,349,'baseline',8,120,2373,100,38,1,110.431,0,0 +'project4133',131,349,'baseline',4,120,2372,100,40,0,108.750,0,0 +'project4133',131,349,'baseline',9,120,2380,100,40,1,109.730,0,0 +'project4133',131,349,'baseline',5,120,2381,100,39,0,108.775,0,0 +'project4133',131,349,'baseline',10,120,2374,100,40,1,109.588,0,0 +'project4133',131,349,'baseline',6,120,2378,100,40,1,109.225,0,0 +'project4133',131,349,'pr_nni',1,120,2373,100,34,0,111.930,5,1 +'project4133',131,349,'baseline',7,120,2382,100,39,1,108.839,0,0 +'project4133',131,349,'pr_nni',2,120,2376,100,34,1,108.882,5,1 +'project4133',131,349,'baseline',8,120,2366,100,40,1,109.575,0,0 +'project4133',131,349,'pr_nni',3,120,2378,100,34,1,109.824,5,1 +'project4133',131,349,'baseline',9,120,2376,100,40,0,108.894,0,0 +'project4133',131,349,'pr_nni',4,120,2377,100,34,1,109.120,5,1 +'project4133',131,349,'baseline',10,120,2370,100,40,1,109.005,0,0 +'project4133',131,349,'pr_nni',5,120,2378,100,33,1,109.037,5,1 +'project4133',131,349,'pr_nni',1,120,2372,100,34,1,108.830,5,1 +'project4133',131,349,'pr_nni',6,120,2379,100,32,1,108.544,5,1 +'project4133',131,349,'pr_nni',2,120,2375,100,34,1,109.696,5,1 +'project4133',131,349,'pr_nni',7,120,2375,100,33,1,110.982,5,1 +'project4133',131,349,'pr_nni',3,120,2373,100,33,1,111.071,5,1 +'project4133',131,349,'pr_nni',8,120,2375,100,34,1,109.485,5,1 +'project4133',131,349,'pr_nni',4,120,2371,100,33,1,108.501,5,1 +'project4133',131,349,'pr_nni',9,120,2379,100,34,1,108.758,5,1 +'project4133',131,349,'pr_nni',5,120,2380,100,34,0,108.912,5,1 +'project4133',131,349,'pr_nni',10,120,2377,100,33,1,108.391,5,1 +'project4133',131,349,'pr_nni',6,120,2380,100,34,1,109.191,5,1 +'project4133',131,349,'pr_tbr',1,120,2376,100,30,1,109.663,5,0 +'project4133',131,349,'pr_nni',7,120,2379,100,34,1,109.609,5,1 +'project4133',131,349,'pr_tbr',2,120,2371,100,30,1,108.952,5,0 +'project4133',131,349,'pr_nni',8,120,2378,100,34,1,111.150,5,1 +'project4133',131,349,'pr_tbr',3,120,2378,100,29,1,108.725,5,0 +'project4133',131,349,'pr_nni',9,120,2380,100,34,2,108.717,5,1 +'project4133',131,349,'pr_tbr',4,120,2381,100,30,1,109.789,5,0 +'project4133',131,349,'pr_nni',10,120,2379,100,34,1,109.038,5,1 +'project4133',131,349,'pr_tbr',5,120,2375,100,30,1,109.177,5,0 +'project4133',131,349,'pr_tbr',1,120,2373,100,29,1,109.232,5,0 +'project4133',131,349,'pr_tbr',6,120,2371,100,28,1,114.340,5,0 +'project4133',131,349,'pr_tbr',2,120,2375,100,29,1,109.025,5,0 +'project4133',131,349,'pr_tbr',7,120,2379,100,29,1,108.769,5,0 +'project4133',131,349,'pr_tbr',3,120,2376,100,30,1,108.938,5,0 +'project4133',131,349,'pr_tbr',8,120,2375,100,29,1,111.501,5,0 +'project4133',131,349,'pr_tbr',4,120,2377,100,30,1,109.013,5,0 +'project4133',131,349,'pr_tbr',9,120,2375,100,30,1,109.654,5,0 +'project4133',131,349,'pr_tbr',5,120,2376,100,30,1,110.094,5,0 +'project4133',131,349,'pr_tbr',10,120,2378,100,30,1,109.259,5,0 +'project4133',131,349,'pr_tbr',6,120,2372,100,31,1,109.013,5,0 +'project3701',146,324,'baseline',1,60,3936,1,7,1,59.506,0,0 +'project3701',146,324,'baseline',2,60,4236,2,8,1,58.009,0,0 +'project4133',131,349,'pr_tbr',7,120,2374,100,30,1,108.595,5,0 +'project3701',146,324,'baseline',3,60,4274,10,8,1,60.040,0,0 +'project4133',131,349,'pr_tbr',8,120,2378,100,30,1,108.926,5,0 +'project3701',146,324,'baseline',4,60,4023,75,8,1,60.018,0,0 +'project3701',146,324,'baseline',5,60,4185,4,7,1,57.522,0,0 +'project4133',131,349,'pr_tbr',9,120,2376,100,29,1,108.921,5,0 +'project3701',146,324,'baseline',6,60,4164,100,8,1,60.026,0,0 +'project3701',146,324,'baseline',7,60,4172,25,8,1,60.046,0,0 +'project4133',131,349,'pr_tbr',10,120,2376,100,30,1,109.691,5,0 +'project3701',146,324,'baseline',8,60,4028,100,8,1,58.648,0,0 +'project3701',146,324,'baseline',1,60,4183,33,8,1,60.043,0,0 +'project3701',146,324,'baseline',9,60,4135,6,7,1,57.190,0,0 +'project3701',146,324,'baseline',2,60,4111,38,8,1,60.025,0,0 +'project3701',146,324,'baseline',10,60,4291,1,8,1,54.254,0,0 +'project3701',146,324,'baseline',3,60,4102,15,8,1,60.010,0,0 +'project3701',146,324,'pr_nni',1,60,4043,2,6,1,57.124,5,1 +'project3701',146,324,'baseline',4,60,4148,4,8,1,57.610,0,0 +'project3701',146,324,'pr_nni',2,60,4005,2,6,1,56.982,5,1 +'project3701',146,324,'baseline',5,60,4105,1,8,1,54.562,0,0 +'project3701',146,324,'pr_nni',3,60,4044,1,6,1,54.141,5,1 +'project3701',146,324,'baseline',6,60,4224,4,8,1,58.400,0,0 +'project3701',146,324,'pr_nni',4,60,3987,2,6,1,59.605,5,1 +'project3701',146,324,'baseline',7,60,4169,2,8,1,54.240,0,0 +'project3701',146,324,'pr_nni',5,60,4067,2,5,1,55.037,5,1 +'project3701',146,324,'baseline',8,60,4086,1,8,1,54.655,0,0 +'project3701',146,324,'pr_nni',6,60,4034,100,6,1,56.858,5,1 +'project3701',146,324,'baseline',9,60,4255,2,8,1,56.195,0,0 +'project3701',146,324,'pr_nni',7,60,3933,1,6,1,54.738,5,1 +'project3701',146,324,'baseline',10,60,4170,2,7,1,54.824,0,0 +'project3701',146,324,'pr_nni',8,60,3953,2,6,1,56.303,5,1 +'project3701',146,324,'pr_nni',1,60,4021,100,6,1,59.274,5,1 +'project3701',146,324,'pr_nni',9,60,4034,100,6,1,55.486,5,1 +'project3701',146,324,'pr_nni',2,60,3991,1,6,1,58.778,5,1 +'project3701',146,324,'pr_nni',10,60,4000,47,6,1,60.034,5,1 +'project3701',146,324,'pr_nni',3,60,3957,49,6,1,60.017,5,1 +'project3701',146,324,'pr_tbr',1,60,4304,1,5,1,55.593,5,0 +'project3701',146,324,'pr_nni',4,60,3953,73,6,1,60.020,5,1 +'project3701',146,324,'pr_tbr',2,60,4079,2,6,1,58.223,5,0 +'project3701',146,324,'pr_tbr',3,60,4161,2,6,1,54.635,5,0 +'project3701',146,324,'pr_nni',5,60,3947,100,6,1,60.019,5,1 +'project3701',146,324,'pr_nni',6,60,3883,2,6,1,55.778,5,1 +'project3701',146,324,'pr_tbr',4,60,4182,2,6,1,60.040,5,0 +'project3701',146,324,'pr_nni',7,60,4039,2,6,1,57.403,5,1 +'project3701',146,324,'pr_tbr',5,60,4106,21,6,1,60.021,5,0 +'project3701',146,324,'pr_tbr',6,60,4198,4,6,1,54.833,5,0 +'project3701',146,324,'pr_nni',8,60,3985,42,6,1,60.050,5,1 +'project3701',146,324,'pr_tbr',7,60,4218,3,5,1,55.935,5,0 +'project3701',146,324,'pr_nni',9,60,3934,2,6,1,55.977,5,1 +'project3701',146,324,'pr_tbr',8,60,4159,1,6,1,55.356,5,0 +'project3701',146,324,'pr_nni',10,60,3932,83,6,1,60.020,5,1 +'project3701',146,324,'pr_tbr',9,60,4116,2,6,1,60.033,5,0 +'project3701',146,324,'pr_tbr',1,60,4152,49,6,1,60.035,5,0 +'project3701',146,324,'pr_tbr',10,60,4138,2,6,1,56.370,5,0 +'project3701',146,324,'pr_tbr',2,60,4085,1,6,1,59.708,5,0 +'project3701',146,324,'pr_tbr',3,60,4264,1,6,1,54.272,5,0 +'project3701',146,324,'baseline',1,120,4064,80,16,1,120.051,0,0 +'project3701',146,324,'pr_tbr',4,60,4123,98,6,1,60.032,5,0 +'project3701',146,324,'pr_tbr',5,60,4183,4,6,1,56.191,5,0 +'project3701',146,324,'pr_tbr',6,60,4176,1,6,1,54.199,5,0 +'project3701',146,324,'baseline',2,120,3998,48,16,1,120.048,0,0 +'project3701',146,324,'pr_tbr',7,60,4282,4,6,1,56.526,5,0 +'project3701',146,324,'pr_tbr',8,60,4199,100,6,1,57.092,5,0 +'project3701',146,324,'baseline',3,120,4121,100,16,1,115.286,0,0 +'project3701',146,324,'pr_tbr',9,60,4014,100,6,1,60.035,5,0 +'project3701',146,324,'baseline',4,120,4107,1,16,1,108.661,0,0 +'project3701',146,324,'pr_tbr',10,60,4116,52,6,1,60.045,5,0 +'project3701',146,324,'baseline',5,120,4024,5,16,1,117.648,0,0 +'project3701',146,324,'baseline',1,120,4056,100,15,1,115.870,0,0 +'project3701',146,324,'baseline',2,120,4054,1,17,1,109.320,0,0 +'project3701',146,324,'baseline',6,120,4133,100,16,1,114.469,0,0 +'project3701',146,324,'baseline',7,120,4119,1,17,1,108.415,0,0 +'project3701',146,324,'baseline',3,120,3997,1,16,1,114.274,0,0 +'project3701',146,324,'baseline',8,120,4080,1,17,1,110.831,0,0 +'project3701',146,324,'baseline',4,120,3976,100,17,1,111.987,0,0 +'project3701',146,324,'baseline',9,120,4158,2,17,1,113.085,0,0 +'project3701',146,324,'baseline',5,120,4137,1,16,1,112.144,0,0 +'project3701',146,324,'baseline',10,120,4080,2,15,1,110.105,0,0 +'project3701',146,324,'baseline',6,120,4137,100,16,1,110.400,0,0 +'project3701',146,324,'pr_nni',1,120,3939,1,12,1,109.166,5,1 +'project3701',146,324,'baseline',7,120,4132,13,17,1,120.013,0,0 +'project3701',146,324,'pr_nni',2,120,4014,100,12,1,113.200,5,1 +'project3701',146,324,'baseline',8,120,4053,100,16,1,116.639,0,0 +'project3701',146,324,'pr_nni',3,120,4074,1,12,1,108.249,5,1 +'project3701',146,324,'baseline',9,120,4062,100,14,1,114.849,0,0 +'project3701',146,324,'pr_nni',4,120,4000,88,12,1,120.035,5,1 +'project3701',146,324,'baseline',10,120,4123,100,15,1,114.626,0,0 +'project3701',146,324,'pr_nni',5,120,3905,100,12,1,114.257,5,1 +'project3701',146,324,'pr_nni',1,120,3955,1,11,1,108.643,5,1 +'project3701',146,324,'pr_nni',6,120,3981,6,12,1,118.738,5,1 +'project3701',146,324,'pr_nni',2,120,3903,2,11,1,110.838,5,1 +'project3701',146,324,'pr_nni',3,120,3884,4,11,1,112.425,5,1 +'project3701',146,324,'pr_nni',7,120,4019,100,12,1,119.768,5,1 +'project3701',146,324,'pr_nni',8,120,3965,100,12,1,112.421,5,1 +'project3701',146,324,'pr_nni',4,120,3987,2,12,1,115.265,5,1 +'project3701',146,324,'pr_nni',5,120,3997,6,12,2,117.877,5,1 +'project3701',146,324,'pr_nni',9,120,3888,94,12,1,120.075,5,1 +'project3701',146,324,'pr_nni',10,120,4050,1,12,2,110.971,5,1 +'project3701',146,324,'pr_nni',6,120,3862,2,11,1,114.865,5,1 +'project3701',146,324,'pr_nni',7,120,3924,100,12,1,112.974,5,1 +'project3701',146,324,'pr_tbr',1,120,4066,100,12,1,116.072,5,0 +'project3701',146,324,'pr_tbr',2,120,4270,1,12,1,108.180,5,0 +'project3701',146,324,'pr_nni',8,120,3949,100,11,1,119.000,5,1 +'project3701',146,324,'pr_tbr',3,120,4142,6,12,1,113.382,5,0 +'project3701',146,324,'pr_nni',9,120,3948,8,12,1,115.132,5,1 +'project3701',146,324,'pr_tbr',4,120,4204,1,11,1,110.056,5,0 +'project3701',146,324,'pr_nni',10,120,3906,100,12,1,113.346,5,1 +'project3701',146,324,'pr_tbr',5,120,4123,1,12,1,116.092,5,0 +'project3701',146,324,'pr_tbr',1,120,4149,2,13,1,108.639,5,0 +'project3701',146,324,'pr_tbr',6,120,4037,100,12,1,114.350,5,0 +'project3701',146,324,'pr_tbr',2,120,4124,1,12,1,108.376,5,0 +'project3701',146,324,'pr_tbr',7,120,4155,86,13,1,120.051,5,0 +'project3701',146,324,'pr_tbr',3,120,4048,42,12,1,120.045,5,0 +'project3701',146,324,'pr_tbr',4,120,4187,1,13,1,108.138,5,0 +'project3701',146,324,'pr_tbr',8,120,4019,6,10,1,119.104,5,0 +'project3701',146,324,'pr_tbr',5,120,4129,4,13,1,111.829,5,0 +'project3701',146,324,'pr_tbr',9,120,4008,100,11,1,118.236,5,0 +'project3701',146,324,'pr_tbr',6,120,4141,100,12,1,115.253,5,0 +'project3701',146,324,'pr_tbr',10,120,4082,3,12,1,113.797,5,0 +'project804',173,589,'baseline',1,60,1379,89,4,1,60.144,0,0 +'project3701',146,324,'pr_tbr',7,120,4222,1,12,1,108.299,5,0 +'project804',173,589,'baseline',2,60,1370,100,3,1,56.737,0,0 +'project804',173,589,'baseline',3,60,1365,100,2,1,60.138,0,0 +'project3701',146,324,'pr_tbr',8,120,4059,100,11,1,119.711,5,0 +'project804',173,589,'baseline',4,60,1380,100,3,1,60.042,0,0 +'project804',173,589,'baseline',5,60,1370,100,3,1,60.130,0,0 +'project3701',146,324,'pr_tbr',9,120,4146,1,10,1,108.835,5,0 +'project804',173,589,'baseline',6,60,1380,9,4,1,60.036,0,0 +'project804',173,589,'baseline',7,60,1390,100,3,1,60.055,0,0 +'project3701',146,324,'pr_tbr',10,120,4069,2,11,1,114.383,5,0 +'project804',173,589,'baseline',8,60,1385,100,3,1,60.104,0,0 +'project804',173,589,'baseline',1,60,1375,100,2,1,59.142,0,0 +'project804',173,589,'baseline',9,60,1372,100,2,1,57.940,0,0 +'project804',173,589,'baseline',2,60,1381,100,2,1,60.131,0,0 +'project804',173,589,'baseline',10,60,1360,64,4,1,60.045,0,0 +'project804',173,589,'baseline',3,60,1378,3,2,1,60.073,0,0 +'project804',173,589,'pr_nni',1,60,1360,100,2,1,59.051,5,1 +'project804',173,589,'baseline',4,60,1377,75,2,1,60.057,0,0 +'project804',173,589,'pr_nni',2,60,1371,100,2,1,60.111,5,1 +'project804',173,589,'baseline',5,60,1381,100,2,1,60.086,0,0 +'project804',173,589,'pr_nni',3,60,1366,19,2,1,60.114,5,1 +'project804',173,589,'baseline',6,60,1389,39,2,1,60.017,0,0 +'project804',173,589,'pr_nni',4,60,1373,100,2,1,58.283,5,1 +'project804',173,589,'baseline',7,60,1373,7,2,1,60.087,0,0 +'project804',173,589,'pr_nni',5,60,1363,96,2,1,60.112,5,1 +'project804',173,589,'baseline',8,60,1377,100,2,1,59.280,0,0 +'project804',173,589,'pr_nni',6,60,1362,100,2,1,57.957,5,1 +'project804',173,589,'baseline',9,60,1366,40,2,1,60.101,0,0 +'project804',173,589,'pr_nni',7,60,1376,42,2,1,60.055,5,1 +'project804',173,589,'baseline',10,60,1378,14,2,1,60.055,0,0 +'project804',173,589,'pr_nni',8,60,1365,61,2,1,60.109,5,1 +'project804',173,589,'pr_nni',1,60,1385,100,2,1,60.090,5,1 +'project804',173,589,'pr_nni',9,60,1367,15,2,1,60.069,5,1 +'project804',173,589,'pr_nni',2,60,1365,100,2,2,60.066,5,1 +'project804',173,589,'pr_nni',10,60,1382,100,2,1,59.691,5,1 +'project804',173,589,'pr_nni',3,60,1376,82,2,1,60.047,5,1 +'project804',173,589,'pr_tbr',1,60,1381,29,2,1,60.055,5,0 +'project804',173,589,'pr_nni',4,60,1369,100,2,1,60.091,5,1 +'project804',173,589,'pr_tbr',2,60,1385,100,2,1,58.501,5,0 +'project804',173,589,'pr_nni',5,60,1379,100,2,1,60.057,5,1 +'project804',173,589,'pr_tbr',3,60,1372,100,2,1,59.974,5,0 +'project804',173,589,'pr_nni',6,60,1361,5,2,1,56.079,5,1 +'project804',173,589,'pr_tbr',4,60,1369,97,2,1,60.136,5,0 +'project804',173,589,'pr_nni',7,60,1360,100,2,1,59.682,5,1 +'project804',173,589,'pr_tbr',5,60,1367,98,2,1,60.028,5,0 +'project804',173,589,'pr_nni',8,60,1374,100,2,1,60.045,5,1 +'project804',173,589,'pr_tbr',6,60,1377,59,2,1,60.033,5,0 +'project804',173,589,'pr_nni',9,60,1368,32,2,2,60.047,5,1 +'project804',173,589,'pr_tbr',7,60,1372,77,2,1,60.131,5,0 +'project804',173,589,'pr_nni',10,60,1373,13,2,1,60.034,5,1 +'project804',173,589,'pr_tbr',8,60,1375,100,2,1,60.103,5,0 +'project804',173,589,'pr_tbr',1,60,1382,100,2,1,58.947,5,0 +'project804',173,589,'pr_tbr',9,60,1387,35,2,1,60.074,5,0 +'project804',173,589,'pr_tbr',2,60,1366,25,2,1,60.061,5,0 +'project804',173,589,'pr_tbr',10,60,1393,100,2,1,57.554,5,0 +'project804',173,589,'pr_tbr',3,60,1390,91,2,1,60.112,5,0 +'project804',173,589,'pr_tbr',4,60,1372,100,2,1,60.169,5,0 +'project804',173,589,'baseline',1,120,1367,100,7,1,120.095,0,0 +'project804',173,589,'pr_tbr',5,60,1379,16,2,1,60.096,5,0 +'project804',173,589,'pr_tbr',6,60,1374,11,2,1,60.024,5,0 +'project804',173,589,'baseline',2,120,1369,100,7,2,118.125,0,0 +'project804',173,589,'pr_tbr',7,60,1379,100,2,1,59.602,5,0 +'project804',173,589,'pr_tbr',8,60,1369,88,2,1,60.085,5,0 +'project804',173,589,'baseline',3,120,1367,100,6,1,109.514,0,0 +'project804',173,589,'pr_tbr',9,60,1375,59,2,1,60.067,5,0 +'project804',173,589,'pr_tbr',10,60,1377,100,2,1,60.054,5,0 +'project804',173,589,'baseline',4,120,1367,100,6,1,120.167,0,0 +'project804',173,589,'baseline',1,120,1361,58,6,1,120.099,0,0 +'project804',173,589,'baseline',5,120,1378,100,8,1,120.193,0,0 +'project804',173,589,'baseline',2,120,1361,20,6,1,120.092,0,0 +'project804',173,589,'baseline',6,120,1371,100,6,1,117.192,0,0 +'project804',173,589,'baseline',3,120,1377,100,6,1,120.043,0,0 +'project804',173,589,'baseline',7,120,1369,100,7,1,114.480,0,0 +'project804',173,589,'baseline',4,120,1361,100,6,1,117.798,0,0 +'project804',173,589,'baseline',8,120,1370,100,6,0,111.198,0,0 +'project804',173,589,'baseline',5,120,1365,100,6,1,117.842,0,0 +'project804',173,589,'baseline',9,120,1370,45,7,1,120.029,0,0 +'project804',173,589,'baseline',6,120,1373,100,6,1,110.251,0,0 +'project804',173,589,'baseline',10,120,1365,100,8,1,118.772,0,0 +'project804',173,589,'baseline',7,120,1366,66,6,1,120.061,0,0 +'project804',173,589,'pr_nni',1,120,1364,100,6,1,115.659,5,1 +'project804',173,589,'baseline',8,120,1366,100,6,1,117.724,0,0 +'project804',173,589,'pr_nni',2,120,1367,100,6,1,116.814,5,1 +'project804',173,589,'baseline',9,120,1366,100,6,1,120.078,0,0 +'project804',173,589,'pr_nni',3,120,1367,100,6,1,118.761,5,1 +'project804',173,589,'baseline',10,120,1371,100,6,1,115.588,0,0 +'project804',173,589,'pr_nni',4,120,1355,100,6,1,114.508,5,1 +'project804',173,589,'pr_nni',1,120,1367,100,4,1,120.148,5,1 +'project804',173,589,'pr_nni',5,120,1361,100,6,1,114.829,5,1 +'project804',173,589,'pr_nni',2,120,1376,100,5,1,120.122,5,1 +'project804',173,589,'pr_nni',6,120,1358,100,6,1,114.471,5,1 +'project804',173,589,'pr_nni',7,120,1361,100,6,1,114.447,5,1 +'project804',173,589,'pr_nni',3,120,1373,100,5,1,118.995,5,1 +'project804',173,589,'pr_nni',4,120,1368,100,6,1,111.709,5,1 +'project804',173,589,'pr_nni',8,120,1362,94,6,1,120.092,5,1 +'project804',173,589,'pr_nni',5,120,1362,35,4,1,120.094,5,1 +'project804',173,589,'pr_nni',9,120,1366,82,6,1,120.106,5,1 +'project804',173,589,'pr_nni',6,120,1366,96,5,1,120.133,5,1 +'project804',173,589,'pr_nni',10,120,1365,68,6,1,120.078,5,1 +'project804',173,589,'pr_nni',7,120,1365,100,5,1,117.875,5,1 +'project804',173,589,'pr_tbr',1,120,1374,100,5,1,117.393,5,0 +'project804',173,589,'pr_nni',8,120,1365,100,6,1,120.133,5,1 +'project804',173,589,'pr_tbr',2,120,1364,68,5,1,120.095,5,0 +'project804',173,589,'pr_nni',9,120,1357,100,6,1,114.293,5,1 +'project804',173,589,'pr_tbr',3,120,1368,100,6,1,118.277,5,0 +'project804',173,589,'pr_nni',10,120,1375,100,5,1,120.075,5,1 +'project804',173,589,'pr_tbr',4,120,1374,100,6,0,116.211,5,0 +'project804',173,589,'pr_tbr',1,120,1367,100,4,1,114.706,5,0 +'project804',173,589,'pr_tbr',5,120,1376,4,5,1,120.015,5,0 +'project804',173,589,'pr_tbr',2,120,1366,100,4,2,120.081,5,0 +'project804',173,589,'pr_tbr',6,120,1379,100,6,1,117.838,5,0 +'project804',173,589,'pr_tbr',3,120,1371,100,4,1,112.028,5,0 +'project804',173,589,'pr_tbr',7,120,1373,100,6,1,116.842,5,0 +'project804',173,589,'pr_tbr',4,120,1379,100,4,1,115.752,5,0 +'project804',173,589,'pr_tbr',8,120,1373,100,5,1,116.783,5,0 +'project804',173,589,'pr_tbr',5,120,1367,100,4,1,115.504,5,0 +'project804',173,589,'pr_tbr',9,120,1366,100,6,1,117.697,5,0 +'project804',173,589,'pr_tbr',6,120,1374,56,4,1,120.104,5,0 +'project804',173,589,'pr_tbr',10,120,1374,89,5,1,120.036,5,0 +'syab07205',206,748,'baseline',1,60,10399,29,2,1,60.057,0,0 +'project804',173,589,'pr_tbr',7,120,1363,32,4,1,120.022,5,0 +'syab07205',206,748,'baseline',2,60,10407,60,2,1,60.066,0,0 +'syab07205',206,748,'baseline',3,60,10413,23,2,1,60.070,0,0 +'project804',173,589,'pr_tbr',8,120,1366,100,4,1,119.935,5,0 +'syab07205',206,748,'baseline',4,60,10413,25,2,1,60.083,0,0 +'syab07205',206,748,'baseline',5,60,10475,76,2,1,60.054,0,0 +'project804',173,589,'pr_tbr',9,120,1362,100,4,1,118.924,5,0 +'syab07205',206,748,'baseline',6,60,10414,32,2,1,60.042,0,0 +'syab07205',206,748,'baseline',7,60,10456,6,2,1,58.700,0,0 +'project804',173,589,'pr_tbr',10,120,1376,100,4,1,114.016,5,0 +'syab07205',206,748,'baseline',8,60,10539,10,2,1,60.046,0,0 +'syab07205',206,748,'baseline',1,60,10382,9,2,1,57.329,0,0 +'syab07205',206,748,'baseline',9,60,10596,63,2,1,60.030,0,0 +'syab07205',206,748,'baseline',2,60,10516,100,2,1,60.053,0,0 +'syab07205',206,748,'baseline',10,60,10510,64,2,1,60.075,0,0 +'syab07205',206,748,'baseline',3,60,10428,71,2,1,60.024,0,0 +'syab07205',206,748,'pr_nni',1,60,10445,1,2,1,60.052,5,1 +'syab07205',206,748,'baseline',4,60,10464,57,2,1,60.020,0,0 +'syab07205',206,748,'pr_nni',2,60,10461,16,2,1,60.039,5,1 +'syab07205',206,748,'baseline',5,60,10422,34,2,1,60.018,0,0 +'syab07205',206,748,'pr_nni',3,60,10505,3,1,1,58.640,5,1 +'syab07205',206,748,'baseline',6,60,10433,64,2,1,60.022,0,0 +'syab07205',206,748,'pr_nni',4,60,10440,27,2,1,60.029,5,1 +'syab07205',206,748,'baseline',7,60,10389,8,2,1,59.710,0,0 +'syab07205',206,748,'pr_nni',5,60,10430,7,2,1,60.020,5,1 +'syab07205',206,748,'baseline',8,60,10572,100,2,1,60.073,0,0 +'syab07205',206,748,'pr_nni',6,60,10465,65,2,1,60.030,5,1 +'syab07205',206,748,'baseline',9,60,10480,4,2,1,56.434,0,0 +'syab07205',206,748,'pr_nni',7,60,10435,100,2,1,56.943,5,1 +'syab07205',206,748,'baseline',10,60,10496,86,2,1,60.100,0,0 +'syab07205',206,748,'pr_nni',8,60,10451,56,2,1,60.026,5,1 +'syab07205',206,748,'pr_nni',1,60,10463,100,2,1,60.042,5,1 +'syab07205',206,748,'pr_nni',9,60,10529,99,2,1,60.085,5,1 +'syab07205',206,748,'pr_nni',2,60,10465,18,2,1,60.018,5,1 +'syab07205',206,748,'pr_nni',10,60,10562,16,2,1,60.029,5,1 +'syab07205',206,748,'pr_nni',3,60,10444,100,1,1,60.116,5,1 +'syab07205',206,748,'pr_tbr',1,60,-1,1,0,0,54.057,5,0 +'syab07205',206,748,'pr_nni',4,60,10424,63,1,1,60.083,5,1 +'syab07205',206,748,'pr_tbr',2,60,-1,1,0,0,54.123,5,0 +'syab07205',206,748,'pr_nni',5,60,10384,42,2,1,60.083,5,1 +'syab07205',206,748,'pr_tbr',3,60,-1,1,0,0,54.103,5,0 +'syab07205',206,748,'pr_nni',6,60,10482,7,1,1,60.014,5,1 +'syab07205',206,748,'pr_tbr',4,60,-1,1,0,0,54.024,5,0 +'syab07205',206,748,'pr_tbr',5,60,-1,1,0,0,54.025,5,0 +'syab07205',206,748,'pr_nni',7,60,10421,31,2,1,60.036,5,1 +'syab07205',206,748,'pr_tbr',6,60,-1,1,0,0,54.031,5,0 +'syab07205',206,748,'pr_nni',8,60,10522,31,2,1,60.087,5,1 +'syab07205',206,748,'pr_tbr',7,60,-1,1,0,0,54.027,5,0 +'syab07205',206,748,'pr_nni',9,60,10473,36,1,1,60.022,5,1 +'syab07205',206,748,'pr_tbr',8,60,-1,1,0,0,54.026,5,0 +'syab07205',206,748,'pr_nni',10,60,10527,41,2,1,60.071,5,1 +'syab07205',206,748,'pr_tbr',9,60,-1,1,0,0,54.074,5,0 +'syab07205',206,748,'pr_tbr',1,60,-1,1,0,0,54.195,5,0 +'syab07205',206,748,'pr_tbr',10,60,-1,1,0,0,54.050,5,0 +'syab07205',206,748,'pr_tbr',2,60,-1,1,0,0,54.020,5,0 +'syab07205',206,748,'pr_tbr',3,60,-1,1,0,0,54.167,5,0 +'syab07205',206,748,'baseline',1,120,10442,100,4,1,112.614,0,0 +'syab07205',206,748,'pr_tbr',4,60,-1,1,0,0,54.049,5,0 +'syab07205',206,748,'pr_tbr',5,60,-1,1,0,0,54.209,5,0 +'syab07205',206,748,'baseline',2,120,10373,87,4,1,120.081,0,0 +'syab07205',206,748,'pr_tbr',6,60,-1,1,0,0,54.166,5,0 +'syab07205',206,748,'pr_tbr',7,60,-1,1,0,0,54.200,5,0 +'syab07205',206,748,'pr_tbr',8,60,-1,1,0,0,54.200,5,0 +'syab07205',206,748,'baseline',3,120,10490,100,4,1,119.513,0,0 +'syab07205',206,748,'pr_tbr',9,60,-1,1,0,0,54.161,5,0 +'syab07205',206,748,'pr_tbr',10,60,-1,1,0,0,54.161,5,0 +'syab07205',206,748,'baseline',4,120,10371,34,4,1,120.069,0,0 +'syab07205',206,748,'baseline',1,120,10425,100,4,1,112.412,0,0 +'syab07205',206,748,'baseline',5,120,10390,20,4,1,120.031,0,0 +'syab07205',206,748,'baseline',2,120,10432,100,4,1,111.530,0,0 +'syab07205',206,748,'baseline',6,120,10313,12,4,1,120.059,0,0 +'syab07205',206,748,'baseline',3,120,10363,8,4,1,119.719,0,0 +'syab07205',206,748,'baseline',7,120,10425,100,4,1,115.191,0,0 +'syab07205',206,748,'baseline',4,120,10499,16,4,1,116.035,0,0 +'syab07205',206,748,'baseline',8,120,10484,100,4,1,120.082,0,0 +'syab07205',206,748,'baseline',5,120,10389,54,4,1,120.042,0,0 +'syab07205',206,748,'baseline',9,120,10427,5,4,1,117.743,0,0 +'syab07205',206,748,'baseline',6,120,10436,56,4,1,120.074,0,0 +'syab07205',206,748,'baseline',10,120,10453,100,4,1,120.102,0,0 +'syab07205',206,748,'baseline',7,120,10435,100,4,1,115.716,0,0 +'syab07205',206,748,'pr_nni',1,120,10467,7,4,1,119.064,5,1 +'syab07205',206,748,'baseline',8,120,10373,14,4,1,115.910,0,0 +'syab07205',206,748,'pr_nni',2,120,10451,55,4,1,120.024,5,1 +'syab07205',206,748,'baseline',9,120,10426,100,4,1,120.017,0,0 +'syab07205',206,748,'pr_nni',3,120,10493,100,4,1,119.570,5,1 +'syab07205',206,748,'baseline',10,120,10403,82,4,1,120.059,0,0 +'syab07205',206,748,'pr_nni',4,120,10429,100,4,1,116.792,5,1 +'syab07205',206,748,'pr_nni',1,120,10407,100,4,1,115.317,5,1 +'syab07205',206,748,'pr_nni',5,120,10429,3,4,1,120.014,5,1 +'syab07205',206,748,'pr_nni',2,120,10374,100,4,1,117.950,5,1 +'syab07205',206,748,'pr_nni',6,120,10457,100,4,1,116.620,5,1 +'syab07205',206,748,'pr_nni',3,120,10477,52,3,1,120.035,5,1 +'syab07205',206,748,'pr_nni',7,120,10397,100,4,1,113.963,5,1 +'syab07205',206,748,'pr_nni',4,120,10448,8,3,1,120.060,5,1 +'syab07205',206,748,'pr_nni',8,120,10359,3,4,1,116.396,5,1 +'syab07205',206,748,'pr_nni',5,120,10459,8,3,1,116.031,5,1 +'syab07205',206,748,'pr_nni',9,120,10475,37,4,1,120.054,5,1 +'syab07205',206,748,'pr_nni',6,120,10421,100,4,1,120.119,5,1 +'syab07205',206,748,'pr_nni',10,120,10382,2,4,1,120.080,5,1 +'syab07205',206,748,'pr_nni',7,120,10435,25,4,1,120.061,5,1 +'syab07205',206,748,'pr_tbr',1,120,10452,4,2,1,115.760,5,0 +'syab07205',206,748,'pr_nni',8,120,10404,100,4,1,116.387,5,1 +'syab07205',206,748,'pr_tbr',2,120,10361,24,2,1,120.019,5,0 +'syab07205',206,748,'pr_nni',9,120,10382,30,4,1,120.083,5,1 +'syab07205',206,748,'pr_tbr',3,120,10431,19,2,1,120.062,5,0 +'syab07205',206,748,'pr_nni',10,120,10401,48,4,1,120.018,5,1 +'syab07205',206,748,'pr_tbr',4,120,10480,100,2,1,113.969,5,0 +'syab07205',206,748,'pr_tbr',1,120,10316,12,2,1,120.013,5,0 +'syab07205',206,748,'pr_tbr',5,120,10434,100,2,1,113.233,5,0 +'syab07205',206,748,'pr_tbr',2,120,10431,74,2,1,120.095,5,0 +'syab07205',206,748,'pr_tbr',6,120,10372,13,2,1,120.058,5,0 +'syab07205',206,748,'pr_tbr',3,120,10402,100,2,1,119.026,5,0 +'syab07205',206,748,'pr_tbr',7,120,10440,4,2,1,113.381,5,0 +'syab07205',206,748,'pr_tbr',4,120,10359,4,2,1,112.226,5,0 +'syab07205',206,748,'pr_tbr',8,120,10398,7,2,1,115.180,5,0 +'syab07205',206,748,'pr_tbr',5,120,10373,12,2,1,120.021,5,0 +'syab07205',206,748,'pr_tbr',9,120,10443,56,2,1,120.092,5,0 +'syab07205',206,748,'pr_tbr',6,120,10484,8,2,1,118.688,5,0 +'syab07205',206,748,'pr_tbr',10,120,10438,9,2,1,112.202,5,0 +'syab07205',206,748,'pr_tbr',7,120,10420,100,2,1,115.420,5,0 +'syab07205',206,748,'pr_tbr',8,120,10386,16,2,1,120.074,5,0 +'syab07205',206,748,'pr_tbr',9,120,10440,16,2,1,120.073,5,0 +'syab07205',206,748,'pr_tbr',10,120,10440,15,2,1,120.013,5,0 diff --git a/dev/benchmarks/t289f_stage5_hamilton.sh b/dev/benchmarks/t289f_stage5_hamilton.sh new file mode 100644 index 000000000..548cc808d --- /dev/null +++ b/dev/benchmarks/t289f_stage5_hamilton.sh @@ -0,0 +1,76 @@ +#!/bin/bash +#SBATCH --job-name=t289f-pr-nni +#SBATCH -p shared +#SBATCH -n 1 +#SBATCH --mem=8G +#SBATCH --time=8:00:00 +#SBATCH --output=/nobackup/%u/TreeSearch/logs/t289f_%j.out +#SBATCH --error=/nobackup/%u/TreeSearch/logs/t289f_%j.err + +# T-289f: Prune-reinsert Stage 5 — NNI full-tree polish cost reduction +# +# Compares: baseline (no PR) vs pr_nni (NNI polish) vs pr_tbr (TBR polish) +# on the same 5 large-tree datasets as Stage 4 (131-206 tips). +# +# Builds from feature/tbr-batch (contains pruneReinsertNni parameter). +# +# Grid: 5 datasets x 3 configs x 2 budgets x 10 seeds = 300 runs +# Expected wall time: ~4-6h; 8h limit provides comfortable margin. + +module load r/4.5.1 +module load gcc/14.2 + +export OMP_NUM_THREADS=1 +export OPENBLAS_NUM_THREADS=1 + +REPO=/nobackup/$USER/TreeSearch-a +LIB=/nobackup/$USER/TreeSearch/lib +OUTDIR=/nobackup/$USER/TreeSearch/t289f_results +export R_LIBS="$LIB:${R_LIBS}" + +mkdir -p "$LIB" "$OUTDIR" /nobackup/$USER/TreeSearch/logs + +echo "=== T-289f Hamilton job (PR Stage 5 — NNI Polish) ===" +echo "Job ID: $SLURM_JOB_ID" +echo "Node: $(hostname)" +echo "Started: $(date)" +echo "" + +# Install CRAN dependencies if missing +Rscript --no-save -e " + lib <- '$LIB' + .libPaths(c(lib, .libPaths())) + pkgs <- c('abind', 'ape', 'cli', 'colorspace', 'fastmatch', 'Rdpack', 'TreeDist', 'TreeTools') + need <- pkgs[!vapply(pkgs, requireNamespace, logical(1), quietly = TRUE)] + if (length(need) > 0) { + message('Installing: ', paste(need, collapse = ', ')) + install.packages(need, lib = lib, repos = 'https://cloud.r-project.org', + dependencies = NA, quiet = TRUE) + } else { message('All dependencies present.') } +" + +# Rebuild from cpp-search (pruneReinsertNni merged via PR #238) +cd "$REPO" || exit 1 +git fetch origin cpp-search +git reset --hard origin/cpp-search +echo "Git HEAD: $(git log --oneline -1)" + +echo "Rebuilding TreeSearch..." +rm -f src/*.o src/*.so src/*.dll +R CMD build --no-build-vignettes --no-manual --no-resave-data . +R CMD INSTALL --library="$LIB" TreeSearch_*.tar.gz +rc=$? +rm -f TreeSearch_*.tar.gz +echo "Install exit code: $rc" +if [ $rc -ne 0 ]; then + echo "FATAL: install failed" + exit 1 +fi + +# Run benchmark +cd "$OUTDIR" +Rscript "$REPO/dev/benchmarks/bench_pr_stage5_nni.R" "$OUTDIR" + +echo "" +echo "Completed: $(date)" +ls -lh "$OUTDIR"/t289f_*.csv 2>/dev/null diff --git a/dev/benchmarks/tnt_bench_new_ts.csv b/dev/benchmarks/tnt_bench_new_ts.csv new file mode 100644 index 000000000..fd7260e9c --- /dev/null +++ b/dev/benchmarks/tnt_bench_new_ts.csv @@ -0,0 +1,43 @@ +"dataset","n_taxa","n_chars","seed","timeout_s","ts_score","ts_trees","ts_wall_s","ts_reps","ts_hits" +"Longrich2010",20,93,1,10,131,100,0.11,11,3 +"Longrich2010",20,93,2,10,131,90,0.41,8,2 +"Longrich2010",20,93,3,10,131,100,0.09,8,2 +"Vinther2008",23,57,1,10,78,66,0.34,7,3 +"Vinther2008",23,57,2,10,78,66,0.33,19,4 +"Vinther2008",23,57,3,10,78,66,0.36,13,4 +"Sansom2010",23,109,1,10,188,1,0.25,20,1 +"Sansom2010",23,109,2,10,188,1,0.23,11,2 +"Sansom2010",23,109,3,10,189,31,0.22,11,3 +"DeAssis2011",33,50,1,10,64,100,0.14,5,5 +"DeAssis2011",33,50,2,10,64,100,0.13,5,5 +"DeAssis2011",33,50,3,10,64,100,0.12,5,5 +"Aria2015",35,50,1,10,142,100,0.41,10,4 +"Aria2015",35,50,2,10,142,100,0.65,14,5 +"Aria2015",35,50,3,10,142,100,0.31,8,5 +"Wortley2006",37,105,1,10,488,3,2.72,20,1 +"Wortley2006",37,105,2,10,487,2,2.49,20,1 +"Wortley2006",37,105,3,10,486,1,2.59,20,1 +"Griswold1999",43,137,1,10,394,13,0.81,6,2 +"Griswold1999",43,137,2,10,394,20,1.11,8,5 +"Griswold1999",43,137,3,10,394,8,1.46,10,2 +"Schulze2007",52,58,1,10,155,100,1.17,11,4 +"Schulze2007",52,58,2,10,155,100,0.7,9,5 +"Schulze2007",52,58,3,10,155,100,0.75,8,5 +"Eklund2004",54,131,1,10,440,100,6.39,20,1 +"Eklund2004",54,131,2,10,441,100,5.69,20,2 +"Eklund2004",54,131,3,10,441,100,3.08,10,1 +"Agnarsson2004",62,242,1,10,765,1,1.29,5,5 +"Agnarsson2004",62,242,2,10,765,1,1.24,5,5 +"Agnarsson2004",62,242,3,10,765,1,1.28,5,5 +"Zanol2014",74,213,1,10,1271,1,10,4,1 +"Zanol2014",74,213,2,10,1272,1,10,4,1 +"Zanol2014",74,213,3,10,1266,1,10,4,1 +"Zhu2013",75,253,1,10,636,1,10,4,1 +"Zhu2013",75,253,2,10,635,1,10,5,1 +"Zhu2013",75,253,3,10,631,1,10.01,5,1 +"Giles2015",78,236,1,10,676,1,10,6,1 +"Giles2015",78,236,2,10,675,1,10.02,6,1 +"Giles2015",78,236,3,10,674,1,10,5,1 +"Dikow2009",88,220,1,10,1606,3,10.01,4,3 +"Dikow2009",88,220,2,10,1606,1,10.02,3,1 +"Dikow2009",88,220,3,10,1606,1,10,4,1 diff --git a/dev/benchmarks/tnt_disassembly_analysis.md b/dev/benchmarks/tnt_disassembly_analysis.md new file mode 100644 index 000000000..96d520da9 --- /dev/null +++ b/dev/benchmarks/tnt_disassembly_analysis.md @@ -0,0 +1,166 @@ +# TNT vs TreeSearch: Fitch Kernel Disassembly Comparison (T-250) + +Date: 2026-03-26 + +## Scope limitation + +**This analysis covers the native Windows TNT binary only.** The TNT +download page explicitly labels the Windows build as "[32 bits]". The +Mac, Linux, and Cygwin builds are compiled as 64-bit (Goloboff & Morales +2023). The 64-bit builds likely use wider registers and may include SIMD +or hardware `popcnt` — the "~4× throughput advantage" conclusion below +does **not** generalize to 64-bit TNT. Hamilton HPC benchmarks (T-249) +will run against the 64-bit Linux TNT and may show a very different +implementation-level gap. + +## TNT Binary Profile (Windows, 32-bit) + +- **File:** `C:/Programs/Phylogeny/tnt/TNT-bin/tnt.exe` (3.1 MB) +- **Format:** PE32 (32-bit i386), stripped (no symbols) +- **SIMD:** None. Zero xmm/ymm register references, zero popcnt instructions. +- **Code section:** `AUTO` — 2.4 MB, ~721K disassembly lines + +## TreeSearch DLL Profile + +- **File:** `.agent-E/TreeSearch/libs/x64/TreeSearch.dll` (1.8 MB) +- **Format:** PE32+ (64-bit x86-64), stripped +- **SIMD:** SSE2 (128-bit). 1281 integer SIMD ops (pand/por/pxor/pcmpeq), + 16472 xmm register references (includes scalar double FP). Zero ymm (no AVX2). +- **Popcount:** Software Hamming weight (0x5555.../0x3333.../0x0f0f... shift-mask + pattern). No hardware `popcnt` instruction. + +## Comparison Table + +| Feature | TNT | TreeSearch | +|---------|-----|------------| +| Architecture | 32-bit i386 | 64-bit x86-64 | +| Word size | 32-bit | 64-bit | +| SIMD for Fitch | None | SSE2 (128-bit `pand`/`por`) | +| Popcount | 64KB lookup table (two 16-bit halves) | Software Hamming weight (shift+mask) | +| Hardware `popcnt` | No | No | +| AVX2 | No | No | +| Bits/inner-loop iteration | 32 | 128 (2 × uint64 via `movdqu`/`pand`/`por`) | + +## TNT Fitch Kernel (0x420c04) + +The main scoring loop at 0x420c04–0x420c77 is a single-pass design: + +``` +loop: + dec counter; cmp -1; je exit // iterate over character words + mov 0x4(%esi),%eax // load left child state (32 bits) + mov 0x4(%ebx),%ecx // load right child state + add $0x4,%ebx; add $0x4,%esi // advance pointers (stride 4 = 32 bits) + not %eax; and %ecx,%eax // ~left & right = "extra states in right" + je skip // if zero, no extra states + mov %ecx,%edx; xor %eax,%edx // right XOR extra = intersection + push %eax; call 0x5c9f30 // popcount(extra) via 64KB LUT + mov %edx,(%ebx) // store intersection + sub %eax,(%edx) // adjust score counter +skip: + [symmetric check: ~result & left] + jmp loop +``` + +The popcount function at 0x5c9f30 splits a 32-bit value into two 16-bit halves +and uses a 64KB lookup table at 0x718dbd: + +``` +mov 0x8(%ebp),%edx // arg +mov %edx,%eax +and $0xffff,%eax // low 16 bits +shr $0x10,%edx // high 16 bits +mov 0x718dbd(%eax),%al // table[low] +add 0x718dbd(%edx),%al // + table[high] +movsbl %al,%eax +ret +``` + +**Key characteristics:** +- Processes one 32-bit word per iteration +- NOT+AND pattern (computes "extra states" directly) rather than AND then check-zero +- Includes a symmetric second check (right-to-left and left-to-right in the same loop body) +- Function call for popcount (not inlined) +- Branch per character word (`je skip`) + +## TreeSearch Fitch Kernel + +### Indirect scoring (TBR inner loop — 72% of wall time at 180 tips) + +`any_hit_reduce3()` in `ts_simd.h` is the critical inner function: + +```cpp +v128 acc = zero128(); +for (; s + 2 <= n_states; s += 2) { + v128 vc = loadu128(&clip[s]); // 128-bit load (2 × uint64) + v128 va = loadu128(&a[s]); + v128 vb = loadu128(&b[s]); + acc = or128(acc, and128(vc, or128(va, vb))); // clip & (a | b) +} +``` + +Compiled to: +``` +movdqu (%r8,%rax),%xmm0 // load 128 bits from clip +movdqu (%r9,%rax),%xmm2 // load 128 bits from a|b (pre-computed or inline) +add $0x10,%rax // stride 16 = 128 bits +pand %xmm2,%xmm0 // 128-bit AND +por %xmm0,%xmm1 // OR accumulate +cmp %rax,%rdx +jne loop +``` + +**Key characteristics:** +- Processes 128 bits per iteration (2 × uint64) +- SSE2 `pand`/`por` for bit operations +- Branchless within the character loop (no per-word branching) +- `popcount64()` on the result mask (software Hamming weight) + +### Downpass (`fitch_downpass_node`) + +Two-pass design: +1. **Pass 1:** `any_hit_reduce()` — tight SSE2 `pand`+`por` loop to determine + which characters have intersection (single 64-bit mask) +2. **Pass 2:** Broadcast mask + SSE2 select — no per-character branching + +## Implications + +### TNT's speed advantage is NOT implementation-level (Windows 32-bit) + +On Windows, TreeSearch has a **~4× raw Fitch throughput advantage** (128-bit +SSE2 vs 32-bit scalar). Yet TNT converges 3–5× faster on the same datasets. +This means — at minimum on Windows: + +1. **TNT's advantage is purely strategic** — fewer candidates evaluated, + more effective heuristics, or both. +2. **T-246 (AVX2)** would double TreeSearch's throughput from 128→256 bits + (and could add hardware `popcnt`). This is still worthwhile for absolute + speed, but it won't close the strategic gap with TNT. +3. **T-251 (trajectory analysis) is the higher-priority investigation** — + understanding *how many* candidates TNT evaluates per score improvement + will reveal whether the gap is in candidate pruning, search ordering, + or phase composition. + +### Minor optimization opportunities + +- **Hardware `popcnt`:** Neither program uses it. Adding `-mpopcnt` to + TreeSearch's compile flags (or runtime dispatch) would replace the + ~10-instruction software Hamming weight with a single `popcntq`. This + affects step counting after each `any_hit_reduce`, not the inner loop + itself, but could save ~5–10% of scoring time. +- **TNT's popcount is worse:** The 64KB LUT + function call overhead is + significantly more expensive than TreeSearch's inlined shift-mask. + This further confirms TNT's advantage is strategic. + +### What to investigate next + +The round 2 data shows TNT completing 50+ trees in 7–27s while TreeSearch +takes 45–110s for similar scores. If TreeSearch's per-candidate scoring is +faster, TNT must be evaluating far fewer candidates to achieve the same +result — either through better candidate pruning (e.g., more aggressive +clip skipping, smarter regraft ordering) or through phases that escape +local optima more efficiently (more effective ratchet/drift parameters). + +T-249 (rerun comparison) and T-251 (trajectory analysis) should focus on +comparing **total candidates evaluated** and **score improvement per candidate** +rather than wall-clock timing. diff --git a/dev/benchmarks/tnt_trajectory_analysis.md b/dev/benchmarks/tnt_trajectory_analysis.md new file mode 100644 index 000000000..8ce0526a0 --- /dev/null +++ b/dev/benchmarks/tnt_trajectory_analysis.md @@ -0,0 +1,226 @@ +# T-251: TNT vs TreeSearch Trajectory Analysis + +Date: 2026-03-26 + +## Executive Summary + +TreeSearch's score gap with TNT (3–21 steps on gap datasets) arises from two +compounding factors: + +1. **Per-evaluation overhead**: TNT evaluates 1.5–3.6× more rearrangements + per second than TreeSearch, despite TreeSearch having wider SIMD (SSE2 + 128-bit vs TNT's 32-bit scalar on Windows). The overhead is in data + structure manipulation, not the Fitch kernel. + +2. **Phase allocation**: TreeSearch spends 16–23% of wall time on drift, + which has extremely poor return (405–1498 ms per step gained). TNT's + `xmult` is dominated by sectorial search, which is far more cost-effective. + +## Methodology + +Three datasets with the largest persistent score gaps (from T-249) were +compared at 30-second budgets, 3 seeds each, EW scoring, inapplicable +tokens treated as missing: + +| Dataset | Tips | Chars | Gap (TS − TNT) | +|---------|:----:|:-----:|:---:| +| Geisler2001 | 68 | 186 | 5–9 | +| Zhu2013 | 75 | 253 | 4–6 | +| Wortley2006 | 37 | 105 | 3–4 | + +TNT: console-mode Windows 32-bit (v1.6, 2026-02-20), `xmult=hits 10 +replic 100`. TreeSearch: cpp-search HEAD, `ts_driven_search()` with +default strategy parameters, `verbosity=2`. + +**Caveat:** TNT on Windows is 32-bit; Hamilton benchmarks will use the +64-bit Linux build which may have different throughput characteristics. +The per-evaluation throughput ratios below may not hold on Linux. + +## Per-Evaluation Throughput + +TNT's total rearrangements are reported directly. TreeSearch's +per-evaluation rate was measured via `ts_tbr_search()` on a single Wagner +→ TBR convergence. + +| Dataset | TNT M evals/s | TS M evals/s | TNT/TS ratio | +|---------|:---:|:---:|:---:| +| Geisler2001 (68t) | 16.5 | 10.9 | 1.5× | +| Zhu2013 (75t) | 27.9 | 13.9 | 2.0× | +| Wortley2006 (37t) | 12.2 | 3.4 | 3.6× | + +The gap is larger at smaller tree sizes, where the Fitch kernel is a +smaller fraction of per-evaluation cost and overhead dominates. + +T-250 showed TreeSearch's Fitch kernel processes 128 bits per SIMD +iteration vs TNT's 32 bits — a ~4× raw throughput advantage. Yet TNT +evaluates more total rearrangements per second. This means TreeSearch's +**per-evaluation overhead** (undo stack management, data structure +traversal, incremental scoring setup) exceeds TNT's by 6–14×, completely +negating the SIMD advantage. + +## Total Rearrangements (30s budget) + +| Dataset | TNT total evals | TS est. total evals | TNT/TS ratio | +|---------|:---:|:---:|:---:| +| Geisler2001 | 499M | ~210M (est.) | ~2.4× | +| Zhu2013 | 796M | ~280M (est.) | ~2.8× | +| Wortley2006 | 104M | ~54M (est.) | ~1.9× | + +TS estimates based on TBR throughput × phase time allocation. TNT +examines roughly twice as many candidates in the same wall time. + +## Phase Cost Efficiency + +TreeSearch phase efficiency = ms of wall time per step of score +improvement. Lower is better. Averaged over 3 seeds per dataset. + +### Geisler2001 (68 taxa) + +| Phase | Time (ms) | Steps gained | ms/step | % of time | +|-------|:---------:|:---:|:---:|:---:| +| TBR | 1773 | 2397 | 0.8 | 3% | +| CSS | 1408 | 154 | 9.1 | 3% | +| RSS | 863 | 49 | 18 | 2% | +| XSS | 1502 | 97 | 20 | 3% | +| Ratchet | 34616 | 1070 | 34 | 63% | +| **Drift** | **11843** | **8** | **1498** | **22%** | + +### Zhu2013 (75 taxa) + +| Phase | Time (ms) | Steps gained | ms/step | % of time | +|-------|:---------:|:---:|:---:|:---:| +| TBR | 1574 | 3321 | 0.5 | 3% | +| XSS | 2028 | 367 | 5.7 | 4% | +| CSS | 1372 | 107 | 14 | 3% | +| RSS | 833 | 46 | 18 | 2% | +| Ratchet | 33710 | 765 | 44 | 62% | +| **Drift** | **12695** | **10** | **1270** | **23%** | + +### Wortley2006 (37 taxa) + +| Phase | Time (ms) | Steps gained | ms/step | % of time | +|-------|:---------:|:---:|:---:|:---:| +| TBR | 1100 | 2655 | 0.4 | 2% | +| XSS | 1652 | 376 | 4.5 | 3% | +| CSS | 1332 | 226 | 6.1 | 3% | +| RSS | 883 | 83 | 11 | 2% | +| Ratchet | 35945 | 2058 | 18 | 72% | +| **Drift** | **7989** | **22** | **405** | **16%** | + +**Pattern:** Drift is 30–170× less efficient than the next-worst phase +(ratchet) across all three datasets. + +## TNT's Search Structure + +TNT's `xmult` trajectory reveals a fundamentally different phase +composition from TreeSearch's pipeline: + +**Geisler2001 (30s, seed 1):** TNT reports 30 sub-replicate results +across 7 replicates. Algorithm breakdown: +- SECT (sectorial search): ~20 entries +- TBR: ~8 entries +- FUSE: ~2 entries + +TNT hits score 1293 within replicate 0 (3 seconds, 56M rearrangements) +via TBR following sectorial search. Subsequent replicates hover around +1293–1303, with sectorial search and fusing maintaining the best score. + +TreeSearch hits 1298 as its best single-replicate score (replicate 10, +after ~14s of cumulative search time). No replicate reaches 1293. + +**Key structural differences:** + +1. **TNT does extensive sectorial search within each replicate.** Each TNT + replicate includes multiple rounds of sectorial search + TBR before + moving to the next Wagner start. TreeSearch does one pass of + XSS+RSS+CSS per outer cycle. + +2. **TNT's replicates are longer and more productive.** TNT completes ~7 + replicates in 30s on Geisler2001 (~4.3s each), with each replicate + including intensive sectorial + TBR + fuse. TreeSearch completes 14–19 + replicates (~1.5–2s each), but each is shallower. + +3. **TNT fuses frequently within the search.** The FUSE entries in TNT's + trajectory show tree fusing as an integrated part of the search cycle, + not a separate post-search step. + +## Per-Replicate Score Quality + +Median per-replicate score (the typical quality of a single search from +a random Wagner start): + +| Dataset | TNT median rep | TS median rep | TNT advantage | +|---------|:---:|:---:|:---:| +| Geisler2001 | ~1297 | 1313 | 16 steps | +| Zhu2013 | ~626 | 636 | 10 steps | +| Wortley2006 | ~487 | 488 | 1 step | + +TNT achieves better per-replicate scores, which means its intra-replicate +search (sectorial + TBR) is more thorough. + +## TreeSearch Per-Replicate Trajectory + +**Geisler2001 (seed 1):** 15 replicates +- Rep 1: 1349 (Wagner 1678 → TBR → Ratchet → Drift) +- Rep 2: 1308 (improvement) +- Rep 5: 1304 +- Rep 10: 1298 (best found) +- Rep 15: 1327 (no improvement in last 5 reps) + +Score improves from 1349 → 1298 over 15 replicates (51 steps). TNT +improves from ~1298 → 1293 within a single replicate. + +## Recommendations + +### High priority: Eliminate or drastically reduce drift + +Drift consumes 16–23% of search time but contributes <1% of score +improvement. At 405–1498 ms per step gained, it is 30–170× less +efficient than the next-worst phase. + +**Proposed change:** Set `driftCycles = 0` in the default preset. +Reallocate the saved time to additional ratchet cycles or sectorial +search rounds. The `thorough` preset (with many more base cycles) could +retain 1–2 drift cycles as a diversity mechanism. + +Expected impact: ~20% wall-time savings with negligible score loss. +Equivalent to adding ~4 more replicates per 30s budget. + +### Medium priority: Increase sectorial search intensity + +TNT's dominance of sectorial search (SECT appears in ~67% of trajectory +entries) suggests TreeSearch's single-pass XSS+RSS+CSS is insufficient. +Currently sectorial search takes only 6–10% of wall time but has +respectable efficiency (5–20 ms/step). + +**Proposed change:** Increase sectorial search rounds. Options: +- Double `xssRounds` and `rssRounds` within each outer cycle +- Add a second sectorial search pass after ratchet (currently + sectorial → ratchet → drift → TBR; change to + sectorial → ratchet → sectorial → TBR) +- Increase `sectorMaxSize` to capture more of the tree in each sector + +### Medium priority: Reduce per-evaluation overhead + +The 1.5–3.6× per-evaluation throughput gap means every search phase is +penalized. Likely targets: +- Undo stack management in TBR (PreallocUndo grow/shrink) +- Incremental scoring setup cost (even when not finding improvements) +- Collapsed-flag recomputation (O(n) per move, even when 0% collapsed) + +This is a deeper engineering effort (T-245/T-246 overlap) but has the +broadest impact since it accelerates every phase. + +### Low priority: Ratchet tuning + +Ratchet is the most time-consuming phase (62–72%) and mid-tier in +efficiency. The current 12 cycles at 25% perturbation may be too many; +diminishing returns likely set in after 6–8 cycles. The adaptive level +mechanism already scales this down when hit rates are high, but the +base count could be reduced for the default preset. + +## Data Files + +- `bench_trajectory.R` — comparison script +- `trajectory_results.rds` — raw results (3 datasets × 3 seeds) +- `tnt_trajectory_analysis.md` — this document diff --git a/dev/benchmarks/trajectory_results.rds b/dev/benchmarks/trajectory_results.rds new file mode 100644 index 000000000..b6c6b2815 Binary files /dev/null and b/dev/benchmarks/trajectory_results.rds differ diff --git a/dev/benchmarks/vtune_tbr_analysis.md b/dev/benchmarks/vtune_tbr_analysis.md new file mode 100644 index 000000000..f6d2c68b5 --- /dev/null +++ b/dev/benchmarks/vtune_tbr_analysis.md @@ -0,0 +1,149 @@ +# T-260: VTune TBR Per-Evaluation Overhead Analysis + +**Date:** 2026-03-26 +**Agent:** E +**CPU:** Intel Core i7-10700 @ 2.90 GHz (Comet Lake, 10th gen) +**Sampling:** User-mode software sampling (VTune 2025.10) +**Dataset:** Dikow2009 (88 tips, EW parsimony) +**Workload:** 50 random starts × (Wagner → NNI → 20 TBR passes) = 1000 TBR passes +**Total CPU time:** 30.96s (of which TreeSearch.dll = 23.71s = 76.6%) + +## Module breakdown + +| Module | CPU Time | % | +|--------|:--------:|:-:| +| TreeSearch.dll | 23.71s | 76.6% | +| ucrtbase.dll | 6.00s | 19.4% | +| R.dll | 1.10s | 3.6% | +| Other | 0.15s | 0.5% | + +## Top hotspots (TreeSearch.dll + attributed ucrtbase) + +### By logical category + +| Category | Time | % of total | Key functions | +|----------|:----:|:----------:|---------------| +| **Full NA-aware scoring** | 9.03s | 29.2% | `fitch_na_score` (includes NNI path: 3.62s) | +| **StateSnapshot save/restore** | 4.53s | 14.6% | `save` 2.15s, `restore` 1.97s, `restore_prealloc_undo` 0.18s (memcpy in ucrtbase) | +| **Incremental scoring** | 2.28s | 7.4% | `fitch_na_indirect_length_cached` 1.02s, `fitch_na_pass3_score` 0.89s, `fitch_na_indirect_length_bounded` 0.37s | +| **Tip state reloading** | 1.62s | 5.2% | `load_tip_states` (called from `reset_states` → `full_rescore`) | +| **SIMD bit ops** | ~2.0s | 6.5% | `any_hit_reduce` 1.60s, `or_reduce` 0.21s, `any_hit_reduce3` 0.31s | +| **Buffer zeroing** | ~1.20s | 3.9% | `std::fill` in `reset_states()` — zeroes prelim, final_, down2, subtree_actives, local_cost | +| **TBR orchestration** | ~1.9s | 6.1% | `tbr_search` 1.06s, `precompute_vroot_cache` 0.46s, `fitch_join_states` 0.13s, `collect_main_edges` 0.11s, `validate_topology` 0.07s, `fast_hash` 0.06s | +| **Data setup** | ~0.9s | 2.9% | `count_state_occurrences` 0.64s, `simplify_patterns` 0.12s, `build_dataset` 0.12s | +| **Memory management** | ~0.8s | 2.6% | `malloc_base` 0.77s | +| **popcount** | ~0.43s | 1.4% | `popcount64` (multiple sites) | +| **Hash set destructor** | 0.14s | 0.4% | `unordered_set::~unordered_set` (TBR tabu set) | + +### TBR-only breakdown (excluding NNI scoring) + +Subtracting the NNI path (3.62s fitch_na_score + proportional overhead), the +TBR-specific budget is approximately: + +| TBR phase | Time | % of TBR | +|-----------|:----:|:--------:| +| Full rescore scoring (`fitch_na_score`) | 5.41s | 28% | +| StateSnapshot save/restore | 4.53s | 23% | +| Incremental candidate screening | 2.28s | 12% | +| Buffer zeroing (`std::fill` in `reset_states`) | ~1.20s | 6% | +| Tip reloading (`load_tip_states`) | 1.60s | 8% | +| TBR orchestration | ~1.9s | 10% | +| SIMD / popcount / other | ~2.5s | 13% | +| **Total TBR** | **~19.4s** | **100%** | + +## Key finding: `full_rescore` overhead + +Every TBR candidate that passes incremental screening triggers: + +1. `state_snap.save()` — memcpy ~190 KB (5 arrays × n_node × total_words) +2. `apply_tbr_move()` — modifies topology + states +3. `full_rescore()` = `reset_states()` + `score_tree()` + - `reset_states()`: 5× `std::fill(0)` + `load_tip_states()` + - `score_tree()`: `fitch_na_score()` (full 3-pass) +4. If rejected: `state_snap.restore()` — memcpy ~190 KB back + +**The non-scoring overhead of a single candidate evaluation +(save + zero + load_tips + restore) totals 7.35s = 37.8% of TBR time.** + +The snapshot mechanism itself (save+restore = 4.53s) is an optimization +over the alternative (re-running `full_rescore` after rejection). But the +`reset_states()` step — zeroing all arrays before the downpass overwrites +them — is likely unnecessary since the Fitch downpass will recompute all +internal node values from tips up. + +## Top 3 actionable hotspots + +### 1. StateSnapshot save/restore — 14.6% (4.53s) + +**What:** Full-array memcpy of prelim, final_, down2, subtree_actives, +local_cost, and postorder before each candidate evaluation. Restore copies +everything back when the move is rejected. + +**Why it's expensive:** At 88 tips: n_node=175, total_words≈30 → each +state array is ~42 KB. With 5 arrays + cost array + postorder, each +save/restore copies ~190 KB. At 180 tips, this doubles. + +**Potential fixes:** +- **Selective save/restore**: Only save nodes affected by the TBR move + (the clip subtree path + regraft path to root). Requires tracking dirty + nodes in `apply_tbr_move()`. +- **Copy-on-write / versioned arrays**: Use generation counters instead + of bulk copy. +- **Eliminate the need**: If `full_rescore()` is made cheaper (see #2), + the restore path could simply re-run scoring instead of restoring from + snapshot. + +### 2. `reset_states()` (zero + reload tips) — 9.1% (2.82s) + +**What:** `full_rescore()` calls `reset_states()` which zeroes all 5 state +arrays then copies tip data back from the dataset. This runs before every +`score_tree()`. + +**Why it may be unnecessary:** The Fitch downpass computes every internal +node's `prelim` from its children's values (bottom-up), overwriting whatever +was there. The uppass similarly overwrites `final_`. The zeroing is only +needed if the scoring algorithm reads uninitialized memory — but if the +postorder traversal visits every internal node, it never does. + +**Potential fix:** Replace `reset_states()` with just `load_tip_states()`. +Verify that the NA-aware passes (down2, subtree_actives) also fully +overwrite internal nodes during their traversals. If they do, save 3.9% +immediately (the std::fill cost) and reduce tip loading to only the +arrays that aren't fully recomputed. + +### 3. `fitch_na_score` as authoritative rescore — 29.2% (9.03s) + +**What:** The full 3-pass NA-aware Fitch algorithm is called for every +candidate that passes incremental screening. This is the authoritative +score used to accept/reject moves. + +**Why it dominates:** It's the core algorithm — this is expected. But +it's called much more often than strictly necessary because incremental +scoring is only a screening heuristic. + +**Potential fixes:** +- **Improve incremental accuracy**: If incremental scoring matched + full-rescore more closely, fewer candidates would need full evaluation. + Currently ~every clip with a viable candidate triggers full_rescore. +- **Deferred full rescore**: Accept based on incremental score, batch + full rescores periodically (risk: score drift). +- **This is also addressed indirectly by fixes #1 and #2**: reducing + the per-evaluation overhead means each full_rescore call is cheaper. + +## Estimated impact of fixes + +| Fix | Savings | Effort | +|-----|:-------:|:------:| +| Eliminate `std::fill` in `reset_states` | ~3.9% (~1.2s) | Low — verify NA invariants, remove 5 fill calls | +| Selective StateSnapshot (save/restore only dirty nodes) | ~10–12% (~3–4s) | Medium — track dirty set in apply_tbr_move | +| Reduce `load_tip_states` scope (only reload modified arrays) | ~2–3% (~0.6–0.9s) | Low — check which tip arrays are read by scoring | +| **Combined** | **~16–19%** | — | + +## Raw VTune data + +Results stored in `vtune-tbr-out/` (gitignored). Regenerate with: +```bash +"C:/Program Files (x86)/Intel/oneAPI/vtune/latest/bin64/vtune.exe" \ + -collect hotspots -result-dir vtune-tbr-out \ + -- Rscript dev/vtune-tbr-driver.R +``` diff --git a/dev/vtune-tbr-driver.R b/dev/vtune-tbr-driver.R new file mode 100644 index 000000000..9e8dce95d --- /dev/null +++ b/dev/vtune-tbr-driver.R @@ -0,0 +1,36 @@ +# VTune driver: TBR inner loop profiling at 88 tips (Dikow2009) +# Target: ~15-30s of pure TBR evaluation time +library(TreeSearch, lib.loc = ".vtune-lib") +library(TreeTools) + +data(inapplicable.phyData) +dataset <- inapplicable.phyData$Dikow2009 + +at <- attributes(dataset) +contrast <- at$contrast +tip_data <- matrix(unlist(dataset, use.names = FALSE), + nrow = length(dataset), byrow = TRUE) +weight <- at$weight +levels <- at$levels + +set.seed(5813) +n_tip <- length(dataset) +t0 <- proc.time() + +# Many random starts: Wagner → NNI → TBR chain +# Each start exercises the full TBR evaluation pipeline +for (rep in seq_len(50)) { + wag <- TreeSearch:::ts_random_wagner_tree(contrast, tip_data, weight, levels) + nni <- TreeSearch:::ts_nni_search(wag$edge, contrast, tip_data, weight, levels) + edge <- nni$edge + for (pass in seq_len(20)) { + res <- TreeSearch:::ts_tbr_search( + edge, contrast, tip_data, weight, levels, + maxHits = 1L, acceptEqual = FALSE + ) + edge <- res$edge + } +} + +elapsed <- (proc.time() - t0)["elapsed"] +cat("Elapsed:", round(elapsed, 1), "s (", 50 * 20, "TBR passes)\n") diff --git a/inst/CITATION b/inst/CITATION index cd2747039..96d8bd242 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,35 +1,15 @@ +vers <- meta$Version mrs <- person(c("Martin", "R."), "Smith", email = "martin.smith@durham.ac.uk") -bibentry( - mheader = paste0( - "Please acknowledge your use of TreeSearch and MorphyLib in any article ", +citHeader(paste0( + "Please acknowledge your use of TreeSearch in any article ", "in which they are used, and cite Smith (2023) and Brazeau et al. (2019). ", "For example, 'Phylogenetic search was performed using the R package ", - "TreeSearch v", meta$Version, " (Smith 2023), which uses MorphyLib ", - "(Brazeau et al. 2017) to handle inapplicable data (Brazeau et al. 2019)'. ", + "TreeSearch v", meta$Version, " (Smith 2023), which uses an approximate correction for ", + " inapplicable data (Brazeau et al. 2019)'. ", "The GUI recommends further citations for each method it employs." - ), - bibtype = "Manual", - key = "Brazeau2017", - title = paste0( - "MorphyLib: a library for phylogenetic analysis of categorical trait data ", - "with inapplicability" - ), - author = c( - person(c("Martin", "D."), "Brazeau"), - mrs, - person("Thomas", "Guillerme") - ), - year = 2017, - note = "Version 0.0.1-alpha", - - textVersion = paste0( - "Brazeau, M.D., Smith, M.R. & Guillerme, T. (2017). ", - "MorphyLib: a library for phylogenetic analysis of categorical ", - "trait data with inapplicability. doi: 10.5281/zenodo.815371" - ) -) - + )) + bibentry( bibtype = "Article", key = "Brazeau2019", @@ -86,7 +66,6 @@ bibentry( textVersion = paste0( "Smith (2023). TreeSearch: morphological phylogenetic analysis in R. ", - "R journal 14:305-315. doi:10.32614/RJ-2023-019 -.32614/RJ-2023-019" + "R journal 14:305-315. doi:10.32614/RJ-2023-019" ) ) diff --git a/inst/Parsimony/app.R b/inst/Parsimony/app.R deleted file mode 100644 index b6b6de6f2..000000000 --- a/inst/Parsimony/app.R +++ /dev/null @@ -1,3681 +0,0 @@ -# options("TreeSearch.logging" = TRUE) # Log function entry and exit -# options("TreeSearch.write.code" = TRUE) # Show code as it is written to log -logging <- isTRUE(getOption("TreeSearch.logging")) -options(shiny.maxRequestSize = 1024 ^ 3) # Allow max 1 GB files - - -library("methods", exclude = c("show", "removeClass")) -library("cli") -library("TreeSearch") # load now: inapplicable.datasets required within ui -.DateTime <- function() { # Copy, because not exported - format(Sys.time(), "%Y-%m-%d %T") -} - -suppressPackageStartupMessages({ - library("shiny", exclude = c("runExample")) - library("shinyjs", exclude = c("runExample")) -}) - - -if (logging) { - logMsgFile <- file("log.lg", open = "w+") - LogMsg <- function (...) { - message(.DateTime(), ": ", ...) - writeLines(.DateTime(), con = logMsgFile) - writeLines(paste0(" ", ...), con = logMsgFile) - } - Put <- function (..., file) { - dput(..., file = file) - writeLines(gsub("", "NULL", readLines(file)), - file) - } - PutTree <- function (...) { - Put(..., file = "tree.lg") - } - PutData <- function (...) { - Put(..., file = "dataset.lg") - } -} else { - PutData <- PutTree <- LogMsg <- function (...) {} -} - -WriteLoggedCode <- if (isTRUE(getOption("TreeSearch.write.code"))) { - if (requireNamespace("crayon", quietly = TRUE)) { - function(txt) { - for (line in txt) cat(if (substr(trimws(line), 0, 1) == "#") { - crayon::green(" ", line, "\n") - } else { - crayon::yellow(" ", line, "\n") - }) - } - } else { - function(txt) message(" ", txt) - } -} else { - function(txt) {} -} - -Notification <- function (...) { - if (!isTRUE(getOption("shiny.testmode"))) { - showNotification(...) - } -} - -Icon <- function(...) icon(..., class = "fas") - -aJiffy <- 42 # ms, default debounce period for input sliders etc -typingJiffy <- 2.5 * aJiffy # slightly slower if might be typing -aFewTrees <- 48L # Too many and rogues / tree space are slowed -NO_OUTGROUP <- "! TREESEARCH_no outgroup specified ." - -palettes <- list("#7a6c36", - c("#7a6c36", "#864885"), - c("#7a6c36", "#864885", "#427743"), - c("#7a6c36", "#864885", "#427743", "#4c5c86"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca", "#364020"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca", "#364020", "#c241a7"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca", "#364020", "#c241a7", "#391d42"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#85c6f9", "#fbd1a0", "#7696be", "#89996c", "#ddcdff", "#719d89", "#f5cde6", "#b6e0da", "#e8d4cd", "#b5ddfa"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#bbcb8f", "#bf82ab", "#85ddc4", "#eea0ba", "#c1d8ff", "#c3818b", "#c5c6ff", "#999388", "#e8cbff", "#ffb5b6", "#d2dad7"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#cc8f6f", "#499fae", "#d9dca6", "#7796b8", "#bee1ba", "#b4daff", "#919583", "#e2d3e9", "#47a19b", "#ebd4bc", "#7c9993", "#a9e3e0"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#a8e0fe", "#fad0a8", "#679e8d", "#ffc7b1", "#abe5c0", "#ac8d78", "#c5dddc", "#a48f84", "#cadfb0", "#899694", "#fdcdc1", "#d1dad5", "#dfd8c4"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#dcb983", "#77bff0", "#f0ab92", "#90ddff", "#f1d3a9", "#b5c2fe", "#c1e1b7", "#7596ba", "#bce1c4", "#a88c96", "#5a9daf", "#b18b80", "#d4d6f3", "#949577"), - c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#e03795", "#438f2e", "#5e2195", "#758029", "#4042b9", "#a37926", "#8364df", "#c3671f", "#444491", "#dc4c1f", "#367076", "#e2383c", "#4786b4", "#e13964", "#4c8c73", "#a53396", "#2c4422", "#b553cb", "#50381b", "#4f75d8", "#a12c1b", "#8576b8", "#bd6541", "#3a1959", "#83491f", "#2d2644", "#c45b94", "#451523", "#966883", "#782224", "#b96563", "#762254", "#95765c", "#ad355a") -) - -ErrorPlot <- function (...) { - plot(0, 0, type = "n", axes = FALSE, ann = FALSE) - text(0, 0, paste0(..., collapse = "\n"), - col = "#dd6611", font = 2) -} - -badToGood <- rev(c("#1AB958", "#23B956", "#2BB954", "#31B952", "#37B850", "#3CB84E", "#41B84C", "#45B74A", "#49B749", "#4DB747", "#51B645", "#54B643", "#58B641", "#5BB53F", "#5FB53D", "#62B53C", "#65B43A", "#68B438", "#6BB336", "#6DB335", "#70B333", "#73B231", "#76B230", "#78B12E", "#7BB12C", "#7DB02B", "#80B029", "#82AF28", "#85AF26", "#87AE25", "#8AAE23", "#8CAD22", "#8EAD21", "#91AC1F", "#93AC1E", "#95AB1D", "#97AB1C", "#9AAA1B", "#9CAA1A", "#9EA919", "#A0A918", "#A2A818", "#A4A717", "#A6A716", "#A8A616", "#AAA616", "#ACA515", "#AEA415", "#B0A415", "#B2A315", "#B4A315", "#B6A216", "#B8A116", "#B9A117", "#BBA017", "#BD9F18", "#BF9F18", "#C19E19", "#C29D1A", "#C49D1B", "#C69C1C", "#C79B1D", "#C99A1E", "#CB9A1F", "#CC9920", "#CE9822", "#CF9823", "#D19724", "#D29625", "#D49626", "#D59528", "#D79429", "#D8932A", "#D9932C", "#DB922D", "#DC912E", "#DD9130", "#DF9031", "#E08F33", "#E18F34", "#E28E35", "#E38D37", "#E58C38", "#E68C3A", "#E78B3B", "#E88A3D", "#E98A3E", "#EA8940", "#EB8841", "#EC8843", "#ED8744", "#EE8746", "#EE8647", "#EF8549", "#F0854A", "#F1844C", "#F2844D", "#F2834F", "#F38350", "#F48252", "#F48253", "#F58155", "#F58157", "#F68058", "#F6805A", "#F77F5B", "#F77F5D", "#F87E5E")) - -Reference <- function (authors, year, title, journal = "", - volume = NULL, pages = NULL, doi = NULL, - publisher = NULL, editors = NULL) { - nAuth <- length(authors) - if (nAuth > 1L) { - authors <- paste(paste0(authors[-nAuth], collapse = ", "), "&", authors[nAuth]) - } - nEd <- length(editors) - if (nEd > 1L) { - editors <- paste(paste0(editors[-nEd], collapse = ", "), "&", editors[nEd]) - } else if (nEd < 1) { - editors <- "" - } - paste0("

", authors, " (", year, "). “", title, - "”. ", - if (editors != "") paste0("In: ", editors, " (eds). ") else "", - if (journal != "") paste0("", journal, " ") else "", - if (is.null(volume)) "" else paste0("", volume, ":"), - if (is.null(publisher)) "" else paste0(publisher, ". "), - if (is.null(pages)) "" else paste0(paste0(pages, collapse = "–"), ". "), - if (is.null(doi)) "" else paste0( - "doi:", - doi, ". "), - "

") -} - - -Arthur2007 <- Reference( - c("Arthur, D.", "Vassilvitskii, S"), - title = "k-means++: the advantages of careful seeding", - year = 2007, - journal = "Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms", - pages = c(1027, 1035) -) -Brazeau2019 <- Reference(c("Brazeau, M.D.", "Guillerme, T.", "Smith, M.R."), 2019, - title = "An algorithm for morphological phylogenetic analysis with inapplicable data", - journal = "Systematic Biology", - volume = 64, - pages = c(619, 631), - doi = "10.1093/sysbio/syy083") -Bien2011 <- Reference( - c("Bien, J.", "Tibshirani, R."), - title = "Hierarchical clustering with prototypes via minimax linkage", - year = 2011, - volume = 106, - doi = "10.1198/jasa.2011.tm10183", - pages = c(1075, 1084), - journal = "Journal of the American Statistical Association") -Gower1966 <- Reference(title = "Some distance properties of latent root and vector methods used in multivariate analysis", - authors = "Gower, J.C.", - year = 1966, - volume = 53, - pages = c(325, 338), - doi = "10.2307/2333639", - journal = "Biometrika") -Gower1969 <- Reference( - title = "Minimum spanning trees and single linkage cluster analysis", - authors = c("Gower, J.C.", "Ross, G.J.S."), - year = 1969, volume = 18, pages = c(54, 64), doi = "10.2307/2346439", - journal = "Journal of the Royal Statistical Society Series C (Applied Statistics)") -Hartigan1979 <- Reference( - title = "Algorithm AS 136: a K-means clustering algorithm", - authors = c("Hartigan, J.A.", "Wong, M.A."), - journal = "Journal of the Royal Statistical Society Series C (Applied Statistics)", - year = 1979, volume = 28, pages = c(100, 108), - doi = "10.2307/2346830") -Kaski2003 <- Reference( - title = "Trustworthiness and metrics in visualizing similarity of gene expression", - authors = c("Kaski, S.", "Nikkilä, J.", "Oja, M.", "Venna, J.", - "Törönen, P.", "Castrén, E."), - year = 2003, volume = 4, pages = 48, doi = "10.1186/1471-2105-4-48", - journal = "BMC Bioinformatics") -Klopfstein2019 <- Reference( - title = "Illustrating phylogenetic placement of fossils using RoguePlots: An example from ichneumonid parasitoid wasps (Hymenoptera, Ichneumonidae) and an extensive morphological matrix.", - authors = c("Klopfstein, S.", "Spasojevic, T."), year = 2019, - journal = "PLoS ONE", volume = 14, pages = "e0212942", - doi = "10.1371/journal.pone.0212942" -) -Maechler2019 <- Reference( - title = "cluster: cluster analysis basics and extensions", year = 2022, - authors = c("Maechler, M.", "Rousseeuw, P.", "Struyf, A.", "Hubert, M.", "Hornik, K."), - journal = "Comprehensive R Archive Network") -Morphy <- Reference( - c("Brazeau, M.D.", "Smith, M.R.", "Guillerme, T."), 2017, - "MorphyLib: a library for phylogenetic analysis of categorical trait data with inapplicability", - doi = "10.5281/zenodo.815371") -Murtagh1983 <- Reference( - title = "A survey of recent advances in hierarchical clustering algorithms", - authors = "Murtagh, F.", year = 1983, volume = 26, pages = c(354, 359), - doi = "10.1093/comjnl/26.4.354", journal = "The Computer Journal") -Nixon1999 <- Reference( - "Nixon, K.C.", 1999, - journal = "Cladistics", volume = 15, pages = c(407, 414), - title = "The Parsimony Ratchet, a new method for rapid parsimony analysis", - doi = "10.1111/j.1096-0031.1999.tb00277.x") -Pol2009 <- Reference( - title = "Unstable taxa in cladistic analysis: identification and the assessment of relevant characters", - authors = c("Pol, D.", "Escapa, I.H."), - journal = "Cladistics", 2009, 25, pages = c(515, 527), - doi = "10.1111/j.1096-0031.2009.00258.x") -RCoreTeam <- Reference( - authors = "R Core Team", year = 2020, - title = "R: A language and environment for statistical computing", - publisher = "R Foundation for Statistical Computing, Vienna, Austria") -Rousseeuw1987 <- Reference( - title = "Silhouettes: a graphical aid to the interpretation and validation of cluster analysis", - author = "Rousseeuw, P.J.", year = 1987, - journal = "Journal of Computational and Applied Mathematics", - volume = 20, pages = c(53, 65), doi = "10.1016/0377-0427(87)90125-7" -) -SmithDist <- Reference( - "Smith, M.R.", "2020a", "TreeDist: distances between phylogenetic trees", - doi = "10.5281/zenodo.3528123", "Comprehensive R Archive Network") -SmithQuartet <- Reference( - "Smith, M.R.", 2019, - "Quartet: comparison of phylogenetic trees using quartet and split measures", - "Comprehensive R Archive Network", doi = "10.5281/zenodo.2536318") -SmithSearch <- Reference( - "Smith, M.R.", 2023, "TreeSearch: morphological phylogenetic analysis in R", - "R Journal", volume = 14, pages = c(305, 315), - doi = "10.32614/RJ-2023-019") -Smith2020 <- Reference( - "Smith, M.R.", "2020b", - "Information theoretic Generalized Robinson-Foulds metrics for comparing phylogenetic trees", - "Bioinformatics", volume = 36, pages = c("5007", "5013"), - doi = "10.1093/bioinformatics/btaa614") -SmithSpace <- Reference( - "Smith, M.R.", "2022a", "Robust analysis of phylogenetic tree space", - "Systematic Biology", 71, pages = c("1255", "1270"), - doi = "10.1093/sysbio/syab100") -SmithRogue <- Reference( - "Smith, M.R.", "2022b", - "Using information theory to detect rogue taxa and improve consensus trees", - "Systematic Biology", 71, pages = c("1088", "1094"), - doi = "10.1093/sysbio/syab099") -Stockham2002 <- Reference( - authors = c("Stockham, C.", "Wang, L.-S.", "Warnow, T."), 2002, - "Statistically based postprocessing of phylogenetic analysis by clustering", - "Bioinformatics", 18, c("S285", "S293"), - doi = "10.1093/bioinformatics/18.suppl_1.S285") - -Venna2001 <- Reference( - title = "Neighborhood preservation in nonlinear projection methods: an experimental study", - authors = c("Venna, J.", "Kaski, S."), year = 2001, pages = c(485, 491), - journal = "Lecture Notes in Computer Science: Artificial Neural Networks—ICANN 2001", - editors = c("Dorffner, G.", "Bischof, H.", "Hornik, K."), - publisher = "Springer, Berlin", - doi = "10.1007/3-540-44668-0_68") - - - - - - - - - - - - - - -ui <- fluidPage( - theme = "app.css", - title = "TreeSearch", - - if (isTRUE(getOption("shiny.testmode"))) { - tags$head( - tags$style(HTML("#shiny-notification-panel {visibility: hidden;}") - ) - ) - }, - useShinyjs(), - column(3, - fluidRow( - tags$h1("TreeSearch", style = "margin-top: 0.4em;"), - selectInput( - "dataSource", - "Dataset", - c("< Load from file below >" = "file", - "Agnarsson 2004" = "Agnarsson2004", - "Sun et al. 2018" = "Sun2018", - "Wills et al. 2012" = "Wills2012", - if (logging) setNames(names(inapplicable.datasets), - names(inapplicable.datasets)) - ) - ), - fileInput("dataFile", - tags$span( - tags$i(class="fas fa-solid fa-table"), - tags$span("Load data from file") - ), - placeholder = "No data file selected"), - hidden(tags$span(id = "readxl.options", - selectInput("readxl.sheet", "Excel sheet to read:", "Sheet 1", "Sheet 1"), - tags$span("First character row & column:"), - numericInput("readxlSkip", - label = NULL, - min = 2L, value = 2L, step = 1L), - numericInput("readxlSkipCols", - label = NULL, - min = 2L, value = 2L, step = 1L), - htmlOutput("readxl.chars", style = "clear: both;"), - htmlOutput("readxl.taxa", style = "clear: both; margin-bottom: 1em;") - )), - tags$label("Search", class = "control-label", - style = "display: block; margin-top: -15px;"), - actionButton("searchConfig", "Configure", icon = Icon("gears")), - hidden(actionButton("go", "Search", icon = Icon("magnifying-glass"))), - downloadButton("saveZip", "Save log", icon = Icon("download")), - fileInput("treeFile", - label = tags$span( - tags$i(class="fas fa-solid fa-tree"), - tags$span("Load trees") - ), - placeholder = "No tree file selected"), - textOutput("results"), - hidden(tags$div(id = "manipulateTreeset", - numericInput("nTree", - label = HTML("Sample n trees from range:"), - min = 1L, value = 1L, step = 1L), - sliderInput("treeRange", label = "", min = 1L, max = 1L, - step = 1L, value = c(1, 1)), - tags$label("Save chosen trees:", class = "control-label"), - tags$div(style = "display: inline-block", - downloadButton("saveNwk", "Newick", icon = Icon("download")), - downloadButton("saveNex", "Nexus", icon = Icon("download")) - ) - )), - hidden( - tags$div(id = "displayConfig", - radioButtons("plotFormat", "Display:", - list("Characters on trees" = "ind", - "Consensus tree" = "cons", - "Cluster consensus trees" = "clus", - "Tree space" = "space"), - # "ind"), - "cons"), - hidden(sliderInput("whichTree", "Tree to plot", value = 0L, - min = 0L, max = 1L, step = 1L)), - hidden(tags$div(id = "treePlotConfig", - selectizeInput("outgroup", "Root on:", multiple = TRUE, - choices = list()), - selectizeInput( - "concordance", - "Split support:", - choices = list( - "None" = "none", - "% trees containing" = "p", - "Quartet concordance" = "qc", - "Clustering concordance" = "clc", - "Phylogenetic concordance" = "phc", - "Mutual Clustering conc." = "mcc", - "Shared Phylog. conc." = "spc" - )) - )), - hidden(tags$div(id = "mapConfig", - checkboxGroupInput("mapLines", "Connect:", - choices = list( - "Cluster convex hulls" = "hull", - "Minimum spanning tree" = "mst", - "Trees in sequence" = "seq" - ), selected = c("hull", "mst")) - )) - ) - ), - ), - ), - column(9, - fluidRow(id = "plotConfig", - tags$div(id = "plotSizer", - tags$span("Plot size:", id = "plotSizeSpan"), - sliderInput(inputId = "plotSize", - label = NULL, width = "200px", - min = 100, max = 2000, - post = "px", value = 600), - ), - tags$div(id = "saveAs", - tags$span("Save\ua0plot: "), - downloadButton("savePlotZip", "R script", icon = Icon("download")), - downloadButton("savePdf", "PDF", icon = Icon("download")), - downloadButton("savePng", "PNG", icon = Icon("download")) - ), - tags$div(id = "savePlottedTrees", - downloadButton("savePlotNwk", "Newick", icon = Icon("download")), - downloadButton("savePlotNex", "Nexus", icon = Icon("download")) - ) - ), - fluidRow( - plotOutput(outputId = "treePlot", height = "600px"), - hidden(plotOutput("clustCons", height = "200px")), - hidden(tags$div(id = "charChooser", - tags$div( - numericInput("plottedChar", "Character to map:", value = 1L, - min = 0L, max = 1L, step = 1L, width = 200), - selectizeInput("searchChar", "Search characters:", multiple = FALSE, - choices = list()), - checkboxGroupInput("mapDisplay", "", list( - "Align tips" = "tipsRight", - "Infer tips" = "updateTips" - )), - style = "float: right; width: 200px; margin-left: 2em;"), - htmlOutput("charMapLegend"), - htmlOutput("charNotes"), - )), - hidden(tags$div(id = "consConfig", - tags$div(style = "float: right; width: 200px; margin-left: 2em;", - sliderInput("consP", "Majority:", value = 1, - min = 0.5, max = 1, width = 200), - numericInput("keepNTips", "Tips to show:", value = 0L, - min = 3L, max = 2L, step = 1L, width = 200), - selectizeInput("neverDrop", "Never drop:", multiple = TRUE, - choices = c()) - ), - tags$div(id = "consLegend", - tags$span(id = "instabLegend", - tagList( - tags$span(class = "legendLeft", "Stable"), - tags$span(class = "infernoScale legendBar", "\ua0"), - tags$span(class = "legendRight", "Unstable"), - )), - htmlOutput("branchLegend", inline = TRUE)), - tags$div(id = "droppedTips", - selectInput("excludedTip", "Show excluded tip", choices = list())), - tags$div(id = "droppedList", style = "float: left;"), - )), - hidden(tags$div(id = "clusLegend", - htmlOutput("instabLegend2", inline = TRUE) - )), - hidden(tags$div(id = "clusConfig", - style = "float: right; width: 200px; margin-left: 2em;", - sliderInput("clThresh", "Cluster threshold:", value = 0.5, - min = 0, max = 1, width = 200), - selectInput("distMeth", "Distance method:", selected = "cid", - choices = list("Clustering Information" = "cid", - "Phylogenetic information" = "pid", - "Matching split info" = "msid", - "Robinson-Foulds (fast, iffy)" = "rf", - "Quartet (slower)" = "qd"), - width = 200) - )), - hidden(tags$div( - id = "spaceConfig", - tags$div(id = "spaceLegend", - style = "float: left;", - plotOutput(outputId = "pcQuality", - height = "72px", width = "240px"), - htmlOutput("stressLegend", inline = TRUE) - ), - tags$div( - style = "float: right; width: 200px; margin-left: 2em;", - sliderInput("spaceDim", "Dimensions:", value = 5, - min = 1, max = 12, step = 1, width = 200), - selectInput("spaceCol", "Colour trees by:", - list("Cluster membership" = "clust", - "Parsimony score" = "score", - "When first found" = "firstHit")), - selectInput("spacePch", "Plotting symbols:", - selected = "relat", - list("Cluster membership" = "clust", - "Relationships" = "relat", - "Tree name" = "name")), - selectizeInput("relators", "Show relationship between:", - choices = list(), multiple = TRUE), - ), - )), - htmlOutput("references", style = "clear: both;"), - ), - ) -) - - - - - - - - - - - - - - - - - - -Enquote <- function(x, ...) { - if (mode(x) == "character") { - paste0("\"", x, "\"") - } else { - signif(x, ...) - } -} - -EnC <- function(...) { - if (length(...) == 1) { - Enquote(...) - } else { - paste0("c(", paste(sapply(..., Enquote), collapse = ", "), ")") - } -} - -server <- function(input, output, session) { - - r <- reactiveValues( - dataFileVisible = TRUE, - ignoreTreeRange = TRUE, - ignoreNTree = TRUE, - nTree = 0L, - oldOutgroup = NO_OUTGROUP, - sortTrees = FALSE, # May be arranged nicely in input files - treeRange = c(1L, 1L), - updatingTrees = FALSE # TODO DELETE? - ) - - serverEnv <- environment() - logIndent <- 0 - loggingOn <- TRUE - - cmdLogFile <- tempfile("TreeSearch-", fileext = ".R") - Write <- function (txt, file) { - if (serverEnv$loggingOn) { - txt <- paste0(strrep(" ", logIndent), txt) - con <- file(file, open = "a") - on.exit(close(con)) - if (logging) { - WriteLoggedCode(txt) - } - writeLines(txt, con) - } - } - - WriteP <- function (txt, file = NULL) { - if (serverEnv$loggingOn) { - txt <- paste0(strrep(" ", logIndent), txt) - if (logging) { - WriteLoggedCode(txt) - } - r$plotLog <- c(r$plotLog, as.character(txt)) - } - } - - LogExpr <- function(exps, evaluate = TRUE, WriteFn = Write) { - for (exp in exps) { - WriteFn(as.character(exp), cmdLogFile) - if (evaluate) { - eval(exp) - } - } - } - - LogExprP <- function(...) { - LogExpr(..., WriteFn = WriteP) - } - - LogIndent <- function(n) { - serverEnv$logIndent <- serverEnv$logIndent + n - if (serverEnv$logIndent < 0) { - warning("Negative indent") - } - } - - systemInfo <- c( - paste( - "System:", Sys.info()["sysname"], Sys.info()["release"], - Sys.info()["version"], "-", - .Platform$OS.type, R.version$platform - ), - paste( - "-", R.version$version.string - ), - paste("- TreeSearch", packageVersion("TreeSearch")), - paste("- TreeTools", packageVersion("TreeTools")), - paste("- TreeDist", packageVersion("TreeDist")), - paste("- ape", packageVersion("ape")) - ) - - logCaveats <- c( - "Before running, check that the script and any data files are in the", - "R working directory, which can be read with getwd() and set with setwd().", - "", - "Please validate the code before reproducing in a manuscript, reporting", - "any errors at https://github.com/ms609/treesearch/issues or by e-mail to", - "the package maintainer." - ) - - BeginLog <- function() { - LogComment(c( - paste("# # TreeSearch session log:", .DateTime(), "# # #"), - "", - systemInfo, - "", - "This log was generated procedurally to facilitate the reproduction of", - "results obtained during an interactive Shiny session.", - "It is provided without guarantee of completeness or accuracy.", - "In particular, code will not be logged when previously computed values", - "are retrieved from cache.", - "", - logCaveats, - "", - "# # # # #" - )) - - LogComment("Load required libraries", 2) - LogCode(c( - "library(\"TreeTools\", quietly = TRUE)", - "library(\"TreeDist\")", - "library(\"TreeSearch\")" - )) - - LogComment("View recommended citations", 1) - LogCode(c( - "citation(\"TreeTools\")", - "citation(\"TreeDist\")", - "citation(\"TreeSearch\")", - "citation(\"Rogue\")" - )) - } - - BeginLogP <- function() { - r$plotLog <- NULL - LogCommentP(c( - paste("# # TreeSearch plot log:", .DateTime(), "# # #"), - "", - systemInfo, - "", - "This log was generated procedurally to facilitate the reproduction of", - "figures obtained during an interactive Shiny session.", - "It is provided without guarantee of completeness or accuracy.", - "In particular, code will not be logged when previously computed values", - "are retrieved from cache.", - "", - logCaveats, - "", - "# # # # #" - )) - LogCommentP("Load required libraries", 2) - LogCodeP(c( - "library(\"TreeTools\", quietly = TRUE)", - "library(\"TreeDist\")", - "library(\"TreeSearch\")" - )) - - LogCommentP("View recommended citations", 1) - LogCodeP(c( - "citation(\"TreeTools\")", - "citation(\"TreeDist\")", - "citation(\"Quartet\")", - "citation(\"TreeSearch\")", - "citation(\"Rogue\")" - )) - - LogCommentP("Check working directory", 1) - LogCodeP("getwd() # Should match location of data / tree files", - "setwd(\".\") # Replace . with desired/directory to change") - - if (HaveData()) { - LogCommentP("Load data from file") - LogCodeP(c( - paste0("dataFile <- ", Enquote(DataFileName(r$dataFiles))), - paste0("dataset <- ", r$readDataFile) - )) - } - - if (AnyTrees()) { - LogCommentP("Load trees from file") - LogCodeP(c( - paste0("treeFile <- ", Enquote(TreeFileName(r$treeFiles))), - "trees <- read.nexus(treeFile)", - if (!identical(r$trees, r$allTrees)) { - paste0( - "trees <- trees[unique(as.integer(seq.int(", - r$treeRange[1], ", ", r$treeRange[2], - ", length.out = ", r$nTree, ")))]" - ) - } - )) - } - } - - PauseLog <- function() { - serverEnv$loggingOn <- FALSE - } - - ResumeLog <- function() { - serverEnv$loggingOn <- TRUE - } - - LogCode <- function(..., WriteFn = Write) { - for (line in list(...)) { - if (!is.null(line)) { - WriteFn(as.character(line), cmdLogFile) - } - } - } - - LogCodeP <- function(...) { - LogCode(..., WriteFn = WriteP) - } - - LogComment <- function(exps, returns = 1, WriteFn = Write) { - if (returns > 0) { - WriteFn(rep("", returns), cmdLogFile) - } - for (exp in exps) { - WriteFn(paste("#", exp), cmdLogFile) - } - } - - LogCommentP <- function (exps, returns = 1) { - LogComment(exps, returns, WriteFn = WriteP) - } - - r$dataFiles <- 0 - r$excelFiles <- 0 - r$treeFiles <- 0 - TwoWide <- function(n) { - formatC(n, width = 2, flag = "0") - } - DataFileName <- function(n) if (length(n)) { - paste0("dataFile-", TwoWide(n), ".txt") - } - ExcelFileName <- function(n) if (length(n)) { - paste0("excelFile-", TwoWide(n), ".xlsx") - } - TreeFileName <- function(n) if (length(n)) { - paste0("treeFile-", TwoWide(n), ".txt") - } - LastFile <- function(type) { - switch(pmatch(type, c("data", "excel", "tree")), - DataFileName(r$dataFiles), - ExcelFileName(r$excelFiles), - TreeFileName(r$treeFiles) - ) - } - CacheInput <- function(type, fileName) { - key <- paste0(type, "Files") - r[[key]] <- r[[key]] + 1 - file.copy(fileName, paste0(tempdir(), "/", LastFile(type)), - overwrite = TRUE) - } - StashTrees <- function(trees) { - key <- paste0("treeFiles") - r[[key]] <- r[[key]] + 1 - write.nexus(trees, file = paste0(tempdir(), "/", LastFile("tree"))) - } - - if (!requireNamespace("TreeDist", quietly = TRUE)) { - install.packages("TreeDist") - } - - library("TreeTools", quietly = TRUE) - library("TreeDist") - library("TreeSearch") - - BeginLog() - - library("future") - library("promises") - plan(multisession) - - startOpt <- options("cli.progress_show_after" = 0.1) - - - LogMsg("Started server") - - - ############################################################################## - # Load data - ############################################################################## - - tipLabels <- reactive({r$trees[[1]][["tip.label"]]}) - - nChars <- reactive({ - if (HaveData()) { - as.integer(length(attr(r$dataset, "index"))) - } else { - 0L - } - }) - - TaxonOrder <- reactive({ - if (HaveData()) { - names(r$dataset) - } else { - tipLabels() - } - }) - - DatasetMatchesTrees <- reactive({ - length(intersect(names(r$dataset), tipLabels())) == length(r$dataset) - }) - - UpdateData <- reactive({ - source <- input$dataSource - if (source == "file") { - if (!r$dataFileVisible) { - showElement("dataFile") - r$dataFileVisible <- TRUE - runjs("console.log($('#dataFile-label'))") - runjs(paste0( - "$('#dataFile-label').parent()", - ".css({'outline': 'dashed #428bca 20px', ", - "'width': '100%'})", - ".animate({'outline-width': '0px'}, 'slow');")) - return() - } - - fileInput <- input$dataFile - r$dataset <- NULL - r$chars <- NULL - if (is.null(fileInput)) { - # How can this be? - Notification(type = "error", "No data file selected") - return("No data file selected.") - } - dataFile <- fileInput$datapath - if (is.null(dataFile)) { - Notification(type = "error", "No data file found.") - return ("No data file specified.") - } - - LogMsg("UpdateData(): from file") - r$sortTrees <- FALSE # Trees loaded from dataset may be in sequence - r$readDataFile <- NULL - - if (length(grep("\\.xlsx?$", dataFile))) { - if (!requireNamespace("readxl", quietly = TRUE)) { - install.packages("readxl") - } - showElement("readxl.options", anim = TRUE) - - r$dataset <- tryCatch({ - sheets <- readxl::excel_sheets(dataFile) - updateSelectInput(session, - inputId = "readxl.sheet", - choices = setNames(sheets, sheets), - selected = if (input$readxl.sheet %in% sheets) { - input$readxl.sheet - } else { - sheets[1] - }) - - tibble <- readxl::read_excel( - path = dataFile, - sheet = match(input$readxl.sheet, sheets, nomatch = 1L), - skip = max(0L, input$readxlSkip - 2L), - .name_repair = "minimal", - col_types = "text" - ) - - firstCol <- input$readxlSkipCols - 1L - chars <- colnames(tibble)[-seq_len(firstCol)] - taxNames <- gsub(" ", "_", trimws(unlist(tibble[, firstCol]))) - output$readxl.taxa <- renderUI(HTML(paste( - "Taxon names:", - paste(taxNames[1:3], collapse = ", "), - "...\n"))) - output$readxl.chars <- renderUI(HTML(paste( - "Character names:", - # not r$chars, which may be modified before output updated - paste(chars[1:3], collapse = ", "), - "..." - ))) - r$chars <- chars - - dat <- as.matrix(tibble[, -seq_len(firstCol)]) - rownames(dat) <- taxNames - dat <- MatrixToPhyDat(dat) - if (attr(dat, "nr") == 0) { - stop("No characters loaded; throw error") - } - - # Lines that could cause an error must come before log - - LogComment("Load data from spreadsheet", 2) - if (r$excelFiles == 0 || - tools::md5sum(dataFile) != - tools::md5sum(paste0(tempdir(), "/", LastFile("excel")))) { - CacheInput("excel", dataFile) - } - LogCode(c( - paste0("dataFile <- \"", LastFile("excel"), "\""), - "excelSheet <- readxl::read_excel(", - " path = dataFile,", - paste0(" sheet = ", match(input$readxl.sheet, sheets, 1L), ","), - paste0(" skip = ", max(0L, input$readxlSkip - 2L), ","), - " .name_repair = \"minimal\",", - " col_types = \"text\"", - ")", - paste0("dat <- as.matrix(excelSheet[, -seq_len(", firstCol, ")])"), - paste0("rownames(dat) <- unlist(excelSheet[, ", firstCol, "])"), - "dataset <- MatrixToPhyDat(dat)" - )) - - # Return: - dat - }, error = function(e) { - NULL - }) - } else { - hideElement("readxl.options") - } - - if (is.null(r$dataset)) suppressWarnings({ - r$dataset <- tryCatch({ - r$readDataFile <- "ReadTntAsPhyDat(dataFile)" - - # Return: - ReadTntAsPhyDat(dataFile) - }, error = function(e) tryCatch({ - r$chars <- tryCatch( - ReadCharacters(dataFile), - error = function(e) { - Notification(type = "error", "Error reading characters from file") - # Return: - NULL - }) - - r$charNotes <- tryCatch( - ReadNotes(dataFile), - error = function(e) { - Notification(type = "error", "Error reading character notes") - # Return: - NULL - }) - - r$readDataFile <- "ReadAsPhyDat(dataFile)" - - # Return: - ReadAsPhyDat(dataFile) - }, error = function(e) { - r$readDataFile <- NULL - # Return: - NULL - })) - - if (!is.null(r$dataset)) { - LogComment("Load data from file", 2) - CacheInput("data", dataFile) - LogCode(c( - paste0("dataFile <- \"", LastFile("data"), "\""), - paste0("dataset <- ", r$readDataFile) - )) - } - }) - } else { - LogMsg("UpdateData(): from package") - - r$sortTrees <- TRUE # Nicer plots - - r$dataFileVisible <- FALSE - hideElement("dataFile") - - dataFile <- system.file(paste0("datasets/", source, ".nex"), - package = "TreeSearch") - CacheInput("data", dataFile) - r$chars <- ReadCharacters(dataFile) - r$charNotes <- ReadNotes(dataFile) - r$readDataFile <- "ReadAsPhyDat(dataFile)" - r$dataset <- ReadAsPhyDat(dataFile) - LogComment("Load dataset file from TreeSearch package") - LogCode(c( - paste0("dataFile <- system.file(\"datasets/", source, - ".nex\", package = \"TreeSearch\")"), - "dataset <- ReadAsPhyDat(dataFile)" - )) - } - if (is.null(r$dataset)) { - Notification(type = "error", "Could not read data from file") - - updateNumericInput(session, "plottedChar", min = 0L, - max = 0L, value = 0L) - updateSelectizeInput(session, "searchChar", choices = NULL) - return ("Could not read data from file") - } else { - Notification(type = "message", - paste("Loaded", nChars(), "characters and", - length(r$dataset), "taxa")) - - updateNumericInput(session, "plottedChar", min = 0L, - max = nChars(), value = 1L) - updateSelectizeInput(session, "searchChar", - choices = paste0(seq_len(nChars()), ": ", - colnames(r$chars)), - selected = "", - server = TRUE) - } - - tryCatch({ - dataFileTrees <- read.nexus(dataFile) - LogComment("Read trees from dataset file") - LogCode("newTrees <- read.nexus(dataFile)") - UpdateAllTrees(dataFileTrees) - CacheInput("tree", dataFile) - r$readTreeFile <- "read.nexus(treeFile)" - }, error = function (e) NULL) - if (!AnyTrees() || !DatasetMatchesTrees()) { - updateActionButton(session, "go", "New search") - } else { - show("displayConfig") - } - - DisplayTreeScores() - }) - - AnyTrees <- reactive({!is.null(r$trees) && length(r$trees) > 0}) - HaveData <- reactive({!is.null(r$dataset) && length(r$dataset) > 0 && inherits(r$dataset, "phyDat")}) - FetchNTree <- debounce(reactive({ - if (!is.null(r$oldNTree)) { - if (!identical(input$nTree, r$oldNTree)) { - r$oldNTree <- NULL - } - } else { - if (UpdateNTree(input$nTree)) { - UpdateActiveTrees() - } - } - }), typingJiffy) - - # Return TRUE if n has changed, FALSE if not - # Don't update active trees here: Leave this to the calling function - UpdateNTree <- function(n) { - if (n > length(r$allTrees)) { # nTree "max" can be beaten by typing - r$oldNTree <- n - n <- length(r$allTrees) - } - if (r$nTree == n) { - # Return: - FALSE - } else { - LogMsg("UpdateNTree(", r$nTree, " -> ", n, ")") - r$nTree <- n - # range <- r$treeRange[2] - r$treeRange[1] - # if (n > range + 1L) { - # nTrees <- length(r$allTrees) - # upper <- min(nTrees, r$treeRange[1] + n - 1L) - # lower <- min(r$treeRange[1], upper + 1L - n) - # r$treeRange <- c(lower, upper) - # updateSliderInput(session, "treeRange", value = r$treeRange) - # } - if (input$nTree != n) { - updateNumericInput(session, "nTree", value = n) - } - # Return: - TRUE - } - } - - FetchTreeRange <- debounce(reactive({ - if (!is.null(r$oldTreeRange)) { - if (!identical(input$treeRange, r$oldTreeRange)) { - r$oldTreeRange <- NULL - } - } else { - if (UpdateTreeRange(input$treeRange)) { - UpdateActiveTrees() - } - } - }), aJiffy) - - # Return TRUE if changed, FALSE if not - # Don't update active trees here: Leave this to the calling function - UpdateTreeRange <- function(range) { - if (identical(range, r$treeRange)) { - # Return: - FALSE - } else { - LogMsg("UpdateTreeRange([", paste(r$treeRange, collapse = ", "), - "] -> [", paste(range, collapse = ", "), "])") - r$treeRange <- range - span <- r$treeRange[2] - r$treeRange[1] - if (r$nTree > span + 1L) { - UpdateNTree(span + 1L) - } - - # Return: - TRUE - } - } - - - UpdateActiveTrees <- reactive({ - if (r$updatingTrees) { - LogMsg(" Skipping UpdateActiveTrees()") - return() - } - r$updatingTrees <- TRUE - on.exit(r$updatingTrees <- FALSE) - LogMsg("UpdateActiveTrees()") - - nTrees <- length(r$allTrees) - if (r$nTree == nTrees && - r$treeRange[1] == 1L && r$treeRange[2] == nTrees) { - thinnedTrees <- r$allTrees - if (!is.null(r$allTrees) && !identical(trees, thinnedTrees)) { - LogCode("trees <- allTrees") - } - } else { - thinnedTrees <- r$allTrees[ - unique(as.integer(seq.int( - r$treeRange[1], r$treeRange[2], length.out = r$nTree)))] - - if (!is.null(r$allTrees) && !identical(trees, thinnedTrees)) { - LogCode(paste0( - "trees <- allTrees[unique(as.integer(seq.int(", - r$treeRange[1], ", ", r$treeRange[2], ", length.out = ", r$nTree, ")))]" - )) - } - } - - r$trees <- thinnedTrees - r$treeHash <- rlang::hash(r$trees) - - DisplayTreeScores() - - if (AnyTrees()) { - for (elem in c("keepNTips", "neverDrop")) { - showElement(elem, anim = TRUE) - } - } else { - for (elem in c("keepNTips", "neverDrop")) { - hideElement(elem) - } - } - - updateSliderInput(session, "whichTree", min = 0L, - max = length(r[["trees"]]), value = 0L) - UpdateKeepNTipsRange() # Updates Rogues() - UpdateDroppedTaxaDisplay() - if (maxProjDim() > 0) { - updateSliderInput(inputId = "spaceDim", max = max(1L, maxProjDim()), - value = min(maxProjDim(), input$spaceDim)) - } - updateSelectizeInput(inputId = "neverDrop", choices = tipLabels(), - selected = input$neverDrop) - UpdateOutgroupInput() - updateSelectizeInput(inputId = "relators", choices = tipLabels(), - selected = input$relators) - }) - - UpdateAllTrees <- function (newTrees) { - LogMsg("UpdateAllTrees()") - on.exit({ - LogMsg("/UpdateAllTrees()") - }, add = TRUE) - - newTrees <- c(newTrees) - if (length(newTrees) > 1L) { - newTrees <- RenumberTips(newTrees, newTrees[[1]]$tip.label) - } - if (identical(newTrees, r$newTrees)) { - LogMsg(" ") - return() - } - - oldNTrees <- length(r$allTrees) - - if (!identical(r$allTrees, newTrees)) { - LogCode("allTrees <- newTrees") - r$allTrees <- newTrees - } - nTrees <- length(newTrees) - - if (nTrees != oldNTrees) { - if (!identical(input$treeRange, c(1L, nTrees))) { - r$oldTreeRange <- input$treeRange - } - UpdateTreeRange(c(1L, nTrees)) - # update*Input messages are collected and sent after all the observers - # (including outputs) have finished running. - updateSliderInput(session, "treeRange", - min = 1L, max = nTrees, - value = r$treeRange) - - r$oldNTree <- input$nTree - UpdateNTree(min(max(input$nTree, aFewTrees), nTrees)) - updateNumericInput(session, "nTree", max = nTrees, - value = r$nTree) - } - - UpdateActiveTrees() - if (AnyTrees()) { - showElement("manipulateTreeset") - } else { - hideElement("manipulateTreeset") - } - } - - ############################################################################## - # Event listeners - ############################################################################## - - observeEvent(input$dataSource, UpdateData(), ignoreInit = TRUE) - observeEvent(input$dataFile, UpdateData(), ignoreInit = TRUE) - observeEvent(input$readxl.sheet, UpdateData(), ignoreInit = TRUE) - observeEvent(input$readxlSkip, UpdateData(), ignoreInit = TRUE) - observeEvent(input$readxlSkipCols, UpdateData(), ignoreInit = TRUE) - - observeEvent(r$dataset, { - r$dataHash <- rlang::hash(r$dataset) - }) - observeEvent(input$plotSize, { - px <- paste0("'", input$plotSize, "px'") - runjs(paste0("$('#treePlot').css({height: ", px, ", width: ", px, "});")) - }) - - observeEvent(input$searchConfig, { - #updateSelectInput(session, "character.weight", - # selected = input$character.weight) - updateSelectInput(session, "implied.weights", - selected = input$implied.weights) - updateSliderInput(session, "concavity", value = input$concavity) - updateNumericInput(session, "epsilon", value = input$epsilon) - updateSliderInput(session, "ratchIter", value = input$ratchIter) - updateSliderInput(session, "tbrIter", value = input$tbrIter) - updateSliderInput(session, "maxHits", value = input$maxHits) - updateSliderInput(session, "startIter", value = input$startIter) - updateSliderInput(session, "finalIter", value = input$finalIter) - showModal(modalDialog( - easyClose = TRUE, - fluidPage(column(6, - tagList( - #selectInput("character.weight", "Character weighting", - # list("Equal" = "equal"), "equal"), - selectInput("implied.weights", "Step weighting", - list("Implied" = "on", "Profile" = "prof", - "Equal" = "off"), "on"), - sliderInput("concavity", "Step weight concavity constant", min = 0L, - max = 3L, pre = "10^", value = 1L), - numericInput("epsilon", "Keep if suboptimal by \u2264", min = 0, - value = 0), - sliderInput("ratchIter", "Ratchet iterations", min = 0L, - max = 50L, value = 6L, step = 1L), - sliderInput("timeout", "Maximum run duration", min = 1, - max = 600, value = 30, post = "min", step = 1), - )), column(6, - tagList( - sliderInput("maxHits", "Maximum hits", min = 0L, max = 5L, - value = 2L, pre = "10^"), - sliderInput("tbrIter", "TBR depth", min = 1L, max = 20L, - value = 1L, step = 1L), - sliderInput("startIter", "First iteration extra depth", min = 1L, - max = 10L, value = 3L, pre = "\ud7"), - sliderInput("finalIter", "Final iteration extra depth", min = 1L, - max = 10L, value = 1L, pre = "\ud7"), - selectizeInput("searchWithout", "Exclude taxa", DatasetTips(), - r$searchWithout, multiple = TRUE) - )) - ), - title = "Tree search settings", - footer = tagList(modalButton("Close", icon = Icon("rectangle-xmark")), - actionButton("modalGo", icon = Icon("magnifying-glass"), - if(length(r$trees)) { - "Continue search" - } else { - "Start search" - })) - )) - show("go") - }) - - observeEvent(input$treeFile, { - tmpFile <- input$treeFile$datapath - newTrees <- tryCatch({ - r$readTreeFile <- "read.tree(treeFile)" - LogMsg("Trying read.tree()") - read.tree(tmpFile) - }, - error = function (x) tryCatch({ - r$readTreeFile <- "read.nexus(treeFile)" - LogMsg("Trying read.nexus()") - read.nexus(tmpFile) - }, - error = function (err) tryCatch( - { - if (grepl("NA/NaN argument", err)) { - LogMsg("Terminating tree block") - # Unterminated tree block, perhaps because a search is ongoing - withEnd <- tempfile() - on.exit(unlink(withEnd)) - writeLines(c(readLines(tmpFile), "\nEND;"), withEnd) - read.nexus(withEnd) - } else { - stop("Next handler, please") - } - }, - error = function (x) tryCatch({ - r$readTreeFile <- "ReadTntTree(treeFile)" - ReadTntTree(tmpFile) - }, warning = function (x) tryCatch({ - Notification(as.character(x), type = "warning") - tryLabels <- TipLabels(r$dataset) - if (length(tryLabels) > 2) { - Notification("Inferring tip labels from dataset", - type = "warning") - r$readTreeFile <- - "ReadTntTree(treeFile, tipLabels = TipLabels(dataset))" - ReadTntTree(tmpFile, tipLabels = tryLabels) - } else { - NULL - } - }, error = NULL - ) - ) - ) - ) - ) - if (is.null(newTrees)) { - Notification("Trees not in a recognized format", type = "error") - } else { - LogComment("Load tree from file", 2) - CacheInput("tree", tmpFile) - LogCode(paste0("treeFile <- \"", LastFile("tree"), "\"")) - LogCode(paste0("newTrees <- ", r$readTreeFile)) - - UpdateAllTrees(newTrees) # updates r$trees - - removeModal() - Notification(paste("Loaded", length(r$trees), "trees"), type = "message") - updateActionButton(session, "modalGo", "Continue search") - updateActionButton(session, "go", "Continue") - show("displayConfig") - } - - }) - - observeEvent(input$implied.weights, { - switch(input$implied.weights, - "on" = show("concavity"), - hide("concavity") - ) - DisplayTreeScores() - }) - - weighting <- reactive( - if (length(input$implied.weights) > 0) { - input$implied.weights - } else { - "on" - } - ) - wtType <- reactive(switch(weighting(), - "on" = paste0("k = ", signif(concavity(), 3)), - "off" = "EW", - "prof" = "PP")) - - scores <- bindCache(reactive({ - if (!HaveData() || !AnyTrees()) { - return(NULL) - } - PutTree(r$trees) - PutData(r$dataset) - LogMsg("scores(): Recalculating scores with k = ", concavity()) - withProgress(tryCatch( - signif(TreeLength( - RootTree(r$trees, 1), - r$dataset, - concavity = concavity() - )), - error = function (x) { - if (HaveData() && AnyTrees()) { - cli::cli_alert(x[[2]]) - cli::cli_alert_danger(x[[1]]) - Notification(type = "error", - "Could not score all trees with dataset") - } - NULL - }), - value = 0.85, message = "Scoring trees") - }), r$treeHash, r$dataHash, concavity()) - - DisplayTreeScores <- function () { - LogMsg("DisplayTreeScores()") - treeScores <- scores() - score <- if (is.null(treeScores)) { - "; could not be scored from dataset" - } else if (length(unique(treeScores)) == 1) { - paste0(", each with score ", treeScores[1], " (", wtType(), ")") - } else { - paste0(" with scores ", min(treeScores), " to ", max(treeScores), - " (", wtType(), ")") - } - - msg <- paste0( - length(r$allTrees), " trees in memory: ", - length(r$trees), " sampled", - score - ) - output$results <- renderText(msg) - msg - } - - observeEvent(input$concavity, { - DisplayTreeScores() - }, ignoreInit = TRUE) - - TipsInTree <- reactive({ - if (AnyTrees()) { - length(r$trees[[1]]$tip.label) - } else { - 0L - } - }) - - UpdateKeepNTipsRange <- reactive({ - if (AnyTrees() && "consConfig" %in% r$visibleConfigs) { - nTip <- TipsInTree() - LogMsg("UpdateKeepNTipsRange(", input$keepNTips, " -> ", nTip, ")") - r$keepNTips <- nNonRogues() - if (r$keepNTips != input$keepNTips) { - r$oldkeepNTips <- input$keepNTips - } - updateNumericInput(inputId = "keepNTips", - label = paste0("Tips to show (/", nTip, "):"), - min = max(3L, length(input$neverDrop)), - max = nTip, - value = nNonRogues()) - } - }) - - UpdateExcludedTipsInput <- reactive({ - if (AnyTrees() && "consConfig" %in% r$visibleConfigs) { - LogMsg("UpdateExcludedTipsInput()") - dropList <- dropSeq()[seq_along(DroppedTips())] - updateSelectInput(inputId = "excludedTip", - choices = dropList, - selected = if(input$excludedTip %in% DroppedTips()) - input$excludedTip else dropSeq()[1]) - html("droppedList", - paste0("", - "
    ", - paste0("
  • ", - dropList, "
  • ", collapse = "\r\n"), - "
")) - } - }) - - UpdateDroppedTaxaDisplay <- reactive({ - LogMsg("UpdateDroppedTaxaDisplay()") - if ("consConfig" %in% r$visibleConfigs) { - if (length(DroppedTips())) { - UpdateExcludedTipsInput() - if ("droppedTips" %in% r$visibleConfigs) { - show("droppedTips") - } - if ("droppedList" %in% r$visibleConfigs) { - show("droppedList") - } - } else { - hide("droppedTips") - hide("droppedList") - } - } - }) - - observeEvent(r$visibleConfigs, { - UpdateDroppedTaxaDisplay() - }) - - UpdateOutgroupInput <- reactive({ - if (AnyTrees() && "treePlotConfig" %in% r$visibleConfigs) { - LogMsg("UpdateOutgroupInput()") - r$outgroup <- intersect(r$outgroup, KeptTips()) - if (length(r$outgroup) == 0) { - r$outgroup <- if (HaveData()) { - intersect(names(r$dataset), KeptTips())[1] - } else { - KeptTips()[1] - } - } - - if (!identical(sort(r$outgroup), sort(input$outgroup))) { - r$oldOutgroup <- if (is.null(input$outgroup)) { - NO_OUTGROUP - } else { - input$outgroup - } - } - - updateSelectizeInput( - inputId = "outgroup", - selected = r$outgroup, - choices = KeptTips() - ) - } - }) - - observeEvent(input$implied.weights, { - switch(input$implied.weights, - "on" = show("concavity"), - hide("concavity") - ) - }) - - ShowConfigs <- function (visible = character(0)) { - allConfigs <- c("whichTree", "charChooser", - "consConfig", "clusConfig", - "clusLegend", "branchLegend", - "spaceConfig", "treePlotConfig", - "mapConfig", "savePlottedTrees", - "droppedTips", "droppedList") - r$visibleConfigs <- visible - lapply(visible, show) - lapply(setdiff(allConfigs, visible), hide) - } - - observeEvent(input$plotFormat, { - ShowConfigs(switch(input$plotFormat, - "ind" = c("whichTree", "charChooser", - "treePlotConfig"), - "cons" = c("consConfig", "droppedTips", - "savePlottedTrees", - "treePlotConfig", "branchLegend"), - "clus" = c("clusConfig", "clusLegend", - "savePlottedTrees", - "consConfig", "droppedList", - "treePlotConfig"), - "space" = c("clusConfig", "clusLegend", - "spaceConfig", "mapConfig"), - "")) - }) - - - output$branchLegend <- renderUI({ - if (!AnyTrees()) { - return() - } - LogMsg("renderUI(branchLegend)") - on.exit(LogMsg("/renderUI(branchLegend)")) - kept <- KeptTips() - dropped <- DroppedTips() - - if (length(dropped) && - length(input$excludedTip) && - nchar(input$excludedTip) && - input$excludedTip %in% tipLabels()) { - consTrees <- lapply(r$trees, DropTip, setdiff(dropped, input$excludedTip)) - plotted <- TreeTools::RoguePlot( - trees = consTrees, - tip = input$excludedTip, - p = consP(), - plot = FALSE - ) - tagList( - tags$span(class = "legendLeft", "1 tree"), - tags$span(id = "blackToGreen", class = "legendBar", "\ua0"), - tags$span(class = "legendRight", - paste(max(c(plotted$onEdge, plotted$atNode)), "trees")), - ) - } - }) - - concavity <- reactive({ - kExp <- if (length(input$concavity)) input$concavity else 1 - switch(weighting(), - "on" = 10 ^ kExp, - "off" = Inf, - "prof" = "Profile") - }) - - tolerance <- reactive({ - if (input$epsilon == 0) { - sqrt(.Machine$double.eps) - } else { - input$epsilon - } - }) - - StartSearch <- function () { - if (!HaveData()) { - Notification("No data loaded", type = "error") - } else { - startTree <- if (!AnyTrees()) { - LogComment("Select starting tree") - LogCode(paste0("startTree <- AdditionTree(dataset, concavity = ", - Enquote(concavity()), ")")) - AdditionTree(r$dataset[SearchTips()], concavity = concavity()) - } else { - LogComment("Select starting tree") - treeLabels <- TipLabels(r$trees[[1]]) - if (all(SearchTips() %in% treeLabels)) { - if (length(setdiff(treeLabels, SearchTips())) > 0) { - if (length(r$searchWithout)) { - LogCode(paste0( - "searchTips <- setdiff(names(dataset), ", EnC(r$searchWithout), - ")"), - "startTree <- KeepTip(trees[[1]], searchTips)") - } else { - LogCode("startTree <- KeepTip(trees[[1]], names(dataset))") - } - KeepTip(r$trees[[1]], SearchTips()) - } else { - firstOptimal <- which.min(scores()) - LogCode(paste0("startTree <- trees[[", firstOptimal, "]]", - " # First tree with optimal score")) - r$trees[[firstOptimal]] - } - } else { - # Fuzzy-match labels - matching <- TreeDist::LAPJV(adist(treeLabels, SearchTips()))$matching - scaffold <- KeepTip(r$trees[[1]], !is.na(matching)) - scaffold[["tip.label"]] <- SearchTips()[matching[!is.na(matching)]] - AdditionTree(r$dataset, concavity = concavity(), - constraint = scaffold) - } - } - LogMsg("StartSearch()") - PutData(r$dataset[SearchTips()]) - PutTree(startTree) - LogComment("Search for optimal trees", 1) - LogCode(c( - "newTrees <- MaximizeParsimony(", - if (length(r$searchWithout)) { - paste0( - " dataset[setdiff(names(dataset), ", EnC(r$searchWithout), ")]" - ) - } else { - " dataset," - }, - " tree = startTree,", - paste0(" concavity = ", Enquote(concavity()), ","), - paste0(" ratchIter = ", input$ratchIter, ","), - paste0(" tbrIter = ", input$tbrIter, ","), - paste0(" maxHits = ", ceiling(10 ^ input$maxHits), ","), - paste0(" maxTime = ", input$timeout, ","), - paste0(" startIter = ", input$startIter, ","), - paste0(" finalIter = ", input$finalIter, ","), - if (input$epsilon > 0) paste0(" tolerance = ", tolerance(), ","), - " verbosity = 4", - ")")) - newTrees <- withProgress( - MaximizeParsimony(r$dataset[SearchTips()], - tree = startTree, - concavity = concavity(), - ratchIter = input$ratchIter, - tbrIter = input$tbrIter, - maxHits = ceiling(10 ^ input$maxHits), - maxTime = input$timeout, - startIter = input$startIter, - finalIter = input$finalIter, - tolerance = tolerance(), - verbosity = 4L), - value = 0.85, message = "Finding MPT", - detail = paste0(ceiling(10^input$maxHits), " hits; ", wtType()) - ) - r$sortTrees <- TRUE # No meaning in order; display nicely - LogComment("Overwrite any previous trees with results") - LogCode(c( - "if (inherits(newTrees, \"phylo\")) {", - " trees <- list(newTrees)", - " attr(trees, \"firstHit\") <- attr(newTrees, \"firstHit\")", - " attr(trees[[1]], \"firstHit\") <- NULL", - "}" - )) - UpdateAllTrees(newTrees) - if (inherits(newTrees, "phylo")) { - attr(r$trees, "firstHit") <- attr(newTrees, "firstHit") - attr(r$trees[[1]], "firstHit") <- NULL - } - - updateSliderInput(session, "whichTree", min = 0L, - max = length(r[["trees"]]), value = 0L) - - updateActionButton(session, "go", "Continue") - updateActionButton(session, "modalGo", "Continue search") - show("displayConfig") - } - } - - observeEvent(input$searchWithout, { - r$searchWithout <- input$searchWithout - }, ignoreInit = TRUE) - - observeEvent(input$go, StartSearch(), ignoreInit = TRUE) - observeEvent(input$modalGo, { - removeModal() - StartSearch() - }, ignoreInit = TRUE) - - UserRoot <- function(tree) { - outgroupTips <- intersect(r$outgroup, tree$tip.label) - if (length(outgroupTips)) { - # DELETE? tr <- deparse(substitute(tree)) - RootTree(tree, outgroupTips) - } else { - tree - } - } - - LogUserRoot <- function(tree = "cons", dropped = character(0)) { - outgroupTips <- setdiff(r$outgroup, dropped) - if (length(outgroupTips)) { - LogCommentP("Root tree") - LogCodeP(paste0(tree, " <- RootTree(", tree, ", ", EnC(outgroupTips), ")")) - } - } - - PlottedChar <- debounce(reactive({ - typed <- max(0L, as.integer(input$plottedChar), na.rm = TRUE) - if (nChars() > 0 && typed > nChars()) { - Notification(type = "warning", - paste("Dataset contains", nChars(), "characters.") - ) - updateNumericInput(session, "plottedChar", value = nChars()) - } - min(typed, nChars()) - }), aJiffy) - - observeEvent(PlottedChar(), { - if (PlottedChar() > 0) { - showElement("mapDisplay") - } else { - hideElement("mapDisplay") - } - }, ignoreInit = TRUE) - - observeEvent(input$searchChar, { - searchResult <- as.numeric(strsplit(input$searchChar, ": ")[[1]][1]) - if (!is.na(searchResult)) { - updateNumericInput(session, "plottedChar", value = searchResult) - } - }) - - whichTree <- debounce(reactive(input$whichTree), aJiffy) - - PlottedTree <- reactive({ - if (length(r$trees) > 0L) { - plottedTree <- if (whichTree() > 0) { - r$trees[[whichTree()]] - } else { - Consensus(r$trees, p = 1) - } - plottedTree <- UserRoot(plottedTree) - plottedTree <- SortEdges(plottedTree) - if (!("tipsRight" %in% input$mapDisplay)) { - plottedTree$edge.length <- rep_len(2, dim(plottedTree[["edge"]])[[1]]) - } - plottedTree - } - }) - LogPlottedTree <- function() { - if (whichTree() > 0) { - LogCodeP(paste0("plottedTree <- trees[[", whichTree(), "]]")) - } else { - LogCodeP("plottedTree <- Consensus(trees, p = 1)") - } - LogUserRoot("plottedTree") - if (!("tipsRight" %in% input$mapDisplay)) { - LogCommentP("Set uniform edge length", 0) - LogCodeP( - "plottedTree$edge.length <- rep.int(2, nrow(plottedTree$edge))" - ) - } - LogSortEdges("plottedTree") - } - - Instab <- reactive({ - TipInstability(r$trees) - }) - - dropSeq <- reactive({ - LogMsg("dropSeq()") - Rogues()$taxon[-1] - }) - - stableCol <- reactive({ - Rogue::ColByStability(r$trees) - }) - - Rogues <- bindCache(reactive({ - if (AnyTrees() && inherits(r$trees, "multiPhylo")) { - LogComment("Check for rogue taxa", 2) - LogComment(paste0( - "Use RogueTaxa() in place of QuickRogue() for a more complete ", - "analysis")) - LogCode(c( - "rogues <- Rogue::QuickRogue(", - " trees,", - if (length(input$neverDrop)) paste0( - " neverDrop = ", EnC(input$neverDrop), "," - ), - " fullSeq = TRUE,", - paste0(" p = ", Enquote(consP())), - ")", - "print(rogues) # Detailed results of rogue analysis", - "print(rogues$taxon[-1]) # Sequence of taxa to drop" - )) - withProgress( - message = "Identifying rogues", value = 0.99, - rogues <- Rogue::QuickRogue(r$trees, neverDrop = input$neverDrop, - fullSeq = TRUE, p = consP()) - ) - # TODO delete once Rogue 2.1.2 released -- return QuickRogue above. - rogues[!rogues$taxon %in% input$neverDrop, ] - } else { - data.frame(num = 0, taxNum = NA_integer_, taxon = NA_character_, - rawImprovement = NA_real_, IC = 0) - } - }), r$treeHash, input$neverDrop, consP()) - - unitEdge <- reactive({ - TRUE - }) - - SortEdges <- function (tr, force = FALSE) { - if (force || r$sortTrees) { - # Return: - SortTree(tr, order = TaxonOrder()) - } else { - # Return: - tr - } - } - LogSortEdges <- function(tr) ( - if (r$sortTrees) { - LogCommentP("Rotate nodes, to display clades in order of size", 0) - LogCodeP(paste0( - tr, " <- SortTree(", tr, ", order = ", - if (HaveData()) { - "names(dataset)" - } else { - "trees[[1]]$tip.label" - }, - ")" - )) - } - ) - - nNonRogues <- reactive({ - LogMsg("nNonRogues()") - on.exit(LogMsg("nNonRogues: ", nrow(Rogues()) - which.max(Rogues()$IC))) - nrow(Rogues()) - which.max(Rogues()$IC) - }) - - TipCols <- reactive(stableCol()) # TODO allow user to choose how to colour - - TipColLegend <- function() { - PlotTools::SpectrumLegend( - "bottomleft", horiz = TRUE, inset = 0.01, bty = "n", xpd = NA, - palette = hcl.colors(131, "inferno")[1:101], - legend = c("Stable", "Unstable"), - title = "Leaf stability", - title.font = 2 - ) - } - - consP <- debounce(reactive(signif(input$consP)), 50) - observeEvent(consP(), { - if (AnyTrees()) { - LogMsg("Observed consP()") - UpdateKeepNTipsRange() - UpdateDroppedTaxaDisplay() - r$concordance <- list() - } - }, ignoreInit = TRUE) - - concordance <- bindCache(reactive({ - LogMsg("concordance()") - # Return: - switch(input$concordance, - "p" = SplitFrequency(r$plottedTree, r$trees) / length(r$trees), - "qc" = QuartetConcordance(r$plottedTree, r$dataset), - "mcc" = MutualClusteringConcordance(r$plottedTree, r$dataset), - "spc" = SharedPhylogeneticConcordance(r$plottedTree, r$dataset), - "clc" = ClusteringConcordance(r$plottedTree, r$dataset), - "phc" = PhylogeneticConcordance(r$plottedTree, r$dataset), - NULL - ) - }), r$plottedTree, r$treeHash, r$dataHash, input$concordance) - - LabelConcordance <- \() { - LogMsg("LabelConcordance()") - if (input$concordance != "none" && - !is.null(r$plottedTree)) { - LabelSplits(r$plottedTree, signif(concordance(), 3), - col = SupportColor(concordance()), - frame = "none", pos = 3L) - } - } - - LogConcordance <- function(plottedTree = "plottedTree") { - if (input$concordance != "none") { - LogCommentP("Calculate split concordance", 1) - concCode <- switch( - input$concordance, - "p" = paste0("SplitFrequency(", plottedTree, - ", trees) / length(trees)"), - "qc" = paste0("QuartetConcordance(", plottedTree, ", dataset)"), - "clc" = paste0("ClusteringConcordance(", plottedTree, ", dataset)"), - "phc" = paste0("PhylogeneticConcordance(", plottedTree, ", dataset)"), - "mcc" = paste0("MutualClusteringConcordance(", plottedTree, - ", dataset)"), - "spc" = paste0("SharedPhylogeneticConcordance(", plottedTree, - ", dataset)"), - NULL - ) - LogCodeP(paste0("concordance <- ", concCode)) - LogCommentP("Annotate splits by concordance", 1) - LogCodeP("LabelSplits(", - paste0(" tree = ", plottedTree, ","), - " labels = signif(concordance, 3),", - " col = SupportColor(concordance),", - " frame = \"none\",", - " pos = 3", - ")") - } - } - - observeEvent(input$keepNTips, { - if (!is.null(r$oldkeepNTips)) { - if (!identical(input$keepNTips, r$oldkeepNTips)) { - r$oldkeepNTips <- NULL - } - } else { - LogMsg("Observed input$keepNTips -> ", EnC(input$keepNTips)) - r$keepNTips <- max(length(input$neverDrop), 3L, - min(input$keepNTips, TipsInTree())) - UpdateOutgroupInput() - UpdateDroppedTaxaDisplay() - } - }, ignoreInit = TRUE) - - observeEvent(input$neverDrop, { - LogMsg("Observed input$neverDrop -> ", EnC(input$neverDrop)) - UpdateKeepNTipsRange() - UpdateOutgroupInput() - UpdateDroppedTaxaDisplay() - }, ignoreInit = TRUE) - - observeEvent(input$outgroup, { - if (!is.null(r$oldOutgroup)) { - if (!identical(input$outgroup, r$oldOutgroup)) { - r$oldOutgroup <- NULL - } - } else { - LogMsg("Observed input$outgroup -> ", EnC(input$outgroup)) - r$outgroup <- input$outgroup - } - }, ignoreInit = TRUE) - - DatasetTips <- reactive(names(r$dataset)) - SearchTips <- reactive(setdiff(DatasetTips(), r$searchWithout)) - - KeptTips <- reactive({ - LogMsg("KeptTips()") - n <- r$keepNTips - maxN <- length(tipLabels()) - if (is.na(n) || is.null(n)) { - n <- maxN - } - if (n < 3L) { - n <- 3L - } - nNeverDrop <- length(input$neverDrop) - if (n < nNeverDrop) { - n <- nNeverDrop - } - nFromDropSeq <- n - nNeverDrop - - # Return: - if (nFromDropSeq > length(dropSeq())) { - c(input$neverDrop, dropSeq()) - } else { - c(input$neverDrop, rev(dropSeq())[seq_len(nFromDropSeq)]) - } - }) - - DroppedTips <- reactive({ - LogMsg("DroppedTips()") - if (length(KeptTips()) > 1) { - setdiff(tipLabels(), KeptTips()) - } else { - character(0) - } - }) - - ConsensusPlot <- function() { - LogMsg("ConsensusPlot()") - on.exit(LogMsg("/ConsensusPlot()")) - - par(mar = rep(0, 4), cex = 0.9) - kept <- KeptTips() - dropped <- DroppedTips() - - if (length(dropped) && - length(input$excludedTip) && - nchar(input$excludedTip) && - input$excludedTip %in% tipLabels()) { - - if (length(setdiff(dropped, input$excludedTip))) { - consTrees <- lapply(r$trees, DropTip, - setdiff(dropped, input$excludedTip)) - } else { - consTrees <- r$trees - } - - plotted <- TreeTools::RoguePlot( - consTrees, - input$excludedTip, - p = consP(), - edgeLength = 1, - outgroupTips = r$outgroup, - tip.color = TipCols()[intersect(consTrees[[1]]$tip.label, kept)] - ) - r$plottedTree <- plotted$cons - - LabelConcordance() - } else { - without <- intersect(dropped, tipLabels()) # `dropped` might be outdated - if (length(without)) { - } else { - } - cons <- ConsensusWithout(r$trees, without, p = consP()) - cons <- UserRoot(cons) - - if (unitEdge()) { - cons$edge.length <- rep.int(1, dim(cons$edge)[1]) - } - cons <- SortEdges(cons) - - r$plottedTree <- cons - plot(r$plottedTree, tip.color = TipCols()[intersect(cons$tip.label, kept)]) - LabelConcordance() - } - } - - LogConsensusPlot <- function() { - BeginLogP() - LogPar() - dropped <- DroppedTips() - - if (length(dropped) && - length(input$excludedTip) && - nchar(input$excludedTip) && - input$excludedTip %in% tipLabels()) { - - LogCommentP("Prepare reduced consensus tree", 1) - if (length(setdiff(dropped, input$excludedTip))) { - LogCodeP(paste0("exclude <- ", - EnC(setdiff(dropped, input$excludedTip)))) - LogCodeP("consTrees <- lapply(trees, DropTip, exclude)") - LogCodeP("labels <- setdiff(consTrees[[1]]$tip.label, exclude)") - } else { - LogCodeP("consTrees <- trees", - "labels <- consTrees[[1]]$tip.label") - } - - LogCommentP(paste0( - "Colour tip labels according to their original 'instability' ", - "(Smith 2022)") - ) - LogCodeP( - "tipCols <- Rogue::ColByStability(trees)", - paste0( - "tipCols <- tipCols[setdiff(labels, ", - Enquote(input$excludedTip), ")]" - ) - ) - LogCommentP(paste0( - "Plot the reduced consensus tree, showing position of ", - gsub("_", " ", input$excludedTip, fixed = TRUE)) - ) - LogCodeP("plotted <- RoguePlot(", - " trees = consTrees,", - paste0(" tip = ", Enquote(input$excludedTip), ","), - paste0(" p = ", consP(), ","), - " edgeLength = 1,", - if(length(r$outgroup)) { - paste0(" outgroupTips = ", EnC(r$outgroup), ",") - }, - " tip.color = tipCols", - ")") - - LogCommentP("Store tree to plot concordance") - LogCodeP("plottedTree <- plotted$cons") - - LogConcordance() - } else { - without <- intersect(dropped, tipLabels()) # `dropped` might be outdated - LogCommentP("Calculate consensus tree") - if (length(without)) { - LogCodeP( - "cons <- ConsensusWithout(", - " trees,", - paste0(" ", EnC(without), ","), - paste0(" p = ", consP()), - ")") - } else { - LogCodeP(paste0( - "cons <- Consensus(trees, p = ", consP(), ")" - )) - } - LogUserRoot(dropped = without) - if (unitEdge()) { - LogCodeP("cons$edge.length <- rep.int(1L, nrow(cons$edge))") - } - LogSortEdges("cons") - LogCommentP("Plot consensus tree") - LogCodeP( - "tipCols <- Rogue::ColByStability(trees)[cons$tip.label]", - "plot(cons, tip.color = tipCols)") - LogConcordance("cons") - } - } - - PolEscVal <- reactive({ - LengthAdded(r$trees, - r$dataset[tipLabels(), PlottedChar()], - concavity()) - }) - - CharacterwisePlot <- function() { - par(mar = rep(0, 4), cex = 0.9) - n <- PlottedChar() - if (whichTree() > 0) { - LogMsg("Plotting PlottedTree(", whichTree(), ", ", n, ")") - } - r$plottedTree <- PlottedTree() - if (length(n) && n > 0L) { - pc <- tryCatch({ - extraLen <- PolEscVal() - roguishness <- if (max(extraLen) == 0) { - "black" - } else { - hcl.colors(256, "inferno")[ - (192 * extraLen[r$plottedTree$tip.label] / max(extraLen)) + 1 - ] - } - PlotCharacter( - if (whichTree() > 0) r$plottedTree else lapply(r$trees, UserRoot), - r$dataset, - n, - edge.width = 2.5, - updateTips = "updateTips" %in% input$mapDisplay, - tip.color = roguishness, - Display = function(tr) { - tr <- UserRoot(tr) - if (unitEdge()) { - tr$edge.length <- rep.int(1, dim(tr$edge)[[1]]) - } - SortEdges(tr) - } - ) - if (max(extraLen) > 0) { - PlotTools::SpectrumLegend( - "bottomleft", bty = "n", - palette = hcl.colors(256, "inferno")[1:193], - title = "Mean tree score\nimpact", - title.font = 2, - y.intersp = 1.42, - legend = c(signif(4:1 * max(extraLen) / 4, 3), "No impact") - ) - } - }, - error = function (cond) { - cli::cli_alert_danger(cond) - Notification(type = "error", - "Could not match dataset to taxa in trees") - ErrorPlot("Load dataset with\n", "character codings\n", - "for taxa on tree") - return() - } - ) - - LabelConcordance() - } else { - plot(r$plottedTree, tip.color = TipCols()[r$plottedTree$tip.label]) - TipColLegend() - } - } - - LogPar <- function() { - LogCommentP("Set up plotting area") - LogCodeP(c( - "par(", - " mar = c(0, 0, 0, 0), # Zero margins", - " cex = 0.9 # Smaller font size", - ")" - )) - } - - LogCharacterwisePlot <- function() { - BeginLogP() - LogPar() - n <- PlottedChar() - if (whichTree() > 0) { - LogComment(paste("Select tree", whichTree(), "from tree set")) - } - LogPlottedTree() - if (length(n) && n > 0L) { - if (whichTree() > 0) { - LogCommentP(paste("Map character", n, "onto tree", whichTree())) - } else { - LogCommentP(paste("Map character", n, "onto consensus tree")) - } - LogCodeP( - "PlotCharacter(", - if (whichTree() > 0) " tree = plottedTree," else - paste0(" tree = RootTree(trees, ", EnC(r$outgroup), "),"), - " dataset = dataset,", - paste0(" char = ", n, ","), - paste0(" updateTips = ", "updateTips" %in% input$mapDisplay, ","), - " Display = function(tr) {", - paste0(" tr <- RootTree(tr, ", EnC(r$outgroup), ")"), - " tr$edge.length <- rep.int(2, nrow(tr$edge))", - " SortTree(tr)", - " },", - " edge.width = 2.5", - ")" - ) - LogConcordance() - } else { - LogCommentP("Plot single tree") - LogCodeP( - "tipCols <- Rogue::ColByStability(trees)[plottedTree$tip.label]", - "plot(plottedTree, tip.color = tipCols)" - ) - } - } - - MainPlot <- function() { - if (AnyTrees()) { - LogMsg("MainPlot()") - switch( - input$plotFormat, - "cons" = { - ConsensusPlot() - }, - "clus" = { - PlotClusterCons() - }, - "ind" = { - CharacterwisePlot() - }, - "space" = { - TreespacePlot() - } - ) # end switch - } - } - ReactiveMainPlot <- reactive({MainPlot()}) - - output$treePlot <- renderCachedPlot( - ReactiveMainPlot(), - cacheKeyExpr = { # Must be identical to RCode below - switch( - input$plotFormat, - - "clus" = list(r$treeHash, input$plotFormat, - r$keepNTips, input$excludedTip, - consP(), - input$neverDrop, r$outgroup, - input$distMeth, - input$concordance, - silThreshold(), - input$consP, input$concordance), - "cons" = list(r$treeHash, input$plotFormat, - r$keepNTips, input$excludedTip, - consP(), - input$neverDrop, r$outgroup, - input$concordance), - "ind" = list(PlottedChar(), - whichTree(), - input$concordance, - r$outgroup, - concavity(), - input$mapDisplay, - r$dataHash, r$treeHash), - "space" = list(r$treeHash, input$plotFormat, - min(dims(), nProjDim()), - TreeCols(), - treePch(), - input$distMeth, - input$spaceCol, - input$mapLines, - concavity(), - input$spacePch, - if (input$spacePch == "relat") input$relators, - silThreshold(), - input$display) - ) - }, - sizePolicy = function(x) rep(input$plotSize, 2) - ) - - RCode <- bindCache(reactive({ - switch( - input$plotFormat, - "cons" = { - LogConsensusPlot() - }, - "clus" = { - LogPlotClusterCons() - }, - "ind" = { - LogCharacterwisePlot() - }, - "space" = { - LogTreespacePlot() - } - ) - - # Return: - r$plotLog - }), # Must be identical to output$treePlot above - switch( - input$plotFormat, - - "clus" = list(r$treeHash, input$plotFormat, - r$keepNTips, input$excludedTip, - consP(), - input$neverDrop, r$outgroup, - input$distMeth, - input$concordance, - silThreshold(), - input$consP, input$concordance), - "cons" = list(r$treeHash, input$plotFormat, - r$keepNTips, input$excludedTip, - consP(), - input$neverDrop, r$outgroup, - input$concordance), - "ind" = list(PlottedChar(), - whichTree(), - input$concordance, - r$outgroup, - input$mapDisplay, - r$dataHash, r$treeHash), - "space" = list(r$treeHash, input$plotFormat, - min(dims(), nProjDim()), - TreeCols(), - treePch(), - input$distMeth, - input$spaceCol, - input$mapLines, - concavity(), - input$spacePch, - if (input$spacePch == "relat") input$relators, - silThreshold(), - input$display) - ) - ) - - UCFirst <- function (str) { - paste0(toupper(substr(str, 1, 1)), - substr(str, 2, nchar(str))) - } - - nonAmbigContrast <- reactive({ - cont <- attr(r$dataset, "contrast") - applic <- cont[, setdiff(colnames(cont), "-")] - cont[rowSums(applic) == dim(applic)[[2]], ] <- 0 - - # Return: - cont - }) - - plottedTokens <- reactive({ - n <- PlottedChar() - # `phyDat[,]` returns a new phyDat object with a different "contrast" - # Hence we manually extract the compressed character tokens: - phyColumn <- vapply(r$dataset, `[[`, integer(1), - attr(r$dataset, "index")[[n]], USE.NAMES = FALSE) - tokens <- colSums(nonAmbigContrast()[phyColumn, ]) > 0L - names(tokens[tokens]) - }) - - output$charMapLegend <- bindCache( - renderUI({ - n <- PlottedChar() - if (length(n) && n > 0L && !is.null(r$chars)) { - - pal <- c("#00bfc6", "#ffd46f", "#ffbcc5", "#c8a500", - "#ffcaf5", "#d5fb8d", "#e082b4", "#25ffd3", - "#a6aaff", "#e6f3cc", "#67c4ff", "#9ba75c", - "#60b17f") - - states <- attr(r$chars, "state.labels")[[n]] - tokens <- plottedTokens() - appTokens <- setdiff(tokens, "-") - datApp <- setdiff(attr(r$dataset, "levels"), "-") - .State <- function (glyph, text = "Error?", col = "red") { - if (is.numeric(glyph)) { - if (glyph > length(appTokens)) { - return(NULL) - } - level <- match(appTokens[[glyph]], datApp) - text <- states[[level]] - col <- pal[[level]] - glyph <- appTokens[[glyph]] - } - - tags$li(style = "margin-bottom: 2px;", - tags$span(glyph, - style = paste("display: inline-block;", - "border: 1px solid;", - "width: 1em;", - "text-align: center;", - "line-height: 1em;", - "margin-right: 0.5em;", - "background-color:", col, ";") - ), - tags$span(UCFirst(text))) - } - - tagList( - tags$h3(colnames(r$chars)[n]), - tags$ul(style = "list-style: none;", - .State(1), .State(2), .State(3), .State(4), .State(5), - .State(6), .State(7), .State(8), .State(9), - .State(10), .State(11), .State(12), .State(13), - if ("-" %in% tokens) - .State("-", "Inapplicable", "lightgrey"), - .State("?", "Ambiguous", "grey") - ) - ) - } - }), - PlottedChar(), - r$chars, - r$dataset - ) - - - output$charNotes <- bindCache( - renderUI({ - n <- PlottedChar() - if (length(n) && n > 0L - && is.list(r$charNotes) && is.list(r$charNotes[[1]]) - && length(r$charNotes) >= n) { - - charNotes <- r$charNotes[[n]] - description <- charNotes[[1]] - notes <- charNotes[[2]] - states <- attr(r$chars, "state.labels")[[n]] - tokens <- plottedTokens() - - tagList( - if (length(description) > 0) { - tags$div(id = "char-description", - lapply(strsplit(description, "\n")[[1]], tags$p)) - }, - if (!is.null(notes)) tags$ul(class = "state-notes", { - PrintNote <- function(note) { - taxa <- names(note)[note] - tags$li(class = "state-note", - tags$span(class = "state-note-label", - paste(gsub("_", " ", fixed = TRUE, - taxa), collapse = ", ")), - tags$span(class = "state-note-detail", - notes[taxa[1]])) - } - - DuplicateOf <- function(x) { - duplicates <- duplicated(x) - masters <- x[!duplicates] - vapply(masters, function(d) x == d, logical(length(x))) - } - if (length(notes) == 1) { - onlyOne <- TRUE - names(onlyOne) <- names(notes) - PrintNote(onlyOne) - } else { - notes <- notes[order(names(notes))] - duplicates <- DuplicateOf(toupper(notes)) - apply(duplicates, 2, PrintNote) - } - }), - if (!states[[1]] %in% c("", "''") - && any(tokens == "-")) { - tags$p(tags$em("Brazeau et al. (2019) advise that neomorphic (0/1) characters should not contain inapplicable tokens (-).")) - } - ) - } - }), - PlottedChar(), - r$dataset, - r$chars, - r$charNotes - ) - - LogScore <- function (x) { - (-(log10(1 - pmin(1, x) + 1e-2))) / 2 - } - - QualityPlot <- function (quality) { - par(mar = c(2, 0, 0, 0)) - nStop <- length(badToGood) + 1L - - # LogMsg("QualityPlot()") - plot(NULL, xlim = c(0, 1), ylim = c(-1.5, 2.5), - ann = FALSE, axes = FALSE) - x <- seq.int(from = 0, to = 1, length.out = nStop) - segments(x[-nStop], numeric(nStop), x[-1], lwd = 5, col = badToGood) - - trust <- quality[["Trustworthiness"]] - cont <- quality[["Continuity"]] - txc <- quality[["sqrtTxC"]] - - if (trust > 1) { - LogMsg("Preternaturally high Trustworthiness: ", trust) - } - if (cont > 1) { - LogMsg("Preternaturally high Continuity: ", cont) - } - LogMsg(trust * nStop) - segments(LogScore(txc), -1, y1 = 1, lty = 3) - text(LogScore(trust), 1, "T", col = badToGood[LogScore(trust) * nStop]) - text(LogScore(cont), -1, "C", col = badToGood[LogScore(cont) * nStop]) - - tickPos <- c(0, 0.5, 0.7, 0.8, 0.9, 0.95, 1.0) - ticks <- LogScore(tickPos) - - axis(1, at = ticks, labels = NA, line = 0) - axis(1, tick = FALSE, at = ticks, labels = tickPos, line = 0) - axis(1, line = -1, tick = FALSE, - at = ticks[-1] - ((ticks[-1] - ticks[-length(ticks)]) / 2), - labels = c("", "dire", "", "ok", "gd", "excellent")) - axis(3, at = 0.5, tick = FALSE, line = -2, - paste0(dims(), "D mapping quality (trustw. / contin.):")) - } - - output$pcQuality <- renderCachedPlot({ - if (length(r$trees) < 3) { - return() - } - dstnc <- distances() - mppng <- mapping() - mppng <- mapping()[, seq_len(min(dim(mppng)[2], dims()))] - neighbs <- min(10L, length(r$trees) / 2) - future_promise( - TreeDist::MappingQuality(dstnc, dist(mppng), neighbs), - seed = NULL) %...>% QualityPlot - }, cacheKeyExpr = { - list(r$treeHash, input$distMeth, dims()) - }, - sizePolicy = function (dims) dims - ) - - - output$howManyDims <- renderPlot({ - par(mar = c(2.5, 2.5, 0, 0), xpd = NA, mgp = c(1.5, 0.5, 0)) - txc <- projQual()["TxC", ] - nStop <- length(badToGood) - - plot(txc, type = "n", ylim = c(min(txc, 0.5), 1), - frame.plot = FALSE, axes = FALSE, - xlab = "Dimension", ylab = "Trustw. \uD7 Contin.") - par(xpd = FALSE) - axis(1, 1:14) - axis(2) - tickPos <- c(0, 0.5, 0.7, 0.8, 0.9, 0.95, 1.0) - mids <- c(0.6, 0.75, 0.85, 0.925) - text(rep.int(15, 4), mids, pos = 2, cex = 0.8, - col = badToGood[nStop * LogScore(mids)], - c("Essentially random", "Dangerous", "Usable", "Good")) - text(1, 0.975, pos = 4, "Excellent", cex = 0.8, - col = badToGood[LogScore(0.975) * nStop]) - for (i in tickPos[-1]) { - abline(h = i, lty = 3, col = badToGood[LogScore(i) * nStop]) - } - points(txc, type = "b") - txcNow <- txc[dims()] - - points(dims(), txcNow, pch = 16, col = badToGood[LogScore(txcNow) * nStop], - cex = 1.6) - }) - - observeEvent(input$clThresh, { - classes <- c("meaningless", "weak", "good", "strong") - liveClass <- classes[as.integer(cut(input$clThresh, c(0, 0.25, 0.5, 0.7, 1), - include.lowest = TRUE, right = FALSE))] - addClass("clThresh-label", liveClass) - removeClass("clThresh-label", setdiff(classes, liveClass)) - }) - silThreshold <- debounce(reactive({ - input$clThresh - }), 50) - - ############################################################################## - # Clusterings - ############################################################################## - clusterings <- bindCache(reactive({ - ## CAUTION: Update LogClusterings() to reflect any changes made - ## to this function - LogMsg("clusterings()") - maxCluster <- min(15L, length(r$trees) - 1L) - if (maxCluster > 1L) { - possibleClusters <- 2:maxCluster - - hSil <- pamSil <- -99 - dists <- distances() - - nMethodsChecked <- 3L - cli::cli_progress_bar("Computing clusterings", "K-means", - total = nMethodsChecked) - - nK <- length(possibleClusters) - - kClusters <- lapply(possibleClusters, - function (k) TreeDist::KMeansPP(dists, k)) - kSils <- vapply(kClusters, function (kCluster) { - mean(cluster::silhouette(kCluster$cluster, dists)[, 3]) - }, double(1)) - bestK <- which.max(kSils) - kSil <- kSils[bestK] - kCluster <- kClusters[[bestK]]$cluster - - cli::cli_progress_update(1, status = "PAM") - pamClusters <- lapply(possibleClusters, function (k) { - cluster::pam(dists, k = k) - }) - pamSils <- vapply(pamClusters, function (pamCluster) { - mean(cluster::silhouette(pamCluster)[, 3]) - }, double(1)) - bestPam <- which.max(pamSils) - pamSil <- pamSils[bestPam] - pamCluster <- pamClusters[[bestPam]]$cluster - - cli::cli_progress_update(1, status = "Hierarchical") - hTree <- protoclust::protoclust(dists) - hClusters <- lapply(possibleClusters, function (k) cutree(hTree, k = k)) - hSils <- vapply(hClusters, function (hCluster) { - mean(cluster::silhouette(hCluster, dists)[, 3]) - }, double(1)) - bestH <- which.max(hSils) - hSil <- hSils[bestH] - hCluster <- hClusters[[bestH]] - cli::cli_progress_update(1, status = "Done") - - bestCluster <- c("none", "pam", "hmm", "kmn")[ - which.max(c(silThreshold(), pamSil, hSil, kSil))] - } else { - bestCluster <- "none" - } - - LogMsg("Best clustering: ", bestCluster, - "; sil: ", signif(switch(bestCluster, pam = pamSil, hmm = hSil, kmn = kSil, 0))) - # Return: - list(method = switch(bestCluster, pam = "part. around medoids", - hmm = "minimax linkage", - kmn = "k-means", - none = "no significant clustering"), - n = 1 + switch(bestCluster, pam = bestPam, hmm = bestH, kmn = bestK, 0), - sil = switch(bestCluster, pam = pamSil, hmm = hSil, kmn = kSil, 0), - cluster = switch(bestCluster, pam = pamCluster, hmm = hCluster, kmn = kCluster, 1) - ) - - }), r$treeHash, silThreshold(), input$distMeth) - - LogClusterings <- function() { - maxCluster <- min(15L, length(r$trees) - 1L) - if (maxCluster > 1L) { - possibleClusters <- paste(2, maxCluster, sep = ":") - - hSil <- pamSil <- -99 - LogDistances() - dists <- distances() - - LogCommentP("Compute clusters of trees", 2) - nK <- length(possibleClusters) - LogCommentP("Try K-means++ clustering (Arthur & Vassilvitskii 2007):") - LogCodeP( - paste0( - "kClusters <- lapply(", possibleClusters, ", ", - "function (k) KMeansPP(dists, k)", ")" - ), - "kSils <- vapply(kClusters, function (kCluster) {", - " mean(cluster::silhouette(kCluster$cluster, dists)[, 3])", - "}, double(1))", - "bestK <- which.max(kSils)", - "kSil <- kSils[bestK] # Best silhouette coefficient", - "kCluster <- kClusters[[bestK]]$cluster # Best solution" - ) - - LogCommentP("Try partitioning around medoids (Maechler et al. 2019):") - LogCodeP( - paste0( - "pamClusters <- lapply(", possibleClusters, ", ", - "function (k) cluster::pam(dists, k = k)", ")" - ), - "pamSils <- vapply(pamClusters, function (pamCluster) {", - " mean(cluster::silhouette(pamCluster)[, 3])", - "}, double(1))", - "bestPam <- which.max(pamSils)", - "pamSil <- pamSils[bestPam] # Best silhouette coefficient", - "pamCluster <- pamClusters[[bestPam]]$cluster # Best solution" - ) - - - LogCommentP( - paste("Try hierarchical clustering with minimax linkage", - "(Bien & Tibshirani 2011):") - ) - LogCodeP( - "hTree <- protoclust::protoclust(dists)", - paste0( - "hClusters <- lapply(", possibleClusters, ", ", - "function (k) cutree(hTree, k = k)", ")" - ), - "hSils <- vapply(hClusters, function (hCluster) {", - " mean(cluster::silhouette(hCluster, dists)[, 3])", - "}, double(1))", - "bestH <- which.max(hSils)", - "hSil <- hSils[bestH] # Best silhouette coefficient", - "hCluster <- hClusters[[bestH]] # Best solution" - ) - - LogCommentP("Set threshold for recognizing meaningful clustering") - LogCommentP("no support < 0.25 < weak < 0.5 < good < 0.7 < strong", 0) - LogCodeP(paste0("threshold <- ", silThreshold())) - - LogCommentP("Compare silhouette coefficients of each method") - LogCodeP( - "bestMethodId <- which.max(c(threshold, pamSil, hSil, kSil))", - "bestCluster <- c(\"none\", \"pam\", \"hmm\", \"kmn\")[bestMethodId]" - ) - if (clusterings()$n == 1) { - LogCommentP("No significant clustering was found.") - LogCodeP("clustering <- 1 # Assign all trees to single cluster") - } else { - LogCommentP(paste0("Best clustering was ", clusterings()$method, ":")) - LogCommentP(paste0("Silhouette coefficient = ", - signif(clusterings()$sil)), 0) - LogCommentP(paste0("Store the cluster to which each tree is ", - "optimally assigned:")) - LogCodeP(paste0( - "clustering <- switch(bestCluster, pam = pamCluster, hmm = hCluster,", - " kmn = kCluster, 1)"), - paste0("nClusters <- length(unique(clustering))"), - paste0( - "clusterCol <- ", - EnC(palettes[[min(length(palettes), clusterings()$n)]]), - " # Arbitrarily" - ) - ) - } - } else { - LogCommentP("Not enough trees for clustering analysis") - LogCodeP("bestCluster <- \"none\"") - LogCodeP("nClusters <- 1") - } - } - - PlotClusterCons <- function() { - LogMsg("PlotClusterCons()") - on.exit(LogMsg("/PlotClusterCons()")) - - cl <- clusterings() - - kept <- KeptTips() - dropped <- if (length(kept) > 1) { - setdiff(TipLabels(r$trees[[1]]), kept) - } else { - character(0) - } - par(mar = c(0.2, 0, 0.2, 0), xpd = NA) - if (cl$sil > silThreshold()) { - nRow <- ceiling(cl$n / 3) - r$plottedTree <- vector("list", cl$n) - par(mfrow = c(nRow, ceiling(cl$n / nRow))) - - for (i in seq_len(cl$n)) { - col <- palettes[[min(length(palettes), cl$n)]][i] - PutTree(r$trees) - PutData(cl$cluster) - - cons <- ConsensusWithout(r$trees[cl$cluster == i], dropped, p = consP()) - cons <- UserRoot(cons) - if (unitEdge()) { - cons$edge.length <- rep.int(1, dim(cons$edge)[1]) - } - cons <- SortEdges(cons) - r$plottedTree[[i]] <- cons - plot(cons, edge.width = 2, font = 3, cex = 0.83, - edge.color = col, tip.color = TipCols()[cons$tip.label]) - legend("topright", paste0("Cluster ", i), pch = 15, col = col, - pt.cex = 1.5, bty = "n") - LabelConcordance() - } - } else { - PutTree(r$trees) - cons <- ConsensusWithout(r$trees, dropped, p = consP()) - cons <- UserRoot(cons) - if (unitEdge()) { - cons$edge.length <- rep.int(1, dim(cons$edge)[1]) - } - cons <- SortEdges(cons) - r$plottedTree <- cons - plot(cons, edge.width = 2, font = 3, cex = 0.83, - edge.color = palettes[[1]], tip.color = TipCols()[cons$tip.label]) - LabelConcordance() - legend("topright", "No clustering", pch = 16, col = palettes[[1]], - bty = "n") - } - } - - LogPlotClusterCons <- function() { - LogMsg("PlotClusterCons()") - on.exit(LogMsg("/PlotClusterCons()")) - - BeginLogP() - - cl <- clusterings() - LogClusterings() - - kept <- KeptTips() - dropped <- if (length(kept) > 1) { - setdiff(TipLabels(r$trees[[1]]), kept) - } else { - character(0) - } - if (cl$sil > silThreshold()) { - nRow <- ceiling(cl$n / 3) - LogCommentP("Plot consensus of each tree cluster", 2) - LogCodeP(paste0( - "par(mfrow = c(", nRow, ", ", - ceiling(cl$n / nRow), "))", - " # Plotting area layout" - )) - LogCodeP( - paste0( - "tipCols <- Rogue::ColByStability(trees)", - " # Colour tips by stability" - ) - ) - LogCommentP("Plot each consensus tree in turn:", 1) - LogCodeP(paste0("for (i in seq_len(", cl$n, ")) {")) - LogIndent(+2) - LogCodeP( - "clusterTrees <- trees[clustering == i]", - "cons <- ConsensusWithout(", - " trees = clusterTrees,", - paste0(" tip = ", EnC(dropped), ","), - paste0(" p = ", consP()), - ")" - ) - LogUserRoot(dropped = dropped) - if (unitEdge()) { - LogExprP("cons$edge.length <- rep.int(1, nrow(cons$edge))") - } - LogSortEdges("cons") - LogCodeP("plot(", - " cons,", - " edge.width = 2, # Widen lines", - " font = 3, # Italicize labels", - " cex = 0.83, # Shrink tip font size", - " edge.color = clusterCol[i], # Colour tree", - " tip.color = tipCols[cons$tip.label]", - ")") - LogCodeP("legend(", - " \"bottomright\",", - " paste(\"Cluster\", i),", - " pch = 15, # Filled circle icon", - " pt.cex = 1.5, # Increase icon size", - " col = clusterCol[i],", - " bty = \"n\" # Don't plot legend in box", - ")") - LogConcordance("cons") - LogIndent(-2) - LogCodeP("}") - } else { - LogCommentP("No clustering structure: Plot consensus tree") - LogCodeP( - if (length(dropped)) { - c("cons <- ConsensusWithout(", - " trees = trees,", - paste0(" tip = ", EnC(dropped), ","), - paste0(" p = ", consP()), - ")" - ) - } else { - paste0("cons <- Consensus(trees, p = ", consP(), ")") - } - ) - LogUserRoot("cons", dropped = dropped) - if (unitEdge()) { - LogCommentP("Set unit edge length", 0) - LogCodeP("cons$edge.length <- rep.int(1, nrow(cons$edge))") - } - LogSortEdges("cons") - LogCodeP("plottedTree <- cons # Store for future reference") - - LogCodeP("tipCols <- Rogue::ColByStability(trees)[cons$tip.label]") - LogCommentP("Plot consensus tree") - LogCodeP( - "plot(", - " cons,", - " edge.width = 2, # Widen lines", - " font = 3, # Italicize labels", - " cex = 0.83, # Shrink tip font size", - " tip.color = tipCols", - ")" - ) - LogConcordance() - } - } - - ############################################################################## - # Plot settings: point style - ############################################################################## - - spaceCex <- reactive(1.7) - spaceLwd <- reactive(2) - - FirstHit <- reactive({ - r$trees <- WhenFirstHit(r$trees) - - # Return: - attr(r$trees, "firstHit") - }) - - LogFirstHit <- function() { - LogCodeP("whenHit <- gsub(\"(seed|start|ratch\\\\d+|final)_\\\\d+\", \"\\\\1\", - names(trees), perl = TRUE)") - LogCodeP("attr(trees, \"firstHit\") <- table(whenHit)[unique(whenHit)]") - } - - FirstHitCols <- reactive({ - if (is.null(FirstHit())) { - palettes[[1]] - } else { - hcl.colors(length(FirstHit()), "viridis") - } - }) - - LogFirstHitCols <- reactive({ - if (is.null(FirstHit())) { - paste0(palettes[[1]], " # Arbitrarily") - } else { - "hcl.colors(length(firstHit), \"viridis\")" - } - }) - - TreeCols <- reactive({ - switch( - input$spaceCol, - "clust" = { - cl <- clusterings() - if (cl$sil > silThreshold()) { - palettes[[min(length(palettes), cl$n)]][cl$cluster] - } else { - palettes[[1]] - } - }, "score" = { - if (is.null(scores()) || length(unique(scores())) == 1L) { - palettes[[1]] - } else { - norm <- scores() - min(scores()) - norm <- (length(badToGood) - 1L) * norm / max(norm) - rev(badToGood)[1 + norm] - } - }, "firstHit" = { - if (is.null(FirstHit())) { - Notification("Data not available; were trees loaded from file?", - type = "warning") - palettes[[1]] - } else { - rep(FirstHitCols(), FirstHit()) - } - }, - "black" - ) - }) - - LogTreeCols <- reactive({ - beige <- paste0("treeCols <- ", Enquote(palettes[[1]]), " # Arbitrarily") - switch( - input$spaceCol, - "clust" = { - cl <- clusterings() - if (cl$sil > silThreshold()) { - paste0("treeCols <- ", - EnC(palettes[[min(length(palettes), cl$n)]]), - "[clustering]") - } else { - beige - } - }, "score" = { - if (is.null(scores()) || length(unique(scores())) == 1L) { - beige - } else { - c(paste0("scores <- TreeLength(trees, dataset, concavity = ", - Enquote(concavity()), ")"), - "normalized <- scores - min(scores)", - "normalized <- 107 * normalized / max(normalized)", - "goodToBad <- hcl.colors(108, \"Temps\")", - "treeCols <- goodToBad[1 + normalized]" - ) - } - }, "firstHit" = { - if (is.null(FirstHit())) { - beige - } else { - c("trees <- WhenFirstHit(trees)", - "firstHit <- attr(trees, \"firstHit\")", - paste0("treeCols <- rep(", LogFirstHitCols(), ", firstHit))") - ) - } - }, - "treeCols <- black" - ) - }) - - treeNameClustering <- reactive({ - ClusterStrings(names(r$trees)) - }) - - treePch <- reactive({ - switch( - input$spacePch, - "clust" = { - cl <- clusterings() - if (cl$sil > silThreshold()) { - cl$cluster - 1 - } else { - 16 # Filled circle - } - }, "relat" = { - quartet <- input$relators - if (length(quartet) == 4) { - QuartetResolution(r$trees, input$relators) - } else { - Notification("Select four taxa to show relationships") - 0 - } - }, "name" = { - if (is.null(names(r$trees))) { - Notification("Trees lack names", type = "warning") - 16 - } else { - indices <- treeNameClustering() - # Match pch from BGS2019 Fig. 9 for pre-loaded datasets. - # Embarrassingly, in BGS19 I plotted ambigAbsent instead of ambiguous. - # Sadly, Systematic Biology will not allow a correction. - c(1, 3, 4, 2, seq_len(max(indices))[-(1:4)])[indices] - } - }, 0) - }) - - LogTreePch <- function() { - switch( - input$spacePch, - "clust" = { - cl <- clusterings() - if (cl$sil > silThreshold()) { - "cl$cluster - 1" - } else { - "16 # No clustering structure: Use filled circle" - } - }, "relat" = { - quartet <- input$relators - if (length(quartet) == 4) { - paste0("QuartetResolution(trees, ", EnC(input$relators), ")") - } else { - "0 # Square" - } - }, "name" = { - if (is.null(names(r$trees))) { - "16 # Filled circle" - } else { - "ClusterStrings(names(trees))" - } - }, "0 # Square") - } - - maxProjDim <- reactive({ - min(12, max(0L, length(r$trees) - 1L)) - }) - - nProjDim <- reactive({ - dim(mapping())[2] - }) - - dims <- debounce(reactive({ - if (mode3D()) 3L else { - min(input$spaceDim, maxProjDim()) - } - }), 400) - - Quartet <- function (...) { - if (!requireNamespace("Quartet", quietly = TRUE)) { - Notification("Installing required package \"Quartet\"", - type = "warning", duration = 20) - install.packages("Quartet") - } - as.dist(Quartet::QuartetDivergence( - Quartet::ManyToManyQuartetAgreement(...), similarity = FALSE)) - } - - distances <- bindCache(reactive({ - ## CAUTION: LogDistances() must be updated to reflect any changes to - ## this code - LogMsg("distances(): ", input$distMeth) - if (length(r$trees) > 1L) { - Dist <- switch(input$distMeth, - "cid" = TreeDist::ClusteringInfoDistance, - "pid" = TreeDist::PhylogeneticInfoDistance, - "msid" = TreeDist::MatchingSplitInfoDistance, - "rf" = TreeDist::RobinsonFoulds, - "qd" = Quartet) - withProgress( - message = "Initializing distances...", value = 0.99, - Dist(r$trees) - ) - } else { - matrix(0, 0, 0) - } - - }), input$distMeth, r$treeHash) - - LogDistances <- function() { - LogCommentP("Compute tree distances") - LogCodeP(switch( - input$distMeth, - "cid" = "dists <- TreeDist::ClusteringInfoDistance(trees)", - "pid" = "dists <- TreeDist::PhylogeneticInfoDistance(trees)", - "msid" = "dists <- TreeDist::MatchingSplitInfoDistance(trees)", - "rf" = "dists <- TreeDist::RobinsonFoulds(trees)", - "qd" = c("dists <- as.dist(Quartet::QuartetDivergence(", - " Quartet::ManyToManyQuartetAgreement(trees),", - " similarity = FALSE)", ")") - )) - } - - mapping <- bindCache(reactive({ - LogMsg("mapping()") - if (maxProjDim() > 1L) { - withProgress( - message = "Mapping trees", - value = 0.99, - tryCatch(cmdscale(distances(), k = maxProjDim()), - warning = function (e) { - nDim <- as.integer(substr(e$message, 6, 7)) - updateSliderInput(inputId = "spaceDim", - value = min(nDim, input$spaceDim), - max = nDim) - message("Max dimensions available for mapping: ", nDim, ".") - cmdscale(distances(), k = nDim) - }) - ) - } else { - matrix(0, 0, 0) - } - }), r$treeHash, input$distMeth, maxProjDim()) - - LogMapping <- function() { - k <- dim(mapping())[2] - if (!is.null(k) && k > 0) { - LogCommentP(paste0( - "Generate first ", k, " dimensions of tree space using PCoA" - )) - LogCodeP(paste0("map <- cmdscale(dists, k = ", k, ")")) - } - } - - mstEnds <- bindCache(reactive({ - dist <- as.matrix(distances()) - withProgress(message = "Calculating MST", { - edges <- MSTEdges(dist) - }) - edges - }), input$distMeth, r$treeHash) - - ############################################################################## - # Plot tree space - ############################################################################## - # CAUTION: Remember to update accompanying logging function below. - TreespacePlot <- function() { - if (length(r$trees) < 3) { - return(ErrorPlot("Need at least\nthree trees to\nmap tree space")) - } - - cl <- clusterings() - map <- mapping() - - nDim <- min(dims(), nProjDim()) - if (nDim < 2) { - if (dim(map)[2] == 1L) { - map <- cbind(map, 0) - } else { - map[, 2] <- 0 - } - nDim <- 2L - nPanels <- 1L - } else { - plotSeq <- matrix(0, nDim, nDim) - nPanels <- nDim * (nDim - 1L) / 2L - plotSeq[upper.tri(plotSeq)] <- seq_len(nPanels) - if (nDim > 2) { - plotSeq[nDim - 1, 2] <- max(plotSeq) + 1L - } - layout(t(plotSeq[-nDim, -1])) - } - - par(mar = rep(0.2, 4)) - withProgress(message = "Drawing plot", { - for (i in 2:nDim) for (j in seq_len(i - 1)) { - incProgress(1 / nPanels) - # Set up blank plot - plot(map[, j], map[, i], ann = FALSE, axes = FALSE, - frame.plot = nDim > 2L, - type = "n", asp = 1, xlim = range(map), ylim = range(map)) - - # Connect sequential trees - if ("seq" %in% input$mapLines) { - lines(map[, j], map[, i], col = "#ffcc33", lty = 2) - } - - # Plot MST - if ("mst" %in% input$mapLines) { - segments(map[mstEnds()[, 1], j], map[mstEnds()[, 1], i], - map[mstEnds()[, 2], j], map[mstEnds()[, 2], i], - col = "#bbbbbb", lty = 1) - } - - - # Add points - points(map[, j], map[, i], pch = treePch(), - col = paste0(TreeCols(), as.hexmode(200)), - cex = spaceCex(), - lwd = spaceLwd() - )#input$pt.cex) - - if (cl$sil > silThreshold() && "hull" %in% input$mapLines) { - # Mark clusters - for (clI in seq_len(cl$n)) { - inCluster <- cl$cluster == clI - clusterX <- map[inCluster, j] - clusterY <- map[inCluster, i] - hull <- chull(clusterX, clusterY) - polygon(clusterX[hull], clusterY[hull], lty = 1, lwd = 2, - border = palettes[[min(length(palettes), cl$n)]][clI]) - } - } - if ("labelTrees" %in% input$display) { - text(map[, j], map[, i], names(r$trees)) - } - } - if (nDim > 2) { - plot.new() - } - if (input$spacePch == "relat") { - if (length(input$relators) == 4L) { - legend( - "topright", - bty = "n", - pch = 1:3, - xpd = NA, - pt.cex = spaceCex(), - pt.lwd = spaceLwd(), - gsub("_", " ", fixed = TRUE, - paste(input$relators[2:4], "&", input$relators[[1]])) - ) - } - } else if (input$spacePch == "name") { - clstr <- treeNameClustering() - clusters <- unique(clstr) - if (length(clusters) > 1L) { - legend(bty = "n", "topright", xpd = NA, - pch = c(1, 3, 4, 2, seq_len(max(clstr))[-(1:4)])[clusters], - paste0("~ ", attr(clstr, "med"), " (", table(clstr), ")")) - } - } - if (input$spaceCol == "firstHit" && length(FirstHit())) { - legend(bty = "n", "topleft", pch = 16, col = FirstHitCols(), - pt.cex = spaceCex(), - names(FirstHit()), title = "Iteration first hit") - } else if (input$spaceCol == "score") { - legendRes <- length(badToGood) - leg <- rep(NA, legendRes) - leg[c(legendRes, 1)] <- signif(range(scores())) - legend("bottomright", bty = "n", border = NA, - legend = leg, fill = rev(badToGood), - y.intersp = 0.04, cex = 1.1) - } - }) - } - - LogTreespacePlot <- function() { - BeginLogP() - - LogClusterings() - LogMapping() - - map <- mapping() - nDim <- min(dims(), nProjDim()) - if (nDim < 2) { - LogCommentP("Prepare 1D map", 0) - if (dim(map)[2] == 1L) { - LogCodeP("map <- cbind(map, 0)") - } else { - LogCodeP("map[, 2] <- 0") - } - nDim <- 2L - nPanels <- 1L - } else { - LogCommentP("Prepare plot layout") - - LogCodeP(c( - paste0("nDim <- ", nDim, " # Number of dimensions to plot"), - "nPanels <- nDim * (nDim - 1L) / 2L # Lower-left triangle", - "plotSeq <- matrix(0, nDim, nDim)", - "plotSeq[upper.tri(plotSeq)] <- seq_len(nPanels)", - if (nDim > 2) { - "plotSeq[nDim - 1, 2] <- max(plotSeq) + 1L" - }, - "layout(t(plotSeq[-nDim, -1]))" - )) - } - - LogCommentP("Set plot margins", 0) - LogCodeP("par(mar = rep(0.2, 4))") - - LogCommentP("Set up tree plotting symbols") - LogCodeP(paste0("treePch <- ", LogTreePch()), - LogTreeCols(), - "treeCols <- paste0(treeCols, as.hexmode(200)) # Semitransparent" - ) - - LogCodeP("for (i in 2:nDim) for (j in seq_len(i - 1)) {") - LogIndent(+2) - LogCommentP("Set up blank plot") - LogCodeP("plot(", - " x = map[, j],", - " y = map[, i],", - " ann = FALSE, # No annotations", - " axes = FALSE, # No axes", - paste0(" frame.plot = ", - if(nDim > 2L) { - "TRUE, # Border around plot" - } else { - "FALSE, # No border around plot" - }), - " type = \"n\", # Don't plot any points yet", - " asp = 1, # Fix aspect ratio to avoid distortion", - " xlim = range(map), # Constant X range for all dimensions", - " ylim = range(map) # Constant Y range for all dimensions", - ")") - - if ("seq" %in% input$mapLines) { - LogCommentP("Connect trees in sequence") - LogCodeP("lines(", - " x = map[, j],", - " y = map[, i],", - " col = \"#ffcc33\", # Orange", - " lty = 2 # dashed", - ")") - } - - if ("mst" %in% input$mapLines) { - LogCommentP("Plot minimum spanning tree (Gower 1969)") - LogCodeP( - "mst <- MSTEdges(as.matrix(dists))", - "segments(", - " x0 = map[mst[, 1], j],", - " y0 = map[mst[, 1], i],", - " x1 = map[mst[, 2], j],", - " y1 = map[mst[, 2], i],", - " col = \"#bbbbbb\", # Light grey", - " lty = 1 # Solid lines", - ")" - ) - } - - LogCommentP("Add points") - LogCodeP( - "points(", - " x = map[, j],", - " y = map[, i],", - " pch = treePch,", - " col = treeCols,", - paste0(" cex = ", spaceCex(), ", # Point size"), - paste0(" lwd = ", spaceLwd(), " # Line width"), - ")" - ) - - cl <- clusterings() - if (cl$sil > silThreshold() && "hull" %in% input$mapLines) { - LogCommentP("Mark clusters") - LogCodeP("for (clI in seq_len(nClusters)) {") - LogIndent(+2) - LogCodeP( - "inCluster <- clustering == clI", - "clusterX <- map[inCluster, j]", - "clusterY <- map[inCluster, i]", - "hull <- chull(clusterX, clusterY)", - "polygon(", - " x = clusterX[hull],", - " y = clusterY[hull],", - " lty = 1, # Solid line style", - " lwd = 2, # Wider line width", - " border = clusterCol[clI]", - ")") - LogIndent(-2) - LogCodeP("}") - } - if ("labelTrees" %in% input$display) { - #TODO input$display doesn't exist. If useful, implement below too. - LogCodeP("text(map[, j], map[, i], trees)") - } - - LogIndent(-2) - LogCodeP("}") - - if (nDim > 2) { - LogCodeP("plot.new() # Use new panel to plot legends") - } - - if (input$spacePch == "relat") { - if (length(input$relators) == 4L) { - LogCommentP("Add legend for plotting symbols") - LogCodeP( - "legend(", - " \"topright\",", - " bty = \"n\", # No legend border box", - " pch = 1:3, # Legend symbols", - " xpd = NA, # Display overflowing text", - paste0(" pt.cex = ", spaceCex(), ", # Point size"), - paste0(" pt.lwd = ", spaceLwd(), ", # Line width"), - paste0(" ", - EnC(gsub("_", " ", fixed = TRUE, - paste(input$relators[2:4], "&", input$relators[[1]]))) - ), ")" - ) - } - } else if (input$spacePch == "name") { - clstr <- treeNameClustering() - clusters <- unique(clstr) - if (length(clusters) > 1L) { - LogCommentP("Add legend for plotting symbols") - LogCodeP( - "nameClusters <- ClusterStrings(names(trees))", - "uniqueClusters <- unique(nameClusters)", - "legend(", - " \"topright\",", - " bty = \"n\", # No legend border box", - " xpd = NA, # Display overflowing text", - paste0( - " pch = ", - EnC(c(1, 3, 4, 2, seq_len(max(clstr))[-(1:4)])[clusters]), - ", # Legend symbols" - ), paste0(" ", - EnC(paste0("~ ", attr(clstr, "med"), - " (", table(clstr), ")")) - ), - ")") - } - } - if (input$spaceCol == "firstHit" && length(FirstHit())) { - LogCommentP("Record when trees first hit") - LogFirstHit() - - LogCommentP("Add legend for symbol colours") - LogCodeP( - "legend(", - " \"topleft\",", - " bty = \"n\", # No legend border box", - " pch = 16, # Circle symbol", - " xpd = NA, # Display overflowing text", - paste0(" col = ", LogFirstHitCols(), ","), - paste0(" pt.cex = ", spaceCex(), ", # Point size"), - paste0(" ", EnC(names(FirstHit())), ","), - " title = \"Iteration first hit\"", - ")" - ) - } else if (input$spaceCol == "score") { - LogCommentP("Add legend for symbol colours") - LogCodeP( - "goodToBad <- hcl.colors(108, \"Temps\")", - "leg <- rep_len(NA, 108)", - paste0("leg[c(1, 108)] <- ", EnC(rev(signif(range(scores()))))), - "legend(", - " \"bottomright\",", - " legend = leg,", - " bty = \"n\", # No legend border box", - " border = NA, # No border around plot icons", - " xpd = NA, # Display overflowing text", - " fill = goodToBad,", - " y.intersp = 0.04, # Compress squares to make gradient scale", - " cex = 1.1 # Increase font and icon size slightly", - ")" - ) - } - } - - mode3D <- reactive("show3d" %in% input$display) - - saveDetails <- reactive({ - switch(input$plotFormat, - "cons" = { - list( - fileName = "ConsensusTrees", - title = "Consensus tree - TreeSearch", - asp = 2L - ) - }, - "clus" = { - list( - fileName = "ClusterCons", - title = "Cluster Consensus trees - TreeSearch", - asp = 1.6 - ) - }, - "ind" = { - list( - fileName = "OptimalTree", - title = "Optimal tree - TreeSearch", - asp = 2L - ) - }, - "space" = { - list( - fileName = "TreeSpace", - title = "Tree space - TreeSearch", - asp = 1L - ) - }) - }) - - output$saveZip <- downloadHandler( - filename = function() paste0("TreeSearch-session.zip"), - content = function(file) { - if (isTRUE(getOption("shiny.testmode"))) { - file.copy(cmdLogFile, file) - } else { - zipDir <- tempfile("zip-") - dir.create(zipDir) - on.exit(unlink(zipDir)) - rFile <- paste0(zipDir, "/TreeSearch-session.R") - file.copy(cmdLogFile, rFile, overwrite = TRUE) - zip(file, c( - rFile, - if (r$dataFiles) - paste0(tempdir(), "/", DataFileName(seq_len(r$dataFiles))), - if (r$excelFiles) - paste0(tempdir(), "/", ExcelFileName(seq_len(r$excelFiles))), - if (r$treeFiles) - paste0(tempdir(), "/", TreeFileName(seq_len(r$treeFiles))) - ), flags = "-9Xj") - } - }) - - output$savePlotZip <- downloadHandler( - filename = function() paste0(saveDetails()$fileName, ".zip"), - content = function(file) { - StashTrees(r$allTrees) - - if (isTRUE(getOption("shiny.testmode"))) { - rCode <- RCode() - rCode <- sub("TreeSearch plot log: 2[\\d\\-]{9} [012][\\d:]{7}", - "TreeSearch plot log: ", - rCode, perl = TRUE) - rCode[4] <- "# System: " - rCode[5:9] <- sub("^(# \\- \\w+ ).*$", "\\1", - rCode[5:9], perl = TRUE) - rCode <- sub("dataFile <- .*$", - paste0("dataFile <- system.file(\"datasets/", - input$dataSource, - ".nex\", package = \"TreeSearch\") # FALSE CODE for TEST MODE"), - rCode, - perl = TRUE) - rCode <- sub("treeFile <- .*$", - "treeFile <- dataFile # Test mode", - rCode, - perl = TRUE) - writeLines(rCode, con = file) - } else { - tempDir <- tempfile("plot-zip-") - dir.create(tempDir) - on.exit(unlink(tempDir)) - rFile <- paste0(tempDir, "/", saveDetails()$fileName, ".R") - writeLines(RCode(), con = rFile) - - # Create ZIP - zip(file, c( - rFile, - paste0(tempdir(), "/", LastFile("data")), - paste0(tempdir(), "/", LastFile("excel")), - paste0(tempdir(), "/", LastFile("tree")) - ), flags = "-r9Xj") - } - }) - - output$savePng <- downloadHandler( - filename = function() paste0(saveDetails()$fileName, ".png"), - content = function (file) { - png(file, width = input$plotSize, height = input$plotSize) - MainPlot() - dev.off() - }) - - output$savePdf <- downloadHandler( - filename = function() paste0(saveDetails()$fileName, ".pdf"), - content = function (file) { - width <- 8 - pdf( - file, - title = saveDetails()$title, - width = width, - height = saveDetails()$asp * width - ) - MainPlot() - dev.off() - }) - - output$savePlotNwk <- downloadHandler( - filename = "TreeSearch-consensus.nwk", - content = function(file) { - write.tree(r$plottedTree, file = file) - } - ) - - output$savePlotNex <- downloadHandler( - filename = "TreeSearch-consensus.nex", - content = function(file) { - write.nexus(r$plottedTree, file = file) - } - ) - - output$saveNwk <- downloadHandler( - filename = "TreeSearch.nwk", - content = function(file) { - write.tree(r$trees, file = file, tree.names = TRUE) - } - ) - - output$saveNex <- downloadHandler( - filename = "TreeSearch.nex", - content = function(file) { - write.nexus(r$trees, file = file) - } - ) - - ############################################################################## - # References - ############################################################################## - - output$references <- renderUI({ - tagList( - tags$h2("References for methods used"), - tags$h3("Tree search"), - HTML(Brazeau2019, Morphy, Nixon1999, SmithSearch), - tags$h3("Tree space mapping"), - HTML(paste0(Gower1966, Gower1969, Kaski2003, RCoreTeam, - SmithDist, Smith2020, SmithSpace, - Venna2001)), - tags$h3("Clustering"), - HTML(paste("Cluster consensus trees:", Stockham2002)), - HTML(paste0( - "k-means++:", Arthur2007, Hartigan1979, - "Partitioning around medoids:", Maechler2019, - "Hierarchical, minimax linkage:", Bien2011, Murtagh1983, - "Clustering evaluation:", Rousseeuw1987 - )), - tags$h3("Rogue taxa"), - HTML(paste("Detection:", SmithRogue)), - HTML(paste("Plotting:", Klopfstein2019)), - HTML(paste("Character analysis:", Pol2009)), - ) - }) - - onStop(function() { - options(startOpt) - if (file.exists(cmdLogFile)) { - unlink(cmdLogFile) - } - unlink(DataFileName("*")) - unlink(TreeFileName("*")) - if (logging) { - LogMsg("Session has ended") - on.exit(close(logMsgFile)) - } - }) -} - - -shinyApp(ui = ui, server = server) diff --git a/inst/Parsimony/global.R b/inst/Parsimony/global.R new file mode 100644 index 000000000..0061c7b7b --- /dev/null +++ b/inst/Parsimony/global.R @@ -0,0 +1,433 @@ +# options("TreeSearch.logging" = TRUE) # Log function entry and exit +# options("TreeSearch.write.code" = TRUE) # Show code as it is written to log +logging <- isTRUE(getOption("TreeSearch.logging")) +options(shiny.maxRequestSize = 1024 ^ 3) # Allow max 1 GB files + +# Development: prepend .agent-shiny library so library("TreeSearch") finds +# the pre-built v2.0.0 install, preventing pkgload from intercepting and +# attempting a debug recompile (which fails when src/*.o files are stale). +local({ + shiny_lib <- normalizePath( + file.path(dirname(dirname(getwd())), ".agent-shiny"), + mustWork = FALSE + ) + if (dir.exists(shiny_lib)) { + .libPaths(c(shiny_lib, .libPaths())) + } +}) + +library("methods", exclude = c("show", "removeClass")) +library("cli") +library("TreeSearch") # load now: inapplicable.datasets required within ui +.DateTime <- function() { # Copy, because not exported + format(Sys.time(), "%Y-%m-%d %T") +} + +local({ + needed <- c("cluster", "future", "PlotTools", "promises", + "protoclust", "Rogue", "shinyjs") + miss <- needed[!vapply(needed, requireNamespace, logical(1L), quietly = TRUE)] + if (length(miss)) { + message("Installing packages required by EasyTrees(): ", + paste(miss, collapse = ", ")) + utils::install.packages(miss) + } +}) + +suppressPackageStartupMessages({ + library("shiny", exclude = c("runExample")) + library("shinyjs", exclude = c("runExample")) +}) +library("TreeTools", quietly = TRUE) +library("TreeDist", quietly = TRUE) +library("future") +library("promises") + + +if (logging) { + logMsgFile <- file("log.lg", open = "w+") + LogMsg <- function (...) { + message(.DateTime(), ": ", ...) + writeLines(.DateTime(), con = logMsgFile) + writeLines(paste0(" ", ...), con = logMsgFile) + } + Put <- function (..., file) { + dput(..., file = file) + writeLines(gsub("", "NULL", readLines(file)), + file) + } + PutTree <- function (...) { + Put(..., file = "tree.lg") + } + PutData <- function (...) { + Put(..., file = "dataset.lg") + } +} else { + PutData <- PutTree <- LogMsg <- function (...) {} +} + +WriteLoggedCode <- if (isTRUE(getOption("TreeSearch.write.code"))) { + if (requireNamespace("crayon", quietly = TRUE)) { + function(txt) { + for (line in txt) cat(if (substr(trimws(line), 0, 1) == "#") { + crayon::green(" ", line, "\n") + } else { + crayon::yellow(" ", line, "\n") + }) + } + } else { + function(txt) message(" ", txt) + } +} else { + function(txt) {} +} + +Notification <- function (...) { + if (!isTRUE(getOption("shiny.testmode"))) { + showNotification(...) + } +} + +Icon <- function(...) icon(..., class = "fas") + +aJiffy <- 42 # ms, default debounce period for input sliders etc +typingJiffy <- 2.5 * aJiffy # slightly slower if might be typing +aFewTrees <- 48L # Too many and rogues / tree space are slowed +NO_OUTGROUP <- "! TREESEARCH_no outgroup specified ." + +palettes <- list("#7a6c36", + c("#7a6c36", "#864885"), + c("#7a6c36", "#864885", "#427743"), + c("#7a6c36", "#864885", "#427743", "#4c5c86"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca", "#364020"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca", "#364020", "#c241a7"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#824eca", "#b3622a", "#452580", "#417f81", "#ca4172", "#6171ca", "#364020", "#c241a7", "#391d42"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#85c6f9", "#fbd1a0", "#7696be", "#89996c", "#ddcdff", "#719d89", "#f5cde6", "#b6e0da", "#e8d4cd", "#b5ddfa"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#bbcb8f", "#bf82ab", "#85ddc4", "#eea0ba", "#c1d8ff", "#c3818b", "#c5c6ff", "#999388", "#e8cbff", "#ffb5b6", "#d2dad7"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#cc8f6f", "#499fae", "#d9dca6", "#7796b8", "#bee1ba", "#b4daff", "#919583", "#e2d3e9", "#47a19b", "#ebd4bc", "#7c9993", "#a9e3e0"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#a8e0fe", "#fad0a8", "#679e8d", "#ffc7b1", "#abe5c0", "#ac8d78", "#c5dddc", "#a48f84", "#cadfb0", "#899694", "#fdcdc1", "#d1dad5", "#dfd8c4"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#dcb983", "#77bff0", "#f0ab92", "#90ddff", "#f1d3a9", "#b5c2fe", "#c1e1b7", "#7596ba", "#bce1c4", "#a88c96", "#5a9daf", "#b18b80", "#d4d6f3", "#949577"), + c("#7a6c36", "#864885", "#427743", "#4c5c86", "#cb4745", "#73383b", "#e03795", "#438f2e", "#5e2195", "#758029", "#4042b9", "#a37926", "#8364df", "#c3671f", "#444491", "#dc4c1f", "#367076", "#e2383c", "#4786b4", "#e13964", "#4c8c73", "#a53396", "#2c4422", "#b553cb", "#50381b", "#4f75d8", "#a12c1b", "#8576b8", "#bd6541", "#3a1959", "#83491f", "#2d2644", "#c45b94", "#451523", "#966883", "#782224", "#b96563", "#762254", "#95765c", "#ad355a") +) + +ErrorPlot <- function (...) { + plot(0, 0, type = "n", axes = FALSE, ann = FALSE) + text(0, 0, paste0(..., collapse = "\n"), + col = "#dd6611", font = 2) +} + +badToGood <- rev(c("#1AB958", "#23B956", "#2BB954", "#31B952", "#37B850", "#3CB84E", "#41B84C", "#45B74A", "#49B749", "#4DB747", "#51B645", "#54B643", "#58B641", "#5BB53F", "#5FB53D", "#62B53C", "#65B43A", "#68B438", "#6BB336", "#6DB335", "#70B333", "#73B231", "#76B230", "#78B12E", "#7BB12C", "#7DB02B", "#80B029", "#82AF28", "#85AF26", "#87AE25", "#8AAE23", "#8CAD22", "#8EAD21", "#91AC1F", "#93AC1E", "#95AB1D", "#97AB1C", "#9AAA1B", "#9CAA1A", "#9EA919", "#A0A918", "#A2A818", "#A4A717", "#A6A716", "#A8A616", "#AAA616", "#ACA515", "#AEA415", "#B0A415", "#B2A315", "#B4A315", "#B6A216", "#B8A116", "#B9A117", "#BBA017", "#BD9F18", "#BF9F18", "#C19E19", "#C29D1A", "#C49D1B", "#C69C1C", "#C79B1D", "#C99A1E", "#CB9A1F", "#CC9920", "#CE9822", "#CF9823", "#D19724", "#D29625", "#D49626", "#D59528", "#D79429", "#D8932A", "#D9932C", "#DB922D", "#DC912E", "#DD9130", "#DF9031", "#E08F33", "#E18F34", "#E28E35", "#E38D37", "#E58C38", "#E68C3A", "#E78B3B", "#E88A3D", "#E98A3E", "#EA8940", "#EB8841", "#EC8843", "#ED8744", "#EE8746", "#EE8647", "#EF8549", "#F0854A", "#F1844C", "#F2844D", "#F2834F", "#F38350", "#F48252", "#F48253", "#F58155", "#F58157", "#F68058", "#F6805A", "#F77F5B", "#F77F5D", "#F87E5E")) + +Reference <- function (authors, year, title, journal = "", + volume = NULL, pages = NULL, doi = NULL, + publisher = NULL, editors = NULL) { + nAuth <- length(authors) + if (nAuth > 1L) { + authors <- paste(paste0(authors[-nAuth], collapse = ", "), "&", authors[nAuth]) + } + nEd <- length(editors) + if (nEd > 1L) { + editors <- paste(paste0(editors[-nEd], collapse = ", "), "&", editors[nEd]) + } else if (nEd < 1) { + editors <- "" + } + paste0("

", authors, " (", year, "). “", title, + "”. ", + if (editors != "") paste0("In: ", editors, " (eds). ") else "", + if (journal != "") paste0("", journal, " ") else "", + if (is.null(volume)) "" else paste0("", volume, ":"), + if (is.null(publisher)) "" else paste0(publisher, ". "), + if (is.null(pages)) "" else paste0(paste0(pages, collapse = "–"), ". "), + if (is.null(doi)) "" else paste0( + "doi:", + doi, ". "), + "

") +} + + +Arthur2007 <- Reference( + c("Arthur, D.", "Vassilvitskii, S"), + title = "k-means++: the advantages of careful seeding", + year = 2007, + journal = "Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms", + pages = c(1027, 1035) +) +Brazeau2019 <- Reference(c("Brazeau, M.D.", "Guillerme, T.", "Smith, M.R."), 2019, + title = "An algorithm for morphological phylogenetic analysis with inapplicable data", + journal = "Systematic Biology", + volume = 64, + pages = c(619, 631), + doi = "10.1093/sysbio/syy083") +Goloboff1993 <- Reference("Goloboff, P.A.", 1993, + "Estimating character weights during tree search", + "Cladistics", volume = 9, + pages = c(83, 91), + doi = "10.1111/j.1096-0031.1993.tb00209.x") +Goloboff1999 <- Reference("Goloboff, P.A.", 1999, + "Analyzing large data sets in reasonable times: solutions for composite optima", + "Cladistics", volume = 15, + pages = c(415, 428), + doi = "10.1006/clad.1999.0122") +Goloboff2014 <- Reference("Goloboff, P.A.", 2014, + "Extended implied weighting", + "Cladistics", volume = 30, + pages = c(260, 272), + doi = "10.1111/cla.12047") +Bien2011 <- Reference( + c("Bien, J.", "Tibshirani, R."), + title = "Hierarchical clustering with prototypes via minimax linkage", + year = 2011, + volume = 106, + doi = "10.1198/jasa.2011.tm10183", + pages = c(1075, 1084), + journal = "Journal of the American Statistical Association") +Gower1966 <- Reference(title = "Some distance properties of latent root and vector methods used in multivariate analysis", + authors = "Gower, J.C.", + year = 1966, + volume = 53, + pages = c(325, 338), + doi = "10.2307/2333639", + journal = "Biometrika") +Gower1969 <- Reference( + title = "Minimum spanning trees and single linkage cluster analysis", + authors = c("Gower, J.C.", "Ross, G.J.S."), + year = 1969, volume = 18, pages = c(54, 64), doi = "10.2307/2346439", + journal = "Journal of the Royal Statistical Society Series C (Applied Statistics)") +Hartigan1979 <- Reference( + title = "Algorithm AS 136: a K-means clustering algorithm", + authors = c("Hartigan, J.A.", "Wong, M.A."), + journal = "Journal of the Royal Statistical Society Series C (Applied Statistics)", + year = 1979, volume = 28, pages = c(100, 108), + doi = "10.2307/2346830") +Kaski2003 <- Reference( + title = "Trustworthiness and metrics in visualizing similarity of gene expression", + authors = c("Kaski, S.", "Nikkilä, J.", "Oja, M.", "Venna, J.", + "Törönen, P.", "Castrén, E."), + year = 2003, volume = 4, pages = 48, doi = "10.1186/1471-2105-4-48", + journal = "BMC Bioinformatics") +Klopfstein2019 <- Reference( + title = "Illustrating phylogenetic placement of fossils using RoguePlots: An example from ichneumonid parasitoid wasps (Hymenoptera, Ichneumonidae) and an extensive morphological matrix.", + authors = c("Klopfstein, S.", "Spasojevic, T."), year = 2019, + journal = "PLoS ONE", volume = 14, pages = "e0212942", + doi = "10.1371/journal.pone.0212942" +) +Maechler2019 <- Reference( + title = "cluster: cluster analysis basics and extensions", year = 2022, + authors = c("Maechler, M.", "Rousseeuw, P.", "Struyf, A.", "Hubert, M.", "Hornik, K."), + journal = "Comprehensive R Archive Network") +Morphy <- Reference( + c("Brazeau, M.D.", "Smith, M.R.", "Guillerme, T."), 2017, + "MorphyLib: a library for phylogenetic analysis of categorical trait data with inapplicability", + doi = "10.5281/zenodo.815371") +Murtagh1983 <- Reference( + title = "A survey of recent advances in hierarchical clustering algorithms", + authors = "Murtagh, F.", year = 1983, volume = 26, pages = c(354, 359), + doi = "10.1093/comjnl/26.4.354", journal = "The Computer Journal") +Nixon1999 <- Reference( + "Nixon, K.C.", 1999, + journal = "Cladistics", volume = 15, pages = c(407, 414), + title = "The Parsimony Ratchet, a new method for rapid parsimony analysis", + doi = "10.1111/j.1096-0031.1999.tb00277.x") +Pol2009 <- Reference( + title = "Unstable taxa in cladistic analysis: identification and the assessment of relevant characters", + authors = c("Pol, D.", "Escapa, I.H."), + journal = "Cladistics", 2009, 25, pages = c(515, 527), + doi = "10.1111/j.1096-0031.2009.00258.x") +RCoreTeam <- Reference( + authors = "R Core Team", year = 2020, + title = "R: A language and environment for statistical computing", + publisher = "R Foundation for Statistical Computing, Vienna, Austria") +Rousseeuw1987 <- Reference( + title = "Silhouettes: a graphical aid to the interpretation and validation of cluster analysis", + author = "Rousseeuw, P.J.", year = 1987, + journal = "Journal of Computational and Applied Mathematics", + volume = 20, pages = c(53, 65), doi = "10.1016/0377-0427(87)90125-7" +) +SmithDist <- Reference( + "Smith, M.R.", "2020a", "TreeDist: distances between phylogenetic trees", + doi = "10.5281/zenodo.3528123", "Comprehensive R Archive Network") +SmithQuartet <- Reference( + "Smith, M.R.", 2019, + "Quartet: comparison of phylogenetic trees using quartet and split measures", + "Comprehensive R Archive Network", doi = "10.5281/zenodo.2536318") +SmithSearch <- Reference( + "Smith, M.R.", 2023, "TreeSearch: morphological phylogenetic analysis in R", + "R Journal", volume = 14, pages = c(305, 315), + doi = "10.32614/RJ-2023-019") +Smith2020 <- Reference( + "Smith, M.R.", "2020b", + "Information theoretic Generalized Robinson-Foulds metrics for comparing phylogenetic trees", + "Bioinformatics", volume = 36, pages = c("5007", "5013"), + doi = "10.1093/bioinformatics/btaa614") +SmithSpace <- Reference( + "Smith, M.R.", "2022a", "Robust analysis of phylogenetic tree space", + "Systematic Biology", 71, pages = c("1255", "1270"), + doi = "10.1093/sysbio/syab100") +SmithRogue <- Reference( + "Smith, M.R.", "2022b", + "Using information theory to detect rogue taxa and improve consensus trees", + "Systematic Biology", 71, pages = c("1088", "1094"), + doi = "10.1093/sysbio/syab099") +Stockham2002 <- Reference( + authors = c("Stockham, C.", "Wang, L.-S.", "Warnow, T."), 2002, + "Statistically based postprocessing of phylogenetic analysis by clustering", + "Bioinformatics", 18, c("S285", "S293"), + doi = "10.1093/bioinformatics/18.suppl_1.S285") + +Venna2001 <- Reference( + title = "Neighborhood preservation in nonlinear projection methods: an experimental study", + authors = c("Venna, J.", "Kaski, S."), year = 2001, pages = c(485, 491), + journal = "Lecture Notes in Computer Science: Artificial Neural Networks—ICANN 2001", + editors = c("Dorffner, G.", "Bischof, H.", "Hornik, K."), + publisher = "Springer, Berlin", + doi = "10.1007/3-540-44668-0_68") + + + + + + + + + + + + + + + +Enquote <- function(x, ...) { + if (mode(x) == "character") { + paste0("\"", x, "\"") + } else { + signif(x, ...) + } +} + +#' Confidence text for post-search results display. +#' +#' Given K hits to best score in R total runs, returns a plain-text +#' summary: "K of R runs hit best score. Probability that a better tree +#' exists: ~X%". +#' +#' @param K integer. Cumulative hits to best score. +#' @param R integer. Cumulative runs completed. +#' @return character(1) or NULL if no search data. +FormatMissProb <- function(prob) { + pct <- prob * 100 + if (pct >= 1) paste0("~", round(pct), "%") + else if (pct >= 0.1) "<1%" + else if (pct >= 0.01) "<0.1%" + else "<0.01%" +} + +SearchConfidenceText <- function(K, R, nSearches = 1L, + nTopologies = NULL, + lastImprovedRep = NULL, + stopReason = NULL, + replicateScores = NULL) { + if (is.null(K) || is.null(R) || R <= 0L || K <= 0L) return(NULL) + K <- min(K, R) + + # Tightened binomial bound: (1 - K/R)^R is tighter than exp(-K) when K < R. + # Falls back to exp(-K) when K == R, since (1 - 1)^R = 0 is overconfident. + prob_miss <- if (K < R) (1 - K / R) ^ R else exp(-K) + + runs_label <- if (!is.null(nSearches) && nSearches > 1L) { + paste0("total runs across ", nSearches, " searches") + } else { + "runs" + } + + # Only warn when a single topology limits the independence assumption + topo_note <- if (!is.null(nTopologies) && nTopologies == 1L) { + " [single topology \u2014 limited independence]" + } else { + "" + } + + # Trajectory info + trajectory_note <- if (!is.null(lastImprovedRep) && R > 1L) { + paste0(" Last improvement: replicate ", lastImprovedRep, ".") + } else { + "" + } + + # Landscape ruggedness flag + rugged_note <- if (K / R < 0.3 && R >= 5L) { + paste0(" Hit rate low (", round(100 * K / R), + "%) \u2014 more replicates may help.") + } else { + "" + } + + # Nudge for small K == R + small_sample_note <- if (K == R && R <= 5L) { + paste0(" \u2014 increase \u2018Stop when N runs hit best\u2019 for a ", + "tighter estimate") + } else { + "" + } + + stop_note <- if (identical(stopReason, "consensus")) { + " Search stopped: consensus tree unchanged across recent replicates." + } else if (identical(stopReason, "timeout")) { + " Search stopped: time limit reached." + } else { + "" + } + + # Chao1-style landscape coverage (appended when enough replicates available) + coverage_note <- if (!is.null(replicateScores) && + length(replicateScores) >= 5L) { + sp <- tryCatch(ScoreSpectrum(replicateScores), error = function(e) NULL) + if (!is.null(sp) && !is.na(sp$coverage)) { + pct <- round(100 * sp$coverage) + paste0(" Landscape coverage: ~", pct, "%", + if (sp$unseen_fraction > 0.05) + paste0(" (~", round(100 * sp$unseen_fraction), + "% of score levels unseen)") + else + "") + } else { + "" + } + } else { + "" + } + + paste0(K, " of ", R, " ", runs_label, " hit best score. ", + "Probability that a better score exists: ", + FormatMissProb(prob_miss), + topo_note, trajectory_note, rugged_note, small_sample_note, + stop_note, coverage_note) +} + +EnC <- function(...) { + if (length(...) == 1) { + Enquote(...) + } else { + paste0("c(", paste(sapply(..., Enquote), collapse = ", "), ")") + } +} + +# Shiny modules — sourced here so ui.R can call xxx_ui() at definition time +source("server/mod_references.R") +source("server/mod_downloads.R") +dl_ui <- downloads_ui("dl") +source("server/mod_search.R") +se_ui <- search_ui("search") +source("server/mod_data.R") +source("server/mod_clustering.R") +source("server/mod_treespace.R") +source("server/mod_consensus.R") +data_ui_elems <- data_ui("data") +co_ui <- consensus_ui("consensus") diff --git a/inst/Parsimony/server.R b/inst/Parsimony/server.R new file mode 100644 index 000000000..e7d512b15 --- /dev/null +++ b/inst/Parsimony/server.R @@ -0,0 +1,200 @@ +server <- function(input, output, session) { + + source("server/app_state.R", local = TRUE) + r <- AppState() + exportTestValues(searchCount = { r$searchCount }) + + # Async search setup (was in search.R) + plan(multisession) + startOpt <- options("cli.progress_show_after" = 0.1) + + source("server/logging.R", local = TRUE) + LogMsg("Started server") + + # Forward-reference bridge for callbacks defined after the data module + cb_ref <- new.env(parent = emptyenv()) + + # Data module (replaces data.R + trees.R + data event bindings from events.R) + dt <- data_server("data", + r = r, + parent_session = session, + callbacks = list( + DisplayTreeScores = function() cb_ref$DisplayTreeScores(), + UpdateKeepNTipsRange = function() cb_ref$UpdateKeepNTipsRange(), + UpdateDroppedTaxaDisplay = function() cb_ref$UpdateDroppedTaxaDisplay(), + UpdateOutgroupInput = function() cb_ref$UpdateOutgroupInput() + ), + log_fns = list( + LogMsg = LogMsg, + LogComment = LogComment, + LogCode = LogCode, + CacheInput = CacheInput, + LastFile = LastFile + ) + ) + # Expose data module reactives for other modules + AnyTrees <- dt$AnyTrees + HaveData <- dt$HaveData + tipLabels <- dt$tipLabels + nChars <- dt$nChars + TaxonOrder <- dt$TaxonOrder + DatasetMatchesTrees <- dt$DatasetMatchesTrees + UpdateAllTrees <- dt$UpdateAllTrees + UpdateActiveTrees <- dt$UpdateActiveTrees + + # Search module + se <- search_server("search", + r = r, + AnyTrees = AnyTrees, + HaveData = HaveData, + UpdateAllTrees = UpdateAllTrees, + log_fns = list( + LogMsg = LogMsg, + LogCode = LogCode, + LogComment = LogComment + ) + ) + scores <- se$scores + concavity <- se$concavity + DisplayTreeScores <- se$DisplayTreeScores + + # Show/hide config panels based on active plot format + ShowConfigs <- function(visible = character(0)) { + allConfigs <- c("whichTree", "charChooser", + "consConfig", "clusConfig", + "clusLegend", "branchLegend", + "spaceConfig", "treePlotConfig", + "mapConfig", "savePlottedTrees", + "droppedTips", "droppedList") + r$visibleConfigs <- visible + lapply(visible, show) + lapply(setdiff(allConfigs, visible), hide) + } + + observeEvent(input$plotFormat, { + ShowConfigs(switch(input$plotFormat, + "ind" = c("whichTree", "charChooser", "treePlotConfig"), + "cons" = c("consConfig", "droppedTips", "savePlottedTrees", + "treePlotConfig", "branchLegend"), + "clus" = c("clusConfig", "clusLegend", "savePlottedTrees", + "consConfig", "droppedList", "treePlotConfig"), + "space" = c("clusConfig", "clusLegend", "spaceConfig", "mapConfig"), + "")) + }) + + # Clustering module + cl <- clustering_server("clustering", + r = r, + distMeth = reactive(input$distMeth), + log_fns = list( + LogMsg = LogMsg, + LogCommentP = LogCommentP, + LogCodeP = LogCodeP, + LogIndent = LogIndent, + BeginLogP = BeginLogP, + LogExprP = LogExprP + ) + ) + distances <- cl$distances + LogDistances <- cl$LogDistances + silThreshold <- cl$silThreshold + clusterings <- cl$clusterings + LogClusterings <- cl$LogClusterings + + # Treespace module + ts <- treespace_server("treespace", + r = r, + clusterings = clusterings, + silThreshold = silThreshold, + scores = scores, + concavity = concavity, + distMeth = reactive(input$distMeth), + plotFormat = reactive(input$plotFormat), + distances = distances, + mapLines = reactive(input$mapLines), + LogDistances = LogDistances, + log_fns = list( + BeginLogP = BeginLogP, + LogCommentP = LogCommentP, + LogCodeP = LogCodeP, + LogIndent = LogIndent, + LogClusterings = LogClusterings + ) + ) + saveDetails <- ts$saveDetails + + # Consensus module (replaces consensus.R + clustering.R + events.R bindings) + co <- consensus_server("consensus", + r = r, + AnyTrees = AnyTrees, + HaveData = HaveData, + tipLabels = tipLabels, + nChars = nChars, + TaxonOrder = TaxonOrder, + concavity = concavity, + clusterings = clusterings, + silThreshold = silThreshold, + LogClusterings = LogClusterings, + TreespacePlot = ts$TreespacePlot, + LogTreespacePlot = ts$LogTreespacePlot, + dims = ts$dims, + nProjDim = ts$nProjDim, + TreeCols = ts$TreeCols, + treePch = ts$treePch, + ts_spaceCol = ts$spaceCol, + ts_mapLines = ts$mapLines, + ts_spacePch = ts$spacePch, + ts_relators = ts$relators, + plotFormat = reactive(input$plotFormat), + plotSize = reactive(input$plotSize), + distMeth = reactive(input$distMeth), + log_fns = list( + LogMsg = LogMsg, + LogComment = LogComment, + LogCode = LogCode, + LogCommentP = LogCommentP, + LogCodeP = LogCodeP, + LogIndent = LogIndent, + BeginLogP = BeginLogP, + LogExprP = LogExprP + ) + ) + + # Wire forward-reference callbacks (consensus module now defined) + cb_ref$DisplayTreeScores <- DisplayTreeScores + cb_ref$UpdateKeepNTipsRange <- co$UpdateKeepNTipsRange + cb_ref$UpdateDroppedTaxaDisplay <- co$UpdateDroppedTaxaDisplay + cb_ref$UpdateOutgroupInput <- co$UpdateOutgroupInput + + # Downloads module + downloads_server( + "dl", + state = r, + dataSource = dt$dataSource, + plotSize = reactive(input$plotSize), + cmdLogFile = cmdLogFile, + stashTrees = StashTrees, + dataFileName = DataFileName, + excelFileName = ExcelFileName, + treeFileName = TreeFileName, + lastFile = LastFile, + mainPlot = co$MainPlot, + rCode = co$RCode, + saveDetails = saveDetails + ) + references_server("refs", weighting = se$weighting) + + onStop(function() { + options(startOpt) + if (file.exists(cmdLogFile)) { + unlink(cmdLogFile) + } + # Clean cached input files from tempdir (data, tree, and excel) + unlink(list.files(tempdir(), pattern = "^(data|tree|excel)File-", + full.names = TRUE)) + if (logging) { + LogMsg("Session has ended") + on.exit(close(logMsgFile)) + } + }) +} diff --git a/inst/Parsimony/server/app_state.R b/inst/Parsimony/server/app_state.R new file mode 100644 index 000000000..612a6dba8 --- /dev/null +++ b/inst/Parsimony/server/app_state.R @@ -0,0 +1,78 @@ +# AppState: Centralized reactive state for the TreeSearch Shiny app. +# +# All reactive values used by server modules are defined here with explicit +# initial values and domain grouping. This replaces the ad-hoc +# reactiveValues() call in server.R. +# +# Usage in server.R: +# r <- AppState() +# +# Modules access fields via r$fieldName. See field documentation below. + +AppState <- function() { + reactiveValues( + + # -- Data domain -- + # Primary dataset and metadata loaded from file + dataset = NULL, # phyDat object (or NULL before load) + chars = NULL, # character matrix from ReadCharacters() + charNotes = NULL, # character notes from ReadNotes() + dataHash = NULL, # rlang::hash() of dataset (change trigger) + dataFileVisible = TRUE, # whether file-upload UI is shown + readDataFile = NULL, # string: R expression used to read data file + + # -- File tracking (logging) -- + # Counters for unique file uploads per session (used by logging) + dataFiles = 0, # count of data file uploads + excelFiles = 0, # count of Excel file uploads + treeFiles = 0, # count of tree file uploads + + # -- Tree domain -- + # Trees loaded from files or produced by search + allTrees = NULL, # multiPhylo: full tree set (unsorted/unfiltered) + trees = NULL, # multiPhylo: active subset (after range/thin) + treeHash = NULL, # rlang::hash() of trees (change trigger) + newTrees = NULL, # multiPhylo: trees from most recent search + sortTrees = FALSE, # logical: sort trees by score before display + readTreeFile = NULL, # string: R expression used to read tree file + + # -- Tree subsetting state -- + nTree = 0L, # integer: current max trees to display + treeRange = c(1L, 1L), # integer[2]: active range of tree indices + updatingTrees = FALSE, # reentrancy guard for UpdateActiveTrees() + + # -- "Old" values for change detection -- + # These track previous input values so observers can detect real changes + # vs reactive re-fires. Will be removed when modules handle own state. + oldNTree = NULL, # previous value of input$nTree + oldTreeRange = NULL, # previous value of input$treeRange + oldOutgroup = NO_OUTGROUP, # previous value of input$outgroup + oldkeepNTips = NULL, # previous value of input$keepNTips + + # -- Search domain -- + searchCount = 0L, # integer: how many searches have been run + searchDataHash = NULL, # hash of dataset at search time + searchNotification = NULL, # Shiny notification ID (for dismissal) + searchInProgress = FALSE, # TRUE while searchTask is running + searchWithout = NULL, # character: taxa excluded from search + bestSearchScore = NULL, # numeric: best score seen across all searches (for accumulation) + searchTotalHits = 0L, # integer: cumulative hits_to_best across runs at current best score + searchTotalReps = 0L, # integer: cumulative runs completed at current best score + searchReplicateScores = numeric(0), # numeric: per-replicate scores for ScoreSpectrum coverage estimation + searchConsensusStable = FALSE, # logical: TRUE if latest search stopped due to consensus stability + searchTimedOut = FALSE, # logical: TRUE if latest search stopped due to timeout + + # -- Consensus / plotting domain -- + outgroup = NULL, # character: selected outgroup taxa + keepNTips = NULL, # integer: tips retained in consensus + plottedTree = NULL, # phylo or list: tree(s) currently plotted + concordance = list(), # list: cached concordance results + plotLog = NULL, # character vector: R code log for plot + + # -- Cluster domain -- + # (r$cluster is not a state field; clustering.R uses local variables) + + # -- UI state -- + visibleConfigs = NULL # character: which config panels are visible + ) +} diff --git a/inst/Parsimony/server/logging.R b/inst/Parsimony/server/logging.R new file mode 100644 index 000000000..f81398e8b --- /dev/null +++ b/inst/Parsimony/server/logging.R @@ -0,0 +1,233 @@ + serverEnv <- environment() + logIndent <- 0 + loggingOn <- TRUE + + cmdLogFile <- tempfile("TreeSearch-", fileext = ".R") + Write <- function (txt, file) { + if (serverEnv$loggingOn) { + txt <- paste0(strrep(" ", logIndent), txt) + con <- file(file, open = "a") + on.exit(close(con)) + if (logging) { + WriteLoggedCode(txt) + } + writeLines(txt, con) + } + } + + WriteP <- function (txt, file = NULL) { + if (serverEnv$loggingOn) { + txt <- paste0(strrep(" ", logIndent), txt) + if (logging) { + WriteLoggedCode(txt) + } + r$plotLog <- c(r$plotLog, as.character(txt)) + } + } + + LogExpr <- function(exps, evaluate = TRUE, WriteFn = Write) { + for (exp in exps) { + WriteFn(as.character(exp), cmdLogFile) + if (evaluate) { + eval(exp) + } + } + } + + LogExprP <- function(...) { + LogExpr(..., WriteFn = WriteP) + } + + LogIndent <- function(n) { + serverEnv$logIndent <- serverEnv$logIndent + n + if (serverEnv$logIndent < 0) { + warning("Negative indent") + } + } + + systemInfo <- c( + paste( + "System:", Sys.info()["sysname"], Sys.info()["release"], + Sys.info()["version"], "-", + .Platform$OS.type, R.version$platform + ), + paste( + "-", R.version$version.string + ), + paste("- TreeSearch", packageVersion("TreeSearch")), + paste("- TreeTools", packageVersion("TreeTools")), + paste("- TreeDist", packageVersion("TreeDist")), + paste("- ape", packageVersion("ape")) + ) + + logCaveats <- c( + "Before running, check that the script and any data files are in the", + "R working directory, which can be read with getwd() and set with setwd().", + "", + "Please validate the code before reproducing in a manuscript, reporting", + "any errors at https://github.com/ms609/treesearch/issues or by e-mail to", + "the package maintainer." + ) + + BeginLog <- function() { + LogComment(c( + paste("# # TreeSearch session log:", .DateTime(), "# # #"), + "", + systemInfo, + "", + "This log was generated procedurally to facilitate the reproduction of", + "results obtained during an interactive Shiny session.", + "It is provided without guarantee of completeness or accuracy.", + "In particular, code will not be logged when previously computed values", + "are retrieved from cache.", + "", + logCaveats, + "", + "# # # # #" + )) + + LogComment("Load required libraries", 2) + LogCode(c( + "library(\"TreeTools\", quietly = TRUE)", + "library(\"TreeDist\")", + "library(\"TreeSearch\")" + )) + + LogComment("View recommended citations", 1) + LogCode(c( + "citation(\"TreeTools\")", + "citation(\"TreeDist\")", + "citation(\"TreeSearch\")", + "citation(\"Rogue\")" + )) + } + + BeginLogP <- function() { + r$plotLog <- NULL + LogCommentP(c( + paste("# # TreeSearch plot log:", .DateTime(), "# # #"), + "", + systemInfo, + "", + "This log was generated procedurally to facilitate the reproduction of", + "figures obtained during an interactive Shiny session.", + "It is provided without guarantee of completeness or accuracy.", + "In particular, code will not be logged when previously computed values", + "are retrieved from cache.", + "", + logCaveats, + "", + "# # # # #" + )) + LogCommentP("Load required libraries", 2) + LogCodeP(c( + "library(\"TreeTools\", quietly = TRUE)", + "library(\"TreeDist\")", + "library(\"TreeSearch\")" + )) + + LogCommentP("View recommended citations", 1) + LogCodeP(c( + "citation(\"TreeTools\")", + "citation(\"TreeDist\")", + "citation(\"Quartet\")", + "citation(\"TreeSearch\")", + "citation(\"Rogue\")" + )) + + LogCommentP("Check working directory", 1) + LogCodeP("getwd() # Should match location of data / tree files", + "setwd(\".\") # Replace . with desired/directory to change") + + if (HaveData()) { + LogCommentP("Load data from file") + LogCodeP(c( + paste0("dataFile <- ", Enquote(DataFileName(r$dataFiles))), + paste0("dataset <- ", r$readDataFile) + )) + } + + if (AnyTrees()) { + LogCommentP("Load trees from file") + LogCodeP(c( + paste0("treeFile <- ", Enquote(TreeFileName(r$treeFiles))), + "trees <- read.nexus(treeFile)", + if (!identical(r$trees, r$allTrees)) { + paste0( + "trees <- trees[unique(as.integer(seq.int(", + r$treeRange[1], ", ", r$treeRange[2], + ", length.out = ", r$nTree, ")))]" + ) + } + )) + } + } + + PauseLog <- function() { + serverEnv$loggingOn <- FALSE + } + + ResumeLog <- function() { + serverEnv$loggingOn <- TRUE + } + + LogCode <- function(..., WriteFn = Write) { + for (line in list(...)) { + if (!is.null(line)) { + WriteFn(as.character(line), cmdLogFile) + } + } + } + + LogCodeP <- function(...) { + LogCode(..., WriteFn = WriteP) + } + + LogComment <- function(exps, returns = 1, WriteFn = Write) { + if (returns > 0) { + WriteFn(rep("", returns), cmdLogFile) + } + for (exp in exps) { + WriteFn(paste("#", exp), cmdLogFile) + } + } + + LogCommentP <- function (exps, returns = 1) { + LogComment(exps, returns, WriteFn = WriteP) + } + + r$dataFiles <- 0 + r$excelFiles <- 0 + r$treeFiles <- 0 + TwoWide <- function(n) { + formatC(n, width = 2, flag = "0") + } + DataFileName <- function(n) if (length(n)) { + paste0("dataFile-", TwoWide(n), ".txt") + } + ExcelFileName <- function(n) if (length(n)) { + paste0("excelFile-", TwoWide(n), ".xlsx") + } + TreeFileName <- function(n) if (length(n)) { + paste0("treeFile-", TwoWide(n), ".txt") + } + LastFile <- function(type) { + switch(pmatch(type, c("data", "excel", "tree")), + DataFileName(r$dataFiles), + ExcelFileName(r$excelFiles), + TreeFileName(r$treeFiles) + ) + } + CacheInput <- function(type, fileName) { + key <- paste0(type, "Files") + r[[key]] <- r[[key]] + 1 + file.copy(fileName, paste0(tempdir(), "/", LastFile(type)), + overwrite = TRUE) + } + StashTrees <- function(trees) { + key <- paste0("treeFiles") + r[[key]] <- r[[key]] + 1 + write.nexus(trees, file = paste0(tempdir(), "/", LastFile("tree"))) + } + + BeginLog() diff --git a/inst/Parsimony/server/mod_clustering.R b/inst/Parsimony/server/mod_clustering.R new file mode 100644 index 000000000..9d8c405b4 --- /dev/null +++ b/inst/Parsimony/server/mod_clustering.R @@ -0,0 +1,294 @@ +# Module: Clustering analysis +# +# Owns inputs: clThresh. Owns distances computation (shared with treespace). +# Reads: r$trees, r$treeHash. +# Receives top-level distMeth as reactive arg. +# +# Returns a list of reactives: +# distances, LogDistances, silThreshold, clusterings, LogClusterings + +clustering_ui <- function(id) { + ns <- NS(id) + sliderInput(ns("clThresh"), "Cluster threshold:", value = 0.5, + min = 0, max = 1, width = 200) +} + +#' @param id Module namespace id. +#' @param r AppState reactiveValues. +#' @param distMeth Reactive wrapping top-level \code{input$distMeth}. +#' @param log_fns Named list of logging functions from logging.R: +#' LogMsg, LogCommentP, LogCodeP, LogIndent, BeginLogP, LogExprP. +clustering_server <- function(id, r, distMeth, log_fns) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + + # Unpack logging functions + LogMsg <- log_fns$LogMsg + LogCommentP <- log_fns$LogCommentP + LogCodeP <- log_fns$LogCodeP + LogIndent <- log_fns$LogIndent + BeginLogP <- log_fns$BeginLogP + LogExprP <- log_fns$LogExprP + + ############################################################################ + # Silhouette threshold (debounced clThresh input) + ############################################################################ + + silThreshold <- debounce(reactive({ + input$clThresh + }), 50) + + ############################################################################ + # Tree distances (moved from treespace module) + ############################################################################ + + Quartet <- function(...) { + if (!requireNamespace("Quartet", quietly = TRUE)) { + Notification("Installing required package \"Quartet\"", + type = "warning", duration = 20) + install.packages("Quartet") + } + as.dist(Quartet::QuartetDivergence( + Quartet::ManyToManyQuartetAgreement(...), similarity = FALSE)) + } + + distances <- bindCache(reactive({ + LogMsg("distances(): ", distMeth()) + if (length(r$trees) > 1L) { + Dist <- switch(distMeth(), + "cid" = TreeDist::ClusteringInfoDistance, + "pid" = TreeDist::PhylogeneticInfoDistance, + "msid" = TreeDist::MatchingSplitInfoDistance, + "rf" = TreeDist::RobinsonFoulds, + "qd" = Quartet) + withProgress( + message = "Initializing distances...", value = 0.99, + Dist(r$trees) + ) + } else { + matrix(0, 0, 0) + } + }), distMeth(), r$treeHash) + + LogDistances <- function() { + LogCommentP("Compute tree distances") + LogCodeP(switch( + distMeth(), + "cid" = "dists <- TreeDist::ClusteringInfoDistance(trees)", + "pid" = "dists <- TreeDist::PhylogeneticInfoDistance(trees)", + "msid" = "dists <- TreeDist::MatchingSplitInfoDistance(trees)", + "rf" = "dists <- TreeDist::RobinsonFoulds(trees)", + "qd" = c("dists <- as.dist(Quartet::QuartetDivergence(", + " Quartet::ManyToManyQuartetAgreement(trees),", + " similarity = FALSE)", ")") + )) + } + + ############################################################################ + # Clusterings + ############################################################################ + + clusterings <- bindCache(reactive({ + ## CAUTION: Update LogClusterings() to reflect any changes made + ## to this function + LogMsg("clusterings()") + maxCluster <- min(15L, length(r$trees) - 1L) + if (maxCluster > 1L) { + possibleClusters <- 2:maxCluster + + hSil <- pamSil <- -99 + dists <- distances() + + nMethodsChecked <- 3L + cli::cli_progress_bar("Computing clusterings", "K-means", + total = nMethodsChecked) + + nK <- length(possibleClusters) + + kClusters <- lapply(possibleClusters, + function(k) TreeDist::KMeansPP(dists, k)) + kSils <- vapply(kClusters, function(kCluster) { + mean(cluster::silhouette(kCluster$cluster, dists)[, 3]) + }, double(1)) + bestK <- which.max(kSils) + kSil <- kSils[bestK] + kCluster <- kClusters[[bestK]]$cluster + + cli::cli_progress_update(1, status = "PAM") + pamClusters <- lapply(possibleClusters, function(k) { + cluster::pam(dists, k = k) + }) + pamSils <- vapply(pamClusters, function(pamCluster) { + mean(cluster::silhouette(pamCluster)[, 3]) + }, double(1)) + bestPam <- which.max(pamSils) + pamSil <- pamSils[bestPam] + pamCluster <- pamClusters[[bestPam]]$cluster + + cli::cli_progress_update(1, status = "Hierarchical") + hTree <- protoclust::protoclust(dists) + hClusters <- lapply(possibleClusters, function(k) cutree(hTree, k = k)) + hSils <- vapply(hClusters, function(hCluster) { + mean(cluster::silhouette(hCluster, dists)[, 3]) + }, double(1)) + bestH <- which.max(hSils) + hSil <- hSils[bestH] + hCluster <- hClusters[[bestH]] + cli::cli_progress_update(1, status = "Done") + + bestCluster <- c("none", "pam", "hmm", "kmn")[ + which.max(c(silThreshold(), pamSil, hSil, kSil))] + } else { + bestCluster <- "none" + } + + LogMsg("Best clustering: ", bestCluster, + "; sil: ", signif(switch(bestCluster, + pam = pamSil, hmm = hSil, + kmn = kSil, 0))) + # Return: + list(method = switch(bestCluster, + pam = "part. around medoids", + hmm = "minimax linkage", + kmn = "k-means", + none = "no significant clustering"), + n = 1 + switch(bestCluster, pam = bestPam, hmm = bestH, + kmn = bestK, 0), + sil = switch(bestCluster, pam = pamSil, hmm = hSil, + kmn = kSil, 0), + cluster = switch(bestCluster, pam = pamCluster, hmm = hCluster, + kmn = kCluster, 1) + ) + + }), r$treeHash, silThreshold(), distMeth()) + + ############################################################################ + # LogClusterings + ############################################################################ + + LogClusterings <- function() { + maxCluster <- min(15L, length(r$trees) - 1L) + if (maxCluster > 1L) { + possibleClusters <- paste(2, maxCluster, sep = ":") + + hSil <- pamSil <- -99 + LogDistances() + dists <- distances() + + LogCommentP("Compute clusters of trees", 2) + nK <- length(possibleClusters) + LogCommentP("Try K-means++ clustering (Arthur & Vassilvitskii 2007):") + LogCodeP( + paste0( + "kClusters <- lapply(", possibleClusters, ", ", + "function (k) KMeansPP(dists, k)", ")" + ), + "kSils <- vapply(kClusters, function (kCluster) {", + " mean(cluster::silhouette(kCluster$cluster, dists)[, 3])", + "}, double(1))", + "bestK <- which.max(kSils)", + "kSil <- kSils[bestK] # Best silhouette coefficient", + "kCluster <- kClusters[[bestK]]$cluster # Best solution" + ) + + LogCommentP("Try partitioning around medoids (Maechler et al. 2019):") + LogCodeP( + paste0( + "pamClusters <- lapply(", possibleClusters, ", ", + "function (k) cluster::pam(dists, k = k)", ")" + ), + "pamSils <- vapply(pamClusters, function (pamCluster) {", + " mean(cluster::silhouette(pamCluster)[, 3])", + "}, double(1))", + "bestPam <- which.max(pamSils)", + "pamSil <- pamSils[bestPam] # Best silhouette coefficient", + "pamCluster <- pamClusters[[bestPam]]$cluster # Best solution" + ) + + + LogCommentP( + paste("Try hierarchical clustering with minimax linkage", + "(Bien & Tibshirani 2011):") + ) + LogCodeP( + "hTree <- protoclust::protoclust(dists)", + paste0( + "hClusters <- lapply(", possibleClusters, ", ", + "function (k) cutree(hTree, k = k)", ")" + ), + "hSils <- vapply(hClusters, function (hCluster) {", + " mean(cluster::silhouette(hCluster, dists)[, 3])", + "}, double(1))", + "bestH <- which.max(hSils)", + "hSil <- hSils[bestH] # Best silhouette coefficient", + "hCluster <- hClusters[[bestH]] # Best solution" + ) + + LogCommentP("Set threshold for recognizing meaningful clustering") + LogCommentP( + "no support < 0.25 < weak < 0.5 < good < 0.7 < strong", 0) + LogCodeP(paste0("threshold <- ", silThreshold())) + + LogCommentP("Compare silhouette coefficients of each method") + LogCodeP( + "bestMethodId <- which.max(c(threshold, pamSil, hSil, kSil))", + "bestCluster <- c(\"none\", \"pam\", \"hmm\", \"kmn\")[bestMethodId]" + ) + if (clusterings()$n == 1) { + LogCommentP("No significant clustering was found.") + LogCodeP("clustering <- 1 # Assign all trees to single cluster") + } else { + LogCommentP(paste0("Best clustering was ", + clusterings()$method, ":")) + LogCommentP(paste0("Silhouette coefficient = ", + signif(clusterings()$sil)), 0) + LogCommentP(paste0("Store the cluster to which each tree is ", + "optimally assigned:")) + LogCodeP(paste0( + "clustering <- switch(bestCluster, pam = pamCluster,", + " hmm = hCluster, kmn = kCluster, 1)"), + paste0("nClusters <- length(unique(clustering))"), + paste0( + "clusterCol <- ", + EnC(palettes[[min(length(palettes), clusterings()$n)]]), + " # Arbitrarily" + ) + ) + } + } else { + LogCommentP("Not enough trees for clustering analysis") + LogCodeP("bestCluster <- \"none\"") + LogCodeP("nClusters <- 1") + } + } + + ############################################################################ + # clThresh label CSS class (color-codes threshold strength) + ############################################################################ + + observeEvent(input$clThresh, { + classes <- c("meaningless", "weak", "good", "strong") + liveClass <- classes[as.integer(cut( + input$clThresh, c(0, 0.25, 0.5, 0.7, 1), + include.lowest = TRUE, right = FALSE + ))] + labelId <- ns("clThresh-label") + runjs(paste0( + "$('#", labelId, "').removeClass('", paste(classes, collapse = " "), + "').addClass('", liveClass, "');" + )) + }) + + ############################################################################ + # Return reactives for server.R and other modules + ############################################################################ + + list( + distances = distances, + LogDistances = LogDistances, + silThreshold = silThreshold, + clusterings = clusterings, + LogClusterings = LogClusterings + ) + }) +} diff --git a/inst/Parsimony/server/mod_consensus.R b/inst/Parsimony/server/mod_consensus.R new file mode 100644 index 000000000..a31179cb6 --- /dev/null +++ b/inst/Parsimony/server/mod_consensus.R @@ -0,0 +1,1439 @@ +# Module: Consensus & Main Plot +# +# Absorbs consensus.R + residual clustering.R + consensus-related bindings +# from events.R. Owns the main plot dispatch, consensus tree plotting, +# character mapping, stability / rogue analysis, concordance, cluster +# consensus plotting, plot code logging, and associated UI updates. +# +# Owns inputs: consP, keepNTips, neverDrop, outgroup, concordance, +# plottedChar, searchChar, mapDisplay, whichTree, excludedTip. +# +# Owns outputs: treePlot, charMapLegend, charNotes, branchLegend. +# +# Reactive args: +# r AppState reactiveValues +# AnyTrees reactive logical (data module) +# HaveData reactive logical (data module) +# tipLabels reactive character (data module) +# nChars reactive integer (data module) +# TaxonOrder reactive character (data module) +# concavity reactive (search module) +# clusterings reactive list (clustering module) +# silThreshold reactive numeric (clustering module) +# LogClusterings function (clustering module) +# TreespacePlot function (treespace module) +# LogTreespacePlot function (treespace module) +# dims reactive integer (treespace module) +# nProjDim reactive integer (treespace module) +# TreeCols reactive character (treespace module) +# treePch reactive (treespace module) +# ts_spaceCol reactive character (treespace module) +# ts_mapLines reactive character (treespace module) +# ts_spacePch reactive character (treespace module) +# ts_relators reactive character (treespace module) +# plotFormat reactive character (top-level input) +# plotSize reactive integer (top-level input) +# distMeth reactive character (top-level input) +# log_fns named list of logging functions +# +# Returns: +# MainPlot, RCode, UpdateKeepNTipsRange, +# UpdateDroppedTaxaDisplay, UpdateOutgroupInput + +# --------------------------------------------------------------------------- +# UI — returns named list for scattered placement in ui.R +# --------------------------------------------------------------------------- +consensus_ui <- function(id) { + ns <- NS(id) + list( + tree_plot = plotOutput(ns("treePlot"), height = "600px"), + + which_tree = tagList( + sliderInput(ns("whichTree"), "Tree to plot", value = 0L, + min = 0L, max = 1L, step = 1L), + htmlOutput(ns("clusterLabel"), inline = TRUE) + ), + + tree_plot_config = tagList( + selectizeInput(ns("outgroup"), "Root on:", multiple = TRUE, + choices = list()), + selectizeInput( + ns("concordance"), "Split support:", + choices = list( + "None" = "none", + "% trees containing" = "p", + "Quartet concordance" = "qc", + "Clustering concordance" = "clc", + "Phylogenetic concordance" = "phc", + "Mutual Clustering conc." = "mcc", + "Shared Phylog. conc." = "spc" + )) + ), + + char_chooser = tagList( + tags$div( + numericInput(ns("plottedChar"), "Character to map:", value = 1L, + min = 0L, max = 1L, step = 1L, width = 200), + selectizeInput(ns("searchChar"), "Search characters:", + multiple = FALSE, choices = list()), + checkboxGroupInput(ns("mapDisplay"), "", list( + "Align tips" = "tipsRight", + "Infer tips" = "updateTips" + )), + style = "float: right; width: 200px; margin-left: 2em;" + ), + htmlOutput(ns("charMapLegend")), + htmlOutput(ns("charNotes")) + ), + + cons_config = tagList( + tags$div(style = "float: right; width: 200px; margin-left: 2em;", + sliderInput(ns("consP"), "Majority:", value = 1, + min = 0.5, max = 1, width = 200), + numericInput(ns("keepNTips"), "Tips to show:", value = 0L, + min = 3L, max = 2L, step = 1L, width = 200), + selectizeInput(ns("neverDrop"), "Never drop:", multiple = TRUE, + choices = c()) + ), + tags$div(id = "consLegend", + tags$span(id = "instabLegend", + tagList( + tags$span(class = "legendLeft", "Stable"), + tags$span(class = "infernoScale legendBar", "\ua0"), + tags$span(class = "legendRight", "Unstable") + ) + ), + # Wrapper keeps top-level id for ShowConfigs show/hide + tags$span(id = "branchLegend", + htmlOutput(ns("branchLegend"), inline = TRUE) + ) + ), + tags$div(id = "droppedTips", + selectInput(ns("excludedTip"), "Show excluded tip", choices = list()) + ), + tags$div(id = "droppedList", style = "float: left;") + ) + ) +} + +# --------------------------------------------------------------------------- +# Server +# --------------------------------------------------------------------------- +consensus_server <- function(id, r, + AnyTrees, HaveData, tipLabels, nChars, TaxonOrder, + concavity, + clusterings, silThreshold, LogClusterings, + TreespacePlot, LogTreespacePlot, + dims, nProjDim, TreeCols, treePch, + ts_spaceCol, ts_mapLines, ts_spacePch, ts_relators, + plotFormat, plotSize, distMeth, + log_fns) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + + # Unpack logging + LogMsg <- log_fns$LogMsg + LogComment <- log_fns$LogComment + LogCode <- log_fns$LogCode + LogCommentP <- log_fns$LogCommentP + LogCodeP <- log_fns$LogCodeP + LogIndent <- log_fns$LogIndent + BeginLogP <- log_fns$BeginLogP + LogExprP <- log_fns$LogExprP + + ############################################################################ + # Cross-module shinyjs helpers (target top-level DOM ids) + ############################################################################ + + parentShow <- function(id) { + runjs(paste0("$('#", id, "').removeClass('shinyjs-hide').show()")) + } + parentHide <- function(id) { + runjs(paste0("$('#", id, "').hide()")) + } + parentHtml <- function(id, html) { + escaped <- gsub("'", "\\'", html, fixed = TRUE) + runjs(paste0("$('#", id, "').html('", escaped, "')")) + } + + ############################################################################ + # Core helpers + ############################################################################ + + UserRoot <- function(tree) { + outgroupTips <- intersect(r$outgroup, tree$tip.label) + if (length(outgroupTips)) { + RootTree(tree, outgroupTips) + } else { + tree + } + } + + LogUserRoot <- function(tree = "cons", dropped = character(0)) { + outgroupTips <- setdiff(r$outgroup, dropped) + if (length(outgroupTips)) { + LogCommentP("Root tree") + LogCodeP(paste0(tree, " <- RootTree(", tree, ", ", + EnC(outgroupTips), ")")) + } + } + + unitEdge <- reactive(TRUE) + + SortEdges <- function(tr, force = FALSE) { + if (force || r$sortTrees) { + SortTree(tr, order = TaxonOrder()) + } else { + tr + } + } + LogSortEdges <- function(tr) ( + if (r$sortTrees) { + LogCommentP("Rotate nodes, to display clades in order of size", 0) + LogCodeP(paste0( + tr, " <- SortTree(", tr, ", order = ", + if (HaveData()) { + "names(dataset)" + } else { + "trees[[1]]$tip.label" + }, + ")" + )) + } + ) + + LogPar <- function() { + LogCommentP("Set up plotting area") + LogCodeP(c( + "par(", + " mar = c(0, 0, 0, 0), # Zero margins", + " cex = 0.9 # Smaller font size", + ")" + )) + } + + UCFirst <- function(str) { + paste0(toupper(substr(str, 1, 1)), + substr(str, 2, nchar(str))) + } + + TipsInTree <- reactive({ + if (AnyTrees()) { + length(r$trees[[1]]$tip.label) + } else { + 0L + } + }) + + ############################################################################ + # Debounced input reactives + ############################################################################ + + PlottedChar <- debounce(reactive({ + typed <- max(0L, as.integer(input$plottedChar), na.rm = TRUE) + if (nChars() > 0 && typed > nChars()) { + Notification(type = "warning", + paste("Dataset contains", nChars(), "characters.")) + updateNumericInput(session, "plottedChar", value = nChars()) + } + min(typed, nChars()) + }), aJiffy) + + whichTree <- debounce(reactive(input$whichTree), aJiffy) + + output$clusterLabel <- renderUI({ + wt <- whichTree() + if (is.null(wt) || wt < 1L) return(NULL) + cl <- clusterings() + if (cl$n < 2L) return(NULL) + clId <- cl$cluster[wt] + col <- palettes[[min(length(palettes), cl$n)]][clId] + tags$span( + paste0("Cluster ", clId), + style = paste0("color:", col, ";font-weight:bold;margin-left:4px;") + ) + }) + + consP <- debounce(reactive(signif(input$consP)), 50) + + ############################################################################ + # Stability / rogue analysis + ############################################################################ + + Instab <- reactive({ + TipInstability(r$trees) + }) + + stableCol <- reactive({ + Rogue::ColByStability(r$trees) + }) + + Rogues <- bindCache(reactive({ + if (AnyTrees() && inherits(r$trees, "multiPhylo")) { + LogComment("Check for rogue taxa", 2) + LogComment(paste0( + "Use RogueTaxa() in place of QuickRogue() for a more complete ", + "analysis")) + LogCode(c( + "rogues <- Rogue::QuickRogue(", + " trees,", + if (length(input$neverDrop)) paste0( + " neverDrop = ", EnC(input$neverDrop), "," + ), + " fullSeq = TRUE,", + paste0(" p = ", Enquote(consP())), + ")", + "print(rogues) # Detailed results of rogue analysis", + "print(rogues$taxon[-1]) # Sequence of taxa to drop" + )) + withProgress( + message = "Identifying rogues", value = 0.99, + rogues <- Rogue::QuickRogue(r$trees, neverDrop = input$neverDrop, + fullSeq = TRUE, p = consP()) + ) + rogues[!rogues$taxon %in% input$neverDrop, ] + } else { + data.frame(num = 0, taxNum = NA_integer_, taxon = NA_character_, + rawImprovement = NA_real_, IC = 0) + } + }), r$treeHash, input$neverDrop, consP()) + + dropSeq <- reactive({ + LogMsg("dropSeq()") + Rogues()$taxon[-1] + }) + + nNonRogues <- reactive({ + LogMsg("nNonRogues()") + on.exit(LogMsg("nNonRogues: ", nrow(Rogues()) - which.max(Rogues()$IC))) + nrow(Rogues()) - which.max(Rogues()$IC) + }) + + TipCols <- reactive(stableCol()) + + TipColLegend <- function() { + PlotTools::SpectrumLegend( + "bottomleft", horiz = TRUE, inset = 0.01, bty = "n", xpd = NA, + palette = hcl.colors(131, "inferno")[1:101], + legend = c("Stable", "Unstable"), + title = "Leaf stability", + title.font = 2 + ) + } + + ############################################################################ + # Tip subsetting + ############################################################################ + + KeptTips <- reactive({ + LogMsg("KeptTips()") + n <- r$keepNTips + maxN <- length(tipLabels()) + if (is.na(n) || is.null(n)) { + n <- maxN + } + if (n < 3L) { + n <- 3L + } + nNeverDrop <- length(input$neverDrop) + if (n < nNeverDrop) { + n <- nNeverDrop + } + nFromDropSeq <- n - nNeverDrop + if (nFromDropSeq > length(dropSeq())) { + c(input$neverDrop, dropSeq()) + } else { + c(input$neverDrop, rev(dropSeq())[seq_len(nFromDropSeq)]) + } + }) + + DroppedTips <- reactive({ + LogMsg("DroppedTips()") + if (length(KeptTips()) > 1) { + setdiff(tipLabels(), KeptTips()) + } else { + character(0) + } + }) + + ############################################################################ + # Concordance + ############################################################################ + + concordance <- bindCache(reactive({ + LogMsg("concordance()") + switch(input$concordance, + "p" = SplitFrequency(r$plottedTree, r$trees) / length(r$trees), + "qc" = QuartetConcordance(r$plottedTree, r$dataset), + "mcc" = MutualClusteringConcordance(r$plottedTree, r$dataset), + "spc" = SharedPhylogeneticConcordance(r$plottedTree, r$dataset), + "clc" = ClusteringConcordance(r$plottedTree, r$dataset), + "phc" = PhylogeneticConcordance(r$plottedTree, r$dataset), + NULL + ) + }), r$plottedTree, r$treeHash, r$dataHash, input$concordance) + + LabelConcordance <- \() { + LogMsg("LabelConcordance()") + if (input$concordance != "none" && + inherits(r$plottedTree, "phylo")) { + LabelSplits(r$plottedTree, signif(concordance(), 3), + col = SupportColor(concordance()), + frame = "none", pos = 3L) + } + } + + LogConcordance <- function(plottedTree = "plottedTree") { + if (input$concordance != "none") { + LogCommentP("Calculate split concordance", 1) + concCode <- switch( + input$concordance, + "p" = paste0("SplitFrequency(", plottedTree, + ", trees) / length(trees)"), + "qc" = paste0("QuartetConcordance(", plottedTree, ", dataset)"), + "clc" = paste0("ClusteringConcordance(", plottedTree, ", dataset)"), + "phc" = paste0("PhylogeneticConcordance(", plottedTree, ", dataset)"), + "mcc" = paste0("MutualClusteringConcordance(", plottedTree, + ", dataset)"), + "spc" = paste0("SharedPhylogeneticConcordance(", plottedTree, + ", dataset)"), + NULL + ) + LogCodeP(paste0("concordance <- ", concCode)) + LogCommentP("Annotate splits by concordance", 1) + LogCodeP("LabelSplits(", + paste0(" tree = ", plottedTree, ","), + " labels = signif(concordance, 3),", + " col = SupportColor(concordance),", + " frame = \"none\",", + " pos = 3", + ")") + } + } + + ############################################################################ + # Tree plotting + ############################################################################ + + PlottedTree <- reactive({ + if (length(r$trees) > 0L) { + plottedTree <- if (whichTree() > 0) { + r$trees[[whichTree()]] + } else { + Consensus(r$trees, p = 1) + } + plottedTree <- UserRoot(plottedTree) + plottedTree <- SortEdges(plottedTree) + if (!("tipsRight" %in% input$mapDisplay)) { + plottedTree$edge.length <- + rep_len(2, dim(plottedTree[["edge"]])[[1]]) + } + plottedTree + } + }) + + LogPlottedTree <- function() { + if (whichTree() > 0) { + LogCodeP(paste0("plottedTree <- trees[[", whichTree(), "]]")) + } else { + LogCodeP("plottedTree <- Consensus(trees, p = 1)") + } + LogUserRoot("plottedTree") + if (!("tipsRight" %in% input$mapDisplay)) { + LogCommentP("Set uniform edge length", 0) + LogCodeP( + "plottedTree$edge.length <- rep.int(2, nrow(plottedTree$edge))" + ) + } + LogSortEdges("plottedTree") + } + + ############################################################################ + # Consensus plot + ############################################################################ + + ConsensusPlot <- function() { + LogMsg("ConsensusPlot()") + on.exit(LogMsg("/ConsensusPlot()")) + + par(mar = rep(0, 4), cex = 0.9) + kept <- KeptTips() + dropped <- DroppedTips() + + if (length(dropped) && + length(input$excludedTip) && + nchar(input$excludedTip) && + input$excludedTip %in% tipLabels()) { + + if (length(setdiff(dropped, input$excludedTip))) { + consTrees <- lapply(r$trees, DropTip, + setdiff(dropped, input$excludedTip)) + } else { + consTrees <- r$trees + } + + plotted <- TreeTools::RoguePlot( + consTrees, + input$excludedTip, + p = consP(), + edgeLength = 1, + outgroupTips = r$outgroup, + tip.color = TipCols()[intersect(consTrees[[1]]$tip.label, kept)] + ) + r$plottedTree <- plotted$cons + + LabelConcordance() + } else { + without <- intersect(dropped, tipLabels()) + cons <- ConsensusWithout(r$trees, without, p = consP()) + cons <- UserRoot(cons) + + if (unitEdge()) { + cons$edge.length <- rep.int(1, dim(cons$edge)[1]) + } + cons <- SortEdges(cons) + + r$plottedTree <- cons + plot(r$plottedTree, + tip.color = TipCols()[intersect(cons$tip.label, kept)]) + LabelConcordance() + } + } + + LogConsensusPlot <- function() { + BeginLogP() + LogPar() + dropped <- DroppedTips() + + if (length(dropped) && + length(input$excludedTip) && + nchar(input$excludedTip) && + input$excludedTip %in% tipLabels()) { + + LogCommentP("Prepare reduced consensus tree", 1) + if (length(setdiff(dropped, input$excludedTip))) { + LogCodeP(paste0("exclude <- ", + EnC(setdiff(dropped, input$excludedTip)))) + LogCodeP("consTrees <- lapply(trees, DropTip, exclude)") + LogCodeP("labels <- setdiff(consTrees[[1]]$tip.label, exclude)") + } else { + LogCodeP("consTrees <- trees", + "labels <- consTrees[[1]]$tip.label") + } + + LogCommentP(paste0( + "Colour tip labels according to their original 'instability' ", + "(Smith 2022)") + ) + LogCodeP( + "tipCols <- Rogue::ColByStability(trees)", + paste0( + "tipCols <- tipCols[setdiff(labels, ", + Enquote(input$excludedTip), ")]" + ) + ) + LogCommentP(paste0( + "Plot the reduced consensus tree, showing position of ", + gsub("_", " ", input$excludedTip, fixed = TRUE)) + ) + LogCodeP("plotted <- RoguePlot(", + " trees = consTrees,", + paste0(" tip = ", Enquote(input$excludedTip), ","), + paste0(" p = ", consP(), ","), + " edgeLength = 1,", + if(length(r$outgroup)) { + paste0(" outgroupTips = ", EnC(r$outgroup), ",") + }, + " tip.color = tipCols", + ")") + + LogCommentP("Store tree to plot concordance") + LogCodeP("plottedTree <- plotted$cons") + + LogConcordance() + } else { + without <- intersect(dropped, tipLabels()) + LogCommentP("Calculate consensus tree") + if (length(without)) { + LogCodeP( + "cons <- ConsensusWithout(", + " trees,", + paste0(" ", EnC(without), ","), + paste0(" p = ", consP()), + ")") + } else { + LogCodeP(paste0( + "cons <- Consensus(trees, p = ", consP(), ")" + )) + } + LogUserRoot(dropped = without) + if (unitEdge()) { + LogCodeP("cons$edge.length <- rep.int(1L, nrow(cons$edge))") + } + LogSortEdges("cons") + LogCommentP("Plot consensus tree") + LogCodeP( + "tipCols <- Rogue::ColByStability(trees)[cons$tip.label]", + "plot(cons, tip.color = tipCols)") + LogConcordance("cons") + } + } + + ############################################################################ + # Character-wise plot + ############################################################################ + + PolEscVal <- reactive({ + tl <- tipLabels() + dl <- names(r$dataset) + # Skip if taxa don't match exactly: tipLabels() may include taxa absent + # from the dataset (e.g. trees loaded from a superset dataset), causing + # a matrix-dimension mismatch inside LengthAdded / TreeLength. + if (!setequal(tl, dl)) return(NULL) + LengthAdded(r$trees, + r$dataset[tl, PlottedChar()], + concavity()) + }) + + CharacterwisePlot <- function() { + par(mar = rep(0, 4), cex = 0.9) + n <- PlottedChar() + if (whichTree() > 0) { + LogMsg("Plotting PlottedTree(", whichTree(), ", ", n, ")") + } + r$plottedTree <- PlottedTree() + if (length(n) && n > 0L) { + pc <- tryCatch({ + extraLen <- PolEscVal() + roguishness <- if (max(extraLen) == 0) { + "black" + } else { + hcl.colors(256, "inferno")[ + (192 * extraLen[r$plottedTree$tip.label] / max(extraLen)) + 1 + ] + } + PlotCharacter( + if (whichTree() > 0) { + MakeTreeBinary(r$plottedTree) + } else { + lapply(r$trees, function(t) MakeTreeBinary(UserRoot(t))) + }, + r$dataset, + n, + edge.width = 2.5, + updateTips = "updateTips" %in% input$mapDisplay, + tip.color = roguishness, + Display = function(tr) { + tr <- UserRoot(tr) + if ("tipsRight" %in% input$mapDisplay) { + # Cladogram: tips aligned to the right + tr$edge.length <- NULL + } else { + tr$edge.length <- rep.int(1, dim(tr$edge)[[1]]) + } + SortEdges(tr) + } + ) + if (max(extraLen) > 0) { + PlotTools::SpectrumLegend( + "bottomleft", bty = "n", + palette = hcl.colors(256, "inferno")[1:193], + title = "Mean tree score\nimpact", + title.font = 2, + y.intersp = 1.42, + legend = c(signif(4:1 * max(extraLen) / 4, 3), "No impact") + ) + } + }, + error = function(cond) { + cli::cli_alert_danger(cond) + Notification(type = "error", + "Could not match dataset to taxa in trees") + ErrorPlot("Load dataset with\n", "character codings\n", + "for taxa on tree") + return() + } + ) + + LabelConcordance() + } else { + plot(r$plottedTree, tip.color = TipCols()[r$plottedTree$tip.label]) + TipColLegend() + } + } + + LogCharacterwisePlot <- function() { + BeginLogP() + LogPar() + n <- PlottedChar() + if (whichTree() > 0) { + LogComment(paste("Select tree", whichTree(), "from tree set")) + } + LogPlottedTree() + if (length(n) && n > 0L) { + if (whichTree() > 0) { + LogCommentP(paste("Map character", n, "onto tree", whichTree())) + } else { + LogCommentP(paste("Map character", n, "onto consensus tree")) + } + LogCodeP( + "PlotCharacter(", + if (whichTree() > 0) " tree = MakeTreeBinary(plottedTree)," else + paste0(" tree = lapply(RootTree(trees, ", EnC(r$outgroup), + "), MakeTreeBinary),"), + " dataset = dataset,", + paste0(" char = ", n, ","), + paste0(" updateTips = ", "updateTips" %in% input$mapDisplay, ","), + " Display = function(tr) {", + paste0(" tr <- RootTree(tr, ", EnC(r$outgroup), ")"), + " tr$edge.length <- rep.int(2, nrow(tr$edge))", + " SortTree(tr)", + " },", + " edge.width = 2.5", + ")" + ) + LogConcordance() + } else { + LogCommentP("Plot single tree") + LogCodeP( + "tipCols <- Rogue::ColByStability(trees)[plottedTree$tip.label]", + "plot(plottedTree, tip.color = tipCols)" + ) + } + } + + ############################################################################ + # Cluster consensus plot (absorbed from clustering.R) + ############################################################################ + + # Per-edge colors for cluster consensus: unique splits get the full + # cluster color; splits shared by other clusters fade towards grey. + ClusterEdgeCols <- function(tree, cluster_col, all_splits, cluster_idx) { + n_tip <- Ntip(tree) + n_edge <- nrow(tree$edge) + edge_col <- rep(cluster_col, n_edge) + + my_splits <- all_splits[[cluster_idx]] + n_clusters <- length(all_splits) + if (length(my_splits) == 0 || n_clusters < 2) return(edge_col) + + other_idx <- setdiff(seq_len(n_clusters), cluster_idx) + split_nodes <- as.integer(names(my_splits)) + + shared <- integer(length(my_splits)) + for (j in other_idx) { + if (length(all_splits[[j]]) > 0) { + shared <- shared + as.integer(my_splits %in% all_splits[[j]]) + } + } + uniqueness <- 1 - shared / length(other_idx) + + grey_rgb <- col2rgb("grey70")[, 1] + col_rgb <- col2rgb(cluster_col)[, 1] + edge_child <- tree$edge[, 2] + for (e in seq_len(n_edge)) { + child <- edge_child[e] + if (child > n_tip) { + sidx <- match(child, split_nodes) + if (!is.na(sidx)) { + u <- uniqueness[sidx] + bl <- grey_rgb + (col_rgb - grey_rgb) * u + edge_col[e] <- rgb(bl[1], bl[2], bl[3], maxColorValue = 255) + } + } + } + edge_col + } + + PlotClusterCons <- function() { + LogMsg("PlotClusterCons()") + on.exit(LogMsg("/PlotClusterCons()")) + + cl <- clusterings() + + kept <- KeptTips() + dropped <- if (length(kept) > 1) { + setdiff(TipLabels(r$trees[[1]]), kept) + } else { + character(0) + } + par(mar = c(0.2, 0, 0.2, 0), xpd = NA) + if (cl$sil > silThreshold()) { + nRow <- ceiling(cl$n / 3) + r$plottedTree <- vector("list", cl$n) + par(mfrow = c(nRow, ceiling(cl$n / nRow))) + + # Phase 1: compute all cluster consensus trees + all_cons <- vector("list", cl$n) + for (i in seq_len(cl$n)) { + cons <- ConsensusWithout(r$trees[cl$cluster == i], dropped, + p = consP()) + cons <- UserRoot(cons) + if (unitEdge()) { + cons$edge.length <- rep.int(1, dim(cons$edge)[1]) + } + all_cons[[i]] <- SortEdges(cons) + } + all_splits <- lapply(all_cons, as.Splits) + + # Phase 2: plot with uniqueness-based edge coloring + for (i in seq_len(cl$n)) { + col <- palettes[[min(length(palettes), cl$n)]][i] + PutTree(r$trees) + PutData(cl$cluster) + + cons <- all_cons[[i]] + r$plottedTree[[i]] <- cons + edge_col <- ClusterEdgeCols(cons, col, all_splits, i) + plot(cons, edge.width = 2, font = 3, cex = 0.83, + edge.color = edge_col, tip.color = TipCols()[cons$tip.label]) + legend("topright", paste0("Cluster ", i), pch = 15, col = col, + pt.cex = 1.5, bty = "n") + LabelConcordance() + } + } else { + PutTree(r$trees) + cons <- ConsensusWithout(r$trees, dropped, p = consP()) + cons <- UserRoot(cons) + if (unitEdge()) { + cons$edge.length <- rep.int(1, dim(cons$edge)[1]) + } + cons <- SortEdges(cons) + r$plottedTree <- cons + plot(cons, edge.width = 2, font = 3, cex = 0.83, + edge.color = palettes[[1]], + tip.color = TipCols()[cons$tip.label]) + LabelConcordance() + legend("topright", "No clustering", pch = 16, col = palettes[[1]], + bty = "n") + } + } + + LogPlotClusterCons <- function() { + LogMsg("PlotClusterCons()") + on.exit(LogMsg("/PlotClusterCons()")) + + BeginLogP() + + cl <- clusterings() + LogClusterings() + + kept <- KeptTips() + dropped <- if (length(kept) > 1) { + setdiff(TipLabels(r$trees[[1]]), kept) + } else { + character(0) + } + if (cl$sil > silThreshold()) { + nRow <- ceiling(cl$n / 3) + LogCommentP("Plot consensus of each tree cluster", 2) + LogCodeP(paste0( + "par(mfrow = c(", nRow, ", ", + ceiling(cl$n / nRow), "))", + " # Plotting area layout" + )) + LogCodeP( + paste0( + "tipCols <- Rogue::ColByStability(trees)", + " # Colour tips by stability" + ) + ) + LogCommentP("Compute all cluster consensus trees:", 1) + LogCodeP( + paste0("allCons <- lapply(seq_len(", cl$n, "), function(i) {"), + " clusterTrees <- trees[clustering == i]", + " cons <- ConsensusWithout(", + " trees = clusterTrees,", + paste0(" tip = ", EnC(dropped), ","), + paste0(" p = ", consP()), + " )" + ) + LogUserRoot(dropped = dropped) + if (unitEdge()) { + LogExprP(" cons$edge.length <- rep.int(1, nrow(cons$edge))") + } + LogCodeP(" TreeTools::SortTree(cons)", "})") + LogCommentP(paste0( + "Compare splits across clusters to highlight unique edges" + )) + LogCodeP("allSplits <- lapply(allCons, TreeTools::as.Splits)") + LogCommentP("Plot each consensus tree in turn:", 1) + LogCodeP(paste0("for (i in seq_len(", cl$n, ")) {")) + LogIndent(+2) + LogCodeP( + "cons <- allCons[[i]]", + "nTip <- ape::Ntip(cons)", + "mySplits <- allSplits[[i]]", + paste0("otherIdx <- setdiff(seq_len(", cl$n, "), i)"), + "shared <- integer(length(mySplits))", + "for (j in otherIdx) {", + " if (length(allSplits[[j]]) > 0)", + " shared <- shared + (mySplits %in% allSplits[[j]])", + "}", + "uniqueness <- 1 - shared / length(otherIdx)", + "greyRgb <- col2rgb(\"grey70\")[, 1]", + "colRgb <- col2rgb(clusterCol[i])[, 1]", + "edgeCol <- rep(clusterCol[i], nrow(cons$edge))", + "splitNodes <- as.integer(names(mySplits))", + "for (e in seq_len(nrow(cons$edge))) {", + " child <- cons$edge[e, 2]", + " if (child > nTip) {", + " si <- match(child, splitNodes)", + " if (!is.na(si)) {", + " bl <- greyRgb + (colRgb - greyRgb) * uniqueness[si]", + " edgeCol[e] <- rgb(bl[1], bl[2], bl[3], maxColorValue = 255)", + " }", + " }", + "}" + ) + LogCodeP("plot(", + " cons,", + " edge.width = 2,", + " font = 3,", + " cex = 0.83,", + " edge.color = edgeCol,", + " tip.color = tipCols[cons$tip.label]", + ")") + LogCodeP("legend(", + " \"bottomright\",", + " paste(\"Cluster\", i),", + " pch = 15,", + " pt.cex = 1.5,", + " col = clusterCol[i],", + " bty = \"n\"", + ")") + LogConcordance("cons") + LogIndent(-2) + LogCodeP("}") + } else { + LogCommentP("No clustering structure: Plot consensus tree") + LogCodeP( + if (length(dropped)) { + c("cons <- ConsensusWithout(", + " trees = trees,", + paste0(" tip = ", EnC(dropped), ","), + paste0(" p = ", consP()), + ")" + ) + } else { + paste0("cons <- Consensus(trees, p = ", consP(), ")") + } + ) + LogUserRoot("cons", dropped = dropped) + if (unitEdge()) { + LogCommentP("Set unit edge length", 0) + LogCodeP("cons$edge.length <- rep.int(1, nrow(cons$edge))") + } + LogSortEdges("cons") + LogCodeP("plottedTree <- cons # Store for future reference") + + LogCodeP("tipCols <- Rogue::ColByStability(trees)[cons$tip.label]") + LogCommentP("Plot consensus tree") + LogCodeP( + "plot(", + " cons,", + " edge.width = 2, # Widen lines", + " font = 3, # Italicize labels", + " cex = 0.83, # Shrink tip font size", + " tip.color = tipCols", + ")" + ) + LogConcordance() + } + } + + ############################################################################ + # Main plot dispatch + ############################################################################ + + MainPlot <- function() { + if (AnyTrees()) { + LogMsg("MainPlot()") + switch( + plotFormat(), + "cons" = ConsensusPlot(), + "clus" = PlotClusterCons(), + "ind" = CharacterwisePlot(), + "space" = TreespacePlot() + ) + } + } + ReactiveMainPlot <- reactive({ MainPlot() }) + + output$treePlot <- renderCachedPlot( + ReactiveMainPlot(), + cacheKeyExpr = { + switch( + plotFormat(), + + "clus" = list(r$treeHash, plotFormat(), + r$keepNTips, input$excludedTip, + consP(), + input$neverDrop, r$outgroup, + distMeth(), + input$concordance, + silThreshold()), + "cons" = list(r$treeHash, plotFormat(), + r$keepNTips, input$excludedTip, + consP(), + input$neverDrop, r$outgroup, + input$concordance), + "ind" = list(PlottedChar(), + whichTree(), + input$concordance, + r$outgroup, + concavity(), + input$mapDisplay, + r$dataHash, r$treeHash), + "space" = list(r$treeHash, plotFormat(), + min(dims(), nProjDim()), + TreeCols(), + treePch(), + distMeth(), + ts_spaceCol(), + ts_mapLines(), + concavity(), + ts_spacePch(), + if (ts_spacePch() == "relat") ts_relators(), + silThreshold()) + ) + }, + sizePolicy = function(x) rep(plotSize(), 2) + ) + + ############################################################################ + # R code logging for plots (for downloads) + ############################################################################ + + RCode <- bindCache(reactive({ + switch( + plotFormat(), + "cons" = LogConsensusPlot(), + "clus" = LogPlotClusterCons(), + "ind" = LogCharacterwisePlot(), + "space" = LogTreespacePlot() + ) + r$plotLog + }), + switch( + plotFormat(), + + "clus" = list(r$treeHash, plotFormat(), + r$keepNTips, input$excludedTip, + consP(), + input$neverDrop, r$outgroup, + distMeth(), + input$concordance, + silThreshold()), + "cons" = list(r$treeHash, plotFormat(), + r$keepNTips, input$excludedTip, + consP(), + input$neverDrop, r$outgroup, + input$concordance), + "ind" = list(PlottedChar(), + whichTree(), + input$concordance, + r$outgroup, + concavity(), + input$mapDisplay, + r$dataHash, r$treeHash), + "space" = list(r$treeHash, plotFormat(), + min(dims(), nProjDim()), + TreeCols(), + treePch(), + distMeth(), + ts_spaceCol(), + ts_mapLines(), + concavity(), + ts_spacePch(), + if (ts_spacePch() == "relat") ts_relators(), + silThreshold()) + ) + ) + + ############################################################################ + # Character map legend + notes (htmlOutput) + ############################################################################ + + nonAmbigContrast <- reactive({ + cont <- attr(r$dataset, "contrast") + applic <- cont[, setdiff(colnames(cont), "-")] + cont[rowSums(applic) == dim(applic)[[2]], ] <- 0 + cont + }) + + plottedTokens <- reactive({ + n <- PlottedChar() + phyColumn <- vapply(r$dataset, `[[`, integer(1), + attr(r$dataset, "index")[[n]], USE.NAMES = FALSE) + tokens <- colSums(nonAmbigContrast()[phyColumn, ]) > 0L + names(tokens[tokens]) + }) + + output$charMapLegend <- bindCache( + renderUI({ + n <- PlottedChar() + if (length(n) && n > 0L && !is.null(r$chars)) { + pal <- c("#00bfc6", "#ffd46f", "#ffbcc5", "#c8a500", + "#ffcaf5", "#d5fb8d", "#e082b4", "#25ffd3", + "#a6aaff", "#e6f3cc", "#67c4ff", "#9ba75c", + "#60b17f") + + states <- attr(r$chars, "state.labels")[[n]] + tokens <- plottedTokens() + appTokens <- setdiff(tokens, "-") + datApp <- setdiff(attr(r$dataset, "levels"), "-") + .State <- function(glyph, text = "Error?", col = "red") { + if (is.numeric(glyph)) { + if (glyph > length(appTokens)) { + return(NULL) + } + level <- match(appTokens[[glyph]], datApp) + text <- states[[level]] + col <- pal[[level]] + glyph <- appTokens[[glyph]] + } + + tags$li(style = "margin-bottom: 2px;", + tags$span(glyph, + style = paste("display: inline-block;", + "border: 1px solid;", + "width: 1em;", + "text-align: center;", + "line-height: 1em;", + "margin-right: 0.5em;", + "background-color:", col, ";") + ), + tags$span(UCFirst(text))) + } + + tagList( + tags$h3(colnames(r$chars)[n]), + tags$ul(style = "list-style: none;", + .State(1), .State(2), .State(3), .State(4), .State(5), + .State(6), .State(7), .State(8), .State(9), + .State(10), .State(11), .State(12), .State(13), + if ("-" %in% tokens) + .State("-", "Inapplicable", "lightgrey"), + .State("?", "Ambiguous", "grey") + ) + ) + } + }), + PlottedChar(), + r$chars, + r$dataset + ) + + output$charNotes <- bindCache( + renderUI({ + n <- PlottedChar() + if (length(n) && n > 0L + && is.list(r$charNotes) && is.list(r$charNotes[[1]]) + && length(r$charNotes) >= n) { + + charNotes <- r$charNotes[[n]] + description <- charNotes[[1]] + notes <- charNotes[[2]] + states <- attr(r$chars, "state.labels")[[n]] + tokens <- plottedTokens() + + tagList( + if (length(description) > 0) { + tags$div(id = "char-description", + lapply(strsplit(description, "\n")[[1]], tags$p)) + }, + if (!is.null(notes)) tags$ul(class = "state-notes", { + PrintNote <- function(note) { + taxa <- names(note)[note] + tags$li(class = "state-note", + tags$span(class = "state-note-label", + paste(gsub("_", " ", fixed = TRUE, + taxa), collapse = ", ")), + tags$span(class = "state-note-detail", + notes[taxa[1]])) + } + + DuplicateOf <- function(x) { + duplicates <- duplicated(x) + masters <- x[!duplicates] + vapply(masters, function(d) x == d, logical(length(x))) + } + if (length(notes) == 1) { + onlyOne <- TRUE + names(onlyOne) <- names(notes) + PrintNote(onlyOne) + } else { + notes <- notes[order(names(notes))] + duplicates <- DuplicateOf(toupper(notes)) + apply(duplicates, 2, PrintNote) + } + }), + if (!states[[1]] %in% c("", "''") + && any(tokens == "-")) { + tags$p(tags$em(paste0( + "Brazeau et al. (2019) advise that neomorphic (0/1) ", + "characters should not contain inapplicable tokens (-)." + ))) + } + ) + } + }), + PlottedChar(), + r$dataset, + r$chars, + r$charNotes + ) + + ############################################################################ + # Branch legend (from events.R) + ############################################################################ + + output$branchLegend <- renderUI({ + if (!AnyTrees()) { + return() + } + LogMsg("renderUI(branchLegend)") + on.exit(LogMsg("/renderUI(branchLegend)")) + kept <- KeptTips() + dropped <- DroppedTips() + + if (length(dropped) && + length(input$excludedTip) && + nchar(input$excludedTip) && + input$excludedTip %in% tipLabels()) { + consTrees <- lapply(r$trees, DropTip, + setdiff(dropped, input$excludedTip)) + plotted <- TreeTools::RoguePlot( + trees = consTrees, + tip = input$excludedTip, + p = consP(), + plot = FALSE + ) + tagList( + tags$span(class = "legendLeft", "1 tree"), + tags$span(id = "blackToGreen", class = "legendBar", "\ua0"), + tags$span(class = "legendRight", + paste(max(c(plotted$onEdge, plotted$atNode)), "trees")), + ) + } + }) + + ############################################################################ + # Update functions (from events.R) — used by data module via callbacks + ############################################################################ + + UpdateKeepNTipsRange <- reactive({ + if (AnyTrees() && "consConfig" %in% r$visibleConfigs) { + nTip <- TipsInTree() + # isolate() prevents re-triggering when user manually edits keepNTips + currentInput <- isolate(input$keepNTips) + LogMsg("UpdateKeepNTipsRange(", currentInput, " -> ", nTip, ")") + r$keepNTips <- nNonRogues() + if (r$keepNTips != currentInput) { + r$oldkeepNTips <- currentInput + } + updateNumericInput(session, inputId = "keepNTips", + label = paste0("Tips to show (/", nTip, "):"), + min = max(3L, length(input$neverDrop)), + max = nTip, + value = nNonRogues()) + } + }) + + UpdateExcludedTipsInput <- reactive({ + if (AnyTrees() && "consConfig" %in% r$visibleConfigs) { + LogMsg("UpdateExcludedTipsInput()") + dropList <- dropSeq()[seq_along(DroppedTips())] + updateSelectInput(session, inputId = "excludedTip", + choices = dropList, + selected = if (input$excludedTip %in% DroppedTips()) + input$excludedTip else dropSeq()[1]) + # droppedList is a top-level div — use runjs + droppedHtml <- paste0( + "", + "
    ", + paste0("
  • ", + dropList, "
  • ", collapse = "\r\n"), + "
") + parentHtml("droppedList", droppedHtml) + } + }) + + UpdateDroppedTaxaDisplay <- reactive({ + LogMsg("UpdateDroppedTaxaDisplay()") + if ("consConfig" %in% r$visibleConfigs) { + if (length(DroppedTips())) { + UpdateExcludedTipsInput() + if ("droppedTips" %in% r$visibleConfigs) { + parentShow("droppedTips") + } + if ("droppedList" %in% r$visibleConfigs) { + parentShow("droppedList") + } + } else { + parentHide("droppedTips") + parentHide("droppedList") + } + } + }) + + UpdateOutgroupInput <- reactive({ + if (AnyTrees() && "treePlotConfig" %in% r$visibleConfigs) { + LogMsg("UpdateOutgroupInput()") + r$outgroup <- intersect(r$outgroup, KeptTips()) + if (length(r$outgroup) == 0) { + r$outgroup <- if (HaveData()) { + intersect(names(r$dataset), KeptTips())[1] + } else { + KeptTips()[1] + } + } + + if (!identical(sort(r$outgroup), sort(input$outgroup))) { + r$oldOutgroup <- if (is.null(input$outgroup)) { + NO_OUTGROUP + } else { + input$outgroup + } + } + + updateSelectizeInput( + session, + inputId = "outgroup", + selected = r$outgroup, + choices = KeptTips() + ) + } + }) + + # Force reactive UI-update functions to run whenever their dependencies + # change. Without these observers, the reactives are never consumed on + # initial load, leaving inputs with their placeholder values. + observe(UpdateKeepNTipsRange()) + observe(UpdateOutgroupInput()) + + ############################################################################ + # Input observers + ############################################################################ + + observeEvent(PlottedChar(), { + if (PlottedChar() > 0) { + showElement("mapDisplay") + } else { + hideElement("mapDisplay") + } + }, ignoreInit = TRUE) + + observeEvent(input$searchChar, { + searchResult <- as.numeric(strsplit(input$searchChar, ": ")[[1]][1]) + if (!is.na(searchResult)) { + updateNumericInput(session, "plottedChar", value = searchResult) + } + }) + + observeEvent(consP(), { + if (AnyTrees()) { + LogMsg("Observed consP()") + UpdateKeepNTipsRange() + UpdateDroppedTaxaDisplay() + r$concordance <- list() + } + }, ignoreInit = TRUE) + + observeEvent(input$keepNTips, { + if (!is.null(r$oldkeepNTips)) { + if (!identical(input$keepNTips, r$oldkeepNTips)) { + r$oldkeepNTips <- NULL + } + } else { + LogMsg("Observed input$keepNTips -> ", EnC(input$keepNTips)) + r$keepNTips <- max(length(input$neverDrop), 3L, + min(input$keepNTips, TipsInTree())) + UpdateOutgroupInput() + UpdateDroppedTaxaDisplay() + } + }, ignoreInit = TRUE) + + observeEvent(input$neverDrop, { + LogMsg("Observed input$neverDrop -> ", EnC(input$neverDrop)) + UpdateKeepNTipsRange() + UpdateOutgroupInput() + UpdateDroppedTaxaDisplay() + }, ignoreInit = TRUE) + + observeEvent(input$outgroup, { + if (!is.null(r$oldOutgroup)) { + if (!identical(input$outgroup, r$oldOutgroup)) { + r$oldOutgroup <- NULL + } + } else { + LogMsg("Observed input$outgroup -> ", EnC(input$outgroup)) + r$outgroup <- input$outgroup + } + }, ignoreInit = TRUE) + + observeEvent(r$visibleConfigs, { + UpdateDroppedTaxaDisplay() + }) + + ############################################################################ + # Cross-module reactivity: observe state changes -> update module inputs + # Replaces parent_session updateXxxInput calls from mod_data.R + ############################################################################ + + # When dataset changes: update plottedChar range + searchChar choices + observeEvent(r$dataHash, { + if (HaveData()) { + n <- nChars() + updateNumericInput(session, "plottedChar", + min = 0L, max = n, value = 1L) + updateSelectizeInput(session, "searchChar", + choices = paste0(seq_len(n), ": ", + colnames(r$chars)), + selected = "", + server = TRUE) + } else { + updateNumericInput(session, "plottedChar", + min = 0L, max = 0L, value = 0L) + updateSelectizeInput(session, "searchChar", choices = NULL) + } + }, ignoreInit = TRUE) + + # When trees change: update whichTree slider range + neverDrop choices + observeEvent(r$treeHash, { + if (AnyTrees()) { + nTr <- length(r$trees) + updateSliderInput(session, "whichTree", + min = 0L, max = nTr, value = 0L) + updateSelectizeInput(session, "neverDrop", + choices = tipLabels(), + selected = input$neverDrop) + showElement("keepNTips") + showElement("neverDrop") + } else { + hideElement("keepNTips") + hideElement("neverDrop") + } + }, ignoreInit = TRUE) + + # Resize plot via CSS when plotSize changes + observe({ + px <- paste0("'", plotSize(), "px'") + runjs(paste0("$('#", ns("treePlot"), "').css({height: ", + px, ", width: ", px, "});")) + }) + + ############################################################################ + # Return values for other modules / server.R + ############################################################################ + + list( + MainPlot = MainPlot, + RCode = RCode, + UpdateKeepNTipsRange = UpdateKeepNTipsRange, + UpdateDroppedTaxaDisplay = UpdateDroppedTaxaDisplay, + UpdateOutgroupInput = UpdateOutgroupInput + ) + }) +} diff --git a/inst/Parsimony/server/mod_data.R b/inst/Parsimony/server/mod_data.R new file mode 100644 index 000000000..13dc10390 --- /dev/null +++ b/inst/Parsimony/server/mod_data.R @@ -0,0 +1,649 @@ +# Module: Data loading and tree management +# +# Absorbs data.R + trees.R + data/tree event bindings from events.R. +# Owns inputs: dataSource, dataFile, readxl.sheet, readxlSkip, readxlSkipCols, +# treeFile, nTree, treeRange. +# Writes most data/tree state fields in AppState. +# +# Returns a list of reactives/functions consumed by other modules/source'd files. + +data_ui <- function(id) { + ns <- NS(id) + list( + data_source = selectInput( + ns("dataSource"), "Dataset", + c("< Load from file below >" = "file", + "Agnarsson 2004" = "Agnarsson2004", + "Sun et al. 2018" = "Sun2018", + "Wills et al. 2012" = "Wills2012", + if (logging) setNames(names(inapplicable.datasets), + names(inapplicable.datasets))) + ), + data_file = fileInput( + ns("dataFile"), + tags$span( + tags$i(class = "fas fa-solid fa-table"), + tags$span("Load data from file") + ), + placeholder = "No data file selected" + ), + readxl_options = hidden(tags$span( + id = ns("readxl_options"), + selectInput(ns("readxl_sheet"), "Excel sheet to read:", + "Sheet 1", "Sheet 1"), + tags$span("First character row & column:"), + numericInput(ns("readxlSkip"), label = NULL, + min = 2L, value = 2L, step = 1L), + numericInput(ns("readxlSkipCols"), label = NULL, + min = 2L, value = 2L, step = 1L), + htmlOutput(ns("readxl_chars"), style = "clear: both;"), + htmlOutput(ns("readxl_taxa"), style = "clear: both; margin-bottom: 1em;") + )), + tree_file = fileInput( + ns("treeFile"), + label = tags$span( + tags$i(class = "fas fa-solid fa-tree"), + tags$span("Load trees") + ), + placeholder = "No tree file selected" + ), + nTree_input = numericInput(ns("nTree"), + label = HTML("Sample n trees from range:"), + min = 1L, value = 1L, step = 1L), + treeRange_input = sliderInput(ns("treeRange"), label = "", + min = 1L, max = 1L, step = 1L, value = c(1, 1)) + ) +} + +#' @param id Module namespace id. +#' @param r AppState reactiveValues. +#' @param parent_session The top-level Shiny session (for cross-module +#' \code{updateXxxInput} calls targeting non-namespaced inputs). +#' @param callbacks Named list of callback functions from events.R / consensus.R +#' that the module triggers on tree updates: +#' \code{DisplayTreeScores}, \code{UpdateKeepNTipsRange}, +#' \code{UpdateDroppedTaxaDisplay}, \code{UpdateOutgroupInput}, +#' \code{KeptTips}. +#' @param log_fns Named list of logging functions from logging.R: +#' \code{LogMsg}, \code{LogComment}, \code{LogCode}, \code{CacheInput}, +#' \code{LastFile}. +data_server <- function(id, r, parent_session, callbacks, log_fns) { + moduleServer(id, function(input, output, session) { + + # Unpack logging + LogMsg <- log_fns$LogMsg + LogComment <- log_fns$LogComment + LogCode <- log_fns$LogCode + CacheInput <- log_fns$CacheInput + LastFile <- log_fns$LastFile + + # Unpack callbacks (from events.R / consensus.R — use isolate-safe pattern) + DisplayTreeScores <- callbacks$DisplayTreeScores + UpdateKeepNTipsRange <- callbacks$UpdateKeepNTipsRange + UpdateDroppedTaxaDisplay <- callbacks$UpdateDroppedTaxaDisplay + UpdateOutgroupInput <- callbacks$UpdateOutgroupInput + + # Cross-module shinyjs helpers (target top-level DOM ids, not namespaced) + parentShow <- function(id) { + runjs(paste0("$('#", id, "').removeClass('shinyjs-hide').show()")) + } + parentHide <- function(id) { + runjs(paste0("$('#", id, "').hide()")) + } + + ############################################################################ + # Helper reactives (from data.R) + ############################################################################ + + AnyTrees <- reactive({ + !is.null(r$trees) && length(r$trees) > 0 + }) + + HaveData <- reactive({ + !is.null(r$dataset) && length(r$dataset) > 0 && + inherits(r$dataset, "phyDat") + }) + + tipLabels <- reactive({ + if (!length(r$trees)) return(character(0L)) + r$trees[[1]][["tip.label"]] + }) + + nChars <- reactive({ + if (HaveData()) { + as.integer(length(attr(r$dataset, "index"))) + } else { + 0L + } + }) + + TaxonOrder <- reactive({ + if (HaveData()) { + names(r$dataset) + } else { + tipLabels() + } + }) + + DatasetMatchesTrees <- reactive({ + length(intersect(names(r$dataset), tipLabels())) == length(r$dataset) + }) + + ############################################################################ + # Tree management (from trees.R) + ############################################################################ + + UpdateNTree <- function(n) { + if (is.null(n) || length(n) == 0) return(FALSE) + if (n > length(r$allTrees)) { + r$oldNTree <- n + n <- length(r$allTrees) + } + if (r$nTree == n) { + FALSE + } else { + LogMsg("UpdateNTree(", r$nTree, " -> ", n, ")") + r$nTree <- n + if (input$nTree != n) { + updateNumericInput(session, "nTree", value = n) + } + TRUE + } + } + + UpdateTreeRange <- function(range) { + if (is.null(range) || length(range) == 0) return(FALSE) + if (identical(range, r$treeRange)) { + FALSE + } else { + LogMsg("UpdateTreeRange([", paste(r$treeRange, collapse = ", "), + "] -> [", paste(range, collapse = ", "), "])") + r$treeRange <- range + span <- r$treeRange[2] - r$treeRange[1] + if (r$nTree > span + 1L) { + UpdateNTree(span + 1L) + } + TRUE + } + } + + UpdateActiveTrees <- reactive({ + if (r$updatingTrees) { + LogMsg(" Skipping UpdateActiveTrees()") + return() + } + r$updatingTrees <- TRUE + on.exit(r$updatingTrees <- FALSE) + LogMsg("UpdateActiveTrees()") + + nTrees <- length(r$allTrees) + if (r$nTree == nTrees && + r$treeRange[1] == 1L && r$treeRange[2] == nTrees) { + thinnedTrees <- r$allTrees + if (!is.null(r$allTrees) && !identical(r$trees, thinnedTrees)) { + LogCode("trees <- allTrees") + } + } else { + rangedTrees <- r$allTrees[r$treeRange[1]:r$treeRange[2]] + thinnedTrees <- WideSample(rangedTrees, r$nTree) + + if (!is.null(r$allTrees) && !identical(r$trees, thinnedTrees)) { + LogCode(paste0( + "trees <- WideSample(allTrees[", + r$treeRange[1], ":", r$treeRange[2], + "], ", r$nTree, ")")) + } + } + + r$trees <- thinnedTrees + r$treeHash <- rlang::hash(r$trees) + + DisplayTreeScores() + + # Consensus module observes r$treeHash for whichTree, keepNTips, + # neverDrop, outgroup, droppedTips updates (T-063). + + updateSelectizeInput(session = parent_session, + inputId = "treespace-relators", + choices = tipLabels(), + selected = parent_session$input[["treespace-relators"]]) + }) + + UpdateAllTrees <- function(newTrees) { + LogMsg("UpdateAllTrees()") + on.exit(LogMsg("/UpdateAllTrees()"), add = TRUE) + + newTrees <- c(newTrees) + if (length(newTrees) > 1L) { + newTrees <- RenumberTips(newTrees, newTrees[[1]]$tip.label) + } + if (identical(newTrees, r$newTrees)) { + LogMsg(" ") + return() + } + r$newTrees <- newTrees + + oldNTrees <- length(r$allTrees) + + if (!identical(r$allTrees, newTrees)) { + LogCode("allTrees <- newTrees") + r$allTrees <- newTrees + } + nTrees <- length(newTrees) + + if (nTrees != oldNTrees) { + if (nTrees > 0L) { + if (!identical(input$treeRange, c(1L, nTrees))) { + r$oldTreeRange <- input$treeRange + } + UpdateTreeRange(c(1L, nTrees)) + updateSliderInput(session, "treeRange", + min = 1L, max = nTrees, + value = r$treeRange) + + r$oldNTree <- input$nTree + UpdateNTree(min(max(input$nTree, aFewTrees), nTrees)) + updateNumericInput(session, "nTree", max = nTrees, + value = r$nTree) + } + # When nTrees == 0, skip slider updates — the tree manipulation panel + # is hidden by the parentHide("manipulateTreeset") call below, so no + # visible element needs updating and we avoid min > max warnings. + } + + UpdateActiveTrees() + if (AnyTrees()) { + parentShow("manipulateTreeset") + } else { + parentHide("manipulateTreeset") + } + } + + # Debounced nTree / treeRange watchers + FetchNTree <- debounce(reactive({ + if (!is.null(r$oldNTree)) { + if (!identical(input$nTree, r$oldNTree)) { + r$oldNTree <- NULL + } + } else { + if (UpdateNTree(input$nTree)) { + UpdateActiveTrees() + } + } + }), typingJiffy) + + FetchTreeRange <- debounce(reactive({ + if (!is.null(r$oldTreeRange)) { + if (!identical(input$treeRange, r$oldTreeRange)) { + r$oldTreeRange <- NULL + } + } else { + if (UpdateTreeRange(input$treeRange)) { + UpdateActiveTrees() + } + } + }), aJiffy) + + # Force evaluation of the debounced reactives + observe(FetchNTree()) + observe(FetchTreeRange()) + + ############################################################################ + # Data loading (from data.R + events.R bindings) + ############################################################################ + + UpdateData <- reactive({ + source <- input$dataSource + if (source == "file") { + if (!r$dataFileVisible) { + showElement("dataFile") + r$dataFileVisible <- TRUE + dfId <- session$ns("dataFile") + runjs(paste0("console.log($('#", dfId, "-label'));")) + runjs(paste0( + "$('#", dfId, "-label').parent()", + ".css({'outline': 'dashed #428bca 20px', ", + "'width': '100%'})", + ".animate({'outline-width': '0px'}, 'slow');")) + return() + } + + fileInput <- input$dataFile + r$dataset <- NULL + r$chars <- NULL + if (is.null(fileInput)) { + Notification(type = "error", "No data file selected") + return("No data file selected.") + } + dataFile <- fileInput$datapath + if (is.null(dataFile)) { + Notification(type = "error", "No data file found.") + return("No data file specified.") + } + + LogMsg("UpdateData(): from file") + r$sortTrees <- FALSE + r$readDataFile <- NULL + r$bestSearchScore <- NULL + + if (length(grep("\\.xlsx?$", dataFile))) { + if (!requireNamespace("readxl", quietly = TRUE)) { + install.packages("readxl") + } + showElement("readxl_options") + + r$dataset <- tryCatch({ + sheets <- readxl::excel_sheets(dataFile) + updateSelectInput(session, + inputId = "readxl_sheet", + choices = setNames(sheets, sheets), + selected = if (input$readxl_sheet %in% sheets) { + input$readxl_sheet + } else { + sheets[1] + }) + + tibble <- readxl::read_excel( + path = dataFile, + sheet = match(input$readxl_sheet, sheets, nomatch = 1L), + skip = max(0L, input$readxlSkip - 2L), + .name_repair = "minimal", + col_types = "text" + ) + + firstCol <- input$readxlSkipCols - 1L + chars <- colnames(tibble)[-seq_len(firstCol)] + taxNames <- gsub(" ", "_", trimws(unlist(tibble[, firstCol]))) + output$readxl_taxa <- renderUI(HTML(paste( + "Taxon names:", + paste(head(taxNames, 3), collapse = ", "), "...\n"))) + output$readxl_chars <- renderUI(HTML(paste( + "Character names:", + paste(head(chars, 3), collapse = ", "), "..."))) + r$chars <- chars + + dat <- as.matrix(tibble[, -seq_len(firstCol)]) + rownames(dat) <- taxNames + dat <- MatrixToPhyDat(dat) + if (attr(dat, "nr") == 0) { + stop("No characters loaded; throw error") + } + + LogComment("Load data from spreadsheet", 2) + if (r$excelFiles == 0 || + tools::md5sum(dataFile) != + tools::md5sum(paste0(tempdir(), "/", LastFile("excel")))) { + CacheInput("excel", dataFile) + } + LogCode(c( + paste0("dataFile <- \"", LastFile("excel"), "\""), + "excelSheet <- readxl::read_excel(", + " path = dataFile,", + paste0(" sheet = ", + match(input$readxl_sheet, sheets, 1L), ","), + paste0(" skip = ", max(0L, input$readxlSkip - 2L), ","), + " .name_repair = \"minimal\",", + " col_types = \"text\"", + ")", + paste0("dat <- as.matrix(excelSheet[, -seq_len(", + firstCol, ")])"), + paste0("rownames(dat) <- unlist(excelSheet[, ", + firstCol, "])"), + "dataset <- MatrixToPhyDat(dat)" + )) + + dat + }, error = function(e) NULL) + } else { + hideElement("readxl_options") + } + + if (is.null(r$dataset)) suppressWarnings({ + r$dataset <- tryCatch({ + r$readDataFile <- "ReadTntAsPhyDat(dataFile)" + ReadTntAsPhyDat(dataFile) + }, error = function(e) tryCatch({ + r$chars <- tryCatch( + ReadCharacters(dataFile), + error = function(e) { + Notification(type = "error", + "Error reading characters from file") + NULL + }) + + r$charNotes <- tryCatch( + ReadNotes(dataFile), + error = function(e) { + Notification(type = "error", + "Error reading character notes") + NULL + }) + + r$readDataFile <- "ReadAsPhyDat(dataFile)" + ReadAsPhyDat(dataFile) + }, error = function(e) { + r$readDataFile <- NULL + NULL + })) + + if (!is.null(r$dataset)) { + LogComment("Load data from file", 2) + CacheInput("data", dataFile) + LogCode(c( + paste0("dataFile <- \"", LastFile("data"), "\""), + paste0("dataset <- ", r$readDataFile) + )) + } + }) + } else { + LogMsg("UpdateData(): from package") + + r$sortTrees <- TRUE + r$bestSearchScore <- NULL + + r$dataFileVisible <- FALSE + hideElement("dataFile") + + dataFile <- system.file(paste0("datasets/", source, ".nex"), + package = "TreeSearch") + CacheInput("data", dataFile) + r$chars <- ReadCharacters(dataFile) + r$charNotes <- ReadNotes(dataFile) + r$readDataFile <- "ReadAsPhyDat(dataFile)" + r$dataset <- ReadAsPhyDat(dataFile) + LogComment("Load dataset file from TreeSearch package") + LogCode(c( + paste0("dataFile <- system.file(\"datasets/", source, + ".nex\", package = \"TreeSearch\")"), + "dataset <- ReadAsPhyDat(dataFile)" + )) + } + + if (is.null(r$dataset)) { + Notification(type = "error", "Could not read data from file") + # Consensus module observes nChars() for plottedChar/searchChar (T-063) + return("Could not read data from file") + } else { + Notification(type = "message", + paste("Loaded", nChars(), "characters and", + length(r$dataset), "taxa")) + # Consensus module observes nChars() for plottedChar/searchChar (T-063) + } + + tryCatch({ + # suppressWarnings: ape::read.nexus emits a spurious recycling warning + # when a NEXUS file has unequal counts of [ and ] comment brackets on + # a line (upstream ape bug; does not affect parsing correctness). + dataFileTrees <- suppressWarnings(read.nexus(dataFile)) + LogComment("Read trees from dataset file") + LogCode("newTrees <- read.nexus(dataFile)") + UpdateAllTrees(dataFileTrees) + CacheInput("tree", dataFile) + r$readTreeFile <- "read.nexus(treeFile)" + }, error = function(e) { + # Data file has no trees — clear stale trees only if they don't + # match the new dataset (prevents blank plot from incompatible tips). + # Keep trees if they match (e.g., re-selecting same dataset after search). + if (AnyTrees() && !DatasetMatchesTrees()) { + UpdateAllTrees(list()) + } + }) + if (AnyTrees() && DatasetMatchesTrees()) { + parentShow("displayConfig") + } + # Button labels reactively managed by mod_search.R + + DisplayTreeScores() + }) + + ############################################################################ + # Tree file loading (from events.R) + ############################################################################ + + observeEvent(input$treeFile, { + tmpFile <- input$treeFile$datapath + newTrees <- tryCatch({ + r$readTreeFile <- "read.tree(treeFile)" + LogMsg("Trying read.tree()") + read.tree(tmpFile) + }, + error = function(x) tryCatch({ + r$readTreeFile <- "read.nexus(treeFile)" + LogMsg("Trying read.nexus()") + suppressWarnings(read.nexus(tmpFile)) + }, + error = function(err) tryCatch({ + if (grepl("NA/NaN argument", err)) { + LogMsg("Terminating tree block") + withEnd <- tempfile() + on.exit(unlink(withEnd)) + # suppressWarnings: readLines emits "incomplete final line" warning + # for files without trailing newline; benign, does not affect parsing. + writeLines(c(suppressWarnings(readLines(tmpFile)), "\nEND;"), withEnd) + read.nexus(withEnd) + } else { + stop("Next handler, please") + } + }, + error = function(x) tryCatch( + # withCallingHandlers muffles the benign readLines "incomplete final + # line" warning from ReadTntTree before it reaches the outer warning + # handler (which is for genuine TNT tip-label warnings only). + withCallingHandlers( + { + r$readTreeFile <- "ReadTntTree(treeFile)" + ReadTntTree(tmpFile) + }, + warning = function(w) { + if (grepl("incomplete final line", conditionMessage(w), + ignore.case = TRUE)) { + invokeRestart("muffleWarning") + } + } + ), + warning = function(x) tryCatch({ + Notification(as.character(x), type = "warning") + tryLabels <- TipLabels(r$dataset) + if (length(tryLabels) > 2) { + Notification("Inferring tip labels from dataset", + type = "warning") + r$readTreeFile <- + "ReadTntTree(treeFile, tipLabels = TipLabels(dataset))" + ReadTntTree(tmpFile, tipLabels = tryLabels) + } else { + NULL + } + }, error = function(e) NULL), + error = function(e) NULL)))) + + if (is.null(newTrees)) { + # No trees found: check whether the file is a data file uploaded to + # the wrong input. Mirror the data-loader fallback chain + # (ReadTntAsPhyDat → ReadAsPhyDat). + autoData <- tryCatch( + suppressWarnings(ReadTntAsPhyDat(tmpFile)), + error = function(e) tryCatch( + suppressWarnings(ReadAsPhyDat(tmpFile)), + error = function(e) NULL + ) + ) + if (!is.null(autoData)) { + # Treat as a data file: load it as the active dataset. + # observeEvent(r$dataset) handles tree-clearing + hash update. + r$dataset <- autoData + r$chars <- tryCatch(suppressWarnings(ReadCharacters(tmpFile)), + error = function(e) NULL) + r$charNotes <- tryCatch(suppressWarnings(ReadNotes(tmpFile)), + error = function(e) NULL) + r$readDataFile <- "ReadAsPhyDat(dataFile)" + r$sortTrees <- FALSE + r$bestSearchScore <- NULL + Notification( + paste0("No trees found \u2014 loaded ", + length(autoData), " taxa and ", + length(attr(autoData, "index")), " characters as dataset"), + type = "message" + ) + } else { + Notification("Trees not in a recognized format", type = "error") + } + } else { + LogComment("Load tree from file", 2) + CacheInput("tree", tmpFile) + LogCode(paste0("treeFile <- \"", LastFile("tree"), "\"")) + LogCode(paste0("newTrees <- ", r$readTreeFile)) + + UpdateAllTrees(newTrees) + + removeModal() + Notification(paste("Loaded", length(r$trees), "trees"), + type = "message") + # Button labels reactively managed by mod_search.R + parentShow("displayConfig") + } + }) + + ############################################################################ + # Data event bindings (from events.R) + ############################################################################ + + observeEvent(input$dataSource, UpdateData(), ignoreInit = TRUE) + observeEvent(input$dataFile, UpdateData(), ignoreInit = TRUE) + observeEvent(input$readxl_sheet, UpdateData(), ignoreInit = TRUE) + observeEvent(input$readxlSkip, UpdateData(), ignoreInit = TRUE) + observeEvent(input$readxlSkipCols, UpdateData(), ignoreInit = TRUE) + + observeEvent(r$dataset, { + r$dataHash <- rlang::hash(r$dataset) + # Clear stale trees only when they are incompatible with the new dataset. + # UpdateData() may call UpdateAllTrees() *before* this observer fires, so + # trees from the same .nex file are already in r$allTrees and are + # compatible. Unconditionally clearing them blanks the plot and resets + # the tree count to 0 for all 31 bundled example datasets (T-151). + if (!HaveData() || !DatasetMatchesTrees()) { + r$allTrees <- NULL + r$trees <- NULL + r$treeHash <- NULL + r$newTrees <- NULL + parentHide("manipulateTreeset") + } + # Search stat reset + timeout default handled by mod_search.R + }) + + ############################################################################ + # Return reactives/functions for other modules + ############################################################################ + + list( + AnyTrees = AnyTrees, + HaveData = HaveData, + tipLabels = tipLabels, + nChars = nChars, + TaxonOrder = TaxonOrder, + DatasetMatchesTrees = DatasetMatchesTrees, + UpdateAllTrees = UpdateAllTrees, + UpdateActiveTrees = UpdateActiveTrees, + dataSource = reactive(input$dataSource) + ) + }) +} diff --git a/inst/Parsimony/server/mod_downloads.R b/inst/Parsimony/server/mod_downloads.R new file mode 100644 index 000000000..6c2cb0b56 --- /dev/null +++ b/inst/Parsimony/server/mod_downloads.R @@ -0,0 +1,166 @@ +# Module: Downloads +# +# Owns all 8 downloadHandler outputs: +# saveZip, savePlotZip, savePng, savePdf, +# savePlotNwk, savePlotNex, saveNwk, saveNex +# +# Reactive args (passed from server.R top-level input): +# dataSource reactive(input$dataSource) +# plotSize reactive(input$plotSize) +# +# Callback args (functions/values from sourced server files): +# cmdLogFile character — path to session R-script log +# stashTrees function(trees) — writes trees to temp file (logging.R) +# dataFileName function(n) — (logging.R) +# excelFileName function(n) — (logging.R) +# treeFileName function(n) — (logging.R) +# lastFile function(type) — (logging.R) +# mainPlot function() — renders current plot (consensus.R) +# rCode reactive — plot R-script lines (consensus.R) +# saveDetails reactive — list(fileName, title, asp) (treespace.R) + +# --------------------------------------------------------------------------- +# UI helpers — returns a named list so scattered buttons can be placed +# individually in ui.R without duplicating ns() logic. +# --------------------------------------------------------------------------- +downloads_ui <- function(id) { + ns <- NS(id) + list( + save_zip = downloadButton(ns("saveZip"), "Save log", icon = Icon("download")), + save_nwk = downloadButton(ns("saveNwk"), "Newick", icon = Icon("download")), + save_nex = downloadButton(ns("saveNex"), "Nexus", icon = Icon("download")), + save_plot_zip = downloadButton(ns("savePlotZip"), "R script", icon = Icon("download")), + save_pdf = downloadButton(ns("savePdf"), "PDF", icon = Icon("download")), + save_png = downloadButton(ns("savePng"), "PNG", icon = Icon("download")), + save_plot_nwk = downloadButton(ns("savePlotNwk"), "Newick", icon = Icon("download")), + save_plot_nex = downloadButton(ns("savePlotNex"), "Nexus", icon = Icon("download")) + ) +} + +# --------------------------------------------------------------------------- +# Server +# --------------------------------------------------------------------------- +downloads_server <- function(id, state, dataSource, plotSize, + cmdLogFile, stashTrees, + dataFileName, excelFileName, treeFileName, lastFile, + mainPlot, rCode, saveDetails) { + moduleServer(id, function(input, output, session) { + + output$saveZip <- downloadHandler( + filename = function() "TreeSearch-session.zip", + content = function(file) { + if (isTRUE(getOption("shiny.testmode"))) { + file.copy(cmdLogFile, file) + } else { + zipDir <- tempfile("zip-") + dir.create(zipDir) + on.exit(unlink(zipDir)) + rFile <- paste0(zipDir, "/TreeSearch-session.R") + file.copy(cmdLogFile, rFile, overwrite = TRUE) + zip::zip(file, c( + rFile, + if (state$dataFiles) + paste0(tempdir(), "/", dataFileName(seq_len(state$dataFiles))), + if (state$excelFiles) + paste0(tempdir(), "/", excelFileName(seq_len(state$excelFiles))), + if (state$treeFiles) + paste0(tempdir(), "/", treeFileName(seq_len(state$treeFiles))) + ), compression_level = 9, mode = "cherry-pick") + } + } + ) + + output$savePlotZip <- downloadHandler( + filename = function() paste0(saveDetails()$fileName, ".zip"), + content = function(file) { + stashTrees(state$allTrees) + + if (isTRUE(getOption("shiny.testmode"))) { + rCode_val <- rCode() + rCode_val <- sub("TreeSearch plot log: 2[\\d\\-]{9} [012][\\d:]{7}", + "TreeSearch plot log: ", + rCode_val, perl = TRUE) + rCode_val[4] <- "# System: " + rCode_val[5:9] <- sub("^(# \\- \\w+ ).*$", "\\1", + rCode_val[5:9], perl = TRUE) + rCode_val <- sub("dataFile <- .*$", + paste0("dataFile <- system.file(\"datasets/", + dataSource(), + ".nex\", package = \"TreeSearch\") # FALSE CODE for TEST MODE"), + rCode_val, + perl = TRUE) + rCode_val <- sub("treeFile <- .*$", + "treeFile <- dataFile # Test mode", + rCode_val, + perl = TRUE) + writeLines(rCode_val, con = file) + } else { + tempDir <- tempfile("plot-zip-") + dir.create(tempDir) + on.exit(unlink(tempDir)) + rFile <- paste0(tempDir, "/", saveDetails()$fileName, ".R") + writeLines(rCode(), con = rFile) + + zip::zip(file, c( + rFile, + paste0(tempdir(), "/", lastFile("data")), + paste0(tempdir(), "/", lastFile("excel")), + paste0(tempdir(), "/", lastFile("tree")) + ), compression_level = 9, mode = "cherry-pick") + } + } + ) + + output$savePng <- downloadHandler( + filename = function() paste0(saveDetails()$fileName, ".png"), + content = function(file) { + png(file, width = plotSize(), height = plotSize()) + mainPlot() + dev.off() + } + ) + + output$savePdf <- downloadHandler( + filename = function() paste0(saveDetails()$fileName, ".pdf"), + content = function(file) { + width <- 8 + pdf( + file, + title = saveDetails()$title, + width = width, + height = saveDetails()$asp * width + ) + mainPlot() + dev.off() + } + ) + + output$savePlotNwk <- downloadHandler( + filename = "TreeSearch-consensus.nwk", + content = function(file) { + write.tree(state$plottedTree, file = file) + } + ) + + output$savePlotNex <- downloadHandler( + filename = "TreeSearch-consensus.nex", + content = function(file) { + write.nexus(state$plottedTree, file = file) + } + ) + + output$saveNwk <- downloadHandler( + filename = "TreeSearch.nwk", + content = function(file) { + write.tree(state$trees, file = file, tree.names = TRUE) + } + ) + + output$saveNex <- downloadHandler( + filename = "TreeSearch.nex", + content = function(file) { + write.nexus(state$trees, file = file) + } + ) + }) +} diff --git a/inst/Parsimony/server/mod_references.R b/inst/Parsimony/server/mod_references.R new file mode 100644 index 000000000..bc1ef8c6e --- /dev/null +++ b/inst/Parsimony/server/mod_references.R @@ -0,0 +1,95 @@ +# Module: References panel +# +# Renders the references section. Adapts "Tree search" references based on +# the active weighting mode ("off" = EW, "on" = IW, "xpiwe" = XPIWE, +# "prof" = profile parsimony). + +references_ui <- function(id) { + ns <- NS(id) + htmlOutput(ns("references"), style = "clear: both;") +} + +#' @param id Module namespace id. +#' @param weighting Reactive returning the current weighting mode string. +#' @param cites Named list of citation HTML strings. Defaults to looking up +#' each variable in the calling environment (i.e. global.R when run as app). +references_server <- function(id, weighting = NULL, cites = NULL) { + # If no cites list supplied, collect from the caller's environment so the + # app's global.R assignments are found automatically. + if (is.null(cites)) { + e <- parent.frame() + get_cite <- function(nm) get(nm, envir = e, inherits = TRUE) + cites <- list( + Brazeau2019 = get_cite("Brazeau2019"), + Goloboff1993 = get_cite("Goloboff1993"), + Goloboff1999 = get_cite("Goloboff1999"), + Goloboff2014 = get_cite("Goloboff2014"), + Morphy = get_cite("Morphy"), + Nixon1999 = get_cite("Nixon1999"), + SmithSearch = get_cite("SmithSearch"), + Gower1966 = get_cite("Gower1966"), + Gower1969 = get_cite("Gower1969"), + Kaski2003 = get_cite("Kaski2003"), + RCoreTeam = get_cite("RCoreTeam"), + SmithDist = get_cite("SmithDist"), + Smith2020 = get_cite("Smith2020"), + SmithSpace = get_cite("SmithSpace"), + Venna2001 = get_cite("Venna2001"), + Stockham2002 = get_cite("Stockham2002"), + Arthur2007 = get_cite("Arthur2007"), + Hartigan1979 = get_cite("Hartigan1979"), + Maechler2019 = get_cite("Maechler2019"), + Bien2011 = get_cite("Bien2011"), + Murtagh1983 = get_cite("Murtagh1983"), + Rousseeuw1987 = get_cite("Rousseeuw1987"), + SmithRogue = get_cite("SmithRogue"), + Klopfstein2019 = get_cite("Klopfstein2019"), + Pol2009 = get_cite("Pol2009") + ) + } + + moduleServer(id, function(input, output, session) { + output$references <- renderUI({ + wt <- if (is.reactive(weighting)) weighting() else "off" + + # Standing tree-search references (always shown) + searchRefs <- list( + cites$SmithSearch, + cites$Goloboff1999, + cites$Nixon1999, + cites$Brazeau2019, + cites$Morphy + ) + # IW / XPIWE: add Goloboff 1993 + if (wt %in% c("on", "xpiwe")) { + searchRefs <- c(searchRefs, list(cites$Goloboff1993)) + } + # XPIWE only: add Goloboff 2014 + if (identical(wt, "xpiwe")) { + searchRefs <- c(searchRefs, list(cites$Goloboff2014)) + } + + tagList( + tags$h2("References for methods used"), + tags$h3("Tree search"), + HTML(paste0(searchRefs, collapse = "")), + tags$h3("Tree space mapping"), + HTML(paste0(cites$Gower1966, cites$Gower1969, cites$Kaski2003, + cites$RCoreTeam, cites$SmithDist, cites$Smith2020, + cites$SmithSpace, cites$Venna2001)), + tags$h3("Clustering"), + HTML(paste("Cluster consensus trees:", cites$Stockham2002)), + HTML(paste0( + "k-means++:", cites$Arthur2007, cites$Hartigan1979, + "Partitioning around medoids:", cites$Maechler2019, + "Hierarchical, minimax linkage:", cites$Bien2011, cites$Murtagh1983, + "Clustering evaluation:", cites$Rousseeuw1987 + )), + tags$h3("Rogue taxa"), + HTML(paste("Detection:", cites$SmithRogue)), + HTML(paste("Plotting:", cites$Klopfstein2019)), + HTML(paste("Character analysis:", cites$Pol2009)), + ) + }) + }) +} diff --git a/inst/Parsimony/server/mod_search.R b/inst/Parsimony/server/mod_search.R new file mode 100644 index 000000000..a2eb5c29b --- /dev/null +++ b/inst/Parsimony/server/mod_search.R @@ -0,0 +1,1184 @@ +# Module: Search +# +# Owns: searchTask (ExtendedTask), StartSearch(), result observer, search +# config modal, scoring, and weighting logic. +# +# Owns inputs: go, modalGo, searchConfig, strategy, maxReplicates, +# targetHits, timeout, epsilon, searchWithout, implied.weights, concavity, +# nThreads, inapplicable, hsjAlpha. +# +# Reactive args: +# r AppState reactiveValues +# AnyTrees reactive (from trees.R) +# HaveData reactive (from trees.R) +# UpdateAllTrees function (from trees.R) +# log_fns named list: LogMsg, LogCode, LogComment +# +# Returns a list of reactives/functions consumed by other server files: +# scores, concavity, DisplayTreeScores + +# --------------------------------------------------------------------------- +# UI — returns a named list so scattered elements can be placed individually +# in ui.R (same pattern as downloads_ui). +# --------------------------------------------------------------------------- +search_ui <- function(id) { + ns <- NS(id) + list( + label = tags$label("Search", class = "control-label", + style = "display: block; margin-top: -15px;"), + config = actionButton(ns("searchConfig"), "Configure", + icon = Icon("gears")), + go = hidden(actionButton(ns("go"), "Search", + icon = Icon("magnifying-glass"))), + cancel = hidden(actionButton(ns("cancel"), "Stop", + icon = Icon("circle-stop"), + class = "btn-danger btn-sm", + style = "margin-left: 4px;")), + results = htmlOutput(ns("results")) + ) +} + +# --------------------------------------------------------------------------- +# Server +# --------------------------------------------------------------------------- +search_server <- function(id, r, AnyTrees, HaveData, UpdateAllTrees, log_fns) { + moduleServer(id, function(input, output, session) { + ns <- session$ns + + # Unpack logging functions + LogMsg <- log_fns$LogMsg + LogCode <- log_fns$LogCode + LogComment <- log_fns$LogComment + + ########################################################################## + # Local helpers + ########################################################################## + + DatasetTips <- reactive(names(r$dataset)) + SearchTips <- reactive(setdiff(DatasetTips(), r$searchWithout)) + + # Adaptive note under the targetHits slider (shown inside config modal) + output$targetHitsNote <- renderUI({ + N <- input$targetHits + if (is.null(N) || N < 1L) return(NULL) + # Worst-case miss probability: lim_{R->inf} (1 - N/R)^R = exp(-N) + helpText(title = paste0( + "Theoretical worst-case: exp(-", N, + "). After searching, the results panel uses actual hit counts." + ), + paste0("Probability of missing best score: ", + FormatMissProb(exp(-N)))) + }) + + ########################################################################## + # Weighting / concavity + ########################################################################## + + weighting <- reactive( + if (length(input$implied.weights) > 0) { + input$implied.weights + } else { + "xpiwe" + } + ) + + wtType <- reactive(switch(weighting(), + "xpiwe" = paste0("k = ", signif(concavity(), 3)), + "on" = paste0("k = ", signif(concavity(), 3)), + "off" = "EW", + "prof" = "PP")) + + concavity <- reactive({ + kExp <- if (length(input$concavity)) input$concavity else 1 + switch(weighting(), + "xpiwe" = 10 ^ kExp, + "on" = 10 ^ kExp, + "off" = Inf, + "prof" = "profile") + }) + + # Whether to apply extended implied weighting (missing-entries correction) + extendedIw <- reactive(identical(weighting(), "xpiwe")) + + tolerance <- reactive({ + if (input$epsilon == 0) { + sqrt(.Machine$double.eps) + } else { + input$epsilon + } + }) + + # Show/hide concavity slider when weighting mode changes + observeEvent(input$implied.weights, { + switch(input$implied.weights, + "xpiwe" = , "on" = show("concavity"), + hide("concavity") + ) + # Weighting mode changed: old run counts no longer apply; keep trees + r$searchTotalHits <- 0L + r$searchTotalReps <- 0L + r$searchReplicateScores <- numeric(0) + r$bestSearchScore <- NULL + r$searchLastImprovedRep <- NULL + r$searchConsensusStable <- FALSE + r$searchTimedOut <- FALSE + DisplayTreeScores() + }) + + observeEvent(input$concavity, { + # Concavity constant changed: old run counts no longer apply; keep trees + r$searchTotalHits <- 0L + r$searchTotalReps <- 0L + r$searchReplicateScores <- numeric(0) + r$bestSearchScore <- NULL + r$searchLastImprovedRep <- NULL + r$searchConsensusStable <- FALSE + r$searchTimedOut <- FALSE + DisplayTreeScores() + }, ignoreInit = TRUE) + + # Show/hide hsjAlpha input when inapplicable method changes + observeEvent(input$inapplicable, { + if (identical(input$inapplicable, "hsj")) { + show("hsjAlpha") + } else { + hide("hsjAlpha") + } + }, ignoreInit = TRUE) + + # Dynamic help text for hierarchy detection (shown inside config modal) + output$hierarchyInfo <- renderUI({ + inp <- input$inapplicable + if (is.null(inp) || identical(inp, "bgs")) return(NULL) + chars <- r$chars + if (is.null(chars) || length(chars) == 0L) { + return(helpText( + "No character names available for hierarchy auto-detection." + )) + } + h <- tryCatch( + withCallingHandlers( + hierarchy_from_names(chars), + warning = function(w) invokeRestart("muffleWarning") + ), + error = function(e) NULL + ) + if (is.null(h)) { + helpText(HTML(paste0( + "No hierarchy detected. Character names must follow the convention ", + "sup_tag (primary) and ", + "sub_tag[_suffix] (secondary); see ", + "?hierarchy_from_names." + ))) + } else { + n_blocks <- length(h) + n_chars <- length(hierarchy_chars(h)) + helpText(paste0( + "Detected ", n_blocks, " hierarchy block(s) covering ", + n_chars, " character(s)." + )) + } + }) + + ########################################################################## + # Async profile data preparation (with progress + cancel) + ########################################################################## + + profileDataset <- reactiveVal(NULL) + profileDataHash <- reactiveVal(NULL) + profileNotification <- reactiveVal(NULL) + profileProgressFile <- reactiveVal(NULL) + profileCancelFile <- reactiveVal(NULL) + + # Inlines PrepareDataProfile() logic so the slow StepInformation loop + # can report per-pattern progress and check a cancel file. + # Mirrors R/data_manipulation.R::PrepareDataProfile(); keep in sync. + profilePrepTask <- ExtendedTask$new( + function(dataset, progressPath, cancelPath) { + future::future({ + if ("info.amounts" %in% names(attributes(dataset))) { + return(dataset) + } + + at <- attributes(dataset) + cont <- attr(dataset, "contrast") + nTip <- length(dataset) + index <- at[["index"]] + allLevels <- as.character(at[["allLevels"]]) + + contSums <- rowSums(cont) + qmLevel <- which(contSums == ncol(cont)) + + if (length(qmLevel) == 0) { + attr(dataset, "contrast") <- rbind(attr(dataset, "contrast"), 1) + attr(dataset, "allLevels") <- c(attr(dataset, "allLevels"), "{?}") + qmLevel <- length(allLevels) + 1L + } + + ambigs <- which(contSums > 1L & contSums < ncol(cont)) + inappLevel <- which(colnames(cont) == "-") + if (length(inappLevel) != 0L) { + inappLevel <- which(apply(unname(cont), 1, identical, + as.double(colnames(cont) == "-"))) + dataset[] <- lapply(dataset, function(i) { + i[i %in% inappLevel] <- qmLevel + i + }) + } + + if (length(ambigs) != 0L) { + dataset[] <- lapply(dataset, function(i) { + i[i %in% ambigs] <- qmLevel + i + }) + } + + nPattern <- max(index) + mataset <- matrix( + unlist(dataset, recursive = FALSE, use.names = FALSE), nPattern + ) + mataset <- t(mataset) + + maxInformative <- 0L + + for (j in seq_len(ncol(mataset))) { + col <- mataset[, j] + nonAmbig <- col[col != qmLevel[1]] + if (length(nonAmbig) == 0L) next + + tab <- table(nonAmbig) + informative <- tab > 1L + nInf <- sum(informative) + + singletonTokens <- as.integer(names(tab[!informative])) + if (length(singletonTokens) > 0L) { + mataset[mataset[, j] %in% singletonTokens, j] <- qmLevel[1] + } + + maxInformative <- max(maxInformative, nInf) + } + + if (maxInformative < 2L) { + attr(dataset, "info.amounts") <- double(0) + return(dataset[0]) + } + + AMBIG_TOKEN <- maxInformative + 1L + + for (j in seq_len(ncol(mataset))) { + col <- mataset[, j] + nonAmbig <- sort(unique(col[col != qmLevel[1]])) + newCol <- rep(AMBIG_TOKEN, length(col)) + for (i in seq_along(nonAmbig)) { + newCol[col == nonAmbig[i]] <- i + } + mataset[, j] <- newCol + } + + dupCols <- duplicated(t(mataset)) + kept <- which(!dupCols) + copies <- lapply(kept, function(i) { + i + which(apply( + mataset[, -seq_len(i), drop = FALSE], 2, identical, mataset[, i] + )) + }) + firstOccurrence <- seq_len(dim(mataset)[2]) + for (i in seq_along(copies)) { + firstOccurrence[copies[[i]]] <- kept[i] + } + + cipher <- seq_len(max(kept)) + cipher[kept] <- order(kept) + index <- cipher[firstOccurrence][index] + + mataset <- mataset[, !dupCols, drop = FALSE] + dataset[] <- lapply( + seq_len(length(dataset)), function(i) mataset[i, ] + ) + + # --- Slow part: StepInformation per unique pattern --- + nPatterns <- ncol(mataset) + info <- vector("list", nPatterns) + + for (i in seq_len(nPatterns)) { + if (file.exists(cancelPath)) return(NULL) + + info[[i]] <- TreeSearch::StepInformation( + mataset[, i], ambiguousTokens = AMBIG_TOKEN + ) + + writeLines(paste(i, nPatterns), progressPath) + } + + if (file.exists(cancelPath)) return(NULL) + + maxSteps <- max(vapply( + info, function(x) max(as.integer(names(x))), integer(1) + )) + info <- vapply(info, function(x) { + ret <- setNames(double(maxSteps), seq_len(maxSteps)) + x <- x[setdiff(names(x), "0")] + if (length(x)) { + ret[names(x)] <- max(x) - x + } + ret + }, double(maxSteps)) + if (is.null(dim(info))) { + dim(info) <- c(1L, length(info)) + } + attr(dataset, "index") <- index + weight <- as.integer(table(index)) + attr(dataset, "weight") <- weight + attr(dataset, "nr") <- length(weight) + attr(dataset, "info.amounts") <- info + attr(dataset, "informative") <- colSums(info) > 0 + + k <- maxInformative + lvls <- as.character(seq_len(k)) + contMatrix <- rbind(diag(k), rep(1L, k)) + dimnames(contMatrix) <- list(NULL, lvls) + + attr(dataset, "levels") <- lvls + attr(dataset, "allLevels") <- c(lvls, "?") + attr(dataset, "contrast") <- contMatrix + attr(dataset, "nc") <- as.integer(k) + + if (!any(attr(dataset, "bootstrap") == "info.amounts")) { + attr(dataset, "bootstrap") <- c( + attr(dataset, "bootstrap"), "info.amounts" + ) + } + + dataset + }, seed = TRUE) + } + ) + + # Helper: start async profile data preparation. Called from + # StartSearch() when the user requests profile scoring and data + # hasn't been prepared yet. NOT triggered eagerly on mode change — + # deferred until the user actually starts a search. + startProfilePrep <- function(dataset) { + # Cancel any in-flight prep first. + cf <- profileCancelFile() + if (!is.null(cf) && !file.exists(cf)) { + file.create(cf) + } + status <- tryCatch(profilePrepTask$status(), error = function(e) "initial") + if (status == "running") return(FALSE) + + profileDataset(NULL) + LogMsg("Starting async profile data preparation") + + progPath <- tempfile("ts_profile_prog_", fileext = ".txt") + cancPath <- tempfile("ts_profile_cancel_", fileext = ".signal") + profileProgressFile(progPath) + profileCancelFile(cancPath) + + nid <- showNotification("Preparing profile scores\u2026", + duration = NULL, type = "message") + profileNotification(nid) + profilePrepTask$invoke(dataset, progPath, cancPath) + TRUE + } + + # Poll progress file and update notification while profile prep runs + observe({ + progFile <- profileProgressFile() + nid <- profileNotification() + if (is.null(progFile) || is.null(nid)) return() + invalidateLater(500) + progress <- tryCatch( + readLines(progFile, warn = FALSE), error = function(e) NULL + ) + if (is.null(progress) || length(progress) == 0L || !nzchar(progress[1])) { + return() + } + parts <- strsplit(trimws(progress[1]), "\\s+")[[1]] + if (length(parts) != 2L) return() + current <- suppressWarnings(as.integer(parts[1])) + total <- suppressWarnings(as.integer(parts[2])) + if (is.na(current) || is.na(total) || total < 1L) return() + pct <- round(100 * current / total) + showNotification( + id = nid, + paste0("Preparing profile scores\u2026 ", current, "/", total, + " patterns (", pct, "%)"), + duration = NULL, type = "message" + ) + }) + + # Process profile preparation result + observe({ + result <- tryCatch( + profilePrepTask$result(), + error = function(e) { + if (inherits(e, "shiny.silent.error")) stop(e) + LogMsg("Profile data preparation failed: ", conditionMessage(e)) + NULL + } + ) + isolate({ + nid <- profileNotification() + if (!is.null(nid)) { + removeNotification(nid) + profileNotification(NULL) + } + # Clean up temp files + pf <- profileProgressFile() + if (!is.null(pf)) { + suppressWarnings(file.remove(pf)) + profileProgressFile(NULL) + } + cf <- profileCancelFile() + if (!is.null(cf)) { + suppressWarnings(file.remove(cf)) + profileCancelFile(NULL) + } + if (!is.null(result)) { + profileDataset(result) + profileDataHash(r$dataHash) + # Auto-start the search that was deferred for profile preparation. + # StartSearch() will see that profileDataHash matches and proceed + # directly to the search without re-preparing. + StartSearch() + } else { + DisplayTreeScores() + } + }) + }) + + # Cancel profile prep if user switches away from profile mode + observe({ + if (!identical(concavity(), "profile")) { + nid <- profileNotification() + if (!is.null(nid)) { + removeNotification(nid) + profileNotification(NULL) + } + cf <- profileCancelFile() + if (!is.null(cf) && !file.exists(cf)) { + file.create(cf) + } + } + }) + + ########################################################################## + # Scores + ########################################################################## + + scores <- reactive({ + if (!HaveData() || !AnyTrees()) { + return(NULL) + } + conc <- concavity() + ds <- if (identical(conc, "profile")) { + pd <- profileDataset() + if (is.null(pd)) return(NULL) + pd + } else { + r$dataset + } + PutTree(r$trees) + PutData(ds) + useXpiwe <- extendedIw() + LogMsg("scores(): Recalculating scores with k = ", conc, + if (useXpiwe) " (extended)") + tryCatch( + signif(TreeLength( + RootTree(r$trees, 1), + ds, + concavity = conc, + extended_iw = useXpiwe + )), + error = function (x) { + if (HaveData() && AnyTrees()) { + cli::cli_alert(x[[2]]) + cli::cli_alert_danger(x[[1]]) + Notification(type = "error", + "Could not score all trees with dataset") + } + NULL + }) + }) + + ########################################################################## + # DisplayTreeScores + ########################################################################## + + DisplayTreeScores <- function () { + # Don't overwrite "Searching..." indicator while a search is running. + # Guard on both fields: searchNotification can be NULL if the + # notification was dismissed externally, but searchInProgress is the + # authoritative flag. + if (!is.null(r$searchNotification) || isTRUE(r$searchInProgress)) return(invisible()) + LogMsg("DisplayTreeScores()") + treeScores <- scores() + score <- if (is.null(treeScores) && identical(concavity(), "profile") && + is.null(profileDataset()) && HaveData() && AnyTrees()) { + "; profile scores available after search" + } else if (is.null(treeScores)) { + "; could not be scored from dataset" + } else if (length(unique(treeScores)) == 1) { + paste0(", each with score ", treeScores[1], " (", wtType(), ")") + } else { + paste0(" with scores ", min(treeScores), " to ", max(treeScores), + " (", wtType(), ")") + } + + msg <- paste0( + length(r$allTrees), " trees in memory: ", + length(r$trees), " sampled", + score + ) + stopReason <- if (isTRUE(r$searchConsensusStable)) { + "consensus" + } else if (isTRUE(r$searchTimedOut)) { + "timeout" + } + confText <- SearchConfidenceText(r$searchTotalHits, r$searchTotalReps, + r$searchCount, + nTopologies = length(r$allTrees), + lastImprovedRep = r$searchLastImprovedRep, + stopReason = stopReason, + replicateScores = r$searchReplicateScores) + html <- if (!is.null(confText)) { + nS <- r$searchCount + tooltip <- paste0( + "Estimated as (1 - K/R)^R where K = ", + r$searchTotalHits, " and R = ", r$searchTotalReps, + " (runs hitting best score", + if (!is.null(nS) && nS > 1L) + paste0(" across ", nS, " searches") + else + "", + "). Falls back to exp(-K) when K = R. ", + "Assumes independent runs. ", + "'Maximum independent runs' limits each individual search; ", + "this tally accumulates across all continued searches." + ) + paste0(msg, "
", confText, "") + } else { + msg + } + output$results <- renderUI(HTML(html)) + invisible(msg) + } + + ########################################################################## + # ExtendedTask for async search + ########################################################################## + + # Cancel file path — created before each search, deleted on completion. + # The C++ engine checks for this file's existence every ~200ms and stops + # gracefully if it appears. + cancelFile <- reactiveVal(NULL) + # Progress file path — C++ callback writes per-replicate status here; + # polled by an invalidateLater observer to update the notification. + progressFile <- reactiveVal(NULL) + + searchTask <- ExtendedTask$new( + function(dataset, tree, concavity, extendedIw, strategy, + maxReplicates, targetHits, maxSeconds, poolSuboptimal, + nThreads, cancelPath, progressPath, + hierarchy, inapplicable, hsjAlpha) { + future::future({ + on.exit({ + Sys.unsetenv("TREESEARCH_CANCEL_FILE") + Sys.unsetenv("TREESEARCH_PROGRESS_FILE") + }) + if (nzchar(cancelPath)) { + Sys.setenv(TREESEARCH_CANCEL_FILE = cancelPath) + } + if (nzchar(progressPath)) { + Sys.setenv(TREESEARCH_PROGRESS_FILE = progressPath) + } + args <- list( + dataset, + tree = tree, + concavity = concavity, + extended_iw = extendedIw, + strategy = strategy, + maxReplicates = maxReplicates, + targetHits = targetHits, + maxSeconds = maxSeconds, + nThreads = nThreads, + verbosity = 0L + ) + # Only pass control when non-default, so strategy presets apply + if (poolSuboptimal > 0) { + args$control <- TreeSearch::SearchControl( + poolSuboptimal = poolSuboptimal + ) + } + # Inapplicable handling (non-Brazeau requires hierarchy) + if (!is.null(hierarchy) && !identical(inapplicable, "bgs")) { + args$hierarchy <- hierarchy + args$inapplicable <- inapplicable + if (identical(inapplicable, "hsj")) { + args$hsj_alpha <- hsjAlpha + } + } + do.call(TreeSearch::MaximizeParsimony, args) + }, seed = TRUE) + } + ) + + ########################################################################## + # StartSearch + ########################################################################## + + StartSearch <- function () { + if (!HaveData()) { + Notification("No data loaded", type = "error") + return(invisible()) + } + + # Profile mode: defer search until profile data is prepared + if (identical(concavity(), "profile") && + !identical(r$dataHash, profileDataHash())) { + startProfilePrep(r$dataset) + return(invisible()) + } + + # Read search parameters early (before any slow prep) + searchStrategy <- if (length(input$strategy)) input$strategy else "auto" + searchMaxRep <- if (length(input$maxReplicates)) { + as.integer(input$maxReplicates) + } else { + 96L + } + searchTargetHits <- if (length(input$targetHits)) { + as.integer(input$targetHits) + } else { + 10L + } + searchMaxSeconds <- if (length(input$timeout)) { + as.double(input$timeout) * 60 + } else { + 0 + } + searchPoolSub <- if (length(input$epsilon) && input$epsilon > 0) { + tolerance() + } else { + 0 + } + searchNThreads <- if (length(input$nThreads)) as.integer(input$nThreads) else 1L + + # Inapplicable handling + searchInapplicable <- if (length(input$inapplicable)) input$inapplicable else "bgs" + searchHsjAlpha <- if (length(input$hsjAlpha)) as.double(input$hsjAlpha) else 1.0 + searchHierarchy <- if (!identical(searchInapplicable, "bgs") && + !is.null(r$chars) && length(r$chars) > 0L) { + tryCatch( + withCallingHandlers( + hierarchy_from_names(r$chars), + warning = function(w) invokeRestart("muffleWarning") + ), + error = function(e) NULL + ) + } else { + NULL + } + + # Non-Brazeau methods require a detected hierarchy; abort early + if (!identical(searchInapplicable, "bgs") && is.null(searchHierarchy)) { + methodLabel <- switch(searchInapplicable, + hsj = "Hopkins & St. John (HSJ)", + xform = "X-transformation (Goloboff)", + searchInapplicable) + Notification( + paste0( + "The \u201c", methodLabel, "\u201d method requires a character ", + "hierarchy. Ensure character names follow the sup_ / ", + "sub_ convention (see ?hierarchy_from_names)." + ), + type = "error", duration = 10 + ) + return(invisible()) + } + + # Show search-in-progress indicator BEFORE tree selection (which may + # call AdditionTree synchronously). The guard in DisplayTreeScores() + # checks r$searchNotification to avoid overwriting this indicator. + disable("go") + disable("modalGo") + disable("searchConfig") + shinyjs::show("cancel") + # Create unique temp file paths for cancel + progress signaling + cancelPath <- tempfile("ts_cancel_", fileext = ".signal") + cancelFile(cancelPath) + progressPath <- tempfile("ts_progress_", fileext = ".txt") + progressFile(progressPath) + searchLabel <- paste0( + "Searching (", searchMaxRep, " runs, ", wtType(), + if (searchNThreads > 1L) paste0(", ", searchNThreads, " threads") else "", + ")\u2026" + ) + r$searchNotification <- showNotification( + searchLabel, duration = NULL, type = "message", closeButton = FALSE + ) + r$searchDataHash <- r$dataHash + r$searchInProgress <- TRUE + output$results <- renderUI(HTML(searchLabel)) + + startTree <- tryCatch({ + if (!AnyTrees()) { + LogComment("Select starting tree") + LogCode(paste0("startTree <- AdditionTree(dataset, concavity = ", + Enquote(concavity()), ")")) + AdditionTree(r$dataset[SearchTips()], concavity = concavity()) + } else { + LogComment("Select starting tree") + treeLabels <- TipLabels(r$trees[[1]]) + if (all(SearchTips() %in% treeLabels)) { + if (length(setdiff(treeLabels, SearchTips())) > 0) { + if (length(r$searchWithout)) { + LogCode(paste0( + "searchTips <- setdiff(names(dataset), ", EnC(r$searchWithout), + ")"), + "startTree <- KeepTip(trees[[1]], searchTips)") + } else { + LogCode("startTree <- KeepTip(trees[[1]], names(dataset))") + } + KeepTip(r$trees[[1]], SearchTips()) + } else { + sc <- scores() + firstOptimal <- if (length(sc)) which.min(sc) else 1L + LogCode(paste0("startTree <- trees[[", firstOptimal, "]]", + " # First tree with optimal score")) + r$trees[[firstOptimal]] + } + } else { + # Fuzzy-match labels + matching <- TreeDist::LAPJV(adist(treeLabels, SearchTips()))$matching + scaffold <- KeepTip(r$trees[[1]], !is.na(matching)) + scaffold[["tip.label"]] <- SearchTips()[matching[!is.na(matching)]] + AdditionTree(r$dataset, concavity = concavity(), + constraint = scaffold) + } + } + }, error = function(e) { + LogMsg("Starting tree error: ", conditionMessage(e), "; using fresh tree") + LogCode(paste0("startTree <- AdditionTree(dataset, concavity = ", + Enquote(concavity()), ")")) + AdditionTree(r$dataset[SearchTips()], concavity = concavity()) + }) + LogMsg("StartSearch()") + PutData(r$dataset[SearchTips()]) + PutTree(startTree) + # Snapshot reactive values for the async task + searchDataset <- r$dataset[SearchTips()] + searchConcavity <- concavity() + searchExtendedIw <- extendedIw() + LogComment("Search for optimal trees", 1) + LogCode(c( + "newTrees <- MaximizeParsimony(", + if (length(r$searchWithout)) { + paste0( + " dataset[setdiff(names(dataset), ", EnC(r$searchWithout), ")]," + ) + } else { + " dataset," + }, + " tree = startTree,", + paste0(" concavity = ", Enquote(concavity()), ","), + if (!searchExtendedIw && is.finite(searchConcavity)) + " extended_iw = FALSE,", + paste0(" strategy = \"", searchStrategy, "\","), + paste0(" maxReplicates = ", searchMaxRep, ","), + paste0(" targetHits = ", searchTargetHits, ","), + if (searchMaxSeconds > 0) + paste0(" maxSeconds = ", searchMaxSeconds, ","), + if (searchPoolSub > 0) + paste0(" control = SearchControl(poolSuboptimal = ", searchPoolSub, "),"), + if (searchNThreads > 1L) + paste0(" nThreads = ", searchNThreads, "L,"), + if (!identical(searchInapplicable, "bgs") && !is.null(searchHierarchy)) + paste0(" inapplicable = \"", searchInapplicable, "\","), + if (identical(searchInapplicable, "hsj") && !is.null(searchHierarchy) && + searchHsjAlpha != 1.0) + paste0(" hsj_alpha = ", searchHsjAlpha, ","), + " verbosity = 0", + ")")) + + searchTask$invoke( + searchDataset, startTree, searchConcavity, searchExtendedIw, + searchStrategy, searchMaxRep, searchTargetHits, + searchMaxSeconds, searchPoolSub, searchNThreads, + cancelPath, progressPath, + searchHierarchy, searchInapplicable, searchHsjAlpha + ) + } + + ########################################################################## + # Input observers + ########################################################################## + + observeEvent(input$searchWithout, { + r$searchWithout <- input$searchWithout + }, ignoreInit = TRUE) + + observeEvent(input$go, StartSearch()) + observeEvent(input$modalGo, { + removeModal() + StartSearch() + }) + + # Cancel button: create the signal file so the C++ engine stops + observeEvent(input$cancel, { + cf <- cancelFile() + if (!is.null(cf)) { + file.create(cf) + shinyjs::hide("cancel") + # Remove search notification immediately so it doesn't linger + if (!is.null(r$searchNotification)) { + removeNotification(r$searchNotification) + r$searchNotification <- NULL + } + output$results <- renderUI(HTML( + "Stopping \u2014 waiting for current search phase to finish\u2026" + )) + } + }) + + # Poll progress file during search to update notification + observe({ + pf <- progressFile() + nid <- r$searchNotification + if (is.null(pf) || is.null(nid) || !isTRUE(r$searchInProgress)) return() + invalidateLater(500) + if (!file.exists(pf)) return() # C++ hasn't written first status yet + progress <- tryCatch( + readLines(pf, warn = FALSE), + error = function(e) NULL + ) + if (is.null(progress) || length(progress) == 0L || + !nzchar(progress[[1L]])) return() + parts <- strsplit(progress[[1L]], " ", fixed = TRUE)[[1L]] + if (length(parts) < 5L) return() + rep_cur <- parts[1L] + rep_max <- parts[2L] + best <- parts[3L] + hits <- parts[4L] + target <- parts[5L] + msg <- paste0( + "Searching\u2026 Rep ", rep_cur, "/", rep_max, + " | Best: ", best, + " | Hits: ", hits, "/", target + ) + # Update both the results area and the toast (belt-and-suspenders: if + # DisplayTreeScores() was called and overwrote output$results, the next + # poll restores the progress message within 500 ms). + output$results <- renderUI(HTML(msg)) + showNotification(msg, id = nid, duration = NULL, + type = "message", closeButton = FALSE) + }) + + ########################################################################## + # Search config modal + ########################################################################## + + observeEvent(input$searchConfig, { + nCores <- max(1L, parallel::detectCores(logical = FALSE), na.rm = TRUE) + updateSelectInput(session, "implied.weights", + selected = input$implied.weights) + updateSliderInput(session, "concavity", value = input$concavity) + updateNumericInput(session, "epsilon", value = input$epsilon) + updateSelectInput(session, "strategy", selected = input$strategy) + updateSliderInput(session, "maxReplicates", value = input$maxReplicates) + updateSliderInput(session, "targetHits", value = input$targetHits) + updateSliderInput(session, "timeout", value = input$timeout) + if (nCores > 1L) { + updateSliderInput(session, "nThreads", value = input$nThreads) + } + # Sync inapplicable selector and show/hide hsjAlpha accordingly + inapplicable_cur <- if (length(input$inapplicable)) input$inapplicable else "bgs" + updateSelectInput(session, "inapplicable", selected = inapplicable_cur) + updateNumericInput(session, "hsjAlpha", + value = if (length(input$hsjAlpha)) input$hsjAlpha else 1.0) + if (identical(inapplicable_cur, "hsj")) show("hsjAlpha") else hide("hsjAlpha") + # Initialise all modal inputs from current values so that opening the + # modal does not fire observeEvent(input$concavity) or + # observeEvent(input$implied.weights), which reset the run counters. + cur_weights <- if (length(input$implied.weights)) input$implied.weights else "xpiwe" + cur_concavity <- if (length(input$concavity)) input$concavity else 1L + cur_strategy <- if (length(input$strategy)) input$strategy else "auto" + cur_maxRep <- if (length(input$maxReplicates)) input$maxReplicates else 96L + cur_hits <- if (length(input$targetHits)) input$targetHits else 10L + cur_timeout <- if (length(input$timeout)) input$timeout else 5 + cur_epsilon <- if (length(input$epsilon)) input$epsilon else 0 + cur_threads <- if (length(input$nThreads)) input$nThreads else max(1L, floor(nCores / 2L)) + # Concavity slider should start hidden unless weighting mode uses it + concavityInput <- sliderInput(ns("concavity"), "Concavity constant", + min = 0L, max = 3L, pre = "10^", + value = cur_concavity) + if (!cur_weights %in% c("xpiwe", "on")) { + concavityInput <- hidden(concavityInput) + } + showModal(modalDialog( + easyClose = TRUE, + fluidPage(column(6, + selectInput(ns("implied.weights"), "Step weighting", + list("Implied (extended)" = "xpiwe", + "Implied" = "on", "Profile" = "prof", + "Equal" = "off"), cur_weights), + concavityInput, + selectInput(ns("inapplicable"), "Inapplicable characters", + list("Brazeau et al. (default)" = "bgs", + "Hopkins & St. John (HSJ)" = "hsj", + "X-transformation (Goloboff)" = "xform"), + inapplicable_cur), + hidden(numericInput(ns("hsjAlpha"), "HSJ \u03b1 parameter", + value = if (length(input$hsjAlpha)) input$hsjAlpha else 1.0, + min = 0, step = 0.1)), + uiOutput(ns("hierarchyInfo")), + if (nCores > 1L) { + sliderInput(ns("nThreads"), "Parallel search threads", + min = 1L, max = nCores, + value = cur_threads, + step = 1L) + }, + selectizeInput(ns("searchWithout"), "Exclude taxa", DatasetTips(), + r$searchWithout, multiple = TRUE), + numericInput(ns("epsilon"), "Keep if suboptimal by \u2264", min = 0, + value = cur_epsilon) + ), column(6, + selectInput(ns("strategy"), "Search strategy", + list("Auto" = "auto", "Sprint" = "sprint", + "Default" = "default", "Thorough" = "thorough"), + cur_strategy), + sliderInput(ns("targetHits"), + "Stop when N runs have hit best score", + min = 1L, max = 50L, value = cur_hits, step = 1L), + uiOutput(ns("targetHitsNote")), + sliderInput(ns("timeout"), "Maximum run duration", min = 1, + max = 60, value = cur_timeout, post = "min", step = 1), + sliderInput(ns("maxReplicates"), "Maximum independent runs", + min = 48L, max = 960L, value = cur_maxRep, step = 48L), + helpText("Limits each individual search. Clicking \u2018Continue\u2019", + "starts a fresh search; the results panel shows the", + "cumulative total across all continued searches.") + )), + title = "Tree search settings", + footer = tagList(modalButton("Close", icon = Icon("rectangle-xmark")), + actionButton(ns("modalGo"), icon = Icon("magnifying-glass"), + if(length(r$trees)) { + "Continue search" + } else { + "Start search" + })) + )) + show("go") + }) + + ########################################################################## + # Async search result observer + ########################################################################## + + # Only searchTask$result() should be a reactive dependency; + # isolate everything else to prevent reactive cascade re-runs. + observe({ + # Use a single `error` handler rather than separate `shiny.silent.error` + # + `error` handlers. With two handlers, `req(FALSE)` thrown inside the + # `shiny.silent.error` handler is caught by the sibling `error` handler + # (R's tryCatch does not fully unwind before sibling handlers), causing + # the isolate block below to run prematurely (notification removed, + # cancel hidden) while the search task is still running. + newTrees <- tryCatch( + searchTask$result(), + error = function(e) { + if (inherits(e, "shiny.silent.error")) { + # ExtendedTask signals shiny.silent.error when status is "initial" + # or "running". Re-throw so Shiny's observer wrapper terminates + # this cycle cleanly; the observer will re-fire on task completion. + stop(e) + } + msg <- conditionMessage(e) + if (nzchar(msg)) { + Notification(paste("Search error:", msg), type = "error") + } + NULL + } + ) + isolate({ + # Clean up search-in-progress UI state. Gate on searchInProgress + # (not searchNotification) because the cancel observer may have + # already dismissed the notification. + if (isTRUE(r$searchInProgress)) { + if (!is.null(r$searchNotification)) { + removeNotification(r$searchNotification) + r$searchNotification <- NULL + } + enable("go") + enable("modalGo") + enable("searchConfig") + shinyjs::hide("cancel") + cf <- cancelFile() + if (!is.null(cf)) { + suppressWarnings(file.remove(cf)) + cancelFile(NULL) + } + pf <- progressFile() + if (!is.null(pf)) { + suppressWarnings(file.remove(pf)) + progressFile(NULL) + } + r$searchInProgress <- FALSE + } + + if (is.null(newTrees)) { + DisplayTreeScores() + return() + } + if (!identical(r$dataHash, r$searchDataHash)) { + Notification("Dataset changed during search; results discarded.", + type = "warning") + DisplayTreeScores() + return() + } + + r$sortTrees <- TRUE + + # Accumulate trees across searches: if the new result matches the + # current best score, merge with existing trees (dedup by topology). + newScore <- attr(newTrees, "score") + newHitsRaw <- attr(newTrees, "hits_to_best") + newRepsRaw <- attr(newTrees, "replicates") + newHits <- if (is.null(newHitsRaw)) 0L else as.integer(newHitsRaw) + newReps <- if (is.null(newRepsRaw)) 0L else as.integer(newRepsRaw) + newLastImp <- attr(newTrees, "last_improved_rep") + r$searchConsensusStable <- isTRUE(attr(newTrees, "consensus_stable")) + r$searchTimedOut <- isTRUE(attr(newTrees, "timed_out")) + prevCount <- length(r$allTrees) + treesToStore <- if ( + !is.null(newScore) && !is.null(r$bestSearchScore) && + isTRUE(abs(newScore - r$bestSearchScore) < sqrt(.Machine$double.eps)) && + prevCount > 0L + ) { + LogComment("Same optimal score: accumulating trees across search runs") + r$searchTotalHits <- r$searchTotalHits + newHits + r$searchTotalReps <- r$searchTotalReps + newReps + newRepScores <- attr(newTrees, "replicate_scores") + if (!is.null(newRepScores)) { + r$searchReplicateScores <- c(r$searchReplicateScores, newRepScores) + } + # Keep existing last_improved_rep (new search didn't improve score) + combined <- c(r$allTrees, newTrees) + # Deduplicate by canonical Newick (ladderized topology string) + nwk <- vapply(combined, function(t) { + write.tree(ape::ladderize(t)) + }, character(1L)) + combined <- combined[!duplicated(nwk)] + # Filter out trees exceeding current poolSuboptimal threshold + tol <- tolerance() + if (tol < Inf && length(combined) > 1L) { + conc <- concavity() + ds <- if (identical(conc, "profile")) profileDataset() else r$dataset + if (!is.null(ds)) { + sc <- tryCatch( + TreeLength(RootTree(combined, 1), ds, + concavity = conc, extended_iw = extendedIw()), + error = function(e) NULL + ) + if (!is.null(sc)) { + combined <- combined[sc <= min(sc) + tol + + sqrt(.Machine$double.eps)] + } + } + } + combined + } else { + LogComment("New or improved score: replacing trees") + r$bestSearchScore <- newScore + r$searchTotalHits <- newHits + r$searchTotalReps <- newReps + r$searchLastImprovedRep <- if (!is.null(newLastImp) && newLastImp > 0L) { + as.integer(newLastImp) + } else { + NULL + } + newRepScores2 <- attr(newTrees, "replicate_scores") + r$searchReplicateScores <- if (!is.null(newRepScores2)) { + newRepScores2 + } else { + numeric(0) + } + newTrees + } + + UpdateAllTrees(treesToStore) + # Always refresh the display — UpdateAllTrees may short-circuit + # when trees are unchanged, but hit/rep counts have been updated. + DisplayTreeScores() + updateActionButton(session, "go", "Continue") + updateActionButton(session, "modalGo", "Continue search") + shinyjs::show(selector = "#displayConfig") + newCount <- length(r$allTrees) + Notification( + if (newCount > prevCount) + paste0("Search complete \u2014 ", newCount, " trees in pool (+", + newCount - prevCount, " new)") + else + "Search complete", + type = "message", duration = 5 + ) + r$searchCount <- r$searchCount + 1L + }) + }) + + ########################################################################## + # Dataset change: reset search stats + update timeout default + ########################################################################## + + observeEvent(r$dataset, { + r$searchTotalHits <- 0L + r$searchTotalReps <- 0L + r$searchReplicateScores <- numeric(0) + r$bestSearchScore <- NULL + r$searchLastImprovedRep <- NULL + r$searchConsensusStable <- FALSE + r$searchTimedOut <- FALSE + r$searchCount <- 0L + nTip <- length(r$dataset) + nChar <- sum(attr(r$dataset, "weight", exact = TRUE)) + defaultTimeout <- max(1L, min(15L, ceiling(nTip * nChar / 20000L))) + updateSliderInput(session, "timeout", value = defaultTimeout) + }) + + ########################################################################## + # Button label management — react to tree/data state changes + ########################################################################## + + observe({ + hasTrees <- !is.null(r$allTrees) && length(r$allTrees) > 0 + hasData <- !is.null(r$dataset) && length(r$dataset) > 0 + if (!hasData) return() + if (hasTrees) { + treeTips <- r$allTrees[[1]]$tip.label + dataTips <- names(r$dataset) + if (length(intersect(dataTips, treeTips)) == length(r$dataset)) { + updateActionButton(session, "go", "Continue") + } else { + updateActionButton(session, "go", "New search") + } + } else { + updateActionButton(session, "go", "Search") + } + }) + + ########################################################################## + # Return values for other server files + ########################################################################## + + list( + scores = scores, + concavity = concavity, + extendedIw = extendedIw, + weighting = weighting, + DisplayTreeScores = DisplayTreeScores + ) + }) +} diff --git a/inst/Parsimony/server/mod_treespace.R b/inst/Parsimony/server/mod_treespace.R new file mode 100644 index 000000000..6507e1f65 --- /dev/null +++ b/inst/Parsimony/server/mod_treespace.R @@ -0,0 +1,776 @@ +# Module: Tree space visualization +# +# Absorbs treespace.R + plotsettings.R. Owns inputs: spaceDim, spaceCol, +# spacePch, relators, mapLines. Reads: r$trees, r$treeHash, clusterings(), +# silThreshold(), scores(), concavity(). Receives top-level distMeth, +# plotFormat, distances, and LogDistances as reactive/function args. +# +# Returns a list of reactives consumed by other source'd server files: +# mapping, dims, nProjDim, TreeCols, treePch, +# saveDetails, TreespacePlot, LogTreespacePlot, mstEnds + +treespace_ui <- function(id) { + ns <- NS(id) + tags$div( + id = "spaceConfig", + tags$div(id = "spaceLegend", + style = "float: left;", + plotOutput(outputId = ns("pcQuality"), + height = "72px", width = "240px"), + htmlOutput(ns("stressLegend"), inline = TRUE) + ), + tags$div( + style = "float: right; width: 200px; margin-left: 2em;", + sliderInput(ns("spaceDim"), "Dimensions:", value = 5, + min = 1, max = 12, step = 1, width = 200), + selectInput(ns("spaceCol"), "Colour trees by:", + list("Cluster membership" = "clust", + "Parsimony score" = "score", + "When first found" = "firstHit")), + selectInput(ns("spacePch"), "Plotting symbols:", + selected = "relat", + list("Cluster membership" = "clust", + "Relationships" = "relat", + "Tree index" = "index", + "Tree name" = "name")), + selectizeInput(ns("relators"), "Show relationship between:", + choices = list(), multiple = TRUE), + ), + ) +} + +#' @param id Module namespace id. +#' @param r AppState reactiveValues. +#' @param clusterings Reactive returning clustering result list. +#' @param silThreshold Reactive returning silhouette threshold. +#' @param scores Reactive returning tree scores. +#' @param concavity Reactive returning concavity value. +#' @param distMeth Reactive wrapping top-level \code{input$distMeth}. +#' @param plotFormat Reactive wrapping top-level \code{input$plotFormat}. +#' @param distances Reactive returning tree distance matrix (from clustering +#' module). +#' @param mapLines Reactive wrapping top-level \code{input$mapLines}. +#' @param LogDistances Function that logs distance computation code. +#' @param log_fns Named list of logging functions from logging.R: +#' BeginLogP, LogCommentP, LogCodeP, LogIndent, LogClusterings. +treespace_server <- function(id, r, clusterings, silThreshold, scores, + concavity, distMeth, plotFormat, + distances, mapLines, LogDistances, log_fns) { + moduleServer(id, function(input, output, session) { + + # Unpack logging functions + BeginLogP <- log_fns$BeginLogP + LogCommentP <- log_fns$LogCommentP + LogCodeP <- log_fns$LogCodeP + LogIndent <- log_fns$LogIndent + LogClusterings <- log_fns$LogClusterings + + ############################################################################ + # Plot settings (from plotsettings.R) + ############################################################################ + + spaceCex <- reactive(1.7) + spaceLwd <- reactive(2) + + FirstHit <- reactive({ + r$trees <- WhenFirstHit(r$trees) + attr(r$trees, "firstHit") + }) + + LogFirstHit <- function() { + LogCodeP("whenHit <- gsub(\"(seed|start|ratch\\\\d+|final)_\\\\d+\", \"\\\\1\", + names(trees), perl = TRUE)") + LogCodeP("attr(trees, \"firstHit\") <- table(whenHit)[unique(whenHit)]") + } + + FirstHitCols <- reactive({ + if (is.null(FirstHit())) { + palettes[[1]] + } else { + hcl.colors(length(FirstHit()), "viridis") + } + }) + + LogFirstHitCols <- reactive({ + if (is.null(FirstHit())) { + paste0(palettes[[1]], " # Arbitrarily") + } else { + "hcl.colors(length(firstHit), \"viridis\")" + } + }) + + TreeCols <- reactive({ + switch( + input$spaceCol, + "clust" = { + cl <- clusterings() + if (cl$sil > silThreshold()) { + palettes[[min(length(palettes), cl$n)]][cl$cluster] + } else { + palettes[[1]] + } + }, "score" = { + if (is.null(scores()) || length(unique(scores())) == 1L) { + palettes[[1]] + } else { + norm <- scores() - min(scores()) + norm <- (length(badToGood) - 1L) * norm / max(norm) + rev(badToGood)[1 + norm] + } + }, "firstHit" = { + if (is.null(FirstHit())) { + Notification("Data not available; were trees loaded from file?", + type = "warning") + palettes[[1]] + } else { + rep(FirstHitCols(), FirstHit()) + } + }, + "black" + ) + }) + + LogTreeCols <- reactive({ + beige <- paste0("treeCols <- ", Enquote(palettes[[1]]), " # Arbitrarily") + switch( + input$spaceCol, + "clust" = { + cl <- clusterings() + if (cl$sil > silThreshold()) { + paste0("treeCols <- ", + EnC(palettes[[min(length(palettes), cl$n)]]), + "[clustering]") + } else { + beige + } + }, "score" = { + if (is.null(scores()) || length(unique(scores())) == 1L) { + beige + } else { + c(paste0("scores <- TreeLength(trees, dataset, concavity = ", + Enquote(concavity()), ")"), + "normalized <- scores - min(scores)", + "normalized <- 107 * normalized / max(normalized)", + "goodToBad <- hcl.colors(108, \"Temps\")", + "treeCols <- goodToBad[1 + normalized]" + ) + } + }, "firstHit" = { + if (is.null(FirstHit())) { + beige + } else { + c("trees <- WhenFirstHit(trees)", + "firstHit <- attr(trees, \"firstHit\")", + paste0("treeCols <- rep(", LogFirstHitCols(), ", firstHit))") + ) + } + }, + "treeCols <- black" + ) + }) + + treeNameClustering <- reactive({ + ClusterStrings(names(r$trees)) + }) + + treePch <- reactive({ + switch( + input$spacePch, + "clust" = { + cl <- clusterings() + if (cl$sil > silThreshold()) { + cl$cluster - 1 + } else { + 16 + } + }, "relat" = { + quartet <- input$relators + if (length(quartet) == 4) { + QuartetResolution(r$trees, input$relators) + } else { + Notification("Select four taxa to show relationships") + 0 + } + }, "index" = { + 16 # text() used instead of points() for this mode + }, "name" = { + if (is.null(names(r$trees))) { + Notification("Trees lack names", type = "warning") + 16 + } else { + indices <- treeNameClustering() + c(1, 3, 4, 2, seq_len(max(indices))[-(1:4)])[indices] + } + }, 0) + }) + + LogTreePch <- function() { + switch( + input$spacePch, + "clust" = { + cl <- clusterings() + if (cl$sil > silThreshold()) { + "cl$cluster - 1" + } else { + "16 # No clustering structure: Use filled circle" + } + }, "relat" = { + quartet <- input$relators + if (length(quartet) == 4) { + paste0("QuartetResolution(trees, ", EnC(input$relators), ")") + } else { + "0 # Square" + } + }, "index" = { + "seq_along(trees) # text() labels" + }, "name" = { + if (is.null(names(r$trees))) { + "16 # Filled circle" + } else { + "ClusterStrings(names(trees))" + } + }, "0 # Square") + } + + maxProjDim <- reactive({ + min(12, max(0L, length(r$trees) - 1L)) + }) + + # Keep spaceDim slider max in sync with available projection dimensions + observe({ + mpd <- maxProjDim() + if (mpd > 0) { + updateSliderInput(inputId = "spaceDim", + max = max(1L, mpd), + value = min(mpd, input$spaceDim)) + } + }) + + nProjDim <- reactive({ + dim(mapping())[2] + }) + + dims <- debounce(reactive({ + min(input$spaceDim, maxProjDim()) + }), 400) + + # distances and LogDistances are now received as args from clustering module + + mapping <- bindCache(reactive({ + LogMsg("mapping()") + if (maxProjDim() > 1L) { + withProgress( + message = "Mapping trees", + value = 0.99, + tryCatch(cmdscale(distances(), k = maxProjDim()), + warning = function(e) { + nDim <- as.integer(substr(e$message, 6, 7)) + updateSliderInput(inputId = "spaceDim", + value = min(nDim, input$spaceDim), + max = nDim) + message("Max dimensions available for mapping: ", nDim, ".") + cmdscale(distances(), k = nDim) + }) + ) + } else { + matrix(0, 0, 0) + } + }), r$treeHash, distMeth(), maxProjDim()) + + LogMapping <- function() { + k <- dim(mapping())[2] + if (!is.null(k) && k > 0) { + LogCommentP(paste0( + "Generate first ", k, " dimensions of tree space using PCoA" + )) + LogCodeP(paste0("map <- cmdscale(dists, k = ", k, ")")) + } + } + + mstEnds <- bindCache(reactive({ + dist <- as.matrix(distances()) + withProgress(message = "Calculating MST", { + edges <- MSTEdges(dist) + }) + edges + }), distMeth(), r$treeHash) + + ############################################################################ + # Tree space plot (from treespace.R) + ############################################################################ + + TreespacePlot <- function() { + if (length(r$trees) < 3) { + return(ErrorPlot("Need at least\nthree trees to\nmap tree space")) + } + + cl <- clusterings() + map <- mapping() + + nDim <- min(dims(), nProjDim()) + if (nDim < 2) { + if (dim(map)[2] == 1L) { + map <- cbind(map, 0) + } else { + map[, 2] <- 0 + } + nDim <- 2L + nPanels <- 1L + } else { + plotSeq <- matrix(0, nDim, nDim) + nPanels <- nDim * (nDim - 1L) / 2L + plotSeq[upper.tri(plotSeq)] <- seq_len(nPanels) + if (nDim > 2) { + plotSeq[nDim - 1, 2] <- max(plotSeq) + 1L + } + layout(t(plotSeq[-nDim, -1])) + } + + par(mar = rep(0.2, 4)) + withProgress(message = "Drawing plot", { + for (i in 2:nDim) for (j in seq_len(i - 1)) { + incProgress(1 / nPanels) + plot(map[, j], map[, i], ann = FALSE, axes = FALSE, + frame.plot = nDim > 2L, + type = "n", asp = 1, xlim = range(map), ylim = range(map)) + + if ("seq" %in% mapLines()) { + n_map <- nrow(map) + if (n_map > 1L) { + x0 <- map[-n_map, j]; y0 <- map[-n_map, i] + x1 <- map[-1L, j]; y1 <- map[-1L, i] + # Dashed lines between consecutive trees + segments(x0, y0, x1, y1, col = "#ffcc33", lty = 2) + # Small arrows at segment midpoints to show direction + mx <- (x0 + x1) / 2; my <- (y0 + y1) / 2 + dx <- x1 - x0; dy <- y1 - y0 + seg_len <- sqrt(dx * dx + dy * dy) + keep <- seg_len > 0 + if (any(keep)) { + # Nudge = tiny fraction of each segment length + nudge <- seg_len * 0.05 + ux <- dx / seg_len; uy <- dy / seg_len + arrows(mx[keep] - nudge[keep] * ux[keep], + my[keep] - nudge[keep] * uy[keep], + mx[keep] + nudge[keep] * ux[keep], + my[keep] + nudge[keep] * uy[keep], + col = "#ffcc33", length = 0.06, angle = 25, + lwd = 1.2) + } + } + } + + if ("mst" %in% mapLines()) { + segments(map[mstEnds()[, 1], j], map[mstEnds()[, 1], i], + map[mstEnds()[, 2], j], map[mstEnds()[, 2], i], + col = "#bbbbbb", lty = 1) + } + + if (input$spacePch == "index") { + text(map[, j], map[, i], + labels = seq_len(nrow(map)), + col = paste0(TreeCols(), as.hexmode(200)), + cex = spaceCex() * 0.7) + } else { + points(map[, j], map[, i], pch = treePch(), + col = paste0(TreeCols(), as.hexmode(200)), + cex = spaceCex(), + lwd = spaceLwd()) + } + + if (cl$sil > silThreshold() && "hull" %in% mapLines()) { + for (clI in seq_len(cl$n)) { + inCluster <- cl$cluster == clI + clusterX <- map[inCluster, j] + clusterY <- map[inCluster, i] + hull <- chull(clusterX, clusterY) + polygon(clusterX[hull], clusterY[hull], lty = 1, lwd = 2, + border = palettes[[min(length(palettes), cl$n)]][clI]) + } + } + } + if (nDim > 2) { + plot.new() + } + if (input$spacePch == "relat") { + if (length(input$relators) == 4L) { + legend( + "topright", + bty = "n", + pch = 1:3, + xpd = NA, + pt.cex = spaceCex(), + pt.lwd = spaceLwd(), + gsub("_", " ", fixed = TRUE, + paste(input$relators[2:4], "&", input$relators[[1]])) + ) + } + } else if (input$spacePch == "name") { + clstr <- treeNameClustering() + clusters <- unique(clstr) + if (length(clusters) > 1L) { + legend(bty = "n", "topright", xpd = NA, + pch = c(1, 3, 4, 2, + seq_len(max(clstr))[-(1:4)])[clusters], + paste0("~ ", attr(clstr, "med"), " (", table(clstr), ")")) + } + } + if (input$spaceCol == "firstHit" && length(FirstHit())) { + legend(bty = "n", "topleft", pch = 16, col = FirstHitCols(), + pt.cex = spaceCex(), + names(FirstHit()), title = "Iteration first hit") + } else if (input$spaceCol == "score") { + legendRes <- length(badToGood) + leg <- rep(NA, legendRes) + leg[c(legendRes, 1)] <- signif(range(scores())) + legend("bottomright", bty = "n", border = NA, + legend = leg, fill = rev(badToGood), + y.intersp = 0.04, cex = 1.1) + } + }) + } + + LogTreespacePlot <- function() { + BeginLogP() + + LogClusterings() + LogMapping() + + map <- mapping() + nDim <- min(dims(), nProjDim()) + if (nDim < 2) { + LogCommentP("Prepare 1D map", 0) + if (dim(map)[2] == 1L) { + LogCodeP("map <- cbind(map, 0)") + } else { + LogCodeP("map[, 2] <- 0") + } + nDim <- 2L + nPanels <- 1L + } else { + LogCommentP("Prepare plot layout") + + LogCodeP(c( + paste0("nDim <- ", nDim, " # Number of dimensions to plot"), + "nPanels <- nDim * (nDim - 1L) / 2L # Lower-left triangle", + "plotSeq <- matrix(0, nDim, nDim)", + "plotSeq[upper.tri(plotSeq)] <- seq_len(nPanels)", + if (nDim > 2) { + "plotSeq[nDim - 1, 2] <- max(plotSeq) + 1L" + }, + "layout(t(plotSeq[-nDim, -1]))" + )) + } + + LogCommentP("Set plot margins", 0) + LogCodeP("par(mar = rep(0.2, 4))") + + LogCommentP("Set up tree plotting symbols") + LogCodeP(paste0("treePch <- ", LogTreePch()), + LogTreeCols(), + "treeCols <- paste0(treeCols, as.hexmode(200)) # Semitransparent" + ) + + LogCodeP("for (i in 2:nDim) for (j in seq_len(i - 1)) {") + LogIndent(+2) + LogCommentP("Set up blank plot") + LogCodeP("plot(", + " x = map[, j],", + " y = map[, i],", + " ann = FALSE, # No annotations", + " axes = FALSE, # No axes", + paste0(" frame.plot = ", + if (nDim > 2L) { + "TRUE, # Border around plot" + } else { + "FALSE, # No border around plot" + }), + " type = \"n\", # Don't plot any points yet", + " asp = 1, # Fix aspect ratio to avoid distortion", + " xlim = range(map), # Constant X range for all dimensions", + " ylim = range(map) # Constant Y range for all dimensions", + ")") + + if ("seq" %in% mapLines()) { + LogCommentP("Connect trees in sequence with midpoint arrows") + LogCodeP("nMap <- nrow(map)", + "if (nMap > 1) {", + " x0 <- map[-nMap, j]; y0 <- map[-nMap, i]", + " x1 <- map[-1, j]; y1 <- map[-1, i]", + " segments(x0, y0, x1, y1, col = \"#ffcc33\", lty = 2)", + " mx <- (x0 + x1) / 2; my <- (y0 + y1) / 2", + " dx <- x1 - x0; dy <- y1 - y0", + " seg_len <- sqrt(dx^2 + dy^2)", + " keep <- seg_len > 0", + " nudge <- seg_len * 0.05", + " ux <- dx / seg_len; uy <- dy / seg_len", + " arrows(mx[keep] - nudge[keep] * ux[keep],", + " my[keep] - nudge[keep] * uy[keep],", + " mx[keep] + nudge[keep] * ux[keep],", + " my[keep] + nudge[keep] * uy[keep],", + " col = \"#ffcc33\", length = 0.06, angle = 25,", + " lwd = 1.2)", + "}") + } + + if ("mst" %in% mapLines()) { + LogCommentP("Plot minimum spanning tree (Gower 1969)") + LogCodeP( + "mst <- MSTEdges(as.matrix(dists))", + "segments(", + " x0 = map[mst[, 1], j],", + " y0 = map[mst[, 1], i],", + " x1 = map[mst[, 2], j],", + " y1 = map[mst[, 2], i],", + " col = \"#bbbbbb\", # Light grey", + " lty = 1 # Solid lines", + ")" + ) + } + + if (input$spacePch == "index") { + LogCommentP("Label trees by index") + LogCodeP( + "text(", + " x = map[, j],", + " y = map[, i],", + " labels = seq_len(nrow(map)),", + " col = treeCols,", + paste0(" cex = ", round(spaceCex() * 0.7, 2), " # Text size"), + ")" + ) + } else { + LogCommentP("Add points") + LogCodeP( + "points(", + " x = map[, j],", + " y = map[, i],", + " pch = treePch,", + " col = treeCols,", + paste0(" cex = ", spaceCex(), ", # Point size"), + paste0(" lwd = ", spaceLwd(), " # Line width"), + ")" + ) + } + + cl <- clusterings() + if (cl$sil > silThreshold() && "hull" %in% mapLines()) { + LogCommentP("Mark clusters") + LogCodeP("for (clI in seq_len(nClusters)) {") + LogIndent(+2) + LogCodeP( + "inCluster <- clustering == clI", + "clusterX <- map[inCluster, j]", + "clusterY <- map[inCluster, i]", + "hull <- chull(clusterX, clusterY)", + "polygon(", + " x = clusterX[hull],", + " y = clusterY[hull],", + " lty = 1, # Solid line style", + " lwd = 2, # Wider line width", + " border = clusterCol[clI]", + ")") + LogIndent(-2) + LogCodeP("}") + } + + LogIndent(-2) + LogCodeP("}") + + if (nDim > 2) { + LogCodeP("plot.new() # Use new panel to plot legends") + } + + if (input$spacePch == "relat") { + if (length(input$relators) == 4L) { + LogCommentP("Add legend for plotting symbols") + LogCodeP( + "legend(", + " \"topright\",", + " bty = \"n\", # No legend border box", + " pch = 1:3, # Legend symbols", + " xpd = NA, # Display overflowing text", + paste0(" pt.cex = ", spaceCex(), ", # Point size"), + paste0(" pt.lwd = ", spaceLwd(), ", # Line width"), + paste0(" ", + EnC(gsub("_", " ", fixed = TRUE, + paste(input$relators[2:4], "&", + input$relators[[1]]))) + ), ")" + ) + } + } else if (input$spacePch == "name") { + clstr <- treeNameClustering() + clusters <- unique(clstr) + if (length(clusters) > 1L) { + LogCommentP("Add legend for plotting symbols") + LogCodeP( + "nameClusters <- ClusterStrings(names(trees))", + "uniqueClusters <- unique(nameClusters)", + "legend(", + " \"topright\",", + " bty = \"n\", # No legend border box", + " xpd = NA, # Display overflowing text", + paste0( + " pch = ", + EnC(c(1, 3, 4, 2, + seq_len(max(clstr))[-(1:4)])[clusters]), + ", # Legend symbols" + ), paste0(" ", + EnC(paste0("~ ", attr(clstr, "med"), + " (", table(clstr), ")")) + ), + ")") + } + } + if (input$spaceCol == "firstHit" && length(FirstHit())) { + LogCommentP("Record when trees first hit") + LogFirstHit() + + LogCommentP("Add legend for symbol colours") + LogCodeP( + "legend(", + " \"topleft\",", + " bty = \"n\", # No legend border box", + " pch = 16, # Circle symbol", + " xpd = NA, # Display overflowing text", + paste0(" col = ", LogFirstHitCols(), ","), + paste0(" pt.cex = ", spaceCex(), ", # Point size"), + paste0(" ", EnC(names(FirstHit())), ","), + " title = \"Iteration first hit\"", + ")" + ) + } else if (input$spaceCol == "score") { + LogCommentP("Add legend for symbol colours") + LogCodeP( + "goodToBad <- hcl.colors(108, \"Temps\")", + "leg <- rep_len(NA, 108)", + paste0("leg[c(1, 108)] <- ", + EnC(rev(signif(range(scores()))))), + "legend(", + " \"bottomright\",", + " legend = leg,", + " bty = \"n\", # No legend border box", + " border = NA, # No border around plot icons", + " xpd = NA, # Display overflowing text", + " fill = goodToBad,", + " y.intersp = 0.04, # Compress squares to make gradient scale", + " cex = 1.1 # Increase font and icon size slightly", + ")" + ) + } + } + + ############################################################################ + # saveDetails (shared with downloads) + ############################################################################ + + saveDetails <- reactive({ + switch(plotFormat(), + "cons" = list( + fileName = "ConsensusTrees", + title = "Consensus tree - TreeSearch", + asp = 2L + ), + "clus" = list( + fileName = "ClusterCons", + title = "Cluster Consensus trees - TreeSearch", + asp = 1.6 + ), + "ind" = list( + fileName = "OptimalTree", + title = "Optimal tree - TreeSearch", + asp = 2L + ), + "space" = list( + fileName = "TreeSpace", + title = "Tree space - TreeSearch", + asp = 1L + )) + }) + + ############################################################################ + # Mapping quality (moved from consensus.R) + ############################################################################ + + LogScore <- function(x) { + (-(log10(1 - pmin(1, x) + 1e-2))) / 2 + } + + QualityPlot <- function(quality) { + par(mar = c(2, 0, 0, 0)) + nStop <- length(badToGood) + 1L + + plot(NULL, xlim = c(0, 1), ylim = c(-1.5, 2.5), + ann = FALSE, axes = FALSE) + x <- seq.int(from = 0, to = 1, length.out = nStop) + segments(x[-nStop], numeric(nStop), x[-1], lwd = 5, col = badToGood) + + trust <- quality[["Trustworthiness"]] + cont <- quality[["Continuity"]] + txc <- quality[["sqrtTxC"]] + + if (trust > 1) { + LogMsg("Preternaturally high Trustworthiness: ", trust) + } + if (cont > 1) { + LogMsg("Preternaturally high Continuity: ", cont) + } + LogMsg(trust * nStop) + segments(LogScore(txc), -1, y1 = 1, lty = 3) + text(LogScore(trust), 1, "T", + col = badToGood[LogScore(trust) * nStop]) + text(LogScore(cont), -1, "C", + col = badToGood[LogScore(cont) * nStop]) + + tickPos <- c(0, 0.5, 0.7, 0.8, 0.9, 0.95, 1.0) + ticks <- LogScore(tickPos) + + axis(1, at = ticks, labels = NA, line = 0) + axis(1, tick = FALSE, at = ticks, labels = tickPos, line = 0) + axis(1, line = -1, tick = FALSE, + at = ticks[-1] - ((ticks[-1] - ticks[-length(ticks)]) / 2), + labels = c("", "dire", "", "ok", "gd", "excellent")) + axis(3, at = 0.5, tick = FALSE, line = -2, + paste0(dims(), "D mapping quality (trustw. / contin.):")) + } + + output$pcQuality <- renderCachedPlot({ + if (length(r$trees) < 3) { + return() + } + dstnc <- distances() + mppng <- mapping() + mppng <- mapping()[, seq_len(min(dim(mppng)[2], dims()))] + neighbs <- min(10L, length(r$trees) / 2) + future_promise( + TreeDist::MappingQuality(dstnc, dist(mppng), neighbs), + seed = TRUE) %...>% QualityPlot + }, cacheKeyExpr = { + list(r$treeHash, distMeth(), dims()) + }, + sizePolicy = function(dims) dims + ) + + ############################################################################ + # Return reactives for other modules + ############################################################################ + + list( + mapping = mapping, + dims = dims, + nProjDim = nProjDim, + TreeCols = TreeCols, + treePch = treePch, + mstEnds = mstEnds, + saveDetails = saveDetails, + TreespacePlot = TreespacePlot, + LogTreespacePlot = LogTreespacePlot, + # Expose input values for cache keys in consensus.R + spaceCol = reactive(input$spaceCol), + spacePch = reactive(input$spacePch), + mapLines = mapLines, + relators = reactive(input$relators) + ) + }) +} diff --git a/inst/Parsimony/tests/shinytest/SearchLog-expected/001.download b/inst/Parsimony/tests/shinytest/SearchLog-expected/001.download index fcb5919ca..2f44c2bbc 100644 --- a/inst/Parsimony/tests/shinytest/SearchLog-expected/001.download +++ b/inst/Parsimony/tests/shinytest/SearchLog-expected/001.download @@ -64,22 +64,14 @@ newTrees <- MaximizeParsimony( dataset, tree = startTree, concavity = Inf, - ratchIter = 4, - tbrIter = 2, - maxHits = 20, - maxTime = 30, - startIter = 1.6, - finalIter = 1.4, - tolerance = 1, - verbosity = 4 + strategy = "sprint", + maxReplicates = 5, + targetHits = 3, + poolSuboptimal = 1, + verbosity = 0 ) # Overwrite any previous trees with results -if (inherits(newTrees, "phylo")) { - trees <- list(newTrees) - attr(trees, "firstHit") <- attr(newTrees, "firstHit") - attr(trees[[1]], "firstHit") <- NULL -} allTrees <- newTrees trees <- allTrees[unique(as.integer(seq.int(1, 49, length.out = 48)))] diff --git a/inst/Parsimony/tests/shinytest/SearchLog-expected/003.download b/inst/Parsimony/tests/shinytest/SearchLog-expected/003.download index 592412f3c..c812c0a75 100644 --- a/inst/Parsimony/tests/shinytest/SearchLog-expected/003.download +++ b/inst/Parsimony/tests/shinytest/SearchLog-expected/003.download @@ -64,22 +64,14 @@ newTrees <- MaximizeParsimony( dataset, tree = startTree, concavity = Inf, - ratchIter = 4, - tbrIter = 2, - maxHits = 20, - maxTime = 30, - startIter = 1.6, - finalIter = 1.4, - tolerance = 1, - verbosity = 4 + strategy = "sprint", + maxReplicates = 5, + targetHits = 3, + poolSuboptimal = 1, + verbosity = 0 ) # Overwrite any previous trees with results -if (inherits(newTrees, "phylo")) { - trees <- list(newTrees) - attr(trees, "firstHit") <- attr(newTrees, "firstHit") - attr(trees[[1]], "firstHit") <- NULL -} allTrees <- newTrees trees <- allTrees[unique(as.integer(seq.int(1, 49, length.out = 48)))] @@ -103,21 +95,13 @@ newTrees <- MaximizeParsimony( dataset, tree = startTree, concavity = 12.5893, - ratchIter = 2, - tbrIter = 1, - maxHits = 10, - maxTime = 30, - startIter = 1.2, - finalIter = 1, - verbosity = 4 + strategy = "default", + maxReplicates = 3, + targetHits = 2, + verbosity = 0 ) # Overwrite any previous trees with results -if (inherits(newTrees, "phylo")) { - trees <- list(newTrees) - attr(trees, "firstHit") <- attr(newTrees, "firstHit") - attr(trees[[1]], "firstHit") <- NULL -} allTrees <- newTrees trees <- allTrees diff --git a/inst/Parsimony/tests/shinytest/SearchLog.R b/inst/Parsimony/tests/shinytest/SearchLog.R index 09bede34f..adc14a198 100644 --- a/inst/Parsimony/tests/shinytest/SearchLog.R +++ b/inst/Parsimony/tests/shinytest/SearchLog.R @@ -2,28 +2,40 @@ app <- ShinyDriver$new("../../", seed = 0, loadTimeout = 2e+05, shinyOptions = list(test.mode = TRUE)) app$snapshotInit("SearchLog") +# Helper: poll exported searchCount until it exceeds `prev`. +# Needed because MaximizeParsimony runs asynchronously via ExtendedTask; +# setInputs(modalGo = "click") returns immediately after invoke(). +waitForSearch <- function(app, prev = 0L, timeout_s = 120, poll_s = 2) { + deadline <- Sys.time() + timeout_s + while (Sys.time() < deadline) { + vals <- app$getAllValues() + count <- vals$export$searchCount + if (!is.null(count) && count > prev) return(invisible(count)) + Sys.sleep(poll_s) + } + stop("Timed out waiting for search to complete") +} + app$setInputs(dataSource = "Wills2012", timeout_ = 4000) app$setInputs(searchConfig = "click") app$setInputs(concavity = 1.1) # Set whilst visible; remembered later? app$setInputs(epsilon = 1) # Set whilst visible; remembered later? app$setInputs(`implied.weights` = "off") -app$setInputs(finalIter = 1.4) -app$setInputs(maxHits = 1.3) -app$setInputs(startIter = 1.6) -app$setInputs(ratchIter = 4) -app$setInputs(tbrIter = 2) -app$setInputs(modalGo = "click", timeout_ = 1e05) +app$setInputs(strategy = "sprint") +app$setInputs(maxReplicates = 5) +app$setInputs(targetHits = 3) +app$setInputs(modalGo = "click") +searchesDone <- waitForSearch(app, prev = 0L) app$setInputs(searchConfig = "click") app$snapshotDownload("saveZip") app$snapshotDownload("saveNwk") app$setInputs(`implied.weights` = "on") -app$setInputs(ratchIter = 2) -app$setInputs(maxHits = 1) -app$setInputs(tbrIter = 1) -app$setInputs(startIter = 1.2) +app$setInputs(strategy = "default") +app$setInputs(maxReplicates = 3) +app$setInputs(targetHits = 2) app$setInputs(epsilon = 0) # No tolerance line here -app$setInputs(finalIter = 1) -app$setInputs(modalGo = "click", timeout_ = 2e05) +app$setInputs(modalGo = "click") +searchesDone <- waitForSearch(app, prev = searchesDone) app$snapshotDownload("saveZip") app$snapshotDownload("saveNex") diff --git a/inst/Parsimony/tests/testthat.R b/inst/Parsimony/tests/testthat.R new file mode 100644 index 000000000..7ad3bd767 --- /dev/null +++ b/inst/Parsimony/tests/testthat.R @@ -0,0 +1,2 @@ +library(shinytest2) +test_app("../") diff --git a/inst/Parsimony/tests/testthat/.gitignore b/inst/Parsimony/tests/testthat/.gitignore new file mode 100644 index 000000000..66e54ae32 --- /dev/null +++ b/inst/Parsimony/tests/testthat/.gitignore @@ -0,0 +1 @@ +testthat-problems.rds diff --git a/inst/Parsimony/tests/testthat/_problems/test-app-smoke-15.R b/inst/Parsimony/tests/testthat/_problems/test-app-smoke-15.R new file mode 100644 index 000000000..9f35a2ce5 --- /dev/null +++ b/inst/Parsimony/tests/testthat/_problems/test-app-smoke-15.R @@ -0,0 +1,13 @@ +# Extracted from test-app-smoke.R:15 + +# test ------------------------------------------------------------------------- +app <- AppDriver$new( + app_dir = "../../", + seed = 0, + load_timeout = 200000, + shiny_args = list(test.mode = TRUE), + name = "Smoke" + ) +on.exit(app$stop(), add = TRUE) +app$wait_for_idle(timeout = 10000) +vals <- app$get_values() diff --git a/inst/Parsimony/tests/testthat/_problems/test-app-smoke-8.R b/inst/Parsimony/tests/testthat/_problems/test-app-smoke-8.R new file mode 100644 index 000000000..988bde879 --- /dev/null +++ b/inst/Parsimony/tests/testthat/_problems/test-app-smoke-8.R @@ -0,0 +1,10 @@ +# Extracted from test-app-smoke.R:8 + +# test ------------------------------------------------------------------------- +app <- AppDriver$new( + app_dir = "../../", + seed = 0, + load_timeout = 200000, + shiny_args = list(test.mode = TRUE), + name = "Smoke" + ) diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-001.json b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-001.json new file mode 100644 index 000000000..a6a705d02 --- /dev/null +++ b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-001.json @@ -0,0 +1,92 @@ +{ + "input": { + "clustering-clThresh": 0.5, + "consensus-concordance": "none", + "consensus-consP": 1, + "consensus-excludedTip": "Paterimitra", + "consensus-keepNTips": 46, + "consensus-mapDisplay": null, + "consensus-neverDrop": null, + "consensus-outgroup": "Namacalathus", + "consensus-plottedChar": 1, + "consensus-searchChar": "", + "consensus-whichTree": 0, + "data-dataFile": null, + "data-dataSource": "Sun2018", + "data-nTree": 48, + "data-readxlSkip": 2, + "data-readxlSkipCols": 2, + "data-readxl_sheet": "Sheet 1", + "data-treeFile": null, + "data-treeRange": [ + 1, + 125 + ], + "distMeth": "cid", + "mapLines": [ + "hull", + "mst" + ], + "plotFormat": "clus", + "plotSize": 600, + "search-cancel": 0, + "search-go": 0, + "search-searchConfig": 0, + "treespace-relators": null, + "treespace-spaceCol": "clust", + "treespace-spaceDim": 5, + "treespace-spacePch": "relat" + }, + "output": { + "consensus-branchLegend": null, + "consensus-treePlot": { + "src": "[image data hash: 6c6c892bcb629883ae4f28d30b34c97b]", + "alt": "Plot object", + "coordmap": { + "panels": [ + { + "domain": { + "left": -0.3642173208566126, + "right": 9.469650342271928, + "bottom": -0.8, + "top": 47.8 + }, + "range": { + "left": 0, + "right": 600, + "bottom": 596.12, + "top": 1.880000000000016 + }, + "log": { + "x": null, + "y": null + }, + "mapping": { + + } + } + ], + "dims": { + "width": 600, + "height": 600 + } + }, + "class": "shiny-scalable" + }, + "refs-references": { + "html": "

References for methods used<\/h2>\n

Tree search<\/h3>\n

Brazeau, M.D., Guillerme, T. & Smith, M.R. (2019). “An algorithm for morphological phylogenetic analysis with inapplicable data”. Systematic Biology<\/i> 64<\/b>:619–631. doi:10.1093/sysbio/syy083<\/a>. <\/p>

Brazeau, M.D., Smith, M.R. & Guillerme, T. (2017). “MorphyLib: a library for phylogenetic analysis of categorical trait data with inapplicability”. doi:10.5281/zenodo.815371<\/a>. <\/p>

Nixon, K.C. (1999). “The Parsimony Ratchet, a new method for rapid parsimony analysis”. Cladistics<\/i> 15<\/b>:407–414. doi:10.1111/j.1096-0031.1999.tb00277.x<\/a>. <\/p>

Smith, M.R. (2023). “TreeSearch: morphological phylogenetic analysis in R”. R Journal<\/i> 14<\/b>:305–315. doi:10.32614/RJ-2023-019<\/a>. <\/p>\n

Tree space mapping<\/h3>\n

Gower, J.C. (1966). “Some distance properties of latent root and vector methods used in multivariate analysis”. Biometrika<\/i> 53<\/b>:325–338. doi:10.2307/2333639<\/a>. <\/p>

Gower, J.C. & Ross, G.J.S. (1969). “Minimum spanning trees and single linkage cluster analysis”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 18<\/b>:54–64. doi:10.2307/2346439<\/a>. <\/p>

Kaski, S., Nikkilä, J., Oja, M., Venna, J., Törönen, P. & Castrén, E. (2003). “Trustworthiness and metrics in visualizing similarity of gene expression”. BMC Bioinformatics<\/i> 4<\/b>:48. doi:10.1186/1471-2105-4-48<\/a>. <\/p>

R Core Team (2020). “R: A language and environment for statistical computing”. R Foundation for Statistical Computing, Vienna, Austria. <\/p>

Smith, M.R. (2020a). “TreeDist: distances between phylogenetic trees”. Comprehensive R Archive Network<\/i> doi:10.5281/zenodo.3528123<\/a>. <\/p>

Smith, M.R. (2020b). “Information theoretic Generalized Robinson-Foulds metrics for comparing phylogenetic trees”. Bioinformatics<\/i> 36<\/b>:5007–5013. doi:10.1093/bioinformatics/btaa614<\/a>. <\/p>

Smith, M.R. (2022a). “Robust analysis of phylogenetic tree space”. Systematic Biology<\/i> 71<\/b>:1255–1270. doi:10.1093/sysbio/syab100<\/a>. <\/p>

Venna, J. & Kaski, S. (2001). “Neighborhood preservation in nonlinear projection methods: an experimental study”. In: Dorffner, G., Bischof, H. & Hornik, K. (eds). Lecture Notes in Computer Science: Artificial Neural Networks—ICANN 2001<\/i> Springer, Berlin. 485–491. doi:10.1007/3-540-44668-0_68<\/a>. <\/p>\n

Clustering<\/h3>\nCluster consensus trees:

Stockham, C., Wang, L.-S. & Warnow, T. (2002). “Statistically based postprocessing of phylogenetic analysis by clustering”. Bioinformatics<\/i> 18<\/b>:S285–S293. doi:10.1093/bioinformatics/18.suppl_1.S285<\/a>. <\/p>\nk-means++:

Arthur, D. & Vassilvitskii, S (2007). “k-means++: the advantages of careful seeding”. Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms<\/i> 1027–1035. <\/p>

Hartigan, J.A. & Wong, M.A. (1979). “Algorithm AS 136: a K<\/i>-means clustering algorithm”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 28<\/b>:100–108. doi:10.2307/2346830<\/a>. <\/p>Partitioning around medoids:

Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M. & Hornik, K. (2022). “cluster: cluster analysis basics and extensions”. Comprehensive R Archive Network<\/i> <\/p>Hierarchical, minimax linkage:

Bien, J. & Tibshirani, R. (2011). “Hierarchical clustering with prototypes via minimax linkage”. Journal of the American Statistical Association<\/i> 106<\/b>:1075–1084. doi:10.1198/jasa.2011.tm10183<\/a>. <\/p>

Murtagh, F. (1983). “A survey of recent advances in hierarchical clustering algorithms”. The Computer Journal<\/i> 26<\/b>:354–359. doi:10.1093/comjnl/26.4.354<\/a>. <\/p>Clustering evaluation:

Rousseeuw, P.J. (1987). “Silhouettes: a graphical aid to the interpretation and validation of cluster analysis”. Journal of Computational and Applied Mathematics<\/i> 20<\/b>:53–65. doi:10.1016/0377-0427(87)90125-7<\/a>. <\/p>\n

Rogue taxa<\/h3>\nDetection:

Smith, M.R. (2022b). “Using information theory to detect rogue taxa and improve consensus trees”. Systematic Biology<\/i> 71<\/b>:1088–1094. doi:10.1093/sysbio/syab099<\/a>. <\/p>\nPlotting:

Klopfstein, S. & Spasojevic, T. (2019). “Illustrating phylogenetic placement of fossils using RoguePlots: An example from ichneumonid parasitoid wasps (Hymenoptera, Ichneumonidae) and an extensive morphological matrix.”. PLoS ONE<\/i> 14<\/b>:e0212942. doi:10.1371/journal.pone.0212942<\/a>. <\/p>\nCharacter analysis:

Pol, D. & Escapa, I.H. (2009). “Unstable taxa in cladistic analysis: identification and the assessment of relevant characters”. Cladistics<\/i> 25<\/b>:515–527. doi:10.1111/j.1096-0031.2009.00258.x<\/a>. <\/p>", + "deps": [ + + ] + }, + "search-results": { + "html": "125 trees in memory: 48 sampled with scores 25.9589 to 29.8759 (k = 10)", + "deps": [ + + ] + } + }, + "export": { + "searchCount": 0 + } +} diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-001_.png b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-001_.png new file mode 100644 index 000000000..18fe6a122 Binary files /dev/null and b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-001_.png differ diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-002.json b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-002.json new file mode 100644 index 000000000..0a13462d6 --- /dev/null +++ b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-002.json @@ -0,0 +1,92 @@ +{ + "input": { + "clustering-clThresh": 0.5, + "consensus-concordance": "none", + "consensus-consP": 1, + "consensus-excludedTip": "Paterimitra", + "consensus-keepNTips": 49, + "consensus-mapDisplay": null, + "consensus-neverDrop": null, + "consensus-outgroup": "Namacalathus", + "consensus-plottedChar": 1, + "consensus-searchChar": "", + "consensus-whichTree": 0, + "data-dataFile": null, + "data-dataSource": "Sun2018", + "data-nTree": 125, + "data-readxlSkip": 2, + "data-readxlSkipCols": 2, + "data-readxl_sheet": "Sheet 1", + "data-treeFile": null, + "data-treeRange": [ + 1, + 125 + ], + "distMeth": "cid", + "mapLines": [ + "hull", + "mst" + ], + "plotFormat": "clus", + "plotSize": 600, + "search-cancel": 0, + "search-go": 0, + "search-searchConfig": 0, + "treespace-relators": null, + "treespace-spaceCol": "clust", + "treespace-spaceDim": 5, + "treespace-spacePch": "relat" + }, + "output": { + "consensus-branchLegend": null, + "consensus-treePlot": { + "src": "[image data hash: 19603e60c526b0042031a56f142bdbeb]", + "alt": "Plot object", + "coordmap": { + "panels": [ + { + "domain": { + "left": -0.526105134269941, + "right": 13.67873349101847, + "bottom": -0.9199999999999999, + "top": 50.92 + }, + "range": { + "left": 400, + "right": 600, + "bottom": 597.0992, + "top": 0.9008000000000358 + }, + "log": { + "x": null, + "y": null + }, + "mapping": { + + } + } + ], + "dims": { + "width": 600, + "height": 600 + } + }, + "class": "shiny-scalable" + }, + "refs-references": { + "html": "

References for methods used<\/h2>\n

Tree search<\/h3>\n

Brazeau, M.D., Guillerme, T. & Smith, M.R. (2019). “An algorithm for morphological phylogenetic analysis with inapplicable data”. Systematic Biology<\/i> 64<\/b>:619–631. doi:10.1093/sysbio/syy083<\/a>. <\/p>

Brazeau, M.D., Smith, M.R. & Guillerme, T. (2017). “MorphyLib: a library for phylogenetic analysis of categorical trait data with inapplicability”. doi:10.5281/zenodo.815371<\/a>. <\/p>

Nixon, K.C. (1999). “The Parsimony Ratchet, a new method for rapid parsimony analysis”. Cladistics<\/i> 15<\/b>:407–414. doi:10.1111/j.1096-0031.1999.tb00277.x<\/a>. <\/p>

Smith, M.R. (2023). “TreeSearch: morphological phylogenetic analysis in R”. R Journal<\/i> 14<\/b>:305–315. doi:10.32614/RJ-2023-019<\/a>. <\/p>\n

Tree space mapping<\/h3>\n

Gower, J.C. (1966). “Some distance properties of latent root and vector methods used in multivariate analysis”. Biometrika<\/i> 53<\/b>:325–338. doi:10.2307/2333639<\/a>. <\/p>

Gower, J.C. & Ross, G.J.S. (1969). “Minimum spanning trees and single linkage cluster analysis”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 18<\/b>:54–64. doi:10.2307/2346439<\/a>. <\/p>

Kaski, S., Nikkilä, J., Oja, M., Venna, J., Törönen, P. & Castrén, E. (2003). “Trustworthiness and metrics in visualizing similarity of gene expression”. BMC Bioinformatics<\/i> 4<\/b>:48. doi:10.1186/1471-2105-4-48<\/a>. <\/p>

R Core Team (2020). “R: A language and environment for statistical computing”. R Foundation for Statistical Computing, Vienna, Austria. <\/p>

Smith, M.R. (2020a). “TreeDist: distances between phylogenetic trees”. Comprehensive R Archive Network<\/i> doi:10.5281/zenodo.3528123<\/a>. <\/p>

Smith, M.R. (2020b). “Information theoretic Generalized Robinson-Foulds metrics for comparing phylogenetic trees”. Bioinformatics<\/i> 36<\/b>:5007–5013. doi:10.1093/bioinformatics/btaa614<\/a>. <\/p>

Smith, M.R. (2022a). “Robust analysis of phylogenetic tree space”. Systematic Biology<\/i> 71<\/b>:1255–1270. doi:10.1093/sysbio/syab100<\/a>. <\/p>

Venna, J. & Kaski, S. (2001). “Neighborhood preservation in nonlinear projection methods: an experimental study”. In: Dorffner, G., Bischof, H. & Hornik, K. (eds). Lecture Notes in Computer Science: Artificial Neural Networks—ICANN 2001<\/i> Springer, Berlin. 485–491. doi:10.1007/3-540-44668-0_68<\/a>. <\/p>\n

Clustering<\/h3>\nCluster consensus trees:

Stockham, C., Wang, L.-S. & Warnow, T. (2002). “Statistically based postprocessing of phylogenetic analysis by clustering”. Bioinformatics<\/i> 18<\/b>:S285–S293. doi:10.1093/bioinformatics/18.suppl_1.S285<\/a>. <\/p>\nk-means++:

Arthur, D. & Vassilvitskii, S (2007). “k-means++: the advantages of careful seeding”. Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms<\/i> 1027–1035. <\/p>

Hartigan, J.A. & Wong, M.A. (1979). “Algorithm AS 136: a K<\/i>-means clustering algorithm”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 28<\/b>:100–108. doi:10.2307/2346830<\/a>. <\/p>Partitioning around medoids:

Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M. & Hornik, K. (2022). “cluster: cluster analysis basics and extensions”. Comprehensive R Archive Network<\/i> <\/p>Hierarchical, minimax linkage:

Bien, J. & Tibshirani, R. (2011). “Hierarchical clustering with prototypes via minimax linkage”. Journal of the American Statistical Association<\/i> 106<\/b>:1075–1084. doi:10.1198/jasa.2011.tm10183<\/a>. <\/p>

Murtagh, F. (1983). “A survey of recent advances in hierarchical clustering algorithms”. The Computer Journal<\/i> 26<\/b>:354–359. doi:10.1093/comjnl/26.4.354<\/a>. <\/p>Clustering evaluation:

Rousseeuw, P.J. (1987). “Silhouettes: a graphical aid to the interpretation and validation of cluster analysis”. Journal of Computational and Applied Mathematics<\/i> 20<\/b>:53–65. doi:10.1016/0377-0427(87)90125-7<\/a>. <\/p>\n

Rogue taxa<\/h3>\nDetection:

Smith, M.R. (2022b). “Using information theory to detect rogue taxa and improve consensus trees”. Systematic Biology<\/i> 71<\/b>:1088–1094. doi:10.1093/sysbio/syab099<\/a>. <\/p>\nPlotting:

Klopfstein, S. & Spasojevic, T. (2019). “Illustrating phylogenetic placement of fossils using RoguePlots: An example from ichneumonid parasitoid wasps (Hymenoptera, Ichneumonidae) and an extensive morphological matrix.”. PLoS ONE<\/i> 14<\/b>:e0212942. doi:10.1371/journal.pone.0212942<\/a>. <\/p>\nCharacter analysis:

Pol, D. & Escapa, I.H. (2009). “Unstable taxa in cladistic analysis: identification and the assessment of relevant characters”. Cladistics<\/i> 25<\/b>:515–527. doi:10.1111/j.1096-0031.2009.00258.x<\/a>. <\/p>", + "deps": [ + + ] + }, + "search-results": { + "html": "125 trees in memory: 125 sampled with scores 25.9279 to 29.8759 (k = 10)", + "deps": [ + + ] + } + }, + "export": { + "searchCount": 0 + } +} diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-002_.png b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-002_.png new file mode 100644 index 000000000..20b045cf6 Binary files /dev/null and b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-002_.png differ diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-003.json b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-003.json new file mode 100644 index 000000000..6b15f20c3 --- /dev/null +++ b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-003.json @@ -0,0 +1,92 @@ +{ + "input": { + "clustering-clThresh": 1, + "consensus-concordance": "none", + "consensus-consP": 1, + "consensus-excludedTip": "Paterimitra", + "consensus-keepNTips": 49, + "consensus-mapDisplay": null, + "consensus-neverDrop": null, + "consensus-outgroup": "Namacalathus", + "consensus-plottedChar": 1, + "consensus-searchChar": "", + "consensus-whichTree": 0, + "data-dataFile": null, + "data-dataSource": "Sun2018", + "data-nTree": 125, + "data-readxlSkip": 2, + "data-readxlSkipCols": 2, + "data-readxl_sheet": "Sheet 1", + "data-treeFile": null, + "data-treeRange": [ + 1, + 125 + ], + "distMeth": "cid", + "mapLines": [ + "hull", + "mst" + ], + "plotFormat": "clus", + "plotSize": 600, + "search-cancel": 0, + "search-go": 0, + "search-searchConfig": 0, + "treespace-relators": null, + "treespace-spaceCol": "clust", + "treespace-spaceDim": 5, + "treespace-spacePch": "relat" + }, + "output": { + "consensus-branchLegend": null, + "consensus-treePlot": { + "src": "[image data hash: 52ded580f2796169fcba68e660f5c64b]", + "alt": "Plot object", + "coordmap": { + "panels": [ + { + "domain": { + "left": -0.290659071367388, + "right": 7.557135855552088, + "bottom": -0.9199999999999999, + "top": 50.92 + }, + "range": { + "left": 0, + "right": 600, + "bottom": 596.12, + "top": 1.880000000000082 + }, + "log": { + "x": null, + "y": null + }, + "mapping": { + + } + } + ], + "dims": { + "width": 600, + "height": 600 + } + }, + "class": "shiny-scalable" + }, + "refs-references": { + "html": "

References for methods used<\/h2>\n

Tree search<\/h3>\n

Brazeau, M.D., Guillerme, T. & Smith, M.R. (2019). “An algorithm for morphological phylogenetic analysis with inapplicable data”. Systematic Biology<\/i> 64<\/b>:619–631. doi:10.1093/sysbio/syy083<\/a>. <\/p>

Brazeau, M.D., Smith, M.R. & Guillerme, T. (2017). “MorphyLib: a library for phylogenetic analysis of categorical trait data with inapplicability”. doi:10.5281/zenodo.815371<\/a>. <\/p>

Nixon, K.C. (1999). “The Parsimony Ratchet, a new method for rapid parsimony analysis”. Cladistics<\/i> 15<\/b>:407–414. doi:10.1111/j.1096-0031.1999.tb00277.x<\/a>. <\/p>

Smith, M.R. (2023). “TreeSearch: morphological phylogenetic analysis in R”. R Journal<\/i> 14<\/b>:305–315. doi:10.32614/RJ-2023-019<\/a>. <\/p>\n

Tree space mapping<\/h3>\n

Gower, J.C. (1966). “Some distance properties of latent root and vector methods used in multivariate analysis”. Biometrika<\/i> 53<\/b>:325–338. doi:10.2307/2333639<\/a>. <\/p>

Gower, J.C. & Ross, G.J.S. (1969). “Minimum spanning trees and single linkage cluster analysis”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 18<\/b>:54–64. doi:10.2307/2346439<\/a>. <\/p>

Kaski, S., Nikkilä, J., Oja, M., Venna, J., Törönen, P. & Castrén, E. (2003). “Trustworthiness and metrics in visualizing similarity of gene expression”. BMC Bioinformatics<\/i> 4<\/b>:48. doi:10.1186/1471-2105-4-48<\/a>. <\/p>

R Core Team (2020). “R: A language and environment for statistical computing”. R Foundation for Statistical Computing, Vienna, Austria. <\/p>

Smith, M.R. (2020a). “TreeDist: distances between phylogenetic trees”. Comprehensive R Archive Network<\/i> doi:10.5281/zenodo.3528123<\/a>. <\/p>

Smith, M.R. (2020b). “Information theoretic Generalized Robinson-Foulds metrics for comparing phylogenetic trees”. Bioinformatics<\/i> 36<\/b>:5007–5013. doi:10.1093/bioinformatics/btaa614<\/a>. <\/p>

Smith, M.R. (2022a). “Robust analysis of phylogenetic tree space”. Systematic Biology<\/i> 71<\/b>:1255–1270. doi:10.1093/sysbio/syab100<\/a>. <\/p>

Venna, J. & Kaski, S. (2001). “Neighborhood preservation in nonlinear projection methods: an experimental study”. In: Dorffner, G., Bischof, H. & Hornik, K. (eds). Lecture Notes in Computer Science: Artificial Neural Networks—ICANN 2001<\/i> Springer, Berlin. 485–491. doi:10.1007/3-540-44668-0_68<\/a>. <\/p>\n

Clustering<\/h3>\nCluster consensus trees:

Stockham, C., Wang, L.-S. & Warnow, T. (2002). “Statistically based postprocessing of phylogenetic analysis by clustering”. Bioinformatics<\/i> 18<\/b>:S285–S293. doi:10.1093/bioinformatics/18.suppl_1.S285<\/a>. <\/p>\nk-means++:

Arthur, D. & Vassilvitskii, S (2007). “k-means++: the advantages of careful seeding”. Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms<\/i> 1027–1035. <\/p>

Hartigan, J.A. & Wong, M.A. (1979). “Algorithm AS 136: a K<\/i>-means clustering algorithm”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 28<\/b>:100–108. doi:10.2307/2346830<\/a>. <\/p>Partitioning around medoids:

Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M. & Hornik, K. (2022). “cluster: cluster analysis basics and extensions”. Comprehensive R Archive Network<\/i> <\/p>Hierarchical, minimax linkage:

Bien, J. & Tibshirani, R. (2011). “Hierarchical clustering with prototypes via minimax linkage”. Journal of the American Statistical Association<\/i> 106<\/b>:1075–1084. doi:10.1198/jasa.2011.tm10183<\/a>. <\/p>

Murtagh, F. (1983). “A survey of recent advances in hierarchical clustering algorithms”. The Computer Journal<\/i> 26<\/b>:354–359. doi:10.1093/comjnl/26.4.354<\/a>. <\/p>Clustering evaluation:

Rousseeuw, P.J. (1987). “Silhouettes: a graphical aid to the interpretation and validation of cluster analysis”. Journal of Computational and Applied Mathematics<\/i> 20<\/b>:53–65. doi:10.1016/0377-0427(87)90125-7<\/a>. <\/p>\n

Rogue taxa<\/h3>\nDetection:

Smith, M.R. (2022b). “Using information theory to detect rogue taxa and improve consensus trees”. Systematic Biology<\/i> 71<\/b>:1088–1094. doi:10.1093/sysbio/syab099<\/a>. <\/p>\nPlotting:

Klopfstein, S. & Spasojevic, T. (2019). “Illustrating phylogenetic placement of fossils using RoguePlots: An example from ichneumonid parasitoid wasps (Hymenoptera, Ichneumonidae) and an extensive morphological matrix.”. PLoS ONE<\/i> 14<\/b>:e0212942. doi:10.1371/journal.pone.0212942<\/a>. <\/p>\nCharacter analysis:

Pol, D. & Escapa, I.H. (2009). “Unstable taxa in cladistic analysis: identification and the assessment of relevant characters”. Cladistics<\/i> 25<\/b>:515–527. doi:10.1111/j.1096-0031.2009.00258.x<\/a>. <\/p>", + "deps": [ + + ] + }, + "search-results": { + "html": "125 trees in memory: 125 sampled with scores 25.9279 to 29.8759 (k = 10)", + "deps": [ + + ] + } + }, + "export": { + "searchCount": 0 + } +} diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-003_.png b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-003_.png new file mode 100644 index 000000000..046181986 Binary files /dev/null and b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-003_.png differ diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-004.json b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-004.json new file mode 100644 index 000000000..6b15f20c3 --- /dev/null +++ b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-004.json @@ -0,0 +1,92 @@ +{ + "input": { + "clustering-clThresh": 1, + "consensus-concordance": "none", + "consensus-consP": 1, + "consensus-excludedTip": "Paterimitra", + "consensus-keepNTips": 49, + "consensus-mapDisplay": null, + "consensus-neverDrop": null, + "consensus-outgroup": "Namacalathus", + "consensus-plottedChar": 1, + "consensus-searchChar": "", + "consensus-whichTree": 0, + "data-dataFile": null, + "data-dataSource": "Sun2018", + "data-nTree": 125, + "data-readxlSkip": 2, + "data-readxlSkipCols": 2, + "data-readxl_sheet": "Sheet 1", + "data-treeFile": null, + "data-treeRange": [ + 1, + 125 + ], + "distMeth": "cid", + "mapLines": [ + "hull", + "mst" + ], + "plotFormat": "clus", + "plotSize": 600, + "search-cancel": 0, + "search-go": 0, + "search-searchConfig": 0, + "treespace-relators": null, + "treespace-spaceCol": "clust", + "treespace-spaceDim": 5, + "treespace-spacePch": "relat" + }, + "output": { + "consensus-branchLegend": null, + "consensus-treePlot": { + "src": "[image data hash: 52ded580f2796169fcba68e660f5c64b]", + "alt": "Plot object", + "coordmap": { + "panels": [ + { + "domain": { + "left": -0.290659071367388, + "right": 7.557135855552088, + "bottom": -0.9199999999999999, + "top": 50.92 + }, + "range": { + "left": 0, + "right": 600, + "bottom": 596.12, + "top": 1.880000000000082 + }, + "log": { + "x": null, + "y": null + }, + "mapping": { + + } + } + ], + "dims": { + "width": 600, + "height": 600 + } + }, + "class": "shiny-scalable" + }, + "refs-references": { + "html": "

References for methods used<\/h2>\n

Tree search<\/h3>\n

Brazeau, M.D., Guillerme, T. & Smith, M.R. (2019). “An algorithm for morphological phylogenetic analysis with inapplicable data”. Systematic Biology<\/i> 64<\/b>:619–631. doi:10.1093/sysbio/syy083<\/a>. <\/p>

Brazeau, M.D., Smith, M.R. & Guillerme, T. (2017). “MorphyLib: a library for phylogenetic analysis of categorical trait data with inapplicability”. doi:10.5281/zenodo.815371<\/a>. <\/p>

Nixon, K.C. (1999). “The Parsimony Ratchet, a new method for rapid parsimony analysis”. Cladistics<\/i> 15<\/b>:407–414. doi:10.1111/j.1096-0031.1999.tb00277.x<\/a>. <\/p>

Smith, M.R. (2023). “TreeSearch: morphological phylogenetic analysis in R”. R Journal<\/i> 14<\/b>:305–315. doi:10.32614/RJ-2023-019<\/a>. <\/p>\n

Tree space mapping<\/h3>\n

Gower, J.C. (1966). “Some distance properties of latent root and vector methods used in multivariate analysis”. Biometrika<\/i> 53<\/b>:325–338. doi:10.2307/2333639<\/a>. <\/p>

Gower, J.C. & Ross, G.J.S. (1969). “Minimum spanning trees and single linkage cluster analysis”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 18<\/b>:54–64. doi:10.2307/2346439<\/a>. <\/p>

Kaski, S., Nikkilä, J., Oja, M., Venna, J., Törönen, P. & Castrén, E. (2003). “Trustworthiness and metrics in visualizing similarity of gene expression”. BMC Bioinformatics<\/i> 4<\/b>:48. doi:10.1186/1471-2105-4-48<\/a>. <\/p>

R Core Team (2020). “R: A language and environment for statistical computing”. R Foundation for Statistical Computing, Vienna, Austria. <\/p>

Smith, M.R. (2020a). “TreeDist: distances between phylogenetic trees”. Comprehensive R Archive Network<\/i> doi:10.5281/zenodo.3528123<\/a>. <\/p>

Smith, M.R. (2020b). “Information theoretic Generalized Robinson-Foulds metrics for comparing phylogenetic trees”. Bioinformatics<\/i> 36<\/b>:5007–5013. doi:10.1093/bioinformatics/btaa614<\/a>. <\/p>

Smith, M.R. (2022a). “Robust analysis of phylogenetic tree space”. Systematic Biology<\/i> 71<\/b>:1255–1270. doi:10.1093/sysbio/syab100<\/a>. <\/p>

Venna, J. & Kaski, S. (2001). “Neighborhood preservation in nonlinear projection methods: an experimental study”. In: Dorffner, G., Bischof, H. & Hornik, K. (eds). Lecture Notes in Computer Science: Artificial Neural Networks—ICANN 2001<\/i> Springer, Berlin. 485–491. doi:10.1007/3-540-44668-0_68<\/a>. <\/p>\n

Clustering<\/h3>\nCluster consensus trees:

Stockham, C., Wang, L.-S. & Warnow, T. (2002). “Statistically based postprocessing of phylogenetic analysis by clustering”. Bioinformatics<\/i> 18<\/b>:S285–S293. doi:10.1093/bioinformatics/18.suppl_1.S285<\/a>. <\/p>\nk-means++:

Arthur, D. & Vassilvitskii, S (2007). “k-means++: the advantages of careful seeding”. Proceedings of the Eighteenth Annual ACM-SIAM Symposium on Discrete Algorithms<\/i> 1027–1035. <\/p>

Hartigan, J.A. & Wong, M.A. (1979). “Algorithm AS 136: a K<\/i>-means clustering algorithm”. Journal of the Royal Statistical Society Series C (Applied Statistics)<\/i> 28<\/b>:100–108. doi:10.2307/2346830<\/a>. <\/p>Partitioning around medoids:

Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M. & Hornik, K. (2022). “cluster: cluster analysis basics and extensions”. Comprehensive R Archive Network<\/i> <\/p>Hierarchical, minimax linkage:

Bien, J. & Tibshirani, R. (2011). “Hierarchical clustering with prototypes via minimax linkage”. Journal of the American Statistical Association<\/i> 106<\/b>:1075–1084. doi:10.1198/jasa.2011.tm10183<\/a>. <\/p>

Murtagh, F. (1983). “A survey of recent advances in hierarchical clustering algorithms”. The Computer Journal<\/i> 26<\/b>:354–359. doi:10.1093/comjnl/26.4.354<\/a>. <\/p>Clustering evaluation:

Rousseeuw, P.J. (1987). “Silhouettes: a graphical aid to the interpretation and validation of cluster analysis”. Journal of Computational and Applied Mathematics<\/i> 20<\/b>:53–65. doi:10.1016/0377-0427(87)90125-7<\/a>. <\/p>\n

Rogue taxa<\/h3>\nDetection:

Smith, M.R. (2022b). “Using information theory to detect rogue taxa and improve consensus trees”. Systematic Biology<\/i> 71<\/b>:1088–1094. doi:10.1093/sysbio/syab099<\/a>. <\/p>\nPlotting:

Klopfstein, S. & Spasojevic, T. (2019). “Illustrating phylogenetic placement of fossils using RoguePlots: An example from ichneumonid parasitoid wasps (Hymenoptera, Ichneumonidae) and an extensive morphological matrix.”. PLoS ONE<\/i> 14<\/b>:e0212942. doi:10.1371/journal.pone.0212942<\/a>. <\/p>\nCharacter analysis:

Pol, D. & Escapa, I.H. (2009). “Unstable taxa in cladistic analysis: identification and the assessment of relevant characters”. Cladistics<\/i> 25<\/b>:515–527. doi:10.1111/j.1096-0031.2009.00258.x<\/a>. <\/p>", + "deps": [ + + ] + }, + "search-results": { + "html": "125 trees in memory: 125 sampled with scores 25.9279 to 29.8759 (k = 10)", + "deps": [ + + ] + } + }, + "export": { + "searchCount": 0 + } +} diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-004_.png b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-004_.png new file mode 100644 index 000000000..6adc9c620 Binary files /dev/null and b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-004_.png differ diff --git a/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-005.json b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-005.json new file mode 100644 index 000000000..2b6da39af --- /dev/null +++ b/inst/Parsimony/tests/testthat/_snaps/Distribution/Distribution-005.json @@ -0,0 +1,104 @@ +{ + "input": { + "clustering-clThresh": 1, + "consensus-concordance": "none", + "consensus-consP": 1, + "consensus-excludedTip": "Paterimitra", + "consensus-keepNTips": 49, + "consensus-mapDisplay": null, + "consensus-neverDrop": null, + "consensus-outgroup": "Namacalathus", + "consensus-plottedChar": 1, + "consensus-searchChar": "", + "consensus-whichTree": 0, + "data-dataFile": null, + "data-dataSource": "Sun2018", + "data-nTree": 125, + "data-readxlSkip": 2, + "data-readxlSkipCols": 2, + "data-readxl_sheet": "Sheet 1", + "data-treeFile": null, + "data-treeRange": [ + 1, + 125 + ], + "distMeth": "cid", + "mapLines": [ + "hull", + "mst" + ], + "plotFormat": "ind", + "plotSize": 600, + "search-cancel": 0, + "search-go": 0, + "search-searchConfig": 0, + "treespace-relators": null, + "treespace-spaceCol": "clust", + "treespace-spaceDim": 5, + "treespace-spacePch": "relat" + }, + "output": { + "consensus-branchLegend": null, + "consensus-charMapLegend": { + "html": "

Brephic shell: Embryonic shell<\/h3>\n