From 8a0cd7923d6df461d05ba95d0da9742c65e9ef12 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 13:01:31 +0000 Subject: [PATCH 1/6] Implement channel-based notebook reporting process Agent-Logs-Url: https://github.com/KarchinLab/TCRtoolkit/sessions/3ad3c5f4-c1cd-4a3c-a8a8-638a69efe87c Co-authored-by: dimalvovs <1246862+dimalvovs@users.noreply.github.com> --- .gitignore | 9 + conf/modules.config | 8 + modules/local/compare/compare_plot.nf | 37 - modules/local/compare/gliph2.nf | 33 - modules/local/report/render_notebook.nf | 32 + modules/local/sample/sample_plot.nf | 38 - nextflow.config | 14 +- notebooks/template_details_part1.qmd | 104 + notebooks/template_details_part2.qmd | 108 + notebooks/template_gliph.qmd | 516 +++ notebooks/template_overlap.qmd | 1034 ++++++ notebooks/template_pheno_bulk.qmd | 508 +++ notebooks/template_pheno_sc.qmd | 652 ++++ notebooks/template_qc.qmd | 1938 ++++++++++++ notebooks/template_sample.qmd | 2772 +++++++++++++++++ notebooks/template_sharing.qmd | 863 +++++ subworkflows/local/compare.nf | 8 + subworkflows/local/patient.nf | 13 + subworkflows/local/report.nf | 31 + subworkflows/local/sample.nf | 15 +- .../local/report/render_notebook.nf.test | 72 + workflows/tcrtoolkit.nf | 21 + 22 files changed, 8705 insertions(+), 121 deletions(-) delete mode 100644 modules/local/compare/compare_plot.nf create mode 100644 modules/local/report/render_notebook.nf delete mode 100644 modules/local/sample/sample_plot.nf create mode 100644 notebooks/template_details_part1.qmd create mode 100644 notebooks/template_details_part2.qmd create mode 100644 notebooks/template_gliph.qmd create mode 100644 notebooks/template_overlap.qmd create mode 100644 notebooks/template_pheno_bulk.qmd create mode 100644 notebooks/template_pheno_sc.qmd create mode 100644 notebooks/template_qc.qmd create mode 100644 notebooks/template_sample.qmd create mode 100644 notebooks/template_sharing.qmd create mode 100644 subworkflows/local/report.nf create mode 100644 tests/modules/local/report/render_notebook.nf.test diff --git a/.gitignore b/.gitignore index 9c9b98a..bd0a2f6 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,15 @@ notebooks/* !notebooks/sample_stats_template.qmd !notebooks/compare_stats_template.qmd !notebooks/gliph2_report_template.qmd +!notebooks/template_qc.qmd +!notebooks/template_sample.qmd +!notebooks/template_sharing.qmd +!notebooks/template_overlap.qmd +!notebooks/template_pheno_bulk.qmd +!notebooks/template_pheno_sc.qmd +!notebooks/template_gliph.qmd +!notebooks/template_details_part1.qmd +!notebooks/template_details_part2.qmd ## Bash tmp diff --git a/conf/modules.config b/conf/modules.config index ac213a6..748e140 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -47,4 +47,12 @@ process { label = params.matrix_sparsity == 'sparse' ? 'process_medium' : ['process_high', 'process_high_memory'] } + withName: RENDER_NOTEBOOK { + publishDir = [ + path: { "${params.outdir}/reports" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + } \ No newline at end of file diff --git a/modules/local/compare/compare_plot.nf b/modules/local/compare/compare_plot.nf deleted file mode 100644 index 1eb5792..0000000 --- a/modules/local/compare/compare_plot.nf +++ /dev/null @@ -1,37 +0,0 @@ -process COMPARE_PLOT { - label 'process_single' - - input: - path sample_utf8 - path jaccard_mat - path sorensen_mat - path morisita_mat - path compare_stats_template - val project_name - path all_sample_files - - output: - path 'compare_stats.html' - - script: - """ - ## copy quarto notebook to output directory - cp $compare_stats_template compare_stats.qmd - - ## render qmd report to html - # export QUARTO_DENO_EXTRA_OPTIONS=--v8-flags=--max-old-space-size=8192 - quarto render compare_stats.qmd \ - -P project_name:$project_name \ - -P workflow_cmd:'$workflow.commandLine' \ - -P jaccard_mat:$jaccard_mat \ - -P sorensen_mat:$sorensen_mat \ - -P morisita_mat:$morisita_mat \ - -P sample_utf8:$sample_utf8 \ - --to html - """ - - stub: - """ - touch compare_stats.qmd - """ -} diff --git a/modules/local/compare/gliph2.nf b/modules/local/compare/gliph2.nf index d1a4916..7ecbf45 100644 --- a/modules/local/compare/gliph2.nf +++ b/modules/local/compare/gliph2.nf @@ -50,36 +50,3 @@ process GLIPH2_TURBOGLIPH { cat \$input_file > ${patient}/local_similarities.txt """ } - -process GLIPH2_PLOT { - label 'process_low' - - input: - path gliph2_report_template - path(motifs) - path(clone_network) - path(cluster_member_details) - path(convergence_groups) - path(global_similarities) - path(local_similarities) - path(parameter) - - output: - path 'gliph2_report.html' - - script: - """ - ## copy quarto notebook to output directory - cp $gliph2_report_template gliph2_report.qmd - - ## render qmd report to html - quarto render gliph2_report.qmd \ - -P project_name:$params.project_name \ - -P workflow_cmd:'$workflow.commandLine' \ - -P results_dir:'./' \ - - # -P clusters:$cluster_member_details \ - # -P cluster_stats:$convergence_groups \ - --to html - """ -} diff --git a/modules/local/report/render_notebook.nf b/modules/local/report/render_notebook.nf new file mode 100644 index 0000000..567a004 --- /dev/null +++ b/modules/local/report/render_notebook.nf @@ -0,0 +1,32 @@ +// Generic process to render a Quarto notebook to HTML +process RENDER_NOTEBOOK { + tag "${report_name}" + label 'process_single' + + input: + tuple val(report_name), path(notebook), val(data_dir) + val project_name + val workflow_cmd + + output: + path "${report_name}.html" + + script: + """ + ## copy quarto notebook to working directory + cp $notebook ${report_name}.qmd + + ## render qmd report to html + quarto render ${report_name}.qmd \\ + -P project_name:${project_name} \\ + -P workflow_cmd:'${workflow_cmd}' \\ + -P project_dir:${data_dir} \\ + -P sample_table:${params.samplesheet} \\ + --to html + """ + + stub: + """ + touch ${report_name}.html + """ +} diff --git a/modules/local/sample/sample_plot.nf b/modules/local/sample/sample_plot.nf deleted file mode 100644 index e19dd16..0000000 --- a/modules/local/sample/sample_plot.nf +++ /dev/null @@ -1,38 +0,0 @@ -// process to plot sample level statistics -process SAMPLE_PLOT { - tag "${sample_stats_csv}" - label 'process_single' - - input: - path sample_table - path sample_stats_template - path sample_stats_csv - path v_family_csv - - output: - path 'sample_stats.html' - - script: - """ - ## copy quarto notebook to output directory - cp $sample_stats_template sample_stats.qmd - - ## render qmd report to html - quarto render sample_stats.qmd \ - -P project_name:$params.project_name \ - -P workflow_cmd:'$workflow.commandLine' \ - -P sample_table:$sample_table \ - -P sample_stats_csv:$sample_stats_csv \ - -P v_family_csv:$v_family_csv \ - -P samplechart_x_col:${params.samplechart_x_col} \ - -P samplechart_color_col:${params.samplechart_color_col} \ - -P vgene_subject_col:${params.vgene_subject_col} \ - -P vgene_x_cols:${params.vgene_x_cols} \ - --to html - """ - - stub: - """ - echo "1" - """ -} diff --git a/nextflow.config b/nextflow.config index 32f2048..7ec9563 100644 --- a/nextflow.config +++ b/nextflow.config @@ -31,8 +31,16 @@ params { // Sample + compare parameters workflow_level = "sample,compare" project_name = "tcrtoolkit_"+ new Date().format("yyyy-MM-dd_HH-mm-ss") - sample_stats_template = "${projectDir}/notebooks/sample_stats_template.qmd" - compare_stats_template = "${projectDir}/notebooks/compare_stats_template.qmd" + sample_stats_template = "${projectDir}/notebooks/template_sample.qmd" + compare_stats_template = "${projectDir}/notebooks/template_sharing.qmd" + + // Additional report templates + qc_template = "${projectDir}/notebooks/template_qc.qmd" + overlap_template = "${projectDir}/notebooks/template_overlap.qmd" + pheno_bulk_template = "${projectDir}/notebooks/template_pheno_bulk.qmd" + pheno_sc_template = "${projectDir}/notebooks/template_pheno_sc.qmd" + details_part1_template = "${projectDir}/notebooks/template_details_part1.qmd" + details_part2_template = "${projectDir}/notebooks/template_details_part2.qmd" // Sample stats metadata parameters samplechart_x_col = 'timepoint' @@ -50,7 +58,7 @@ params { // GLIPH2 parameters use_gliph2 = false - gliph2_report_template = "${projectDir}/notebooks/gliph2_report_template.qmd" + gliph2_report_template = "${projectDir}/notebooks/template_gliph.qmd" ref_files = "${projectDir}/assets/gliph2_files" local_min_pvalue = "0.001" diff --git a/notebooks/template_details_part1.qmd b/notebooks/template_details_part1.qmd new file mode 100644 index 0000000..283df9c --- /dev/null +++ b/notebooks/template_details_part1.qmd @@ -0,0 +1,104 @@ +--- +title: "Details" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: false + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +execute: + cache: false + warnings: false +jupyter: python3 +--- + +Thank you for using TCRtoolkit! This report is generated from the data you provided. + +:::{.callout-note collapse="true"} +## Document Information +**Current Version:** 1.0-beta +**Last Updated:** March 2026 +**Maintainer:** BTC Data Science Team +**Notes:** +::: + +::: {.callout-note collapse="true"} +## Notebook Analysis Scope +This notebook a more detailed analysis of the samples being analyzed. +::: + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name='' +project_dir='' +sample_table='' + +timepoint_col = 'timepoint' +timepoint_order_col = 'timepoint_order' +alias_col = 'alias' +subject_col = 'subject_id' + +``` + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}/{project_name}" + +``` + +# Before You Begin + +This pipeline can be used to analyze both **single-cell and bulk TCR data**. Please see the note below to understand some of the **implications** depending on the data type you have: + +::: {.callout-note title="Single-cell vs Bulk Data analysis" collapse="true"} +**Definition of “counts”** +- **Single-cell**: + `counts` represent the number of distinct cells carrying a specific clonotype. For example, a count of 12 indicates that 12 individual cells were encapsulated and sequenced. +- **Bulk**: + `counts` represent the abundance of sequencing reads (or UMIs) supporting a clonotype. The biological interpretation depends heavily on the starting material: + + - **RNA (cDNA):** Counts are a composite metric of Cellular Abundance $\times$ Transcriptional Expression. Since activation status affects TCR mRNA levels, a high count could indicate a large clone or a highly active small clone. Normalization strategies can mitigate, but not eliminate, this expression bias. + - **DNA (gDNA):** Counts are a direct proxy for Cell Number (e.g., Adaptive ImmunoSEQ). Because T-cell genomic templates are constant (one productive rearrangement per cell), DNA sequencing avoids expression bias and allows for accurate estimation of clone size. + +**TCR chains** +- **Single-cell**: + It's common to have paired α/β chains per cell. However we only focus on the Beta chain here. +- **Bulk**: + In bulk repertoire sequencing, you usually amplify TCRα and TCRβ chains separately. The resulting data contains lists of α clonotypes and lists of β clonotypes, but no information about which α and β belong to the same T cell. We focus only on the Beta chain. + +**Diversity & clonality metrics** +- **Single-cell**: + Sensitive to sampling (10^3 – 10^5 cells typical). + Rare clonotypes may be missed, but you can study functional heterogeneity within clones. +- **Bulk**: + Captures broad repertoire diversity (10^5 – 10^6 clonotypes). + More accurate for richness, evenness, overlap across samples. + +**Downstream biological analyses** +- **Single-cell**: + It is possile to link TCRs to phenotypic states (exhaustion, activation, tissue localization), which allows the study of clonotype heterogeneity. +- **Bulk**: + It focuses on population-level measures +::: + + +{{< include ./template_sample.qmd >}} + diff --git a/notebooks/template_details_part2.qmd b/notebooks/template_details_part2.qmd new file mode 100644 index 0000000..fbe5362 --- /dev/null +++ b/notebooks/template_details_part2.qmd @@ -0,0 +1,108 @@ +--- +title: "Details" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: false + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +execute: + cache: false + warnings: false +jupyter: python3 +--- + +Thank you for using TCRtoolkit! This report is generated from the data you provided. + +:::{.callout-note collapse="true"} +## Document Information +**Current Version:** 1.0-beta +**Last Updated:** March 2026 +**Maintainer:** BTC Data Science Team +**Notes:** +::: + +::: {.callout-note collapse="true"} +## Notebook Analysis Scope +This notebook a more detailed analysis of the samples being analyzed. +::: + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name='' +project_dir='' +sample_table='' + +timepoint_col = 'timepoint' +timepoint_order_col = 'timepoint_order' +alias_col = 'alias' +subject_col = 'subject_id' + +``` + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}/{project_name}" + +``` + +# Before You Begin + +This pipeline can be used to analyze both **single-cell and bulk TCR data**. Please see the note below to understand some of the **implications** depending on the data type you have: + +::: {.callout-note title="Single-cell vs Bulk Data analysis" collapse="true"} +**Definition of “counts”** +- **Single-cell**: + `counts` represent the number of distinct cells carrying a specific clonotype. For example, a count of 12 indicates that 12 individual cells were encapsulated and sequenced. +- **Bulk**: + `counts` represent the abundance of sequencing reads (or UMIs) supporting a clonotype. The biological interpretation depends heavily on the starting material: + + - **RNA (cDNA):** Counts are a composite metric of Cellular Abundance $\times$ Transcriptional Expression. Since activation status affects TCR mRNA levels, a high count could indicate a large clone or a highly active small clone. Normalization strategies can mitigate, but not eliminate, this expression bias. + - **DNA (gDNA):** Counts are a direct proxy for Cell Number (e.g., Adaptive ImmunoSEQ). Because T-cell genomic templates are constant (one productive rearrangement per cell), DNA sequencing avoids expression bias and allows for accurate estimation of clone size. + +**TCR chains** +- **Single-cell**: + It's common to have paired α/β chains per cell. However we only focus on the Beta chain here. +- **Bulk**: + In bulk repertoire sequencing, you usually amplify TCRα and TCRβ chains separately. The resulting data contains lists of α clonotypes and lists of β clonotypes, but no information about which α and β belong to the same T cell. We focus only on the Beta chain. + +**Diversity & clonality metrics** +- **Single-cell**: + Sensitive to sampling (10^3 – 10^5 cells typical). + Rare clonotypes may be missed, but you can study functional heterogeneity within clones. +- **Bulk**: + Captures broad repertoire diversity (10^5 – 10^6 clonotypes). + More accurate for richness, evenness, overlap across samples. + +**Downstream biological analyses** +- **Single-cell**: + It is possile to link TCRs to phenotypic states (exhaustion, activation, tissue localization), which allows the study of clonotype heterogeneity. +- **Bulk**: + It focuses on population-level measures +::: + +{{< include ./template_overlap.qmd >}} + +{{< include ./template_sharing.qmd >}} + +{{< include ./template_pheno_sc_details.qmd >}} + +{{< include ./template_gliph.qmd >}} diff --git a/notebooks/template_gliph.qmd b/notebooks/template_gliph.qmd new file mode 100644 index 0000000..98560bf --- /dev/null +++ b/notebooks/template_gliph.qmd @@ -0,0 +1,516 @@ +--- +title: "TCRtoolkit GLIPH2 Report" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: true + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +jupyter: python3 +--- + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name = '' +project_dir = '' +sample_table = '' + +``` + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}" + +sample_stats_csv = f"{project_dir}/sample/sample_stats.csv" +concat_csv = f"{project_dir}/annotate/concatenated_cdr3.tsv" # f"{project_dir}/compare/concatenated_cdr3.txt" + +cluster_details_csv = f"{project_dir}/gliph2/cluster_member_details.txt" +clone_network_csv = f"{project_dir}/gliph2/clone_network.txt" +all_motifs_csv = f"{project_dir}/gliph2/all_motifs.txt" +cluster_member_details_csv = f"{project_dir}/gliph2/cluster_member_details.txt" +global_similarities_csv = f"{project_dir}/gliph2/global_similarities.txt" +local_similarities_csv = f"{project_dir}/gliph2/local_similarities.txt" +convergence_groups_csv = f"{project_dir}/gliph2/convergence_groups.txt" + +# Define dirs +tcrpheno_dir = f"{project_dir}/tcrpheno/" +pseudobulk_dir = f"{project_dir}/pseudobulk/" + +``` + +```{python} +# Load Packages +from IPython.display import Image +import os +import datetime +import sys +import pandas as pd +import math +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.colors import LinearSegmentedColormap +import plotly.express as px +import plotly.graph_objects as go +import numpy as np +import matplotlib.ticker as ticker +from scipy.stats import gaussian_kde +import h5py +import glob +from scipy.sparse import csr_matrix +import scipy.cluster.hierarchy as sch +# import networkx as nx +import itertools +import igraph as ig +import logomaker +import io +import base64 +import json +from IPython.display import HTML, display +import warnings +import matplotlib.pyplot as plt +from upsetplot import from_contents, plot as upset_plot + + +# Print pipeline parameters + +print('Project Name: ' + project_name) +print('Workflow command: ' + workflow_cmd) +print('Pipeline Directory: ' + project_dir) +print('Date and time: ' + str(datetime.datetime.now())) + +# - Loading data + +# Load metadata +## Reading sample metadata +meta = pd.read_csv(sample_table, sep=',') + +## Reading concatenated cdr3 file +concat_df = pd.read_csv(concat_csv, sep='\t') + +# Reading compare outputs +cluster_details = pd.read_csv(cluster_details_csv, sep='\t') +clone_network = pd.read_csv(clone_network_csv, sep='\t', header=None) +all_motifs = pd.read_csv(all_motifs_csv, sep='\t') +cluster_member_details = pd.read_csv(cluster_member_details_csv, sep='\t') +global_similarities = pd.read_csv(global_similarities_csv, sep='\t') +local_similarities = pd.read_csv(local_similarities_csv, sep='\t') +convergence_groups = pd.read_csv(convergence_groups_csv, sep='\t') + +``` + +## GLIPH2: TCRB CDR3 motif clusters {#sec-GLIPH2} + +**GLIPH2** (Grouping of Lymphocyte Interactions by Paratope Hotspots, version 2) is a computational tool used in TCR sequence analysis to **identify clusters of TCRs that likely recognize the same or similar antigens**. + +By grouping TCRs with shared sequence motifs, particularly in the CDR3 regions (the main antigen-binding region), GLIPH2 helps infer functional relationships and **predict which TCRs might target the same or related epitopes**. + +For the current stage of analysis, we are focusing solely on the **CDR3 region of the TCR beta chain (TCRB)**. a network was created to facilitate result visualization. + +```{python} +#| label: fig-gliph-network +#| fig-cap: "**Specificity groups across patients** Each circle (node) represents a unique TCR clonotype across all data. Nodes are connected if they share a common sequence motif, placing them into the same specificity group." + +import plotly.graph_objects as go +import plotly.express as px +import pandas as pd +import igraph as ig + +# --- 1. Prepare Data & Hover Info --- +# Merge metadata +# Ensure 'cluster_details' has 'CDR3b' column matching 'source'/'target' +merged_meta = pd.merge(cluster_details, meta[['sample', subject_col, alias_col]], on='sample', how='left').dropna(subset=[subject_col]) + +# A. Categorize Nodes (Public vs Private) +# Count unique subjects per CDR3 +cdr3_subject_counts = merged_meta.groupby('CDR3b')[subject_col].nunique() +public_cdr3s = set(cdr3_subject_counts[cdr3_subject_counts > 1].index) + +# B. Create Detailed Hover Text +# Group by CDR3b -> Subject -> List of Aliases +def create_hover_string(group): + patient_summaries = [] + # Sort subjects for consistent order + for subj, subgroup in group.groupby(subject_col): + aliases = ", ".join(sorted(subgroup[alias_col].unique().astype(str))) + patient_summaries.append(f"{subj} ({aliases})") + return "
".join(patient_summaries) + +# Apply to all CDR3s +hover_map = merged_meta.groupby('CDR3b').apply(create_hover_string, include_groups=False).to_dict() + +# C. Create Category Map +# Map private clones to their subject_id +private_df = merged_meta[~merged_meta['CDR3b'].isin(public_cdr3s)] +# Take the first subject found (since there is only one) +category_map = dict(zip(private_df['CDR3b'], private_df[subject_col])) + +def get_node_category(cdr3): + if cdr3 in public_cdr3s: + return "Shared" + return category_map.get(cdr3, "Unknown") + +# --- 2. Build Graph --- +clone_network.columns = ["source", "target", "type", "group"] +clone_network = clone_network[clone_network['type'] != 'singleton'] + +edges = clone_network[["source", "target"]].values.tolist() +g = ig.Graph.TupleList(edges, directed=False) + +# Assign attributes +g.vs['category'] = [get_node_category(v['name']) for v in g.vs] +# Pre-fetch hover text for efficiency +g.vs['hover_detail'] = [hover_map.get(v['name'], "No Data") for v in g.vs] + +# Layout +layout = g.layout("fr") +coords = list(map(tuple, layout)) + +# --- 3. Plotting --- +fig = go.Figure() + +# -- Edge Trace -- +edge_x, edge_y = [], [] +for edge in g.es: + src, tgt = edge.tuple + x0, y0 = coords[src] + x1, y1 = coords[tgt] + edge_x.extend([x0, x1, None]) + edge_y.extend([y0, y1, None]) + +fig.add_trace(go.Scatter( + x=edge_x, y=edge_y, + line=dict(width=0.5, color='#cccccc'), + hoverinfo='none', + mode='lines', + showlegend=False +)) + +# -- Node Traces -- +categories = sorted(list(set(g.vs['category']))) + +# Define Colors +base_colors = px.colors.qualitative.Bold +color_map = {cat: base_colors[i % len(base_colors)] for i, cat in enumerate(categories)} +color_map["Shared"] = "black" # Shared = Black + +for cat in categories: + indices = [i for i, v in enumerate(g.vs) if v['category'] == cat] + if not indices: continue + + node_x = [coords[i][0] for i in indices] + node_y = [coords[i][1] for i in indices] + + # Retrieve hover text + hover_texts = [ + f"CDR3: {g.vs[i]['name']}
Found in:
{g.vs[i]['hover_detail']}" + for i in indices + ] + + # --- SIZE ADJUSTMENT HERE --- + # Shared = 18 (Big), Private = 12 (Medium) + node_size = 18 if cat == "Shared" else 12 + + # Add border to shared nodes for clarity + line_width = 2 if cat == "Shared" else 1 + line_color = 'white' + + fig.add_trace(go.Scatter( + x=node_x, y=node_y, + mode='markers', + name=str(cat), + text=hover_texts, + hoverinfo='text', + marker=dict( + color=color_map[cat], + size=node_size, + line=dict(width=line_width, color=line_color) + ) + )) + +# --- 4. Final Layout --- +fig.update_layout( + title='TCRB Network', + plot_bgcolor='white', + width=1000, height=800, # Increased width slightly + xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), + yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), + margin=dict(t=40, b=20, l=10, r=10), + legend_title_text='Category' +) + +fig.show() + + +``` + +**How to Extract Biological Knowledge from the Figure?** + +The GLIPH2 network visualizes structural convergence within the T-cell repertoire. Each node represents a unique TCR clonotype, and edges connect clonotypes that share a conserved sequence motif. By evaluating the topology and composition of these clusters, we can infer the nature of the immune response. + +1. Cluster Size and Density (Assessing Convergence) +The size of a cluster directly correlates with the degree of immune convergence against a specific target. + + - Large, Dense Clusters: Indicate a highly convergent, immunodominant response. Multiple distinct T-cell lineages have been recruited and expanded to target the exact same antigen. + + - Small, Fragmented Clusters (Dyads/Triads): A landscape of isolated pairs suggests a highly diverse repertoire. The immune responses here are broadly distributed across many different targets rather than focused heavily on a single dominant antigen. + +2. Cluster Composition (Public vs. Private Immunity) +By mapping metadata (such as cohort or patient ID) to the node colors, we can classify the structural responses as private or shared: + + - Monochromatic Clusters (Private Convergence): Clusters composed entirely of nodes from a single origin (e.g., only purple DFCI1 nodes) represent an intra-patient convergent response. The individual generated multiple distinct TCR variants against a private target, such as a patient-specific tumor neoantigen. + + - Mixed Clusters (Public Convergence): Clusters containing nodes from multiple different origins (e.g., connected purple and green nodes) represent public convergent immunity. Different individuals independently generated structurally similar TCRs, strongly implying a shared target, such as a common viral epitope or a shared driver mutation. + +3. Analyzing "Shared" Nodes +Nodes specifically designated as "Shared" (colored black) represent exact amino acid sequences found in multiple individuals. Observing the edges connected to these exact public clones reveals the broader structural "neighborhood" of private TCRs that likely recognize the same public antigen. + + +```{python} +#| label: fig-gliph-motifs +#| fig-cap: "**TCRB CDR3 motif lengths.** Histogram showing the distribution of amino acid motif lengths identified by GLIPH2 across all TCRs in the dataset." + +# Calculate motif lengths +all_motifs["motif_length"] = all_motifs["motif"].str.len() + +# Count how many motifs of each length +length_counts = all_motifs["motif_length"].value_counts().reset_index() +length_counts.columns = ["motif_length", "count"] +length_counts = length_counts.sort_values("motif_length") + +# Plot bar chart +plt.figure(figsize=(8, 5)) +plt.bar(length_counts["motif_length"], length_counts["count"], + color="darkgreen", edgecolor="black") + +plt.title("Motif Length Distribution") +plt.xlabel("Motif Length") +plt.ylabel("Number of Motifs") +plt.xticks(length_counts["motif_length"]) +plt.grid(axis='y', linestyle='--', alpha=0.5) +plt.tight_layout() +plt.show() + +``` + +Knowing the length of the amino acid motifs is useful because it provides insight into both the biological plausibility of the findings and the likely functional relationship between the clustered T-cells. + +**Reflecting the Biophysics of Binding** +From a biological standpoint, **the interaction between a TCR and its target peptide-MHC is often driven by a small number of critical "contact points" or "hotspots" within the CDR3 loop.** These hotspots typically consist of a short stretch of 2-5 amino acids. When GLIPH2 identifies a large number of **motifs with a length of 3 or 4**, it serves as a crucial sanity check. It suggests the algorithm **is not just finding random statistical noise, but is successfully identifying patterns that are biologically plausible** and reflect the known mechanics of T-cell recognition. + +**Short Motifs (2-3 amino acids):** These are like a broad. They are **more likely to occur by chance** and may define more "promiscuous" groups of TCRs that share a common structural feature but might still bind to a range of similar peptides. + +**Long Motifs (4-5 amino acids):** These are like a specific. They are statistically much rarer and impose a much stronger constraint on the TCR's structure. Therefore, a group of TCRs defined by a **longer motif is very likely to be highly specific for the exact same antigen.** + +```{python} +#| output: asis +#| label: fig-motif-interactive-plotly +#| fig-cap: "**Top 10 TCR specificity motifs ranked by member count.** Bar chart displaying the most prevalent amino acid motifs identified by the GLIPH2 algorithm, ranked by the number of unique T-cell receptors (TCRs) that share each motif." + +import plotly.graph_objects as go + +def create_motif_tabs(all_motifs): + """ + Creates Quarto tabs containing a Plotly bar plot of top TCR motifs + for each motif length. + """ + # --- 1. Prepare the data --- + all_motifs['motif_length'] = all_motifs['motif'].str.len() + all_motifs.dropna(subset=['motif', 'motif_length'], inplace=True) + all_motifs['motif_length'] = all_motifs['motif_length'].astype(int) + + unique_lengths = sorted(all_motifs['motif_length'].unique()) + + if not unique_lengths: + print("No valid motif lengths found to plot.") + return + + # --- 2. Start Quarto Tabset --- + print("::: {.panel-tabset}\n") + + # --- 3. Process each motif length and create a separate plot --- + for length in unique_lengths: + # Print the tab header + print(f"## Length {length}\n") + + # Filter data and grab top 10 + df_by_length = all_motifs[all_motifs['motif_length'] == length] + top10 = df_by_length.sort_values("num_in_sample", ascending=False).head(10) + + # Build a standalone figure just for this length + fig = go.Figure( + data=[ + go.Bar( + x=top10["motif"], + y=top10["num_in_sample"], + marker_color='tomato', + marker_line_color='black', + marker_line_width=1.5 + ) + ] + ) + + # Keep the layout clean and tight for the tab environment + fig.update_layout( + title_text=f"Top 10 Motifs of Length {length} by TCR Members", + xaxis_title="Motif", + yaxis_title="Number of TCRs", + font=dict(size=12), + plot_bgcolor='white', + margin=dict(t=40, b=20, l=10, r=10) + ) + + fig.show() + + # Add necessary spacing so Quarto doesn't blend the HTML and Markdown + print("\n\n") + + # --- 4. Close Quarto Tabset --- + print(":::\n") + +# --- Run --- +create_motif_tabs(all_motifs) + +``` + +This bar chart ranks the top conserved amino acid motifs found by GLIPH2 based on the number of unique T-cell receptors (TCRs) that belong to each specificity group. Each bar represents a specific motif (e.g., 'GG'), and its height indicates the size of that "TCR neighborhood"—the total number of different TCRs that share this core binding pattern. + +Identifying the motifs shared by the largest number of unique TCRs helps **pinpoint the most immunodominant recognition strategies used by the T-cells** in your dataset; you are effectively highlighting the **most common solutions the immune system has evolved to target key antigens**, thus providing a data-driven way to prioritize the most significant TCR communities for further study. However, it is crucial to interpret these findings with care, as **the most frequent motifs are often short and may not always define the most functionally specific T-cell groups compared to smaller groups with longer, more complex motifs.** + +## GLIPH2 global Motifs +GLIPH2 identifies TCRB CDR3 clusters based on either local motifs (short, position-restricted patterns within 3 amino acids) or **global similarity (requiring identical CDR3 length and differences at the same positions).** Visualizing global motifs as sequence logos reveals the conserved amino acid preferences at each position, and can provide insights into the sequence features that potentially drive shared antigen recognition. + +```{python} +#| echo: false +#| label: fig-motif-logo +#| fig-cap: "**Logo visualization of the top 10 global motifs based on TCR members.**" + +# 1. Sort and select top 10 motifs +top10 = global_similarities.sort_values("cluster_size", ascending=False).head(10).copy() + +# 2. Generate plots and store them as in-memory images +plot_data = [] + +# --- WARNING SUPPRESSION START --- +# We catch warnings here because logomaker uses deprecated pandas setting methods +with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + + for i, row in top10.iterrows(): + motif_name = f"Motif {row['cluster_tag']} (Size: {row['cluster_size']})" + seqs = [s.strip() for s in str(row["CDR3b"]).split(" ") if s.strip()] + + if len(seqs) >= 2: + try: + # Create a counts matrix + counts_mat = logomaker.alignment_to_matrix(seqs, to_type="probability") + + # Create plot but don't show it + fig, ax = plt.subplots(figsize=(10, 3)) + logo = logomaker.Logo(counts_mat, ax=ax) + ax.set_title(motif_name) + fig.tight_layout() + + # Save plot to an in-memory buffer + buf = io.BytesIO() + fig.savefig(buf, format='png', bbox_inches='tight') + buf.seek(0) + + # Encode image to Base64 and decode to a string + img_str = base64.b64encode(buf.read()).decode('utf-8') + + # Add data to our list + plot_data.append({ + "title": motif_name, + "image_src": f"data:image/png;base64,{img_str}" + }) + + # Close the figure to free up memory + plt.close(fig) + + except Exception as e: + print(f"Could not process motif {row['cluster_tag']}: {e}") +# --- WARNING SUPPRESSION END --- + +# 3. Create the HTML and JavaScript component for the interactive viewer +# Convert the Python list of plot data into a JSON string for JavaScript +plots_json = json.dumps(plot_data) + +html_template = f""" +
+

+ +
+ + + +
+
+ + +""" + +# 4. Display the HTML component in the notebook +if plot_data: + display(HTML(html_template)) +else: + print("No valid motifs with 2 or more sequences were found to plot.") + +``` diff --git a/notebooks/template_overlap.qmd b/notebooks/template_overlap.qmd new file mode 100644 index 0000000..137a514 --- /dev/null +++ b/notebooks/template_overlap.qmd @@ -0,0 +1,1034 @@ +--- +title: "TCRtoolkit Overlap Report" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: true + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +jupyter: python3 +--- + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name = '' +project_dir = '' +sample_table = '' + +``` + + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}" + +sample_stats_csv = f"{project_dir}/sample/sample_stats.csv" +concat_csv = f"{project_dir}/annotate/concatenated_cdr3.tsv" # f"{project_dir}/compare/concatenated_cdr3.txt" +morisita_mat= f"{project_dir}/compare/morisita_mat.csv" + +``` + + +```{python} +#| tags: [setup] +#| warning: false + +# Load Packages +from IPython.display import Image, display, Markdown, HTML +from matplotlib.colors import LinearSegmentedColormap +from matplotlib_venn import venn2, venn3 +from bokeh.models import HoverTool +from bokeh.palettes import Viridis256 +from bokeh.transform import linear_cmap +from holoviews import opts +from io import StringIO +from scipy.sparse import csr_matrix +from scipy.stats import gaussian_kde, fisher_exact +from statsmodels.stats.multitest import multipletests +from scipy.cluster.hierarchy import linkage, leaves_list +from scipy.spatial.distance import pdist + +import base64 +import datetime +import glob +import h5py +import holoviews as hv +import igraph as ig +import io +import itertools +import logomaker +import math +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +import networkx as nx +import numpy as np +import os +import pandas as pd +import pathlib +import plotly.express as px +import plotly.graph_objects as go +import scipy.cluster.hierarchy as sch +import seaborn as sns +import shutil +import sys +import upsetplot +import warnings +from pathlib import Path +import re + + +# Print Pipeline Information +# print('Pipeline information and parameters:' + '\n') +# print('Project Name: ' + project_name) +# print('Workflow command: ' + workflow_cmd) +# print('Pipeline Directory: ' + project_dir) +# print('Date and time: ' + str(datetime.datetime.now())) + +# morisita similarity data +morisita_df = pd.read_csv(morisita_mat, sep=',', header=0, index_col=0) +morisita_df.index = morisita_df.index.str.replace('_airr', '', regex=False) # Eliminate suffic from index +morisita_df.index = morisita_df.index.str.replace('_pseudobulk', '', regex=False) +morisita_df.columns = morisita_df.columns.str.replace('_airr', '', regex=False) # Eliminate suffic from colnames +morisita_df.columns = morisita_df.columns.str.replace('_pseudobulk', '', regex=False) + +## Reading sample metadata +meta = pd.read_csv(sample_table, sep=',') + +## Reading concatenated cdr3 file +concat_df = pd.read_csv(concat_csv, sep='\t') +concat_df = concat_df.merge(meta[['sample', 'origin', subject_col, alias_col, timepoint_col, timepoint_order_col]], on='sample', how='left') +meta = meta.set_index('sample', drop=False) + + +``` + +## Clonal Dynamics + +```{python} +#| output: asis +#| echo: false + +# Define Thresholds +fc_thold_up = 2 +signif_thold = 0.05 + +def create_styled_table(df, title="", max_height='400px', formatter=None): + """ + Applies custom styling to a DataFrame to match the desired theme, + hides the index, adds a title, and places it in a scrollable container. + """ + if df.empty: + print(f"{title}: No data to display.") + return + + # Define the CSS styles for the table + styles = [ + # Style for the table header (th) + dict(selector="th", props=[ + ("background-color", "paleturquoise"), + ("text-align", "left"), + ("font-weight", "bold"), + ("padding", "8px") # Add some padding + ]), + # Style for the data cells (td) + dict(selector="td", props=[ + ("background-color", "lavender"), + ("text-align", "left"), + ("padding", "8px"), + ("min-width", "120px") # Give columns enough space for names + ]), + # Style for the table caption (title) + dict(selector="caption", props=[ + ("caption-side", "top"), + ("font-size", "1.1em"), + ("font-weight", "bold"), + ("text-align", "left"), + ("margin-bottom", "10px") + ]) + ] + + # Apply styles and settings to the DataFrame + styler = df.style.set_table_styles(styles).set_caption(title).hide(axis='index') + + # Apply number formatting if provided + if formatter: + styler = styler.format(formatter, na_rep="NA") + + # Convert the styled table to an HTML string + html = styler.to_html() + + # Wrap the HTML table in a scrollable div container + scrollable_html = f'
{html}
' + + # Return an HTML object for display in the notebook + return HTML(scrollable_html) + +``` + +```{python} + +import pandas as pd +import numpy as np +import itertools +from scipy.stats import fisher_exact +from statsmodels.stats.multitest import multipletests + +# --- Setup & Pre-computation --- + +clonotypes_df = concat_df.copy() +clonotypes_df = clonotypes_df.dropna(subset=[timepoint_col, 'origin']) +clone_cols = ['CDR3b', 'TRBV', 'TRBJ'] +merge_cols = clone_cols + [subject_col, 'origin'] + +clonotypes_df['timepoint_rank'] = clonotypes_df[timepoint_order_col] +clonotypes_df = clonotypes_df.sort_values('timepoint_rank') + +# Grouping by subject, origin, and timepoint for accurate total counts +sample_total_counts = clonotypes_df.groupby([subject_col, 'origin', timepoint_col])['counts'].sum().reset_index() +sample_total_counts.rename(columns={'counts': 'total_counts'}, inplace=True) +clonotypes_df = pd.merge(clonotypes_df, sample_total_counts, on=[subject_col, 'origin', timepoint_col]) +clonotypes_df['frequency'] = clonotypes_df['counts'] / clonotypes_df['total_counts'] + +# Helper to run fisher exact test on rows +def run_fisher(row, alt_hyp): + table = [[row['count_post'], row['total_post'] - row['count_post']], + [row['count_pre'], row['total_pre'] - row['count_pre']]] + return fisher_exact(table, alternative=alt_hyp)[1] + + +# ========================================== +# ----------- EXPANDING CLONES ----------- +# ========================================== + +fc_thold_up = 2.0 +signif_thold = 0.05 +top_clones_to_show = 15 + +all_significant_results = [] +comparison_details = [] +expansion_counts_list = [] + +for (subject, origin), subject_df in clonotypes_df.groupby([subject_col, 'origin']): + timepoints = subject_df.sort_values('timepoint_rank')[timepoint_col].unique().tolist() + if len(timepoints) < 2: + continue + + freq_pivot = subject_df.pivot_table(index=clone_cols, columns=timepoint_col, values='frequency', fill_value=0) + counts_pivot = subject_df.pivot_table(index=clone_cols, columns=timepoint_col, values='counts', fill_value=0) + subject_total_counts = subject_df.groupby(timepoint_col)['counts'].sum() + + for t_pre, t_post in itertools.combinations(timepoints, 2): + # 1. Vectorized DataFrame creation instead of iterrows + comp_df = pd.DataFrame({ + 'count_pre': counts_pivot[t_pre], + 'count_post': counts_pivot[t_post], + 'freq_pre': freq_pivot[t_pre], + 'freq_post': freq_pivot[t_post] + }).reset_index() + + # 2. Fast pre-filtering: Must have post counts > 0 to be an expansion + comp_df = comp_df[comp_df['count_post'] > 0].copy() + if comp_df.empty: + continue + + # 3. Vectorized Total Counts & Fold Change + comp_df['total_pre'] = subject_total_counts[t_pre] + comp_df['total_post'] = subject_total_counts[t_post] + + # Safe divide for fold change (np.where is extremely fast) + comp_df['fold_change'] = np.where(comp_df['freq_pre'] == 0, np.inf, comp_df['freq_post'] / comp_df['freq_pre']) + + # 4. Filter strictly by fold_change threshold BEFORE running expensive Fisher tests + # If it doesn't meet the FC threshold, it won't make your final cut anyway. + comp_df = comp_df[comp_df['fold_change'] > fc_thold_up].copy() + + if comp_df.empty: + continue + + # 5. Apply Fisher's Exact test only to the remaining rows + comp_df['p_value'] = comp_df.apply(run_fisher, axis=1, alt_hyp='greater') + + # Add metadata + comp_df[subject_col] = subject + comp_df['origin'] = origin + comp_df['t_pre'] = t_pre + comp_df['t_post'] = t_post + + # Run FDR correction + _, q_values, _, _ = multipletests(comp_df['p_value'], alpha=signif_thold, method='fdr_bh') + comp_df['q_value'] = q_values + + comparison_details.append(comp_df) + + # Get Significant Hits + significant_hits = comp_df[comp_df['q_value'] < signif_thold] + + if not significant_hits.empty: + counts_per_pair = pd.DataFrame({ + 't_pre': [t_pre], 't_post': [t_post], + 'expanded_count': [len(significant_hits)], + subject_col: [subject], 'origin': [origin] + }) + expansion_counts_list.append(counts_per_pair) + + significant_clones = significant_hits[clone_cols].drop_duplicates() + final_clones_df = freq_pivot.loc[pd.MultiIndex.from_frame(significant_clones)].reset_index() + final_clones_df[subject_col] = subject + final_clones_df['origin'] = origin + + cols = [subject_col, 'origin'] + clone_cols + timepoints + final_clones_df = final_clones_df[[c for c in cols if c in final_clones_df.columns]] + all_significant_results.append(final_clones_df) + +# --- Assemble Expanding Tables --- +expansion_counts_table = pd.DataFrame() +if expansion_counts_list: + expansion_counts_table = pd.concat(expansion_counts_list, ignore_index=True) + # Reorder columns slightly just to match previous layout + if not expansion_counts_table.empty: + expansion_counts_table = expansion_counts_table[[subject_col, 'origin', 't_pre', 't_post', 'expanded_count']] + +detailed_comparisons_table = pd.DataFrame() +if comparison_details: + detailed_comparisons_table = pd.concat(comparison_details, ignore_index=True) + +expanded_clones_summary_table = pd.DataFrame() +if all_significant_results: + expanded_clones_summary_table = pd.concat(all_significant_results, ignore_index=True).fillna(0) + expanded_clones_summary_table.rename(columns={t: f'freq_{t}' for t in expanded_clones_summary_table.columns if t not in merge_cols}, inplace=True) + +# --- Clean & Get Top Expanding Clones --- +table1_top = pd.DataFrame() +table2_top = pd.DataFrame() + +if not detailed_comparisons_table.empty: + for col in ['p_value', 'q_value', 'fold_change']: + detailed_comparisons_table[col] = detailed_comparisons_table[col].replace([np.inf, -np.inf], pd.NA) + + # Note: FC filter was already applied earlier, so this is just checking Q-value + significant_details = detailed_comparisons_table[ + (pd.to_numeric(detailed_comparisons_table['q_value'], errors='coerce') < signif_thold) + ].copy() + + if not significant_details.empty: + min_q_value_rows = significant_details.loc[significant_details.groupby(merge_cols)['q_value'].idxmin()] + top_clones = min_q_value_rows.sort_values('q_value', ascending=True).head(top_clones_to_show) + top_clone_identifiers = top_clones[merge_cols] + + if not expanded_clones_summary_table.empty: + table1_top = pd.merge(top_clone_identifiers, expanded_clones_summary_table, on=merge_cols, how='left') + + table2_top = pd.merge(top_clone_identifiers, detailed_comparisons_table, on=merge_cols, how='inner') + table2_top = table2_top[table2_top['q_value'] < signif_thold] + + +# ========================================== +# ----------- CONTRACTING CLONES ----------- +# ========================================== + +fc_thold_down = 0.5 +signif_thold = 0.05 + +all_significant_results_cont = [] +comparison_details_cont = [] +contraction_counts_list = [] + +for (subject, origin), subject_df in clonotypes_df.groupby([subject_col, 'origin']): + timepoints = subject_df.sort_values('timepoint_rank')[timepoint_col].unique().tolist() + if len(timepoints) < 2: + continue + + freq_pivot = subject_df.pivot_table(index=clone_cols, columns=timepoint_col, values='frequency', fill_value=0) + counts_pivot = subject_df.pivot_table(index=clone_cols, columns=timepoint_col, values='counts', fill_value=0) + subject_total_counts = subject_df.groupby(timepoint_col)['counts'].sum() + + for t_pre, t_post in itertools.combinations(timepoints, 2): + # 1. Vectorize + comp_df = pd.DataFrame({ + 'count_pre': counts_pivot[t_pre], + 'count_post': counts_pivot[t_post], + 'freq_pre': freq_pivot[t_pre], + 'freq_post': freq_pivot[t_post] + }).reset_index() + + # 2. Fast pre-filtering: Must have pre counts > 0 to contract + comp_df = comp_df[comp_df['count_pre'] > 0].copy() + if comp_df.empty: + continue + + # 3. Vectorized Total Counts & Fold Change + comp_df['total_pre'] = subject_total_counts[t_pre] + comp_df['total_post'] = subject_total_counts[t_post] + comp_df['fold_change'] = comp_df['freq_post'] / comp_df['freq_pre'] + + # 4. Filter by FC threshold BEFORE Fisher + comp_df = comp_df[comp_df['fold_change'] < fc_thold_down].copy() + + if comp_df.empty: + continue + + # 5. Apply Fisher + comp_df['p_value'] = comp_df.apply(run_fisher, axis=1, alt_hyp='less') + + # Add metadata + comp_df[subject_col] = subject + comp_df['origin'] = origin + comp_df['t_pre'] = t_pre + comp_df['t_post'] = t_post + + # Run FDR + _, q_values, _, _ = multipletests(comp_df['p_value'], alpha=signif_thold, method='fdr_bh') + comp_df['q_value'] = q_values + + comparison_details_cont.append(comp_df) + + # Get Significant Hits + significant_hits = comp_df[comp_df['q_value'] < signif_thold] + + if not significant_hits.empty: + counts_per_pair = pd.DataFrame({ + 't_pre': [t_pre], 't_post': [t_post], + 'contracted_count': [len(significant_hits)], + subject_col: [subject], 'origin': [origin] + }) + contraction_counts_list.append(counts_per_pair) + + significant_clones = significant_hits[clone_cols].drop_duplicates() + final_clones_df = freq_pivot.loc[pd.MultiIndex.from_frame(significant_clones)].reset_index() + final_clones_df[subject_col] = subject + final_clones_df['origin'] = origin + + cols = [subject_col, 'origin'] + clone_cols + timepoints + final_clones_df = final_clones_df[[c for c in cols if c in final_clones_df.columns]] + all_significant_results_cont.append(final_clones_df) + +# --- Assemble Contracting Tables --- +contraction_counts_table = pd.DataFrame() +if contraction_counts_list: + contraction_counts_table = pd.concat(contraction_counts_list, ignore_index=True) + if not contraction_counts_table.empty: + contraction_counts_table = contraction_counts_table[[subject_col, 'origin', 't_pre', 't_post', 'contracted_count']] + +detailed_comparisons_table_cont = pd.DataFrame() +if comparison_details_cont: + detailed_comparisons_table_cont = pd.concat(comparison_details_cont, ignore_index=True) + +contracting_clones_summary_table = pd.DataFrame() +if all_significant_results_cont: + contracting_clones_summary_table = pd.concat(all_significant_results_cont, ignore_index=True).fillna(0) + contracting_clones_summary_table.rename(columns={t: f'freq_{t}' for t in contracting_clones_summary_table.columns if t not in merge_cols}, inplace=True) + +# --- Clean & Get Top Contracting Clones --- +table1_contracting_top = pd.DataFrame() +table2_contracting_top = pd.DataFrame() + +if not detailed_comparisons_table_cont.empty: + for col in ['p_value', 'q_value']: + detailed_comparisons_table_cont[col] = detailed_comparisons_table_cont[col].replace([np.inf, -np.inf], pd.NA) + + significant_details = detailed_comparisons_table_cont[ + (pd.to_numeric(detailed_comparisons_table_cont['q_value'], errors='coerce') < signif_thold) + ].copy() + + if not significant_details.empty: + min_q_value_rows = significant_details.loc[significant_details.groupby(merge_cols)['q_value'].idxmin()] + top_clones = min_q_value_rows.sort_values('q_value', ascending=True).head(top_clones_to_show) + top_clone_identifiers = top_clones[merge_cols] + + if not contracting_clones_summary_table.empty: + table1_contracting_top = pd.merge(top_clone_identifiers, contracting_clones_summary_table, on=merge_cols, how='left') + + table2_contracting_top = pd.merge(top_clone_identifiers, detailed_comparisons_table_cont, on=merge_cols, how='left') + table2_contracting_top = table2_contracting_top[table2_contracting_top['q_value'] < signif_thold] + +``` + + +### Longitudinal Summary + +The clonal trajectory heatmap tracks the longitudinal frequency of dynamic T-cell clones. It displays only TCR clonotypes identified as statistically significant in at least one pairwise comparison, filtering out stable background populations to highlight treatment-responsive clones. + +The heatmap displays only those TCR clonotypes that were identified as **"significant" in at least one pairwise comparison**. This means each row represents a clone that showed a statistically significant change between any two timepoints in the patient's timeline. This filtering focuses the visualization on the most biologically relevant changes, removing the background noise of stable or low-frequency clones. + + +```{python} +#| output: asis +#| echo: false +#| fig-width: 10 +#| fig-height: 8 + +import plotly.express as px +import plotly.graph_objects as go +import scipy.cluster.hierarchy as sch +from scipy.stats import zscore +from IPython.display import display, Markdown +import pandas as pd + +# --- 1. Prepare Master Data --- +# Combine Expanded + Contracted +dfs_to_combine = [] +if 'expanded_clones_summary_table' in locals() and not expanded_clones_summary_table.empty: + dfs_to_combine.append(expanded_clones_summary_table) +if 'contracting_clones_summary_table' in locals() and not contracting_clones_summary_table.empty: + dfs_to_combine.append(contracting_clones_summary_table) + +if not dfs_to_combine: + print("No data available for heatmap.") +else: + # Concatenate and handle duplicates + master_df = pd.concat(dfs_to_combine, ignore_index=True) + master_df = master_df.drop_duplicates(subset=clone_cols + [subject_col, 'origin']).fillna(0) + + # Timepoint Ordering Map + tp_rank_map = clonotypes_df[[subject_col, timepoint_col, 'timepoint_rank']].drop_duplicates() + + # Get Subjects + subjects = sorted(master_df[subject_col].unique()) + + display(Markdown("::: {.panel-tabset}")) + + for subject in subjects: + display(Markdown(f"## {subject}")) + + sub_df = master_df[master_df[subject_col] == subject].copy() + + # Identify unique origins (fallback to 'Unknown' if column is missing) + origins = sorted(sub_df['origin'].unique()) if 'origin' in sub_df.columns else ['Unknown'] + + master_fig = go.Figure() + valid_origins = [] + origin_layouts = [] + buttons = [] + traces_added = 0 + + # Store coloraxis from Plotly Express to maintain consistent styling + px_coloraxis = None + + for origin in origins: + # Filter data for this specific origin + if 'origin' in sub_df.columns: + orig_df = sub_df[sub_df['origin'] == origin].copy() + else: + orig_df = sub_df.copy() + + # Identify frequency columns + freq_cols = [c for c in orig_df.columns if c.startswith('freq_')] + if not freq_cols: continue + + # Rename columns to clean timepoints + rename_map = {c: c.replace('freq_', '') for c in freq_cols} + orig_df = orig_df.rename(columns=rename_map) + + # --- KEY REQUIREMENT: ONLY SAMPLES CORRESPONDING TO ORIGIN --- + # Keep only timepoints that have non-zero data for this specific origin + valid_tps = [tp for tp in rename_map.values() if orig_df[tp].sum() > 0] + if not valid_tps: continue + + # Sort Timepoints biologically based on the origin's valid timepoints + subj_ranks = tp_rank_map[ + (tp_rank_map[subject_col] == subject) & + (tp_rank_map[timepoint_col].isin(valid_tps)) + ].sort_values('timepoint_rank') + sorted_tps = subj_ranks[timepoint_col].tolist() + + # Fallback for timepoints not caught in map + sorted_tps += [t for t in valid_tps if t not in sorted_tps] + + # Set Index to CDR3b + matrix_df = orig_df.set_index('CDR3b')[sorted_tps] + + # Drop completely empty rows + matrix_df = matrix_df.loc[~(matrix_df == 0).all(axis=1)] + + if matrix_df.shape[0] < 1: continue + + # --- Calculate Z-Scores --- + if matrix_df.shape[1] > 1: + # fillna(0) handles cases where standard deviation is 0 (identical frequencies across time) + z_score_df = matrix_df.apply(zscore, axis=1, result_type='expand').fillna(0) + else: + z_score_df = matrix_df.copy() * 0 # If only 1 timepoint, z-score is baseline 0 + + z_score_df.columns = sorted_tps + + # --- Hierarchical Clustering (Manual Row Reordering) --- + if z_score_df.shape[0] >= 2 and z_score_df.shape[1] >= 2: + try: + d = sch.distance.pdist(z_score_df) + L = sch.linkage(d, method='ward') + dendro = sch.dendrogram(L, no_plot=True) + row_order = dendro['leaves'] + + z_score_sorted = z_score_df.iloc[row_order] + raw_values_sorted = matrix_df.iloc[row_order] + except Exception as e: + z_score_sorted = z_score_df + raw_values_sorted = matrix_df + else: + z_score_sorted = z_score_df + raw_values_sorted = matrix_df + + # --- Generate Temporary PX Heatmap to Extract Trace --- + temp_fig = px.imshow( + z_score_sorted, + labels=dict(x="Timepoint", y="CDR3b", color="Z-Score"), + x=sorted_tps, + y=z_score_sorted.index, + aspect="auto", + color_continuous_scale='RdBu_r', + color_continuous_midpoint=0 + ) + + # Save the px coloraxis layout for the master figure + if px_coloraxis is None: + px_coloraxis = temp_fig.layout.coloraxis + + # Extract Trace and Customize Hover + trace = temp_fig.data[0] + trace.customdata = raw_values_sorted.values + trace.hovertemplate = ( + "CDR3: %{y}
" + + "Time: %{x}
" + + "Z-Score: %{z:.2f}
" + + "Raw Freq: %{customdata:.2e}" + ) + + # Only set the first origin as visible by default + is_first = (len(valid_origins) == 0) + trace.visible = is_first + + master_fig.add_trace(trace) + + # Store the specific categorical layout needed for this origin + origin_layouts.append({ + 'x_categories': sorted_tps, + 'y_categories': z_score_sorted.index.tolist() + }) + valid_origins.append(origin) + traces_added += 1 + + if not valid_origins: + print(f"*(Not enough data to cluster for {subject})*") + continue + + # --- Build Dropdown Buttons --- + for i, origin in enumerate(valid_origins): + visibility = [False] * traces_added + visibility[i] = True + + # Create dynamic title + dynamic_title = f"{subject}: Clonal Dynamics (Z-Score)" + if len(valid_origins) > 1: + dynamic_title += f" - {origin}" + + button = dict( + label=str(origin), + method="update", + args=[ + {"visible": visibility}, # Toggle trace + { + "title.text": dynamic_title, + # Update axes dynamically so ONLY this origin's timepoints/CDR3s show + "xaxis.categoryarray": origin_layouts[i]['x_categories'], + "yaxis.categoryarray": origin_layouts[i]['y_categories'] + } + ] + ) + buttons.append(button) + + # --- Apply Master Layout --- + initial_title = f"{subject}: Clonal Dynamics (Z-Score)" + if len(valid_origins) > 1: + initial_title += f" - {valid_origins[0]}" + + master_fig.update_layout( + title=initial_title, + xaxis_title="", + yaxis_title="", + yaxis={'showticklabels': False}, + xaxis={'categoryorder': 'array', 'categoryarray': origin_layouts[0]['x_categories']}, + coloraxis=px_coloraxis, # Restore the RdBu_r continuous scale properties + height=600, + margin=dict(t=120) # Extra top margin to fit the dropdown menu + ) + + # Inject updatemenus only if there's more than 1 origin + if len(valid_origins) > 1: + master_fig.update_layout( + updatemenus=[dict( + active=0, + buttons=buttons, + x=0.0, xanchor="left", y=1.15, yanchor="top", + pad={"r": 10, "t": 10} + )] + ) + + master_fig.show() + print("\n") + + display(Markdown(":::")) + +``` +**Figure 1: Clonal Dynamics Heatmap.** Heatmap showing the standardized frequency (Z-score) of significantly dynamic TCR clones across timepoints. Only clones with at least one significant expansion or contraction event are included. + +This approach highlights distinct response kinetics, distinguishing between transient expansions (post-treatment spikes followed by contraction) and sustained clonal persistence. + +The following tables provide the complete history of every TCR clonotype that showed a statistically significant change ($q < 0.05$ and $Fold Change > \text{2.0}$ for expansion and $Fold Change < \text{0.5}$ for contraction) in at least one pairwise comparison during the patient's timeline. + +- **Unique Rows:** Each row represents a single, unique clonotype (defined by CDR3$\beta$, TRBV, and TRBJ). + +- **Full Timeline:** This table displays the clone's frequency across all sequenced timepoints. +- **Interpretation:** This view allows you to categorize the clone's overall behavior. For example, you can distinguish between a sustained expansion (frequency rises and stays high) versus a transient expansion (frequency rises and then drops back to zero). + - Note: A frequency of 0.0 usually indicates the clone fell below the limit of detection for that sample, rather than true biological absence.w the limit of detection for that sample, rather than true biological absence. + + +```{python} + +# Expanding +display(create_styled_table( + table1_top)) + + +``` +**Table 1: Top 15 expanded clones across timepoints.** Clones (each row is a different clone) with a significant expansion in at least 1 timepoint when compared to all of the previous ones (FC>2 and q-value<0.05). + +```{python} + +# Contracting +display(create_styled_table( + table1_contracting_top)) + + +``` +**Table 2: Top 15 contracted clones across timepoints.** Clones (each row is a different clone) with a significant contraction in at least 1 timepoint when compared to all of the previous ones (FC<0.5 and q-value<0.05). + +Note that "new" clones are captured in these tables as expansions and contractions. + + +### Trajectories of Top Expanded Clones + +To visualize the precise temporal dynamics of the immune response, we isolated the trajectories of the top 15 most significantly upregulated clones (ranked by lowest $q$-value). + +- **Visualization:** The red lines track the $log_{10}$ frequency of these top expanding clones over time. The faint grey lines in the background represent background TCRs of the remaining repertoire, providing a visual baseline for "typical" clonal behavior. + +- **Interpretation**: A steep upward slope indicates rapid proliferation. By **using a logarithmic scale**, we can clearly distinguish clones that expand by orders of magnitude (e.g., from $10^{-4}$ to $10^{-2}$) compared to those with subtler changes. These clones represent the "winners" of the T-cell competition during the treatment window, suggesting strong antigen-driven selection. + +```{python} +#| output: asis + +import plotly.express as px +import plotly.graph_objects as go +import numpy as np +import pandas as pd + +# --- 1. Data Processing: Split Expanded vs Contracted --- + +# Prepare stats table +stats_df = detailed_comparisons_table.copy() +stats_df['p_value'] = pd.to_numeric(stats_df['p_value'], errors='coerce') +stats_df['fold_change'] = pd.to_numeric(stats_df['fold_change'], errors='coerce') + +# Helper function to get Top 15 Unique CDR3s per patient & origin +def get_top_ids(df, direction): + id_map = {} + + # Safely group by subject and origin if it exists + group_cols = [subject_col, 'origin'] if 'origin' in df.columns else [subject_col] + + for name, group in df.groupby(group_cols): + # name is either a tuple (subject, origin) or just subject + if direction == 'up': + subset = group[group['fold_change'] > 1] + else: + subset = group[group['fold_change'] < 1] + + # Sort by P-value (lowest first) -> Take top 15 Unique CDR3s + top_clones = subset.sort_values('p_value', ascending=True)['CDR3b'].drop_duplicates().head(15).tolist() + id_map[name] = set(top_clones) + return id_map + +# Create the two maps +expanded_map = get_top_ids(stats_df, 'up') +contracted_map = get_top_ids(stats_df, 'down') + +# --- 2. Prepare Master Frequency Data --- +# Aggregate by CDR3 including 'origin' so we can split traces later +agg_cols = [subject_col, 'origin', 'CDR3b', timepoint_col] if 'origin' in clonotypes_df.columns else [subject_col, 'CDR3b', timepoint_col] +cdr3_df = clonotypes_df.groupby(agg_cols)['frequency'].sum().reset_index() + +# Log Transform (with pseudocount) +min_freq = cdr3_df[cdr3_df['frequency'] > 0]['frequency'].min() +cdr3_df['log_freq'] = np.log10(cdr3_df['frequency'].replace(0, min_freq / 2)) + +# Map Timepoints +if 'timepoint_order' in locals(): + cdr3_df['timepoint_rank'] = cdr3_df[timepoint_col].map(timepoint_order) +else: + ordered_timepoints = sorted(cdr3_df[timepoint_col].unique()) + timepoint_order_fallback = {t: i for i, t in enumerate(ordered_timepoints)} + cdr3_df['timepoint_rank'] = cdr3_df[timepoint_col].map(timepoint_order_fallback) + +# --- 3. The Plot Generator Function --- + +def generate_plot_quarto_tabs(id_map, title_prefix, line_color): + """ + Generates independent Plotly figures enclosed in Quarto Markdown tabs. + Includes an origin dropdown inside the tab if multiple origins exist. + """ + + # 1. Initialize the Quarto Tabset + print("::: {.panel-tabset}\n") + + # Define Status Helper + def get_status(row): + subj = row[subject_col] + orig = row['origin'] if 'origin' in row else None + seq = row['CDR3b'] + + # Check map using the correct key format + key = (subj, orig) if orig is not None else subj + if key in id_map and seq in id_map[key]: + return "Highlight" + return "Background" + + # Create temp dataframe + plot_data = cdr3_df.copy() + plot_data['status'] = plot_data.apply(get_status, axis=1) + + subjects = sorted(plot_data[subject_col].unique()) + color_map = {"Highlight": line_color, "Background": "lightgrey"} + + # Loop Patients + for subject in subjects: + + # 2. Print the tab header for this specific patient + print(f"### {subject}\n") + + subset = plot_data[plot_data[subject_col] == subject].copy() + + if subset.empty: + print("No data available for this patient.\n") + continue + + # Get unique origins + origins = sorted(subset['origin'].unique()) if 'origin' in subset.columns else ["Unknown"] + + master_fig = go.Figure() + origin_layouts = [] + buttons = [] + traces_added = 0 + + for origin in origins: + if 'origin' in subset.columns: + orig_df = subset[subset['origin'] == origin].copy() + else: + orig_df = subset.copy() + + if orig_df.empty: continue + + # Sort: Background (0) first, Highlight (1) last (Draw order) + orig_df['sort_order'] = orig_df['status'].map(lambda x: 0 if x == "Background" else 1) + orig_df = orig_df.sort_values(['sort_order', 'timepoint_rank']) + + # Determine X-axis categories dynamically for THIS origin + orig_tps = orig_df[[timepoint_col, 'timepoint_rank']].drop_duplicates().sort_values('timepoint_rank')[timepoint_col].tolist() + + # Generate temporary figure to extract traces + temp_fig = px.line( + orig_df, x=timepoint_col, y="log_freq", color="status", + line_group="CDR3b", hover_name="CDR3b", + color_discrete_map=color_map, + category_orders={timepoint_col: orig_tps} + ) + + # Style Traces + temp_fig.update_traces(mode='lines+markers') + temp_fig.for_each_trace(lambda t: t.update( + line=dict(width=1), opacity=0.1, marker=dict(size=2, opacity=0) + ) if t.name == "Background" else t.update( + line=dict(width=3), opacity=1.0, marker=dict(size=6) + )) + + n_traces = len(temp_fig.data) + is_first = (len(origin_layouts) == 0) + + # Inject traces into the master figure + for trace in temp_fig.data: + trace.visible = is_first + master_fig.add_trace(trace) + + # Save layout constraints for the dropdown + origin_layouts.append({ + 'origin': origin, + 'x_categories': orig_tps, + 'n_traces': n_traces + }) + traces_added += n_traces + + if not origin_layouts: + print("*(No plotable data)*\n") + continue + + # --- Build Dropdown Buttons --- + current_trace_idx = 0 + for layout_info in origin_layouts: + visibility = [False] * traces_added + n_tr = layout_info['n_traces'] + + # Set only this origin's traces to True + visibility[current_trace_idx : current_trace_idx + n_tr] = [True] * n_tr + current_trace_idx += n_tr + + orig_name = layout_info['origin'] + dyn_title = f"{title_prefix}: {subject}" + if len(origin_layouts) > 1: + dyn_title += f" ({orig_name})" + + button = dict( + label=str(orig_name), + method="update", + args=[ + {"visible": visibility}, + { + "title.text": dyn_title, + "xaxis.categoryarray": layout_info['x_categories'] # Update X-axis to match the origin + } + ] + ) + buttons.append(button) + + # --- Apply Master Layout --- + first_layout = origin_layouts[0] + initial_title = f"{title_prefix}: {subject}" + if len(origin_layouts) > 1: + initial_title += f" ({first_layout['origin']})" + + master_fig.update_layout( + title=initial_title, + yaxis_title="Log10 Frequency", + xaxis_title="Timepoint", + xaxis={'categoryorder': 'array', 'categoryarray': first_layout['x_categories']}, + hovermode="closest", + template="plotly_white", + width=900, height=550 + ) + + # Inject dropdown only if multiple origins exist + if len(origin_layouts) > 1: + master_fig.update_layout( + margin=dict(t=120), + updatemenus=[dict( + active=0, + buttons=buttons, + x=0.0, xanchor="left", y=1.15, yanchor="top", + pad={"r": 10, "t": 10} + )] + ) + + # 3. Render the figure directly into the tab + master_fig.show() + print("\n") + + # 4. Close the Quarto Tabset + print(":::\n") + +``` + +```{python} +#| label: results +#| output: asis + +generate_plot_quarto_tabs(expanded_map, "Top 15 Upregulated", "firebrick") + +``` +**Figure 2: Longitudinal tracking of top expanded TCR clonotypes.** The temporal dynamics of the 15 most significantly upregulated clonotypes, ranked by lowest FDR-adjusted p-value ($q$-value). Red lines trace the $log_{10}$ frequency of individual clones that exhibited significant expansion ($FC > 2, q < 0.05$) across the sampled timepoints. + +### Trajectory of Top Contracted Clones + +We also isolated the top 15 most significantly contracting clonotypes (lowest $q$-value). + +- **Visualization:** The blue lines highlight the trajectories of clones that drastically reduced their frequency or became undetectable (frequency dropping towards the plot floor) at later timepoints. + +- **Interpretation:** While expansion often signals an active response, contraction is equally informative. These trajectories may represent T-cell populations that are being physically displaced by the expanding clones (competitive exclusion), becoming exhausted and dying off, or disappearing following the clearance of their specific antigen. Tracking these losses prevents us from looking only at "positive" selection bias. + +```{python} +#| output: asis + +generate_plot_quarto_tabs(contracted_map, "Top 15 Downregulated", "royalblue") + +``` +**Figure 3: Longitudinal tracking of top contracted TCR clonotypes.** The temporal dynamics of the 15 most significantly downregulated clonotypes, ranked by lowest FDR-adjusted p-value ($q$-value). Blue lines trace the $log_{10}$ frequency of individual clones that exhibited significant contraction ($FC < 0.5, q < 0.05$) across the sampled timepoints. + + +### Clonal Expansion/Contraction Paiwise Comparisons + +For each patient, we performed pairwise statistical comparisons between all timepoints for every TCR clonotype detected in at least one timepoint. Based on these tests, we identify expansion and contraction events, between any 2 pairs of timepoints. + + +The following tables provides the complete history of every TCR comparison with an statistically significant change ($q < 0.05$ and $Fold Change > \text{2.0}$ in at least one pairwise comparison during the patient's timeline. + +- Multiple Entries: + - A single clone may appear in multiple rows if it changed significantly between multiple different timepoints (e.g., expanding from T1 $\to$ T2, but contracting from T2 $\to$ T3). + +- Statistical Metrics: + - **Fold Change (FC):** The magnitude of the change. + - $p$-value: The raw probability from Fisher's Exact Test. + - $q$-value: The FDR-adjusted p-value. This is the metric used to filter for significance. + +- Usage: Consult this table to verify the statistical strength of a specific claim. + + +```{python} +#| label: table-expanding + +# Expanding +display(create_styled_table( + table2_top +)) + +``` + +**Table 3: Top 15 expansion events across timepoints.** Clone comparisons where there was a significant expansion (FC>2 and q-value<0.05). The same clone can be present in multiple rows. Includes + +```{python} +#| output: asis + +# Contracting +display(create_styled_table( + table2_contracting_top +)) + +``` + +**Table 4: Top 15 contraction events across timepoints.** Clone comparisons where there was a significant contraction (FC<0.5 and q-value<0.05). The same clone can be present in multiple rows. + + +**Interpret the Results** + +It is critical to not interpret the Fold Change in isolation. A clone's biological importance depends on the interplay between three factors: + +- **Fold Change (Magnitude)**: How large was the change? +- **Statistical Significance (Confidence)**: How likely is it that the change was real? (q-value) +- **Clone Counts (Abundance)**: What is the clone's overall presence in the sample? + +For example, a massive Fold Change resulting from a change of 1 count to 10 might not be statistically significant and could be noise. Conversely, a highly significant q-value for a clone that changes from 2 counts to 8 might be statistically real but biologically unimportant due to its low abundance. + +**Therefore, the most robust conclusions come from paying attention to clones that demonstrate both a high abs(Fold Change) and a low q-value, contextualized by their raw clone counts.** diff --git a/notebooks/template_pheno_bulk.qmd b/notebooks/template_pheno_bulk.qmd new file mode 100644 index 0000000..97445c5 --- /dev/null +++ b/notebooks/template_pheno_bulk.qmd @@ -0,0 +1,508 @@ +--- +title: "TCRtoolkit Phenotype (Bulk)" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: true + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +jupyter: python3 +--- + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name = '' +project_dir = '' +sample_table = '' + +``` + + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}" + +sample_stats_csv = f"{project_dir}/sample/sample_stats.csv" +concat_csv = f"{project_dir}/annotate/concatenated_cdr3.tsv" + +v_family_csv= project_dir + '/sample/v_family.csv' +j_family_csv= project_dir + '/sample/j_family.csv' + +# Define dirs +tcrdist_dir = project_dir + '/tcrdist3/' +olga_dir = project_dir + '/olga/' +tcrpheno_dir = f"{project_dir}/tcrpheno/" +pseudobulk_dir = f"{project_dir}/pseudobulk/" +VDJdb_dir = project_dir+ '/vdjdb/' +convergence_dir = project_dir+ '/convergence/' + +``` + +```{python} +#| code-fold: true + +# 1. Load Packages +import datetime +import glob +import itertools +import math +import os +import sys +import h5py +import igraph as ig +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +# import networkx as nx +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import scipy.cluster.hierarchy as sch +import seaborn as sns +from IPython.display import Image +from matplotlib.colors import LinearSegmentedColormap +from scipy.sparse import csr_matrix +from scipy.stats import gaussian_kde +import plotly.io as pio +from scipy import stats +from itertools import combinations +from scipy.stats import mannwhitneyu +from plotly.subplots import make_subplots + +import warnings +warnings.filterwarnings( + "ignore", + category=FutureWarning, + module="plotly" +) + +# 2. Print pipeline parameters + +# print('Project Name: ' + project_name) +# print('Workflow command: ' + workflow_cmd) +# print('Date and time: ' + str(datetime.datetime.now())) + +# 3. Loading data +## Reading sample metadata +meta = pd.read_csv(sample_table, sep=',') +meta.drop(columns=['file'], inplace=True) +meta_cols = meta.columns.tolist() + +## Reading combined repertoire statistics +df = pd.read_csv(sample_stats_csv, sep=',') +df = pd.merge(df, meta, on='sample', how='left') +df = df[meta_cols + [c for c in df.columns if c not in meta_cols]] + +## Reading V gene family usage +v_family = pd.read_csv(v_family_csv, sep=',') +v_family = pd.merge(v_family, meta, on='sample', how='left') +v_family = v_family[meta_cols + [c for c in v_family.columns if c not in meta_cols]] +v_family = v_family.sort_values(by=[subject_col]) + +## Reading J gene family usage +j_family = pd.read_csv(j_family_csv, sep=',') +j_family = pd.merge(j_family, meta, on='sample', how='left') +j_family = j_family[meta_cols + [c for c in j_family.columns if c not in meta_cols]] +j_family = j_family.sort_values(by=[subject_col]) + +## Reading concatenated cdr3 file +concat_df = pd.read_csv(concat_csv, sep='\t') +concat_df = concat_df.merge(meta[['sample', subject_col, 'alias', 'timepoint', 'timepoint_order']], on='sample', how='left') + + +``` + + + +## From TCR sequence to Phenotype: TCRpheno {#sec-TCRPheno unless="show_sc"} + +::: {.callout-warning title="Bulk data ONLY"} +If you are dealing with **single-cell TCR data, please please disregard section.** +::: + +[TCRpheno](https://www.cell.com/cell-reports/fulltext/S2211-1247(24)01449-9#fig1) is a machine learning model that predicts the likely functional phenotype of a T-cell based solely on its TCR sequence. This is powerful because it allows you to infer the roles of different T-cells (e.g., killer, regulator, memory) from your bulk sequencing data without needing corresponding cell surface marker information. + +**How to Interpret the Scores?** 🔍 +**The output from TCRpheno is a set of scores, not a definitive classification.** For each TCR, a higher score in a category indicates a higher probability that the T-cell belongs to that functional group. + +- **TCRbeta.CD8:** A high score here suggests the TCR likely belongs to a cytotoxic CD8+ T-cell. These are the **"killers" of the adaptive immune system**, responsible for destroying virus-infected cells and tumor cells. A repertoire with many high-scoring CD8 TCRs indicates a strong, active anti-pathogen or anti-tumor response. + +- **TCRbeta.reg:** This score corresponds to regulatory T-cells (Tregs). These are the **"peacekeepers" that suppress other immune cells** to prevent autoimmune reactions and maintain tolerance. A high Treg score for a TCR suggests it plays a role in immune suppression. + +- **TCRbeta.mem:** This indicates a likely memory T-cell phenotype. These are the l**ong-lived "veterans" of the immune system that persist after an infection is cleared**, providing rapid protection upon re-exposure to the same pathogen. + +- **TCRbeta.innate:** A high score in this category suggests the TCR may belong to an innate-like T-cell, such as a MAIT or NKT cell. These cells bridge the gap between the innate and adaptive immune systems, **acting as rapid first responders to certain types of threats.** + +It's important to remember that a single TCR can have moderate scores in multiple categories, reflecting the **potential for cellular plasticity** or shared sequence features between different T-cell types. **The phenotype with the highest score is considered the most likely identity for that TCR.** + +```{python} +#| output: asis +#| echo: false + +import os +import glob +import pandas as pd +import numpy as np +import plotly.graph_objects as go + +def create_phenotype_tabs(meta_df): + """ + 1. Reads TCRpheno files. + 2. Merges with metadata. + 3. Creates Quarto tabs containing stacked bar charts per patient. + """ + + # --- 1. Find and Load Data --- + # Ensure tcrpheno_dir is defined in your environment + file_pattern = os.path.join(tcrpheno_dir, '*_tcrpheno.tsv') + pheno_files = glob.glob(file_pattern) + + if not pheno_files: + print(f"Error: No files found matching {file_pattern}") + return + + all_results = [] + phenotype_cols = ["TCRbeta.innate", "TCRbeta.CD8", "TCRbeta.reg", "TCRbeta.mem"] + clean_names = [p.replace('TCRbeta.', '') for p in phenotype_cols] + + for file_path in pheno_files: + try: + filename = os.path.basename(file_path) + sample_id = filename.replace('_tcrpheno.tsv', '') + + df = pd.read_csv(file_path, sep='\t') + + if not all(col in df.columns for col in phenotype_cols): + continue + + df_cleaned = df.dropna(subset=phenotype_cols).copy() + if df_cleaned.empty: continue + + df_cleaned['dominant_phenotype'] = df_cleaned[phenotype_cols].idxmax(axis=1) + + composition = df_cleaned['dominant_phenotype'].value_counts(normalize=True).reset_index() + composition.columns = ['phenotype', 'percentage'] + composition['percentage'] *= 100 + composition['sample'] = sample_id + + # --- Merge Metadata --- + meta_row = meta_df[meta_df['sample'] == sample_id] + if not meta_row.empty: + composition['subject_id'] = meta_row.iloc[0]['subject_id'] + composition['alias'] = meta_row.iloc[0]['alias'] + composition['sort_order'] = meta_row.iloc[0]['timepoint_order'] if 'timepoint_order' in meta_df.columns else 0 + else: + composition['subject_id'] = np.nan + composition['alias'] = sample_id + composition['sort_order'] = 0 + + all_results.append(composition) + + except Exception as e: + print(f"Error processing {sample_id}: {e}") + + if not all_results: + print("No valid data found.") + return + + master_df = pd.concat(all_results, ignore_index=True) + master_df = master_df.dropna(subset=['subject_id']) + + # --- 2. Setup Plotting & Quarto Tabs --- + color_map = { + "TCRbeta.innate": "rgb(99, 110, 250)", + "TCRbeta.CD8": "rgb(239, 85, 59)", + "TCRbeta.reg": "rgb(0, 204, 150)", + "TCRbeta.mem": "rgb(171, 99, 250)" + } + + patients = sorted(master_df['subject_id'].unique()) + + # Open Quarto Tabset + print("::: {.panel-tabset}\n") + + # --- 3. Create a Standalone Chart for Each Patient --- + for patient in patients: + print(f"## {patient}\n") + + fig = go.Figure() + + pt_df = master_df[master_df['subject_id'] == patient].copy() + pt_df.sort_values(by=['sort_order', 'alias'], inplace=True) + aliases = pt_df['alias'].unique() + + for i, pheno in enumerate(phenotype_cols): + # Pivot ensures we have a value for every alias + pivot_df = pt_df.pivot(index='alias', columns='phenotype', values='percentage').fillna(0) + pivot_df = pivot_df.reindex(aliases) + + # Safeguard: If a patient has 0% of a phenotype across all timepoints, the column won't exist. + if pheno not in pivot_df.columns: + pivot_df[pheno] = 0 + + fig.add_trace(go.Bar( + x=pivot_df.index, + y=pivot_df[pheno], + name=clean_names[i], + marker_color=color_map.get(pheno) + )) + + fig.update_layout( + barmode='stack', + title_text=f'Phenotype Composition: {patient}', + xaxis_title='Timepoint', + yaxis_title='Percentage (%)', + legend_title='Phenotype', + template="plotly_white", + margin=dict(t=40, b=20, l=10, r=10) + ) + + fig.show() + print("\n\n") + + # Close Quarto Tabset + print(":::\n") + +create_phenotype_tabs(meta) + +``` +**Figure 9: Composition of Unique TCR Clonotypes by Predicted Phenotype** Stacked bar chart showing the proportional diversity of predicted T-cell phenotypes (memory, regulatory, CD8, or innate-like) for each sample, independent of clonal expansion. Each bar indicates the percentage of clones assigned to each category based on their highest TCRpheno score. + +By analyzing the proportions of TCR phenotypes (such as innate, CD8+, regulatory, and memory T-cells) across different timepoints or samples, we can understand the quality and strategy of the immune response. + +Here is why tracking these phenotypic shifts is critical: + +- **Contextualizing Clonal Dynamics:** A highly clonal sample might initially look like a strong, active immune response. However, if phenotypic analysis reveals that the expansion is entirely driven by regulatory T-cells (Tregs), the immune system is likely being suppressed—a crucial distinction when analyzing tumor microenvironments or immune evasion. + +- **Tracking Treatment Response and Evolution:** In longitudinal data, tracking the ratio of cytotoxic cells (CD8+) to regulatory cells over time can serve as a biomarker for disease progression or treatment efficacy. You are looking for shifts in momentum, such as a sudden spike in CD8+ activity following an intervention. + +- **Identifying Memory Formation:** Observing a transition from an active, innate, or CD8-dominated phenotype toward a memory-dominated phenotype in later timepoints helps confirm that the immune system has successfully cataloged the antigen for long-term protection. + +::: {.callout-important title="Important"} +On overinterpreting TCRPheno scores: +"...our TCR scoring functions are consistent across individuals and antigens, but **they are insufficient to accurately classify T cell states.** After all, **the TCR sequence is only one minor influence on the transcriptional state of a given T cell.**" +- Lagattuta, Kaitlyn A. et al. Cell Reports +::: + +This plot answers the question: *"Of all the different types of T-cell soldiers available, what is the breakdown of their specialties?"* + +It reflects the underlying potential of the repertoire. Comparing "Base" vs. "Post" samples can reveal if a treatment induced the emergence of many new and different types of T-cells with a certain phenotype. + + +**The Importance of Clonally-Weighted Phenotyping** +An unweighted analysis treats every unique TCR sequence as equals—a clone represented by a single read carries the exact same visual weight as a massively expanded clone with 10,000 reads. While unweighted data is great for understanding the breadth of the immune repertoire, it can completely mask what the immune system is actively doing at scale. + +By weighting the phenotypes by clonal size (TCR counts), we reveal the true effector capacity or suppressive burden circulating in the patient. Here is why this distinction is critical: + +- **Revealing Hidden Expansions:** For instance, the proportion of unique clones assigned to the regulatory (reg) phenotype can stay relatively stable across all timepoints. However, if after looking at the clonally-weighted plot, the reg compartment explodes later one, this tells us that while the number of unique regulatory clones didn't change much, a select few of those regulatory clones underwent massive physical expansion. + +- **Capturing True Biological Impact:** In a cancer context, a tumor microenvironment isn't suppressed by the diversity of Tregs; it is suppressed by the sheer volume of them. Weighting by clonal size gives you a much more accurate picture of the suppressive pressure the effector T-cells are actually facing. + +- **Distinguishing Broad vs. Focused Responses:** If an innate or CD8+ phenotype spikes in the unweighted plot but remains flat in the weighted plot, it indicates a broad, low-level mobilization of many different T-cells. If it spikes in the weighted plot, it confirms a highly focused, antigen-driven clonal expansion. + +```{python} +#| output: asis +#| echo: false + +import pandas as pd +import plotly.graph_objects as go +import glob +import os + +def create_weighted_phenotype_plot(meta_df): + """ + Merges TCR counts with phenotype predictions to create a stacked bar chart + where phenotype composition is weighted by clonal expansion. + - Generates one Quarto tab per patient. + - Uses 'alias' for the X-axis. + - Sorts chronologically using 'timepoint_order'. + """ + # --- Define file paths --- + # Assuming concat_csv and tcrpheno_dir are defined globally + counts_file = concat_csv + + # --- Load the primary TCR counts data --- + try: + counts_df = pd.read_csv(counts_file, sep='\t') + except FileNotFoundError: + print(f"Error: Counts file not found at {counts_file}") + return + + # --- Find all phenotype files --- + file_pattern = os.path.join(tcrpheno_dir, '*_tcrpheno.tsv') + pheno_files = sorted(glob.glob(file_pattern)) + + if not pheno_files: + print(f"Error: No files found matching the pattern: {file_pattern}") + return + + # --- Process each sample --- + all_results = [] + phenotype_cols = ["TCRbeta.innate", "TCRbeta.CD8", "TCRbeta.reg", "TCRbeta.mem"] + + # Cleaner names for legend + legend_names = {k: k.replace('TCRbeta.', '') for k in phenotype_cols} + + color_map = { + "TCRbeta.innate": "rgb(99, 110, 250)", # Periwinkle + "TCRbeta.CD8": "rgb(239, 85, 59)", # Red + "TCRbeta.reg": "rgb(0, 204, 150)", # Teal + "TCRbeta.mem": "rgb(171, 99, 250)" # Purple + } + + for pheno_file_path in pheno_files: + try: + # Extract sample name + sample_name = os.path.basename(pheno_file_path).replace('_tcrpheno.tsv', '') + + # --- Metadata Lookup --- + meta_row = meta_df[meta_df['sample'] == sample_name] + if meta_row.empty: + # Skip samples not in metadata, or define defaults + continue + + subject_id = meta_row.iloc[0]['subject_id'] + alias = meta_row.iloc[0]['alias'] + # Default to 0 if order column is missing + tp_order = meta_row.iloc[0]['timepoint_order'] if 'timepoint_order' in meta_df.columns else 0 + + # --- Load Data --- + pheno_df = pd.read_csv(pheno_file_path, sep='\t') + + if 'sample' not in counts_df.columns: + print(f"Error: 'sample' column missing in {counts_file}") + return + + sample_counts_df = counts_df[counts_df['sample'] == sample_name].copy() + + if sample_counts_df.empty: + continue + + # --- Merge & Weight --- + # Match phenotype (junction_aa) with counts (CDR3b) + merged_df = pd.merge(pheno_df, sample_counts_df[['CDR3b', 'counts']], + left_on='junction_aa', right_on='CDR3b', how='inner') + + merged_df = merged_df.dropna(subset=phenotype_cols + ['counts']) + + if merged_df.empty: + continue + + # Determine dominant phenotype + merged_df['dominant_phenotype'] = merged_df[phenotype_cols].idxmax(axis=1) + + # Sum counts per phenotype (Weighted Composition) + composition = merged_df.groupby('dominant_phenotype')['counts'].sum().reset_index() + + # Calculate percentages + total_sample_counts = composition['counts'].sum() + composition['percentage'] = (composition['counts'] / total_sample_counts) * 100 + + # Add identifiers + composition.rename(columns={'dominant_phenotype': 'phenotype'}, inplace=True) + composition['sample'] = sample_name + composition['subject_id'] = subject_id + composition['alias'] = alias + composition['timepoint_order'] = tp_order + + all_results.append(composition) + + except Exception as e: + print(f"Error processing {sample_name}: {e}") + + if not all_results: + print("No valid data could be processed.") + return + + # --- Combine Results --- + results_df = pd.concat(all_results, ignore_index=True) + + # --- Generate Tabs per Patient --- + unique_patients = sorted(results_df['subject_id'].unique()) + + print("::: {.panel-tabset}\n") + + for pat in unique_patients: + print(f"## {pat}\n") + + # Filter and Sort by Timepoint Order + pat_data = results_df[results_df['subject_id'] == pat].copy() + pat_data.sort_values(by=['timepoint_order', 'alias'], inplace=True) + + # Get unique aliases in correct order for X-axis category array + # This ensures the plot respects the sort order even if data is missing for one bar + sorted_aliases = pat_data['alias'].unique() + + fig = go.Figure() + + for phenotype in phenotype_cols: + # We filter for the phenotype + pheno_data = pat_data[pat_data['phenotype'] == phenotype] + + # We reindex or merge to ensure every alias on the X-axis has a value (even if 0) + # Use a pivot to align data to the sorted aliases + # Columns=Phenotype, Index=Alias + pivot_data = pat_data.pivot_table(index='alias', columns='phenotype', values='percentage', fill_value=0) + + # Reindex to ensure we have all timepoints in correct order + pivot_data = pivot_data.reindex(sorted_aliases, fill_value=0) + + # Check if phenotype exists in this patient's data + y_values = pivot_data[phenotype] if phenotype in pivot_data.columns else [0]*len(sorted_aliases) + + fig.add_trace(go.Bar( + x=sorted_aliases, # Use the sorted list + y=y_values, + name=legend_names[phenotype], + marker_color=color_map.get(phenotype, 'grey') + )) + + fig.update_layout( + barmode='stack', + title_text=f'Weighted Phenotype Composition: {pat}', + xaxis_title='Sample (Alias)', + yaxis_title='Weighted Composition (%)', + yaxis=dict(range=[0, 105]), + legend_title='Phenotype', + width=800, + height=500, + template="simple_white", + # Explicitly enforce the sort order on the X-axis + xaxis={'categoryorder': 'array', 'categoryarray': sorted_aliases} + ) + + fig.show() + print("\n") + + print(":::\n") + +# Run the function with your metadata dataframe +create_weighted_phenotype_plot(meta) + +``` + +**Figure 10: Clonally-Weighted TCR Phenotype Composition** Stacked bar chart showing the proportional abundance of predicted T-cell phenotypes for each sample, weighted by clonal size. The size of each segment is determined by summing the read counts of all TCRs assigned to that functional category, thus directly reflecting clonal expansion. + +::: {.callout-important title="Important"} +On overinterpreting TCRPheno scores: +"...our TCR scoring functions are consistent across individuals and antigens, but **they are insufficient to accurately classify T cell states.** After all, **the TCR sequence is only one minor influence on the transcriptional state of a given T cell.**" +- Lagattuta, Kaitlyn A. et al. Cell Reports +::: diff --git a/notebooks/template_pheno_sc.qmd b/notebooks/template_pheno_sc.qmd new file mode 100644 index 0000000..446ec34 --- /dev/null +++ b/notebooks/template_pheno_sc.qmd @@ -0,0 +1,652 @@ +--- +title: "TCRtoolkit Phenotype (SC)" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: true + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +jupyter: python3 +--- + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name = '' +project_dir = '' +sample_table = '' + +``` + + + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}" + +sample_stats_csv = f"{project_dir}/sample/sample_stats.csv" +concat_csv = f"{project_dir}/annotate/concatenated_cdr3.tsv" + +v_family_csv= f"{project_dir}/sample/v_family.csv" +j_family_csv= f"{project_dir}/sample/j_family.csv" +morisita_mat_pheno= f"{project_dir}/compare_phenotype/morisita_mat.csv" +concat_csv_pheno=f"{project_dir}/compare_phenotype/concatenated_cdr3.tsv" +sample_table_pheno=f"{project_dir}/pipeline_info/samplesheet_phenotype.csv" + +# Define dirs +tcrdist_dir = f"{project_dir}/tcrdist3/" +olga_dir = f"{project_dir}/olga/" +tcrpheno_dir = f"{project_dir}/tcrpheno/" +pseudobulk_dir = f"{project_dir}/pseudobulk/" +VDJdb_dir = f"{project_dir}vdjdb/" +convergence_dir = f"{project_dir}convergence/" +``` + + + +```{python} +#| code-fold: true + +# 1. Load Packages +import datetime +import glob +import itertools +import math +import os +import sys +import h5py +import igraph as ig +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +# import networkx as nx +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import scipy.cluster.hierarchy as sch +import seaborn as sns +from IPython.display import Image +from matplotlib.colors import LinearSegmentedColormap +from scipy.sparse import csr_matrix +from scipy.stats import gaussian_kde +import plotly.io as pio +from scipy import stats +from itertools import combinations +from scipy.stats import mannwhitneyu +from plotly.subplots import make_subplots +from pathlib import Path +import re + +import warnings +warnings.filterwarnings( + "ignore", + category=FutureWarning, + module="plotly" +) + +# 2. Print pipeline parameters + +# print('Project Name: ' + project_name) +# print('Workflow command: ' + workflow_cmd) +# print('Date and time: ' + str(datetime.datetime.now())) + +# 3. Loading data +## Reading sample metadata +meta = pd.read_csv(sample_table, sep=',') +meta.drop(columns=['file'], inplace=True) +meta_cols = meta.columns.tolist() + +## Reading combined repertoire statistics +df = pd.read_csv(sample_stats_csv, sep=',') +df = pd.merge(df, meta, on='sample', how='left') +df = df[meta_cols + [c for c in df.columns if c not in meta_cols]] + +## Reading V gene family usage +v_family = pd.read_csv(v_family_csv, sep=',') +v_family = pd.merge(v_family, meta, on='sample', how='left') +v_family = v_family[meta_cols + [c for c in v_family.columns if c not in meta_cols]] +v_family = v_family.sort_values(by=[subject_col]) + +## Reading J gene family usage +j_family = pd.read_csv(j_family_csv, sep=',') +j_family = pd.merge(j_family, meta, on='sample', how='left') +j_family = j_family[meta_cols + [c for c in j_family.columns if c not in meta_cols]] +j_family = j_family.sort_values(by=[subject_col]) + +## Reading concatenated cdr3 file +concat_df = pd.read_csv(concat_csv, sep='\t') +concat_df = concat_df.merge(meta[['sample', subject_col, alias_col, timepoint_col, timepoint_order_col]], on='sample', how='left') + +# Phenotype files + +# Samplesheet +metadata_pheno_df = pd.read_csv(sample_table_pheno, sep=',', header=0, index_col="file") +metadata_pheno_df.index = metadata_pheno_df.index.map(lambda p: Path(p).name) +metadata_pheno_df.index = metadata_pheno_df.index.str.replace('_pseudobulk', '', regex=False) +metadata_pheno_df.index = metadata_pheno_df.index.str.replace('_phenotype.tsv', '', regex=False) +# Morisita +morisita_pheno_df = pd.read_csv(morisita_mat_pheno, sep=',', header=0, index_col=0) +morisita_pheno_df.index = morisita_pheno_df.index.str.replace('_airr', '', regex=False) +morisita_pheno_df.index = morisita_pheno_df.index.str.replace('_pseudobulk', '', regex=False) +morisita_pheno_df.columns = morisita_pheno_df.columns.str.replace('_airr', '', regex=False) +morisita_pheno_df.columns = morisita_pheno_df.columns.str.replace('_pseudobulk', '', regex=False) + +# Importing sample metadata (pheno-ps) +clonotypes_pheno_df = pd.read_csv(concat_csv_pheno, sep='\t', header=0, index_col=0).reset_index() +clonotypes_pheno_df['sample_phenotype'] = clonotypes_pheno_df['sample'] +# Add 'sample_phenotype' and 'phenotype' +samples = sorted(meta['sample'].astype(str).unique(), key=len, reverse=True) +pattern = r'^(?P' + '|'.join(map(re.escape, samples)) + r')(?:_(?P.*))?$' +extracted = clonotypes_pheno_df['sample_phenotype'].astype(str).str.extract(pattern) +clonotypes_pheno_df['sample'] = extracted['sample'] +clonotypes_pheno_df['phenotype'] = extracted['phenotype'] + +``` + + +## Characterizing cellular composition at single-cell resolution {#sec-sc-phenotypes if="show_sc"} + +::: {.callout-warning title="Single-cell data ONLY"} +If you are dealing with **bulk TCR data, please skip section**. +For this section to be rightfully executed, you must have had provided cells and their associated phenotypes. +::: + +Single-cell technologies (e.g. 10X) allow the TCR sequence capture together with the RNA expression caracterization, which allows for cell phenotype assignment. + +TCRs coming from the same phenotype are pseudobulked and here we analyze them. + + +**The Diversity of the Repertoire (Unweighted Phenotypes)** +In the analysis of paired single-cell RNA (scRNA-seq) and T-cell receptor (TCR-seq) data, evaluating the unweighted phenotypic composition provides an initial assessment of repertoire diversity. In this representation, each unique TCR clonotype is counted uniformly, irrespective of its cellular frequency. This visualization delineates the baseline functional landscape of the microenvironment, illustrating the breadth of available transcriptional states—such as memory, regulatory, or CD8+ cytotoxic phenotypes. However, while this approach effectively captures the diverse potential of the immune response, it fundamentally equates rare, quiescent clones with those that have undergone substantial clonal expansion. To accurately determine which functional phenotypes are driving the active immune response, it is necessary to integrate metrics of clonal abundance. + +```{python} +#| output: asis +#| echo: false + +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import os +import glob + +def create_phenotype_stacked_bar(meta_df): + + # --- 1. Load Data --- + file_pattern = os.path.join(pseudobulk_dir, "*_pseudobulk_phenotype.tsv") + file_paths = glob.glob(file_pattern) + + all_data = [] + + for file_path in file_paths: + try: + df = pd.read_csv(file_path, sep='\t') + basename = os.path.basename(file_path) + sample_raw = basename.replace('_pseudobulk_phenotype.tsv', '') + df['sample_raw'] = sample_raw + all_data.append(df) + except Exception as e: + continue + + if not all_data: + print("No files found matching the pattern.") + return + + combined_df = pd.concat(all_data, ignore_index=True) + + # --- 2. Clean Sample Names --- + def clean_sample_name(row): + s = str(row['sample_raw']) + p = str(row['phenotype']) + suffix = f"_{p}" + if s.endswith(suffix): + return s[:-len(suffix)] + return s + + combined_df['sample'] = combined_df.apply(clean_sample_name, axis=1) + + # --- 3. Merge with Metadata --- + cols_to_use = ['sample', subject_col, timepoint_col, timepoint_order_col] + + if subject_col not in meta_df.columns: + print(f"Error: {subject_col} not found in metadata.") + return + + plot_df = combined_df.merge(meta_df[cols_to_use], on='sample', how='inner') + + # --- 4. ESTABLISH GLOBAL PHENOTYPE COLORS --- + # Extract unique phenotypes in order of appearance to map colors stably + all_unique_phenotypes = plot_df['phenotype'].dropna().unique().tolist() + palette = px.colors.qualitative.Plotly + px.colors.qualitative.Set1 + global_pheno_colors = {pheno: palette[i % len(palette)] for i, pheno in enumerate(all_unique_phenotypes)} + + # --- 5. Generate Tabs --- + print("::: {.panel-tabset}\n") + + unique_patients = sorted(plot_df[subject_col].unique()) + + for pat in unique_patients: + print(f"## {pat}\n") + + pat_df = plot_df[plot_df[subject_col] == pat].copy() + + # Group by Timepoint and Phenotype + counts = pat_df.groupby([timepoint_col, 'phenotype', timepoint_order_col]).size().reset_index(name='count') + + # Calculate Totals per Timepoint + total_per_tp = counts.groupby(timepoint_col)['count'].transform('sum') + counts['percentage'] = (counts['count'] / total_per_tp) * 100 + + # Sort by Timepoint Order + counts = counts.sort_values(timepoint_order_col) + + # Create a separate DF for the "Total" labels + totals_df = counts[[timepoint_col, timepoint_order_col]].drop_duplicates().sort_values(timepoint_order_col) + actual_sums = pat_df.groupby(timepoint_col).size() + totals_df['total_count'] = totals_df[timepoint_col].map(actual_sums) + + # --- Plotting --- + + fig = px.bar( + counts, + x=timepoint_col, + y='percentage', + color='phenotype', + color_discrete_map=global_pheno_colors, # Forces consistency + title=f"Phenotype Distribution (Full Repertoire): {pat}", + labels={'percentage': 'Percentage Clonotypes (%)', timepoint_col: timepoint_col, 'phenotype': 'Phenotype'}, + template="simple_white", + category_orders={ + timepoint_col: totals_df[timepoint_col].tolist(), + 'phenotype': all_unique_phenotypes # Matches legend order perfectly + }, + hover_data={'count': True, timepoint_col: False, 'percentage': ':.1f'} + ) + + fig.update_traces( + hovertemplate="%{x}
Phenotype: %{data.name}
Percentage: %{y:.1f}%
Count: %{customdata[0]}" + ) + + # Add Total Count Annotations + fig.add_trace( + go.Scatter( + x=totals_df[timepoint_col], + y=[102] * len(totals_df), + text=totals_df['total_count'], + mode='text', + texttemplate='N=%{text}', + textposition='bottom center', + showlegend=False, + hoverinfo='none' + ) + ) + + fig.update_layout( + barmode='stack', + xaxis_title="Timepoint", + yaxis_title="Percentage Clonotypes (%)", + yaxis=dict(range=[0, 110]), + width=800, + height=600, + margin=dict(t=60), + showlegend=True + ) + + fig.update_xaxes(categoryorder='array', categoryarray=totals_df[timepoint_col].tolist()) + + fig.show() + print("\n") + + print(":::\n") + +# --- Run --- +create_phenotype_stacked_bar(meta) + +``` + +**Figure 9: Composition of Unique TCR Clonotypes by scRNA-seq Phenotype** Stacked bar chart showing the proportional diversity of T-cell phenotypes (e.g., Memory, Regulatory) for each sample, derived from single-cell gene expression profiles. Each bar indicates the percentage of unique TCR clonotypes assigned to a specific phenotype based on the transcriptional cluster of their corresponding cell(s), independent of clonal expansion size. + +**The Physical Footprint (Cell-Weighted Phenotypes)** +Weighting phenotypic categories by the absolute number of cells expressing each TCR provides a more accurate representation of biological impact. This analytical transition shifts the focus from functional diversity to actual cellular effector capacity. A specific phenotype that appears subdominant in an unweighted analysis may dominate the cell-weighted composition if a subset of its constituent clones has undergone significant antigen-driven proliferation. Conversely, a highly diverse phenotypic compartment may represent a negligible fraction of the total cellular population if those clones failed to actively expand. Ultimately, immune-mediated control within a tissue—whether resolving an active infection or infiltrating a tumor microenvironment—is driven not merely by the structural diversity of the T-cell repertoire, but by the aggregate cellular abundance of these specific functional states. + +```{python} +#| output: asis +#| echo: false + +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import os +import glob + +def create_phenotype_stacked_bar(meta_df, count_col='counts'): # <-- Added count_col parameter + + # --- 1. Load Data --- + file_pattern = os.path.join(pseudobulk_dir, "*_pseudobulk_phenotype.tsv") + file_paths = glob.glob(file_pattern) + + all_data = [] + + for file_path in file_paths: + try: + df = pd.read_csv(file_path, sep='\t') + basename = os.path.basename(file_path) + sample_raw = basename.replace('_pseudobulk_phenotype.tsv', '') + df['sample_raw'] = sample_raw + all_data.append(df) + except Exception as e: + continue + + if not all_data: + print("No files found matching the pattern.") + return + + combined_df = pd.concat(all_data, ignore_index=True) + + # Check if the count column exists + if count_col not in combined_df.columns: + print(f"Error: The column '{count_col}' was not found in the data. Please update the 'count_col' parameter.") + return + + # --- 2. Clean Sample Names --- + def clean_sample_name(row): + s = str(row['sample_raw']) + p = str(row['phenotype']) + suffix = f"_{p}" + if s.endswith(suffix): + return s[:-len(suffix)] + return s + + combined_df['sample'] = combined_df.apply(clean_sample_name, axis=1) + + # --- 3. Merge with Metadata --- + cols_to_use = ['sample', subject_col, timepoint_col, timepoint_order_col] + + if subject_col not in meta_df.columns: + print(f"Error: {subject_col} not found in metadata.") + return + + plot_df = combined_df.merge(meta_df[cols_to_use], on='sample', how='inner') + + # --- 4. ESTABLISH GLOBAL PHENOTYPE COLORS --- + all_unique_phenotypes = plot_df['phenotype'].dropna().unique().tolist() + palette = px.colors.qualitative.Plotly + px.colors.qualitative.Set1 + global_pheno_colors = {pheno: palette[i % len(palette)] for i, pheno in enumerate(all_unique_phenotypes)} + + # --- 5. Generate Tabs --- + print("::: {.panel-tabset}\n") + + unique_patients = sorted(plot_df[subject_col].unique()) + + for pat in unique_patients: + print(f"## {pat}\n") + + pat_df = plot_df[plot_df[subject_col] == pat].copy() + + # Group by Timepoint and Phenotype, SUMMING the cell counts instead of counting rows + counts = pat_df.groupby([timepoint_col, 'phenotype', timepoint_order_col])[count_col].sum().reset_index(name='count') + + # Calculate Totals per Timepoint + total_per_tp = counts.groupby(timepoint_col)['count'].transform('sum') + counts['percentage'] = (counts['count'] / total_per_tp) * 100 + + # Sort by Timepoint Order + counts = counts.sort_values(timepoint_order_col) + + # Create a separate DF for the "Total" labels + totals_df = counts[[timepoint_col, timepoint_order_col]].drop_duplicates().sort_values(timepoint_order_col) + + # Sum the counts for the annotations (N=...) + actual_sums = pat_df.groupby(timepoint_col)[count_col].sum() + totals_df['total_count'] = totals_df[timepoint_col].map(actual_sums) + + # --- Plotting --- + + fig = px.bar( + counts, + x=timepoint_col, + y='percentage', + color='phenotype', + color_discrete_map=global_pheno_colors, + title=f"Phenotype Distribution (Cellular Abundance): {pat}", # Updated Title + labels={'percentage': 'Percentage Cells (%)', timepoint_col: timepoint_col, 'phenotype': 'Phenotype'}, # Updated Labels + template="simple_white", + category_orders={ + timepoint_col: totals_df[timepoint_col].tolist(), + 'phenotype': all_unique_phenotypes + }, + hover_data={'count': True, timepoint_col: False, 'percentage': ':.1f'} + ) + + fig.update_traces( + hovertemplate="%{x}
Phenotype: %{data.name}
Percentage: %{y:.1f}%
Cell Count: %{customdata[0]}" + ) + + # Add Total Count Annotations + fig.add_trace( + go.Scatter( + x=totals_df[timepoint_col], + y=[102] * len(totals_df), + text=totals_df['total_count'], + mode='text', + texttemplate='N=%{text}', + textposition='bottom center', + showlegend=False, + hoverinfo='none' + ) + ) + + fig.update_layout( + barmode='stack', + xaxis_title="Timepoint", + yaxis_title="Percentage Cells (%)", # Updated Y-axis Title + yaxis=dict(range=[0, 110]), + width=800, + height=600, + margin=dict(t=60), + showlegend=True + ) + + fig.update_xaxes(categoryorder='array', categoryarray=totals_df[timepoint_col].tolist()) + + fig.show() + print("\n") + + print(":::\n") + +# --- Run --- +create_phenotype_stacked_bar(meta, count_col='cell_count') + +``` +**Figure 10: Clonally-Weighted TCR Phenotype Composition.** Stacked bar chart showing the proportional abundance of T-cell phenotypes for each sample, weighted by clonal size. The size of each segment is determined by summing the read counts of all TCRs assigned to that functional category, thus directly reflecting clonal expansion. + +### Phenotypic plasticity of TCR Clones {#sec-overlap-pheno} + +Beyond tracking clones over time, it is critical to understand their functional roles and whether those roles change. Phenotypic plasticity refers to the **ability of a single T-cell clonal lineage** (defined by its unique TCR sequence) **to acquire different phenotypes or functional states**. By mapping TCR clones to their phenotypic clusters (e.g., cytotoxic, regulatory, memory), we can identify clones that are "locked" into a single state versus those that span multiple functional identities. + +This analysis uses an UpSet plot to visualize the intersection of TCR clones across different phenotypic groups within a single sample. + +- **Phenotypic Fidelity (Unique Sets):** Bars corresponding to a single dot (e.g., only CD8_cytotoxic) represent clones that are exclusively found in that phenotype. A high number here suggests that most clones in your sample have a distinct, stable functional role. + +- **Phenotypic Plasticity (Shared Sets):** Bars corresponding to connected dots (e.g., CD8_cytotoxic -- Memory) represent clones that are present in multiple phenotypic states simultaneously. This implies that the clonal lineage has diversified. These are biologically significant as they may represent clones in transition (e.g., differentiating from effector to memory) or highly adaptable clones responding to the tumor environment. + +```{python} +#| output: asis +#| echo: false + +import upsetplot +import io +import base64 + +# --- Helper: Convert Matplotlib to Plotly Image --- +def fig_to_plotly(mpl_fig): + """Converts a Matplotlib figure to a Plotly image trace.""" + buf = io.BytesIO() + mpl_fig.savefig(buf, format='png', bbox_inches='tight') + plt.close(mpl_fig) + buf.seek(0) + img_bytes = buf.read() + img_base64 = base64.b64encode(img_bytes).decode('utf-8') + return go.Image(source='data:image/png;base64,' + img_base64) + +# --- Core Plotting Function --- +def create_patient_upset_figure(df, patient_samples, meta_df): + """ + Creates a single Plotly figure containing multiple UpSet plot images (traces). + Includes a dropdown menu to toggle between the provided samples. + """ + fig = go.Figure() + + # Map Sample ID -> Alias (or Timepoint) + if alias_col in meta_df.columns: + alias_map = meta_df.set_index('sample')[alias_col].to_dict() + else: + alias_map = {s: s for s in patient_samples} + + # Generate a trace for each sample + valid_indices = [] + + for i, sample in enumerate(patient_samples): + # 1. Prepare Data + sample_df = df[df['sample'] == sample] + phenotypes = sorted(sample_df['phenotype'].unique()) + + # Skip if only 1 phenotype (UpSet needs at least 2 to show intersections) + if len(phenotypes) < 2: + continue + + valid_indices.append(i) # Track which samples actually generated a plot + + clones_by_pheno = { + p: set(sample_df[sample_df['phenotype'] == p]['CDR3b']) + for p in phenotypes + } + upset_data = upsetplot.from_contents(clones_by_pheno) + + # 2. Plot with Matplotlib + mpl_fig = plt.figure(figsize=(10, 6)) + upsetplot.plot( + upset_data, + fig=mpl_fig, + sort_by='cardinality', + show_counts=True + ) + + # Title using ALIAS + alias = alias_map.get(sample, sample) + plt.suptitle(f'Shared Clones: {alias}', fontsize=16) + + # 3. Convert to Plotly Trace + plotly_trace = fig_to_plotly(mpl_fig) + fig.add_trace(plotly_trace) + + # Set visibility: Only first valid trace is True + fig.data[-1].visible = (len(valid_indices) == 1) + + # If no samples had enough phenotypes, return None + if not fig.data: + return None + + # --- Create Dropdown Buttons --- + buttons = [] + # We iterate through the traces we successfully added + for i, trace_idx in enumerate(valid_indices): + sample_id = patient_samples[trace_idx] + alias = alias_map.get(sample_id, sample_id) + + # Visibility mask + visibility = [False] * len(fig.data) + visibility[i] = True + + buttons.append(dict( + label=alias, # Dropdown shows Alias + method="restyle", + args=[{"visible": visibility}] + )) + + # --- Layout --- + fig.update_layout( + updatemenus=[dict( + active=0, + buttons=buttons, + direction="down", + x=0.0, xanchor="left", + y=1.15, yanchor="top", + showactive=True + )], + width=900, + height=700, + margin=dict(t=80) # Space for dropdown + ) + + # Hide axes (since we display images) + fig.update_xaxes(showticklabels=False, showgrid=False, visible=False) + fig.update_yaxes(showticklabels=False, showgrid=False, visible=False) + + return fig + +# --- Main Execution --- + +# 1. Merge Metadata (if needed) to get Patient IDs +if subject_col not in clonotypes_pheno_df.columns: + # Ensure we merge strictly on sample + clonotypes_pheno_df = clonotypes_pheno_df.merge( + meta[['sample', subject_col]], + on='sample', + how='left' + ) + +# 2. Get Patients +unique_patients = sorted(clonotypes_pheno_df[subject_col].dropna().unique()) + +# 3. Suppress Warnings +warnings.filterwarnings("ignore", category=FutureWarning) +warnings.filterwarnings("ignore", category=UserWarning) + +# 4. Generate Tabs +print("::: {.panel-tabset}\n") + +for pat in unique_patients: + print(f"## {pat}\n") + + # Get samples for this patient + # Sort them by timepoint_order_col if available in meta + pat_samples = meta[meta[subject_col] == pat].sort_values(timepoint_order_col)['sample'].tolist() + + # Filter to samples present in the clonotype data + existing_samples = [s for s in pat_samples if s in clonotypes_pheno_df['sample'].unique()] + + if existing_samples: + fig = create_patient_upset_figure(clonotypes_pheno_df, existing_samples, meta) + + if fig: + fig.show() + else: + print(f"No samples for {pat} have >1 phenotype to compare.") + else: + print(f"No data found for {pat}.") + + print("\n") + +print(":::\n") + +``` +**Figure 11: clonal sharing between phenotypes.** Upset plot showing the number of TCR clones shared between different phenotypic clusters within a single sample. + + diff --git a/notebooks/template_qc.qmd b/notebooks/template_qc.qmd new file mode 100644 index 0000000..079fc12 --- /dev/null +++ b/notebooks/template_qc.qmd @@ -0,0 +1,1938 @@ +--- +title: "Quality Control" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: true + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px + +jupyter: python3 +--- + + + +Thank you for using TCRtoolkit! This report is generated from the data provided. + +:::{.callout-note collapse="true"} +## Document Information +**Current Version:** 1.0-beta +**Last Updated:** March 2026 +**Maintainer:** BTC Data Science Team +**Notes:** +::: + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name='' +project_dir='' +sample_table='' + +timepoint_col = 'timepoint' +timepoint_order_col = 'timepoint_order' +alias_col = 'alias' +subject_col = 'subject_id' + +``` + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}/{project_name}" +sample_stats_csv = f"{project_dir}/sample/sample_stats.csv" +concat_file = f"{project_dir}/annotate/concatenated_cdr3.tsv" # f"{project_dir}/compare/concatenated_cdr3.txt" + + +``` + +```{python} +#| code-fold: true + +# 1. Load Packages +from IPython.display import Image +import os +import datetime +import sys +import pandas as pd +import math +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.colors import LinearSegmentedColormap +import plotly.express as px +import plotly.graph_objects as go +import glob +import itertools +import h5py +import igraph as ig +import matplotlib.ticker as ticker +import networkx as nx +import numpy as np +import scipy.cluster.hierarchy as sch +import ipywidgets as widgets +from IPython.display import display, Markdown +from scipy.sparse import csr_matrix +from scipy.stats import gaussian_kde +from scipy.stats import entropy +from scipy.stats import skew, wasserstein_distance +from scipy.stats import pearsonr +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler +from scipy import sparse +from sklearn.preprocessing import normalize +from scipy.stats import wilcoxon +from scipy.stats import mannwhitneyu +from itertools import combinations +from scipy.spatial.distance import pdist +from scipy.cluster.hierarchy import linkage, leaves_list +import plotly.figure_factory as ff + +import warnings + +# 2. Print pipeline parameters + +# print('Project Name: ' + project_name) +# print('Workflow command: ' + workflow_cmd) +# print('Date and time: ' + str(datetime.datetime.now())) + +# 3. Loading data +## reading sample metadata +meta = pd.read_csv(sample_table, sep=',') +meta_cols = meta.columns.tolist() + +## Reading combined repertoire statistics +df = pd.read_csv(sample_stats_csv, sep=',') +df = pd.merge(df, meta, on='sample', how='left') +df = df[meta_cols + [c for c in df.columns if c not in meta_cols]] + +## reading concatenated cdr3 file +concat_df = pd.read_csv(concat_file, sep='\t') + +``` + +# Technical QC & Biological Relevance 🚩/🦠 {#sec-qc-bio} +## Diversity metrics {#sec-div-metrics} + +Analyzing the architecture of a T-cell repertoire requires moving beyond simply counting the total number of unique sequences (Richness). A healthy immune system is highly diverse and relatively evenly distributed. However, upon antigen encounter, specific T-cells rapidly proliferate, causing the repertoire to become highly skewed. +To quantify this structural shift, we rely on ecological diversity metrics and inequality indices. + +- **Shannon Entropy ($H$)** +Answers the question: "How hard is it to predict the identity of a randomly drawn T-cell?" It quantifies the uncertainty of the clonotype distribution. + - ***High Entropy (High Uncertainty):*** The repertoire is very diverse. There are many different clones with comparable frequencies, so you have no idea which one you will pick next. This signifies a healthy, polyclonal repertoire. + - ***Low Entropy (Low Uncertainty):*** The repertoire is dominated by a few large clones. You can easily guess that the next T-cell will likely belong to the dominant clone. This signifies oligoclonality or clonality. + +It is calculated as: +$H = - \sum_{i=1}^{R} p_i \ln(p_i)$ +Where $R$ is the number of unique clonotypes (Richness), and $p_i$ is the frequency of the $i$-th clonotype. + +- **Inverse Simpson Index ($1/D$)** +Answers the question: "How many TCR clones actually matter in this sample?" It ignores the "long tail" of rare, single-read clones (which might just be sequencing errors or debris) and focuses on the dominant, expanded clones that actually drive the immune response. +While the standard Simpson Index ($D$) measures the probability of collision (picking the same clone twice), the Inverse Simpson converts that probability into an "effective number". + - **High Value:** Indicates a high effective number of TCR clones. The sample is diverse and even in clone size. + - **Low Value (approaching 1):** Indicates the sample behaves as if it only contains 1 clone. It is highly dominated by a single expansion. +It is calculated as: +$\frac{1}{D} = \frac{1}{\sum_{i=1}^{R} p_i^2}$ +Where $p_i$ is the frequency of the $i$-th clonotype. + +::: {.callout-tip title="Pro Tip"} +Compare with sample Richness (# of unique clones) + +If Richness = 5,000 but Inverse Simpson = 1.2: Your sample behaves as if it only has 1.2 clones. It is massively dominated by a single expansion. + +If Richness = 5,000 and Inverse Simpson = 4,500: Your sample behaves as if it has 4,500 equally sized clones. It is extremely even and polyclonal. +::: + + +- **Gini Coefficient** +Answers the question: "Has the immune system picked a winner yet?" Measures inequality. Gini is your best metric for detecting Clonal Expansions independent of how many clones you actually sequenced. + - ***Value close to 0 (Perfect Equality):*** Every clone has the exact same frequency. The immune system is "resting" or "naive." It hasn't been triggered by a specific threat, so no single clone has started to divide rapidly. + - ***Medium value (0.3 - 0.7):*** Reactive. The immune system is fighting something. A few clones have expanded to fight a virus or tumor, but the background diversity is still there. + - ***High value (> 0.8):*** Monoclonal / Oligocloncal. A few clones have taken over completely. If this is a tumor sample, it might indicate a TIL (Tumor Infiltrating Lymphocyte) expansion. +It is calculated as: +$G = \frac{\sum_{i=1}^{R} (2i - R - 1) p_i}{R \sum_{i=1}^{R} p_i}$ + +Where the frequencies $p_i$ are sorted in ascending order, and $i$ is the rank. + +::: {.callout-tip title="Pro Tip"} +Why use this instead of Shannon? +Shannon Entropy is hard to compare if your library sizes (depth) are wildly different. Gini is normalized (always 0 to 1). +::: + +- **Hill Numbers ($^qD$)** +Answer the question: "What is the effective number of species in the sample when we ignore (penalize to degree $q$) rare clones?" This is a unified metric that combines richness, entropy, and dominance into a single scale (counts of effective species). + - ***$q=0$ (Richness):*** Counts every unique clone equally, regardless of frequency. Sensitive to sequencing depth and errors. + - ***$q=1$ (Exponential Shannon):*** Weighs clones by their frequency. Represents the number of "common" clones. + - ***$q=2$ (Inverse Simpson):*** Heavily weighs dominant clones. Represents the number of "very dominant" clones. +The general formula for Hill numbers of order $q$ is: +$^qD = \left( \sum_{i=1}^{R} p_i^q \right)^{1/(1-q)}$ +For $q=0$, this simplifies to $R$ (Total unique clonotypes).For $q=1$, the limit is undefined, so we use $\exp(H)$.For $q=2$, this simplifies to $1 / \sum p_i^2$ (Inverse Simpson). + + +```{python} +#| label: diversity-nested +#| output: asis + +# --- 1. CALCULATION FUNCTION --- +def calculate_diversity_metrics(df): + results = [] + # Check if 'counts' exists, if not try 'clone_count' or similar + count_col = 'counts' if 'counts' in df.columns else df.columns[0] # Fallback + + grouped = df.groupby('sample') + for sample, data in grouped: + counts = data[count_col].values + if counts.sum() == 0: continue + + p = counts / counts.sum() + shannon = -np.sum(p * np.log(p)) + inv_simpson = 1.0 / np.sum(p**2) + sorted_p = np.sort(p) + n = len(p) + index = np.arange(1, n + 1) + gini = ((2 * index - n - 1) * sorted_p).sum() / (n * sorted_p.sum()) + q0 = len(p) + q1 = np.exp(shannon) + q2 = inv_simpson + results.append({ + 'sample': sample, + 'shannon_entropy': shannon, + 'inverse_simpson': inv_simpson, + 'gini_coefficient': gini, + 'hill_q0': q0, + 'hill_q1': q1, + 'hill_q2': q2 + }) + return pd.DataFrame(results).set_index('sample') + +# --- 2. DATA PREPARATION --- +metrics = calculate_diversity_metrics(concat_df) +metrics = metrics.reset_index() + +# Ensure numeric columns are float +num_cols = metrics.select_dtypes(include=[np.number]).columns +metrics[num_cols] = metrics[num_cols].astype(float) + +# Merge with metadata +plot_df = pd.merge(metrics, meta, on='sample', how='inner') + +metrics_to_plot = [ + 'shannon_entropy', 'inverse_simpson', 'gini_coefficient', + 'hill_q0', 'hill_q1', 'hill_q2' +] + +# --- 3. SETUP GROUPS & ORDERING --- + +# Create Timepoint Mapping +if timepoint_col in plot_df.columns and timepoint_order_col in plot_df.columns: + time_order_map = plot_df.set_index(timepoint_col)[timepoint_order_col].to_dict() +else: + time_order_map = {} + +# Define Exclusions +exclude_cols = ['sample', 'file', 'total_counts'] +exclude_from_all_samples = [timepoint_col, timepoint_order_col, 'protocol_day', alias_col] + +# Groups for "All Samples" tab +group_opts_all = [ + col for col in meta.columns + if col not in exclude_cols + and col not in exclude_from_all_samples + and meta[col].nunique() < 35 +] + +# Groups for "By Patient" tab +group_opts_patient = [timepoint_col] if timepoint_col in plot_df.columns else [] + +unique_patients = sorted(plot_df[subject_col].dropna().unique().tolist()) + +# --- 4. PLOTTING FUNCTION --- +def create_diversity_plot(data, x_col, y_metric, custom_order=None): + if data.empty or data[x_col].dropna().empty: + return + + # Determine order + if custom_order: + unique_cats = [x for x in custom_order if x in data[x_col].unique()] + else: + unique_cats = sorted(data[x_col].dropna().unique().tolist()) + + # Base Plot + fig = px.box( + data, + x=x_col, + y=y_metric, + color=x_col, + points="all", + hover_data=['alias'], + category_orders={x_col: unique_cats}, + template="simple_white", + width=600, + height=600, + title=f"{y_metric} by {x_col}" + ) + + # Styling: Bigger dots + fig.update_traces(width=0.5, marker=dict(size=10, opacity=0.7)) + + for trace in fig.data: + if isinstance(trace, go.Box): + trace.pointpos = 0 + trace.jitter = 0.2 + + # Statistics + if len(unique_cats) >= 2: + pairs = list(combinations(unique_cats, 2)) + y_max = data[y_metric].max() + y_min = data[y_metric].min() + y_range = y_max - y_min + if y_range == 0: y_range = 1 + step_size = y_range * 0.13 + stack_counter = 0 + + for t1, t2 in pairs: + g1 = data[data[x_col] == t1][y_metric].dropna() + g2 = data[data[x_col] == t2][y_metric].dropna() + + if len(g1) < 2 or len(g2) < 2: continue + + try: + stat, p_value = mannwhitneyu(g1, g2, alternative='two-sided') + except ValueError: continue + + if p_value >= 0.05: continue + + if p_value < 0.001: symbol = '***' + elif p_value < 0.01: symbol = '**' + elif p_value < 0.05: symbol = '*' + + # Bracket coordinates + y_bracket = y_max + (y_range * 0.15) + (stack_counter * step_size) + y_text = y_bracket + (y_range * 0.02) + + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t2, y1=y_bracket, + line=dict(color="black", width=1.5)) + + tick_len = y_range * 0.02 + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t1, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + fig.add_shape(type="line", xref="x", yref="y", + x0=t2, y0=y_bracket, x1=t2, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + + try: x_center = (unique_cats.index(t1) + unique_cats.index(t2)) / 2 + except: x_center = 0 + + fig.add_annotation( + x=x_center, y=y_text, + text=f"{symbol}
p={p_value:.3f}", + showarrow=False, + font=dict(size=10, color="black") + ) + stack_counter += 1 + + top_margin = 60 + (stack_counter * 40) + else: + top_margin = 60 + + fig.update_layout( + showlegend=False, + yaxis_title=y_metric, + xaxis_title=x_col, + xaxis=dict(tickfont=dict(size=15)), # Increased Font Size + margin=dict(t=top_margin), + plot_bgcolor='rgba(0,0,0,0)' + ) + fig.update_yaxes(showgrid=True, gridcolor='lightgrey') + fig.show() + +# --- 5. EXECUTION LOOP --- +print(":::::: {.panel-tabset}\n") # Level 1 Start + +for metric in metrics_to_plot: + print(f"## {metric}\n") + + # Check patient count for split view + if len(unique_patients) <= 10: + print("::::: {.panel-tabset}\n") # Level 2 Start + + # --- TAB A: ALL SAMPLES --- + print("### All Samples\n") + print(":::: {.panel-tabset}\n") # Level 3 Start + for group in group_opts_all: + print(f"#### {group}\n") + create_diversity_plot(plot_df, group, metric) + print("\n") + print("::::\n") # Level 3 End + + # --- TAB B: BY PATIENT --- + print("### By Patient\n") + print(":::: {.panel-tabset}\n") # Level 3 Start + for pat in unique_patients: + print(f"#### {pat}\n") + pat_df = plot_df[plot_df[subject_col] == pat] + + for group in group_opts_patient: # Usually just timepoint_col + if pat_df[group].nunique() > 0: + # Custom sorting for timepoints + custom_order = None + if group == timepoint_col and time_order_map: + pat_tps = pat_df[group].dropna().unique().tolist() + custom_order = sorted(pat_tps, key=lambda x: time_order_map.get(x, 999)) + + create_diversity_plot(pat_df, group, metric, custom_order) + else: + print(f"No data for {group} in patient {pat}.") + print("\n") + print("::::\n") # Level 3 End + print(":::::\n") # Level 2 End + + else: + # Standard View (>10 Patients) + print("::::: {.panel-tabset}\n") + for group in group_opts_all: + print(f"### {group}\n") + create_diversity_plot(plot_df, group, metric) + print("\n") + print(":::::\n") + +print("::::::\n") # Level 1 End + +``` +**Figure 1. Comparative Analysis of TCR Repertoire Diversity Metrics.** Boxplots display the distribution of metrics across samples. The interactive panel allows toggling between diversity indices and grouping variables. This visualization facilitates the assessment of repertoire diversity and evenness across different experimental conditions and biological replicates. + +**Use the tabs to toggle between metadata groupings:** + +- The distributions of **Batches** L0, L1, etc., should largely overlap. If L0 is consistently higher than L1 across all sample types, you have a Batch Effect. You cannot compare samples across batches without computational correction (we do not provide such correction). + +- If one **patient** (e.g. Patien01) has drastically lower diversity than all others across all timepoints and tissues, it is likely a biological anomaly or a collection issue (degraded sample). Treat this patient as a separate cohort or exclude them. + +- In longitudinal studies, diversity often drops slightly over time due to treatment. Diversity metrics rarely jump from 0.8 to 0.1 (or vice versa) in a short window **between timepoints**. If so, that sample is suspect. There might be some causes that could explain it: 1) Sample swap of sample timepoint labels for that patient. 2) The patient contracted a viral infection at T2, causing a massive expansion of non-tumor clones. + +## Hill diversity Profile + +It allows to see how "TCR diversity" changes depending on how much you care about rare clones vs. dominant clones. You use Hill Profiles to catch artifacts that single metrics hide. +$q = 0$ (Richness): Ignore frequency, every unique CDR3 counts as 1. +$q = 1$ (Shannon Entropy): Clones are weighted by their frequency. +$q = 2$ (Simpson): Rare clones are mathematically ignored. The top expanded clones dictate the score. + +Two patients might share the exact same Shannon Entropy score, yet one has a healthy, broad repertoire while the other is battling a massive leukemia clone, for example. The Hill Diversity Profile reveals this difference by visualizing the trade-off between Richness (total number of clones) and Evenness (how equally those clones are distributed) in a single curve. + +**How to Read the Curve?** +Think of the **X-axis** ($q$) as a **"sensitivity dial" for dominance**. At the far left ($q=0$), the metric cares only about presence; it counts every unique clone equally, regardless of whether it appears once or a million times. This point represents your total Richness. **As you move right toward $q=2$, the mathematical weight shifts drastically toward the most abundant clones**, effectively ignoring the rare ones. **The slope of the line tells you the story**: a flat line indicates a perfectly even population where everyone is equal, while **a steep**, crashing slope **reveals a population dominated by a few massive "bully" clones.** + +**Here are 2 possible scenarios:** + +- **Scenario A:** The "Depth Trap" +Imagine you are **comparing Sample A and Sample B**, and you immediately notice that **Sample A has a much higher Hill $q=0$ value**, which represents richness. You might be **tempted to conclude that Sample A is biologically "more diverse"** and has a healthier T cell repertoire. However, when you look at the **Hill Profile as it moves toward $q=1$ and $q=2$**, you see the **lines rapidly converge or even cross, showing no difference in the dominant clones.** This reveals that the **"diversity" in Sample A was an illusion caused by sequencing depth.** You simply spent more money sequencing Sample A, allowing the machine to pick up more rare, single-copy clones that were missed in Sample B. The structure of the immune system was identical in both; one was just measured with a magnifying glass while the other was measured with the naked eye. + +- **Scenario B:** Real Biology +Consider a case where you are analyzing a **"Responder" patient versus a "Non-Responder" in a cancer trial**. If you only looked at a single metric like **Shannon Entropy ($q=1$), the two patients might look identical, leading you to think the treatment had no effect on the repertoire structure.**. But the Hill Profile tells a different story. At $q=0$, the Non-Responder is much higher, indicating a vast, unfocused collection of rare T cells that aren't doing much. As you move to **$q=2$, the lines cross, and the Responder suddenly shoots up**. This crossover reveals that **while the Non-Responder has more "types" of cells, the Responder has successfully expanded a specific "army" of effector clones** to fight the tumor**. The profile proves that their immune systems are structurally opposite, a critical biological insight that a single summary statistic would have completely hidden. + +```{python} +#| output: asis +#| fig-width: 6 +#| fig-height: 4.5 +#| label: hill + +# --- 1. Setup Data & Grouping --- +hill_cols = {'hill_q0': 0, 'hill_q1': 1, 'hill_q2': 2} + +exclude_cols = ['sample', 'file', 'total_counts', 'filename', 'sample_id', 'alias', + 'shannon_entropy', 'gini_coefficient', 'inverse_simpson', + 'hill_q0', 'hill_q1', 'hill_q2', 'mean_len', 'std_len'] + +group_opts = [ + col for col in plot_df.columns + if col not in exclude_cols and plot_df[col].nunique() < 35 +] + +sns.set_theme(style="whitegrid", context="paper", font_scale=1.1) + +# --- 2. Start Quarto Tabs --- +print("::: {.panel-tabset}") + +for col in group_opts: + print(f"## {col}") + + # Data Prep + subset = plot_df[[col, 'alias', 'hill_q0', 'hill_q1', 'hill_q2']].copy() + + melted = subset.melt( + id_vars=['alias', col], + value_vars=['hill_q0', 'hill_q1', 'hill_q2'], + var_name='q_metric', + value_name='Diversity' + ) + melted['q'] = melted['q_metric'].map(hill_cols) + melted[col] = melted[col].astype(str) + + # --- Y-AXIS SYNCHRONIZATION --- + # Calculate global min/max for this tab to lock axes + y_min = melted['Diversity'].min() + y_max = melted['Diversity'].max() + + # Add 5% log-padding so points aren't cut off + log_min = np.log10(y_min) + log_max = np.log10(y_max) + pad = (log_max - log_min) * 0.05 + + # Matplotlib limits (Linear values) + mpl_ylim = (10**(log_min - pad), 10**(log_max + pad)) + + # Plotly limits (Log10 values) + plotly_yrange = [log_min - pad, log_max + pad] + + # --- COLOR CONSISTENCY --- + unique_cats = sorted(melted[col].unique()) + palette_tuples = sns.color_palette("viridis", n_colors=len(unique_cats)) + hex_colors = [f'#{int(r*255):02x}{int(g*255):02x}{int(b*255):02x}' for r,g,b in palette_tuples] + color_map = dict(zip(unique_cats, hex_colors)) + + # =========================== + # PLOT 1: SEABORN (Summary) + # =========================== + plt.figure(figsize=(6, 4.5)) # Standard size + + sns.lineplot( + data=melted, + x='q', y='Diversity', + hue=col, style=col, + markers=True, dashes=False, + linewidth=2.5, + palette=color_map, + err_style='band', errorbar=('ci', 95) + ) + + plt.yscale('log') + plt.ylim(mpl_ylim) + plt.xticks([0, 1, 2], ['q=0\n(Richness)', 'q=1\n(Shannon)', 'q=2\n(Simpson)']) + plt.title(f'Average Profile by {col} (with 95% CI)') + plt.ylabel('Effective Clones (Log Scale)') + plt.xlabel('') + + plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title=col, + fontsize='x-small', title_fontsize='small', frameon=False) + plt.tight_layout() + plt.show() + + # =========================== + # PLOT 2: PLOTLY (Interactive) + # =========================== + + fig = px.line( + melted, + x='q', + y='Diversity', + color=col, + line_group='alias', # Keep sample here to ensure lines are grouped by the unique identifier + hover_name='alias', + + hover_data={col: True, 'q': False, 'Diversity': ':.2f', 'alias': True}, + log_y=True, + title=f'Individual Sample Profiles (Hover to Identify)', + color_discrete_map=color_map + ) + + fig.update_traces( + mode="lines+markers", + line=dict(width=3), + opacity=0.6, + marker=dict(size=6) + ) + + fig.update_layout( + template="simple_white", + width=600, + height=450, + xaxis=dict( + tickmode='array', + tickvals=[0, 1, 2], + ticktext=['q=0 (Richness)', 'q=1 (Shannon)', 'q=2 (Simpson)'], + title='Sensitivity (q)' + ), + yaxis=dict( + title='Effective Clones (Log Scale)', + range=plotly_yrange + ), + legend=dict(title=col) + ) + + fig.show() + + print("\n") + +print(":::") + +``` +**Figure 2. Hill Diversity Profiles of TCR Repertoires.** Diversity profiles display the effective number of clones (log scale) across sensitivity orders $q=0$ (Richness), $q=1$ (Shannon), and $q=2$ (Simpson). The top panel illustrates the mean repertoire structure for each subject with 95% confidence intervals, highlighting differences in evenness and dominance. The bottom panel visualizes individual sample trajectories, revealing heterogeneity and outlier profiles within each cohort. + +**Using the above metrics to identify technical Failures 🚩** +Before attempting biological interpretation, you must validate that your metrics reflect immune biology and not library preparation artifacts. Use these metrics to flag and exclude compromised samples. + +**1. Amplification Bias** +**Symptom:** A sample with moderate Richness ($q=0$) but extreme Dominance ($q=2$). +**Metric Signature:** + +- Shannon ($H$): Disproportionately low compared to the cohort baseline. +- Gini Coefficient: Approaches 1.0 (>0.9 is suspicious in non-tumor samples). +- Hill Profile: The curve starts high at $q=0$ (indicating presence of unique sequences) but crashes vertically as $q \to 2$. + +**Diagnosis:** A "Jackpot" event occurred where a random RNA molecule was preferentially amplified during PCR. The "richness" is likely sequencing noise/errors, while the reads are consumed by a single artifact. +**Action:** Discard sample. + +**2. Sequencing Bias** +**Symptom:** Two samples appear to have different diversities, but the difference disappears when you ignore rare clones. +**Metric Signature:** + +- Hill Profile: Sample A has a much higher $q=0$ (Richness) than Sample B, but the lines converge at $q=1$ and $q=2$. + +**Diagnosis:** This is not a biological difference. Sample A was simply sequenced deeper, detecting more single-copy clones (noise). The structural effective diversity is identical. +**Action:** Do not claim Sample A is "more diverse." Rely on Shannon ($q=1$) or Simpson ($q=2$) for comparison, or downsample (rarefy) the libraries to equal depth. + +**3. Library Failure** +**Symptom**: A sample that looks "empty" compared to the cohort. +**Metric Signature:** + +- Richness ($R$): Drastically lower than the cohort mean. +- Hill Profile: A flat line hovering near the bottom of the Y-axis. +**Diagnosis:** Poor RNA extraction, degradation, or a biopsy that captured mostly fat/stroma (low T-cell content). +**Action:** Exclude from analysis. + +**4. Batch Effects** +**Symptom:** Diversity metrics cluster by processing date rather than biological group. +**Metric Signature:**Visual Check: Boxplots of Shannon Entropy grouped by Batch_ID. If Batch 0 is consistently higher than 1 across all sample types (tumor, blood, healthy), you have a technical batch effect. +**Action:** You cannot compare raw metrics across these batches. + + +**Biological Interpretation of the above metrics 🦠** +Once QC is cleared, these metrics describe the shape of the immune repertoire. + +**1. Naive vs. Reactive Profiles** +**Naive / Resting State:** + +- Signature: High Richness, High Shannon, Low Gini (< 0.3). +- Interpretation: The army is standing by. There is no dominant clone because no specific threat has triggered an expansion. The Hill profile will show a "gentle decline." +**Reactive / Tumor Infiltrating:** + +- Signature: Low Shannon, High Inverse Simpson, High Gini (> 0.6). +- Interpretation: Clonal Expansion. The immune system has "picked a winner." A few specific clones (e.g., tumor-reactive or viral-specific) have expanded to occupy a large fraction of the repertoire (20–50%). + +**2. Inverse Simpson** +The Inverse Simpson Index ($1/D$) converts the probability of coincidence into a concrete "count" of effective clones. It filters out the long tail of rare sequences to quantify the number of clones driving the response. +**High Effective Number** ($1/D \gg 100$): + +- Signature: $1/D$ is close to Richness ($R$; i.e. hill_q0). +- Interpretation: Polyclonal/Diverse. The sample behaves as if it is composed of hundreds or thousands of equally abundant clones. Resources are distributed broadly. + +**Low Effective Number ($1/D < 20$):** + +- Signature: $1/D$ is a small fraction of Richness ($R$). +- Interpretation: Oligoclonal/Focused. Despite potentially containing thousands of unique sequences ($R$), the sample behaves biologically as if it only contains ~10-20 active clones. This confirms a highly focused effector response. + +**3. Hill Profile** +This is the most powerful visualization for distinguishing Responder vs. Non-Responder dynamics in longitudinal data. +**The Scenario:** A Non-Responder may have higher Richness ($q=0$) due to a chaotic, unfocused repertoire. A Responder might have lower Richness but higher Dominance. +**The Signature:** The Hill curves will cross.At $q=0$ (Richness): Non-Responder is higher.At $q=2$ (Dominance): Responder shoots up. +**Interpretation**: The crossover proves the Responder has successfully funneled resources into a specific "army" of effector clones, structurally reorganizing the repertoire to fight the tumor. + +## Combining metrics {#sec-combine-metrics} + +### Evenness Check (Gini vs Shannon) + +Under normal biological conditions, the Gini coefficient and Shannon entropy should exhibit a strong negative correlation. As the immune system focuses on a specific threat, a few clones expand to dominate the repertoire, causing inequality to rise (High Gini) while overall diversity naturally falls (Low Shannon). + +```{python} +#| output: asis +#| fig-width: 6 +#| fig-height: 5 + +# 1. Setup Grouping Options +exclude_cols = ['sample', 'file', 'total_counts', 'filename', 'sample_id', + 'shannon_entropy', 'gini_coefficient', 'inverse_simpson', + 'hill_q0', 'hill_q1', 'hill_q2', 'mean_len', 'std_len'] + +group_opts = [ + col for col in plot_df.columns + if col not in exclude_cols and plot_df[col].nunique() < 35 +] + +# 2. Start Quarto Tabset +print("::: {.panel-tabset}") + +for col in group_opts: + print(f"## {col}") + + # Smaller Figure Size + plt.figure(figsize=(6, 5)) + + # "paper" context scales everything down compared to "talk" + sns.set_theme(style="whitegrid", context="paper") + + hue_data = plot_df[col].astype(str) + + sns.scatterplot( + data=plot_df, + x='shannon_entropy', + y='gini_coefficient', + hue=hue_data, + style=hue_data, + s=80, # Smaller dots on the plot (was 150) + alpha=0.8, + palette='viridis', + edgecolor='black', + linewidth=0.5 # Thinner edge lines + ) + + plt.title(f'Evenness by {col}', fontsize=12) + plt.xlabel('Shannon (Diversity)', fontsize=10) + plt.ylabel('Gini (Inequality)', fontsize=10) + + # Compact Legend Control + plt.legend( + bbox_to_anchor=(1.02, 1), + loc='upper left', + title=col, + fontsize='small', # Shrink text + title_fontsize='small', # Shrink title + markerscale=0.7, # Shrink dots inside legend + frameon=False # Optional: Remove box border to save visual space + ) + + plt.tight_layout() + plt.show() + + print("\n") + +print(":::") + +``` + +**Figure 3. Evaluation of Repertoire Structure via Gini-Shannon Correlation.** This scatter plot contrasts the Gini coefficient (inequality) against Shannon entropy (diversity) for individual samples, colored by metadata variables. The visualization assesses the expected inverse relationship between the two metrics, where antigen-driven clonal expansion typically results in higher inequality and reduced diversity. + +Healthy, **naive, or polyclonal samples** will naturally cluster in the **bottom-right** of the plot. These repertoires are **characterized by high diversity** ($H > 8$) and low inequality ($G < 0.2$), reflecting a deep pool of clones with relatively even frequencies. In contrast, **samples undergoing an active immune response**—such as those from tumors, acute infections, or leukemia—will drift toward the **top-left**. In this "Expansion Zone," the biological dominance of a few clones drives the Gini coefficient up ($> 0.6$) and suppresses the entropy, a signature that is expected in disease states but signals potential PCR bias ("jackpotting") if observed in negative controls. + +The most critical **QC artifact to watch** for is the "Ghost Library" in the **bottom-left corner**. These samples show **low diversity but paradoxically low inequality**. This almost always indicates severe **undersampling**: if a library only contains 50 reads and every read is unique, the sample will appear "perfectly equal" (Gini $\approx$ 0) simply because it lacks the depth to reveal the true distribution. These samples are statistical phantoms and should be **discarded**. + +Finally, samples falling in the **top-right (High Diversity + High Inequality)** represent a **mathematical contradiction for T-cell data**. It is impossible to have a massive, diverse "tail" of clones while simultaneously having a single clone occupy the majority of reads. Samples in this region often indicate **bioinformatic errors**, such as merging incompatible library files or calculating metrics on raw counts rather than frequencies. + +### Richness vs 1/D + +The Inverse Simpson ($1/D$) is used to visualize the magnitude of the response. It ignores the long tail of rare clones. It is a good practice to compare $1/D$ to Richness ($R$). +- If $R = 5000$ and $1/D = 4500$: The sample is polyclonal (many clones of equal size). +- If $R = 5000$ and $1/D = 2$: The sample is oligoclonal. Despite having 5000 unique sequences, it behaves biologically as if it only has 2 clones. + +```{python} +#| output: asis +#| fig-width: 6 +#| fig-height: 5 + +# 1. Setup Grouping Options +exclude_cols = ['sample', 'file', 'total_counts', 'filename', 'sample_id', + 'shannon_entropy', 'gini_coefficient', 'inverse_simpson', + 'hill_q0', 'hill_q1', 'hill_q2', 'mean_len', 'std_len'] + +# Automatically detect categorical columns with reasonable cardinality +group_opts = [ + col for col in plot_df.columns + if col not in exclude_cols and plot_df[col].nunique() < 35 +] + +# ============================================================================== +# PLOT SET 1: RICHNESS vs INVERSE SIMPSON (The "Depth Trap" Check) +# ============================================================================== +print("::: {.panel-tabset}") + +for col in group_opts: + print(f"## {col}") + + plt.figure(figsize=(6, 5)) + sns.set_theme(style="whitegrid", context="paper") + + hue_data = plot_df[col].astype(str) + + # Scatter Plot + sns.scatterplot( + data=plot_df, + x='inverse_simpson', + y='hill_q0', + hue=hue_data, + style=hue_data, + s=80, alpha=0.8, palette='viridis', edgecolor='black', linewidth=0.5 + ) + + # Add Diagonal Line (y=x) for reference + # Points on this line are perfectly even. Points high above it are dominated by rare clones. + max_val = max(plot_df['hill_q0'].max(), plot_df['inverse_simpson'].max()) + plt.plot([0, max_val], [0, max_val], ls="--", c=".3", label="Perfect Evenness (y=x)") + + plt.title(f'Richness vs. Effective Clones by {col}', fontsize=12) + plt.xlabel('Inverse Simpson (Effective # Clones)', fontsize=10) + plt.ylabel('Hill q=0 (Total Richness)', fontsize=10) + + plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title=col, + fontsize='small', title_fontsize='small', markerscale=0.7, frameon=False) + + plt.tight_layout() + plt.show() + print("\n") + +print(":::") +print("\n") + +``` + +**Figure 4. Analysis of Clonal Dominance vs. Effective Clone Count.** This scatter plot compares the total clonal richness against the effective number of clones, colored by metadata variables. The diagonal line ($y=x$) represents a theoretical state of perfect evenness, where every clone has equal frequency. Deviations from this line indicate the degree of clonal dominance. + +### Gini Coefficient vs. Effective Clones + +This plot maps the trade-off between unevenness and effective size. Typically, these metrics are inversely correlated: as inequality (Gini) rises, the number of effective clones (Inverse Simpson) drops. Deviations from this curve reveal samples with unusual structural properties + +```{python} +#| output: asis +#| fig-width: 6 +#| fig-height: 5 +#| +print("::: {.panel-tabset}") + +for col in group_opts: + print(f"## {col}") + + plt.figure(figsize=(6, 5)) + sns.set_theme(style="whitegrid", context="paper") + + hue_data = plot_df[col].astype(str) + + sns.scatterplot( + data=plot_df, + x='inverse_simpson', + y='gini_coefficient', + hue=hue_data, + style=hue_data, + s=80, alpha=0.8, palette='rocket', edgecolor='black', linewidth=0.5 + ) + + plt.title(f'Inequality vs. Diversity by {col}', fontsize=12) + plt.xlabel('Inverse Simpson (Effective # Clones)', fontsize=10) + plt.ylabel('Gini Coefficient (Inequality)', fontsize=10) + plt.ylim(0, 1.05) # Gini is bounded [0,1] + + plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', title=col, + fontsize='small', title_fontsize='small', markerscale=0.7, frameon=False) + + plt.tight_layout() + plt.show() + print("\n") + +print(":::") +``` + +**Figure 5. Relationship Between Repertoire Inequality and Effective Diversity**. The visualization demonstrates the inverse correlation typically observed in immune repertoires, where high inequality (dominance by a few clones) corresponds to a low effective population size. Deviations from this structural trend can identify samples with aberrant clonal distributions. + +**Combined metrics: Quality Control 🚩** +Use these 2D plots to distinguish between sequencing artifacts and true biological signal. + +**1. Richness vs. Effective Clones** +This plot exposes the "useless tail" of your library—the sequences that appear only once or twice and do not contribute to the effective immune response. + +- The Diagonal Line ($y=x$): A sample on this line has no rare clones; every unique sequence is equally abundant. +- Vertical Separation (The Warning): If Sample A is far above Sample B on the Y-axis (Richness) but they align perfectly on the X-axis (Effective Clones), Sample A is not more diverse. It was simply sequenced deeper. The extra richness is noise (singletons). +- Action: Trust the X-axis (Inverse Simpson) for biological comparison. Ignore the Y-axis offset. Samples clustered in the bottom-left corner ($<100$ on both axes) often indicate failed library preparation or low-input samples (e.g., fibrous tissue with no T-cells). + +**2. Gini vs. Effective Clones Plot** +This plot validates the structural integrity of the library. + +- **The "Impossible" Quadrant:** You should rarely see High Gini (>0.8) combined with High Effective Clones (>1000). Mathematically, you cannot have extreme inequality while simultaneously maintaining thousands of effectively equal clones. If you see this, check for chimeric reads or alignment errors. + +- **The "False Dominance":** If a Baseline/PBMC sample appears in the Top-Left corner (High Gini, Low Effective Count), it is a red flag. Unless the patient has a known blood malignancy, a resting control sample should never look like a tumor. This indicates a PCR Jackpot artifact. + + +**Combined metrics: Biological Interpretation 🦠 ** +Once QC is cleared, the position of a sample in these 2D spaces defines its immunological state. + +**1. Gini vs. Effective Clones (1/D)** +**The Naive/Polyclonal State (Bottom-Right):** +The immune system is in surveillance mode. No specific antigen has triggered a response. Resources are distributed broadly across thousands of clones. +**The Reactive/focused State (Top-Left):** +Clonal Expansion. The immune system has identified a threat (tumor/virus) and "picked a winner." The repertoire is dominated by a focused army of effector clones, physically crowding out the diversity. + +**2. Tracking Response** +In a successful immunotherapy response, track the patient's movement across the plot over time: +**The "Focusing" Shift:** A responder should move diagonally Up and Left. They start with a broad repertoire (Bottom-Right) and shift toward dominance (Top-Left) as tumor-reactive clones expand. +**The "Ignorance" Stasis:** Non-responders often remain stuck in the Bottom-Right quadrant (high diversity, low inequality), indicating the treatment failed to trigger T-cell expansion. + +## TCR Overlap {#sec-tcr-overlap} + +We use two distinct metrics to compare repertoires. One measures membership (who is there?), and the other measures structure (who is dominant?). + +- **1. Jaccard Index:** Measures the overlap of unique sequences, ignoring frequency. It treats a singleton and a top clone equally. +**Question:** "What fraction of the unique CDR3s are found in both samples?" +**Range:** $0$ (No shared sequences) $\to$ $1$ (Identical unique sequence list). +**Formula:** +$$J(A,B) = \frac{|A \cap B|}{|A \cup B|}$$ +Where $|A \cap B|$ is the count of unique clonotypes shared by both samples, and $|A \cup B|$ is the total count of unique clonotypes in the union of both samples. + +- **2. Morisita-Horn Index:** Measures the overlap of dominant populations. It accounts for clone abundance, meaning rare clones have mathematically negligible impact on the score. +**Question**: "If I randomly draw two T-cells from these two samples, what is the probability they are the same clone?" +**Range:** $0$ (Completely different dominant clones) $\to$ $1$ (Identical frequencies). +**Formula:** +$$C_{MH} = \frac{2 \sum_{i=1}^{S} p_{ai} p_{bi}}{\sum_{i=1}^{S} p_{ai}^2 + \sum_{i=1}^{S} p_{bi}^2}$$ +Where $p_{ai}$ and $p_{bi}$ are the relative frequencies of the $i$-th clonotype in Sample A and Sample B, respectively. Note that the denominator terms ($\sum p^2$) are essentially the Simpson Indices of the individual samples. + +```{python} +#| output: asis +#| fig-width: 7 +#| fig-height: 7 + +def calculate_overlaps(df): + samples = sorted(df['sample'].unique()) + clones = sorted(df['CDR3b'].unique()) + + sample_map = {s: i for i, s in enumerate(samples)} + clone_map = {c: i for i, c in enumerate(clones)} + + row_idx = df['sample'].map(sample_map).values + col_idx = df['CDR3b'].map(clone_map).values + values = df['counts'].values + + n_samples = len(samples) + n_clones = len(clones) + + mat = sparse.coo_matrix((values, (row_idx, col_idx)), shape=(n_samples, n_clones)).tocsr() + + # Jaccard (Binary) + mat_bin = mat.copy() + mat_bin.data[:] = 1 + intersection = mat_bin.dot(mat_bin.T).toarray() + row_sums = mat_bin.sum(axis=1).A1 + union_matrix = row_sums[:, None] + row_sums[None, :] - intersection + + jaccard = np.divide(intersection, union_matrix, + out=np.zeros_like(intersection, dtype=float), + where=union_matrix!=0) + + # Morisita (Frequency) + mat_freq = normalize(mat, norm='l1', axis=1) + simpson = mat_freq.power(2).sum(axis=1).A1 + numerator = 2 * mat_freq.dot(mat_freq.T).toarray() + denominator = simpson[:, None] + simpson[None, :] + + morisita = np.divide(numerator, denominator, + out=np.zeros_like(numerator), + where=denominator!=0) + + return pd.DataFrame(jaccard, index=samples, columns=samples), \ + pd.DataFrame(morisita, index=samples, columns=samples) + +# --- 2. EXECUTE CALCULATION --- +jaccard_df, morisita_df = calculate_overlaps(concat_df) + +# --- 3. RENAME TO ALIAS --- +if alias_col in meta.columns: + sample_to_alias = meta.set_index('sample')[alias_col].to_dict() + jaccard_df = jaccard_df.rename(index=sample_to_alias, columns=sample_to_alias) + morisita_df = morisita_df.rename(index=sample_to_alias, columns=sample_to_alias) + +# --- 4. CLUSTERING HELPER --- +def get_clustered_order(matrix): + """ + Returns the index/column names sorted by hierarchical clustering. + """ + # Calculate distance (1 - similarity) + # Using 'cityblock' (Manhattan) often works well for abundance/binary data + dist = pdist(matrix.values, metric='cityblock') + link = linkage(dist, method='average') + leaves = leaves_list(link) + + # Return the labels in the clustered order + return matrix.index[leaves].tolist() + +# --- 5. INTERACTIVE PLOTTING FUNCTION --- +def plot_interactive_heatmap(matrix, title): + # 1. Get Clustered Order + try: + ordered_labels = get_clustered_order(matrix) + # Reorder the matrix + matrix = matrix.loc[ordered_labels, ordered_labels] + except Exception as e: + print(f"Clustering failed, using default order: {e}") + + # 2. Mask Diagonal for Color Scaling (optional, to see contrast better) + # Plotly doesn't natively mask, but we can set diagonal to NaN if we want it empty + # For now, we leave it as 1.0 (or max) but control the colorscale range + + vals = matrix.values + # Exclude diagonal from max calculation to prevent 1.0 from dominating small overlaps + mask = ~np.eye(vals.shape[0], dtype=bool) + max_val = vals[mask].max() if mask.any() else 1.0 + if max_val < 0.05: max_val = 0.1 # Minimum floor + + # 3. Create Heatmap + fig = px.imshow( + matrix, + labels=dict(x="Sample", y="Sample", color="Overlap"), + x=matrix.columns, + y=matrix.index, + color_continuous_scale="Reds", + range_color=[0, max_val], # Cap color scale at max non-diagonal value + title=f"{title}
(Max Overlap: {max_val:.3f})" + ) + + # 4. Refine Layout + fig.update_layout( + width=700, + height=700, + xaxis=dict(tickangle=90, tickfont=dict(size=10)), + yaxis=dict(tickfont=dict(size=10)), + plot_bgcolor='black', # Background color for NaN/Empty + ) + + # 5. Add Grid Lines (simulating linewidths) + # Plotly heatmaps don't have 'linewidth', but we can add gaps using gap + fig.update_traces(xgap=1, ygap=1) + + fig.show() + +# --- 6. OUTPUT TABS --- +print("::: {.panel-tabset}\n") + +print("## Jaccard\n") +plot_interactive_heatmap(jaccard_df, "Jaccard Index") +print("\n") + +print("## Morisita\n") +plot_interactive_heatmap(morisita_df, "Morisita Index") +print("\n") + +print(":::\n") + +``` +**Figure 6. Assessment of Clonal Overlap and Cross-Contamination.** +Heatmaps display the pairwise similarity between TCR repertoires using the Jaccard (overlap) index, with an option to toggle to the Morisita (abundance-weighted) index. Hierarchical clustering groups samples based on their shared clonal content, facilitating the identification of biological replicates or potential cross-contamination events. The color intensity reflects the degree of similarity, where darker red indicates a higher proportion of shared clonotypes between sample pairs. + +::: {.callout-tip title="Note"} +When calculating Jaccard for this specific purpose (contamination), we are calculating it on the nucleotide sequence, not the amino acid sequence, **whenever the TCRseq whas done using DNA as input**. Convergent recombination (different DNA making the same Amino Acid) is common in high-depth samples and will give you a false positive "contamination" signal if you stick to Amino Acids. Contamination should be measured at the physical level (DNA). +::: + +**Technical QC 🚩** +Use these overlap metrics to detect the two most dangerous errors in sequencing: **Sample Swapping and Physical Contamination.** + +- **1. Identity Validation: Morisita-Horn Index** + - Red Flag: A dark red square between two different Patient IDs Similarity ($> 0.90$; Dark Red). + - Diagnosis: Sample Swap / Duplication. Biologically, two humans cannot have identical repertoires and share the exact same dominant clones at identical frequencies ($>90\%$). You likely pipetted the same library into two wells, or the labels were swapped during handling. + - Action: If Patient_A and Patient_B have a Morisita of 0.98, the metadata is wrong. Treat them as the same source or discard both. + +- **2. Contamination Detection: Jaccard Index** + - Red Flag: A block of red squares connecting unrelated samples (Overlap $> 0.05 - 0.1$ Pink/Red). + - Diagnosis: Cross-Contamination. Because TCRs are hypervariable, distinct individuals should almost never share rare sequences (Jaccard should be near 0). + - Scenario A (Splash): Patient_A and Patient_B share 20% of their unique sequences. Physical liquid likely splashed between wells during PCR setup. + - Scenario B (Barcode Hopping): If an entire batch shows faint overlap (~5-10%) with a high-concentration sample, it indicates index hopping during sequencing. + - Action: Flag samples with unexpected Jaccard scores $>0.05$. + + +**Biological Interpretation 🦠** +Once technical artifacts are ruled out, overlap quantifies stability and shared biology. + +- **1. Longitudinal Stability: Morisita-Horn** +Use this to track the persistence of the immune response over time within the same patient. + - High Overlap ($> 0.8$): Indicates Repertoire Stability. The dominant clones present at Timepoint 1 are still the dominant clones at Timepoint 2. This is expected in healthy controls or stable disease. + - Dropping Overlap: If Morisita drops significantly between Pre- and Post-treatment, it confirms Repertoire Reshaping. The drug has successfully forced the expansion of a new set of clones, replacing the old dominant population. + +- **2. Public Clones: Jaccard** + - Low but Real Overlap (< 0.01):While distinct humans rarely share repertoires, they may share identical "Public Clones" (often viral-specific, e.g., CMV or EBV). + - Interpretation: A very faint Jaccard signal (0.001 - 0.01) across a cohort can indicate a shared antigen exposure (e.g., a common viral infection in the population) or convergent recombination events. + + +# Technical QC Relevance (🚩) {#sec-qc} +The following section presents metrics designed to assess the processing quality of the samples. These technical QC checks are essential for differentiating between genuine biological signals and technical artifacts, including library failures, undersampling, batch effects, and outliers. However, we acknowledge that some of these metrics can also carry biological meaning, depending on the specific biological context of the analysis. + +## Percent of productive TCRs {#sec-cdr3-prod} + +The percent of productive TCRs is one of the first metrics you should check to validate your data's reliability before proceeding to any biological analysis. + +As a Quality Filter: You should set a QC threshold (e.g., >75% productive reads). Samples that fail to meet this threshold should be flagged for review or potentially excluded from the analysis. Drawing conclusions about T-cell diversity or clonality from a sample with low productivity is unreliable, as the data is likely noisy and not a true representation of the functional T-cell repertoire. + +To Troubleshoot Experiments: If you find that an entire batch of samples has a low productive percentage, it points to a systematic issue in your experimental protocol, most commonly an ineffective gDNA removal step or problems with RNA integrity. + +To Ensure Confidence in Results: By confirming that your samples have a high percentage of productive TCRs, you can be confident that the clonotypes you identify and analyze represent real, functional T-cells that are actively participating in the immune response. + +```{python} +#| output: asis +#| echo: false + +# Create a mapping for timepoint order using the 'meta' dataframe +if 'meta' in locals() and timepoint_col in meta.columns and timepoint_order_col in meta.columns: + time_order_map = meta.set_index(timepoint_col)[timepoint_order_col].to_dict() +else: + # Fallback: if 'meta' isn't loaded, try to use columns in the main df + if timepoint_order_col in df.columns: + time_order_map = df.set_index(timepoint_col)[timepoint_order_col].to_dict() + else: + time_order_map = {} # No sorting map available + +# 1. Define General Exclusions +exclude_cols = ['sample', 'file', 'clonality', 'counts', 'total_counts', + 'num_clones', 'num_TCRs', 'simpson_index', 'simpson_index_corrected', + 'num_prod', 'num_nonprod', 'pct_prod', 'pct_nonprod', + 'productive_cdr3_avg_len', 'num_convergent','ratio_convergent'] + +# 2. Define Groups for "All Samples" Tab (Exclude timepoint/order) +exclude_from_all_samples = [timepoint_col, timepoint_order_col, 'protocol_day', alias_col] + +group_opts_all = [ + col for col in df.columns + if col not in exclude_cols + and col not in exclude_from_all_samples + and df[col].nunique() < 35 +] + +# 3. Define Groups for "By Patient" Tab (Only timepoint) +group_opts_patient = [timepoint_col] if timepoint_col in df.columns else [] + +unique_patients = sorted(df[subject_col].dropna().unique().tolist()) + +# --- 1. DEFINE PLOTTING FUNCTION --- +def create_pct_prod_plot(data, group_col, custom_order=None): + """ + custom_order: Optional list of x-axis values in the desired order. + """ + if data.empty or data[group_col].dropna().empty: + return + + # Determine order: Use custom list if provided, otherwise sort alphabetically + if custom_order: + unique_cats = [x for x in custom_order if x in data[group_col].unique()] + else: + unique_cats = sorted(data[group_col].dropna().unique().tolist()) + + # Base Plot + fig = px.box( + data, + x=group_col, + y='pct_prod', + color=group_col, + points='all', + hover_data=['alias'], + category_orders={group_col: unique_cats}, + template="simple_white", + title=f"Productive TCRs (%) by {group_col}" + ) + + # --- BIGGER DOTS (size=10) --- + fig.update_traces(width=0.5, marker=dict(size=10, opacity=0.7)) + + for trace in fig.data: + if isinstance(trace, go.Box): + trace.pointpos = 0 + trace.jitter = 0.2 + + # Stats + if len(unique_cats) >= 2: + pairs = list(combinations(unique_cats, 2)) + y_max = data['pct_prod'].max() + y_range = y_max - data['pct_prod'].min() + if y_range == 0: y_range = 1 + step_size = y_range * 0.13 + stack_counter = 0 + + for t1, t2 in pairs: + group1 = data[data[group_col] == t1]['pct_prod'].dropna() + group2 = data[data[group_col] == t2]['pct_prod'].dropna() + + if len(group1) < 2 or len(group2) < 2: continue + + try: + stat, p_value = mannwhitneyu(group1, group2, alternative='two-sided') + except ValueError: continue + + if p_value >= 0.05: continue + + if p_value < 0.001: symbol = '***' + elif p_value < 0.01: symbol = '**' + elif p_value < 0.05: symbol = '*' + + y_bracket = y_max + (y_range * 0.15) + (stack_counter * step_size) + y_text = y_bracket + (y_range * 0.02) + + # Draw Bracket + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t2, y1=y_bracket, + line=dict(color="black", width=1.5)) + + tick_len = y_range * 0.02 + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t1, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + fig.add_shape(type="line", xref="x", yref="y", + x0=t2, y0=y_bracket, x1=t2, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + + try: x_center = (unique_cats.index(t1) + unique_cats.index(t2)) / 2 + except: x_center = (unique_cats.index(t1) + unique_cats.index(t2)) / 2 + + fig.add_annotation( + x=x_center, y=y_text, text=f"{symbol}
p={p_value:.3f}", + showarrow=False, font=dict(size=10, color="black") + ) + stack_counter += 1 + top_margin = 60 + (stack_counter * 40) + else: + top_margin = 60 + + fig.update_layout( + xaxis_title=group_col, + yaxis_title="Productive TCRs (%)", + xaxis=dict(tickfont=dict(size=15)), + margin=dict(t=top_margin), + showlegend=False, + width=600, + height=600, + plot_bgcolor='rgba(0,0,0,0)' + ) + fig.update_yaxes(showgrid=True, gridcolor='lightgrey') + fig.show() + +# --- 2. GENERATE TABS --- + +if len(unique_patients) <= 10: + print("::::: {.panel-tabset}\n") + + # --- TAB A: ALL SAMPLES --- + print("## All Samples\n") + print(":::: {.panel-tabset}\n") + for group in group_opts_all: + print(f"### {group}\n") + create_pct_prod_plot(df, group) + print("\n") + print("::::\n") + + # --- TAB B: BY PATIENT --- + print("## By Patient\n") + print(":::: {.panel-tabset}\n") + + for pat in unique_patients: + print(f"### {pat}\n") + + pat_df = df[df[subject_col] == pat] + + for group in group_opts_patient: + if pat_df[group].nunique() > 0: + + # --- APPLY CUSTOM SORTING IF AVAILABLE --- + custom_order = None + if group == timepoint_col and time_order_map: + pat_tps = pat_df[group].dropna().unique().tolist() + custom_order = sorted(pat_tps, key=lambda x: time_order_map.get(x, 999)) + + create_pct_prod_plot(pat_df, group, custom_order) + else: + print(f"No data for {group} in patient {pat}.") + print("\n") + + print("::::\n") + print(":::::\n") + +else: + # --- STANDARD VIEW (>10 Patients) --- + print("::: {.panel-tabset}\n") + for group in group_opts_all: + print(f"## {group}\n") + create_pct_prod_plot(df, group) + print("\n") + print(":::\n") + +``` + +**Figure 7. Percent of productive TCRs.** Distribution of productive TCRs across sample timepoints. A productive TCR is a DNA/RNA sequence that can be translated into a protein sequence, i.e. it does not contain a premature stop codon or an out of frame rearrangement. The percent of productive TCRs is calculated as: + +$$ Percent \text{ } productive \text{ } TCRs = \frac P N $$ + +where $P$ is the number of productive TCRs and $N$ is the total number of TCRs in a given sample. + + +## Average productive CDR3 Length {#sec-cdr3-len} + +The incredible diversity is generated during the development of T-cells through a process of genetic recombination, where different gene segments (V, D, and J) are pieced together. The CDR3 region spans the junction of these segments and includes random nucleotide additions and deletions, creating a unique sequence for almost every T-cell. **The length of this CDR3 region, is a direct consequence of this recombination process.** + +The CDR3 length distribution serves as a primary metric for assessing library complexity and detecting technical biases introduced during library preparation or sequencing. While biological clonality impacts this metric, deviations in non-expanded samples often signal workflow failures rather than biological reality. + +```{python} +#| output: asis +#| echo: false + +# Create a mapping for timepoint order using the 'meta' dataframe +if 'meta' in locals() and timepoint_col in meta.columns and timepoint_order_col in meta.columns: + time_order_map = meta.set_index(timepoint_col)[timepoint_order_col].to_dict() +else: + # Fallback: if 'meta' isn't loaded, try to use columns in the main df + if timepoint_order_col in df.columns: + time_order_map = df.set_index(timepoint_col)[timepoint_order_col].to_dict() + else: + time_order_map = {} # No sorting map available + +# 1. Define General Exclusions +exclude_cols = ['sample', 'file', 'clonality', 'counts', 'total_counts', + 'num_clones', 'num_TCRs', 'simpson_index', 'simpson_index_corrected', + 'num_prod', 'num_nonprod', 'pct_prod', 'pct_nonprod', + 'productive_cdr3_avg_len', 'num_convergent','ratio_convergent'] + +# 2. Define Groups for "All Samples" Tab (Exclude timepoint/order) +exclude_from_all_samples = [timepoint_col, timepoint_order_col, 'protocol_day', alias_col] + +group_opts_all = [ + col for col in df.columns + if col not in exclude_cols + and col not in exclude_from_all_samples + and df[col].nunique() < 35 +] + +# 3. Define Groups for "By Patient" Tab (Only timepoint) +group_opts_patient = [timepoint_col] if timepoint_col in df.columns else [] + +unique_patients = sorted(df[subject_col].dropna().unique().tolist()) + +# --- 1. DEFINE PLOTTING FUNCTION --- +def create_cdr3_len_plot(data, group_col, custom_order=None): + """ + custom_order: Optional list of x-axis values in the desired order. + """ + if data.empty or data[group_col].dropna().empty: + return + + # Determine order: Use custom list if provided, otherwise sort alphabetically + if custom_order: + unique_cats = [x for x in custom_order if x in data[group_col].unique()] + else: + unique_cats = sorted(data[group_col].dropna().unique().tolist()) + + # Base Plot + fig = px.box( + data, + x=group_col, + y='productive_cdr3_avg_len', + color=group_col, + points='all', + hover_data=['alias'], + category_orders={group_col: unique_cats}, + template="simple_white", + title=f"CDR3 Length by {group_col}" + ) + + # --- BIGGER DOTS (size=10) --- + fig.update_traces(width=0.5, marker=dict(size=10, opacity=0.7)) + + for trace in fig.data: + if isinstance(trace, go.Box): + trace.pointpos = 0 + trace.jitter = 0.2 + + # Stats + if len(unique_cats) >= 2: + pairs = list(combinations(unique_cats, 2)) + y_max = data['productive_cdr3_avg_len'].max() + y_range = y_max - data['productive_cdr3_avg_len'].min() + if y_range == 0: y_range = 1 + step_size = y_range * 0.13 + stack_counter = 0 + + for t1, t2 in pairs: + group1 = data[data[group_col] == t1]['productive_cdr3_avg_len'].dropna() + group2 = data[data[group_col] == t2]['productive_cdr3_avg_len'].dropna() + + if len(group1) < 2 or len(group2) < 2: continue + + try: + stat, p_value = mannwhitneyu(group1, group2, alternative='two-sided') + except ValueError: continue + + if p_value >= 0.05: continue + + if p_value < 0.001: symbol = '***' + elif p_value < 0.01: symbol = '**' + elif p_value < 0.05: symbol = '*' + + y_bracket = y_max + (y_range * 0.15) + (stack_counter * step_size) + y_text = y_bracket + (y_range * 0.02) + + # Draw Bracket + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t2, y1=y_bracket, + line=dict(color="black", width=1.5)) + + tick_len = y_range * 0.02 + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t1, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + fig.add_shape(type="line", xref="x", yref="y", + x0=t2, y0=y_bracket, x1=t2, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + + try: x_center = (unique_cats.index(t1) + unique_cats.index(t2)) / 2 + except: x_center = (unique_cats.index(t1) + unique_cats.index(t2)) / 2 + + fig.add_annotation( + x=x_center, y=y_text, text=f"{symbol}
p={p_value:.3f}", + showarrow=False, font=dict(size=10, color="black") + ) + stack_counter += 1 + top_margin = 60 + (stack_counter * 40) + else: + top_margin = 60 + + fig.update_layout( + xaxis_title=group_col, + yaxis_title="Avg CDR3 Length (ntds)", + xaxis=dict(tickfont=dict(size=15)), + margin=dict(t=top_margin), + showlegend=False, + width=600, + height=600, + plot_bgcolor='rgba(0,0,0,0)' + ) + fig.update_yaxes(showgrid=True, gridcolor='lightgrey') + fig.show() + +# --- 2. GENERATE TABS --- + +if len(unique_patients) <= 10: + print("::::: {.panel-tabset}\n") + + # --- TAB A: ALL SAMPLES --- + print("## All Samples\n") + print(":::: {.panel-tabset}\n") + for group in group_opts_all: + print(f"### {group}\n") + create_cdr3_len_plot(df, group) + print("\n") + print("::::\n") + + # --- TAB B: BY PATIENT --- + print("## By Patient\n") + print(":::: {.panel-tabset}\n") + + for pat in unique_patients: + print(f"### {pat}\n") + + pat_df = df[df[subject_col] == pat] + + for group in group_opts_patient: + if pat_df[group].nunique() > 0: + + # --- APPLY CUSTOM SORTING IF AVAILABLE --- + custom_order = None + if group == timepoint_col and time_order_map: + pat_tps = pat_df[group].dropna().unique().tolist() + custom_order = sorted(pat_tps, key=lambda x: time_order_map.get(x, 999)) + + create_cdr3_len_plot(pat_df, group, custom_order) + else: + print(f"No data for {group} in patient {pat}.") + print("\n") + + print("::::\n") + print(":::::\n") + +else: + # --- STANDARD VIEW (>10 Patients) --- + print("::: {.panel-tabset}\n") + for group in group_opts_all: + print(f"## {group}\n") + create_cdr3_len_plot(df, group) + print("\n") + print(":::\n") + +``` +**Figure 8. Average Productive CDR3 Length** Visualizes the frequency of CDR3 nucleotide lengths. This plot is used to verify the expected Gaussian distribution typical of polyclonal repertoires and to identify skewness indicative of technical artifacts or clonal dominance + +**Quality Control Interpretations** + +**Validation of Library Complexity (Gaussian Fit):** A high-quality, polyclonal library must exhibit a near-Gaussian (bell-shaped) distribution of CDR3 lengths. + +- Significant deviations (e.g., flat, multimodal, or ragged distributions) in control or baseline samples indicate poor library complexity, RNA degradation, or sampling bottlenecks (insufficient template starting material). + +**Distinguishing Expansion from Bias:** While clonal expansion biologically causes skew, technical artifacts can mimic this signal. + +- A sharp peak at a single length in a sample expected to be naive or healthy suggests PCR bias or amplicon contamination rather than true biological expansion. +- An abundance of unusually short (<15 ntds or <5 AA) or long (>90 ntds or >30 AA) CDR3s often indicates misalignment, primer dimers, or non-specific amplification products that survived upstream filtering. + + +## Sequencing Depth vs Richness {#sec-depth-richness} + +**Does a sample appears "diverse" because it is biologically complex, or simply because it was sequenced more deeply than others?** +In an ideal experiment, **we would sequence until we reach saturation** (the point where sequencing more reads no longer reveals new clonotypes). On this plot (log-log scale), a saturated cohort appears as a horizontal plateau: increasing Total Counts (x-axis) does not significantly increase Richness (y-axis). This indicates that the library has captured the true biological ceiling of the sample, allowing for valid comparisons between patients. + +```{python} +#| output: asis +#| fig-width: 6 +#| fig-height: 5 + +# --- 1. Merge the missing counts data --- + +# Merge: Inner join ensures we only plot samples present in both +total_counts = concat_df.groupby('sample')['counts'].sum().to_frame(name='total_counts') +bias_plot_df = plot_df.merge(total_counts, on='sample', how='inner') + +# --- 2. Setup Grouping Options --- +exclude_cols = ['sample', 'file', 'total_counts', 'filename', 'sample_id', + 'shannon_entropy', 'gini_coefficient', 'inverse_simpson', + 'hill_q0', 'hill_q1', 'hill_q2', 'mean_len', 'std_len', timepoint_order_col] + +group_opts = [ + col for col in bias_plot_df.columns + if col not in exclude_cols and bias_plot_df[col].nunique() < 35 +] + +# --- 3. Start Quarto Tabset --- +print("::: {.panel-tabset}") + +for col in group_opts: + print(f"## {col}") + + # Calculate Correlation (Log-Log) + # We use log because diversity vs depth follows a power law. + x_val = bias_plot_df['total_counts'] + y_val = bias_plot_df['hill_q0'] # Richness + + # Handle zeros safely for log calculation + x_log = np.log1p(x_val) + y_log = np.log1p(y_val) + + if len(x_log) > 1: + corr, _ = pearsonr(x_log, y_log) + status = "⚠️ BIAS" if corr > 0.8 else "Pass" + title_str = f'Depth Bias by {col}\nR = {corr:.2f} ({status})' + else: + corr = 0 + title_str = f'Depth Bias by {col}\n(Not enough data)' + + # Create Plot + plt.figure(figsize=(6, 5)) + sns.set_theme(style="whitegrid", context="paper") + + # Force Categorical Coloring + hue_data = bias_plot_df[col].astype(str) + + ax = sns.scatterplot( + data=bias_plot_df, + x='total_counts', + y='hill_q0', + hue=hue_data, + style=hue_data, + s=80, + alpha=0.8, + palette='viridis', + edgecolor='black', + linewidth=0.5 + ) + + # Log Scales are crucial for this specific plot + ax.set_xscale('log') + ax.set_yscale('log') + + plt.title(title_str, fontsize=12) + plt.xlabel('Sequencing Depth (Total Counts)', fontsize=10) + plt.ylabel('Richness (Unique Clones)', fontsize=10) + + # Add diagonal warning if correlation is high + if corr > 0.8: + plt.text(0.05, 0.9, "Strong Depth Bias", + transform=ax.transAxes, color='red', fontsize=9, weight='bold') + + # Compact Legend + plt.legend( + bbox_to_anchor=(1.02, 1), + loc='upper left', + title=col, + fontsize='small', + title_fontsize='small', + markerscale=0.7, + frameon=False + ) + + plt.tight_layout() + plt.show() + + print("\n") + +print(":::") + +``` + +**Figure 9. Evaluation of Sequencing Depth Bias.** This scatter plot correlates sequencing depth (Total Counts, x-axis) with observed richness (Unique Clones, y-axis) on a log-log scale. + +A **strong positive correlation** (Pearson’s $R > 0.8$) **typically signals a technical failure in experimental design**. If your samples form a strict **diagonal line from the bottom-left to the top-right**, your **"diversity" metric is merely a proxy for library size**. In this scenario, **samples with higher read counts will artificially appear more diverse, not because of biology, but because the sampling depth was insufficient to capture the rare tail of the repertoire**. Diversity **comparisons** in such a cohort **are statistically invalid without subsampling (rarefaction)**. + +Samples with **both low sequencing depth and low richness** represent **technical dropouts**. These libraries likely suffered from **low RNA input or poor amplification efficiency**. They often lack sufficient statistical power to characterize the repertoire and **should be excluded**. + +Samples that exhibit **high sequencing depth but paradoxically low richness** are of particular interest. This discordant position—deep sampling yielding few unique clones—indicates that the repertoire is dominated by a **massive clonal expansion** (e.g., leukemia or an acute viral response) or suffers from **technical PCR "jackpotting"**. +Finding a sample with **low read counts but extreme richness** is mathematically suspicious. It implies that almost every read is a unique clonotype, a pattern often caused by **sequencing errors** (phantoms) or **contamination** with high-diversity amplicon debris. + +### Rarefaction Curve + +You **cannot fairly compare the diversity of a sample with 1 million reads to one with 10,000 reads**. The **deeper sample will always look richer simply because you looked harder**. Rarefaction simulates what your deep samples would look like if you had stopped sequencing earlier. + +```{python} + +# --------------------------------------------------------- +# Compute Basic Metrics: Depth & Richness +# --------------------------------------------------------- + +qc_stats = concat_df.groupby('sample').agg( + sequencing_depth=('counts', 'sum'), + clonotype_richness=('CDR3b', 'nunique') +).reset_index() + +# We extract just the metadata columns we need from 'df' +meta_safe = meta[['sample', 'alias', 'origin', timepoint_col]].drop_duplicates() + +# Merge metadata into qc_stats +qc_stats = qc_stats.merge(meta_safe, left_on='sample', right_on='sample', how='left') + +# --------------------------------------------------------- +# Rarefaction Curve Calculation +# --------------------------------------------------------- +# Rarefaction answers: "If we sequenced everyone to the same depth, how many clones would we find?" + + +def get_rarefaction_curve(sample_df, sample, steps=50): + """ + Simulates rarefaction by expanding counts and shuffling. + """ + # Flat array where each element is a clone ID, repeated by its count + # Use integer IDs for memory efficiency instead of strings + counts = sample_df['counts'].values + n_reads = counts.sum() + + if n_reads == 0: + return pd.DataFrame() + + # Generate clone IDs (0 to N-1) + ids = np.arange(len(counts)) + + # e.g., if clone 0 has 2 counts -> [0, 0] + # WARNING: High memory usage for very deep sequencing + flat_repertoire = np.repeat(ids, counts) + + # Shuffle to simulate random sampling + np.random.shuffle(flat_repertoire) + + # Define depths to check (from 0 to total reads) + depths = np.linspace(0, n_reads, num=steps, dtype=int) + depths = np.unique(depths)[1:] # Remove 0 and duplicates + + results = [] + for d in depths: + subsample = flat_repertoire[:d] + n_unique = len(np.unique(subsample)) + results.append({'sample': sample, 'depth': d, 'richness': n_unique}) + + return pd.DataFrame(results) + +rarefaction_data = [] + +unique_samples = concat_df['sample'].unique() +for s in unique_samples: + s_df = concat_df[concat_df['sample'] == s] + r_df = get_rarefaction_curve(s_df, s) + rarefaction_data.append(r_df) + +df_rarefaction = pd.concat(rarefaction_data) + +# Merge metadata into rarefaction data using the safe source +df_rarefaction = df_rarefaction.merge(meta_safe, on='sample', how='left') + +# Rarefaction Curves +min_depth = qc_stats['sequencing_depth'].min() + +# 2. Plot with Log X-Axis +fig_rare = px.line(df_rarefaction, x='depth', y='richness', color='alias', + title='Rarefaction Curves (Diversity vs Depth)', + labels={'depth': 'Sequencing Depth (Total Counts)', 'richness': 'Identified Clonotypes'}, + hover_data=['origin']) + +# 3. Add Vertical Line with Numerical Label +min_depth = qc_stats['sequencing_depth'].min() +fig_rare.add_vline(x=min_depth, line_dash="dash", line_color="gray", + annotation_text=f"Lowest Depth: {min_depth}") + +max_limit = qc_stats['sequencing_depth'].quantile(0.90) +fig_rare.update_xaxes(range=[0, max_limit]) + +fig_rare.show() +``` +**Figure 10. Rarefaction Analysis of TCR Repertoire Diversity.** Rarefaction curves depict the accumulation of unique clonotypes (richness) as a function of sequencing depth (Total Counts) for each sample. The dotted vertical line marks the depth of the smallest library. + +Follow each sample's line from left to right to track how new clonotypes are discovered as sequencing depth increases. In an **ideal scenario**, you want to see a **saturation Plateau**: the curve shoots up initially and then flattens out into a horizontal line. This indicates success; you have sequenced the sample deeply enough to exhaust the available diversity, and spending more money on reads would yield no new information. + +You will often see ***two distinct deviations from this ideal***: +- **Lazy Climber:** A curve that **rises very slowly**. This signals **clonal dominance**; the **sequencer is reading millions of molecules, but they are all the same few sequences**. This is a biological reality, not a technical failure, so these samples should be kept. +- **Unsatisfied Climber:** A curve that **shoots diagonally upward** even at maximum depth, never bending toward horizontal. This indicates **undersampling**; the library is far more diverse than your sequencing budget allowed. Because the "Richness" value here is just a lower bound of an unknown total, these samples often cannot be accurately compared and may need to be discarded. + +The **vertical dashed line** on your plot marks the **Lowest Sequencing Depth in your entire cohort**. To perform a valid statistical comparison, you must mathematically "downsample" every patient to this threshold. This forces a difficult trade-off: if your lowest sample has an unusable depth (e.g., < 1,000 reads), keeping it would require you to throw away 90% of the data from your high-quality samples just to make them comparable. In such cases, it is better to **delete the single bad sample rather than degrade the resolution of the entire cohort**. + +**Re-sequencing?** +This visualization also serves as the financial evidence for deciding whether to re-sequence a dropout sample or not: +- If the curve is **rising steeply**, the sample was **undersampled**. **Re-sequencing is possible** and will yield missing data. +- If the **curve has plateaued**, the **sample is naturally low-diversity** (or the library prep failed). **Re-sequencing would be a waste of money**, as deeper reads will not recover new clonotypes. + +**Expected Richness** +Always verify the magnitude of your Y-axis ("Identified Clonotypes"). For a typical bulk human blood sample, you should expect to see between $10^3$ and $10^5$ unique clonotypes. If your plot shows a maximum of only ~90 clones (as seen in some synthetic or filtered datasets), this is a major red flag for real biological data. Such a low count in primary tissue usually implies a library prep failure, an oligoclonal cell line, or an artifact of synthetic simulation. + + +::: {.callout-caution title="Warning"} +Not all diversity metrics are equally sensible to library size: + +**High Sensitivity (Depth Matters!) $\rightarrow$ Use Rarefied Data** +Metrics that rely on counting the **number of unique clones** are heavily biased by sequencing depth. +* **Richness ($q=0$):** A direct count of unique clonotypes. This is the most sensitive metric. +* **Shannon Entropy ($q=1$):** While it considers frequency, it is still significantly influenced by the "long tail" of rare clones. +* **Clonality:** Because it is calculated as $1 - \text{Normalized Entropy}$ (which relies on Richness), it inherits this bias. A deep sample will often appear "less clonal" than a shallow one simply due to the math. + +**Low Sensitivity (Depth is Irrelevant) $\rightarrow$ Use Full Data** +Metrics that measure **dominance** or **inequality** focus on the most abundant clones. Because rare clones (singletons) contribute almost zero to these calculations, adding more reads does not significantly change the result. +- **Simpson Index ($q=2$):** Measures the probability of picking two identical clones. It is mathematically weighted toward the top clones, making it extremely robust to depth differences. +- **Gini Coefficient:** Measures the inequality of the distribution (Lorenz curve). It stabilizes quickly and does not require downsampling. +- **Gene Usage (V/J):** Based on percentages (frequency), not raw counts. +::: + +## V-Gene Usage PCA {#sec-pca} + +```{python} +#| output: asis +#| fig-width: 6 +#| fig-height: 5 + +# --- 1. PCA Calculation Function (Unchanged) --- +def analyze_v_usage(df): + """ + Expects df with columns: ['sample', 'TRBV', 'counts'] + """ + # Sum counts if a sample has multiple clones with same V + matrix = df.groupby(['sample', 'TRBV'])['counts'].sum().unstack(fill_value=0) + + # Normalize (frequencies) + freq_matrix = matrix.div(matrix.sum(axis=1), axis=0) + + # Standardize and PCA + scaler = StandardScaler() + scaled_data = scaler.fit_transform(freq_matrix) + + pca = PCA(n_components=2) + components = pca.fit_transform(scaled_data) + + pca_df = pd.DataFrame(data=components, columns=['PC1', 'PC2'], index=freq_matrix.index) + + return freq_matrix, pca_df + +# --- 2. Run Analysis & Prepare Data --- +# Calculate PCA +_, pca_df = analyze_v_usage(concat_df) + +# Reset index so 'sample' becomes a column we can merge on +if pca_df.index.name == 'sample' or 'sample' not in pca_df.columns: + pca_df = pca_df.reset_index() + # Ensure the column is named 'sample' (sometimes it resets to 'index') + if 'index' in pca_df.columns and 'sample' not in pca_df.columns: + pca_df = pca_df.rename(columns={'index': 'sample'}) + +# Merge with Metadata +# inner join ensures we only plot samples we have metadata for +plot_data = pd.merge(pca_df, meta, on='sample', how='inner') + +# --- 3. Determine Grouping Columns --- +exclude_cols = ['sample', 'file', 'total_counts', 'filename', 'sample_id'] +group_opts = [ + col for col in meta.columns + if col not in exclude_cols and meta[col].nunique() < 35 +] + +# --- 4. Generate Tabs --- +print("::: {.panel-tabset}") + +for col in group_opts: + print(f"## {col}") + + plt.figure(figsize=(7, 6)) # Slightly larger to accommodate outliers + sns.set_theme(style="whitegrid", context="paper") + + # Force categorical coloring + hue_data = plot_data[col].astype(str) + + # Main Scatter Plot + sns.scatterplot( + data=plot_data, + x='PC1', + y='PC2', + hue=hue_data, + style=hue_data, + s=100, + alpha=0.8, + palette='viridis', + edgecolor='black', + linewidth=0.5 + ) + + # --- Outlier Labeling Logic --- + # Label points that are > 3 Std Devs away from the center + # This helps identify samples with "Weird" V-Gene usage (e.g. PCR failure) + outlier_mask = (np.abs(plot_data['PC1']) > 3 * plot_data['PC1'].std()) | \ + (np.abs(plot_data['PC2']) > 3 * plot_data['PC2'].std()) + + outliers = plot_data[outlier_mask] + + # Draw text for outliers + # We use a slight offset so text doesn't sit exactly on the dot + for _, row in outliers.iterrows(): + plt.text( + row['PC1'] + 0.1, + row['PC2'] + 0.1, + str(row['sample']), + fontsize=8, + color='black', + weight='bold' + ) + + plt.title(f'TRBV Gene Usage PCA (Colored by {col})', fontsize=12) + plt.xlabel(f'PC1 ({col})', fontsize=10) + plt.ylabel('PC2', fontsize=10) + + # Legend + plt.legend( + bbox_to_anchor=(1.02, 1), + loc='upper left', + title=col, + fontsize='small', + title_fontsize='small', + markerscale=0.7, + frameon=False + ) + + plt.tight_layout() + plt.show() + + print("\n") + +print(":::") + +``` +**Figure 11. Dimensionality Reduction of TRBV Gene Usage.** Principal Component Analysis (PCA) plot visualizing the variation in T-cell Receptor Beta Variable (TRBV) gene usage frequencies across samples, colored by metadata variables. + +This plot compresses the high-dimensional complexity of T-cell receptor usage (e.g., TRBV1, TRBV2... TRBV30) into a 2D map. Each dot represents a sample, and the **distance between dots represents the similarity of their "V-gene fingerprints."** **If two samples are close together, they are using the V-genes in almost identical proportions**. If they are far apart, their repertoire structures are radically different. + +**Technical Failures 🚩** +One **purpose** of this visualization is to **detect technical artifacts**. When you toggle the tabs to color by **Sequencing Batch for same condition**, you should ideally see a randomly **mixed cloud**. This indicates that the V-gene usage is **consistent** across your **experiments**. However, **if you see distinct, non-overlapping islands** corresponding to different batches (e.g., all "Batch A" samples cluster on the left, and "Batch B" samples on the right), **you have a severe Batch Effect**. This usually implies a change in the wet-lab protocol, such as a new set of multiplex primers that amplify genes with different efficiencies. If observed, **you cannot compare samples across these batches without statistical correction** (e.g., ComBat). + +**Biological Interpretation 🦠** +In the **context of cancer treatment** (especially Checkpoint Inhibitors like PD-1 blockade), **we are looking for Remodeling**. We want to **know if the drug "woke up" the immune system and caused it to expand new T-cell armies (clones) to fight the tumor. For example, if "Responders" form a distinct island separate from "Non-Responders", this suggests that having a specific bias toward a certain V-gene, TRBV-19 for example, pre-disposes a patient to fight the cancer effectively. + +An outlier on this plot can have a **specific biological cause**: + +- ***Monoclonal Expansion***. If a patient has a **massive expansion of a single clone** (e.g., a clone using TRBV7-2 takes up 50% of the repertoire), that sample will be pulled violently toward the "TRBV7-2 direction" in PCA space. Finding such an **outlier in a "healthy" control often indicates a PCR jackpotting error or contamination**. + +## Outlier Detection {#sec-outlier-flag} + +This aggregates everything. We calculate Z-scores for every metric. If a sample is $>3$ standard deviations away from the mean, we flag it. +```{python} + +# CDR3 Length Distribution & Distances +def analyze_cdr3_lengths(df): + """ + Expects df with columns: ['sample', 'CDR3b'] + """ + # Calculate lengths + df['cdr3_len'] = df['CDR3b'].apply(len) + + # 1. Basic Stats per sample + stats = df.groupby('sample')['cdr3_len'].agg( + mean_len='mean', + std_len='std', + skew_len=lambda x: skew(x) + ) + + # 2. Cohort "Ideal" Distribution + # We aggregate ALL samples to create a reference distribution + cohort_distribution = df['cdr3_len'].values + + # 3. Distance Calculation + # How far is each sample from the cohort average? + distances = [] + for sample in stats.index: + sample_dist = df[df['sample'] == sample]['cdr3_len'].values + + # Earth Mover's Distance (Wasserstein) + # Robust metric for histogram similarity + wd = wasserstein_distance(sample_dist, cohort_distribution) + distances.append(wd) + + stats['cdr3_len_dist_to_cohort'] = distances + return stats + +def run_qc_pipeline(df): + div_df = calculate_diversity_metrics(df) + len_df = analyze_cdr3_lengths(df) + _, pca_df = analyze_v_usage(df) + + # Merge all metrics into one master QC table + qc_table = div_df.join(len_df).join(pca_df) + + # Outlier Detection Logic (Z-Scores) + # We define which columns we care about for outliers + metrics_to_check = ['shannon_entropy', 'inverse_simpson', 'mean_len', 'cdr3_len_dist_to_cohort', 'PC1', 'PC2'] + + flags = [] + + for metric in metrics_to_check: + mu = qc_table[metric].mean() + sigma = qc_table[metric].std() + + # Calculate Z-score + z_score = (qc_table[metric] - mu) / sigma + + # Find outliers (|z| > 3 is standard, maybe use 2.5 for stricter QC) + outliers = qc_table[np.abs(z_score) > 3].index.tolist() + + for out in outliers: + flags.append({ + 'sample_id': out, + 'metric': metric, + 'value': qc_table.loc[out, metric], + 'z_score': z_score[out], + 'flag': f"Extreme {metric}" + }) + + return qc_table, pd.DataFrame(flags) + +qc_table,flags = run_qc_pipeline(concat_df) +print("\nMetrics values for all samples\n") +print(qc_table) +print("\nThe following samples had a metric that deviates from the rest (|z-score|>3):\n") +print(flags) +``` diff --git a/notebooks/template_sample.qmd b/notebooks/template_sample.qmd new file mode 100644 index 0000000..61f942c --- /dev/null +++ b/notebooks/template_sample.qmd @@ -0,0 +1,2772 @@ +--- +title: "TCRtoolkit Sample Report" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: true + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +jupyter: python3 +--- + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name = '' +project_dir = '' +sample_table = '' + +``` + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +x_cols='origin,timepoint' + +# Define files +project_dir=f"{project_dir}" + +sample_stats_csv = f"{project_dir}/sample/sample_stats.csv" +concat_csv = f"{project_dir}/annotate/concatenated_cdr3.tsv" +v_family_csv= f"{project_dir}/sample/v_family.csv" +j_family_csv= f"{project_dir}/sample/j_family.csv" + +# Define dirs +tcrdist_dir = f"{project_dir}/tcrdist3/" +olga_dir = f"{project_dir}/olga/" +tcrpheno_dir = f"{project_dir}/tcrpheno/" +VDJdb_dir = f"{project_dir}/vdjdb/" +convergence_dir = f"{project_dir}/convergence/" + +``` + +```{python} +#| code-fold: true + +# 1. Load Packages +import datetime +import glob +import itertools +import math +import os +import sys +import h5py +import igraph as ig +import matplotlib.pyplot as plt +import matplotlib.ticker as ticker +# import networkx as nx +import numpy as np +import pandas as pd +import plotly.express as px +import plotly.graph_objects as go +import scipy.cluster.hierarchy as sch +import seaborn as sns +from IPython.display import Image +from matplotlib.colors import LinearSegmentedColormap +from scipy.sparse import csr_matrix +from scipy.stats import gaussian_kde +import plotly.io as pio +from scipy import stats +from itertools import combinations +from scipy.stats import mannwhitneyu +from plotly.subplots import make_subplots + +import warnings +warnings.filterwarnings( + "ignore", + category=FutureWarning, + module="plotly" +) + +# 2. Print pipeline parameters + +# print('Project Name: ' + project_name) +# print('Workflow command: ' + workflow_cmd) +# print('Date and time: ' + str(datetime.datetime.now())) + +# 3. Loading data +## Reading sample metadata +meta = pd.read_csv(sample_table, sep=',') +meta.drop(columns=['file'], inplace=True) +meta_cols = meta.columns.tolist() + +## Reading combined repertoire statistics +df = pd.read_csv(sample_stats_csv, sep=',') +df = pd.merge(df, meta, on='sample', how='left') +df = df[meta_cols + [c for c in df.columns if c not in meta_cols]] + +## Reading V gene family usage +v_family = pd.read_csv(v_family_csv, sep=',') +v_family = pd.merge(v_family, meta, on='sample', how='left') +v_family = v_family[meta_cols + [c for c in v_family.columns if c not in meta_cols]] +v_family = v_family.sort_values(by=[subject_col]) + +## Reading J gene family usage +j_family = pd.read_csv(j_family_csv, sep=',') +j_family = pd.merge(j_family, meta, on='sample', how='left') +j_family = j_family[meta_cols + [c for c in j_family.columns if c not in meta_cols]] +j_family = j_family.sort_values(by=[subject_col]) + +## Reading concatenated cdr3 file +concat_df = pd.read_csv(concat_csv, sep='\t') +concat_df = concat_df.merge(meta[['sample', 'origin', subject_col, alias_col, timepoint_col, timepoint_order_col]], on='sample', how='left') + +# 4. Define important variables +id_vars = [s.strip() for s in x_cols.split(',') if s.strip()] + +``` + +# Repertoire Composition {#sec-composition} + + +### Clonal Frequency Categories + +This stacked bar plot provides a snapshot of your T-cell repertoire's structure, revealing the balance between dominant, frequent clones and the diverse pool of rare clones. By categorizing each TCR based on its frequency, we can infer the state of the immune system. + +Each bar is a complete TCR repertoire from one sample, broken down into three key functional categories: +- **Highly Frequent (or HyperFrequent):** Clones constituting > 1% of the total productive TCR reads. +- **Frequent (or Large):** Clones with a frequency between 0.1% and 1%. +- **Non-Frequent (or Small/Rare):** Clones with a frequency < 0.1%. +Clonal frequency ia calculated as: +$$f_i = \frac{\text{Read count for clone } i}{\text{Total reads for all productive TCR clones}} \times 100\%$$ + +```{python} +#| output: asis +#| echo: false + +import pandas as pd +import plotly.graph_objects as go + +def create_expansion_stacked_barplot_per_individual(df, subject_col, origin_col, timepoint_order_col): + """ + Generates a stacked bar plot grouped by sample, sorted by TIMEPOINT ORDER. + Generates one Quarto tab per individual with dropdowns for origin. + Includes a horizontal threshold line for sequencing depth (3000 counts). + """ + # --- Ensure 'counts' column is numeric --- + try: + df['counts'] = pd.to_numeric(df['counts']) + except ValueError: + print("Error: 'counts' column could not be converted to a numeric type.") + return + + subjects = [s for s in df[subject_col].unique() if pd.notna(s)] + + print("::: {.panel-tabset}\n") + + for subject in sorted(subjects): + print(f"## {subject}\n") + + subject_df = df[df[subject_col] == subject].copy() + origins = [o for o in subject_df[origin_col].unique() if pd.notna(o)] + processed_origins = {} + + for origin in sorted(origins): + origin_df = subject_df[subject_df[origin_col] == origin].copy() + unique_samples = origin_df['sample'].unique() + processed_data = [] + + for sample_name in unique_samples: + sample_df = origin_df[origin_df['sample'] == sample_name].copy() + + total_reads = sample_df['counts'].sum() + if total_reads == 0: + continue + + sample_df['frequency'] = (sample_df['counts'] / total_reads) * 100 + + highly_expanded = sample_df[sample_df['frequency'] > 1] + expanded = sample_df[(sample_df['frequency'] >= 0.1) & (sample_df['frequency'] <= 1)] + non_expanded = sample_df[sample_df['frequency'] < 0.1] + + total_clones = len(sample_df) + if total_clones == 0: + continue + + # --- NEW: Extract timepoint_order for this sample --- + # We assume all rows for a given sample have the same timepoint_order + tp_order = sample_df[timepoint_order_col].iloc[0] + + processed_data.append({ + 'Sample': sample_name, + 'TotalReads': total_reads, + 'timepoint_order': tp_order, # Store it for sorting + 'Highly Frequent': (len(highly_expanded) / total_clones) * 100, + 'Frequent': (len(expanded) / total_clones) * 100, + 'Non-Frequent': (len(non_expanded) / total_clones) * 100 + }) + + if processed_data: + plot_df = pd.DataFrame(processed_data) + + # --- MODIFIED: Sort by timepoint_order instead of TotalReads --- + plot_df = plot_df.sort_values(by='timepoint_order', ascending=True).reset_index(drop=True) + processed_origins[origin] = plot_df + + if not processed_origins: + print(f"*(No valid data for {subject})*\n\n") + continue + + fig = go.Figure() + + categories = ['Highly Frequent', 'Frequent', 'Non-Frequent'] + colors = ['#d62728', '#ff7f0e', '#1f77b4'] + traces_per_origin = 4 + + buttons = [] + initial_shapes = [] + initial_annotations = [] + initial_x_array = [] + initial_title = "" + + for i, (origin, plot_df) in enumerate(processed_origins.items()): + is_first = (i == 0) + + # Stacked Bars + for category, color in zip(categories, colors): + fig.add_trace(go.Bar( + x=plot_df['Sample'], + y=plot_df[category], + name=category, + marker_color=color, + text=[f'{y:.1f}%' for y in plot_df[category]], + textposition='inside', + offsetgroup=0, + visible=is_first, + legendgroup=category, + showlegend=is_first + )) + + # Depth Line + fig.add_trace(go.Scatter( + x=plot_df['Sample'], + y=plot_df['TotalReads'], + name='Total Reads', + mode='lines+markers', + line=dict(color='black', width=3, dash='dot'), + marker=dict(symbol='circle', size=8, color='black'), + yaxis='y2', + visible=is_first, + legendgroup='Total Reads', + showlegend=is_first + )) + + # --- MODIFIED: Horizontal Threshold Line for 3000 counts --- + shapes = [dict( + type="line", xref="paper", x0=0, x1=1, + yref="y2", y0=3000, y1=3000, # Anchored to the secondary Y axis (TotalReads) + line=dict(color="red", width=2, dash="dot") + )] + + # Place the annotation dynamically based on the length of the x-axis + annotations = [dict( + x=0.5, xref="paper", + y=3000, yref="y2", + text="Reliability Threshold (3k counts)", + showarrow=True, arrowhead=2, arrowsize=1, arrowwidth=2, arrowcolor="red", + ax=0, ay=-30, # Offsets the text slightly above the line + font=dict(color="red", size=11, weight="bold") + )] + + # Build Dropdown Button + visibility_array = [False] * (len(processed_origins) * traces_per_origin) + visibility_array[i * traces_per_origin : (i + 1) * traces_per_origin] = [True] * traces_per_origin + + button = dict( + label=str(origin), + method="update", + args=[ + {"visible": visibility_array}, + { + "shapes": shapes, + "annotations": annotations, + "title.text": f'Clonal Expansion Status: {subject} ({origin})', + "xaxis.categoryarray": plot_df['Sample'].tolist() + } + ] + ) + buttons.append(button) + + if is_first: + initial_shapes = shapes + initial_annotations = annotations + initial_title = f'Clonal Expansion Status: {subject} ({origin})' + initial_x_array = plot_df['Sample'].tolist() + + # Layout Setup + layout_dict = dict( + barmode='stack', + title=initial_title, + xaxis=dict( + title='Sample', tickangle=-90, tickfont=dict(size=10), + categoryorder='array', categoryarray=initial_x_array + ), + yaxis=dict(title='Percentage of Clones (%)', range=[0, 100]), + yaxis2=dict( + title='Total Counts (Depth)', overlaying='y', + side='right', showgrid=False, type='log' + ), + legend=dict(x=1.15, y=1), + margin=dict(r=150, t=100), + font=dict(size=12), + plot_bgcolor='white', paper_bgcolor='white', + shapes=initial_shapes, + annotations=initial_annotations + ) + + if len(processed_origins) > 1: + layout_dict['updatemenus'] = [dict( + active=0, + buttons=buttons, + x=0.0, + xanchor="left", + y=1.15, + yanchor="top", + pad={"r": 10, "t": 10} + )] + layout_dict['margin']['t'] = 120 + + fig.update_layout(**layout_dict) + fig.update_traces(textfont_size=11) + fig.show() + + print("\n\n") + + print(":::\n") + +# --- Generate Plot --- +# Make sure your timepoint order variable is passed here +create_expansion_stacked_barplot_per_individual( + concat_df, + subject_col=subject_col, + origin_col='origin', + timepoint_order_col=timepoint_order_col +) + +``` + +**Figure 1. Clonal Expansion Categories Relative to Sequencing Depth.** This stacked bar chart illustrates the percentage of clones categorized as non-Frequent, Frequent, or highly Frequent for each sample. The overlaid black dotted line tracks the total sequencing read count on a secondary axis, highlighting the influece of sampling depth and the detection of Frequent clones. A vertical red threshold line ($>3,000$ counts) demarcates samples with sufficient data for reliable clonality assessment from low-depth libraries where expansion metrics may be unstable. + +**Understanding Each Category** + +**Highly Frequent Clones (> 1%)** + +- What they are: These are the dominant "fighters" of the immune system. A single clone making up more than 1% of the entire repertoire is exceptionally frequent and almost certainly represents a T-cell that has undergone massive proliferation in response to a specific antigen. + +- Biological Insight: A large "Highly Frequent" portion indicates a highly focused, or oligoclonal, immune response. This is characteristic of: + - An acute viral infection (e.g., flu, COVID-19). + - A strong response against cancer antigens (especially with immunotherapy). + - Certain autoimmune diseases where a specific self-antigen is targeted. + +**Frequent Clones (0.1% - 1%)** + +- What they are: These are moderately frequent clones. They are participating in an immune response but are not as dominant as the hyperfrequent clones. They can represent secondary responders or clones from a previous infection that have settled into a memory state. + +- Biological Insight: This category provides a picture of the active, but not overwhelming, immune activity. A significant portion of the repertoire in this category suggests a broad, active response without being dominated by a single specificity. + +**Non-Frequent Clones (< 0.1%)** + +- What they are: This category represents the vast diversity of the TCR repertoire. It includes the massive pool of naïve T-cells waiting for an antigen and the wide variety of memory cells from past immune encounters. + +- Biological Insight: A large "Non-Frequent" portion indicates a highly diverse, or polyclonal, repertoire. This is the hallmark of a healthy, resting immune system with the potential to respond to a vast array of future threats. It is your immunological "reservoir." + +::: {.callout-warning title="Be mindfull about Sequencing Depth"} +The classification of "Non-Frequent" clones (frequency < 0.1%) is mathematically impossible in samples with fewer than 1,000 total counts, as the minimum detectable frequency (a single clone) exceeds the 0.1% threshold (1/900 = 0.11%). Consequently, low-depth samples will artificially appear entirely "Frequent" or "Highly Frequent." For robust interpretation of the "Non-expanded" category, a minimum depth of 3,000 counts is required; any sample below 1,000 counts should be considered technically insufficient for this analysis. +::: + +**Comparing timepoint samples within Patients** +The real power of this plot comes from comparing samples, such as before (Pre) and after (Post) a treatment or vaccination. + +Look for a shift towards "Highly Frequent": If the blue "Highly Frequent" bar grows significantly from the "Pre" to the "Post" sample, it strongly suggests a successful immune response was mounted. The specific T-cells recognizing the target antigen (e.g., from a vaccine or tumor) have proliferated dramatically. + +Look for a return to diversity: If a sample starts with a large "Highly Frequent" portion (e.g., during an infection) and then shifts towards a larger "Non-Frequent" portion at a later timepoint, it can signify the contraction phase of an immune response. The dominant clones recede after clearing the antigen, and the repertoire returns to a more diverse, homeostatic state. + +By analyzing the changing proportions of these categories, you can build a powerful narrative about how the immune system is responding to disease, vaccination, or therapy. + + +## Clonal Expansion {#sec-expansion} + +### Clone size Distribution + +This table is useful because it reveals the identities of the dominant clones driving the immune response in each sample. By listing the most highly expanded T-cell receptors, **you can pinpoint the specific "effector" cells that have Frequent most, likely in response to a key antigen from a tumor or pathogen.** Having their precise sequences allows you to track these key clones over time, compare them across different patients to find shared public clones, and provides the necessary information for downstream functional studies to determine their antigen specificity. + +```{python} +#| output: asis +#| echo: false + +import plotly.graph_objects as go + +def create_top_clones_tables_by_patient(df, n_clones=15): + """ + Generates Quarto tabs per patient, containing an interactive Plotly table. + The dropdown menu navigates between timepoints for that specific patient. + """ + + # Start Quarto tabset + print("::: {.panel-tabset}\n") + + # Ensure we iterate through patients logically + unique_patients = sorted(df[subject_col].dropna().unique()) + + for pat in unique_patients: + print(f"## {pat}\n") + + # Isolate data for this specific patient + pat_df = df[df[subject_col] == pat].copy() + + # We assume timepoint_order exists to keep the dropdown chronological. + # If not, it falls back to alphabetical sorting of the timepoint strings. + if timepoint_order_col in pat_df.columns: + unique_timepoints = pat_df.sort_values(timepoint_order_col)[timepoint_col].unique() + else: + unique_timepoints = sorted(pat_df[timepoint_col].dropna().unique()) + + fig = go.Figure() + buttons = [] + + # Build a table trace for each timepoint + for i, tp in enumerate(unique_timepoints): + tp_df = pat_df[pat_df[timepoint_col] == tp].copy() + tp_df = tp_df[tp_df['counts'] > 0] + + if tp_df.empty: + continue + + # Grab the top clones + top_n_clones = tp_df.nlargest(n_clones, 'counts') + + # Safely extract columns (avoids KeyErrors if TRBV/TRBJ are missing in some runs) + cols_to_show = ['CDR3b', 'TRBV', 'TRBJ', 'counts'] + available_cols = [c for c in cols_to_show if c in top_n_clones.columns] + display_df = top_n_clones[available_cols].copy() + + # Clean up headers for display + display_df.columns = [c.capitalize() if c == 'counts' else c for c in display_df.columns] + + fig.add_trace( + go.Table( + header=dict(values=list(display_df.columns), + fill_color='paleturquoise', + align='left', + font=dict(size=12, color='black')), + cells=dict(values=[display_df[col] for col in display_df.columns], + fill_color='lavender', + align='left', + font=dict(size=11, color='black')), + name=str(tp), + visible=(i == 0) # Only render the first timepoint initially + ) + ) + + # If the patient had no valid data > 0 across any timepoint, skip plotting + if not fig.data: + print(f"*(No clonal data available for {pat})*\n\n") + continue + + # Build dropdown logic + for trace in fig.data: + tp_name = trace.name + visibility_mask = [tr.name == tp_name for tr in fig.data] + + button = dict( + label=tp_name, # Dropdown shows Timepoint + method="restyle", + args=[ + {"visible": visibility_mask}, + {"title.text": f"Top {n_clones} Expanded Clones: {pat}"} + ] + ) + buttons.append(button) + + first_tp = fig.data[0].name + + fig.update_layout( + updatemenus=[ + dict( + active=0, + buttons=buttons, + direction="down", + pad={"r": 10, "t": 10}, + showactive=True, + x=0.0, # Aligned to the left for cleaner layout + xanchor="left", + y=1.15, + yanchor="top" + ) + ], + title_text=f"Top {n_clones} Expanded Clones: {pat}", + font=dict(size=12), + margin=dict(t=100, b=20, l=10, r=10), # Added top margin so dropdown doesn't clip title + height=400 # Constrain height so the table doesn't consume the entire screen + ) + + fig.show() + print("\n\n") + + # End Quarto tabset + print(":::\n") + +# --- Run --- +create_top_clones_tables_by_patient(concat_df, n_clones=15) + +``` +**Table 1. Top 15 most expanded TCR clones** TCRs are rearranged decreasingly based on the "Counts" column. + + +**Visualizing the clone size distribution provides an intuitive snapshot of the repertoire's entire structure.** This plot reveals the balance between the vast number of rare clones (the "tail") and the few highly expanded clones (the "head") that often drive an immune response. A distribution heavily skewed to the left with a short tail signifies a diverse, polyclonal repertoire with many different rare T-cells. In contrast, a distribution with a long, heavy tail extending to the right is a clear visual indicator of an oligoclonal repertoire, dominated by large, expanded clones. This allows for a quick, qualitative assessment of a sample's clonality and diversity, complementing more quantitative summary metrics. +**By comparing all distributions, you can visually assess the overall impact of your experimental condition across the entire patient cohort.** + +```{python} +#| output: asis +#| echo: false + +import pandas as pd +import numpy as np +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from scipy.stats import mannwhitneyu + +def create_violin_with_heatmap_inset(df, meta_df, origin_col='origin'): + """ + Generates a composite plot (Violin + Heatmap) for each patient. + - Violin: Distribution of clone counts (log10). + - Heatmap: Pairwise Mann-Whitney U test significance. + - Grouped by Patient, Sorted by Timepoint, Labeled by Alias. + - Dropdown menu to toggle between Origins if multiple exist. + """ + + # --- 1. Data Prep --- + try: + df['counts'] = pd.to_numeric(df['counts']) + except ValueError: + print("Error: 'counts' column could not be converted to a numeric type.") + return + + # Filter for valid counts + plot_df = df[df['counts'] > 0].copy() + if plot_df.empty: return + + plot_df['log10_counts'] = np.log10(plot_df['counts']) + + # --- 2. Merge with Metadata --- + # Assuming df already has the metadata merged based on your previous code structure + merged_df = plot_df + merged_df = merged_df.dropna(subset=[subject_col]) + + # Get list of unique patients + patients = sorted(merged_df[subject_col].unique()) + + # --- 3. Generate Tabs --- + print("::: {.panel-tabset}\n") + + for pat in patients: + print(f"## {pat}\n") + + pat_df = merged_df[merged_df[subject_col] == pat].copy() + + # Identify unique origins for this patient + origins = sorted([o for o in pat_df[origin_col].unique() if pd.notna(o)]) + + # Pre-process data for each origin to ensure we only build valid dropdown options + processed_origins = {} + for origin in origins: + origin_df = pat_df[pat_df[origin_col] == origin].copy() + + sample_order_df = origin_df[[alias_col, timepoint_order_col]].drop_duplicates().sort_values(timepoint_order_col) + ordered_aliases = sample_order_df[alias_col].tolist() + n_samples = len(ordered_aliases) + + if n_samples < 1: continue + + # Calculate Pairwise Statistics (Matrix) + p_matrix = np.full((n_samples, n_samples), np.nan) + text_matrix = np.full((n_samples, n_samples), "", dtype=object) + + for i, alias1 in enumerate(ordered_aliases): + for j, alias2 in enumerate(ordered_aliases): + if i <= j: continue # Lower triangle only + + g1 = origin_df[origin_df[alias_col] == alias1]['log10_counts'] + g2 = origin_df[origin_df[alias_col] == alias2]['log10_counts'] + + if len(g1) < 2 or len(g2) < 2: continue + + try: + stat, p = mannwhitneyu(g1, g2, alternative='two-sided') + except ValueError: continue + + log_p = -np.log10(p + 1e-300) + p_matrix[i, j] = log_p + + if p < 0.0001: star = "****" + elif p < 0.001: star = "***" + elif p < 0.01: star = "**" + elif p < 0.05: star = "*" + else: star = "" + + text_matrix[i, j] = star + + processed_origins[origin] = { + 'df': origin_df, + 'aliases': ordered_aliases, + 'p_matrix': p_matrix, + 'text_matrix': text_matrix + } + + if not processed_origins: + print(f"*(No valid data for {pat})*\n\n") + continue + + # --- 4. Create Composite Figure --- + fig = make_subplots( + rows=1, cols=2, + column_widths=[0.6, 0.4], + horizontal_spacing=0.15, + specs=[[{"type": "xy"}, {"type": "heatmap"}]] + ) + + buttons = [] + traces_per_origin = 2 # 1 Violin + 1 Heatmap + + initial_title = "" + initial_x_array = [] + + # Iterate through valid origins and build traces + for i, (origin, data) in enumerate(processed_origins.items()): + is_first = (i == 0) + + # Trace 1: Violin Plot (Left) + fig.add_trace( + go.Violin( + x=data['df'][alias_col], + y=data['df']['log10_counts'], + box_visible=True, + meanline_visible=True, + points=False, + line_color='#1f77b4', + fillcolor='#1f77b4', + opacity=0.6, + name="Distribution", + width=0.7, + spanmode='hard', + showlegend=False, + visible=is_first + ), + row=1, col=1 + ) + + # Trace 2: Significance Heatmap (Right) + fig.add_trace( + go.Heatmap( + z=data['p_matrix'], + x=data['aliases'], + y=data['aliases'], + text=data['text_matrix'], + texttemplate="%{text}", + textfont={"size": 10}, + colorscale="Reds", + zmin=0, zmax=5, + showscale=True, + colorbar=dict( + title="-log10(p)", + titleside="top", + thickness=10, + len=0.5, + x=1.05, + y=0.5 + ), + hovertemplate="%{y} vs %{x}
Sig: %{text}
-log10(p): %{z:.2f}", + visible=is_first + ), + row=1, col=2 + ) + + # --- Dropdown Logic for this Origin --- + visibility_array = [False] * (len(processed_origins) * traces_per_origin) + visibility_array[i * traces_per_origin : (i + 1) * traces_per_origin] = [True] * traces_per_origin + + button = dict( + label=str(origin), + method="update", + args=[ + {"visible": visibility_array}, + { + "title.text": f"Clone Size Distribution: {pat} ({origin})", + "xaxis.categoryarray": data['aliases'], # Update Violin X + "xaxis2.categoryarray": data['aliases'], # Update Heatmap X + "yaxis2.categoryarray": data['aliases'] # Update Heatmap Y + } + ] + ) + buttons.append(button) + + if is_first: + initial_title = f"Clone Size Distribution: {pat} ({origin})" + initial_x_array = data['aliases'] + + # --- 5. Final Layout Setup --- + layout_dict = dict( + title_text=initial_title, + width=1000, + height=600, + plot_bgcolor='rgba(0,0,0,0)', + margin=dict(t=60, b=100, l=50, r=50) + ) + + # Add updatemenus only if there are multiple origins + if len(processed_origins) > 1: + layout_dict['updatemenus'] = [dict( + active=0, + buttons=buttons, + x=0.0, + xanchor="left", + y=1.15, + yanchor="top", + pad={"r": 10, "t": 10} + )] + layout_dict['margin']['t'] = 120 # Give room for the dropdown + + fig.update_layout(**layout_dict) + + # Apply initial Axis Formatting based on the first origin + # Subplot 1 (Violin) -> xaxis, yaxis + fig.update_yaxes(title_text="log10(Clone Count)", row=1, col=1, showgrid=True, gridcolor='lightgrey') + fig.update_xaxes( + title_text="Sample (Alias)", row=1, col=1, tickangle=-45, + categoryorder='array', categoryarray=initial_x_array + ) + + # Subplot 2 (Heatmap) -> xaxis2, yaxis2 + fig.update_xaxes( + side="bottom", showgrid=False, row=1, col=2, tickfont=dict(size=9), tickangle=-45, + categoryorder='array', categoryarray=initial_x_array + ) + fig.update_yaxes( + side="left", showgrid=False, row=1, col=2, tickfont=dict(size=9), + categoryorder='array', categoryarray=initial_x_array + ) + + fig.show() + print("\n") + + print(":::\n") + +# --- Run --- +# Example of running it (make sure your global columns are defined) +create_violin_with_heatmap_inset(concat_df, meta, origin_col='origin') + +``` + +**Figure 2. Clone Density** Violin plots showing the number of sequencing counts (y-axis; log-scale) across all samples (x-axis). A value around 0 in the x-axis, represets clones that are non-expanded. + +## Gene Family usage {#sec-gene-family-usage} +### V Gene Family usage + +**V (Variable), D (Diversity), and J (Joining) genes are the genetic building blocks a T-cell uses to construct a unique T-cell receptor (TCR) through a "mix-and-match" process called V(D)J recombination**. To create the immense diversity needed to recognize countless potential threats, each developing T-cell randomly selects and permanently stitches together one V, one D (for the beta chain only), and one J gene segment from a large genetic library. This process creates a single, unique TCR gene for that cell, with the most critical, hypervariable region known as the CDR3 being formed at the junction where these segments meet, which is the part that directly binds to antigens. 🧬 + +The V gene family usage of the TCRs in each sample is shown in the plots below. The x-axis shows the timepoint collected for each individual, and the y-axis shows the proportion of TCRs that use each V gene family. +The V gene usage proportion, $V_k$, is calculated via: + +$$ +V_k = \frac {N_{k}} {T} \quad\text{,}\quad T = \sum\limits_{i=1}^N p_i +$$ + +where $N_{k}$ is the number of TCRs that use the $k$ th V gene, and T is the total number of TCRs in the sample. + +```{python} + +def prepare_data(df, gene_type): + """ + Transforms the raw data frame into a long format with proportions. + Generic for any gene type ('v', 'j', etc.). + """ + gene_col = f'{gene_type}_gene' + total_col = f'total_{gene_type}_genes' + geneName = 'TRBV' if gene_type == 'v' else 'TRBJ' + + TRB_gene_cols=df.columns.str.startswith(geneName) + df_long = pd.melt(df, id_vars=[subject_col]+id_vars, + value_vars=df.columns[TRB_gene_cols].tolist(), var_name=gene_col, value_name='count') + + # Calculate proportions + df_long['proportion'] = df_long.groupby([subject_col, timepoint_col])['count'].transform(lambda x: x / x.sum() if x.sum() > 0 else 0) + + # Calculate total counts for hover text + total_counts = df_long.groupby([subject_col, timepoint_col])['count'].sum().reset_index() \ + .rename(columns={'count': total_col}) + + # Merge total counts back + df_long = pd.merge(df_long, total_counts, on=[subject_col, timepoint_col]) + return df_long + +def create_vj_navigation_plot(df, patient_list, gene_type): + """ + Creates an interactive Plotly figure with a dropdown menu. + Generic for any gene type. + """ + gene_col = f'{gene_type}_gene' + total_col = f'total_{gene_type}_genes' + + if df.empty or not patient_list: + return go.Figure().update_layout(title_text="No data to display.") + + fig = go.Figure() + all_genes = sorted(df[gene_col].unique()) + colors = px.colors.qualitative.Plotly + color_map = {gene: colors[i % len(colors)] for i, gene in enumerate(all_genes)} + + # Loop to add traces for each patient and gene + for i, patient in enumerate(patient_list): + patient_df = df[df[subject_col] == patient] + for gene in all_genes: + plot_df = patient_df[patient_df[gene_col] == gene] + if plot_df.empty: + timepoints = sorted(patient_df[timepoint_col].unique()) + # Ensure we get the correct total counts for each timepoint + totals = [patient_df[patient_df.timepoint == tp][total_col].iloc[0] if not patient_df[patient_df.timepoint == tp].empty else 0 for tp in timepoints] + plot_df = pd.DataFrame({ + timepoint_col: timepoints, + 'proportion': [0] * len(timepoints), + total_col: totals + }) + + fig.add_trace(go.Bar( + x=plot_df[timepoint_col], y=plot_df['proportion'], name=gene, + marker_color=color_map.get(gene), visible=(i == 0), + text=plot_df[total_col] if gene == all_genes[0] else None, + textposition='outside', hoverinfo='x+y+name', + legendgroup=gene, + showlegend=(i == 0) + )) + + # Create dropdown buttons + buttons = [] + num_genes = len(all_genes) + for i, patient in enumerate(patient_list): + visibility = [False] * (len(patient_list) * num_genes) + start_index = i * num_genes + end_index = (i + 1) * num_genes + visibility[start_index:end_index] = [True] * num_genes + button = dict( + method="restyle", + args=[{"visible": visibility}, {"title.text": f"Patient: {patient}"}], + label=patient + ) + buttons.append(button) + + fig.update_layout( + template="simple_white", + title_text=f"Patient: {patient_list[0]}", + xaxis_title="Timepoint", + yaxis_title="Proportion", + yaxis=dict(range=[0, 1.15]), + barmode="stack", + legend=dict( + orientation="v", x=1.02, xanchor="left", + y=0.5, yanchor="middle", + title=f"{gene_type.upper()}-Gene" # Dynamic legend title + ), + updatemenus=[ + dict( + active=0, buttons=buttons, direction="down", + pad={"r": 10, "t": 10}, showactive=True, + x=1.0, xanchor="right", y=1.19, yanchor="top" + ) + ] + ) + return fig + +``` + +```{python} +# Prepare the V-gene data +v_family_long = prepare_data(v_family, 'v') + +# Split into groups +v_timepoint_counts = v_family_long.groupby(subject_col)[timepoint_col].nunique() +v_single_tp_patients = v_timepoint_counts[v_timepoint_counts == 1].index.tolist() +v_multi_tp_patients = v_timepoint_counts[v_timepoint_counts > 1].index.tolist() + +v_df_single = v_family_long[v_family_long[subject_col].isin(v_single_tp_patients)] +v_df_multi = v_family_long[v_family_long[subject_col].isin(v_multi_tp_patients)] +``` + +::: {.panel-tabset} +## Multiple Timepoints +```{python} +# Display plot for the MULTIPLE timepoints tab. + +if not v_df_multi.empty: + fig_v_multi = create_vj_navigation_plot(v_df_multi, v_multi_tp_patients, 'v') + fig_v_multi.show() +else: + print("No patients with multiple timepoints found for V-genes.") + +``` +## Single Timepoint +```{python} +# Display plot for the SINGLE timepoint tab. +if not v_df_single.empty: + fig_single = create_vj_navigation_plot(v_df_single, v_single_tp_patients, 'v') + fig_single.show() +else: + print("No patients with a single timepoint found.") +``` +::: +**Figure 6. TRBV gene usage** Stacked barplot showing gene proportion (y-axis) at a given sample (x-axis). Samples cointaining multiple timepointes are shown side-by-side to facilitate comparisons. + + +**Use this visualization to detect biased gene usage, which is a strong indicator of a significant, antigen-driven immune response.** +When T-cells responding to a specific antigen proliferate, the V and J genes they are built from will naturally increase in proportion to all other genes in the repertoire. Comparing the usage profiles between two timepoints is the most powerful application of this chart. +Look for significant changes: A bar that dramatically increases in height from one timepoint to the next indicates that an immune response involving T-cells from that specific gene family has been initiated or has expanded. +*Example scenario:* +If you observe that the usage of the TRBV7-9 gene increases from 4% in a pre-treatment sample to 35% in a post-treatment sample, this provides strong evidence of a treatment-induced response driven by T-cells that utilize the TRBV7-9 gene segment. + +### J Gene Family usage + +Likewise, we can visualize the TRB J gene usage. + +```{python} +# Prepare J-gene data using the reusable function +j_family_long = prepare_data(j_family, 'j') + +# Split into groups +j_timepoint_counts = j_family_long.groupby(subject_col)[timepoint_col].nunique() +j_single_tp_patients = j_timepoint_counts[j_timepoint_counts == 1].index.tolist() +j_multi_tp_patients = j_timepoint_counts[j_timepoint_counts > 1].index.tolist() + +j_df_single = j_family_long[j_family_long[subject_col].isin(j_single_tp_patients)] +j_df_multi = j_family_long[j_family_long[subject_col].isin(j_multi_tp_patients)] +``` + +::: {.panel-tabset} +## Multiple Timepoints +```{python} +# Display plot for the MULTIPLE timepoints tab. +if not j_df_multi.empty: + fig_j_multi = create_vj_navigation_plot(j_df_multi, j_multi_tp_patients, 'j') + fig_j_multi.show() +else: + print("No patients with multiple timepoints found for J-genes.") + +``` +## Single Timepoint +```{python} +# Display plot for the SINGLE timepoint tab. +if not j_df_single.empty: + fig_single = create_vj_navigation_plot(j_df_single, j_single_tp_patients, 'j') + fig_single.show() +else: + print("No patients with a single timepoint found.") +``` +::: +**Figure 7. TRBV gene usage** Stacked barplot showing gene proportion (y-axis) at a given sample (x-axis). Samples cointaining multiple timepointes are shown side-by-side to facilitate comparisons. + +# TCR Sequence Generation {#sec-cdr3_generation} + +## TCR Convergence {#sec-convergence} + +**What is TCR Convergence?** +TCR convergence describes the **phenomenon where different T-cell precursors**, through the random V(D)J recombination process, **independently generate T-cell receptors that have identical amino acid sequences but are encoded by different nucleotide (DNA) sequences.** + +**Why is it Important to Look At?** +It's crucial to analyze TCR convergence because while a single expanded clone might arise by chance, **finding multiple, genetically distinct T-cell lineages that have independently evolved to recognize the same target** (a common antigen from a virus or tumor, for instance) **is a strong indicator of antigen-driven selection**. This "convergent evolution" highlights the most effective and functionally important TCRs in an immune response, making them prime candidates for developing targeted immunotherapies, vaccines, and diagnostic biomarkers. + +**Sequencing Depth Bias** +TCR convergence (finding multiple nucleotide sequences that encode the same amino acid CDR3) is heavily influenced by the sequencing depth. As you sequence deeper, capturing more unique nucleotide sequences, the probability of randomly finding a second sequence that codes for the same amino acid increases mathematically, regardless of biology. + +You expect to see a strong positive correlation. Samples with higher sequencing depth (X-axis) will almost always have higher raw convergence counts (Y-axis). That being said, you cannot compare Sample A (100k reads) to Sample B (10k reads) using raw counts. Sample A will always look "better" simply because we looked harder. + +```{python} +#| output: asis +#| echo: false + + +# ============================================================================== +# 1. LOAD AND PROCESS CONVERGENCE FILES (METHOD 2: O/E) +# ============================================================================== + +files = glob.glob(os.path.join(convergence_dir, '*_tcr_convergence.tsv')) + +conv_data = [] +for f in files: + filename = os.path.basename(f) + sample_id = filename.replace('_tcr_convergence.tsv', '') + if sample_id.endswith('_pseudobulk'): + sample_id = sample_id[:-11] + if sample_id.endswith('_airr'): + sample_id = sample_id[:-5] + + try: + temp_df = pd.read_csv(f, sep='\t') + + # Metrics + M = temp_df['convergence'].sum() # Total NT + R = len(temp_df) # Unique AA + raw_events = M - R # Raw Convergence + + # Null Model (O/E) + num_convergent_aa = np.sum(temp_df['convergence'] >= 2) + observed_frac = num_convergent_aa / R if R > 0 else 0 + + if R > 1 and M > 0: + w = 1.0 / R + prob_0 = (1.0 - w) ** M + prob_1 = M * w * ((1.0 - w) ** (M - 1)) + expected_frac = 1.0 - prob_0 - prob_1 + else: + expected_frac = np.nan + + oe_ratio = observed_frac / expected_frac if (expected_frac > 0) else np.nan + + conv_data.append({ + 'sample': sample_id, + 'M_total_nt': M, + 'raw_events': raw_events, + 'convergence_OE_ratio': oe_ratio + }) + + except Exception as e: + print(f"Error reading {sample_id}: {e}") + +# Merge with Metadata +conv_df = pd.DataFrame(conv_data) +merged_df = pd.merge(conv_df, meta, on='sample', how='inner') + +# ============================================================================== +# 2. FILTER & CONVERT COLUMNS +# ============================================================================== + +exclude_cols = ['sample', 'file', 'M_total_nt', 'raw_events', 'convergence_OE_ratio'] +group_opts = [] + +for col in merged_df.columns: + if col in exclude_cols: + continue + # Check unique count + if merged_df[col].nunique() < 30: + # Convert to string to ensure discrete colors + merged_df[col] = merged_df[col].astype(str) + group_opts.append(col) + +group_opts = sorted(group_opts) +hover_cols = list(merged_df.columns) + +# ============================================================================== +# PLOT SET 1: RAW CONVERGENCE VS DEPTH (QC) +# ============================================================================== + +# Calculate Global Correlation Stats (Depth vs Events) once +# We drop NaNs to ensure accurate stats +stat_df = merged_df[['M_total_nt', 'raw_events']].dropna() +slope, intercept, r_value, p_value, std_err = stats.linregress( + stat_df['M_total_nt'], stat_df['raw_events'] +) + +# Create line coordinates for the regression line +line_x = np.array([stat_df['M_total_nt'].min(), stat_df['M_total_nt'].max()]) +line_y = slope * line_x + intercept + +print("::: {.panel-tabset}\n") + +for i, col in enumerate(group_opts): + print(f"## {col}\n") + + use_symbol = col if merged_df[col].nunique() <= 30 else None + + # Base Scatter Plot + fig = px.scatter( + merged_df, + x='M_total_nt', + y='raw_events', + color=col, + symbol=use_symbol, + hover_name='sample', + hover_data=hover_cols, + title=f"Raw Convergence vs. Depth (Colored by {col})", + labels={'M_total_nt': 'Total NT Clonotypes (Depth)', + 'raw_events': 'Raw Convergence Events'}, + template="plotly_white", + opacity=0.8, + category_orders={col: sorted(merged_df[col].unique())} + ) + + # Add Global Regression Line + fig.add_trace(go.Scatter( + x=line_x, + y=line_y, + mode='lines', + name='Global Trend', + line=dict(color='red', width=2, dash='dash') + )) + + # Add Annotation with R and p value + # Formats p-value: if extremely small, use scientific notation + p_text = "< 0.001" if p_value < 0.001 else f"{p_value:.4f}" + + fig.add_annotation( + x=0.05, + y=0.95, + xref="paper", + yref="paper", + text=f"Pearson R: {r_value:.3f}
p-value: {p_text}", + showarrow=False, + bgcolor="rgba(255, 255, 255, 0.8)", + bordercolor="black", + borderwidth=1 + ) + + fig.update_traces(marker=dict(size=10, line=dict(width=1, color='DarkSlateGrey'))) + fig.show() + print("\n") + +print(":::\n") + +``` +**Figure 4. Sequencing Depth Bias in TCR Convergence**. This scatter plot correlates the total number of nucleotide clonotypes (sequencing depth) with the raw count of convergent events (multiple nucleotide sequences encoding the same amino acid CDR3). A strong positive correlation confirms that observed convergence is highly dependent on sampling effort, as deeper sequencing increases the probability of detecting random collisions. + +**Solution 1: Look at the proportion of convergent TCRs** +```{python} +#| output: asis +#| echo: false + +import plotly.express as px +import plotly.graph_objects as go +from scipy.stats import mannwhitneyu +from itertools import combinations +import pandas as pd + +# --- 0. CONFIGURATION & SETUP --- +origin_col = 'origin' # <--- Make sure this matches your dataframe's column name + +# Create a mapping for timepoint order using the 'meta' dataframe +if 'meta' in locals() and timepoint_col in meta.columns and timepoint_order_col in meta.columns: + time_order_map = meta.set_index(timepoint_col)[timepoint_order_col].to_dict() +else: + if timepoint_order_col in df.columns: + time_order_map = df.set_index(timepoint_col)[timepoint_order_col].to_dict() + else: + time_order_map = {} + +# 1. Define General Exclusions +exclude_cols = ['sample', 'file', 'clonality', 'counts', 'total_counts', + 'num_clones', 'num_TCRs', 'simpson_index', 'simpson_index_corrected', + 'num_prod', 'num_nonprod', 'pct_prod', 'pct_nonprod', + 'productive_cdr3_avg_len', 'num_convergent','ratio_convergent'] + +# 2. Define Groups for "All Samples" Tab (Exclude timepoint/order) +exclude_from_all_samples = [timepoint_col, timepoint_order_col, 'protocol_day', alias_col] + +group_opts_all = [ + col for col in df.columns + if col not in exclude_cols + and col not in exclude_from_all_samples + and df[col].nunique() < 35 +] + +# 3. Define Groups for "By Patient" Tab (Only timepoint) +group_opts_patient = [timepoint_col] if timepoint_col in df.columns else [] + +unique_patients = sorted(df[subject_col].dropna().unique().tolist()) + + +# --- 1. DEFINE BASE PLOTTING FUNCTION --- +def create_convergent_plot(data, group_col, custom_order=None): + """ + Returns a Plotly figure object (instead of showing it directly). + """ + if data.empty or data[group_col].dropna().empty: + return None + + if custom_order: + unique_cats = [x for x in custom_order if x in data[group_col].unique()] + else: + unique_cats = sorted(data[group_col].dropna().unique().tolist()) + + # Base Plot + fig = px.box( + data, + x=group_col, + y='ratio_convergent', + color=group_col, + points='all', + hover_data=['sample'], + category_orders={group_col: unique_cats}, + template="simple_white", + title=f"Convergent TCRs by {group_col}" + ) + + # BIGGER DOTS (size=10) + fig.update_traces(width=0.5, marker=dict(size=10, opacity=0.7)) + + for trace in fig.data: + if isinstance(trace, go.Box): + trace.pointpos = 0 + trace.jitter = 0.2 + + # Stats + top_margin = 60 + if len(unique_cats) >= 2: + pairs = list(combinations(unique_cats, 2)) + y_max = data['ratio_convergent'].max() + y_range = y_max - data['ratio_convergent'].min() + if y_range == 0: y_range = 1 + step_size = y_range * 0.13 + stack_counter = 0 + + for t1, t2 in pairs: + group1 = data[data[group_col] == t1]['ratio_convergent'].dropna() + group2 = data[data[group_col] == t2]['ratio_convergent'].dropna() + + if len(group1) < 2 or len(group2) < 2: continue + + try: + stat, p_value = mannwhitneyu(group1, group2, alternative='two-sided') + except ValueError: continue + + if p_value >= 0.05: continue + + if p_value < 0.001: symbol = '***' + elif p_value < 0.01: symbol = '**' + elif p_value < 0.05: symbol = '*' + + y_bracket = y_max + (y_range * 0.15) + (stack_counter * step_size) + y_text = y_bracket + (y_range * 0.02) + + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t2, y1=y_bracket, + line=dict(color="black", width=1.5)) + + tick_len = y_range * 0.02 + fig.add_shape(type="line", xref="x", yref="y", + x0=t1, y0=y_bracket, x1=t1, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + fig.add_shape(type="line", xref="x", yref="y", + x0=t2, y0=y_bracket, x1=t2, y1=y_bracket - tick_len, + line=dict(color="black", width=1.5)) + + try: x_center = (unique_cats.index(t1) + unique_cats.index(t2)) / 2 + except: x_center = (unique_cats.index(t1) + unique_cats.index(t2)) / 2 + + fig.add_annotation( + x=x_center, y=y_text, text=f"{symbol}
p={p_value:.3f}", + showarrow=False, font=dict(size=10, color="black") + ) + stack_counter += 1 + top_margin = 60 + (stack_counter * 40) + + fig.update_layout( + xaxis_title=group_col, + yaxis_title="Convergent TCRs Ratio", + xaxis=dict(tickfont=dict(size=15), categoryorder='array', categoryarray=unique_cats), + margin=dict(t=top_margin), + showlegend=False, + width=600, + height=600, + plot_bgcolor='rgba(0,0,0,0)' + ) + fig.update_yaxes(showgrid=True, gridcolor='lightgrey') + + return fig + + +# --- 2. DROPDOWN WRAPPER FUNCTION --- +def plot_patient_with_dropdown(pat_df, group_col, origin_col, custom_order=None): + """ + Handles plotting for a single patient. If multiple origins exist, + combines them into a single figure with a dropdown menu. + """ + if origin_col not in pat_df.columns: + fig = create_convergent_plot(pat_df, group_col, custom_order) + if fig: fig.show() + return + + origins = sorted(pat_df[origin_col].dropna().unique().tolist()) + + if len(origins) == 0: + return + elif len(origins) == 1: + # Standard plot if only one origin exists + fig = create_convergent_plot(pat_df, group_col, custom_order) + if fig: + fig.update_layout(title=f"Convergent TCRs by {group_col} ({origins[0]})") + fig.show() + return + + # --- Multiple Origins: Master Dropdown Figure --- + master_fig = go.Figure() + buttons = [] + trace_visibility_list = [] + origin_layouts = [] + + current_trace_idx = 0 + valid_origins = [] + + for origin in origins: + orig_df = pat_df[pat_df[origin_col] == origin] + if orig_df.empty: continue + + orig_custom_order = [x for x in custom_order if x in orig_df[group_col].unique()] if custom_order else None + + # Generate the standalone plot for this origin + temp_fig = create_convergent_plot(orig_df, group_col, orig_custom_order) + if not temp_fig: continue + + # Extract traces and add to master + n_traces = len(temp_fig.data) + for trace in temp_fig.data: + trace.visible = (len(valid_origins) == 0) # Only first origin is visible by default + master_fig.add_trace(trace) + + trace_visibility_list.append((current_trace_idx, current_trace_idx + n_traces)) + current_trace_idx += n_traces + + # Extract layout elements (brackets, axes) to swap out later + cat_array = temp_fig.layout.xaxis.categoryarray + if cat_array is None and orig_custom_order is not None: + cat_array = orig_custom_order + + origin_layouts.append({ + 'shapes': list(temp_fig.layout.shapes or []), + 'annotations': list(temp_fig.layout.annotations or []), + 'margin_t': max(temp_fig.layout.margin.t or 60, 120), # Need min 120px for the dropdown UI + 'x_categories': cat_array + }) + valid_origins.append(origin) + + if not valid_origins: return + + # Generate Dropdown Buttons + total_traces = current_trace_idx + for i, origin in enumerate(valid_origins): + visibility = [False] * total_traces + start, end = trace_visibility_list[i] + for j in range(start, end): + visibility[j] = True + + layout_data = origin_layouts[i] + + button = dict( + label=str(origin), + method="update", + args=[ + {"visible": visibility}, + { + "shapes": layout_data['shapes'], + "annotations": layout_data['annotations'], + "title.text": f"Convergent TCRs by {group_col} ({origin})", + "margin.t": layout_data['margin_t'], + "xaxis.categoryarray": layout_data['x_categories'] + } + ] + ) + buttons.append(button) + + # Apply baseline layout using the first valid origin + master_fig.update_layout( + shapes=origin_layouts[0]['shapes'], + annotations=origin_layouts[0]['annotations'], + title_text=f"Convergent TCRs by {group_col} ({valid_origins[0]})", + margin_t=origin_layouts[0]['margin_t'], + xaxis_title=group_col, + yaxis_title="Convergent TCRs Ratio", + xaxis=dict( + tickfont=dict(size=15), + categoryorder='array', + categoryarray=origin_layouts[0]['x_categories'] + ), + showlegend=False, + width=600, height=600, + plot_bgcolor='rgba(0,0,0,0)', + updatemenus=[dict( + active=0, + buttons=buttons, + x=0.0, xanchor="left", y=1.15, yanchor="top", + pad={"r": 10, "t": 10} + )] + ) + master_fig.update_yaxes(showgrid=True, gridcolor='lightgrey') + master_fig.show() + + +# --- 3. GENERATE TABS --- + +if len(unique_patients) <= 10: + print("::::: {.panel-tabset}\n") + + # --- TAB A: ALL SAMPLES --- + print("## All Samples\n") + print(":::: {.panel-tabset}\n") + for group in group_opts_all: + print(f"### {group}\n") + fig = create_convergent_plot(df, group) + if fig: fig.show() + print("\n") + print("::::\n") + + # --- TAB B: BY PATIENT --- + print("## By Patient\n") + print(":::: {.panel-tabset}\n") + + for pat in unique_patients: + print(f"### {pat}\n") + + pat_df = df[df[subject_col] == pat] + + for group in group_opts_patient: + if pat_df[group].nunique() > 0: + + # Apply custom sorting + custom_order = None + if group == timepoint_col and time_order_map: + pat_tps = pat_df[group].dropna().unique().tolist() + custom_order = sorted(pat_tps, key=lambda x: time_order_map.get(x, 999)) + + # Utilize the new dropdown function + plot_patient_with_dropdown(pat_df, group, origin_col, custom_order) + else: + print(f"*(No data for {group} in patient {pat}.)*") + print("\n") + + print("::::\n") + print(":::::\n") + +else: + # --- STANDARD VIEW (>10 Patients) --- + print("::: {.panel-tabset}\n") + for group in group_opts_all: + print(f"## {group}\n") + fig = create_convergent_plot(df, group) + if fig: fig.show() + print("\n") + print(":::\n") + +``` + +**Figure 3. TCR Convergence** The ratio of convergent TCRs to total TCRs. A convergent TCR is a TCR that is generated via 2 or more unique nucleotide sequences via codon degeneracy. + +**Solution 2: Depth-Normalized O/E Ratio** +To eliminate this bias, we move from raw counts to a probabilistic model. We use a "Multinomial Occupancy Model" to calculate an Observed/Expected (O/E) Ratio. This ratio normalizes for depth. A deep library has a higher "Expected" value, so its denominator increases, canceling out the inflation in the "Observed" numerator. + +- **Observed (O):** The actual fraction of amino acid clonotypes in your sample that are convergent (encoded by $\ge 2$ nucleotide sequences). +- **Expected (E):** The probability that 2 or more nucleotide sequences would "collide" $P(\ge 2)$ and form the same amino acid purely by chance, given the exact size (depth) and diversity of that specific library. + - **Multinomial Occupancy Model (Balls and Bins problem):** + - Balls ($M$): Nucleotide Sequences in sample. + - Bins ($R$): Amino Acid Sequences in sample. + If you blindly throw $M$ balls into $R$ bins, some bins will naturally catch two or more balls purely by accident, this is "Random Convergence." The Multinomial Occupancy Model calculates the exact probability of this accident happening: + - **Probability of Empty:** The chance a specific amino acid gets zero nucleotide sequences is $P(\text{0}) = (1 - w)^M$ + - **Probability of Singleton:** The chance it gets exactly one nucleotide sequence (no convergence) is $P(\text{1}) = M \cdot w \cdot (1 - w)^{M-1}$ + - **Probability of Convergence:** The chance it gets two or more (The "Collision") is $P(\ge 2) = 1 - P(\text{0}) - P(\text{1})$ + Where $w$ is the probability of hitting a specific bin, estimated as $1/R$ for a uniform null. + +```{python} +#| output: asis +#| echo: false + +# ============================================================================== +# PLOT SET 2: NORMALIZED O/E RATIO VS METADATA +# ============================================================================== +# Helper function to convert p-value to stars +def get_sig_text(p): + if p < 0.0001: return "****" + elif p < 0.001: return "***" + elif p < 0.01: return "**" + elif p < 0.05: return "*" + else: return "ns" + +print("::: {.panel-tabset}\n") + +for i, col in enumerate(group_opts): + print(f"## {col}\n") + + # 1. Prepare Data & Order + # Sort categories to ensure the x-axis matches our iteration order + cat_order = sorted(merged_df[col].astype(str).unique()) + + # Create the base Strip Plot + fig = px.strip( + merged_df, + x=col, + y='convergence_OE_ratio', + color=col, + hover_name='sample', + hover_data=hover_cols, + title=f"Normalized Convergence (O/E) by {col}", + labels={'convergence_OE_ratio': 'O/E Enrichment Ratio'}, + template="plotly_white", + category_orders={col: cat_order} # Lock the order + ) + + # 2. Calculate Stats (Mann-Whitney U) between ALL pairs + # ----------------------------------------------------- + valid_comparisons = [] + + # Get the max y-value to know where to start drawing lines + y_max = merged_df['convergence_OE_ratio'].max() + # Define a step size for stacking brackets (e.g., 8% of the range) + y_range = y_max - merged_df['convergence_OE_ratio'].min() + step_size = y_range * 0.1 if y_range > 0 else 1.0 + + # Generate all pairs + pairs = list(combinations(cat_order, 2)) + + curr_y_offset = y_max + (step_size * 0.5) # Start slightly above data + + for group1, group2 in pairs: + # Extract the two groups + data1 = merged_df[merged_df[col].astype(str) == group1]['convergence_OE_ratio'].dropna() + data2 = merged_df[merged_df[col].astype(str) == group2]['convergence_OE_ratio'].dropna() + + # Skip if not enough data + if len(data1) < 2 or len(data2) < 2: + continue + + # Perform Mann-Whitney U Test + u_stat, p_val = stats.mannwhitneyu(data1, data2, alternative='two-sided') + + # Determine label (only plotting significant ones or labeling 'ns') + # Here we plot ALL for completeness, or filter for p < 0.05 if preferred + sig_label = get_sig_text(p_val) + + # OPTIONAL: Only show significant brackets to avoid clutter + if p_val < 0.05: + # 3. Add the Bracket and Annotation + # --------------------------------- + + # Draw the bracket line (horizontal with feet) + # Note: On categorical axes, x-coords are the category strings + + fig.add_shape( + type="line", + xref="x", yref="y", + x0=group1, y0=curr_y_offset, + x1=group2, y1=curr_y_offset, + line=dict(color="black", width=1) + ) + + # Add the text (stars) + fig.add_annotation( + x=(cat_order.index(group1) + cat_order.index(group2)) / 2, # Centered by index + y=curr_y_offset, + xref="x", # Use index based reference for centering logic if needed, but this works with text too if using graph_objects correctly or just map to index + # Actually, centering text on categorical axis can be tricky. + # A safer bet is to use the categorical names: + # But Plotly doesn't interpolate strings. + # We simply place it on the line. + text=sig_label, + showarrow=False, + yshift=5, + font=dict(size=10) + ) + + # Increment offset so next bracket is higher + curr_y_offset += step_size + + # 4. Final Layout Touches + fig.add_hline(y=1.0, line_dash="dash", line_color="red", annotation_text="Null Expectation") + fig.update_traces(marker=dict(size=8, line=dict(width=1, color='DarkSlateGrey'))) + + # Increase top margin to fit the brackets + fig.update_layout(margin=dict(t=50 + int(curr_y_offset - y_max)*2)) # naive scaling + + fig.show() + print("\n") + +print(":::\n") +``` + +**Figure 5. Depth-Normalized Assessment of Biological Convergence.** This plot displays the Observed/Expected (O/E) convergence ratio for each sample, grouped by metadata variables. The red dashed line at $y=1.0$ represents the null expectation where observed convergence equals that predicted by random chance given the sample's depth and diversity. Values significantly above 1.0 would indicate biological selection pressure, while the observed values below 1.0 suggest that the convergence in these samples does not exceed random statistical expectations. + + +**Interpretation** +The Signal vs. The NoiseThe O/E Ratio ($y$-axis on Plot 2) isolates biological selection pressure from statistical chance. + +- O/E $\approx$ 1.0 (The "Boring" Zone) + - **Meaning:** Random Chance. The convergence you see is statistical noise. The repertoire is behaving like a random bucket of sequences. + - **Biology:** No evidence of strong antigenic selection driving these specific convergent events. +- O/E $\gg$ 1.0 (The "Signal" Zone) + - **Meaning:** Antigenic Selection. You have significantly more convergence than the null model predicts. + - **Biology:** An external force (antigen) is actively selecting for specific structural solutions (amino acid motifs), forcing independent clones to converge on the same target despite different genetic origins. This is the hallmark of a focused immune response. +- **Relative Constraint (When O/E < 1.0)** + In datasets where all samples fall below the null expectation of 1.0, the biological interpretation shifts from detecting "Antigenic Selection" to quantifying Relative Constraint. A value consistently below 1.0 indicates that the repertoire is hyper-diverse and less convergent than a random mathematical model would predict—effectively, the immune system is avoiding structural overlap. However, statistically significant differences between groups (e.g., $p < 0.05$ via Mann-Whitney U) remain biologically critical. + +**Technical Nuances & Limitations** + +- In samples with extremely low diversity ($R < 100$ clones), the "Expected" probability calculation can become unstable or noisy. Interpret extremely high O/E ratios in very small libraries with caution. +- In highly oligoclonal samples (e.g., TILs dominated by 1-2 massive clones), the "Effective" diversity is low. The model accounts for this, but extreme dominance can mask rarer convergent events in the tail of the distribution. + +## TCR generation probabilities {#sec-olga} + +Calculating these probabilities in bulk TCR-seq data is valuable because **it provides a baseline understanding of the "expected" frequency of each TCR**. By comparing observed TCR frequencies against their generation probabilities, it is easier to identify TCRs that are "overrepresented" (clonally expanded) due to an immune response (e.g., to infection or cancer) rather than just being common due to biases in the generation process itself. This helps distinguish antigen-driven selection from inherent generation biases, offering clearer insights into immune repertoire dynamics and responses. + +```{python} +#| output: asis +#| echo: false + +import os +import glob +import pandas as pd +import numpy as np +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from scipy.stats import mannwhitneyu + +def create_pgen_tabs(meta_df, origin_col='origin'): + """ + Generates multi-tabbed violin plots for TCR Generation Probability (Pgen). + - Tab 1: All Samples (Grouped by Metadata) -> With Heatmap Inset + - Tab 2: By Patient (Grouped by Timepoint) -> With Heatmap Inset & Origin Dropdown + """ + # --- 1. Find Data Files --- + file_pattern = os.path.join(olga_dir, '*_tcr_pgen.tsv') + sample_files = sorted(glob.glob(file_pattern)) + + if not sample_files: + print(f"Error: No files found at: {file_pattern}") + return + + all_data_list = [] + + # --- 2. Load and Process Data --- + for file_path in sample_files: + filename = os.path.basename(file_path) + sample_name = filename.replace('_tcr_pgen.tsv', '') + + try: + # Load data + df = pd.read_csv(file_path, sep='\t') + + # Filter invalid pgen + df = df[df['pgen'] > 0].copy() + if df.empty: continue + + # Calculate log probabilities + log_probs = np.log10(df['pgen']) + + # Check if duplicate_count exists, otherwise assume 1 + if 'duplicate_count' in df.columns: + counts = df['duplicate_count'].astype(int) + # Expand data based on counts (Weighting) + weighted_log_probs = np.repeat(log_probs, counts) + else: + weighted_log_probs = log_probs.values + + # --- SAFETY DOWNSAMPLING --- + # To prevent browser crash with massive scatter plots + if len(weighted_log_probs) > 2000: + weighted_log_probs = np.random.choice(weighted_log_probs, 2000, replace=False) + + temp_df = pd.DataFrame({ + 'log10_pgen': weighted_log_probs, + 'sample': sample_name + }) + all_data_list.append(temp_df) + + except Exception as e: + continue + + if not all_data_list: + print("No valid data found.") + return + + # --- 3. Aggregate and Merge Metadata --- + pgen_df = pd.concat(all_data_list, ignore_index=True) + + cols_to_use = ['sample', subject_col, alias_col, timepoint_order_col] + exclude_cols = ['sample', 'file', 'filename', 'filepath', alias_col, timepoint_order_col, subject_col, timepoint_col] + group_opts = [c for c in meta_df.columns if c not in exclude_cols and meta_df[c].nunique() < 35] + + # Merge + plot_df = pgen_df.merge(meta_df, on='sample', how='inner') + + # --- HELPER 1: STANDARD PLOT WITH HEATMAP (For 'All Samples' tab) --- + def plot_violin_with_heatmap(data, x_col, title, x_order=None): + unique_cats = data[x_col].unique() + if x_order: + unique_cats = [x for x in x_order if x in unique_cats] + else: + unique_cats = sorted(unique_cats) + + n_cats = len(unique_cats) + + p_matrix = np.full((n_cats, n_cats), np.nan) + text_matrix = np.full((n_cats, n_cats), "", dtype=object) + + for i, c1 in enumerate(unique_cats): + for j, c2 in enumerate(unique_cats): + if i <= j: continue + g1 = data[data[x_col] == c1]['log10_pgen'] + g2 = data[data[x_col] == c2]['log10_pgen'] + if len(g1) < 2 or len(g2) < 2: continue + try: stat, p = mannwhitneyu(g1, g2, alternative='two-sided') + except: continue + p_matrix[i, j] = -np.log10(p + 1e-300) + + if p < 0.0001: star = "****" + elif p < 0.001: star = "***" + elif p < 0.01: star = "**" + elif p < 0.05: star = "*" + else: star = "" + text_matrix[i, j] = star + + fig = make_subplots( + rows=1, cols=2, column_widths=[0.6, 0.4], horizontal_spacing=0.15, + specs=[[{"type": "xy"}, {"type": "heatmap"}]] + ) + + fig.add_trace( + go.Violin( + x=data[x_col], y=data['log10_pgen'], box_visible=True, points=False, + line_color='#1f77b4', opacity=0.6, name="Pgen", showlegend=False, + width=0.7, spanmode='hard' + ), row=1, col=1 + ) + + fig.add_trace( + go.Heatmap( + z=p_matrix, x=unique_cats, y=unique_cats, text=text_matrix, + texttemplate="%{text}", textfont={"size": 8}, colorscale="Reds", + zmin=0, zmax=5, showscale=True, + colorbar=dict(title="-log10(p)", titleside="top", orientation="v", thickness=10, len=0.5, x=1.1, y=0.5), + hovertemplate="%{y} vs %{x}
Sig: %{text}
-log10(p): %{z:.2f}" + ), row=1, col=2 + ) + + fig.update_layout(title=title, width=1000, height=600, plot_bgcolor='rgba(0,0,0,0)', margin=dict(t=60, b=100, l=50, r=50)) + fig.update_xaxes(tickangle=-45, row=1, col=1, categoryorder='array', categoryarray=unique_cats) + fig.update_yaxes(title_text="log10 P(gen)", row=1, col=1, showgrid=True, gridcolor='lightgrey') + fig.update_xaxes(showgrid=False, tickangle=-45, tickfont=dict(size=8), row=1, col=2, categoryorder='array', categoryarray=unique_cats) + fig.update_yaxes(showgrid=False, tickfont=dict(size=8), side="left", row=1, col=2, categoryorder='array', categoryarray=unique_cats) + + fig.show() + + # --- HELPER 2: PATIENT PLOT WITH ORIGIN DROPDOWN --- + def plot_patient_with_dropdown(pat_df, title_prefix): + if origin_col not in pat_df.columns: + plot_violin_with_heatmap(pat_df, alias_col, title_prefix) + return + + origins = sorted(pat_df[origin_col].dropna().unique()) + if len(origins) == 0: + return + elif len(origins) == 1: + plot_violin_with_heatmap(pat_df, alias_col, f"{title_prefix} ({origins[0]})") + return + + # Pre-process all origins + processed_origins = [] + for origin in origins: + origin_df = pat_df[pat_df[origin_col] == origin].copy() + if timepoint_order_col in origin_df.columns: + origin_df = origin_df.sort_values(timepoint_order_col) + + unique_cats = origin_df[alias_col].unique().tolist() + n_cats = len(unique_cats) + if n_cats < 1: continue + + p_matrix = np.full((n_cats, n_cats), np.nan) + text_matrix = np.full((n_cats, n_cats), "", dtype=object) + + for i, c1 in enumerate(unique_cats): + for j, c2 in enumerate(unique_cats): + if i <= j: continue + g1 = origin_df[origin_df[alias_col] == c1]['log10_pgen'] + g2 = origin_df[origin_df[alias_col] == c2]['log10_pgen'] + if len(g1) < 2 or len(g2) < 2: continue + try: stat, p = mannwhitneyu(g1, g2, alternative='two-sided') + except: continue + p_matrix[i, j] = -np.log10(p + 1e-300) + + if p < 0.0001: star = "****" + elif p < 0.001: star = "***" + elif p < 0.01: star = "**" + elif p < 0.05: star = "*" + else: star = "" + text_matrix[i, j] = star + + processed_origins.append({ + 'origin': origin, 'df': origin_df, 'cats': unique_cats, + 'p_matrix': p_matrix, 'text_matrix': text_matrix + }) + + if not processed_origins: return + + # Build Master Figure + fig = make_subplots( + rows=1, cols=2, column_widths=[0.6, 0.4], horizontal_spacing=0.15, + specs=[[{"type": "xy"}, {"type": "heatmap"}]] + ) + + buttons = [] + initial_cats = processed_origins[0]['cats'] + + for i, data in enumerate(processed_origins): + is_first = (i == 0) + + # Violin + fig.add_trace( + go.Violin( + x=data['df'][alias_col], y=data['df']['log10_pgen'], + box_visible=True, points=False, line_color='#1f77b4', opacity=0.6, + name="Pgen", showlegend=False, width=0.7, spanmode='hard', visible=is_first + ), row=1, col=1 + ) + + # Heatmap + fig.add_trace( + go.Heatmap( + z=data['p_matrix'], x=data['cats'], y=data['cats'], text=data['text_matrix'], + texttemplate="%{text}", textfont={"size": 8}, colorscale="Reds", zmin=0, zmax=5, showscale=True, + colorbar=dict(title="-log10(p)", titleside="top", orientation="v", thickness=10, len=0.5, x=1.1, y=0.5), + hovertemplate="%{y} vs %{x}
Sig: %{text}
-log10(p): %{z:.2f}", visible=is_first + ), row=1, col=2 + ) + + # Dropdown Logic + visibility = [False] * (len(processed_origins) * 2) + visibility[i * 2] = True + visibility[i * 2 + 1] = True + + button = dict( + label=str(data['origin']), method="update", + args=[ + {"visible": visibility}, + { + "title.text": f"{title_prefix} ({data['origin']})", + "xaxis.categoryarray": data['cats'], + "xaxis2.categoryarray": data['cats'], + "yaxis2.categoryarray": data['cats'] + } + ] + ) + buttons.append(button) + + # Apply Layout + fig.update_layout( + title_text=f"{title_prefix} ({processed_origins[0]['origin']})", + width=1000, height=600, plot_bgcolor='rgba(0,0,0,0)', + margin=dict(t=120, b=100, l=50, r=50), # 120 margin provides room for dropdown + updatemenus=[dict( + active=0, buttons=buttons, x=0.0, xanchor="left", y=1.15, yanchor="top", pad={"r": 10, "t": 10} + )] + ) + + # Apply Base Axes (linked to first origin) + fig.update_xaxes(tickangle=-45, row=1, col=1, categoryorder='array', categoryarray=initial_cats) + fig.update_yaxes(title_text="log10 P(gen)", row=1, col=1, showgrid=True, gridcolor='lightgrey') + fig.update_xaxes(showgrid=False, tickangle=-45, tickfont=dict(size=8), row=1, col=2, categoryorder='array', categoryarray=initial_cats) + fig.update_yaxes(showgrid=False, tickfont=dict(size=8), side="left", row=1, col=2, categoryorder='array', categoryarray=initial_cats) + + fig.show() + + # --- 4. Generate Quarto Tabs --- + print("::: {.panel-tabset}\n") + + # ========================================================= + # TAB 1: ALL SAMPLES + # ========================================================= + print("## All Samples\n") + print("::: {.panel-tabset}\n") + for group in group_opts: + print(f"### By {group}\n") + group_df = plot_df.dropna(subset=[group]).copy() + group_df[group] = group_df[group].astype(str) + plot_violin_with_heatmap(group_df, group, f"Pgen by {group}") + print("\n") + print(":::\n") + + # ========================================================= + # TAB 2: BY PATIENT (With Dropdown for Multiple Origins) + # ========================================================= + print("## By Patient\n") + print("::: {.panel-tabset}\n") + + unique_patients = sorted(plot_df[subject_col].unique()) + for pat in unique_patients: + print(f"### {pat}\n") + pat_df = plot_df[plot_df[subject_col] == pat].copy() + + # Use our new dropdown-aware plotting function + plot_patient_with_dropdown(pat_df, title_prefix=f"Pgen Distribution: {pat}") + print("\n") + + print(":::\n") + print(":::\n") # Close Main Tabset + +# --- Run --- +# Make sure to set origin_col dynamically if your column name isn't 'origin' +create_pgen_tabs(meta, origin_col='origin') + +``` + +**Figure 11 TCR Generation Probabilities** Probabilities shown above are weighted, accounting for the counts per clone within a repertoire. + +# Functional Prediction {#sec-function} + +## TCRdist3 {#sec-tcrdist} + +[TCRdist](https://elifesciences.org/articles/68605) is a **metric that quantifies the similarity between any two T-cell receptors (TCRs) by calculating a biochemically-aware distance based on their amino acid sequences**. Instead of just checking if two TCR sequences are identical, **TCRdist focuses on the key regions that bind to antigens: Complementarity-Determining Regions (CDRs).** It compares the amino acid sequences of the CDR1, CDR2 (obtained from the V gene), and especially the hypervariable CDR3 loops of two TCRs. The "distance" is calculated using a substitution matrix (like BLOSUM62) that scores how similar two different amino acids are in their biochemical properties. Swapping two functionally similar amino acids results in a small distance penalty, while swapping two very different ones results in a large penalty. The total distance is a weighted sum of these scores, with the CDR3 distance contributing the most. + +```{python} +#| output: asis +#| echo: false + +import os +import glob +import h5py +import pandas as pd +import numpy as np +import plotly.graph_objects as go +from plotly.subplots import make_subplots +from scipy.stats import mannwhitneyu +from scipy.sparse import csr_matrix + +def create_distance_tabs_with_heatmap(meta_df, origin_col='origin'): + """ + Generates multi-tabbed violin plots for TCR Distances. + - Tab 1: All Samples (Grouped by Metadata) -> With Pairwise Heatmap Inset + - Tab 2: By Patient (Grouped by Timepoint) -> With Pairwise Heatmap Inset & Origin Dropdown + """ + + # --- 1. FIND AND LOAD DATA --- + file_pattern = os.path.join(tcrdist_dir, '*_distance_matrix.hdf5') + patient_files = sorted(glob.glob(file_pattern)) + + if not patient_files: + print(f"Error: No files found at: {file_pattern}") + return + + all_data_list = [] + + for file_path in patient_files: + filename = os.path.basename(file_path) + sample_id = filename.replace('_distance_matrix.hdf5', '') + + try: + with h5py.File(file_path, 'r') as f: + data = f['data'][:] + indices = f['indices'][:] + indptr = f['indptr'][:] + + sparse_matrix = csr_matrix((data, indices, indptr)) + dense_matrix = sparse_matrix.toarray() + # Symmetrize + dense_matrix = np.maximum(dense_matrix, dense_matrix.T) + + # Extract upper triangle (exclude diagonal 0 and lower triangle) + upper_triangle = dense_matrix[np.triu_indices(dense_matrix.shape[0], k=1)] + + # Filter valid (ignore -1 or 0 if those indicate missing/self) + valid_distances = upper_triangle[(upper_triangle != -1) & (upper_triangle != 0)] + + if len(valid_distances) > 0: + # --- SAFETY DOWNSAMPLING --- + # Critical for performance in browser-based plots + if len(valid_distances) > 1000: + valid_distances = np.random.choice(valid_distances, 1000, replace=False) + + temp_df = pd.DataFrame({ + 'distance': valid_distances, + 'sample': sample_id + }) + all_data_list.append(temp_df) + + except Exception as e: + continue + + if not all_data_list: + print("No valid data found.") + return + + # --- 2. MERGE METADATA --- + dist_df = pd.concat(all_data_list, ignore_index=True) + + cols_to_use = ['sample', subject_col, alias_col, timepoint_order_col] + + plot_df = dist_df.merge(meta_df, on='sample', how='inner') + + exclude_cols = ['sample', 'file', 'filename', 'filepath', alias_col, timepoint_order_col, subject_col, timepoint_col] + group_opts = [c for c in meta_df.columns if c not in exclude_cols and meta_df[c].nunique() < 35] + + # --- HELPER 1: STANDARD PLOT WITH HEATMAP --- + def plot_violin_with_heatmap(data, x_col, title, x_order=None): + unique_cats = data[x_col].unique() + if x_order: + unique_cats = [x for x in x_order if x in unique_cats] + else: + unique_cats = sorted(unique_cats) + + n_cats = len(unique_cats) + + p_matrix = np.full((n_cats, n_cats), np.nan) + text_matrix = np.full((n_cats, n_cats), "", dtype=object) + + for i, c1 in enumerate(unique_cats): + for j, c2 in enumerate(unique_cats): + if i <= j: continue + g1 = data[data[x_col] == c1]['distance'] + g2 = data[data[x_col] == c2]['distance'] + + if len(g1) < 2 or len(g2) < 2: continue + + try: stat, p = mannwhitneyu(g1, g2, alternative='two-sided') + except: continue + + p_matrix[i, j] = -np.log10(p + 1e-300) + + if p < 0.0001: star = "****" + elif p < 0.001: star = "***" + elif p < 0.01: star = "**" + elif p < 0.05: star = "*" + else: star = "" + text_matrix[i, j] = star + + fig = make_subplots( + rows=1, cols=2, column_widths=[0.6, 0.4], horizontal_spacing=0.15, + specs=[[{"type": "xy"}, {"type": "heatmap"}]] + ) + + fig.add_trace( + go.Violin( + x=data[x_col], y=data['distance'], box_visible=True, points=False, + line_color='#1f77b4', opacity=0.6, name="Distance", showlegend=False, + width=0.7, spanmode='hard' + ), row=1, col=1 + ) + + fig.add_trace( + go.Heatmap( + z=p_matrix, x=unique_cats, y=unique_cats, text=text_matrix, + texttemplate="%{text}", textfont={"size": 8}, colorscale="Reds", + zmin=0, zmax=5, showscale=True, + colorbar=dict(title="-log10(p)", titleside="top", orientation="v", thickness=10, len=0.5, x=1.1, y=0.5), + hovertemplate="%{y} vs %{x}
Sig: %{text}
-log10(p): %{z:.2f}" + ), row=1, col=2 + ) + + fig.update_layout(title=title, width=1000, height=600, plot_bgcolor='rgba(0,0,0,0)', margin=dict(t=60, b=100, l=50, r=50)) + fig.update_xaxes(tickangle=-45, row=1, col=1, categoryorder='array', categoryarray=unique_cats) + fig.update_yaxes(title_text="TCR Distance", row=1, col=1, showgrid=True, gridcolor='lightgrey') + fig.update_xaxes(showgrid=False, tickangle=-45, tickfont=dict(size=8), row=1, col=2, categoryorder='array', categoryarray=unique_cats) + fig.update_yaxes(showgrid=False, tickfont=dict(size=8), side="left", row=1, col=2, categoryorder='array', categoryarray=unique_cats) + + fig.show() + + # --- HELPER 2: PATIENT PLOT WITH ORIGIN DROPDOWN --- + def plot_patient_with_dropdown(pat_df, title_prefix): + if origin_col not in pat_df.columns: + plot_violin_with_heatmap(pat_df, alias_col, title_prefix) + return + + origins = sorted(pat_df[origin_col].dropna().unique()) + if len(origins) == 0: + return + elif len(origins) == 1: + plot_violin_with_heatmap(pat_df, alias_col, f"{title_prefix} ({origins[0]})") + return + + # Pre-process all origins + processed_origins = [] + for origin in origins: + origin_df = pat_df[pat_df[origin_col] == origin].copy() + if timepoint_order_col in origin_df.columns: + origin_df = origin_df.sort_values(timepoint_order_col) + + unique_cats = origin_df[alias_col].unique().tolist() + n_cats = len(unique_cats) + if n_cats < 1: continue + + p_matrix = np.full((n_cats, n_cats), np.nan) + text_matrix = np.full((n_cats, n_cats), "", dtype=object) + + for i, c1 in enumerate(unique_cats): + for j, c2 in enumerate(unique_cats): + if i <= j: continue + g1 = origin_df[origin_df[alias_col] == c1]['distance'] + g2 = origin_df[origin_df[alias_col] == c2]['distance'] + if len(g1) < 2 or len(g2) < 2: continue + try: stat, p = mannwhitneyu(g1, g2, alternative='two-sided') + except: continue + p_matrix[i, j] = -np.log10(p + 1e-300) + + if p < 0.0001: star = "****" + elif p < 0.001: star = "***" + elif p < 0.01: star = "**" + elif p < 0.05: star = "*" + else: star = "" + text_matrix[i, j] = star + + processed_origins.append({ + 'origin': origin, 'df': origin_df, 'cats': unique_cats, + 'p_matrix': p_matrix, 'text_matrix': text_matrix + }) + + if not processed_origins: return + + # Build Master Figure + fig = make_subplots( + rows=1, cols=2, column_widths=[0.6, 0.4], horizontal_spacing=0.15, + specs=[[{"type": "xy"}, {"type": "heatmap"}]] + ) + + buttons = [] + initial_cats = processed_origins[0]['cats'] + + for i, data in enumerate(processed_origins): + is_first = (i == 0) + + # Violin + fig.add_trace( + go.Violin( + x=data['df'][alias_col], y=data['df']['distance'], + box_visible=True, points=False, line_color='#1f77b4', opacity=0.6, + name="Distance", showlegend=False, width=0.7, spanmode='hard', visible=is_first + ), row=1, col=1 + ) + + # Heatmap + fig.add_trace( + go.Heatmap( + z=data['p_matrix'], x=data['cats'], y=data['cats'], text=data['text_matrix'], + texttemplate="%{text}", textfont={"size": 8}, colorscale="Reds", zmin=0, zmax=5, showscale=True, + colorbar=dict(title="-log10(p)", titleside="top", orientation="v", thickness=10, len=0.5, x=1.1, y=0.5), + hovertemplate="%{y} vs %{x}
Sig: %{text}
-log10(p): %{z:.2f}", visible=is_first + ), row=1, col=2 + ) + + # Dropdown Logic + visibility = [False] * (len(processed_origins) * 2) + visibility[i * 2] = True + visibility[i * 2 + 1] = True + + button = dict( + label=str(data['origin']), method="update", + args=[ + {"visible": visibility}, + { + "title.text": f"{title_prefix} ({data['origin']})", + "xaxis.categoryarray": data['cats'], + "xaxis2.categoryarray": data['cats'], + "yaxis2.categoryarray": data['cats'] + } + ] + ) + buttons.append(button) + + # Apply Layout + fig.update_layout( + title_text=f"{title_prefix} ({processed_origins[0]['origin']})", + width=1000, height=600, plot_bgcolor='rgba(0,0,0,0)', + margin=dict(t=120, b=100, l=50, r=50), # 120 margin provides room for dropdown + updatemenus=[dict( + active=0, buttons=buttons, x=0.0, xanchor="left", y=1.15, yanchor="top", pad={"r": 10, "t": 10} + )] + ) + + # Apply Base Axes + fig.update_xaxes(tickangle=-45, row=1, col=1, categoryorder='array', categoryarray=initial_cats) + fig.update_yaxes(title_text="TCR Distance", row=1, col=1, showgrid=True, gridcolor='lightgrey') + fig.update_xaxes(showgrid=False, tickangle=-45, tickfont=dict(size=8), row=1, col=2, categoryorder='array', categoryarray=initial_cats) + fig.update_yaxes(showgrid=False, tickfont=dict(size=8), side="left", row=1, col=2, categoryorder='array', categoryarray=initial_cats) + + fig.show() + + # --- 3. GENERATE TABS --- + print("::: {.panel-tabset}\n") + + # ========================================================= + # TAB 1: ALL SAMPLES + # ========================================================= + print("## All Samples\n") + print("::: {.panel-tabset}\n") + + for group in group_opts: + print(f"### By {group}\n") + group_df = plot_df.dropna(subset=[group]).copy() + group_df[group] = group_df[group].astype(str) + plot_violin_with_heatmap(group_df, group, f"Distance Distribution by {group}") + print("\n") + + print(":::\n") + + # ========================================================= + # TAB 2: BY PATIENT + # ========================================================= + print("## By Patient\n") + print("::: {.panel-tabset}\n") + + unique_patients = sorted(plot_df[subject_col].unique()) + + for pat in unique_patients: + print(f"### {pat}\n") + pat_df = plot_df[plot_df[subject_col] == pat].copy() + + # Pass to the new dropdown wrapper + plot_patient_with_dropdown(pat_df, title_prefix=f"Distance Distribution: {pat}") + + print("\n") + + print(":::\n") + print(":::\n") + +# --- Execute --- +create_distance_tabs_with_heatmap(meta, origin_col='origin') + +``` + +**Figure 8. Sample Distances** Plot showing biochemical distances between sample's TCRs grouped by metadata variables. + +**Why This Metric Is Helpful?** 🌐 +The primary benefit of TCRdist is its ability to identify groups of T-cells that are likely to recognize the same antigen, even if their TCRs are not identical. This phenomenon is known as cross-reactivity. +By calculating the distance between all TCRs in your sample, you can move beyond analyzing single, identical clones and instead identify functional "neighborhoods" of similar TCRs. This provides a much more comprehensive and biologically accurate picture of an immune response. It allows you to group related T-cells together to see the full breadth of the response to a specific antigen, rather than just the single most expanded clone. + + +## TCR specificity (VDJdb) {#sec-VDJdb} + +[VDJdb](https://vdjdb.cdr3.net/) is a curated, public **database that catalogs T-cell receptor (TCR) sequences with known antigen specificities**. It serves as a valuable resource for immunologists and bioinformaticians, providing a standardized repository of information **linking a TCR's sequence to the specific epitope (antigen) it recognizes, along with the presenting MHC molecule**. The data within VDJdb is manually aggregated from a wide range of published studies and includes both TCR alpha and beta chains, providing rich context for each entry. + +You can query VDJdb by TCR sequence. You can search for a specific CDR3 sequence to determine if its antigen specificity has been previously reported. This ability to **match your experimental TCR sequences to a reference database allows you to annotate your bulk-TCR sequencing data and gain biological insights**. For example, by comparing your bulk-TCR repertoire to VDJdb, you can infer which antigens your T-cells have previously encountered or are actively responding to. + +```{python} +#| output: asis +#| echo: false + +import os +import glob +import pandas as pd +import plotly.graph_objects as go + +def plot_vdjdb_species_specificity(VDJdb_dir, meta_df, subject_col='subject_id'): + """ + Parses VDJdb annotation summaries and generates Quarto tabs per patient, + with a dropdown menu to select specific timepoints. + """ + file_pattern = os.path.join(VDJdb_dir, '*.annot.summary.txt') + summary_files = sorted(glob.glob(file_pattern)) + + if not summary_files: + print(f"Error: No summary files found in {VDJdb_dir}") + return + + all_data = [] + + # --- 1. Parse and Aggregate VDJdb Files --- + for file in summary_files: + # Extract the sample name from the filename. + basename = os.path.basename(file) + sample_name = basename.replace('.annot.summary.txt', '') + + try: + df = pd.read_csv(file, sep='\t') + + # Filter strictly for the species annotation rows + species_df = df[df['db.column.name'] == 'antigen.species'].copy() + + if species_df.empty: + continue + + # Clean and format data + species_df['sample'] = sample_name + species_df['Species'] = species_df['db.column.value'] + species_df['Frequency (%)'] = species_df['frequency'] * 100 # Convert to % + + all_data.append(species_df[['sample', 'Species', 'Frequency (%)', 'reads']]) + + except Exception as e: + print(f"Error reading {basename}: {e}") + + if not all_data: + print("No species annotations found across the parsed files.") + return + + combined_df = pd.concat(all_data, ignore_index=True) + + # --- 2. Merge with Metadata (Using timepoint_col instead of alias_col) --- + plot_df = combined_df.merge( + meta_df[['sample', subject_col, timepoint_col, timepoint_order_col]], + on='sample', + how='inner' + ) + + # --- 3. Generate Quarto Tabs and Plotly Figures --- + print("::: {.panel-tabset}\n") + + subjects = sorted(plot_df[subject_col].dropna().unique()) + + for subject in subjects: + print(f"## {subject}\n") + + subj_data = plot_df[plot_df[subject_col] == subject] + + # Sort timepoints chronologically based on your timepoint_order column + timepoints = subj_data.sort_values(timepoint_order_col)[timepoint_col].unique() + + fig = go.Figure() + buttons = [] + + for i, tp in enumerate(timepoints): + tp_data = subj_data[subj_data[timepoint_col] == tp].sort_values('Frequency (%)', ascending=False) + + # Add a trace for each timepoint (only the first one is visible by default) + fig.add_trace( + go.Bar( + x=tp_data['Species'], + y=tp_data['Frequency (%)'], + name=str(tp), + visible=(i == 0), + marker_color='#1f77b4', + hovertemplate="Target: %{x}
Frequency: %{y:.2f}%
Reads: %{customdata}", + customdata=tp_data['reads'] + ) + ) + + # Build visibility array for the dropdown button [True, False, False...] + visibility = [False] * len(timepoints) + visibility[i] = True + + button = dict( + label=str(tp), + method="update", + args=[ + {"visible": visibility}, + {"title": f"TCR Specificity - {subject} ({tp})"} + ] + ) + buttons.append(button) + + # Update layout with the dropdown menu + fig.update_layout( + updatemenus=[dict( + active=0, + buttons=buttons, + direction="down", + pad={"r": 10, "t": 10}, + showactive=True, + x=1.0, # Position on the right side + xanchor="left", + y=1.15, + yanchor="top" + )], + title_text=f"TCR Specificity - {subject} ({timepoints[0]})", + xaxis_title="Predicted Antigen Species", + yaxis_title="Repertoire Frequency (%)", + plot_bgcolor='white', + margin=dict(t=80, b=100, l=50, r=200), # Extra right margin to prevent menu cutoff + height=500 + ) + + # Scientific styling + fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, tickangle=45) + fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, gridcolor='lightgrey') + + fig.show() + print("\n\n") + + print(":::\n") + +# Run the function +plot_vdjdb_species_specificity(VDJdb_dir, meta) + +``` +**Figure 14: Top 15 recognized epitopes identified in bulk-TCR data.** The number of unique TCR clonotypes in the sample that match a known epitope in the VDJdb database is shown on the x-axis. The plot highlights the most abundant antigen specificities inferred from the T-cell repertoire. + +The plot reveals the primary epitopes recognized by the TCR repertoire, providing crucial insight into the targets of the immune response. By identifying these epitopes, we can **infer the biological context of the T-cell activity**, which may include responses to viral infections. Understanding these specificities can also **offer clues about TCR cross-reactivity, a key consideration for developing novel targeted therapies**. + +```{python} +#| output: asis +#| echo: false + +import os +import glob +import pandas as pd +import plotly.graph_objects as go + +def plot_vdjdb_sunburst_hierarchy(VDJdb_dir, meta_df, subject_col='subject_id'): + """ + Reads detailed VDJdb match files to create a hierarchical Sunburst chart: + Species (Inner) -> Gene (Middle) -> TCR Sequence (Outer). + Includes Quarto tabs per patient and a dropdown per timepoint. + """ + # NOTE: We must target the detailed match files, NOT the summary files. + # Adjust this pattern if your pipeline names them differently (e.g., '*.annot.txt') + file_pattern = os.path.join(VDJdb_dir, '*vdjmatch.txt') + match_files = sorted(glob.glob(file_pattern)) + + if not match_files: + print(f"Error: No detailed match files found in {VDJdb_dir}") + return + + all_data = [] + + # --- 1. Parse and Deduplicate VDJdb Match Files --- + for file in match_files: + basename = os.path.basename(file) + sample_name = basename.replace('.vdjmatch.txt', '') # Adjust replacement string if needed + + try: + df = pd.read_csv(file, sep='\t') + + # Keep only the columns we need for the hierarchy and deduplication + cols_to_keep = ['count', 'cdr3aa', 'antigen.species', 'antigen.gene', 'vdjdb.score'] + df = df[[c for c in cols_to_keep if c in df.columns]].copy() + df['sample'] = sample_name + + # THE FIX: Sort by VDJdb score (highest confidence first) and drop cross-assignments. + # This forces each TCR to strictly align with its single best target. + if 'vdjdb.score' in df.columns: + df = df.sort_values('vdjdb.score', ascending=False) + df = df.drop_duplicates(subset=['sample', 'cdr3aa']) + + all_data.append(df) + + except Exception as e: + print(f"Error reading {basename}: {e}") + + if not all_data: + print("No valid data found to plot.") + return + + combined_df = pd.concat(all_data, ignore_index=True) + + # Fill NAs so Plotly's hierarchy engine doesn't crash on missing genes/species + combined_df['antigen.species'] = combined_df['antigen.species'].fillna('Unknown Species') + combined_df['antigen.gene'] = combined_df['antigen.gene'].fillna('Unknown Gene') + combined_df['cdr3aa'] = combined_df['cdr3aa'].fillna('Unknown TCR') + + # --- 2. Merge with Metadata --- + plot_df = combined_df.merge( + meta_df[['sample', subject_col, timepoint_col, timepoint_order_col]], + on='sample', + how='inner' + ) + + # --- 3. Helper Function to Build the Plotly Hierarchy --- + def build_sunburst_df(df_tp): + # Level 3: TCR Level (Outer Ring) + df_tcr = df_tp.groupby(['antigen.species', 'antigen.gene', 'cdr3aa']).agg( + total_counts=('count', 'sum') + ).reset_index() + df_tcr['id'] = df_tcr['antigen.species'] + '_' + df_tcr['antigen.gene'] + '_' + df_tcr['cdr3aa'] + df_tcr['parent'] = df_tcr['antigen.species'] + '_' + df_tcr['antigen.gene'] + df_tcr['label'] = df_tcr['cdr3aa'] + df_tcr['n_unique_tcrs'] = 1 # A single sequence is always 1 unique TCR + + # Level 2: Gene Level (Middle Ring) + df_gene = df_tp.groupby(['antigen.species', 'antigen.gene']).agg( + total_counts=('count', 'sum'), + n_unique_tcrs=('cdr3aa', 'nunique') + ).reset_index() + df_gene['id'] = df_gene['antigen.species'] + '_' + df_gene['antigen.gene'] + df_gene['parent'] = df_gene['antigen.species'] + df_gene['label'] = df_gene['antigen.gene'] + + # Level 1: Species Level (Inner Ring) + df_species = df_tp.groupby('antigen.species').agg( + total_counts=('count', 'sum'), + n_unique_tcrs=('cdr3aa', 'nunique') + ).reset_index() + df_species['id'] = df_species['antigen.species'] + df_species['parent'] = 'Annotated Repertoire' + df_species['label'] = df_species['antigen.species'] + + # Level 0: Root Level (Center) + df_root = pd.DataFrame([{ + 'id': 'Annotated Repertoire', + 'parent': '', + 'label': 'Annotated
Repertoire', + 'total_counts': df_tp['count'].sum(), + 'n_unique_tcrs': df_tp['cdr3aa'].nunique() + }]) + + return pd.concat([df_root, df_species, df_gene, df_tcr], ignore_index=True) + + # --- 4. Generate Quarto Tabs and Interactive Plots --- + print("::: {.panel-tabset}\n") + + subjects = sorted(plot_df[subject_col].dropna().unique()) + + for subject in subjects: + print(f"## {subject}\n") + + subj_data = plot_df[plot_df[subject_col] == subject] + timepoints = subj_data.sort_values(timepoint_order_col)[timepoint_col].unique() + + fig = go.Figure() + buttons = [] + + for i, tp in enumerate(timepoints): + tp_data = subj_data[subj_data[timepoint_col] == tp] + sb_df = build_sunburst_df(tp_data) + + fig.add_trace( + go.Sunburst( + ids=sb_df['id'], + labels=sb_df['label'], + parents=sb_df['parent'], + values=sb_df['total_counts'], + branchvalues='total', + customdata=sb_df[['n_unique_tcrs']], + hovertemplate="%{label}
Total Read Counts: %{value}
Unique TCRs: %{customdata[0]}", + visible=(i == 0), + marker=dict(colorscale='Blues') + ) + ) + + # Setup visibility array for dropdown + visibility = [False] * len(timepoints) + visibility[i] = True + + button = dict( + label=str(tp), + method="update", + args=[ + {"visible": visibility}, + {"title": f"TCR Target Architecture - {subject} ({tp})"} + ] + ) + buttons.append(button) + + fig.update_layout( + updatemenus=[dict( + active=0, + buttons=buttons, + direction="down", + pad={"r": 10, "t": 10}, + showactive=True, + x=1.0, + xanchor="left", + y=1.1, + yanchor="top" + )], + title_text=f"TCR Target Architecture - {subject} ({timepoints[0]})", + margin=dict(t=80, b=50, l=50, r=200), + height=700 + ) + + fig.show() + print("\n\n") + + print(":::\n") + +# Run the function +plot_vdjdb_sunburst_hierarchy(VDJdb_dir, meta) + +``` +**Figure 15: TCR Repertoire Mapping to specif gene epitopes:** Sunburst chart visualizing the relationships between T-cell receptor (TCR) sequences and their known antigen specificities from the VDJdb curated database. The central ring represents the species, branching out to their specific genes, and finally to the individual TCR sequences from your dataset that match those specificities. The size of each segment corresponds to the number of TCRs associated with that particular specificity. + +```{python} +#| output: asis +#| echo: false + +import os +import glob +import pandas as pd +import plotly.graph_objects as go + +def plot_vdjdb_sunburst_reversed(VDJdb_dir, meta_df, subject_col='subject_id'): + """ + Reads detailed VDJdb match files to create a hierarchical Sunburst chart: + TCR Sequence (Inner) -> Gene (Middle) -> Species (Outer). + """ + file_pattern = os.path.join(VDJdb_dir, '*vdjmatch.txt') + match_files = sorted(glob.glob(file_pattern)) + + if not match_files: + print(f"Error: No detailed match files found in {VDJdb_dir}") + return + + all_data = [] + + # --- 1. Parse and Deduplicate VDJdb Match Files --- + for file in match_files: + basename = os.path.basename(file) + sample_name = basename.replace('.vdjmatch.txt', '') + + try: + df = pd.read_csv(file, sep='\t') + + cols_to_keep = ['count', 'cdr3aa', 'antigen.species', 'antigen.gene', 'vdjdb.score'] + df = df[[c for c in cols_to_keep if c in df.columns]].copy() + df['sample'] = sample_name + + # Deduplicate: Keep highest scoring target per TCR + if 'vdjdb.score' in df.columns: + df = df.sort_values('vdjdb.score', ascending=False) + df = df.drop_duplicates(subset=['sample', 'cdr3aa']) + + all_data.append(df) + + except Exception as e: + print(f"Error reading {basename}: {e}") + + if not all_data: + print("No valid data found to plot.") + return + + combined_df = pd.concat(all_data, ignore_index=True) + + # Fill NAs to prevent hierarchy breaks + combined_df['antigen.species'] = combined_df['antigen.species'].fillna('Unknown Species') + combined_df['antigen.gene'] = combined_df['antigen.gene'].fillna('Unknown Gene') + combined_df['cdr3aa'] = combined_df['cdr3aa'].fillna('Unknown TCR') + + # --- 2. Merge with Metadata --- + plot_df = combined_df.merge( + meta_df[['sample', subject_col, timepoint_col, timepoint_order_col]], + on='sample', + how='inner' + ) + + # --- 3. REVERSED Hierarchy Function --- + def build_sunburst_df(df_tp): + # Level 1: TCR Level (Inner Ring) + df_tcr = df_tp.groupby('cdr3aa').agg( + total_counts=('count', 'sum') + ).reset_index() + df_tcr['id'] = df_tcr['cdr3aa'] + df_tcr['parent'] = 'Annotated Repertoire' + df_tcr['label'] = df_tcr['cdr3aa'] + df_tcr['n_unique_tcrs'] = 1 + + # Level 2: Gene Level (Middle Ring) + df_gene = df_tp.groupby(['cdr3aa', 'antigen.gene']).agg( + total_counts=('count', 'sum') + ).reset_index() + df_gene['id'] = df_gene['cdr3aa'] + '_' + df_gene['antigen.gene'] + df_gene['parent'] = df_gene['cdr3aa'] + df_gene['label'] = df_gene['antigen.gene'] + df_gene['n_unique_tcrs'] = 1 + + # Level 3: Species Level (Outer Ring) + df_species = df_tp.groupby(['cdr3aa', 'antigen.gene', 'antigen.species']).agg( + total_counts=('count', 'sum') + ).reset_index() + df_species['id'] = df_species['cdr3aa'] + '_' + df_species['antigen.gene'] + '_' + df_species['antigen.species'] + df_species['parent'] = df_species['cdr3aa'] + '_' + df_species['antigen.gene'] + df_species['label'] = df_species['antigen.species'] + df_species['n_unique_tcrs'] = 1 + + # Level 0: Root Level (Center) + df_root = pd.DataFrame([{ + 'id': 'Annotated Repertoire', + 'parent': '', + 'label': 'Annotated
Repertoire', + 'total_counts': df_tp['count'].sum(), + 'n_unique_tcrs': df_tp['cdr3aa'].nunique() + }]) + + return pd.concat([df_root, df_tcr, df_gene, df_species], ignore_index=True) + + # --- 4. Generate Quarto Tabs and Interactive Plots --- + print("::: {.panel-tabset}\n") + + subjects = sorted(plot_df[subject_col].dropna().unique()) + + for subject in subjects: + print(f"## {subject}\n") + + subj_data = plot_df[plot_df[subject_col] == subject] + timepoints = subj_data.sort_values(timepoint_order_col)[timepoint_col].unique() + + fig = go.Figure() + buttons = [] + + for i, tp in enumerate(timepoints): + tp_data = subj_data[subj_data[timepoint_col] == tp] + sb_df = build_sunburst_df(tp_data) + + fig.add_trace( + go.Sunburst( + ids=sb_df['id'], + labels=sb_df['label'], + parents=sb_df['parent'], + values=sb_df['total_counts'], + branchvalues='total', + customdata=sb_df[['n_unique_tcrs']], + hovertemplate="%{label}
Total Read Counts: %{value}
Unique TCRs: %{customdata[0]}", + visible=(i == 0), + marker=dict(colorscale='Blues') + ) + ) + + visibility = [False] * len(timepoints) + visibility[i] = True + + button = dict( + label=str(tp), + method="update", + args=[ + {"visible": visibility}, + {"title": f"Reversed Target Architecture - {subject} ({tp})"} + ] + ) + buttons.append(button) + + fig.update_layout( + updatemenus=[dict( + active=0, + buttons=buttons, + direction="down", + pad={"r": 10, "t": 10}, + showactive=True, + x=1.0, + xanchor="left", + y=1.1, + yanchor="top" + )], + title_text=f"Reversed Target Architecture - {subject} ({timepoints[0]})", + margin=dict(t=80, b=50, l=50, r=200), + height=800 # Increased height slightly to accommodate the dense inner ring + ) + + fig.show() + print("\n\n") + + print(":::\n") + +# Run the function +plot_vdjdb_sunburst_reversed(VDJdb_dir, meta) + +``` +**Figure 16: Gene Epitopes Mapping to TCR sequences:** Sunburst chart visualizing the relationships antigens and T-cell receptor (TCR) sequences. The central ring represents TCRs sequences, branching out to their specific genes, and finally to the species the antigen belongs to. Only TCRs with a Freq>2 are shown, for visualization purposes. + +The **first chart is an antigen-centric visualization.** It is structured hierarchically, starting with the broad species and branching out to specific gene epitopes and the individual TCR clonotypes that recognize them. This top-down view is ideal for gaining a **high-level understanding of the immune response**. It allows you to see **which antigens or pathogens are driving the T-cell activity in a sample and to quantify the breadth and magnitude of the response against them.** + +In contrast, the **second chart offers a TCR-centric view** by reversing the hierarchy. It starts from the individual TCR clonotype and links it to its known epitope and antigen. This **bottom-up perspective is crucial for understanding the function of individual T-cell clones**. By tracing a highly expanded clone, you can **identify all its known specificities, which is a key approach to studying TCR cross-reactivity.** The ability of a single TCR to recognize multiple epitopes is a critical biological concept that informs the design of targeted immunotherapies. + +Together, these two visualizations offer a complete picture. The antigen-centric chart reveals the population-level response—what the immune system is responding to—while the TCR-centric chart provides a functional interpretation of individual clones—what a specific T-cell can do. **This integrated analysis allows for a systematic and detailed investigation of the antigen-specific landscape, from the broad immune response to the functional role of a single T-cell clone.** + +::: {.callout-important title="Important"} +**A Note on Interpreting Database Matches** +In this section, we try to annotate our TCRs by matching them against public databases of TCRs with known antigen specificities. While this is a powerful step, it's critical to understand the current limitations of these databases, especially when analyzing samples. + +***The Viral Bias of Public Databases*** +Most large, well-annotated TCR databases are heavily biased toward specificities for common viral antigens (like CMV, EBV, and Influenza). This is because much of the foundational research in this area focused on infectious diseases. + +***What This Means for Your Cancer Data*** +When you search these databases with TCRs from a tumor sample, you should expect the following: + +**You WILL find matches to viral TCRs**. This is perfectly normal and reflects the **patient's lifelong immune history**. These T-cells are often **"bystanders" in the tumor** and **not directly involved in the anti-cancer response**. + +**You will likely NOT find matches for most tumor-specific TCRs**. TCRs that recognize tumor neoantigens are typically "private" (unique to that patient's specific cancer) and are therefore absent from public databases. + +***Why Is This Step Still Valuable? 🤔*** +- Characterize the Repertoire: It helps you identify and label a known portion of the T-cell repertoire, giving you a clearer picture of the immune context. +- Filter and Prioritize: By identifying the "public" virus-specific TCRs, you can computationally set them aside. This allows you to focus your downstream analysis on the remaining unannotated TCRs, which are more likely to contain the novel, tumor-reactive clones you're searching for. +- Positive Control: Finding common viral TCRs confirms that your sequencing and analysis pipeline is working correctly. +::: diff --git a/notebooks/template_sharing.qmd b/notebooks/template_sharing.qmd new file mode 100644 index 0000000..1667ace --- /dev/null +++ b/notebooks/template_sharing.qmd @@ -0,0 +1,863 @@ +--- +title: "TCRtoolkit Sharing Report" +format: + html: + theme: flatly + toc: true + toc_depth: 3 + code-fold: true + embed-resources: true + number-sections: true + smooth-scroll: true + grid: + body-width: 1000px + margin-width: 300px +jupyter: python3 +--- + +```{python} +#| tags: [parameters] +#| include: false + +# --------------------------------------------------------- +# BASE PARAMETERS +# --------------------------------------------------------- +workflow_cmd = '' +project_name = '' +project_dir = '' +sample_table = '' + +``` + + +```{python} +#| include: false + +# --------------------------------------------------------- +# DERIVED PATHS +# --------------------------------------------------------- + +# Define files +project_dir=f"{project_dir}" + +sample_stats_csv = f"{project_dir}/sample/sample_stats.csv" +concat_csv = f"{project_dir}/annotate/concatenated_cdr3.tsv" # f"{project_dir}/compare/concatenated_cdr3.txt" + +pgen_csv = f"{project_dir}/tcrsharing/cdr3_sharing.tsv" +giana_tsv = f"{project_dir}/giana/giana_RotationEncodingBL62.txt" + +# Define dirs +tcrpheno_dir = f"{project_dir}/tcrpheno/" +pseudobulk_dir = f"{project_dir}/pseudobulk/" + +``` + + +```{python} +# Load Packages +from IPython.display import Image +import os +import datetime +import sys +import pandas as pd +import math +import matplotlib.pyplot as plt +import seaborn as sns +from matplotlib.colors import LinearSegmentedColormap +import plotly.express as px +import plotly.graph_objects as go +import numpy as np +import matplotlib.ticker as ticker +from scipy.stats import gaussian_kde +import h5py +import glob +from scipy.sparse import csr_matrix +import scipy.cluster.hierarchy as sch +# import networkx as nx +import itertools +import igraph as ig +import logomaker +import io +import base64 +import json +from IPython.display import HTML, display +import warnings +import matplotlib.pyplot as plt +from upsetplot import from_contents, plot as upset_plot + + +# Print pipeline parameters + +# print('Project Name: ' + project_name) +# print('Workflow command: ' + workflow_cmd) +# print('Pipeline Directory: ' + project_dir) +# print('Date and time: ' + str(datetime.datetime.now())) + +# - Loading data + +# Load metadata +## Reading sample metadata +meta = pd.read_csv(sample_table, sep=',') + +## Reading concatenated cdr3 file +concat_df = pd.read_csv(concat_csv, sep='\t') + +# Reading compare outputs +prob_generation_df = pd.read_csv(pgen_csv, sep='\t') +giana_df = pd.read_csv(giana_tsv, comment='#', sep='\t') + +``` + + +## TCR Publicity {#sec-publicity} + +In the discovery brief notebook, we extracted the specific amino acid sequences of the top "public" clones shared across multiple individuals. However, we can also map the extent of TCR sharing directly against their clonal frequency sizes. + +This plot provides a powerful, two-dimensional view of your T-cell repertoire, helping you move beyond simple clone lists to understand the functional landscape of the immune response. The central question it helps answer is: **Are the T-cell clones found across many different samples (widely shared) also the ones that are most active and expanded?** + +This plot is generated by first calculating the frequency of each unique TCR clone within every sample from the raw data. For each clone, two key metrics are then determined: the total number of samples it appears in (its sharing level) and its single highest frequency across all of those samples (its maximum expansion). **Clones are then categorized as 'Highly Expanded' (>1%), 'Expanded' (0.1%-1%), or 'Non-expanded' (<0.1%) based on this maximum frequency value (see sample Notebook for a detailed axplanation)**. The final stacked bar plot visualizes the count of unique TCRs for each sharing level on the x-axis, with the colored segments revealing the proportion of clones that fall into each expansion category. + +**A T-cell clone that is both widely shared and highly expanded suggests it recognizes a common and potent antigen present in multiple individuals, such as a shared tumor antigen.** You can quickly generate hypotheses about the nature of the immune responses within your cohort, distinguishing between powerful shared responses and potent individual-specific ones. Identifying these clones is critical as they represent key players in a common immune response and are high-priority targets for further research. + +```{python} +#| label: fig-tcr-sharing +#| fig-cap: "**TCR Sharing by Maximum Clonal Expansion.** Number of samples where a TCR has an exact match on the aminoacid level. Color represents clonal expansion category using the highest frequency across all samples." + +# --- 1. Load and Process Actual Data --- +# We assume 'concat_df' has columns ['CDR3b', 'counts', 'sample'] +# We assume 'meta' has columns ['sample', subject_col] +raw_df = concat_df.copy() + +# Rename columns to standard internal names +raw_df.rename(columns={'CDR3b': 'cdr3_sequence', 'sample': 'sample_id'}, inplace=True) + +# Calculate frequency per SAMPLE first (Expansion is a property of a specific physical sample) +sample_total_counts = raw_df.groupby('sample_id')['counts'].transform('sum') +raw_df['frequency'] = raw_df['counts'] / sample_total_counts + +# --- CRITICAL STEP: Merge with Metadata to get Patient IDs --- +# We merge on 'sample_id' (which matches 'sample' in meta) +raw_df = pd.merge(raw_df, meta[['sample', subject_col]], left_on='sample_id', right_on='sample', how='left') + +# --- 2. Process Data to Get Summary Metrics (BY PATIENT) --- +# We group by TCR and calculate: +# 1. How many unique PATIENTS (subject_id) have this TCR? +# 2. What is the maximum frequency this TCR reached in ANY sample? +summary_df = raw_df.groupby('cdr3_sequence').agg( + total_patients=(subject_col, 'nunique'), + max_frequency=('frequency', 'max') +).reset_index() + +# --- 3. Define Clonal Expansion Categories --- +bins = [0, 0.001, 0.01, np.inf] +labels = ['Non-Frequent (Freq<0.1%)', 'Frequent (0.11%)'] +summary_df['expansion_category'] = pd.cut(summary_df['max_frequency'], bins=bins, labels=labels, right=True) + +# --- 4. Prepare Data for Plotting --- +# We now group by 'total_patients' instead of 'total_samples' +plot_data = summary_df.groupby(['total_patients', 'expansion_category'], observed=True).size().reset_index(name='tcr_count') +pivoted_data = plot_data.pivot(index='total_patients', columns='expansion_category', values='tcr_count').fillna(0) + +# Reorder columns to match the logical progression of expansion +if not all(label in pivoted_data.columns for label in labels): + for label in labels: + if label not in pivoted_data.columns: + pivoted_data[label] = 0 +pivoted_data = pivoted_data[labels] + +# --- 5. Create the Interactive Stacked Bar Plot with Plotly --- +fig = go.Figure() + +# Define a color palette +colors = { + 'Non-Frequent (Freq<0.1%)': '#3498db', # Blue + 'Frequent (0.11%)': '#e74c3c' # Red +} + +# Add a bar trace for each expansion category +for phenotype in pivoted_data.columns: + fig.add_trace(go.Bar( + x=pivoted_data.index, + y=pivoted_data[phenotype], + name=phenotype, + marker_color=colors[phenotype], + hovertemplate=( + "Patient Sharing: %{x} Patients
" + + "Expansion: " + phenotype + "
" + + "TCR Count: %{y}" + ) + )) + +# --- 6. Customize the Plot Layout --- +fig.update_layout( + barmode='stack', + xaxis_title='Number of Patients Sharing TCR', + yaxis_title='Number of Unique TCRs (log scale)', + yaxis_type='log', + legend_title='Max Expansion Category', + template='plotly_white', + # Ensure x-axis ticks are integers (you can't share with 1.5 patients) + xaxis=dict(dtick=1) +) + +# Handle the log scale y-axis range robustly +# We add a small buffer so bars with count=1 are visible +max_val = pivoted_data.sum(axis=1).max() +if max_val > 0: + fig.update_yaxes(range=[np.log10(0.8), np.log10(max_val * 1.5)]) + +fig.show() + +``` + +::: {.callout-warning title="Warning"} +Be mindful that the y-axis of this plot uses a logarithmic scale, this visual compression means the height of the bars does not scale linearly with their value. A bar representing 1,000 clones will not appear ten times taller than a bar for 100 clones. **Always hover over the plot segments to see the precise numerical counts, specially when comparing categories.** +::: + +## Distinguishing Convergent Selection from Recombination Bias + +Identifying "public" clonotypes shared across multiple individuals is a standard approach for discovering potentially antigen-specific TCRs. However, overlapping sequences do not implicitly confirm a shared immune response. Due to inherent structural biases in V(D)J recombination, certain amino acid sequences are generated with remarkably high frequencies across the population by chance alone. +To isolate true convergent, antigen-driven selection from baseline overlap, it is necessary to evaluate the sharing of public clones in the context of their Generation Probability ($P_{gen}$). +By mapping the number of individuals sharing a specific TCR against its theoretical generation probability, we can stratify public clones into two distinct biological categories: + +- **High-Probability Overlap (Right side):** Sequences with high $\log_{10} P_{gen}$ values (e.g., $-5$) are structurally favored during recombination. Their presence in multiple patients is statistically expected and often represents baseline population overlap rather than a targeted response to a specific disease or therapy. +- **Convergent Selection (Left side):** Sequences shared across multiple patients that possess exceptionally low $\log_{10} P_{gen}$ values (e.g., $< -15$) represent highly improbable recombination events. The independent generation and expansion of these rare sequences in different individuals provides strong evidence of convergent immune selection driven by a common antigen. + +Integrating $P_{gen}$ into public clone analysis effectively filters out recombination noise, allowing you to prioritize high-confidence candidates for downstream validation. + +```{python} +#| label: fig-tcr-sharing-2 +#| fig-cap: "**TCR sharing and generation probability correlation.** Relationship between the likelihood of a T-cell receptor (TCR) being generated (generation probability, Pgen) and the number of samples it is shared across." + +import numpy as np +import pandas as pd +import plotly.express as px + +# --- 1. Data Processing: Calculate Patient Sharing & Hover Info --- + +# Merge TCR data with Metadata to link Samples -> Subject IDs and Timepoints +merged_df = pd.merge(concat_df, meta[['sample', subject_col, timepoint_col]], on='sample', how='left') + +# Group by TCR (CDR3b) and aggregate the exact data we need for plotting and hovering +agg_df = merged_df.groupby('CDR3b').agg( + total_patients=(subject_col, 'nunique'), + individuals=(subject_col, lambda x: ', '.join(sorted(set(x.dropna().astype(str))))), + timepoints=(timepoint_col, lambda x: ', '.join(sorted(set(x.dropna().astype(str))))) +).reset_index() + +# Merge this with your existing generation probability dataframe +plot_df = pd.merge(prob_generation_df[['CDR3b', 'pgen']], agg_df, on='CDR3b', how='inner') + +# Calculate the log10 of the generation probability +plot_df['log10_pgen'] = np.log10(plot_df['pgen']) + + +# --- 2. Interactive Scatter Plot Code (Plotly) --- + +fig = px.scatter( + plot_df, + x='log10_pgen', + y='total_patients', + hover_name='CDR3b', # Puts the sequence at the top of the tooltip in bold + hover_data={ + 'log10_pgen': ':.2f', # Format to 2 decimal places + 'total_patients': False, # Hide this because it's already obvious from the Y-axis + 'individuals': True, # Show the list of individuals + 'timepoints': True, # Show the list of timepoints + 'pgen': False # Hide raw pgen to keep the tooltip clean + }, + labels={ + 'log10_pgen': 'log10(Generation Probability)', + 'individuals': 'Individual(s)', + 'timepoints': 'Timepoints' + }, + title='Shared TCRs (by Patient) vs log10(Generation Probability)', + opacity=0.6, + color_discrete_sequence=['blue'] +) + +# Force the layout to match standard scientific styling (white background, integer Y-ticks) +fig.update_layout( + plot_bgcolor='white', + yaxis=dict( + title='Number of Shared Patients', + tickmode='linear', + tick0=1, + dtick=1, # Forces ticks to be 1, 2, 3... (you can't share a clone with 1.5 patients) + showgrid=True, + gridcolor='lightgrey' + ), + xaxis=dict( + showgrid=True, + gridcolor='lightgrey' + ), + # Add a clean border + margin=dict(l=20, r=20, t=50, b=20) +) + +fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True) +fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True) + +fig.show() + + +``` + + +## Similar TCR sequences across all samples {#sec-GIANA} + +### GIANA TCR clusters composition + +In the "Discovery Brief" notebook, we introduced the highest-abundance GIANA clusters, providing a top-level summary table that detailed the specific individuals, timepoints, and constituent TCR sequences for the most dominant metaclones. + +However, simply identifying the existence of these top clusters is only the first step. To fully understand the architecture of the convergent immune response, we must look beyond the top hits and examine the internal composition of the entire clustered network. + +```{python} +#| label: giana-clusters +#| fig-cap: "**GIANA clusters found in each sample/timepoint.** Sunburst chart displays the hierarchical contribution of T-cell receptor (TCR) clusters across all samples. The central circle represents the entire dataset, the middle ring breaks down the data by individual sample, and the outer ring shows the specific clusters within each sample. The size and color of each segment correspond to the total number of TCRs, and you can hover over any section to see the exact counts of both total and unique TCRs" + +giana_tsv = project_dir + "/giana/giana_RotationEncodingBL62.txt" +giana_df = pd.read_csv(giana_tsv, comment='#', sep='\t') +giana_df = giana_df.merge(meta[['sample', subject_col, timepoint_col, alias_col]], on='sample', how='left') + +# Safety catch: Plotly hierarchy will break if parent/id strings are NaN +giana_df = giana_df.dropna(subset=[subject_col]) + +# --- 1. Manually Prepare Data for Each Hierarchy Level --- + +# Level 2: Cluster Level (Innermost Ring) +# Group by alias_col instead of 'sample' +df_cluster = giana_df.groupby([alias_col, 'cluster']).agg( + total_counts=('counts', 'sum'), + n_unique_tcrs=('CDR3b', 'nunique') +).reset_index() + +# Create a unique ID for each cluster to prevent clashes using the alias +df_cluster['id'] = df_cluster[alias_col] + '_' + df_cluster['cluster'].astype(str) +df_cluster['parent'] = df_cluster[alias_col] +df_cluster['label'] = df_cluster['cluster'].astype(str) + + +# Level 1: Alias Level (Middle Ring) +# Aggregate the data up to the alias level +df_sample = giana_df.groupby(alias_col).agg( + total_counts=('counts', 'sum'), + n_unique_tcrs=('CDR3b', 'nunique') +).reset_index() + +df_sample['id'] = df_sample[alias_col] +df_sample['parent'] = 'All Samples' # The parent of each alias is the root +df_sample['label'] = df_sample[alias_col] + + +# Level 0: Root Level (Center) +# This is the top-level aggregation for the entire dataset. +total_counts_root = giana_df['counts'].sum() +unique_tcrs_root = giana_df['CDR3b'].nunique() + +df_root = pd.DataFrame({ + 'id': ['All Samples'], + 'parent': [''], # The root has no parent + 'label': ['All Samples'], + 'total_counts': [total_counts_root], + 'n_unique_tcrs': [unique_tcrs_root] +}) + +# --- 2. Combine all levels into a single DataFrame for Plotly --- +sunburst_df = pd.concat([ + df_root, + df_sample, + df_cluster +]) + + +# --- 3. Create the Sunburst Chart using plotly.graph_objects --- +fig = go.Figure(go.Sunburst( + ids=sunburst_df['id'], + labels=sunburst_df['label'], + parents=sunburst_df['parent'], + values=sunburst_df['total_counts'], + branchvalues='total', + customdata=sunburst_df[['n_unique_tcrs']], + hovertemplate="%{label}
Total TCRs: %{value}
Unique TCRs: %{customdata[0]}", + marker=dict( + colors=sunburst_df['total_counts'], + colorscale='Bluyl', + colorbar=dict( + title="Total TCR Counts", + lenmode="fraction", + len=0.75, + thickness=20, + x=0.95, + y=0.5, + title_side="right" + ) + ) +)) + +# --- 4. Customize Layout --- +fig.update_layout( + title_text="TCR Cluster Contribution by Sample", + font_size=16, + title_font_size=24, + title_x=0.5, + margin=dict(t=50, l=0, r=0, b=0) +) + +fig.show() + + +``` + +```{python} +#| label: giana-clusters-2 +#| fig-cap: "**Samples/timepoints found in each GIANA cluster.** Sunburst chart providing a 'cluster-centric' view of the TCR repertoire, designed to show how individual samples contribute to each shared immune response. The hierarchy flows from the central root ('All Clusters') to the middle ring (individual TCR clusters) and finally to the outer ring (the samples that make up each cluster). The size and color of each segment reflect the total TCR counts, and you can hover over any section to see the precise numbers for both total and unique TCRs." + +# --- 1. Prepare Data for the Hierarchy (Root -> Cluster -> Alias) --- + +# Level 2: Alias Level (Outer Ring) +# This is the most detailed level, showing which aliases are in each cluster. +df_alias_level = (giana_df.groupby(['cluster', alias_col]) + .agg(total_counts=('counts', 'sum'), n_unique_tcrs=('CDR3b', 'nunique')) + .reset_index() + .assign(id=lambda x: x['cluster'].astype(str) + '_' + x[alias_col], + parent=lambda x: x['cluster'].astype(str), + label=lambda x: x[alias_col])) + +# Level 1: Cluster Level (Middle Ring) +# Aggregate the data up to the cluster level. +df_cluster_level = (giana_df.groupby('cluster') + .agg(total_counts=('counts', 'sum'), n_unique_tcrs=('CDR3b', 'nunique')) + .reset_index() + .assign(id=lambda x: x['cluster'].astype(str), + parent='All Clusters', + label=lambda x: x['cluster'].astype(str))) + +# Level 0: Root Level (Center) +# This is the top-level aggregation for the entire dataset. +df_root = pd.DataFrame([{ + 'id': 'All Clusters', + 'parent': '', + 'label': 'All Clusters', + 'total_counts': giana_df['counts'].sum(), + 'n_unique_tcrs': giana_df['CDR3b'].nunique() +}]) + +# --- 2. Combine all levels into a single DataFrame --- +sunburst_df = pd.concat([df_root, df_cluster_level, df_alias_level]) + +# --- 3. Create the Sunburst Chart --- +fig = go.Figure(go.Sunburst( + ids=sunburst_df['id'], + labels=sunburst_df['label'], + parents=sunburst_df['parent'], + values=sunburst_df['total_counts'], + branchvalues='total', + customdata=sunburst_df[['n_unique_tcrs']], + hovertemplate="%{label}
Total TCRs: %{value}
Unique TCRs: %{customdata[0]}", + marker=dict( + colors=sunburst_df['total_counts'], + colorscale='Bluyl', + colorbar=dict( + title="Total TCR Counts", + lenmode="fraction", + len=0.75, + thickness=20, + x=0.95, + y=0.5, + title_side="right" + ) + ) +)) + +# --- 4. Customize Layout --- +fig.update_layout( + title_text="Sample Contribution by TCR Cluster", + font_size=16, + title_font_size=24, + title_x=0.5, + margin=dict(t=50, l=0, r=0, b=0) +) + +fig.show() + +``` + + +**Biological Insight** +The previous visualizations offer two complementary perspectives on the TCR data. **The first plot highlights individual samples**, showing all the TCR clusters they contain. This allows you to appreciate the total number of TCRs per cluster and the sum of clone sizes. **The second visualization**, the sunburst chart,** emphasizes the clusters themselves**, revealing which samples contribute to each one. This is crucial for identifying clusters of TCRs with similar sequences that are shared between patients. + +The clone size shown in both visualizations is a key piece of information. **Since clonal expansion is associated with antigen recognition, a larger clone size suggests a more significant immune response.** This insight helps you better interpret the biological relevance of each cluster. + +To get the most out of this plot, it is recommend to compare samples across different conditions, such as "pre-" and "post-treatment." This way, you can directly visualize how a therapy impacts the immune response. Look for new or expanding clusters in your post-treatment samples, as this provides strong evidence of treatment-induced immune activation and helps you pinpoint the exact TCRs driving the therapeutic effect. + +### GIANA clusters over time + +Now, we shift our focus to the intra-patient dynamics to understand how the TCR repertoire evolves over time within a single individual. + +The UpSet plot below visualizes the intersection of GIANA clusters across multiple timepoints. + +```{python} +#| output: asis +#| echo: false +#| warning: false + +# Get list of unique patients +patients = giana_df[subject_col].unique() + +# 1. Pre-filter for patients that actually have multiple timepoints +valid_patients = [] +for patient in patients: + pat_df = giana_df[giana_df[subject_col] == patient] + clusters_by_timepoint = pat_df.groupby(timepoint_col)['cluster'].apply(set) + if len(clusters_by_timepoint) >= 2: + valid_patients.append((patient, clusters_by_timepoint)) + +# 2. Conditionally generate the tabset +if valid_patients: + print("::: {.panel-tabset}\n") + + for patient, clusters_by_timepoint in valid_patients: + print(f"## {patient}\n") + + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + fig = plt.figure(figsize=(10, 5)) + + try: + upset_data = from_contents(clusters_by_timepoint) + upset_plot(upset_data, show_counts=True, sort_by='cardinality', fig=fig) + plt.suptitle(f"Cluster Sharing over Time: {patient}", fontsize=14) + plt.show() + except Exception as e: + print(f"\n*Could not generate plot for {patient}: {e}*\n") + + plt.close(fig) + print("\n") + + print(":::\n") +``` +**Longitudinal tracking of shared TCR clusters in patients.** This UpSet plot visualizes the intersection of GIANA clusters across distinct timepoints. The horizontal bar chart (bottom left) displays the total number of unique clusters identified at each timepoint. The vertical bar chart (top) represents the size of each intersection. The connectivity matrix (bottom dots) indicates which timepoints are included in each intersection. + +### Metaclonotype enrichment between timepoints (e.g. pre- and post-treatment) + +Visualizing the enrichment of metaclonotypes post-treatment is crucial because it provides direct evidence that a therapy is successfully activating the immune system against a disease like cancer. + +These expanding clusters represent the specific T-cell "armies" that have been mobilized and are proliferating to attack their target. Identifying these enriched metaclonotypes is the most direct way to discover the exact, functionally relevant T-cells that are driving a patient's response. This knowledge is vital for developing biomarkers to predict treatment success and for isolating potent TCR sequences to create next-generation personalized immunotherapies. + +```{python} +#| output: asis +#| echo: false +#| label: giana-clusters-4 +#| fig-cap: "**GIANA cluster enrichment** Scatter plot comparing the abundance of each T-cell receptor (TCR) metaclonotype between timepoints for an individual patient. Each point represents a single metaclonotype, with its position determined by its abundance on a log scale at each timepoint." + +import itertools +import plotly.graph_objects as go +import pandas as pd + +def create_patient_tabs_abundance_scatter(df): + """ + Generates Quarto tabs per patient containing a normalized scatter plot. + Uses a slider to navigate through timepoint comparisons. + """ + df = df.copy() + df[timepoint_col] = df[timepoint_col].astype(str) + + # Start Quarto tabset + print("::: {.panel-tabset}\n") + + all_patients = sorted(df[subject_col].dropna().unique()) + + for patient_id in all_patients: + print(f"## {patient_id}\n") + + patient_df = df[df[subject_col] == patient_id] + + # A. Calculate Total Depth per Timepoint for Normalization + sample_depths = patient_df.groupby(timepoint_col)['counts'].sum() + + # B. Pivot: Raw Counts + patient_pivot = patient_df.pivot_table( + index='cluster', + columns=timepoint_col, + values='counts', + aggfunc='sum' + ).fillna(0) + + # C. Normalize to CPM (Counts Per Million) + patient_cpm = patient_pivot.div(sample_depths, axis=1) * 1e6 + patient_cpm = patient_cpm.fillna(0) + + timepoints = patient_cpm.columns.tolist() + + # Skip if patient doesn't have at least 2 timepoints to compare + if len(timepoints) < 2: + print(f"*(Not enough timepoints to generate dynamics plot for {patient_id})*\n\n") + continue + + fig = go.Figure() + plot_metadata = [] + timepoint_pairs = list(itertools.combinations(timepoints, 2)) + + # --- Generate Traces for this Patient --- + for tp1, tp2 in timepoint_pairs: + relevant_clusters_df = patient_cpm[ + (patient_cpm[tp1] > 0) | (patient_cpm[tp2] > 0) + ] + + if relevant_clusters_df.empty: + continue + + # Log Transformation with Pseudocount + x_data = relevant_clusters_df[tp1] + 1 + y_data = relevant_clusters_df[tp2] + 1 + + trace_name = f"{tp1} vs {tp2}" + + fig.add_trace( + go.Scatter( + x=x_data, y=y_data, mode='markers', name=trace_name, + text=relevant_clusters_df.index, + hovertemplate=( + f'Cluster: %{{text}}
' + f'{tp1}: %{{x:.2f}} CPM (adj)
' + f'{tp2}: %{{y:.2f}} CPM (adj)' + '' + ), + visible=False, + marker=dict(opacity=0.6, size=8, color='#1f77b4') + ) + ) + + max_val = max(x_data.max(), y_data.max()) + plot_metadata.append({ + 'tp1': tp1, + 'tp2': tp2, + 'trace_index': len(fig.data) - 1, + 'max_val': max_val, + 'label': f"{tp1} vs {tp2}" + }) + + if not fig.data: + print(f"*(No overlapping cluster data found for {patient_id})*\n\n") + continue + + # --- Build Slider for this Patient --- + steps = [] + for meta in plot_metadata: + visible_array = [False] * len(fig.data) + visible_array[meta['trace_index']] = True + + shape = dict( + type='line', x0=1, y0=1, x1=meta['max_val'], y1=meta['max_val'], + line=dict(color='grey', dash='dash', width=1) + ) + + step = dict( + method="update", + args=[ + {"visible": visible_array}, + { + "title.text": f"Cluster Dynamics - {patient_id}: {meta['tp1']} vs {meta['tp2']}", + "xaxis.title": f"{meta['tp1']} [Log(CPM+1)]", + "yaxis.title": f"{meta['tp2']} [Log(CPM+1)]", + "shapes": [shape] + } + ], + label=meta['label'] + ) + steps.append(step) + + slider = [dict( + active=0, + currentvalue={"prefix": "Comparison: "}, + pad={"t": 50}, + steps=steps + )] + + # --- Initialize First View --- + first_meta = plot_metadata[0] + fig.data[first_meta['trace_index']].visible = True + + initial_shape = dict( + type='line', x0=1, y0=1, x1=first_meta['max_val'], y1=first_meta['max_val'], + line=dict(color='grey', dash='dash', width=1) + ) + + # --- Final Layout Update --- + fig.update_layout( + sliders=slider, + title_text=f"Cluster Dynamics - {patient_id}: {first_meta['tp1']} vs {first_meta['tp2']}", + xaxis_title=f"{first_meta['tp1']} [Log(CPM+1)]", + yaxis_title=f"{first_meta['tp2']} [Log(CPM+1)]", + xaxis_type="log", + yaxis_type="log", + showlegend=False, + shapes=[initial_shape], + margin=dict(b=100, t=100, l=50, r=50), + height=600, + plot_bgcolor='white' + ) + + # Add clean axis lines and grid + fig.update_xaxes(showline=True, linewidth=1, linecolor='black', mirror=True, gridcolor='lightgrey') + fig.update_yaxes(showline=True, linewidth=1, linecolor='black', mirror=True, gridcolor='lightgrey') + + fig.show() + print("\n\n") + + # End Quarto tabset + print(":::\n") + +# Run +giana_df = giana_df.dropna(subset=[subject_col]) +create_patient_tabs_abundance_scatter(giana_df) + +``` + +**Understand the plot** +The plot shows the abundance of metaclones at two given timepoints. + +**Abundance** is calculated as Log(CPM + 1). +**Counts Per Million (CPM):** The counts for all TCRs within a specific cluster are summed, divided by the total counts in that sample.timepoint, and multiplied by 1,000,000. This normalizes the data, allowing for fair comparisons across samples with different sequencing depths. + +**The Pseudo-count (+1):** A pseudo-count of 1 is added to every CPM value before plotting on a logarithmic scale. This is a mathematical necessity to handle clusters that have zero counts at a given timepoint (since the log of 0 is undefined). + +**How to Interpret the Plot** +Each point on the plot represents a single metaclonotype. **The diagonal line (y=x) is the baseline; any cluster falling on this line maintained a stable relative abundance between the two timepoints.** + +- **Above the Diagonal:** Clusters that expanded (increased in relative abundance) by the second timepoint. +- **Below the Diagonal:** Clusters that contracted (decreased in relative abundance) by the second timepoint. +- **The Far-Left Boundary ($X = 1$):** Clusters that were completely absent at the first timepoint but emerged by the second timepoint. +- **The Bottom Boundary ($Y = 1$):** Clusters that were present at the first timepoint but completely vanished (zero counts) by the second timepoint. + +**What This Represents in a Cancer Context** +In the context of cancer immunotherapy (like checkpoint inhibitors or cancer vaccines), the enrichment of specific metaclonotypes post-treatment is a critical finding. **It provides strong evidence that the therapy was successful in stimulating an anti-tumor immune response.** + +These expanding clusters represent the T-cell "armies" that have been activated and are proliferating to recognize and attack cancer cells. Identifying these treatment-responsive metaclonotypes is of immense importance because: + +- They are likely tumor-reactive: These are the T-cells that are doing the work of fighting the cancer. + +- They are potential biomarkers: The magnitude of this expansion could correlate with a patient's clinical outcome, serving as a biomarker to predict who will respond to therapy. + +- They are candidates for next-generation therapies: The TCR sequences within these expanding clusters can be isolated and used to develop highly targeted and personalized TCR-T cell therapies. 🔬 + +**Visualizing this enrichment is not just a quality check; it is a discovery tool. It allows you to directly identify the most biologically active and therapeutically relevant T-cells that are driving a patient's response to cancer treatment.** + +### Public and Private Immunity at the TCR cluster Level +This Upset plot offers a powerful, high-level view of how immune responses are shared across patients. Crucially, it operates at the metaclonotype level, where each cluster represents a group of functionally similar T-cell receptors (TCRs), rather than just showing the sharing of exact, identical sequences. This approach provides a more biologically relevant picture of shared immunity, as TCRs with slight variations can still recognize the same antigen. + +```{python} +#| warning: false +#| label: giana-clusters-5 +#| fig-cap: "**Upset plot visualizing sharing of TCR GIANA clusters across the patient cohort.** The top vertical bars represent the number of clusters shared exclusively by the group of patients identified by the connected dots in the matrix below. The horizontal bars on the left show the total number of unique metaclonotypes found in each individual patient, providing an overview of both public and private immune repertoires." + +# Group by Patient (Subject) +clusters_by_patient = giana_df.groupby(subject_col)['cluster'].apply(set) + +# Filter out patients with empty sets if necessary +clusters_by_patient = clusters_by_patient[clusters_by_patient.apply(len) > 0] + +with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + + # Generate Upset Data + upset_data = from_contents(clusters_by_patient) + + # Plot + upset_plot(upset_data, show_counts=True, sort_by='cardinality') + plt.suptitle("Public Clusters: Shared Across Patients", fontsize=16) + plt.show() +``` + +**What Biological Information Can You Obtain? 🔬** +By analyzing shared metaclonotypes, you can identify patterns of shared functional immunity. + +**Identifying a Public Functional Response:** Look for tall bars with multiple connected dots. This signifies that a significant number of metaclonotypes are shared across those patients. These represent a "public" immune response, where different individuals have TCRs that are functionally equipped to recognize the same common and important antigens, like those from a virus or a shared tumor type. This is the first step toward discovering functionally relevant biomarkers or developing targeted immunotherapies. + +**Characterizing the Private Functional Repertoire:** The bars corresponding to single dots highlight private immune responses. A patient with a large number of unique metaclonotypes has a diverse and personalized functional repertoire, possibly targeting patient-specific neoantigens. + + +```{python} +#| label: giana-clusters-6 + +import plotly.graph_objects as go +import pandas as pd + +# --- 1. Identify Shared Clusters --- +patient_counts_per_cluster = giana_df.groupby('cluster')[subject_col].nunique() + +# Filter to get only the clusters shared by more than one patient. +shared_cluster_ids = patient_counts_per_cluster[patient_counts_per_cluster > 1].index +shared_clusters_df = giana_df[giana_df['cluster'].isin(shared_cluster_ids)].copy() + +# --- 2. Format Data for the Table (Timepoint + Sequence) --- +shared_clusters_df['display_info'] = ( + shared_clusters_df[timepoint_col].astype(str) + ': ' + shared_clusters_df['CDR3b'] +) + +aggregated_df = shared_clusters_df.groupby(['cluster', subject_col])['display_info'].apply( + lambda x: '
'.join(x) +).reset_index() + +# --- 3. Pivot to Wide Format (One Column Per Patient) --- +wide_table = aggregated_df.pivot(index='cluster', columns=subject_col, values='display_info') + +# --- 4. Add Counts and Sort --- +wide_table.insert(0, 'Number of Patients', patient_counts_per_cluster[shared_cluster_ids]) +wide_table.sort_values(by='Number of Patients', ascending=False, inplace=True) +wide_table.fillna('-', inplace=True) +wide_table.reset_index(inplace=True) +wide_table.rename(columns={'cluster': 'Cluster ID'}, inplace=True) + + +# --- 5. Create the Interactive Plotly Table --- +def create_interactive_cluster_table(df): + """Creates an interactive Plotly table from the shared cluster data.""" + + # Separate standard columns from patient columns to apply widths dynamically + patient_cols = [c for c in df.columns if c not in ['Cluster ID', 'Number of Patients']] + + # Match the 180 and 140 pixel widths from your target format + col_widths = [180, 140] + [180] * len(patient_cols) + total_table_width = sum(col_widths) + + fig = go.Figure(data=[go.Table( + columnwidth=col_widths, + header=dict( + values=list(df.columns), + fill_color='paleturquoise', + align='left', + font=dict(size=12, color='black', weight='bold'), + line_color='darkslategray' + ), + cells=dict( + values=[df[col] for col in df.columns], + fill_color='lavender', # Solid color to match your example + align='left', + font=dict(size=11, color='black'), + line_color='darkslategray' + ) + )]) + + # Update layout to match your target margins and width + fig.update_layout( + title_text="TCR Clusters Shared Across Patients (with Timepoints)", + title_x=0.5, + margin=dict(l=10, r=10, t=50, b=10), + width=max(total_table_width, 800) + ) + + return fig + +# Generate and display +interactive_table_fig = create_interactive_cluster_table(wide_table) +interactive_table_fig.show() + +``` +**Table. TCR GIANA clusters shared across more than one patient.** The clusters are sorted in descending order based on the number of patients they are found in. For each cluster, the table details the specific patients it is present in and lists the unique TCR CDR3 sequences belonging to that cluster within each patient. + +Each row in this table details a single T-cell receptor (TCR) metaclonotype (cluster) that is part of the public immune repertoire. This table of shared metaclonotypes is a launchpad for translational research, providing a **direct list of high-priority T-cell receptors (TCRs) that are likely involved in a common immune response.** + + diff --git a/subworkflows/local/compare.nf b/subworkflows/local/compare.nf index 9e2453e..073f9f1 100644 --- a/subworkflows/local/compare.nf +++ b/subworkflows/local/compare.nf @@ -33,4 +33,12 @@ workflow COMPARE { TCRSHARING_SCATTERPLOT( TCRSHARING_CALC.out.shared_cdr3 ) + + emit: + // Emits the absolute output directory path once compare-level results are ready + // for reporting. Collects from both final plotting processes to ensure both complete. + outdir = TCRSHARING_HISTOGRAM.out + .mix(TCRSHARING_SCATTERPLOT.out) + .collect() + .map { _ -> file(params.outdir).toAbsolutePath().toString() } } \ No newline at end of file diff --git a/subworkflows/local/patient.nf b/subworkflows/local/patient.nf index d8a55d3..b1b71ce 100644 --- a/subworkflows/local/patient.nf +++ b/subworkflows/local/patient.nf @@ -50,5 +50,18 @@ workflow PATIENT { GLIPH2_TURBOGLIPH( PATIENT_CONCATENATE.out.patient_cdr3 ) + ch_gliph2_outdir = GLIPH2_TURBOGLIPH.out.all_motifs + .first() + .map { _ -> file(params.outdir).toAbsolutePath().toString() } + } else { + ch_gliph2_outdir = Channel.empty() } + + emit: + // Emits the absolute output directory path once patient-level results are ready. + patient_outdir = PATIENT_CALC.out.jaccard_mat + .first() + .map { _ -> file(params.outdir).toAbsolutePath().toString() } + // Emits only when use_gliph2 = true; empty channel otherwise. + gliph2_outdir = ch_gliph2_outdir } \ No newline at end of file diff --git a/subworkflows/local/report.nf b/subworkflows/local/report.nf new file mode 100644 index 0000000..2abb175 --- /dev/null +++ b/subworkflows/local/report.nf @@ -0,0 +1,31 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { RENDER_NOTEBOOK } from '../../modules/local/report/render_notebook' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN SUBWORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// Single-channel listener: receives tuples of (report_name, notebook_path, data_dir) +// and renders each notebook to HTML. +workflow REPORT { + + take: + ch_reports // channel of tuples: tuple(val report_name, path notebook, val data_dir) + + main: + RENDER_NOTEBOOK( + ch_reports, + params.project_name, + workflow.commandLine + ) + + emit: + reports = RENDER_NOTEBOOK.out +} diff --git a/subworkflows/local/sample.nf b/subworkflows/local/sample.nf index 1454d1f..46287c7 100644 --- a/subworkflows/local/sample.nf +++ b/subworkflows/local/sample.nf @@ -6,7 +6,6 @@ */ include { SAMPLE_CALC } from '../../modules/local/sample/sample_calc' -include { SAMPLE_PLOT } from '../../modules/local/sample/sample_plot' include { SAMPLE_AGGREGATE as SAMPLE_AGG_STAT } from '../../modules/local/sample/sample_aggregate' include { SAMPLE_AGGREGATE as SAMPLE_AGG_V } from '../../modules/local/sample/sample_aggregate' include { SAMPLE_AGGREGATE as SAMPLE_AGG_D } from '../../modules/local/sample/sample_aggregate' @@ -46,15 +45,6 @@ workflow SAMPLE { SAMPLE_AGG_D(d_family_csv_files, "d_family.csv") SAMPLE_AGG_J(j_family_csv_files, "j_family.csv") - /////// =================== PLOT SAMPLE =================== /////// - - SAMPLE_PLOT ( - file(params.samplesheet), - file(params.sample_stats_template), - SAMPLE_AGG_STAT.out.aggregated_csv, - SAMPLE_AGG_V.out.aggregated_csv - ) - TCRDIST3_MATRIX( sample_map, params.matrix_sparsity, @@ -111,4 +101,9 @@ workflow SAMPLE { VDJDB_VDJMATCH (sample_map, VDJDB_GET.out.ref_db) + emit: + // Emits the absolute output directory path once SAMPLE_AGG_STAT completes, + // signalling that sample-level results are ready for reporting. + outdir = SAMPLE_AGG_STAT.out.aggregated_csv.map { _ -> file(params.outdir).toAbsolutePath().toString() } + } \ No newline at end of file diff --git a/tests/modules/local/report/render_notebook.nf.test b/tests/modules/local/report/render_notebook.nf.test new file mode 100644 index 0000000..598137b --- /dev/null +++ b/tests/modules/local/report/render_notebook.nf.test @@ -0,0 +1,72 @@ +nextflow_process { + + name "Test RENDER_NOTEBOOK" + script "modules/local/report/render_notebook.nf" + process "RENDER_NOTEBOOK" + + test("Should produce an HTML output file (stub)") { + + tag "stub" + + options "-stub" + + when { + params { + project_name = "test_project" + samplesheet = "${projectDir}/tests/fixtures/valid_samplesheet.csv" + outdir = "out" + } + process { + """ + input[0] = tuple( + "sample_report", + file("${projectDir}/notebooks/template_sample.qmd"), + "${projectDir}/out" + ) + input[1] = "test_project" + input[2] = "nextflow run main.nf" + """ + } + } + + then { + assert process.success + assert process.out.size() == 1 + assert path(process.out[0]).exists() + assert path(process.out[0]).getName().endsWith(".html") + assert path(process.out[0]).getName() == "sample_report.html" + } + } + + test("Should handle compare report name (stub)") { + + tag "stub" + + options "-stub" + + when { + params { + project_name = "compare_project" + samplesheet = "${projectDir}/tests/fixtures/valid_samplesheet.csv" + outdir = "out" + } + process { + """ + input[0] = tuple( + "sharing_report", + file("${projectDir}/notebooks/template_sharing.qmd"), + "${projectDir}/out" + ) + input[1] = "compare_project" + input[2] = "nextflow run main.nf --workflow_level compare" + """ + } + } + + then { + assert process.success + assert process.out.size() == 1 + assert path(process.out[0]).getName() == "sharing_report.html" + } + } +} diff --git a/workflows/tcrtoolkit.nf b/workflows/tcrtoolkit.nf index 633d0af..c0c4066 100644 --- a/workflows/tcrtoolkit.nf +++ b/workflows/tcrtoolkit.nf @@ -17,6 +17,7 @@ include { PATIENT } from '../subworkflows/local/patient' include { COMPARE } from '../subworkflows/local/compare' include { VALIDATE_PARAMS } from '../subworkflows/local/validate_params' include { ANNOTATE } from '../subworkflows/local/annotate' +include { REPORT } from '../subworkflows/local/report' include { PSEUDOBULK_PHENOTYPE }from '../subworkflows/local/pseudobulk_phenotype' @@ -98,6 +99,9 @@ workflow TCRTOOLKIT { ANNOTATE( sample_map_final ) } + // Accumulate reporting tuples: (report_name, notebook_path, data_dir) + ch_reports = Channel.empty() + // Running sample level analysis if (levels.contains('sample')) { SAMPLE( @@ -105,11 +109,21 @@ workflow TCRTOOLKIT { ANNOTATE.out.cdr3_pgen, ANNOTATE.out.olga_stats ) + ch_reports = ch_reports.mix( + SAMPLE.out.outdir.map { dir -> + tuple("sample_report", file(params.sample_stats_template), dir) + } + ) } // Running patient analysis if (levels.contains('patient')) { PATIENT( ANNOTATE.out.processed_samples ) + ch_reports = ch_reports.mix( + PATIENT.out.gliph2_outdir.map { dir -> + tuple("gliph2_report", file(params.gliph2_report_template), dir) + } + ) } // Running comparison analysis @@ -118,7 +132,14 @@ workflow TCRTOOLKIT { ANNOTATE.out.concat_cdr3_sorted, ANNOTATE.out.cdr3_pgen ) + ch_reports = ch_reports.mix( + COMPARE.out.outdir.map { dir -> + tuple("sharing_report", file(params.compare_stats_template), dir) + } + ) } + + REPORT( ch_reports ) } /* From e53f7ad21df03f07cb4dd7b3ede8cd95d2f57127 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 13:02:57 +0000 Subject: [PATCH 2/6] Fix typos in template_sample.qmd and template_sharing.qmd Agent-Logs-Url: https://github.com/KarchinLab/TCRtoolkit/sessions/3ad3c5f4-c1cd-4a3c-a8a8-638a69efe87c Co-authored-by: dimalvovs <1246862+dimalvovs@users.noreply.github.com> --- notebooks/template_sample.qmd | 2 +- notebooks/template_sharing.qmd | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/template_sample.qmd b/notebooks/template_sample.qmd index 61f942c..e77bf9d 100644 --- a/notebooks/template_sample.qmd +++ b/notebooks/template_sample.qmd @@ -360,7 +360,7 @@ create_expansion_stacked_barplot_per_individual( ``` -**Figure 1. Clonal Expansion Categories Relative to Sequencing Depth.** This stacked bar chart illustrates the percentage of clones categorized as non-Frequent, Frequent, or highly Frequent for each sample. The overlaid black dotted line tracks the total sequencing read count on a secondary axis, highlighting the influece of sampling depth and the detection of Frequent clones. A vertical red threshold line ($>3,000$ counts) demarcates samples with sufficient data for reliable clonality assessment from low-depth libraries where expansion metrics may be unstable. +**Figure 1. Clonal Expansion Categories Relative to Sequencing Depth.** This stacked bar chart illustrates the percentage of clones categorized as non-Frequent, Frequent, or highly Frequent for each sample. The overlaid black dotted line tracks the total sequencing read count on a secondary axis, highlighting the influence of sampling depth and the detection of Frequent clones. A vertical red threshold line ($>3,000$ counts) demarcates samples with sufficient data for reliable clonality assessment from low-depth libraries where expansion metrics may be unstable. **Understanding Each Category** diff --git a/notebooks/template_sharing.qmd b/notebooks/template_sharing.qmd index 1667ace..a0e6037 100644 --- a/notebooks/template_sharing.qmd +++ b/notebooks/template_sharing.qmd @@ -115,7 +115,7 @@ In the discovery brief notebook, we extracted the specific amino acid sequences This plot provides a powerful, two-dimensional view of your T-cell repertoire, helping you move beyond simple clone lists to understand the functional landscape of the immune response. The central question it helps answer is: **Are the T-cell clones found across many different samples (widely shared) also the ones that are most active and expanded?** -This plot is generated by first calculating the frequency of each unique TCR clone within every sample from the raw data. For each clone, two key metrics are then determined: the total number of samples it appears in (its sharing level) and its single highest frequency across all of those samples (its maximum expansion). **Clones are then categorized as 'Highly Expanded' (>1%), 'Expanded' (0.1%-1%), or 'Non-expanded' (<0.1%) based on this maximum frequency value (see sample Notebook for a detailed axplanation)**. The final stacked bar plot visualizes the count of unique TCRs for each sharing level on the x-axis, with the colored segments revealing the proportion of clones that fall into each expansion category. +This plot is generated by first calculating the frequency of each unique TCR clone within every sample from the raw data. For each clone, two key metrics are then determined: the total number of samples it appears in (its sharing level) and its single highest frequency across all of those samples (its maximum expansion). **Clones are then categorized as 'Highly Expanded' (>1%), 'Expanded' (0.1%-1%), or 'Non-expanded' (<0.1%) based on this maximum frequency value (see sample Notebook for a detailed explanation)**. The final stacked bar plot visualizes the count of unique TCRs for each sharing level on the x-axis, with the colored segments revealing the proportion of clones that fall into each expansion category. **A T-cell clone that is both widely shared and highly expanded suggests it recognizes a common and potent antigen present in multiple individuals, such as a shared tumor antigen.** You can quickly generate hypotheses about the nature of the immune responses within your cohort, distinguishing between powerful shared responses and potent individual-specific ones. Identifying these clones is critical as they represent key players in a common immune response and are high-priority targets for further research. From 21c7949c5aaf5b96661c9092c1e9acb7f7128feb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 18:18:18 +0000 Subject: [PATCH 3/6] Fix RENDER_NOTEBOOK tests: use process.out[0][0] to access emission value Agent-Logs-Url: https://github.com/KarchinLab/TCRtoolkit/sessions/b6f09317-3a96-40cd-9260-76676dc70c4e Co-authored-by: dimalvovs <1246862+dimalvovs@users.noreply.github.com> --- tests/modules/local/report/render_notebook.nf.test | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/modules/local/report/render_notebook.nf.test b/tests/modules/local/report/render_notebook.nf.test index 598137b..677be03 100644 --- a/tests/modules/local/report/render_notebook.nf.test +++ b/tests/modules/local/report/render_notebook.nf.test @@ -31,10 +31,10 @@ nextflow_process { then { assert process.success - assert process.out.size() == 1 - assert path(process.out[0]).exists() - assert path(process.out[0]).getName().endsWith(".html") - assert path(process.out[0]).getName() == "sample_report.html" + assert process.out[0].size() == 1 + assert path(process.out[0][0]).exists() + assert path(process.out[0][0]).getName().endsWith(".html") + assert path(process.out[0][0]).getName() == "sample_report.html" } } @@ -65,8 +65,8 @@ nextflow_process { then { assert process.success - assert process.out.size() == 1 - assert path(process.out[0]).getName() == "sharing_report.html" + assert process.out[0].size() == 1 + assert path(process.out[0][0]).getName() == "sharing_report.html" } } } From 31cf4c6f96cbdd93a0ed2de1b73cc09eef2f42ae Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:19:19 +0000 Subject: [PATCH 4/6] Fix RENDER_NOTEBOOK tests: use getFileName().toString() instead of getName() Agent-Logs-Url: https://github.com/KarchinLab/TCRtoolkit/sessions/3dd1579f-e401-4eb6-bb5b-8d55c17757de Co-authored-by: dimalvovs <1246862+dimalvovs@users.noreply.github.com> --- tests/modules/local/report/render_notebook.nf.test | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/modules/local/report/render_notebook.nf.test b/tests/modules/local/report/render_notebook.nf.test index 677be03..40c24e4 100644 --- a/tests/modules/local/report/render_notebook.nf.test +++ b/tests/modules/local/report/render_notebook.nf.test @@ -33,8 +33,8 @@ nextflow_process { assert process.success assert process.out[0].size() == 1 assert path(process.out[0][0]).exists() - assert path(process.out[0][0]).getName().endsWith(".html") - assert path(process.out[0][0]).getName() == "sample_report.html" + assert path(process.out[0][0]).getFileName().toString().endsWith(".html") + assert path(process.out[0][0]).getFileName().toString() == "sample_report.html" } } @@ -66,7 +66,7 @@ nextflow_process { then { assert process.success assert process.out[0].size() == 1 - assert path(process.out[0][0]).getName() == "sharing_report.html" + assert path(process.out[0][0]).getFileName().toString() == "sharing_report.html" } } } From afc87bf3bec7451e3f3f7bacbd1341001e841cdb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 19:42:43 +0000 Subject: [PATCH 5/6] Fix integration test: remove deleted sample_plot/compare_plot imports from pseudobulk_phenotype.nf Agent-Logs-Url: https://github.com/KarchinLab/TCRtoolkit/sessions/5a46743e-13d6-4f38-b705-d7137b74cbf9 Co-authored-by: dimalvovs <1246862+dimalvovs@users.noreply.github.com> --- subworkflows/local/pseudobulk_phenotype.nf | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/subworkflows/local/pseudobulk_phenotype.nf b/subworkflows/local/pseudobulk_phenotype.nf index 3ccb8c0..222b183 100644 --- a/subworkflows/local/pseudobulk_phenotype.nf +++ b/subworkflows/local/pseudobulk_phenotype.nf @@ -9,10 +9,7 @@ include { SAMPLE_AGGREGATE as SAMPLE_AGG_STAT_PHENO } from '../../modules/local/ include { SAMPLE_AGGREGATE as SAMPLE_AGG_V_PHENO } from '../../modules/local/sample/sample_aggregate' include { SAMPLE_AGGREGATE as SAMPLE_AGG_D_PHENO } from '../../modules/local/sample/sample_aggregate' include { SAMPLE_AGGREGATE as SAMPLE_AGG_J_PHENO } from '../../modules/local/sample/sample_aggregate' -include { SAMPLE_PLOT as SAMPLE_PLOT_PHENO } from '../../modules/local/sample/sample_plot' - include { COMPARE_CALC as COMPARE_CALC_PHENO } from '../../modules/local/compare/compare_calc' -include { COMPARE_PLOT as COMPARE_PLOT_PHENO } from '../../modules/local/compare/compare_plot' include { ANNOTATE_CONCATENATE as COMPARE_CONCATENATE_PHENO } from '../../modules/local/annotate' workflow PSEUDOBULK_PHENOTYPE { @@ -65,13 +62,6 @@ workflow PSEUDOBULK_PHENOTYPE { SAMPLE_AGG_V_PHENO(v_family_csv_files, "v_family.csv") SAMPLE_AGG_D_PHENO(d_family_csv_files, "d_family.csv") SAMPLE_AGG_J_PHENO(j_family_csv_files, "j_family.csv") - - SAMPLE_PLOT_PHENO( - samplesheet_pheno, - file(params.sample_stats_template), - SAMPLE_AGG_STAT_PHENO.out.aggregated_csv, - SAMPLE_AGG_V_PHENO.out.aggregated_csv - ) } ch_phenotype_files_transformed @@ -84,15 +74,6 @@ workflow PSEUDOBULK_PHENOTYPE { COMPARE_CONCATENATE_PHENO( samplesheet_pheno, all_sample_files ) - - COMPARE_PLOT_PHENO( samplesheet_pheno, - COMPARE_CALC_PHENO.out.jaccard_mat, - COMPARE_CALC_PHENO.out.sorensen_mat, - COMPARE_CALC_PHENO.out.morisita_mat, - file(params.compare_stats_template), - params.project_name, - all_sample_files - ) } emit: From 2356fc2b5010f13586df59032550e6a3064bac03 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 7 Apr 2026 20:04:11 +0000 Subject: [PATCH 6/6] Fix RENDER_NOTEBOOK: resolve samplesheet to absolute path for Quarto Agent-Logs-Url: https://github.com/KarchinLab/TCRtoolkit/sessions/8a7adee6-ce70-45d2-afb7-7389abd2c7e7 Co-authored-by: dimalvovs <1246862+dimalvovs@users.noreply.github.com> --- modules/local/report/render_notebook.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/local/report/render_notebook.nf b/modules/local/report/render_notebook.nf index 567a004..5b98d4d 100644 --- a/modules/local/report/render_notebook.nf +++ b/modules/local/report/render_notebook.nf @@ -21,7 +21,7 @@ process RENDER_NOTEBOOK { -P project_name:${project_name} \\ -P workflow_cmd:'${workflow_cmd}' \\ -P project_dir:${data_dir} \\ - -P sample_table:${params.samplesheet} \\ + -P sample_table:${file(params.samplesheet)} \\ --to html """