diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..fa4ccb2
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,142 @@
+FROM ubuntu:20.04
+
+# Set environment variables to make the build non-interactive
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Etc/UTC
+
+# Update and install prerequisites for adding the PPA
+RUN apt-get update && apt-get install -y \
+ software-properties-common \
+ build-essential \
+ libssl-dev \
+ libffi-dev \
+ curl \
+ lsb-release \
+ && add-apt-repository -y ppa:deadsnakes/ppa \
+ && apt-get update \
+ && apt-get install -y \
+ python3.10 \
+ python3.10-dev \
+ python3.10-venv \
+ python3.10-distutils \
+ && apt-get clean \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install pip using the official get-pip.py script
+RUN ln -sf /usr/bin/python3.10 /usr/bin/python
+RUN curl -O https://bootstrap.pypa.io/get-pip.py && python get-pip.py
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Copy the requirements.txt file to the working directory
+COPY requirements.txt .
+
+RUN pip install -r requirements.txt
+
+RUN apt-get update && \
+ # apt-get install -y libx11-dev libgl1-mesa-dev && \
+ apt-get install -y libx11-dev libgl1-mesa-dev libxcomposite-dev libxrandr-dev libxss-dev libxcursor-dev
+
+RUN apt-get install -y libx11-xcb1 libxcb-xinerama0 libxkbcommon0 libglib2.0-0
+
+
+RUN apt-get update && apt-get install -y \
+ libx11-xcb1 \
+ libxcb-util1 \
+ # libxcb-xinerama0 \
+ libxcb-icccm4 \
+ libxcb-image0 \
+ libxcb-keysyms1 \
+ libxcb-randr0 \
+ libxcb-render-util0 \
+ libxcb-render0 \
+ libxcb-shape0 \
+ libxcb-shm0 \
+ libxcb-sync1 \
+ libxcb-xfixes0 \
+ libxcb-xkb1 \
+ x11-utils \
+ libxkbcommon-x11-0
+
+RUN export QT_QPA_PLATFORM_PLUGIN_PATH=/usr/local/lib/python3.10/dist-packages/PyQt5/Qt5/plugins/platforms
+RUN export QT_DEBUG_PLUGINS=1
+
+
+# Copy Cellscanner files to the working directory
+COPY cellscanner ./
+
+# Specify the command to run the application (optional)
+CMD ["python", "Cellscanner.py"]
+
+
+
+# docker run --rm -it --entrypoint /bin/bash cellscanner
+# docker run -e DISPLAY=$DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix -v .:/media cellscanner
+
+
+
+
+# # FROM DEEPSEEK AS ALTERNATIVE -- NOT WORKING MOUHAHA
+# FROM ubuntu:20.04
+
+# # Set environment variables
+# ENV DEBIAN_FRONTEND=noninteractive \
+# TZ=Etc/UTC \
+# QT_QPA_PLATFORM_PLUGIN_PATH=/usr/local/lib/python3.10/dist-packages/PyQt5/Qt5/plugins/platforms \
+# QT_DEBUG_PLUGINS=1
+
+# # Combine all package installations into a single layer
+# RUN apt-get update && apt-get install -y --no-install-recommends \
+# software-properties-common \
+# build-essential \
+# libssl-dev \
+# libffi-dev \
+# curl \
+# lsb-release \
+# python3.10 \
+# python3.10-dev \
+# python3.10-venv \
+# python3.10-distutils \
+# libx11-dev \
+# libgl1-mesa-dev \
+# libxcomposite-dev \
+# libxrandr-dev \
+# libxss-dev \
+# libxcursor-dev \
+# libx11-xcb1 \
+# libxcb-xinerama0 \
+# libxkbcommon0 \
+# libglib2.0-0 \
+# libxcb-util1 \
+# libxcb-icccm4 \
+# libxcb-image0 \
+# libxcb-keysyms1 \
+# libxcb-randr0 \
+# libxcb-render-util0 \
+# libxcb-render0 \
+# libxcb-shape0 \
+# libxcb-shm0 \
+# libxcb-sync1 \
+# libxcb-xfixes0 \
+# libxcb-xkb1 \
+# x11-utils \
+# libxkbcommon-x11-0 && \
+# add-apt-repository -y ppa:deadsnakes/ppa && \
+# apt-get clean && \
+# rm -rf /var/lib/apt/lists/*
+
+# # Install pip and setup Python
+# RUN ln -sf /usr/bin/python3.10 /usr/bin/python && \
+# curl -sS https://bootstrap.pypa.io/get-pip.py | python -
+
+# WORKDIR /app
+
+# # Install Python dependencies first (better layer caching)
+# COPY requirements.txt .
+# RUN pip install --no-cache-dir -r requirements.txt
+
+# # Copy application files
+# COPY cellscanner ./
+
+# CMD ["python", "Cellscanner.py"]
diff --git a/README.md b/README.md
index 7ce01b0..881a661 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ To run CellScanner with its GUI, you may now run:
# Always remember to activate your conda environment, if you set one for CellScanner
conda activate cellscanner
# If `python` returns an error message that is not there, try with `python3` instead
-python Cellscanner.py
+python cellscanner/Cellscanner.py
```
This will pop-up CellScanner where you can now import your data, fill in your training parameters and
@@ -90,7 +90,7 @@ Once your configuration file is ready, you may run CellScanner CLI :
```bash
conda activate cellscanner
-python CellscannerCLI.py --config config.yml
+python cellscanner/CellscannerCLI.py --config config.yml
```
@@ -107,5 +107,22 @@ For the new features that have been added, a manuscript is in process. :pencil:
+## Docker
+
+In linux, remember to disable access control for local connections first:
+
+ ```bash
+ xhost +local:
+ ```
+
+ then, you need to run something line:
+
+ ```bash
+ docker run -e DISPLAY=$DISPLAY -v /tmp/.X11-unix:/tmp/.X11-unix -v ./Testfiles:/csFiles hariszaf/cell_scanner
+ ```
+
+ where `Testfiles` is the directory where you have you input data, and also, where CellScanner will return its findings, on your local machine.
+
+ `csFiles` is the output directory of CellScanner within the container, so make sure you always keep it like that in your command.
diff --git a/Tutorial/Images/GUI.png b/Tutorial/Images/GUI.png
deleted file mode 100644
index ed2dd5d..0000000
Binary files a/Tutorial/Images/GUI.png and /dev/null differ
diff --git a/Tutorial/Images/Import_data_step.png b/Tutorial/Images/Import_data_step.png
deleted file mode 100644
index de6f13c..0000000
Binary files a/Tutorial/Images/Import_data_step.png and /dev/null differ
diff --git a/Tutorial/Images/Run_prediction_step.png b/Tutorial/Images/Run_prediction_step.png
deleted file mode 100644
index edc1ff3..0000000
Binary files a/Tutorial/Images/Run_prediction_step.png and /dev/null differ
diff --git a/Tutorial/Images/Train_model_step.png b/Tutorial/Images/Train_model_step.png
deleted file mode 100644
index 77910e2..0000000
Binary files a/Tutorial/Images/Train_model_step.png and /dev/null differ
diff --git a/Tutorial/Images/growthcurves.png b/Tutorial/Images/growthcurves.png
deleted file mode 100644
index 008047d..0000000
Binary files a/Tutorial/Images/growthcurves.png and /dev/null differ
diff --git a/Tutorial/README.md b/Tutorial/README.md
deleted file mode 100644
index 2fef417..0000000
--- a/Tutorial/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-# Tutorial for CellScanner 2.0
-
-## Introduction
-For the tutorial, we will use flow cytometry files coming from a growth experiment with two gut bacterial species, *Roseburia intestinalis* (RI) and *Bacteroides thetaiotaomicron* (BT). These two species were grown in mono- and co-culture for up to 120 hours, as shown below:
-
-
-In this experiment, dead/live staining with propidium iodide and SYBR-Green was applied. Propidium iodide is a stain that enters cells with broken membranes, which we therefore count as dead. SYBR-Green is a DNA-binding molecule that helps distinguish cells from background particles that do not contain DNA. Thus, viable cells should stain green and not red.
-
-If you are interested in the biological background of the experiment, please check out the [article](https://www.nature.com/articles/s41396-023-01501-1).
-
-The flow cytometry data for the growth curves shown above are available at [flowrepository.org](https://flowrepository.org/id/FR-FCM-Z6YM).
-We are going to work here only with one time point (50 hours). You can find the files used in the tutorial [here](http://msysbiology.com/documents/CellScanner/CS2TutorialFiles.zip).
-
-When you open CellScanner, you see the graphical user interface (GUI) shown below. Please be patient, opening the GUI can sometimes take a minute.
-
-
-
-## Import Data
-The first step is to import the data. You can do this by clicking on **"Import Data"**. You can select blank cultures and mono-cultures by clicking on the corresponding fc files. Note that **you can select several files at once**! Optionally, you can also specify an output directory where results will be saved. If you do not specify one, results will go in an output folder created on the fly inside the CellScanner directory. If you previously trained a model for your data, you can also re-use it. Here, we work with two blank files and three biological replicates for each monoculture, with the samples collected at 50 hours.
-
-
-
-## Train Model
-Next, we open the **"Train Model"** panel. If the CellScanner window becomes too big for your screen, close the "Import Data" panel. Here, we are going to use default values as shown below. UMAP is run first to remove debris. Essentially, this is done by clustering events from blanks and monocultures and then removing events from monocultures that are too similar to events in blanks. Next, a neural model is trained on the filtered monocultures.
-
-TODO: gating based on stains to be explained here
-
-
-
-Model training should be fast (within one minute). Model performance files will be stored in a sub-folder in your specified output folder (if you did not specify one, then in the CellScanner folder). The sub-folder name starts with *working_files* and ends with a time stamp. It contains another folder called "model", in which you will find a number of files encoding the trained neural network, a file called *model_statistics.csv* and two html files, which will open in your browser when clicked. The first shows a UMAP projection before and the second one after filtering. An event is filtered if its neighbors in the UMAP embedding do not have the same label (the number of neighbors considered is among CellScanner's parameters). The "model_statistics.csv" file contains information about classification performance, including accuracy, precision, recall, F1 score and the confusion matrix.
-
-## Run prediction
-We are now ready to apply the trained neural network on one or several cocultures. For this, we open the **"Run Prediction"** panel by clicking on it. As with monocultures, several coculture files can be selected and imported at once. If more than one coculture is selected, the trained neural network will be applied to each coculture in turn. Here, we are importing six replicates of the coculture (btriA-F). **Optionally, the "uncertainty" thresholding can be enabled** by clicking the box next to "Apply filtering on the predictions based on their uncertainty scores". Events that cannot be easily assigned to one species have a high uncertainty (entropy). CellScanner automatically computes an uncertainty threshold that maximizes model performance. If uncertainty thresholding is enabled, events with uncertainty above this threshold will be filtered out. Note that the threshold can be manually adjusted. Next, we specify three flow cytometer channels to be used in the visualization. Clicking "Predict" will then launch the prediction step.
-
-
-
-The prediction should also happen within one minute. The output is stored in a folder called "Prediction" (followed by a time stamp) that is either located in the specified output folder or the CellScanner folder.
-For each coculture, the following files are generated:
-
-- prediction_counts.csv, which contains the predicted counts for debris (blank), for each species, and also for the unknown events if uncertainty thresholding was enabled
-- raw_predictions.csv, which is the fc file extended with prediction results (labels and, if enabled, uncertainties)
-- uncertainty_counts.csv, which lists the number of uncertain events per label if uncertainty thresholding was enabled
-- 3D_coculture_predictions_species.html plots events in a 3D plot spanned by the three selected flow cytometer channels and colors them by species
-- 3D_coculture_predictions_uncertainty.html is the same with events colored by prediction uncertainty
-- sub-folder "gated" provides more information and a plot on gating if stains were provided
-- sub-folder "heterogeneity_results" quantifies and visualizes overall and species-specific heterogeneity
-
-If more than one coculture file was provided, "merged_prediction_counts.csv" will list the counts for each coculture, and "merged_uncertainty_counts.csv" will list the number of uncertain events in each category for each coculture.
-
-Below is the result for the six coculture replicates:
-
-| Species | Coculure 1 | Coculture 2 | Coculture 3 | Coculture 4 | Coculture 5 | Coculture 6 |
-| ----------- | ----------- | ------- | ----| -----| ---- | ------ |
-| BT | 140116 | 158746 | 140214 | 142779 | 154802 | 144496
-| RI | 80022 | 40105 | 89645 | 75130 | 94461 | 90365 |
-| Blank | 664 | 705 | 677 | 594| 817 | 687 |
-| Unknown | 29 | 39 | 38 | 45 | 246 | 127 |
-
-
-At 50 hours, the coculture is dominated by *Bacteroides thetaiotaomicron* according to CellScanner.
-
-
-
-
-
diff --git a/Cellscanner.py b/cellscanner/Cellscanner.py
old mode 100644
new mode 100755
similarity index 70%
rename from Cellscanner.py
rename to cellscanner/Cellscanner.py
index ed603a4..3d7d27a
--- a/Cellscanner.py
+++ b/cellscanner/Cellscanner.py
@@ -1,21 +1,5 @@
-# CellScanner.py
-import os
-import sys
-
-from PyQt5.QtCore import Qt
-from PyQt5.QtGui import QPixmap, QFont
-from PyQt5.QtWidgets import QApplication, QMainWindow, QVBoxLayout, QHBoxLayout,\
- QPushButton, QWidget, QLabel, QScrollArea
-
-from scripts.helpers import button_style, get_app_dir
-from scripts.ImportFiles import ImportFilePanel
-from scripts.TrainingModel import TrainModelPanel
-from scripts.Prediction import PredictionPanel
-
-
+#!/usr/bin/env python
"""
-CellScanner - A Comprehensive Flow Cytometry Data Analysis Tool
-
==========================================
Overview
==========================================
@@ -36,78 +20,52 @@
- **Heterogeneity Analysis**: Perform heterogeneity assessments using both simple range-based methods and MiniBatchKMeans clustering.
- **Interactive Visualizations**: Generate and save interactive 3D scatter plots, pie charts, and bar charts to visualize predictions, gating results, and heterogeneity measures.
-
==========================================
-Modules and Functions
+Cellscanner.py
==========================================
-1. **get_app_dir()**:
- - **Description**: Determines the base path of the application, accommodating both development and bundled executable environments (e.g., PyInstaller).
- - **Returns**: Absolute path to the directory where the script or executable is located.
-
-2. **get_abs_path(relative_path)**:
- - **Description**: Converts a relative file path to an absolute path based on the application's base directory.
- - **Parameters**:
- - `relative_path` (str): The relative path of the resource.
- - **Returns**: Absolute path to the specified resource.
-
-3. **NeuralNetworkGUI (class)**:
- - **Description**: The primary GUI class inheriting from `QMainWindow`. It orchestrates the layout, user interactions, and integrates various panels for data importation, model training, prediction, and analysis.
- - **Attributes**:
- - `model`: Trained neural network model.
- - `scaler`: Scaler object for data preprocessing.
- - `label_encoder`: Encoder for translating model predictions into readable labels.
- - **Methods**:
- - `__init__()`: Initializes the GUI components, including title, logo, buttons, and panels.
- - `init_predict_panel()`: Configures the prediction panel with options for selecting coculture files, axis channels, gating parameters, and initiating predictions.
- - `toggle_file_panel()`: Shows or hides the data import panel based on user interaction.
- - `toggle_train_panel()`: Shows or hides the model training panel.
- - `toggle_gating_options()`: Displays or conceals gating options contingent on the gating checkbox state.
- - `toggle_predict_panel()`: Shows or hides the prediction panel.
- - `choose_coculture_file()`: Facilitates the selection of coculture files and populates axis selection dropdowns with appropriate channels.
- - `run_prediction()`: Executes the prediction workflow, including loading models, predicting species, applying gating, performing heterogeneity analysis, and generating visualizations.
- - `open_documentation()`: Opens the application's documentation webpage in the user's default web browser.
-
-4. **Main Application Loop**:
- - **Description**: Initializes the QApplication, instantiates the main GUI window, and executes the application loop to render the GUI.
+Initializes the QApplication, instantiates the main GUI window, and executes the application loop to render the GUI.
+
Usage:
- Import Files: Allows users to import monoculture and blank files for analysis.
- Train Neural Network: Provides an interface to train a neural network model on the imported data.
- Predict Coculture: Allows users to select a coculture file, predict species within the sample, and optionally apply gating and heterogeneity analysis.
+"""
+import os
+import sys
-Dependencies:
-- Python 3.x
-- PyQt5
-- TensorFlow
-- fcsparser
-- joblib
-- numpy
-- shutil
-- atexit
+from PyQt5.QtCore import Qt
+from PyQt5.QtGui import QPixmap, QFont
+from PyQt5.QtWidgets import QApplication, QMainWindow, QVBoxLayout, QHBoxLayout,\
+ QPushButton, QWidget, QLabel, QScrollArea
+from scripts.helpers import get_app_dir
+from scripts.GUIhelpers import button_style
+from scripts.ImportFiles import ImportFilePanel
+from scripts.TrainingModel import TrainModelPanel
+from scripts.Prediction import PredictionPanel
+
+"""
Authors:
- Ermis Ioannis Michail Delopoulos
- Haris Zafeiropoulos
Date: 2024-2025
-
"""
class NeuralNetworkGUI(QMainWindow):
"""
Main class of the CellScanner GUI.
It builds a PyQT app with 3 main panels:
- - Importing files
- - Training
- - Prediction
-
- NOTE: The Training panel is delivered thanks to the TrainModelPanel class of the TrainingModel.py
- The Importing files and the Predictions panels though, they are described as features of the NeuralNetworkGUI class.
+ - Importing files (:class:`scripts.ImportFiles.ImportFilePanel`)
+ - Training (:class:`scripts.TrainingModel.TrainModelPanel`)
+ - Prediction (:class:`scripts.Prediction.PredictionPanel`)
"""
def __init__(self):
super().__init__()
self.setWindowTitle("CellScanner")
- self.setGeometry(100, 100, 850, 1800)
+ # setGeometry(x, y, width, height) x, y stand for top-left corner
+ self.setGeometry(100, 100, 950, 2100)
# Initialize model-related attributes
self.model = None
@@ -207,21 +165,21 @@ def __init__(self):
self.setCentralWidget(scroll_area)
def toggle_file_panel(self):
- # Toggle the visibility of the file import panel
+ """Shows or hides the data import panel based on user interaction."""
if self.file_panel.isVisible():
self.file_panel.hide()
else:
self.file_panel.show()
def toggle_train_panel(self):
- # Toggle the visibility of the train model panel
+ """Shows or hides the model training panel."""
if self.train_panel.isVisible():
self.train_panel.hide()
else:
self.train_panel.show()
def toggle_predict_panel(self):
- # Toggle the visibility of the predict panel
+ """Shows or hides the predict panel"""
if self.predict_panel.isVisible():
self.predict_panel.hide()
else:
diff --git a/CellscannerCLI.py b/cellscanner/CellscannerCLI.py
old mode 100644
new mode 100755
similarity index 75%
rename from CellscannerCLI.py
rename to cellscanner/CellscannerCLI.py
index cc7e115..7af646e
--- a/CellscannerCLI.py
+++ b/cellscanner/CellscannerCLI.py
@@ -1,14 +1,16 @@
+#!/usr/bin/env python
+"""
+CellScanner Command Line Interface main class.
+
+Besides using CellScanner throught its GUI, you may use it through a CLI.
+To this end, you should first complete a [`config.yml`](../config.yml) file, providing the necessary parameters.
+"""
import os
import sys
import yaml
import argparse
import fcsparser
from collections import defaultdict
-# Load CellScanner features
-from scripts.apply_umap import process_files
-from scripts.nn import prepare_for_training, train_neural_network
-from scripts.run_prediction import predict, merge_prediction_results
-from scripts.helpers import get_app_dir, time_based_dir, load_model_from_files, Stain
class CellScannerCLI():
@@ -24,6 +26,9 @@ def __init__(self, args):
# Load config file
conf = load_yaml(args.config)
+ # Load CellScanner features
+ from scripts.helpers import get_app_dir, load_model_from_files
+
# Output dir
outdir = conf.get("output_directory").get("path")
if os.getcwd() == "/app":
@@ -85,35 +90,50 @@ def __init__(self, args):
self.stain1_train, self.stain2_train = None, None
self.stain1_predict, self.stain2_predict, self.extra_stains = None, None, None
if self.gating:
+
# Training
self.stain1_train = get_stain_params("stain1_train", conf)
self.stain2_train = get_stain_params("stain2_train", conf)
+
# Predict
self.stain1_predict = get_stain_params("stain1_predict", conf)
self.stain2_predict = get_stain_params("stain2_predict", conf)
+
# Extra stains for predict
self.extra_stains = get_extra_stains(conf)
self._channel_sannity_check()
def _channel_sannity_check(self):
-
+ """
+ Checks whether a channel provided by the user is actually amont those on the .fcs files.
+ """
basic_stains = [self.stain1_train, self.stain2_train, self.stain1_predict, self.stain2_predict]
- print(self.blank_files)
- _, data_df = fcsparser.parse(list(self.blank_files)[0], reformat_meta=True)
+
+ if self.prev_trained_model is None:
+ _, data_df = fcsparser.parse(list(self.blank_files)[0], reformat_meta=True)
+ else:
+ _, data_df = fcsparser.parse(list(self.coculture_files)[0], reformat_meta=True)
all_channels = data_df.columns
for stain in basic_stains:
- if stain.channel not in all_channels:
+ if stain.channel is not None and stain.channel not in all_channels:
raise ValueError(f"Channel provided for gating {stain.channel} not present in the .fcs files provided.")
for stain in self.extra_stains:
if stain not in all_channels:
raise ValueError(f"Channel provided for gating {stain.channel} not present in the .fcs files provided.")
+
print("Valid channel names.")
def train_model(self):
+ """
+ A wrapper for the main training model - related CellScanner functions.
+ """
+ from scripts.apply_umap import process_files
+ from scripts.nn import prepare_for_training, train_neural_network
+
print("\nAbout to preprocess input files.")
cleaned_data = process_files(
n_events = self.events, umap_n_neighbors=self.n_neighbors,
@@ -144,6 +164,13 @@ def train_model(self):
print("Model complete!")
def predict_coculture(self):
+ """
+ A wrapper for the running prediction step - related CellScanner functions.
+ In case where several co-culture files have been provided (samples), CellScanner makes its prediction per sample
+ and in the end merges them in a single file.
+ """
+ from scripts.helpers import time_based_dir, merge_prediction_results
+ from scripts.run_prediction import predict
print("About to start predicting co-culture profiles.")
@@ -191,7 +218,6 @@ def predict_coculture(self):
"filter_out_uncertain": self.filter_out_uncertain,
"uncertainty_threshold": self.uncertainty_threshold
}
-
# Add specific parameters based on gating
if self.gating:
predict_params.update({
@@ -212,9 +238,13 @@ def predict_coculture(self):
merge_prediction_results(self.predict_dir, "uncertainty")
-def load_yaml(yaml_file):
+def load_yaml(yaml_file: str):
"""
Load a yaml file
+
+ :param yaml_file: path to the YAML file
+ :return: A ``dict`` with the YAML file parameters
+ :rtype: dict
"""
with open(yaml_file, 'r') as f:
try:
@@ -225,26 +255,21 @@ def load_yaml(yaml_file):
sys.exit(1)
-def parse_dicts(dir_list, entity, names=None):
+def parse_dicts(dir_list: [dict], entity: str, names: str=None):
"""
- Processes a list of directory info to extract file paths and optionally map them to names.
-
- Parameters:
- -----------
- dir_list : list of dicts
- List containing directory info, each with 'path' and 'filenames'.
- entity : str
- The entity being processed (e.g., "species_files").
- names : str, optional
- The key to map filenames to names (labels). If omitted, only file paths are returned.
-
- Returns:
- --------
- set or tuple
- - If `names` is provided, returns a tuple of:
- - A set of file paths.
- - A dictionary mapping names to file paths.
- - Otherwise, returns only the set of file paths.
+ Processes a list of directory info to extract file paths and optionally map them to names.
+ If ``names`` is provided, returns a ``tuple`` including a set of file paths and a dictionary with the labels assigned as keys
+ and their corresponding files as values.
+ Otherwise, returns only the set of file paths.
+
+ :param dir_list: List containing directory info, each with 'path' and 'filenames'.
+ :param entity: The entity being processed (e.g., "species_files").
+ :param names The key to map filenames to names (labels). If omitted, only file paths are returned.
+
+ :return all_files: A set of file paths
+ :rtype: set
+ :retur all_maps: A dictionary mapping names to file paths.
+ :rtype: dict
"""
all_files = set()
if names:
@@ -295,7 +320,13 @@ def parse_dicts(dir_list, entity, names=None):
return (all_files, all_maps) if names else all_files
-def get_param_value(param, conf):
+def get_param_value(param: str, conf: dict):
+ """
+ Get values of a specific parameter from the YAML configuration file
+
+ :param param: Parameter to get their value
+ :param conf: Parameters as loaded from the YAML file
+ """
v = conf.get(param, {}).get("value") or conf.get(param, {}).get("name") or conf.get(param, {}).get("path")
if v is None:
v = conf.get(param).get("default")
@@ -304,12 +335,11 @@ def get_param_value(param, conf):
return v
-def get_extra_stains(conf):
+def get_extra_stains(conf: dict):
"""
Get extra stains provided by the user
- :return extra_stains (Dict): A dictionary with channel name as key and a set with the sign, threshold and label
- of the stain as value
+ :return: A dictionary with channel name as key and a set with the sign, threshold and label of the stain as their value
"""
extra_stains = {}
extras = conf.get("extra_stains").get("stains")
@@ -323,9 +353,13 @@ def get_extra_stains(conf):
return extra_stains
-def get_stain_params(stain, conf):
+def get_stain_params(stain: str, conf: dict):
"""
Build a Stain instance based on the configuration file.
+
+ :param stain: Stain entry on the config.yaml file; it can take one of the following values:
+ ``stain1_train``, ``stain2_train``, ``stain1_predict``, ``stain2_predict``, ``extra_stains``
+ :param conf: A dictionary with channel name as key and a set with the sign, threshold and label of the stain as their value
"""
# Get params from the yaml file
params = conf.get(stain)
@@ -335,12 +369,26 @@ def get_stain_params(stain, conf):
return build_stain(stain, channel, sign, value)
-def build_stain(stain, channel, sign, value):
+def build_stain(stain: str, channel: str, sign: str, value: int):
+ """
+ Builds a :class:`Stain` based on the user's settings, as loaded by the :func:`get_stain_params`
+
+ :param stain: Stain name as mentioned in the configuration YAML file to in process
+ :param channel: Name of the channel
+ :param channel: Sign of the relationship needs to hold; can be either ``>``, ``<`` in the GUI version,
+ or ``higher_than``, ``lower_than`` in the CLI
+ :param value: Threshold of the channel value
+ """
+ from scripts.helpers import Stain
+
# Check if all stain params are there
- if not all([sign, value]) and channel!=None:
+ if not all([sign, value]) and channel is not None:
missing = [k for k, v in {"channel": channel, "sign": sign, "value": value}.items() if v is None]
raise ValueError(f"Please provide {' and '.join(missing)} for {stain}.")
+ elif channel is None:
+ return Stain(channel=None, sign=None, value=None)
+
return Stain(channel=channel, sign=sign, value=value)
diff --git a/logo.png b/cellscanner/logo.png
similarity index 100%
rename from logo.png
rename to cellscanner/logo.png
diff --git a/scripts/GUIhelpers.py b/cellscanner/scripts/GUIhelpers.py
similarity index 55%
rename from scripts/GUIhelpers.py
rename to cellscanner/scripts/GUIhelpers.py
index 1ab3a50..5b96ce0 100644
--- a/scripts/GUIhelpers.py
+++ b/cellscanner/scripts/GUIhelpers.py
@@ -1,4 +1,9 @@
+"""
+A set of classes to support the GUI.
+
+"""
import os
+import re
import fcsparser
import numpy as np
@@ -7,7 +12,12 @@
QSpinBox, QVBoxLayout, QCheckBox
)
+from .helpers import Stain, NOT_APPLICABLE
+
class AxisSelector(QWidget):
+ """
+ Box for the user to choose among the channels on the .fcs as the channel to be plotted in the 3D-plots.
+ """
def __init__(self, label_text, parent=None):
super().__init__(parent)
layout = QHBoxLayout(self)
@@ -22,21 +32,25 @@ def set_items(self, items):
class StainSelector(QWidget):
- def __init__(self, label_text, tooltip_text, parent=None):
+ """
+ Set of boxes for the user to choose among the channels on the .fcs as the channel to be used for a stain,
+ its sign (>,<) and to set its value (an integer).
+ """
+ def __init__(self, label_text, tooltip_text, label, parent=None):
super().__init__(parent)
layout = QHBoxLayout(self)
self.label = QLabel(label_text, self)
self.combo = QComboBox(self)
self.combo.setToolTip(tooltip_text)
- self.combo.addItem("Not applicable")
+ self.combo.addItem(NOT_APPLICABLE)
self.relation = QComboBox(self)
self.relation.addItems(['>', '<'])
self.threshold = QLineEdit(self)
self.threshold.setPlaceholderText(
- "Enter threshold. All events where the threshold is met will be classified as dead."
+ f"Enter threshold. All events where the threshold is met will be classified as {label}."
)
layout.addWidget(self.label)
@@ -48,12 +62,25 @@ def __init__(self, label_text, tooltip_text, parent=None):
def set_items(self, items):
self.combo.clear()
- self.combo.addItem("Not applicable") # Keep default
+ self.combo.addItem(NOT_APPLICABLE) # Keep default
self.combo.addItems(items)
class LabeledComboBox(QWidget):
+ """
+ A labeled combo box widget for selecting parameter values from a predefined list.
+ This widget consists of a label and a combo box, allowing users to select a value
+ from a specified list. It is useful for settings where choices are restricted to
+ predefined options.
+
+ Args:
+ label_text (str): The text to display in the label.
+ items (list of str, optional): The list of selectable values for the combo box. Defaults to an empty list.
+ default (str, optional): The default value to be pre-selected in the combo box. If specified, must be included in the items.
+ parent (QWidget, optional): The parent widget, if applicable.
+
+ """
def __init__(self, label_text, items=None, default=None, parent=None):
super().__init__(parent)
@@ -77,6 +104,32 @@ def __init__(self, label_text, items=None, default=None, parent=None):
class LabeledSpinBox(QWidget):
+ """
+ A labeled spin box widget for selecting integer values within a specified range.
+
+ This widget consists of a label and a spin box, allowing users to select a numerical
+ value by incrementing or decrementing within a defined range.
+
+ Example:
+ A spin box for setting a parameter with a range from 10 to 100, step size of 5,
+ and default value of 20:
+
+ ```
+ spin_box = LabeledSpinBox("Select Value:", min_value=10, max_value=100, step=5, default_value=20)
+ ```
+
+ Attributes:
+ label (QLabel): A Qlabel displaying the provided label text.
+ spin_box (QSpinBox): A spin box allowing integer selection.
+
+ Args:
+ label_text (str): The text to display in the label.
+ min_value (int, optional): The minimum allowable value in the spin box. Default is 0.
+ max_value (int, optional): The maximum allowable value in the spin box. Default is 1000.
+ step (int, optional): The increment/decrement step size. Default is 1.
+ default_value (int, optional): The default selected value in the spin box. Default is 0.
+ parent (QWidget, optional): The parent widget, if applicable.
+ """
def __init__(self, label_text, min_value=0, max_value=1000, step=1, default_value=0, parent=None):
super().__init__(parent)
@@ -135,10 +188,11 @@ class GatingMixin:
A thread on mixin:
https://stackoverflow.com/questions/533631/what-is-a-mixin-and-why-is-it-useful
- This mixin defines the `toggle_gating_options` method, which shows or hides
+ This mixin defines the :func:`toggle_gating_options` method, which shows or hides
UI elements related to gating based on the state of a checkbox.
"""
def toggle_gating_options(self):
+ """Displays or conceals gating options contingent on the gating checkbox state."""
is_checked = self.gating_checkbox.isChecked()
if self.get_host_class_name() == "TrainModelPanel":
@@ -146,11 +200,8 @@ def toggle_gating_options(self):
if is_checked and len(self.file_panel.blank_files) > 0:
# Update all stain selectors
for selector in self.stain_selectors:
- selector.set_items(self.file_panel.numeric_colums_set)
-
- else:
- print(self.get_host_class_name())
-
+ # selector.set_items(self.file_panel.numeric_columns_set)
+ selector.set_items(self.file_panel.channels)
for selector in self.stain_selectors:
selector.label.setVisible(is_checked)
@@ -161,7 +212,7 @@ def toggle_gating_options(self):
try:
self.new_stain_button.setVisible(is_checked)
except:
- print("No need for extra stain at the training step")
+ print("No need for extra stain at the training step besides the two main ones (SYBR and PI).")
pass
def get_host_class_name(self):
@@ -169,30 +220,31 @@ def get_host_class_name(self):
class GatingCheckBox:
"""
+
"""
def gating_checkbox(self):
# Add a checkbox to apply gating
self.gating_layout = QVBoxLayout()
self.gating_checkbox = QCheckBox("Apply line gating", self)
- self.gating_checkbox.setToolTip(GuiMessages.GATING_CHECHBOX)
+ self.gating_checkbox.setToolTip(_GuiMessages.GATING_CHECHBOX)
self.gating_layout.addChildWidget(self.gating_checkbox)
self.gating_checkbox.stateChanged.connect(self.toggle_gating_options)
try:
self.predict_panel_layout.addWidget(self.gating_checkbox)
except:
- self.train_gating_layout.addWidget(self.gating_checkbox)
+ self.train_panel_layout.addWidget(self.gating_checkbox)
# Add message for strain thresholds
self.thresholds_layout = QHBoxLayout()
self.threshold_message = QLabel(
- GuiMessages.GATING_THRESHOLD,
+ _GuiMessages.GATING_THRESHOLD,
self
)
self.thresholds_layout.addWidget(self.threshold_message)
try:
self.predict_panel_layout.addLayout(self.thresholds_layout)
except:
- self.train_gating_layout.addLayout(self.thresholds_layout)
+ self.train_panel_layout.addLayout(self.thresholds_layout)
class LiveDeadDebrisSelectors:
@@ -202,28 +254,16 @@ class LiveDeadDebrisSelectors:
"""
def basic_stains(self):
- # Stain 1 selection (for live/dead)
- tooltip_for_stain_1 = (
- "Select the channel that will be used for gating live/dead cells. "
- "All events where the threshold is met will be classified as dead."
- )
- # Stain 2 selection (for debris, optional)
- tooltip_for_stain_2 = (
- "Select the channel that will be used for gating all cells. "
- "All events where the threshold is met will be classified as cells. "
- "The rest of the events will be classified as debris."
- )
-
# Pair of basic stains
- self.stain1_selector = StainSelector("Staining inactive cells (e.g. PI):", tooltip_for_stain_1, self)
- self.stain2_selector = StainSelector("Staining all cells (e.g. SYBR/DAPI):", tooltip_for_stain_2, self)
+ self.stain1_selector = StainSelector("Staining all cells (e.g. SYBR/DAPI):", _GuiMessages.TP_STAIN_1, "cell", self)
+ self.stain2_selector = StainSelector("Staining inactive (dead) cells (e.g. PI):", _GuiMessages.TP_STAIN_2, "dead", self)
try:
self.predict_panel_layout.addWidget(self.stain1_selector)
self.predict_panel_layout.addWidget(self.stain2_selector)
except:
- self.train_gating_layout.addWidget(self.stain1_selector)
- self.train_gating_layout.addWidget(self.stain2_selector)
+ self.train_panel_layout.addWidget(self.stain1_selector)
+ self.train_panel_layout.addWidget(self.stain2_selector)
self.stain_selectors = [
self.stain1_selector,
@@ -232,8 +272,10 @@ def basic_stains(self):
# ToolTips
-class GuiMessages:
-
+class _GuiMessages:
+ """
+ Messages to be shown acrross the app.
+ """
UNCERTAINTY_TOOLTIP = (
"Set threshold for filtering out uncertain predictions. "
"If you just trained a model, CellScanner computed already the threshold allowing the highest accuracy and set it as default. "
@@ -251,17 +293,16 @@ class GuiMessages:
AXIS_SELECTION = "Choose the Channels that will be used as x, y, z axis for the 3D plot:"
-
GATING_CHECHBOX = (
"When staining for both inactive and total cells, CellScanner will also return"
"the living cells, by combining findings from these 2 stains."
)
+
GATING_THRESHOLD = (
"Important: Some visualization software may transform raw data."
"Ensure you set the threshold based on the raw data, not post-transformation."
)
-
COLUMN_NAMES_ERROR = (
"Column names on your coculture files differ. Please make sure you only include files sharing the same column names."
)
@@ -277,20 +318,88 @@ class GuiMessages:
"Every time you click on the Select Files button, previsouly selected files are removed."
)
+ OUTPUT_DIR = (
+ "Optional. Provide output directory where intermediate files and predictions will be saved."
+ )
-def load_fcs_file(fcss):
+ # Stain 1 selection (for debris, optional) sybr-green
+ TP_STAIN_1 = (
+ "Select the channel that will be used for gating all cells. "
+ "All events where the threshold is met will be classified as cells. "
+ "The rest of the events will be classified as debris."
+ )
+ # Stain 2 selection (for live/dead)
+ TP_STAIN_2 = (
+ "Select the channel that will be used for gating live/dead cells. "
+ "All events where the threshold is met will be classified as dead."
+ "The rest of the events will be classified as live."
+ )
+
+
+def get_stains_from_panel(Panel):
"""
- :param fcss: List of fcs files provided by the user
- :return sample_to_df:
- :return sample_numeric_columns:
+ Build Stain instances for the two main stain types of living/dead and cells/not cells cases.
+ In this case, no label is part of the Stain instance.
+ Function to be used only in the GUI framework.
+
+ Arguments:
+ Panel (:class:`PredictionPanel` | :class:`TrainModelPanel`):
+ Returns:
+ stain1 (Stain)
+ stain2 (Stain)
"""
+ # Stain 1
+
+ stain_1 = Panel.stain1_selector.combo.currentText() # It should be the column name
+
+ if stain_1 != NOT_APPLICABLE:
+
+ match = re.search(r"\[(.*?)\]", stain_1) # In case we chose a channel with a second name in brackets
+ stain1_channel = match.group(1) if match else stain_1
+ stain1_relation = Panel.stain1_selector.relation.currentText()
+ stain1_threshold = float(Panel.stain1_selector.threshold.text())
+
+ stain1 = Stain(stain1_channel, stain1_relation, stain1_threshold)
+
+ else:
+
+ stain1 = Stain(channel=None, sign=None, value=None)
+
+ # Stain 2
+ stain_2 = Panel.stain2_selector.combo.currentText() # It should be the column name
+ if stain_2 != NOT_APPLICABLE:
+
+ match = re.search(r"\[(.*?)\]", stain_2)
+ stain2_channel = match.group(1) if match else stain_2
+ stain2_relation = Panel.stain2_selector.relation.currentText()
+ stain2_threshold = float(Panel.stain2_selector.threshold.text()) if Panel.stain2_selector.threshold.text() else None
+ stain2 = Stain(stain2_channel, stain2_relation, stain2_threshold)
+ else:
+ stain2 = Stain(channel=None, sign=None, value=None)
+
+ return stain1, stain2
+
+def extact_channel(long_channel):
+ match = re.search(r"\[(.*?)\]", long_channel) # In case we chose a channel with a second name in brackets
+ channel = match.group(1) if match else long_channel
+ return channel
+
+
+def load_fcs_file(fcss):
+ """
+ Loads .fcs file from a list of .fcs files.
+
+ :param fcss: List of fcs files provided by the user
+ :return sample_to_df: a dictionary with sample name as key and the dataframe with the fcs data loaded as their value
+ :return sample_numeric_columns: a dictionary with sample name as key and the number columnns of the dataframe with the fcs data loaded as their value
+ """
sample_to_df = {}
sample_numeric_columns = {}
for fcs in fcss:
- _, data_df = fcsparser.parse(fcs, reformat_meta=True)
+ meta, data_df = fcsparser.parse(fcs, reformat_meta=True)
# Drop the 'Time' column if it exists
if 'Time' in data_df.columns:
@@ -303,5 +412,31 @@ def load_fcs_file(fcss):
sample_numeric_columns[sample_file_basename] = numeric_columns
sample_to_df[sample] = data_df
- return sample_to_df, sample_numeric_columns, numeric_columns
+ return sample_to_df, sample_numeric_columns, numeric_columns, meta
+
+def button_style(
+ font_size=12, padding=5, color="black", bck_col="#90EE90",
+ bck_col_hov="#7FCF7F", bck_col_clicked="#72B572", radius=5
+ ):
+ """
+ A button style
+ :return style: A string that can be directly assigned as a button-style in PyQt5 apps.
+ """
+ style = f"""
+ QPushButton {{
+ font-size: {font_size}px;
+ font-weight: bold;
+ padding: {padding}px;
+ color: {color};
+ background-color: {bck_col}; /* Light green color */
+ border-radius: {radius}px;
+ }}
+ QPushButton:hover {{
+ background-color: {bck_col_hov}; /* Slightly darker green on hover */
+ }}
+ QPushButton:pressed {{
+ background-color: {bck_col_clicked}; /* Even darker when pressed */
+ }}
+ """
+ return style
diff --git a/scripts/ImportFiles.py b/cellscanner/scripts/ImportFiles.py
similarity index 86%
rename from scripts/ImportFiles.py
rename to cellscanner/scripts/ImportFiles.py
index 0daddb7..c6ab60b 100644
--- a/scripts/ImportFiles.py
+++ b/cellscanner/scripts/ImportFiles.py
@@ -1,40 +1,40 @@
-# ImportFiles.py
-import os
-import shutil
-from PyQt5.QtWidgets import QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLineEdit, \
- QFileDialog, QInputDialog, QMessageBox, QLabel, QSpinBox
-import fcsparser
-
-from .helpers import get_app_dir, time_based_dir, button_style, load_model_from_files
-from .GUIhelpers import LabeledSpinBox, GuiMessages, load_fcs_file
-
"""
-ImportFiles.py
-
-This module is part of the CellScanner application.
-The `ImportFilePanel` class provides a user interface panel for selecting and managing flow cytometry files
+The :class:`ImportFilePanel` class provides a user interface panel for selecting and managing flow cytometry files
for monoculture species 1, monoculture species 2, and blank samples. The selected files are copied to a
working directory, where they can be processed by other components of the application.
Key Features:
-- Allows users to select `.fcs` files for species 1, species 2, and blank samples.
+
+- Allows users to select ``.fcs`` files for species 1, species 2, and blank samples.
- Files are copied to a working directory to ensure the original files remain unchanged.
- Displays metadata and the first few rows of data for each selected file.
- Handles file parsing using the `fcsparser` library.
- Supports cleanup of the working directory to remove copied files.
-Classes:
-- ImportFilePanel: A QWidget subclass that provides the interface for selecting files and managing the working directory.
+"""
+import os
+import shutil
+import fcsparser
+
+from PyQt5.QtWidgets import QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QLineEdit, \
+ QFileDialog, QInputDialog, QMessageBox
-Author: Ermis Ioannis Michail Delopoulos
-Date: 22/08/2024
+from .helpers import get_app_dir, time_based_dir, load_model_from_files, get_channels
+from .GUIhelpers import button_style, LabeledSpinBox, _GuiMessages, load_fcs_file
"""
+Authors:
+ - Ermis Ioannis Michail Delopoulos
+ - Haris Zafeiropoulos
+Date: 2024-2025
+"""
class ImportFilePanel(QWidget):
-
+ """
+ A QWidget subclass that provides the interface for selecting files and managing the working directory.
+ """
def __init__(self, parent=None):
super().__init__(parent)
self.layout = QVBoxLayout(self)
@@ -55,7 +55,7 @@ def __init__(self, parent=None):
blank_layout = QHBoxLayout()
self.blank_button = QPushButton("Select Blank Files (.fcs)", self)
self.blank_button.setStyleSheet(button_style())
- self.blank_button.setToolTip(GuiMessages.BLANKS_MULTIFILES)
+ self.blank_button.setToolTip(_GuiMessages.BLANKS_MULTIFILES)
self.blank_button.clicked.connect(self.select_blank_files)
blank_layout.addWidget(self.blank_button)
self.layout.addLayout(blank_layout)
@@ -64,12 +64,12 @@ def __init__(self, parent=None):
self.blank_files = []
# Button to add a new species selection
- self.init_species_layout()
+ self._init_species_layout()
# Previously trained model button
self.previously_trained_model_button = QPushButton("Add Model", self)
self.previously_trained_model_button.setStyleSheet(button_style(bck_col="#f7c67d", bck_col_hov="#deb270"))
- self.previously_trained_model_button.setToolTip(GuiMessages.PREVIOUSLY_TRAINED_MODEL)
+ self.previously_trained_model_button.setToolTip(_GuiMessages.PREVIOUSLY_TRAINED_MODEL)
self.previously_trained_model_button.clicked.connect(self.add_prev_trained_model)
self.layout.addWidget(self.previously_trained_model_button)
@@ -82,20 +82,23 @@ def __init__(self, parent=None):
# Model load
self.model_loaded = False
- self.toggle_const_scaling()
+ self._toggle_const_scaling()
# Button for output dir
if os.getcwd() != "/app":
+
self.output_dir = get_app_dir()
outdir_layout = QHBoxLayout()
self.output_dir_button = QPushButton("Set output directory", self)
self.output_dir_button.setStyleSheet(button_style(bck_col="#f7c67d", bck_col_hov="#deb270"))
- self.output_dir_button.setToolTip(
- "Optional. Provide output directory where intermediate files and predictions will be saved.")
+ self.output_dir_button.setToolTip(_GuiMessages.OUTPUT_DIR)
self.output_dir_button.clicked.connect(self.select_directory)
outdir_layout.addWidget(self.output_dir_button)
self.layout.addLayout(outdir_layout)
+ else:
+ self.output_dir = "/csFiles"
+
def add_species(self):
"""Add a new species selection panel."""
@@ -174,8 +177,12 @@ def select_blank_files(self):
blank_files.append(dest_file)
# Keep the numeric columns to be used in the TrainModelPanel in case user applies line gating
- _, _, numeric_columns = load_fcs_file(original_files)
- self.numeric_colums_set = set(numeric_columns)
+ _, _, numeric_columns, meta = load_fcs_file(original_files)
+ # channels_df = meta["_channels_"]
+ # channels_df["long_channel"] = channels_df.apply(lambda row: f"{row['$PnN']} [{row['$PnS']}]" if row["$PnN"] != row["$PnS"] else row["$PnN"], axis=1)
+ # self.channels = set(channels_df["long_channel"])
+ self.channels = get_channels(meta["_channels_"])
+ self.numeric_columns_set = set(numeric_columns)
# Blank filenames
self.blank_files = blank_files
@@ -225,17 +232,17 @@ def add_prev_trained_model(self):
self.species_files = {}
except:
QMessageBox.information(self, "Model loading", "Model files were not loaded!")
- self.toggle_const_scaling()
+ self._toggle_const_scaling()
- def toggle_const_scaling(self):
+ def _toggle_const_scaling(self):
# Show or hide scaling constant layout based on model_loaded flag
self.model_loaded = True if self.model is not None else False
self.scaling_constant.label.setVisible(self.model_loaded)
self.scaling_constant.setVisible(self.model_loaded)
- def init_species_layout(self):
+ def _init_species_layout(self):
self.species_layout = QHBoxLayout()
self.add_species_button = QPushButton("Add Species", self)
self.add_species_button.setStyleSheet(button_style())
@@ -243,5 +250,3 @@ def init_species_layout(self):
self.add_species_button.clicked.connect(self.add_species)
self.species_layout.addWidget(self.add_species_button)
self.layout.addLayout(self.species_layout)
-
-
diff --git a/scripts/Prediction.py b/cellscanner/scripts/Prediction.py
similarity index 86%
rename from scripts/Prediction.py
rename to cellscanner/scripts/Prediction.py
index 3a6343e..211edd2 100644
--- a/scripts/Prediction.py
+++ b/cellscanner/scripts/Prediction.py
@@ -22,13 +22,11 @@
from PyQt5.QtCore import Qt, QThread, pyqtSignal, QObject
import os
-import fcsparser
-import numpy as np
-from .helpers import button_style, time_based_dir
-from .run_prediction import predict, merge_prediction_results
+from .run_prediction import predict
+from .helpers import time_based_dir, merge_prediction_results, get_channels, NOT_APPLICABLE
from .GUIhelpers import (
- AxisSelector, LiveDeadDebrisSelectors, GatingMixin, GatingCheckBox, GuiMessages,
+ button_style, _GuiMessages, AxisSelector, LiveDeadDebrisSelectors, GatingMixin, GatingCheckBox,
iterate_stains, load_fcs_file
)
@@ -61,7 +59,7 @@ def __init__(self, file_panel, train_panel, parent=None):
self.predict_panel_layout.addWidget(self.choose_coculture_file_button)
# Add a text label for selecting the x, y, z axes
- self.axis_selection_label = QLabel(GuiMessages.AXIS_SELECTION, self)
+ self.axis_selection_label = QLabel(_GuiMessages.AXIS_SELECTION, self)
self.predict_panel_layout.addWidget(self.axis_selection_label)
# X,Y,Z axis
@@ -80,8 +78,8 @@ def __init__(self, file_panel, train_panel, parent=None):
]
# Add a checkbox to apply uncertainty filtering
- self.uncertainty_filtering_checkbox = QCheckBox(GuiMessages.UNCERTAINTY_CHECKBOX, self)
- self.uncertainty_filtering_checkbox.stateChanged.connect(self.toggle_uncertainty_filterint_options)
+ self.uncertainty_filtering_checkbox = QCheckBox(_GuiMessages.UNCERTAINTY_CHECKBOX, self)
+ self.uncertainty_filtering_checkbox.stateChanged.connect(self._toggle_uncertainty_filterint_options)
self.predict_panel_layout.addWidget(self.uncertainty_filtering_checkbox)
# Scaling constant for uncertainty filtering
@@ -90,10 +88,10 @@ def __init__(self, file_panel, train_panel, parent=None):
self.uncertainty_threshold_layout.addWidget(self.uncertainty_threshold_label)
self.uncertainty_threshold = QDoubleSpinBox(self)
- self.uncertainty_threshold.setToolTip(GuiMessages.UNCERTAINTY_TOOLTIP)
+ self.uncertainty_threshold.setToolTip(_GuiMessages.UNCERTAINTY_TOOLTIP)
self.uncertainty_threshold.setRange(-1.0, 10.0)
self.uncertainty_threshold.setSingleStep(0.01)
- self.update_uncertainty_threshold()
+ self._update_uncertainty_threshold()
self.uncertainty_threshold_layout.addWidget(self.uncertainty_threshold)
self.predict_panel_layout.addLayout(self.uncertainty_threshold_layout)
@@ -113,7 +111,7 @@ def __init__(self, file_panel, train_panel, parent=None):
# Hide gating and uncertainty filtering options initially
self.toggle_gating_options() # NOTE: from the GatingMixin mixin class, passed in the base classes of the PredictionPanel
- self.toggle_uncertainty_filterint_options()
+ self._toggle_uncertainty_filterint_options()
# Add gating layout to the predict one
self.predict_panel_layout.addLayout(self.gating_layout) # NOTE: (clarification) the gating_layout is there, thanks to the gating_checkbox mixin class
@@ -126,7 +124,7 @@ def __init__(self, file_panel, train_panel, parent=None):
def fire_predict(self):
try:
- self.start_loading_cursor()
+ self._start_loading_cursor()
self.samples_number = len(self.sample_to_df)
if self.samples_number == 0:
raise ValueError("Coculture data have not been provided.")
@@ -135,12 +133,12 @@ def fire_predict(self):
self.thread = QThread()
self.worker = WorkerPredict(PredictPanel=self)
self.worker.moveToThread(self.thread)
- self.worker.error_signal.connect(self.on_error)
+ self.worker.error_signal.connect(self._on_error)
self.thread.started.connect(self.worker.run_predict)
# Apply UMAP & train neural network
- self.worker.finished_signal.connect(self.prediction_completed)
+ self.worker.finished_signal.connect(self._prediction_completed)
self.worker.finished_signal.connect(self.thread.quit)
# Ensure the thread finishes properly but does not exit the app
@@ -150,17 +148,20 @@ def fire_predict(self):
self.thread.start()
except Exception as e:
- self.on_error(str(e))
- self.stop_loading_cursor()
+ self._on_error(str(e))
+ self._stop_loading_cursor()
def choose_coculture_file(self):
+ """
+ Facilitates the selection of coculture files and populates axis selection dropdowns with appropriate channels.
+ """
select_coculture_message = ["Select Coculture File", "", "Flow Cytometry Files (*.fcs);;All Files (*)"]
coculture_filepath, _ = QFileDialog.getOpenFileNames(self, *select_coculture_message)
if coculture_filepath:
try:
# Load fcs files
- sample_to_df, sample_numeric_columns, numeric_columns = load_fcs_file(coculture_filepath)
+ sample_to_df, sample_numeric_columns, numeric_columns, meta = load_fcs_file(coculture_filepath)
# Show files selected in the button
self.choose_coculture_file_button.setText(",".join(sample_to_df.keys())) # Display only the filename, not the full path
@@ -168,32 +169,35 @@ def choose_coculture_file(self):
# Check if all files share the same numeric column names
all_same = all(value.equals(list(sample_numeric_columns.values())[0]) for value in sample_numeric_columns.values())
if not all_same:
- self.on_error(GuiMessages.COLUMN_NAMES_ERROR)
+ self._on_error(_GuiMessages.COLUMN_NAMES_ERROR)
# Populate the combo boxes with the numeric column names
- self.numeric_colums_set = set(numeric_columns)
+ self.numeric_columns_set = set(numeric_columns)
+ self.channels = get_channels(meta["_channels_"])
# Update all axis selectors
for selector in self.axis_selectors:
- selector.set_items(self.numeric_colums_set)
+ # selector.set_items(self.numeric_columns_set)
+ selector.set_items(self.channels)
# Update all stain selectors
for selector in self.stain_selectors:
- selector.set_items(self.numeric_colums_set)
+ # selector.set_items(self.numeric_columns_set)
+ selector.set_items(self.channels)
self.channels_on_stain_buttons()
# Keep dictionary with sample names (key) and their corresponding data_df (value)
self.sample_to_df = sample_to_df
except:
- self.on_error("Something went off with your coculture files.")
+ self._on_error("Something went off with your coculture files.")
else:
print("No coculture file selected.")
self.choose_coculture_file_button.setText(select_coculture_message[0])
- def on_error(self, message):
+ def _on_error(self, message):
try:
- self.stop_loading_cursor()
+ self._stop_loading_cursor()
QMessageBox.critical(self, "Error", message)
except Exception as e:
print(f"Error displaying the message: {e}")
@@ -201,25 +205,25 @@ def on_error(self, message):
# Ensure that the thread is not running after error
self.thread = None
- def start_loading_cursor(self):
+ def _start_loading_cursor(self):
QApplication.setOverrideCursor(Qt.WaitCursor)
- def stop_loading_cursor(self):
+ def _stop_loading_cursor(self):
QApplication.restoreOverrideCursor()
- def prediction_completed(self):
- self.stop_loading_cursor()
+ def _prediction_completed(self):
+ self._stop_loading_cursor()
QMessageBox.information(self, "Prediction Complete", f"Predictions have been saved in {self.predict_dir}.")
self.thread = None
- def toggle_uncertainty_filterint_options(self):
+ def _toggle_uncertainty_filterint_options(self):
is_checked = self.uncertainty_filtering_checkbox.isChecked()
self.filter_out_uncertain = True
self.uncertainty_threshold_label.setVisible(is_checked)
self.uncertainty_threshold.setVisible(is_checked)
- self.update_uncertainty_threshold()
+ self._update_uncertainty_threshold()
- def update_uncertainty_threshold(self):
+ def _update_uncertainty_threshold(self):
if self.train_panel.cs_uncertainty_threshold is not None:
self.uncertainty_threshold.setValue(self.train_panel.cs_uncertainty_threshold)
else:
@@ -238,7 +242,7 @@ def build_stain_inputs(self):
stain_layout = QHBoxLayout()
stain_description = QLabel("Staining cells:", self)
stain_combo = QComboBox(self)
- stain_combo.setToolTip(GuiMessages.USER_STAIN_TOOLTIP)
+ stain_combo.setToolTip(_GuiMessages.USER_STAIN_TOOLTIP)
stain_relation = QComboBox(self)
stain_relation.addItems(['>', '<'])
stain_threshold = QDoubleSpinBox(self) # QLineEdit(self)
@@ -261,7 +265,7 @@ def build_stain_inputs(self):
except:
print("No coculture file yet.")
pass
- stain_combo.addItem("Not applicable")
+ stain_combo.addItem(NOT_APPLICABLE)
def channels_on_stain_buttons(self):
for i in range(self.gating_layout.count()):
@@ -276,7 +280,7 @@ def channels_on_stain_buttons(self):
component = stain_layout.itemAt(j).widget()
if isinstance(component, QComboBox):
if not any(component.itemText(i) == '>' for i in range(component.count())):
- component.addItems(self.numeric_colums_set)
+ component.addItems(self.numeric_columns_set)
class WorkerPredict(QObject):
diff --git a/scripts/TrainingModel.py b/cellscanner/scripts/TrainingModel.py
similarity index 78%
rename from scripts/TrainingModel.py
rename to cellscanner/scripts/TrainingModel.py
index c3f7005..a48e342 100644
--- a/scripts/TrainingModel.py
+++ b/cellscanner/scripts/TrainingModel.py
@@ -1,31 +1,21 @@
+"""
+Panel for setting the parameters and performing:
+- UMAP on the training data
+- training of a neural network model
+- and evaluating its performance.
+
+Usage:
+- The `TrainModelPanel` is integrated into the main application window and handles the entire model training pipeline.
+
+"""
from PyQt5.QtWidgets import QWidget, QVBoxLayout, QHBoxLayout, QPushButton, QComboBox,\
QGroupBox, QLabel, QMessageBox, QApplication, QSpinBox
from PyQt5.QtCore import Qt, QThread, pyqtSignal, QObject
-from .helpers import button_style
from .apply_umap import process_files
-from .GUIhelpers import LabeledComboBox, LabeledSpinBox, LiveDeadDebrisSelectors, GatingMixin, GatingCheckBox
+from .GUIhelpers import LabeledComboBox, LabeledSpinBox, LiveDeadDebrisSelectors, GatingMixin, GatingCheckBox, button_style
"""
-TrainingModel.py
-
-This module is part of the CellScanner application, responsible for processing flow cytometry data, training
-a neural network model, and evaluating its performance. The `TrainModelPanel` class provides a user interface
-panel for selecting training parameters and initiating the training process.
-
-Key Features:
-- Allows users to select the number of random events to sample from each monoculture and blank file.
-- Handles the preprocessing of `.fcs` files, including file parsing, data sampling, and scaling.
-- Implements UMAP for dimensionality reduction and filtering of data based on nearest neighbors.
-- Trains a neural network model using the processed data.
-- Evaluates the trained model and saves performance metrics, including a confusion matrix and classification report.
-
-Classes:
-- TrainModelPanel: A QWidget subclass that provides the interface for training the neural network model.
-
-Usage:
-- The `TrainModelPanel` is integrated into the main application window and handles the entire model training pipeline.
-
Authors:
- Ermis Ioannis Michail Delopoulos
- Haris Zafeiropoulos
@@ -34,15 +24,16 @@
"""
class TrainModelPanel(QWidget, LiveDeadDebrisSelectors, GatingMixin, GatingCheckBox):
+ """
+ User interface panel for selecting training parameters and initiating the training process.
+ Inherits parent class (ImportFile) and a set of mixin classes for enabling line gating
+ """
def __init__(self, file_panel, parent=None):
- """
- Training panel using the mixin classes for gating
- """
super().__init__(parent)
self.file_panel = file_panel
- self.layout = QVBoxLayout(self)
+ self.train_panel_layout = QVBoxLayout(self)
# Group box for "File Settings"
self.file_settings_group = QGroupBox("File Settings", self)
@@ -67,7 +58,7 @@ def __init__(self, file_panel, parent=None):
# Add the event_layout into file_settings_layout, then add file_settings_group to the main layout.
file_settings_layout.addLayout(event_layout)
- self.layout.addWidget(self.file_settings_group)
+ self.train_panel_layout.addWidget(self.file_settings_group)
# -----------------------------------------------------------
@@ -96,7 +87,7 @@ def __init__(self, file_panel, parent=None):
umap_layout.addWidget(self.umap_mindist_combo)
# Add the UMAP group box to the main layout
- self.layout.addWidget(self.umap_group)
+ self.train_panel_layout.addWidget(self.umap_group)
# -----------------------------------------------------------
@@ -128,7 +119,7 @@ def __init__(self, file_panel, parent=None):
nn_layout.addWidget(self.nn_nonblank_combo)
# Add the NN group box to the main layout
- self.layout.addWidget(self.nn_group)
+ self.train_panel_layout.addWidget(self.nn_group)
# -----------------------------------------------------------
@@ -155,13 +146,13 @@ def __init__(self, file_panel, parent=None):
model_settings_layout.addWidget(self.patience_combo)
# Add the QGroupBox to your main layout
- self.layout.addWidget(self.model_settings_group)
+ self.train_panel_layout.addWidget(self.model_settings_group)
# -----------------------------------------------------------
# Gating option at the training step
- self.train_gating = QGroupBox("Line gating", self)
- self.train_gating_layout = QVBoxLayout(self.train_gating)
+ # self.train_gating = QGroupBox("Line gating", self)
+ # self.train_gating_layout = QVBoxLayout(self.train_gating)
# Add a checkbox to apply gating
self.gating_checkbox() # NOTE: from the GatingCheckBox mixin class, passed in the class definition
@@ -172,7 +163,7 @@ def __init__(self, file_panel, parent=None):
# Initially hide / show after click on gating checkbox
self.toggle_gating_options()
- self.layout.addWidget(self.train_gating)
+ # self.train_panel_layout.addWidget(self.train_gating)
# -----------------------------------------------------------
@@ -181,7 +172,7 @@ def __init__(self, file_panel, parent=None):
self.process_button.setStyleSheet(button_style())
self.process_button.clicked.connect(self.start_training_process)
- self.layout.addWidget(self.process_button)
+ self.train_panel_layout.addWidget(self.process_button)
# Store the processed and filtered dataframe
self.cleaned_data = None
@@ -193,21 +184,21 @@ def __init__(self, file_panel, parent=None):
# Keep a reference to best model
self.best_model = None
- def start_loading_cursor(self):
+ def _start_loading_cursor(self):
QApplication.setOverrideCursor(Qt.WaitCursor)
- def stop_loading_cursor(self):
+ def _stop_loading_cursor(self):
QApplication.restoreOverrideCursor()
def start_training_process(self):
"""
- Establihes a thread and a worker to execute the run_process_files().
+ Establihes a thread and a worker to execute the :func:`run_process_files`.
The signals of the worker allows not to exit the app in case of error
and to return a success message when complete.
"""
try:
#start the loading cursor
- self.start_loading_cursor()
+ self._start_loading_cursor()
if not self.file_panel.species_files and not self.file_panel.blank_files:
raise ValueError("No files selected. Please import files.")
@@ -216,11 +207,11 @@ def start_training_process(self):
self.thread = QThread()
self.worker = WorkerProcessFiles(TrainModelPanel=self)
self.worker.moveToThread(self.thread)
- self.worker.error_signal.connect(self.on_error)
+ self.worker.error_signal.connect(self._on_error)
self.thread.started.connect(self.worker.run_process_files)
# Apply UMAP & train neural network
- self.worker.finished_signal.connect(self.on_finished)
+ self.worker.finished_signal.connect(self._on_finished)
self.worker.finished_signal.connect(self.thread.quit)
# Ensure the thread finishes properly but does not exit the app
@@ -230,19 +221,19 @@ def start_training_process(self):
self.thread.start()
except Exception as e:
- self.on_error(str(e))
+ self._on_error(str(e))
- def on_finished(self):
- self.stop_loading_cursor()
+ def _on_finished(self):
+ self._stop_loading_cursor()
QMessageBox.information(self, "Success",
f"Training process completed successfully. Suggested thresholds equals to {self.cs_uncertainty_threshold}"
)
self.thread = None
- def on_error(self, message):
+ def _on_error(self, message):
try:
- self.stop_loading_cursor()
+ self._stop_loading_cursor()
QMessageBox.critical(self, "Error", message)
except Exception as e:
print(f"Error displaying the message: {e}")
@@ -254,7 +245,7 @@ class WorkerProcessFiles(QObject):
"""
Worker class for processing files in a separate thread.
- This worker is responsible for running `process_files()` without freezing the main UI.
+ This worker is responsible for running :func:`process_files` without freezing the main UI.
It emits signals to indicate success or failure, allowing the main UI to handle errors properly.
"""
finished_signal = pyqtSignal() # Define a signal for completion
@@ -266,11 +257,13 @@ def __init__(self, TrainModelPanel=None):
def run_process_files(self):
+ """Run the :func:`process_files` and send success/failure signals to the related thread"""
try:
self.TrainModelPanel = process_files(self.TrainModelPanel)
self.finished_signal.emit() # Emit the finished signal when done
except Exception as e:
- print("fuck this")
- self.error_signal.emit(f"Error during prediction: {str(e)}")
+ error_message = str(e) # Extract only the error message
+ self.error_signal.emit(error_message) # Emit it to the GUI
+ # self.error_signal.emit(f"Error during prediction: {str(e)}")
self.TrainModelPanel.thread.quit()
diff --git a/scripts/__init__.py b/cellscanner/scripts/__init__.py
similarity index 100%
rename from scripts/__init__.py
rename to cellscanner/scripts/__init__.py
diff --git a/scripts/apply_umap.py b/cellscanner/scripts/apply_umap.py
similarity index 57%
rename from scripts/apply_umap.py
rename to cellscanner/scripts/apply_umap.py
index 636e4ba..1b7b4b0 100644
--- a/scripts/apply_umap.py
+++ b/cellscanner/scripts/apply_umap.py
@@ -1,41 +1,55 @@
+"""
+Line gating, if asked, is applied to remove entries corresponding to dead and/or debris cells.
+Remaining entries from all .fcs files are combined.
+UMAP is then applied to reduce dimensionality of the data.
+Nearest Neighbors are calculated and a filtering step is applied where only entries whose neighbours have the same label
+are kept for the training of the Neural Network step.
+"""
import os
import umap
-import fcsparser
import pandas as pd
import numpy as np
+import fcsparser
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
+
from .nn import prepare_for_training
-from .helpers import Stain, get_stains_from_panel, apply_gating
+from .helpers import Stain, apply_gating
+from .GUIhelpers import get_stains_from_panel
from .illustrations import umap_plot
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+ from .TrainingModel import TrainModelPanel # Import only for static type checkers
-def process_file(file, species_name, n_events, stain_1, stain_2, model_dir): # , scaling_constant=150
+def process_file(file: str, species_name: str, n_events: int, stain_1: Stain, stain_2: Stain, model_dir: str) -> pd.DataFrame:
"""
Processes import .fcs files by first gating (if asked) and then sampling their entries
to only keep a subset of them for the training step.
- :param file (str): Path to the .fcs file
- :param species_name (str): Name of the species corresponding to the .fcs file
- :param n_events (int): Number of entries of the .fcs file to keep for model training
- :param stain_1 (Strain): User parameters for the (live/dead) staining
- :param stain_2 (Stain): User parammeters for the (cells/debris) staining
- :param model_dir (str): Path for model-related output files to be saved
+ :param file: Path to the .fcs file
+ :param species_name: Name of the species corresponding to the .fcs file
+ :param n_events: Number of entries of the .fcs file to keep for model training
+ :param stain_1: User parameters for the (live/dead) staining
+ :param stain_2: User parammeters for the (cells/debris) staining
+ :param model_dir: Path for model-related output files to be saved
- :return sampled_df: The gated (if asked) and sampled entris of the .fcs file to be used for the model training
+ :return: The gated (if asked) and sampled entries of the .fcs file to be used for the model training
"""
_, df = fcsparser.parse(file, reformat_meta=True)
if 'Time' in df.columns:
df = df.drop(columns=['Time']) # Remove Time column
- print("Processing file: ", species_name)
-
- if stain_1 is None and stain_2 is None:
- print("No gating for the training step.")
+ if stain_1.channel is None and stain_2.channel is None:
+ print(f"No gating, no further processing before the training step for file: {file}")
+ print(f"File is probably a blank. Is it? {species_name}")
else:
+ print(f"Gating file: {file}")
# Apply gating
with open(os.path.join(model_dir, "gating_input_data.txt"), "w") as f:
+
# Writing species name and original number of entries
f.write(f"species name: {species_name}\n")
f.write(f"original number of entries: {df.shape}\n")
@@ -43,21 +57,29 @@ def process_file(file, species_name, n_events, stain_1, stain_2, model_dir):
# Apply gating process
try:
gated_df, _ = apply_gating(df, stain_1, stain_2)
- except ValueError as e:
- raise ValueError(f"Error processing species {species_name}: {e}") from e
+ print(f"Gating performed fine for file: {file}")
+ except Exception as e:
+ # raise ValueError(f"Error applying gating for species {species_name}, file {file}: {e}") from e
+ raise e
# Print and write columns to the file
f.write(f"df.columns: {df.columns.tolist()}\n")
f.write(f"gated_df.columns: {gated_df.columns.tolist()}\n")
# Apply gating for stain 1 if channel is not None
- if stain_1.channel is not None:
- df = df[gated_df["dead"] == False]
+ if isinstance(stain_1, Stain) and stain_1.channel:
+ gating_condition = gated_df["cell"] == False
+ gating_condition = gating_condition.reindex(df.index, fill_value=False)
+ df = df[gating_condition]
+ # df = df[gated_df["dead"] == False]
f.write(f"number of entries after gating for stain1: {df.shape}\n")
# Apply gating for stain 2 if channel is not None
- if stain_2.channel is not None:
- df = df[gated_df["cell"] == True]
+ if isinstance(stain_2, Stain) and stain_2.channel:
+ gating_condition = gated_df["dead"] == True
+ gating_condition = gating_condition.reindex(df.index, fill_value=False)
+ df = df[gating_condition]
+ # df = df[gated_df["cell"] == True]
f.write(f"number of entries after gating for stain2: {df.shape}\n")
# Keep a subset of the entries for the training part
@@ -67,9 +89,19 @@ def process_file(file, species_name, n_events, stain_1, stain_2, model_dir):
return sampled_df
-def process_files(TrainPanel=None, **kwargs):
+def process_files(TrainPanel: "TrainModelPanel" = None, **kwargs):
"""
- Main function for UMAP application.
+ This function behaves differently depending on how it is called:
+ - When used from the GUI, it receives an instance of :class:`TrainPanel`.
+ - When used from the CLI, it receives a set of keyword arguments provided through the config.yml file.
+
+ The function applies process_file(), in both blank and monoculture .fcs files.
+ It then merges the filtered entries in a single df which transforms using the StandardScaler() to apply UMAP.
+
+ :param TrainPanel: An instance of the :class:`TrainPanel` when called from the GUI.
+ :param kwargs: User parameter settings as keyword arguments when called from the CLI.
+
+ :return: In case of the CLI, the filtered entries are returned (``cleaned_df``).
"""
gui = False
if type(TrainPanel).__name__ == "TrainModelPanel":
@@ -86,15 +118,14 @@ def process_files(TrainPanel=None, **kwargs):
"working_directory": TrainPanel.file_panel.working_directory,
}
- gating = TrainPanel.gating_checkbox.isChecked()
+ # NOTE (Haris Zafeiropoulos, 2025-03-31):
+ # If gating is not been clicked at all, stain_1 and stain_2 will be blank stains
+ stain_1, stain_2 = get_stains_from_panel(TrainPanel)
- if gating:
- stain_1, stain_2 = get_stains_from_panel(TrainPanel)
- print("Stains loaded:", stain_1)
- else:
- stain_1, stain_2 = None, None
gui = True
+
else:
+
# Read parameters from kwargs
required_keys = [
"n_events", "umap_n_neighbors", "umap_min_dist",
@@ -117,6 +148,11 @@ def process_files(TrainPanel=None, **kwargs):
model_dir = os.path.join(working_directory, "model")
os.makedirs(model_dir, exist_ok=True)
+ # Build a map between the species name and their index on the key list
+ label_map = {species_name: idx for idx, species_name in enumerate(species_files_names_dict.keys())}
+ # Add Blanks as the last ones
+ label_map['Blank'] = len(label_map)
+
# Process files for all species dynamically
all_species_dataframes = []
for species_name, species_files in species_files_names_dict.items():
@@ -132,7 +168,9 @@ def process_files(TrainPanel=None, **kwargs):
)
)
except Exception as e:
- raise Exception(f"Error while processing species file {species_name}: {e}") from e # Corrected
+ # raise Exception(f"Error while processing species file {species_name}: {e}") from e # Corrected
+ raise e
+
all_species_dataframes.append(species_dataframes)
# Process blanks
@@ -148,38 +186,36 @@ def process_files(TrainPanel=None, **kwargs):
)
)
except Exception as e:
- raise(f"Error processing blank file {blank_file}: {e}") from e
+ raise e
+ # raise(f"Error processing blank file {blank_file}: {e}") from e
# Combine data
+ print("Build unified dataframe")
combined_df = pd.concat([df for species_dataframes in all_species_dataframes for df in species_dataframes] + blank_dataframes)
+ columns_to_plot = combined_df.columns.difference(['Species']).tolist() # All column names except of those in the list
+ data_subset = combined_df[columns_to_plot].values
+ # -----------------------------------------------
+ # Implement dimensionality reduction using UMAP
+ # -----------------------------------------------
- columns_to_plot = combined_df.columns.difference(['Species']).tolist()
- data_subset = combined_df[columns_to_plot].values
+ print("Run UMAP...")
+
+ # Scale: (x - u) / s
scaled_data_subset = StandardScaler().fit_transform(data_subset)
- # Dimensionality reduction using UMAP
- print("Build UMAP reducer")
+ # Init a reducer based on user's settings
reducer = umap.UMAP(
n_components=3,
n_neighbors=umap_n_neighbors,
min_dist=umap_min_dist
)
- # Fit and transform the data
+ # Run UMAP: Fit and transform the data
embedding = reducer.fit_transform(scaled_data_subset)
- label_map = {species_name: idx for idx, species_name in enumerate(species_files_names_dict.keys())}
- label_map['Blank'] = len(label_map)
mapped_labels = combined_df['Species'].map(label_map).values
- # Plot UMAP before filtering
- umap_plot(combined_df, embedding, model_dir, "Before", None)
-
- # -----------------------
- # Call nn basic class
- # -----------------------
-
# Nearest Neighbors filtering
print("Instantiate the Nearest Neighbors Model")
nn = NearestNeighbors(n_neighbors=50)
@@ -189,24 +225,33 @@ def process_files(TrainPanel=None, **kwargs):
print("Find Nearest Neighbors")
_, indices = nn.kneighbors(embedding)
- indices_to_keep = []
+ # Entries are parsed and indices of those that the one of the following cases applies are kept:
+ # If the point is NOT "Blank", keeps it only if enough neighbors have the same label (nonblank_threshold).
+ # If the point IS "Blank", keeps it only if enough neighbors are also "Blank" (blank_threshold).
+ indices_to_keep = []
for i in range(len(embedding)):
neighbor_labels = mapped_labels[indices[i][1:]] if len(indices[i]) > 1 else []
if mapped_labels[i] != label_map['Blank']:
- non_blank_neighbors = np.sum(neighbor_labels == mapped_labels[i])
- if non_blank_neighbors >= nonblank_threshold:
+ nonblank_neighbors = np.sum(neighbor_labels == mapped_labels[i])
+ if nonblank_neighbors >= nonblank_threshold:
indices_to_keep.append(i)
else:
blank_neighbors = np.sum(neighbor_labels == label_map['Blank'])
if blank_neighbors >= blank_threshold:
indices_to_keep.append(i)
+ # Only entries kept after NN filtering are kept to be used for the training of the NN model
cleaned_data = combined_df.iloc[indices_to_keep]
+ # Plot UMAP before filtering
+ umap_plot(combined_df, embedding, model_dir, "Before", None)
+
# Plot UMAP after filtering
umap_plot(cleaned_data, embedding, model_dir, "After", indices_to_keep)
print("Data processing and UMAP filtering successful.")
+
+ # Exit function based on interface
if gui:
TrainPanel.cleaned_data = cleaned_data
prepare_for_training(TrainPanel)
diff --git a/cellscanner/scripts/helpers.py b/cellscanner/scripts/helpers.py
new file mode 100644
index 0000000..2d04223
--- /dev/null
+++ b/cellscanner/scripts/helpers.py
@@ -0,0 +1,321 @@
+"""
+Helpers functions to support CellScanner main tasks.
+"""
+
+import os, sys
+from datetime import datetime
+from dataclasses import dataclass
+from typing import Optional
+import numpy as np
+import pandas as pd
+from .illustrations import gating_plot
+
+NOT_APPLICABLE = "Not applicable"
+
+
+@dataclass
+class Stain:
+ channel: str
+ sign: str
+ value: float
+ label: Optional[str] = None
+
+
+def get_app_dir():
+ """Get absolute path relative to the executable location."""
+ if hasattr(sys, '_MEIPASS'):
+ base_path = sys._MEIPASS
+ else:
+ base_path = os.path.dirname(os.path.abspath(__file__))
+ return base_path
+
+
+def get_abs_path(relative_path):
+ """Get absolute path to a resource, relative to the base directory."""
+ return os.path.join(get_app_dir(), relative_path)
+
+
+def time_based_dir(prefix, base_path, multiple_cocultures=False):
+ timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M")
+ time_dir_name = "_".join([prefix, timestamp])
+ if os.getcwd() == "/app":
+ time_dir = os.path.join("/media", time_dir_name)
+ else:
+ time_dir = os.path.join(base_path, time_dir_name)
+ if os.path.exists(time_dir) and multiple_cocultures is False:
+ base, ext = os.path.splitext(time_dir)
+ counter = 1
+ while os.path.exists(f"{base}_{counter}{ext}"):
+ counter += 1
+ time_dir = f"{base}_{counter}{ext}"
+ return time_dir
+
+
+def load_model_from_files(trained_model_dir):
+ """
+ Loads a previously trained model from CellScanner.
+
+ :param trained_model_dir: Path to the directory with the previously trained model files. Under trained_model_dir,
+ the user needs to make sure there are all three following files: "trained_model.keras", "scaler.pkl", "label_encoder.pkl"
+ :raises ValueError: If any of the 3 required files is missing.
+ """
+ print("Loading model from files")
+ from tensorflow.keras.models import load_model
+ import joblib
+
+ modelfiles = ["trained_model.keras", "scaler.pkl", "label_encoder.pkl"]
+ model_path, scaler_path, le_path = [os.path.join(trained_model_dir, x) for x in modelfiles]
+
+ try:
+ model = load_model(model_path)
+ scaler = joblib.load(scaler_path)
+ label_encoder = joblib.load(le_path)
+ return model, scaler, label_encoder
+
+ except Exception as e:
+ print(f"Error loading model or preprocessing objects: {e}")
+ raise ValueError(f"No valid model directory. Check whether all 3 required files are there and valid.")
+
+
+def create_file_path(output_dir, sample, name, extension):
+ """Helper function to create file paths."""
+ if sample:
+ return os.path.join(output_dir, f"{sample}_{name}.{extension}")
+ return os.path.join(output_dir, f"{name}.{extension}")
+
+
+def get_channels(channels_df):
+ """
+ : channels_df: A pd.DataFrame part of the fcsparser loading function
+
+ """
+ channels_df["long_channel"] = channels_df.apply(
+ lambda row: f"{row['$PnN']} [{row['$PnS']}]"
+ if row["$PnN"] != row["$PnS"]
+ else row["$PnN"],
+ axis=1
+ )
+ channels = set(channels_df["long_channel"])
+ return channels
+
+
+def stain_sannity_check(df, label, channel, sign, threshold):
+ """
+ Checks if gating applied for a stain returns both True and False cases.
+ If not, raises an error so the user refines their thresholds.
+ """
+ counts = df[label].value_counts()
+ if True not in counts.index or False not in counts.index:
+ stain_min, stain_max = np.min(df[channel]), np.max(df[channel])
+ raise ValueError(
+ f"Invalid gating. Please check the gating thresholds."
+ f"Stain {channel} ranges between {stain_min} and {stain_max}, while current gating thresholds are {sign} {threshold}."
+ )
+
+
+def apply_gating(data_df: pd.DataFrame,
+ stain1: Stain = None,
+ stain2: Stain = None,
+ extra_stains: dict =None
+ ):
+ """
+ Applies line gating to a dataset based on fluorescence or marker intensity thresholds.
+
+ This function evaluates whether the values of specified stains (channels) in the dataset
+ meet their respective thresholds. It assigns labels (True or False) based on the gating
+ criteria.
+
+ Args:
+ data_df (:class:`pandas.DataFrame`): The dataframe containing intensity values for different stains.
+ stain1 (Stain, optional): An instance of the :class:`Stain` class representing the first stain used for gating.
+ stain2 (Stain, optional): An instance of the :class:`Stain` class representing the second stain used for gating.
+ extra_stains (Dict, optional): A dictionary with the label of a stain as key, and its channel, sign, and threshold as their value for multi-channel gating.
+
+ Returns:
+ pandas.DataFrame: A dataframe with the gating results, where each row is labeled as True or False.
+ list: A list of labels assigned to each row based on the applied gating criteria.
+
+ Example:
+ ```
+ gated_df, labels = apply_gating(data, stain1=Stain("CD3", threshold=500), stain2=Stain("CD19", threshold=200))
+ ```
+ """
+
+ all_labels = []
+
+ # Copy the DataFrame to not change the original data
+ gated_data_df = data_df.copy()
+
+ # Temporarily remove the 'predictions' column to avoid issues with numeric operations -- irrelevant in TRAINING
+ predictions_column = gated_data_df.pop('predictions') if 'predictions' in gated_data_df.columns else None
+
+ # Reintegrate the 'predictions' column after the arcsinh transformation
+ if predictions_column is not None:
+ gated_data_df['predictions'] = predictions_column
+
+ if stain1.channel is not None:
+ """ STAIN FOR CELLS / DEBRIS (sybr green) """
+ if stain1.channel is not None and stain1.channel != NOT_APPLICABLE:
+
+ # Initialize the 'state' column with 'not dead'
+ gated_data_df['cell'] = False
+
+ # Apply gating based on the first stain (live/dead)
+ if stain1.sign in ['>', 'greater_than']:
+ gated_data_df.loc[gated_data_df[stain1.channel] > stain1.value, 'cell'] = True
+
+ elif stain1.sign in ['<', 'less_than']:
+ gated_data_df.loc[gated_data_df[stain1.channel] < stain1.value, 'cell'] = True
+
+ # Sannity check
+ try:
+ stain_sannity_check(gated_data_df, "cell", stain1.channel, stain1.sign, stain1.value)
+ all_labels.append("cell")
+
+ except Exception as e:
+ raise ValueError(f"Gating failed for stain1: {stain1.channel}") from e # Preserve original traceback
+
+ if stain2.channel is not None:
+ """ STAIN FOR LIVE / DEAD (PI) """
+ if stain2.channel is not None and stain2.channel != NOT_APPLICABLE:
+
+ # Initialize the 'state' column with 'not dead'
+ gated_data_df['dead'] = False
+
+ # Apply gating based on the first stain (live/dead)
+ if stain2.sign in ['>', 'greater_than']:
+ gated_data_df.loc[gated_data_df[stain2.channel] > stain2.value, 'dead'] = True
+
+ elif stain2.sign in ['<', 'less_than']:
+ gated_data_df.loc[gated_data_df[stain2.channel] < stain2.value, 'dead'] = True
+
+ # Sannity check
+ try:
+ stain_sannity_check(gated_data_df, "dead", stain2.channel, stain2.sign, stain2.value)
+ all_labels.append("dead")
+
+ except Exception as e:
+ raise (f"Sannity check failed for stain2: {stain2.channel}") from e # Preserve original traceback
+
+ # Apply gating on extra stains
+ if extra_stains is not None:
+ for channel, details in extra_stains.items():
+ sign, threshold, label = details
+ # Create the comparison operator dynamically
+ condition = gated_data_df[channel] > threshold if sign == ">" else gated_data_df[channel] < threshold
+ gated_data_df[label] = condition
+ try:
+ stain_sannity_check(gated_data_df, label, channel, sign, threshold)
+ except ValueError as e:
+ raise ValueError(f"Gating failed for extra stain: {e}") from e # Preserve original traceback
+
+ all_labels.append(label)
+
+ return gated_data_df, all_labels
+
+
+def save_gating_results(gated_data_df, output_dir, sample, x_axis, y_axis, z_axis, all_labels):
+ """
+ Counts entries in a dataframe for its of the labels in all_labels and exports those in a csv file.
+ It then calls for the :func:`gating_plot` function to export relative visual components.
+
+ :param gated_data_df: A dataframe with a 'predictions' column with the species predicted name label columns (True/False) to count
+ :param output_dir: Path where results will be saved
+ :param sample: Sample name
+ :param x_axis: Name of the X-axis to be plotted (channel among those on the .fcs file)
+ :param y_axis: Name of the X-axis to be plotted (channel among those on the .fcs file)
+ :param z_axis: Name of the X-axis to be plotted (channel among those on the .fcs file)
+ :param all_labels: A list with all the labels for the the stains provided
+ """
+
+ # Create a directory for gating results
+ gated_dir = os.path.join(output_dir, 'gated')
+ os.makedirs(gated_dir, exist_ok=True)
+
+ # Iterate over each species and calculate the state counts
+ species_names = list(gated_data_df['predictions'].unique())
+
+ gated_data_df.to_csv(
+ os.path.join(gated_dir, "_".join([sample, 'raw', 'gating.csv']))
+ )
+ print(
+ f"File with gating raw findings for sample {sample} saved at:\n",
+ os.path.join(gated_dir, "_".join([sample, 'raw', 'gating.csv']))
+ )
+ # Plot status if both stains provided
+ gating_plot(gated_data_df, species_names, x_axis, y_axis, z_axis, gated_dir, sample, all_labels)
+ print("3D scatter plot for gated data saved to:", gated_dir)
+
+
+def merge_prediction_results(output_dir, prediction_type):
+ """
+ Merge prediction and uncertainty output files into a single file for each case when multiple coculture files are provided.
+
+ :param output_dir: Output directory where CellScanner prediction files were saved
+ :param prediction_type: Type of CellScanner output file; 'prediction' (counts) or 'uncertainty' (heretogeneity)
+ """
+
+ if prediction_type not in ["prediction", "uncertainty"]:
+ raise ValueError(f"Please provide a valide prediction_type: 'prediction|uncertainty'")
+
+ if prediction_type == "prediction":
+
+ patterns = ["_".join([prediction_type, "counts"]), "state_counts", "dead_counts", "cell_counts" ]
+
+ # Loop through all files in the directory
+ dfs = []
+ for file_name in os.listdir(output_dir):
+
+ matched_pattern = next((pattern for pattern in patterns if pattern in file_name), None)
+ if matched_pattern is None:
+ continue # Skip files that don't match any pattern
+
+ # Read each file as a DataFrame
+ file_path = os.path.join(output_dir, file_name)
+ df = pd.read_csv(file_path, index_col = 0) # Make sure you keep first column as index of the dataframe
+
+ # Name the "count" column based on the filename (without extension)
+ new_column_name = file_name.split(matched_pattern)[0][:-1]
+ df.columns = [new_column_name]
+ dfs.append(df)
+
+ pattern = matched_pattern
+
+ # Merge all DataFrames on the "predictions" column
+ result = pd.concat(dfs, axis=1)
+ result = result.dropna(how='all')
+ unknonws = [x for x in result.index if "Unknown" in x]
+ if len(unknonws) > 0:
+ sum_unknowns = result.loc[unknonws].sum()
+ result = result.drop(index=unknonws)
+ result.loc["Unknown"] = sum_unknowns
+ else:
+
+ pattern = "heterogeneity_results"
+ output_dir = os.path.join(output_dir, "heterogeneity_results")
+
+ # Loop through all files in the directory
+ dfs = []
+ for file_name in os.listdir(output_dir):
+ if pattern not in file_name:
+ continue
+ file_path = os.path.join(output_dir, file_name)
+
+ # Read each file as a DataFrame
+ df = pd.read_csv(file_path, sep=",") # Adjust separator if needed
+
+ # Rename the "count" column to the filename (without extension)
+ new_column_name = file_name.split(pattern)[0][:-1]
+ df = df.rename(columns={"count": new_column_name})
+ dfs.append(df)
+
+ # Merge all DataFrames on the "predictions" column
+ result = pd.concat(dfs, axis=1).loc[:,~pd.concat(dfs, axis=1).columns.duplicated()]
+
+ # Save the final result to a CSV file
+ try:
+ merged_filename = "".join(["merged_", pattern, ".csv"])
+ merged_file = os.path.join(output_dir, merged_filename)
+ result.to_csv(merged_file, index=True)
+ except:
+ print("No merging case. Please go through the output files of each sample.")
diff --git a/scripts/illustrations.py b/cellscanner/scripts/illustrations.py
similarity index 100%
rename from scripts/illustrations.py
rename to cellscanner/scripts/illustrations.py
diff --git a/scripts/nn.py b/cellscanner/scripts/nn.py
similarity index 96%
rename from scripts/nn.py
rename to cellscanner/scripts/nn.py
index 615cbf7..dae045c 100644
--- a/scripts/nn.py
+++ b/cellscanner/scripts/nn.py
@@ -173,11 +173,13 @@ def prepare_for_training(TrainPanel=None, **kwargs):
X_arcsinh = np.arcsinh(X / scaling_constant)
# 4. Scaling
+ # The standard score of a sample `x` :
+ # z = (x - u) / s
+ # where `u` is the mean of the training samples and `s` is the standard deviation of the training samples.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_arcsinh)
-
- # (Skipping PCA, so X_whitened = X_scaled)
- X_whitened = X_scaled
+ # NOTE: Skipping PCA, so X_whitened in previous implementations, is now replaced by X_scaled
+ # X_whitened = X_scaled
# Save scaler for future use/prediction
model_dir = os.path.join(working_directory, "model") # get_abs_path('model/statistics')
@@ -193,16 +195,17 @@ def prepare_for_training(TrainPanel=None, **kwargs):
if gui:
# Store entire dataset
- TrainPanel.X = X_whitened
+ TrainPanel.X = X_scaled
TrainPanel.y = y_categorical
TrainPanel.scaler = scaler
TrainPanel.le = le
print("Success: Data preparation done.")
+ # In GUI, call the main function for training the Neural Network.
train_neural_network(TrainPanel)
else:
- return X_whitened, y_categorical, scaler, le
+ return X_scaled, y_categorical, scaler, le
def build_model(input_dim, num_classes):
diff --git a/scripts/run_prediction.py b/cellscanner/scripts/run_prediction.py
similarity index 67%
rename from scripts/run_prediction.py
rename to cellscanner/scripts/run_prediction.py
index cc61350..99375e1 100644
--- a/scripts/run_prediction.py
+++ b/cellscanner/scripts/run_prediction.py
@@ -2,19 +2,20 @@
import math
import numpy as np
import pandas as pd
-from typing import List
+from scipy.stats import entropy
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import pairwise_distances
-from scipy.stats import entropy
-from .helpers import create_file_path, get_stains_from_panel, apply_gating, save_gating_results
+from .helpers import create_file_path, apply_gating, save_gating_results
+from .GUIhelpers import get_stains_from_panel, extact_channel
from .illustrations import species_plot, uncertainty_plot, heterogeneity_pie_chart, heterogeneity_bar_plot, create_color_map
# Main function to be called from the worker
def predict(PredictionPanel=None, **kwargs):
"""
- Runs
+ Executes the prediction workflow, including loading models,
+ predicting species, applying gating, performing heterogeneity analysis, and generating visualizations.
:param PredictionPanel:
:param kwargs:
@@ -23,9 +24,6 @@ def predict(PredictionPanel=None, **kwargs):
if type(PredictionPanel).__name__ == "PredictionPanel":
-
- print(PredictionPanel.__dict__)
-
# Attempt to retrieve components from file_panel first
model, scaler, label_encoder, scaling_constant = get_model_components(PredictionPanel.file_panel)
@@ -36,9 +34,10 @@ def predict(PredictionPanel=None, **kwargs):
data_df = PredictionPanel.data_df
output_dir = PredictionPanel.predict_dir
- x_axis_combo = PredictionPanel.x_axis_selector.combo.currentText()
- y_axis_combo = PredictionPanel.y_axis_selector.combo.currentText()
- z_axis_combo = PredictionPanel.z_axis_selector.combo.currentText()
+ x_axis_combo = extact_channel(PredictionPanel.x_axis_selector.combo.currentText())
+ y_axis_combo = extact_channel(PredictionPanel.y_axis_selector.combo.currentText())
+ z_axis_combo = extact_channel(PredictionPanel.z_axis_selector.combo.currentText())
+
gating = PredictionPanel.gating_checkbox.isChecked()
sample = PredictionPanel.sample
@@ -118,22 +117,11 @@ def predict(PredictionPanel=None, **kwargs):
if filter_out_uncertain:
data_df_pred.loc[data_df_pred["uncertainties"] > uncertainty_threshold, "predictions"] = "Unknown"
- # Save prediction results and plot the 3D scatter plot
- species_list = list(index_to_species.values())
- save_prediction_results(
- data_df_pred,
- species_list,
- output_dir,
- x_axis_combo, y_axis_combo, z_axis_combo,
- sample=sample,
- scaling_constant=scaling_constant,
- uncertainty_threshold=uncertainty_threshold,
- filter_out_uncertain=filter_out_uncertain
- )
-
# Gating -- may return a "state" column mentioning live - dead cells, it may not
if gating:
+ print("Run gating...")
+
# Apply gating
gating_df, all_labels = apply_gating(
data_df_pred,
@@ -149,23 +137,42 @@ def predict(PredictionPanel=None, **kwargs):
)
# Perform heterogeneity analysis
hetero_df = gating_df.drop(columns=all_labels)
-
+ data_df_pred = gating_df.copy()
else:
hetero_df = data_df_pred.copy()
# Calculate heterogeneity
+ species_list = list(index_to_species.values())
run_heterogeneity(hetero_df, species_list, output_dir, sample)
+ # Save prediction results and plot the 3D scatter plot
+ save_prediction_results(
+ data_df_pred,
+ species_list,
+ output_dir,
+ x_axis_combo, y_axis_combo, z_axis_combo,
+ sample=sample,
+ scaling_constant=scaling_constant,
+ uncertainty_threshold=uncertainty_threshold,
+ filter_out_uncertain=filter_out_uncertain,
+ all_labels = all_labels if 'all_labels' in locals() else None
+ )
+
if not gui:
return data_df_pred
# Functions to be used by the predict()
-def predict_species(data_df, model, scaler, label_encoder, scaling_constant):
+def predict_species(data_df: pd.DataFrame,
+ model: "tensorflow.keras.Sequential",
+ scaler: "sklearn.preprocessing.StandardScaler",
+ label_encoder: "sklearn.preprocessing.LabelEncoder",
+ scaling_constant: int
+ ):
"""
-
:param data_df:
:param model:
+ :type model: `keras.models.Sequential` or `Sequential API documentation `
:param scaler:
:param label_encoder:
:param scaling_constant:
@@ -201,16 +208,17 @@ def predict_species(data_df, model, scaler, label_encoder, scaling_constant):
def save_prediction_results(
- data_df: pd.DataFrame, species_list: List,
+ data_df: pd.DataFrame, species_list: list,
output_dir: str,
- x_axis, y_axis, z_axis,
+ x_axis: str, y_axis: str, z_axis: str,
sample: str = None,
scaling_constant: int = 150,
uncertainty_threshold: float = 0.5,
- filter_out_uncertain: bool = False
+ filter_out_uncertain: bool = False,
+ all_labels: list = None
):
"""
- Saves prediction file for a coculture CellScanner prediction along with its corresponding species and uncertainty plolts.
+ Saves prediction file for a coculture sample CellScanner prediction along with its corresponding species and uncertainty plolts.
"""
# Ensure `data_df` is still a DataFrame and not an ndarray
if not isinstance(data_df, pd.DataFrame):
@@ -221,16 +229,15 @@ def save_prediction_results(
outfile_prediction_counts = create_file_path(output_dir, sample, 'prediction_counts', 'csv')
plot_path_species = create_file_path(output_dir, sample, '3D_coculture_predictions_species', 'html')
- # Save predictions and prediction counts to a CSV file
+ # Save raw predictions and prediction counts to a CSV file
data_df.to_csv(outfile_predictions)
- prediction_counts = data_df['predictions'].value_counts()
- prediction_counts.to_csv(outfile_prediction_counts)
- print("Prediction counts saved to:", outfile_prediction_counts)
# Calculate and save uncertainty counts by species
if filter_out_uncertain:
- outfile_uncertainties = create_file_path(output_dir, sample, 'uncertainty_counts', 'csv')
- plot_path_uncertainty = create_file_path(output_dir, sample, '3D_coculture_predictions_uncertainty', 'html')
+ uncertaint_dir = os.path.join(output_dir, "uncertainty_counts")
+ os.makedirs(uncertaint_dir, exist_ok=True)
+ outfile_uncertainties = create_file_path(uncertaint_dir, sample, 'uncertainty_counts', 'csv')
+ plot_path_uncertainty = create_file_path(uncertaint_dir, sample, '3D_coculture_predictions_uncertainty', 'html')
uncertainty_counts = data_df.groupby('predictions')['uncertainties'].agg(
greater_than=lambda x: (x > uncertainty_threshold).sum(),
less_than=lambda x: (x <= uncertainty_threshold).sum()
@@ -238,6 +245,84 @@ def save_prediction_results(
uncertainty_counts.to_csv(outfile_uncertainties)
print("Uncertainty counts by species saved to:", outfile_uncertainties)
+ # ===============
+ # Build prediction output file
+ # NOTE: Check on PR #28 for output descriptions
+ # ===============
+ df = data_df.copy()
+ counts_df = pd.DataFrame(columns=["count"]) # NOTE: the value_counts() of pd, returns a df with a column called "count"
+ species_names = list(df['predictions'].unique())
+
+ # Uncertainty higher in hierarchy than all
+ if "Unknown" in species_names:
+
+ # Add counts to counts_df
+ unknown_df = df[df["predictions"]== "Unknown"]
+ counts_df.loc["Unknown"] = {"count": unknown_df.shape[0]}
+
+ # Remove from gate df
+ df = df[df["predictions"] != "Unknown"]
+ species_names.remove("Unknown")
+
+ # If both basic stains there, cell/debris precedes
+ if {"cell", "dead"}.issubset(df.columns):
+
+ for species in species_names:
+ # NOTE: If cell column is False, thus threshold holds, entry is a debris
+ sp_debris = df[(df['predictions'] == species) & (df["cell"] == False)]
+ counts_df.loc["_".join([species, "debris"])] = {"count": sp_debris.shape[0]}
+
+ # Remove species debris from working df
+ df = df[~df.index.isin(sp_debris.index)]
+
+ # Count how many dead/live from the remaining
+ # NOTE: If dead column is True then, thus threshold holds, the entry is a dead entry
+ sp_dead = df[df['predictions'] == species]["dead"].value_counts()
+ counts_df.loc["_".join([species, "live"])] = sp_dead.get(False, None) if False in sp_dead else print(f"Species: {species} has no live entries")
+ counts_df.loc["_".join([species, "dead"])] = sp_dead.get(True, None) if True in sp_dead else print(f"Species: {species} has no dead entries")
+
+ elif "cell" in df.columns:
+
+ for species in species_names:
+ sp_debris = df[df['predictions'] == species]["cell"].value_counts()
+ counts_df.loc["_".join([species, "debris"])] = sp_debris.get(False, None) if False in sp_debris else print(f"Species: {species} has no debris entries")
+ counts_df.loc[species] = sp_debris.get(True, None) if True in sp_debris else print(f"Species: {species} has no entries not being debris.")
+
+ elif "dead" in df.columns:
+
+ for species in species_names:
+
+ # Count how many dead/live from the remaining
+ sp_dead = df[df['predictions'] == species]["dead"].value_counts()
+ counts_df.loc["_".join([species, "live"])] = sp_dead.get(False, None) if False in sp_dead else print(f"Species: {species} has no live entries")
+ counts_df.loc["_".join([species, "dead"])] = sp_dead.get(True, None) if True in sp_dead else print(f"Species: {species} has no dead entries")
+ else:
+
+ if all_labels is not None:
+ for label in all_labels:
+ for species in species_names:
+ sp_df = df[df['predictions'] == species][label].value_counts()
+ counts_df.loc["_".join([species, "not", label])] = (
+ sp_df.get(False, None)
+ if False in sp_df
+ else print(f"Species: {species} has no entries of not being {label}")
+ )
+ counts_df.loc["_".join([species, label])] = (
+ sp_df.get(True, None)
+ if True in sp_df
+ else print(f"Species: {species} has no dead entries of {label}")
+ )
+ print("ATTENTION! In this case the number of entries is not in line with the entries on the .fcs file")
+ else:
+ print("Basic case where no gating - no stains at all.")
+ counts_df = pd.concat([counts_df, df['predictions'].value_counts()])
+
+ counts_df.to_csv(outfile_prediction_counts)
+
+ # ----------------
+ # PLOTS
+ # ----------------
+
# Perform arcsinh transformation on numeric columns
coculture_data_numeric = data_df.drop(columns=['predictions', 'uncertainties'])
coculture_data_arcsin = np.arcsinh(coculture_data_numeric / scaling_constant)
@@ -265,7 +350,7 @@ def save_prediction_results(
)
-def run_heterogeneity(df, species_list, output_dir, sample):
+def run_heterogeneity(df: pd.DataFrame, species_list: list, output_dir: str, sample: str):
"""
Calculate, plot and export to files heterogeneity metrics.
@@ -279,13 +364,13 @@ def run_heterogeneity(df, species_list, output_dir, sample):
os.makedirs(heterogeneity_dir, exist_ok=True)
hetero_df = df.select_dtypes(include='number')
- hetero_df.drop('uncertainties', axis=1, inplace=True)
+ try:
+ hetero_df.drop('uncertainties', axis=1, inplace=True)
+ except:
+ print("No uncertainties; dropped in gating_df.drop(columns=all_labels).")
+ pass
hetero_df['predictions'] = df['predictions']
- print("\n\n>>>>>>>>>>>>>>>>>>>>>>>>>>")
- print(hetero_df.columns)
- print(">>>>>>>>>>>>>>>>>>>>>>>>>>\n\n")
-
# Compute heterogeneity measures for the sample
try:
hetero1 = hetero_simple(hetero_df.iloc[:, :-1])
@@ -293,11 +378,6 @@ def run_heterogeneity(df, species_list, output_dir, sample):
except ValueError as e:
raise ValueError("Error calculating heterogeneity.") from e
- # Create and save heterogeneity plots
- save_heterogeneity_plots(hetero1, hetero2, heterogeneity_dir, sample)
- res_file = "_".join([sample, "heterogeneity_results.csv"])
- hetero_res_file = os.path.join(heterogeneity_dir, res_file)
-
# Compute heterogeneity metrics for each species
hetero_results_list = []
for species in species_list:
@@ -309,15 +389,13 @@ def run_heterogeneity(df, species_list, output_dir, sample):
hetero1 = hetero_simple(df.iloc[:, :-1])
hetero2 = hetero_mini_batch(df.iloc[:, :-1], species)
- # Build heterogeneity plots
- save_heterogeneity_plots(hetero1, hetero2, heterogeneity_dir, sample, species)
-
# Append the result as a dictionary to the list
hetero_results_list.append({
"Species": species,
"Simple Heterogeneity": hetero1,
"Medoid Heterogeneity": hetero2
})
+
except ValueError as e:
raise ValueError("Error calculating heterogeneity.") from e
@@ -325,16 +403,22 @@ def run_heterogeneity(df, species_list, output_dir, sample):
hetero_results_df = pd.DataFrame(hetero_results_list)
# Save the DataFrame to a CSV file
+ res_file = "_".join([sample, "heterogeneity_results.csv"])
+ hetero_res_file = os.path.join(heterogeneity_dir, res_file)
hetero_results_df.to_csv(hetero_res_file, sep="\t", index=False)
-def hetero_simple(data):
- """Calculate simple heterogeneity as the sum of mean ranges across all channels."""
+def hetero_simple(data: pd.DataFrame):
+ """
+ Calculate simple heterogeneity as the sum of mean ranges across all channels.
+
+ :return: numpy.int64
+ """
ranges = data.apply(np.ptp, axis=0)
return np.sum(ranges.mean())
-def hetero_mini_batch(data, species=None, type='av_diss'):
+def hetero_mini_batch(data: pd.DataFrame, species: str=None, type='av_diss'):
"""
Uses a variant of K-Means clustering that is faster and more memory-efficient asking for a single cluster
similar to computing the mean (or geometric center) of all points.
@@ -345,7 +429,7 @@ def hetero_mini_batch(data, species=None, type='av_diss'):
:param data:
:param species:
:param type:
- :return result: Maximum distance between the centroid and data points
+ :return result: numpy.float64 - Maximum distance between the centroid and data points
"""
# Use MiniBatchKMeans as an alternative
if data.shape[0] == 0:
@@ -368,8 +452,9 @@ def hetero_mini_batch(data, species=None, type='av_diss'):
return result
-def save_heterogeneity_plots(hetero1, hetero2, output_dir, sample, species = None):
+def _save_heterogeneity_plots(hetero1, hetero2, output_dir, sample, species = None):
"""
+ NOTE: DEPRECATED
Exports html heterogeneity pie chart and bar plot
"""
# Values corresponding to each measure
@@ -388,44 +473,6 @@ def save_heterogeneity_plots(hetero1, hetero2, output_dir, sample, species = Non
heterogeneity_bar_plot(labels, metrics_data, colors, output_dir, sample, species, plot_width, plot_height)
-def merge_prediction_results(output_dir, prediction_type):
- """
- Merge prediction and uncertainty output files into a single file for each case when multiple coculture files are provided.
-
- :param output_dir: Output directory where CellScanner prediction files were saved
- :param prediction_type: TYpe of CellScanner output file; `prediction` (counts) or `uncertainty` (heretogeneity)
- """
- if prediction_type not in ["prediction", "uncertainty"]:
- raise ValueError(f"Please provide a valide prediction_type: 'prediction|uncertainty'")
-
- if prediction_type == "prediction":
- pattern = "_".join([prediction_type, "counts"])
- else:
- pattern = "heterogeneity_results"
- output_dir = os.path.join(output_dir, "heterogeneity_results")
-
- # Loop through all files in the directory
- dfs = []
- for file_name in os.listdir(output_dir):
- if pattern not in file_name:
- continue
- file_path = os.path.join(output_dir, file_name)
- # Read each file as a DataFrame
- df = pd.read_csv(file_path, sep=",") # Adjust separator if needed
- # Rename the "count" column to the filename (without extension)
- new_column_name = file_name.split(pattern)[0][:-1]
- df = df.rename(columns={"count": new_column_name})
- dfs.append(df)
-
- # Merge all DataFrames on the "predictions" column
- result = pd.concat(dfs, axis=1).loc[:,~pd.concat(dfs, axis=1).columns.duplicated()]
-
- # Save the final result to a CSV file
- merged_filename = "".join(["merged_", pattern, ".csv"])
- merged_file = os.path.join(output_dir, merged_filename)
- result.to_csv(merged_file, index=False)
-
-
def get_model_components(panel):
"""
Helper function to retrieve model, scaler, label encoder, and scaling constant from a panel.
diff --git a/config.yml b/config.yml
index ecebb54..14d574b 100644
--- a/config.yml
+++ b/config.yml
@@ -6,7 +6,7 @@
# Import data
# ---------------------
output_directory:
- path: CellScanner/cs_output
+ path: Testfiles/cs_output
required: false
description: >
Directory where CellScanner will save intermediate files and findings.
@@ -18,7 +18,7 @@ output_directory:
blank_files:
directories:
- directory:
- path: CellScanner/CS2TutorialFiles
+ path: Testfiles/CS2TutorialFiles
filenames:
- 01-t12_d50_wc_btri_control-H7.fcs
- 01-t12_d50_wc_btri_control-H8.fcs
@@ -34,7 +34,7 @@ blank_files:
species_files:
directories:
- directory:
- path: CellScanner/CS2TutorialFiles
+ path: Testfiles/CS2TutorialFiles
filenames:
- 01-t12_d50_wc_btA-D1.fcs
- 01-t12_d50_wc_btB-D2.fcs
@@ -60,7 +60,7 @@ species_files:
coculture_files:
directories:
- directory:
- path: CellScanner/CS2TutorialFiles
+ path: Testfiles/CS2TutorialFiles
filenames:
- 01-t12_d50_wc_btriA-D7.fcs
- 01-t12_d50_wc_btriB-D8.fcs
@@ -73,7 +73,7 @@ coculture_files:
You can use both relative paths and paths based on the home directory (~).
prev_trained_model:
- path:
+ path: Testfiles/cs_output/model
required: false
default:
description: >
@@ -205,8 +205,8 @@ z_axis:
type: str
filter_out_uncertain:
- value: false
- threshold: 0.5
+ value: true
+ threshold: 0.6
required: false
default: false
description: >
@@ -225,13 +225,6 @@ filter_out_uncertain:
# ---------------------
# Gating parameters
# ---------------------
-gating:
- value: true
- required: false
- default: false
- description: >
- Set whether you wish data to be gated or not.
- type: bool
# ------------------------- For all stains -------------------------
#
@@ -257,10 +250,28 @@ gating:
# label:
#
+gating:
+ value: true
+ required: false
+ default: false
+ description: >
+ Set whether you wish data to be gated or not.
+ type: bool
+
stain1_train:
- channel: FITC-A
- sign: greater_than
- value: 2000000
+ channel:
+ sign:
+ value:
+ description: >
+ This stain specifically describes total cells;
+ all events where the threshold is met will be classified as `cells`. The rest of the events will be classified as `debris`.
+ required: false
+ type: Stain
+
+stain2_train:
+ channel:
+ sign:
+ value:
required: false
sign_values:
- greater_than
@@ -270,34 +281,27 @@ stain1_train:
Any event meeting the threshold will be classified as `dead`.
type: Stain
-stain2_train:
- channel: PerCP-H
- sign: greater_than
- value: 500000
- description: >
- This stain specifically describes total cells;
- all events where the threshold is met will be classified as `cells`. The rest of the events will be classified as `debris`.
- required: false
- type: Stain
-
+# DNA stain: cells - debris ("$LABEL_debris";) SYBR Green -- FITC-A
stain1_predict:
channel: FITC-A
sign: greater_than
- value: 2000000
+ value: 500000
required: false
description: >
- This stain specifically marks dead cells;
- Any event meeting the threshold will be classified as `dead`.
+ This stain specifically describes total cells.
+ All events meeting the threshold will be classified as `cell`. The rest of the events will be classified as `debris`.
type: Stain
+# Dead/live stain: dead - live ("$LABEL_dead") PI -- PerCP-H
stain2_predict:
channel: PerCP-H
sign: greater_than
- value: 500000
+ value: 2000000
required: false
description: >
- This stain specifically describes total cells;
- all events where the threshold is met will be classified as `cells`. The rest of the events will be classified as `debris`.
+ This stain specifically marks dead cells.
+ All events meeting the threshold will be classified as `dead`. The rest will be classified as `live`, but careful!
+ You need to combine this with stain1 to make sure you have living cells.
type: Stain
extra_stains:
diff --git a/docs/_static/Run_prediction_step.png b/docs/_static/Run_prediction_step.png
index edc1ff3..f3b9f09 100644
Binary files a/docs/_static/Run_prediction_step.png and b/docs/_static/Run_prediction_step.png differ
diff --git a/docs/_static/Train_model_step.png b/docs/_static/Train_model_step.png
index 5305508..b4759bf 100644
Binary files a/docs/_static/Train_model_step.png and b/docs/_static/Train_model_step.png differ
diff --git a/docs/_static/cell_count_plot.png b/docs/_static/cell_count_plot.png
new file mode 100644
index 0000000..4b3ba6f
Binary files /dev/null and b/docs/_static/cell_count_plot.png differ
diff --git a/docs/_static/line_gating.png b/docs/_static/line_gating.png
index 58389f5..bb8ef19 100644
Binary files a/docs/_static/line_gating.png and b/docs/_static/line_gating.png differ
diff --git a/docs/_static/line_gating_setting.png b/docs/_static/line_gating_setting.png
new file mode 100644
index 0000000..9797eea
Binary files /dev/null and b/docs/_static/line_gating_setting.png differ
diff --git a/docs/_static/umap_after_filtering.png b/docs/_static/umap_after_filtering.png
new file mode 100644
index 0000000..5d89f86
Binary files /dev/null and b/docs/_static/umap_after_filtering.png differ
diff --git a/docs/about/known-issues.rst b/docs/about/known-issues.rst
index 3f0120f..51fc1ce 100644
--- a/docs/about/known-issues.rst
+++ b/docs/about/known-issues.rst
@@ -7,7 +7,7 @@ Please see the following links:
.. bullet_list::
- - :link:`Issues on GitHub <{GitHubLink}/issues>`
+ - :link:`Issues on GitHub `
.. Example for hyperlink:
.. :link:`asdas `
diff --git a/docs/conf.py b/docs/conf.py
index 7e2fff7..6890909 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -1,41 +1,42 @@
-
# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-
import os
import sys
-sys.path.insert(0, os.path.abspath('./extensions'))
# -- Project information -----------------------------------------------------
-
# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
-project = "μGrowthDB"
+project = "CellScanner"
organization = "Lab of Microbial Systems Biology"
author = f"{organization} & Contributors"
copyright = f"2025, {author}"
version = "0.0.1"
-# https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-release
release = version
# -- General configuration ---------------------------------------------------
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
+sys.path.insert(0, os.path.abspath('./extensions'))
+
+# Add any Sphinx extension module names here, as strings.
+# They can be extensions coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+# IMPORTANT NOTE: The order you give the extensions in the extensions list MATTERS!
+# e.g. https://github.com/sphinx-doc/sphinx/issues/4221
extensions = [
- "sphinx_inline_tabs",
- "sphinx_design",
- "sphinx_issues",
+
+ # To link to pyqt5 docs
+ "sphinx.ext.autodoc",
+ "sphinx.ext.intersphinx",
+ "sphinx.ext.mathjax",
+ "sphinx.ext.viewcode",
+ "sphinx.ext.napoleon",
+ "sphinx.ext.autosummary",
+ "sphinx_qt_documentation",
+
+ "nbsphinx",
+ "autoapi.extension",
+ "sphinx_search.extension",
# For using CONTRIBUTING.md.
"myst_parser",
@@ -47,71 +48,50 @@
"tags",
"links",
"hacks",
+ # "notfound.extension", ## not in the bac_Growt
- "notfound.extension",
-
- # These extensions require RTDs to work so they will not work locally.
- "hoverxref.extension",
- "sphinx_search.extension",
-
- "autoapi.extension",
- # "recommonmark",
]
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = [
- '_build',
- 'Thumbs.db',
- '.DS_Store',
- ".env",
- "extensions",
- "**/includes",
- "README.md",
- "design-tabs.js", # We are using inline-tabs and this throws errors/warnings
+# -- Options for autoapi -------------------------------------------------------
+autoapi_dirs = ["../cellscanner"]
+autoapi_ignore = []
+
+# NOTE: The autoapi_options and the functions autoapi_skip_member() and setup()
+# make sure class attributes are not shown on the API
+autoapi_options = [
+ "members",
+ "undoc-members",
+ "show-inheritance",
]
-# https://github.com/readthedocs/readthedocs.org/issues/4603
-# `tags` come from the `extensions` folder, where the tags.py is located, including the `Tags` class.
-if os.environ.get('PLATFORM') == "READTHEDOCS":
- tags.add('readthedocs')
- tags.add("birp")
- tags.add("hdrp")
- tags.add("urp")
-else:
- notfound_no_urls_prefix = True
+def autoapi_skip_member(app, what, name, obj, skip, options):
+ # Skip all attributes globally
+ if what == "attribute":
+ return True
+ return None
-# -- Features ----------------------------------------------------------------
+def setup(app):
+ app.connect("autoapi-skip-member", autoapi_skip_member)
-# Auto numbering of figures
-numfig = True
+# Enable typehints
+autodoc_typehints = "signature"
-# GitHub repo
-issues_github_path = "msysbio/CellScanner"
+# Napoleon settings
+napoleon_numpy_docstring = True
-# https://sphinx-hoverxref.readthedocs.io/en/latest/usage.html#tooltip-on-all-ref-roles
-hoverxref_auto_ref = True
-hoverxref_role_types = {
- "ref": "tooltip", # for hoverxref_auto_ref config
-}
+# The master toctree document.
+master_doc = "index"
+
+pygments_style = "sphinx"
# -- Options for HTML output -------------------------------------------------
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-# https://github.com/pradyunsg/furo
-# https://pradyunsg.me/furo/
html_theme = 'furo'
html_title = "CellScanner"
html_short_title = "CellScanner"
-# html_logo = '../logo/crest-oceanrender-logo.svg'
html_logo = '_static/logo.png'
html_favicon = '_static/favicon.ico'
-
html_theme_options = {
"light_logo": 'logo.png', # "crest-oceanrender-logo.svg",
"dark_logo": 'logo-dark.png', # "crest-oceanrender-logo-dark.svg",
@@ -144,143 +124,48 @@
html_output_encoding = "utf-8"
-# -- Options for PDF output --------------------------------------------------
-# Customise PDF here. maketitle overrides the cover page.
-latex_elements = {
- # "maketitle": "\\input{your_cover.tex}"
- # "maketitle": "\\sphinxmaketitle",
-}
+# -- Features ----------------------------------------------------------------
-# latex_logo = "../logo/crest-oceanrender-logomark512.png"
-latex_logo = "_static/logo.png"
+# Auto numbering of figures
+numfig = True
-# -- Templating --------------------------------------------------------------
+issues_github_path = "hariszaf/cellscanner"
-# The default role will be used for `` so we do not need to do :get:``.
-default_role = "get"
+mathjax_path = (
+ "https://cdn.mathjax.org/mathjax/latest/"
+ "MathJax.js?config=TeX-AMS-MML_HTMLorMML"
+)
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
-# "replace" substitutions are static/global:
-# |name1| replace:: value
-# |name1|
-# Cannot do this:
-# |name2| replace:: |name1|
-# Inline content has no nested parsing.
-
-# "set" only supports inline content. It will pass its contents to the parser so roles will be processed. Brace
-# substitution is supported and is text only (it will lose any nodes). Use it when you need substitutions in role
-# content.
-# .. set:: LongName Example
-# .. set:: ShortName :abbr:`{LongName}`
-# An example of using `ShortName`.
-
-# For links where you want to use substitutions, use the link role:
-# .. set Something Example Page
-# .. set BaseURL https://example.com
-# :link:`Link Text for {Something} <{BaseURL}/example>`
-# Pass the URL within the angle brackets. Brace substitution will work and will be text only for URLs and support nodes
-# for the link text.
-#
-# For URLs, it is best to use braces even in "set" as they don't require being enclosed in escaped whitespace:
-# .. set:: Link `LinkBase`\ /something/\ `LinkPart`\ /example.html
-# Versus:
-# .. set:: Link {LinkBase}/something/{LinkPart}/example.html
-
-# The following will be included before every page:
-rst_prolog = f"""
-.. tags::
-.. set:: AssetVersion {version}
-
-"""
-rst_prolog = rst_prolog + """
-.. set:: RPMinVersion 10.10
-.. set:: UPMDocLinkBase \https://docs.unity3d.com/Packages
-.. set:: RPDocLinkBase \https://docs.unity3d.com/Packages/com.unity.render-pipelines.
-.. set:: UnityMinVersion 2020.3
-.. set:: UnityDocsLinkBase https://docs.unity3d.com/{UnityMinVersion}/Documentation
-.. set:: UnityDocLink https://docs.unity3d.com/{UnityMinVersion}/Documentation/Manual
-.. set:: UnityDocScriptLink {UnityDocsLinkBase}/ScriptReference
-.. set:: UnityIssueLink https://issuetracker.unity3d.com/product/unity/issues/guid
-.. set:: AssetStoreLinkBase \https://assetstore.unity.com/packages/tools/particles-effects
-.. set:: DocLinkBase https://mgrowthdb.readthedocs.io/en/{AssetVersion}
-.. set:: GitHubLink \https://github.com/msysbio/CellScanner
-.. set:: WikiLink \{GitHubLink}/wiki
-
-.. set:: SGDocLink {UPMDocLinkBase}/com.unity.shadergraph@{RPMinVersion}/manual
-
-.. set:: [BIRP] :guilabel:`BIRP`
-.. set:: BIRPNameLong Built-in
-.. set:: BIRPNameShort BIRP
-.. set:: BIRPNameSlug birp
-.. set:: BIRP :abbr:`{BIRPNameShort} ({BIRPNameLong} Render Pipeline)`
-.. set:: BIRPMinVersion `RPMinVersion`
-.. set:: BIRPDocLink {UnityDocLink}/
-.. set:: BIRPAssetDocLink {DocLinkBase}?rp={BIRPNameSlug}
-
-.. set:: [URP] :guilabel:`URP`
-.. set:: URPNameLong Universal
-.. set:: URPNameShort URP
-.. set:: URPNameSlug urp
-.. set:: URP :abbr:`{URPNameShort} ({URPNameLong} Render Pipeline)`
-.. set:: URPMinVersion `RPMinVersion`
-.. set:: URPDocLink {RPDocLinkBase}universal@{URPMinVersion}/manual
-.. set:: URPAssetLink {AssetStoreLinkBase}/crest-ocean-system-urp-141674
-.. set:: URPAssetDocLink {DocLinkBase}/?rp={URPNameSlug}
-
-.. set:: [HDRP] :guilabel:`HDRP`
-.. set:: HDRPNameLong High Definition
-.. set:: HDRPNameShort HDRP
-.. set:: HDRPNameSlug hdrp
-.. set:: HDRP :abbr:`{HDRPNameShort} ({HDRPNameLong} Render Pipeline)`
-.. set:: HDRPMinVersion `RPMinVersion`
-.. set:: HDRPDocLink {RPDocLinkBase}high-definition@{HDRPMinVersion}/manual
-.. set:: HDRPAssetLink {AssetStoreLinkBase}/crest-ocean-system-hdrp-164158
-.. set:: HDRPAssetDocLink {DocLinkBase}/?rp={HDRPNameSlug}
-
-.. set:: Crest *Crest*
-
-.. set:: TAA :abbr:`TAA (Temporal Anti-Aliasing)`
-.. set:: SMAA :abbr:`SMAA (Subpixel Morphological Anti-Aliasing)`
-.. set:: SPI :abbr:`SPI (Single-Pass Instanced)`
-.. set:: MP :abbr:`MP (Multi-Pass)`
-.. set:: FFT :abbr:`FFT (Fast Fourier Transform)`
-.. set:: GC :abbr:`GC (Garbage Collector)`
-.. set:: SSR :abbr:`SSR (Screen-Space Reflections)`
-.. set:: SSAO :abbr:`SSAO (Screen-Space Ambient Occlusion)`
-.. set:: SAO :abbr:`SSAO (Scalable Ambient Occlusion)`
-.. set:: STPP :abbr:`STPP (Spatial-Temporal Post-Processing)`
-
-.. set:: DWP2 :abbr:`DWP2 (Dynamic Water Physics 2)`
-
-.. set:: Time.time :link:`Time.time <{UnityDocScriptLink}/Time-time.html>`
-.. set:: Time.timeScale :link:`Time.timeScale <{UnityDocScriptLink}/Time-timeScale.html>`
-.. set:: Timeline :link:`Timeline <{UPMDocLinkBase}/com.unity.timeline@1.5/manual/tl_about.html>`
-.. set:: Playable_Director :link:`Playable Director <{UPMDocLinkBase}/com.unity.timeline@1.5/manual/play_director.html>`
-.. set:: Master_Stack :link:`Master Stack <{SGDocLink}/Master-Stack.html>`
-.. set:: HDRP_Lit_Shader :link:`Lit Shader <{HDRPDocLink}/Lit-Shader.html>`
-.. set:: URP_Lit_Shader :link:`Lit Shader <{URPDocLink}/lit-shader.html>`
-"""
-
-# -- Debugging ---------------------------------------------------------------
-
-# For debugging if you want to always have a tag on or off
-# tags.add("tag")
-# tags.remove("tag")
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
-# -- Options for autoapi -------------------------------------------------------
-autoapi_type = "python"
-autoapi_dirs = ["../scripts"]
-autoapi_keep_files = True
-autoapi_root = "api"
-autoapi_member_order = "groupwise"
+# myst_enable_extensions = ["colon_fence"]
+
+# -- Templating --------------------------------------------------------------
+
+# The default role will be used for `` so we do not need to do :get:``.
+default_role = "get"
# -- Options for markdown -------------------------------------------------------
-# source_suffix = ['.rst', '.md']
# No need to manually register .md, as myst_parser handles it
source_suffix = {
'.rst': 'restructuredtext',
'.md': 'markdown', # This is registered automatically by myst_parser
+}
+
+intersphinx_mapping = {
+ "python": ("https://docs.python.org/3", None),
+ "PyQt5.QtWidgets": ("https://www.riverbankcomputing.com/static/Docs/PyQt5", None),
+ "PySide6": ("https://doc.qt.io/qtforpython/", None),
+ "Numpy": ("https://docs.scipy.org/doc/numpy/", None),
+ "pandas": ("https://pandas.pydata.org/pandas-docs/stable/", None),
+ "packaging": ("https://packaging.pypa.io/en/latest/", None),
+ # from here: https://github.com/GPflow/tensorflow-intersphinx/
+ "tensorflow": ("https://www.tensorflow.org/api_docs/python", "https://github.com/GPflow/tensorflow-intersphinx/raw/master/tf2_py_objects.inv"),
+ 'sklearn': ('http://scikit-learn.org/stable', None)
}
\ No newline at end of file
diff --git a/docs/faqs/faqs.rst b/docs/faqs/faqs.rst
index 8c61e5b..4b6e5bd 100644
--- a/docs/faqs/faqs.rst
+++ b/docs/faqs/faqs.rst
@@ -5,6 +5,6 @@ Frequently Asked Questions
When you submit read data to CellScanner, we store and accession your files within.
.. :ref: ``
-As part of your run..
+As part of your run..
diff --git a/docs/index.rst b/docs/index.rst
index 684ce71..e26db9e 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -7,26 +7,6 @@ CellScanner
:target: https://cellscanner.readthedocs.io/en/latest/?badge=latest
:alt: Documentation Status
-.. NOTE:
-.. Subsequent captions are broken in PDFs: https://github.com/sphinx-doc/sphinx/issues/4977.
-
-.. NOTE:
-.. :numbered: has bugs with PDFs: https://github.com/sphinx-doc/sphinx/issues/4318.
-
-.. NOTE:
-.. only directive does not work with tocree directive for HTML.
-
-.. .. only:: latex
-..
-.. .. toctree::
-.. :hidden:
-.. :caption: User Guide
-..
-.. about/introduction
-
-.. NOTE:
-.. ":numbered: 1" means numbering is only one deep. Needed for the version history.
-
.. toctree::
:numbered: 1
@@ -36,7 +16,6 @@ CellScanner
about/background
about/known-issues
about/history
-.. about/integrations
.. toctree::
@@ -44,6 +23,7 @@ CellScanner
:maxdepth: 3
:caption: Tutorial
+ tutorials/install
tutorials/gui
@@ -55,35 +35,7 @@ CellScanner
faqs/faqs
-
.. toctree::
:maxdepth: 3
:caption: Developer Guide
- .. dev/contributing
-
-
-.. ===============================================
-
-
-
-.. NOTE:
-.. Tried to have only the title show in the ToC, but it looks like Sphinx is ignoring toctree options.
-
-.. .. only:: latex
-..
-.. .. toctree::
-..
-.. meta/history
-
-.. TODO:
-.. user/support
-
-.. only:: html
-
- .. .. toctree::
- .. :maxdepth: 3
- .. :caption: Developer Guide
-
- .. dev/contributing
-
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 7bd4b9d..b68ae7b 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -9,4 +9,7 @@ sphinx-hoverxref==1.3.0
sphinx-inline-tabs==2023.4.21
sphinx-issues==3.0.1
sphinx-notfound-page==1.0.0
-sphinx-autoapi==3.4.0
\ No newline at end of file
+sphinx-autoapi==3.4.0
+sphinx-qt-documentation==0.4.1
+PyQt5
+nbsphinx
\ No newline at end of file
diff --git a/docs/tutorials/gui.md b/docs/tutorials/gui.md
index 7494a90..a33aab5 100644
--- a/docs/tutorials/gui.md
+++ b/docs/tutorials/gui.md
@@ -22,87 +22,123 @@ The flow cytometry data for the growth curves shown above are available at
We are going to work here only with one time point (50 hours). You can find the files used in the tutorial
[here](http://msysbiology.com/documents/CellScanner/CS2TutorialFiles.zip).
-When you open CellScanner, you see the graphical user interface (GUI) shown below. Please be patient, opening the GUI can sometimes take a minute.
+
+When you open CellScanner, you see the graphical user interface (GUI) shown below. Please be patient, opening the GUI can sometimes take a minute. For further information on how to install CellScanner, you may have a look [here](./install.md).

## Import Data
-The first step is to import the data. You can do this by clicking on **"Import Data"**. When you click the *Select blank file* button, you can navigate to the corresponding fcs files. When you click the *Add Species* button, a new button appears next to a text field that allows you to assign the name to the fcs files belonging to the same species.
-Note that **you can select several files at once** for both blank and mono-culture files!
+The first step is to import the data. Flow cytometer (fcs) files are basically large tables with events (particles or cells) as rows and measurements in different channels as columns. Blanks are fcs files collected for the cell-free medium. A monoculture file is an fcs file for a single species, ideally inoculated in the same medium used for the blank.
+We will now read these files into CellScanner.
+For this, click on **"Import Data"** to open the panel dedicated to data import tasks. When you click the *Select blank file* button, you can navigate to the corresponding fcs files. When you click the *Add Species* button, a new button appears next to a text field that allows you to assign the name to the fcs files belonging to the same species.
+Note that **you can select several files at once** for both blank and monoculture files!
Optionally, you can also specify an output directory where results will be saved.
If you do not specify one, results will go in an output folder created on the fly inside the CellScanner directory.
-If you previously trained a model for your data, you can also re-use it. Here, we work with two blank files and three biological replicates for each monoculture, with the samples collected at 50 hours.
+If you previously trained a model for your data, you can also re-use it. Here, we work with two blank files and three biological replicates for each monoculture, with the samples collected at 50 hours. Thus, three files are loaded for each species.

+
## Train Model
Next, we open the **Train Model** panel.
If the CellScanner window becomes too big for your screen, close the **Import Data** panel.
-Here, we are going to use default values as shown below.
-UMAP is run first to remove debris.
-Essentially, this is done by clustering events from blanks and monocultures and then removing events from monocultures that are too similar to events in blanks.
-Next, a model (here a neural network) is trained on the filtered monocultures.
-
-Optionally, you can apply gating to the mono-cultures. If you wish to do so, please open the **Run prediction** panel and click the *Apply line gating* checkbox. In this example, samples were treated with SYBR Green and propidium iodide. The latter is not membrane-permeable and is a red flurescent stain. Thus, it can enter cells only if their membrane is ruptured and cells stained red are therefore treated as dead.
-In contrast, SYBR Green is a membrane-permeable green fluorescent stain that binds to DNA. Thus, any event that is not green does not contain DNA and should better not be counted as a cell. The thresholds have to be specified as a function of the intensity values for the corresponding channels in mono- and cocultures. Usually, the software accompanying your flow cytometer can visualise intensities in different channels as histograms and scatter plots, thereby helping you to select thresholds. Here, we will set the thresholds as follows:
-- Staining inactive cells: FL2-H > 2000000
-- Staining all cells: FL1-A > 500000
+UMAP is run first to improve the separation between the monocultures and the blanks. Monoculture events that resemble blank events or events in other monocultures will disturb the classification and should be removed. This is in essence what UMAP does. It embeds events in a space defined by the flow cytometry measurements. We then identify the nearest neighbors of each event in that space. An event is filtered if less than the specified number of neighbors in the UMAP embedding have the same label (i.e. the same monoculture or blank).
+Next, a model (a neural network) is trained on the filtered monocultures and blanks. The number of events used for training is another parameter and is specified per file. Thus, if several fcs files were provided per species, the number of events considered for training is the file number per species times the specified parameter. The number of folds determines into how many sub-sets the data will be partitioned for training and testing. By default, there is no such partitionining and training is carried out on 80% of the events. The epoch number specifies how often the model is trained on the data and the early stopping criterion determines after how many training rounds the model should stop in case its performance no longer improves. Here, we are going to use the default values for UMAP and the neural network as shown below.
+You can **launch the model training step by clicking on** the green **Train** button.

+**Optionally**, you can **apply gating** to the monocultures, which is then carried out before UMAP. If you wish to do so, please click the *Apply line gating* checkbox. In this example, as mentioned above, samples were treated with SYBR Green and propidium iodide. The latter is a red flurescent stain that cannot pass the cell membrane. Thus, it can enter cells only if their membrane is ruptured and cells stained red are therefore treated as dead.
+In contrast, SYBR Green is a membrane-permeable green fluorescent stain that binds to DNA. Thus, any event that is not green does not contain DNA and should better not be counted as a cell. The thresholds have to be specified as a function of the intensity values for the corresponding channels across all monocultures. Usually, the software accompanying your flow cytometer can visualise intensities in different channels as histograms and scatter plots, thereby helping you to select thresholds. Here, we will set the thresholds as follows:
+
+
+
+The plot below illustrates the gating carried out. FITC and PerCP refer to the green and red fluorescence channel, respectively (A and H stand for area and height of the fluorescence signal). All events above the red line and all events to the left of the green line will be treated as dead cells or debris respectively and removed. The dead-cell filter is chosen leniently here to make sure that only cells with a strong red signal are treated as dead.
+
+
+
+
Model training should be fast (within one minute).
-Model performance files will be stored in a sub-folder in your specified output folder (if you did not specify one, then in the CellScanner folder).
-The sub-folder name starts with `working_files` and ends with a time stamp.
-It contains another folder called `model`, in which you will find a number of files encoding the trained neural network, a file called `model_statistics.csv` and two html files, which will open in your browser when clicked.
-The first shows a UMAP projection before and the second one after filtering.
-An event is filtered if its neighbors in the UMAP embedding do not have the same label (the number of neighbors considered is among CellScanner's parameters).
-The `model_statistics.csv` file contains information about classification performance, including accuracy, precision, recall, F1 score and the confusion matrix.
+Model performance files will be stored in a sub-folder in your specified output folder (if you did not specify one, then in the CellScanner folder, in `cellscanner/scripts`).
+The sub-folder name starts with `working_files` and ends with a time stamp. Please note that if training is rerun, training result files can be overwritten if no new output folder is specified.
+The `working_files` folder contains the input files and another folder called `model`, in which you will find a number of files encoding the trained neural network, a file called `model_statistics.csv` and two html files, `umap_Before_filtering.html` and `umap_After_filtering.html`, which will open in your browser when clicked.
+The first shows a UMAP projection before and the second one after filtering of events.
+The **`model_statistics.csv` file contains information about classification performance, including accuracy, precision, recall, F1 score and the confusion matrix**.
+
+This is the confusion matrix for our tutorial files:
+
+| Species | Blank | BT | RI |
+| ----------- | ----------- | ------- | ----|
+| Blank | 84 | 0 | 0 |
+| BT | 0 | 592 | 2 |
+| RI | 0 | 1 | 591 |
+
+The confusion matrix is computed by creating in-silico communities with known composition and feeding them to the trained model. It shows how well each species is separated from the other species and from debris (blank). Here, the model performed very well, with only a few events that were misclassified.
+
+This separation is also seen in the UMAP space plotted in `umap_After_filtering.html`, as shown below. In the html file, the plot is interactive and you can click on the label to hide the corresponding events from the plot.
+
+
+
## Run prediction
We are now ready to apply the trained neural network on one or several cocultures.
For this, we open the **Run Prediction** panel by clicking on it.
As with monocultures, several coculture files can be selected and imported at once.
If more than one coculture is selected, the trained neural network will be applied to each coculture in turn.
-Here, we are importing six replicates of the coculture (btriA-F).
-**Optionally, the "uncertainty" thresholding can be enabled** by clicking the box next to
+Here, we are importing six replicates of the coculture (btriA-btriF).
+
+Next, we specify three flow cytometer channels to be used in the visualization. Here, these are the forward scatter (FSC), side scatter (SSC) and green fluorescence (FITC) channel.
+
+**Optionally, the "uncertainty" thresholding can be enabled** by clicking the checkbox next to
*"Apply filtering on the predictions based on their uncertainty scores"*.
-Events that cannot be easily assigned to one species have a high **uncertainty** (entropy).
-CellScanner automatically computes an uncertainty threshold that maximizes model performance.
-If uncertainty thresholding is enabled, events with uncertainty above this threshold will be filtered out.
-Note that the threshold can be manually adjusted.
-Next, we specify three flow cytometer channels to be used in the visualization.
+Events that cannot be easily assigned to one species (or blank) have a high **uncertainty** (entropy).
+CellScanner automatically computes an uncertainty threshold that maximizes model performance and displays it in the field next to the checkbox for uncertainty filtering. If uncertainty thresholding is enabled, events with uncertainty above this threshold will be filtered out. Note that the threshold can be manually adjusted.
-We can also **optionally** use the line gating that we already appplied to the monocultures, which will prefilter fcs files before UMAP. In this example, FL1 and FL2 are the green and red channel, respectively, whereas H and A are height and area of the light signal. All events with FL2-H values above the threshold will be treated as dead cells and removed. All events with FL1-A values above the threshold will be counted as cells, so events below it will be removed as DNA-free debris. This implements a simple line gating as shown in the scatter plot below for one coculture. Of note, the same thresholds are applied to all mono- and cocultures and should therefore be carefully selected based on visual inspection of intensities in corresponding fcs files.
-
+Optionally, line gating can also be applied to the coculture. Here, we are going to use the same thresholds as for the monocultures.
Clicking *"Predict"* will then launch the prediction step.

-The prediction should also happen within one minute. The output is stored in a folder called "Prediction" (followed by a time stamp) that is either located in the specified output folder or the CellScanner folder.
-For each coculture, the following files are generated:
+The prediction should also happen within one minute. The output is stored in a folder called "Prediction" (followed by a time stamp) that is either located in the specified output folder or the `CellScanner/cellscanner/scripts` folder. For each coculture, the following files are generated (file names start with coculture name):
-- `prediction_counts.csv`, which contains the predicted counts for debris (blank), for each species, and also for the unknown events if uncertainty thresholding was enabled
-- `raw_predictions.csv`, which is the fc file extended with prediction results (labels and, if enabled, uncertainties)
-- `uncertainty_counts.csv`, which lists the number of uncertain events per label if uncertainty thresholding was enabled
- `3D_coculture_predictions_species.html` plots events in a 3D plot spanned by the three selected flow cytometer channels and colors them by species
-- `3D_coculture_predictions_uncertainty.html` is the same with events colored by prediction uncertainty
-- sub-folder `gated` provides more information and a plot on gating if stains were provided
-- sub-folder `heterogeneity_results` quantifies and visualizes overall and species-specific heterogeneity
+- **`prediction_counts.csv`**, which contains the predicted counts for debris (blank), for each species, and also for the unknown events if uncertainty thresholding was enabled
+- `raw_predictions.csv`, which is the fcs file of the coculture extended with prediction results (labels and, if enabled, uncertainties)
-If more than one coculture file was provided, `merged_prediction_counts.csv` will list the counts for each coculture, and `merged_uncertainty_counts.csv` will list the number of uncertain events in each category for each coculture.
-Below is the result for the six coculture replicates:
+In addition, three subfolders are created:
-| Species | Coculure 1 | Coculture 2 | Coculture 3 | Coculture 4 | Coculture 5 | Coculture 6 |
-| ----------- | ----------- | ------- | ----| -----| ---- | ------ |
-| BT | 140116 | 158746 | 140214 | 142779 | 154802 | 144496
-| RI | 80022 | 40105 | 89645 | 75130 | 94461 | 90365 |
-| Blank | 664 | 705 | 677 | 594| 817 | 687 |
-| Unknown | 29 | 39 | 38 | 45 | 246 | 127 |
+- subfolder `gated` contains 3D plots for different gating categories depending on gating input parameters as well as `raw_gating.csv`, which is the input file with labels resulting from gating appended
+- subfolder `heterogeneity_results` quantifies overall and species-specific heterogeneity, for each coculture separately as well as for all co-cultures together in a merged file called `merged_heterogeneity_results.csv`
+- subfolder `uncertainty_counts` is created when an uncertainty filter was specified and contains two files for each co-culture, namely `3D_coculture_predictions_uncertainty.html` with a 3D plot in which events are colored by uncertainty and `uncertainty_counts.csv`, which lists the number of uncertain events per label
-At 50 hours, the coculture is dominated by *Bacteroides thetaiotaomicron* according to CellScanner.
+If more than one coculture file was provided, **`merged_prediction_counts.csv`** will list the counts in the different categories for each coculture.
+Below is the final result for the six coculture replicates (found in `merged_prediction_counts.csv`):
+
+| Species | Coculure A | Coculture B | Coculture C | Coculture D | Coculture E | Coculture F |
+| ----------- | ----------- | ------- | ----| -----| ---- | ------ |
+| BT_debris | 7035.0 | 4779.0 | 4320.0 | 6514.0 | 40316.0 | 19945.0 |
+| **BT_live** | 132000.0 | 152150.0 | 135189.0 | 135168.0 | 111288.0 | 122545.0 |
+| RI_debris | 238.0 | 90.0 | 251.0 | 214.0 | 376.0 | 366.0 |
+| **RI_live** | 76367.0 | 36993.0 | 84986.0 | 71656.0 | 90392.0 | 86118.0 |
+| RI_dead | 957.0 | 1371.0 | 1163.0 | 1002.0 | 895.0 | 867.0 |
+| Blank_debris | 602.0 | 631.0 | 641.0 | 547.0 | 675.0 | 621.0 |
+| Blank_notdead | 40.0 | 74.0 | 29.0 | 34.0 | 179.0 | 87.0 |
+| Unknown | 3592.0 | 3507.0 | 3995.0 | 3413.0 | 6205.0 | 5126.0 |
+
+This table classifies the co-culture events in a number of categories.
+The neural network was trained to distinguish not only species from each other but also from events in the blanks (which do not contain cells). Events labeled as `blank` are therefore co-culture events that the neural network thought are too similar to events encountered in blanks.
+`debris` refers to events filtered out after classification because their signal was too weak in the specified channel (here FITC-A) and `dead` refers to events with a strong red signal that means that the cell membrane was compromised. Both categories only appear if the corresponding stains were specified.
+Finally, `unknown` are events that could not be clearly classified by the neural network as a species or a blank.
+The order of these filters is as follows: only events that are not unknown can be classified as debris and only events not classified as debris can be classified as live or dead. The total count of events in a coculture sample is independent of the specified filters.
+
+The total cell count can be obtained by summing the dead and live cell counts. Of note, no dead Bacteroides cells were detected with the lenient dead-cell threshold specified here and therefore the live cell count is equal to the total cell count in this case.
+
+Thus, at 50 hours, the coculture is dominated by *Bacteroides thetaiotaomicron* according to CellScanner.
+
+
diff --git a/docs/tutorials/install.md b/docs/tutorials/install.md
new file mode 100644
index 0000000..6ccfd7f
--- /dev/null
+++ b/docs/tutorials/install.md
@@ -0,0 +1,51 @@
+Installation
+============
+
+CellScanner has been tested in Python 3.12.x in both Linux, macOS and Windows platforms.
+
+In the following sections we describe how you can access it.
+
+
+## Linux and macOS
+
+To run CellScanner you first need to get it along with its corresponding dependencies.
+To this end, you may run the following chunk to get CellScanner and create a `conda` environment for it:
+
+```bash
+git clone https://github.com/msysbio/CellScanner.git
+cd CellScanner
+conda create -n cellscanner python=3.12.2
+conda activate cellscanner
+pip install -r requirements.txt
+```
+
+Then, to fire the GUI you just need to run
+
+```bash
+./cellscanner/Cellscanner.py
+```
+
+from the root folder of the repo.
+
+
+## Windows
+
+In Windows, you may also follow the steps described above for the Linux and macOS systems.
+If you try through a WSL however, as already mentioned an X11 is required, which you would have to set up on your own.
+
+Alternatively, you can build an `.exe` on your own
+**Attention!** Do not use a WSL. Also, that [`pyinstaller`](https://pyinstaller.org/en/stable/) is available.
+
+Then, after you make sure you have activated the `cellscanner` conda environment, you may run:
+
+```bash
+pyinstaller --onefile --icon=logo.ico --add-data "logo.png:." Cellscanner.py
+```
+
+
+**.. or**
+
+you can simply download the `.exe` of CellScanner v2.0 from [here]().
+
+
+
diff --git a/scripts/helpers.py b/scripts/helpers.py
deleted file mode 100644
index 511b182..0000000
--- a/scripts/helpers.py
+++ /dev/null
@@ -1,272 +0,0 @@
-import os
-import sys
-from datetime import datetime
-from dataclasses import dataclass
-from typing import Optional
-import numpy as np
-import pandas as pd
-from .illustrations import gating_plot
-
-def get_app_dir():
- """Get absolute path relative to the executable location."""
- if hasattr(sys, '_MEIPASS'):
- base_path = sys._MEIPASS
- else:
- base_path = os.path.dirname(os.path.abspath(__file__))
- return base_path
-
-
-def get_abs_path(relative_path):
- """Get absolute path to a resource, relative to the base directory."""
- return os.path.join(get_app_dir(), relative_path)
-
-
-def time_based_dir(prefix, base_path, multiple_cocultures=False):
- timestamp = datetime.now().strftime("%d-%m-%Y_%H-%M")
- time_dir_name = "_".join([prefix, timestamp])
- if os.getcwd() == "/app":
- time_dir = os.path.join("/media", time_dir_name)
- else:
- time_dir = os.path.join(base_path, time_dir_name)
- if os.path.exists(time_dir) and multiple_cocultures is False:
- base, ext = os.path.splitext(time_dir)
- counter = 1
- while os.path.exists(f"{base}_{counter}{ext}"):
- counter += 1
- time_dir = f"{base}_{counter}{ext}"
- return time_dir
-
-
-def button_style(font_size=12, padding=5, color="black", bck_col="#90EE90",
- bck_col_hov="#7FCF7F", bck_col_clicked="#72B572", radius=5):
- style = f"""
- QPushButton {{
- font-size: {font_size}px;
- font-weight: bold;
- padding: {padding}px;
- color: {color};
- background-color: {bck_col}; /* Light green color */
- border-radius: {radius}px;
- }}
- QPushButton:hover {{
- background-color: {bck_col_hov}; /* Slightly darker green on hover */
- }}
- QPushButton:pressed {{
- background-color: {bck_col_clicked}; /* Even darker when pressed */
- }}
- """
- return style
-
-
-def load_model_from_files(trained_model_dir):
-
- print("Loading model from files")
- from tensorflow.keras.models import load_model
- import joblib
-
- modelfiles = ["trained_model.keras", "scaler.pkl", "label_encoder.pkl"]
- model_path, scaler_path, le_path = [os.path.join(trained_model_dir, x) for x in modelfiles]
-
- try:
- model = load_model(model_path)
- scaler = joblib.load(scaler_path)
- label_encoder = joblib.load(le_path)
- return model, scaler, label_encoder
-
- except Exception as e:
- print(f"Error loading model or preprocessing objects: {e}")
- raise ValueError(f"No valid model directory. Check whether all 3 required files are there and valid.")
-
-
-def create_file_path(output_dir, sample, name, extension):
- """Helper function to create file paths."""
- if sample:
- return os.path.join(output_dir, f"{sample}_{name}.{extension}")
- return os.path.join(output_dir, f"{name}.{extension}")
-
-
-def get_stains_from_panel(Panel):
- """
- Build Stain instances for the two main stain types of living/dead and cells/not cells cases.
- In this case, no label is part of the Stain instance.
- Function to be used only in the GUI framework.
-
- Arguments:
- PredictionPanel:
- Returns:
- stain1 (Stain)
- stain2 (Stain)
- """
- # Stain 1
- stain_1 = Panel.stain1_selector.combo.currentText() # It should be the column name
- if stain_1 != "Not applicable":
- stain1_channel = stain_1
- stain1_relation = Panel.stain1_selector.relation.currentText()
- stain1_threshold = float(Panel.stain1_selector.threshold.text())
- stain1 = Stain(stain1_channel, stain1_relation, stain1_threshold)
- else:
- stain1 = None
-
- # Stain 2
- stain_2 = Panel.stain2_selector.combo.currentText() # It should be the column name
- if stain_2 != "Not applicable":
- stain2_channel = stain_2
- stain2_relation = Panel.stain2_selector.relation.currentText()
- stain2_threshold = float(Panel.stain2_selector.threshold.text()) if Panel.stain2_selector.threshold.text() else None
- stain2 = Stain(stain2_channel, stain2_relation, stain2_threshold)
- else:
- stain2 = None
-
- return stain1, stain2
-
-
-def stain_sannity_check(df, label, channel, sign, threshold):
- """
- Checks if gating applied for a stain returns both True and False cases.
- If not, raises an error so the user refines their thresholds.
- """
- counts = df[label].value_counts()
- if True not in counts.index or False not in counts.index:
- stain_min, stain_max = np.min(df[channel]), np.max(df[channel])
- raise ValueError(
- f"Invalid gating. Please check the gating thresholds."
- f"Stain {channel} ranges between {stain_min} and {stain_max}, while current gating thresholds are {sign} {threshold}."
- )
-
-
-def apply_gating(data_df,
- stain1=None,
- stain2=None,
- extra_stains=None
- ):
-
- all_labels = []
-
- # Copy the DataFrame to not change the original data
- gated_data_df = data_df.copy()
-
- # Temporarily remove the 'predictions' column to avoid issues with numeric operations -- irrelevant in TRAINING
- predictions_column = gated_data_df.pop('predictions') if 'predictions' in gated_data_df.columns else None
-
- # Reintegrate the 'predictions' column after the arcsinh transformation
- if predictions_column is not None:
- gated_data_df['predictions'] = predictions_column
-
- if stain1 is not None:
-
- if stain1.channel is not None and stain1.channel != "Not applicable":
-
- # Initialize the 'state' column with 'not dead'
- gated_data_df['dead'] = False
- # Apply gating based on the first stain (live/dead)
- if stain1.sign in ['>', 'greater_than']:
- gated_data_df.loc[gated_data_df[stain1.channel] > stain1.value, 'dead'] = True
- elif stain1.sign in ['<', 'less_than']:
- gated_data_df.loc[gated_data_df[stain1.channel] < stain1.value, 'dead'] = True
- # Sannity check
- try:
- stain_sannity_check(gated_data_df, "dead", stain1.channel, stain1.sign, stain1.value)
- all_labels.append("dead")
- except ValueError as e:
- raise ValueError(f"Gating failed for stain1: {e}") from e # Preserve original traceback
-
- if stain2 is not None:
- if stain2.channel is not None and stain2.channel != "Not applicable":
-
- # Initialize the 'state' column with 'not dead'
- gated_data_df['cell'] = False
- # Apply gating based on the first stain (live/dead)
- if stain2.sign in ['>', 'greater_than']:
- gated_data_df.loc[gated_data_df[stain2.channel] > stain2.value, 'cell'] = True
- elif stain2.sign in ['<', 'less_than']:
- gated_data_df.loc[gated_data_df[stain2.channel] < stain2.value, 'cell'] = True
- # Sannity check
- try:
- stain_sannity_check(gated_data_df, "cell", stain2.channel, stain2.sign, stain2.value)
- all_labels.append("cell")
- except ValueError as e:
- raise ValueError(f"Gating failed for stain2: {e}") from e # Preserve original traceback
-
- # Apply gating based on the second stain (debris)
- if stain1 is not None and stain2 is not None:
- if stain2.channel and stain2.value and stain1.channel:
-
- gated_data_df["state"] = "debris"
-
- gated_data_df["state"].loc[
- (gated_data_df["dead"] == False) & (gated_data_df["cell"] == True)
- ] = "live"
-
- gated_data_df["state"].loc[
- (gated_data_df["dead"] == True) & (gated_data_df["cell"] == True)
- ] = "inactive"
-
- all_labels.append("state")
-
- # Apply gating on extra stains
- if extra_stains is not None:
- for channel, details in extra_stains.items():
- sign, threshold, label = details
- # Create the comparison operator dynamically
- condition = gated_data_df[channel] > threshold if sign == ">" else gated_data_df[channel] < threshold
- gated_data_df[label] = condition
- try:
- stain_sannity_check(gated_data_df, label, channel, sign, threshold)
- except ValueError as e:
- raise ValueError(f"Gating failed for extra stain: {e}") from e # Preserve original traceback
-
- all_labels.append(label)
-
- return gated_data_df, all_labels
-
-
-
-def save_gating_results(gated_data_df, output_dir, sample, x_axis, y_axis, z_axis, all_labels):
- """
-
- """
- # Create a directory for gating results
- gated_dir = os.path.join(output_dir, 'gated')
- os.makedirs(gated_dir, exist_ok=True)
-
- # Initialize an empty DataFrame to hold all state counts
- combined_counts_df = pd.DataFrame()
-
- # Iterate over each species and calculate the state counts
- species_names = gated_data_df['predictions'].unique()
- if "state" in all_labels:
- all_labels.remove("dead") ; all_labels.remove("cell")
-
- for species in species_names:
- species_df = pd.DataFrame()
- for label in all_labels:
- if label == "state":
- s = gated_data_df[gated_data_df['predictions'] == species][label].value_counts()
- else:
- s = gated_data_df[gated_data_df['predictions'] == species][label].value_counts()
- if s.index[0] == True:
- s.index = [label, "_".join(["not", label])] if len(s.index) == 2 else [label]
- else:
- s.index = ["_".join(["not", label]), label] if len(s.index) == 2 else ["_".join(["not", label])] # Default for False case
- s.name = species
- species_df = pd.concat([species_df, s], axis=0)
-
- combined_counts_df = pd.concat([combined_counts_df, species_df], axis=1)
-
- # Save the combined state counts to a single CSV file
- combined_counts_df.to_csv(
- os.path.join(gated_dir, "_".join([sample,'gating.csv']))
- )
-
- # Plot status if both stains provided
- gating_plot(gated_data_df, species_names, x_axis, y_axis, z_axis, gated_dir, sample, all_labels)
-
- print("3D scatter plot for gated data saved to:", gated_dir)
-
-
-@dataclass
-class Stain:
- channel: str
- sign: str
- value: float
- label: Optional[str] = None